Merge 4.14.20 into android-4.14

author Greg Kroah-Hartman <gregkh@google.com>

Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)

committer Greg Kroah-Hartman <gregkh@google.com>

Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)
author Greg Kroah-Hartman <gregkh@google.com>
Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)
committer Greg Kroah-Hartman <gregkh@google.com>
Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)
diff --git a/.gitignore b/.gitignore

index f6050b88e95b5b59e2c08dbef865d92049956e58..be92dfa89957c112c7231d3b3c637cd46736e354 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -122,3 +122,6 @@ all.config
  
  # Kdevelop4
  *.kdev4
+
+# fetched Android config fragments
+kernel/configs/android-*.cfg
diff --git a/Documentation/ABI/testing/sysfs-class-dual-role-usb b/Documentation/ABI/testing/sysfs-class-dual-role-usb

new file mode 100644 (file)

index 0000000..a900fd7
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-class-dual-role-usb
@@ -0,0 +1,71 @@
+What:          /sys/class/dual_role_usb/.../
+Date:          June 2015
+Contact:       Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               Provide a generic interface to monitor and change
+               the state of dual role usb ports. The name here
+               refers to the name mentioned in the
+               dual_role_phy_desc that is passed while registering
+               the dual_role_phy_intstance through
+               devm_dual_role_instance_register.
+
+What:           /sys/class/dual_role_usb/.../supported_modes
+Date:           June 2015
+Contact:        Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               This is a static node, once initialized this
+               is not expected to change during runtime. "dfp"
+               refers to "downstream facing port" i.e. port can
+               only act as host. "ufp" refers to "upstream
+               facing port" i.e. port can only act as device.
+               "dfp ufp" refers to "dual role port" i.e. the port
+               can either be a host port or a device port.
+
+What:          /sys/class/dual_role_usb/.../mode
+Date:          June 2015
+Contact:       Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               The mode node refers to the current mode in which the
+               port is operating. "dfp" for host ports. "ufp" for device
+               ports and "none" when cable is not connected.
+
+               On devices where the USB mode is software-controllable,
+               userspace can change the mode by writing "dfp" or "ufp".
+               On devices where the USB mode is fixed in hardware,
+               this attribute is read-only.
+
+What:          /sys/class/dual_role_usb/.../power_role
+Date:          June 2015
+Contact:       Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               The power_role node mentions whether the port
+               is "sink"ing or "source"ing power. "none" if
+               they are not connected.
+
+               On devices implementing USB Power Delivery,
+               userspace can control the power role by writing "sink" or
+               "source". On devices without USB-PD, this attribute is
+               read-only.
+
+What:          /sys/class/dual_role_usb/.../data_role
+Date:          June 2015
+Contact:       Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               The data_role node mentions whether the port
+               is acting as "host" or "device" for USB data connection.
+               "none" if there is no active data link.
+
+               On devices implementing USB Power Delivery, userspace
+               can control the data role by writing "host" or "device".
+               On devices without USB-PD, this attribute is read-only
+
+What:          /sys/class/dual_role_usb/.../powers_vconn
+Date:          June 2015
+Contact:       Badhri Jagan Sridharan<badhri@google.com>
+Description:
+               The powers_vconn node mentions whether the port
+               is supplying power for VCONN pin.
+
+               On devices with software control of VCONN,
+               userspace can disable the power supply to VCONN by writing "n",
+               or enable the power supply by writing "y".
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs

index 11b7f4ebea7c4b6a04d2ce3894ff1a8d9907e60a..a7799c2fca2855eced4740cb6a82e5f26a448287 100644 (file)
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -51,6 +51,18 @@ Description:
                  Controls the dirty page count condition for the in-place-update
                  policies.
  
+What:          /sys/fs/f2fs/<disk>/min_hot_blocks
+Date:          March 2017
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+                Controls the dirty page count condition for redefining hot data.
+
+What:          /sys/fs/f2fs/<disk>/min_ssr_sections
+Date:          October 2017
+Contact:       "Chao Yu" <yuchao0@huawei.com>
+Description:
+                Controls the fee section threshold to trigger SSR allocation.
+
  What:          /sys/fs/f2fs/<disk>/max_small_discards
  Date:          November 2013
  Contact:       "Jaegeuk Kim" <jaegeuk.kim@samsung.com>
@@ -102,6 +114,12 @@ Contact:   "Jaegeuk Kim" <jaegeuk@kernel.org>
  Description:
                  Controls the idle timing.
  
+What:          /sys/fs/f2fs/<disk>/iostat_enable
+Date:          August 2017
+Contact:       "Chao Yu" <yuchao0@huawei.com>
+Description:
+                Controls to enable/disable IO stat.
+
  What:          /sys/fs/f2fs/<disk>/ra_nid_pages
  Date:          October 2015
  Contact:       "Chao Yu" <chao2.yu@samsung.com>
@@ -122,6 +140,12 @@ Contact:   "Shuoran Liu" <liushuoran@huawei.com>
  Description:
                  Shows total written kbytes issued to disk.
  
+What:          /sys/fs/f2fs/<disk>/feature
+Date:          July 2017
+Contact:       "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description:
+                Shows all enabled features in current device.
+
  What:          /sys/fs/f2fs/<disk>/inject_rate
  Date:          May 2016
  Contact:       "Sheng Yong" <shengyong1@huawei.com>
@@ -138,7 +162,18 @@ What:              /sys/fs/f2fs/<disk>/reserved_blocks
  Date:          June 2017
  Contact:       "Chao Yu" <yuchao0@huawei.com>
  Description:
-                Controls current reserved blocks in system.
+                Controls target reserved blocks in system, the threshold
+                is soft, it could exceed current available user space.
+
+What:          /sys/fs/f2fs/<disk>/current_reserved_blocks
+Date:          October 2017
+Contact:       "Yunlong Song" <yunlong.song@huawei.com>
+Contact:       "Chao Yu" <yuchao0@huawei.com>
+Description:
+                Shows current reserved blocks in system, it may be temporarily
+                smaller than target_reserved_blocks, but will gradually
+                increase to target_reserved_blocks when more free blocks are
+                freed by user later.
  
  What:          /sys/fs/f2fs/<disk>/gc_urgent
  Date:          August 2017
diff --git a/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons

new file mode 100644 (file)

index 0000000..acb19b9
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-kernel-wakeup_reasons
@@ -0,0 +1,16 @@
+What:          /sys/kernel/wakeup_reasons/last_resume_reason
+Date:          February 2014
+Contact:       Ruchi Kandoi <kandoiruchi@google.com>
+Description:
+               The /sys/kernel/wakeup_reasons/last_resume_reason is
+               used to report wakeup reasons after system exited suspend.
+
+What:          /sys/kernel/wakeup_reasons/last_suspend_time
+Date:          March 2015
+Contact:       jinqian <jinqian@google.com>
+Description:
+               The /sys/kernel/wakeup_reasons/last_suspend_time is
+               used to report time spent in last suspend cycle. It contains
+               two numbers (in seconds) separated by space. First number is
+               the time spent in suspend and resume processes. Second number
+               is the time spent in sleep state.
+\ No newline at end of file
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt

index c76afdcafbef0d6d2ddcc2be0683bfeef15f6275..1b5fd9d2586946dda31d5aed3e37249c1f7f2548 100644 (file)
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -837,6 +837,9 @@
  
         dis_ucode_ldr   [X86] Disable the microcode loader.
  
+       dm=             [DM] Allows early creation of a device-mapper device.
+                       See Documentation/device-mapper/boot.txt.
+
         dma_debug=off   If the kernel is compiled with DMA_API_DEBUG support,
                         this option disables the debugging code at boot.
  
diff --git a/Documentation/device-mapper/boot.txt b/Documentation/device-mapper/boot.txt

new file mode 100644 (file)

index 0000000..adcaad5
--- /dev/null
+++ b/Documentation/device-mapper/boot.txt
@@ -0,0 +1,42 @@
+Boot time creation of mapped devices
+===================================
+
+It is possible to configure a device mapper device to act as the root
+device for your system in two ways.
+
+The first is to build an initial ramdisk which boots to a minimal
+userspace which configures the device, then pivot_root(8) in to it.
+
+For simple device mapper configurations, it is possible to boot directly
+using the following kernel command line:
+
+dm="<name> <uuid> <ro>,table line 1,...,table line n"
+
+name = the name to associate with the device
+       after boot, udev, if used, will use that name to label
+       the device node.
+uuid = may be 'none' or the UUID desired for the device.
+ro = may be "ro" or "rw".  If "ro", the device and device table will be
+       marked read-only.
+
+Each table line may be as normal when using the dmsetup tool except for
+two variations:
+1. Any use of commas will be interpreted as a newline
+2. Quotation marks cannot be escaped and cannot be used without
+   terminating the dm= argument.
+
+Unless renamed by udev, the device node created will be dm-0 as the
+first minor number for the device-mapper is used during early creation.
+
+Example
+=======
+
+- Booting to a linear array made up of user-mode linux block devices:
+
+  dm="lroot none 0, 0 4096 linear 98:16 0, 4096 4096 linear 98:32 0" \
+  root=/dev/dm-0
+
+Will boot to a rw dm-linear target of 8192 sectors split across two
+block devices identified by their major:minor numbers.  After boot, udev
+will rename this target to /dev/mapper/lroot (depending on the rules).
+No uuid was assigned.
diff --git a/Documentation/devicetree/bindings/misc/memory-state-time.txt b/Documentation/devicetree/bindings/misc/memory-state-time.txt

new file mode 100644 (file)

index 0000000..c99a506
--- /dev/null
+++ b/Documentation/devicetree/bindings/misc/memory-state-time.txt
@@ -0,0 +1,8 @@
+Memory bandwidth and frequency state tracking
+
+Required properties:
+- compatible : should be:
+       "memory-state-time"
+- freq-tbl: Should contain entries with each frequency in Hz.
+- bw-buckets: Should contain upper-bound limits for each bandwidth bucket in Mbps.
+       Must match the framework power_profile.xml for the device.
diff --git a/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt

new file mode 100644 (file)

index 0000000..2ceb202
--- /dev/null
+++ b/Documentation/devicetree/bindings/scheduler/sched-energy-costs.txt
@@ -0,0 +1,378 @@
+===========================================================
+Energy cost bindings for Energy Aware Scheduling
+===========================================================
+
+===========================================================
+1 - Introduction
+===========================================================
+
+This note specifies bindings required for energy-aware scheduling
+(EAS)[1]. Historically, the scheduler's primary objective has been
+performance.  EAS aims to provide an alternative objective - energy
+efficiency. EAS relies on a simple platform energy cost model to
+guide scheduling decisions.  The model only considers the CPU
+subsystem.
+
+This note is aligned with the definition of the layout of physical
+CPUs in the system as described in the ARM topology binding
+description [2]. The concept is applicable to any system so long as
+the cost model data is provided for those processing elements in
+that system's topology that EAS is required to service.
+
+Processing elements refer to hardware threads, CPUs and clusters of
+related CPUs in increasing order of hierarchy.
+
+EAS requires two key cost metrics - busy costs and idle costs. Busy
+costs comprise of a list of compute capacities for the processing
+element in question and the corresponding power consumption at that
+capacity.  Idle costs comprise of a list of power consumption values
+for each idle state [C-state] that the processing element supports.
+For a detailed description of these metrics, their derivation and
+their use see [3].
+
+These cost metrics are required for processing elements in all
+scheduling domain levels that EAS is required to service.
+
+===========================================================
+2 - energy-costs node
+===========================================================
+
+Energy costs for the processing elements in scheduling domains that
+EAS is required to service are defined in the energy-costs node
+which acts as a container for the actual per processing element cost
+nodes. A single energy-costs node is required for a given system.
+
+- energy-costs node
+
+       Usage: Required
+
+       Description: The energy-costs node is a container node and
+       it's sub-nodes describe costs for each processing element at
+       all scheduling domain levels that EAS is required to
+       service.
+
+       Node name must be "energy-costs".
+
+       The energy-costs node's parent node must be the cpus node.
+
+       The energy-costs node's child nodes can be:
+
+       - one or more cost nodes.
+
+       Any other configuration is considered invalid.
+
+The energy-costs node can only contain a single type of child node
+whose bindings are described in paragraph 4.
+
+===========================================================
+3 - energy-costs node child nodes naming convention
+===========================================================
+
+energy-costs child nodes must follow a naming convention where the
+node name must be "thread-costN", "core-costN", "cluster-costN"
+depending on whether the costs in the node are for a thread, core or
+cluster.  N (where N = {0, 1, ...}) is the node number and has no
+bearing to the OS' logical thread, core or cluster index.
+
+===========================================================
+4 - cost node bindings
+===========================================================
+
+Bindings for cost nodes are defined as follows:
+
+- system-cost node
+
+       Description: Optional. Must be declared within an energy-costs
+       node. A system should contain no more than one system-cost node.
+
+       Systems with no modelled system cost should not provide this
+       node.
+
+       The system-cost node name must be "system-costN" as
+       described in 3 above.
+
+       A system-cost node must be a leaf node with no children.
+
+       Properties for system-cost nodes are described in paragraph
+       5 below.
+
+       Any other configuration is considered invalid.
+
+- cluster-cost node
+
+       Description: must be declared within an energy-costs node. A
+       system can contain multiple clusters and each cluster
+       serviced by EAS must have a corresponding cluster-costs
+       node.
+
+       The cluster-cost node name must be "cluster-costN" as
+       described in 3 above.
+
+       A cluster-cost node must be a leaf node with no children.
+
+       Properties for cluster-cost nodes are described in paragraph
+       5 below.
+
+       Any other configuration is considered invalid.
+
+- core-cost node
+
+       Description: must be declared within an energy-costs node. A
+       system can contain multiple cores and each core serviced by
+       EAS must have a corresponding core-cost node.
+
+       The core-cost node name must be "core-costN" as described in
+       3 above.
+
+       A core-cost node must be a leaf node with no children.
+
+       Properties for core-cost nodes are described in paragraph
+       5 below.
+
+       Any other configuration is considered invalid.
+
+- thread-cost node
+
+       Description: must be declared within an energy-costs node. A
+       system can contain cores with multiple hardware threads and
+       each thread serviced by EAS must have a corresponding
+       thread-cost node.
+
+       The core-cost node name must be "core-costN" as described in
+       3 above.
+
+       A core-cost node must be a leaf node with no children.
+
+       Properties for thread-cost nodes are described in paragraph
+       5 below.
+
+       Any other configuration is considered invalid.
+
+===========================================================
+5 - Cost node properties
+==========================================================
+
+All cost node types must have only the following properties:
+
+- busy-cost-data
+
+       Usage: required
+       Value type: An array of 2-item tuples. Each item is of type
+       u32.
+       Definition: The first item in the tuple is the capacity
+       value as described in [3]. The second item in the tuple is
+       the energy cost value as described in [3].
+
+- idle-cost-data
+
+       Usage: required
+       Value type: An array of 1-item tuples. The item is of type
+       u32.
+       Definition: The item in the tuple is the energy cost value
+       as described in [3].
+
+===========================================================
+4 - Extensions to the cpu node
+===========================================================
+
+The cpu node is extended with a property that establishes the
+connection between the processing element represented by the cpu
+node and the cost-nodes associated with this processing element.
+
+The connection is expressed in line with the topological hierarchy
+that this processing element belongs to starting with the level in
+the hierarchy that this processing element itself belongs to through
+to the highest level that EAS is required to service.  The
+connection cannot be sparse and must be contiguous from the
+processing element's level through to the highest desired level. The
+highest desired level must be the same for all processing elements.
+
+Example: Given that a cpu node may represent a thread that is a part
+of a core, this property may contain multiple elements which
+associate the thread with cost nodes describing the costs for the
+thread itself, the core the thread belongs to, the cluster the core
+belongs to and so on. The elements must be ordered from the lowest
+level nodes to the highest desired level that EAS must service. The
+highest desired level must be the same for all cpu nodes. The
+elements must not be sparse: there must be elements for the current
+thread, the next level of hierarchy (core) and so on without any
+'holes'.
+
+Example: Given that a cpu node may represent a core that is a part
+of a cluster of related cpus this property may contain multiple
+elements which associate the core with cost nodes describing the
+costs for the core itself, the cluster the core belongs to and so
+on. The elements must be ordered from the lowest level nodes to the
+highest desired level that EAS must service. The highest desired
+level must be the same for all cpu nodes. The elements must not be
+sparse: there must be elements for the current thread, the next
+level of hierarchy (core) and so on without any 'holes'.
+
+If the system comprises of hierarchical clusters of clusters, this
+property will contain multiple associations with the relevant number
+of cluster elements in hierarchical order.
+
+Property added to the cpu node:
+
+- sched-energy-costs
+
+       Usage: required
+       Value type: List of phandles
+       Definition: a list of phandles to specific cost nodes in the
+       energy-costs parent node that correspond to the processing
+       element represented by this cpu node in hierarchical order
+       of topology.
+
+       The order of phandles in the list is significant. The first
+       phandle is to the current processing element's own cost
+       node.  Subsequent phandles are to higher hierarchical level
+       cost nodes up until the maximum level that EAS is to
+       service.
+
+       All cpu nodes must have the same highest level cost node.
+
+       The phandle list must not be sparsely populated with handles
+       to non-contiguous hierarchical levels. See commentary above
+       for clarity.
+
+       Any other configuration is invalid.
+
+===========================================================
+5 - Example dts
+===========================================================
+
+Example 1 (ARM 64-bit, 6-cpu system, two clusters of cpus, one
+cluster of 2 Cortex-A57 cpus, one cluster of 4 Cortex-A53 cpus):
+
+cpus {
+       #address-cells = <2>;
+       #size-cells = <0>;
+       .
+       .
+       .
+       A57_0: cpu@0 {
+               compatible = "arm,cortex-a57","arm,armv8";
+               reg = <0x0 0x0>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A57_L2>;
+               clocks = <&scpi_dvfs 0>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+       };
+
+       A57_1: cpu@1 {
+               compatible = "arm,cortex-a57","arm,armv8";
+               reg = <0x0 0x1>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A57_L2>;
+               clocks = <&scpi_dvfs 0>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_0 &CLUSTER_COST_0>;
+       };
+
+       A53_0: cpu@100 {
+               compatible = "arm,cortex-a53","arm,armv8";
+               reg = <0x0 0x100>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A53_L2>;
+               clocks = <&scpi_dvfs 1>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+       };
+
+       A53_1: cpu@101 {
+               compatible = "arm,cortex-a53","arm,armv8";
+               reg = <0x0 0x101>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A53_L2>;
+               clocks = <&scpi_dvfs 1>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+       };
+
+       A53_2: cpu@102 {
+               compatible = "arm,cortex-a53","arm,armv8";
+               reg = <0x0 0x102>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A53_L2>;
+               clocks = <&scpi_dvfs 1>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+       };
+
+       A53_3: cpu@103 {
+               compatible = "arm,cortex-a53","arm,armv8";
+               reg = <0x0 0x103>;
+               device_type = "cpu";
+               enable-method = "psci";
+               next-level-cache = <&A53_L2>;
+               clocks = <&scpi_dvfs 1>;
+               cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+               sched-energy-costs = <&CPU_COST_1 &CLUSTER_COST_1>;
+       };
+
+       energy-costs {
+               CPU_COST_0: core-cost0 {
+                       busy-cost-data = <
+                               417   168
+                               579   251
+                               744   359
+                               883   479
+                               1024  616
+                       >;
+                       idle-cost-data = <
+                               15
+                               0
+                       >;
+               };
+               CPU_COST_1: core-cost1 {
+                       busy-cost-data = <
+                               235 33
+                               302 46
+                               368 61
+                               406 76
+                               447 93
+                       >;
+                       idle-cost-data = <
+                               6
+                               0
+                       >;
+               };
+               CLUSTER_COST_0: cluster-cost0 {
+                       busy-cost-data = <
+                               417   24
+                               579   32
+                               744   43
+                               883   49
+                               1024  64
+                       >;
+                       idle-cost-data = <
+                               65
+                               24
+                       >;
+               };
+               CLUSTER_COST_1: cluster-cost1 {
+                       busy-cost-data = <
+                               235 26
+                               303 30
+                               368 39
+                               406 47
+                               447 57
+                       >;
+                       idle-cost-data = <
+                               56
+                               17
+                       >;
+               };
+       };
+};
+
+===============================================================================
+[1] https://lkml.org/lkml/2015/5/12/728
+[2] Documentation/devicetree/bindings/topology.txt
+[3] Documentation/scheduler/sched-energy.txt
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt

index adba21b5ada7b53fb9d0811039afba37141ed099..99ca8e30a4cad4bd93a641e0154f7dccb63247e0 100644 (file)
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -396,6 +396,8 @@ is not associated with a file:
   [stack]                  = the stack of the main process
   [vdso]                   = the "virtual dynamic shared object",
                              the kernel system call handler
+ [anon:<name>]            = an anonymous mapping that has been
+                            named by userspace
  
   or if empty, the mapping is anonymous.
  
@@ -424,6 +426,7 @@ KernelPageSize:        4 kB
  MMUPageSize:           4 kB
  Locked:                0 kB
  VmFlags: rd ex mr mw me dw
+Name:           name from userspace
  
  the first of these lines shows the same information as is displayed for the
  mapping in /proc/PID/maps.  The remaining lines show the size of the mapping
@@ -496,6 +499,9 @@ Note that there is no guarantee that every flag and associated mnemonic will
  be present in all further kernel releases. Things get changed, the flags may
  be vanished or the reverse -- new added.
  
+The "Name" field will only be present on a mapping that has been named by
+userspace, and will show the name passed in by userspace.
+
  This file is only present if the CONFIG_MMU kernel configuration option is
  enabled.
  
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt

index 77f4de59dc9ceb3cdb36692d1ea41e1d861468b0..b0930d4b099b53a361817ce88d9983cb5429c905 100644 (file)
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -608,6 +608,16 @@ tcp_fastopen_blackhole_timeout_sec - INTEGER
         initial value when the blackhole issue goes away.
         By default, it is set to 1hr.
  
+tcp_fwmark_accept - BOOLEAN
+       If set, incoming connections to listening sockets that do not have a
+       socket mark will set the mark of the accepting socket to the fwmark of
+       the incoming SYN packet. This will cause all packets on that connection
+       (starting from the first SYNACK) to be sent with that fwmark. The
+       listening socket's mark is unchanged. Listening sockets that already
+       have a fwmark set via setsockopt(SOL_SOCKET, SO_MARK, ...) are
+       unaffected.
+       Default: 0
+
  tcp_syn_retries - INTEGER
         Number of times initial SYNs for an active TCP connection attempt
         will be retransmitted. Should not be higher than 127. Default value
diff --git a/Documentation/scheduler/sched-energy.txt b/Documentation/scheduler/sched-energy.txt

new file mode 100644 (file)

index 0000000..dab2f90
--- /dev/null
+++ b/Documentation/scheduler/sched-energy.txt
@@ -0,0 +1,362 @@
+Energy cost model for energy-aware scheduling (EXPERIMENTAL)
+
+Introduction
+=============
+
+The basic energy model uses platform energy data stored in sched_group_energy
+data structures attached to the sched_groups in the sched_domain hierarchy. The
+energy cost model offers two functions that can be used to guide scheduling
+decisions:
+
+1.     static unsigned int sched_group_energy(struct energy_env *eenv)
+2.     static int energy_diff(struct energy_env *eenv)
+
+sched_group_energy() estimates the energy consumed by all cpus in a specific
+sched_group including any shared resources owned exclusively by this group of
+cpus. Resources shared with other cpus are excluded (e.g. later level caches).
+
+energy_diff() estimates the total energy impact of a utilization change. That
+is, adding, removing, or migrating utilization (tasks).
+
+Both functions use a struct energy_env to specify the scenario to be evaluated:
+
+       struct energy_env {
+               struct sched_group      *sg_top;
+               struct sched_group      *sg_cap;
+               int                     cap_idx;
+               int                     util_delta;
+               int                     src_cpu;
+               int                     dst_cpu;
+               int                     energy;
+       };
+
+sg_top: sched_group to be evaluated. Not used by energy_diff().
+
+sg_cap: sched_group covering the cpus in the same frequency domain. Set by
+sched_group_energy().
+
+cap_idx: Capacity state to be used for energy calculations. Set by
+find_new_capacity().
+
+util_delta: Amount of utilization to be added, removed, or migrated.
+
+src_cpu: Source cpu from where 'util_delta' utilization is removed. Should be
+-1 if no source (e.g. task wake-up).
+
+dst_cpu: Destination cpu where 'util_delta' utilization is added. Should be -1
+if utilization is removed (e.g. terminating tasks).
+
+energy: Result of sched_group_energy().
+
+The metric used to represent utilization is the actual per-entity running time
+averaged over time using a geometric series. Very similar to the existing
+per-entity load-tracking, but _not_ scaled by task priority and capped by the
+capacity of the cpu. The latter property does mean that utilization may
+underestimate the compute requirements for task on fully/over utilized cpus.
+The greatest potential for energy savings without affecting performance too much
+is scenarios where the system isn't fully utilized. If the system is deemed
+fully utilized load-balancing should be done with task load (includes task
+priority) instead in the interest of fairness and performance.
+
+
+Background and Terminology
+===========================
+
+To make it clear from the start:
+
+energy = [joule] (resource like a battery on powered devices)
+power = energy/time = [joule/second] = [watt]
+
+The goal of energy-aware scheduling is to minimize energy, while still getting
+the job done. That is, we want to maximize:
+
+       performance [inst/s]
+       --------------------
+           power [W]
+
+which is equivalent to minimizing:
+
+       energy [J]
+       -----------
+       instruction
+
+while still getting 'good' performance. It is essentially an alternative
+optimization objective to the current performance-only objective for the
+scheduler. This alternative considers two objectives: energy-efficiency and
+performance. Hence, there needs to be a user controllable knob to switch the
+objective. Since it is early days, this is currently a sched_feature
+(ENERGY_AWARE).
+
+The idea behind introducing an energy cost model is to allow the scheduler to
+evaluate the implications of its decisions rather than applying energy-saving
+techniques blindly that may only have positive effects on some platforms. At
+the same time, the energy cost model must be as simple as possible to minimize
+the scheduler latency impact.
+
+Platform topology
+------------------
+
+The system topology (cpus, caches, and NUMA information, not peripherals) is
+represented in the scheduler by the sched_domain hierarchy which has
+sched_groups attached at each level that covers one or more cpus (see
+sched-domains.txt for more details). To add energy awareness to the scheduler
+we need to consider power and frequency domains.
+
+Power domain:
+
+A power domain is a part of the system that can be powered on/off
+independently. Power domains are typically organized in a hierarchy where you
+may be able to power down just a cpu or a group of cpus along with any
+associated resources (e.g.  shared caches). Powering up a cpu means that all
+power domains it is a part of in the hierarchy must be powered up. Hence, it is
+more expensive to power up the first cpu that belongs to a higher level power
+domain than powering up additional cpus in the same high level domain. Two
+level power domain hierarchy example:
+
+               Power source
+                        +-------------------------------+----...
+per group PD            G                               G
+                        |           +----------+        |
+                   +--------+-------| Shared   |  (other groups)
+per-cpu PD         G        G       | resource |
+                   |        |       +----------+
+               +-------+ +-------+
+               | CPU 0 | | CPU 1 |
+               +-------+ +-------+
+
+Frequency domain:
+
+Frequency domains (P-states) typically cover the same group of cpus as one of
+the power domain levels. That is, there might be several smaller power domains
+sharing the same frequency (P-state) or there might be a power domain spanning
+multiple frequency domains.
+
+From a scheduling point of view there is no need to know the actual frequencies
+[Hz]. All the scheduler cares about is the compute capacity available at the
+current state (P-state) the cpu is in and any other available states. For that
+reason, and to also factor in any cpu micro-architecture differences, compute
+capacity scaling states are called 'capacity states' in this document. For SMP
+systems this is equivalent to P-states. For mixed micro-architecture systems
+(like ARM big.LITTLE) it is P-states scaled according to the micro-architecture
+performance relative to the other cpus in the system.
+
+Energy modelling:
+------------------
+
+Due to the hierarchical nature of the power domains, the most obvious way to
+model energy costs is therefore to associate power and energy costs with
+domains (groups of cpus). Energy costs of shared resources are associated with
+the group of cpus that share the resources, only the cost of powering the
+cpu itself and any private resources (e.g. private L1 caches) is associated
+with the per-cpu groups (lowest level).
+
+For example, for an SMP system with per-cpu power domains and a cluster level
+(group of cpus) power domain we get the overall energy costs to be:
+
+       energy = energy_cluster + n * energy_cpu
+
+where 'n' is the number of cpus powered up and energy_cluster is the cost paid
+as soon as any cpu in the cluster is powered up.
+
+The power and frequency domains can naturally be mapped onto the existing
+sched_domain hierarchy and sched_groups by adding the necessary data to the
+existing data structures.
+
+The energy model considers energy consumption from two contributors (shown in
+the illustration below):
+
+1. Busy energy: Energy consumed while a cpu and the higher level groups that it
+belongs to are busy running tasks. Busy energy is associated with the state of
+the cpu, not an event. The time the cpu spends in this state varies. Thus, the
+most obvious platform parameter for this contribution is busy power
+(energy/time).
+
+2. Idle energy: Energy consumed while a cpu and higher level groups that it
+belongs to are idle (in a C-state). Like busy energy, idle energy is associated
+with the state of the cpu. Thus, the platform parameter for this contribution
+is idle power (energy/time).
+
+Energy consumed during transitions from an idle-state (C-state) to a busy state
+(P-state) or going the other way is ignored by the model to simplify the energy
+model calculations.
+
+
+       Power
+       ^
+       |            busy->idle             idle->busy
+       |            transition             transition
+       |
+       |                _                      __
+       |               / \                    /  \__________________
+       |______________/   \                  /
+       |                   \                /
+       |  Busy              \    Idle      /        Busy
+       |  low P-state        \____________/         high P-state
+       |
+       +------------------------------------------------------------> time
+
+Busy    |--------------|                          |-----------------|
+
+Wakeup                 |------|            |------|
+
+Idle                          |------------|
+
+
+The basic algorithm
+====================
+
+The basic idea is to determine the total energy impact when utilization is
+added or removed by estimating the impact at each level in the sched_domain
+hierarchy starting from the bottom (sched_group contains just a single cpu).
+The energy cost comes from busy time (sched_group is awake because one or more
+cpus are busy) and idle time (in an idle-state). Energy model numbers account
+for energy costs associated with all cpus in the sched_group as a group.
+
+       for_each_domain(cpu, sd) {
+               sg = sched_group_of(cpu)
+               energy_before = curr_util(sg) * busy_power(sg)
+                               + (1-curr_util(sg)) * idle_power(sg)
+               energy_after = new_util(sg) * busy_power(sg)
+                               + (1-new_util(sg)) * idle_power(sg)
+               energy_diff += energy_before - energy_after
+
+       }
+
+       return energy_diff
+
+{curr, new}_util: The cpu utilization at the lowest level and the overall
+non-idle time for the entire group for higher levels. Utilization is in the
+range 0.0 to 1.0 in the pseudo-code.
+
+busy_power: The power consumption of the sched_group.
+
+idle_power: The power consumption of the sched_group when idle.
+
+Note: It is a fundamental assumption that the utilization is (roughly) scale
+invariant. Task utilization tracking factors in any frequency scaling and
+performance scaling differences due to difference cpu microarchitectures such
+that task utilization can be used across the entire system.
+
+
+Platform energy data
+=====================
+
+struct sched_group_energy can be attached to sched_groups in the sched_domain
+hierarchy and has the following members:
+
+cap_states:
+       List of struct capacity_state representing the supported capacity states
+       (P-states). struct capacity_state has two members: cap and power, which
+       represents the compute capacity and the busy_power of the state. The
+       list must be ordered by capacity low->high.
+
+nr_cap_states:
+       Number of capacity states in cap_states list.
+
+idle_states:
+       List of struct idle_state containing idle_state power cost for each
+       idle-state supported by the system orderd by shallowest state first.
+       All states must be included at all level in the hierarchy, i.e. a
+       sched_group spanning just a single cpu must also include coupled
+       idle-states (cluster states). In addition to the cpuidle idle-states,
+       the list must also contain an entry for the idling using the arch
+       default idle (arch_idle_cpu()). Despite this state may not be a true
+       hardware idle-state it is considered the shallowest idle-state in the
+       energy model and must be the first entry. cpus may enter this state
+       (possibly 'active idling') if cpuidle decides not enter a cpuidle
+       idle-state. Default idle may not be used when cpuidle is enabled.
+       In this case, it should just be a copy of the first cpuidle idle-state.
+
+nr_idle_states:
+       Number of idle states in idle_states list.
+
+There are no unit requirements for the energy cost data. Data can be normalized
+with any reference, however, the normalization must be consistent across all
+energy cost data. That is, one bogo-joule/watt must be the same quantity for
+data, but we don't care what it is.
+
+A recipe for platform characterization
+=======================================
+
+Obtaining the actual model data for a particular platform requires some way of
+measuring power/energy. There isn't a tool to help with this (yet). This
+section provides a recipe for use as reference. It covers the steps used to
+characterize the ARM TC2 development platform. This sort of measurements is
+expected to be done anyway when tuning cpuidle and cpufreq for a given
+platform.
+
+The energy model needs two types of data (struct sched_group_energy holds
+these) for each sched_group where energy costs should be taken into account:
+
+1. Capacity state information
+
+A list containing the compute capacity and power consumption when fully
+utilized attributed to the group as a whole for each available capacity state.
+At the lowest level (group contains just a single cpu) this is the power of the
+cpu alone without including power consumed by resources shared with other cpus.
+It basically needs to fit the basic modelling approach described in "Background
+and Terminology" section:
+
+       energy_system = energy_shared + n * energy_cpu
+
+for a system containing 'n' busy cpus. Only 'energy_cpu' should be included at
+the lowest level. 'energy_shared' is included at the next level which
+represents the group of cpus among which the resources are shared.
+
+This model is, of course, a simplification of reality. Thus, power/energy
+attributions might not always exactly represent how the hardware is designed.
+Also, busy power is likely to depend on the workload. It is therefore
+recommended to use a representative mix of workloads when characterizing the
+capacity states.
+
+If the group has no capacity scaling support, the list will contain a single
+state where power is the busy power attributed to the group. The capacity
+should be set to a default value (1024).
+
+When frequency domains include multiple power domains, the group representing
+the frequency domain and all child groups share capacity states. This must be
+indicated by setting the SD_SHARE_CAP_STATES sched_domain flag. All groups at
+all levels that share the capacity state must have the list of capacity states
+with the power set to the contribution of the individual group.
+
+2. Idle power information
+
+Stored in the idle_states list. The power number is the group idle power
+consumption in each idle state as well when the group is idle but has not
+entered an idle-state ('active idle' as mentioned earlier). Due to the way the
+energy model is defined, the idle power of the deepest group idle state can
+alternatively be accounted for in the parent group busy power. In that case the
+group idle state power values are offset such that the idle power of the
+deepest state is zero. It is less intuitive, but it is easier to measure as
+idle power consumed by the group and the busy/idle power of the parent group
+cannot be distinguished without per group measurement points.
+
+Measuring capacity states and idle power:
+
+The capacity states' capacity and power can be estimated by running a benchmark
+workload at each available capacity state. By restricting the benchmark to run
+on subsets of cpus it is possible to extrapolate the power consumption of
+shared resources.
+
+ARM TC2 has two clusters of two and three cpus respectively. Each cluster has a
+shared L2 cache. TC2 has on-chip energy counters per cluster. Running a
+benchmark workload on just one cpu in a cluster means that power is consumed in
+the cluster (higher level group) and a single cpu (lowest level group). Adding
+another benchmark task to another cpu increases the power consumption by the
+amount consumed by the additional cpu. Hence, it is possible to extrapolate the
+cluster busy power.
+
+For platforms that don't have energy counters or equivalent instrumentation
+built-in, it may be possible to use an external DAQ to acquire similar data.
+
+If the benchmark includes some performance score (for example sysbench cpu
+benchmark), this can be used to record the compute capacity.
+
+Measuring idle power requires insight into the idle state implementation on the
+particular platform. Specifically, if the platform has coupled idle-states (or
+package states). To measure non-coupled per-cpu idle-states it is necessary to
+keep one cpu busy to keep any shared resources alive to isolate the idle power
+of the cpu from idle/busy power of the shared resources. The cpu can be tricked
+into different per-cpu idle states by disabling the other states. Based on
+various combinations of measurements with specific cpus busy and disabling
+idle-states it is possible to extrapolate the idle-state power.
diff --git a/Documentation/scheduler/sched-tune.txt b/Documentation/scheduler/sched-tune.txt

new file mode 100644 (file)

index 0000000..5df0ea3
--- /dev/null
+++ b/Documentation/scheduler/sched-tune.txt
@@ -0,0 +1,413 @@
+             Central, scheduler-driven, power-performance control
+                               (EXPERIMENTAL)
+
+Abstract
+========
+
+The topic of a single simple power-performance tunable, that is wholly
+scheduler centric, and has well defined and predictable properties has come up
+on several occasions in the past [1,2]. With techniques such as a scheduler
+driven DVFS [3], we now have a good framework for implementing such a tunable.
+This document describes the overall ideas behind its design and implementation.
+
+
+Table of Contents
+=================
+
+1. Motivation
+2. Introduction
+3. Signal Boosting Strategy
+4. OPP selection using boosted CPU utilization
+5. Per task group boosting
+6. Per-task wakeup-placement-strategy Selection
+7. Question and Answers
+   - What about "auto" mode?
+   - What about boosting on a congested system?
+   - How CPUs are boosted when we have tasks with multiple boost values?
+8. References
+
+
+1. Motivation
+=============
+
+Sched-DVFS [3] was a new event-driven cpufreq governor which allows the
+scheduler to select the optimal DVFS operating point (OPP) for running a task
+allocated to a CPU. Later, the cpufreq maintainers introduced a similar
+governor, schedutil. The introduction of schedutil also enables running
+workloads at the most energy efficient OPPs.
+
+However, sometimes it may be desired to intentionally boost the performance of
+a workload even if that could imply a reasonable increase in energy
+consumption. For example, in order to reduce the response time of a task, we
+may want to run the task at a higher OPP than the one that is actually required
+by it's CPU bandwidth demand.
+
+This last requirement is especially important if we consider that one of the
+main goals of the utilization-driven governor component is to replace all
+currently available CPUFreq policies. Since sched-DVFS and schedutil are event
+based, as opposed to the sampling driven governors we currently have, they are
+already more responsive at selecting the optimal OPP to run tasks allocated to
+a CPU. However, just tracking the actual task load demand may not be enough
+from a performance standpoint.  For example, it is not possible to get
+behaviors similar to those provided by the "performance" and "interactive"
+CPUFreq governors.
+
+This document describes an implementation of a tunable, stacked on top of the
+utilization-driven governors which extends their functionality to support task
+performance boosting.
+
+By "performance boosting" we mean the reduction of the time required to
+complete a task activation, i.e. the time elapsed from a task wakeup to its
+next deactivation (e.g. because it goes back to sleep or it terminates).  For
+example, if we consider a simple periodic task which executes the same workload
+for 5[s] every 20[s] while running at a certain OPP, a boosted execution of
+that task must complete each of its activations in less than 5[s].
+
+A previous attempt [5] to introduce such a boosting feature has not been
+successful mainly because of the complexity of the proposed solution. Previous
+versions of the approach described in this document exposed a single simple
+interface to user-space.  This single tunable knob allowed the tuning of
+system wide scheduler behaviours ranging from energy efficiency at one end
+through to incremental performance boosting at the other end.  This first
+tunable affects all tasks. However, that is not useful for Android products
+so in this version only a more advanced extension of the concept is provided
+which uses CGroups to boost the performance of only selected tasks while using
+the energy efficient default for all others.
+
+The rest of this document introduces in more details the proposed solution
+which has been named SchedTune.
+
+
+2. Introduction
+===============
+
+SchedTune exposes a simple user-space interface provided through a new
+CGroup controller 'stune' which provides two power-performance tunables
+per group:
+
+  /<stune cgroup mount point>/schedtune.prefer_idle
+  /<stune cgroup mount point>/schedtune.boost
+
+The CGroup implementation permits arbitrary user-space defined task
+classification to tune the scheduler for different goals depending on the
+specific nature of the task, e.g. background vs interactive vs low-priority.
+
+More details are given in section 5.
+
+2.1 Boosting
+============
+
+The boost value is expressed as an integer in the range [-100..0..100].
+
+A value of 0 (default) configures the CFS scheduler for maximum energy
+efficiency. This means that sched-DVFS runs the tasks at the minimum OPP
+required to satisfy their workload demand.
+
+A value of 100 configures scheduler for maximum performance, which translates
+to the selection of the maximum OPP on that CPU.
+
+A value of -100 configures scheduler for minimum performance, which translates
+to the selection of the minimum OPP on that CPU.
+
+The range between -100, 0 and 100 can be set to satisfy other scenarios suitably.
+For example to satisfy interactive response or depending on other system events
+(battery level etc).
+
+The overall design of the SchedTune module is built on top of "Per-Entity Load
+Tracking" (PELT) signals and sched-DVFS by introducing a bias on the Operating
+Performance Point (OPP) selection.
+
+Each time a task is allocated on a CPU, cpufreq is given the opportunity to tune
+the operating frequency of that CPU to better match the workload demand. The
+selection of the actual OPP being activated is influenced by the boost value
+for the task CGroup.
+
+This simple biasing approach leverages existing frameworks, which means minimal
+modifications to the scheduler, and yet it allows to achieve a range of
+different behaviours all from a single simple tunable knob.
+
+In EAS schedulers, we use boosted task and CPU utilization for energy
+calculation and energy-aware task placement.
+
+2.2 prefer_idle
+===============
+
+This is a flag which indicates to the scheduler that userspace would like
+the scheduler to focus on energy or to focus on performance.
+
+A value of 0 (default) signals to the CFS scheduler that tasks in this group
+can be placed according to the energy-aware wakeup strategy.
+
+A value of 1 signals to the CFS scheduler that tasks in this group should be
+placed to minimise wakeup latency.
+
+The value is combined with the boost value - task placement will not be
+boost aware however CPU OPP selection is still boost aware.
+
+Android platforms typically use this flag for application tasks which the
+user is currently interacting with.
+
+
+3. Signal Boosting Strategy
+===========================
+
+The whole PELT machinery works based on the value of a few load tracking signals
+which basically track the CPU bandwidth requirements for tasks and the capacity
+of CPUs. The basic idea behind the SchedTune knob is to artificially inflate
+some of these load tracking signals to make a task or RQ appears more demanding
+that it actually is.
+
+Which signals have to be inflated depends on the specific "consumer".  However,
+independently from the specific (signal, consumer) pair, it is important to
+define a simple and possibly consistent strategy for the concept of boosting a
+signal.
+
+A boosting strategy defines how the "abstract" user-space defined
+sched_cfs_boost value is translated into an internal "margin" value to be added
+to a signal to get its inflated value:
+
+  margin         := boosting_strategy(sched_cfs_boost, signal)
+  boosted_signal := signal + margin
+
+Different boosting strategies were identified and analyzed before selecting the
+one found to be most effective.
+
+Signal Proportional Compensation (SPC)
+--------------------------------------
+
+In this boosting strategy the sched_cfs_boost value is used to compute a
+margin which is proportional to the complement of the original signal.
+When a signal has a maximum possible value, its complement is defined as
+the delta from the actual value and its possible maximum.
+
+Since the tunable implementation uses signals which have SCHED_LOAD_SCALE as
+the maximum possible value, the margin becomes:
+
+       margin := sched_cfs_boost * (SCHED_LOAD_SCALE - signal)
+
+Using this boosting strategy:
+- a 100% sched_cfs_boost means that the signal is scaled to the maximum value
+- each value in the range of sched_cfs_boost effectively inflates the signal in
+  question by a quantity which is proportional to the maximum value.
+
+For example, by applying the SPC boosting strategy to the selection of the OPP
+to run a task it is possible to achieve these behaviors:
+
+-   0% boosting: run the task at the minimum OPP required by its workload
+- 100% boosting: run the task at the maximum OPP available for the CPU
+-  50% boosting: run at the half-way OPP between minimum and maximum
+
+Which means that, at 50% boosting, a task will be scheduled to run at half of
+the maximum theoretically achievable performance on the specific target
+platform.
+
+A graphical representation of an SPC boosted signal is represented in the
+following figure where:
+ a) "-" represents the original signal
+ b) "b" represents a  50% boosted signal
+ c) "p" represents a 100% boosted signal
+
+
+   ^
+   |  SCHED_LOAD_SCALE
+   +-----------------------------------------------------------------+
+   |pppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppppp
+   |
+   |                                             boosted_signal
+   |                                          bbbbbbbbbbbbbbbbbbbbbbbb
+   |
+   |                                            original signal
+   |                  bbbbbbbbbbbbbbbbbbbbbbbb+----------------------+
+   |                                          |
+   |bbbbbbbbbbbbbbbbbb                        |
+   |                                          |
+   |                                          |
+   |                                          |
+   |                  +-----------------------+
+   |                  |
+   |                  |
+   |                  |
+   |------------------+
+   |
+   |
+   +----------------------------------------------------------------------->
+
+The plot above shows a ramped load signal (titled 'original_signal') and it's
+boosted equivalent. For each step of the original signal the boosted signal
+corresponding to a 50% boost is midway from the original signal and the upper
+bound. Boosting by 100% generates a boosted signal which is always saturated to
+the upper bound.
+
+
+4. OPP selection using boosted CPU utilization
+==============================================
+
+It is worth calling out that the implementation does not introduce any new load
+signals. Instead, it provides an API to tune existing signals. This tuning is
+done on demand and only in scheduler code paths where it is sensible to do so.
+The new API calls are defined to return either the default signal or a boosted
+one, depending on the value of sched_cfs_boost. This is a clean an non invasive
+modification of the existing existing code paths.
+
+The signal representing a CPU's utilization is boosted according to the
+previously described SPC boosting strategy. To sched-DVFS, this allows a CPU
+(ie CFS run-queue) to appear more used then it actually is.
+
+Thus, with the sched_cfs_boost enabled we have the following main functions to
+get the current utilization of a CPU:
+
+  cpu_util()
+  boosted_cpu_util()
+
+The new boosted_cpu_util() is similar to the first but returns a boosted
+utilization signal which is a function of the sched_cfs_boost value.
+
+This function is used in the CFS scheduler code paths where sched-DVFS needs to
+decide the OPP to run a CPU at.
+For example, this allows selecting the highest OPP for a CPU which has
+the boost value set to 100%.
+
+
+5. Per task group boosting
+==========================
+
+On battery powered devices there usually are many background services which are
+long running and need energy efficient scheduling. On the other hand, some
+applications are more performance sensitive and require an interactive
+response and/or maximum performance, regardless of the energy cost.
+
+To better service such scenarios, the SchedTune implementation has an extension
+that provides a more fine grained boosting interface.
+
+A new CGroup controller, namely "schedtune", can be enabled which allows to
+defined and configure task groups with different boosting values.
+Tasks that require special performance can be put into separate CGroups.
+The value of the boost associated with the tasks in this group can be specified
+using a single knob exposed by the CGroup controller:
+
+   schedtune.boost
+
+This knob allows the definition of a boost value that is to be used for
+SPC boosting of all tasks attached to this group.
+
+The current schedtune controller implementation is really simple and has these
+main characteristics:
+
+  1) It is only possible to create 1 level depth hierarchies
+
+     The root control groups define the system-wide boost value to be applied
+     by default to all tasks. Its direct subgroups are named "boost groups" and
+     they define the boost value for specific set of tasks.
+     Further nested subgroups are not allowed since they do not have a sensible
+     meaning from a user-space standpoint.
+
+  2) It is possible to define only a limited number of "boost groups"
+
+     This number is defined at compile time and by default configured to 16.
+     This is a design decision motivated by two main reasons:
+     a) In a real system we do not expect utilization scenarios with more then few
+       boost groups. For example, a reasonable collection of groups could be
+        just "background", "interactive" and "performance".
+     b) It simplifies the implementation considerably, especially for the code
+       which has to compute the per CPU boosting once there are multiple
+        RUNNABLE tasks with different boost values.
+
+Such a simple design should allow servicing the main utilization scenarios identified
+so far. It provides a simple interface which can be used to manage the
+power-performance of all tasks or only selected tasks.
+Moreover, this interface can be easily integrated by user-space run-times (e.g.
+Android, ChromeOS) to implement a QoS solution for task boosting based on tasks
+classification, which has been a long standing requirement.
+
+Setup and usage
+---------------
+
+0. Use a kernel with CONFIG_SCHED_TUNE support enabled
+
+1. Check that the "schedtune" CGroup controller is available:
+
+   root@linaro-nano:~# cat /proc/cgroups
+   #subsys_name        hierarchy       num_cgroups     enabled
+   cpuset      0               1               1
+   cpu         0               1               1
+   schedtune   0               1               1
+
+2. Mount a tmpfs to create the CGroups mount point (Optional)
+
+   root@linaro-nano:~# sudo mount -t tmpfs cgroups /sys/fs/cgroup
+
+3. Mount the "schedtune" controller
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune
+   root@linaro-nano:~# sudo mount -t cgroup -o schedtune stune /sys/fs/cgroup/stune
+
+4. Create task groups and configure their specific boost value (Optional)
+
+   For example here we create a "performance" boost group configure to boost
+   all its tasks to 100%
+
+   root@linaro-nano:~# mkdir /sys/fs/cgroup/stune/performance
+   root@linaro-nano:~# echo 100 > /sys/fs/cgroup/stune/performance/schedtune.boost
+
+5. Move tasks into the boost group
+
+   For example, the following moves the tasks with PID $TASKPID (and all its
+   threads) into the "performance" boost group.
+
+   root@linaro-nano:~# echo "TASKPID > /sys/fs/cgroup/stune/performance/cgroup.procs
+
+This simple configuration allows only the threads of the $TASKPID task to run,
+when needed, at the highest OPP in the most capable CPU of the system.
+
+
+6. Per-task wakeup-placement-strategy Selection
+===============================================
+
+Many devices have a number of CFS tasks in use which require an absolute
+minimum wakeup latency, and many tasks for which wakeup latency is not
+important.
+
+For touch-driven environments, removing additional wakeup latency can be
+critical.
+
+When you use the Schedtume CGroup controller, you have access to a second
+parameter which allows a group to be marked such that energy_aware task
+placement is bypassed for tasks belonging to that group.
+
+prefer_idle=0 (default - use energy-aware task placement if available)
+prefer_idle=1 (never use energy-aware task placement for these tasks)
+
+Since the regular wakeup task placement algorithm in CFS is biased for
+performance, this has the effect of restoring minimum wakeup latency
+for the desired tasks whilst still allowing energy-aware wakeup placement
+to save energy for other tasks.
+
+
+7. Question and Answers
+=======================
+
+What about "auto" mode?
+-----------------------
+
+The 'auto' mode as described in [5] can be implemented by interfacing SchedTune
+with some suitable user-space element. This element could use the exposed
+system-wide or cgroup based interface.
+
+How are multiple groups of tasks with different boost values managed?
+---------------------------------------------------------------------
+
+The current SchedTune implementation keeps track of the boosted RUNNABLE tasks
+on a CPU. The CPU utilization seen by the scheduler-driven cpufreq governors
+(and used to select an appropriate OPP) is boosted with a value which is the
+maximum of the boost values of the currently RUNNABLE tasks in its RQ.
+
+This allows cpufreq to boost a CPU only while there are boosted tasks ready
+to run and switch back to the energy efficient mode as soon as the last boosted
+task is dequeued.
+
+
+8. References
+=============
+[1] http://lwn.net/Articles/552889
+[2] http://lkml.org/lkml/2012/5/18/91
+[3] http://lkml.org/lkml/2015/6/26/620
diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt

index 694968c7523cc28620c8ac51a28a33dc1b14336e..b757d6eb365bb63b38f7263a34400e8afdd3102f 100644 (file)
--- a/Documentation/sysctl/kernel.txt
+++ b/Documentation/sysctl/kernel.txt
@@ -653,7 +653,8 @@ allowed to execute.
  perf_event_paranoid:
  
  Controls use of the performance events system by unprivileged
-users (without CAP_SYS_ADMIN).  The default value is 2.
+users (without CAP_SYS_ADMIN).  The default value is 3 if
+CONFIG_SECURITY_PERF_EVENTS_RESTRICT is set, or 2 otherwise.
  
   -1: Allow use of (almost) all events by all users
       Ignore mlock limit after perf_event_mlock_kb without CAP_IPC_LOCK
@@ -661,6 +662,7 @@ users (without CAP_SYS_ADMIN).  The default value is 2.
       Disallow raw tracepoint access by users without CAP_SYS_ADMIN
  >=1: Disallow CPU event access by users without CAP_SYS_ADMIN
  >=2: Disallow kernel profiling by users without CAP_SYS_ADMIN
+>=3: Disallow all event access by users without CAP_SYS_ADMIN
  
  ==============================================================
  
diff --git a/Documentation/trace/events-power.txt b/Documentation/trace/events-power.txt

index 21d514ced212436ea70aa5a218e20a5f8407523e..4d817d5acc4090a7001b7ca4ae505d7884d1e3a3 100644 (file)
--- a/Documentation/trace/events-power.txt
+++ b/Documentation/trace/events-power.txt
@@ -25,6 +25,7 @@ cpufreq.
  
  cpu_idle               "state=%lu cpu_id=%lu"
  cpu_frequency          "state=%lu cpu_id=%lu"
+cpu_frequency_limits   "min=%lu max=%lu cpu_id=%lu"
  
  A suspend event is used to indicate the system going in and out of the
  suspend mode:
diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt

index d4601df6e72e7d39090b27541a9a29e390ef95a7..f2fcbb7a70c653ccf4402fe7229e3a7c29cd84d7 100644 (file)
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -2407,6 +2407,35 @@ will produce:
   1)   1.449 us    |             }
  
  
+You can disable the hierarchical function call formatting and instead print a
+flat list of function entry and return events.  This uses the format described
+in the Output Formatting section and respects all the trace options that
+control that formatting.  Hierarchical formatting is the default.
+
+       hierachical: echo nofuncgraph-flat > trace_options
+       flat: echo funcgraph-flat > trace_options
+
+  ie:
+
+  # tracer: function_graph
+  #
+  # entries-in-buffer/entries-written: 68355/68355   #P:2
+  #
+  #                              _-----=> irqs-off
+  #                             / _----=> need-resched
+  #                            | / _---=> hardirq/softirq
+  #                            || / _--=> preempt-depth
+  #                            ||| /     delay
+  #           TASK-PID   CPU#  ||||    TIMESTAMP  FUNCTION
+  #              | |       |   ||||       |         |
+                sh-1806  [001] d...   198.843443: graph_ent: func=_raw_spin_lock
+                sh-1806  [001] d...   198.843445: graph_ent: func=__raw_spin_lock
+                sh-1806  [001] d..1   198.843447: graph_ret: func=__raw_spin_lock
+                sh-1806  [001] d..1   198.843449: graph_ret: func=_raw_spin_lock
+                sh-1806  [001] d..1   198.843451: graph_ent: func=_raw_spin_unlock_irqrestore
+                sh-1806  [001] d...   198.843453: graph_ret: func=_raw_spin_unlock_irqrestore
+
+
  You might find other useful features for this tracer in the
  following "dynamic ftrace" section such as tracing only specific
  functions or tasks.
diff --git a/Makefile b/Makefile

index 33176140f1336e494780e416cfd343ea76fe76f0..bd9d4a358d0a2206b2f0912a7086c0c1d1741575 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -700,7 +700,8 @@ KBUILD_CFLAGS += $(stackp-flag)
  
  ifeq ($(cc-name),clang)
  ifneq ($(CROSS_COMPILE),)
-CLANG_TARGET   := --target=$(notdir $(CROSS_COMPILE:%-=%))
+CLANG_TRIPLE   ?= $(CROSS_COMPILE)
+CLANG_TARGET   := --target=$(notdir $(CLANG_TRIPLE:%-=%))
  GCC_TOOLCHAIN  := $(realpath $(dir $(shell which $(LD)))/..)
  endif
  ifneq ($(GCC_TOOLCHAIN),)
@@ -713,6 +714,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, unused-variable)
  KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier)
  KBUILD_CFLAGS += $(call cc-disable-warning, gnu)
  KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member)
+KBUILD_CFLAGS += $(call cc-disable-warning, duplicate-decl-specifier)
  # Quiet clang warning: comparison of unsigned expression < 0 is always false
  KBUILD_CFLAGS += $(call cc-disable-warning, tautological-compare)
  # CLANG uses a _MergedGlobals as optimization, but this breaks modpost, as the
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig

index d1346a16076033acb8d3d60e5de846d7ee2e0e3b..858638134bfa79a6f47f6c3b02d1f7ca0214d564 100644 (file)
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -1826,6 +1826,15 @@ config XEN
         help
           Say Y if you want to run Linux in a Virtual Machine on Xen on ARM.
  
+config ARM_FLUSH_CONSOLE_ON_RESTART
+       bool "Force flush the console on restart"
+       help
+         If the console is locked while the system is rebooted, the messages
+         in the temporary logbuffer would not have propogated to all the
+         console drivers. This option forces the console lock to be
+         released if it failed to be acquired, which will cause all the
+         pending messages to be flushed.
+
  endmenu
  
  menu "Boot options"
@@ -1854,6 +1863,21 @@ config DEPRECATED_PARAM_STRUCT
           This was deprecated in 2001 and announced to live on for 5 years.
           Some old boot loaders still use this way.
  
+config BUILD_ARM_APPENDED_DTB_IMAGE
+       bool "Build a concatenated zImage/dtb by default"
+       depends on OF
+       help
+         Enabling this option will cause a concatenated zImage and list of
+         DTBs to be built by default (instead of a standalone zImage.)
+         The image will built in arch/arm/boot/zImage-dtb
+
+config BUILD_ARM_APPENDED_DTB_IMAGE_NAMES
+       string "Default dtb names"
+       depends on BUILD_ARM_APPENDED_DTB_IMAGE
+       help
+         Space separated list of names of dtbs to append when
+         building a concatenated zImage-dtb.
+
  # Compressed boot loader in ROM.  Yes, we really want to ask about
  # TEXT and BSS so we preserve their values in the config files.
  config ZBOOT_ROM_TEXT
diff --git a/arch/arm/Makefile b/arch/arm/Makefile

index 36ae4454554ce1a11cc65848c60569bf8b3e0abe..bc805d702d824b1286f6d76d2c161d1e87dbf62a 100644 (file)
--- a/arch/arm/Makefile
+++ b/arch/arm/Makefile
@@ -303,6 +303,8 @@ libs-y                              := arch/arm/lib/ $(libs-y)
  boot := arch/arm/boot
  ifeq ($(CONFIG_XIP_KERNEL),y)
  KBUILD_IMAGE := $(boot)/xipImage
+else ifeq ($(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE := $(boot)/zImage-dtb
  else
  KBUILD_IMAGE := $(boot)/zImage
  endif
@@ -356,6 +358,9 @@ ifeq ($(CONFIG_VDSO),y)
         $(Q)$(MAKE) $(build)=arch/arm/vdso $@
  endif
  
+zImage-dtb: vmlinux scripts dtbs
+       $(Q)$(MAKE) $(build)=$(boot) MACHINE=$(MACHINE) $(boot)/$@
+
  # We use MRPROPER_FILES and CLEAN_FILES now
  archclean:
         $(Q)$(MAKE) $(clean)=$(boot)
diff --git a/arch/arm/boot/Makefile b/arch/arm/boot/Makefile

index 50f8d1be7fcbe7cdcecb37f54f2b24f514a65974..da75630c440da6528577d1561f2d3d4143506a26 100644 (file)
--- a/arch/arm/boot/Makefile
+++ b/arch/arm/boot/Makefile
@@ -16,6 +16,7 @@ OBJCOPYFLAGS  :=-O binary -R .comment -S
  ifneq ($(MACHINE),)
  include $(MACHINE)/Makefile.boot
  endif
+include $(srctree)/arch/arm/boot/dts/Makefile
  
  # Note: the following conditions must always be true:
  #   ZRELADDR == virt_to_phys(PAGE_OFFSET + TEXT_OFFSET)
@@ -29,6 +30,14 @@ export ZRELADDR INITRD_PHYS PARAMS_PHYS
  
  targets := Image zImage xipImage bootpImage uImage
  
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
  ifeq ($(CONFIG_XIP_KERNEL),y)
  
  $(obj)/xipImage: vmlinux FORCE
@@ -55,6 +64,10 @@ $(obj)/compressed/vmlinux: $(obj)/Image FORCE
  $(obj)/zImage: $(obj)/compressed/vmlinux FORCE
         $(call if_changed,objcopy)
  
+$(obj)/zImage-dtb:     $(obj)/zImage $(DTB_OBJS) FORCE
+       $(call if_changed,cat)
+       @echo '  Kernel: $@ is ready'
+
  endif
  
  ifneq ($(LOADADDR),)
diff --git a/arch/arm/boot/compressed/head.S b/arch/arm/boot/compressed/head.S

index 8a756870c238435af684215c653f54a739f4f1a5..5b9e2d4bc1b3fb06c85a40132170b0c2dad4fe30 100644 (file)
--- a/arch/arm/boot/compressed/head.S
+++ b/arch/arm/boot/compressed/head.S
@@ -794,6 +794,8 @@ __armv7_mmu_cache_on:
                 bic     r6, r6, #1 << 31        @ 32-bit translation system
                 bic     r6, r6, #(7 << 0) | (1 << 4)    @ use only ttbr0
                 mcrne   p15, 0, r3, c2, c0, 0   @ load page table pointer
+               mcrne   p15, 0, r0, c8, c7, 0   @ flush I,D TLBs
+               mcr     p15, 0, r0, c7, c5, 4   @ ISB
                 mcrne   p15, 0, r1, c3, c0, 0   @ load domain access control
                 mcrne   p15, 0, r6, c2, c0, 2   @ load ttb control
  #endif
diff --git a/arch/arm/boot/dts/Makefile b/arch/arm/boot/dts/Makefile

index eff87a3445662c767bf6f4b24a94620148ab11e3..86e591cc25674563f024f60dc644ac08007fdbad 100644 (file)
--- a/arch/arm/boot/dts/Makefile
+++ b/arch/arm/boot/dts/Makefile
@@ -1074,5 +1074,15 @@ endif
  dtstree                := $(srctree)/$(src)
  dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(wildcard $(dtstree)/*.dts))
  
-always         := $(dtb-y)
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+
+targets += dtbs dtbs_install
+targets += $(DTB_LIST)
+
+always         := $(DTB_LIST)
  clean-files    := *.dtb
diff --git a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts

index a4c7713edfcd5c9807cf5c4827978303aa8e1c95..c5c365f35baa2eb354aad663d09b404e686ca11d 100644 (file)
--- a/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
+++ b/arch/arm/boot/dts/vexpress-v2p-ca15_a7.dts
@@ -41,6 +41,7 @@
                         cci-control-port = <&cci_control1>;
                         cpu-idle-states = <&CLUSTER_SLEEP_BIG>;
                         capacity-dmips-mhz = <1024>;
+                       sched-energy-costs = <&CPU_COST_A15 &CLUSTER_COST_A15>;
                 };
  
                 cpu1: cpu@1 {
@@ -50,6 +51,7 @@
                         cci-control-port = <&cci_control1>;
                         cpu-idle-states = <&CLUSTER_SLEEP_BIG>;
                         capacity-dmips-mhz = <1024>;
+                       sched-energy-costs = <&CPU_COST_A15 &CLUSTER_COST_A15>;
                 };
  
                 cpu2: cpu@2 {
@@ -59,6 +61,7 @@
                         cci-control-port = <&cci_control2>;
                         cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
                         capacity-dmips-mhz = <516>;
+                       sched-energy-costs = <&CPU_COST_A7 &CLUSTER_COST_A7>;
                 };
  
                 cpu3: cpu@3 {
@@ -68,6 +71,7 @@
                         cci-control-port = <&cci_control2>;
                         cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
                         capacity-dmips-mhz = <516>;
+                       sched-energy-costs = <&CPU_COST_A7 &CLUSTER_COST_A7>;
                 };
  
                 cpu4: cpu@4 {
@@ -77,6 +81,7 @@
                         cci-control-port = <&cci_control2>;
                         cpu-idle-states = <&CLUSTER_SLEEP_LITTLE>;
                         capacity-dmips-mhz = <516>;
+                       sched-energy-costs = <&CPU_COST_A7 &CLUSTER_COST_A7>;
                 };
  
                 idle-states {
@@ -96,6 +101,77 @@
                                 min-residency-us = <2500>;
                         };
                 };
+
+               energy-costs {
+                       CPU_COST_A15: core-cost0 {
+                               busy-cost-data = <
+                                       426    2021
+                                       512    2312
+                                       597    2756
+                                       682    3125
+                                       768    3524
+                                       853    3846
+                                       938    5177
+                                       1024   6997
+                               >;
+                               idle-cost-data = <
+                                       0
+                                       0
+                                       0
+                               >;
+                       };
+                       CPU_COST_A7: core-cost1 {
+                               busy-cost-data = <
+                                       150    187
+                                       172    275
+                                       215    334
+                                       258    407
+                                       301    447
+                                       344    549
+                                       387    761
+                                       430    1024
+                               >;
+                               idle-cost-data = <
+                                       0
+                                       0
+                                       0
+                               >;
+                       };
+                       CLUSTER_COST_A15: cluster-cost0 {
+                               busy-cost-data = <
+                                       426    7920
+                                       512    8165
+                                       597    8172
+                                       682    8195
+                                       768    8265
+                                       853    8446
+                                       938    11426
+                                       1024   15200
+                               >;
+                               idle-cost-data = <
+                                        70
+                                        70
+                                        25
+                               >;
+                       };
+                       CLUSTER_COST_A7: cluster-cost1 {
+                               busy-cost-data = <
+                                       150    2967
+                                       172    2792
+                                       215    2810
+                                       258    2815
+                                       301    2919
+                                       344    2847
+                                       387    3917
+                                       430    4905
+                               >;
+                               idle-cost-data = <
+                                       25
+                                       25
+                                       10
+                               >;
+                       };
+               };
         };
  
         memory@80000000 {
diff --git a/arch/arm/common/Kconfig b/arch/arm/common/Kconfig

index e5ad0708849a2f57d14d6e1f8141f6469d44aeeb..f07f47943bf381218a04320df087ec2a1f232e21 100644 (file)
--- a/arch/arm/common/Kconfig
+++ b/arch/arm/common/Kconfig
@@ -15,3 +15,7 @@ config SHARP_PARAM
  
  config SHARP_SCOOP
         bool
+
+config FIQ_GLUE
+       bool
+       select FIQ
diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile

index 70b4a14ed9937aad24ff77f5dd16dd0a52934538..10b506469c95a80a38aa8afc20cd77af3a63bf0a 100644 (file)
--- a/arch/arm/common/Makefile
+++ b/arch/arm/common/Makefile
@@ -5,6 +5,7 @@
  
  obj-y                          += firmware.o
  
+obj-$(CONFIG_FIQ_GLUE)         += fiq_glue.o fiq_glue_setup.o
  obj-$(CONFIG_SA1111)           += sa1111.o
  obj-$(CONFIG_DMABOUNCE)                += dmabounce.o
  obj-$(CONFIG_SHARP_LOCOMO)     += locomo.o
diff --git a/arch/arm/common/fiq_glue.S b/arch/arm/common/fiq_glue.S

new file mode 100644 (file)

index 0000000..24b42ce
--- /dev/null
+++ b/arch/arm/common/fiq_glue.S
@@ -0,0 +1,118 @@
+/*
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/linkage.h>
+#include <asm/assembler.h>
+
+               .text
+
+               .global fiq_glue_end
+
+               /* fiq stack: r0-r15,cpsr,spsr of interrupted mode */
+
+ENTRY(fiq_glue)
+               /* store pc, cpsr from previous mode, reserve space for spsr */
+               mrs     r12, spsr
+               sub     lr, lr, #4
+               subs    r10, #1
+               bne     nested_fiq
+
+               str     r12, [sp, #-8]!
+               str     lr, [sp, #-4]!
+
+               /* store r8-r14 from previous mode */
+               sub     sp, sp, #(7 * 4)
+               stmia   sp, {r8-r14}^
+               nop
+
+               /* store r0-r7 from previous mode */
+               stmfd   sp!, {r0-r7}
+
+               /* setup func(data,regs) arguments */
+               mov     r0, r9
+               mov     r1, sp
+               mov     r3, r8
+
+               mov     r7, sp
+
+               /* Get sp and lr from non-user modes */
+               and     r4, r12, #MODE_MASK
+               cmp     r4, #USR_MODE
+               beq     fiq_from_usr_mode
+
+               mov     r7, sp
+               orr     r4, r4, #(PSR_I_BIT | PSR_F_BIT)
+               msr     cpsr_c, r4
+               str     sp, [r7, #(4 * 13)]
+               str     lr, [r7, #(4 * 14)]
+               mrs     r5, spsr
+               str     r5, [r7, #(4 * 17)]
+
+               cmp     r4, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+               /* use fiq stack if we reenter this mode */
+               subne   sp, r7, #(4 * 3)
+
+fiq_from_usr_mode:
+               msr     cpsr_c, #(SVC_MODE | PSR_I_BIT | PSR_F_BIT)
+               mov     r2, sp
+               sub     sp, r7, #12
+               stmfd   sp!, {r2, ip, lr}
+               /* call func(data,regs) */
+               blx     r3
+               ldmfd   sp, {r2, ip, lr}
+               mov     sp, r2
+
+               /* restore/discard saved state */
+               cmp     r4, #USR_MODE
+               beq     fiq_from_usr_mode_exit
+
+               msr     cpsr_c, r4
+               ldr     sp, [r7, #(4 * 13)]
+               ldr     lr, [r7, #(4 * 14)]
+               msr     spsr_cxsf, r5
+
+fiq_from_usr_mode_exit:
+               msr     cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+
+               ldmfd   sp!, {r0-r7}
+               ldr     lr, [sp, #(4 * 7)]
+               ldr     r12, [sp, #(4 * 8)]
+               add     sp, sp, #(10 * 4)
+exit_fiq:
+               msr     spsr_cxsf, r12
+               add     r10, #1
+               cmp     r11, #0
+               moveqs  pc, lr
+               bx      r11 /* jump to custom fiq return function */
+
+nested_fiq:
+               orr     r12, r12, #(PSR_F_BIT)
+               b       exit_fiq
+
+fiq_glue_end:
+
+ENTRY(fiq_glue_setup) /* func, data, sp, smc call number */
+               stmfd           sp!, {r4}
+               mrs             r4, cpsr
+               msr             cpsr_c, #(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)
+               movs            r8, r0
+               mov             r9, r1
+               mov             sp, r2
+               mov             r11, r3
+               moveq           r10, #0
+               movne           r10, #1
+               msr             cpsr_c, r4
+               ldmfd           sp!, {r4}
+               bx              lr
+
diff --git a/arch/arm/common/fiq_glue_setup.c b/arch/arm/common/fiq_glue_setup.c

new file mode 100644 (file)

index 0000000..8cb1b61
--- /dev/null
+++ b/arch/arm/common/fiq_glue_setup.c
@@ -0,0 +1,147 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/slab.h>
+#include <asm/fiq.h>
+#include <asm/fiq_glue.h>
+
+extern unsigned char fiq_glue, fiq_glue_end;
+extern void fiq_glue_setup(void *func, void *data, void *sp,
+                          fiq_return_handler_t fiq_return_handler);
+
+static struct fiq_handler fiq_debbuger_fiq_handler = {
+       .name = "fiq_glue",
+};
+DEFINE_PER_CPU(void *, fiq_stack);
+static struct fiq_glue_handler *current_handler;
+static fiq_return_handler_t fiq_return_handler;
+static DEFINE_MUTEX(fiq_glue_lock);
+
+static void fiq_glue_setup_helper(void *info)
+{
+       struct fiq_glue_handler *handler = info;
+       fiq_glue_setup(handler->fiq, handler,
+               __get_cpu_var(fiq_stack) + THREAD_START_SP,
+               fiq_return_handler);
+}
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler)
+{
+       int ret;
+       int cpu;
+
+       if (!handler || !handler->fiq)
+               return -EINVAL;
+
+       mutex_lock(&fiq_glue_lock);
+       if (fiq_stack) {
+               ret = -EBUSY;
+               goto err_busy;
+       }
+
+       for_each_possible_cpu(cpu) {
+               void *stack;
+               stack = (void *)__get_free_pages(GFP_KERNEL, THREAD_SIZE_ORDER);
+               if (WARN_ON(!stack)) {
+                       ret = -ENOMEM;
+                       goto err_alloc_fiq_stack;
+               }
+               per_cpu(fiq_stack, cpu) = stack;
+       }
+
+       ret = claim_fiq(&fiq_debbuger_fiq_handler);
+       if (WARN_ON(ret))
+               goto err_claim_fiq;
+
+       current_handler = handler;
+       on_each_cpu(fiq_glue_setup_helper, handler, true);
+       set_fiq_handler(&fiq_glue, &fiq_glue_end - &fiq_glue);
+
+       mutex_unlock(&fiq_glue_lock);
+       return 0;
+
+err_claim_fiq:
+err_alloc_fiq_stack:
+       for_each_possible_cpu(cpu) {
+               __free_pages(per_cpu(fiq_stack, cpu), THREAD_SIZE_ORDER);
+               per_cpu(fiq_stack, cpu) = NULL;
+       }
+err_busy:
+       mutex_unlock(&fiq_glue_lock);
+       return ret;
+}
+
+static void fiq_glue_update_return_handler(void (*fiq_return)(void))
+{
+       fiq_return_handler = fiq_return;
+       if (current_handler)
+               on_each_cpu(fiq_glue_setup_helper, current_handler, true);
+}
+
+int fiq_glue_set_return_handler(void (*fiq_return)(void))
+{
+       int ret;
+
+       mutex_lock(&fiq_glue_lock);
+       if (fiq_return_handler) {
+               ret = -EBUSY;
+               goto err_busy;
+       }
+       fiq_glue_update_return_handler(fiq_return);
+       ret = 0;
+err_busy:
+       mutex_unlock(&fiq_glue_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(fiq_glue_set_return_handler);
+
+int fiq_glue_clear_return_handler(void (*fiq_return)(void))
+{
+       int ret;
+
+       mutex_lock(&fiq_glue_lock);
+       if (WARN_ON(fiq_return_handler != fiq_return)) {
+               ret = -EINVAL;
+               goto err_inval;
+       }
+       fiq_glue_update_return_handler(NULL);
+       ret = 0;
+err_inval:
+       mutex_unlock(&fiq_glue_lock);
+
+       return ret;
+}
+EXPORT_SYMBOL(fiq_glue_clear_return_handler);
+
+/**
+ * fiq_glue_resume - Restore fiqs after suspend or low power idle states
+ *
+ * This must be called before calling local_fiq_enable after returning from a
+ * power state where the fiq mode registers were lost. If a driver provided
+ * a resume hook when it registered the handler it will be called.
+ */
+
+void fiq_glue_resume(void)
+{
+       if (!current_handler)
+               return;
+       fiq_glue_setup(current_handler->fiq, current_handler,
+               __get_cpu_var(fiq_stack) + THREAD_START_SP,
+               fiq_return_handler);
+       if (current_handler->resume)
+               current_handler->resume(current_handler);
+}
+
diff --git a/arch/arm/configs/ranchu_defconfig b/arch/arm/configs/ranchu_defconfig

new file mode 100644 (file)

index 0000000..461a85a
--- /dev/null
+++ b/arch/arm/configs/ranchu_defconfig
@@ -0,0 +1,314 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CPUSETS=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+CONFIG_PROFILING=y
+CONFIG_OPROFILE=y
+CONFIG_ARCH_MMAP_RND_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+# CONFIG_IOSCHED_CFQ is not set
+CONFIG_ARCH_VIRT=y
+CONFIG_ARM_KERNMEM_PERMS=y
+CONFIG_SMP=y
+CONFIG_PREEMPT=y
+CONFIG_AEABI=y
+CONFIG_HIGHMEM=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_CMDLINE="console=ttyAMA0"
+CONFIG_VFP=y
+CONFIG_NEON=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_MTD=y
+CONFIG_MTD_CMDLINE_PARTS=y
+CONFIG_MTD_BLOCK=y
+CONFIG_MTD_CFI=y
+CONFIG_MTD_CFI_INTELEXT=y
+CONFIG_MTD_CFI_AMDSTD=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMSC911X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_USB_USBNET=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+CONFIG_SERIO_AMBAKMI=y
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_EHCI_HCD=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_RTC_CLASS=y
+CONFIG_RTC_DRV_PL031=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DETECT_HUNG_TASK=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_ENABLE_DEFAULT_TRACERS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_VIRTUALIZATION=y
diff --git a/arch/arm/include/asm/elf.h b/arch/arm/include/asm/elf.h

index 8c5ca92a87a990ea69d58b9167299fc53d2d9e92..51f0d2417fa999d581d2fc36a6ece701b8419c9c 100644 (file)
--- a/arch/arm/include/asm/elf.h
+++ b/arch/arm/include/asm/elf.h
@@ -113,8 +113,12 @@ int dump_task_regs(struct task_struct *t, elf_gregset_t *elfregs);
  #define CORE_DUMP_USE_REGSET
  #define ELF_EXEC_PAGESIZE      4096
  
-/* This is the base location for PIE (ET_DYN with INTERP) loads. */
-#define ELF_ET_DYN_BASE                0x400000UL
+/* This is the location that an ET_DYN program is loaded if exec'ed.  Typical
+   use of this is to invoke "./ld.so someprog" to test out a new version of
+   the loader.  We need to make sure that it is out of the way of the program
+   that it will "exec", and that there is sufficient room for the brk.  */
+
+#define ELF_ET_DYN_BASE        (TASK_SIZE / 3 * 2)
  
  /* When the program starts, a1 contains a pointer to a function to be 
     registered with atexit, as per the SVR4 ABI.  A value of 0 means we 
diff --git a/arch/arm/include/asm/fiq_glue.h b/arch/arm/include/asm/fiq_glue.h

new file mode 100644 (file)

index 0000000..a9e244f
--- /dev/null
+++ b/arch/arm/include/asm/fiq_glue.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __ASM_FIQ_GLUE_H
+#define __ASM_FIQ_GLUE_H
+
+struct fiq_glue_handler {
+       void (*fiq)(struct fiq_glue_handler *h, void *regs, void *svc_sp);
+       void (*resume)(struct fiq_glue_handler *h);
+};
+typedef void (*fiq_return_handler_t)(void);
+
+int fiq_glue_register_handler(struct fiq_glue_handler *handler);
+int fiq_glue_set_return_handler(fiq_return_handler_t fiq_return);
+int fiq_glue_clear_return_handler(fiq_return_handler_t fiq_return);
+
+#ifdef CONFIG_FIQ_GLUE
+void fiq_glue_resume(void);
+#else
+static inline void fiq_glue_resume(void) {}
+#endif
+
+#endif
diff --git a/arch/arm/include/asm/topology.h b/arch/arm/include/asm/topology.h

index f59ab9bcbaf956b8cac5568e62c54de17a7cd648..2a786f54d8b8b26af5b17f00c3373e09da3e30f8 100644 (file)
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -25,6 +25,17 @@ void init_cpu_topology(void);
  void store_cpu_topology(unsigned int cpuid);
  const struct cpumask *cpu_coregroup_mask(int cpu);
  
+#include <linux/arch_topology.h>
+
+/* Replace task scheduler's default frequency-invariant accounting */
+#define arch_scale_freq_capacity topology_get_freq_scale
+
+/* Replace task scheduler's default cpu-invariant accounting */
+#define arch_scale_cpu_capacity topology_get_cpu_scale
+
+/* Enable topology flag updates */
+#define arch_update_cpu_topology topology_update_cpu_topology
+
  #else
  
  static inline void init_cpu_topology(void) { }
diff --git a/arch/arm/kernel/kgdb.c b/arch/arm/kernel/kgdb.c

index caa0dbe3dc6156b6f1d36db227ab11346157322a..923a725ab9b585d74ace759389894d1ea5041a07 100644 (file)
--- a/arch/arm/kernel/kgdb.c
+++ b/arch/arm/kernel/kgdb.c
@@ -141,6 +141,8 @@ int kgdb_arch_handle_exception(int exception_vector, int signo,
  
  static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
  {
+       if (user_mode(regs))
+               return -1;
         kgdb_handle_exception(1, SIGTRAP, 0, regs);
  
         return 0;
@@ -148,6 +150,8 @@ static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr)
  
  static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr)
  {
+       if (user_mode(regs))
+               return -1;
         compiled_break = 1;
         kgdb_handle_exception(1, SIGTRAP, 0, regs);
  
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c

index d96714e1858c466ff8e055d84df9828349469d80..4b675a80f523d47b2ff7930fdcb751bd161beb71 100644 (file)
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -94,6 +94,77 @@ void arch_cpu_idle_exit(void)
         ledtrig_cpu(CPU_LED_IDLE_END);
  }
  
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+       int     i, j;
+       int     nlines;
+       u32     *p;
+
+       /*
+        * don't attempt to dump non-kernel addresses or
+        * values that are probably just small negative numbers
+        */
+       if (addr < PAGE_OFFSET || addr > -256UL)
+               return;
+
+       printk("\n%s: %#lx:\n", name, addr);
+
+       /*
+        * round address down to a 32 bit boundary
+        * and always dump a multiple of 32 bytes
+        */
+       p = (u32 *)(addr & ~(sizeof(u32) - 1));
+       nbytes += (addr & (sizeof(u32) - 1));
+       nlines = (nbytes + 31) / 32;
+
+
+       for (i = 0; i < nlines; i++) {
+               /*
+                * just display low 16 bits of address to keep
+                * each line of the dump < 80 characters
+                */
+               printk("%04lx ", (unsigned long)p & 0xffff);
+               for (j = 0; j < 8; j++) {
+                       u32     data;
+                       if (probe_kernel_address(p, data)) {
+                               printk(" ********");
+                       } else {
+                               printk(" %08x", data);
+                       }
+                       ++p;
+               }
+               printk("\n");
+       }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+       mm_segment_t fs;
+
+       fs = get_fs();
+       set_fs(KERNEL_DS);
+       show_data(regs->ARM_pc - nbytes, nbytes * 2, "PC");
+       show_data(regs->ARM_lr - nbytes, nbytes * 2, "LR");
+       show_data(regs->ARM_sp - nbytes, nbytes * 2, "SP");
+       show_data(regs->ARM_ip - nbytes, nbytes * 2, "IP");
+       show_data(regs->ARM_fp - nbytes, nbytes * 2, "FP");
+       show_data(regs->ARM_r0 - nbytes, nbytes * 2, "R0");
+       show_data(regs->ARM_r1 - nbytes, nbytes * 2, "R1");
+       show_data(regs->ARM_r2 - nbytes, nbytes * 2, "R2");
+       show_data(regs->ARM_r3 - nbytes, nbytes * 2, "R3");
+       show_data(regs->ARM_r4 - nbytes, nbytes * 2, "R4");
+       show_data(regs->ARM_r5 - nbytes, nbytes * 2, "R5");
+       show_data(regs->ARM_r6 - nbytes, nbytes * 2, "R6");
+       show_data(regs->ARM_r7 - nbytes, nbytes * 2, "R7");
+       show_data(regs->ARM_r8 - nbytes, nbytes * 2, "R8");
+       show_data(regs->ARM_r9 - nbytes, nbytes * 2, "R9");
+       show_data(regs->ARM_r10 - nbytes, nbytes * 2, "R10");
+       set_fs(fs);
+}
+
  void __show_regs(struct pt_regs *regs)
  {
         unsigned long flags;
@@ -185,6 +256,8 @@ void __show_regs(struct pt_regs *regs)
                 printk("Control: %08x%s\n", ctrl, buf);
         }
  #endif
+
+       show_extra_register_data(regs, 128);
  }
  
  void show_regs(struct pt_regs * regs)
diff --git a/arch/arm/kernel/reboot.c b/arch/arm/kernel/reboot.c

index 3b2aa9a9fe268d45335f781c4aa22cf573753a1b..c74249136f18d732d8f548eaf58f423365ae2be3 100644 (file)
--- a/arch/arm/kernel/reboot.c
+++ b/arch/arm/kernel/reboot.c
@@ -6,6 +6,7 @@
   * it under the terms of the GNU General Public License version 2 as
   * published by the Free Software Foundation.
   */
+#include <linux/console.h>
  #include <linux/cpu.h>
  #include <linux/delay.h>
  #include <linux/reboot.h>
@@ -125,6 +126,31 @@ void machine_power_off(void)
                 pm_power_off();
  }
  
+#ifdef CONFIG_ARM_FLUSH_CONSOLE_ON_RESTART
+void arm_machine_flush_console(void)
+{
+       printk("\n");
+       pr_emerg("Restarting %s\n", linux_banner);
+       if (console_trylock()) {
+               console_unlock();
+               return;
+       }
+
+       mdelay(50);
+
+       local_irq_disable();
+       if (!console_trylock())
+               pr_emerg("arm_restart: Console was locked! Busting\n");
+       else
+               pr_emerg("arm_restart: Console was locked!\n");
+       console_unlock();
+}
+#else
+void arm_machine_flush_console(void)
+{
+}
+#endif
+
  /*
   * Restart requires that the secondary CPUs stop performing any activity
   * while the primary CPU resets the system. Systems with a single CPU can
@@ -141,6 +167,10 @@ void machine_restart(char *cmd)
         local_irq_disable();
         smp_send_stop();
  
+       /* Flush the console to make sure all the relevant messages make it
+        * out to the console drivers */
+       arm_machine_flush_console();
+
         if (arm_pm_restart)
                 arm_pm_restart(reboot_mode, cmd);
         else
diff --git a/arch/arm/kernel/topology.c b/arch/arm/kernel/topology.c

index 24ac3cab411d94a186390aa1fedb16b53e1d3a73..a86e1057bd3645f6fe7979bb03c98237a4851085 100644 (file)
--- a/arch/arm/kernel/topology.c
+++ b/arch/arm/kernel/topology.c
@@ -25,11 +25,49 @@
  #include <linux/sched/topology.h>
  #include <linux/slab.h>
  #include <linux/string.h>
+#include <linux/sched_energy.h>
  
  #include <asm/cpu.h>
  #include <asm/cputype.h>
  #include <asm/topology.h>
  
+/* sd energy functions */
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+       struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+       unsigned long capacity;
+       int max_cap_idx;
+
+       if (!sge) {
+               pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+               return NULL;
+       }
+
+       max_cap_idx = sge->nr_cap_states - 1;
+       capacity = sge->cap_states[max_cap_idx].cap;
+
+       printk_deferred("cpu=%d set cpu scale %lu from energy model\n",
+                       cpu, capacity);
+
+       topology_set_cpu_scale(cpu, capacity);
+
+       return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+       struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+       if (!sge) {
+               pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+               return NULL;
+       }
+
+       return sge;
+}
+
  /*
   * cpu capacity scale management
   */
@@ -169,10 +207,26 @@ static void __init parse_dt_topology(void)
   */
  static void update_cpu_capacity(unsigned int cpu)
  {
-       if (!cpu_capacity(cpu) || cap_from_dt)
-               return;
+       const struct sched_group_energy *sge;
+       unsigned long capacity;
  
-       topology_set_cpu_scale(cpu, cpu_capacity(cpu) / middle_capacity);
+       sge = cpu_core_energy(cpu);
+
+       if (sge) {
+               int max_cap_idx;
+
+               max_cap_idx = sge->nr_cap_states - 1;
+               capacity = sge->cap_states[max_cap_idx].cap;
+
+               printk_deferred("cpu=%d set cpu scale %lu from energy model\n",
+                               cpu, capacity);
+       } else {
+               if (!cpu_capacity(cpu) || cap_from_dt)
+                       return;
+               capacity = cpu_capacity(cpu) / middle_capacity;
+       }
+
+       topology_set_cpu_scale(cpu, capacity);
  
         pr_info("CPU%u: update cpu_capacity %lu\n",
                 cpu, topology_get_cpu_scale(NULL, cpu));
@@ -278,23 +332,37 @@ void store_cpu_topology(unsigned int cpuid)
  
         update_cpu_capacity(cpuid);
  
+       topology_detect_flags();
+
         pr_info("CPU%u: thread %d, cpu %d, socket %d, mpidr %x\n",
                 cpuid, cpu_topology[cpuid].thread_id,
                 cpu_topology[cpuid].core_id,
                 cpu_topology[cpuid].socket_id, mpidr);
  }
  
+#ifdef CONFIG_SCHED_MC
+static int core_flags(void)
+{
+       return cpu_core_flags() | topology_core_flags();
+}
+
  static inline int cpu_corepower_flags(void)
  {
-       return SD_SHARE_PKG_RESOURCES  | SD_SHARE_POWERDOMAIN;
+       return topology_core_flags()
+               | SD_SHARE_PKG_RESOURCES | SD_SHARE_POWERDOMAIN;
+}
+#endif
+
+static int cpu_flags(void)
+{
+       return topology_cpu_flags();
  }
  
  static struct sched_domain_topology_level arm_topology[] = {
  #ifdef CONFIG_SCHED_MC
-       { cpu_corepower_mask, cpu_corepower_flags, SD_INIT_NAME(GMC) },
-       { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) },
+       { cpu_coregroup_mask, core_flags, cpu_core_energy, SD_INIT_NAME(MC) },
  #endif
-       { cpu_cpu_mask, SD_INIT_NAME(DIE) },
+       { cpu_cpu_mask, cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) },
         { NULL, },
  };
  
@@ -322,4 +390,6 @@ void __init init_cpu_topology(void)
  
         /* Set scheduler topology descriptor */
         set_sched_topology(arm_topology);
+
+       init_sched_energy_costs();
  }
diff --git a/arch/arm/mm/cache-v6.S b/arch/arm/mm/cache-v6.S

index 24659952c2784de64a53dc2e889ab616bd19b12b..11da0f50a1fef80c0789e653f8307ccee434e5c6 100644 (file)
--- a/arch/arm/mm/cache-v6.S
+++ b/arch/arm/mm/cache-v6.S
@@ -270,6 +270,11 @@ v6_dma_clean_range:
   *     - end     - virtual end address of region
   */
  ENTRY(v6_dma_flush_range)
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+       sub     r2, r1, r0
+       cmp     r2, #CONFIG_CACHE_FLUSH_RANGE_LIMIT
+       bhi     v6_dma_flush_dcache_all
+#endif
  #ifdef CONFIG_DMA_CACHE_RWFO
         ldrb    r2, [r0]                @ read for ownership
         strb    r2, [r0]                @ write for ownership
@@ -292,6 +297,18 @@ ENTRY(v6_dma_flush_range)
         mcr     p15, 0, r0, c7, c10, 4          @ drain write buffer
         ret     lr
  
+#ifdef CONFIG_CACHE_FLUSH_RANGE_LIMIT
+v6_dma_flush_dcache_all:
+       mov     r0, #0
+#ifdef HARVARD_CACHE
+       mcr     p15, 0, r0, c7, c14, 0          @ D cache clean+invalidate
+#else
+       mcr     p15, 0, r0, c7, c15, 0          @ Cache clean+invalidate
+#endif
+       mcr     p15, 0, r0, c7, c10, 4          @ drain write buffer
+       mov     pc, lr
+#endif
+
  /*
   *     dma_map_area(start, size, dir)
   *     - start - kernel virtual start address
diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c

index 42f585379e19c97fcb6abe1004d9b469f90215ce..6123d126e5ae6e2744d9065d0d3dfb88657ee839 100644 (file)
--- a/arch/arm/mm/fault.c
+++ b/arch/arm/mm/fault.c
@@ -274,10 +274,10 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
                 local_irq_enable();
  
         /*
-        * If we're in an interrupt or have no user
+        * If we're in an interrupt, or have no irqs, or have no user
          * context, we must not take the fault..
          */
-       if (faulthandler_disabled() || !mm)
+       if (faulthandler_disabled() || irqs_disabled() || !mm)
                 goto no_context;
  
         if (user_mode(regs))
diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig

index c2abb4e88ff2fa954a56de3b1d421763e9b83337..cc904e0a1a5945547de8234ddbaa5b380da5d456 100644 (file)
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1081,6 +1081,23 @@ config CMDLINE
           entering them here. As a minimum, you should specify the the
           root device (e.g. root=/dev/nfs).
  
+choice
+       prompt "Kernel command line type" if CMDLINE != ""
+       default CMDLINE_FROM_BOOTLOADER
+
+config CMDLINE_FROM_BOOTLOADER
+       bool "Use bootloader kernel arguments if available"
+       help
+         Uses the command-line options passed by the boot loader. If
+         the boot loader doesn't provide any, the default kernel command
+         string provided in CMDLINE will be used.
+
+config CMDLINE_EXTEND
+       bool "Extend bootloader kernel arguments"
+       help
+         The command-line arguments provided by the boot loader will be
+         appended to the default kernel command string.
+
  config CMDLINE_FORCE
         bool "Always use the default kernel command string"
         help
@@ -1088,6 +1105,7 @@ config CMDLINE_FORCE
           loader passes other arguments to the kernel.
           This is useful if you cannot or don't want to change the
           command-line options your boot loader passes to the kernel.
+endchoice
  
  config EFI_STUB
         bool
@@ -1120,6 +1138,41 @@ config DMI
           However, even with this option, the resultant kernel should
           continue to boot on existing non-UEFI platforms.
  
+config BUILD_ARM64_APPENDED_DTB_IMAGE
+       bool "Build a concatenated Image.gz/dtb by default"
+       depends on OF
+       help
+         Enabling this option will cause a concatenated Image.gz and list of
+         DTBs to be built by default (instead of a standalone Image.gz.)
+         The image will built in arch/arm64/boot/Image.gz-dtb
+
+choice
+       prompt "Appended DTB Kernel Image name"
+       depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+       help
+         Enabling this option will cause a specific kernel image Image or
+         Image.gz to be used for final image creation.
+         The image will built in arch/arm64/boot/IMAGE-NAME-dtb
+
+       config IMG_GZ_DTB
+               bool "Image.gz-dtb"
+       config IMG_DTB
+               bool "Image-dtb"
+endchoice
+
+config BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME
+       string
+       depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+       default "Image.gz-dtb" if IMG_GZ_DTB
+       default "Image-dtb" if IMG_DTB
+
+config BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES
+       string "Default dtb names"
+       depends on BUILD_ARM64_APPENDED_DTB_IMAGE
+       help
+         Space separated list of names of dtbs to append when
+         building a concatenated Image.gz-dtb.
+
  endmenu
  
  menu "Userspace binary formats"
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile

index 7318165cfc90b5b4e36048fbafd9287a1cf3516f..3132997f668ebf59d0c53b2f37ff4b1406b5cdcc 100644 (file)
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -49,9 +49,17 @@ $(warning Detected assembler with broken .inst; disassembly will be unreliable)
    endif
  endif
  
-KBUILD_CFLAGS  += -mgeneral-regs-only $(lseinstr) $(brokengasinst)
+ifeq ($(cc-name),clang)
+# This is a workaround for https://bugs.llvm.org/show_bug.cgi?id=30792.
+# TODO: revert when this is fixed in LLVM.
+KBUILD_CFLAGS  += -mno-implicit-float
+else
+KBUILD_CFLAGS  += -mgeneral-regs-only
+endif
+KBUILD_CFLAGS  += $(lseinstr) $(brokengasinst)
  KBUILD_CFLAGS  += -fno-asynchronous-unwind-tables
  KBUILD_CFLAGS  += $(call cc-option, -mpc-relative-literal-loads)
+KBUILD_CFLAGS  += -fno-pic
  KBUILD_AFLAGS  += $(lseinstr) $(brokengasinst)
  
  KBUILD_CFLAGS  += $(call cc-option,-mabi=lp64)
@@ -95,6 +103,10 @@ else
  TEXT_OFFSET := 0x00080000
  endif
  
+ifeq ($(cc-name),clang)
+KBUILD_CFLAGS += $(call cc-disable-warning, asm-operand-widths)
+endif
+
  # KASAN_SHADOW_OFFSET = VA_START + (1 << (VA_BITS - 3)) - (1 << 61)
  # in 32-bit arithmetic
  KASAN_SHADOW_OFFSET := $(shell printf "0x%08x00000000\n" $$(( \
@@ -114,10 +126,15 @@ core-$(CONFIG_EFI_STUB) += $(objtree)/drivers/firmware/efi/libstub/lib.a
  
  # Default target when executing plain make
  boot           := arch/arm64/boot
+ifeq ($(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE),y)
+KBUILD_IMAGE   := $(boot)/$(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME))
+else
  KBUILD_IMAGE   := $(boot)/Image.gz
+endif
+
  KBUILD_DTBS    := dtbs
  
-all:   Image.gz $(KBUILD_DTBS)
+all:   Image.gz $(KBUILD_DTBS) $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_KERNEL_IMAGE_NAME))
  
  
  Image: vmlinux
@@ -140,6 +157,9 @@ dtbs: prepare scripts
  dtbs_install:
         $(Q)$(MAKE) $(dtbinst)=$(boot)/dts
  
+Image-dtb Image.gz-dtb: vmlinux scripts dtbs
+       $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@
+
  PHONY += vdso_install
  vdso_install:
         $(Q)$(MAKE) $(build)=arch/arm64/kernel/vdso $@
diff --git a/arch/arm64/boot/.gitignore b/arch/arm64/boot/.gitignore

index 8dab0bb6ae667c5f89da8aca74a0a3bb61dcfaa9..34e35209fc2ed7b8d21d1306e15fe6563181952c 100644 (file)
--- a/arch/arm64/boot/.gitignore
+++ b/arch/arm64/boot/.gitignore
@@ -1,2 +1,4 @@
  Image
+Image-dtb
  Image.gz
+Image.gz-dtb
diff --git a/arch/arm64/boot/Makefile b/arch/arm64/boot/Makefile

index 1f012c506434360f764655a89eb4719c2aabd0f9..2c8cb864315e30b238b997e129dcb8fde133d00f 100644 (file)
--- a/arch/arm64/boot/Makefile
+++ b/arch/arm64/boot/Makefile
@@ -14,16 +14,29 @@
  # Based on the ia64 boot/Makefile.
  #
  
+include $(srctree)/arch/arm64/boot/dts/Makefile
+
  OBJCOPYFLAGS_Image :=-O binary -R .note -R .note.gnu.build-id -R .comment -S
  
  targets := Image Image.gz
  
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+DTB_OBJS := $(addprefix $(obj)/dts/,$(DTB_LIST))
+
  $(obj)/Image: vmlinux FORCE
         $(call if_changed,objcopy)
  
  $(obj)/Image.bz2: $(obj)/Image FORCE
         $(call if_changed,bzip2)
  
+$(obj)/Image-dtb: $(obj)/Image $(DTB_OBJS) FORCE
+       $(call if_changed,cat)
+
  $(obj)/Image.gz: $(obj)/Image FORCE
         $(call if_changed,gzip)
  
@@ -36,6 +49,9 @@ $(obj)/Image.lzma: $(obj)/Image FORCE
  $(obj)/Image.lzo: $(obj)/Image FORCE
         $(call if_changed,lzo)
  
+$(obj)/Image.gz-dtb: $(obj)/Image.gz $(DTB_OBJS) FORCE
+       $(call if_changed,cat)
+
  install:
         $(CONFIG_SHELL) $(srctree)/$(src)/install.sh $(KERNELRELEASE) \
         $(obj)/Image System.map "$(INSTALL_PATH)"
diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile

index c6684ab8e201fc5dca8ce29726e8b3d8508e6fbd..db5a70876487ddc6591512994181f34df1ffa1e8 100644 (file)
--- a/arch/arm64/boot/dts/Makefile
+++ b/arch/arm64/boot/dts/Makefile
@@ -32,3 +32,17 @@ dtstree              := $(srctree)/$(src)
  dtb-$(CONFIG_OF_ALL_DTBS) := $(patsubst $(dtstree)/%.dts,%.dtb, $(foreach d,$(dts-dirs), $(wildcard $(dtstree)/$(d)/*.dts)))
  
  always         := $(dtb-y)
+
+targets += dtbs
+
+DTB_NAMES := $(subst $\",,$(CONFIG_BUILD_ARM64_APPENDED_DTB_IMAGE_NAMES))
+ifneq ($(DTB_NAMES),)
+DTB_LIST := $(addsuffix .dtb,$(DTB_NAMES))
+else
+DTB_LIST := $(dtb-y)
+endif
+targets += $(DTB_LIST)
+
+dtbs: $(addprefix $(obj)/, $(DTB_LIST))
+
+clean-files := dts/*.dtb *.dtb
diff --git a/arch/arm64/boot/dts/arm/juno-r2.dts b/arch/arm64/boot/dts/arm/juno-r2.dts

index b39b6d6ec5aa1be93c88e7a988bdbae3cfd40a9b..d2467e478ec3d75e36924eb1a3cbdda814cf999d 100644 (file)
--- a/arch/arm64/boot/dts/arm/juno-r2.dts
+++ b/arch/arm64/boot/dts/arm/juno-r2.dts
@@ -98,6 +98,7 @@
                         next-level-cache = <&A72_L2>;
                         clocks = <&scpi_dvfs 0>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A72 &CLUSTER_COST_A72>;
                         capacity-dmips-mhz = <1024>;
                 };
  
@@ -115,6 +116,7 @@
                         next-level-cache = <&A72_L2>;
                         clocks = <&scpi_dvfs 0>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A72 &CLUSTER_COST_A72>;
                         capacity-dmips-mhz = <1024>;
                 };
  
@@ -132,6 +134,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53R2 &CLUSTER_COST_A53R2>;
                         capacity-dmips-mhz = <485>;
                 };
  
@@ -149,6 +152,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53R2 &CLUSTER_COST_A53R2>;
                         capacity-dmips-mhz = <485>;
                 };
  
@@ -166,6 +170,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53R2 &CLUSTER_COST_A53R2>;
                         capacity-dmips-mhz = <485>;
                 };
  
@@ -183,6 +188,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53R2 &CLUSTER_COST_A53R2>;
                         capacity-dmips-mhz = <485>;
                 };
  
@@ -199,6 +205,7 @@
                         cache-line-size = <64>;
                         cache-sets = <1024>;
                 };
+               /include/ "juno-sched-energy.dtsi"
         };
  
         pmu_a72 {
diff --git a/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi

new file mode 100644 (file)

index 0000000..7ffb255
--- /dev/null
+++ b/arch/arm64/boot/dts/arm/juno-sched-energy.dtsi
@@ -0,0 +1,123 @@
+/*
+ * ARM JUNO specific energy cost model data. There are no unit requirements for
+ * the data. Data can be normalized to any reference point, but the
+ * normalization must be consistent. That is, one bogo-joule/watt must be the
+ * same quantity for all data, but we don't care what it is.
+ */
+
+energy-costs {
+       /* Juno r0 Energy */
+       CPU_COST_A57: core-cost0 {
+               busy-cost-data = <
+                       417   168
+                       579   251
+                       744   359
+                       883   479
+                       1024   616
+               >;
+               idle-cost-data = <
+                       15
+                       15
+                        0
+                        0
+               >;
+       };
+       CPU_COST_A53: core-cost1 {
+               busy-cost-data = <
+                       235    33
+                       302    46
+                       368    61
+                       406    76
+                       447    93
+               >;
+               idle-cost-data = <
+                     6
+                     6
+                     0
+                     0
+               >;
+       };
+       CLUSTER_COST_A57: cluster-cost0 {
+               busy-cost-data = <
+                       417    24
+                       579    32
+                       744    43
+                       883    49
+                       1024    64
+               >;
+               idle-cost-data = <
+                        65
+                        65
+                        65
+                        24
+               >;
+       };
+       CLUSTER_COST_A53: cluster-cost1 {
+               busy-cost-data = <
+                       235    26
+                       303    30
+                       368    39
+                       406    47
+                       447    57
+               >;
+               idle-cost-data = <
+                       56
+                       56
+                       56
+                       17
+               >;
+       };
+       /* Juno r2 Energy */
+       CPU_COST_A72: core-cost2 {
+               busy-cost-data = <
+                       501   174
+                       849   344
+                       1024   526
+               >;
+               idle-cost-data = <
+                     48
+                     48
+                      0
+                      0
+               >;
+       };
+       CPU_COST_A53R2: core-cost3 {
+               busy-cost-data = <
+                       276    37
+                       501    59
+                       593   117
+               >;
+               idle-cost-data = <
+                     33
+                     33
+                     0
+                     0
+               >;
+       };
+       CLUSTER_COST_A72: cluster-cost2 {
+               busy-cost-data = <
+                       501    48
+                       849    73
+                       1024   107
+               >;
+               idle-cost-data = <
+                        48
+                        48
+                        48
+                        18
+               >;
+       };
+       CLUSTER_COST_A53R2: cluster-cost3 {
+               busy-cost-data = <
+                       276    41
+                       501    86
+                       593   107
+               >;
+               idle-cost-data = <
+                       41
+                       41
+                       41
+                       14
+               >;
+       };
+};
diff --git a/arch/arm64/boot/dts/arm/juno.dts b/arch/arm64/boot/dts/arm/juno.dts

index c9236c4b967d2f461b91730e0b8f3f941b32d307..ae5306a0ca267823007c8b957cfd00cf7afd4265 100644 (file)
--- a/arch/arm64/boot/dts/arm/juno.dts
+++ b/arch/arm64/boot/dts/arm/juno.dts
@@ -97,6 +97,7 @@
                         next-level-cache = <&A57_L2>;
                         clocks = <&scpi_dvfs 0>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>;
                         capacity-dmips-mhz = <1024>;
                 };
  
@@ -114,6 +115,7 @@
                         next-level-cache = <&A57_L2>;
                         clocks = <&scpi_dvfs 0>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A57 &CLUSTER_COST_A57>;
                         capacity-dmips-mhz = <1024>;
                 };
  
@@ -131,6 +133,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
                         capacity-dmips-mhz = <578>;
                 };
  
@@ -148,6 +151,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
                         capacity-dmips-mhz = <578>;
                 };
  
@@ -165,6 +169,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
                         capacity-dmips-mhz = <578>;
                 };
  
@@ -182,6 +187,7 @@
                         next-level-cache = <&A53_L2>;
                         clocks = <&scpi_dvfs 1>;
                         cpu-idle-states = <&CPU_SLEEP_0 &CLUSTER_SLEEP_0>;
+                       sched-energy-costs = <&CPU_COST_A53 &CLUSTER_COST_A53>;
                         capacity-dmips-mhz = <578>;
                 };
  
@@ -198,6 +204,7 @@
                         cache-line-size = <64>;
                         cache-sets = <1024>;
                 };
+               /include/ "juno-sched-energy.dtsi"
         };
  
         pmu_a57 {
diff --git a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi

index ff1dc89f599e6b8cadecb8a7ce18b4cba8ad590a..66d48e35d66da095b95f610aa4fd0423c0df1abc 100644 (file)
--- a/arch/arm64/boot/dts/hisilicon/hi6220.dtsi
+++ b/arch/arm64/boot/dts/hisilicon/hi6220.dtsi
@@ -92,7 +92,9 @@
                         cooling-max-level = <0>;
                         #cooling-cells = <2>; /* min followed by max */
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
                         dynamic-power-coefficient = <311>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu1: cpu@1 {
@@ -103,6 +105,8 @@
                         next-level-cache = <&CLUSTER0_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu2: cpu@2 {
@@ -113,6 +117,7 @@
                         next-level-cache = <&CLUSTER0_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu3: cpu@3 {
@@ -123,6 +128,8 @@
                         next-level-cache = <&CLUSTER0_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu4: cpu@100 {
@@ -133,6 +140,7 @@
                         next-level-cache = <&CLUSTER1_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu5: cpu@101 {
@@ -143,6 +151,8 @@
                         next-level-cache = <&CLUSTER1_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu6: cpu@102 {
@@ -153,6 +163,8 @@
                         next-level-cache = <&CLUSTER1_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 cpu7: cpu@103 {
@@ -163,6 +175,8 @@
                         next-level-cache = <&CLUSTER1_L2>;
                         operating-points-v2 = <&cpu_opp_table>;
                         cpu-idle-states = <&CPU_SLEEP &CLUSTER_SLEEP>;
+                       sched-energy-costs = <&CPU_COST &CLUSTER_COST &SYSTEM_COST>;
+                       capacity-dmips-mhz = <1024>;
                 };
  
                 CLUSTER0_L2: l2-cache0 {
@@ -172,6 +186,50 @@
                 CLUSTER1_L2: l2-cache1 {
                         compatible = "cache";
                 };
+
+               energy-costs {
+                       SYSTEM_COST: system-cost0 {
+                               busy-cost-data = <
+                                       1024   0
+                               >;
+                               idle-cost-data = <
+                                       0
+                                       0
+                                       0
+                                       0
+                               >;
+                       };
+                       CLUSTER_COST: cluster-cost0 {
+                               busy-cost-data = <
+                                       178   16
+                                       369   29
+                                       622   47
+                                       819   75
+                                       1024  112
+                               >;
+                               idle-cost-data = <
+                                       107
+                                       107
+                                        47
+                                        0
+                               >;
+                       };
+                       CPU_COST: core-cost0 {
+                               busy-cost-data = <
+                                       178   69
+                                       369   125
+                                       622   224
+                                       819   367
+                                       1024  670
+                               >;
+                               idle-cost-data = <
+                                       15
+                                       15
+                                        0
+                                        0
+                               >;
+                       };
+               };
         };
  
         cpu_opp_table: cpu_opp_table {
diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig

index 34480e9af2e71d4ae8dfe694b3f2a918d3d9872b..36e21112113e2db2c6074cd265cd3ae69fc6216d 100644 (file)
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -4,6 +4,7 @@ CONFIG_AUDIT=y
  CONFIG_NO_HZ_IDLE=y
  CONFIG_HIGH_RES_TIMERS=y
  CONFIG_IRQ_TIME_ACCOUNTING=y
+CONFIG_SCHED_WALT=y
  CONFIG_BSD_PROCESS_ACCT=y
  CONFIG_BSD_PROCESS_ACCT_V3=y
  CONFIG_TASKSTATS=y
@@ -24,8 +25,9 @@ CONFIG_CGROUP_CPUACCT=y
  CONFIG_CGROUP_PERF=y
  CONFIG_USER_NS=y
  CONFIG_SCHED_AUTOGROUP=y
+CONFIG_SCHED_TUNE=y
+CONFIG_DEFAULT_USE_ENERGY_AWARE=y
  CONFIG_BLK_DEV_INITRD=y
-CONFIG_KALLSYMS_ALL=y
  # CONFIG_COMPAT_BRK is not set
  CONFIG_PROFILING=y
  CONFIG_JUMP_LABEL=y
@@ -69,13 +71,13 @@ CONFIG_HOTPLUG_PCI_ACPI=y
  CONFIG_PCI_LAYERSCAPE=y
  CONFIG_PCI_HISI=y
  CONFIG_PCIE_QCOM=y
-CONFIG_PCIE_KIRIN=y
  CONFIG_PCIE_ARMADA_8K=y
+CONFIG_PCIE_KIRIN=y
  CONFIG_PCI_AARDVARK=y
  CONFIG_PCIE_RCAR=y
-CONFIG_PCIE_ROCKCHIP=m
  CONFIG_PCI_HOST_GENERIC=y
  CONFIG_PCI_XGENE=y
+CONFIG_PCIE_ROCKCHIP=m
  CONFIG_ARM64_VA_BITS_48=y
  CONFIG_SCHED_MC=y
  CONFIG_NUMA=y
@@ -93,6 +95,12 @@ CONFIG_HIBERNATION=y
  CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
  CONFIG_ARM_CPUIDLE=y
  CONFIG_CPU_FREQ=y
+CONFIG_CPU_FREQ_STAT=y
+CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL=y
+CONFIG_CPU_FREQ_GOV_POWERSAVE=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_CPU_FREQ_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y
  CONFIG_CPUFREQ_DT=y
  CONFIG_ARM_BIG_LITTLE_CPUFREQ=y
  CONFIG_ARM_SCPI_CPUFREQ=y
@@ -140,11 +148,10 @@ CONFIG_BT_HIDP=m
  CONFIG_BT_LEDS=y
  # CONFIG_BT_DEBUGFS is not set
  CONFIG_BT_HCIUART=m
-CONFIG_BT_HCIUART_LL=y
-CONFIG_CFG80211=m
-CONFIG_MAC80211=m
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
  CONFIG_MAC80211_LEDS=y
-CONFIG_RFKILL=m
+CONFIG_RFKILL=y
  CONFIG_NET_9P=y
  CONFIG_NET_9P_VIRTIO=y
  CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
@@ -210,21 +217,16 @@ CONFIG_REALTEK_PHY=m
  CONFIG_ROCKCHIP_PHY=y
  CONFIG_USB_PEGASUS=m
  CONFIG_USB_RTL8150=m
-CONFIG_USB_RTL8152=m
-CONFIG_USB_USBNET=m
-CONFIG_USB_NET_DM9601=m
-CONFIG_USB_NET_SR9800=m
-CONFIG_USB_NET_SMSC75XX=m
-CONFIG_USB_NET_SMSC95XX=m
-CONFIG_USB_NET_PLUSB=m
-CONFIG_USB_NET_MCS7830=m
+CONFIG_USB_RTL8152=y
  CONFIG_BRCMFMAC=m
+CONFIG_RTL_CARDS=m
  CONFIG_WL18XX=m
  CONFIG_WLCORE_SDIO=m
+CONFIG_USB_NET_RNDIS_WLAN=y
  CONFIG_INPUT_EVDEV=y
  CONFIG_KEYBOARD_ADC=m
-CONFIG_KEYBOARD_CROS_EC=y
  CONFIG_KEYBOARD_GPIO=y
+CONFIG_KEYBOARD_CROS_EC=y
  CONFIG_INPUT_MISC=y
  CONFIG_INPUT_PM8941_PWRKEY=y
  CONFIG_INPUT_HISI_POWERKEY=y
@@ -275,20 +277,20 @@ CONFIG_I2C_UNIPHIER_F=y
  CONFIG_I2C_RCAR=y
  CONFIG_I2C_CROS_EC_TUNNEL=y
  CONFIG_SPI=y
-CONFIG_SPI_MESON_SPICC=m
-CONFIG_SPI_MESON_SPIFC=m
  CONFIG_SPI_BCM2835=m
  CONFIG_SPI_BCM2835AUX=m
+CONFIG_SPI_MESON_SPICC=m
+CONFIG_SPI_MESON_SPIFC=m
  CONFIG_SPI_ORION=y
  CONFIG_SPI_PL022=y
-CONFIG_SPI_QUP=y
  CONFIG_SPI_ROCKCHIP=y
+CONFIG_SPI_QUP=y
  CONFIG_SPI_S3C64XX=y
  CONFIG_SPI_SPIDEV=m
  CONFIG_SPMI=y
-CONFIG_PINCTRL_IPQ8074=y
  CONFIG_PINCTRL_SINGLE=y
  CONFIG_PINCTRL_MAX77620=y
+CONFIG_PINCTRL_IPQ8074=y
  CONFIG_PINCTRL_MSM8916=y
  CONFIG_PINCTRL_MSM8994=y
  CONFIG_PINCTRL_MSM8996=y
@@ -313,9 +315,8 @@ CONFIG_SENSORS_INA2XX=m
  CONFIG_THERMAL_GOV_POWER_ALLOCATOR=y
  CONFIG_CPU_THERMAL=y
  CONFIG_THERMAL_EMULATION=y
-CONFIG_BRCMSTB_THERMAL=m
-CONFIG_EXYNOS_THERMAL=y
  CONFIG_ROCKCHIP_THERMAL=m
+CONFIG_EXYNOS_THERMAL=y
  CONFIG_WATCHDOG=y
  CONFIG_S3C2410_WATCHDOG=y
  CONFIG_MESON_GXBB_WATCHDOG=m
@@ -334,9 +335,9 @@ CONFIG_MFD_MAX77620=y
  CONFIG_MFD_SPMI_PMIC=y
  CONFIG_MFD_RK808=y
  CONFIG_MFD_SEC_CORE=y
+CONFIG_REGULATOR_FIXED_VOLTAGE=y
  CONFIG_REGULATOR_AXP20X=y
  CONFIG_REGULATOR_FAN53555=y
-CONFIG_REGULATOR_FIXED_VOLTAGE=y
  CONFIG_REGULATOR_GPIO=y
  CONFIG_REGULATOR_HI6421V530=y
  CONFIG_REGULATOR_HI655X=y
@@ -346,16 +347,13 @@ CONFIG_REGULATOR_QCOM_SMD_RPM=y
  CONFIG_REGULATOR_QCOM_SPMI=y
  CONFIG_REGULATOR_RK808=y
  CONFIG_REGULATOR_S2MPS11=y
+CONFIG_RC_DEVICES=y
+CONFIG_IR_MESON=m
  CONFIG_MEDIA_SUPPORT=m
  CONFIG_MEDIA_CAMERA_SUPPORT=y
  CONFIG_MEDIA_ANALOG_TV_SUPPORT=y
  CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y
  CONFIG_MEDIA_CONTROLLER=y
-CONFIG_MEDIA_RC_SUPPORT=y
-CONFIG_RC_CORE=m
-CONFIG_RC_DEVICES=y
-CONFIG_RC_DECODERS=y
-CONFIG_IR_MESON=m
  CONFIG_VIDEO_V4L2_SUBDEV_API=y
  # CONFIG_DVB_NET is not set
  CONFIG_V4L_MEM2MEM_DRIVERS=y
@@ -393,7 +391,6 @@ CONFIG_FB_ARMCLCD=y
  CONFIG_BACKLIGHT_GENERIC=m
  CONFIG_BACKLIGHT_PWM=m
  CONFIG_BACKLIGHT_LP855X=m
-CONFIG_FRAMEBUFFER_CONSOLE=y
  CONFIG_LOGO=y
  # CONFIG_LOGO_LINUX_MONO is not set
  # CONFIG_LOGO_LINUX_VGA16 is not set
@@ -492,7 +489,6 @@ CONFIG_XEN_GRANT_DEV_ALLOC=y
  CONFIG_COMMON_CLK_RK808=y
  CONFIG_COMMON_CLK_SCPI=y
  CONFIG_COMMON_CLK_CS2000_CP=y
-CONFIG_COMMON_CLK_S2MPS11=y
  CONFIG_CLK_QORIQ=y
  CONFIG_COMMON_CLK_PWM=y
  CONFIG_COMMON_CLK_QCOM=y
@@ -531,13 +527,13 @@ CONFIG_PWM_MESON=m
  CONFIG_PWM_ROCKCHIP=y
  CONFIG_PWM_SAMSUNG=y
  CONFIG_PWM_TEGRA=m
-CONFIG_PHY_RCAR_GEN3_USB2=y
-CONFIG_PHY_HI6220_USB=y
+CONFIG_PHY_XGENE=y
  CONFIG_PHY_SUN4I_USB=y
-CONFIG_PHY_ROCKCHIP_INNO_USB2=y
+CONFIG_PHY_HI6220_USB=y
+CONFIG_PHY_RCAR_GEN3_USB2=y
  CONFIG_PHY_ROCKCHIP_EMMC=y
+CONFIG_PHY_ROCKCHIP_INNO_USB2=y
  CONFIG_PHY_ROCKCHIP_PCIE=m
-CONFIG_PHY_XGENE=y
  CONFIG_PHY_TEGRA_XUSB=y
  CONFIG_QCOM_L2_PMU=y
  CONFIG_QCOM_L3_PMU=y
@@ -579,29 +575,27 @@ CONFIG_VIRTUALIZATION=y
  CONFIG_KVM=y
  CONFIG_PRINTK_TIME=y
  CONFIG_DEBUG_INFO=y
-CONFIG_DEBUG_FS=y
  CONFIG_MAGIC_SYSRQ=y
  CONFIG_DEBUG_KERNEL=y
-CONFIG_LOCKUP_DETECTOR=y
-# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
  # CONFIG_DEBUG_PREEMPT is not set
-# CONFIG_FTRACE is not set
+CONFIG_PROVE_LOCKING=y
+CONFIG_FUNCTION_TRACER=y
+CONFIG_IRQSOFF_TRACER=y
+CONFIG_PREEMPT_TRACER=y
+CONFIG_SCHED_TRACER=y
  CONFIG_MEMTEST=y
  CONFIG_SECURITY=y
  CONFIG_CRYPTO_ECHAINIV=y
  CONFIG_CRYPTO_ANSI_CPRNG=y
  CONFIG_ARM64_CRYPTO=y
-CONFIG_CRYPTO_SHA256_ARM64=m
  CONFIG_CRYPTO_SHA512_ARM64=m
  CONFIG_CRYPTO_SHA1_ARM64_CE=y
  CONFIG_CRYPTO_SHA2_ARM64_CE=y
  CONFIG_CRYPTO_GHASH_ARM64_CE=y
  CONFIG_CRYPTO_CRCT10DIF_ARM64_CE=m
  CONFIG_CRYPTO_CRC32_ARM64_CE=m
-CONFIG_CRYPTO_AES_ARM64=m
-CONFIG_CRYPTO_AES_ARM64_CE=m
  CONFIG_CRYPTO_AES_ARM64_CE_CCM=y
  CONFIG_CRYPTO_AES_ARM64_CE_BLK=y
-CONFIG_CRYPTO_AES_ARM64_NEON_BLK=m
  CONFIG_CRYPTO_CHACHA20_NEON=m
  CONFIG_CRYPTO_AES_ARM64_BS=m
diff --git a/arch/arm64/configs/ranchu64_defconfig b/arch/arm64/configs/ranchu64_defconfig

new file mode 100644 (file)

index 0000000..51c3bfc
--- /dev/null
+++ b/arch/arm64/configs/ranchu64_defconfig
@@ -0,0 +1,310 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+# CONFIG_SWAP is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_BSD_PROCESS_ACCT_V3=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_IKCONFIG=y
+CONFIG_IKCONFIG_PROC=y
+CONFIG_LOG_BUF_SHIFT=14
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_SCHED_AUTOGROUP=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_PROFILING=y
+CONFIG_ARCH_MMAP_RND_BITS=24
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+# CONFIG_BLK_DEV_BSG is not set
+# CONFIG_IOSCHED_DEADLINE is not set
+CONFIG_ARCH_VEXPRESS=y
+CONFIG_NR_CPUS=4
+CONFIG_PREEMPT=y
+CONFIG_KSM=y
+CONFIG_SECCOMP=y
+CONFIG_ARMV8_DEPRECATED=y
+CONFIG_SWP_EMULATION=y
+CONFIG_CP15_BARRIER_EMULATION=y
+CONFIG_SETEND_EMULATION=y
+CONFIG_CMDLINE="console=ttyAMA0"
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_COMPAT=y
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_LRO is not set
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_RPFILTER=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_TARGET_ECN=y
+CONFIG_IP_NF_TARGET_TTL=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_MATCH_AH=y
+CONFIG_IP6_NF_MATCH_EUI64=y
+CONFIG_IP6_NF_MATCH_FRAG=y
+CONFIG_IP6_NF_MATCH_OPTS=y
+CONFIG_IP6_NF_MATCH_HL=y
+CONFIG_IP6_NF_MATCH_IPV6HEADER=y
+CONFIG_IP6_NF_MATCH_MH=y
+CONFIG_IP6_NF_MATCH_RT=y
+CONFIG_IP6_NF_TARGET_HL=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_BRIDGE=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+# CONFIG_WIRELESS is not set
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_SCSI=y
+# CONFIG_SCSI_PROC_FS is not set
+CONFIG_BLK_DEV_SD=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_SMC91X=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+# CONFIG_WLAN is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_INPUT_TABLET=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO_SERPORT is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_AMBA_PL011=y
+CONFIG_SERIAL_AMBA_PL011_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+# CONFIG_HW_RANDOM is not set
+CONFIG_BATTERY_GOLDFISH=y
+# CONFIG_HWMON is not set
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_FB=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_FB_SIMPLE=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+CONFIG_LOGO=y
+# CONFIG_LOGO_LINUX_MONO is not set
+# CONFIG_LOGO_LINUX_VGA16 is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+# CONFIG_USB_SUPPORT is not set
+CONFIG_RTC_CLASS=y
+CONFIG_VIRTIO_MMIO=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_TIMED_GPIO=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SW_SYNC_USER=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+# CONFIG_IOMMU_SUPPORT is not set
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_EXT2_FS=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_FUSE_FS=y
+CONFIG_CUSE=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+# CONFIG_MISC_FILESYSTEMS is not set
+CONFIG_NFS_FS=y
+CONFIG_ROOT_NFS=y
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_DEBUG_INFO=y
+CONFIG_DEBUG_FS=y
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_PANIC_TIMEOUT=5
+# CONFIG_SCHED_DEBUG is not set
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+# CONFIG_FTRACE is not set
+CONFIG_ATOMIC64_SELFTEST=y
+CONFIG_DEBUG_RODATA=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h

index 33be513ef24c84093c9a50441663896b3f04376f..36d9863cb3cb6ad07caa2165dd4fd2aa7ea09670 100644 (file)
--- a/arch/arm64/include/asm/elf.h
+++ b/arch/arm64/include/asm/elf.h
@@ -173,7 +173,7 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm,
  #ifdef CONFIG_COMPAT
  
  /* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */
-#define COMPAT_ELF_ET_DYN_BASE         0x000400000UL
+#define COMPAT_ELF_ET_DYN_BASE         (2 * TASK_SIZE_32 / 3)
  
  /* AArch32 registers. */
  #define COMPAT_ELF_NGREG               18
diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h

index b3202284568b2e86708b6bc59800053995aa739b..13a00ece862bb3f19422612d0ea2dfb98dd68085 100644 (file)
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -33,6 +33,17 @@ int pcibus_to_node(struct pci_bus *bus);
  
  #endif /* CONFIG_NUMA */
  
+#include <linux/arch_topology.h>
+
+/* Replace task scheduler's default frequency-invariant accounting */
+#define arch_scale_freq_capacity topology_get_freq_scale
+
+/* Replace task scheduler's default cpu-invariant accounting */
+#define arch_scale_cpu_capacity topology_get_cpu_scale
+
+/* Enable topology flag updates */
+#define arch_update_cpu_topology topology_update_cpu_topology
+
  #include <asm-generic/topology.h>
  
  #endif /* _ASM_ARM_TOPOLOGY_H */
diff --git a/arch/arm64/kernel/io.c b/arch/arm64/kernel/io.c

index 354be2a872ae81bd58db3506cd0c7f77a62cd402..79b17384efface023d0331594df8554de67efaf8 100644 (file)
--- a/arch/arm64/kernel/io.c
+++ b/arch/arm64/kernel/io.c
@@ -25,8 +25,7 @@
   */
  void __memcpy_fromio(void *to, const volatile void __iomem *from, size_t count)
  {
-       while (count && (!IS_ALIGNED((unsigned long)from, 8) ||
-                        !IS_ALIGNED((unsigned long)to, 8))) {
+       while (count && !IS_ALIGNED((unsigned long)from, 8)) {
                 *(u8 *)to = __raw_readb(from);
                 from++;
                 to++;
@@ -54,23 +53,22 @@ EXPORT_SYMBOL(__memcpy_fromio);
   */
  void __memcpy_toio(volatile void __iomem *to, const void *from, size_t count)
  {
-       while (count && (!IS_ALIGNED((unsigned long)to, 8) ||
-                        !IS_ALIGNED((unsigned long)from, 8))) {
-               __raw_writeb(*(volatile u8 *)from, to);
+       while (count && !IS_ALIGNED((unsigned long)to, 8)) {
+               __raw_writeb(*(u8 *)from, to);
                 from++;
                 to++;
                 count--;
         }
  
         while (count >= 8) {
-               __raw_writeq(*(volatile u64 *)from, to);
+               __raw_writeq(*(u64 *)from, to);
                 from += 8;
                 to += 8;
                 count -= 8;
         }
  
         while (count) {
-               __raw_writeb(*(volatile u8 *)from, to);
+               __raw_writeb(*(u8 *)from, to);
                 from++;
                 to++;
                 count--;
diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c

index 9e773732520cd55237c3501017d7cccf916dd0d2..07de304210e3378860d666d62b966b863eaabec5 100644 (file)
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -170,6 +170,70 @@ void machine_restart(char *cmd)
         while (1);
  }
  
+/*
+ * dump a block of kernel memory from around the given address
+ */
+static void show_data(unsigned long addr, int nbytes, const char *name)
+{
+       int     i, j;
+       int     nlines;
+       u32     *p;
+
+       /*
+        * don't attempt to dump non-kernel addresses or
+        * values that are probably just small negative numbers
+        */
+       if (addr < PAGE_OFFSET || addr > -256UL)
+               return;
+
+       printk("\n%s: %#lx:\n", name, addr);
+
+       /*
+        * round address down to a 32 bit boundary
+        * and always dump a multiple of 32 bytes
+        */
+       p = (u32 *)(addr & ~(sizeof(u32) - 1));
+       nbytes += (addr & (sizeof(u32) - 1));
+       nlines = (nbytes + 31) / 32;
+
+
+       for (i = 0; i < nlines; i++) {
+               /*
+                * just display low 16 bits of address to keep
+                * each line of the dump < 80 characters
+                */
+               printk("%04lx ", (unsigned long)p & 0xffff);
+               for (j = 0; j < 8; j++) {
+                       u32     data;
+                       if (probe_kernel_address(p, data)) {
+                               printk(" ********");
+                       } else {
+                               printk(" %08x", data);
+                       }
+                       ++p;
+               }
+               printk("\n");
+       }
+}
+
+static void show_extra_register_data(struct pt_regs *regs, int nbytes)
+{
+       mm_segment_t fs;
+       unsigned int i;
+
+       fs = get_fs();
+       set_fs(KERNEL_DS);
+       show_data(regs->pc - nbytes, nbytes * 2, "PC");
+       show_data(regs->regs[30] - nbytes, nbytes * 2, "LR");
+       show_data(regs->sp - nbytes, nbytes * 2, "SP");
+       for (i = 0; i < 30; i++) {
+               char name[4];
+               snprintf(name, sizeof(name), "X%u", i);
+               show_data(regs->regs[i] - nbytes, nbytes * 2, name);
+       }
+       set_fs(fs);
+}
+
  void __show_regs(struct pt_regs *regs)
  {
         int i, top_reg;
@@ -205,6 +269,9 @@ void __show_regs(struct pt_regs *regs)
  
                 pr_cont("\n");
         }
+       if (!user_mode(regs))
+               show_extra_register_data(regs, 128);
+       printk("\n");
  }
  
  void show_regs(struct pt_regs * regs)
diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c

index 8d48b233e6ce5d09cd84db6a21a52f4d0dd68e97..35dbdfaa6129ed0737fb7bb5af01c75722080a60 100644 (file)
--- a/arch/arm64/kernel/topology.c
+++ b/arch/arm64/kernel/topology.c
@@ -21,6 +21,7 @@
  #include <linux/of.h>
  #include <linux/sched.h>
  #include <linux/sched/topology.h>
+#include <linux/sched_energy.h>
  #include <linux/slab.h>
  #include <linux/string.h>
  
@@ -187,6 +188,8 @@ static int __init parse_dt_topology(void)
         if (!map)
                 goto out;
  
+       init_sched_energy_costs();
+
         ret = parse_cluster(map, 0);
         if (ret != 0)
                 goto out_map;
@@ -280,8 +283,89 @@ void store_cpu_topology(unsigned int cpuid)
  
  topology_populated:
         update_siblings_masks(cpuid);
+       topology_detect_flags();
+}
+
+#ifdef CONFIG_SCHED_SMT
+static int smt_flags(void)
+{
+       return cpu_smt_flags() | topology_smt_flags();
+}
+#endif
+
+#ifdef CONFIG_SCHED_MC
+static int core_flags(void)
+{
+       return cpu_core_flags() | topology_core_flags();
+}
+#endif
+
+static int cpu_flags(void)
+{
+       return topology_cpu_flags();
+}
+
+static inline
+const struct sched_group_energy * const cpu_core_energy(int cpu)
+{
+       struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL0];
+       unsigned long capacity;
+       int max_cap_idx;
+
+       if (!sge) {
+               pr_warn("Invalid sched_group_energy for CPU%d\n", cpu);
+               return NULL;
+       }
+
+       max_cap_idx = sge->nr_cap_states - 1;
+       capacity = sge->cap_states[max_cap_idx].cap;
+
+       printk_deferred("cpu=%d set cpu scale %lu from energy model\n",
+                       cpu, capacity);
+
+       topology_set_cpu_scale(cpu, capacity);
+
+       return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_cluster_energy(int cpu)
+{
+       struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL1];
+
+       if (!sge) {
+               pr_warn("Invalid sched_group_energy for Cluster%d\n", cpu);
+               return NULL;
+       }
+
+       return sge;
+}
+
+static inline
+const struct sched_group_energy * const cpu_system_energy(int cpu)
+{
+       struct sched_group_energy *sge = sge_array[cpu][SD_LEVEL2];
+
+       if (!sge) {
+               pr_warn("Invalid sched_group_energy for System%d\n", cpu);
+               return NULL;
+       }
+
+       return sge;
  }
  
+static struct sched_domain_topology_level arm64_topology[] = {
+#ifdef CONFIG_SCHED_SMT
+       { cpu_smt_mask, smt_flags, SD_INIT_NAME(SMT) },
+#endif
+#ifdef CONFIG_SCHED_MC
+       { cpu_coregroup_mask, core_flags, cpu_core_energy, SD_INIT_NAME(MC) },
+#endif
+       { cpu_cpu_mask, cpu_flags, cpu_cluster_energy, SD_INIT_NAME(DIE) },
+       { cpu_cpu_mask, NULL, cpu_system_energy, SD_INIT_NAME(SYS) },
+       { NULL, }
+};
+
  static void __init reset_cpu_topology(void)
  {
         unsigned int cpu;
@@ -310,4 +394,6 @@ void __init init_cpu_topology(void)
          */
         if (of_have_populated_dt() && parse_dt_topology())
                 reset_cpu_topology();
+       else
+               set_sched_topology(arm64_topology);
  }
diff --git a/arch/arm64/kernel/vdso/gettimeofday.S b/arch/arm64/kernel/vdso/gettimeofday.S

index 76320e9209651fd307659dcbab8092ff7c1c09e2..c39872a7b03c3e152315781c753f3e6b186524ed 100644 (file)
--- a/arch/arm64/kernel/vdso/gettimeofday.S
+++ b/arch/arm64/kernel/vdso/gettimeofday.S
@@ -309,7 +309,7 @@ ENTRY(__kernel_clock_getres)
         b.ne    4f
         ldr     x2, 6f
  2:
-       cbz     w1, 3f
+       cbz     x1, 3f
         stp     xzr, x2, [x1]
  
  3:     /* res == NULL. */
diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c

index 614af886b7ef4f7348470f8746ea822e8665401a..115b32639a3cbfcb4a76fd777717ee55edaf2bf5 100644 (file)
--- a/arch/arm64/mm/dma-mapping.c
+++ b/arch/arm64/mm/dma-mapping.c
@@ -166,7 +166,7 @@ static void *__dma_alloc(struct device *dev, size_t size,
         /* create a coherent mapping */
         page = virt_to_page(ptr);
         coherent_ptr = dma_common_contiguous_remap(page, size, VM_USERMAP,
-                                                  prot, NULL);
+                                                  prot, __builtin_return_address(0));
         if (!coherent_ptr)
                 goto no_map;
  
diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c

index 00e7b900ca4193e83dfa7de7dd506984afe90bce..885402869696b8276ee4c4485f879a60b4242613 100644 (file)
--- a/arch/arm64/mm/init.c
+++ b/arch/arm64/mm/init.c
@@ -285,9 +285,11 @@ static void __init zone_sizes_init(unsigned long min, unsigned long max)
  #endif /* CONFIG_NUMA */
  
  #ifdef CONFIG_HAVE_ARCH_PFN_VALID
+#define PFN_MASK ((1UL << (64 - PAGE_SHIFT)) - 1)
+
  int pfn_valid(unsigned long pfn)
  {
-       return memblock_is_map_memory(pfn << PAGE_SHIFT);
+       return (pfn & PFN_MASK) == pfn && memblock_is_map_memory(pfn << PAGE_SHIFT);
  }
  EXPORT_SYMBOL(pfn_valid);
  #endif
diff --git a/arch/x86/Makefile b/arch/x86/Makefile

index 504b1a4535acaa252dedc6e6cf8f9031c2caf2f8..c4b896661654d35664f29aca6a4ae3e696882838 100644 (file)
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -111,6 +111,8 @@ else
          KBUILD_CFLAGS += $(call cc-option,-mno-80387)
          KBUILD_CFLAGS += $(call cc-option,-mno-fp-ret-in-387)
  
+        KBUILD_CFLAGS += -fno-pic
+
          # By default gcc and clang use a stack alignment of 16 bytes for x86.
          # However the standard kernel entry on x86-64 leaves the stack on an
          # 8-byte boundary. If the compiler isn't informed about the actual
diff --git a/arch/x86/configs/i386_ranchu_defconfig b/arch/x86/configs/i386_ranchu_defconfig

new file mode 100644 (file)

index 0000000..18d3675
--- /dev/null
+++ b/arch/x86/configs/i386_ranchu_defconfig
@@ -0,0 +1,422 @@
+# CONFIG_64BIT is not set
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_X86_BIGSMP=y
+CONFIG_MCORE2=y
+CONFIG_X86_GENERIC=y
+CONFIG_HPET_TIMER=y
+CONFIG_NR_CPUS=512
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_REBOOTFIXUPS=y
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CMA_SIZE_MBYTES=16
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+CONFIG_FRAME_WARN=2048
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_AES_586=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/arch/x86/configs/x86_64_ranchu_defconfig b/arch/x86/configs/x86_64_ranchu_defconfig

new file mode 100644 (file)

index 0000000..7eff300
--- /dev/null
+++ b/arch/x86/configs/x86_64_ranchu_defconfig
@@ -0,0 +1,417 @@
+# CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_POSIX_MQUEUE=y
+CONFIG_AUDIT=y
+CONFIG_NO_HZ=y
+CONFIG_HIGH_RES_TIMERS=y
+CONFIG_BSD_PROCESS_ACCT=y
+CONFIG_TASKSTATS=y
+CONFIG_TASK_DELAY_ACCT=y
+CONFIG_TASK_XACCT=y
+CONFIG_TASK_IO_ACCOUNTING=y
+CONFIG_CGROUPS=y
+CONFIG_CGROUP_DEBUG=y
+CONFIG_CGROUP_FREEZER=y
+CONFIG_CGROUP_CPUACCT=y
+CONFIG_CGROUP_SCHED=y
+CONFIG_RT_GROUP_SCHED=y
+CONFIG_BLK_DEV_INITRD=y
+CONFIG_CC_OPTIMIZE_FOR_SIZE=y
+CONFIG_SYSCTL_SYSCALL=y
+CONFIG_KALLSYMS_ALL=y
+CONFIG_EMBEDDED=y
+# CONFIG_COMPAT_BRK is not set
+CONFIG_ARCH_MMAP_RND_BITS=32
+CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16
+CONFIG_PARTITION_ADVANCED=y
+CONFIG_OSF_PARTITION=y
+CONFIG_AMIGA_PARTITION=y
+CONFIG_MAC_PARTITION=y
+CONFIG_BSD_DISKLABEL=y
+CONFIG_MINIX_SUBPARTITION=y
+CONFIG_SOLARIS_X86_PARTITION=y
+CONFIG_UNIXWARE_DISKLABEL=y
+CONFIG_SGI_PARTITION=y
+CONFIG_SUN_PARTITION=y
+CONFIG_KARMA_PARTITION=y
+CONFIG_SMP=y
+CONFIG_MCORE2=y
+CONFIG_MAXSMP=y
+CONFIG_PREEMPT=y
+# CONFIG_X86_MCE is not set
+CONFIG_X86_MSR=y
+CONFIG_X86_CPUID=y
+CONFIG_KSM=y
+CONFIG_CMA=y
+# CONFIG_MTRR_SANITIZER is not set
+CONFIG_EFI=y
+CONFIG_EFI_STUB=y
+CONFIG_HZ_100=y
+CONFIG_PHYSICAL_START=0x100000
+CONFIG_PM_AUTOSLEEP=y
+CONFIG_PM_WAKELOCKS=y
+CONFIG_PM_WAKELOCKS_LIMIT=0
+# CONFIG_PM_WAKELOCKS_GC is not set
+CONFIG_PM_DEBUG=y
+CONFIG_CPU_FREQ=y
+# CONFIG_CPU_FREQ_STAT is not set
+CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y
+CONFIG_CPU_FREQ_GOV_USERSPACE=y
+CONFIG_PCI_MMCONFIG=y
+CONFIG_PCIEPORTBUS=y
+# CONFIG_PCIEASPM is not set
+CONFIG_PCCARD=y
+CONFIG_YENTA=y
+CONFIG_HOTPLUG_PCI=y
+# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set
+CONFIG_BINFMT_MISC=y
+CONFIG_IA32_EMULATION=y
+CONFIG_NET=y
+CONFIG_PACKET=y
+CONFIG_UNIX=y
+CONFIG_XFRM_USER=y
+CONFIG_NET_KEY=y
+CONFIG_INET=y
+CONFIG_IP_MULTICAST=y
+CONFIG_IP_ADVANCED_ROUTER=y
+CONFIG_IP_MULTIPLE_TABLES=y
+CONFIG_IP_ROUTE_MULTIPATH=y
+CONFIG_IP_ROUTE_VERBOSE=y
+CONFIG_IP_PNP=y
+CONFIG_IP_PNP_DHCP=y
+CONFIG_IP_PNP_BOOTP=y
+CONFIG_IP_PNP_RARP=y
+CONFIG_IP_MROUTE=y
+CONFIG_IP_PIMSM_V1=y
+CONFIG_IP_PIMSM_V2=y
+CONFIG_SYN_COOKIES=y
+CONFIG_INET_ESP=y
+# CONFIG_INET_XFRM_MODE_BEET is not set
+# CONFIG_INET_LRO is not set
+CONFIG_INET_DIAG_DESTROY=y
+CONFIG_IPV6_ROUTER_PREF=y
+CONFIG_IPV6_ROUTE_INFO=y
+CONFIG_IPV6_OPTIMISTIC_DAD=y
+CONFIG_INET6_AH=y
+CONFIG_INET6_ESP=y
+CONFIG_INET6_IPCOMP=y
+CONFIG_IPV6_MIP6=y
+CONFIG_IPV6_MULTIPLE_TABLES=y
+CONFIG_NETLABEL=y
+CONFIG_NETFILTER=y
+CONFIG_NF_CONNTRACK=y
+CONFIG_NF_CONNTRACK_SECMARK=y
+CONFIG_NF_CONNTRACK_EVENTS=y
+CONFIG_NF_CT_PROTO_DCCP=y
+CONFIG_NF_CT_PROTO_SCTP=y
+CONFIG_NF_CT_PROTO_UDPLITE=y
+CONFIG_NF_CONNTRACK_AMANDA=y
+CONFIG_NF_CONNTRACK_FTP=y
+CONFIG_NF_CONNTRACK_H323=y
+CONFIG_NF_CONNTRACK_IRC=y
+CONFIG_NF_CONNTRACK_NETBIOS_NS=y
+CONFIG_NF_CONNTRACK_PPTP=y
+CONFIG_NF_CONNTRACK_SANE=y
+CONFIG_NF_CONNTRACK_TFTP=y
+CONFIG_NF_CT_NETLINK=y
+CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y
+CONFIG_NETFILTER_XT_TARGET_CONNMARK=y
+CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y
+CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y
+CONFIG_NETFILTER_XT_TARGET_MARK=y
+CONFIG_NETFILTER_XT_TARGET_NFLOG=y
+CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y
+CONFIG_NETFILTER_XT_TARGET_TPROXY=y
+CONFIG_NETFILTER_XT_TARGET_TRACE=y
+CONFIG_NETFILTER_XT_TARGET_SECMARK=y
+CONFIG_NETFILTER_XT_TARGET_TCPMSS=y
+CONFIG_NETFILTER_XT_MATCH_COMMENT=y
+CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_CONNMARK=y
+CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y
+CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y
+CONFIG_NETFILTER_XT_MATCH_HELPER=y
+CONFIG_NETFILTER_XT_MATCH_IPRANGE=y
+CONFIG_NETFILTER_XT_MATCH_LENGTH=y
+CONFIG_NETFILTER_XT_MATCH_LIMIT=y
+CONFIG_NETFILTER_XT_MATCH_MAC=y
+CONFIG_NETFILTER_XT_MATCH_MARK=y
+CONFIG_NETFILTER_XT_MATCH_POLICY=y
+CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y
+CONFIG_NETFILTER_XT_MATCH_QTAGUID=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA=y
+CONFIG_NETFILTER_XT_MATCH_QUOTA2=y
+CONFIG_NETFILTER_XT_MATCH_SOCKET=y
+CONFIG_NETFILTER_XT_MATCH_STATE=y
+CONFIG_NETFILTER_XT_MATCH_STATISTIC=y
+CONFIG_NETFILTER_XT_MATCH_STRING=y
+CONFIG_NETFILTER_XT_MATCH_TIME=y
+CONFIG_NETFILTER_XT_MATCH_U32=y
+CONFIG_NF_CONNTRACK_IPV4=y
+CONFIG_IP_NF_IPTABLES=y
+CONFIG_IP_NF_MATCH_AH=y
+CONFIG_IP_NF_MATCH_ECN=y
+CONFIG_IP_NF_MATCH_TTL=y
+CONFIG_IP_NF_FILTER=y
+CONFIG_IP_NF_TARGET_REJECT=y
+CONFIG_IP_NF_MANGLE=y
+CONFIG_IP_NF_RAW=y
+CONFIG_IP_NF_SECURITY=y
+CONFIG_IP_NF_ARPTABLES=y
+CONFIG_IP_NF_ARPFILTER=y
+CONFIG_IP_NF_ARP_MANGLE=y
+CONFIG_NF_CONNTRACK_IPV6=y
+CONFIG_IP6_NF_IPTABLES=y
+CONFIG_IP6_NF_FILTER=y
+CONFIG_IP6_NF_TARGET_REJECT=y
+CONFIG_IP6_NF_MANGLE=y
+CONFIG_IP6_NF_RAW=y
+CONFIG_NET_SCHED=y
+CONFIG_NET_SCH_HTB=y
+CONFIG_NET_CLS_U32=y
+CONFIG_NET_EMATCH=y
+CONFIG_NET_EMATCH_U32=y
+CONFIG_NET_CLS_ACT=y
+CONFIG_CFG80211=y
+CONFIG_MAC80211=y
+CONFIG_MAC80211_LEDS=y
+CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug"
+CONFIG_DMA_CMA=y
+CONFIG_CONNECTOR=y
+CONFIG_BLK_DEV_LOOP=y
+CONFIG_BLK_DEV_RAM=y
+CONFIG_BLK_DEV_RAM_SIZE=8192
+CONFIG_VIRTIO_BLK=y
+CONFIG_BLK_DEV_SD=y
+CONFIG_BLK_DEV_SR=y
+CONFIG_BLK_DEV_SR_VENDOR=y
+CONFIG_CHR_DEV_SG=y
+CONFIG_SCSI_CONSTANTS=y
+CONFIG_SCSI_SPI_ATTRS=y
+CONFIG_SCSI_ISCSI_ATTRS=y
+# CONFIG_SCSI_LOWLEVEL is not set
+CONFIG_ATA=y
+CONFIG_SATA_AHCI=y
+CONFIG_ATA_PIIX=y
+CONFIG_PATA_AMD=y
+CONFIG_PATA_OLDPIIX=y
+CONFIG_PATA_SCH=y
+CONFIG_PATA_MPIIX=y
+CONFIG_ATA_GENERIC=y
+CONFIG_MD=y
+CONFIG_BLK_DEV_MD=y
+CONFIG_BLK_DEV_DM=y
+CONFIG_DM_DEBUG=y
+CONFIG_DM_CRYPT=y
+CONFIG_DM_MIRROR=y
+CONFIG_DM_ZERO=y
+CONFIG_DM_UEVENT=y
+CONFIG_DM_VERITY=y
+CONFIG_DM_VERITY_FEC=y
+CONFIG_NETDEVICES=y
+CONFIG_NETCONSOLE=y
+CONFIG_TUN=y
+CONFIG_VIRTIO_NET=y
+CONFIG_BNX2=y
+CONFIG_TIGON3=y
+CONFIG_NET_TULIP=y
+CONFIG_E100=y
+CONFIG_E1000=y
+CONFIG_E1000E=y
+CONFIG_SKY2=y
+CONFIG_NE2K_PCI=y
+CONFIG_FORCEDETH=y
+CONFIG_8139TOO=y
+# CONFIG_8139TOO_PIO is not set
+CONFIG_R8169=y
+CONFIG_FDDI=y
+CONFIG_PPP=y
+CONFIG_PPP_BSDCOMP=y
+CONFIG_PPP_DEFLATE=y
+CONFIG_PPP_MPPE=y
+CONFIG_USB_USBNET=y
+CONFIG_INPUT_POLLDEV=y
+# CONFIG_INPUT_MOUSEDEV_PSAUX is not set
+CONFIG_INPUT_EVDEV=y
+CONFIG_INPUT_KEYRESET=y
+# CONFIG_KEYBOARD_ATKBD is not set
+CONFIG_KEYBOARD_GOLDFISH_EVENTS=y
+# CONFIG_INPUT_MOUSE is not set
+CONFIG_INPUT_JOYSTICK=y
+CONFIG_JOYSTICK_XPAD=y
+CONFIG_JOYSTICK_XPAD_FF=y
+CONFIG_JOYSTICK_XPAD_LEDS=y
+CONFIG_INPUT_TABLET=y
+CONFIG_TABLET_USB_ACECAD=y
+CONFIG_TABLET_USB_AIPTEK=y
+CONFIG_TABLET_USB_GTCO=y
+CONFIG_TABLET_USB_HANWANG=y
+CONFIG_TABLET_USB_KBTAB=y
+CONFIG_INPUT_TOUCHSCREEN=y
+CONFIG_INPUT_MISC=y
+CONFIG_INPUT_KEYCHORD=y
+CONFIG_INPUT_UINPUT=y
+CONFIG_INPUT_GPIO=y
+# CONFIG_SERIO is not set
+# CONFIG_VT is not set
+# CONFIG_LEGACY_PTYS is not set
+CONFIG_SERIAL_NONSTANDARD=y
+# CONFIG_DEVMEM is not set
+# CONFIG_DEVKMEM is not set
+CONFIG_SERIAL_8250=y
+CONFIG_SERIAL_8250_CONSOLE=y
+CONFIG_VIRTIO_CONSOLE=y
+CONFIG_NVRAM=y
+CONFIG_I2C_I801=y
+CONFIG_BATTERY_GOLDFISH=y
+CONFIG_WATCHDOG=y
+CONFIG_MEDIA_SUPPORT=y
+CONFIG_AGP=y
+CONFIG_AGP_AMD64=y
+CONFIG_AGP_INTEL=y
+CONFIG_DRM=y
+CONFIG_FB_MODE_HELPERS=y
+CONFIG_FB_TILEBLITTING=y
+CONFIG_FB_EFI=y
+CONFIG_FB_GOLDFISH=y
+CONFIG_BACKLIGHT_LCD_SUPPORT=y
+# CONFIG_LCD_CLASS_DEVICE is not set
+CONFIG_SOUND=y
+CONFIG_SND=y
+CONFIG_HIDRAW=y
+CONFIG_UHID=y
+CONFIG_HID_A4TECH=y
+CONFIG_HID_ACRUX=y
+CONFIG_HID_ACRUX_FF=y
+CONFIG_HID_APPLE=y
+CONFIG_HID_BELKIN=y
+CONFIG_HID_CHERRY=y
+CONFIG_HID_CHICONY=y
+CONFIG_HID_PRODIKEYS=y
+CONFIG_HID_CYPRESS=y
+CONFIG_HID_DRAGONRISE=y
+CONFIG_DRAGONRISE_FF=y
+CONFIG_HID_EMS_FF=y
+CONFIG_HID_ELECOM=y
+CONFIG_HID_EZKEY=y
+CONFIG_HID_HOLTEK=y
+CONFIG_HID_KEYTOUCH=y
+CONFIG_HID_KYE=y
+CONFIG_HID_UCLOGIC=y
+CONFIG_HID_WALTOP=y
+CONFIG_HID_GYRATION=y
+CONFIG_HID_TWINHAN=y
+CONFIG_HID_KENSINGTON=y
+CONFIG_HID_LCPOWER=y
+CONFIG_HID_LOGITECH=y
+CONFIG_HID_LOGITECH_DJ=y
+CONFIG_LOGITECH_FF=y
+CONFIG_LOGIRUMBLEPAD2_FF=y
+CONFIG_LOGIG940_FF=y
+CONFIG_HID_MAGICMOUSE=y
+CONFIG_HID_MICROSOFT=y
+CONFIG_HID_MONTEREY=y
+CONFIG_HID_MULTITOUCH=y
+CONFIG_HID_NTRIG=y
+CONFIG_HID_ORTEK=y
+CONFIG_HID_PANTHERLORD=y
+CONFIG_PANTHERLORD_FF=y
+CONFIG_HID_PETALYNX=y
+CONFIG_HID_PICOLCD=y
+CONFIG_HID_PRIMAX=y
+CONFIG_HID_ROCCAT=y
+CONFIG_HID_SAITEK=y
+CONFIG_HID_SAMSUNG=y
+CONFIG_HID_SONY=y
+CONFIG_HID_SPEEDLINK=y
+CONFIG_HID_SUNPLUS=y
+CONFIG_HID_GREENASIA=y
+CONFIG_GREENASIA_FF=y
+CONFIG_HID_SMARTJOYPLUS=y
+CONFIG_SMARTJOYPLUS_FF=y
+CONFIG_HID_TIVO=y
+CONFIG_HID_TOPSEED=y
+CONFIG_HID_THRUSTMASTER=y
+CONFIG_HID_WACOM=y
+CONFIG_HID_WIIMOTE=y
+CONFIG_HID_ZEROPLUS=y
+CONFIG_HID_ZYDACRON=y
+CONFIG_HID_PID=y
+CONFIG_USB_HIDDEV=y
+CONFIG_USB_ANNOUNCE_NEW_DEVICES=y
+CONFIG_USB_MON=y
+CONFIG_USB_EHCI_HCD=y
+# CONFIG_USB_EHCI_TT_NEWSCHED is not set
+CONFIG_USB_OHCI_HCD=y
+CONFIG_USB_UHCI_HCD=y
+CONFIG_USB_PRINTER=y
+CONFIG_USB_STORAGE=y
+CONFIG_USB_OTG_WAKELOCK=y
+CONFIG_EDAC=y
+CONFIG_RTC_CLASS=y
+# CONFIG_RTC_HCTOSYS is not set
+CONFIG_DMADEVICES=y
+CONFIG_VIRTIO_PCI=y
+CONFIG_STAGING=y
+CONFIG_ASHMEM=y
+CONFIG_ANDROID_LOW_MEMORY_KILLER=y
+CONFIG_SYNC=y
+CONFIG_SW_SYNC=y
+CONFIG_SYNC_FILE=y
+CONFIG_ION=y
+CONFIG_GOLDFISH_AUDIO=y
+CONFIG_SND_HDA_INTEL=y
+CONFIG_GOLDFISH=y
+CONFIG_GOLDFISH_PIPE=y
+CONFIG_GOLDFISH_SYNC=y
+CONFIG_ANDROID=y
+CONFIG_ANDROID_BINDER_IPC=y
+CONFIG_ISCSI_IBFT_FIND=y
+CONFIG_EXT4_FS=y
+CONFIG_EXT4_FS_SECURITY=y
+CONFIG_QUOTA=y
+CONFIG_QUOTA_NETLINK_INTERFACE=y
+# CONFIG_PRINT_QUOTA_WARNING is not set
+CONFIG_FUSE_FS=y
+CONFIG_ISO9660_FS=y
+CONFIG_JOLIET=y
+CONFIG_ZISOFS=y
+CONFIG_MSDOS_FS=y
+CONFIG_VFAT_FS=y
+CONFIG_PROC_KCORE=y
+CONFIG_TMPFS=y
+CONFIG_TMPFS_POSIX_ACL=y
+CONFIG_HUGETLBFS=y
+CONFIG_PSTORE=y
+CONFIG_PSTORE_CONSOLE=y
+CONFIG_PSTORE_RAM=y
+# CONFIG_NETWORK_FILESYSTEMS is not set
+CONFIG_NLS_DEFAULT="utf8"
+CONFIG_NLS_CODEPAGE_437=y
+CONFIG_NLS_ASCII=y
+CONFIG_NLS_ISO8859_1=y
+CONFIG_NLS_UTF8=y
+CONFIG_PRINTK_TIME=y
+CONFIG_DEBUG_INFO=y
+# CONFIG_ENABLE_WARN_DEPRECATED is not set
+# CONFIG_ENABLE_MUST_CHECK is not set
+# CONFIG_UNUSED_SYMBOLS is not set
+CONFIG_MAGIC_SYSRQ=y
+CONFIG_DEBUG_MEMORY_INIT=y
+CONFIG_PANIC_TIMEOUT=5
+CONFIG_SCHEDSTATS=y
+CONFIG_TIMER_STATS=y
+CONFIG_SCHED_TRACER=y
+CONFIG_BLK_DEV_IO_TRACE=y
+CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
+CONFIG_KEYS=y
+CONFIG_SECURITY=y
+CONFIG_SECURITY_NETWORK=y
+CONFIG_SECURITY_SELINUX=y
+CONFIG_CRYPTO_TWOFISH=y
+CONFIG_ASYMMETRIC_KEY_TYPE=y
+CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y
+CONFIG_X509_CERTIFICATE_PARSER=y
+CONFIG_PKCS7_MESSAGE_PARSER=y
+CONFIG_PKCS7_TEST_KEY=y
+# CONFIG_VIRTUALIZATION is not set
+CONFIG_CRC_T10DIF=y
diff --git a/build.config.goldfish.arm b/build.config.goldfish.arm

new file mode 100644 (file)

index 0000000..866da93
--- /dev/null
+++ b/build.config.goldfish.arm
@@ -0,0 +1,12 @@
+ARCH=arm
+BRANCH=android-4.4
+CROSS_COMPILE=arm-linux-androidkernel-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/arm/arm-linux-androideabi-4.9/bin
+FILES="
+arch/arm/boot/zImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.arm64 b/build.config.goldfish.arm64

new file mode 100644 (file)

index 0000000..9c963cf
--- /dev/null
+++ b/build.config.goldfish.arm64
@@ -0,0 +1,12 @@
+ARCH=arm64
+BRANCH=android-4.4
+CROSS_COMPILE=aarch64-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/bin
+FILES="
+arch/arm64/boot/Image
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips b/build.config.goldfish.mips

new file mode 100644 (file)

index 0000000..8af53d2
--- /dev/null
+++ b/build.config.goldfish.mips
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.mips64 b/build.config.goldfish.mips64

new file mode 100644 (file)

index 0000000..2a33d36
--- /dev/null
+++ b/build.config.goldfish.mips64
@@ -0,0 +1,11 @@
+ARCH=mips
+BRANCH=android-4.4
+CROSS_COMPILE=mips64el-linux-android-
+DEFCONFIG=ranchu64_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/mips/mips64el-linux-android-4.9/bin
+FILES="
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86 b/build.config.goldfish.x86

new file mode 100644 (file)

index 0000000..f86253f
--- /dev/null
+++ b/build.config.goldfish.x86
@@ -0,0 +1,12 @@
+ARCH=x86
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=i386_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/build.config.goldfish.x86_64 b/build.config.goldfish.x86_64

new file mode 100644 (file)

index 0000000..e173886
--- /dev/null
+++ b/build.config.goldfish.x86_64
@@ -0,0 +1,12 @@
+ARCH=x86_64
+BRANCH=android-4.4
+CROSS_COMPILE=x86_64-linux-android-
+DEFCONFIG=x86_64_ranchu_defconfig
+EXTRA_CMDS=''
+KERNEL_DIR=common
+LINUX_GCC_CROSS_COMPILE_PREBUILTS_BIN=prebuilts/gcc/linux-x86/x86/x86_64-linux-android-4.9/bin
+FILES="
+arch/x86/boot/bzImage
+vmlinux
+System.map
+"
diff --git a/crypto/api.c b/crypto/api.c

index 941cd4c6c7ecbbb02348dc99f2a0fbed4576421d..2a2479d168aacbe3e31799a2c713614b71e6805b 100644 (file)
--- a/crypto/api.c
+++ b/crypto/api.c
@@ -24,6 +24,7 @@
  #include <linux/sched/signal.h>
  #include <linux/slab.h>
  #include <linux/string.h>
+#include <linux/completion.h>
  #include "internal.h"
  
  LIST_HEAD(crypto_alg_list);
@@ -595,5 +596,17 @@ int crypto_has_alg(const char *name, u32 type, u32 mask)
  }
  EXPORT_SYMBOL_GPL(crypto_has_alg);
  
+void crypto_req_done(struct crypto_async_request *req, int err)
+{
+       struct crypto_wait *wait = req->data;
+
+       if (err == -EINPROGRESS)
+               return;
+
+       wait->err = err;
+       complete(&wait->completion);
+}
+EXPORT_SYMBOL_GPL(crypto_req_done);
+
  MODULE_DESCRIPTION("Cryptographic core API");
  MODULE_LICENSE("GPL");
diff --git a/drivers/android/binder.c b/drivers/android/binder.c

index 2ef8bd29e1887095059d18d0882a8321c63ffed3..539e9c64f02710046177343bd3fec91f0f97b91d 100644 (file)
--- a/drivers/android/binder.c
+++ b/drivers/android/binder.c
@@ -77,6 +77,7 @@
  #endif
  
  #include <uapi/linux/android/binder.h>
+#include <uapi/linux/sched/types.h>
  #include "binder_alloc.h"
  #include "binder_trace.h"
  
@@ -351,10 +352,14 @@ struct binder_error {
   *                        and by @lock)
   * @has_async_transaction: async transaction to node in progress
   *                        (protected by @lock)
+ * @sched_policy:         minimum scheduling policy for node
+ *                        (invariant after initialized)
   * @accept_fds:           file descriptor operations supported for node
   *                        (invariant after initialized)
   * @min_priority:         minimum scheduling priority
   *                        (invariant after initialized)
+ * @inherit_rt:           inherit RT scheduling policy from caller
+ *                        (invariant after initialized)
   * @async_todo:           list of async work items
   *                        (protected by @proc->inner_lock)
   *
@@ -390,6 +395,8 @@ struct binder_node {
                 /*
                  * invariant after initialization
                  */
+               u8 sched_policy:2;
+               u8 inherit_rt:1;
                 u8 accept_fds:1;
                 u8 min_priority;
         };
@@ -463,6 +470,22 @@ enum binder_deferred_state {
         BINDER_DEFERRED_RELEASE      = 0x04,
  };
  
+/**
+ * struct binder_priority - scheduler policy and priority
+ * @sched_policy            scheduler policy
+ * @prio                    [100..139] for SCHED_NORMAL, [0..99] for FIFO/RT
+ *
+ * The binder driver supports inheriting the following scheduler policies:
+ * SCHED_NORMAL
+ * SCHED_BATCH
+ * SCHED_FIFO
+ * SCHED_RR
+ */
+struct binder_priority {
+       unsigned int sched_policy;
+       int prio;
+};
+
  /**
   * struct binder_proc - binder process bookkeeping
   * @proc_node:            element for binder_procs list
@@ -493,8 +516,6 @@ enum binder_deferred_state {
   *                        (protected by @inner_lock)
   * @todo:                 list of work for this process
   *                        (protected by @inner_lock)
- * @wait:                 wait queue head to wait for proc work
- *                        (invariant after initialized)
   * @stats:                per-process binder statistics
   *                        (atomics, no lock needed)
   * @delivered_death:      list of delivered death notification
@@ -537,14 +558,13 @@ struct binder_proc {
         bool is_dead;
  
         struct list_head todo;
-       wait_queue_head_t wait;
         struct binder_stats stats;
         struct list_head delivered_death;
         int max_threads;
         int requested_threads;
         int requested_threads_started;
         int tmp_ref;
-       long default_priority;
+       struct binder_priority default_priority;
         struct dentry *debugfs_entry;
         struct binder_alloc alloc;
         struct binder_context *context;
@@ -579,6 +599,8 @@ enum {
   *                        (protected by @proc->inner_lock)
   * @todo:                 list of work to do for this thread
   *                        (protected by @proc->inner_lock)
+ * @process_todo:         whether work in @todo should be processed
+ *                        (protected by @proc->inner_lock)
   * @return_error:         transaction errors reported by this thread
   *                        (only accessed by this thread)
   * @reply_error:          transaction errors reported by target thread
@@ -592,6 +614,7 @@ enum {
   * @is_dead:              thread is dead and awaiting free
   *                        when outstanding transactions are cleaned up
   *                        (protected by @proc->inner_lock)
+ * @task:                 struct task_struct for this thread
   *
   * Bookkeeping structure for binder threads.
   */
@@ -604,12 +627,14 @@ struct binder_thread {
         bool looper_need_return; /* can be written by other thread */
         struct binder_transaction *transaction_stack;
         struct list_head todo;
+       bool process_todo;
         struct binder_error return_error;
         struct binder_error reply_error;
         wait_queue_head_t wait;
         struct binder_stats stats;
         atomic_t tmp_ref;
         bool is_dead;
+       struct task_struct *task;
  };
  
  struct binder_transaction {
@@ -626,8 +651,9 @@ struct binder_transaction {
         struct binder_buffer *buffer;
         unsigned int    code;
         unsigned int    flags;
-       long    priority;
-       long    saved_priority;
+       struct binder_priority  priority;
+       struct binder_priority  saved_priority;
+       bool    set_priority_called;
         kuid_t  sender_euid;
         /**
          * @lock:  protects @from, @to_proc, and @to_thread
@@ -789,6 +815,16 @@ static bool binder_worklist_empty(struct binder_proc *proc,
         return ret;
  }
  
+/**
+ * binder_enqueue_work_ilocked() - Add an item to the work list
+ * @work:         struct binder_work to add to list
+ * @target_list:  list to add work to
+ *
+ * Adds the work to the specified list. Asserts that work
+ * is not already on a list.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
  static void
  binder_enqueue_work_ilocked(struct binder_work *work,
                            struct list_head *target_list)
@@ -799,22 +835,56 @@ binder_enqueue_work_ilocked(struct binder_work *work,
  }
  
  /**
- * binder_enqueue_work() - Add an item to the work list
- * @proc:         binder_proc associated with list
+ * binder_enqueue_deferred_thread_work_ilocked() - Add deferred thread work
+ * @thread:       thread to queue work to
   * @work:         struct binder_work to add to list
- * @target_list:  list to add work to
   *
- * Adds the work to the specified list. Asserts that work
- * is not already on a list.
+ * Adds the work to the todo list of the thread. Doesn't set the process_todo
+ * flag, which means that (if it wasn't already set) the thread will go to
+ * sleep without handling this work when it calls read.
+ *
+ * Requires the proc->inner_lock to be held.
   */
  static void
-binder_enqueue_work(struct binder_proc *proc,
-                   struct binder_work *work,
-                   struct list_head *target_list)
+binder_enqueue_deferred_thread_work_ilocked(struct binder_thread *thread,
+                                           struct binder_work *work)
  {
-       binder_inner_proc_lock(proc);
-       binder_enqueue_work_ilocked(work, target_list);
-       binder_inner_proc_unlock(proc);
+       binder_enqueue_work_ilocked(work, &thread->todo);
+}
+
+/**
+ * binder_enqueue_thread_work_ilocked() - Add an item to the thread work list
+ * @thread:       thread to queue work to
+ * @work:         struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ *
+ * Requires the proc->inner_lock to be held.
+ */
+static void
+binder_enqueue_thread_work_ilocked(struct binder_thread *thread,
+                                  struct binder_work *work)
+{
+       binder_enqueue_work_ilocked(work, &thread->todo);
+       thread->process_todo = true;
+}
+
+/**
+ * binder_enqueue_thread_work() - Add an item to the thread work list
+ * @thread:       thread to queue work to
+ * @work:         struct binder_work to add to list
+ *
+ * Adds the work to the todo list of the thread, and enables processing
+ * of the todo queue.
+ */
+static void
+binder_enqueue_thread_work(struct binder_thread *thread,
+                          struct binder_work *work)
+{
+       binder_inner_proc_lock(thread->proc);
+       binder_enqueue_thread_work_ilocked(thread, work);
+       binder_inner_proc_unlock(thread->proc);
  }
  
  static void
@@ -940,7 +1010,7 @@ err:
  static bool binder_has_work_ilocked(struct binder_thread *thread,
                                     bool do_proc_work)
  {
-       return !binder_worklist_empty_ilocked(&thread->todo) ||
+       return thread->process_todo ||
                 thread->looper_need_return ||
                 (do_proc_work &&
                  !binder_worklist_empty_ilocked(&thread->proc->todo));
@@ -1064,22 +1134,145 @@ static void binder_wakeup_proc_ilocked(struct binder_proc *proc)
         binder_wakeup_thread_ilocked(proc, thread, /* sync = */false);
  }
  
-static void binder_set_nice(long nice)
+static bool is_rt_policy(int policy)
  {
-       long min_nice;
+       return policy == SCHED_FIFO || policy == SCHED_RR;
+}
+
+static bool is_fair_policy(int policy)
+{
+       return policy == SCHED_NORMAL || policy == SCHED_BATCH;
+}
+
+static bool binder_supported_policy(int policy)
+{
+       return is_fair_policy(policy) || is_rt_policy(policy);
+}
+
+static int to_userspace_prio(int policy, int kernel_priority)
+{
+       if (is_fair_policy(policy))
+               return PRIO_TO_NICE(kernel_priority);
+       else
+               return MAX_USER_RT_PRIO - 1 - kernel_priority;
+}
+
+static int to_kernel_prio(int policy, int user_priority)
+{
+       if (is_fair_policy(policy))
+               return NICE_TO_PRIO(user_priority);
+       else
+               return MAX_USER_RT_PRIO - 1 - user_priority;
+}
  
-       if (can_nice(current, nice)) {
-               set_user_nice(current, nice);
+static void binder_do_set_priority(struct task_struct *task,
+                                  struct binder_priority desired,
+                                  bool verify)
+{
+       int priority; /* user-space prio value */
+       bool has_cap_nice;
+       unsigned int policy = desired.sched_policy;
+
+       if (task->policy == policy && task->normal_prio == desired.prio)
                 return;
+
+       has_cap_nice = has_capability_noaudit(task, CAP_SYS_NICE);
+
+       priority = to_userspace_prio(policy, desired.prio);
+
+       if (verify && is_rt_policy(policy) && !has_cap_nice) {
+               long max_rtprio = task_rlimit(task, RLIMIT_RTPRIO);
+
+               if (max_rtprio == 0) {
+                       policy = SCHED_NORMAL;
+                       priority = MIN_NICE;
+               } else if (priority > max_rtprio) {
+                       priority = max_rtprio;
+               }
         }
-       min_nice = rlimit_to_nice(rlimit(RLIMIT_NICE));
-       binder_debug(BINDER_DEBUG_PRIORITY_CAP,
-                    "%d: nice value %ld not allowed use %ld instead\n",
-                     current->pid, nice, min_nice);
-       set_user_nice(current, min_nice);
-       if (min_nice <= MAX_NICE)
+
+       if (verify && is_fair_policy(policy) && !has_cap_nice) {
+               long min_nice = rlimit_to_nice(task_rlimit(task, RLIMIT_NICE));
+
+               if (min_nice > MAX_NICE) {
+                       binder_user_error("%d RLIMIT_NICE not set\n",
+                                         task->pid);
+                       return;
+               } else if (priority < min_nice) {
+                       priority = min_nice;
+               }
+       }
+
+       if (policy != desired.sched_policy ||
+           to_kernel_prio(policy, priority) != desired.prio)
+               binder_debug(BINDER_DEBUG_PRIORITY_CAP,
+                            "%d: priority %d not allowed, using %d instead\n",
+                             task->pid, desired.prio,
+                             to_kernel_prio(policy, priority));
+
+       trace_binder_set_priority(task->tgid, task->pid, task->normal_prio,
+                                 to_kernel_prio(policy, priority),
+                                 desired.prio);
+
+       /* Set the actual priority */
+       if (task->policy != policy || is_rt_policy(policy)) {
+               struct sched_param params;
+
+               params.sched_priority = is_rt_policy(policy) ? priority : 0;
+
+               sched_setscheduler_nocheck(task,
+                                          policy | SCHED_RESET_ON_FORK,
+                                          &params);
+       }
+       if (is_fair_policy(policy))
+               set_user_nice(task, priority);
+}
+
+static void binder_set_priority(struct task_struct *task,
+                               struct binder_priority desired)
+{
+       binder_do_set_priority(task, desired, /* verify = */ true);
+}
+
+static void binder_restore_priority(struct task_struct *task,
+                                   struct binder_priority desired)
+{
+       binder_do_set_priority(task, desired, /* verify = */ false);
+}
+
+static void binder_transaction_priority(struct task_struct *task,
+                                       struct binder_transaction *t,
+                                       struct binder_priority node_prio,
+                                       bool inherit_rt)
+{
+       struct binder_priority desired_prio = t->priority;
+
+       if (t->set_priority_called)
                 return;
-       binder_user_error("%d RLIMIT_NICE not set\n", current->pid);
+
+       t->set_priority_called = true;
+       t->saved_priority.sched_policy = task->policy;
+       t->saved_priority.prio = task->normal_prio;
+
+       if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) {
+               desired_prio.prio = NICE_TO_PRIO(0);
+               desired_prio.sched_policy = SCHED_NORMAL;
+       }
+
+       if (node_prio.prio < t->priority.prio ||
+           (node_prio.prio == t->priority.prio &&
+            node_prio.sched_policy == SCHED_FIFO)) {
+               /*
+                * In case the minimum priority on the node is
+                * higher (lower value), use that priority. If
+                * the priority is the same, but the node uses
+                * SCHED_FIFO, prefer SCHED_FIFO, since it can
+                * run unbounded, unlike SCHED_RR.
+                */
+               desired_prio = node_prio;
+       }
+
+       binder_set_priority(task, desired_prio);
  }
  
  static struct binder_node *binder_get_node_ilocked(struct binder_proc *proc,
@@ -1132,6 +1325,7 @@ static struct binder_node *binder_init_node_ilocked(
         binder_uintptr_t ptr = fp ? fp->binder : 0;
         binder_uintptr_t cookie = fp ? fp->cookie : 0;
         __u32 flags = fp ? fp->flags : 0;
+       s8 priority;
  
         assert_spin_locked(&proc->inner_lock);
  
@@ -1164,8 +1358,12 @@ static struct binder_node *binder_init_node_ilocked(
         node->ptr = ptr;
         node->cookie = cookie;
         node->work.type = BINDER_WORK_NODE;
-       node->min_priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+       priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK;
+       node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >>
+               FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT;
+       node->min_priority = to_kernel_prio(node->sched_policy, priority);
         node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS);
+       node->inherit_rt = !!(flags & FLAT_BINDER_FLAG_INHERIT_RT);
         spin_lock_init(&node->lock);
         INIT_LIST_HEAD(&node->work.entry);
         INIT_LIST_HEAD(&node->async_todo);
@@ -1228,6 +1426,17 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
                         node->local_strong_refs++;
                 if (!node->has_strong_ref && target_list) {
                         binder_dequeue_work_ilocked(&node->work);
+                       /*
+                        * Note: this function is the only place where we queue
+                        * directly to a thread->todo without using the
+                        * corresponding binder_enqueue_thread_work() helper
+                        * functions; in this case it's ok to not set the
+                        * process_todo flag, since we know this node work will
+                        * always be followed by other work that starts queue
+                        * processing: in case of synchronous transactions, a
+                        * BR_REPLY or BR_ERROR; in case of oneway
+                        * transactions, a BR_TRANSACTION_COMPLETE.
+                        */
                         binder_enqueue_work_ilocked(&node->work, target_list);
                 }
         } else {
@@ -1239,6 +1448,9 @@ static int binder_inc_node_nilocked(struct binder_node *node, int strong,
                                         node->debug_id);
                                 return -EINVAL;
                         }
+                       /*
+                        * See comment above
+                        */
                         binder_enqueue_work_ilocked(&node->work, target_list);
                 }
         }
@@ -1928,9 +2140,9 @@ static void binder_send_failed_reply(struct binder_transaction *t,
                         binder_pop_transaction_ilocked(target_thread, t);
                         if (target_thread->reply_error.cmd == BR_OK) {
                                 target_thread->reply_error.cmd = error_code;
-                               binder_enqueue_work_ilocked(
-                                       &target_thread->reply_error.work,
-                                       &target_thread->todo);
+                               binder_enqueue_thread_work_ilocked(
+                                       target_thread,
+                                       &target_thread->reply_error.work);
                                 wake_up_interruptible(&target_thread->wait);
                         } else {
                                 WARN(1, "Unexpected reply error: %u\n",
@@ -2569,18 +2781,20 @@ static bool binder_proc_transaction(struct binder_transaction *t,
                                     struct binder_proc *proc,
                                     struct binder_thread *thread)
  {
-       struct list_head *target_list = NULL;
         struct binder_node *node = t->buffer->target_node;
+       struct binder_priority node_prio;
         bool oneway = !!(t->flags & TF_ONE_WAY);
-       bool wakeup = true;
+       bool pending_async = false;
  
         BUG_ON(!node);
         binder_node_lock(node);
+       node_prio.prio = node->min_priority;
+       node_prio.sched_policy = node->sched_policy;
+
         if (oneway) {
                 BUG_ON(thread);
                 if (node->has_async_transaction) {
-                       target_list = &node->async_todo;
-                       wakeup = false;
+                       pending_async = true;
                 } else {
                         node->has_async_transaction = 1;
                 }
@@ -2594,19 +2808,20 @@ static bool binder_proc_transaction(struct binder_transaction *t,
                 return false;
         }
  
-       if (!thread && !target_list)
+       if (!thread && !pending_async)
                 thread = binder_select_thread_ilocked(proc);
  
-       if (thread)
-               target_list = &thread->todo;
-       else if (!target_list)
-               target_list = &proc->todo;
-       else
-               BUG_ON(target_list != &node->async_todo);
-
-       binder_enqueue_work_ilocked(&t->work, target_list);
+       if (thread) {
+               binder_transaction_priority(thread->task, t, node_prio,
+                                           node->inherit_rt);
+               binder_enqueue_thread_work_ilocked(thread, &t->work);
+       } else if (!pending_async) {
+               binder_enqueue_work_ilocked(&t->work, &proc->todo);
+       } else {
+               binder_enqueue_work_ilocked(&t->work, &node->async_todo);
+       }
  
-       if (wakeup)
+       if (!pending_async)
                 binder_wakeup_thread_ilocked(proc, thread, !oneway /* sync */);
  
         binder_inner_proc_unlock(proc);
@@ -2721,7 +2936,6 @@ static void binder_transaction(struct binder_proc *proc,
                 }
                 thread->transaction_stack = in_reply_to->to_parent;
                 binder_inner_proc_unlock(proc);
-               binder_set_nice(in_reply_to->saved_priority);
                 target_thread = binder_get_txn_from_and_acq_inner(in_reply_to);
                 if (target_thread == NULL) {
                         return_error = BR_DEAD_REPLY;
@@ -2886,7 +3100,15 @@ static void binder_transaction(struct binder_proc *proc,
         t->to_thread = target_thread;
         t->code = tr->code;
         t->flags = tr->flags;
-       t->priority = task_nice(current);
+       if (!(t->flags & TF_ONE_WAY) &&
+           binder_supported_policy(current->policy)) {
+               /* Inherit supported policies for synchronous transactions */
+               t->priority.sched_policy = current->policy;
+               t->priority.prio = current->normal_prio;
+       } else {
+               /* Otherwise, fall back to the default priority */
+               t->priority = target_proc->default_priority;
+       }
  
         trace_binder_transaction(reply, t, target_node);
  
@@ -3101,10 +3323,10 @@ static void binder_transaction(struct binder_proc *proc,
                 }
         }
         tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE;
-       binder_enqueue_work(proc, tcomplete, &thread->todo);
         t->work.type = BINDER_WORK_TRANSACTION;
  
         if (reply) {
+               binder_enqueue_thread_work(thread, tcomplete);
                 binder_inner_proc_lock(target_proc);
                 if (target_thread->is_dead) {
                         binder_inner_proc_unlock(target_proc);
@@ -3112,13 +3334,22 @@ static void binder_transaction(struct binder_proc *proc,
                 }
                 BUG_ON(t->buffer->async_transaction != 0);
                 binder_pop_transaction_ilocked(target_thread, in_reply_to);
-               binder_enqueue_work_ilocked(&t->work, &target_thread->todo);
+               binder_enqueue_thread_work_ilocked(target_thread, &t->work);
                 binder_inner_proc_unlock(target_proc);
                 wake_up_interruptible_sync(&target_thread->wait);
+               binder_restore_priority(current, in_reply_to->saved_priority);
                 binder_free_transaction(in_reply_to);
         } else if (!(t->flags & TF_ONE_WAY)) {
                 BUG_ON(t->buffer->async_transaction != 0);
                 binder_inner_proc_lock(proc);
+               /*
+                * Defer the TRANSACTION_COMPLETE, so we don't return to
+                * userspace immediately; this allows the target process to
+                * immediately start processing this transaction, reducing
+                * latency. We will then return the TRANSACTION_COMPLETE when
+                * the target replies (or there is an error).
+                */
+               binder_enqueue_deferred_thread_work_ilocked(thread, tcomplete);
                 t->need_reply = 1;
                 t->from_parent = thread->transaction_stack;
                 thread->transaction_stack = t;
@@ -3132,6 +3363,7 @@ static void binder_transaction(struct binder_proc *proc,
         } else {
                 BUG_ON(target_node == NULL);
                 BUG_ON(t->buffer->async_transaction != 1);
+               binder_enqueue_thread_work(thread, tcomplete);
                 if (!binder_proc_transaction(t, target_proc, NULL))
                         goto err_dead_proc_or_thread;
         }
@@ -3209,16 +3441,13 @@ err_invalid_target_handle:
  
         BUG_ON(thread->return_error.cmd != BR_OK);
         if (in_reply_to) {
+               binder_restore_priority(current, in_reply_to->saved_priority);
                 thread->return_error.cmd = BR_TRANSACTION_COMPLETE;
-               binder_enqueue_work(thread->proc,
-                                   &thread->return_error.work,
-                                   &thread->todo);
+               binder_enqueue_thread_work(thread, &thread->return_error.work);
                 binder_send_failed_reply(in_reply_to, return_error);
         } else {
                 thread->return_error.cmd = return_error;
-               binder_enqueue_work(thread->proc,
-                                   &thread->return_error.work,
-                                   &thread->todo);
+               binder_enqueue_thread_work(thread, &thread->return_error.work);
         }
  }
  
@@ -3522,10 +3751,9 @@ static int binder_thread_write(struct binder_proc *proc,
                                         WARN_ON(thread->return_error.cmd !=
                                                 BR_OK);
                                         thread->return_error.cmd = BR_ERROR;
-                                       binder_enqueue_work(
-                                               thread->proc,
-                                               &thread->return_error.work,
-                                               &thread->todo);
+                                       binder_enqueue_thread_work(
+                                               thread,
+                                               &thread->return_error.work);
                                         binder_debug(
                                                 BINDER_DEBUG_FAILED_TRANSACTION,
                                                 "%d:%d BC_REQUEST_DEATH_NOTIFICATION failed\n",
@@ -3605,9 +3833,9 @@ static int binder_thread_write(struct binder_proc *proc,
                                         if (thread->looper &
                                             (BINDER_LOOPER_STATE_REGISTERED |
                                              BINDER_LOOPER_STATE_ENTERED))
-                                               binder_enqueue_work_ilocked(
-                                                               &death->work,
-                                                               &thread->todo);
+                                               binder_enqueue_thread_work_ilocked(
+                                                               thread,
+                                                               &death->work);
                                         else {
                                                 binder_enqueue_work_ilocked(
                                                                 &death->work,
@@ -3662,8 +3890,8 @@ static int binder_thread_write(struct binder_proc *proc,
                                 if (thread->looper &
                                         (BINDER_LOOPER_STATE_REGISTERED |
                                          BINDER_LOOPER_STATE_ENTERED))
-                                       binder_enqueue_work_ilocked(
-                                               &death->work, &thread->todo);
+                                       binder_enqueue_thread_work_ilocked(
+                                               thread, &death->work);
                                 else {
                                         binder_enqueue_work_ilocked(
                                                         &death->work,
@@ -3794,7 +4022,7 @@ retry:
                         wait_event_interruptible(binder_user_error_wait,
                                                  binder_stop_on_user_error < 2);
                 }
-               binder_set_nice(proc->default_priority);
+               binder_restore_priority(current, proc->default_priority);
         }
  
         if (non_block) {
@@ -3837,6 +4065,8 @@ retry:
                         break;
                 }
                 w = binder_dequeue_work_head_ilocked(list);
+               if (binder_worklist_empty_ilocked(&thread->todo))
+                       thread->process_todo = false;
  
                 switch (w->type) {
                 case BINDER_WORK_TRANSACTION: {
@@ -4006,16 +4236,14 @@ retry:
                 BUG_ON(t->buffer == NULL);
                 if (t->buffer->target_node) {
                         struct binder_node *target_node = t->buffer->target_node;
+                       struct binder_priority node_prio;
  
                         tr.target.ptr = target_node->ptr;
                         tr.cookie =  target_node->cookie;
-                       t->saved_priority = task_nice(current);
-                       if (t->priority < target_node->min_priority &&
-                           !(t->flags & TF_ONE_WAY))
-                               binder_set_nice(t->priority);
-                       else if (!(t->flags & TF_ONE_WAY) ||
-                                t->saved_priority > target_node->min_priority)
-                               binder_set_nice(target_node->min_priority);
+                       node_prio.sched_policy = target_node->sched_policy;
+                       node_prio.prio = target_node->min_priority;
+                       binder_transaction_priority(current, t, node_prio,
+                                                   target_node->inherit_rt);
                         cmd = BR_TRANSACTION;
                 } else {
                         tr.target.ptr = 0;
@@ -4193,6 +4421,8 @@ static struct binder_thread *binder_get_thread_ilocked(
         binder_stats_created(BINDER_STAT_THREAD);
         thread->proc = proc;
         thread->pid = current->pid;
+       get_task_struct(current);
+       thread->task = current;
         atomic_set(&thread->tmp_ref, 0);
         init_waitqueue_head(&thread->wait);
         INIT_LIST_HEAD(&thread->todo);
@@ -4243,6 +4473,7 @@ static void binder_free_thread(struct binder_thread *thread)
         BUG_ON(!list_empty(&thread->todo));
         binder_stats_deleted(BINDER_STAT_THREAD);
         binder_proc_dec_tmpref(thread->proc);
+       put_task_struct(thread->task);
         kfree(thread);
  }
  
@@ -4680,7 +4911,14 @@ static int binder_open(struct inode *nodp, struct file *filp)
         proc->tsk = current->group_leader;
         mutex_init(&proc->files_lock);
         INIT_LIST_HEAD(&proc->todo);
-       proc->default_priority = task_nice(current);
+       if (binder_supported_policy(current->policy)) {
+               proc->default_priority.sched_policy = current->policy;
+               proc->default_priority.prio = current->normal_prio;
+       } else {
+               proc->default_priority.sched_policy = SCHED_NORMAL;
+               proc->default_priority.prio = NICE_TO_PRIO(0);
+       }
+
         binder_dev = container_of(filp->private_data, struct binder_device,
                                   miscdev);
         proc->context = &binder_dev->context;
@@ -4974,13 +5212,14 @@ static void print_binder_transaction_ilocked(struct seq_file *m,
         spin_lock(&t->lock);
         to_proc = t->to_proc;
         seq_printf(m,
-                  "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %ld r%d",
+                  "%s %d: %p from %d:%d to %d:%d code %x flags %x pri %d:%d r%d",
                    prefix, t->debug_id, t,
                    t->from ? t->from->proc->pid : 0,
                    t->from ? t->from->pid : 0,
                    to_proc ? to_proc->pid : 0,
                    t->to_thread ? t->to_thread->pid : 0,
-                  t->code, t->flags, t->priority, t->need_reply);
+                  t->code, t->flags, t->priority.sched_policy,
+                  t->priority.prio, t->need_reply);
         spin_unlock(&t->lock);
  
         if (proc != to_proc) {
@@ -5098,8 +5337,9 @@ static void print_binder_node_nilocked(struct seq_file *m,
         hlist_for_each_entry(ref, &node->refs, node_entry)
                 count++;
  
-       seq_printf(m, "  node %d: u%016llx c%016llx hs %d hw %d ls %d lw %d is %d iw %d tr %d",
+       seq_printf(m, "  node %d: u%016llx c%016llx pri %d:%d hs %d hw %d ls %d lw %d is %d iw %d tr %d",
                    node->debug_id, (u64)node->ptr, (u64)node->cookie,
+                  node->sched_policy, node->min_priority,
                    node->has_strong_ref, node->has_weak_ref,
                    node->local_strong_refs, node->local_weak_refs,
                    node->internal_strong_refs, count, node->tmp_refs);
diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c

index 6cb14826867605fe1c3d25aa64eb0f242ad73e66..ba6d8d23f20658563cfdea89d850965cf371ee4d 100644 (file)
--- a/drivers/android/binder_alloc.c
+++ b/drivers/android/binder_alloc.c
@@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc,
  }
  
  static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
-                                   void *start, void *end,
-                                   struct vm_area_struct *vma)
+                                   void *start, void *end)
  {
         void *page_addr;
         unsigned long user_page_addr;
         struct binder_lru_page *page;
+       struct vm_area_struct *vma = NULL;
         struct mm_struct *mm = NULL;
         bool need_mm = false;
  
@@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
                 }
         }
  
-       if (!vma && need_mm && mmget_not_zero(alloc->vma_vm_mm))
+       if (need_mm && mmget_not_zero(alloc->vma_vm_mm))
                 mm = alloc->vma_vm_mm;
  
         if (mm) {
@@ -281,6 +281,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate,
                         goto err_vm_insert_page_failed;
                 }
  
+               if (index + 1 > alloc->pages_high)
+                       alloc->pages_high = index + 1;
+
                 trace_binder_alloc_page_end(alloc, index);
                 /* vm_insert_page does not seem to increment the refcount */
         }
@@ -437,7 +440,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
         if (end_page_addr > has_page_addr)
                 end_page_addr = has_page_addr;
         ret = binder_update_page_range(alloc, 1,
-           (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL);
+           (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr);
         if (ret)
                 return ERR_PTR(ret);
  
@@ -478,7 +481,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc,
  err_alloc_buf_struct_failed:
         binder_update_page_range(alloc, 0,
                                  (void *)PAGE_ALIGN((uintptr_t)buffer->data),
-                                end_page_addr, NULL);
+                                end_page_addr);
         return ERR_PTR(-ENOMEM);
  }
  
@@ -562,8 +565,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc,
                                    alloc->pid, buffer->data,
                                    prev->data, next ? next->data : NULL);
                 binder_update_page_range(alloc, 0, buffer_start_page(buffer),
-                                        buffer_start_page(buffer) + PAGE_SIZE,
-                                        NULL);
+                                        buffer_start_page(buffer) + PAGE_SIZE);
         }
         list_del(&buffer->entry);
         kfree(buffer);
@@ -600,8 +602,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc,
  
         binder_update_page_range(alloc, 0,
                 (void *)PAGE_ALIGN((uintptr_t)buffer->data),
-               (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK),
-               NULL);
+               (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK));
  
         rb_erase(&buffer->rb_node, &alloc->allocated_buffers);
         buffer->free = 1;
@@ -855,6 +856,7 @@ void binder_alloc_print_pages(struct seq_file *m,
         }
         mutex_unlock(&alloc->mutex);
         seq_printf(m, "  pages: %d:%d:%d\n", active, lru, free);
+       seq_printf(m, "  pages high watermark: %zu\n", alloc->pages_high);
  }
  
  /**
@@ -984,7 +986,7 @@ binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
         return ret;
  }
  
-struct shrinker binder_shrinker = {
+static struct shrinker binder_shrinker = {
         .count_objects = binder_shrink_count,
         .scan_objects = binder_shrink_scan,
         .seeks = DEFAULT_SEEKS,
diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h

index 2dd33b6df1044e64b785a6193bc30b84ddf9d1c5..0b145307f1fd1cfdc76fd03c819bb3c533c28414 100644 (file)
--- a/drivers/android/binder_alloc.h
+++ b/drivers/android/binder_alloc.h
@@ -92,6 +92,7 @@ struct binder_lru_page {
   * @pages:              array of binder_lru_page
   * @buffer_size:        size of address space specified via mmap
   * @pid:                pid for associated binder_proc (invariant after init)
+ * @pages_high:         high watermark of offset in @pages
   *
   * Bookkeeping structure for per-proc address space management for binder
   * buffers. It is normally initialized during binder_init() and binder_mmap()
@@ -112,6 +113,7 @@ struct binder_alloc {
         size_t buffer_size;
         uint32_t buffer_free;
         int pid;
+       size_t pages_high;
  };
  
  #ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST
diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h

index 76e3b9c8a8a291e60edb23b43b62ff8585c2befd..b11dffc521e85734dd53f8ed64e10dc799f610a6 100644 (file)
--- a/drivers/android/binder_trace.h
+++ b/drivers/android/binder_trace.h
@@ -85,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done);
  DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done);
  DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done);
  
+TRACE_EVENT(binder_set_priority,
+       TP_PROTO(int proc, int thread, unsigned int old_prio,
+                unsigned int desired_prio, unsigned int new_prio),
+       TP_ARGS(proc, thread, old_prio, new_prio, desired_prio),
+
+       TP_STRUCT__entry(
+               __field(int, proc)
+               __field(int, thread)
+               __field(unsigned int, old_prio)
+               __field(unsigned int, new_prio)
+               __field(unsigned int, desired_prio)
+       ),
+       TP_fast_assign(
+               __entry->proc = proc;
+               __entry->thread = thread;
+               __entry->old_prio = old_prio;
+               __entry->new_prio = new_prio;
+               __entry->desired_prio = desired_prio;
+       ),
+       TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d",
+                 __entry->proc, __entry->thread, __entry->old_prio,
+                 __entry->new_prio, __entry->desired_prio)
+);
+
  TRACE_EVENT(binder_wait_for_work,
         TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo),
         TP_ARGS(proc_work, transaction_stack, thread_todo),
diff --git a/drivers/base/arch_topology.c b/drivers/base/arch_topology.c

index 6df7d6676a48104267b5e739c6e58b85a00724e1..cf92aa817dac3305ea5be55e175fc9f8dd4592bf 100644 (file)
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -21,15 +21,26 @@
  #include <linux/slab.h>
  #include <linux/string.h>
  #include <linux/sched/topology.h>
+#include <linux/cpuset.h>
+#include <linux/sched_energy.h>
  
-static DEFINE_MUTEX(cpu_scale_mutex);
-static DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
  
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+                        unsigned long max_freq)
  {
-       return per_cpu(cpu_scale, cpu);
+       unsigned long scale;
+       int i;
+
+       scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
+
+       for_each_cpu(i, cpus)
+               per_cpu(freq_scale, i) = scale;
  }
  
+static DEFINE_MUTEX(cpu_scale_mutex);
+DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
+
  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity)
  {
         per_cpu(cpu_scale, cpu) = capacity;
@@ -44,6 +55,9 @@ static ssize_t cpu_capacity_show(struct device *dev,
         return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
  }
  
+static void update_topology_flags_workfn(struct work_struct *work);
+static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
+
  static ssize_t cpu_capacity_store(struct device *dev,
                                   struct device_attribute *attr,
                                   const char *buf,
@@ -54,10 +68,15 @@ static ssize_t cpu_capacity_store(struct device *dev,
         int i;
         unsigned long new_capacity;
         ssize_t ret;
+       cpumask_var_t mask;
  
         if (!count)
                 return 0;
  
+       /* don't allow changes if sched-group-energy is installed */
+       if(sched_energy_installed(this_cpu))
+               return -EINVAL;
+
         ret = kstrtoul(buf, 0, &new_capacity);
         if (ret)
                 return ret;
@@ -65,10 +84,41 @@ static ssize_t cpu_capacity_store(struct device *dev,
                 return -EINVAL;
  
         mutex_lock(&cpu_scale_mutex);
-       for_each_cpu(i, &cpu_topology[this_cpu].core_sibling)
+
+       if (new_capacity < SCHED_CAPACITY_SCALE) {
+               int highest_score_cpu = 0;
+
+               if (!alloc_cpumask_var(&mask, GFP_KERNEL)) {
+                       mutex_unlock(&cpu_scale_mutex);
+                       return -ENOMEM;
+               }
+
+               cpumask_andnot(mask, cpu_online_mask,
+                               topology_core_cpumask(this_cpu));
+
+               for_each_cpu(i, mask) {
+                       if (topology_get_cpu_scale(NULL, i) ==
+                                       SCHED_CAPACITY_SCALE) {
+                               highest_score_cpu = 1;
+                               break;
+                       }
+               }
+
+               free_cpumask_var(mask);
+
+               if (!highest_score_cpu) {
+                       mutex_unlock(&cpu_scale_mutex);
+                       return -EINVAL;
+               }
+       }
+
+       for_each_cpu(i, topology_core_cpumask(this_cpu))
                 topology_set_cpu_scale(i, new_capacity);
         mutex_unlock(&cpu_scale_mutex);
  
+       if (topology_detect_flags())
+               schedule_work(&update_topology_flags_work);
+
         return count;
  }
  
@@ -93,6 +143,185 @@ static int register_cpu_capacity_sysctl(void)
  }
  subsys_initcall(register_cpu_capacity_sysctl);
  
+enum asym_cpucap_type { no_asym, asym_thread, asym_core, asym_die };
+static enum asym_cpucap_type asym_cpucap = no_asym;
+enum share_cap_type { no_share_cap, share_cap_thread, share_cap_core, share_cap_die};
+static enum share_cap_type share_cap = no_share_cap;
+
+#ifdef CONFIG_CPU_FREQ
+int detect_share_cap_flag(void)
+{
+       int cpu;
+       enum share_cap_type share_cap_level = no_share_cap;
+       struct cpufreq_policy *policy;
+
+       for_each_possible_cpu(cpu) {
+               policy = cpufreq_cpu_get(cpu);
+
+               if (!policy)
+                       return 0;
+
+               if (cpumask_equal(topology_sibling_cpumask(cpu),
+                                 policy->related_cpus)) {
+                       share_cap_level = share_cap_thread;
+                       continue;
+               }
+
+               if (cpumask_equal(topology_core_cpumask(cpu),
+                                 policy->related_cpus)) {
+                       share_cap_level = share_cap_core;
+                       continue;
+               }
+
+               if (cpumask_equal(cpu_cpu_mask(cpu),
+                                 policy->related_cpus)) {
+                       share_cap_level = share_cap_die;
+                       continue;
+               }
+       }
+
+       if (share_cap != share_cap_level) {
+               share_cap = share_cap_level;
+               return 1;
+       }
+
+       return 0;
+}
+#else
+int detect_share_cap_flag(void) { return 0; }
+#endif
+
+/*
+ * Walk cpu topology to determine sched_domain flags.
+ *
+ * SD_ASYM_CPUCAPACITY: Indicates the lowest level that spans all cpu
+ * capacities found in the system for all cpus, i.e. the flag is set
+ * at the same level for all systems. The current algorithm implements
+ * this by looking for higher capacities, which doesn't work for all
+ * conceivable topology, but don't complicate things until it is
+ * necessary.
+ */
+int topology_detect_flags(void)
+{
+       unsigned long max_capacity, capacity;
+       enum asym_cpucap_type asym_level = no_asym;
+       int cpu, die_cpu, core, thread, flags_changed = 0;
+
+       for_each_possible_cpu(cpu) {
+               max_capacity = 0;
+
+               if (asym_level >= asym_thread)
+                       goto check_core;
+
+               for_each_cpu(thread, topology_sibling_cpumask(cpu)) {
+                       capacity = topology_get_cpu_scale(NULL, thread);
+
+                       if (capacity > max_capacity) {
+                               if (max_capacity != 0)
+                                       asym_level = asym_thread;
+
+                               max_capacity = capacity;
+                       }
+               }
+
+check_core:
+               if (asym_level >= asym_core)
+                       goto check_die;
+
+               for_each_cpu(core, topology_core_cpumask(cpu)) {
+                       capacity = topology_get_cpu_scale(NULL, core);
+
+                       if (capacity > max_capacity) {
+                               if (max_capacity != 0)
+                                       asym_level = asym_core;
+
+                               max_capacity = capacity;
+                       }
+               }
+check_die:
+               for_each_possible_cpu(die_cpu) {
+                       capacity = topology_get_cpu_scale(NULL, die_cpu);
+
+                       if (capacity > max_capacity) {
+                               if (max_capacity != 0) {
+                                       asym_level = asym_die;
+                                       goto done;
+                               }
+                       }
+               }
+       }
+
+done:
+       if (asym_cpucap != asym_level) {
+               asym_cpucap = asym_level;
+               flags_changed = 1;
+               pr_debug("topology flag change detected\n");
+       }
+
+       if (detect_share_cap_flag())
+               flags_changed = 1;
+
+       return flags_changed;
+}
+
+int topology_smt_flags(void)
+{
+       int flags = 0;
+
+       if (asym_cpucap == asym_thread)
+               flags |= SD_ASYM_CPUCAPACITY;
+
+       if (share_cap == share_cap_thread)
+               flags |= SD_SHARE_CAP_STATES;
+
+       return flags;
+}
+
+int topology_core_flags(void)
+{
+       int flags = 0;
+
+       if (asym_cpucap == asym_core)
+               flags |= SD_ASYM_CPUCAPACITY;
+
+       if (share_cap == share_cap_core)
+               flags |= SD_SHARE_CAP_STATES;
+
+       return flags;
+}
+
+int topology_cpu_flags(void)
+{
+       int flags = 0;
+
+       if (asym_cpucap == asym_die)
+               flags |= SD_ASYM_CPUCAPACITY;
+
+       if (share_cap == share_cap_die)
+               flags |= SD_SHARE_CAP_STATES;
+
+       return flags;
+}
+
+static int update_topology = 0;
+
+int topology_update_cpu_topology(void)
+{
+       return update_topology;
+}
+
+/*
+ * Updating the sched_domains can't be done directly from cpufreq callbacks
+ * due to locking, so queue the work for later.
+ */
+static void update_topology_flags_workfn(struct work_struct *work)
+{
+       update_topology = 1;
+       rebuild_sched_domains();
+       pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
+       update_topology = 0;
+}
+
  static u32 capacity_scale;
  static u32 *raw_capacity;
  
@@ -115,13 +344,12 @@ void topology_normalize_cpu_scale(void)
         pr_debug("cpu_capacity: capacity_scale=%u\n", capacity_scale);
         mutex_lock(&cpu_scale_mutex);
         for_each_possible_cpu(cpu) {
-               pr_debug("cpu_capacity: cpu=%d raw_capacity=%u\n",
-                        cpu, raw_capacity[cpu]);
                 capacity = (raw_capacity[cpu] << SCHED_CAPACITY_SHIFT)
                         / capacity_scale;
                 topology_set_cpu_scale(cpu, capacity);
-               pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu\n",
-                       cpu, topology_get_cpu_scale(NULL, cpu));
+               pr_debug("cpu_capacity: CPU%d cpu_capacity=%lu raw_capacity=%u\n",
+                       cpu, topology_get_cpu_scale(NULL, cpu),
+                       raw_capacity[cpu]);
         }
         mutex_unlock(&cpu_scale_mutex);
  }
@@ -129,14 +357,19 @@ void topology_normalize_cpu_scale(void)
  bool __init topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu)
  {
         static bool cap_parsing_failed;
-       int ret;
+       int ret = 0;
         u32 cpu_capacity;
  
         if (cap_parsing_failed)
                 return false;
  
-       ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
+       /* override capacity-dmips-mhz if we have sched-energy-costs */
+       if (of_find_property(cpu_node, "sched-energy-costs", NULL))
+               cpu_capacity = topology_get_cpu_scale(NULL, cpu);
+       else
+               ret = of_property_read_u32(cpu_node, "capacity-dmips-mhz",
                                    &cpu_capacity);
+
         if (!ret) {
                 if (!raw_capacity) {
                         raw_capacity = kcalloc(num_possible_cpus(),
@@ -198,6 +431,8 @@ init_cpu_capacity_callback(struct notifier_block *nb,
  
         if (cpumask_empty(cpus_to_visit)) {
                 topology_normalize_cpu_scale();
+               if (topology_detect_flags())
+                       schedule_work(&update_topology_flags_work);
                 free_raw_capacity();
                 pr_debug("cpu_capacity: parsing done\n");
                 schedule_work(&parsing_done_work);
@@ -212,6 +447,8 @@ static struct notifier_block init_cpu_capacity_notifier __initdata = {
  
  static int __init register_cpufreq_notifier(void)
  {
+       int ret;
+
         /*
          * on ACPI-based systems we need to use the default cpu capacity
          * until we have the necessary code to parse the cpu capacity, so
@@ -227,8 +464,13 @@ static int __init register_cpufreq_notifier(void)
  
         cpumask_copy(cpus_to_visit, cpu_possible_mask);
  
-       return cpufreq_register_notifier(&init_cpu_capacity_notifier,
-                                        CPUFREQ_POLICY_NOTIFIER);
+       ret = cpufreq_register_notifier(&init_cpu_capacity_notifier,
+                                       CPUFREQ_POLICY_NOTIFIER);
+
+       if (ret)
+               free_cpumask_var(cpus_to_visit);
+
+       return ret;
  }
  core_initcall(register_cpufreq_notifier);
  
@@ -236,6 +478,7 @@ static void __init parsing_done_workfn(struct work_struct *work)
  {
         cpufreq_unregister_notifier(&init_cpu_capacity_notifier,
                                          CPUFREQ_POLICY_NOTIFIER);
+       free_cpumask_var(cpus_to_visit);
  }
  
  #else
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c

index 770b1539a083d111ba1a65b569d3a75eaa81dd38..e24f6500786e32f0c2a3d585c88975db2cdd85bf 100644 (file)
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -34,6 +34,7 @@
  #include <linux/cpufreq.h>
  #include <linux/cpuidle.h>
  #include <linux/timer.h>
+#include <linux/wakeup_reason.h>
  
  #include "../base.h"
  #include "power.h"
@@ -1455,6 +1456,7 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
         pm_callback_t callback = NULL;
         const char *info = NULL;
         int error = 0;
+       char suspend_abort[MAX_SUSPEND_ABORT_LEN];
         DECLARE_DPM_WATCHDOG_ON_STACK(wd);
  
         TRACE_DEVICE(dev);
@@ -1475,6 +1477,9 @@ static int __device_suspend(struct device *dev, pm_message_t state, bool async)
                 pm_wakeup_event(dev, 0);
  
         if (pm_wakeup_pending()) {
+               pm_get_active_wakeup_sources(suspend_abort,
+                       MAX_SUSPEND_ABORT_LEN);
+               log_suspend_abort_reason(suspend_abort);
                 async_error = -EBUSY;
                 goto Complete;
         }
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c

index cdd6f256da597cb2abad069538d09723b2250d87..b932d7f755043bbc053da2c1ed1c219cb5ebf324 100644 (file)
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -15,6 +15,7 @@
  #include <linux/seq_file.h>
  #include <linux/debugfs.h>
  #include <linux/pm_wakeirq.h>
+#include <linux/types.h>
  #include <trace/events/power.h>
  
  #include "power.h"
@@ -805,6 +806,37 @@ void pm_wakeup_dev_event(struct device *dev, unsigned int msec, bool hard)
  }
  EXPORT_SYMBOL_GPL(pm_wakeup_dev_event);
  
+void pm_get_active_wakeup_sources(char *pending_wakeup_source, size_t max)
+{
+       struct wakeup_source *ws, *last_active_ws = NULL;
+       int len = 0;
+       bool active = false;
+
+       rcu_read_lock();
+       list_for_each_entry_rcu(ws, &wakeup_sources, entry) {
+               if (ws->active && len < max) {
+                       if (!active)
+                               len += scnprintf(pending_wakeup_source, max,
+                                               "Pending Wakeup Sources: ");
+                       len += scnprintf(pending_wakeup_source + len, max - len,
+                               "%s ", ws->name);
+                       active = true;
+               } else if (!active &&
+                          (!last_active_ws ||
+                           ktime_to_ns(ws->last_time) >
+                           ktime_to_ns(last_active_ws->last_time))) {
+                       last_active_ws = ws;
+               }
+       }
+       if (!active && last_active_ws) {
+               scnprintf(pending_wakeup_source, max,
+                               "Last active Wakeup Source: %s",
+                               last_active_ws->name);
+       }
+       rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(pm_get_active_wakeup_sources);
+
  void pm_print_active_wakeup_sources(void)
  {
         struct wakeup_source *ws;
@@ -1018,7 +1050,7 @@ static int print_wakeup_source_stats(struct seq_file *m,
                 active_time = 0;
         }
  
-       seq_printf(m, "%-12s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
+       seq_printf(m, "%-32s\t%lu\t\t%lu\t\t%lu\t\t%lu\t\t%lld\t\t%lld\t\t%lld\t\t%lld\t\t%lld\n",
                    ws->name, active_count, ws->event_count,
                    ws->wakeup_count, ws->expire_count,
                    ktime_to_ms(active_time), ktime_to_ms(total_time),
@@ -1039,7 +1071,7 @@ static int wakeup_sources_stats_show(struct seq_file *m, void *unused)
         struct wakeup_source *ws;
         int srcuidx;
  
-       seq_puts(m, "name\t\tactive_count\tevent_count\twakeup_count\t"
+       seq_puts(m, "name\t\t\t\t\tactive_count\tevent_count\twakeup_count\t"
                 "expire_count\tactive_since\ttotal_time\tmax_time\t"
                 "last_change\tprevent_suspend_time\n");
  
diff --git a/drivers/base/syscore.c b/drivers/base/syscore.c

index 8d98a329f6ea63a2daf179bb3f15e5307c6a0d13..96c34a95cc625929a08ca0e76c13b9b830a4907c 100644 (file)
--- a/drivers/base/syscore.c
+++ b/drivers/base/syscore.c
@@ -11,6 +11,7 @@
  #include <linux/module.h>
  #include <linux/suspend.h>
  #include <trace/events/power.h>
+#include <linux/wakeup_reason.h>
  
  static LIST_HEAD(syscore_ops_list);
  static DEFINE_MUTEX(syscore_ops_lock);
@@ -75,6 +76,8 @@ int syscore_suspend(void)
         return 0;
  
   err_out:
+       log_suspend_abort_reason("System core suspend callback %pF failed",
+               ops->suspend);
         pr_err("PM: System core suspend callback %pF failed.\n", ops->suspend);
  
         list_for_each_entry_continue(ops, &syscore_ops_list, node)
diff --git a/drivers/cpufreq/arm_big_little.c b/drivers/cpufreq/arm_big_little.c

index 17504129fd778164aadcdbbee89dbbbdb40d22b4..0c41ab3b16eb82fa3daa0ab8e991c1059da30cb7 100644 (file)
--- a/drivers/cpufreq/arm_big_little.c
+++ b/drivers/cpufreq/arm_big_little.c
@@ -213,6 +213,7 @@ static int bL_cpufreq_set_target(struct cpufreq_policy *policy,
  {
         u32 cpu = policy->cpu, cur_cluster, new_cluster, actual_cluster;
         unsigned int freqs_new;
+       int ret;
  
         cur_cluster = cpu_to_cluster(cpu);
         new_cluster = actual_cluster = per_cpu(physical_cluster, cpu);
@@ -229,7 +230,14 @@ static int bL_cpufreq_set_target(struct cpufreq_policy *policy,
                 }
         }
  
-       return bL_cpufreq_set_rate(cpu, actual_cluster, new_cluster, freqs_new);
+       ret = bL_cpufreq_set_rate(cpu, actual_cluster, new_cluster, freqs_new);
+
+       if (!ret) {
+               arch_set_freq_scale(policy->related_cpus, freqs_new,
+                                   policy->cpuinfo.max_freq);
+       }
+
+       return ret;
  }
  
  static inline u32 get_table_count(struct cpufreq_frequency_table *table)
diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c

index d83ab94d041a89cd35529ac5cb43c542f9282dad..545946ad07527df11459b2d35011774f94f81cac 100644 (file)
--- a/drivers/cpufreq/cpufreq-dt.c
+++ b/drivers/cpufreq/cpufreq-dt.c
@@ -43,9 +43,17 @@ static struct freq_attr *cpufreq_dt_attr[] = {
  static int set_target(struct cpufreq_policy *policy, unsigned int index)
  {
         struct private_data *priv = policy->driver_data;
+       unsigned long freq = policy->freq_table[index].frequency;
+       int ret;
+
+       ret = dev_pm_opp_set_rate(priv->cpu_dev, freq * 1000);
  
-       return dev_pm_opp_set_rate(priv->cpu_dev,
-                                  policy->freq_table[index].frequency * 1000);
+       if (!ret) {
+               arch_set_freq_scale(policy->related_cpus, freq,
+                                   policy->cpuinfo.max_freq);
+       }
+
+       return ret;
  }
  
  /*
diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c

index ea43b147a7fe93adc3c8229327532debec429486..183e1edaeece583fb30f5551af8e1973477707f8 100644 (file)
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -2232,6 +2232,7 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy,
  
         policy->min = new_policy->min;
         policy->max = new_policy->max;
+       trace_cpu_frequency_limits(policy->max, policy->min, policy->cpu);
  
         policy->cached_target_freq = UINT_MAX;
  
@@ -2430,6 +2431,17 @@ int cpufreq_boost_enabled(void)
  }
  EXPORT_SYMBOL_GPL(cpufreq_boost_enabled);
  
+/*********************************************************************
+ *               FREQUENCY INVARIANT ACCOUNTING SUPPORT              *
+ *********************************************************************/
+
+__weak void arch_set_freq_scale(struct cpumask *cpus,
+                               unsigned long cur_freq,
+                               unsigned long max_freq)
+{
+}
+EXPORT_SYMBOL_GPL(arch_set_freq_scale);
+
  /*********************************************************************
   *               REGISTER / UNREGISTER CPUFREQ DRIVER                *
   *********************************************************************/
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c

index ed4df58a855e14057611746016ac65eba9466a16..e47e1410176837434ab7bb4587fb55cf3e317d32 100644 (file)
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -212,7 +212,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
         }
  
         /* Take note of the planned idle state. */
-       sched_idle_set_state(target_state);
+       sched_idle_set_state(target_state, index);
  
         trace_cpu_idle_rcuidle(index, dev->cpu);
         time_start = ns_to_ktime(local_clock());
@@ -226,7 +226,7 @@ int cpuidle_enter_state(struct cpuidle_device *dev, struct cpuidle_driver *drv,
         trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, dev->cpu);
  
         /* The cpu is no longer idle or about to enter idle. */
-       sched_idle_set_state(NULL);
+       sched_idle_set_state(NULL, -1);
  
         if (broadcast) {
                 if (WARN_ON_ONCE(!irqs_disabled()))
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c

index 48eaf2879228371fa92fa0f314d7e3407295b78d..a99d5620056d8803fe88de7ab272229eb92c6168 100644 (file)
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -180,7 +180,12 @@ static inline int performance_multiplier(unsigned long nr_iowaiters, unsigned lo
  
         /* for higher loadavg, we are more reluctant */
  
-       mult += 2 * get_loadavg(load);
+       /*
+        * this doesn't work as intended - it is almost always 0, but can
+        * sometimes, depending on workload, spike very high into the hundreds
+        * even when the average cpu load is under 10%.
+        */
+       /* mult += 2 * get_loadavg(); */
  
         /* for IO wait tasks (per cpu!) we add 5x each */
         mult += 10 * nr_iowaiters;
diff --git a/drivers/dma-buf/dma-fence.c b/drivers/dma-buf/dma-fence.c

index 9a302799040e4529bc5d08ce7c17343421ea19b6..33ea68b45654f734c83656c6920aff9698324c26 100644 (file)
--- a/drivers/dma-buf/dma-fence.c
+++ b/drivers/dma-buf/dma-fence.c
@@ -329,8 +329,12 @@ dma_fence_remove_callback(struct dma_fence *fence, struct dma_fence_cb *cb)
         spin_lock_irqsave(fence->lock, flags);
  
         ret = !list_empty(&cb->node);
-       if (ret)
+       if (ret) {
                 list_del_init(&cb->node);
+               if (list_empty(&fence->cb_list))
+                       if (fence->ops->disable_signaling)
+                               fence->ops->disable_signaling(fence);
+       }
  
         spin_unlock_irqrestore(fence->lock, flags);
  
diff --git a/drivers/dma-buf/sw_sync.c b/drivers/dma-buf/sw_sync.c

index 24f83f9eeaedce12904ce8190edd1e5ed0b67e76..ebc11961d64901fcfb2236e5ee3577b93e59f65a 100644 (file)
--- a/drivers/dma-buf/sw_sync.c
+++ b/drivers/dma-buf/sw_sync.c
@@ -169,6 +169,13 @@ static bool timeline_fence_enable_signaling(struct dma_fence *fence)
         return true;
  }
  
+static void timeline_fence_disable_signaling(struct dma_fence *fence)
+{
+       struct sync_pt *pt = dma_fence_to_sync_pt(fence);
+
+       list_del_init(&pt->link);
+}
+
  static void timeline_fence_value_str(struct dma_fence *fence,
                                     char *str, int size)
  {
@@ -187,6 +194,7 @@ static const struct dma_fence_ops timeline_fence_ops = {
         .get_driver_name = timeline_fence_get_driver_name,
         .get_timeline_name = timeline_fence_get_timeline_name,
         .enable_signaling = timeline_fence_enable_signaling,
+       .disable_signaling = timeline_fence_disable_signaling,
         .signaled = timeline_fence_signaled,
         .wait = dma_fence_default_wait,
         .release = timeline_fence_release,
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig

index ff80377987795df3bdd3755e55806499ea54d6d2..724715e4f8bc58113bec5b106c6959c177018332 100644 (file)
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -184,6 +184,19 @@ config INPUT_APMPOWER
           To compile this driver as a module, choose M here: the
           module will be called apm-power.
  
+config INPUT_KEYRESET
+       bool "Reset key"
+       depends on INPUT
+       select INPUT_KEYCOMBO
+       ---help---
+         Say Y here if you want to reboot when some keys are pressed;
+
+config INPUT_KEYCOMBO
+       bool "Key combo"
+       depends on INPUT
+       ---help---
+         Say Y here if you want to take action when some keys are pressed;
+
  comment "Input Device Drivers"
  
  source "drivers/input/keyboard/Kconfig"
diff --git a/drivers/input/Makefile b/drivers/input/Makefile

index 40de6a7be641d6c6c4a9729531abbc009812e0aa..f0351af763bd58cb5653731440258214f1f31111 100644 (file)
--- a/drivers/input/Makefile
+++ b/drivers/input/Makefile
@@ -27,5 +27,7 @@ obj-$(CONFIG_INPUT_TOUCHSCREEN)       += touchscreen/
  obj-$(CONFIG_INPUT_MISC)       += misc/
  
  obj-$(CONFIG_INPUT_APMPOWER)   += apm-power.o
+obj-$(CONFIG_INPUT_KEYRESET)   += keyreset.o
+obj-$(CONFIG_INPUT_KEYCOMBO)   += keycombo.o
  
  obj-$(CONFIG_RMI4_CORE)                += rmi4/
diff --git a/drivers/input/keyboard/goldfish_events.c b/drivers/input/keyboard/goldfish_events.c

index f6e643b589b616c61d1751005fedb3288f0ad82c..c877e56a9bd5760f0ef9151460a6fd5f7d99a6b2 100644 (file)
--- a/drivers/input/keyboard/goldfish_events.c
+++ b/drivers/input/keyboard/goldfish_events.c
@@ -17,6 +17,7 @@
  #include <linux/interrupt.h>
  #include <linux/types.h>
  #include <linux/input.h>
+#include <linux/input/mt.h>
  #include <linux/kernel.h>
  #include <linux/platform_device.h>
  #include <linux/slab.h>
@@ -24,6 +25,8 @@
  #include <linux/io.h>
  #include <linux/acpi.h>
  
+#define GOLDFISH_MAX_FINGERS 5
+
  enum {
         REG_READ        = 0x00,
         REG_SET_PAGE    = 0x00,
@@ -52,7 +55,21 @@ static irqreturn_t events_interrupt(int irq, void *dev_id)
         value = __raw_readl(edev->addr + REG_READ);
  
         input_event(edev->input, type, code, value);
-       input_sync(edev->input);
+       // Send an extra (EV_SYN, SYN_REPORT, 0x0) event
+       // if a key was pressed. Some keyboard device
+        // drivers may only send the EV_KEY event and
+        // not EV_SYN.
+        // Note that sending an extra SYN_REPORT is not
+        // necessary nor correct protocol with other
+        // devices such as touchscreens, which will send
+        // their own SYN_REPORT's when sufficient event
+        // information has been collected (e.g., for
+        // touchscreens, when pressure and X/Y coordinates
+       // have been received). Hence, we will only send
+       // this extra SYN_REPORT if type == EV_KEY.
+       if (type == EV_KEY) {
+               input_sync(edev->input);
+       }
         return IRQ_HANDLED;
  }
  
@@ -154,6 +171,15 @@ static int events_probe(struct platform_device *pdev)
  
         input_dev->name = edev->name;
         input_dev->id.bustype = BUS_HOST;
+       // Set the Goldfish Device to be multi-touch.
+       // In the Ranchu kernel, there is multi-touch-specific
+       // code for handling ABS_MT_SLOT events.
+       // See drivers/input/input.c:input_handle_abs_event.
+       // If we do not issue input_mt_init_slots,
+        // the kernel will filter out needed ABS_MT_SLOT
+        // events when we touch the screen in more than one place,
+        // preventing multi-touch with more than one finger from working.
+       input_mt_init_slots(input_dev, GOLDFISH_MAX_FINGERS, 0);
  
         events_import_bits(edev, input_dev->evbit, EV_SYN, EV_MAX);
         events_import_bits(edev, input_dev->keybit, EV_KEY, KEY_MAX);
diff --git a/drivers/input/keycombo.c b/drivers/input/keycombo.c

new file mode 100644 (file)

index 0000000..2fba451
--- /dev/null
+++ b/drivers/input/keycombo.c
@@ -0,0 +1,261 @@
+/* drivers/input/keycombo.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keycombo.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+struct keycombo_state {
+       struct input_handler input_handler;
+       unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+       unsigned long upbit[BITS_TO_LONGS(KEY_CNT)];
+       unsigned long key[BITS_TO_LONGS(KEY_CNT)];
+       spinlock_t lock;
+       struct  workqueue_struct *wq;
+       int key_down_target;
+       int key_down;
+       int key_up;
+       struct delayed_work key_down_work;
+       int delay;
+       struct work_struct key_up_work;
+       void (*key_up_fn)(void *);
+       void (*key_down_fn)(void *);
+       void *priv;
+       int key_is_down;
+       struct wakeup_source combo_held_wake_source;
+       struct wakeup_source combo_up_wake_source;
+};
+
+static void do_key_down(struct work_struct *work)
+{
+       struct delayed_work *dwork = container_of(work, struct delayed_work,
+                                                                       work);
+       struct keycombo_state *state = container_of(dwork,
+                                       struct keycombo_state, key_down_work);
+       if (state->key_down_fn)
+               state->key_down_fn(state->priv);
+}
+
+static void do_key_up(struct work_struct *work)
+{
+       struct keycombo_state *state = container_of(work, struct keycombo_state,
+                                                               key_up_work);
+       if (state->key_up_fn)
+               state->key_up_fn(state->priv);
+       __pm_relax(&state->combo_up_wake_source);
+}
+
+static void keycombo_event(struct input_handle *handle, unsigned int type,
+               unsigned int code, int value)
+{
+       unsigned long flags;
+       struct keycombo_state *state = handle->private;
+
+       if (type != EV_KEY)
+               return;
+
+       if (code >= KEY_MAX)
+               return;
+
+       if (!test_bit(code, state->keybit))
+               return;
+
+       spin_lock_irqsave(&state->lock, flags);
+       if (!test_bit(code, state->key) == !value)
+               goto done;
+       __change_bit(code, state->key);
+       if (test_bit(code, state->upbit)) {
+               if (value)
+                       state->key_up++;
+               else
+                       state->key_up--;
+       } else {
+               if (value)
+                       state->key_down++;
+               else
+                       state->key_down--;
+       }
+       if (state->key_down == state->key_down_target && state->key_up == 0) {
+               __pm_stay_awake(&state->combo_held_wake_source);
+               state->key_is_down = 1;
+               if (queue_delayed_work(state->wq, &state->key_down_work,
+                                                               state->delay))
+                       pr_debug("Key down work already queued!");
+       } else if (state->key_is_down) {
+               if (!cancel_delayed_work(&state->key_down_work)) {
+                       __pm_stay_awake(&state->combo_up_wake_source);
+                       queue_work(state->wq, &state->key_up_work);
+               }
+               __pm_relax(&state->combo_held_wake_source);
+               state->key_is_down = 0;
+       }
+done:
+       spin_unlock_irqrestore(&state->lock, flags);
+}
+
+static int keycombo_connect(struct input_handler *handler,
+               struct input_dev *dev,
+               const struct input_device_id *id)
+{
+       int i;
+       int ret;
+       struct input_handle *handle;
+       struct keycombo_state *state =
+               container_of(handler, struct keycombo_state, input_handler);
+       for (i = 0; i < KEY_MAX; i++) {
+               if (test_bit(i, state->keybit) && test_bit(i, dev->keybit))
+                       break;
+       }
+       if (i == KEY_MAX)
+               return -ENODEV;
+
+       handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+       if (!handle)
+               return -ENOMEM;
+
+       handle->dev = dev;
+       handle->handler = handler;
+       handle->name = KEYCOMBO_NAME;
+       handle->private = state;
+
+       ret = input_register_handle(handle);
+       if (ret)
+               goto err_input_register_handle;
+
+       ret = input_open_device(handle);
+       if (ret)
+               goto err_input_open_device;
+
+       return 0;
+
+err_input_open_device:
+       input_unregister_handle(handle);
+err_input_register_handle:
+       kfree(handle);
+       return ret;
+}
+
+static void keycombo_disconnect(struct input_handle *handle)
+{
+       input_close_device(handle);
+       input_unregister_handle(handle);
+       kfree(handle);
+}
+
+static const struct input_device_id keycombo_ids[] = {
+               {
+                               .flags = INPUT_DEVICE_ID_MATCH_EVBIT,
+                               .evbit = { BIT_MASK(EV_KEY) },
+               },
+               { },
+};
+MODULE_DEVICE_TABLE(input, keycombo_ids);
+
+static int keycombo_probe(struct platform_device *pdev)
+{
+       int ret;
+       int key, *keyp;
+       struct keycombo_state *state;
+       struct keycombo_platform_data *pdata = pdev->dev.platform_data;
+
+       if (!pdata)
+               return -EINVAL;
+
+       state = kzalloc(sizeof(*state), GFP_KERNEL);
+       if (!state)
+               return -ENOMEM;
+
+       spin_lock_init(&state->lock);
+       keyp = pdata->keys_down;
+       while ((key = *keyp++)) {
+               if (key >= KEY_MAX)
+                       continue;
+               state->key_down_target++;
+               __set_bit(key, state->keybit);
+       }
+       if (pdata->keys_up) {
+               keyp = pdata->keys_up;
+               while ((key = *keyp++)) {
+                       if (key >= KEY_MAX)
+                               continue;
+                       __set_bit(key, state->keybit);
+                       __set_bit(key, state->upbit);
+               }
+       }
+
+       state->wq = alloc_ordered_workqueue("keycombo", 0);
+       if (!state->wq)
+               return -ENOMEM;
+
+       state->priv = pdata->priv;
+
+       if (pdata->key_down_fn)
+               state->key_down_fn = pdata->key_down_fn;
+       INIT_DELAYED_WORK(&state->key_down_work, do_key_down);
+
+       if (pdata->key_up_fn)
+               state->key_up_fn = pdata->key_up_fn;
+       INIT_WORK(&state->key_up_work, do_key_up);
+
+       wakeup_source_init(&state->combo_held_wake_source, "key combo");
+       wakeup_source_init(&state->combo_up_wake_source, "key combo up");
+       state->delay = msecs_to_jiffies(pdata->key_down_delay);
+
+       state->input_handler.event = keycombo_event;
+       state->input_handler.connect = keycombo_connect;
+       state->input_handler.disconnect = keycombo_disconnect;
+       state->input_handler.name = KEYCOMBO_NAME;
+       state->input_handler.id_table = keycombo_ids;
+       ret = input_register_handler(&state->input_handler);
+       if (ret) {
+               kfree(state);
+               return ret;
+       }
+       platform_set_drvdata(pdev, state);
+       return 0;
+}
+
+int keycombo_remove(struct platform_device *pdev)
+{
+       struct keycombo_state *state = platform_get_drvdata(pdev);
+       input_unregister_handler(&state->input_handler);
+       destroy_workqueue(state->wq);
+       kfree(state);
+       return 0;
+}
+
+
+struct platform_driver keycombo_driver = {
+               .driver.name = KEYCOMBO_NAME,
+               .probe = keycombo_probe,
+               .remove = keycombo_remove,
+};
+
+static int __init keycombo_init(void)
+{
+       return platform_driver_register(&keycombo_driver);
+}
+
+static void __exit keycombo_exit(void)
+{
+       return platform_driver_unregister(&keycombo_driver);
+}
+
+module_init(keycombo_init);
+module_exit(keycombo_exit);
diff --git a/drivers/input/keyreset.c b/drivers/input/keyreset.c

new file mode 100644 (file)

index 0000000..7e5222a
--- /dev/null
+++ b/drivers/input/keyreset.c
@@ -0,0 +1,144 @@
+/* drivers/input/keyreset.c
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/input.h>
+#include <linux/keyreset.h>
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/reboot.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/syscalls.h>
+#include <linux/keycombo.h>
+
+struct keyreset_state {
+       int restart_requested;
+       int (*reset_fn)(void);
+       struct platform_device *pdev_child;
+       struct work_struct restart_work;
+};
+
+static void do_restart(struct work_struct *unused)
+{
+       orderly_reboot();
+}
+
+static void do_reset_fn(void *priv)
+{
+       struct keyreset_state *state = priv;
+       if (state->restart_requested)
+               panic("keyboard reset failed, %d", state->restart_requested);
+       if (state->reset_fn) {
+               state->restart_requested = state->reset_fn();
+       } else {
+               pr_info("keyboard reset\n");
+               schedule_work(&state->restart_work);
+               state->restart_requested = 1;
+       }
+}
+
+static int keyreset_probe(struct platform_device *pdev)
+{
+       int ret = -ENOMEM;
+       struct keycombo_platform_data *pdata_child;
+       struct keyreset_platform_data *pdata = pdev->dev.platform_data;
+       int up_size = 0, down_size = 0, size;
+       int key, *keyp;
+       struct keyreset_state *state;
+
+       if (!pdata)
+               return -EINVAL;
+       state = devm_kzalloc(&pdev->dev, sizeof(*state), GFP_KERNEL);
+       if (!state)
+               return -ENOMEM;
+
+       state->pdev_child = platform_device_alloc(KEYCOMBO_NAME,
+                                                       PLATFORM_DEVID_AUTO);
+       if (!state->pdev_child)
+               return -ENOMEM;
+       state->pdev_child->dev.parent = &pdev->dev;
+       INIT_WORK(&state->restart_work, do_restart);
+
+       keyp = pdata->keys_down;
+       while ((key = *keyp++)) {
+               if (key >= KEY_MAX)
+                       continue;
+               down_size++;
+       }
+       if (pdata->keys_up) {
+               keyp = pdata->keys_up;
+               while ((key = *keyp++)) {
+                       if (key >= KEY_MAX)
+                               continue;
+                       up_size++;
+               }
+       }
+       size = sizeof(struct keycombo_platform_data)
+                       + sizeof(int) * (down_size + 1);
+       pdata_child = devm_kzalloc(&pdev->dev, size, GFP_KERNEL);
+       if (!pdata_child)
+               goto error;
+       memcpy(pdata_child->keys_down, pdata->keys_down,
+                                               sizeof(int) * down_size);
+       if (up_size > 0) {
+               pdata_child->keys_up = devm_kzalloc(&pdev->dev, up_size + 1,
+                                                               GFP_KERNEL);
+               if (!pdata_child->keys_up)
+                       goto error;
+               memcpy(pdata_child->keys_up, pdata->keys_up,
+                                                       sizeof(int) * up_size);
+               if (!pdata_child->keys_up)
+                       goto error;
+       }
+       state->reset_fn = pdata->reset_fn;
+       pdata_child->key_down_fn = do_reset_fn;
+       pdata_child->priv = state;
+       pdata_child->key_down_delay = pdata->key_down_delay;
+       ret = platform_device_add_data(state->pdev_child, pdata_child, size);
+       if (ret)
+               goto error;
+       platform_set_drvdata(pdev, state);
+       return platform_device_add(state->pdev_child);
+error:
+       platform_device_put(state->pdev_child);
+       return ret;
+}
+
+int keyreset_remove(struct platform_device *pdev)
+{
+       struct keyreset_state *state = platform_get_drvdata(pdev);
+       platform_device_put(state->pdev_child);
+       return 0;
+}
+
+
+struct platform_driver keyreset_driver = {
+       .driver.name = KEYRESET_NAME,
+       .probe = keyreset_probe,
+       .remove = keyreset_remove,
+};
+
+static int __init keyreset_init(void)
+{
+       return platform_driver_register(&keyreset_driver);
+}
+
+static void __exit keyreset_exit(void)
+{
+       return platform_driver_unregister(&keyreset_driver);
+}
+
+module_init(keyreset_init);
+module_exit(keyreset_exit);
diff --git a/drivers/input/misc/Kconfig b/drivers/input/misc/Kconfig

index 9f082a388388b8aa542905d5c58b126c70b99cfb..4b269b3326366223e95859354155c5408d066046 100644 (file)
--- a/drivers/input/misc/Kconfig
+++ b/drivers/input/misc/Kconfig
@@ -367,6 +367,17 @@ config INPUT_ATI_REMOTE2
           To compile this driver as a module, choose M here: the module will be
           called ati_remote2.
  
+config INPUT_KEYCHORD
+       tristate "Key chord input driver support"
+       help
+         Say Y here if you want to enable the key chord driver
+         accessible at /dev/keychord.  This driver can be used
+         for receiving notifications when client specified key
+         combinations are pressed.
+
+         To compile this driver as a module, choose M here: the
+         module will be called keychord.
+
  config INPUT_KEYSPAN_REMOTE
         tristate "Keyspan DMR USB remote control"
         depends on USB_ARCH_HAS_HCD
@@ -535,6 +546,11 @@ config INPUT_SGI_BTNS
           To compile this driver as a module, choose M here: the
           module will be called sgi_btns.
  
+config INPUT_GPIO
+       tristate "GPIO driver support"
+       help
+         Say Y here if you want to support gpio based keys, wheels etc...
+
  config HP_SDC_RTC
         tristate "HP SDC Real Time Clock"
         depends on (GSC || HP300) && SERIO
diff --git a/drivers/input/misc/Makefile b/drivers/input/misc/Makefile

index 4b6118d313fe78458c40e6885f3e9590b8276ccf..2ecb6869e4b656f9d7f286f565bf652bd23cee52 100644 (file)
--- a/drivers/input/misc/Makefile
+++ b/drivers/input/misc/Makefile
@@ -38,10 +38,12 @@ obj-$(CONFIG_INPUT_GP2A)            += gp2ap002a00f.o
  obj-$(CONFIG_INPUT_GPIO_BEEPER)                += gpio-beeper.o
  obj-$(CONFIG_INPUT_GPIO_TILT_POLLED)   += gpio_tilt_polled.o
  obj-$(CONFIG_INPUT_GPIO_DECODER)       += gpio_decoder.o
+obj-$(CONFIG_INPUT_GPIO)               += gpio_event.o gpio_matrix.o gpio_input.o gpio_output.o gpio_axis.o
  obj-$(CONFIG_INPUT_HISI_POWERKEY)      += hisi_powerkey.o
  obj-$(CONFIG_HP_SDC_RTC)               += hp_sdc_rtc.o
  obj-$(CONFIG_INPUT_IMS_PCU)            += ims-pcu.o
  obj-$(CONFIG_INPUT_IXP4XX_BEEPER)      += ixp4xx-beeper.o
+obj-$(CONFIG_INPUT_KEYCHORD)           += keychord.o
  obj-$(CONFIG_INPUT_KEYSPAN_REMOTE)     += keyspan_remote.o
  obj-$(CONFIG_INPUT_KXTJ9)              += kxtj9.o
  obj-$(CONFIG_INPUT_M68K_BEEP)          += m68kspkr.o
diff --git a/drivers/input/misc/gpio_axis.c b/drivers/input/misc/gpio_axis.c

new file mode 100644 (file)

index 0000000..0acf4a5
--- /dev/null
+++ b/drivers/input/misc/gpio_axis.c
@@ -0,0 +1,192 @@
+/* drivers/input/misc/gpio_axis.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_axis_state {
+       struct gpio_event_input_devs *input_devs;
+       struct gpio_event_axis_info *info;
+       uint32_t pos;
+};
+
+uint16_t gpio_axis_4bit_gray_map_table[] = {
+       [0x0] = 0x0, [0x1] = 0x1, /* 0000 0001 */
+       [0x3] = 0x2, [0x2] = 0x3, /* 0011 0010 */
+       [0x6] = 0x4, [0x7] = 0x5, /* 0110 0111 */
+       [0x5] = 0x6, [0x4] = 0x7, /* 0101 0100 */
+       [0xc] = 0x8, [0xd] = 0x9, /* 1100 1101 */
+       [0xf] = 0xa, [0xe] = 0xb, /* 1111 1110 */
+       [0xa] = 0xc, [0xb] = 0xd, /* 1010 1011 */
+       [0x9] = 0xe, [0x8] = 0xf, /* 1001 1000 */
+};
+uint16_t gpio_axis_4bit_gray_map(struct gpio_event_axis_info *info, uint16_t in)
+{
+       return gpio_axis_4bit_gray_map_table[in];
+}
+
+uint16_t gpio_axis_5bit_singletrack_map_table[] = {
+       [0x10] = 0x00, [0x14] = 0x01, [0x1c] = 0x02, /*     10000 10100 11100 */
+       [0x1e] = 0x03, [0x1a] = 0x04, [0x18] = 0x05, /*     11110 11010 11000 */
+       [0x08] = 0x06, [0x0a] = 0x07, [0x0e] = 0x08, /*    01000 01010 01110  */
+       [0x0f] = 0x09, [0x0d] = 0x0a, [0x0c] = 0x0b, /*    01111 01101 01100  */
+       [0x04] = 0x0c, [0x05] = 0x0d, [0x07] = 0x0e, /*   00100 00101 00111   */
+       [0x17] = 0x0f, [0x16] = 0x10, [0x06] = 0x11, /*   10111 10110 00110   */
+       [0x02] = 0x12, [0x12] = 0x13, [0x13] = 0x14, /*  00010 10010 10011    */
+       [0x1b] = 0x15, [0x0b] = 0x16, [0x03] = 0x17, /*  11011 01011 00011    */
+       [0x01] = 0x18, [0x09] = 0x19, [0x19] = 0x1a, /* 00001 01001 11001     */
+       [0x1d] = 0x1b, [0x15] = 0x1c, [0x11] = 0x1d, /* 11101 10101 10001     */
+};
+uint16_t gpio_axis_5bit_singletrack_map(
+       struct gpio_event_axis_info *info, uint16_t in)
+{
+       return gpio_axis_5bit_singletrack_map_table[in];
+}
+
+static void gpio_event_update_axis(struct gpio_axis_state *as, int report)
+{
+       struct gpio_event_axis_info *ai = as->info;
+       int i;
+       int change;
+       uint16_t state = 0;
+       uint16_t pos;
+       uint16_t old_pos = as->pos;
+       for (i = ai->count - 1; i >= 0; i--)
+               state = (state << 1) | gpio_get_value(ai->gpio[i]);
+       pos = ai->map(ai, state);
+       if (ai->flags & GPIOEAF_PRINT_RAW)
+               pr_info("axis %d-%d raw %x, pos %d -> %d\n",
+                       ai->type, ai->code, state, old_pos, pos);
+       if (report && pos != old_pos) {
+               if (ai->type == EV_REL) {
+                       change = (ai->decoded_size + pos - old_pos) %
+                                 ai->decoded_size;
+                       if (change > ai->decoded_size / 2)
+                               change -= ai->decoded_size;
+                       if (change == ai->decoded_size / 2) {
+                               if (ai->flags & GPIOEAF_PRINT_EVENT)
+                                       pr_info("axis %d-%d unknown direction, "
+                                               "pos %d -> %d\n", ai->type,
+                                               ai->code, old_pos, pos);
+                               change = 0; /* no closest direction */
+                       }
+                       if (ai->flags & GPIOEAF_PRINT_EVENT)
+                               pr_info("axis %d-%d change %d\n",
+                                       ai->type, ai->code, change);
+                       input_report_rel(as->input_devs->dev[ai->dev],
+                                               ai->code, change);
+               } else {
+                       if (ai->flags & GPIOEAF_PRINT_EVENT)
+                               pr_info("axis %d-%d now %d\n",
+                                       ai->type, ai->code, pos);
+                       input_event(as->input_devs->dev[ai->dev],
+                                       ai->type, ai->code, pos);
+               }
+               input_sync(as->input_devs->dev[ai->dev]);
+       }
+       as->pos = pos;
+}
+
+static irqreturn_t gpio_axis_irq_handler(int irq, void *dev_id)
+{
+       struct gpio_axis_state *as = dev_id;
+       gpio_event_update_axis(as, 1);
+       return IRQ_HANDLED;
+}
+
+int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+                        struct gpio_event_info *info, void **data, int func)
+{
+       int ret;
+       int i;
+       int irq;
+       struct gpio_event_axis_info *ai;
+       struct gpio_axis_state *as;
+
+       ai = container_of(info, struct gpio_event_axis_info, info);
+       if (func == GPIO_EVENT_FUNC_SUSPEND) {
+               for (i = 0; i < ai->count; i++)
+                       disable_irq(gpio_to_irq(ai->gpio[i]));
+               return 0;
+       }
+       if (func == GPIO_EVENT_FUNC_RESUME) {
+               for (i = 0; i < ai->count; i++)
+                       enable_irq(gpio_to_irq(ai->gpio[i]));
+               return 0;
+       }
+
+       if (func == GPIO_EVENT_FUNC_INIT) {
+               *data = as = kmalloc(sizeof(*as), GFP_KERNEL);
+               if (as == NULL) {
+                       ret = -ENOMEM;
+                       goto err_alloc_axis_state_failed;
+               }
+               as->input_devs = input_devs;
+               as->info = ai;
+               if (ai->dev >= input_devs->count) {
+                       pr_err("gpio_event_axis: bad device index %d >= %d "
+                               "for %d:%d\n", ai->dev, input_devs->count,
+                               ai->type, ai->code);
+                       ret = -EINVAL;
+                       goto err_bad_device_index;
+               }
+
+               input_set_capability(input_devs->dev[ai->dev],
+                                    ai->type, ai->code);
+               if (ai->type == EV_ABS) {
+                       input_set_abs_params(input_devs->dev[ai->dev], ai->code,
+                                            0, ai->decoded_size - 1, 0, 0);
+               }
+               for (i = 0; i < ai->count; i++) {
+                       ret = gpio_request(ai->gpio[i], "gpio_event_axis");
+                       if (ret < 0)
+                               goto err_request_gpio_failed;
+                       ret = gpio_direction_input(ai->gpio[i]);
+                       if (ret < 0)
+                               goto err_gpio_direction_input_failed;
+                       ret = irq = gpio_to_irq(ai->gpio[i]);
+                       if (ret < 0)
+                               goto err_get_irq_num_failed;
+                       ret = request_irq(irq, gpio_axis_irq_handler,
+                                         IRQF_TRIGGER_RISING |
+                                         IRQF_TRIGGER_FALLING,
+                                         "gpio_event_axis", as);
+                       if (ret < 0)
+                               goto err_request_irq_failed;
+               }
+               gpio_event_update_axis(as, 0);
+               return 0;
+       }
+
+       ret = 0;
+       as = *data;
+       for (i = ai->count - 1; i >= 0; i--) {
+               free_irq(gpio_to_irq(ai->gpio[i]), as);
+err_request_irq_failed:
+err_get_irq_num_failed:
+err_gpio_direction_input_failed:
+               gpio_free(ai->gpio[i]);
+err_request_gpio_failed:
+               ;
+       }
+err_bad_device_index:
+       kfree(as);
+       *data = NULL;
+err_alloc_axis_state_failed:
+       return ret;
+}
diff --git a/drivers/input/misc/gpio_event.c b/drivers/input/misc/gpio_event.c

new file mode 100644 (file)

index 0000000..90f07eb
--- /dev/null
+++ b/drivers/input/misc/gpio_event.c
@@ -0,0 +1,228 @@
+/* drivers/input/misc/gpio_event.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/input.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/platform_device.h>
+#include <linux/slab.h>
+
+struct gpio_event {
+       struct gpio_event_input_devs *input_devs;
+       const struct gpio_event_platform_data *info;
+       void *state[0];
+};
+
+static int gpio_input_event(
+       struct input_dev *dev, unsigned int type, unsigned int code, int value)
+{
+       int i;
+       int devnr;
+       int ret = 0;
+       int tmp_ret;
+       struct gpio_event_info **ii;
+       struct gpio_event *ip = input_get_drvdata(dev);
+
+       for (devnr = 0; devnr < ip->input_devs->count; devnr++)
+               if (ip->input_devs->dev[devnr] == dev)
+                       break;
+       if (devnr == ip->input_devs->count) {
+               pr_err("gpio_input_event: unknown device %p\n", dev);
+               return -EIO;
+       }
+
+       for (i = 0, ii = ip->info->info; i < ip->info->info_count; i++, ii++) {
+               if ((*ii)->event) {
+                       tmp_ret = (*ii)->event(ip->input_devs, *ii,
+                                               &ip->state[i],
+                                               devnr, type, code, value);
+                       if (tmp_ret)
+                               ret = tmp_ret;
+               }
+       }
+       return ret;
+}
+
+static int gpio_event_call_all_func(struct gpio_event *ip, int func)
+{
+       int i;
+       int ret;
+       struct gpio_event_info **ii;
+
+       if (func == GPIO_EVENT_FUNC_INIT || func == GPIO_EVENT_FUNC_RESUME) {
+               ii = ip->info->info;
+               for (i = 0; i < ip->info->info_count; i++, ii++) {
+                       if ((*ii)->func == NULL) {
+                               ret = -ENODEV;
+                               pr_err("gpio_event_probe: Incomplete pdata, "
+                                       "no function\n");
+                               goto err_no_func;
+                       }
+                       if (func == GPIO_EVENT_FUNC_RESUME && (*ii)->no_suspend)
+                               continue;
+                       ret = (*ii)->func(ip->input_devs, *ii, &ip->state[i],
+                                         func);
+                       if (ret) {
+                               pr_err("gpio_event_probe: function failed\n");
+                               goto err_func_failed;
+                       }
+               }
+               return 0;
+       }
+
+       ret = 0;
+       i = ip->info->info_count;
+       ii = ip->info->info + i;
+       while (i > 0) {
+               i--;
+               ii--;
+               if ((func & ~1) == GPIO_EVENT_FUNC_SUSPEND && (*ii)->no_suspend)
+                       continue;
+               (*ii)->func(ip->input_devs, *ii, &ip->state[i], func & ~1);
+err_func_failed:
+err_no_func:
+               ;
+       }
+       return ret;
+}
+
+static void __maybe_unused gpio_event_suspend(struct gpio_event *ip)
+{
+       gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_SUSPEND);
+       if (ip->info->power)
+               ip->info->power(ip->info, 0);
+}
+
+static void __maybe_unused gpio_event_resume(struct gpio_event *ip)
+{
+       if (ip->info->power)
+               ip->info->power(ip->info, 1);
+       gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_RESUME);
+}
+
+static int gpio_event_probe(struct platform_device *pdev)
+{
+       int err;
+       struct gpio_event *ip;
+       struct gpio_event_platform_data *event_info;
+       int dev_count = 1;
+       int i;
+       int registered = 0;
+
+       event_info = pdev->dev.platform_data;
+       if (event_info == NULL) {
+               pr_err("gpio_event_probe: No pdata\n");
+               return -ENODEV;
+       }
+       if ((!event_info->name && !event_info->names[0]) ||
+           !event_info->info || !event_info->info_count) {
+               pr_err("gpio_event_probe: Incomplete pdata\n");
+               return -ENODEV;
+       }
+       if (!event_info->name)
+               while (event_info->names[dev_count])
+                       dev_count++;
+       ip = kzalloc(sizeof(*ip) +
+                    sizeof(ip->state[0]) * event_info->info_count +
+                    sizeof(*ip->input_devs) +
+                    sizeof(ip->input_devs->dev[0]) * dev_count, GFP_KERNEL);
+       if (ip == NULL) {
+               err = -ENOMEM;
+               pr_err("gpio_event_probe: Failed to allocate private data\n");
+               goto err_kp_alloc_failed;
+       }
+       ip->input_devs = (void*)&ip->state[event_info->info_count];
+       platform_set_drvdata(pdev, ip);
+
+       for (i = 0; i < dev_count; i++) {
+               struct input_dev *input_dev = input_allocate_device();
+               if (input_dev == NULL) {
+                       err = -ENOMEM;
+                       pr_err("gpio_event_probe: "
+                               "Failed to allocate input device\n");
+                       goto err_input_dev_alloc_failed;
+               }
+               input_set_drvdata(input_dev, ip);
+               input_dev->name = event_info->name ?
+                                       event_info->name : event_info->names[i];
+               input_dev->event = gpio_input_event;
+               ip->input_devs->dev[i] = input_dev;
+       }
+       ip->input_devs->count = dev_count;
+       ip->info = event_info;
+       if (event_info->power)
+               ip->info->power(ip->info, 1);
+
+       err = gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_INIT);
+       if (err)
+               goto err_call_all_func_failed;
+
+       for (i = 0; i < dev_count; i++) {
+               err = input_register_device(ip->input_devs->dev[i]);
+               if (err) {
+                       pr_err("gpio_event_probe: Unable to register %s "
+                               "input device\n", ip->input_devs->dev[i]->name);
+                       goto err_input_register_device_failed;
+               }
+               registered++;
+       }
+
+       return 0;
+
+err_input_register_device_failed:
+       gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+err_call_all_func_failed:
+       if (event_info->power)
+               ip->info->power(ip->info, 0);
+       for (i = 0; i < registered; i++)
+               input_unregister_device(ip->input_devs->dev[i]);
+       for (i = dev_count - 1; i >= registered; i--) {
+               input_free_device(ip->input_devs->dev[i]);
+err_input_dev_alloc_failed:
+               ;
+       }
+       kfree(ip);
+err_kp_alloc_failed:
+       return err;
+}
+
+static int gpio_event_remove(struct platform_device *pdev)
+{
+       struct gpio_event *ip = platform_get_drvdata(pdev);
+       int i;
+
+       gpio_event_call_all_func(ip, GPIO_EVENT_FUNC_UNINIT);
+       if (ip->info->power)
+               ip->info->power(ip->info, 0);
+       for (i = 0; i < ip->input_devs->count; i++)
+               input_unregister_device(ip->input_devs->dev[i]);
+       kfree(ip);
+       return 0;
+}
+
+static struct platform_driver gpio_event_driver = {
+       .probe          = gpio_event_probe,
+       .remove         = gpio_event_remove,
+       .driver         = {
+               .name   = GPIO_EVENT_DEV_NAME,
+       },
+};
+
+module_platform_driver(gpio_event_driver);
+
+MODULE_DESCRIPTION("GPIO Event Driver");
+MODULE_LICENSE("GPL");
+
diff --git a/drivers/input/misc/gpio_input.c b/drivers/input/misc/gpio_input.c

new file mode 100644 (file)

index 0000000..5875d73
--- /dev/null
+++ b/drivers/input/misc/gpio_input.c
@@ -0,0 +1,390 @@
+/* drivers/input/misc/gpio_input.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/input.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+#include <linux/pm_wakeup.h>
+
+enum {
+       DEBOUNCE_UNSTABLE     = BIT(0), /* Got irq, while debouncing */
+       DEBOUNCE_PRESSED      = BIT(1),
+       DEBOUNCE_NOTPRESSED   = BIT(2),
+       DEBOUNCE_WAIT_IRQ     = BIT(3), /* Stable irq state */
+       DEBOUNCE_POLL         = BIT(4), /* Stable polling state */
+
+       DEBOUNCE_UNKNOWN =
+               DEBOUNCE_PRESSED | DEBOUNCE_NOTPRESSED,
+};
+
+struct gpio_key_state {
+       struct gpio_input_state *ds;
+       uint8_t debounce;
+};
+
+struct gpio_input_state {
+       struct gpio_event_input_devs *input_devs;
+       const struct gpio_event_input_info *info;
+       struct hrtimer timer;
+       int use_irq;
+       int debounce_count;
+       spinlock_t irq_lock;
+       struct wakeup_source *ws;
+       struct gpio_key_state key_state[0];
+};
+
+static enum hrtimer_restart gpio_event_input_timer_func(struct hrtimer *timer)
+{
+       int i;
+       int pressed;
+       struct gpio_input_state *ds =
+               container_of(timer, struct gpio_input_state, timer);
+       unsigned gpio_flags = ds->info->flags;
+       unsigned npolarity;
+       int nkeys = ds->info->keymap_size;
+       const struct gpio_event_direct_entry *key_entry;
+       struct gpio_key_state *key_state;
+       unsigned long irqflags;
+       uint8_t debounce;
+       bool sync_needed;
+
+#if 0
+       key_entry = kp->keys_info->keymap;
+       key_state = kp->key_state;
+       for (i = 0; i < nkeys; i++, key_entry++, key_state++)
+               pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+                       gpio_read_detect_status(key_entry->gpio));
+#endif
+       key_entry = ds->info->keymap;
+       key_state = ds->key_state;
+       sync_needed = false;
+       spin_lock_irqsave(&ds->irq_lock, irqflags);
+       for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+               debounce = key_state->debounce;
+               if (debounce & DEBOUNCE_WAIT_IRQ)
+                       continue;
+               if (key_state->debounce & DEBOUNCE_UNSTABLE) {
+                       debounce = key_state->debounce = DEBOUNCE_UNKNOWN;
+                       enable_irq(gpio_to_irq(key_entry->gpio));
+                       if (gpio_flags & GPIOEDF_PRINT_KEY_UNSTABLE)
+                               pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+                                       "(%d) continue debounce\n",
+                                       ds->info->type, key_entry->code,
+                                       i, key_entry->gpio);
+               }
+               npolarity = !(gpio_flags & GPIOEDF_ACTIVE_HIGH);
+               pressed = gpio_get_value(key_entry->gpio) ^ npolarity;
+               if (debounce & DEBOUNCE_POLL) {
+                       if (pressed == !(debounce & DEBOUNCE_PRESSED)) {
+                               ds->debounce_count++;
+                               key_state->debounce = DEBOUNCE_UNKNOWN;
+                               if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+                                       pr_info("gpio_keys_scan_keys: key %x-"
+                                               "%x, %d (%d) start debounce\n",
+                                               ds->info->type, key_entry->code,
+                                               i, key_entry->gpio);
+                       }
+                       continue;
+               }
+               if (pressed && (debounce & DEBOUNCE_NOTPRESSED)) {
+                       if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+                               pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+                                       "(%d) debounce pressed 1\n",
+                                       ds->info->type, key_entry->code,
+                                       i, key_entry->gpio);
+                       key_state->debounce = DEBOUNCE_PRESSED;
+                       continue;
+               }
+               if (!pressed && (debounce & DEBOUNCE_PRESSED)) {
+                       if (gpio_flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+                               pr_info("gpio_keys_scan_keys: key %x-%x, %d "
+                                       "(%d) debounce pressed 0\n",
+                                       ds->info->type, key_entry->code,
+                                       i, key_entry->gpio);
+                       key_state->debounce = DEBOUNCE_NOTPRESSED;
+                       continue;
+               }
+               /* key is stable */
+               ds->debounce_count--;
+               if (ds->use_irq)
+                       key_state->debounce |= DEBOUNCE_WAIT_IRQ;
+               else
+                       key_state->debounce |= DEBOUNCE_POLL;
+               if (gpio_flags & GPIOEDF_PRINT_KEYS)
+                       pr_info("gpio_keys_scan_keys: key %x-%x, %d (%d) "
+                               "changed to %d\n", ds->info->type,
+                               key_entry->code, i, key_entry->gpio, pressed);
+               input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+                           key_entry->code, pressed);
+               sync_needed = true;
+       }
+       if (sync_needed) {
+               for (i = 0; i < ds->input_devs->count; i++)
+                       input_sync(ds->input_devs->dev[i]);
+       }
+
+#if 0
+       key_entry = kp->keys_info->keymap;
+       key_state = kp->key_state;
+       for (i = 0; i < nkeys; i++, key_entry++, key_state++) {
+               pr_info("gpio_read_detect_status %d %d\n", key_entry->gpio,
+                       gpio_read_detect_status(key_entry->gpio));
+       }
+#endif
+
+       if (ds->debounce_count)
+               hrtimer_start(timer, ds->info->debounce_time, HRTIMER_MODE_REL);
+       else if (!ds->use_irq)
+               hrtimer_start(timer, ds->info->poll_time, HRTIMER_MODE_REL);
+       else
+               __pm_relax(ds->ws);
+
+       spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+       return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_event_input_irq_handler(int irq, void *dev_id)
+{
+       struct gpio_key_state *ks = dev_id;
+       struct gpio_input_state *ds = ks->ds;
+       int keymap_index = ks - ds->key_state;
+       const struct gpio_event_direct_entry *key_entry;
+       unsigned long irqflags;
+       int pressed;
+
+       if (!ds->use_irq)
+               return IRQ_HANDLED;
+
+       key_entry = &ds->info->keymap[keymap_index];
+
+       if (ds->info->debounce_time) {
+               spin_lock_irqsave(&ds->irq_lock, irqflags);
+               if (ks->debounce & DEBOUNCE_WAIT_IRQ) {
+                       ks->debounce = DEBOUNCE_UNKNOWN;
+                       if (ds->debounce_count++ == 0) {
+                               __pm_stay_awake(ds->ws);
+                               hrtimer_start(
+                                       &ds->timer, ds->info->debounce_time,
+                                       HRTIMER_MODE_REL);
+                       }
+                       if (ds->info->flags & GPIOEDF_PRINT_KEY_DEBOUNCE)
+                               pr_info("gpio_event_input_irq_handler: "
+                                       "key %x-%x, %d (%d) start debounce\n",
+                                       ds->info->type, key_entry->code,
+                                       keymap_index, key_entry->gpio);
+               } else {
+                       disable_irq_nosync(irq);
+                       ks->debounce = DEBOUNCE_UNSTABLE;
+               }
+               spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+       } else {
+               pressed = gpio_get_value(key_entry->gpio) ^
+                       !(ds->info->flags & GPIOEDF_ACTIVE_HIGH);
+               if (ds->info->flags & GPIOEDF_PRINT_KEYS)
+                       pr_info("gpio_event_input_irq_handler: key %x-%x, %d "
+                               "(%d) changed to %d\n",
+                               ds->info->type, key_entry->code, keymap_index,
+                               key_entry->gpio, pressed);
+               input_event(ds->input_devs->dev[key_entry->dev], ds->info->type,
+                           key_entry->code, pressed);
+               input_sync(ds->input_devs->dev[key_entry->dev]);
+       }
+       return IRQ_HANDLED;
+}
+
+static int gpio_event_input_request_irqs(struct gpio_input_state *ds)
+{
+       int i;
+       int err;
+       unsigned int irq;
+       unsigned long req_flags = IRQF_TRIGGER_RISING | IRQF_TRIGGER_FALLING;
+
+       for (i = 0; i < ds->info->keymap_size; i++) {
+               err = irq = gpio_to_irq(ds->info->keymap[i].gpio);
+               if (err < 0)
+                       goto err_gpio_get_irq_num_failed;
+               err = request_irq(irq, gpio_event_input_irq_handler,
+                                 req_flags, "gpio_keys", &ds->key_state[i]);
+               if (err) {
+                       pr_err("gpio_event_input_request_irqs: request_irq "
+                               "failed for input %d, irq %d\n",
+                               ds->info->keymap[i].gpio, irq);
+                       goto err_request_irq_failed;
+               }
+               if (ds->info->info.no_suspend) {
+                       err = enable_irq_wake(irq);
+                       if (err) {
+                               pr_err("gpio_event_input_request_irqs: "
+                                       "enable_irq_wake failed for input %d, "
+                                       "irq %d\n",
+                                       ds->info->keymap[i].gpio, irq);
+                               goto err_enable_irq_wake_failed;
+                       }
+               }
+       }
+       return 0;
+
+       for (i = ds->info->keymap_size - 1; i >= 0; i--) {
+               irq = gpio_to_irq(ds->info->keymap[i].gpio);
+               if (ds->info->info.no_suspend)
+                       disable_irq_wake(irq);
+err_enable_irq_wake_failed:
+               free_irq(irq, &ds->key_state[i]);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+               ;
+       }
+       return err;
+}
+
+int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data, int func)
+{
+       int ret;
+       int i;
+       unsigned long irqflags;
+       struct gpio_event_input_info *di;
+       struct gpio_input_state *ds = *data;
+       char *wlname;
+
+       di = container_of(info, struct gpio_event_input_info, info);
+
+       if (func == GPIO_EVENT_FUNC_SUSPEND) {
+               if (ds->use_irq)
+                       for (i = 0; i < di->keymap_size; i++)
+                               disable_irq(gpio_to_irq(di->keymap[i].gpio));
+               hrtimer_cancel(&ds->timer);
+               return 0;
+       }
+       if (func == GPIO_EVENT_FUNC_RESUME) {
+               spin_lock_irqsave(&ds->irq_lock, irqflags);
+               if (ds->use_irq)
+                       for (i = 0; i < di->keymap_size; i++)
+                               enable_irq(gpio_to_irq(di->keymap[i].gpio));
+               hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+               spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+               return 0;
+       }
+
+       if (func == GPIO_EVENT_FUNC_INIT) {
+               if (ktime_to_ns(di->poll_time) <= 0)
+                       di->poll_time = ktime_set(0, 20 * NSEC_PER_MSEC);
+
+               *data = ds = kzalloc(sizeof(*ds) + sizeof(ds->key_state[0]) *
+                                       di->keymap_size, GFP_KERNEL);
+               if (ds == NULL) {
+                       ret = -ENOMEM;
+                       pr_err("gpio_event_input_func: "
+                               "Failed to allocate private data\n");
+                       goto err_ds_alloc_failed;
+               }
+               ds->debounce_count = di->keymap_size;
+               ds->input_devs = input_devs;
+               ds->info = di;
+               wlname = kasprintf(GFP_KERNEL, "gpio_input:%s%s",
+                                  input_devs->dev[0]->name,
+                                  (input_devs->count > 1) ? "..." : "");
+
+               ds->ws = wakeup_source_register(wlname);
+               kfree(wlname);
+               if (!ds->ws) {
+                       ret = -ENOMEM;
+                       pr_err("gpio_event_input_func: "
+                               "Failed to allocate wakeup source\n");
+                       goto err_ws_failed;
+               }
+
+               spin_lock_init(&ds->irq_lock);
+
+               for (i = 0; i < di->keymap_size; i++) {
+                       int dev = di->keymap[i].dev;
+                       if (dev >= input_devs->count) {
+                               pr_err("gpio_event_input_func: bad device "
+                                       "index %d >= %d for key code %d\n",
+                                       dev, input_devs->count,
+                                       di->keymap[i].code);
+                               ret = -EINVAL;
+                               goto err_bad_keymap;
+                       }
+                       input_set_capability(input_devs->dev[dev], di->type,
+                                            di->keymap[i].code);
+                       ds->key_state[i].ds = ds;
+                       ds->key_state[i].debounce = DEBOUNCE_UNKNOWN;
+               }
+
+               for (i = 0; i < di->keymap_size; i++) {
+                       ret = gpio_request(di->keymap[i].gpio, "gpio_kp_in");
+                       if (ret) {
+                               pr_err("gpio_event_input_func: gpio_request "
+                                       "failed for %d\n", di->keymap[i].gpio);
+                               goto err_gpio_request_failed;
+                       }
+                       ret = gpio_direction_input(di->keymap[i].gpio);
+                       if (ret) {
+                               pr_err("gpio_event_input_func: "
+                                       "gpio_direction_input failed for %d\n",
+                                       di->keymap[i].gpio);
+                               goto err_gpio_configure_failed;
+                       }
+               }
+
+               ret = gpio_event_input_request_irqs(ds);
+
+               spin_lock_irqsave(&ds->irq_lock, irqflags);
+               ds->use_irq = ret == 0;
+
+               pr_info("GPIO Input Driver: Start gpio inputs for %s%s in %s "
+                       "mode\n", input_devs->dev[0]->name,
+                       (input_devs->count > 1) ? "..." : "",
+                       ret == 0 ? "interrupt" : "polling");
+
+               hrtimer_init(&ds->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+               ds->timer.function = gpio_event_input_timer_func;
+               hrtimer_start(&ds->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+               spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+               return 0;
+       }
+
+       ret = 0;
+       spin_lock_irqsave(&ds->irq_lock, irqflags);
+       hrtimer_cancel(&ds->timer);
+       if (ds->use_irq) {
+               for (i = di->keymap_size - 1; i >= 0; i--) {
+                       int irq = gpio_to_irq(di->keymap[i].gpio);
+                       if (ds->info->info.no_suspend)
+                               disable_irq_wake(irq);
+                       free_irq(irq, &ds->key_state[i]);
+               }
+       }
+       spin_unlock_irqrestore(&ds->irq_lock, irqflags);
+
+       for (i = di->keymap_size - 1; i >= 0; i--) {
+err_gpio_configure_failed:
+               gpio_free(di->keymap[i].gpio);
+err_gpio_request_failed:
+               ;
+       }
+err_bad_keymap:
+       wakeup_source_unregister(ds->ws);
+err_ws_failed:
+       kfree(ds);
+err_ds_alloc_failed:
+       return ret;
+}
diff --git a/drivers/input/misc/gpio_matrix.c b/drivers/input/misc/gpio_matrix.c

new file mode 100644 (file)

index 0000000..08769dd
--- /dev/null
+++ b/drivers/input/misc/gpio_matrix.c
@@ -0,0 +1,440 @@
+/* drivers/input/misc/gpio_matrix.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+#include <linux/hrtimer.h>
+#include <linux/interrupt.h>
+#include <linux/slab.h>
+
+struct gpio_kp {
+       struct gpio_event_input_devs *input_devs;
+       struct gpio_event_matrix_info *keypad_info;
+       struct hrtimer timer;
+       struct wakeup_source wake_src;
+       int current_output;
+       unsigned int use_irq:1;
+       unsigned int key_state_changed:1;
+       unsigned int last_key_state_changed:1;
+       unsigned int some_keys_pressed:2;
+       unsigned int disabled_irq:1;
+       unsigned long keys_pressed[0];
+};
+
+static void clear_phantom_key(struct gpio_kp *kp, int out, int in)
+{
+       struct gpio_event_matrix_info *mi = kp->keypad_info;
+       int key_index = out * mi->ninputs + in;
+       unsigned short keyentry = mi->keymap[key_index];
+       unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+       unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+       if (!test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+               if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+                       pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+                               "cleared\n", keycode, out, in,
+                               mi->output_gpios[out], mi->input_gpios[in]);
+               __clear_bit(key_index, kp->keys_pressed);
+       } else {
+               if (mi->flags & GPIOKPF_PRINT_PHANTOM_KEYS)
+                       pr_info("gpiomatrix: phantom key %x, %d-%d (%d-%d) "
+                               "not cleared\n", keycode, out, in,
+                               mi->output_gpios[out], mi->input_gpios[in]);
+       }
+}
+
+static int restore_keys_for_input(struct gpio_kp *kp, int out, int in)
+{
+       int rv = 0;
+       int key_index;
+
+       key_index = out * kp->keypad_info->ninputs + in;
+       while (out < kp->keypad_info->noutputs) {
+               if (test_bit(key_index, kp->keys_pressed)) {
+                       rv = 1;
+                       clear_phantom_key(kp, out, in);
+               }
+               key_index += kp->keypad_info->ninputs;
+               out++;
+       }
+       return rv;
+}
+
+static void remove_phantom_keys(struct gpio_kp *kp)
+{
+       int out, in, inp;
+       int key_index;
+
+       if (kp->some_keys_pressed < 3)
+               return;
+
+       for (out = 0; out < kp->keypad_info->noutputs; out++) {
+               inp = -1;
+               key_index = out * kp->keypad_info->ninputs;
+               for (in = 0; in < kp->keypad_info->ninputs; in++, key_index++) {
+                       if (test_bit(key_index, kp->keys_pressed)) {
+                               if (inp == -1) {
+                                       inp = in;
+                                       continue;
+                               }
+                               if (inp >= 0) {
+                                       if (!restore_keys_for_input(kp, out + 1,
+                                                                       inp))
+                                               break;
+                                       clear_phantom_key(kp, out, inp);
+                                       inp = -2;
+                               }
+                               restore_keys_for_input(kp, out, in);
+                       }
+               }
+       }
+}
+
+static void report_key(struct gpio_kp *kp, int key_index, int out, int in)
+{
+       struct gpio_event_matrix_info *mi = kp->keypad_info;
+       int pressed = test_bit(key_index, kp->keys_pressed);
+       unsigned short keyentry = mi->keymap[key_index];
+       unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+       unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+
+       if (pressed != test_bit(keycode, kp->input_devs->dev[dev]->key)) {
+               if (keycode == KEY_RESERVED) {
+                       if (mi->flags & GPIOKPF_PRINT_UNMAPPED_KEYS)
+                               pr_info("gpiomatrix: unmapped key, %d-%d "
+                                       "(%d-%d) changed to %d\n",
+                                       out, in, mi->output_gpios[out],
+                                       mi->input_gpios[in], pressed);
+               } else {
+                       if (mi->flags & GPIOKPF_PRINT_MAPPED_KEYS)
+                               pr_info("gpiomatrix: key %x, %d-%d (%d-%d) "
+                                       "changed to %d\n", keycode,
+                                       out, in, mi->output_gpios[out],
+                                       mi->input_gpios[in], pressed);
+                       input_report_key(kp->input_devs->dev[dev], keycode, pressed);
+               }
+       }
+}
+
+static void report_sync(struct gpio_kp *kp)
+{
+       int i;
+
+       for (i = 0; i < kp->input_devs->count; i++)
+               input_sync(kp->input_devs->dev[i]);
+}
+
+static enum hrtimer_restart gpio_keypad_timer_func(struct hrtimer *timer)
+{
+       int out, in;
+       int key_index;
+       int gpio;
+       struct gpio_kp *kp = container_of(timer, struct gpio_kp, timer);
+       struct gpio_event_matrix_info *mi = kp->keypad_info;
+       unsigned gpio_keypad_flags = mi->flags;
+       unsigned polarity = !!(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH);
+
+       out = kp->current_output;
+       if (out == mi->noutputs) {
+               out = 0;
+               kp->last_key_state_changed = kp->key_state_changed;
+               kp->key_state_changed = 0;
+               kp->some_keys_pressed = 0;
+       } else {
+               key_index = out * mi->ninputs;
+               for (in = 0; in < mi->ninputs; in++, key_index++) {
+                       gpio = mi->input_gpios[in];
+                       if (gpio_get_value(gpio) ^ !polarity) {
+                               if (kp->some_keys_pressed < 3)
+                                       kp->some_keys_pressed++;
+                               kp->key_state_changed |= !__test_and_set_bit(
+                                               key_index, kp->keys_pressed);
+                       } else
+                               kp->key_state_changed |= __test_and_clear_bit(
+                                               key_index, kp->keys_pressed);
+               }
+               gpio = mi->output_gpios[out];
+               if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+                       gpio_set_value(gpio, !polarity);
+               else
+                       gpio_direction_input(gpio);
+               out++;
+       }
+       kp->current_output = out;
+       if (out < mi->noutputs) {
+               gpio = mi->output_gpios[out];
+               if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+                       gpio_set_value(gpio, polarity);
+               else
+                       gpio_direction_output(gpio, polarity);
+               hrtimer_start(timer, mi->settle_time, HRTIMER_MODE_REL);
+               return HRTIMER_NORESTART;
+       }
+       if (gpio_keypad_flags & GPIOKPF_DEBOUNCE) {
+               if (kp->key_state_changed) {
+                       hrtimer_start(&kp->timer, mi->debounce_delay,
+                                     HRTIMER_MODE_REL);
+                       return HRTIMER_NORESTART;
+               }
+               kp->key_state_changed = kp->last_key_state_changed;
+       }
+       if (kp->key_state_changed) {
+               if (gpio_keypad_flags & GPIOKPF_REMOVE_SOME_PHANTOM_KEYS)
+                       remove_phantom_keys(kp);
+               key_index = 0;
+               for (out = 0; out < mi->noutputs; out++)
+                       for (in = 0; in < mi->ninputs; in++, key_index++)
+                               report_key(kp, key_index, out, in);
+               report_sync(kp);
+       }
+       if (!kp->use_irq || kp->some_keys_pressed) {
+               hrtimer_start(timer, mi->poll_time, HRTIMER_MODE_REL);
+               return HRTIMER_NORESTART;
+       }
+
+       /* No keys are pressed, reenable interrupt */
+       for (out = 0; out < mi->noutputs; out++) {
+               if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+                       gpio_set_value(mi->output_gpios[out], polarity);
+               else
+                       gpio_direction_output(mi->output_gpios[out], polarity);
+       }
+       for (in = 0; in < mi->ninputs; in++)
+               enable_irq(gpio_to_irq(mi->input_gpios[in]));
+       __pm_relax(&kp->wake_src);
+       return HRTIMER_NORESTART;
+}
+
+static irqreturn_t gpio_keypad_irq_handler(int irq_in, void *dev_id)
+{
+       int i;
+       struct gpio_kp *kp = dev_id;
+       struct gpio_event_matrix_info *mi = kp->keypad_info;
+       unsigned gpio_keypad_flags = mi->flags;
+
+       if (!kp->use_irq) {
+               /* ignore interrupt while registering the handler */
+               kp->disabled_irq = 1;
+               disable_irq_nosync(irq_in);
+               return IRQ_HANDLED;
+       }
+
+       for (i = 0; i < mi->ninputs; i++)
+               disable_irq_nosync(gpio_to_irq(mi->input_gpios[i]));
+       for (i = 0; i < mi->noutputs; i++) {
+               if (gpio_keypad_flags & GPIOKPF_DRIVE_INACTIVE)
+                       gpio_set_value(mi->output_gpios[i],
+                               !(gpio_keypad_flags & GPIOKPF_ACTIVE_HIGH));
+               else
+                       gpio_direction_input(mi->output_gpios[i]);
+       }
+       __pm_stay_awake(&kp->wake_src);
+       hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+       return IRQ_HANDLED;
+}
+
+static int gpio_keypad_request_irqs(struct gpio_kp *kp)
+{
+       int i;
+       int err;
+       unsigned int irq;
+       unsigned long request_flags;
+       struct gpio_event_matrix_info *mi = kp->keypad_info;
+
+       switch (mi->flags & (GPIOKPF_ACTIVE_HIGH|GPIOKPF_LEVEL_TRIGGERED_IRQ)) {
+       default:
+               request_flags = IRQF_TRIGGER_FALLING;
+               break;
+       case GPIOKPF_ACTIVE_HIGH:
+               request_flags = IRQF_TRIGGER_RISING;
+               break;
+       case GPIOKPF_LEVEL_TRIGGERED_IRQ:
+               request_flags = IRQF_TRIGGER_LOW;
+               break;
+       case GPIOKPF_LEVEL_TRIGGERED_IRQ | GPIOKPF_ACTIVE_HIGH:
+               request_flags = IRQF_TRIGGER_HIGH;
+               break;
+       }
+
+       for (i = 0; i < mi->ninputs; i++) {
+               err = irq = gpio_to_irq(mi->input_gpios[i]);
+               if (err < 0)
+                       goto err_gpio_get_irq_num_failed;
+               err = request_irq(irq, gpio_keypad_irq_handler, request_flags,
+                                 "gpio_kp", kp);
+               if (err) {
+                       pr_err("gpiomatrix: request_irq failed for input %d, "
+                               "irq %d\n", mi->input_gpios[i], irq);
+                       goto err_request_irq_failed;
+               }
+               err = enable_irq_wake(irq);
+               if (err) {
+                       pr_err("gpiomatrix: set_irq_wake failed for input %d, "
+                               "irq %d\n", mi->input_gpios[i], irq);
+               }
+               disable_irq(irq);
+               if (kp->disabled_irq) {
+                       kp->disabled_irq = 0;
+                       enable_irq(irq);
+               }
+       }
+       return 0;
+
+       for (i = mi->noutputs - 1; i >= 0; i--) {
+               free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+err_request_irq_failed:
+err_gpio_get_irq_num_failed:
+               ;
+       }
+       return err;
+}
+
+int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+       struct gpio_event_info *info, void **data, int func)
+{
+       int i;
+       int err;
+       int key_count;
+       struct gpio_kp *kp;
+       struct gpio_event_matrix_info *mi;
+
+       mi = container_of(info, struct gpio_event_matrix_info, info);
+       if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME) {
+               /* TODO: disable scanning */
+               return 0;
+       }
+
+       if (func == GPIO_EVENT_FUNC_INIT) {
+               if (mi->keymap == NULL ||
+                  mi->input_gpios == NULL ||
+                  mi->output_gpios == NULL) {
+                       err = -ENODEV;
+                       pr_err("gpiomatrix: Incomplete pdata\n");
+                       goto err_invalid_platform_data;
+               }
+               key_count = mi->ninputs * mi->noutputs;
+
+               *data = kp = kzalloc(sizeof(*kp) + sizeof(kp->keys_pressed[0]) *
+                                    BITS_TO_LONGS(key_count), GFP_KERNEL);
+               if (kp == NULL) {
+                       err = -ENOMEM;
+                       pr_err("gpiomatrix: Failed to allocate private data\n");
+                       goto err_kp_alloc_failed;
+               }
+               kp->input_devs = input_devs;
+               kp->keypad_info = mi;
+               for (i = 0; i < key_count; i++) {
+                       unsigned short keyentry = mi->keymap[i];
+                       unsigned short keycode = keyentry & MATRIX_KEY_MASK;
+                       unsigned short dev = keyentry >> MATRIX_CODE_BITS;
+                       if (dev >= input_devs->count) {
+                               pr_err("gpiomatrix: bad device index %d >= "
+                                       "%d for key code %d\n",
+                                       dev, input_devs->count, keycode);
+                               err = -EINVAL;
+                               goto err_bad_keymap;
+                       }
+                       if (keycode && keycode <= KEY_MAX)
+                               input_set_capability(input_devs->dev[dev],
+                                                       EV_KEY, keycode);
+               }
+
+               for (i = 0; i < mi->noutputs; i++) {
+                       err = gpio_request(mi->output_gpios[i], "gpio_kp_out");
+                       if (err) {
+                               pr_err("gpiomatrix: gpio_request failed for "
+                                       "output %d\n", mi->output_gpios[i]);
+                               goto err_request_output_gpio_failed;
+                       }
+                       if (gpio_cansleep(mi->output_gpios[i])) {
+                               pr_err("gpiomatrix: unsupported output gpio %d,"
+                                       " can sleep\n", mi->output_gpios[i]);
+                               err = -EINVAL;
+                               goto err_output_gpio_configure_failed;
+                       }
+                       if (mi->flags & GPIOKPF_DRIVE_INACTIVE)
+                               err = gpio_direction_output(mi->output_gpios[i],
+                                       !(mi->flags & GPIOKPF_ACTIVE_HIGH));
+                       else
+                               err = gpio_direction_input(mi->output_gpios[i]);
+                       if (err) {
+                               pr_err("gpiomatrix: gpio_configure failed for "
+                                       "output %d\n", mi->output_gpios[i]);
+                               goto err_output_gpio_configure_failed;
+                       }
+               }
+               for (i = 0; i < mi->ninputs; i++) {
+                       err = gpio_request(mi->input_gpios[i], "gpio_kp_in");
+                       if (err) {
+                               pr_err("gpiomatrix: gpio_request failed for "
+                                       "input %d\n", mi->input_gpios[i]);
+                               goto err_request_input_gpio_failed;
+                       }
+                       err = gpio_direction_input(mi->input_gpios[i]);
+                       if (err) {
+                               pr_err("gpiomatrix: gpio_direction_input failed"
+                                       " for input %d\n", mi->input_gpios[i]);
+                               goto err_gpio_direction_input_failed;
+                       }
+               }
+               kp->current_output = mi->noutputs;
+               kp->key_state_changed = 1;
+
+               hrtimer_init(&kp->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+               kp->timer.function = gpio_keypad_timer_func;
+               wakeup_source_init(&kp->wake_src, "gpio_kp");
+               err = gpio_keypad_request_irqs(kp);
+               kp->use_irq = err == 0;
+
+               pr_info("GPIO Matrix Keypad Driver: Start keypad matrix for "
+                       "%s%s in %s mode\n", input_devs->dev[0]->name,
+                       (input_devs->count > 1) ? "..." : "",
+                       kp->use_irq ? "interrupt" : "polling");
+
+               if (kp->use_irq)
+                       __pm_stay_awake(&kp->wake_src);
+               hrtimer_start(&kp->timer, ktime_set(0, 0), HRTIMER_MODE_REL);
+
+               return 0;
+       }
+
+       err = 0;
+       kp = *data;
+
+       if (kp->use_irq)
+               for (i = mi->noutputs - 1; i >= 0; i--)
+                       free_irq(gpio_to_irq(mi->input_gpios[i]), kp);
+
+       hrtimer_cancel(&kp->timer);
+       wakeup_source_trash(&kp->wake_src);
+       for (i = mi->noutputs - 1; i >= 0; i--) {
+err_gpio_direction_input_failed:
+               gpio_free(mi->input_gpios[i]);
+err_request_input_gpio_failed:
+               ;
+       }
+       for (i = mi->noutputs - 1; i >= 0; i--) {
+err_output_gpio_configure_failed:
+               gpio_free(mi->output_gpios[i]);
+err_request_output_gpio_failed:
+               ;
+       }
+err_bad_keymap:
+       kfree(kp);
+err_kp_alloc_failed:
+err_invalid_platform_data:
+       return err;
+}
diff --git a/drivers/input/misc/gpio_output.c b/drivers/input/misc/gpio_output.c

new file mode 100644 (file)

index 0000000..2aac2fa
--- /dev/null
+++ b/drivers/input/misc/gpio_output.c
@@ -0,0 +1,97 @@
+/* drivers/input/misc/gpio_output.c
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/gpio.h>
+#include <linux/gpio_event.h>
+
+int gpio_event_output_event(
+       struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+       void **data, unsigned int dev, unsigned int type,
+       unsigned int code, int value)
+{
+       int i;
+       struct gpio_event_output_info *oi;
+       oi = container_of(info, struct gpio_event_output_info, info);
+       if (type != oi->type)
+               return 0;
+       if (!(oi->flags & GPIOEDF_ACTIVE_HIGH))
+               value = !value;
+       for (i = 0; i < oi->keymap_size; i++)
+               if (dev == oi->keymap[i].dev && code == oi->keymap[i].code)
+                       gpio_set_value(oi->keymap[i].gpio, value);
+       return 0;
+}
+
+int gpio_event_output_func(
+       struct gpio_event_input_devs *input_devs, struct gpio_event_info *info,
+       void **data, int func)
+{
+       int ret;
+       int i;
+       struct gpio_event_output_info *oi;
+       oi = container_of(info, struct gpio_event_output_info, info);
+
+       if (func == GPIO_EVENT_FUNC_SUSPEND || func == GPIO_EVENT_FUNC_RESUME)
+               return 0;
+
+       if (func == GPIO_EVENT_FUNC_INIT) {
+               int output_level = !(oi->flags & GPIOEDF_ACTIVE_HIGH);
+
+               for (i = 0; i < oi->keymap_size; i++) {
+                       int dev = oi->keymap[i].dev;
+                       if (dev >= input_devs->count) {
+                               pr_err("gpio_event_output_func: bad device "
+                                       "index %d >= %d for key code %d\n",
+                                       dev, input_devs->count,
+                                       oi->keymap[i].code);
+                               ret = -EINVAL;
+                               goto err_bad_keymap;
+                       }
+                       input_set_capability(input_devs->dev[dev], oi->type,
+                                            oi->keymap[i].code);
+               }
+
+               for (i = 0; i < oi->keymap_size; i++) {
+                       ret = gpio_request(oi->keymap[i].gpio,
+                                          "gpio_event_output");
+                       if (ret) {
+                               pr_err("gpio_event_output_func: gpio_request "
+                                       "failed for %d\n", oi->keymap[i].gpio);
+                               goto err_gpio_request_failed;
+                       }
+                       ret = gpio_direction_output(oi->keymap[i].gpio,
+                                                   output_level);
+                       if (ret) {
+                               pr_err("gpio_event_output_func: "
+                                       "gpio_direction_output failed for %d\n",
+                                       oi->keymap[i].gpio);
+                               goto err_gpio_direction_output_failed;
+                       }
+               }
+               return 0;
+       }
+
+       ret = 0;
+       for (i = oi->keymap_size - 1; i >= 0; i--) {
+err_gpio_direction_output_failed:
+               gpio_free(oi->keymap[i].gpio);
+err_gpio_request_failed:
+               ;
+       }
+err_bad_keymap:
+       return ret;
+}
+
diff --git a/drivers/input/misc/keychord.c b/drivers/input/misc/keychord.c

new file mode 100644 (file)

index 0000000..b09ecf7
--- /dev/null
+++ b/drivers/input/misc/keychord.c
@@ -0,0 +1,467 @@
+/*
+ *  drivers/input/misc/keychord.c
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#include <linux/poll.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/keychord.h>
+#include <linux/sched.h>
+
+#define KEYCHORD_NAME          "keychord"
+#define BUFFER_SIZE                    16
+
+MODULE_AUTHOR("Mike Lockwood <lockwood@android.com>");
+MODULE_DESCRIPTION("Key chord input driver");
+MODULE_SUPPORTED_DEVICE("keychord");
+MODULE_LICENSE("GPL");
+
+#define NEXT_KEYCHORD(kc) ((struct input_keychord *) \
+               ((char *)kc + sizeof(struct input_keychord) + \
+               kc->count * sizeof(kc->keycodes[0])))
+
+struct keychord_device {
+       struct input_handler    input_handler;
+       int                     registered;
+
+       /* list of keychords to monitor */
+       struct input_keychord   *keychords;
+       int                     keychord_count;
+
+       /* bitmask of keys contained in our keychords */
+       unsigned long keybit[BITS_TO_LONGS(KEY_CNT)];
+       /* current state of the keys */
+       unsigned long keystate[BITS_TO_LONGS(KEY_CNT)];
+       /* number of keys that are currently pressed */
+       int key_down;
+
+       /* second input_device_id is needed for null termination */
+       struct input_device_id  device_ids[2];
+
+       spinlock_t              lock;
+       wait_queue_head_t       waitq;
+       unsigned char           head;
+       unsigned char           tail;
+       __u16                   buff[BUFFER_SIZE];
+       /* Bit to serialize writes to this device */
+#define KEYCHORD_BUSY                  0x01
+       unsigned long           flags;
+       wait_queue_head_t       write_waitq;
+};
+
+static int check_keychord(struct keychord_device *kdev,
+               struct input_keychord *keychord)
+{
+       int i;
+
+       if (keychord->count != kdev->key_down)
+               return 0;
+
+       for (i = 0; i < keychord->count; i++) {
+               if (!test_bit(keychord->keycodes[i], kdev->keystate))
+                       return 0;
+       }
+
+       /* we have a match */
+       return 1;
+}
+
+static void keychord_event(struct input_handle *handle, unsigned int type,
+                          unsigned int code, int value)
+{
+       struct keychord_device *kdev = handle->private;
+       struct input_keychord *keychord;
+       unsigned long flags;
+       int i, got_chord = 0;
+
+       if (type != EV_KEY || code >= KEY_MAX)
+               return;
+
+       spin_lock_irqsave(&kdev->lock, flags);
+       /* do nothing if key state did not change */
+       if (!test_bit(code, kdev->keystate) == !value)
+               goto done;
+       __change_bit(code, kdev->keystate);
+       if (value)
+               kdev->key_down++;
+       else
+               kdev->key_down--;
+
+       /* don't notify on key up */
+       if (!value)
+               goto done;
+       /* ignore this event if it is not one of the keys we are monitoring */
+       if (!test_bit(code, kdev->keybit))
+               goto done;
+
+       keychord = kdev->keychords;
+       if (!keychord)
+               goto done;
+
+       /* check to see if the keyboard state matches any keychords */
+       for (i = 0; i < kdev->keychord_count; i++) {
+               if (check_keychord(kdev, keychord)) {
+                       kdev->buff[kdev->head] = keychord->id;
+                       kdev->head = (kdev->head + 1) % BUFFER_SIZE;
+                       got_chord = 1;
+                       break;
+               }
+               /* skip to next keychord */
+               keychord = NEXT_KEYCHORD(keychord);
+       }
+
+done:
+       spin_unlock_irqrestore(&kdev->lock, flags);
+
+       if (got_chord) {
+               pr_info("keychord: got keychord id %d. Any tasks: %d\n",
+                       keychord->id,
+                       !list_empty_careful(&kdev->waitq.head));
+               wake_up_interruptible(&kdev->waitq);
+       }
+}
+
+static int keychord_connect(struct input_handler *handler,
+                                         struct input_dev *dev,
+                                         const struct input_device_id *id)
+{
+       int i, ret;
+       struct input_handle *handle;
+       struct keychord_device *kdev =
+               container_of(handler, struct keychord_device, input_handler);
+
+       /*
+        * ignore this input device if it does not contain any keycodes
+        * that we are monitoring
+        */
+       for (i = 0; i < KEY_MAX; i++) {
+               if (test_bit(i, kdev->keybit) && test_bit(i, dev->keybit))
+                       break;
+       }
+       if (i == KEY_MAX)
+               return -ENODEV;
+
+       handle = kzalloc(sizeof(*handle), GFP_KERNEL);
+       if (!handle)
+               return -ENOMEM;
+
+       handle->dev = dev;
+       handle->handler = handler;
+       handle->name = KEYCHORD_NAME;
+       handle->private = kdev;
+
+       ret = input_register_handle(handle);
+       if (ret)
+               goto err_input_register_handle;
+
+       ret = input_open_device(handle);
+       if (ret)
+               goto err_input_open_device;
+
+       pr_info("keychord: using input dev %s for fevent\n", dev->name);
+       return 0;
+
+err_input_open_device:
+       input_unregister_handle(handle);
+err_input_register_handle:
+       kfree(handle);
+       return ret;
+}
+
+static void keychord_disconnect(struct input_handle *handle)
+{
+       input_close_device(handle);
+       input_unregister_handle(handle);
+       kfree(handle);
+}
+
+/*
+ * keychord_read is used to read keychord events from the driver
+ */
+static ssize_t keychord_read(struct file *file, char __user *buffer,
+               size_t count, loff_t *ppos)
+{
+       struct keychord_device *kdev = file->private_data;
+       __u16   id;
+       int retval;
+       unsigned long flags;
+
+       if (count < sizeof(id))
+               return -EINVAL;
+       count = sizeof(id);
+
+       if (kdev->head == kdev->tail && (file->f_flags & O_NONBLOCK))
+               return -EAGAIN;
+
+       retval = wait_event_interruptible(kdev->waitq,
+                       kdev->head != kdev->tail);
+       if (retval)
+               return retval;
+
+       spin_lock_irqsave(&kdev->lock, flags);
+       /* pop a keychord ID off the queue */
+       id = kdev->buff[kdev->tail];
+       kdev->tail = (kdev->tail + 1) % BUFFER_SIZE;
+       spin_unlock_irqrestore(&kdev->lock, flags);
+
+       if (copy_to_user(buffer, &id, count))
+               return -EFAULT;
+
+       return count;
+}
+
+/*
+ * serializes writes on a device. can use mutex_lock_interruptible()
+ * for this particular use case as well - a matter of preference.
+ */
+static int
+keychord_write_lock(struct keychord_device *kdev)
+{
+       int ret;
+       unsigned long flags;
+
+       spin_lock_irqsave(&kdev->lock, flags);
+       while (kdev->flags & KEYCHORD_BUSY) {
+               spin_unlock_irqrestore(&kdev->lock, flags);
+               ret = wait_event_interruptible(kdev->write_waitq,
+                              ((kdev->flags & KEYCHORD_BUSY) == 0));
+               if (ret)
+                       return ret;
+               spin_lock_irqsave(&kdev->lock, flags);
+       }
+       kdev->flags |= KEYCHORD_BUSY;
+       spin_unlock_irqrestore(&kdev->lock, flags);
+       return 0;
+}
+
+static void
+keychord_write_unlock(struct keychord_device *kdev)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&kdev->lock, flags);
+       kdev->flags &= ~KEYCHORD_BUSY;
+       spin_unlock_irqrestore(&kdev->lock, flags);
+       wake_up_interruptible(&kdev->write_waitq);
+}
+
+/*
+ * keychord_write is used to configure the driver
+ */
+static ssize_t keychord_write(struct file *file, const char __user *buffer,
+               size_t count, loff_t *ppos)
+{
+       struct keychord_device *kdev = file->private_data;
+       struct input_keychord *keychords = 0;
+       struct input_keychord *keychord;
+       int ret, i, key;
+       unsigned long flags;
+       size_t resid = count;
+       size_t key_bytes;
+
+       if (count < sizeof(struct input_keychord))
+               return -EINVAL;
+       keychords = kzalloc(count, GFP_KERNEL);
+       if (!keychords)
+               return -ENOMEM;
+
+       /* read list of keychords from userspace */
+       if (copy_from_user(keychords, buffer, count)) {
+               kfree(keychords);
+               return -EFAULT;
+       }
+
+       /*
+        * Serialize writes to this device to prevent various races.
+        * 1) writers racing here could do duplicate input_unregister_handler()
+        *    calls, resulting in attempting to unlink a node from a list that
+        *    does not exist.
+        * 2) writers racing here could do duplicate input_register_handler() calls
+        *    below, resulting in a duplicate insertion of a node into the list.
+        * 3) a double kfree of keychords can occur (in the event that
+        *    input_register_handler() fails below.
+        */
+       ret = keychord_write_lock(kdev);
+       if (ret) {
+               kfree(keychords);
+               return ret;
+       }
+
+       /* unregister handler before changing configuration */
+       if (kdev->registered) {
+               input_unregister_handler(&kdev->input_handler);
+               kdev->registered = 0;
+       }
+
+       spin_lock_irqsave(&kdev->lock, flags);
+       /* clear any existing configuration */
+       kfree(kdev->keychords);
+       kdev->keychords = 0;
+       kdev->keychord_count = 0;
+       kdev->key_down = 0;
+       memset(kdev->keybit, 0, sizeof(kdev->keybit));
+       memset(kdev->keystate, 0, sizeof(kdev->keystate));
+       kdev->head = kdev->tail = 0;
+
+       keychord = keychords;
+
+       while (resid > 0) {
+               /* Is the entire keychord entry header present ? */
+               if (resid < sizeof(struct input_keychord)) {
+                       pr_err("keychord: Insufficient bytes present for header %zu\n",
+                              resid);
+                       goto err_unlock_return;
+               }
+               resid -= sizeof(struct input_keychord);
+               if (keychord->count <= 0) {
+                       pr_err("keychord: invalid keycode count %d\n",
+                               keychord->count);
+                       goto err_unlock_return;
+               }
+               key_bytes = keychord->count * sizeof(keychord->keycodes[0]);
+               /* Do we have all the expected keycodes ? */
+               if (resid < key_bytes) {
+                       pr_err("keychord: Insufficient bytes present for keycount %zu\n",
+                              resid);
+                       goto err_unlock_return;
+               }
+               resid -= key_bytes;
+
+               if (keychord->version != KEYCHORD_VERSION) {
+                       pr_err("keychord: unsupported version %d\n",
+                               keychord->version);
+                       goto err_unlock_return;
+               }
+
+               /* keep track of the keys we are monitoring in keybit */
+               for (i = 0; i < keychord->count; i++) {
+                       key = keychord->keycodes[i];
+                       if (key < 0 || key >= KEY_CNT) {
+                               pr_err("keychord: keycode %d out of range\n",
+                                       key);
+                               goto err_unlock_return;
+                       }
+                       __set_bit(key, kdev->keybit);
+               }
+
+               kdev->keychord_count++;
+               keychord = NEXT_KEYCHORD(keychord);
+       }
+
+       kdev->keychords = keychords;
+       spin_unlock_irqrestore(&kdev->lock, flags);
+
+       ret = input_register_handler(&kdev->input_handler);
+       if (ret) {
+               kfree(keychords);
+               kdev->keychords = 0;
+               keychord_write_unlock(kdev);
+               return ret;
+       }
+       kdev->registered = 1;
+
+       keychord_write_unlock(kdev);
+
+       return count;
+
+err_unlock_return:
+       spin_unlock_irqrestore(&kdev->lock, flags);
+       kfree(keychords);
+       keychord_write_unlock(kdev);
+       return -EINVAL;
+}
+
+static unsigned int keychord_poll(struct file *file, poll_table *wait)
+{
+       struct keychord_device *kdev = file->private_data;
+
+       poll_wait(file, &kdev->waitq, wait);
+
+       if (kdev->head != kdev->tail)
+               return POLLIN | POLLRDNORM;
+
+       return 0;
+}
+
+static int keychord_open(struct inode *inode, struct file *file)
+{
+       struct keychord_device *kdev;
+
+       kdev = kzalloc(sizeof(struct keychord_device), GFP_KERNEL);
+       if (!kdev)
+               return -ENOMEM;
+
+       spin_lock_init(&kdev->lock);
+       init_waitqueue_head(&kdev->waitq);
+       init_waitqueue_head(&kdev->write_waitq);
+
+       kdev->input_handler.event = keychord_event;
+       kdev->input_handler.connect = keychord_connect;
+       kdev->input_handler.disconnect = keychord_disconnect;
+       kdev->input_handler.name = KEYCHORD_NAME;
+       kdev->input_handler.id_table = kdev->device_ids;
+
+       kdev->device_ids[0].flags = INPUT_DEVICE_ID_MATCH_EVBIT;
+       __set_bit(EV_KEY, kdev->device_ids[0].evbit);
+
+       file->private_data = kdev;
+
+       return 0;
+}
+
+static int keychord_release(struct inode *inode, struct file *file)
+{
+       struct keychord_device *kdev = file->private_data;
+
+       if (kdev->registered)
+               input_unregister_handler(&kdev->input_handler);
+       kfree(kdev->keychords);
+       kfree(kdev);
+
+       return 0;
+}
+
+static const struct file_operations keychord_fops = {
+       .owner          = THIS_MODULE,
+       .open           = keychord_open,
+       .release        = keychord_release,
+       .read           = keychord_read,
+       .write          = keychord_write,
+       .poll           = keychord_poll,
+};
+
+static struct miscdevice keychord_misc = {
+       .fops           = &keychord_fops,
+       .name           = KEYCHORD_NAME,
+       .minor          = MISC_DYNAMIC_MINOR,
+};
+
+static int __init keychord_init(void)
+{
+       return misc_register(&keychord_misc);
+}
+
+static void __exit keychord_exit(void)
+{
+       misc_deregister(&keychord_misc);
+}
+
+module_init(keychord_init);
+module_exit(keychord_exit);
diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig

index 4a249ee86364c488ba31d2c660e690379e85e1f6..6ac0297db28c7e2b8d6d84725fe3dcf60d41c122 100644 (file)
--- a/drivers/md/Kconfig
+++ b/drivers/md/Kconfig
@@ -460,6 +460,21 @@ config DM_VERITY
  
           If unsure, say N.
  
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+       bool "Prefetch size 128"
+
+config DM_VERITY_HASH_PREFETCH_MIN_SIZE
+       int "Verity hash prefetch minimum size"
+       depends on DM_VERITY
+       range 1 4096
+       default 128 if DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+       default 1
+       ---help---
+         This sets minimum number of hash blocks to prefetch for dm-verity.
+         For devices like eMMC, having larger prefetch size like 128 can improve
+         performance with increased memory consumption for keeping more hashes
+         in RAM.
+
  config DM_VERITY_FEC
         bool "Verity forward error correction support"
         depends on DM_VERITY
@@ -540,4 +555,21 @@ config DM_ZONED
  
           If unsure, say N.
  
+config DM_ANDROID_VERITY
+       bool "Android verity target support"
+       depends on DM_VERITY
+       depends on X509_CERTIFICATE_PARSER
+       depends on SYSTEM_TRUSTED_KEYRING
+       depends on PUBLIC_KEY_ALGO_RSA
+       depends on KEYS
+       depends on ASYMMETRIC_KEY_TYPE
+       depends on ASYMMETRIC_PUBLIC_KEY_SUBTYPE
+       select DM_VERITY_HASH_PREFETCH_MIN_SIZE_128
+       ---help---
+         This device-mapper target is virtually a VERITY target. This
+         target is setup by reading the metadata contents piggybacked
+         to the actual data blocks in the block device. The signature
+         of the metadata contents are verified against the key included
+         in the system keyring. Upon success, the underlying verity
+         target is setup.
  endif # MD
diff --git a/drivers/md/Makefile b/drivers/md/Makefile

index e94b6f9be941428890424902c5a8a6b56e13e58c..83109ad30a4897eaa5d069bfb5996ed305533e4c 100644 (file)
--- a/drivers/md/Makefile
+++ b/drivers/md/Makefile
@@ -71,3 +71,7 @@ endif
  ifeq ($(CONFIG_DM_VERITY_FEC),y)
  dm-verity-objs                 += dm-verity-fec.o
  endif
+
+ifeq ($(CONFIG_DM_ANDROID_VERITY),y)
+dm-verity-objs                 += dm-android-verity.o
+endif
diff --git a/drivers/md/dm-android-verity.c b/drivers/md/dm-android-verity.c

new file mode 100644 (file)

index 0000000..0dd6924
--- /dev/null
+++ b/drivers/md/dm-android-verity.c
@@ -0,0 +1,949 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/buffer_head.h>
+#include <linux/debugfs.h>
+#include <linux/delay.h>
+#include <linux/device.h>
+#include <linux/device-mapper.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/fcntl.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/key.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/of.h>
+#include <linux/reboot.h>
+#include <linux/string.h>
+#include <linux/vmalloc.h>
+
+#include <asm/setup.h>
+#include <crypto/hash.h>
+#include <crypto/public_key.h>
+#include <crypto/sha.h>
+#include <keys/asymmetric-type.h>
+#include <keys/system_keyring.h>
+
+#include "dm-verity.h"
+#include "dm-android-verity.h"
+
+static char verifiedbootstate[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritymode[VERITY_COMMANDLINE_PARAM_LENGTH];
+static char veritykeyid[VERITY_DEFAULT_KEY_ID_LENGTH];
+static char buildvariant[BUILD_VARIANT];
+
+static bool target_added;
+static bool verity_enabled = true;
+struct dentry *debug_dir;
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+
+static struct target_type android_verity_target = {
+       .name                   = "android-verity",
+       .version                = {1, 0, 0},
+       .module                 = THIS_MODULE,
+       .ctr                    = android_verity_ctr,
+       .dtr                    = verity_dtr,
+       .map                    = verity_map,
+       .status                 = verity_status,
+       .prepare_ioctl          = verity_prepare_ioctl,
+       .iterate_devices        = verity_iterate_devices,
+       .io_hints               = verity_io_hints,
+};
+
+static int __init verified_boot_state_param(char *line)
+{
+       strlcpy(verifiedbootstate, line, sizeof(verifiedbootstate));
+       return 1;
+}
+
+__setup("androidboot.verifiedbootstate=", verified_boot_state_param);
+
+static int __init verity_mode_param(char *line)
+{
+       strlcpy(veritymode, line, sizeof(veritymode));
+       return 1;
+}
+
+__setup("androidboot.veritymode=", verity_mode_param);
+
+static int __init verity_keyid_param(char *line)
+{
+       strlcpy(veritykeyid, line, sizeof(veritykeyid));
+       return 1;
+}
+
+__setup("veritykeyid=", verity_keyid_param);
+
+static int __init verity_buildvariant(char *line)
+{
+       strlcpy(buildvariant, line, sizeof(buildvariant));
+       return 1;
+}
+
+__setup("buildvariant=", verity_buildvariant);
+
+static inline bool default_verity_key_id(void)
+{
+       return veritykeyid[0] != '\0';
+}
+
+static inline bool is_eng(void)
+{
+       static const char typeeng[]  = "eng";
+
+       return !strncmp(buildvariant, typeeng, sizeof(typeeng));
+}
+
+static inline bool is_userdebug(void)
+{
+       static const char typeuserdebug[]  = "userdebug";
+
+       return !strncmp(buildvariant, typeuserdebug, sizeof(typeuserdebug));
+}
+
+static inline bool is_unlocked(void)
+{
+       static const char unlocked[] = "orange";
+
+       return !strncmp(verifiedbootstate, unlocked, sizeof(unlocked));
+}
+
+static int table_extract_mpi_array(struct public_key_signature *pks,
+                               const void *data, size_t len)
+{
+       MPI mpi = mpi_read_raw_data(data, len);
+
+       if (!mpi) {
+               DMERR("Error while allocating mpi array");
+               return -ENOMEM;
+       }
+
+       pks->mpi[0] = mpi;
+       pks->nr_mpi = 1;
+       return 0;
+}
+
+static struct public_key_signature *table_make_digest(
+                                               enum hash_algo hash,
+                                               const void *table,
+                                               unsigned long table_len)
+{
+       struct public_key_signature *pks = NULL;
+       struct crypto_shash *tfm;
+       struct shash_desc *desc;
+       size_t digest_size, desc_size;
+       int ret;
+
+       /* Allocate the hashing algorithm we're going to need and find out how
+        * big the hash operational data will be.
+        */
+       tfm = crypto_alloc_shash(hash_algo_name[hash], 0, 0);
+       if (IS_ERR(tfm))
+               return ERR_CAST(tfm);
+
+       desc_size = crypto_shash_descsize(tfm) + sizeof(*desc);
+       digest_size = crypto_shash_digestsize(tfm);
+
+       /* We allocate the hash operational data storage on the end of out
+        * context data and the digest output buffer on the end of that.
+        */
+       ret = -ENOMEM;
+       pks = kzalloc(digest_size + sizeof(*pks) + desc_size, GFP_KERNEL);
+       if (!pks)
+               goto error;
+
+       pks->pkey_hash_algo = hash;
+       pks->digest = (u8 *)pks + sizeof(*pks) + desc_size;
+       pks->digest_size = digest_size;
+
+       desc = (struct shash_desc *)(pks + 1);
+       desc->tfm = tfm;
+       desc->flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+
+       ret = crypto_shash_init(desc);
+       if (ret < 0)
+               goto error;
+
+       ret = crypto_shash_finup(desc, table, table_len, pks->digest);
+       if (ret < 0)
+               goto error;
+
+       crypto_free_shash(tfm);
+       return pks;
+
+error:
+       kfree(pks);
+       crypto_free_shash(tfm);
+       return ERR_PTR(ret);
+}
+
+static int read_block_dev(struct bio_read *payload, struct block_device *bdev,
+               sector_t offset, int length)
+{
+       struct bio *bio;
+       int err = 0, i;
+
+       payload->number_of_pages = DIV_ROUND_UP(length, PAGE_SIZE);
+
+       bio = bio_alloc(GFP_KERNEL, payload->number_of_pages);
+       if (!bio) {
+               DMERR("Error while allocating bio");
+               return -ENOMEM;
+       }
+
+       bio->bi_bdev = bdev;
+       bio->bi_iter.bi_sector = offset;
+
+       payload->page_io = kzalloc(sizeof(struct page *) *
+               payload->number_of_pages, GFP_KERNEL);
+       if (!payload->page_io) {
+               DMERR("page_io array alloc failed");
+               err = -ENOMEM;
+               goto free_bio;
+       }
+
+       for (i = 0; i < payload->number_of_pages; i++) {
+               payload->page_io[i] = alloc_page(GFP_KERNEL);
+               if (!payload->page_io[i]) {
+                       DMERR("alloc_page failed");
+                       err = -ENOMEM;
+                       goto free_pages;
+               }
+               if (!bio_add_page(bio, payload->page_io[i], PAGE_SIZE, 0)) {
+                       DMERR("bio_add_page error");
+                       err = -EIO;
+                       goto free_pages;
+               }
+       }
+
+       if (!submit_bio_wait(READ, bio))
+               /* success */
+               goto free_bio;
+       DMERR("bio read failed");
+       err = -EIO;
+
+free_pages:
+       for (i = 0; i < payload->number_of_pages; i++)
+               if (payload->page_io[i])
+                       __free_page(payload->page_io[i]);
+       kfree(payload->page_io);
+free_bio:
+       bio_put(bio);
+       return err;
+}
+
+static inline u64 fec_div_round_up(u64 x, u64 y)
+{
+       u64 remainder;
+
+       return div64_u64_rem(x, y, &remainder) +
+               (remainder > 0 ? 1 : 0);
+}
+
+static inline void populate_fec_metadata(struct fec_header *header,
+                               struct fec_ecc_metadata *ecc)
+{
+       ecc->blocks = fec_div_round_up(le64_to_cpu(header->inp_size),
+                       FEC_BLOCK_SIZE);
+       ecc->roots = le32_to_cpu(header->roots);
+       ecc->start = le64_to_cpu(header->inp_size);
+}
+
+static inline int validate_fec_header(struct fec_header *header, u64 offset)
+{
+       /* move offset to make the sanity check work for backup header
+        * as well. */
+       offset -= offset % FEC_BLOCK_SIZE;
+       if (le32_to_cpu(header->magic) != FEC_MAGIC ||
+               le32_to_cpu(header->version) != FEC_VERSION ||
+               le32_to_cpu(header->size) != sizeof(struct fec_header) ||
+               le32_to_cpu(header->roots) == 0 ||
+               le32_to_cpu(header->roots) >= FEC_RSM)
+               return -EINVAL;
+
+       return 0;
+}
+
+static int extract_fec_header(dev_t dev, struct fec_header *fec,
+                               struct fec_ecc_metadata *ecc)
+{
+       u64 device_size;
+       struct bio_read payload;
+       int i, err = 0;
+       struct block_device *bdev;
+
+       bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+       if (IS_ERR_OR_NULL(bdev)) {
+               DMERR("bdev get error");
+               return PTR_ERR(bdev);
+       }
+
+       device_size = i_size_read(bdev->bd_inode);
+
+       /* fec metadata size is a power of 2 and PAGE_SIZE
+        * is a power of 2 as well.
+        */
+       BUG_ON(FEC_BLOCK_SIZE > PAGE_SIZE);
+       /* 512 byte sector alignment */
+       BUG_ON(((device_size - FEC_BLOCK_SIZE) % (1 << SECTOR_SHIFT)) != 0);
+
+       err = read_block_dev(&payload, bdev, (device_size -
+               FEC_BLOCK_SIZE) / (1 << SECTOR_SHIFT), FEC_BLOCK_SIZE);
+       if (err) {
+               DMERR("Error while reading verity metadata");
+               goto error;
+       }
+
+       BUG_ON(sizeof(struct fec_header) > PAGE_SIZE);
+       memcpy(fec, page_address(payload.page_io[0]),
+                       sizeof(*fec));
+
+       ecc->valid = true;
+       if (validate_fec_header(fec, device_size - FEC_BLOCK_SIZE)) {
+               /* Try the backup header */
+               memcpy(fec, page_address(payload.page_io[0]) + FEC_BLOCK_SIZE
+                       - sizeof(*fec) ,
+                       sizeof(*fec));
+               if (validate_fec_header(fec, device_size -
+                       sizeof(struct fec_header)))
+                       ecc->valid = false;
+       }
+
+       if (ecc->valid)
+               populate_fec_metadata(fec, ecc);
+
+       for (i = 0; i < payload.number_of_pages; i++)
+               __free_page(payload.page_io[i]);
+       kfree(payload.page_io);
+
+error:
+       blkdev_put(bdev, FMODE_READ);
+       return err;
+}
+static void find_metadata_offset(struct fec_header *fec,
+               struct block_device *bdev, u64 *metadata_offset)
+{
+       u64 device_size;
+
+       device_size = i_size_read(bdev->bd_inode);
+
+       if (le32_to_cpu(fec->magic) == FEC_MAGIC)
+               *metadata_offset = le64_to_cpu(fec->inp_size) -
+                                       VERITY_METADATA_SIZE;
+       else
+               *metadata_offset = device_size - VERITY_METADATA_SIZE;
+}
+
+static int find_size(dev_t dev, u64 *device_size)
+{
+       struct block_device *bdev;
+
+       bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+       if (IS_ERR_OR_NULL(bdev)) {
+               DMERR("blkdev_get_by_dev failed");
+               return PTR_ERR(bdev);
+       }
+
+       *device_size = i_size_read(bdev->bd_inode);
+       *device_size >>= SECTOR_SHIFT;
+
+       DMINFO("blkdev size in sectors: %llu", *device_size);
+       blkdev_put(bdev, FMODE_READ);
+       return 0;
+}
+
+static int verify_header(struct android_metadata_header *header)
+{
+       int retval = -EINVAL;
+
+       if (is_userdebug() && le32_to_cpu(header->magic_number) ==
+                       VERITY_METADATA_MAGIC_DISABLE)
+               return VERITY_STATE_DISABLE;
+
+       if (!(le32_to_cpu(header->magic_number) ==
+                       VERITY_METADATA_MAGIC_NUMBER) ||
+                       (le32_to_cpu(header->magic_number) ==
+                       VERITY_METADATA_MAGIC_DISABLE)) {
+               DMERR("Incorrect magic number");
+               return retval;
+       }
+
+       if (le32_to_cpu(header->protocol_version) !=
+                       VERITY_METADATA_VERSION) {
+               DMERR("Unsupported version %u",
+                       le32_to_cpu(header->protocol_version));
+               return retval;
+       }
+
+       return 0;
+}
+
+static int extract_metadata(dev_t dev, struct fec_header *fec,
+                               struct android_metadata **metadata,
+                               bool *verity_enabled)
+{
+       struct block_device *bdev;
+       struct android_metadata_header *header;
+       int i;
+       u32 table_length, copy_length, offset;
+       u64 metadata_offset;
+       struct bio_read payload;
+       int err = 0;
+
+       bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+
+       if (IS_ERR_OR_NULL(bdev)) {
+               DMERR("blkdev_get_by_dev failed");
+               return -ENODEV;
+       }
+
+       find_metadata_offset(fec, bdev, &metadata_offset);
+
+       /* Verity metadata size is a power of 2 and PAGE_SIZE
+        * is a power of 2 as well.
+        * PAGE_SIZE is also a multiple of 512 bytes.
+       */
+       if (VERITY_METADATA_SIZE > PAGE_SIZE)
+               BUG_ON(VERITY_METADATA_SIZE % PAGE_SIZE != 0);
+       /* 512 byte sector alignment */
+       BUG_ON(metadata_offset % (1 << SECTOR_SHIFT) != 0);
+
+       err = read_block_dev(&payload, bdev, metadata_offset /
+               (1 << SECTOR_SHIFT), VERITY_METADATA_SIZE);
+       if (err) {
+               DMERR("Error while reading verity metadata");
+               goto blkdev_release;
+       }
+
+       header = kzalloc(sizeof(*header), GFP_KERNEL);
+       if (!header) {
+               DMERR("kzalloc failed for header");
+               err = -ENOMEM;
+               goto free_payload;
+       }
+
+       memcpy(header, page_address(payload.page_io[0]),
+               sizeof(*header));
+
+       DMINFO("bio magic_number:%u protocol_version:%d table_length:%u",
+               le32_to_cpu(header->magic_number),
+               le32_to_cpu(header->protocol_version),
+               le32_to_cpu(header->table_length));
+
+       err = verify_header(header);
+
+       if (err == VERITY_STATE_DISABLE) {
+               DMERR("Mounting root with verity disabled");
+               *verity_enabled = false;
+               /* we would still have to read the metadata to figure out
+                * the data blocks size. Or may be could map the entire
+                * partition similar to mounting the device.
+                *
+                * Reset error as well as the verity_enabled flag is changed.
+                */
+               err = 0;
+       } else if (err)
+               goto free_header;
+
+       *metadata = kzalloc(sizeof(**metadata), GFP_KERNEL);
+       if (!*metadata) {
+               DMERR("kzalloc for metadata failed");
+               err = -ENOMEM;
+               goto free_header;
+       }
+
+       (*metadata)->header = header;
+       table_length = le32_to_cpu(header->table_length);
+
+       if (table_length == 0 ||
+               table_length > (VERITY_METADATA_SIZE -
+                       sizeof(struct android_metadata_header))) {
+               DMERR("table_length too long");
+               err = -EINVAL;
+               goto free_metadata;
+       }
+
+       (*metadata)->verity_table = kzalloc(table_length + 1, GFP_KERNEL);
+
+       if (!(*metadata)->verity_table) {
+               DMERR("kzalloc verity_table failed");
+               err = -ENOMEM;
+               goto free_metadata;
+       }
+
+       if (sizeof(struct android_metadata_header) +
+                       table_length <= PAGE_SIZE) {
+               memcpy((*metadata)->verity_table,
+                       page_address(payload.page_io[0])
+                       + sizeof(struct android_metadata_header),
+                       table_length);
+       } else {
+               copy_length = PAGE_SIZE -
+                       sizeof(struct android_metadata_header);
+               memcpy((*metadata)->verity_table,
+                       page_address(payload.page_io[0])
+                       + sizeof(struct android_metadata_header),
+                       copy_length);
+               table_length -= copy_length;
+               offset = copy_length;
+               i = 1;
+               while (table_length != 0) {
+                       if (table_length > PAGE_SIZE) {
+                               memcpy((*metadata)->verity_table + offset,
+                                       page_address(payload.page_io[i]),
+                                       PAGE_SIZE);
+                               offset += PAGE_SIZE;
+                               table_length -= PAGE_SIZE;
+                       } else {
+                               memcpy((*metadata)->verity_table + offset,
+                                       page_address(payload.page_io[i]),
+                                       table_length);
+                               table_length = 0;
+                       }
+                       i++;
+               }
+       }
+       (*metadata)->verity_table[table_length] = '\0';
+
+       DMINFO("verity_table: %s", (*metadata)->verity_table);
+       goto free_payload;
+
+free_metadata:
+       kfree(*metadata);
+free_header:
+       kfree(header);
+free_payload:
+       for (i = 0; i < payload.number_of_pages; i++)
+               if (payload.page_io[i])
+                       __free_page(payload.page_io[i]);
+       kfree(payload.page_io);
+blkdev_release:
+       blkdev_put(bdev, FMODE_READ);
+       return err;
+}
+
+/* helper functions to extract properties from dts */
+const char *find_dt_value(const char *name)
+{
+       struct device_node *firmware;
+       const char *value;
+
+       firmware = of_find_node_by_path("/firmware/android");
+       if (!firmware)
+               return NULL;
+       value = of_get_property(firmware, name, NULL);
+       of_node_put(firmware);
+
+       return value;
+}
+
+static int verity_mode(void)
+{
+       static const char enforcing[] = "enforcing";
+       static const char verified_mode_prop[] = "veritymode";
+       const char *value;
+
+       value = find_dt_value(verified_mode_prop);
+       if (!value)
+               value = veritymode;
+       if (!strncmp(value, enforcing, sizeof(enforcing) - 1))
+               return DM_VERITY_MODE_RESTART;
+
+       return DM_VERITY_MODE_EIO;
+}
+
+static int verify_verity_signature(char *key_id,
+               struct android_metadata *metadata)
+{
+       key_ref_t key_ref;
+       struct key *key;
+       struct public_key_signature *pks = NULL;
+       int retval = -EINVAL;
+
+       key_ref = keyring_search(make_key_ref(system_trusted_keyring, 1),
+               &key_type_asymmetric, key_id);
+
+       if (IS_ERR(key_ref)) {
+               DMERR("keyring: key not found");
+               return -ENOKEY;
+       }
+
+       key = key_ref_to_ptr(key_ref);
+
+       pks = table_make_digest(HASH_ALGO_SHA256,
+                       (const void *)metadata->verity_table,
+                       le32_to_cpu(metadata->header->table_length));
+
+       if (IS_ERR(pks)) {
+               DMERR("hashing failed");
+               retval = PTR_ERR(pks);
+               pks = NULL;
+               goto error;
+       }
+
+       retval = table_extract_mpi_array(pks, &metadata->header->signature[0],
+                               RSANUMBYTES);
+       if (retval < 0) {
+               DMERR("Error extracting mpi %d", retval);
+               goto error;
+       }
+
+       retval = verify_signature(key, pks);
+       mpi_free(pks->rsa.s);
+error:
+       kfree(pks);
+       key_put(key);
+
+       return retval;
+}
+
+static void handle_error(void)
+{
+       int mode = verity_mode();
+       if (mode == DM_VERITY_MODE_RESTART) {
+               DMERR("triggering restart");
+               kernel_restart("dm-verity device corrupted");
+       } else {
+               DMERR("Mounting verity root failed");
+       }
+}
+
+static inline bool test_mult_overflow(sector_t a, u32 b)
+{
+       sector_t r = (sector_t)~0ULL;
+
+       sector_div(r, b);
+       return a > r;
+}
+
+static int add_as_linear_device(struct dm_target *ti, char *dev)
+{
+       /*Move to linear mapping defines*/
+       char *linear_table_args[DM_LINEAR_ARGS] = {dev,
+                                       DM_LINEAR_TARGET_OFFSET};
+       int err = 0;
+
+       android_verity_target.dtr = dm_linear_dtr,
+       android_verity_target.map = dm_linear_map,
+       android_verity_target.status = dm_linear_status,
+       android_verity_target.end_io = dm_linear_end_io,
+       android_verity_target.prepare_ioctl = dm_linear_prepare_ioctl,
+       android_verity_target.iterate_devices = dm_linear_iterate_devices,
+        android_verity_target.direct_access = dm_linear_dax_direct_access,
+        android_verity_target.dax_copy_from_iter = dm_linear_dax_copy_from_iter,
+       android_verity_target.io_hints = NULL;
+
+       set_disk_ro(dm_disk(dm_table_get_md(ti->table)), 0);
+
+       err = dm_linear_ctr(ti, DM_LINEAR_ARGS, linear_table_args);
+
+       if (!err) {
+               DMINFO("Added android-verity as a linear target");
+               target_added = true;
+       } else
+               DMERR("Failed to add android-verity as linear target");
+
+       return err;
+}
+
+static int create_linear_device(struct dm_target *ti, dev_t dev,
+                               char *target_device)
+{
+       u64 device_size = 0;
+       int err = find_size(dev, &device_size);
+
+       if (err) {
+               DMERR("error finding bdev size");
+               handle_error();
+               return err;
+       }
+
+       ti->len = device_size;
+       err = add_as_linear_device(ti, target_device);
+       if (err) {
+               handle_error();
+               return err;
+       }
+       verity_enabled = false;
+       return 0;
+}
+
+/*
+ * Target parameters:
+ *     <key id>        Key id of the public key in the system keyring.
+ *                     Verity metadata's signature would be verified against
+ *                     this. If the key id contains spaces, replace them
+ *                     with '#'.
+ *     <block device>  The block device for which dm-verity is being setup.
+ */
+static int android_verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+{
+       dev_t uninitialized_var(dev);
+       struct android_metadata *metadata = NULL;
+       int err = 0, i, mode;
+       char *key_id, *table_ptr, dummy, *target_device,
+       *verity_table_args[VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS];
+       /* One for specifying number of opt args and one for mode */
+       sector_t data_sectors;
+       u32 data_block_size;
+       unsigned int no_of_args = VERITY_TABLE_ARGS + 2 + VERITY_TABLE_OPT_FEC_ARGS;
+       struct fec_header uninitialized_var(fec);
+       struct fec_ecc_metadata uninitialized_var(ecc);
+       char buf[FEC_ARG_LENGTH], *buf_ptr;
+       unsigned long long tmpll;
+
+       if (argc == 1) {
+               /* Use the default keyid */
+               if (default_verity_key_id())
+                       key_id = veritykeyid;
+               else if (!is_eng()) {
+                       DMERR("veritykeyid= is not set");
+                       handle_error();
+                       return -EINVAL;
+               }
+       } else if (argc == 2)
+               key_id = argv[1];
+       else {
+               DMERR("Incorrect number of arguments");
+               handle_error();
+               return -EINVAL;
+       }
+
+       target_device = argv[0];
+
+       dev = name_to_dev_t(target_device);
+       if (!dev) {
+               DMERR("no dev found for %s", target_device);
+               handle_error();
+               return -EINVAL;
+       }
+
+       if (is_eng())
+               return create_linear_device(ti, dev, target_device);
+
+       strreplace(key_id, '#', ' ');
+
+       DMINFO("key:%s dev:%s", key_id, target_device);
+
+       if (extract_fec_header(dev, &fec, &ecc)) {
+               DMERR("Error while extracting fec header");
+               handle_error();
+               return -EINVAL;
+       }
+
+       err = extract_metadata(dev, &fec, &metadata, &verity_enabled);
+
+       if (err) {
+               /* Allow invalid metadata when the device is unlocked */
+               if (is_unlocked()) {
+                       DMWARN("Allow invalid metadata when unlocked");
+                       return create_linear_device(ti, dev, target_device);
+               }
+               DMERR("Error while extracting metadata");
+               handle_error();
+               goto free_metadata;
+       }
+
+       if (verity_enabled) {
+               err = verify_verity_signature(key_id, metadata);
+
+               if (err) {
+                       DMERR("Signature verification failed");
+                       handle_error();
+                       goto free_metadata;
+               } else
+                       DMINFO("Signature verification success");
+       }
+
+       table_ptr = metadata->verity_table;
+
+       for (i = 0; i < VERITY_TABLE_ARGS; i++) {
+               verity_table_args[i] = strsep(&table_ptr, " ");
+               if (verity_table_args[i] == NULL)
+                       break;
+       }
+
+       if (i != VERITY_TABLE_ARGS) {
+               DMERR("Verity table not in the expected format");
+               err = -EINVAL;
+               handle_error();
+               goto free_metadata;
+       }
+
+       if (sscanf(verity_table_args[5], "%llu%c", &tmpll, &dummy)
+                                                       != 1) {
+               DMERR("Verity table not in the expected format");
+               handle_error();
+               err = -EINVAL;
+               goto free_metadata;
+       }
+
+       if (tmpll > ULONG_MAX) {
+               DMERR("<num_data_blocks> too large. Forgot to turn on CONFIG_LBDAF?");
+               handle_error();
+               err = -EINVAL;
+               goto free_metadata;
+       }
+
+       data_sectors = tmpll;
+
+       if (sscanf(verity_table_args[3], "%u%c", &data_block_size, &dummy)
+                                                               != 1) {
+               DMERR("Verity table not in the expected format");
+               handle_error();
+               err = -EINVAL;
+               goto free_metadata;
+       }
+
+       if (test_mult_overflow(data_sectors, data_block_size >>
+                                                       SECTOR_SHIFT)) {
+               DMERR("data_sectors too large");
+               handle_error();
+               err = -EOVERFLOW;
+               goto free_metadata;
+       }
+
+       data_sectors *= data_block_size >> SECTOR_SHIFT;
+       DMINFO("Data sectors %llu", (unsigned long long)data_sectors);
+
+       /* update target length */
+       ti->len = data_sectors;
+
+       /* Setup linear target and free */
+       if (!verity_enabled) {
+               err = add_as_linear_device(ti, target_device);
+               goto free_metadata;
+       }
+
+       /*substitute data_dev and hash_dev*/
+       verity_table_args[1] = target_device;
+       verity_table_args[2] = target_device;
+
+       mode = verity_mode();
+
+       if (ecc.valid && IS_BUILTIN(CONFIG_DM_VERITY_FEC)) {
+               if (mode) {
+                       err = snprintf(buf, FEC_ARG_LENGTH,
+                               "%u %s " VERITY_TABLE_OPT_FEC_FORMAT,
+                               1 + VERITY_TABLE_OPT_FEC_ARGS,
+                               mode == DM_VERITY_MODE_RESTART ?
+                                       VERITY_TABLE_OPT_RESTART :
+                                       VERITY_TABLE_OPT_LOGGING,
+                               target_device,
+                               ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+                               ecc.roots);
+               } else {
+                       err = snprintf(buf, FEC_ARG_LENGTH,
+                               "%u " VERITY_TABLE_OPT_FEC_FORMAT,
+                               VERITY_TABLE_OPT_FEC_ARGS, target_device,
+                               ecc.start / FEC_BLOCK_SIZE, ecc.blocks,
+                               ecc.roots);
+               }
+       } else if (mode) {
+               err = snprintf(buf, FEC_ARG_LENGTH,
+                       "2 " VERITY_TABLE_OPT_IGNZERO " %s",
+                       mode == DM_VERITY_MODE_RESTART ?
+                       VERITY_TABLE_OPT_RESTART : VERITY_TABLE_OPT_LOGGING);
+       } else {
+               err = snprintf(buf, FEC_ARG_LENGTH, "1 %s",
+                                "ignore_zero_blocks");
+       }
+
+       if (err < 0 || err >= FEC_ARG_LENGTH)
+               goto free_metadata;
+
+       buf_ptr = buf;
+
+       for (i = VERITY_TABLE_ARGS; i < (VERITY_TABLE_ARGS +
+               VERITY_TABLE_OPT_FEC_ARGS + 2); i++) {
+               verity_table_args[i] = strsep(&buf_ptr, " ");
+               if (verity_table_args[i] == NULL) {
+                       no_of_args = i;
+                       break;
+               }
+       }
+
+       err = verity_ctr(ti, no_of_args, verity_table_args);
+
+       if (err)
+               DMERR("android-verity failed to mount as verity target");
+       else {
+               target_added = true;
+               DMINFO("android-verity mounted as verity target");
+       }
+
+free_metadata:
+       if (metadata) {
+               kfree(metadata->header);
+               kfree(metadata->verity_table);
+       }
+       kfree(metadata);
+       return err;
+}
+
+static int __init dm_android_verity_init(void)
+{
+       int r;
+       struct dentry *file;
+
+       r = dm_register_target(&android_verity_target);
+       if (r < 0)
+               DMERR("register failed %d", r);
+
+       /* Tracks the status of the last added target */
+       debug_dir = debugfs_create_dir("android_verity", NULL);
+
+       if (IS_ERR_OR_NULL(debug_dir)) {
+               DMERR("Cannot create android_verity debugfs directory: %ld",
+                       PTR_ERR(debug_dir));
+               goto end;
+       }
+
+       file = debugfs_create_bool("target_added", S_IRUGO, debug_dir,
+                               &target_added);
+
+       if (IS_ERR_OR_NULL(file)) {
+               DMERR("Cannot create android_verity debugfs directory: %ld",
+                       PTR_ERR(debug_dir));
+               debugfs_remove_recursive(debug_dir);
+               goto end;
+       }
+
+       file = debugfs_create_bool("verity_enabled", S_IRUGO, debug_dir,
+                               &verity_enabled);
+
+       if (IS_ERR_OR_NULL(file)) {
+               DMERR("Cannot create android_verity debugfs directory: %ld",
+                       PTR_ERR(debug_dir));
+               debugfs_remove_recursive(debug_dir);
+       }
+
+end:
+       return r;
+}
+
+static void __exit dm_android_verity_exit(void)
+{
+       if (!IS_ERR_OR_NULL(debug_dir))
+               debugfs_remove_recursive(debug_dir);
+
+       dm_unregister_target(&android_verity_target);
+}
+
+module_init(dm_android_verity_init);
+module_exit(dm_android_verity_exit);
diff --git a/drivers/md/dm-android-verity.h b/drivers/md/dm-android-verity.h

new file mode 100644 (file)

index 0000000..ed67d56
--- /dev/null
+++ b/drivers/md/dm-android-verity.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef DM_ANDROID_VERITY_H
+#define DM_ANDROID_VERITY_H
+
+#include <crypto/sha.h>
+
+#define RSANUMBYTES 256
+#define VERITY_METADATA_MAGIC_NUMBER 0xb001b001
+#define VERITY_METADATA_MAGIC_DISABLE 0x46464f56
+#define VERITY_METADATA_VERSION 0
+#define VERITY_STATE_DISABLE 1
+#define DATA_BLOCK_SIZE (4 * 1024)
+#define VERITY_METADATA_SIZE (8 * DATA_BLOCK_SIZE)
+#define VERITY_TABLE_ARGS 10
+#define VERITY_COMMANDLINE_PARAM_LENGTH 20
+#define BUILD_VARIANT 20
+
+/*
+ * <subject>:<sha1-id> is the format for the identifier.
+ * subject can either be the Common Name(CN) + Organization Name(O) or
+ * just the CN if the it is prefixed with O
+ * From https://tools.ietf.org/html/rfc5280#appendix-A
+ * ub-organization-name-length INTEGER ::= 64
+ * ub-common-name-length INTEGER ::= 64
+ *
+ * http://lxr.free-electrons.com/source/crypto/asymmetric_keys/x509_cert_parser.c?v=3.9#L278
+ * ctx->o_size + 2 + ctx->cn_size + 1
+ * + 41 characters for ":" and sha1 id
+ * 64 + 2 + 64 + 1 + 1 + 40 (172)
+ * setting VERITY_DEFAULT_KEY_ID_LENGTH to 200 characters.
+ */
+#define VERITY_DEFAULT_KEY_ID_LENGTH 200
+
+#define FEC_MAGIC 0xFECFECFE
+#define FEC_BLOCK_SIZE (4 * 1024)
+#define FEC_VERSION 0
+#define FEC_RSM 255
+#define FEC_ARG_LENGTH 300
+
+#define VERITY_TABLE_OPT_RESTART "restart_on_corruption"
+#define VERITY_TABLE_OPT_LOGGING "ignore_corruption"
+#define VERITY_TABLE_OPT_IGNZERO "ignore_zero_blocks"
+
+#define VERITY_TABLE_OPT_FEC_FORMAT \
+       "use_fec_from_device %s fec_start %llu fec_blocks %llu fec_roots %u ignore_zero_blocks"
+#define VERITY_TABLE_OPT_FEC_ARGS 9
+
+#define VERITY_DEBUG 0
+
+#define DM_MSG_PREFIX                   "android-verity"
+
+#define DM_LINEAR_ARGS 2
+#define DM_LINEAR_TARGET_OFFSET "0"
+
+/*
+ * There can be two formats.
+ * if fec is present
+ * <data_blocks> <verity_tree> <verity_metdata_32K><fec_data><fec_data_4K>
+ * if fec is not present
+ * <data_blocks> <verity_tree> <verity_metdata_32K>
+ */
+struct fec_header {
+       __le32 magic;
+       __le32 version;
+       __le32 size;
+       __le32 roots;
+       __le32 fec_size;
+       __le64 inp_size;
+       u8 hash[SHA256_DIGEST_SIZE];
+} __attribute__((packed));
+
+struct android_metadata_header {
+       __le32 magic_number;
+       __le32 protocol_version;
+       char signature[RSANUMBYTES];
+       __le32 table_length;
+};
+
+struct android_metadata {
+       struct android_metadata_header *header;
+       char *verity_table;
+};
+
+struct fec_ecc_metadata {
+       bool valid;
+       u32 roots;
+       u64 blocks;
+       u64 rounds;
+       u64 start;
+};
+
+struct bio_read {
+       struct page **page_io;
+       int number_of_pages;
+};
+
+extern struct target_type linear_target;
+
+extern void dm_linear_dtr(struct dm_target *ti);
+extern int dm_linear_map(struct dm_target *ti, struct bio *bio);
+extern int dm_linear_end_io(struct dm_target *ti, struct bio *bio,
+                        blk_status_t *error);
+extern void dm_linear_status(struct dm_target *ti, status_type_t type,
+                       unsigned status_flags, char *result, unsigned maxlen);
+extern int dm_linear_prepare_ioctl(struct dm_target *ti,
+                struct block_device **bdev, fmode_t *mode);
+extern int dm_linear_iterate_devices(struct dm_target *ti,
+                       iterate_devices_callout_fn fn, void *data);
+extern int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv);
+extern long dm_linear_dax_direct_access(struct dm_target *ti, sector_t sector,
+                                 void **kaddr, pfn_t *pfn, long size);
+extern size_t dm_linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+               void *addr, size_t bytes, struct iov_iter *i);
+#endif /* DM_ANDROID_VERITY_H */
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c

index e52676fa9832c53dcd57dc37eb654e3e7eee0d88..4a94d510aeff990a0bd290b67be5de202a588fc4 100644 (file)
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1992,6 +1992,45 @@ void dm_interface_exit(void)
         dm_hash_exit();
  }
  
+
+/**
+ * dm_ioctl_export - Permanently export a mapped device via the ioctl interface
+ * @md: Pointer to mapped_device
+ * @name: Buffer (size DM_NAME_LEN) for name
+ * @uuid: Buffer (size DM_UUID_LEN) for uuid or NULL if not desired
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+                   const char *uuid)
+{
+       int r = 0;
+       struct hash_cell *hc;
+
+       if (!md) {
+               r = -ENXIO;
+               goto out;
+       }
+
+       /* The name and uuid can only be set once. */
+       mutex_lock(&dm_hash_cells_mutex);
+       hc = dm_get_mdptr(md);
+       mutex_unlock(&dm_hash_cells_mutex);
+       if (hc) {
+               DMERR("%s: already exported", dm_device_name(md));
+               r = -ENXIO;
+               goto out;
+       }
+
+       r = dm_hash_insert(name, uuid, md);
+       if (r) {
+               DMERR("%s: could not bind to '%s'", dm_device_name(md), name);
+               goto out;
+       }
+
+       /* Let udev know we've changed. */
+       dm_kobject_uevent(md, KOBJ_CHANGE, dm_get_event_nr(md));
+out:
+       return r;
+}
  /**
   * dm_copy_name_and_uuid - Copy mapped device name & uuid into supplied buffers
   * @md: Pointer to mapped_device
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c

index d5f8eff7c11d88a066d1dd83fe4707b33264cfae..e6fd31b03c38a4d4f8684d03f95c36820ae0cebb 100644 (file)
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -26,7 +26,7 @@ struct linear_c {
  /*
   * Construct a linear mapping: <dev_path> <offset>
   */
-static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
+int dm_linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
  {
         struct linear_c *lc;
         unsigned long long tmp;
@@ -69,7 +69,7 @@ static int linear_ctr(struct dm_target *ti, unsigned int argc, char **argv)
         return ret;
  }
  
-static void linear_dtr(struct dm_target *ti)
+void dm_linear_dtr(struct dm_target *ti)
  {
         struct linear_c *lc = (struct linear_c *) ti->private;
  
@@ -94,14 +94,14 @@ static void linear_map_bio(struct dm_target *ti, struct bio *bio)
                         linear_map_sector(ti, bio->bi_iter.bi_sector);
  }
  
-static int linear_map(struct dm_target *ti, struct bio *bio)
+int dm_linear_map(struct dm_target *ti, struct bio *bio)
  {
         linear_map_bio(ti, bio);
  
         return DM_MAPIO_REMAPPED;
  }
  
-static int linear_end_io(struct dm_target *ti, struct bio *bio,
+int dm_linear_end_io(struct dm_target *ti, struct bio *bio,
                          blk_status_t *error)
  {
         struct linear_c *lc = ti->private;
@@ -111,8 +111,9 @@ static int linear_end_io(struct dm_target *ti, struct bio *bio,
  
         return DM_ENDIO_DONE;
  }
+EXPORT_SYMBOL_GPL(dm_linear_end_io);
  
-static void linear_status(struct dm_target *ti, status_type_t type,
+void dm_linear_status(struct dm_target *ti, status_type_t type,
                           unsigned status_flags, char *result, unsigned maxlen)
  {
         struct linear_c *lc = (struct linear_c *) ti->private;
@@ -129,7 +130,7 @@ static void linear_status(struct dm_target *ti, status_type_t type,
         }
  }
  
-static int linear_prepare_ioctl(struct dm_target *ti,
+int dm_linear_prepare_ioctl(struct dm_target *ti,
                 struct block_device **bdev, fmode_t *mode)
  {
         struct linear_c *lc = (struct linear_c *) ti->private;
@@ -146,7 +147,7 @@ static int linear_prepare_ioctl(struct dm_target *ti,
         return 0;
  }
  
-static int linear_iterate_devices(struct dm_target *ti,
+int dm_linear_iterate_devices(struct dm_target *ti,
                                   iterate_devices_callout_fn fn, void *data)
  {
         struct linear_c *lc = ti->private;
@@ -154,7 +155,7 @@ static int linear_iterate_devices(struct dm_target *ti,
         return fn(ti, lc->dev, lc->start, ti->len, data);
  }
  
-static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
+long dm_linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
                 long nr_pages, void **kaddr, pfn_t *pfn)
  {
         long ret;
@@ -169,8 +170,9 @@ static long linear_dax_direct_access(struct dm_target *ti, pgoff_t pgoff,
                 return ret;
         return dax_direct_access(dax_dev, pgoff, nr_pages, kaddr, pfn);
  }
+EXPORT_SYMBOL_GPL(dm_linear_dax_direct_access);
  
-static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
+size_t dm_linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
                 void *addr, size_t bytes, struct iov_iter *i)
  {
         struct linear_c *lc = ti->private;
@@ -183,21 +185,22 @@ static size_t linear_dax_copy_from_iter(struct dm_target *ti, pgoff_t pgoff,
                 return 0;
         return dax_copy_from_iter(dax_dev, pgoff, addr, bytes, i);
  }
+EXPORT_SYMBOL_GPL(dm_linear_dax_copy_from_iter);
  
  static struct target_type linear_target = {
         .name   = "linear",
         .version = {1, 4, 0},
         .features = DM_TARGET_PASSES_INTEGRITY | DM_TARGET_ZONED_HM,
         .module = THIS_MODULE,
-       .ctr    = linear_ctr,
-       .dtr    = linear_dtr,
-       .map    = linear_map,
-       .end_io = linear_end_io,
-       .status = linear_status,
-       .prepare_ioctl = linear_prepare_ioctl,
-       .iterate_devices = linear_iterate_devices,
-       .direct_access = linear_dax_direct_access,
-       .dax_copy_from_iter = linear_dax_copy_from_iter,
+       .ctr    = dm_linear_ctr,
+       .dtr    = dm_linear_dtr,
+       .map    = dm_linear_map,
+       .status = dm_linear_status,
+       .end_io = dm_linear_end_io,
+       .prepare_ioctl = dm_linear_prepare_ioctl,
+       .iterate_devices = dm_linear_iterate_devices,
+       .direct_access = dm_linear_dax_direct_access,
+       .dax_copy_from_iter = dm_linear_dax_copy_from_iter,
  };
  
  int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c

index 4287fc9f3527f84cf4606b795be92d63503534dc..a02672047d66e05e4277cf90a13cdb159b418e7e 100644 (file)
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -11,6 +11,7 @@
  #include <linux/vmalloc.h>
  #include <linux/blkdev.h>
  #include <linux/namei.h>
+#include <linux/mount.h>
  #include <linux/ctype.h>
  #include <linux/string.h>
  #include <linux/slab.h>
diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c

index e13f90832b6b54256f88d45ba30cee16660f58ef..776a4f77f76ce6031d54401a23dd39ca89ca6524 100644 (file)
--- a/drivers/md/dm-verity-fec.c
+++ b/drivers/md/dm-verity-fec.c
@@ -11,6 +11,7 @@
  
  #include "dm-verity-fec.h"
  #include <linux/math64.h>
+#include <linux/sysfs.h>
  
  #define DM_MSG_PREFIX  "verity-fec"
  
@@ -175,9 +176,11 @@ error:
         if (r < 0 && neras)
                 DMERR_LIMIT("%s: FEC %llu: failed to correct: %d",
                             v->data_dev->name, (unsigned long long)rsb, r);
-       else if (r > 0)
+       else if (r > 0) {
                 DMWARN_LIMIT("%s: FEC %llu: corrected %d errors",
                              v->data_dev->name, (unsigned long long)rsb, r);
+               atomic_add_unless(&v->fec->corrected, 1, INT_MAX);
+       }
  
         return r;
  }
@@ -545,6 +548,7 @@ unsigned verity_fec_status_table(struct dm_verity *v, unsigned sz,
  void verity_fec_dtr(struct dm_verity *v)
  {
         struct dm_verity_fec *f = v->fec;
+       struct kobject *kobj = &f->kobj_holder.kobj;
  
         if (!verity_fec_is_enabled(v))
                 goto out;
@@ -561,6 +565,12 @@ void verity_fec_dtr(struct dm_verity *v)
  
         if (f->dev)
                 dm_put_device(v->ti, f->dev);
+
+       if (kobj->state_initialized) {
+               kobject_put(kobj);
+               wait_for_completion(dm_get_completion_from_kobject(kobj));
+       }
+
  out:
         kfree(f);
         v->fec = NULL;
@@ -649,6 +659,28 @@ int verity_fec_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v,
         return 0;
  }
  
+static ssize_t corrected_show(struct kobject *kobj, struct kobj_attribute *attr,
+                             char *buf)
+{
+       struct dm_verity_fec *f = container_of(kobj, struct dm_verity_fec,
+                                              kobj_holder.kobj);
+
+       return sprintf(buf, "%d\n", atomic_read(&f->corrected));
+}
+
+static struct kobj_attribute attr_corrected = __ATTR_RO(corrected);
+
+static struct attribute *fec_attrs[] = {
+       &attr_corrected.attr,
+       NULL
+};
+
+static struct kobj_type fec_ktype = {
+       .sysfs_ops = &kobj_sysfs_ops,
+       .default_attrs = fec_attrs,
+       .release = dm_kobject_release
+};
+
  /*
   * Allocate dm_verity_fec for v->fec. Must be called before verity_fec_ctr.
   */
@@ -672,8 +704,10 @@ int verity_fec_ctr_alloc(struct dm_verity *v)
   */
  int verity_fec_ctr(struct dm_verity *v)
  {
+       int r;
         struct dm_verity_fec *f = v->fec;
         struct dm_target *ti = v->ti;
+       struct mapped_device *md = dm_table_get_md(ti->table);
         u64 hash_blocks;
  
         if (!verity_fec_is_enabled(v)) {
@@ -681,6 +715,16 @@ int verity_fec_ctr(struct dm_verity *v)
                 return 0;
         }
  
+       /* Create a kobject and sysfs attributes */
+       init_completion(&f->kobj_holder.completion);
+
+       r = kobject_init_and_add(&f->kobj_holder.kobj, &fec_ktype,
+                                &disk_to_dev(dm_disk(md))->kobj, "%s", "fec");
+       if (r) {
+               ti->error = "Cannot create kobject";
+               return r;
+       }
+
         /*
          * FEC is computed over data blocks, possible metadata, and
          * hash blocks. In other words, FEC covers total of fec_blocks
diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h

index bb31ce87a933b80d11140f31607f804ff81d209a..4db0cae262eb7ccc489ee3ce44fbde7660626fdc 100644 (file)
--- a/drivers/md/dm-verity-fec.h
+++ b/drivers/md/dm-verity-fec.h
@@ -12,6 +12,8 @@
  #ifndef DM_VERITY_FEC_H
  #define DM_VERITY_FEC_H
  
+#include "dm.h"
+#include "dm-core.h"
  #include "dm-verity.h"
  #include <linux/rslib.h>
  
@@ -51,6 +53,8 @@ struct dm_verity_fec {
         mempool_t *extra_pool;  /* mempool for extra buffers */
         mempool_t *output_pool; /* mempool for output */
         struct kmem_cache *cache;       /* cache for buffers */
+       atomic_t corrected;             /* corrected errors */
+       struct dm_kobject_holder kobj_holder;   /* for sysfs attributes */
  };
  
  /* per-bio data */
diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c

index bda3caca23ca69af2fe97592aa817a29b87851d6..5c6d441a8a8a08e8f456316d127c54d2c6634e35 100644 (file)
--- a/drivers/md/dm-verity-target.c
+++ b/drivers/md/dm-verity-target.c
@@ -582,6 +582,7 @@ static void verity_prefetch_io(struct work_struct *work)
                 container_of(work, struct dm_verity_prefetch_work, work);
         struct dm_verity *v = pw->v;
         int i;
+       sector_t prefetch_size;
  
         for (i = v->levels - 2; i >= 0; i--) {
                 sector_t hash_block_start;
@@ -604,8 +605,14 @@ static void verity_prefetch_io(struct work_struct *work)
                                 hash_block_end = v->hash_blocks - 1;
                 }
  no_prefetch_cluster:
+               // for emmc, it is more efficient to send bigger read
+               prefetch_size = max((sector_t)CONFIG_DM_VERITY_HASH_PREFETCH_MIN_SIZE,
+                       hash_block_end - hash_block_start + 1);
+               if ((hash_block_start + prefetch_size) >= (v->hash_start + v->hash_blocks)) {
+                       prefetch_size = hash_block_end - hash_block_start + 1;
+               }
                 dm_bufio_prefetch(v->bufio, hash_block_start,
-                                 hash_block_end - hash_block_start + 1);
+                                 prefetch_size);
         }
  
         kfree(pw);
@@ -632,7 +639,7 @@ static void verity_submit_prefetch(struct dm_verity *v, struct dm_verity_io *io)
   * Bio map function. It allocates dm_verity_io structure and bio vector and
   * fills them. Then it issues prefetches and the I/O.
   */
-static int verity_map(struct dm_target *ti, struct bio *bio)
+int verity_map(struct dm_target *ti, struct bio *bio)
  {
         struct dm_verity *v = ti->private;
         struct dm_verity_io *io;
@@ -677,7 +684,7 @@ static int verity_map(struct dm_target *ti, struct bio *bio)
  /*
   * Status: V (valid) or C (corruption found)
   */
-static void verity_status(struct dm_target *ti, status_type_t type,
+void verity_status(struct dm_target *ti, status_type_t type,
                           unsigned status_flags, char *result, unsigned maxlen)
  {
         struct dm_verity *v = ti->private;
@@ -737,7 +744,7 @@ static void verity_status(struct dm_target *ti, status_type_t type,
         }
  }
  
-static int verity_prepare_ioctl(struct dm_target *ti,
+int verity_prepare_ioctl(struct dm_target *ti,
                 struct block_device **bdev, fmode_t *mode)
  {
         struct dm_verity *v = ti->private;
@@ -750,7 +757,7 @@ static int verity_prepare_ioctl(struct dm_target *ti,
         return 0;
  }
  
-static int verity_iterate_devices(struct dm_target *ti,
+int verity_iterate_devices(struct dm_target *ti,
                                   iterate_devices_callout_fn fn, void *data)
  {
         struct dm_verity *v = ti->private;
@@ -758,7 +765,7 @@ static int verity_iterate_devices(struct dm_target *ti,
         return fn(ti, v->data_dev, v->data_start, ti->len, data);
  }
  
-static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
+void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
  {
         struct dm_verity *v = ti->private;
  
@@ -771,7 +778,7 @@ static void verity_io_hints(struct dm_target *ti, struct queue_limits *limits)
         blk_limits_io_min(limits, limits->logical_block_size);
  }
  
-static void verity_dtr(struct dm_target *ti)
+void verity_dtr(struct dm_target *ti)
  {
         struct dm_verity *v = ti->private;
  
@@ -898,7 +905,7 @@ static int verity_parse_opt_args(struct dm_arg_set *as, struct dm_verity *v)
   *     <digest>
   *     <salt>          Hex string or "-" if no salt.
   */
-static int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
+int verity_ctr(struct dm_target *ti, unsigned argc, char **argv)
  {
         struct dm_verity *v;
         struct dm_arg_set as;
diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h

index a59e0ada6fd32d01b80c4032a25a67c90b048417..29831d66d9c200e8d155040eb2e3086f1da3bcf4 100644 (file)
--- a/drivers/md/dm-verity.h
+++ b/drivers/md/dm-verity.h
@@ -131,4 +131,14 @@ extern int verity_hash(struct dm_verity *v, struct ahash_request *req,
  extern int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io,
                                  sector_t block, u8 *digest, bool *is_zero);
  
+extern void verity_status(struct dm_target *ti, status_type_t type,
+                       unsigned status_flags, char *result, unsigned maxlen);
+extern int verity_prepare_ioctl(struct dm_target *ti,
+                struct block_device **bdev, fmode_t *mode);
+extern int verity_iterate_devices(struct dm_target *ti,
+                               iterate_devices_callout_fn fn, void *data);
+extern void verity_io_hints(struct dm_target *ti, struct queue_limits *limits);
+extern void verity_dtr(struct dm_target *ti);
+extern int verity_ctr(struct dm_target *ti, unsigned argc, char **argv);
+extern int verity_map(struct dm_target *ti, struct bio *bio);
  #endif /* DM_VERITY_H */
diff --git a/drivers/md/dm.h b/drivers/md/dm.h

index 38c84c0a35d47f1bf1b02826c284170d2b8195f4..ab289ce9c3cd8ee5e547de077c54ae0f4b7b8e34 100644 (file)
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -80,8 +80,6 @@ void dm_set_md_type(struct mapped_device *md, enum dm_queue_mode type);
  enum dm_queue_mode dm_get_md_type(struct mapped_device *md);
  struct target_type *dm_get_immutable_target_type(struct mapped_device *md);
  
-int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
-
  /*
   * To check the return value from dm_table_find_target().
   */
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig

index 8136dc7e863d7166a64c5ac8bddfa7f606e8c1fa..ce2449ad16881547f1e51f72dec90c33d5bc858f 100644 (file)
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -506,6 +506,27 @@ config PCI_ENDPOINT_TEST
             Enable this configuration option to enable the host side test driver
             for PCI Endpoint.
  
+config UID_SYS_STATS
+       bool "Per-UID statistics"
+       depends on PROFILING && TASK_XACCT && TASK_IO_ACCOUNTING
+       help
+         Per UID based cpu time statistics exported to /proc/uid_cputime
+         Per UID based io statistics exported to /proc/uid_io
+         Per UID based procstat control in /proc/uid_procstat
+
+config UID_SYS_STATS_DEBUG
+       bool "Per-TASK statistics"
+       depends on UID_SYS_STATS
+       default n
+       help
+         Per TASK based io statistics exported to /proc/uid_io
+
+config MEMORY_STATE_TIME
+       tristate "Memory freq/bandwidth time statistics"
+       depends on PROFILING
+       help
+         Memory time statistics exported to /sys/kernel/memory_state_time
+
  source "drivers/misc/c2port/Kconfig"
  source "drivers/misc/eeprom/Kconfig"
  source "drivers/misc/cb710/Kconfig"
diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile

index ad0e64fdba3484445921b8140d4c994f880970d1..63e802aebb03007c6fe601ec4ee75a79939e02d3 100644 (file)
--- a/drivers/misc/Makefile
+++ b/drivers/misc/Makefile
@@ -57,6 +57,9 @@ obj-$(CONFIG_ASPEED_LPC_CTRL) += aspeed-lpc-ctrl.o
  obj-$(CONFIG_ASPEED_LPC_SNOOP) += aspeed-lpc-snoop.o
  obj-$(CONFIG_PCI_ENDPOINT_TEST)        += pci_endpoint_test.o
  
+obj-$(CONFIG_UID_SYS_STATS)    += uid_sys_stats.o
+obj-$(CONFIG_MEMORY_STATE_TIME)        += memory_state_time.o
+
  lkdtm-$(CONFIG_LKDTM)          += lkdtm_core.o
  lkdtm-$(CONFIG_LKDTM)          += lkdtm_bugs.o
  lkdtm-$(CONFIG_LKDTM)          += lkdtm_heap.o
diff --git a/drivers/misc/memory_state_time.c b/drivers/misc/memory_state_time.c

new file mode 100644 (file)

index 0000000..ba94dcf
--- /dev/null
+++ b/drivers/misc/memory_state_time.c
@@ -0,0 +1,462 @@
+/* drivers/misc/memory_state_time.c
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/errno.h>
+#include <linux/hashtable.h>
+#include <linux/kconfig.h>
+#include <linux/kernel.h>
+#include <linux/kobject.h>
+#include <linux/memory-state-time.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/of_platform.h>
+#include <linux/slab.h>
+#include <linux/sysfs.h>
+#include <linux/time.h>
+#include <linux/timekeeping.h>
+#include <linux/workqueue.h>
+
+#define KERNEL_ATTR_RO(_name) \
+static struct kobj_attribute _name##_attr = __ATTR_RO(_name)
+
+#define KERNEL_ATTR_RW(_name) \
+static struct kobj_attribute _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+
+#define FREQ_HASH_BITS 4
+DECLARE_HASHTABLE(freq_hash_table, FREQ_HASH_BITS);
+
+static DEFINE_MUTEX(mem_lock);
+
+#define TAG "memory_state_time"
+#define BW_NODE "/soc/memory-state-time"
+#define FREQ_TBL "freq-tbl"
+#define BW_TBL "bw-buckets"
+#define NUM_SOURCES "num-sources"
+
+#define LOWEST_FREQ 2
+
+static int curr_bw;
+static int curr_freq;
+static u32 *bw_buckets;
+static u32 *freq_buckets;
+static int num_freqs;
+static int num_buckets;
+static int registered_bw_sources;
+static u64 last_update;
+static bool init_success;
+static struct workqueue_struct *memory_wq;
+static u32 num_sources = 10;
+static int *bandwidths;
+
+struct freq_entry {
+       int freq;
+       u64 *buckets; /* Bandwidth buckets. */
+       struct hlist_node hash;
+};
+
+struct queue_container {
+       struct work_struct update_state;
+       int value;
+       u64 time_now;
+       int id;
+       struct mutex *lock;
+};
+
+static int find_bucket(int bw)
+{
+       int i;
+
+       if (bw_buckets != NULL) {
+               for (i = 0; i < num_buckets; i++) {
+                       if (bw_buckets[i] > bw) {
+                               pr_debug("Found bucket %d for bandwidth %d\n",
+                                       i, bw);
+                               return i;
+                       }
+               }
+               return num_buckets - 1;
+       }
+       return 0;
+}
+
+static u64 get_time_diff(u64 time_now)
+{
+       u64 ms;
+
+       ms = time_now - last_update;
+       last_update = time_now;
+       return ms;
+}
+
+static ssize_t show_stat_show(struct kobject *kobj,
+               struct kobj_attribute *attr, char *buf)
+{
+       int i, j;
+       int len = 0;
+       struct freq_entry *freq_entry;
+
+       for (i = 0; i < num_freqs; i++) {
+               hash_for_each_possible(freq_hash_table, freq_entry, hash,
+                               freq_buckets[i]) {
+                       if (freq_entry->freq == freq_buckets[i]) {
+                               len += scnprintf(buf + len, PAGE_SIZE - len,
+                                               "%d ", freq_buckets[i]);
+                               if (len >= PAGE_SIZE)
+                                       break;
+                               for (j = 0; j < num_buckets; j++) {
+                                       len += scnprintf(buf + len,
+                                                       PAGE_SIZE - len,
+                                                       "%llu ",
+                                                       freq_entry->buckets[j]);
+                               }
+                               len += scnprintf(buf + len, PAGE_SIZE - len,
+                                               "\n");
+                       }
+               }
+       }
+       pr_debug("Current Time: %llu\n", ktime_get_boot_ns());
+       return len;
+}
+KERNEL_ATTR_RO(show_stat);
+
+static void update_table(u64 time_now)
+{
+       struct freq_entry *freq_entry;
+
+       pr_debug("Last known bw %d freq %d\n", curr_bw, curr_freq);
+       hash_for_each_possible(freq_hash_table, freq_entry, hash, curr_freq) {
+               if (curr_freq == freq_entry->freq) {
+                       freq_entry->buckets[find_bucket(curr_bw)]
+                                       += get_time_diff(time_now);
+                       break;
+               }
+       }
+}
+
+static bool freq_exists(int freq)
+{
+       int i;
+
+       for (i = 0; i < num_freqs; i++) {
+               if (freq == freq_buckets[i])
+                       return true;
+       }
+       return false;
+}
+
+static int calculate_total_bw(int bw, int index)
+{
+       int i;
+       int total_bw = 0;
+
+       pr_debug("memory_state_time New bw %d for id %d\n", bw, index);
+       bandwidths[index] = bw;
+       for (i = 0; i < registered_bw_sources; i++)
+               total_bw += bandwidths[i];
+       return total_bw;
+}
+
+static void freq_update_do_work(struct work_struct *work)
+{
+       struct queue_container *freq_state_update
+                       = container_of(work, struct queue_container,
+                       update_state);
+       if (freq_state_update) {
+               mutex_lock(&mem_lock);
+               update_table(freq_state_update->time_now);
+               curr_freq = freq_state_update->value;
+               mutex_unlock(&mem_lock);
+               kfree(freq_state_update);
+       }
+}
+
+static void bw_update_do_work(struct work_struct *work)
+{
+       struct queue_container *bw_state_update
+                       = container_of(work, struct queue_container,
+                       update_state);
+       if (bw_state_update) {
+               mutex_lock(&mem_lock);
+               update_table(bw_state_update->time_now);
+               curr_bw = calculate_total_bw(bw_state_update->value,
+                               bw_state_update->id);
+               mutex_unlock(&mem_lock);
+               kfree(bw_state_update);
+       }
+}
+
+static void memory_state_freq_update(struct memory_state_update_block *ub,
+               int value)
+{
+       if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+               if (freq_exists(value) && init_success) {
+                       struct queue_container *freq_container
+                               = kmalloc(sizeof(struct queue_container),
+                               GFP_KERNEL);
+                       if (!freq_container)
+                               return;
+                       INIT_WORK(&freq_container->update_state,
+                                       freq_update_do_work);
+                       freq_container->time_now = ktime_get_boot_ns();
+                       freq_container->value = value;
+                       pr_debug("Scheduling freq update in work queue\n");
+                       queue_work(memory_wq, &freq_container->update_state);
+               } else {
+                       pr_debug("Freq does not exist.\n");
+               }
+       }
+}
+
+static void memory_state_bw_update(struct memory_state_update_block *ub,
+               int value)
+{
+       if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+               if (init_success) {
+                       struct queue_container *bw_container
+                               = kmalloc(sizeof(struct queue_container),
+                               GFP_KERNEL);
+                       if (!bw_container)
+                               return;
+                       INIT_WORK(&bw_container->update_state,
+                                       bw_update_do_work);
+                       bw_container->time_now = ktime_get_boot_ns();
+                       bw_container->value = value;
+                       bw_container->id = ub->id;
+                       pr_debug("Scheduling bandwidth update in work queue\n");
+                       queue_work(memory_wq, &bw_container->update_state);
+               }
+       }
+}
+
+struct memory_state_update_block *memory_state_register_frequency_source(void)
+{
+       struct memory_state_update_block *block;
+
+       if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+               pr_debug("Allocating frequency source\n");
+               block = kmalloc(sizeof(struct memory_state_update_block),
+                                       GFP_KERNEL);
+               if (!block)
+                       return NULL;
+               block->update_call = memory_state_freq_update;
+               return block;
+       }
+       pr_err("Config option disabled.\n");
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_frequency_source);
+
+struct memory_state_update_block *memory_state_register_bandwidth_source(void)
+{
+       struct memory_state_update_block *block;
+
+       if (IS_ENABLED(CONFIG_MEMORY_STATE_TIME)) {
+               pr_debug("Allocating bandwidth source %d\n",
+                               registered_bw_sources);
+               block = kmalloc(sizeof(struct memory_state_update_block),
+                                       GFP_KERNEL);
+               if (!block)
+                       return NULL;
+               block->update_call = memory_state_bw_update;
+               if (registered_bw_sources < num_sources) {
+                       block->id = registered_bw_sources++;
+               } else {
+                       pr_err("Unable to allocate source; max number reached\n");
+                       kfree(block);
+                       return NULL;
+               }
+               return block;
+       }
+       pr_err("Config option disabled.\n");
+       return NULL;
+}
+EXPORT_SYMBOL_GPL(memory_state_register_bandwidth_source);
+
+/* Buckets are designated by their maximum.
+ * Returns the buckets decided by the capability of the device.
+ */
+static int get_bw_buckets(struct device *dev)
+{
+       int ret, lenb;
+       struct device_node *node = dev->of_node;
+
+       of_property_read_u32(node, NUM_SOURCES, &num_sources);
+       if (!of_find_property(node, BW_TBL, &lenb)) {
+               pr_err("Missing %s property\n", BW_TBL);
+               return -ENODATA;
+       }
+
+       bandwidths = devm_kzalloc(dev,
+                       sizeof(*bandwidths) * num_sources, GFP_KERNEL);
+       if (!bandwidths)
+               return -ENOMEM;
+       lenb /= sizeof(*bw_buckets);
+       bw_buckets = devm_kzalloc(dev, lenb * sizeof(*bw_buckets),
+                       GFP_KERNEL);
+       if (!bw_buckets) {
+               devm_kfree(dev, bandwidths);
+               return -ENOMEM;
+       }
+       ret = of_property_read_u32_array(node, BW_TBL, bw_buckets,
+                       lenb);
+       if (ret < 0) {
+               devm_kfree(dev, bandwidths);
+               devm_kfree(dev, bw_buckets);
+               pr_err("Unable to read bandwidth table from device tree.\n");
+               return ret;
+       }
+
+       curr_bw = 0;
+       num_buckets = lenb;
+       return 0;
+}
+
+/* Adds struct freq_entry nodes to the hashtable for each compatible frequency.
+ * Returns the supported number of frequencies.
+ */
+static int freq_buckets_init(struct device *dev)
+{
+       struct freq_entry *freq_entry;
+       int i;
+       int ret, lenf;
+       struct device_node *node = dev->of_node;
+
+       if (!of_find_property(node, FREQ_TBL, &lenf)) {
+               pr_err("Missing %s property\n", FREQ_TBL);
+               return -ENODATA;
+       }
+
+       lenf /= sizeof(*freq_buckets);
+       freq_buckets = devm_kzalloc(dev, lenf * sizeof(*freq_buckets),
+                       GFP_KERNEL);
+       if (!freq_buckets)
+               return -ENOMEM;
+       pr_debug("freqs found len %d\n", lenf);
+       ret = of_property_read_u32_array(node, FREQ_TBL, freq_buckets,
+                       lenf);
+       if (ret < 0) {
+               devm_kfree(dev, freq_buckets);
+               pr_err("Unable to read frequency table from device tree.\n");
+               return ret;
+       }
+       pr_debug("ret freq %d\n", ret);
+
+       num_freqs = lenf;
+       curr_freq = freq_buckets[LOWEST_FREQ];
+
+       for (i = 0; i < num_freqs; i++) {
+               freq_entry = devm_kzalloc(dev, sizeof(struct freq_entry),
+                               GFP_KERNEL);
+               if (!freq_entry)
+                       return -ENOMEM;
+               freq_entry->buckets = devm_kzalloc(dev, sizeof(u64)*num_buckets,
+                               GFP_KERNEL);
+               if (!freq_entry->buckets) {
+                       devm_kfree(dev, freq_entry);
+                       return -ENOMEM;
+               }
+               pr_debug("memory_state_time Adding freq to ht %d\n",
+                               freq_buckets[i]);
+               freq_entry->freq = freq_buckets[i];
+               hash_add(freq_hash_table, &freq_entry->hash, freq_buckets[i]);
+       }
+       return 0;
+}
+
+struct kobject *memory_kobj;
+EXPORT_SYMBOL_GPL(memory_kobj);
+
+static struct attribute *memory_attrs[] = {
+       &show_stat_attr.attr,
+       NULL
+};
+
+static struct attribute_group memory_attr_group = {
+       .attrs = memory_attrs,
+};
+
+static int memory_state_time_probe(struct platform_device *pdev)
+{
+       int error;
+
+       error = get_bw_buckets(&pdev->dev);
+       if (error)
+               return error;
+       error = freq_buckets_init(&pdev->dev);
+       if (error)
+               return error;
+       last_update = ktime_get_boot_ns();
+       init_success = true;
+
+       pr_debug("memory_state_time initialized with num_freqs %d\n",
+                       num_freqs);
+       return 0;
+}
+
+static const struct of_device_id match_table[] = {
+       { .compatible = "memory-state-time" },
+       {}
+};
+
+static struct platform_driver memory_state_time_driver = {
+       .probe = memory_state_time_probe,
+       .driver = {
+               .name = "memory-state-time",
+               .of_match_table = match_table,
+               .owner = THIS_MODULE,
+       },
+};
+
+static int __init memory_state_time_init(void)
+{
+       int error;
+
+       hash_init(freq_hash_table);
+       memory_wq = create_singlethread_workqueue("memory_wq");
+       if (!memory_wq) {
+               pr_err("Unable to create workqueue.\n");
+               return -EINVAL;
+       }
+       /*
+        * Create sys/kernel directory for memory_state_time.
+        */
+       memory_kobj = kobject_create_and_add(TAG, kernel_kobj);
+       if (!memory_kobj) {
+               pr_err("Unable to allocate memory_kobj for sysfs directory.\n");
+               error = -ENOMEM;
+               goto wq;
+       }
+       error = sysfs_create_group(memory_kobj, &memory_attr_group);
+       if (error) {
+               pr_err("Unable to create sysfs folder.\n");
+               goto kobj;
+       }
+
+       error = platform_driver_register(&memory_state_time_driver);
+       if (error) {
+               pr_err("Unable to register memory_state_time platform driver.\n");
+               goto group;
+       }
+       return 0;
+
+group: sysfs_remove_group(memory_kobj, &memory_attr_group);
+kobj:  kobject_put(memory_kobj);
+wq:    destroy_workqueue(memory_wq);
+       return error;
+}
+module_init(memory_state_time_init);
diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c

new file mode 100644 (file)

index 0000000..9f5b6d1
--- /dev/null
+++ b/drivers/misc/uid_sys_stats.c
@@ -0,0 +1,698 @@
+/* drivers/misc/uid_sys_stats.c
+ *
+ * Copyright (C) 2014 - 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/atomic.h>
+#include <linux/err.h>
+#include <linux/hashtable.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/proc_fs.h>
+#include <linux/profile.h>
+#include <linux/rtmutex.h>
+#include <linux/sched/cputime.h>
+#include <linux/seq_file.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>
+
+
+#define UID_HASH_BITS  10
+DECLARE_HASHTABLE(hash_table, UID_HASH_BITS);
+
+static DEFINE_RT_MUTEX(uid_lock);
+static struct proc_dir_entry *cpu_parent;
+static struct proc_dir_entry *io_parent;
+static struct proc_dir_entry *proc_parent;
+
+struct io_stats {
+       u64 read_bytes;
+       u64 write_bytes;
+       u64 rchar;
+       u64 wchar;
+       u64 fsync;
+};
+
+#define UID_STATE_FOREGROUND   0
+#define UID_STATE_BACKGROUND   1
+#define UID_STATE_BUCKET_SIZE  2
+
+#define UID_STATE_TOTAL_CURR   2
+#define UID_STATE_TOTAL_LAST   3
+#define UID_STATE_DEAD_TASKS   4
+#define UID_STATE_SIZE         5
+
+#define MAX_TASK_COMM_LEN 256
+
+struct task_entry {
+       char comm[MAX_TASK_COMM_LEN];
+       pid_t pid;
+       struct io_stats io[UID_STATE_SIZE];
+       struct hlist_node hash;
+};
+
+struct uid_entry {
+       uid_t uid;
+       u64 utime;
+       u64 stime;
+       u64 active_utime;
+       u64 active_stime;
+       int state;
+       struct io_stats io[UID_STATE_SIZE];
+       struct hlist_node hash;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+       DECLARE_HASHTABLE(task_entries, UID_HASH_BITS);
+#endif
+};
+
+static u64 compute_write_bytes(struct task_struct *task)
+{
+       if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes)
+               return 0;
+
+       return task->ioac.write_bytes - task->ioac.cancelled_write_bytes;
+}
+
+static void compute_io_bucket_stats(struct io_stats *io_bucket,
+                                       struct io_stats *io_curr,
+                                       struct io_stats *io_last,
+                                       struct io_stats *io_dead)
+{
+       /* tasks could switch to another uid group, but its io_last in the
+        * previous uid group could still be positive.
+        * therefore before each update, do an overflow check first
+        */
+       int64_t delta;
+
+       delta = io_curr->read_bytes + io_dead->read_bytes -
+               io_last->read_bytes;
+       io_bucket->read_bytes += delta > 0 ? delta : 0;
+       delta = io_curr->write_bytes + io_dead->write_bytes -
+               io_last->write_bytes;
+       io_bucket->write_bytes += delta > 0 ? delta : 0;
+       delta = io_curr->rchar + io_dead->rchar - io_last->rchar;
+       io_bucket->rchar += delta > 0 ? delta : 0;
+       delta = io_curr->wchar + io_dead->wchar - io_last->wchar;
+       io_bucket->wchar += delta > 0 ? delta : 0;
+       delta = io_curr->fsync + io_dead->fsync - io_last->fsync;
+       io_bucket->fsync += delta > 0 ? delta : 0;
+
+       io_last->read_bytes = io_curr->read_bytes;
+       io_last->write_bytes = io_curr->write_bytes;
+       io_last->rchar = io_curr->rchar;
+       io_last->wchar = io_curr->wchar;
+       io_last->fsync = io_curr->fsync;
+
+       memset(io_dead, 0, sizeof(struct io_stats));
+}
+
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+static void get_full_task_comm(struct task_entry *task_entry,
+               struct task_struct *task)
+{
+       int i = 0, offset = 0, len = 0;
+       /* save one byte for terminating null character */
+       int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1;
+       char buf[unused_len];
+       struct mm_struct *mm = task->mm;
+
+       /* fill the first TASK_COMM_LEN bytes with thread name */
+       get_task_comm(task_entry->comm, task);
+       i = strlen(task_entry->comm);
+       while (i < TASK_COMM_LEN)
+               task_entry->comm[i++] = ' ';
+
+       /* next the executable file name */
+       if (mm) {
+               down_read(&mm->mmap_sem);
+               if (mm->exe_file) {
+                       char *pathname = d_path(&mm->exe_file->f_path, buf,
+                                       unused_len);
+
+                       if (!IS_ERR(pathname)) {
+                               len = strlcpy(task_entry->comm + i, pathname,
+                                               unused_len);
+                               i += len;
+                               task_entry->comm[i++] = ' ';
+                               unused_len--;
+                       }
+               }
+               up_read(&mm->mmap_sem);
+       }
+       unused_len -= len;
+
+       /* fill the rest with command line argument
+        * replace each null or new line character
+        * between args in argv with whitespace */
+       len = get_cmdline(task, buf, unused_len);
+       while (offset < len) {
+               if (buf[offset] != '\0' && buf[offset] != '\n')
+                       task_entry->comm[i++] = buf[offset];
+               else
+                       task_entry->comm[i++] = ' ';
+               offset++;
+       }
+
+       /* get rid of trailing whitespaces in case when arg is memset to
+        * zero before being reset in userspace
+        */
+       while (task_entry->comm[i-1] == ' ')
+               i--;
+       task_entry->comm[i] = '\0';
+}
+
+static struct task_entry *find_task_entry(struct uid_entry *uid_entry,
+               struct task_struct *task)
+{
+       struct task_entry *task_entry;
+
+       hash_for_each_possible(uid_entry->task_entries, task_entry, hash,
+                       task->pid) {
+               if (task->pid == task_entry->pid) {
+                       /* if thread name changed, update the entire command */
+                       int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN)
+                               - task_entry->comm;
+
+                       if (strncmp(task_entry->comm, task->comm, len))
+                               get_full_task_comm(task_entry, task);
+                       return task_entry;
+               }
+       }
+       return NULL;
+}
+
+static struct task_entry *find_or_register_task(struct uid_entry *uid_entry,
+               struct task_struct *task)
+{
+       struct task_entry *task_entry;
+       pid_t pid = task->pid;
+
+       task_entry = find_task_entry(uid_entry, task);
+       if (task_entry)
+               return task_entry;
+
+       task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC);
+       if (!task_entry)
+               return NULL;
+
+       get_full_task_comm(task_entry, task);
+
+       task_entry->pid = pid;
+       hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid);
+
+       return task_entry;
+}
+
+static void remove_uid_tasks(struct uid_entry *uid_entry)
+{
+       struct task_entry *task_entry;
+       unsigned long bkt_task;
+       struct hlist_node *tmp_task;
+
+       hash_for_each_safe(uid_entry->task_entries, bkt_task,
+                       tmp_task, task_entry, hash) {
+               hash_del(&task_entry->hash);
+               kfree(task_entry);
+       }
+}
+
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry)
+{
+       struct task_entry *task_entry;
+       unsigned long bkt_task;
+
+       hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+               memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0,
+                       sizeof(struct io_stats));
+       }
+}
+
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+               struct task_struct *task, int slot)
+{
+       struct task_entry *task_entry = find_or_register_task(uid_entry, task);
+       struct io_stats *task_io_slot = &task_entry->io[slot];
+
+       task_io_slot->read_bytes += task->ioac.read_bytes;
+       task_io_slot->write_bytes += compute_write_bytes(task);
+       task_io_slot->rchar += task->ioac.rchar;
+       task_io_slot->wchar += task->ioac.wchar;
+       task_io_slot->fsync += task->ioac.syscfs;
+}
+
+static void compute_io_uid_tasks(struct uid_entry *uid_entry)
+{
+       struct task_entry *task_entry;
+       unsigned long bkt_task;
+
+       hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+               compute_io_bucket_stats(&task_entry->io[uid_entry->state],
+                                       &task_entry->io[UID_STATE_TOTAL_CURR],
+                                       &task_entry->io[UID_STATE_TOTAL_LAST],
+                                       &task_entry->io[UID_STATE_DEAD_TASKS]);
+       }
+}
+
+static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry)
+{
+       struct task_entry *task_entry;
+       unsigned long bkt_task;
+
+       hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) {
+               /* Separated by comma because space exists in task comm */
+               seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n",
+                               task_entry->comm,
+                               (unsigned long)task_entry->pid,
+                               task_entry->io[UID_STATE_FOREGROUND].rchar,
+                               task_entry->io[UID_STATE_FOREGROUND].wchar,
+                               task_entry->io[UID_STATE_FOREGROUND].read_bytes,
+                               task_entry->io[UID_STATE_FOREGROUND].write_bytes,
+                               task_entry->io[UID_STATE_BACKGROUND].rchar,
+                               task_entry->io[UID_STATE_BACKGROUND].wchar,
+                               task_entry->io[UID_STATE_BACKGROUND].read_bytes,
+                               task_entry->io[UID_STATE_BACKGROUND].write_bytes,
+                               task_entry->io[UID_STATE_FOREGROUND].fsync,
+                               task_entry->io[UID_STATE_BACKGROUND].fsync);
+       }
+}
+#else
+static void remove_uid_tasks(struct uid_entry *uid_entry) {};
+static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {};
+static void add_uid_tasks_io_stats(struct uid_entry *uid_entry,
+               struct task_struct *task, int slot) {};
+static void compute_io_uid_tasks(struct uid_entry *uid_entry) {};
+static void show_io_uid_tasks(struct seq_file *m,
+               struct uid_entry *uid_entry) {}
+#endif
+
+static struct uid_entry *find_uid_entry(uid_t uid)
+{
+       struct uid_entry *uid_entry;
+       hash_for_each_possible(hash_table, uid_entry, hash, uid) {
+               if (uid_entry->uid == uid)
+                       return uid_entry;
+       }
+       return NULL;
+}
+
+static struct uid_entry *find_or_register_uid(uid_t uid)
+{
+       struct uid_entry *uid_entry;
+
+       uid_entry = find_uid_entry(uid);
+       if (uid_entry)
+               return uid_entry;
+
+       uid_entry = kzalloc(sizeof(struct uid_entry), GFP_ATOMIC);
+       if (!uid_entry)
+               return NULL;
+
+       uid_entry->uid = uid;
+#ifdef CONFIG_UID_SYS_STATS_DEBUG
+       hash_init(uid_entry->task_entries);
+#endif
+       hash_add(hash_table, &uid_entry->hash, uid);
+
+       return uid_entry;
+}
+
+static int uid_cputime_show(struct seq_file *m, void *v)
+{
+       struct uid_entry *uid_entry = NULL;
+       struct task_struct *task, *temp;
+       struct user_namespace *user_ns = current_user_ns();
+       u64 utime;
+       u64 stime;
+       unsigned long bkt;
+       uid_t uid;
+
+       rt_mutex_lock(&uid_lock);
+
+       hash_for_each(hash_table, bkt, uid_entry, hash) {
+               uid_entry->active_stime = 0;
+               uid_entry->active_utime = 0;
+       }
+
+       read_lock(&tasklist_lock);
+       do_each_thread(temp, task) {
+               uid = from_kuid_munged(user_ns, task_uid(task));
+               if (!uid_entry || uid_entry->uid != uid)
+                       uid_entry = find_or_register_uid(uid);
+               if (!uid_entry) {
+                       read_unlock(&tasklist_lock);
+                       rt_mutex_unlock(&uid_lock);
+                       pr_err("%s: failed to find the uid_entry for uid %d\n",
+                               __func__, uid);
+                       return -ENOMEM;
+               }
+               task_cputime_adjusted(task, &utime, &stime);
+               uid_entry->active_utime += utime;
+               uid_entry->active_stime += stime;
+       } while_each_thread(temp, task);
+       read_unlock(&tasklist_lock);
+
+       hash_for_each(hash_table, bkt, uid_entry, hash) {
+               u64 total_utime = uid_entry->utime +
+                                                       uid_entry->active_utime;
+               u64 total_stime = uid_entry->stime +
+                                                       uid_entry->active_stime;
+               seq_printf(m, "%d: %llu %llu\n", uid_entry->uid,
+                       ktime_to_ms(total_utime), ktime_to_ms(total_stime));
+       }
+
+       rt_mutex_unlock(&uid_lock);
+       return 0;
+}
+
+static int uid_cputime_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, uid_cputime_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_cputime_fops = {
+       .open           = uid_cputime_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int uid_remove_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_remove_write(struct file *file,
+                       const char __user *buffer, size_t count, loff_t *ppos)
+{
+       struct uid_entry *uid_entry;
+       struct hlist_node *tmp;
+       char uids[128];
+       char *start_uid, *end_uid = NULL;
+       long int uid_start = 0, uid_end = 0;
+
+       if (count >= sizeof(uids))
+               count = sizeof(uids) - 1;
+
+       if (copy_from_user(uids, buffer, count))
+               return -EFAULT;
+
+       uids[count] = '\0';
+       end_uid = uids;
+       start_uid = strsep(&end_uid, "-");
+
+       if (!start_uid || !end_uid)
+               return -EINVAL;
+
+       if (kstrtol(start_uid, 10, &uid_start) != 0 ||
+               kstrtol(end_uid, 10, &uid_end) != 0) {
+               return -EINVAL;
+       }
+       rt_mutex_lock(&uid_lock);
+
+       for (; uid_start <= uid_end; uid_start++) {
+               hash_for_each_possible_safe(hash_table, uid_entry, tmp,
+                                                       hash, (uid_t)uid_start) {
+                       if (uid_start == uid_entry->uid) {
+                               remove_uid_tasks(uid_entry);
+                               hash_del(&uid_entry->hash);
+                               kfree(uid_entry);
+                       }
+               }
+       }
+
+       rt_mutex_unlock(&uid_lock);
+       return count;
+}
+
+static const struct file_operations uid_remove_fops = {
+       .open           = uid_remove_open,
+       .release        = single_release,
+       .write          = uid_remove_write,
+};
+
+
+static void add_uid_io_stats(struct uid_entry *uid_entry,
+                       struct task_struct *task, int slot)
+{
+       struct io_stats *io_slot = &uid_entry->io[slot];
+
+       io_slot->read_bytes += task->ioac.read_bytes;
+       io_slot->write_bytes += compute_write_bytes(task);
+       io_slot->rchar += task->ioac.rchar;
+       io_slot->wchar += task->ioac.wchar;
+       io_slot->fsync += task->ioac.syscfs;
+
+       add_uid_tasks_io_stats(uid_entry, task, slot);
+}
+
+static void update_io_stats_all_locked(void)
+{
+       struct uid_entry *uid_entry = NULL;
+       struct task_struct *task, *temp;
+       struct user_namespace *user_ns = current_user_ns();
+       unsigned long bkt;
+       uid_t uid;
+
+       hash_for_each(hash_table, bkt, uid_entry, hash) {
+               memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+                       sizeof(struct io_stats));
+               set_io_uid_tasks_zero(uid_entry);
+       }
+
+       rcu_read_lock();
+       do_each_thread(temp, task) {
+               uid = from_kuid_munged(user_ns, task_uid(task));
+               if (!uid_entry || uid_entry->uid != uid)
+                       uid_entry = find_or_register_uid(uid);
+               if (!uid_entry)
+                       continue;
+               add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+       } while_each_thread(temp, task);
+       rcu_read_unlock();
+
+       hash_for_each(hash_table, bkt, uid_entry, hash) {
+               compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+                                       &uid_entry->io[UID_STATE_TOTAL_CURR],
+                                       &uid_entry->io[UID_STATE_TOTAL_LAST],
+                                       &uid_entry->io[UID_STATE_DEAD_TASKS]);
+               compute_io_uid_tasks(uid_entry);
+       }
+}
+
+static void update_io_stats_uid_locked(struct uid_entry *uid_entry)
+{
+       struct task_struct *task, *temp;
+       struct user_namespace *user_ns = current_user_ns();
+
+       memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0,
+               sizeof(struct io_stats));
+       set_io_uid_tasks_zero(uid_entry);
+
+       rcu_read_lock();
+       do_each_thread(temp, task) {
+               if (from_kuid_munged(user_ns, task_uid(task)) != uid_entry->uid)
+                       continue;
+               add_uid_io_stats(uid_entry, task, UID_STATE_TOTAL_CURR);
+       } while_each_thread(temp, task);
+       rcu_read_unlock();
+
+       compute_io_bucket_stats(&uid_entry->io[uid_entry->state],
+                               &uid_entry->io[UID_STATE_TOTAL_CURR],
+                               &uid_entry->io[UID_STATE_TOTAL_LAST],
+                               &uid_entry->io[UID_STATE_DEAD_TASKS]);
+       compute_io_uid_tasks(uid_entry);
+}
+
+
+static int uid_io_show(struct seq_file *m, void *v)
+{
+       struct uid_entry *uid_entry;
+       unsigned long bkt;
+
+       rt_mutex_lock(&uid_lock);
+
+       update_io_stats_all_locked();
+
+       hash_for_each(hash_table, bkt, uid_entry, hash) {
+               seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                               uid_entry->uid,
+                               uid_entry->io[UID_STATE_FOREGROUND].rchar,
+                               uid_entry->io[UID_STATE_FOREGROUND].wchar,
+                               uid_entry->io[UID_STATE_FOREGROUND].read_bytes,
+                               uid_entry->io[UID_STATE_FOREGROUND].write_bytes,
+                               uid_entry->io[UID_STATE_BACKGROUND].rchar,
+                               uid_entry->io[UID_STATE_BACKGROUND].wchar,
+                               uid_entry->io[UID_STATE_BACKGROUND].read_bytes,
+                               uid_entry->io[UID_STATE_BACKGROUND].write_bytes,
+                               uid_entry->io[UID_STATE_FOREGROUND].fsync,
+                               uid_entry->io[UID_STATE_BACKGROUND].fsync);
+
+               show_io_uid_tasks(m, uid_entry);
+       }
+
+       rt_mutex_unlock(&uid_lock);
+       return 0;
+}
+
+static int uid_io_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, uid_io_show, PDE_DATA(inode));
+}
+
+static const struct file_operations uid_io_fops = {
+       .open           = uid_io_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = single_release,
+};
+
+static int uid_procstat_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, NULL, NULL);
+}
+
+static ssize_t uid_procstat_write(struct file *file,
+                       const char __user *buffer, size_t count, loff_t *ppos)
+{
+       struct uid_entry *uid_entry;
+       uid_t uid;
+       int argc, state;
+       char input[128];
+
+       if (count >= sizeof(input))
+               return -EINVAL;
+
+       if (copy_from_user(input, buffer, count))
+               return -EFAULT;
+
+       input[count] = '\0';
+
+       argc = sscanf(input, "%u %d", &uid, &state);
+       if (argc != 2)
+               return -EINVAL;
+
+       if (state != UID_STATE_BACKGROUND && state != UID_STATE_FOREGROUND)
+               return -EINVAL;
+
+       rt_mutex_lock(&uid_lock);
+
+       uid_entry = find_or_register_uid(uid);
+       if (!uid_entry) {
+               rt_mutex_unlock(&uid_lock);
+               return -EINVAL;
+       }
+
+       if (uid_entry->state == state) {
+               rt_mutex_unlock(&uid_lock);
+               return count;
+       }
+
+       update_io_stats_uid_locked(uid_entry);
+
+       uid_entry->state = state;
+
+       rt_mutex_unlock(&uid_lock);
+
+       return count;
+}
+
+static const struct file_operations uid_procstat_fops = {
+       .open           = uid_procstat_open,
+       .release        = single_release,
+       .write          = uid_procstat_write,
+};
+
+static int process_notifier(struct notifier_block *self,
+                       unsigned long cmd, void *v)
+{
+       struct task_struct *task = v;
+       struct uid_entry *uid_entry;
+       u64 utime, stime;
+       uid_t uid;
+
+       if (!task)
+               return NOTIFY_OK;
+
+       rt_mutex_lock(&uid_lock);
+       uid = from_kuid_munged(current_user_ns(), task_uid(task));
+       uid_entry = find_or_register_uid(uid);
+       if (!uid_entry) {
+               pr_err("%s: failed to find uid %d\n", __func__, uid);
+               goto exit;
+       }
+
+       task_cputime_adjusted(task, &utime, &stime);
+       uid_entry->utime += utime;
+       uid_entry->stime += stime;
+
+       add_uid_io_stats(uid_entry, task, UID_STATE_DEAD_TASKS);
+
+exit:
+       rt_mutex_unlock(&uid_lock);
+       return NOTIFY_OK;
+}
+
+static struct notifier_block process_notifier_block = {
+       .notifier_call  = process_notifier,
+};
+
+static int __init proc_uid_sys_stats_init(void)
+{
+       hash_init(hash_table);
+
+       cpu_parent = proc_mkdir("uid_cputime", NULL);
+       if (!cpu_parent) {
+               pr_err("%s: failed to create uid_cputime proc entry\n",
+                       __func__);
+               goto err;
+       }
+
+       proc_create_data("remove_uid_range", 0222, cpu_parent,
+               &uid_remove_fops, NULL);
+       proc_create_data("show_uid_stat", 0444, cpu_parent,
+               &uid_cputime_fops, NULL);
+
+       io_parent = proc_mkdir("uid_io", NULL);
+       if (!io_parent) {
+               pr_err("%s: failed to create uid_io proc entry\n",
+                       __func__);
+               goto err;
+       }
+
+       proc_create_data("stats", 0444, io_parent,
+               &uid_io_fops, NULL);
+
+       proc_parent = proc_mkdir("uid_procstat", NULL);
+       if (!proc_parent) {
+               pr_err("%s: failed to create uid_procstat proc entry\n",
+                       __func__);
+               goto err;
+       }
+
+       proc_create_data("set", 0222, proc_parent,
+               &uid_procstat_fops, NULL);
+
+       profile_event_register(PROFILE_TASK_EXIT, &process_notifier_block);
+
+       return 0;
+
+err:
+       remove_proc_subtree("uid_cputime", NULL);
+       remove_proc_subtree("uid_io", NULL);
+       remove_proc_subtree("uid_procstat", NULL);
+       return -ENOMEM;
+}
+
+early_initcall(proc_uid_sys_stats_init);
diff --git a/drivers/mmc/core/Kconfig b/drivers/mmc/core/Kconfig

index 42e89060cd41ecb766dbc3c4b2c98b6037decae1..b3faf1062d4eb5fb82e8aa1e2f7c749f4e463c64 100644 (file)
--- a/drivers/mmc/core/Kconfig
+++ b/drivers/mmc/core/Kconfig
@@ -80,3 +80,17 @@ config MMC_TEST
           This driver is only of interest to those developing or
           testing a host driver. Most people should say N here.
  
+config MMC_EMBEDDED_SDIO
+       boolean "MMC embedded SDIO device support"
+       help
+         If you say Y here, support will be added for embedded SDIO
+         devices which do not contain the necessary enumeration
+         support in hardware to be properly detected.
+
+config MMC_PARANOID_SD_INIT
+       bool "Enable paranoid SD card initialization"
+       help
+         If you say Y here, the MMC layer will be extra paranoid
+         about re-trying SD init requests. This can be a useful
+         work-around for buggy controllers and hardware. Enable
+         if you are experiencing issues with SD detection.
diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c

index 66c9cf49ad2f11fe59de05eae716b6b797766cc4..41ac8292192268313a773e8d1041f11c7709fc9f 100644 (file)
--- a/drivers/mmc/core/core.c
+++ b/drivers/mmc/core/core.c
@@ -2802,6 +2802,22 @@ void mmc_init_context_info(struct mmc_host *host)
         init_waitqueue_head(&host->context_info.wait);
  }
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+void mmc_set_embedded_sdio_data(struct mmc_host *host,
+                               struct sdio_cis *cis,
+                               struct sdio_cccr *cccr,
+                               struct sdio_embedded_func *funcs,
+                               int num_funcs)
+{
+       host->embedded_sdio_data.cis = cis;
+       host->embedded_sdio_data.cccr = cccr;
+       host->embedded_sdio_data.funcs = funcs;
+       host->embedded_sdio_data.num_funcs = num_funcs;
+}
+
+EXPORT_SYMBOL(mmc_set_embedded_sdio_data);
+#endif
+
  static int __init mmc_init(void)
  {
         int ret;
diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c

index ad88deb2e8f3b046838a3539841717951c33a6c6..f469254bf78a7e2b6a32c449475771ca8a84ee3a 100644 (file)
--- a/drivers/mmc/core/host.c
+++ b/drivers/mmc/core/host.c
@@ -429,7 +429,8 @@ int mmc_add_host(struct mmc_host *host)
  #endif
  
         mmc_start_host(host);
-       mmc_register_pm_notifier(host);
+       if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+               mmc_register_pm_notifier(host);
  
         return 0;
  }
@@ -446,7 +447,8 @@ EXPORT_SYMBOL(mmc_add_host);
   */
  void mmc_remove_host(struct mmc_host *host)
  {
-       mmc_unregister_pm_notifier(host);
+       if (!(host->pm_flags & MMC_PM_IGNORE_PM_NOTIFY))
+               mmc_unregister_pm_notifier(host);
         mmc_stop_host(host);
  
  #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c

index bad5c1bf4ed9f87975f2485810e722d85dbbe7c7..13871e2bcb0ae83fa55cb438b69d4159bae44c3c 100644 (file)
--- a/drivers/mmc/core/mmc.c
+++ b/drivers/mmc/core/mmc.c
@@ -780,6 +780,7 @@ MMC_DEV_ATTR(manfid, "0x%06x\n", card->cid.manfid);
  MMC_DEV_ATTR(name, "%s\n", card->cid.prod_name);
  MMC_DEV_ATTR(oemid, "0x%04x\n", card->cid.oemid);
  MMC_DEV_ATTR(prv, "0x%x\n", card->cid.prv);
+MMC_DEV_ATTR(rev, "0x%x\n", card->ext_csd.rev);
  MMC_DEV_ATTR(pre_eol_info, "0x%02x\n", card->ext_csd.pre_eol_info);
  MMC_DEV_ATTR(life_time, "0x%02x 0x%02x\n",
         card->ext_csd.device_life_time_est_typ_a,
@@ -838,6 +839,7 @@ static struct attribute *mmc_std_attrs[] = {
         &dev_attr_name.attr,
         &dev_attr_oemid.attr,
         &dev_attr_prv.attr,
+       &dev_attr_rev.attr,
         &dev_attr_pre_eol_info.attr,
         &dev_attr_life_time.attr,
         &dev_attr_serial.attr,
diff --git a/drivers/mmc/core/queue.c b/drivers/mmc/core/queue.c

index 0a4e77a5ba33fe7e0009ab1f7f97078a38e1559c..8c4721f1b4bc246f315f214e7add275c4303dc40 100644 (file)
--- a/drivers/mmc/core/queue.c
+++ b/drivers/mmc/core/queue.c
@@ -17,6 +17,8 @@
  
  #include <linux/mmc/card.h>
  #include <linux/mmc/host.h>
+#include <linux/sched/rt.h>
+#include <uapi/linux/sched/types.h>
  
  #include "queue.h"
  #include "block.h"
@@ -43,6 +45,11 @@ static int mmc_queue_thread(void *d)
         struct mmc_queue *mq = d;
         struct request_queue *q = mq->queue;
         struct mmc_context_info *cntx = &mq->card->host->context_info;
+       struct sched_param scheduler_params = {0};
+
+       scheduler_params.sched_priority = 1;
+
+       sched_setscheduler(current, SCHED_FIFO, &scheduler_params);
  
         current->flags |= PF_MEMALLOC;
  
diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c

index eb9de21349679914b0bd0b3de068c2685080217b..4bbdfa382e84931c1ea505100ba18eb73b4ff4f8 100644 (file)
--- a/drivers/mmc/core/sd.c
+++ b/drivers/mmc/core/sd.c
@@ -834,6 +834,9 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
         bool reinit)
  {
         int err;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       int retries;
+#endif
  
         if (!reinit) {
                 /*
@@ -860,7 +863,26 @@ int mmc_sd_setup_card(struct mmc_host *host, struct mmc_card *card,
                 /*
                  * Fetch switch information from card.
                  */
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+               for (retries = 1; retries <= 3; retries++) {
+                       err = mmc_read_switch(card);
+                       if (!err) {
+                               if (retries > 1) {
+                                       printk(KERN_WARNING
+                                              "%s: recovered\n",
+                                              mmc_hostname(host));
+                               }
+                               break;
+                       } else {
+                               printk(KERN_WARNING
+                                      "%s: read switch failed (attempt %d)\n",
+                                      mmc_hostname(host), retries);
+                       }
+               }
+#else
                 err = mmc_read_switch(card);
+#endif
+
                 if (err)
                         return err;
         }
@@ -1054,14 +1076,33 @@ static int mmc_sd_alive(struct mmc_host *host)
   */
  static void mmc_sd_detect(struct mmc_host *host)
  {
-       int err;
+       int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       int retries = 5;
+#endif
  
         mmc_get_card(host->card);
  
         /*
          * Just check if our card has been removed.
          */
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       while(retries) {
+               err = mmc_send_status(host->card, NULL);
+               if (err) {
+                       retries--;
+                       udelay(5);
+                       continue;
+               }
+               break;
+       }
+       if (!retries) {
+               printk(KERN_ERR "%s(%s): Unable to re-detect card (%d)\n",
+                      __func__, mmc_hostname(host), err);
+       }
+#else
         err = _mmc_detect_card_removed(host);
+#endif
  
         mmc_put_card(host->card);
  
@@ -1120,6 +1161,9 @@ static int mmc_sd_suspend(struct mmc_host *host)
  static int _mmc_sd_resume(struct mmc_host *host)
  {
         int err = 0;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       int retries;
+#endif
  
         mmc_claim_host(host);
  
@@ -1127,7 +1171,23 @@ static int _mmc_sd_resume(struct mmc_host *host)
                 goto out;
  
         mmc_power_up(host, host->card->ocr);
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       retries = 5;
+       while (retries) {
+               err = mmc_sd_init_card(host, host->card->ocr, host->card);
+
+               if (err) {
+                       printk(KERN_ERR "%s: Re-init card rc = %d (retries = %d)\n",
+                              mmc_hostname(host), err, retries);
+                       mdelay(5);
+                       retries--;
+                       continue;
+               }
+               break;
+       }
+#else
         err = mmc_sd_init_card(host, host->card->ocr, host->card);
+#endif
         mmc_card_clr_suspended(host->card);
  
  out:
@@ -1202,6 +1262,9 @@ int mmc_attach_sd(struct mmc_host *host)
  {
         int err;
         u32 ocr, rocr;
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       int retries;
+#endif
  
         WARN_ON(!host->claimed);
  
@@ -1237,9 +1300,27 @@ int mmc_attach_sd(struct mmc_host *host)
         /*
          * Detect and init the card.
          */
+#ifdef CONFIG_MMC_PARANOID_SD_INIT
+       retries = 5;
+       while (retries) {
+               err = mmc_sd_init_card(host, rocr, NULL);
+               if (err) {
+                       retries--;
+                       continue;
+               }
+               break;
+       }
+
+       if (!retries) {
+               printk(KERN_ERR "%s: mmc_sd_init_card() failure (err = %d)\n",
+                      mmc_hostname(host), err);
+               goto err;
+       }
+#else
         err = mmc_sd_init_card(host, rocr, NULL);
         if (err)
                 goto err;
+#endif
  
         mmc_release_host(host);
         err = mmc_add_card(host->card);
diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c

index cc43687ca241918fe40d8511eab5fe6ecb848d52..c42e3cd537f288c477eaa48a705659cda6694024 100644 (file)
--- a/drivers/mmc/core/sdio.c
+++ b/drivers/mmc/core/sdio.c
@@ -31,6 +31,10 @@
  #include "sdio_ops.h"
  #include "sdio_cis.h"
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/sdio_ids.h>
+#endif
+
  static int sdio_read_fbr(struct sdio_func *func)
  {
         int ret;
@@ -706,28 +710,44 @@ try_again:
                 goto finish;
         }
  
-       /*
-        * Read the common registers. Note that we should try to
-        * validate whether UHS would work or not.
-        */
-       err = sdio_read_cccr(card, ocr);
-       if (err) {
-               mmc_sdio_resend_if_cond(host, card);
-               if (ocr & R4_18V_PRESENT) {
-                       /* Retry init sequence, but without R4_18V_PRESENT. */
-                       retries = 0;
-                       goto try_again;
-               } else {
-                       goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       if (host->embedded_sdio_data.cccr)
+               memcpy(&card->cccr, host->embedded_sdio_data.cccr, sizeof(struct sdio_cccr));
+       else {
+#endif
+               /*
+                * Read the common registers. Note that we should try to
+                * validate whether UHS would work or not.
+                */
+               err = sdio_read_cccr(card, ocr);
+               if (err) {
+                       mmc_sdio_resend_if_cond(host, card);
+                       if (ocr & R4_18V_PRESENT) {
+                               /* Retry init sequence, but without R4_18V_PRESENT. */
+                               retries = 0;
+                               goto try_again;
+                       } else {
+                               goto remove;
+                       }
                 }
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
         }
+#endif
  
-       /*
-        * Read the common CIS tuples.
-        */
-       err = sdio_read_common_cis(card);
-       if (err)
-               goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       if (host->embedded_sdio_data.cis)
+               memcpy(&card->cis, host->embedded_sdio_data.cis, sizeof(struct sdio_cis));
+       else {
+#endif
+               /*
+                * Read the common CIS tuples.
+                */
+               err = sdio_read_common_cis(card);
+               if (err)
+                       goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       }
+#endif
  
         if (oldcard) {
                 int same = (card->cis.vendor == oldcard->cis.vendor &&
@@ -1129,14 +1149,36 @@ int mmc_attach_sdio(struct mmc_host *host)
         funcs = (ocr & 0x70000000) >> 28;
         card->sdio_funcs = 0;
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       if (host->embedded_sdio_data.funcs)
+               card->sdio_funcs = funcs = host->embedded_sdio_data.num_funcs;
+#endif
+
         /*
          * Initialize (but don't add) all present functions.
          */
         for (i = 0; i < funcs; i++, card->sdio_funcs++) {
-               err = sdio_init_func(host->card, i + 1);
-               if (err)
-                       goto remove;
-
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+               if (host->embedded_sdio_data.funcs) {
+                       struct sdio_func *tmp;
+
+                       tmp = sdio_alloc_func(host->card);
+                       if (IS_ERR(tmp))
+                               goto remove;
+                       tmp->num = (i + 1);
+                       card->sdio_func[i] = tmp;
+                       tmp->class = host->embedded_sdio_data.funcs[i].f_class;
+                       tmp->max_blksize = host->embedded_sdio_data.funcs[i].f_maxblksize;
+                       tmp->vendor = card->cis.vendor;
+                       tmp->device = card->cis.device;
+               } else {
+#endif
+                       err = sdio_init_func(host->card, i + 1);
+                       if (err)
+                               goto remove;
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+               }
+#endif
                 /*
                  * Enable Runtime PM for this func (if supported)
                  */
diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c

index 2b32b88949ba40dfb3901cf9588f670b7aad8441..997d556fccf1ca5675317ad6955ea146a3b21003 100644 (file)
--- a/drivers/mmc/core/sdio_bus.c
+++ b/drivers/mmc/core/sdio_bus.c
@@ -29,6 +29,10 @@
  #include "sdio_cis.h"
  #include "sdio_bus.h"
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+#include <linux/mmc/host.h>
+#endif
+
  #define to_sdio_driver(d)      container_of(d, struct sdio_driver, drv)
  
  /* show configuration fields */
@@ -264,7 +268,14 @@ static void sdio_release_func(struct device *dev)
  {
         struct sdio_func *func = dev_to_sdio_func(dev);
  
-       sdio_free_func_cis(func);
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       /*
+        * If this device is embedded then we never allocated
+        * cis tables for this func
+        */
+       if (!func->card->host->embedded_sdio_data.funcs)
+#endif
+               sdio_free_func_cis(func);
  
         kfree(func->info);
         kfree(func->tmpbuf);
diff --git a/drivers/net/tun.c b/drivers/net/tun.c

index fa51b7b0e9eaba067f27f4195d0e93d05791f545..f7bfce50ba9e6055b7543eb4246165662a3d152d 100644 (file)
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -2276,6 +2276,12 @@ static long __tun_chr_ioctl(struct file *file, unsigned int cmd,
         int le;
         int ret;
  
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+       if (cmd != TUNGETIFF && !capable(CAP_NET_ADMIN)) {
+               return -EPERM;
+       }
+#endif
+
         if (cmd == TUNSETIFF || cmd == TUNSETQUEUE || _IOC_TYPE(cmd) == SOCK_IOC_TYPE) {
                 if (copy_from_user(&ifr, argp, ifreq_len))
                         return -EFAULT;
diff --git a/drivers/net/wireless/ti/wlcore/init.c b/drivers/net/wireless/ti/wlcore/init.c

index 58898b99d3f74ada971d9f614733b61d13dc69bd..145e10a8be5540b23b014351187367c24bfcf415 100644 (file)
--- a/drivers/net/wireless/ti/wlcore/init.c
+++ b/drivers/net/wireless/ti/wlcore/init.c
@@ -549,6 +549,11 @@ static int wl12xx_init_ap_role(struct wl1271 *wl, struct wl12xx_vif *wlvif)
  {
         int ret;
  
+       /* Disable filtering */
+       ret = wl1271_acx_group_address_tbl(wl, wlvif, false, NULL, 0);
+       if (ret < 0)
+               return ret;
+
         ret = wl1271_acx_ap_max_tx_retry(wl, wlvif);
         if (ret < 0)
                 return ret;
diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c

index c4da50e07bbcbe2318a27f55c5e49baf5f3c077b..08a4f82a296533ce086630c105efa2f049a246e4 100644 (file)
--- a/drivers/nfc/fdp/i2c.c
+++ b/drivers/nfc/fdp/i2c.c
@@ -176,6 +176,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb)
                 /* Packet that contains a length */
                 if (tmp[0] == 0 && tmp[1] == 0) {
                         phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3;
+                       /*
+                        * Ensure next_read_size does not exceed sizeof(tmp)
+                        * for reading that many bytes during next iteration
+                        */
+                       if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) {
+                               dev_dbg(&client->dev, "%s: corrupted packet\n",
+                                       __func__);
+                               phy->next_read_size = 5;
+                               goto flush;
+                       }
                 } else {
                         phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD;
  
diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c

index fd08be2917e60ab6d4adc5254e3df2090ae6f9f5..3420c5104c9432c82f11ae0bf77e802bf166b107 100644 (file)
--- a/drivers/nfc/st21nfca/dep.c
+++ b/drivers/nfc/st21nfca/dep.c
@@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev,
  
         atr_req = (struct st21nfca_atr_req *)skb->data;
  
-       if (atr_req->length < sizeof(struct st21nfca_atr_req)) {
+       if (atr_req->length < sizeof(struct st21nfca_atr_req) ||
+           atr_req->length > skb->len) {
                 r = -EPROTO;
                 goto exit;
         }
diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c

index 3a98563d4a121ddc99223ca367ee44a51f8fd92b..6e84e120150ddad98f77f3af3a42b194e13bf37f 100644 (file)
--- a/drivers/nfc/st21nfca/se.c
+++ b/drivers/nfc/st21nfca/se.c
@@ -320,23 +320,33 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host,
                  * AID          81      5 to 16
                  * PARAMETERS   82      0 to 255
                  */
-               if (skb->len < NFC_MIN_AID_LENGTH + 2 &&
+               if (skb->len < NFC_MIN_AID_LENGTH + 2 ||
                     skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG)
                         return -EPROTO;
  
+               /*
+                * Buffer should have enough space for at least
+                * two tag fields + two length fields + aid_len (skb->data[1])
+                */
+               if (skb->len < skb->data[1] + 4)
+                       return -EPROTO;
+
                 transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev,
                                                    skb->len - 2, GFP_KERNEL);
  
                 transaction->aid_len = skb->data[1];
                 memcpy(transaction->aid, &skb->data[2],
                        transaction->aid_len);
+               transaction->params_len = skb->data[transaction->aid_len + 3];
  
-               /* Check next byte is PARAMETERS tag (82) */
+               /* Check next byte is PARAMETERS tag (82) and the length field */
                 if (skb->data[transaction->aid_len + 2] !=
-                   NFC_EVT_TRANSACTION_PARAMS_TAG)
+                   NFC_EVT_TRANSACTION_PARAMS_TAG ||
+                   skb->len < transaction->aid_len + transaction->params_len + 4) {
+                       devm_kfree(dev, transaction);
                         return -EPROTO;
+               }
  
-               transaction->params_len = skb->data[transaction->aid_len + 3];
                 memcpy(transaction->params, skb->data +
                        transaction->aid_len + 4, transaction->params_len);
  
diff --git a/drivers/of/fdt.c b/drivers/of/fdt.c

index ce30c9a588a41d87841289204aeacefc1b7f3cd2..179cbc5929b8b9be96c9c3e2a653f05cf29e51dc 100644 (file)
--- a/drivers/of/fdt.c
+++ b/drivers/of/fdt.c
@@ -1109,42 +1109,66 @@ int __init early_init_dt_scan_memory(unsigned long node, const char *uname,
         return 0;
  }
  
+/*
+ * Convert configs to something easy to use in C code
+ */
+#if defined(CONFIG_CMDLINE_FORCE)
+static const int overwrite_incoming_cmdline = 1;
+static const int read_dt_cmdline;
+static const int concat_cmdline;
+#elif defined(CONFIG_CMDLINE_EXTEND)
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline = 1;
+#else /* CMDLINE_FROM_BOOTLOADER */
+static const int overwrite_incoming_cmdline;
+static const int read_dt_cmdline = 1;
+static const int concat_cmdline;
+#endif
+
+#ifdef CONFIG_CMDLINE
+static const char *config_cmdline = CONFIG_CMDLINE;
+#else
+static const char *config_cmdline = "";
+#endif
+
  int __init early_init_dt_scan_chosen(unsigned long node, const char *uname,
                                      int depth, void *data)
  {
-       int l;
-       const char *p;
+       int l = 0;
+       const char *p = NULL;
+       char *cmdline = data;
  
         pr_debug("search \"chosen\", depth: %d, uname: %s\n", depth, uname);
  
-       if (depth != 1 || !data ||
+       if (depth != 1 || !cmdline ||
             (strcmp(uname, "chosen") != 0 && strcmp(uname, "chosen@0") != 0))
                 return 0;
  
         early_init_dt_check_for_initrd(node);
  
-       /* Retrieve command line */
-       p = of_get_flat_dt_prop(node, "bootargs", &l);
-       if (p != NULL && l > 0)
-               strlcpy(data, p, min((int)l, COMMAND_LINE_SIZE));
-
-       /*
-        * CONFIG_CMDLINE is meant to be a default in case nothing else
-        * managed to set the command line, unless CONFIG_CMDLINE_FORCE
-        * is set in which case we override whatever was found earlier.
-        */
-#ifdef CONFIG_CMDLINE
-#if defined(CONFIG_CMDLINE_EXTEND)
-       strlcat(data, " ", COMMAND_LINE_SIZE);
-       strlcat(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#elif defined(CONFIG_CMDLINE_FORCE)
-       strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#else
-       /* No arguments from boot loader, use kernel's  cmdl*/
-       if (!((char *)data)[0])
-               strlcpy(data, CONFIG_CMDLINE, COMMAND_LINE_SIZE);
-#endif
-#endif /* CONFIG_CMDLINE */
+       /* Put CONFIG_CMDLINE in if forced or if data had nothing in it to start */
+       if (overwrite_incoming_cmdline || !cmdline[0])
+               strlcpy(cmdline, config_cmdline, COMMAND_LINE_SIZE);
+
+       /* Retrieve command line unless forcing */
+       if (read_dt_cmdline)
+               p = of_get_flat_dt_prop(node, "bootargs", &l);
+
+       if (p != NULL && l > 0) {
+               if (concat_cmdline) {
+                       int cmdline_len;
+                       int copy_len;
+                       strlcat(cmdline, " ", COMMAND_LINE_SIZE);
+                       cmdline_len = strlen(cmdline);
+                       copy_len = COMMAND_LINE_SIZE - cmdline_len - 1;
+                       copy_len = min((int)l, copy_len);
+                       strncpy(cmdline + cmdline_len, p, copy_len);
+                       cmdline[cmdline_len + copy_len] = '\0';
+               } else {
+                       strlcpy(cmdline, p, min((int)l, COMMAND_LINE_SIZE));
+               }
+       }
  
         pr_debug("Command line is: %s\n", (char*)data);
  
diff --git a/drivers/power/supply/power_supply_sysfs.c b/drivers/power/supply/power_supply_sysfs.c

index 5204f115970fe721f9bc2c092c663be173a7283f..da589c3decb28348d47b1dc15cbd710ae80f0cad 100644 (file)
--- a/drivers/power/supply/power_supply_sysfs.c
+++ b/drivers/power/supply/power_supply_sysfs.c
@@ -121,7 +121,10 @@ static ssize_t power_supply_show_property(struct device *dev,
         else if (off >= POWER_SUPPLY_PROP_MODEL_NAME)
                 return sprintf(buf, "%s\n", value.strval);
  
-       return sprintf(buf, "%d\n", value.intval);
+       if (off == POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT)
+               return sprintf(buf, "%lld\n", value.int64val);
+       else
+               return sprintf(buf, "%d\n", value.intval);
  }
  
  static ssize_t power_supply_store_property(struct device *dev,
@@ -245,6 +248,12 @@ static struct device_attribute power_supply_attrs[] = {
         POWER_SUPPLY_ATTR(precharge_current),
         POWER_SUPPLY_ATTR(charge_term_current),
         POWER_SUPPLY_ATTR(calibrate),
+       /* Local extensions */
+       POWER_SUPPLY_ATTR(usb_hc),
+       POWER_SUPPLY_ATTR(usb_otg),
+       POWER_SUPPLY_ATTR(charge_enabled),
+       /* Local extensions of type int64_t */
+       POWER_SUPPLY_ATTR(charge_counter_ext),
         /* Properties of type `const char *' */
         POWER_SUPPLY_ATTR(model_name),
         POWER_SUPPLY_ATTR(manufacturer),
diff --git a/drivers/rtc/rtc-palmas.c b/drivers/rtc/rtc-palmas.c

index 4bcfb88674d38b5d3343149708ed188e7d2c707e..34aea38ebfa625d58da61b3811ced7e19e1b8535 100644 (file)
--- a/drivers/rtc/rtc-palmas.c
+++ b/drivers/rtc/rtc-palmas.c
@@ -45,6 +45,42 @@ struct palmas_rtc {
  /* Total number of RTC registers needed to set time*/
  #define PALMAS_NUM_TIME_REGS   (PALMAS_YEARS_REG - PALMAS_SECONDS_REG + 1)
  
+/*
+ * Special bin2bcd mapping to deal with bcd storage of year.
+ *
+ *   0-69                -> 0xD0
+ *  70-99  (1970 - 1999) -> 0xD0 - 0xF9 (correctly rolls to 0x00)
+ * 100-199 (2000 - 2099) -> 0x00 - 0x99 (does not roll to 0xA0 :-( )
+ * 200-229 (2100 - 2129) -> 0xA0 - 0xC9 (really for completeness)
+ * 230-                  -> 0xC9
+ *
+ * Confirmed: the only transition that does not work correctly for this rtc
+ * clock is the transition from 2099 to 2100, it proceeds to 2000. We will
+ * accept this issue since the clock retains and transitions the year correctly
+ * in all other conditions.
+ */
+static unsigned char year_bin2bcd(int val)
+{
+       if (val < 70)
+               return 0xD0;
+       if (val < 100)
+               return bin2bcd(val - 20) | 0x80; /* KISS leverage of bin2bcd */
+       if (val >= 230)
+               return 0xC9;
+       if (val >= 200)
+               return bin2bcd(val - 180) | 0x80;
+       return bin2bcd(val - 100);
+}
+
+static int year_bcd2bin(unsigned char val)
+{
+       if (val >= 0xD0)
+               return bcd2bin(val & 0x7F) + 20;
+       if (val >= 0xA0)
+               return bcd2bin(val & 0x7F) + 180;
+       return bcd2bin(val) + 100;
+}
+
  static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
  {
         unsigned char rtc_data[PALMAS_NUM_TIME_REGS];
@@ -71,7 +107,7 @@ static int palmas_rtc_read_time(struct device *dev, struct rtc_time *tm)
         tm->tm_hour = bcd2bin(rtc_data[2]);
         tm->tm_mday = bcd2bin(rtc_data[3]);
         tm->tm_mon = bcd2bin(rtc_data[4]) - 1;
-       tm->tm_year = bcd2bin(rtc_data[5]) + 100;
+       tm->tm_year = year_bcd2bin(rtc_data[5]);
  
         return ret;
  }
@@ -87,7 +123,7 @@ static int palmas_rtc_set_time(struct device *dev, struct rtc_time *tm)
         rtc_data[2] = bin2bcd(tm->tm_hour);
         rtc_data[3] = bin2bcd(tm->tm_mday);
         rtc_data[4] = bin2bcd(tm->tm_mon + 1);
-       rtc_data[5] = bin2bcd(tm->tm_year - 100);
+       rtc_data[5] = year_bin2bcd(tm->tm_year);
  
         /* Stop RTC while updating the RTC time registers */
         ret = palmas_update_bits(palmas, PALMAS_RTC_BASE, PALMAS_RTC_CTRL_REG,
@@ -142,7 +178,7 @@ static int palmas_rtc_read_alarm(struct device *dev, struct rtc_wkalrm *alm)
         alm->time.tm_hour = bcd2bin(alarm_data[2]);
         alm->time.tm_mday = bcd2bin(alarm_data[3]);
         alm->time.tm_mon = bcd2bin(alarm_data[4]) - 1;
-       alm->time.tm_year = bcd2bin(alarm_data[5]) + 100;
+       alm->time.tm_year = year_bcd2bin(alarm_data[5]);
  
         ret = palmas_read(palmas, PALMAS_RTC_BASE, PALMAS_RTC_INTERRUPTS_REG,
                         &int_val);
@@ -173,7 +209,7 @@ static int palmas_rtc_set_alarm(struct device *dev, struct rtc_wkalrm *alm)
         alarm_data[2] = bin2bcd(alm->time.tm_hour);
         alarm_data[3] = bin2bcd(alm->time.tm_mday);
         alarm_data[4] = bin2bcd(alm->time.tm_mon + 1);
-       alarm_data[5] = bin2bcd(alm->time.tm_year - 100);
+       alarm_data[5] = year_bin2bcd(alm->time.tm_year);
  
         ret = palmas_bulk_write(palmas, PALMAS_RTC_BASE,
                 PALMAS_ALARM_SECONDS_REG, alarm_data, PALMAS_NUM_TIME_REGS);
diff --git a/drivers/staging/android/Kconfig b/drivers/staging/android/Kconfig

index 71a50b99caff160b9f5603ac4229f87c84cacb09..0a28c7616d539ca67926db2aa91329b1a34711b4 100644 (file)
--- a/drivers/staging/android/Kconfig
+++ b/drivers/staging/android/Kconfig
@@ -16,6 +16,8 @@ config ASHMEM
  
  source "drivers/staging/android/ion/Kconfig"
  
+source "drivers/staging/android/fiq_debugger/Kconfig"
+
  endif # if ANDROID
  
  endmenu
diff --git a/drivers/staging/android/Makefile b/drivers/staging/android/Makefile

index 7cf1564a49a5eec86acf2c6f09a9196940beef25..9b9b297d7c0ea6d0cee687dbf0894c5a6d98debc 100644 (file)
--- a/drivers/staging/android/Makefile
+++ b/drivers/staging/android/Makefile
@@ -1,5 +1,6 @@
  ccflags-y += -I$(src)                  # needed for trace events
  
  obj-y                                  += ion/
+obj-$(CONFIG_FIQ_DEBUGGER)             += fiq_debugger/
  
  obj-$(CONFIG_ASHMEM)                   += ashmem.o
diff --git a/drivers/staging/android/ashmem.c b/drivers/staging/android/ashmem.c

index 372ce9913e6dea373d4cdd06ca51f2419c7a7373..1e7c1826e2212fa78285e6d3a662f0db13832e18 100644 (file)
--- a/drivers/staging/android/ashmem.c
+++ b/drivers/staging/android/ashmem.c
@@ -401,22 +401,14 @@ static int ashmem_mmap(struct file *file, struct vm_area_struct *vma)
         }
         get_file(asma->file);
  
-       /*
-        * XXX - Reworked to use shmem_zero_setup() instead of
-        * shmem_set_file while we're in staging. -jstultz
-        */
-       if (vma->vm_flags & VM_SHARED) {
-               ret = shmem_zero_setup(vma);
-               if (ret) {
-                       fput(asma->file);
-                       goto out;
-               }
+       if (vma->vm_flags & VM_SHARED)
+               shmem_set_file(vma, asma->file);
+       else {
+               if (vma->vm_file)
+                       fput(vma->vm_file);
+               vma->vm_file = asma->file;
         }
  
-       if (vma->vm_file)
-               fput(vma->vm_file);
-       vma->vm_file = asma->file;
-
  out:
         mutex_unlock(&ashmem_mutex);
         return ret;
@@ -453,9 +445,9 @@ ashmem_shrink_scan(struct shrinker *shrink, struct shrink_control *sc)
                 loff_t start = range->pgstart * PAGE_SIZE;
                 loff_t end = (range->pgend + 1) * PAGE_SIZE;
  
-               vfs_fallocate(range->asma->file,
-                             FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
-                             start, end - start);
+               range->asma->file->f_op->fallocate(range->asma->file,
+                               FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
+                               start, end - start);
                 range->purged = ASHMEM_WAS_PURGED;
                 lru_del(range);
  
diff --git a/drivers/staging/android/fiq_debugger/Kconfig b/drivers/staging/android/fiq_debugger/Kconfig

new file mode 100644 (file)

index 0000000..60fc224
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Kconfig
@@ -0,0 +1,58 @@
+config FIQ_DEBUGGER
+       bool "FIQ Mode Serial Debugger"
+       default n
+       depends on ARM || ARM64
+       help
+         The FIQ serial debugger can accept commands even when the
+         kernel is unresponsive due to being stuck with interrupts
+         disabled.
+
+config FIQ_DEBUGGER_NO_SLEEP
+       bool "Keep serial debugger active"
+       depends on FIQ_DEBUGGER
+       default n
+       help
+         Enables the serial debugger at boot. Passing
+         fiq_debugger.no_sleep on the kernel commandline will
+         override this config option.
+
+config FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+       bool "Don't disable wakeup IRQ when debugger is active"
+       depends on FIQ_DEBUGGER
+       default n
+       help
+         Don't disable the wakeup irq when enabling the uart clock.  This will
+         cause extra interrupts, but it makes the serial debugger usable with
+         on some MSM radio builds that ignore the uart clock request in power
+         collapse.
+
+config FIQ_DEBUGGER_CONSOLE
+       bool "Console on FIQ Serial Debugger port"
+       depends on FIQ_DEBUGGER
+       default n
+       help
+         Enables a console so that printk messages are displayed on
+         the debugger serial port as the occur.
+
+config FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+       bool "Put the FIQ debugger into console mode by default"
+       depends on FIQ_DEBUGGER_CONSOLE
+       default n
+       help
+         If enabled, this puts the fiq debugger into console mode by default.
+         Otherwise, the fiq debugger will start out in debug mode.
+
+config FIQ_DEBUGGER_UART_OVERLAY
+       bool "Install uart DT overlay"
+       depends on FIQ_DEBUGGER
+       select OF_OVERLAY
+       default n
+       help
+         If enabled, fiq debugger is calling fiq_debugger_uart_overlay()
+         that will apply overlay uart_overlay@0 to disable proper uart.
+
+config FIQ_WATCHDOG
+       bool
+       select FIQ_DEBUGGER
+       select PSTORE_RAM
+       default n
diff --git a/drivers/staging/android/fiq_debugger/Makefile b/drivers/staging/android/fiq_debugger/Makefile

new file mode 100644 (file)

index 0000000..a7ca487
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/Makefile
@@ -0,0 +1,4 @@
+obj-y                  += fiq_debugger.o
+obj-$(CONFIG_ARM)      += fiq_debugger_arm.o
+obj-$(CONFIG_ARM64)    += fiq_debugger_arm64.o
+obj-$(CONFIG_FIQ_WATCHDOG)     += fiq_watchdog.o
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c

new file mode 100644 (file)

index 0000000..f6a8062
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c
@@ -0,0 +1,1246 @@
+/*
+ * drivers/staging/android/fiq_debugger.c
+ *
+ * Serial Debugger Interface accessed through an FIQ interrupt.
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <stdarg.h>
+#include <linux/module.h>
+#include <linux/io.h>
+#include <linux/console.h>
+#include <linux/interrupt.h>
+#include <linux/clk.h>
+#include <linux/platform_device.h>
+#include <linux/kernel_stat.h>
+#include <linux/kmsg_dump.h>
+#include <linux/irq.h>
+#include <linux/delay.h>
+#include <linux/reboot.h>
+#include <linux/sched/signal.h>
+#include <linux/slab.h>
+#include <linux/smp.h>
+#include <linux/timer.h>
+#include <linux/tty.h>
+#include <linux/tty_flip.h>
+
+#ifdef CONFIG_FIQ_GLUE
+#include <asm/fiq_glue.h>
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_UART_OVERLAY
+#include <linux/of.h>
+#endif
+
+#include <linux/uaccess.h>
+
+#include "fiq_debugger.h"
+#include "fiq_debugger_priv.h"
+#include "fiq_debugger_ringbuf.h"
+
+#define DEBUG_MAX 64
+#define MAX_UNHANDLED_FIQ_COUNT 1000000
+
+#define MAX_FIQ_DEBUGGER_PORTS 4
+
+struct fiq_debugger_state {
+#ifdef CONFIG_FIQ_GLUE
+       struct fiq_glue_handler handler;
+#endif
+       struct fiq_debugger_output output;
+
+       int fiq;
+       int uart_irq;
+       int signal_irq;
+       int wakeup_irq;
+       bool wakeup_irq_no_set_wake;
+       struct clk *clk;
+       struct fiq_debugger_pdata *pdata;
+       struct platform_device *pdev;
+
+       char debug_cmd[DEBUG_MAX];
+       int debug_busy;
+       int debug_abort;
+
+       char debug_buf[DEBUG_MAX];
+       int debug_count;
+
+       bool no_sleep;
+       bool debug_enable;
+       bool ignore_next_wakeup_irq;
+       struct timer_list sleep_timer;
+       spinlock_t sleep_timer_lock;
+       bool uart_enabled;
+       struct wakeup_source debugger_wake_src;
+       bool console_enable;
+       int current_cpu;
+       atomic_t unhandled_fiq_count;
+       bool in_fiq;
+
+       struct work_struct work;
+       spinlock_t work_lock;
+       char work_cmd[DEBUG_MAX];
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+       spinlock_t console_lock;
+       struct console console;
+       struct tty_port tty_port;
+       struct fiq_debugger_ringbuf *tty_rbuf;
+       bool syslog_dumping;
+#endif
+
+       unsigned int last_irqs[NR_IRQS];
+       unsigned int last_local_timer_irqs[NR_CPUS];
+};
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+struct tty_driver *fiq_tty_driver;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_NO_SLEEP
+static bool initial_no_sleep = true;
+#else
+static bool initial_no_sleep;
+#endif
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE_DEFAULT_ENABLE
+static bool initial_debug_enable = true;
+static bool initial_console_enable = true;
+#else
+static bool initial_debug_enable;
+static bool initial_console_enable;
+#endif
+
+static bool fiq_kgdb_enable;
+static bool fiq_debugger_disable;
+
+module_param_named(no_sleep, initial_no_sleep, bool, 0644);
+module_param_named(debug_enable, initial_debug_enable, bool, 0644);
+module_param_named(console_enable, initial_console_enable, bool, 0644);
+module_param_named(kgdb_enable, fiq_kgdb_enable, bool, 0644);
+module_param_named(disable, fiq_debugger_disable, bool, 0644);
+
+#ifdef CONFIG_FIQ_DEBUGGER_WAKEUP_IRQ_ALWAYS_ON
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state) {}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state) {}
+#else
+static inline
+void fiq_debugger_enable_wakeup_irq(struct fiq_debugger_state *state)
+{
+       if (state->wakeup_irq < 0)
+               return;
+       enable_irq(state->wakeup_irq);
+       if (!state->wakeup_irq_no_set_wake)
+               enable_irq_wake(state->wakeup_irq);
+}
+static inline
+void fiq_debugger_disable_wakeup_irq(struct fiq_debugger_state *state)
+{
+       if (state->wakeup_irq < 0)
+               return;
+       disable_irq_nosync(state->wakeup_irq);
+       if (!state->wakeup_irq_no_set_wake)
+               disable_irq_wake(state->wakeup_irq);
+}
+#endif
+
+static inline bool fiq_debugger_have_fiq(struct fiq_debugger_state *state)
+{
+       return (state->fiq >= 0);
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_force_irq(struct fiq_debugger_state *state)
+{
+       unsigned int irq = state->signal_irq;
+
+       if (WARN_ON(!fiq_debugger_have_fiq(state)))
+               return;
+       if (state->pdata->force_irq) {
+               state->pdata->force_irq(state->pdev, irq);
+       } else {
+               struct irq_chip *chip = irq_get_chip(irq);
+               if (chip && chip->irq_retrigger)
+                       chip->irq_retrigger(irq_get_irq_data(irq));
+       }
+}
+#endif
+
+static void fiq_debugger_uart_enable(struct fiq_debugger_state *state)
+{
+       if (state->clk)
+               clk_enable(state->clk);
+       if (state->pdata->uart_enable)
+               state->pdata->uart_enable(state->pdev);
+}
+
+static void fiq_debugger_uart_disable(struct fiq_debugger_state *state)
+{
+       if (state->pdata->uart_disable)
+               state->pdata->uart_disable(state->pdev);
+       if (state->clk)
+               clk_disable(state->clk);
+}
+
+static void fiq_debugger_uart_flush(struct fiq_debugger_state *state)
+{
+       if (state->pdata->uart_flush)
+               state->pdata->uart_flush(state->pdev);
+}
+
+static void fiq_debugger_putc(struct fiq_debugger_state *state, char c)
+{
+       state->pdata->uart_putc(state->pdev, c);
+}
+
+static void fiq_debugger_puts(struct fiq_debugger_state *state, char *s)
+{
+       unsigned c;
+       while ((c = *s++)) {
+               if (c == '\n')
+                       fiq_debugger_putc(state, '\r');
+               fiq_debugger_putc(state, c);
+       }
+}
+
+static void fiq_debugger_prompt(struct fiq_debugger_state *state)
+{
+       fiq_debugger_puts(state, "debug> ");
+}
+
+static void fiq_debugger_dump_kernel_log(struct fiq_debugger_state *state)
+{
+       char buf[512];
+       size_t len;
+       struct kmsg_dumper dumper = { .active = true };
+
+
+       kmsg_dump_rewind_nolock(&dumper);
+       while (kmsg_dump_get_line_nolock(&dumper, true, buf,
+                                        sizeof(buf) - 1, &len)) {
+               buf[len] = 0;
+               fiq_debugger_puts(state, buf);
+       }
+}
+
+static void fiq_debugger_printf(struct fiq_debugger_output *output,
+                              const char *fmt, ...)
+{
+       struct fiq_debugger_state *state;
+       char buf[256];
+       va_list ap;
+
+       state = container_of(output, struct fiq_debugger_state, output);
+       va_start(ap, fmt);
+       vsnprintf(buf, sizeof(buf), fmt, ap);
+       va_end(ap);
+
+       fiq_debugger_puts(state, buf);
+}
+
+/* Safe outside fiq context */
+static int fiq_debugger_printf_nfiq(void *cookie, const char *fmt, ...)
+{
+       struct fiq_debugger_state *state = cookie;
+       char buf[256];
+       va_list ap;
+       unsigned long irq_flags;
+
+       va_start(ap, fmt);
+       vsnprintf(buf, 128, fmt, ap);
+       va_end(ap);
+
+       local_irq_save(irq_flags);
+       fiq_debugger_puts(state, buf);
+       fiq_debugger_uart_flush(state);
+       local_irq_restore(irq_flags);
+       return state->debug_abort;
+}
+
+static void fiq_debugger_dump_irqs(struct fiq_debugger_state *state)
+{
+       int n;
+       struct irq_desc *desc;
+
+       fiq_debugger_printf(&state->output,
+                       "irqnr       total  since-last   status  name\n");
+       for_each_irq_desc(n, desc) {
+               struct irqaction *act = desc->action;
+               if (!act && !kstat_irqs(n))
+                       continue;
+               fiq_debugger_printf(&state->output, "%5d: %10u %11u %8x  %s\n", n,
+                       kstat_irqs(n),
+                       kstat_irqs(n) - state->last_irqs[n],
+                       desc->status_use_accessors,
+                       (act && act->name) ? act->name : "???");
+               state->last_irqs[n] = kstat_irqs(n);
+       }
+}
+
+static void fiq_debugger_do_ps(struct fiq_debugger_state *state)
+{
+       struct task_struct *g;
+       struct task_struct *p;
+       unsigned task_state;
+       static const char stat_nam[] = "RSDTtZX";
+
+       fiq_debugger_printf(&state->output, "pid   ppid  prio task            pc\n");
+       read_lock(&tasklist_lock);
+       do_each_thread(g, p) {
+               task_state = p->state ? __ffs(p->state) + 1 : 0;
+               fiq_debugger_printf(&state->output,
+                            "%5d %5d %4d ", p->pid, p->parent->pid, p->prio);
+               fiq_debugger_printf(&state->output, "%-13.13s %c", p->comm,
+                            task_state >= sizeof(stat_nam) ? '?' : stat_nam[task_state]);
+               if (task_state == TASK_RUNNING)
+                       fiq_debugger_printf(&state->output, " running\n");
+               else
+                       fiq_debugger_printf(&state->output, " %08lx\n",
+                                       thread_saved_pc(p));
+       } while_each_thread(g, p);
+       read_unlock(&tasklist_lock);
+}
+
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+       state->syslog_dumping = true;
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+       state->syslog_dumping = false;
+}
+#else
+extern int do_syslog(int type, char __user *bug, int count);
+static void fiq_debugger_begin_syslog_dump(struct fiq_debugger_state *state)
+{
+       do_syslog(5 /* clear */, NULL, 0);
+}
+
+static void fiq_debugger_end_syslog_dump(struct fiq_debugger_state *state)
+{
+       fiq_debugger_dump_kernel_log(state);
+}
+#endif
+
+static void fiq_debugger_do_sysrq(struct fiq_debugger_state *state, char rq)
+{
+       if ((rq == 'g' || rq == 'G') && !fiq_kgdb_enable) {
+               fiq_debugger_printf(&state->output, "sysrq-g blocked\n");
+               return;
+       }
+       fiq_debugger_begin_syslog_dump(state);
+       handle_sysrq(rq);
+       fiq_debugger_end_syslog_dump(state);
+}
+
+#ifdef CONFIG_KGDB
+static void fiq_debugger_do_kgdb(struct fiq_debugger_state *state)
+{
+       if (!fiq_kgdb_enable) {
+               fiq_debugger_printf(&state->output, "kgdb through fiq debugger not enabled\n");
+               return;
+       }
+
+       fiq_debugger_printf(&state->output, "enabling console and triggering kgdb\n");
+       state->console_enable = true;
+       handle_sysrq('g');
+}
+#endif
+
+static void fiq_debugger_schedule_work(struct fiq_debugger_state *state,
+               char *cmd)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&state->work_lock, flags);
+       if (state->work_cmd[0] != '\0') {
+               fiq_debugger_printf(&state->output, "work command processor busy\n");
+               spin_unlock_irqrestore(&state->work_lock, flags);
+               return;
+       }
+
+       strlcpy(state->work_cmd, cmd, sizeof(state->work_cmd));
+       spin_unlock_irqrestore(&state->work_lock, flags);
+
+       schedule_work(&state->work);
+}
+
+static void fiq_debugger_work(struct work_struct *work)
+{
+       struct fiq_debugger_state *state;
+       char work_cmd[DEBUG_MAX];
+       char *cmd;
+       unsigned long flags;
+
+       state = container_of(work, struct fiq_debugger_state, work);
+
+       spin_lock_irqsave(&state->work_lock, flags);
+
+       strlcpy(work_cmd, state->work_cmd, sizeof(work_cmd));
+       state->work_cmd[0] = '\0';
+
+       spin_unlock_irqrestore(&state->work_lock, flags);
+
+       cmd = work_cmd;
+       if (!strncmp(cmd, "reboot", 6)) {
+               cmd += 6;
+               while (*cmd == ' ')
+                       cmd++;
+               if (*cmd != '\0')
+                       kernel_restart(cmd);
+               else
+                       kernel_restart(NULL);
+       } else {
+               fiq_debugger_printf(&state->output, "unknown work command '%s'\n",
+                               work_cmd);
+       }
+}
+
+/* This function CANNOT be called in FIQ context */
+static void fiq_debugger_irq_exec(struct fiq_debugger_state *state, char *cmd)
+{
+       if (!strcmp(cmd, "ps"))
+               fiq_debugger_do_ps(state);
+       if (!strcmp(cmd, "sysrq"))
+               fiq_debugger_do_sysrq(state, 'h');
+       if (!strncmp(cmd, "sysrq ", 6))
+               fiq_debugger_do_sysrq(state, cmd[6]);
+#ifdef CONFIG_KGDB
+       if (!strcmp(cmd, "kgdb"))
+               fiq_debugger_do_kgdb(state);
+#endif
+       if (!strncmp(cmd, "reboot", 6))
+               fiq_debugger_schedule_work(state, cmd);
+}
+
+static void fiq_debugger_help(struct fiq_debugger_state *state)
+{
+       fiq_debugger_printf(&state->output,
+                               "FIQ Debugger commands:\n"
+                               " pc            PC status\n"
+                               " regs          Register dump\n"
+                               " allregs       Extended Register dump\n"
+                               " bt            Stack trace\n"
+                               " reboot [<c>]  Reboot with command <c>\n"
+                               " reset [<c>]   Hard reset with command <c>\n"
+                               " irqs          Interupt status\n"
+                               " kmsg          Kernel log\n"
+                               " version       Kernel version\n");
+       fiq_debugger_printf(&state->output,
+                               " sleep         Allow sleep while in FIQ\n"
+                               " nosleep       Disable sleep while in FIQ\n"
+                               " console       Switch terminal to console\n"
+                               " cpu           Current CPU\n"
+                               " cpu <number>  Switch to CPU<number>\n");
+       fiq_debugger_printf(&state->output,
+                               " ps            Process list\n"
+                               " sysrq         sysrq options\n"
+                               " sysrq <param> Execute sysrq with <param>\n");
+#ifdef CONFIG_KGDB
+       fiq_debugger_printf(&state->output,
+                               " kgdb          Enter kernel debugger\n");
+#endif
+}
+
+static void fiq_debugger_take_affinity(void *info)
+{
+       struct fiq_debugger_state *state = info;
+       struct cpumask cpumask;
+
+       cpumask_clear(&cpumask);
+       cpumask_set_cpu(get_cpu(), &cpumask);
+
+       irq_set_affinity(state->uart_irq, &cpumask);
+}
+
+static void fiq_debugger_switch_cpu(struct fiq_debugger_state *state, int cpu)
+{
+       if (!fiq_debugger_have_fiq(state))
+               smp_call_function_single(cpu, fiq_debugger_take_affinity, state,
+                               false);
+       state->current_cpu = cpu;
+}
+
+static bool fiq_debugger_fiq_exec(struct fiq_debugger_state *state,
+                       const char *cmd, const struct pt_regs *regs,
+                       void *svc_sp)
+{
+       bool signal_helper = false;
+
+       if (!strcmp(cmd, "help") || !strcmp(cmd, "?")) {
+               fiq_debugger_help(state);
+       } else if (!strcmp(cmd, "pc")) {
+               fiq_debugger_dump_pc(&state->output, regs);
+       } else if (!strcmp(cmd, "regs")) {
+               fiq_debugger_dump_regs(&state->output, regs);
+       } else if (!strcmp(cmd, "allregs")) {
+               fiq_debugger_dump_allregs(&state->output, regs);
+       } else if (!strcmp(cmd, "bt")) {
+               fiq_debugger_dump_stacktrace(&state->output, regs, 100, svc_sp);
+       } else if (!strncmp(cmd, "reset", 5)) {
+               cmd += 5;
+               while (*cmd == ' ')
+                       cmd++;
+               if (*cmd) {
+                       char tmp_cmd[32];
+                       strlcpy(tmp_cmd, cmd, sizeof(tmp_cmd));
+                       machine_restart(tmp_cmd);
+               } else {
+                       machine_restart(NULL);
+               }
+       } else if (!strcmp(cmd, "irqs")) {
+               fiq_debugger_dump_irqs(state);
+       } else if (!strcmp(cmd, "kmsg")) {
+               fiq_debugger_dump_kernel_log(state);
+       } else if (!strcmp(cmd, "version")) {
+               fiq_debugger_printf(&state->output, "%s\n", linux_banner);
+       } else if (!strcmp(cmd, "sleep")) {
+               state->no_sleep = false;
+               fiq_debugger_printf(&state->output, "enabling sleep\n");
+       } else if (!strcmp(cmd, "nosleep")) {
+               state->no_sleep = true;
+               fiq_debugger_printf(&state->output, "disabling sleep\n");
+       } else if (!strcmp(cmd, "console")) {
+               fiq_debugger_printf(&state->output, "console mode\n");
+               fiq_debugger_uart_flush(state);
+               state->console_enable = true;
+       } else if (!strcmp(cmd, "cpu")) {
+               fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+       } else if (!strncmp(cmd, "cpu ", 4)) {
+               unsigned long cpu = 0;
+               if (kstrtoul(cmd + 4, 10, &cpu) == 0)
+                       fiq_debugger_switch_cpu(state, cpu);
+               else
+                       fiq_debugger_printf(&state->output, "invalid cpu\n");
+               fiq_debugger_printf(&state->output, "cpu %d\n", state->current_cpu);
+       } else {
+               if (state->debug_busy) {
+                       fiq_debugger_printf(&state->output,
+                               "command processor busy. trying to abort.\n");
+                       state->debug_abort = -1;
+               } else {
+                       strcpy(state->debug_cmd, cmd);
+                       state->debug_busy = 1;
+               }
+
+               return true;
+       }
+       if (!state->console_enable)
+               fiq_debugger_prompt(state);
+
+       return signal_helper;
+}
+
+static void fiq_debugger_sleep_timer_expired(unsigned long data)
+{
+       struct fiq_debugger_state *state = (struct fiq_debugger_state *)data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&state->sleep_timer_lock, flags);
+       if (state->uart_enabled && !state->no_sleep) {
+               if (state->debug_enable && !state->console_enable) {
+                       state->debug_enable = false;
+                       fiq_debugger_printf_nfiq(state,
+                                       "suspending fiq debugger\n");
+               }
+               state->ignore_next_wakeup_irq = true;
+               fiq_debugger_uart_disable(state);
+               state->uart_enabled = false;
+               fiq_debugger_enable_wakeup_irq(state);
+       }
+       __pm_relax(&state->debugger_wake_src);
+       spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static void fiq_debugger_handle_wakeup(struct fiq_debugger_state *state)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&state->sleep_timer_lock, flags);
+       if (state->wakeup_irq >= 0 && state->ignore_next_wakeup_irq) {
+               state->ignore_next_wakeup_irq = false;
+       } else if (!state->uart_enabled) {
+               __pm_stay_awake(&state->debugger_wake_src);
+               fiq_debugger_uart_enable(state);
+               state->uart_enabled = true;
+               fiq_debugger_disable_wakeup_irq(state);
+               mod_timer(&state->sleep_timer, jiffies + HZ / 2);
+       }
+       spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+}
+
+static irqreturn_t fiq_debugger_wakeup_irq_handler(int irq, void *dev)
+{
+       struct fiq_debugger_state *state = dev;
+
+       if (!state->no_sleep)
+               fiq_debugger_puts(state, "WAKEUP\n");
+       fiq_debugger_handle_wakeup(state);
+
+       return IRQ_HANDLED;
+}
+
+static
+void fiq_debugger_handle_console_irq_context(struct fiq_debugger_state *state)
+{
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+       if (state->tty_port.ops) {
+               int i;
+               int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+               for (i = 0; i < count; i++) {
+                       int c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+                       tty_insert_flip_char(&state->tty_port, c, TTY_NORMAL);
+                       if (!fiq_debugger_ringbuf_consume(state->tty_rbuf, 1))
+                               pr_warn("fiq tty failed to consume byte\n");
+               }
+               tty_flip_buffer_push(&state->tty_port);
+       }
+#endif
+}
+
+static void fiq_debugger_handle_irq_context(struct fiq_debugger_state *state)
+{
+       if (!state->no_sleep) {
+               unsigned long flags;
+
+               spin_lock_irqsave(&state->sleep_timer_lock, flags);
+               __pm_stay_awake(&state->debugger_wake_src);
+               mod_timer(&state->sleep_timer, jiffies + HZ * 5);
+               spin_unlock_irqrestore(&state->sleep_timer_lock, flags);
+       }
+       fiq_debugger_handle_console_irq_context(state);
+       if (state->debug_busy) {
+               fiq_debugger_irq_exec(state, state->debug_cmd);
+               if (!state->console_enable)
+                       fiq_debugger_prompt(state);
+               state->debug_busy = 0;
+       }
+}
+
+static int fiq_debugger_getc(struct fiq_debugger_state *state)
+{
+       return state->pdata->uart_getc(state->pdev);
+}
+
+static bool fiq_debugger_handle_uart_interrupt(struct fiq_debugger_state *state,
+                       int this_cpu, const struct pt_regs *regs, void *svc_sp)
+{
+       int c;
+       static int last_c;
+       int count = 0;
+       bool signal_helper = false;
+
+       if (this_cpu != state->current_cpu) {
+               if (state->in_fiq)
+                       return false;
+
+               if (atomic_inc_return(&state->unhandled_fiq_count) !=
+                                       MAX_UNHANDLED_FIQ_COUNT)
+                       return false;
+
+               fiq_debugger_printf(&state->output,
+                       "fiq_debugger: cpu %d not responding, "
+                       "reverting to cpu %d\n", state->current_cpu,
+                       this_cpu);
+
+               atomic_set(&state->unhandled_fiq_count, 0);
+               fiq_debugger_switch_cpu(state, this_cpu);
+               return false;
+       }
+
+       state->in_fiq = true;
+
+       while ((c = fiq_debugger_getc(state)) != FIQ_DEBUGGER_NO_CHAR) {
+               count++;
+               if (!state->debug_enable) {
+                       if ((c == 13) || (c == 10)) {
+                               state->debug_enable = true;
+                               state->debug_count = 0;
+                               fiq_debugger_prompt(state);
+                       }
+               } else if (c == FIQ_DEBUGGER_BREAK) {
+                       state->console_enable = false;
+                       fiq_debugger_puts(state, "fiq debugger mode\n");
+                       state->debug_count = 0;
+                       fiq_debugger_prompt(state);
+#ifdef CONFIG_FIQ_DEBUGGER_CONSOLE
+               } else if (state->console_enable && state->tty_rbuf) {
+                       fiq_debugger_ringbuf_push(state->tty_rbuf, c);
+                       signal_helper = true;
+#endif
+               } else if ((c >= ' ') && (c < 127)) {
+                       if (state->debug_count < (DEBUG_MAX - 1)) {
+                               state->debug_buf[state->debug_count++] = c;
+                               fiq_debugger_putc(state, c);
+                       }
+               } else if ((c == 8) || (c == 127)) {
+                       if (state->debug_count > 0) {
+                               state->debug_count--;
+                               fiq_debugger_putc(state, 8);
+                               fiq_debugger_putc(state, ' ');
+                               fiq_debugger_putc(state, 8);
+                       }
+               } else if ((c == 13) || (c == 10)) {
+                       if (c == '\r' || (c == '\n' && last_c != '\r')) {
+                               fiq_debugger_putc(state, '\r');
+                               fiq_debugger_putc(state, '\n');
+                       }
+                       if (state->debug_count) {
+                               state->debug_buf[state->debug_count] = 0;
+                               state->debug_count = 0;
+                               signal_helper |=
+                                       fiq_debugger_fiq_exec(state,
+                                                       state->debug_buf,
+                                                       regs, svc_sp);
+                       } else {
+                               fiq_debugger_prompt(state);
+                       }
+               }
+               last_c = c;
+       }
+       if (!state->console_enable)
+               fiq_debugger_uart_flush(state);
+       if (state->pdata->fiq_ack)
+               state->pdata->fiq_ack(state->pdev, state->fiq);
+
+       /* poke sleep timer if necessary */
+       if (state->debug_enable && !state->no_sleep)
+               signal_helper = true;
+
+       atomic_set(&state->unhandled_fiq_count, 0);
+       state->in_fiq = false;
+
+       return signal_helper;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_fiq(struct fiq_glue_handler *h,
+               const struct pt_regs *regs, void *svc_sp)
+{
+       struct fiq_debugger_state *state =
+               container_of(h, struct fiq_debugger_state, handler);
+       unsigned int this_cpu = THREAD_INFO(svc_sp)->cpu;
+       bool need_irq;
+
+       need_irq = fiq_debugger_handle_uart_interrupt(state, this_cpu, regs,
+                       svc_sp);
+       if (need_irq)
+               fiq_debugger_force_irq(state);
+}
+#endif
+
+/*
+ * When not using FIQs, we only use this single interrupt as an entry point.
+ * This just effectively takes over the UART interrupt and does all the work
+ * in this context.
+ */
+static irqreturn_t fiq_debugger_uart_irq(int irq, void *dev)
+{
+       struct fiq_debugger_state *state = dev;
+       bool not_done;
+
+       fiq_debugger_handle_wakeup(state);
+
+       /* handle the debugger irq in regular context */
+       not_done = fiq_debugger_handle_uart_interrupt(state, smp_processor_id(),
+                                             get_irq_regs(),
+                                             current_thread_info());
+       if (not_done)
+               fiq_debugger_handle_irq_context(state);
+
+       return IRQ_HANDLED;
+}
+
+/*
+ * If FIQs are used, not everything can happen in fiq context.
+ * FIQ handler does what it can and then signals this interrupt to finish the
+ * job in irq context.
+ */
+static irqreturn_t fiq_debugger_signal_irq(int irq, void *dev)
+{
+       struct fiq_debugger_state *state = dev;
+
+       if (state->pdata->force_irq_ack)
+               state->pdata->force_irq_ack(state->pdev, state->signal_irq);
+
+       fiq_debugger_handle_irq_context(state);
+
+       return IRQ_HANDLED;
+}
+
+#ifdef CONFIG_FIQ_GLUE
+static void fiq_debugger_resume(struct fiq_glue_handler *h)
+{
+       struct fiq_debugger_state *state =
+               container_of(h, struct fiq_debugger_state, handler);
+       if (state->pdata->uart_resume)
+               state->pdata->uart_resume(state->pdev);
+}
+#endif
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+struct tty_driver *fiq_debugger_console_device(struct console *co, int *index)
+{
+       *index = co->index;
+       return fiq_tty_driver;
+}
+
+static void fiq_debugger_console_write(struct console *co,
+                               const char *s, unsigned int count)
+{
+       struct fiq_debugger_state *state;
+       unsigned long flags;
+
+       state = container_of(co, struct fiq_debugger_state, console);
+
+       if (!state->console_enable && !state->syslog_dumping)
+               return;
+
+       fiq_debugger_uart_enable(state);
+       spin_lock_irqsave(&state->console_lock, flags);
+       while (count--) {
+               if (*s == '\n')
+                       fiq_debugger_putc(state, '\r');
+               fiq_debugger_putc(state, *s++);
+       }
+       fiq_debugger_uart_flush(state);
+       spin_unlock_irqrestore(&state->console_lock, flags);
+       fiq_debugger_uart_disable(state);
+}
+
+static struct console fiq_debugger_console = {
+       .name = "ttyFIQ",
+       .device = fiq_debugger_console_device,
+       .write = fiq_debugger_console_write,
+       .flags = CON_PRINTBUFFER | CON_ANYTIME | CON_ENABLED,
+};
+
+int fiq_tty_open(struct tty_struct *tty, struct file *filp)
+{
+       int line = tty->index;
+       struct fiq_debugger_state **states = tty->driver->driver_state;
+       struct fiq_debugger_state *state = states[line];
+
+       return tty_port_open(&state->tty_port, tty, filp);
+}
+
+void fiq_tty_close(struct tty_struct *tty, struct file *filp)
+{
+       tty_port_close(tty->port, tty, filp);
+}
+
+int  fiq_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+       int i;
+       int line = tty->index;
+       struct fiq_debugger_state **states = tty->driver->driver_state;
+       struct fiq_debugger_state *state = states[line];
+
+       if (!state->console_enable)
+               return count;
+
+       fiq_debugger_uart_enable(state);
+       spin_lock_irq(&state->console_lock);
+       for (i = 0; i < count; i++)
+               fiq_debugger_putc(state, *buf++);
+       spin_unlock_irq(&state->console_lock);
+       fiq_debugger_uart_disable(state);
+
+       return count;
+}
+
+int  fiq_tty_write_room(struct tty_struct *tty)
+{
+       return 16;
+}
+
+#ifdef CONFIG_CONSOLE_POLL
+static int fiq_tty_poll_init(struct tty_driver *driver, int line, char *options)
+{
+       return 0;
+}
+
+static int fiq_tty_poll_get_char(struct tty_driver *driver, int line)
+{
+       struct fiq_debugger_state **states = driver->driver_state;
+       struct fiq_debugger_state *state = states[line];
+       int c = NO_POLL_CHAR;
+
+       fiq_debugger_uart_enable(state);
+       if (fiq_debugger_have_fiq(state)) {
+               int count = fiq_debugger_ringbuf_level(state->tty_rbuf);
+               if (count > 0) {
+                       c = fiq_debugger_ringbuf_peek(state->tty_rbuf, 0);
+                       fiq_debugger_ringbuf_consume(state->tty_rbuf, 1);
+               }
+       } else {
+               c = fiq_debugger_getc(state);
+               if (c == FIQ_DEBUGGER_NO_CHAR)
+                       c = NO_POLL_CHAR;
+       }
+       fiq_debugger_uart_disable(state);
+
+       return c;
+}
+
+static void fiq_tty_poll_put_char(struct tty_driver *driver, int line, char ch)
+{
+       struct fiq_debugger_state **states = driver->driver_state;
+       struct fiq_debugger_state *state = states[line];
+       fiq_debugger_uart_enable(state);
+       fiq_debugger_putc(state, ch);
+       fiq_debugger_uart_disable(state);
+}
+#endif
+
+static const struct tty_port_operations fiq_tty_port_ops;
+
+static const struct tty_operations fiq_tty_driver_ops = {
+       .write = fiq_tty_write,
+       .write_room = fiq_tty_write_room,
+       .open = fiq_tty_open,
+       .close = fiq_tty_close,
+#ifdef CONFIG_CONSOLE_POLL
+       .poll_init = fiq_tty_poll_init,
+       .poll_get_char = fiq_tty_poll_get_char,
+       .poll_put_char = fiq_tty_poll_put_char,
+#endif
+};
+
+static int fiq_debugger_tty_init(void)
+{
+       int ret;
+       struct fiq_debugger_state **states = NULL;
+
+       states = kzalloc(sizeof(*states) * MAX_FIQ_DEBUGGER_PORTS, GFP_KERNEL);
+       if (!states) {
+               pr_err("Failed to allocate fiq debugger state structres\n");
+               return -ENOMEM;
+       }
+
+       fiq_tty_driver = alloc_tty_driver(MAX_FIQ_DEBUGGER_PORTS);
+       if (!fiq_tty_driver) {
+               pr_err("Failed to allocate fiq debugger tty\n");
+               ret = -ENOMEM;
+               goto err_free_state;
+       }
+
+       fiq_tty_driver->owner           = THIS_MODULE;
+       fiq_tty_driver->driver_name     = "fiq-debugger";
+       fiq_tty_driver->name            = "ttyFIQ";
+       fiq_tty_driver->type            = TTY_DRIVER_TYPE_SERIAL;
+       fiq_tty_driver->subtype         = SERIAL_TYPE_NORMAL;
+       fiq_tty_driver->init_termios    = tty_std_termios;
+       fiq_tty_driver->flags           = TTY_DRIVER_REAL_RAW |
+                                         TTY_DRIVER_DYNAMIC_DEV;
+       fiq_tty_driver->driver_state    = states;
+
+       fiq_tty_driver->init_termios.c_cflag =
+                                       B115200 | CS8 | CREAD | HUPCL | CLOCAL;
+       fiq_tty_driver->init_termios.c_ispeed = 115200;
+       fiq_tty_driver->init_termios.c_ospeed = 115200;
+
+       tty_set_operations(fiq_tty_driver, &fiq_tty_driver_ops);
+
+       ret = tty_register_driver(fiq_tty_driver);
+       if (ret) {
+               pr_err("Failed to register fiq tty: %d\n", ret);
+               goto err_free_tty;
+       }
+
+       pr_info("Registered FIQ tty driver\n");
+       return 0;
+
+err_free_tty:
+       put_tty_driver(fiq_tty_driver);
+       fiq_tty_driver = NULL;
+err_free_state:
+       kfree(states);
+       return ret;
+}
+
+static int fiq_debugger_tty_init_one(struct fiq_debugger_state *state)
+{
+       int ret;
+       struct device *tty_dev;
+       struct fiq_debugger_state **states = fiq_tty_driver->driver_state;
+
+       states[state->pdev->id] = state;
+
+       state->tty_rbuf = fiq_debugger_ringbuf_alloc(1024);
+       if (!state->tty_rbuf) {
+               pr_err("Failed to allocate fiq debugger ringbuf\n");
+               ret = -ENOMEM;
+               goto err;
+       }
+
+       tty_port_init(&state->tty_port);
+       state->tty_port.ops = &fiq_tty_port_ops;
+
+       tty_dev = tty_port_register_device(&state->tty_port, fiq_tty_driver,
+                                          state->pdev->id, &state->pdev->dev);
+       if (IS_ERR(tty_dev)) {
+               pr_err("Failed to register fiq debugger tty device\n");
+               ret = PTR_ERR(tty_dev);
+               goto err;
+       }
+
+       device_set_wakeup_capable(tty_dev, 1);
+
+       pr_info("Registered fiq debugger ttyFIQ%d\n", state->pdev->id);
+
+       return 0;
+
+err:
+       fiq_debugger_ringbuf_free(state->tty_rbuf);
+       state->tty_rbuf = NULL;
+       return ret;
+}
+#endif
+
+static int fiq_debugger_dev_suspend(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+       if (state->pdata->uart_dev_suspend)
+               return state->pdata->uart_dev_suspend(pdev);
+       return 0;
+}
+
+static int fiq_debugger_dev_resume(struct device *dev)
+{
+       struct platform_device *pdev = to_platform_device(dev);
+       struct fiq_debugger_state *state = platform_get_drvdata(pdev);
+
+       if (state->pdata->uart_dev_resume)
+               return state->pdata->uart_dev_resume(pdev);
+       return 0;
+}
+
+static int fiq_debugger_probe(struct platform_device *pdev)
+{
+       int ret;
+       struct fiq_debugger_pdata *pdata = dev_get_platdata(&pdev->dev);
+       struct fiq_debugger_state *state;
+       int fiq;
+       int uart_irq;
+
+       if (pdev->id >= MAX_FIQ_DEBUGGER_PORTS)
+               return -EINVAL;
+
+       if (!pdata->uart_getc || !pdata->uart_putc)
+               return -EINVAL;
+       if ((pdata->uart_enable && !pdata->uart_disable) ||
+           (!pdata->uart_enable && pdata->uart_disable))
+               return -EINVAL;
+
+       fiq = platform_get_irq_byname(pdev, "fiq");
+       uart_irq = platform_get_irq_byname(pdev, "uart_irq");
+
+       /* uart_irq mode and fiq mode are mutually exclusive, but one of them
+        * is required */
+       if ((uart_irq < 0 && fiq < 0) || (uart_irq >= 0 && fiq >= 0))
+               return -EINVAL;
+       if (fiq >= 0 && !pdata->fiq_enable)
+               return -EINVAL;
+
+       state = kzalloc(sizeof(*state), GFP_KERNEL);
+       state->output.printf = fiq_debugger_printf;
+       setup_timer(&state->sleep_timer, fiq_debugger_sleep_timer_expired,
+                   (unsigned long)state);
+       state->pdata = pdata;
+       state->pdev = pdev;
+       state->no_sleep = initial_no_sleep;
+       state->debug_enable = initial_debug_enable;
+       state->console_enable = initial_console_enable;
+
+       state->fiq = fiq;
+       state->uart_irq = uart_irq;
+       state->signal_irq = platform_get_irq_byname(pdev, "signal");
+       state->wakeup_irq = platform_get_irq_byname(pdev, "wakeup");
+
+       INIT_WORK(&state->work, fiq_debugger_work);
+       spin_lock_init(&state->work_lock);
+
+       platform_set_drvdata(pdev, state);
+
+       spin_lock_init(&state->sleep_timer_lock);
+
+       if (state->wakeup_irq < 0 && fiq_debugger_have_fiq(state))
+               state->no_sleep = true;
+       state->ignore_next_wakeup_irq = !state->no_sleep;
+
+       wakeup_source_init(&state->debugger_wake_src, "serial-debug");
+
+       state->clk = clk_get(&pdev->dev, NULL);
+       if (IS_ERR(state->clk))
+               state->clk = NULL;
+
+       /* do not call pdata->uart_enable here since uart_init may still
+        * need to do some initialization before uart_enable can work.
+        * So, only try to manage the clock during init.
+        */
+       if (state->clk)
+               clk_enable(state->clk);
+
+       if (pdata->uart_init) {
+               ret = pdata->uart_init(pdev);
+               if (ret)
+                       goto err_uart_init;
+       }
+
+       fiq_debugger_printf_nfiq(state,
+                               "<hit enter %sto activate fiq debugger>\n",
+                               state->no_sleep ? "" : "twice ");
+
+#ifdef CONFIG_FIQ_GLUE
+       if (fiq_debugger_have_fiq(state)) {
+               state->handler.fiq = fiq_debugger_fiq;
+               state->handler.resume = fiq_debugger_resume;
+               ret = fiq_glue_register_handler(&state->handler);
+               if (ret) {
+                       pr_err("%s: could not install fiq handler\n", __func__);
+                       goto err_register_irq;
+               }
+
+               pdata->fiq_enable(pdev, state->fiq, 1);
+       } else
+#endif
+       {
+               ret = request_irq(state->uart_irq, fiq_debugger_uart_irq,
+                                 IRQF_NO_SUSPEND, "debug", state);
+               if (ret) {
+                       pr_err("%s: could not install irq handler\n", __func__);
+                       goto err_register_irq;
+               }
+
+               /* for irq-only mode, we want this irq to wake us up, if it
+                * can.
+                */
+               enable_irq_wake(state->uart_irq);
+       }
+
+       if (state->clk)
+               clk_disable(state->clk);
+
+       if (state->signal_irq >= 0) {
+               ret = request_irq(state->signal_irq, fiq_debugger_signal_irq,
+                         IRQF_TRIGGER_RISING, "debug-signal", state);
+               if (ret)
+                       pr_err("serial_debugger: could not install signal_irq");
+       }
+
+       if (state->wakeup_irq >= 0) {
+               ret = request_irq(state->wakeup_irq,
+                                 fiq_debugger_wakeup_irq_handler,
+                                 IRQF_TRIGGER_FALLING,
+                                 "debug-wakeup", state);
+               if (ret) {
+                       pr_err("serial_debugger: "
+                               "could not install wakeup irq\n");
+                       state->wakeup_irq = -1;
+               } else {
+                       ret = enable_irq_wake(state->wakeup_irq);
+                       if (ret) {
+                               pr_err("serial_debugger: "
+                                       "could not enable wakeup\n");
+                               state->wakeup_irq_no_set_wake = true;
+                       }
+               }
+       }
+       if (state->no_sleep)
+               fiq_debugger_handle_wakeup(state);
+
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+       spin_lock_init(&state->console_lock);
+       state->console = fiq_debugger_console;
+       state->console.index = pdev->id;
+       if (!console_set_on_cmdline)
+               add_preferred_console(state->console.name,
+                       state->console.index, NULL);
+       register_console(&state->console);
+       fiq_debugger_tty_init_one(state);
+#endif
+       return 0;
+
+err_register_irq:
+       if (pdata->uart_free)
+               pdata->uart_free(pdev);
+err_uart_init:
+       if (state->clk)
+               clk_disable(state->clk);
+       if (state->clk)
+               clk_put(state->clk);
+       wakeup_source_trash(&state->debugger_wake_src);
+       platform_set_drvdata(pdev, NULL);
+       kfree(state);
+       return ret;
+}
+
+static const struct dev_pm_ops fiq_debugger_dev_pm_ops = {
+       .suspend        = fiq_debugger_dev_suspend,
+       .resume         = fiq_debugger_dev_resume,
+};
+
+static struct platform_driver fiq_debugger_driver = {
+       .probe  = fiq_debugger_probe,
+       .driver = {
+               .name   = "fiq_debugger",
+               .pm     = &fiq_debugger_dev_pm_ops,
+       },
+};
+
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+int fiq_debugger_uart_overlay(void)
+{
+       struct device_node *onp = of_find_node_by_path("/uart_overlay@0");
+       int ret;
+
+       if (!onp) {
+               pr_err("serial_debugger: uart overlay not found\n");
+               return -ENODEV;
+       }
+
+       ret = of_overlay_create(onp);
+       if (ret < 0) {
+               pr_err("serial_debugger: fail to create overlay: %d\n", ret);
+               of_node_put(onp);
+               return ret;
+       }
+
+       pr_info("serial_debugger: uart overlay applied\n");
+       return 0;
+}
+#endif
+
+static int __init fiq_debugger_init(void)
+{
+       if (fiq_debugger_disable) {
+               pr_err("serial_debugger: disabled\n");
+               return -ENODEV;
+       }
+#if defined(CONFIG_FIQ_DEBUGGER_CONSOLE)
+       fiq_debugger_tty_init();
+#endif
+#if defined(CONFIG_FIQ_DEBUGGER_UART_OVERLAY)
+       fiq_debugger_uart_overlay();
+#endif
+       return platform_driver_register(&fiq_debugger_driver);
+}
+
+postcore_initcall(fiq_debugger_init);
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.h b/drivers/staging/android/fiq_debugger/fiq_debugger.h

new file mode 100644 (file)

index 0000000..c9ec4f8
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger.h
@@ -0,0 +1,64 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger.h
+ *
+ * Copyright (C) 2010 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+#define _ARCH_ARM_MACH_TEGRA_FIQ_DEBUGGER_H_
+
+#include <linux/serial_core.h>
+
+#define FIQ_DEBUGGER_NO_CHAR NO_POLL_CHAR
+#define FIQ_DEBUGGER_BREAK 0x00ff0100
+
+#define FIQ_DEBUGGER_FIQ_IRQ_NAME      "fiq"
+#define FIQ_DEBUGGER_SIGNAL_IRQ_NAME   "signal"
+#define FIQ_DEBUGGER_WAKEUP_IRQ_NAME   "wakeup"
+
+/**
+ * struct fiq_debugger_pdata - fiq debugger platform data
+ * @uart_resume:       used to restore uart state right before enabling
+ *                     the fiq.
+ * @uart_enable:       Do the work necessary to communicate with the uart
+ *                     hw (enable clocks, etc.). This must be ref-counted.
+ * @uart_disable:      Do the work necessary to disable the uart hw
+ *                     (disable clocks, etc.). This must be ref-counted.
+ * @uart_dev_suspend:  called during PM suspend, generally not needed
+ *                     for real fiq mode debugger.
+ * @uart_dev_resume:   called during PM resume, generally not needed
+ *                     for real fiq mode debugger.
+ */
+struct fiq_debugger_pdata {
+       int (*uart_init)(struct platform_device *pdev);
+       void (*uart_free)(struct platform_device *pdev);
+       int (*uart_resume)(struct platform_device *pdev);
+       int (*uart_getc)(struct platform_device *pdev);
+       void (*uart_putc)(struct platform_device *pdev, unsigned int c);
+       void (*uart_flush)(struct platform_device *pdev);
+       void (*uart_enable)(struct platform_device *pdev);
+       void (*uart_disable)(struct platform_device *pdev);
+
+       int (*uart_dev_suspend)(struct platform_device *pdev);
+       int (*uart_dev_resume)(struct platform_device *pdev);
+
+       void (*fiq_enable)(struct platform_device *pdev, unsigned int fiq,
+                                                               bool enable);
+       void (*fiq_ack)(struct platform_device *pdev, unsigned int fiq);
+
+       void (*force_irq)(struct platform_device *pdev, unsigned int irq);
+       void (*force_irq_ack)(struct platform_device *pdev, unsigned int irq);
+};
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c

new file mode 100644 (file)

index 0000000..8b3e013
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm.c
@@ -0,0 +1,240 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <linux/uaccess.h>
+
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(unsigned cpsr)
+{
+       switch (cpsr & MODE_MASK) {
+       case USR_MODE: return "USR";
+       case FIQ_MODE: return "FIQ";
+       case IRQ_MODE: return "IRQ";
+       case SVC_MODE: return "SVC";
+       case ABT_MODE: return "ABT";
+       case UND_MODE: return "UND";
+       case SYSTEM_MODE: return "SYS";
+       default: return "???";
+       }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       output->printf(output, " pc %08x cpsr %08x mode %s\n",
+               regs->ARM_pc, regs->ARM_cpsr, mode_name(regs->ARM_cpsr));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       output->printf(output,
+                       " r0 %08x  r1 %08x  r2 %08x  r3 %08x\n",
+                       regs->ARM_r0, regs->ARM_r1, regs->ARM_r2, regs->ARM_r3);
+       output->printf(output,
+                       " r4 %08x  r5 %08x  r6 %08x  r7 %08x\n",
+                       regs->ARM_r4, regs->ARM_r5, regs->ARM_r6, regs->ARM_r7);
+       output->printf(output,
+                       " r8 %08x  r9 %08x r10 %08x r11 %08x  mode %s\n",
+                       regs->ARM_r8, regs->ARM_r9, regs->ARM_r10, regs->ARM_fp,
+                       mode_name(regs->ARM_cpsr));
+       output->printf(output,
+                       " ip %08x  sp %08x  lr %08x  pc %08x cpsr %08x\n",
+                       regs->ARM_ip, regs->ARM_sp, regs->ARM_lr, regs->ARM_pc,
+                       regs->ARM_cpsr);
+}
+
+struct mode_regs {
+       unsigned long sp_svc;
+       unsigned long lr_svc;
+       unsigned long spsr_svc;
+
+       unsigned long sp_abt;
+       unsigned long lr_abt;
+       unsigned long spsr_abt;
+
+       unsigned long sp_und;
+       unsigned long lr_und;
+       unsigned long spsr_und;
+
+       unsigned long sp_irq;
+       unsigned long lr_irq;
+       unsigned long spsr_irq;
+
+       unsigned long r8_fiq;
+       unsigned long r9_fiq;
+       unsigned long r10_fiq;
+       unsigned long r11_fiq;
+       unsigned long r12_fiq;
+       unsigned long sp_fiq;
+       unsigned long lr_fiq;
+       unsigned long spsr_fiq;
+};
+
+static void __naked get_mode_regs(struct mode_regs *regs)
+{
+       asm volatile (
+       "mrs    r1, cpsr\n"
+       "msr    cpsr_c, #0xd3 @(SVC_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+       "stmia  r0!, {r13 - r14}\n"
+       "mrs    r2, spsr\n"
+       "msr    cpsr_c, #0xd7 @(ABT_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+       "stmia  r0!, {r2, r13 - r14}\n"
+       "mrs    r2, spsr\n"
+       "msr    cpsr_c, #0xdb @(UND_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+       "stmia  r0!, {r2, r13 - r14}\n"
+       "mrs    r2, spsr\n"
+       "msr    cpsr_c, #0xd2 @(IRQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+       "stmia  r0!, {r2, r13 - r14}\n"
+       "mrs    r2, spsr\n"
+       "msr    cpsr_c, #0xd1 @(FIQ_MODE | PSR_I_BIT | PSR_F_BIT)\n"
+       "stmia  r0!, {r2, r8 - r14}\n"
+       "mrs    r2, spsr\n"
+       "stmia  r0!, {r2}\n"
+       "msr    cpsr_c, r1\n"
+       "bx     lr\n");
+}
+
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       struct mode_regs mode_regs;
+       unsigned long mode = regs->ARM_cpsr & MODE_MASK;
+
+       fiq_debugger_dump_regs(output, regs);
+       get_mode_regs(&mode_regs);
+
+       output->printf(output,
+                       "%csvc: sp %08x  lr %08x  spsr %08x\n",
+                       mode == SVC_MODE ? '*' : ' ',
+                       mode_regs.sp_svc, mode_regs.lr_svc, mode_regs.spsr_svc);
+       output->printf(output,
+                       "%cabt: sp %08x  lr %08x  spsr %08x\n",
+                       mode == ABT_MODE ? '*' : ' ',
+                       mode_regs.sp_abt, mode_regs.lr_abt, mode_regs.spsr_abt);
+       output->printf(output,
+                       "%cund: sp %08x  lr %08x  spsr %08x\n",
+                       mode == UND_MODE ? '*' : ' ',
+                       mode_regs.sp_und, mode_regs.lr_und, mode_regs.spsr_und);
+       output->printf(output,
+                       "%cirq: sp %08x  lr %08x  spsr %08x\n",
+                       mode == IRQ_MODE ? '*' : ' ',
+                       mode_regs.sp_irq, mode_regs.lr_irq, mode_regs.spsr_irq);
+       output->printf(output,
+                       "%cfiq: r8 %08x  r9 %08x  r10 %08x  r11 %08x  r12 %08x\n",
+                       mode == FIQ_MODE ? '*' : ' ',
+                       mode_regs.r8_fiq, mode_regs.r9_fiq, mode_regs.r10_fiq,
+                       mode_regs.r11_fiq, mode_regs.r12_fiq);
+       output->printf(output,
+                       " fiq: sp %08x  lr %08x  spsr %08x\n",
+                       mode_regs.sp_fiq, mode_regs.lr_fiq, mode_regs.spsr_fiq);
+}
+
+struct stacktrace_state {
+       struct fiq_debugger_output *output;
+       unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+       struct stacktrace_state *sts = d;
+
+       if (sts->depth) {
+               sts->output->printf(sts->output,
+                       "  pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+                       frame->pc, frame->pc, frame->lr, frame->lr,
+                       frame->sp, frame->fp);
+               sts->depth--;
+               return 0;
+       }
+       sts->output->printf(sts->output, "  ...\n");
+
+       return sts->depth == 0;
+}
+
+struct frame_tail {
+       struct frame_tail *fp;
+       unsigned long sp;
+       unsigned long lr;
+} __attribute__((packed));
+
+static struct frame_tail *user_backtrace(struct fiq_debugger_output *output,
+                                       struct frame_tail *tail)
+{
+       struct frame_tail buftail[2];
+
+       /* Also check accessibility of one struct frame_tail beyond */
+       if (!access_ok(VERIFY_READ, tail, sizeof(buftail))) {
+               output->printf(output, "  invalid frame pointer %p\n",
+                               tail);
+               return NULL;
+       }
+       if (__copy_from_user_inatomic(buftail, tail, sizeof(buftail))) {
+               output->printf(output,
+                       "  failed to copy frame pointer %p\n", tail);
+               return NULL;
+       }
+
+       output->printf(output, "  %p\n", buftail[0].lr);
+
+       /* frame pointers should strictly progress back up the stack
+        * (towards higher addresses) */
+       if (tail >= buftail[0].fp)
+               return NULL;
+
+       return buftail[0].fp-1;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+               const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+       struct frame_tail *tail;
+       struct thread_info *real_thread_info = THREAD_INFO(ssp);
+       struct stacktrace_state sts;
+
+       sts.depth = depth;
+       sts.output = output;
+       *current_thread_info() = *real_thread_info;
+
+       if (!current)
+               output->printf(output, "current NULL\n");
+       else
+               output->printf(output, "pid: %d  comm: %s\n",
+                       current->pid, current->comm);
+       fiq_debugger_dump_regs(output, regs);
+
+       if (!user_mode(regs)) {
+               struct stackframe frame;
+               frame.fp = regs->ARM_fp;
+               frame.sp = regs->ARM_sp;
+               frame.lr = regs->ARM_lr;
+               frame.pc = regs->ARM_pc;
+               output->printf(output,
+                       "  pc: %p (%pF), lr %p (%pF), sp %p, fp %p\n",
+                       regs->ARM_pc, regs->ARM_pc, regs->ARM_lr, regs->ARM_lr,
+                       regs->ARM_sp, regs->ARM_fp);
+               walk_stackframe(&frame, report_trace, &sts);
+               return;
+       }
+
+       tail = ((struct frame_tail *) regs->ARM_fp) - 1;
+       while (depth-- && tail && !((unsigned long) tail & 3))
+               tail = user_backtrace(output, tail);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c

new file mode 100644 (file)

index 0000000..c53f498
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_arm64.c
@@ -0,0 +1,201 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ptrace.h>
+#include <asm/stacktrace.h>
+
+#include "fiq_debugger_priv.h"
+
+static char *mode_name(const struct pt_regs *regs)
+{
+       if (compat_user_mode(regs)) {
+               return "USR";
+       } else {
+               switch (processor_mode(regs)) {
+               case PSR_MODE_EL0t: return "EL0t";
+               case PSR_MODE_EL1t: return "EL1t";
+               case PSR_MODE_EL1h: return "EL1h";
+               case PSR_MODE_EL2t: return "EL2t";
+               case PSR_MODE_EL2h: return "EL2h";
+               default: return "???";
+               }
+       }
+}
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       output->printf(output, " pc %016lx cpsr %08lx mode %s\n",
+               regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch32(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       output->printf(output, " r0 %08x  r1 %08x  r2 %08x  r3 %08x\n",
+                       regs->compat_usr(0), regs->compat_usr(1),
+                       regs->compat_usr(2), regs->compat_usr(3));
+       output->printf(output, " r4 %08x  r5 %08x  r6 %08x  r7 %08x\n",
+                       regs->compat_usr(4), regs->compat_usr(5),
+                       regs->compat_usr(6), regs->compat_usr(7));
+       output->printf(output, " r8 %08x  r9 %08x r10 %08x r11 %08x\n",
+                       regs->compat_usr(8), regs->compat_usr(9),
+                       regs->compat_usr(10), regs->compat_usr(11));
+       output->printf(output, " ip %08x  sp %08x  lr %08x  pc %08x\n",
+                       regs->compat_usr(12), regs->compat_sp,
+                       regs->compat_lr, regs->pc);
+       output->printf(output, " cpsr %08x (%s)\n",
+                       regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs_aarch64(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+
+       output->printf(output, "  x0 %016lx   x1 %016lx\n",
+                       regs->regs[0], regs->regs[1]);
+       output->printf(output, "  x2 %016lx   x3 %016lx\n",
+                       regs->regs[2], regs->regs[3]);
+       output->printf(output, "  x4 %016lx   x5 %016lx\n",
+                       regs->regs[4], regs->regs[5]);
+       output->printf(output, "  x6 %016lx   x7 %016lx\n",
+                       regs->regs[6], regs->regs[7]);
+       output->printf(output, "  x8 %016lx   x9 %016lx\n",
+                       regs->regs[8], regs->regs[9]);
+       output->printf(output, " x10 %016lx  x11 %016lx\n",
+                       regs->regs[10], regs->regs[11]);
+       output->printf(output, " x12 %016lx  x13 %016lx\n",
+                       regs->regs[12], regs->regs[13]);
+       output->printf(output, " x14 %016lx  x15 %016lx\n",
+                       regs->regs[14], regs->regs[15]);
+       output->printf(output, " x16 %016lx  x17 %016lx\n",
+                       regs->regs[16], regs->regs[17]);
+       output->printf(output, " x18 %016lx  x19 %016lx\n",
+                       regs->regs[18], regs->regs[19]);
+       output->printf(output, " x20 %016lx  x21 %016lx\n",
+                       regs->regs[20], regs->regs[21]);
+       output->printf(output, " x22 %016lx  x23 %016lx\n",
+                       regs->regs[22], regs->regs[23]);
+       output->printf(output, " x24 %016lx  x25 %016lx\n",
+                       regs->regs[24], regs->regs[25]);
+       output->printf(output, " x26 %016lx  x27 %016lx\n",
+                       regs->regs[26], regs->regs[27]);
+       output->printf(output, " x28 %016lx  x29 %016lx\n",
+                       regs->regs[28], regs->regs[29]);
+       output->printf(output, " x30 %016lx   sp %016lx\n",
+                       regs->regs[30], regs->sp);
+       output->printf(output, "  pc %016lx cpsr %08x (%s)\n",
+                       regs->pc, regs->pstate, mode_name(regs));
+}
+
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       if (compat_user_mode(regs))
+               fiq_debugger_dump_regs_aarch32(output, regs);
+       else
+               fiq_debugger_dump_regs_aarch64(output, regs);
+}
+
+#define READ_SPECIAL_REG(x) ({ \
+       u64 val; \
+       asm volatile ("mrs %0, " # x : "=r"(val)); \
+       val; \
+})
+
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs)
+{
+       u32 pstate = READ_SPECIAL_REG(CurrentEl);
+       bool in_el2 = (pstate & PSR_MODE_MASK) >= PSR_MODE_EL2t;
+
+       fiq_debugger_dump_regs(output, regs);
+
+       output->printf(output, " sp_el0   %016lx\n",
+                       READ_SPECIAL_REG(sp_el0));
+
+       if (in_el2)
+               output->printf(output, " sp_el1   %016lx\n",
+                               READ_SPECIAL_REG(sp_el1));
+
+       output->printf(output, " elr_el1  %016lx\n",
+                       READ_SPECIAL_REG(elr_el1));
+
+       output->printf(output, " spsr_el1 %08lx\n",
+                       READ_SPECIAL_REG(spsr_el1));
+
+       if (in_el2) {
+               output->printf(output, " spsr_irq %08lx\n",
+                               READ_SPECIAL_REG(spsr_irq));
+               output->printf(output, " spsr_abt %08lx\n",
+                               READ_SPECIAL_REG(spsr_abt));
+               output->printf(output, " spsr_und %08lx\n",
+                               READ_SPECIAL_REG(spsr_und));
+               output->printf(output, " spsr_fiq %08lx\n",
+                               READ_SPECIAL_REG(spsr_fiq));
+               output->printf(output, " spsr_el2 %08lx\n",
+                               READ_SPECIAL_REG(elr_el2));
+               output->printf(output, " spsr_el2 %08lx\n",
+                               READ_SPECIAL_REG(spsr_el2));
+       }
+}
+
+struct stacktrace_state {
+       struct fiq_debugger_output *output;
+       unsigned int depth;
+};
+
+static int report_trace(struct stackframe *frame, void *d)
+{
+       struct stacktrace_state *sts = d;
+
+       if (sts->depth) {
+               sts->output->printf(sts->output, "%pF:\n", frame->pc);
+               sts->output->printf(sts->output,
+                               "  pc %016lx   fp %016lx\n",
+                               frame->pc, frame->fp);
+               sts->depth--;
+               return 0;
+       }
+       sts->output->printf(sts->output, "  ...\n");
+
+       return sts->depth == 0;
+}
+
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+               const struct pt_regs *regs, unsigned int depth, void *ssp)
+{
+       struct thread_info *real_thread_info = THREAD_INFO(ssp);
+       struct stacktrace_state sts;
+
+       sts.depth = depth;
+       sts.output = output;
+       *current_thread_info() = *real_thread_info;
+
+       if (!current)
+               output->printf(output, "current NULL\n");
+       else
+               output->printf(output, "pid: %d  comm: %s\n",
+                       current->pid, current->comm);
+       fiq_debugger_dump_regs(output, regs);
+
+       if (!user_mode(regs)) {
+               struct stackframe frame;
+               frame.fp = regs->regs[29];
+               frame.pc = regs->pc;
+               output->printf(output, "\n");
+               walk_stackframe(current, &frame, report_trace, &sts);
+       }
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h

new file mode 100644 (file)

index 0000000..d5d051f
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_priv.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ * Author: Colin Cross <ccross@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_DEBUGGER_PRIV_H_
+#define _FIQ_DEBUGGER_PRIV_H_
+
+#define THREAD_INFO(sp) ((struct thread_info *) \
+               ((unsigned long)(sp) & ~(THREAD_SIZE - 1)))
+
+struct fiq_debugger_output {
+       void (*printf)(struct fiq_debugger_output *output, const char *fmt, ...);
+};
+
+struct pt_regs;
+
+void fiq_debugger_dump_pc(struct fiq_debugger_output *output,
+               const struct pt_regs *regs);
+void fiq_debugger_dump_regs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs);
+void fiq_debugger_dump_allregs(struct fiq_debugger_output *output,
+               const struct pt_regs *regs);
+void fiq_debugger_dump_stacktrace(struct fiq_debugger_output *output,
+               const struct pt_regs *regs, unsigned int depth, void *ssp);
+
+#endif
diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h

new file mode 100644 (file)

index 0000000..10c3c5d
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
@@ -0,0 +1,94 @@
+/*
+ * drivers/staging/android/fiq_debugger/fiq_debugger_ringbuf.h
+ *
+ * simple lockless ringbuffer
+ *
+ * Copyright (C) 2010 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+struct fiq_debugger_ringbuf {
+       int len;
+       int head;
+       int tail;
+       u8 buf[];
+};
+
+
+static inline struct fiq_debugger_ringbuf *fiq_debugger_ringbuf_alloc(int len)
+{
+       struct fiq_debugger_ringbuf *rbuf;
+
+       rbuf = kzalloc(sizeof(*rbuf) + len, GFP_KERNEL);
+       if (rbuf == NULL)
+               return NULL;
+
+       rbuf->len = len;
+       rbuf->head = 0;
+       rbuf->tail = 0;
+       smp_mb();
+
+       return rbuf;
+}
+
+static inline void fiq_debugger_ringbuf_free(struct fiq_debugger_ringbuf *rbuf)
+{
+       kfree(rbuf);
+}
+
+static inline int fiq_debugger_ringbuf_level(struct fiq_debugger_ringbuf *rbuf)
+{
+       int level = rbuf->head - rbuf->tail;
+
+       if (level < 0)
+               level = rbuf->len + level;
+
+       return level;
+}
+
+static inline int fiq_debugger_ringbuf_room(struct fiq_debugger_ringbuf *rbuf)
+{
+       return rbuf->len - fiq_debugger_ringbuf_level(rbuf) - 1;
+}
+
+static inline u8
+fiq_debugger_ringbuf_peek(struct fiq_debugger_ringbuf *rbuf, int i)
+{
+       return rbuf->buf[(rbuf->tail + i) % rbuf->len];
+}
+
+static inline int
+fiq_debugger_ringbuf_consume(struct fiq_debugger_ringbuf *rbuf, int count)
+{
+       count = min(count, fiq_debugger_ringbuf_level(rbuf));
+
+       rbuf->tail = (rbuf->tail + count) % rbuf->len;
+       smp_mb();
+
+       return count;
+}
+
+static inline int
+fiq_debugger_ringbuf_push(struct fiq_debugger_ringbuf *rbuf, u8 datum)
+{
+       if (fiq_debugger_ringbuf_room(rbuf) == 0)
+               return 0;
+
+       rbuf->buf[rbuf->head] = datum;
+       smp_mb();
+       rbuf->head = (rbuf->head + 1) % rbuf->len;
+       smp_mb();
+
+       return 1;
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.c b/drivers/staging/android/fiq_debugger/fiq_watchdog.c

new file mode 100644 (file)

index 0000000..194b541
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.c
@@ -0,0 +1,56 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/spinlock.h>
+#include <linux/pstore_ram.h>
+
+#include "fiq_watchdog.h"
+#include "fiq_debugger_priv.h"
+
+static DEFINE_RAW_SPINLOCK(fiq_watchdog_lock);
+
+static void fiq_watchdog_printf(struct fiq_debugger_output *output,
+                               const char *fmt, ...)
+{
+       char buf[256];
+       va_list ap;
+       int len;
+
+       va_start(ap, fmt);
+       len = vscnprintf(buf, sizeof(buf), fmt, ap);
+       va_end(ap);
+
+       ramoops_console_write_buf(buf, len);
+}
+
+struct fiq_debugger_output fiq_watchdog_output = {
+       .printf = fiq_watchdog_printf,
+};
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp)
+{
+       char msg[24];
+       int len;
+
+       raw_spin_lock(&fiq_watchdog_lock);
+
+       len = scnprintf(msg, sizeof(msg), "watchdog fiq cpu %d\n",
+                       THREAD_INFO(svc_sp)->cpu);
+       ramoops_console_write_buf(msg, len);
+
+       fiq_debugger_dump_stacktrace(&fiq_watchdog_output, regs, 100, svc_sp);
+
+       raw_spin_unlock(&fiq_watchdog_lock);
+}
diff --git a/drivers/staging/android/fiq_debugger/fiq_watchdog.h b/drivers/staging/android/fiq_debugger/fiq_watchdog.h

new file mode 100644 (file)

index 0000000..c6b507f
--- /dev/null
+++ b/drivers/staging/android/fiq_debugger/fiq_watchdog.h
@@ -0,0 +1,20 @@
+/*
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _FIQ_WATCHDOG_H_
+#define _FIQ_WATCHDOG_H_
+
+void fiq_watchdog_triggered(const struct pt_regs *regs, void *svc_sp);
+
+#endif
diff --git a/drivers/staging/goldfish/Kconfig b/drivers/staging/goldfish/Kconfig

index 4e094602437c3dea1c36e7614f808d680a0b4aa1..d293bbc22c792a41d83dddf5e4c5861135ae633e 100644 (file)
--- a/drivers/staging/goldfish/Kconfig
+++ b/drivers/staging/goldfish/Kconfig
@@ -4,6 +4,14 @@ config GOLDFISH_AUDIO
         ---help---
           Emulated audio channel for the Goldfish Android Virtual Device
  
+config GOLDFISH_SYNC
+    tristate "Goldfish AVD Sync Driver"
+    depends on GOLDFISH
+    depends on SW_SYNC
+    depends on SYNC_FILE
+       ---help---
+         Emulated sync fences for the Goldfish Android Virtual Device
+
  config MTD_GOLDFISH_NAND
         tristate "Goldfish NAND device"
         depends on GOLDFISH
diff --git a/drivers/staging/goldfish/Makefile b/drivers/staging/goldfish/Makefile

index dec34ad58162fdda2dde0969b0e004bc3dbfb01d..3313fce4e940d7be564416ca1485225ab41aba4b 100644 (file)
--- a/drivers/staging/goldfish/Makefile
+++ b/drivers/staging/goldfish/Makefile
@@ -4,3 +4,9 @@
  
  obj-$(CONFIG_GOLDFISH_AUDIO) += goldfish_audio.o
  obj-$(CONFIG_MTD_GOLDFISH_NAND)        += goldfish_nand.o
+
+# and sync
+
+ccflags-y := -Idrivers/staging/android
+goldfish_sync-objs := goldfish_sync_timeline_fence.o goldfish_sync_timeline.o
+obj-$(CONFIG_GOLDFISH_SYNC) += goldfish_sync.o
diff --git a/drivers/staging/goldfish/goldfish_audio.c b/drivers/staging/goldfish/goldfish_audio.c

index bd559956f199001dcf2743f5e944111f8236169b..0bb0ee2e691f1a8899ca63a9c5bc93d6a8a7a36b 100644 (file)
--- a/drivers/staging/goldfish/goldfish_audio.c
+++ b/drivers/staging/goldfish/goldfish_audio.c
@@ -28,6 +28,7 @@
  #include <linux/uaccess.h>
  #include <linux/slab.h>
  #include <linux/goldfish.h>
+#include <linux/acpi.h>
  
  MODULE_AUTHOR("Google, Inc.");
  MODULE_DESCRIPTION("Android QEMU Audio Driver");
@@ -116,6 +117,7 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
                                    size_t count, loff_t *pos)
  {
         struct goldfish_audio *data = fp->private_data;
+       unsigned long irq_flags;
         int length;
         int result = 0;
  
@@ -129,6 +131,10 @@ static ssize_t goldfish_audio_read(struct file *fp, char __user *buf,
                 wait_event_interruptible(data->wait, data->buffer_status &
                                          AUDIO_INT_READ_BUFFER_FULL);
  
+               spin_lock_irqsave(&data->lock, irq_flags);
+               data->buffer_status &= ~AUDIO_INT_READ_BUFFER_FULL;
+               spin_unlock_irqrestore(&data->lock, irq_flags);
+
                 length = AUDIO_READ(data, AUDIO_READ_BUFFER_AVAILABLE);
  
                 /* copy data to user space */
@@ -351,12 +357,19 @@ static const struct of_device_id goldfish_audio_of_match[] = {
  };
  MODULE_DEVICE_TABLE(of, goldfish_audio_of_match);
  
+static const struct acpi_device_id goldfish_audio_acpi_match[] = {
+       { "GFSH0005", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_audio_acpi_match);
+
  static struct platform_driver goldfish_audio_driver = {
         .probe          = goldfish_audio_probe,
         .remove         = goldfish_audio_remove,
         .driver = {
                 .name = "goldfish_audio",
                 .of_match_table = goldfish_audio_of_match,
+               .acpi_match_table = ACPI_PTR(goldfish_audio_acpi_match),
         }
  };
  
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline.c b/drivers/staging/goldfish/goldfish_sync_timeline.c

new file mode 100644 (file)

index 0000000..880d6e2
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline.c
@@ -0,0 +1,962 @@
+/*
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/fdtable.h>
+#include <linux/file.h>
+#include <linux/init.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+
+#include <linux/interrupt.h>
+#include <linux/kref.h>
+#include <linux/spinlock.h>
+#include <linux/types.h>
+
+#include <linux/io.h>
+#include <linux/mm.h>
+#include <linux/acpi.h>
+
+#include <linux/string.h>
+
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/dma-fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+#define ERR(...) printk(KERN_ERR __VA_ARGS__);
+
+#define INFO(...) printk(KERN_INFO __VA_ARGS__);
+
+#define DPRINT(...) pr_debug(__VA_ARGS__);
+
+#define DTRACE() DPRINT("%s: enter", __func__)
+
+/* The Goldfish sync driver is designed to provide a interface
+ * between the underlying host's sync device and the kernel's
+ * fence sync framework..
+ * The purpose of the device/driver is to enable lightweight
+ * creation and signaling of timelines and fences
+ * in order to synchronize the guest with host-side graphics events.
+ *
+ * Each time the interrupt trips, the driver
+ * may perform a sync operation.
+ */
+
+/* The operations are: */
+
+/* Ready signal - used to mark when irq should lower */
+#define CMD_SYNC_READY            0
+
+/* Create a new timeline. writes timeline handle */
+#define CMD_CREATE_SYNC_TIMELINE  1
+
+/* Create a fence object. reads timeline handle and time argument.
+ * Writes fence fd to the SYNC_REG_HANDLE register. */
+#define CMD_CREATE_SYNC_FENCE     2
+
+/* Increments timeline. reads timeline handle and time argument */
+#define CMD_SYNC_TIMELINE_INC     3
+
+/* Destroys a timeline. reads timeline handle */
+#define CMD_DESTROY_SYNC_TIMELINE 4
+
+/* Starts a wait on the host with
+ * the given glsync object and sync thread handle. */
+#define CMD_TRIGGER_HOST_WAIT     5
+
+/* The register layout is: */
+
+#define SYNC_REG_BATCH_COMMAND                0x00 /* host->guest batch commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND           0x04 /* guest->host batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR           0x08 /* communicate physical address of host->guest batch commands */
+#define SYNC_REG_BATCH_COMMAND_ADDR_HIGH      0x0c /* 64-bit part */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR      0x10 /* communicate physical address of guest->host commands */
+#define SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH 0x14 /* 64-bit part */
+#define SYNC_REG_INIT                         0x18 /* signals that the device has been probed */
+
+/* There is an ioctl associated with goldfish sync driver.
+ * Make it conflict with ioctls that are not likely to be used
+ * in the emulator.
+ *
+ * '@' 00-0F   linux/radeonfb.h        conflict!
+ * '@' 00-0F   drivers/video/aty/aty128fb.c    conflict!
+ */
+#define GOLDFISH_SYNC_IOC_MAGIC        '@'
+
+#define GOLDFISH_SYNC_IOC_QUEUE_WORK   _IOWR(GOLDFISH_SYNC_IOC_MAGIC, 0, struct goldfish_sync_ioctl_info)
+
+/* The above definitions (command codes, register layout, ioctl definitions)
+ * need to be in sync with the following files:
+ *
+ * Host-side (emulator):
+ * external/qemu/android/emulation/goldfish_sync.h
+ * external/qemu-android/hw/misc/goldfish_sync.c
+ *
+ * Guest-side (system image):
+ * device/generic/goldfish-opengl/system/egl/goldfish_sync.h
+ * device/generic/goldfish/ueventd.ranchu.rc
+ * platform/build/target/board/generic/sepolicy/file_contexts
+ */
+struct goldfish_sync_hostcmd {
+       /* sorted for alignment */
+       uint64_t handle;
+       uint64_t hostcmd_handle;
+       uint32_t cmd;
+       uint32_t time_arg;
+};
+
+struct goldfish_sync_guestcmd {
+       uint64_t host_command; /* uint64_t for alignment */
+       uint64_t glsync_handle;
+       uint64_t thread_handle;
+       uint64_t guest_timeline_handle;
+};
+
+#define GOLDFISH_SYNC_MAX_CMDS 32
+
+struct goldfish_sync_state {
+       char __iomem *reg_base;
+       int irq;
+
+       /* Spinlock protects |to_do| / |to_do_end|. */
+       spinlock_t lock;
+       /* |mutex_lock| protects all concurrent access
+        * to timelines for both kernel and user space. */
+       struct mutex mutex_lock;
+
+       /* Buffer holding commands issued from host. */
+       struct goldfish_sync_hostcmd to_do[GOLDFISH_SYNC_MAX_CMDS];
+       uint32_t to_do_end;
+
+       /* Addresses for the reading or writing
+        * of individual commands. The host can directly write
+        * to |batch_hostcmd| (and then this driver immediately
+        * copies contents to |to_do|). This driver either replies
+        * through |batch_hostcmd| or simply issues a
+        * guest->host command through |batch_guestcmd|.
+        */
+       struct goldfish_sync_hostcmd *batch_hostcmd;
+       struct goldfish_sync_guestcmd *batch_guestcmd;
+
+       /* Used to give this struct itself to a work queue
+        * function for executing actual sync commands. */
+       struct work_struct work_item;
+};
+
+static struct goldfish_sync_state global_sync_state[1];
+
+struct goldfish_sync_timeline_obj {
+       struct goldfish_sync_timeline *sync_tl;
+       uint32_t current_time;
+       /* We need to be careful about when we deallocate
+        * this |goldfish_sync_timeline_obj| struct.
+        * In order to ensure proper cleanup, we need to
+        * consider the triggered host-side wait that may
+        * still be in flight when the guest close()'s a
+        * goldfish_sync device's sync context fd (and
+        * destroys the |sync_tl| field above).
+        * The host-side wait may raise IRQ
+        * and tell the kernel to increment the timeline _after_
+        * the |sync_tl| has already been set to null.
+        *
+        * From observations on OpenGL apps and CTS tests, this
+        * happens at some very low probability upon context
+        * destruction or process close, but it does happen
+        * and it needs to be handled properly. Otherwise,
+        * if we clean up the surrounding |goldfish_sync_timeline_obj|
+        * too early, any |handle| field of any host->guest command
+        * might not even point to a null |sync_tl| field,
+        * but to garbage memory or even a reclaimed |sync_tl|.
+        * If we do not count such "pending waits" and kfree the object
+        * immediately upon |goldfish_sync_timeline_destroy|,
+        * we might get mysterous RCU stalls after running a long
+        * time because the garbage memory that is being read
+        * happens to be interpretable as a |spinlock_t| struct
+        * that is currently in the locked state.
+        *
+        * To track when to free the |goldfish_sync_timeline_obj|
+        * itself, we maintain a kref.
+        * The kref essentially counts the timeline itself plus
+        * the number of waits in flight. kref_init/kref_put
+        * are issued on
+        * |goldfish_sync_timeline_create|/|goldfish_sync_timeline_destroy|
+        * and kref_get/kref_put are issued on
+        * |goldfish_sync_fence_create|/|goldfish_sync_timeline_inc|.
+        *
+        * The timeline is destroyed after reference count
+        * reaches zero, which would happen after
+        * |goldfish_sync_timeline_destroy| and all pending
+        * |goldfish_sync_timeline_inc|'s are fulfilled.
+        *
+        * NOTE (1): We assume that |fence_create| and
+        * |timeline_inc| calls are 1:1, otherwise the kref scheme
+        * will not work. This is a valid assumption as long
+        * as the host-side virtual device implementation
+        * does not insert any timeline increments
+        * that we did not trigger from here.
+        *
+        * NOTE (2): The use of kref by itself requires no locks,
+        * but this does not mean everything works without locks.
+        * Related timeline operations do require a lock of some sort,
+        * or at least are not proven to work without it.
+        * In particualr, we assume that all the operations
+        * done on the |kref| field above are done in contexts where
+        * |global_sync_state->mutex_lock| is held. Do not
+        * remove that lock until everything is proven to work
+        * without it!!! */
+       struct kref kref;
+};
+
+/* We will call |delete_timeline_obj| when the last reference count
+ * of the kref is decremented. This deletes the sync
+ * timeline object along with the wrapper itself. */
+static void delete_timeline_obj(struct kref* kref) {
+       struct goldfish_sync_timeline_obj* obj =
+               container_of(kref, struct goldfish_sync_timeline_obj, kref);
+
+       goldfish_sync_timeline_put_internal(obj->sync_tl);
+       obj->sync_tl = NULL;
+       kfree(obj);
+}
+
+static uint64_t gensym_ctr;
+static void gensym(char *dst)
+{
+       sprintf(dst, "goldfish_sync:gensym:%llu", gensym_ctr);
+       gensym_ctr++;
+}
+
+/* |goldfish_sync_timeline_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static struct goldfish_sync_timeline_obj*
+goldfish_sync_timeline_create(void)
+{
+
+       char timeline_name[256];
+       struct goldfish_sync_timeline *res_sync_tl = NULL;
+       struct goldfish_sync_timeline_obj *res;
+
+       DTRACE();
+
+       gensym(timeline_name);
+
+       res_sync_tl = goldfish_sync_timeline_create_internal(timeline_name);
+       if (!res_sync_tl) {
+               ERR("Failed to create goldfish_sw_sync timeline.");
+               return NULL;
+       }
+
+       res = kzalloc(sizeof(struct goldfish_sync_timeline_obj), GFP_KERNEL);
+       res->sync_tl = res_sync_tl;
+       res->current_time = 0;
+       kref_init(&res->kref);
+
+       DPRINT("new timeline_obj=0x%p", res);
+       return res;
+}
+
+/* |goldfish_sync_fence_create| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static int
+goldfish_sync_fence_create(struct goldfish_sync_timeline_obj *obj,
+                                                       uint32_t val)
+{
+
+       int fd;
+       char fence_name[256];
+       struct sync_pt *syncpt = NULL;
+       struct sync_file *sync_file_obj = NULL;
+       struct goldfish_sync_timeline *tl;
+
+       DTRACE();
+
+       if (!obj) return -1;
+
+       tl = obj->sync_tl;
+
+       syncpt = goldfish_sync_pt_create_internal(
+                               tl, sizeof(struct sync_pt) + 4, val);
+       if (!syncpt) {
+               ERR("could not create sync point! "
+                       "goldfish_sync_timeline=0x%p val=%d",
+                          tl, val);
+               return -1;
+       }
+
+       fd = get_unused_fd_flags(O_CLOEXEC);
+       if (fd < 0) {
+               ERR("could not get unused fd for sync fence. "
+                       "errno=%d", fd);
+               goto err_cleanup_pt;
+       }
+
+       gensym(fence_name);
+
+       sync_file_obj = sync_file_create(&syncpt->base);
+       if (!sync_file_obj) {
+               ERR("could not create sync fence! "
+                       "goldfish_sync_timeline=0x%p val=%d sync_pt=0x%p",
+                          tl, val, syncpt);
+               goto err_cleanup_fd_pt;
+       }
+
+       DPRINT("installing sync fence into fd %d sync_file_obj=0x%p",
+                       fd, sync_file_obj);
+       fd_install(fd, sync_file_obj->file);
+       kref_get(&obj->kref);
+
+       return fd;
+
+err_cleanup_fd_pt:
+       put_unused_fd(fd);
+err_cleanup_pt:
+       dma_fence_put(&syncpt->base);
+       return -1;
+}
+
+/* |goldfish_sync_timeline_inc| assumes that |global_sync_state->mutex_lock|
+ * is held. */
+static void
+goldfish_sync_timeline_inc(struct goldfish_sync_timeline_obj *obj, uint32_t inc)
+{
+       DTRACE();
+       /* Just give up if someone else nuked the timeline.
+        * Whoever it was won't care that it doesn't get signaled. */
+       if (!obj) return;
+
+       DPRINT("timeline_obj=0x%p", obj);
+       goldfish_sync_timeline_signal_internal(obj->sync_tl, inc);
+       DPRINT("incremented timeline. increment max_time");
+       obj->current_time += inc;
+
+       /* Here, we will end up deleting the timeline object if it
+        * turns out that this call was a pending increment after
+        * |goldfish_sync_timeline_destroy| was called. */
+       kref_put(&obj->kref, delete_timeline_obj);
+       DPRINT("done");
+}
+
+/* |goldfish_sync_timeline_destroy| assumes
+ * that |global_sync_state->mutex_lock| is held. */
+static void
+goldfish_sync_timeline_destroy(struct goldfish_sync_timeline_obj *obj)
+{
+       DTRACE();
+       /* See description of |goldfish_sync_timeline_obj| for why we
+        * should not immediately destroy |obj| */
+       kref_put(&obj->kref, delete_timeline_obj);
+}
+
+static inline void
+goldfish_sync_cmd_queue(struct goldfish_sync_state *sync_state,
+                                               uint32_t cmd,
+                                               uint64_t handle,
+                                               uint32_t time_arg,
+                                               uint64_t hostcmd_handle)
+{
+       struct goldfish_sync_hostcmd *to_add;
+
+       DTRACE();
+
+       BUG_ON(sync_state->to_do_end == GOLDFISH_SYNC_MAX_CMDS);
+
+       to_add = &sync_state->to_do[sync_state->to_do_end];
+
+       to_add->cmd = cmd;
+       to_add->handle = handle;
+       to_add->time_arg = time_arg;
+       to_add->hostcmd_handle = hostcmd_handle;
+
+       sync_state->to_do_end += 1;
+}
+
+static inline void
+goldfish_sync_hostcmd_reply(struct goldfish_sync_state *sync_state,
+                                                       uint32_t cmd,
+                                                       uint64_t handle,
+                                                       uint32_t time_arg,
+                                                       uint64_t hostcmd_handle)
+{
+       unsigned long irq_flags;
+       struct goldfish_sync_hostcmd *batch_hostcmd =
+               sync_state->batch_hostcmd;
+
+       DTRACE();
+
+       spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+       batch_hostcmd->cmd = cmd;
+       batch_hostcmd->handle = handle;
+       batch_hostcmd->time_arg = time_arg;
+       batch_hostcmd->hostcmd_handle = hostcmd_handle;
+       writel(0, sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+
+       spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+static inline void
+goldfish_sync_send_guestcmd(struct goldfish_sync_state *sync_state,
+                                                       uint32_t cmd,
+                                                       uint64_t glsync_handle,
+                                                       uint64_t thread_handle,
+                                                       uint64_t timeline_handle)
+{
+       unsigned long irq_flags;
+       struct goldfish_sync_guestcmd *batch_guestcmd =
+               sync_state->batch_guestcmd;
+
+       DTRACE();
+
+       spin_lock_irqsave(&sync_state->lock, irq_flags);
+
+       batch_guestcmd->host_command = (uint64_t)cmd;
+       batch_guestcmd->glsync_handle = (uint64_t)glsync_handle;
+       batch_guestcmd->thread_handle = (uint64_t)thread_handle;
+       batch_guestcmd->guest_timeline_handle = (uint64_t)timeline_handle;
+       writel(0, sync_state->reg_base + SYNC_REG_BATCH_GUESTCOMMAND);
+
+       spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+}
+
+/* |goldfish_sync_interrupt| handles IRQ raises from the virtual device.
+ * In the context of OpenGL, this interrupt will fire whenever we need
+ * to signal a fence fd in the guest, with the command
+ * |CMD_SYNC_TIMELINE_INC|.
+ * However, because this function will be called in an interrupt context,
+ * it is necessary to do the actual work of signaling off of interrupt context.
+ * The shared work queue is used for this purpose. At the end when
+ * all pending commands are intercepted by the interrupt handler,
+ * we call |schedule_work|, which will later run the actual
+ * desired sync command in |goldfish_sync_work_item_fn|.
+ */
+static irqreturn_t goldfish_sync_interrupt(int irq, void *dev_id)
+{
+
+       struct goldfish_sync_state *sync_state = dev_id;
+
+       uint32_t nextcmd;
+       uint32_t command_r;
+       uint64_t handle_rw;
+       uint32_t time_r;
+       uint64_t hostcmd_handle_rw;
+
+       int count = 0;
+
+       DTRACE();
+
+       sync_state = dev_id;
+
+       spin_lock(&sync_state->lock);
+
+       for (;;) {
+
+               readl(sync_state->reg_base + SYNC_REG_BATCH_COMMAND);
+               nextcmd = sync_state->batch_hostcmd->cmd;
+
+               if (nextcmd == 0)
+                       break;
+
+               command_r = nextcmd;
+               handle_rw = sync_state->batch_hostcmd->handle;
+               time_r = sync_state->batch_hostcmd->time_arg;
+               hostcmd_handle_rw = sync_state->batch_hostcmd->hostcmd_handle;
+
+               goldfish_sync_cmd_queue(
+                               sync_state,
+                               command_r,
+                               handle_rw,
+                               time_r,
+                               hostcmd_handle_rw);
+
+               count++;
+       }
+
+       spin_unlock(&sync_state->lock);
+
+       schedule_work(&sync_state->work_item);
+
+       return (count == 0) ? IRQ_NONE : IRQ_HANDLED;
+}
+
+/* |goldfish_sync_work_item_fn| does the actual work of servicing
+ * host->guest sync commands. This function is triggered whenever
+ * the IRQ for the goldfish sync device is raised. Once it starts
+ * running, it grabs the contents of the buffer containing the
+ * commands it needs to execute (there may be multiple, because
+ * our IRQ is active high and not edge triggered), and then
+ * runs all of them one after the other.
+ */
+static void goldfish_sync_work_item_fn(struct work_struct *input)
+{
+
+       struct goldfish_sync_state *sync_state;
+       int sync_fence_fd;
+
+       struct goldfish_sync_timeline_obj *timeline;
+       uint64_t timeline_ptr;
+
+       uint64_t hostcmd_handle;
+
+       uint32_t cmd;
+       uint64_t handle;
+       uint32_t time_arg;
+
+       struct goldfish_sync_hostcmd *todo;
+       uint32_t todo_end;
+
+       unsigned long irq_flags;
+
+       struct goldfish_sync_hostcmd to_run[GOLDFISH_SYNC_MAX_CMDS];
+       uint32_t i = 0;
+
+       sync_state = container_of(input, struct goldfish_sync_state, work_item);
+
+       mutex_lock(&sync_state->mutex_lock);
+
+       spin_lock_irqsave(&sync_state->lock, irq_flags); {
+
+               todo_end = sync_state->to_do_end;
+
+               DPRINT("num sync todos: %u", sync_state->to_do_end);
+
+               for (i = 0; i < todo_end; i++)
+                       to_run[i] = sync_state->to_do[i];
+
+               /* We expect that commands will come in at a slow enough rate
+                * so that incoming items will not be more than
+                * GOLDFISH_SYNC_MAX_CMDS.
+                *
+                * This is because the way the sync device is used,
+                * it's only for managing buffer data transfers per frame,
+                * with a sequential dependency between putting things in
+                * to_do and taking them out. Once a set of commands is
+                * queued up in to_do, the user of the device waits for
+                * them to be processed before queuing additional commands,
+                * which limits the rate at which commands come in
+                * to the rate at which we take them out here.
+                *
+                * We also don't expect more than MAX_CMDS to be issued
+                * at once; there is a correspondence between
+                * which buffers need swapping to the (display / buffer queue)
+                * to particular commands, and we don't expect there to be
+                * enough display or buffer queues in operation at once
+                * to overrun GOLDFISH_SYNC_MAX_CMDS.
+                */
+               sync_state->to_do_end = 0;
+
+       } spin_unlock_irqrestore(&sync_state->lock, irq_flags);
+
+       for (i = 0; i < todo_end; i++) {
+               DPRINT("todo index: %u", i);
+
+               todo = &to_run[i];
+
+               cmd = todo->cmd;
+
+               handle = (uint64_t)todo->handle;
+               time_arg = todo->time_arg;
+               hostcmd_handle = (uint64_t)todo->hostcmd_handle;
+
+               DTRACE();
+
+               timeline = (struct goldfish_sync_timeline_obj *)(uintptr_t)handle;
+
+               switch (cmd) {
+               case CMD_SYNC_READY:
+                       break;
+               case CMD_CREATE_SYNC_TIMELINE:
+                       DPRINT("exec CMD_CREATE_SYNC_TIMELINE: "
+                                       "handle=0x%llx time_arg=%d",
+                                       handle, time_arg);
+                       timeline = goldfish_sync_timeline_create();
+                       timeline_ptr = (uintptr_t)timeline;
+                       goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_TIMELINE,
+                                                                               timeline_ptr,
+                                                                               0,
+                                                                               hostcmd_handle);
+                       DPRINT("sync timeline created: %p", timeline);
+                       break;
+               case CMD_CREATE_SYNC_FENCE:
+                       DPRINT("exec CMD_CREATE_SYNC_FENCE: "
+                                       "handle=0x%llx time_arg=%d",
+                                       handle, time_arg);
+                       sync_fence_fd = goldfish_sync_fence_create(timeline, time_arg);
+                       goldfish_sync_hostcmd_reply(sync_state, CMD_CREATE_SYNC_FENCE,
+                                                                               sync_fence_fd,
+                                                                               0,
+                                                                               hostcmd_handle);
+                       break;
+               case CMD_SYNC_TIMELINE_INC:
+                       DPRINT("exec CMD_SYNC_TIMELINE_INC: "
+                                       "handle=0x%llx time_arg=%d",
+                                       handle, time_arg);
+                       goldfish_sync_timeline_inc(timeline, time_arg);
+                       break;
+               case CMD_DESTROY_SYNC_TIMELINE:
+                       DPRINT("exec CMD_DESTROY_SYNC_TIMELINE: "
+                                       "handle=0x%llx time_arg=%d",
+                                       handle, time_arg);
+                       goldfish_sync_timeline_destroy(timeline);
+                       break;
+               }
+               DPRINT("Done executing sync command");
+       }
+       mutex_unlock(&sync_state->mutex_lock);
+}
+
+/* Guest-side interface: file operations */
+
+/* Goldfish sync context and ioctl info.
+ *
+ * When a sync context is created by open()-ing the goldfish sync device, we
+ * create a sync context (|goldfish_sync_context|).
+ *
+ * Currently, the only data required to track is the sync timeline itself
+ * along with the current time, which are all packed up in the
+ * |goldfish_sync_timeline_obj| field. We use a |goldfish_sync_context|
+ * as the filp->private_data.
+ *
+ * Next, when a sync context user requests that work be queued and a fence
+ * fd provided, we use the |goldfish_sync_ioctl_info| struct, which holds
+ * information about which host handles to touch for this particular
+ * queue-work operation. We need to know about the host-side sync thread
+ * and the particular host-side GLsync object. We also possibly write out
+ * a file descriptor.
+ */
+struct goldfish_sync_context {
+       struct goldfish_sync_timeline_obj *timeline;
+};
+
+struct goldfish_sync_ioctl_info {
+       uint64_t host_glsync_handle_in;
+       uint64_t host_syncthread_handle_in;
+       int fence_fd_out;
+};
+
+static int goldfish_sync_open(struct inode *inode, struct file *file)
+{
+
+       struct goldfish_sync_context *sync_context;
+
+       DTRACE();
+
+       mutex_lock(&global_sync_state->mutex_lock);
+
+       sync_context = kzalloc(sizeof(struct goldfish_sync_context), GFP_KERNEL);
+
+       if (sync_context == NULL) {
+               ERR("Creation of goldfish sync context failed!");
+               mutex_unlock(&global_sync_state->mutex_lock);
+               return -ENOMEM;
+       }
+
+       sync_context->timeline = NULL;
+
+       file->private_data = sync_context;
+
+       DPRINT("successfully create a sync context @0x%p", sync_context);
+
+       mutex_unlock(&global_sync_state->mutex_lock);
+
+       return 0;
+}
+
+static int goldfish_sync_release(struct inode *inode, struct file *file)
+{
+
+       struct goldfish_sync_context *sync_context;
+
+       DTRACE();
+
+       mutex_lock(&global_sync_state->mutex_lock);
+
+       sync_context = file->private_data;
+
+       if (sync_context->timeline)
+               goldfish_sync_timeline_destroy(sync_context->timeline);
+
+       sync_context->timeline = NULL;
+
+       kfree(sync_context);
+
+       mutex_unlock(&global_sync_state->mutex_lock);
+
+       return 0;
+}
+
+/* |goldfish_sync_ioctl| is the guest-facing interface of goldfish sync
+ * and is used in conjunction with eglCreateSyncKHR to queue up the
+ * actual work of waiting for the EGL sync command to complete,
+ * possibly returning a fence fd to the guest.
+ */
+static long goldfish_sync_ioctl(struct file *file,
+                                                               unsigned int cmd,
+                                                               unsigned long arg)
+{
+       struct goldfish_sync_context *sync_context_data;
+       struct goldfish_sync_timeline_obj *timeline;
+       int fd_out;
+       struct goldfish_sync_ioctl_info ioctl_data;
+
+       DTRACE();
+
+       sync_context_data = file->private_data;
+       fd_out = -1;
+
+       switch (cmd) {
+       case GOLDFISH_SYNC_IOC_QUEUE_WORK:
+
+               DPRINT("exec GOLDFISH_SYNC_IOC_QUEUE_WORK");
+
+               mutex_lock(&global_sync_state->mutex_lock);
+
+               if (copy_from_user(&ioctl_data,
+                                               (void __user *)arg,
+                                               sizeof(ioctl_data))) {
+                       ERR("Failed to copy memory for ioctl_data from user.");
+                       mutex_unlock(&global_sync_state->mutex_lock);
+                       return -EFAULT;
+               }
+
+               if (ioctl_data.host_syncthread_handle_in == 0) {
+                       DPRINT("Error: zero host syncthread handle!!!");
+                       mutex_unlock(&global_sync_state->mutex_lock);
+                       return -EFAULT;
+               }
+
+               if (!sync_context_data->timeline) {
+                       DPRINT("no timeline yet, create one.");
+                       sync_context_data->timeline = goldfish_sync_timeline_create();
+                       DPRINT("timeline: 0x%p", &sync_context_data->timeline);
+               }
+
+               timeline = sync_context_data->timeline;
+               fd_out = goldfish_sync_fence_create(timeline,
+                                                                                       timeline->current_time + 1);
+               DPRINT("Created fence with fd %d and current time %u (timeline: 0x%p)",
+                          fd_out,
+                          sync_context_data->timeline->current_time + 1,
+                          sync_context_data->timeline);
+
+               ioctl_data.fence_fd_out = fd_out;
+
+               if (copy_to_user((void __user *)arg,
+                                               &ioctl_data,
+                                               sizeof(ioctl_data))) {
+                       DPRINT("Error, could not copy to user!!!");
+
+                       sys_close(fd_out);
+                       /* We won't be doing an increment, kref_put immediately. */
+                       kref_put(&timeline->kref, delete_timeline_obj);
+                       mutex_unlock(&global_sync_state->mutex_lock);
+                       return -EFAULT;
+               }
+
+               /* We are now about to trigger a host-side wait;
+                * accumulate on |pending_waits|. */
+               goldfish_sync_send_guestcmd(global_sync_state,
+                               CMD_TRIGGER_HOST_WAIT,
+                               ioctl_data.host_glsync_handle_in,
+                               ioctl_data.host_syncthread_handle_in,
+                               (uint64_t)(uintptr_t)(sync_context_data->timeline));
+
+               mutex_unlock(&global_sync_state->mutex_lock);
+               return 0;
+       default:
+               return -ENOTTY;
+       }
+}
+
+static const struct file_operations goldfish_sync_fops = {
+       .owner = THIS_MODULE,
+       .open = goldfish_sync_open,
+       .release = goldfish_sync_release,
+       .unlocked_ioctl = goldfish_sync_ioctl,
+       .compat_ioctl = goldfish_sync_ioctl,
+};
+
+static struct miscdevice goldfish_sync_device = {
+       .name = "goldfish_sync",
+       .fops = &goldfish_sync_fops,
+};
+
+
+static bool setup_verify_batch_cmd_addr(struct goldfish_sync_state *sync_state,
+                                                                               void *batch_addr,
+                                                                               uint32_t addr_offset,
+                                                                               uint32_t addr_offset_high)
+{
+       uint64_t batch_addr_phys;
+       uint32_t batch_addr_phys_test_lo;
+       uint32_t batch_addr_phys_test_hi;
+
+       if (!batch_addr) {
+               ERR("Could not use batch command address!");
+               return false;
+       }
+
+       batch_addr_phys = virt_to_phys(batch_addr);
+       writel((uint32_t)(batch_addr_phys),
+                       sync_state->reg_base + addr_offset);
+       writel((uint32_t)(batch_addr_phys >> 32),
+                       sync_state->reg_base + addr_offset_high);
+
+       batch_addr_phys_test_lo =
+               readl(sync_state->reg_base + addr_offset);
+       batch_addr_phys_test_hi =
+               readl(sync_state->reg_base + addr_offset_high);
+
+       if (virt_to_phys(batch_addr) !=
+                       (((uint64_t)batch_addr_phys_test_hi << 32) |
+                        batch_addr_phys_test_lo)) {
+               ERR("Invalid batch command address!");
+               return false;
+       }
+
+       return true;
+}
+
+int goldfish_sync_probe(struct platform_device *pdev)
+{
+       struct resource *ioresource;
+       struct goldfish_sync_state *sync_state = global_sync_state;
+       int status;
+
+       DTRACE();
+
+       sync_state->to_do_end = 0;
+
+       spin_lock_init(&sync_state->lock);
+       mutex_init(&sync_state->mutex_lock);
+
+       platform_set_drvdata(pdev, sync_state);
+
+       ioresource = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       if (ioresource == NULL) {
+               ERR("platform_get_resource failed");
+               return -ENODEV;
+       }
+
+       sync_state->reg_base =
+               devm_ioremap(&pdev->dev, ioresource->start, PAGE_SIZE);
+       if (sync_state->reg_base == NULL) {
+               ERR("Could not ioremap");
+               return -ENOMEM;
+       }
+
+       sync_state->irq = platform_get_irq(pdev, 0);
+       if (sync_state->irq < 0) {
+               ERR("Could not platform_get_irq");
+               return -ENODEV;
+       }
+
+       status = devm_request_irq(&pdev->dev,
+                                                       sync_state->irq,
+                                                       goldfish_sync_interrupt,
+                                                       IRQF_SHARED,
+                                                       pdev->name,
+                                                       sync_state);
+       if (status) {
+               ERR("request_irq failed");
+               return -ENODEV;
+       }
+
+       INIT_WORK(&sync_state->work_item,
+                         goldfish_sync_work_item_fn);
+
+       misc_register(&goldfish_sync_device);
+
+       /* Obtain addresses for batch send/recv of commands. */
+       {
+               struct goldfish_sync_hostcmd *batch_addr_hostcmd;
+               struct goldfish_sync_guestcmd *batch_addr_guestcmd;
+
+               batch_addr_hostcmd =
+                       devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_hostcmd),
+                               GFP_KERNEL);
+               batch_addr_guestcmd =
+                       devm_kzalloc(&pdev->dev, sizeof(struct goldfish_sync_guestcmd),
+                               GFP_KERNEL);
+
+               if (!setup_verify_batch_cmd_addr(sync_state,
+                                       batch_addr_hostcmd,
+                                       SYNC_REG_BATCH_COMMAND_ADDR,
+                                       SYNC_REG_BATCH_COMMAND_ADDR_HIGH)) {
+                       ERR("goldfish_sync: Could not setup batch command address");
+                       return -ENODEV;
+               }
+
+               if (!setup_verify_batch_cmd_addr(sync_state,
+                                       batch_addr_guestcmd,
+                                       SYNC_REG_BATCH_GUESTCOMMAND_ADDR,
+                                       SYNC_REG_BATCH_GUESTCOMMAND_ADDR_HIGH)) {
+                       ERR("goldfish_sync: Could not setup batch guest command address");
+                       return -ENODEV;
+               }
+
+               sync_state->batch_hostcmd = batch_addr_hostcmd;
+               sync_state->batch_guestcmd = batch_addr_guestcmd;
+       }
+
+       INFO("goldfish_sync: Initialized goldfish sync device");
+
+       writel(0, sync_state->reg_base + SYNC_REG_INIT);
+
+       return 0;
+}
+
+static int goldfish_sync_remove(struct platform_device *pdev)
+{
+       struct goldfish_sync_state *sync_state = global_sync_state;
+
+       DTRACE();
+
+       misc_deregister(&goldfish_sync_device);
+       memset(sync_state, 0, sizeof(struct goldfish_sync_state));
+       return 0;
+}
+
+static const struct of_device_id goldfish_sync_of_match[] = {
+       { .compatible = "google,goldfish-sync", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_sync_of_match);
+
+static const struct acpi_device_id goldfish_sync_acpi_match[] = {
+       { "GFSH0006", 0 },
+       { },
+};
+
+MODULE_DEVICE_TABLE(acpi, goldfish_sync_acpi_match);
+
+static struct platform_driver goldfish_sync = {
+       .probe = goldfish_sync_probe,
+       .remove = goldfish_sync_remove,
+       .driver = {
+               .name = "goldfish_sync",
+               .of_match_table = goldfish_sync_of_match,
+               .acpi_match_table = ACPI_PTR(goldfish_sync_acpi_match),
+       }
+};
+
+module_platform_driver(goldfish_sync);
+
+MODULE_AUTHOR("Google, Inc.");
+MODULE_DESCRIPTION("Android QEMU Sync Driver");
+MODULE_LICENSE("GPL");
+MODULE_VERSION("1.0");
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.c b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c

new file mode 100644 (file)

index 0000000..a5bc2de
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.c
@@ -0,0 +1,254 @@
+#include <linux/slab.h>
+#include <linux/fs.h>
+#include <linux/syscalls.h>
+#include <linux/sync_file.h>
+#include <linux/dma-fence.h>
+
+#include "goldfish_sync_timeline_fence.h"
+
+/*
+ * Timeline-based sync for Goldfish Sync
+ * Based on "Sync File validation framework"
+ * (drivers/dma-buf/sw_sync.c)
+ *
+ * Copyright (C) 2017 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/**
+ * struct goldfish_sync_timeline - sync object
+ * @kref:              reference count on fence.
+ * @name:              name of the goldfish_sync_timeline. Useful for debugging
+ * @child_list_head:   list of children sync_pts for this goldfish_sync_timeline
+ * @child_list_lock:   lock protecting @child_list_head and fence.status
+ * @active_list_head:  list of active (unsignaled/errored) sync_pts
+ */
+struct goldfish_sync_timeline {
+       struct kref             kref;
+       char                    name[32];
+
+       /* protected by child_list_lock */
+       u64                     context;
+       int                     value;
+
+       struct list_head        child_list_head;
+       spinlock_t              child_list_lock;
+
+       struct list_head        active_list_head;
+};
+
+static inline struct goldfish_sync_timeline *goldfish_dma_fence_parent(struct dma_fence *fence)
+{
+       return container_of(fence->lock, struct goldfish_sync_timeline,
+                               child_list_lock);
+}
+
+static const struct dma_fence_ops goldfish_sync_timeline_fence_ops;
+
+static inline struct sync_pt *goldfish_sync_fence_to_sync_pt(struct dma_fence *fence)
+{
+       if (fence->ops != &goldfish_sync_timeline_fence_ops)
+               return NULL;
+       return container_of(fence, struct sync_pt, base);
+}
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name:      sync_timeline name
+ *
+ * Creates a new sync_timeline. Returns the sync_timeline object or NULL in
+ * case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name)
+{
+       struct goldfish_sync_timeline *obj;
+
+       obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+       if (!obj)
+               return NULL;
+
+       kref_init(&obj->kref);
+       obj->context = dma_fence_context_alloc(1);
+       strlcpy(obj->name, name, sizeof(obj->name));
+
+       INIT_LIST_HEAD(&obj->child_list_head);
+       INIT_LIST_HEAD(&obj->active_list_head);
+       spin_lock_init(&obj->child_list_lock);
+
+       return obj;
+}
+
+static void goldfish_sync_timeline_free_internal(struct kref *kref)
+{
+       struct goldfish_sync_timeline *obj =
+               container_of(kref, struct goldfish_sync_timeline, kref);
+
+       kfree(obj);
+}
+
+static void goldfish_sync_timeline_get_internal(
+                                       struct goldfish_sync_timeline *obj)
+{
+       kref_get(&obj->kref);
+}
+
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj)
+{
+       kref_put(&obj->kref, goldfish_sync_timeline_free_internal);
+}
+
+/**
+ * goldfish_sync_timeline_signal() -
+ * signal a status change on a goldfish_sync_timeline
+ * @obj:       sync_timeline to signal
+ * @inc:       num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+                                                                                       unsigned int inc)
+{
+       unsigned long flags;
+       struct sync_pt *pt, *next;
+
+       spin_lock_irqsave(&obj->child_list_lock, flags);
+
+       obj->value += inc;
+
+       list_for_each_entry_safe(pt, next, &obj->active_list_head,
+                                active_list) {
+               if (dma_fence_is_signaled_locked(&pt->base))
+                       list_del_init(&pt->active_list);
+       }
+
+       spin_unlock_irqrestore(&obj->child_list_lock, flags);
+}
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent:    fence's parent sync_timeline
+ * @size:      size to allocate for this pt
+ * @inc:       value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent.  @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt *goldfish_sync_pt_create_internal(
+                                       struct goldfish_sync_timeline *obj, int size,
+                                       unsigned int value)
+{
+       unsigned long flags;
+       struct sync_pt *pt;
+
+       if (size < sizeof(*pt))
+               return NULL;
+
+       pt = kzalloc(size, GFP_KERNEL);
+       if (!pt)
+               return NULL;
+
+       spin_lock_irqsave(&obj->child_list_lock, flags);
+       goldfish_sync_timeline_get_internal(obj);
+       dma_fence_init(&pt->base, &goldfish_sync_timeline_fence_ops, &obj->child_list_lock,
+                  obj->context, value);
+       list_add_tail(&pt->child_list, &obj->child_list_head);
+       INIT_LIST_HEAD(&pt->active_list);
+       spin_unlock_irqrestore(&obj->child_list_lock, flags);
+       return pt;
+}
+
+static const char *goldfish_sync_timeline_fence_get_driver_name(
+                                               struct dma_fence *fence)
+{
+       return "sw_sync";
+}
+
+static const char *goldfish_sync_timeline_fence_get_timeline_name(
+                                               struct dma_fence *fence)
+{
+       struct goldfish_sync_timeline *parent = goldfish_dma_fence_parent(fence);
+
+       return parent->name;
+}
+
+static void goldfish_sync_timeline_fence_release(struct dma_fence *fence)
+{
+       struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+       struct goldfish_sync_timeline *parent = goldfish_dma_fence_parent(fence);
+       unsigned long flags;
+
+       spin_lock_irqsave(fence->lock, flags);
+       list_del(&pt->child_list);
+       if (!list_empty(&pt->active_list))
+               list_del(&pt->active_list);
+       spin_unlock_irqrestore(fence->lock, flags);
+
+       goldfish_sync_timeline_put_internal(parent);
+       dma_fence_free(fence);
+}
+
+static bool goldfish_sync_timeline_fence_signaled(struct dma_fence *fence)
+{
+       struct goldfish_sync_timeline *parent = goldfish_dma_fence_parent(fence);
+
+       return (fence->seqno > parent->value) ? false : true;
+}
+
+static bool goldfish_sync_timeline_fence_enable_signaling(struct dma_fence *fence)
+{
+       struct sync_pt *pt = goldfish_sync_fence_to_sync_pt(fence);
+       struct goldfish_sync_timeline *parent = goldfish_dma_fence_parent(fence);
+
+       if (goldfish_sync_timeline_fence_signaled(fence))
+               return false;
+
+       list_add_tail(&pt->active_list, &parent->active_list_head);
+       return true;
+}
+
+static void goldfish_sync_timeline_fence_disable_signaling(struct dma_fence *fence)
+{
+       struct sync_pt *pt = container_of(fence, struct sync_pt, base);
+
+       list_del_init(&pt->active_list);
+}
+
+static void goldfish_sync_timeline_fence_value_str(struct dma_fence *fence,
+                                       char *str, int size)
+{
+       snprintf(str, size, "%d", fence->seqno);
+}
+
+static void goldfish_sync_timeline_fence_timeline_value_str(
+                               struct dma_fence *fence,
+                               char *str, int size)
+{
+       struct goldfish_sync_timeline *parent = goldfish_dma_fence_parent(fence);
+
+       snprintf(str, size, "%d", parent->value);
+}
+
+static const struct dma_fence_ops goldfish_sync_timeline_fence_ops = {
+       .get_driver_name = goldfish_sync_timeline_fence_get_driver_name,
+       .get_timeline_name = goldfish_sync_timeline_fence_get_timeline_name,
+       .enable_signaling = goldfish_sync_timeline_fence_enable_signaling,
+       .disable_signaling = goldfish_sync_timeline_fence_disable_signaling,
+       .signaled = goldfish_sync_timeline_fence_signaled,
+       .wait = dma_fence_default_wait,
+       .release = goldfish_sync_timeline_fence_release,
+       .fence_value_str = goldfish_sync_timeline_fence_value_str,
+       .timeline_value_str = goldfish_sync_timeline_fence_timeline_value_str,
+};
diff --git a/drivers/staging/goldfish/goldfish_sync_timeline_fence.h b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h

new file mode 100644 (file)

index 0000000..638c6fb
--- /dev/null
+++ b/drivers/staging/goldfish/goldfish_sync_timeline_fence.h
@@ -0,0 +1,58 @@
+#include <linux/sync_file.h>
+#include <linux/dma-fence.h>
+
+/**
+ * struct sync_pt - sync_pt object
+ * @base: base dma_fence object
+ * @child_list: sync timeline child's list
+ * @active_list: sync timeline active child's list
+ */
+struct sync_pt {
+       struct dma_fence base;
+       struct list_head child_list;
+       struct list_head active_list;
+};
+
+/**
+ * goldfish_sync_timeline_create_internal() - creates a sync object
+ * @name:      goldfish_sync_timeline name
+ *
+ * Creates a new goldfish_sync_timeline.
+ * Returns the goldfish_sync_timeline object or NULL in case of error.
+ */
+struct goldfish_sync_timeline
+*goldfish_sync_timeline_create_internal(const char *name);
+
+/**
+ * goldfish_sync_pt_create_internal() - creates a sync pt
+ * @parent:    fence's parent goldfish_sync_timeline
+ * @size:      size to allocate for this pt
+ * @inc:       value of the fence
+ *
+ * Creates a new sync_pt as a child of @parent.  @size bytes will be
+ * allocated allowing for implementation specific data to be kept after
+ * the generic sync_timeline struct. Returns the sync_pt object or
+ * NULL in case of error.
+ */
+struct sync_pt
+*goldfish_sync_pt_create_internal(struct goldfish_sync_timeline *obj,
+                                                                       int size, unsigned int value);
+
+/**
+ * goldfish_sync_timeline_signal_internal() -
+ * signal a status change on a sync_timeline
+ * @obj:       goldfish_sync_timeline to signal
+ * @inc:       num to increment on timeline->value
+ *
+ * A sync implementation should call this any time one of it's fences
+ * has signaled or has an error condition.
+ */
+void goldfish_sync_timeline_signal_internal(struct goldfish_sync_timeline *obj,
+                                                                                       unsigned int inc);
+
+/**
+ * goldfish_sync_timeline_put_internal() - dec refcount of a sync_timeline
+ * and clean up memory if it was the last ref.
+ * @obj:       goldfish_sync_timeline to decref
+ */
+void goldfish_sync_timeline_put_internal(struct goldfish_sync_timeline *obj);
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c

index 6d8906d6547638d91f9aeb804e08c5eaba7c0c2c..2d855a96cdd9886c5f3abbcfb4e1d8a5c1ea8666 100644 (file)
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -23,243 +23,450 @@
  #include <linux/module.h>
  #include <linux/platform_device.h>
  #include <linux/io.h>
+#include <linux/of_device.h>
  
  #include "thermal_core.h"
  
-#define TEMP0_TH                       (0x4)
-#define TEMP0_RST_TH                   (0x8)
-#define TEMP0_CFG                      (0xC)
-#define TEMP0_EN                       (0x10)
-#define TEMP0_INT_EN                   (0x14)
-#define TEMP0_INT_CLR                  (0x18)
-#define TEMP0_RST_MSK                  (0x1C)
-#define TEMP0_VALUE                    (0x28)
-
-#define HISI_TEMP_BASE                 (-60000)
-#define HISI_TEMP_RESET                        (100000)
-#define HISI_TEMP_STEP                 (784)
-
-#define HISI_MAX_SENSORS               4
+#define HI6220_TEMP0_LAG                       (0x0)
+#define HI6220_TEMP0_TH                                (0x4)
+#define HI6220_TEMP0_RST_TH                    (0x8)
+#define HI6220_TEMP0_CFG                       (0xC)
+#define HI6220_TEMP0_CFG_SS_MSK                        (0xF000)
+#define HI6220_TEMP0_CFG_HDAK_MSK              (0x30)
+#define HI6220_TEMP0_EN                                (0x10)
+#define HI6220_TEMP0_INT_EN                    (0x14)
+#define HI6220_TEMP0_INT_CLR                   (0x18)
+#define HI6220_TEMP0_RST_MSK                   (0x1C)
+#define HI6220_TEMP0_VALUE                     (0x28)
+
+#define HI3660_OFFSET(chan)            ((chan) * 0x40)
+#define HI3660_TEMP(chan)              (HI3660_OFFSET(chan) + 0x1C)
+#define HI3660_TH(chan)                        (HI3660_OFFSET(chan) + 0x20)
+#define HI3660_LAG(chan)               (HI3660_OFFSET(chan) + 0x28)
+#define HI3660_INT_EN(chan)            (HI3660_OFFSET(chan) + 0x2C)
+#define HI3660_INT_CLR(chan)           (HI3660_OFFSET(chan) + 0x30)
+
+#define HI6220_TEMP_BASE                       (-60000)
+#define HI6220_TEMP_RESET                      (100000)
+#define HI6220_TEMP_STEP                       (785)
+#define HI6220_TEMP_LAG                                (3500)
+
+#define HI3660_TEMP_BASE               (-63780)
+#define HI3660_TEMP_STEP               (205)
+#define HI3660_TEMP_LAG                        (4000)
+
+#define HI6220_DEFAULT_SENSOR          2
+#define HI3660_DEFAULT_SENSOR          1
  
  struct hisi_thermal_sensor {
-       struct hisi_thermal_data *thermal;
         struct thermal_zone_device *tzd;
-
-       long sensor_temp;
         uint32_t id;
         uint32_t thres_temp;
  };
  
  struct hisi_thermal_data {
-       struct mutex thermal_lock;    /* protects register data */
+       int (*get_temp)(struct hisi_thermal_data *data);
+       int (*enable_sensor)(struct hisi_thermal_data *data);
+       int (*disable_sensor)(struct hisi_thermal_data *data);
+       int (*irq_handler)(struct hisi_thermal_data *data);
         struct platform_device *pdev;
         struct clk *clk;
-       struct hisi_thermal_sensor sensors[HISI_MAX_SENSORS];
-
-       int irq, irq_bind_sensor;
-       bool irq_enabled;
-
+       struct hisi_thermal_sensor sensor;
         void __iomem *regs;
+       int irq;
  };
  
  /*
   * The temperature computation on the tsensor is as follow:
   *     Unit: millidegree Celsius
- *     Step: 255/200 (0.7843)
+ *     Step: 200/255 (0.7843)
   *     Temperature base: -60°C
   *
- * The register is programmed in temperature steps, every step is 784
+ * The register is programmed in temperature steps, every step is 785
   * millidegree and begins at -60 000 m°C
   *
   * The temperature from the steps:
   *
- *     Temp = TempBase + (steps x 784)
+ *     Temp = TempBase + (steps x 785)
   *
   * and the steps from the temperature:
   *
- *     steps = (Temp - TempBase) / 784
+ *     steps = (Temp - TempBase) / 785
   *
   */
-static inline int hisi_thermal_step_to_temp(int step)
+static inline int hi6220_thermal_step_to_temp(int step)
  {
-       return HISI_TEMP_BASE + (step * HISI_TEMP_STEP);
+       return HI6220_TEMP_BASE + (step * HI6220_TEMP_STEP);
  }
  
-static inline long hisi_thermal_temp_to_step(long temp)
+static inline int hi6220_thermal_temp_to_step(int temp)
  {
-       return (temp - HISI_TEMP_BASE) / HISI_TEMP_STEP;
+       return DIV_ROUND_UP(temp - HI6220_TEMP_BASE, HI6220_TEMP_STEP);
  }
  
-static inline long hisi_thermal_round_temp(int temp)
+/*
+ * for Hi3660,
+ *     Step: 189/922 (0.205)
+ *     Temperature base: -63.780°C
+ *
+ * The register is programmed in temperature steps, every step is 205
+ * millidegree and begins at -63 780 m°C
+ */
+static inline int hi3660_thermal_step_to_temp(int step)
  {
-       return hisi_thermal_step_to_temp(
-               hisi_thermal_temp_to_step(temp));
+       return HI3660_TEMP_BASE + step * HI3660_TEMP_STEP;
  }
  
-static long hisi_thermal_get_sensor_temp(struct hisi_thermal_data *data,
-                                        struct hisi_thermal_sensor *sensor)
+static inline int hi3660_thermal_temp_to_step(int temp)
  {
-       long val;
+       return DIV_ROUND_UP(temp - HI3660_TEMP_BASE, HI3660_TEMP_STEP);
+}
  
-       mutex_lock(&data->thermal_lock);
+/*
+ * The lag register contains 5 bits encoding the temperature in steps.
+ *
+ * Each time the temperature crosses the threshold boundary, an
+ * interrupt is raised. It could be when the temperature is going
+ * above the threshold or below. However, if the temperature is
+ * fluctuating around this value due to the load, we can receive
+ * several interrupts which may not desired.
+ *
+ * We can setup a temperature representing the delta between the
+ * threshold and the current temperature when the temperature is
+ * decreasing.
+ *
+ * For instance: the lag register is 5°C, the threshold is 65°C, when
+ * the temperature reaches 65°C an interrupt is raised and when the
+ * temperature decrease to 65°C - 5°C another interrupt is raised.
+ *
+ * A very short lag can lead to an interrupt storm, a long lag
+ * increase the latency to react to the temperature changes.  In our
+ * case, that is not really a problem as we are polling the
+ * temperature.
+ *
+ * [0:4] : lag register
+ *
+ * The temperature is coded in steps, cf. HI6220_TEMP_STEP.
+ *
+ * Min : 0x00 :  0.0 °C
+ * Max : 0x1F : 24.3 °C
+ *
+ * The 'value' parameter is in milliCelsius.
+ */
+static inline void hi6220_thermal_set_lag(void __iomem *addr, int value)
+{
+       writel(DIV_ROUND_UP(value, HI6220_TEMP_STEP) & 0x1F,
+                       addr + HI6220_TEMP0_LAG);
+}
  
-       /* disable interrupt */
-       writel(0x0, data->regs + TEMP0_INT_EN);
-       writel(0x1, data->regs + TEMP0_INT_CLR);
+static inline void hi6220_thermal_alarm_clear(void __iomem *addr, int value)
+{
+       writel(value, addr + HI6220_TEMP0_INT_CLR);
+}
  
-       /* disable module firstly */
-       writel(0x0, data->regs + TEMP0_EN);
+static inline void hi6220_thermal_alarm_enable(void __iomem *addr, int value)
+{
+       writel(value, addr + HI6220_TEMP0_INT_EN);
+}
  
-       /* select sensor id */
-       writel((sensor->id << 12), data->regs + TEMP0_CFG);
+static inline void hi6220_thermal_alarm_set(void __iomem *addr, int temp)
+{
+       writel(hi6220_thermal_temp_to_step(temp) | 0x0FFFFFF00,
+              addr + HI6220_TEMP0_TH);
+}
  
-       /* enable module */
-       writel(0x1, data->regs + TEMP0_EN);
+static inline void hi6220_thermal_reset_set(void __iomem *addr, int temp)
+{
+       writel(hi6220_thermal_temp_to_step(temp), addr + HI6220_TEMP0_RST_TH);
+}
+
+static inline void hi6220_thermal_reset_enable(void __iomem *addr, int value)
+{
+       writel(value, addr + HI6220_TEMP0_RST_MSK);
+}
+
+static inline void hi6220_thermal_enable(void __iomem *addr, int value)
+{
+       writel(value, addr + HI6220_TEMP0_EN);
+}
+
+static inline int hi6220_thermal_get_temperature(void __iomem *addr)
+{
+       return hi6220_thermal_step_to_temp(readl(addr + HI6220_TEMP0_VALUE));
+}
+
+/*
+ * [0:6] lag register
+ *
+ * The temperature is coded in steps, cf. HI3660_TEMP_STEP.
+ *
+ * Min : 0x00 :  0.0 °C
+ * Max : 0x7F : 26.0 °C
+ *
+ */
+static inline void hi3660_thermal_set_lag(void __iomem *addr,
+                                         int id, int value)
+{
+       writel(DIV_ROUND_UP(value, HI3660_TEMP_STEP) & 0x7F,
+                       addr + HI3660_LAG(id));
+}
  
-       usleep_range(3000, 5000);
+static inline void hi3660_thermal_alarm_clear(void __iomem *addr,
+                                             int id, int value)
+{
+       writel(value, addr + HI3660_INT_CLR(id));
+}
  
-       val = readl(data->regs + TEMP0_VALUE);
-       val = hisi_thermal_step_to_temp(val);
+static inline void hi3660_thermal_alarm_enable(void __iomem *addr,
+                                              int id, int value)
+{
+       writel(value, addr + HI3660_INT_EN(id));
+}
  
-       mutex_unlock(&data->thermal_lock);
+static inline void hi3660_thermal_alarm_set(void __iomem *addr,
+                                           int id, int value)
+{
+       writel(value, addr + HI3660_TH(id));
+}
  
-       return val;
+static inline int hi3660_thermal_get_temperature(void __iomem *addr, int id)
+{
+       return hi3660_thermal_step_to_temp(readl(addr + HI3660_TEMP(id)));
  }
  
-static void hisi_thermal_enable_bind_irq_sensor
-                       (struct hisi_thermal_data *data)
+/*
+ * Temperature configuration register - Sensor selection
+ *
+ * Bits [19:12]
+ *
+ * 0x0: local sensor (default)
+ * 0x1: remote sensor 1 (ACPU cluster 1)
+ * 0x2: remote sensor 2 (ACPU cluster 0)
+ * 0x3: remote sensor 3 (G3D)
+ */
+static inline void hi6220_thermal_sensor_select(void __iomem *addr, int sensor)
  {
-       struct hisi_thermal_sensor *sensor;
+       writel((readl(addr + HI6220_TEMP0_CFG) & ~HI6220_TEMP0_CFG_SS_MSK) |
+              (sensor << 12), addr + HI6220_TEMP0_CFG);
+}
  
-       mutex_lock(&data->thermal_lock);
+/*
+ * Temperature configuration register - Hdak conversion polling interval
+ *
+ * Bits [5:4]
+ *
+ * 0x0 :   0.768 ms
+ * 0x1 :   6.144 ms
+ * 0x2 :  49.152 ms
+ * 0x3 : 393.216 ms
+ */
+static inline void hi6220_thermal_hdak_set(void __iomem *addr, int value)
+{
+       writel((readl(addr + HI6220_TEMP0_CFG) & ~HI6220_TEMP0_CFG_HDAK_MSK) |
+              (value << 4), addr + HI6220_TEMP0_CFG);
+}
  
-       sensor = &data->sensors[data->irq_bind_sensor];
+static int hi6220_thermal_irq_handler(struct hisi_thermal_data *data)
+{
+       hi6220_thermal_alarm_clear(data->regs, 1);
+       return 0;
+}
  
-       /* setting the hdak time */
-       writel(0x0, data->regs + TEMP0_CFG);
+static int hi3660_thermal_irq_handler(struct hisi_thermal_data *data)
+{
+       hi3660_thermal_alarm_clear(data->regs, data->sensor.id, 1);
+       return 0;
+}
+
+static int hi6220_thermal_get_temp(struct hisi_thermal_data *data)
+{
+       return hi6220_thermal_get_temperature(data->regs);
+}
+
+static int hi3660_thermal_get_temp(struct hisi_thermal_data *data)
+{
+       return hi3660_thermal_get_temperature(data->regs, data->sensor.id);
+}
+
+static int hi6220_thermal_disable_sensor(struct hisi_thermal_data *data)
+{
+       /* disable sensor module */
+       hi6220_thermal_enable(data->regs, 0);
+       hi6220_thermal_alarm_enable(data->regs, 0);
+       hi6220_thermal_reset_enable(data->regs, 0);
+
+       clk_disable_unprepare(data->clk);
+
+       return 0;
+}
+
+static int hi3660_thermal_disable_sensor(struct hisi_thermal_data *data)
+{
+       /* disable sensor module */
+       hi3660_thermal_alarm_enable(data->regs, data->sensor.id, 0);
+       return 0;
+}
+
+static int hi6220_thermal_enable_sensor(struct hisi_thermal_data *data)
+{
+       struct hisi_thermal_sensor *sensor = &data->sensor;
+       int ret;
+
+       /* enable clock for tsensor */
+       ret = clk_prepare_enable(data->clk);
+       if (ret)
+               return ret;
  
         /* disable module firstly */
-       writel(0x0, data->regs + TEMP0_RST_MSK);
-       writel(0x0, data->regs + TEMP0_EN);
+       hi6220_thermal_reset_enable(data->regs, 0);
+       hi6220_thermal_enable(data->regs, 0);
  
         /* select sensor id */
-       writel((sensor->id << 12), data->regs + TEMP0_CFG);
+       hi6220_thermal_sensor_select(data->regs, sensor->id);
+
+       /* setting the hdak time */
+       hi6220_thermal_hdak_set(data->regs, 0);
+
+       /* setting lag value between current temp and the threshold */
+       hi6220_thermal_set_lag(data->regs, HI6220_TEMP_LAG);
  
         /* enable for interrupt */
-       writel(hisi_thermal_temp_to_step(sensor->thres_temp) | 0x0FFFFFF00,
-              data->regs + TEMP0_TH);
+       hi6220_thermal_alarm_set(data->regs, sensor->thres_temp);
  
-       writel(hisi_thermal_temp_to_step(HISI_TEMP_RESET),
-              data->regs + TEMP0_RST_TH);
+       hi6220_thermal_reset_set(data->regs, HI6220_TEMP_RESET);
  
         /* enable module */
-       writel(0x1, data->regs + TEMP0_RST_MSK);
-       writel(0x1, data->regs + TEMP0_EN);
-
-       writel(0x0, data->regs + TEMP0_INT_CLR);
-       writel(0x1, data->regs + TEMP0_INT_EN);
+       hi6220_thermal_reset_enable(data->regs, 1);
+       hi6220_thermal_enable(data->regs, 1);
  
-       usleep_range(3000, 5000);
+       hi6220_thermal_alarm_clear(data->regs, 0);
+       hi6220_thermal_alarm_enable(data->regs, 1);
  
-       mutex_unlock(&data->thermal_lock);
+       return 0;
  }
  
-static void hisi_thermal_disable_sensor(struct hisi_thermal_data *data)
+static int hi3660_thermal_enable_sensor(struct hisi_thermal_data *data)
  {
-       mutex_lock(&data->thermal_lock);
+       unsigned int value;
+       struct hisi_thermal_sensor *sensor = &data->sensor;
  
-       /* disable sensor module */
-       writel(0x0, data->regs + TEMP0_INT_EN);
-       writel(0x0, data->regs + TEMP0_RST_MSK);
-       writel(0x0, data->regs + TEMP0_EN);
+       /* disable interrupt */
+       hi3660_thermal_alarm_enable(data->regs, sensor->id, 0);
  
-       mutex_unlock(&data->thermal_lock);
-}
+       /* setting lag value between current temp and the threshold */
+       hi3660_thermal_set_lag(data->regs, sensor->id, HI3660_TEMP_LAG);
  
-static int hisi_thermal_get_temp(void *_sensor, int *temp)
-{
-       struct hisi_thermal_sensor *sensor = _sensor;
-       struct hisi_thermal_data *data = sensor->thermal;
+       /* set interrupt threshold */
+       value = hi3660_thermal_temp_to_step(sensor->thres_temp);
+       hi3660_thermal_alarm_set(data->regs, sensor->id, value);
  
-       int sensor_id = -1, i;
-       long max_temp = 0;
+       /* enable interrupt */
+       hi3660_thermal_alarm_clear(data->regs, sensor->id, 1);
+       hi3660_thermal_alarm_enable(data->regs, sensor->id, 1);
  
-       *temp = hisi_thermal_get_sensor_temp(data, sensor);
+       return 0;
+}
  
-       sensor->sensor_temp = *temp;
+static int hi6220_thermal_probe(struct hisi_thermal_data *data)
+{
+       struct platform_device *pdev = data->pdev;
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+       int ret;
  
-       for (i = 0; i < HISI_MAX_SENSORS; i++) {
-               if (!data->sensors[i].tzd)
-                       continue;
+       data->get_temp = hi6220_thermal_get_temp;
+       data->enable_sensor = hi6220_thermal_enable_sensor;
+       data->disable_sensor = hi6220_thermal_disable_sensor;
+       data->irq_handler = hi6220_thermal_irq_handler;
  
-               if (data->sensors[i].sensor_temp >= max_temp) {
-                       max_temp = data->sensors[i].sensor_temp;
-                       sensor_id = i;
-               }
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       data->regs = devm_ioremap_resource(dev, res);
+       if (IS_ERR(data->regs)) {
+               dev_err(dev, "failed to get io address\n");
+               return PTR_ERR(data->regs);
         }
  
-       /* If no sensor has been enabled, then skip to enable irq */
-       if (sensor_id == -1)
-               return 0;
-
-       mutex_lock(&data->thermal_lock);
-       data->irq_bind_sensor = sensor_id;
-       mutex_unlock(&data->thermal_lock);
-
-       dev_dbg(&data->pdev->dev, "id=%d, irq=%d, temp=%d, thres=%d\n",
-               sensor->id, data->irq_enabled, *temp, sensor->thres_temp);
-       /*
-        * Bind irq to sensor for two cases:
-        *   Reenable alarm IRQ if temperature below threshold;
-        *   if irq has been enabled, always set it;
-        */
-       if (data->irq_enabled) {
-               hisi_thermal_enable_bind_irq_sensor(data);
-               return 0;
+       data->clk = devm_clk_get(dev, "thermal_clk");
+       if (IS_ERR(data->clk)) {
+               ret = PTR_ERR(data->clk);
+               if (ret != -EPROBE_DEFER)
+                       dev_err(dev, "failed to get thermal clk: %d\n", ret);
+               return ret;
         }
  
-       if (max_temp < sensor->thres_temp) {
-               data->irq_enabled = true;
-               hisi_thermal_enable_bind_irq_sensor(data);
-               enable_irq(data->irq);
-       }
+       data->irq = platform_get_irq(pdev, 0);
+       if (data->irq < 0)
+               return data->irq;
+
+       data->sensor.id = HI6220_DEFAULT_SENSOR;
  
         return 0;
  }
  
-static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
-       .get_temp = hisi_thermal_get_temp,
-};
+static int hi3660_thermal_probe(struct hisi_thermal_data *data)
+{
+       struct platform_device *pdev = data->pdev;
+       struct device *dev = &pdev->dev;
+       struct resource *res;
+
+       data->get_temp = hi3660_thermal_get_temp;
+       data->enable_sensor = hi3660_thermal_enable_sensor;
+       data->disable_sensor = hi3660_thermal_disable_sensor;
+       data->irq_handler = hi3660_thermal_irq_handler;
+
+       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+       data->regs = devm_ioremap_resource(dev, res);
+       if (IS_ERR(data->regs)) {
+               dev_err(dev, "failed to get io address\n");
+               return PTR_ERR(data->regs);
+       }
+
+       data->irq = platform_get_irq(pdev, 0);
+       if (data->irq < 0)
+               return data->irq;
+
+       data->sensor.id = HI3660_DEFAULT_SENSOR;
  
-static irqreturn_t hisi_thermal_alarm_irq(int irq, void *dev)
+       return 0;
+}
+
+static int hisi_thermal_get_temp(void *__data, int *temp)
  {
-       struct hisi_thermal_data *data = dev;
+       struct hisi_thermal_data *data = __data;
+       struct hisi_thermal_sensor *sensor = &data->sensor;
+
+       *temp = data->get_temp(data);
  
-       disable_irq_nosync(irq);
-       data->irq_enabled = false;
+       dev_dbg(&data->pdev->dev, "id=%d, temp=%d, thres=%d\n",
+               sensor->id, *temp, sensor->thres_temp);
  
-       return IRQ_WAKE_THREAD;
+       return 0;
  }
  
+static const struct thermal_zone_of_device_ops hisi_of_thermal_ops = {
+       .get_temp = hisi_thermal_get_temp,
+};
+
  static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
  {
         struct hisi_thermal_data *data = dev;
-       struct hisi_thermal_sensor *sensor;
-       int i;
+       struct hisi_thermal_sensor *sensor = &data->sensor;
+       int temp = 0;
  
-       mutex_lock(&data->thermal_lock);
-       sensor = &data->sensors[data->irq_bind_sensor];
+       data->irq_handler(data);
  
-       dev_crit(&data->pdev->dev, "THERMAL ALARM: T > %d\n",
-                sensor->thres_temp);
-       mutex_unlock(&data->thermal_lock);
+       hisi_thermal_get_temp(data, &temp);
  
-       for (i = 0; i < HISI_MAX_SENSORS; i++) {
-               if (!data->sensors[i].tzd)
-                       continue;
+       if (temp >= sensor->thres_temp) {
+               dev_crit(&data->pdev->dev, "THERMAL ALARM: %d > %d\n",
+                        temp, sensor->thres_temp);
  
-               thermal_zone_device_update(data->sensors[i].tzd,
+               thermal_zone_device_update(data->sensor.tzd,
                                            THERMAL_EVENT_UNSPECIFIED);
+
+       } else {
+               dev_crit(&data->pdev->dev, "THERMAL ALARM stopped: %d < %d\n",
+                        temp, sensor->thres_temp);
         }
  
         return IRQ_HANDLED;
@@ -267,17 +474,14 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
  
  static int hisi_thermal_register_sensor(struct platform_device *pdev,
                                         struct hisi_thermal_data *data,
-                                       struct hisi_thermal_sensor *sensor,
-                                       int index)
+                                       struct hisi_thermal_sensor *sensor)
  {
         int ret, i;
         const struct thermal_trip *trip;
  
-       sensor->id = index;
-       sensor->thermal = data;
-
         sensor->tzd = devm_thermal_zone_of_sensor_register(&pdev->dev,
-                               sensor->id, sensor, &hisi_of_thermal_ops);
+                                                          sensor->id, data,
+                                                          &hisi_of_thermal_ops);
         if (IS_ERR(sensor->tzd)) {
                 ret = PTR_ERR(sensor->tzd);
                 sensor->tzd = NULL;
@@ -290,7 +494,7 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
  
         for (i = 0; i < of_thermal_get_ntrips(sensor->tzd); i++) {
                 if (trip[i].type == THERMAL_TRIP_PASSIVE) {
-                       sensor->thres_temp = hisi_thermal_round_temp(trip[i].temperature);
+                       sensor->thres_temp = trip[i].temperature;
                         break;
                 }
         }
@@ -299,7 +503,14 @@ static int hisi_thermal_register_sensor(struct platform_device *pdev,
  }
  
  static const struct of_device_id of_hisi_thermal_match[] = {
-       { .compatible = "hisilicon,tsensor" },
+       {
+               .compatible = "hisilicon,tsensor",
+               .data = hi6220_thermal_probe
+       },
+       {
+               .compatible = "hisilicon,hi3660-tsensor",
+               .data = hi3660_thermal_probe
+       },
         { /* end */ }
  };
  MODULE_DEVICE_TABLE(of, of_hisi_thermal_match);
@@ -316,69 +527,51 @@ static void hisi_thermal_toggle_sensor(struct hisi_thermal_sensor *sensor,
  static int hisi_thermal_probe(struct platform_device *pdev)
  {
         struct hisi_thermal_data *data;
-       struct resource *res;
-       int i;
+       int const (*platform_probe)(struct hisi_thermal_data *);
+       struct device *dev = &pdev->dev;
         int ret;
  
-       data = devm_kzalloc(&pdev->dev, sizeof(*data), GFP_KERNEL);
+       data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
         if (!data)
                 return -ENOMEM;
  
-       mutex_init(&data->thermal_lock);
         data->pdev = pdev;
+       platform_set_drvdata(pdev, data);
  
-       res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-       data->regs = devm_ioremap_resource(&pdev->dev, res);
-       if (IS_ERR(data->regs)) {
-               dev_err(&pdev->dev, "failed to get io address\n");
-               return PTR_ERR(data->regs);
+       platform_probe = of_device_get_match_data(dev);
+       if (!platform_probe) {
+               dev_err(dev, "failed to get probe func\n");
+               return -EINVAL;
         }
  
-       data->irq = platform_get_irq(pdev, 0);
-       if (data->irq < 0)
-               return data->irq;
-
-       platform_set_drvdata(pdev, data);
-
-       data->clk = devm_clk_get(&pdev->dev, "thermal_clk");
-       if (IS_ERR(data->clk)) {
-               ret = PTR_ERR(data->clk);
-               if (ret != -EPROBE_DEFER)
-                       dev_err(&pdev->dev,
-                               "failed to get thermal clk: %d\n", ret);
+       ret = platform_probe(data);
+       if (ret)
                 return ret;
-       }
  
-       /* enable clock for thermal */
-       ret = clk_prepare_enable(data->clk);
+       ret = hisi_thermal_register_sensor(pdev, data,
+                                          &data->sensor);
         if (ret) {
-               dev_err(&pdev->dev, "failed to enable thermal clk: %d\n", ret);
+               dev_err(dev, "failed to register thermal sensor: %d\n", ret);
                 return ret;
         }
  
-       hisi_thermal_enable_bind_irq_sensor(data);
-       data->irq_enabled = true;
-
-       for (i = 0; i < HISI_MAX_SENSORS; ++i) {
-               ret = hisi_thermal_register_sensor(pdev, data,
-                                                  &data->sensors[i], i);
-               if (ret)
-                       dev_err(&pdev->dev,
-                               "failed to register thermal sensor: %d\n", ret);
-               else
-                       hisi_thermal_toggle_sensor(&data->sensors[i], true);
+       ret = data->enable_sensor(data);
+       if (ret) {
+               dev_err(dev, "Failed to setup the sensor: %d\n", ret);
+               return ret;
         }
  
-       ret = devm_request_threaded_irq(&pdev->dev, data->irq,
-                                       hisi_thermal_alarm_irq,
-                                       hisi_thermal_alarm_irq_thread,
-                                       0, "hisi_thermal", data);
-       if (ret < 0) {
-               dev_err(&pdev->dev, "failed to request alarm irq: %d\n", ret);
-               return ret;
+       if (data->irq) {
+               ret = devm_request_threaded_irq(dev, data->irq, NULL,
+                               hisi_thermal_alarm_irq_thread,
+                               IRQF_ONESHOT, "hisi_thermal", data);
+               if (ret < 0) {
+                       dev_err(dev, "failed to request alarm irq: %d\n", ret);
+                       return ret;
+               }
         }
  
-       enable_irq(data->irq);
+       hisi_thermal_toggle_sensor(&data->sensor, true);
  
         return 0;
  }
@@ -386,19 +579,11 @@ static int hisi_thermal_probe(struct platform_device *pdev)
  static int hisi_thermal_remove(struct platform_device *pdev)
  {
         struct hisi_thermal_data *data = platform_get_drvdata(pdev);
-       int i;
-
-       for (i = 0; i < HISI_MAX_SENSORS; i++) {
-               struct hisi_thermal_sensor *sensor = &data->sensors[i];
+       struct hisi_thermal_sensor *sensor = &data->sensor;
  
-               if (!sensor->tzd)
-                       continue;
+       hisi_thermal_toggle_sensor(sensor, false);
  
-               hisi_thermal_toggle_sensor(sensor, false);
-       }
-
-       hisi_thermal_disable_sensor(data);
-       clk_disable_unprepare(data->clk);
+       data->disable_sensor(data);
  
         return 0;
  }
@@ -408,10 +593,7 @@ static int hisi_thermal_suspend(struct device *dev)
  {
         struct hisi_thermal_data *data = dev_get_drvdata(dev);
  
-       hisi_thermal_disable_sensor(data);
-       data->irq_enabled = false;
-
-       clk_disable_unprepare(data->clk);
+       data->disable_sensor(data);
  
         return 0;
  }
@@ -419,16 +601,8 @@ static int hisi_thermal_suspend(struct device *dev)
  static int hisi_thermal_resume(struct device *dev)
  {
         struct hisi_thermal_data *data = dev_get_drvdata(dev);
-       int ret;
  
-       ret = clk_prepare_enable(data->clk);
-       if (ret)
-               return ret;
-
-       data->irq_enabled = true;
-       hisi_thermal_enable_bind_irq_sensor(data);
-
-       return 0;
+       return data->enable_sensor(data);
  }
  #endif
  
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig

index 31cce7805eb2e93706468409a39f287fdebd5f5a..f572b645d21b8a1a20a9dc59abc47fef1dc20bf4 100644 (file)
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -215,6 +215,12 @@ config USB_F_PRINTER
  config USB_F_TCM
         tristate
  
+config USB_F_AUDIO_SRC
+       tristate
+
+config USB_F_ACC
+       tristate
+
  # this first set of drivers all depend on bulk-capable hardware.
  
  config USB_CONFIGFS
@@ -368,6 +374,30 @@ config USB_CONFIGFS_F_FS
           implemented in kernel space (for instance Ethernet, serial or
           mass storage) and other are implemented in user space.
  
+config USB_CONFIGFS_F_ACC
+       boolean "Accessory gadget"
+       depends on USB_CONFIGFS
+       select USB_F_ACC
+       help
+         USB gadget Accessory support
+
+config USB_CONFIGFS_F_AUDIO_SRC
+       boolean "Audio Source gadget"
+       depends on USB_CONFIGFS && USB_CONFIGFS_F_ACC
+       depends on SND
+       select SND_PCM
+       select USB_F_AUDIO_SRC
+       help
+         USB gadget Audio Source support
+
+config USB_CONFIGFS_UEVENT
+       boolean "Uevent notification of Gadget state"
+       depends on USB_CONFIGFS
+       help
+         Enable uevent notifications to userspace when the gadget
+         state changes. The gadget can be in any of the following
+         three states: "CONNECTED/DISCONNECTED/CONFIGURED"
+
  config USB_CONFIGFS_F_UAC1
         bool "Audio Class 1.0"
         depends on USB_CONFIGFS
diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c

index ed9346f0b0005b52da55b4a73073bab7bbf41a62..60d198db0893241c8e7ed7b7cb7a9a0ad436db78 100644 (file)
--- a/drivers/usb/gadget/composite.c
+++ b/drivers/usb/gadget/composite.c
@@ -2003,6 +2003,12 @@ void composite_disconnect(struct usb_gadget *gadget)
         struct usb_composite_dev        *cdev = get_gadget_data(gadget);
         unsigned long                   flags;
  
+       if (cdev == NULL) {
+               WARN(1, "%s: Calling disconnect on a Gadget that is \
+                        not connected\n", __func__);
+               return;
+       }
+
         /* REVISIT:  should we have config and device level
          * disconnect callbacks?
          */
diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c

index aeb9f3c4052157e9a99d9c5dceef026970ea76f3..bf2d0ce80c99af5c4b113135e62a553d1b919311 100644 (file)
--- a/drivers/usb/gadget/configfs.c
+++ b/drivers/usb/gadget/configfs.c
@@ -9,6 +9,31 @@
  #include "u_f.h"
  #include "u_os_desc.h"
  
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+#include <linux/platform_device.h>
+#include <linux/kdev_t.h>
+#include <linux/usb/ch9.h>
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+extern int acc_ctrlrequest(struct usb_composite_dev *cdev,
+                               const struct usb_ctrlrequest *ctrl);
+void acc_disconnect(void);
+#endif
+static struct class *android_class;
+static struct device *android_device;
+static int index;
+
+struct device *create_function_device(char *name)
+{
+       if (android_device && !IS_ERR(android_device))
+               return device_create(android_class, android_device,
+                       MKDEV(0, index++), NULL, name);
+       else
+               return ERR_PTR(-EINVAL);
+}
+EXPORT_SYMBOL_GPL(create_function_device);
+#endif
+
  int check_user_usb_string(const char *name,
                 struct usb_gadget_strings *stringtab_dev)
  {
@@ -60,6 +85,12 @@ struct gadget_info {
         bool use_os_desc;
         char b_vendor_code;
         char qw_sign[OS_STRING_QW_SIGN_LEN];
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+       bool connected;
+       bool sw_connected;
+       struct work_struct work;
+       struct device *dev;
+#endif
  };
  
  static inline struct gadget_info *to_gadget_info(struct config_item *item)
@@ -265,7 +296,7 @@ static ssize_t gadget_dev_desc_UDC_store(struct config_item *item,
  
         mutex_lock(&gi->lock);
  
-       if (!strlen(name)) {
+       if (!strlen(name) || strcmp(name, "none") == 0) {
                 ret = unregister_gadget(gi);
                 if (ret)
                         goto err;
@@ -1371,6 +1402,60 @@ err_comp_cleanup:
         return ret;
  }
  
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static void android_work(struct work_struct *data)
+{
+       struct gadget_info *gi = container_of(data, struct gadget_info, work);
+       struct usb_composite_dev *cdev = &gi->cdev;
+       char *disconnected[2] = { "USB_STATE=DISCONNECTED", NULL };
+       char *connected[2]    = { "USB_STATE=CONNECTED", NULL };
+       char *configured[2]   = { "USB_STATE=CONFIGURED", NULL };
+       /* 0-connected 1-configured 2-disconnected*/
+       bool status[3] = { false, false, false };
+       unsigned long flags;
+       bool uevent_sent = false;
+
+       spin_lock_irqsave(&cdev->lock, flags);
+       if (cdev->config)
+               status[1] = true;
+
+       if (gi->connected != gi->sw_connected) {
+               if (gi->connected)
+                       status[0] = true;
+               else
+                       status[2] = true;
+               gi->sw_connected = gi->connected;
+       }
+       spin_unlock_irqrestore(&cdev->lock, flags);
+
+       if (status[0]) {
+               kobject_uevent_env(&android_device->kobj,
+                                       KOBJ_CHANGE, connected);
+               pr_info("%s: sent uevent %s\n", __func__, connected[0]);
+               uevent_sent = true;
+       }
+
+       if (status[1]) {
+               kobject_uevent_env(&android_device->kobj,
+                                       KOBJ_CHANGE, configured);
+               pr_info("%s: sent uevent %s\n", __func__, configured[0]);
+               uevent_sent = true;
+       }
+
+       if (status[2]) {
+               kobject_uevent_env(&android_device->kobj,
+                                       KOBJ_CHANGE, disconnected);
+               pr_info("%s: sent uevent %s\n", __func__, disconnected[0]);
+               uevent_sent = true;
+       }
+
+       if (!uevent_sent) {
+               pr_info("%s: did not send uevent (%d %d %p)\n", __func__,
+                       gi->connected, gi->sw_connected, cdev->config);
+       }
+}
+#endif
+
  static void configfs_composite_unbind(struct usb_gadget *gadget)
  {
         struct usb_composite_dev        *cdev;
@@ -1390,14 +1475,91 @@ static void configfs_composite_unbind(struct usb_gadget *gadget)
         set_gadget_data(gadget, NULL);
  }
  
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static int android_setup(struct usb_gadget *gadget,
+                       const struct usb_ctrlrequest *c)
+{
+       struct usb_composite_dev *cdev = get_gadget_data(gadget);
+       unsigned long flags;
+       struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+       int value = -EOPNOTSUPP;
+       struct usb_function_instance *fi;
+
+       spin_lock_irqsave(&cdev->lock, flags);
+       if (!gi->connected) {
+               gi->connected = 1;
+               schedule_work(&gi->work);
+       }
+       spin_unlock_irqrestore(&cdev->lock, flags);
+       list_for_each_entry(fi, &gi->available_func, cfs_list) {
+               if (fi != NULL && fi->f != NULL && fi->f->setup != NULL) {
+                       value = fi->f->setup(fi->f, c);
+                       if (value >= 0)
+                               break;
+               }
+       }
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+       if (value < 0)
+               value = acc_ctrlrequest(cdev, c);
+#endif
+
+       if (value < 0)
+               value = composite_setup(gadget, c);
+
+       spin_lock_irqsave(&cdev->lock, flags);
+       if (c->bRequest == USB_REQ_SET_CONFIGURATION &&
+                                               cdev->config) {
+               schedule_work(&gi->work);
+       }
+       spin_unlock_irqrestore(&cdev->lock, flags);
+
+       return value;
+}
+
+static void android_disconnect(struct usb_gadget *gadget)
+{
+       struct usb_composite_dev        *cdev = get_gadget_data(gadget);
+       struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev);
+
+       /* FIXME: There's a race between usb_gadget_udc_stop() which is likely
+        * to set the gadget driver to NULL in the udc driver and this drivers
+        * gadget disconnect fn which likely checks for the gadget driver to
+        * be a null ptr. It happens that unbind (doing set_gadget_data(NULL))
+        * is called before the gadget driver is set to NULL and the udc driver
+        * calls disconnect fn which results in cdev being a null ptr.
+        */
+       if (cdev == NULL) {
+               WARN(1, "%s: gadget driver already disconnected\n", __func__);
+               return;
+       }
+
+       /* accessory HID support can be active while the
+               accessory function is not actually enabled,
+               so we need to inform it when we are disconnected.
+       */
+
+#ifdef CONFIG_USB_CONFIGFS_F_ACC
+       acc_disconnect();
+#endif
+       gi->connected = 0;
+       schedule_work(&gi->work);
+       composite_disconnect(gadget);
+}
+#endif
+
  static const struct usb_gadget_driver configfs_driver_template = {
         .bind           = configfs_composite_bind,
         .unbind         = configfs_composite_unbind,
-
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+       .setup          = android_setup,
+       .reset          = android_disconnect,
+       .disconnect     = android_disconnect,
+#else
         .setup          = composite_setup,
         .reset          = composite_disconnect,
         .disconnect     = composite_disconnect,
-
+#endif
         .suspend        = composite_suspend,
         .resume         = composite_resume,
  
@@ -1409,6 +1571,89 @@ static const struct usb_gadget_driver configfs_driver_template = {
         .match_existing_only = 1,
  };
  
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+static ssize_t state_show(struct device *pdev, struct device_attribute *attr,
+                       char *buf)
+{
+       struct gadget_info *dev = dev_get_drvdata(pdev);
+       struct usb_composite_dev *cdev;
+       char *state = "DISCONNECTED";
+       unsigned long flags;
+
+       if (!dev)
+               goto out;
+
+       cdev = &dev->cdev;
+
+       if (!cdev)
+               goto out;
+
+       spin_lock_irqsave(&cdev->lock, flags);
+       if (cdev->config)
+               state = "CONFIGURED";
+       else if (dev->connected)
+               state = "CONNECTED";
+       spin_unlock_irqrestore(&cdev->lock, flags);
+out:
+       return sprintf(buf, "%s\n", state);
+}
+
+static DEVICE_ATTR(state, S_IRUGO, state_show, NULL);
+
+static struct device_attribute *android_usb_attributes[] = {
+       &dev_attr_state,
+       NULL
+};
+
+static int android_device_create(struct gadget_info *gi)
+{
+       struct device_attribute **attrs;
+       struct device_attribute *attr;
+
+       INIT_WORK(&gi->work, android_work);
+       android_device = device_create(android_class, NULL,
+                               MKDEV(0, 0), NULL, "android0");
+       if (IS_ERR(android_device))
+               return PTR_ERR(android_device);
+
+       dev_set_drvdata(android_device, gi);
+
+       attrs = android_usb_attributes;
+       while ((attr = *attrs++)) {
+               int err;
+
+               err = device_create_file(android_device, attr);
+               if (err) {
+                       device_destroy(android_device->class,
+                                      android_device->devt);
+                       return err;
+               }
+       }
+
+       return 0;
+}
+
+static void android_device_destroy(void)
+{
+       struct device_attribute **attrs;
+       struct device_attribute *attr;
+
+       attrs = android_usb_attributes;
+       while ((attr = *attrs++))
+               device_remove_file(android_device, attr);
+       device_destroy(android_device->class, android_device->devt);
+}
+#else
+static inline int android_device_create(struct gadget_info *gi)
+{
+       return 0;
+}
+
+static inline void android_device_destroy(void)
+{
+}
+#endif
+
  static struct config_group *gadgets_make(
                 struct config_group *group,
                 const char *name)
@@ -1460,7 +1705,11 @@ static struct config_group *gadgets_make(
         if (!gi->composite.gadget_driver.function)
                 goto err;
  
+       if (android_device_create(gi) < 0)
+               goto err;
+
         return &gi->group;
+
  err:
         kfree(gi);
         return ERR_PTR(-ENOMEM);
@@ -1469,6 +1718,7 @@ err:
  static void gadgets_drop(struct config_group *group, struct config_item *item)
  {
         config_item_put(item);
+       android_device_destroy();
  }
  
  static struct configfs_group_operations gadgets_ops = {
@@ -1508,6 +1758,13 @@ static int __init gadget_cfs_init(void)
         config_group_init(&gadget_subsys.su_group);
  
         ret = configfs_register_subsystem(&gadget_subsys);
+
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+       android_class = class_create(THIS_MODULE, "android_usb");
+       if (IS_ERR(android_class))
+               return PTR_ERR(android_class);
+#endif
+
         return ret;
  }
  module_init(gadget_cfs_init);
@@ -1515,5 +1772,10 @@ module_init(gadget_cfs_init);
  static void __exit gadget_cfs_exit(void)
  {
         configfs_unregister_subsystem(&gadget_subsys);
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+       if (!IS_ERR(android_class))
+               class_destroy(android_class);
+#endif
+
  }
  module_exit(gadget_cfs_exit);
diff --git a/drivers/usb/gadget/function/Makefile b/drivers/usb/gadget/function/Makefile

index 5d3a6cf022185f5155e6b25c897df1198a2ff20a..d7d5673d834393f6deefc6eb22da0581d7f26ddd 100644 (file)
--- a/drivers/usb/gadget/function/Makefile
+++ b/drivers/usb/gadget/function/Makefile
@@ -50,3 +50,7 @@ usb_f_printer-y                       := f_printer.o
  obj-$(CONFIG_USB_F_PRINTER)    += usb_f_printer.o
  usb_f_tcm-y                    := f_tcm.o
  obj-$(CONFIG_USB_F_TCM)                += usb_f_tcm.o
+usb_f_audio_source-y            := f_audio_source.o
+obj-$(CONFIG_USB_F_AUDIO_SRC)   += usb_f_audio_source.o
+usb_f_accessory-y               := f_accessory.o
+obj-$(CONFIG_USB_F_ACC)         += usb_f_accessory.o
diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c

new file mode 100644 (file)

index 0000000..7aa2656
--- /dev/null
+++ b/drivers/usb/gadget/function/f_accessory.c
@@ -0,0 +1,1352 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+/* #define DEBUG */
+/* #define VERBOSE_DEBUG */
+
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/poll.h>
+#include <linux/delay.h>
+#include <linux/wait.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+
+#include <linux/types.h>
+#include <linux/file.h>
+#include <linux/device.h>
+#include <linux/miscdevice.h>
+
+#include <linux/hid.h>
+#include <linux/hiddev.h>
+#include <linux/usb.h>
+#include <linux/usb/ch9.h>
+#include <linux/usb/f_accessory.h>
+
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+
+#define MAX_INST_NAME_LEN        40
+#define BULK_BUFFER_SIZE    16384
+#define ACC_STRING_SIZE     256
+
+#define PROTOCOL_VERSION    2
+
+/* String IDs */
+#define INTERFACE_STRING_INDEX 0
+
+/* number of tx and rx requests to allocate */
+#define TX_REQ_MAX 4
+#define RX_REQ_MAX 2
+
+struct acc_hid_dev {
+       struct list_head        list;
+       struct hid_device *hid;
+       struct acc_dev *dev;
+       /* accessory defined ID */
+       int id;
+       /* HID report descriptor */
+       u8 *report_desc;
+       /* length of HID report descriptor */
+       int report_desc_len;
+       /* number of bytes of report_desc we have received so far */
+       int report_desc_offset;
+};
+
+struct acc_dev {
+       struct usb_function function;
+       struct usb_composite_dev *cdev;
+       spinlock_t lock;
+
+       struct usb_ep *ep_in;
+       struct usb_ep *ep_out;
+
+       /* online indicates state of function_set_alt & function_unbind
+        * set to 1 when we connect
+        */
+       int online:1;
+
+       /* disconnected indicates state of open & release
+        * Set to 1 when we disconnect.
+        * Not cleared until our file is closed.
+        */
+       int disconnected:1;
+
+       /* strings sent by the host */
+       char manufacturer[ACC_STRING_SIZE];
+       char model[ACC_STRING_SIZE];
+       char description[ACC_STRING_SIZE];
+       char version[ACC_STRING_SIZE];
+       char uri[ACC_STRING_SIZE];
+       char serial[ACC_STRING_SIZE];
+
+       /* for acc_complete_set_string */
+       int string_index;
+
+       /* set to 1 if we have a pending start request */
+       int start_requested;
+
+       int audio_mode;
+
+       /* synchronize access to our device file */
+       atomic_t open_excl;
+
+       struct list_head tx_idle;
+
+       wait_queue_head_t read_wq;
+       wait_queue_head_t write_wq;
+       struct usb_request *rx_req[RX_REQ_MAX];
+       int rx_done;
+
+       /* delayed work for handling ACCESSORY_START */
+       struct delayed_work start_work;
+
+       /* worker for registering and unregistering hid devices */
+       struct work_struct hid_work;
+
+       /* list of active HID devices */
+       struct list_head        hid_list;
+
+       /* list of new HID devices to register */
+       struct list_head        new_hid_list;
+
+       /* list of dead HID devices to unregister */
+       struct list_head        dead_hid_list;
+};
+
+static struct usb_interface_descriptor acc_interface_desc = {
+       .bLength                = USB_DT_INTERFACE_SIZE,
+       .bDescriptorType        = USB_DT_INTERFACE,
+       .bInterfaceNumber       = 0,
+       .bNumEndpoints          = 2,
+       .bInterfaceClass        = USB_CLASS_VENDOR_SPEC,
+       .bInterfaceSubClass     = USB_SUBCLASS_VENDOR_SPEC,
+       .bInterfaceProtocol     = 0,
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_in_desc = {
+       .bLength                = USB_DT_ENDPOINT_SIZE,
+       .bDescriptorType        = USB_DT_ENDPOINT,
+       .bEndpointAddress       = USB_DIR_IN,
+       .bmAttributes           = USB_ENDPOINT_XFER_BULK,
+       .wMaxPacketSize         = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_highspeed_out_desc = {
+       .bLength                = USB_DT_ENDPOINT_SIZE,
+       .bDescriptorType        = USB_DT_ENDPOINT,
+       .bEndpointAddress       = USB_DIR_OUT,
+       .bmAttributes           = USB_ENDPOINT_XFER_BULK,
+       .wMaxPacketSize         = __constant_cpu_to_le16(512),
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_in_desc = {
+       .bLength                = USB_DT_ENDPOINT_SIZE,
+       .bDescriptorType        = USB_DT_ENDPOINT,
+       .bEndpointAddress       = USB_DIR_IN,
+       .bmAttributes           = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_endpoint_descriptor acc_fullspeed_out_desc = {
+       .bLength                = USB_DT_ENDPOINT_SIZE,
+       .bDescriptorType        = USB_DT_ENDPOINT,
+       .bEndpointAddress       = USB_DIR_OUT,
+       .bmAttributes           = USB_ENDPOINT_XFER_BULK,
+};
+
+static struct usb_descriptor_header *fs_acc_descs[] = {
+       (struct usb_descriptor_header *) &acc_interface_desc,
+       (struct usb_descriptor_header *) &acc_fullspeed_in_desc,
+       (struct usb_descriptor_header *) &acc_fullspeed_out_desc,
+       NULL,
+};
+
+static struct usb_descriptor_header *hs_acc_descs[] = {
+       (struct usb_descriptor_header *) &acc_interface_desc,
+       (struct usb_descriptor_header *) &acc_highspeed_in_desc,
+       (struct usb_descriptor_header *) &acc_highspeed_out_desc,
+       NULL,
+};
+
+static struct usb_string acc_string_defs[] = {
+       [INTERFACE_STRING_INDEX].s      = "Android Accessory Interface",
+       {  },   /* end of list */
+};
+
+static struct usb_gadget_strings acc_string_table = {
+       .language               = 0x0409,       /* en-US */
+       .strings                = acc_string_defs,
+};
+
+static struct usb_gadget_strings *acc_strings[] = {
+       &acc_string_table,
+       NULL,
+};
+
+/* temporary variable used between acc_open() and acc_gadget_bind() */
+static struct acc_dev *_acc_dev;
+
+struct acc_instance {
+       struct usb_function_instance func_inst;
+       const char *name;
+};
+
+static inline struct acc_dev *func_to_dev(struct usb_function *f)
+{
+       return container_of(f, struct acc_dev, function);
+}
+
+static struct usb_request *acc_request_new(struct usb_ep *ep, int buffer_size)
+{
+       struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+       if (!req)
+               return NULL;
+
+       /* now allocate buffers for the requests */
+       req->buf = kmalloc(buffer_size, GFP_KERNEL);
+       if (!req->buf) {
+               usb_ep_free_request(ep, req);
+               return NULL;
+       }
+
+       return req;
+}
+
+static void acc_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+       if (req) {
+               kfree(req->buf);
+               usb_ep_free_request(ep, req);
+       }
+}
+
+/* add a request to the tail of a list */
+static void req_put(struct acc_dev *dev, struct list_head *head,
+               struct usb_request *req)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       list_add_tail(&req->list, head);
+       spin_unlock_irqrestore(&dev->lock, flags);
+}
+
+/* remove a request from the head of a list */
+static struct usb_request *req_get(struct acc_dev *dev, struct list_head *head)
+{
+       unsigned long flags;
+       struct usb_request *req;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       if (list_empty(head)) {
+               req = 0;
+       } else {
+               req = list_first_entry(head, struct usb_request, list);
+               list_del(&req->list);
+       }
+       spin_unlock_irqrestore(&dev->lock, flags);
+       return req;
+}
+
+static void acc_set_disconnected(struct acc_dev *dev)
+{
+       dev->disconnected = 1;
+}
+
+static void acc_complete_in(struct usb_ep *ep, struct usb_request *req)
+{
+       struct acc_dev *dev = _acc_dev;
+
+       if (req->status == -ESHUTDOWN) {
+               pr_debug("acc_complete_in set disconnected");
+               acc_set_disconnected(dev);
+       }
+
+       req_put(dev, &dev->tx_idle, req);
+
+       wake_up(&dev->write_wq);
+}
+
+static void acc_complete_out(struct usb_ep *ep, struct usb_request *req)
+{
+       struct acc_dev *dev = _acc_dev;
+
+       dev->rx_done = 1;
+       if (req->status == -ESHUTDOWN) {
+               pr_debug("acc_complete_out set disconnected");
+               acc_set_disconnected(dev);
+       }
+
+       wake_up(&dev->read_wq);
+}
+
+static void acc_complete_set_string(struct usb_ep *ep, struct usb_request *req)
+{
+       struct acc_dev  *dev = ep->driver_data;
+       char *string_dest = NULL;
+       int length = req->actual;
+
+       if (req->status != 0) {
+               pr_err("acc_complete_set_string, err %d\n", req->status);
+               return;
+       }
+
+       switch (dev->string_index) {
+       case ACCESSORY_STRING_MANUFACTURER:
+               string_dest = dev->manufacturer;
+               break;
+       case ACCESSORY_STRING_MODEL:
+               string_dest = dev->model;
+               break;
+       case ACCESSORY_STRING_DESCRIPTION:
+               string_dest = dev->description;
+               break;
+       case ACCESSORY_STRING_VERSION:
+               string_dest = dev->version;
+               break;
+       case ACCESSORY_STRING_URI:
+               string_dest = dev->uri;
+               break;
+       case ACCESSORY_STRING_SERIAL:
+               string_dest = dev->serial;
+               break;
+       }
+       if (string_dest) {
+               unsigned long flags;
+
+               if (length >= ACC_STRING_SIZE)
+                       length = ACC_STRING_SIZE - 1;
+
+               spin_lock_irqsave(&dev->lock, flags);
+               memcpy(string_dest, req->buf, length);
+               /* ensure zero termination */
+               string_dest[length] = 0;
+               spin_unlock_irqrestore(&dev->lock, flags);
+       } else {
+               pr_err("unknown accessory string index %d\n",
+                       dev->string_index);
+       }
+}
+
+static void acc_complete_set_hid_report_desc(struct usb_ep *ep,
+               struct usb_request *req)
+{
+       struct acc_hid_dev *hid = req->context;
+       struct acc_dev *dev = hid->dev;
+       int length = req->actual;
+
+       if (req->status != 0) {
+               pr_err("acc_complete_set_hid_report_desc, err %d\n",
+                       req->status);
+               return;
+       }
+
+       memcpy(hid->report_desc + hid->report_desc_offset, req->buf, length);
+       hid->report_desc_offset += length;
+       if (hid->report_desc_offset == hid->report_desc_len) {
+               /* After we have received the entire report descriptor
+                * we schedule work to initialize the HID device
+                */
+               schedule_work(&dev->hid_work);
+       }
+}
+
+static void acc_complete_send_hid_event(struct usb_ep *ep,
+               struct usb_request *req)
+{
+       struct acc_hid_dev *hid = req->context;
+       int length = req->actual;
+
+       if (req->status != 0) {
+               pr_err("acc_complete_send_hid_event, err %d\n", req->status);
+               return;
+       }
+
+       hid_report_raw_event(hid->hid, HID_INPUT_REPORT, req->buf, length, 1);
+}
+
+static int acc_hid_parse(struct hid_device *hid)
+{
+       struct acc_hid_dev *hdev = hid->driver_data;
+
+       hid_parse_report(hid, hdev->report_desc, hdev->report_desc_len);
+       return 0;
+}
+
+static int acc_hid_start(struct hid_device *hid)
+{
+       return 0;
+}
+
+static void acc_hid_stop(struct hid_device *hid)
+{
+}
+
+static int acc_hid_open(struct hid_device *hid)
+{
+       return 0;
+}
+
+static void acc_hid_close(struct hid_device *hid)
+{
+}
+
+static int acc_hid_raw_request(struct hid_device *hid, unsigned char reportnum,
+       __u8 *buf, size_t len, unsigned char rtype, int reqtype)
+{
+       return 0;
+}
+
+static struct hid_ll_driver acc_hid_ll_driver = {
+       .parse = acc_hid_parse,
+       .start = acc_hid_start,
+       .stop = acc_hid_stop,
+       .open = acc_hid_open,
+       .close = acc_hid_close,
+       .raw_request = acc_hid_raw_request,
+};
+
+static struct acc_hid_dev *acc_hid_new(struct acc_dev *dev,
+               int id, int desc_len)
+{
+       struct acc_hid_dev *hdev;
+
+       hdev = kzalloc(sizeof(*hdev), GFP_ATOMIC);
+       if (!hdev)
+               return NULL;
+       hdev->report_desc = kzalloc(desc_len, GFP_ATOMIC);
+       if (!hdev->report_desc) {
+               kfree(hdev);
+               return NULL;
+       }
+       hdev->dev = dev;
+       hdev->id = id;
+       hdev->report_desc_len = desc_len;
+
+       return hdev;
+}
+
+static struct acc_hid_dev *acc_hid_get(struct list_head *list, int id)
+{
+       struct acc_hid_dev *hid;
+
+       list_for_each_entry(hid, list, list) {
+               if (hid->id == id)
+                       return hid;
+       }
+       return NULL;
+}
+
+static int acc_register_hid(struct acc_dev *dev, int id, int desc_length)
+{
+       struct acc_hid_dev *hid;
+       unsigned long flags;
+
+       /* report descriptor length must be > 0 */
+       if (desc_length <= 0)
+               return -EINVAL;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       /* replace HID if one already exists with this ID */
+       hid = acc_hid_get(&dev->hid_list, id);
+       if (!hid)
+               hid = acc_hid_get(&dev->new_hid_list, id);
+       if (hid)
+               list_move(&hid->list, &dev->dead_hid_list);
+
+       hid = acc_hid_new(dev, id, desc_length);
+       if (!hid) {
+               spin_unlock_irqrestore(&dev->lock, flags);
+               return -ENOMEM;
+       }
+
+       list_add(&hid->list, &dev->new_hid_list);
+       spin_unlock_irqrestore(&dev->lock, flags);
+
+       /* schedule work to register the HID device */
+       schedule_work(&dev->hid_work);
+       return 0;
+}
+
+static int acc_unregister_hid(struct acc_dev *dev, int id)
+{
+       struct acc_hid_dev *hid;
+       unsigned long flags;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       hid = acc_hid_get(&dev->hid_list, id);
+       if (!hid)
+               hid = acc_hid_get(&dev->new_hid_list, id);
+       if (!hid) {
+               spin_unlock_irqrestore(&dev->lock, flags);
+               return -EINVAL;
+       }
+
+       list_move(&hid->list, &dev->dead_hid_list);
+       spin_unlock_irqrestore(&dev->lock, flags);
+
+       schedule_work(&dev->hid_work);
+       return 0;
+}
+
+static int create_bulk_endpoints(struct acc_dev *dev,
+                               struct usb_endpoint_descriptor *in_desc,
+                               struct usb_endpoint_descriptor *out_desc)
+{
+       struct usb_composite_dev *cdev = dev->cdev;
+       struct usb_request *req;
+       struct usb_ep *ep;
+       int i;
+
+       DBG(cdev, "create_bulk_endpoints dev: %p\n", dev);
+
+       ep = usb_ep_autoconfig(cdev->gadget, in_desc);
+       if (!ep) {
+               DBG(cdev, "usb_ep_autoconfig for ep_in failed\n");
+               return -ENODEV;
+       }
+       DBG(cdev, "usb_ep_autoconfig for ep_in got %s\n", ep->name);
+       ep->driver_data = dev;          /* claim the endpoint */
+       dev->ep_in = ep;
+
+       ep = usb_ep_autoconfig(cdev->gadget, out_desc);
+       if (!ep) {
+               DBG(cdev, "usb_ep_autoconfig for ep_out failed\n");
+               return -ENODEV;
+       }
+       DBG(cdev, "usb_ep_autoconfig for ep_out got %s\n", ep->name);
+       ep->driver_data = dev;          /* claim the endpoint */
+       dev->ep_out = ep;
+
+       /* now allocate requests for our endpoints */
+       for (i = 0; i < TX_REQ_MAX; i++) {
+               req = acc_request_new(dev->ep_in, BULK_BUFFER_SIZE);
+               if (!req)
+                       goto fail;
+               req->complete = acc_complete_in;
+               req_put(dev, &dev->tx_idle, req);
+       }
+       for (i = 0; i < RX_REQ_MAX; i++) {
+               req = acc_request_new(dev->ep_out, BULK_BUFFER_SIZE);
+               if (!req)
+                       goto fail;
+               req->complete = acc_complete_out;
+               dev->rx_req[i] = req;
+       }
+
+       return 0;
+
+fail:
+       pr_err("acc_bind() could not allocate requests\n");
+       while ((req = req_get(dev, &dev->tx_idle)))
+               acc_request_free(req, dev->ep_in);
+       for (i = 0; i < RX_REQ_MAX; i++)
+               acc_request_free(dev->rx_req[i], dev->ep_out);
+       return -1;
+}
+
+static ssize_t acc_read(struct file *fp, char __user *buf,
+       size_t count, loff_t *pos)
+{
+       struct acc_dev *dev = fp->private_data;
+       struct usb_request *req;
+       ssize_t r = count;
+       unsigned xfer;
+       int ret = 0;
+
+       pr_debug("acc_read(%zu)\n", count);
+
+       if (dev->disconnected) {
+               pr_debug("acc_read disconnected");
+               return -ENODEV;
+       }
+
+       if (count > BULK_BUFFER_SIZE)
+               count = BULK_BUFFER_SIZE;
+
+       /* we will block until we're online */
+       pr_debug("acc_read: waiting for online\n");
+       ret = wait_event_interruptible(dev->read_wq, dev->online);
+       if (ret < 0) {
+               r = ret;
+               goto done;
+       }
+
+       if (dev->rx_done) {
+               // last req cancelled. try to get it.
+               req = dev->rx_req[0];
+               goto copy_data;
+       }
+
+requeue_req:
+       /* queue a request */
+       req = dev->rx_req[0];
+       req->length = count;
+       dev->rx_done = 0;
+       ret = usb_ep_queue(dev->ep_out, req, GFP_KERNEL);
+       if (ret < 0) {
+               r = -EIO;
+               goto done;
+       } else {
+               pr_debug("rx %p queue\n", req);
+       }
+
+       /* wait for a request to complete */
+       ret = wait_event_interruptible(dev->read_wq, dev->rx_done);
+       if (ret < 0) {
+               r = ret;
+               ret = usb_ep_dequeue(dev->ep_out, req);
+               if (ret != 0) {
+                       // cancel failed. There can be a data already received.
+                       // it will be retrieved in the next read.
+                       pr_debug("acc_read: cancelling failed %d", ret);
+               }
+               goto done;
+       }
+
+copy_data:
+       dev->rx_done = 0;
+       if (dev->online) {
+               /* If we got a 0-len packet, throw it back and try again. */
+               if (req->actual == 0)
+                       goto requeue_req;
+
+               pr_debug("rx %p %u\n", req, req->actual);
+               xfer = (req->actual < count) ? req->actual : count;
+               r = xfer;
+               if (copy_to_user(buf, req->buf, xfer))
+                       r = -EFAULT;
+       } else
+               r = -EIO;
+
+done:
+       pr_debug("acc_read returning %zd\n", r);
+       return r;
+}
+
+static ssize_t acc_write(struct file *fp, const char __user *buf,
+       size_t count, loff_t *pos)
+{
+       struct acc_dev *dev = fp->private_data;
+       struct usb_request *req = 0;
+       ssize_t r = count;
+       unsigned xfer;
+       int ret;
+
+       pr_debug("acc_write(%zu)\n", count);
+
+       if (!dev->online || dev->disconnected) {
+               pr_debug("acc_write disconnected or not online");
+               return -ENODEV;
+       }
+
+       while (count > 0) {
+               if (!dev->online) {
+                       pr_debug("acc_write dev->error\n");
+                       r = -EIO;
+                       break;
+               }
+
+               /* get an idle tx request to use */
+               req = 0;
+               ret = wait_event_interruptible(dev->write_wq,
+                       ((req = req_get(dev, &dev->tx_idle)) || !dev->online));
+               if (!req) {
+                       r = ret;
+                       break;
+               }
+
+               if (count > BULK_BUFFER_SIZE) {
+                       xfer = BULK_BUFFER_SIZE;
+                       /* ZLP, They will be more TX requests so not yet. */
+                       req->zero = 0;
+               } else {
+                       xfer = count;
+                       /* If the data length is a multple of the
+                        * maxpacket size then send a zero length packet(ZLP).
+                       */
+                       req->zero = ((xfer % dev->ep_in->maxpacket) == 0);
+               }
+               if (copy_from_user(req->buf, buf, xfer)) {
+                       r = -EFAULT;
+                       break;
+               }
+
+               req->length = xfer;
+               ret = usb_ep_queue(dev->ep_in, req, GFP_KERNEL);
+               if (ret < 0) {
+                       pr_debug("acc_write: xfer error %d\n", ret);
+                       r = -EIO;
+                       break;
+               }
+
+               buf += xfer;
+               count -= xfer;
+
+               /* zero this so we don't try to free it on error exit */
+               req = 0;
+       }
+
+       if (req)
+               req_put(dev, &dev->tx_idle, req);
+
+       pr_debug("acc_write returning %zd\n", r);
+       return r;
+}
+
+static long acc_ioctl(struct file *fp, unsigned code, unsigned long value)
+{
+       struct acc_dev *dev = fp->private_data;
+       char *src = NULL;
+       int ret;
+
+       switch (code) {
+       case ACCESSORY_GET_STRING_MANUFACTURER:
+               src = dev->manufacturer;
+               break;
+       case ACCESSORY_GET_STRING_MODEL:
+               src = dev->model;
+               break;
+       case ACCESSORY_GET_STRING_DESCRIPTION:
+               src = dev->description;
+               break;
+       case ACCESSORY_GET_STRING_VERSION:
+               src = dev->version;
+               break;
+       case ACCESSORY_GET_STRING_URI:
+               src = dev->uri;
+               break;
+       case ACCESSORY_GET_STRING_SERIAL:
+               src = dev->serial;
+               break;
+       case ACCESSORY_IS_START_REQUESTED:
+               return dev->start_requested;
+       case ACCESSORY_GET_AUDIO_MODE:
+               return dev->audio_mode;
+       }
+       if (!src)
+               return -EINVAL;
+
+       ret = strlen(src) + 1;
+       if (copy_to_user((void __user *)value, src, ret))
+               ret = -EFAULT;
+       return ret;
+}
+
+static int acc_open(struct inode *ip, struct file *fp)
+{
+       printk(KERN_INFO "acc_open\n");
+       if (atomic_xchg(&_acc_dev->open_excl, 1))
+               return -EBUSY;
+
+       _acc_dev->disconnected = 0;
+       fp->private_data = _acc_dev;
+       return 0;
+}
+
+static int acc_release(struct inode *ip, struct file *fp)
+{
+       printk(KERN_INFO "acc_release\n");
+
+       WARN_ON(!atomic_xchg(&_acc_dev->open_excl, 0));
+       /* indicate that we are disconnected
+        * still could be online so don't touch online flag
+        */
+       _acc_dev->disconnected = 1;
+       return 0;
+}
+
+/* file operations for /dev/usb_accessory */
+static const struct file_operations acc_fops = {
+       .owner = THIS_MODULE,
+       .read = acc_read,
+       .write = acc_write,
+       .unlocked_ioctl = acc_ioctl,
+       .open = acc_open,
+       .release = acc_release,
+};
+
+static int acc_hid_probe(struct hid_device *hdev,
+               const struct hid_device_id *id)
+{
+       int ret;
+
+       ret = hid_parse(hdev);
+       if (ret)
+               return ret;
+       return hid_hw_start(hdev, HID_CONNECT_DEFAULT);
+}
+
+static struct miscdevice acc_device = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = "usb_accessory",
+       .fops = &acc_fops,
+};
+
+static const struct hid_device_id acc_hid_table[] = {
+       { HID_USB_DEVICE(HID_ANY_ID, HID_ANY_ID) },
+       { }
+};
+
+static struct hid_driver acc_hid_driver = {
+       .name = "USB accessory",
+       .id_table = acc_hid_table,
+       .probe = acc_hid_probe,
+};
+
+static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req)
+{
+       /*
+        * Default no-op function when nothing needs to be done for the
+        * setup request
+        */
+}
+
+int acc_ctrlrequest(struct usb_composite_dev *cdev,
+                               const struct usb_ctrlrequest *ctrl)
+{
+       struct acc_dev  *dev = _acc_dev;
+       int     value = -EOPNOTSUPP;
+       struct acc_hid_dev *hid;
+       int offset;
+       u8 b_requestType = ctrl->bRequestType;
+       u8 b_request = ctrl->bRequest;
+       u16     w_index = le16_to_cpu(ctrl->wIndex);
+       u16     w_value = le16_to_cpu(ctrl->wValue);
+       u16     w_length = le16_to_cpu(ctrl->wLength);
+       unsigned long flags;
+
+/*
+       printk(KERN_INFO "acc_ctrlrequest "
+                       "%02x.%02x v%04x i%04x l%u\n",
+                       b_requestType, b_request,
+                       w_value, w_index, w_length);
+*/
+
+       if (b_requestType == (USB_DIR_OUT | USB_TYPE_VENDOR)) {
+               if (b_request == ACCESSORY_START) {
+                       dev->start_requested = 1;
+                       schedule_delayed_work(
+                               &dev->start_work, msecs_to_jiffies(10));
+                       value = 0;
+                       cdev->req->complete = acc_complete_setup_noop;
+               } else if (b_request == ACCESSORY_SEND_STRING) {
+                       dev->string_index = w_index;
+                       cdev->gadget->ep0->driver_data = dev;
+                       cdev->req->complete = acc_complete_set_string;
+                       value = w_length;
+               } else if (b_request == ACCESSORY_SET_AUDIO_MODE &&
+                               w_index == 0 && w_length == 0) {
+                       dev->audio_mode = w_value;
+                       cdev->req->complete = acc_complete_setup_noop;
+                       value = 0;
+               } else if (b_request == ACCESSORY_REGISTER_HID) {
+                       cdev->req->complete = acc_complete_setup_noop;
+                       value = acc_register_hid(dev, w_value, w_index);
+               } else if (b_request == ACCESSORY_UNREGISTER_HID) {
+                       cdev->req->complete = acc_complete_setup_noop;
+                       value = acc_unregister_hid(dev, w_value);
+               } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) {
+                       spin_lock_irqsave(&dev->lock, flags);
+                       hid = acc_hid_get(&dev->new_hid_list, w_value);
+                       spin_unlock_irqrestore(&dev->lock, flags);
+                       if (!hid) {
+                               value = -EINVAL;
+                               goto err;
+                       }
+                       offset = w_index;
+                       if (offset != hid->report_desc_offset
+                               || offset + w_length > hid->report_desc_len) {
+                               value = -EINVAL;
+                               goto err;
+                       }
+                       cdev->req->context = hid;
+                       cdev->req->complete = acc_complete_set_hid_report_desc;
+                       value = w_length;
+               } else if (b_request == ACCESSORY_SEND_HID_EVENT) {
+                       spin_lock_irqsave(&dev->lock, flags);
+                       hid = acc_hid_get(&dev->hid_list, w_value);
+                       spin_unlock_irqrestore(&dev->lock, flags);
+                       if (!hid) {
+                               value = -EINVAL;
+                               goto err;
+                       }
+                       cdev->req->context = hid;
+                       cdev->req->complete = acc_complete_send_hid_event;
+                       value = w_length;
+               }
+       } else if (b_requestType == (USB_DIR_IN | USB_TYPE_VENDOR)) {
+               if (b_request == ACCESSORY_GET_PROTOCOL) {
+                       *((u16 *)cdev->req->buf) = PROTOCOL_VERSION;
+                       value = sizeof(u16);
+                       cdev->req->complete = acc_complete_setup_noop;
+                       /* clear any string left over from a previous session */
+                       memset(dev->manufacturer, 0, sizeof(dev->manufacturer));
+                       memset(dev->model, 0, sizeof(dev->model));
+                       memset(dev->description, 0, sizeof(dev->description));
+                       memset(dev->version, 0, sizeof(dev->version));
+                       memset(dev->uri, 0, sizeof(dev->uri));
+                       memset(dev->serial, 0, sizeof(dev->serial));
+                       dev->start_requested = 0;
+                       dev->audio_mode = 0;
+               }
+       }
+
+       if (value >= 0) {
+               cdev->req->zero = 0;
+               cdev->req->length = value;
+               value = usb_ep_queue(cdev->gadget->ep0, cdev->req, GFP_ATOMIC);
+               if (value < 0)
+                       ERROR(cdev, "%s setup response queue error\n",
+                               __func__);
+       }
+
+err:
+       if (value == -EOPNOTSUPP)
+               VDBG(cdev,
+                       "unknown class-specific control req "
+                       "%02x.%02x v%04x i%04x l%u\n",
+                       ctrl->bRequestType, ctrl->bRequest,
+                       w_value, w_index, w_length);
+       return value;
+}
+EXPORT_SYMBOL_GPL(acc_ctrlrequest);
+
+static int
+__acc_function_bind(struct usb_configuration *c,
+                       struct usb_function *f, bool configfs)
+{
+       struct usb_composite_dev *cdev = c->cdev;
+       struct acc_dev  *dev = func_to_dev(f);
+       int                     id;
+       int                     ret;
+
+       DBG(cdev, "acc_function_bind dev: %p\n", dev);
+
+       if (configfs) {
+               if (acc_string_defs[INTERFACE_STRING_INDEX].id == 0) {
+                       ret = usb_string_id(c->cdev);
+                       if (ret < 0)
+                               return ret;
+                       acc_string_defs[INTERFACE_STRING_INDEX].id = ret;
+                       acc_interface_desc.iInterface = ret;
+               }
+               dev->cdev = c->cdev;
+       }
+       ret = hid_register_driver(&acc_hid_driver);
+       if (ret)
+               return ret;
+
+       dev->start_requested = 0;
+
+       /* allocate interface ID(s) */
+       id = usb_interface_id(c, f);
+       if (id < 0)
+               return id;
+       acc_interface_desc.bInterfaceNumber = id;
+
+       /* allocate endpoints */
+       ret = create_bulk_endpoints(dev, &acc_fullspeed_in_desc,
+                       &acc_fullspeed_out_desc);
+       if (ret)
+               return ret;
+
+       /* support high speed hardware */
+       if (gadget_is_dualspeed(c->cdev->gadget)) {
+               acc_highspeed_in_desc.bEndpointAddress =
+                       acc_fullspeed_in_desc.bEndpointAddress;
+               acc_highspeed_out_desc.bEndpointAddress =
+                       acc_fullspeed_out_desc.bEndpointAddress;
+       }
+
+       DBG(cdev, "%s speed %s: IN/%s, OUT/%s\n",
+                       gadget_is_dualspeed(c->cdev->gadget) ? "dual" : "full",
+                       f->name, dev->ep_in->name, dev->ep_out->name);
+       return 0;
+}
+
+static int
+acc_function_bind_configfs(struct usb_configuration *c,
+                       struct usb_function *f) {
+       return __acc_function_bind(c, f, true);
+}
+
+static void
+kill_all_hid_devices(struct acc_dev *dev)
+{
+       struct acc_hid_dev *hid;
+       struct list_head *entry, *temp;
+       unsigned long flags;
+
+       /* do nothing if usb accessory device doesn't exist */
+       if (!dev)
+               return;
+
+       spin_lock_irqsave(&dev->lock, flags);
+       list_for_each_safe(entry, temp, &dev->hid_list) {
+               hid = list_entry(entry, struct acc_hid_dev, list);
+               list_del(&hid->list);
+               list_add(&hid->list, &dev->dead_hid_list);
+       }
+       list_for_each_safe(entry, temp, &dev->new_hid_list) {
+               hid = list_entry(entry, struct acc_hid_dev, list);
+               list_del(&hid->list);
+               list_add(&hid->list, &dev->dead_hid_list);
+       }
+       spin_unlock_irqrestore(&dev->lock, flags);
+
+       schedule_work(&dev->hid_work);
+}
+
+static void
+acc_hid_unbind(struct acc_dev *dev)
+{
+       hid_unregister_driver(&acc_hid_driver);
+       kill_all_hid_devices(dev);
+}
+
+static void
+acc_function_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+       struct acc_dev  *dev = func_to_dev(f);
+       struct usb_request *req;
+       int i;
+
+       dev->online = 0;                /* clear online flag */
+       wake_up(&dev->read_wq);         /* unblock reads on closure */
+       wake_up(&dev->write_wq);        /* likewise for writes */
+
+       while ((req = req_get(dev, &dev->tx_idle)))
+               acc_request_free(req, dev->ep_in);
+       for (i = 0; i < RX_REQ_MAX; i++)
+               acc_request_free(dev->rx_req[i], dev->ep_out);
+
+       acc_hid_unbind(dev);
+}
+
+static void acc_start_work(struct work_struct *data)
+{
+       char *envp[2] = { "ACCESSORY=START", NULL };
+
+       kobject_uevent_env(&acc_device.this_device->kobj, KOBJ_CHANGE, envp);
+}
+
+static int acc_hid_init(struct acc_hid_dev *hdev)
+{
+       struct hid_device *hid;
+       int ret;
+
+       hid = hid_allocate_device();
+       if (IS_ERR(hid))
+               return PTR_ERR(hid);
+
+       hid->ll_driver = &acc_hid_ll_driver;
+       hid->dev.parent = acc_device.this_device;
+
+       hid->bus = BUS_USB;
+       hid->vendor = HID_ANY_ID;
+       hid->product = HID_ANY_ID;
+       hid->driver_data = hdev;
+       ret = hid_add_device(hid);
+       if (ret) {
+               pr_err("can't add hid device: %d\n", ret);
+               hid_destroy_device(hid);
+               return ret;
+       }
+
+       hdev->hid = hid;
+       return 0;
+}
+
+static void acc_hid_delete(struct acc_hid_dev *hid)
+{
+       kfree(hid->report_desc);
+       kfree(hid);
+}
+
+static void acc_hid_work(struct work_struct *data)
+{
+       struct acc_dev *dev = _acc_dev;
+       struct list_head        *entry, *temp;
+       struct acc_hid_dev *hid;
+       struct list_head        new_list, dead_list;
+       unsigned long flags;
+
+       INIT_LIST_HEAD(&new_list);
+
+       spin_lock_irqsave(&dev->lock, flags);
+
+       /* copy hids that are ready for initialization to new_list */
+       list_for_each_safe(entry, temp, &dev->new_hid_list) {
+               hid = list_entry(entry, struct acc_hid_dev, list);
+               if (hid->report_desc_offset == hid->report_desc_len)
+                       list_move(&hid->list, &new_list);
+       }
+
+       if (list_empty(&dev->dead_hid_list)) {
+               INIT_LIST_HEAD(&dead_list);
+       } else {
+               /* move all of dev->dead_hid_list to dead_list */
+               dead_list.prev = dev->dead_hid_list.prev;
+               dead_list.next = dev->dead_hid_list.next;
+               dead_list.next->prev = &dead_list;
+               dead_list.prev->next = &dead_list;
+               INIT_LIST_HEAD(&dev->dead_hid_list);
+       }
+
+       spin_unlock_irqrestore(&dev->lock, flags);
+
+       /* register new HID devices */
+       list_for_each_safe(entry, temp, &new_list) {
+               hid = list_entry(entry, struct acc_hid_dev, list);
+               if (acc_hid_init(hid)) {
+                       pr_err("can't add HID device %p\n", hid);
+                       acc_hid_delete(hid);
+               } else {
+                       spin_lock_irqsave(&dev->lock, flags);
+                       list_move(&hid->list, &dev->hid_list);
+                       spin_unlock_irqrestore(&dev->lock, flags);
+               }
+       }
+
+       /* remove dead HID devices */
+       list_for_each_safe(entry, temp, &dead_list) {
+               hid = list_entry(entry, struct acc_hid_dev, list);
+               list_del(&hid->list);
+               if (hid->hid)
+                       hid_destroy_device(hid->hid);
+               acc_hid_delete(hid);
+       }
+}
+
+static int acc_function_set_alt(struct usb_function *f,
+               unsigned intf, unsigned alt)
+{
+       struct acc_dev  *dev = func_to_dev(f);
+       struct usb_composite_dev *cdev = f->config->cdev;
+       int ret;
+
+       DBG(cdev, "acc_function_set_alt intf: %d alt: %d\n", intf, alt);
+
+       ret = config_ep_by_speed(cdev->gadget, f, dev->ep_in);
+       if (ret)
+               return ret;
+
+       ret = usb_ep_enable(dev->ep_in);
+       if (ret)
+               return ret;
+
+       ret = config_ep_by_speed(cdev->gadget, f, dev->ep_out);
+       if (ret)
+               return ret;
+
+       ret = usb_ep_enable(dev->ep_out);
+       if (ret) {
+               usb_ep_disable(dev->ep_in);
+               return ret;
+       }
+
+       dev->online = 1;
+       dev->disconnected = 0; /* if online then not disconnected */
+
+       /* readers may be blocked waiting for us to go online */
+       wake_up(&dev->read_wq);
+       return 0;
+}
+
+static void acc_function_disable(struct usb_function *f)
+{
+       struct acc_dev  *dev = func_to_dev(f);
+       struct usb_composite_dev        *cdev = dev->cdev;
+
+       DBG(cdev, "acc_function_disable\n");
+       acc_set_disconnected(dev); /* this now only sets disconnected */
+       dev->online = 0; /* so now need to clear online flag here too */
+       usb_ep_disable(dev->ep_in);
+       usb_ep_disable(dev->ep_out);
+
+       /* readers may be blocked waiting for us to go online */
+       wake_up(&dev->read_wq);
+
+       VDBG(cdev, "%s disabled\n", dev->function.name);
+}
+
+static int acc_setup(void)
+{
+       struct acc_dev *dev;
+       int ret;
+
+       dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+       if (!dev)
+               return -ENOMEM;
+
+       spin_lock_init(&dev->lock);
+       init_waitqueue_head(&dev->read_wq);
+       init_waitqueue_head(&dev->write_wq);
+       atomic_set(&dev->open_excl, 0);
+       INIT_LIST_HEAD(&dev->tx_idle);
+       INIT_LIST_HEAD(&dev->hid_list);
+       INIT_LIST_HEAD(&dev->new_hid_list);
+       INIT_LIST_HEAD(&dev->dead_hid_list);
+       INIT_DELAYED_WORK(&dev->start_work, acc_start_work);
+       INIT_WORK(&dev->hid_work, acc_hid_work);
+
+       /* _acc_dev must be set before calling usb_gadget_register_driver */
+       _acc_dev = dev;
+
+       ret = misc_register(&acc_device);
+       if (ret)
+               goto err;
+
+       return 0;
+
+err:
+       kfree(dev);
+       pr_err("USB accessory gadget driver failed to initialize\n");
+       return ret;
+}
+
+void acc_disconnect(void)
+{
+       /* unregister all HID devices if USB is disconnected */
+       kill_all_hid_devices(_acc_dev);
+}
+EXPORT_SYMBOL_GPL(acc_disconnect);
+
+static void acc_cleanup(void)
+{
+       misc_deregister(&acc_device);
+       kfree(_acc_dev);
+       _acc_dev = NULL;
+}
+static struct acc_instance *to_acc_instance(struct config_item *item)
+{
+       return container_of(to_config_group(item), struct acc_instance,
+               func_inst.group);
+}
+
+static void acc_attr_release(struct config_item *item)
+{
+       struct acc_instance *fi_acc = to_acc_instance(item);
+
+       usb_put_function_instance(&fi_acc->func_inst);
+}
+
+static struct configfs_item_operations acc_item_ops = {
+       .release        = acc_attr_release,
+};
+
+static struct config_item_type acc_func_type = {
+       .ct_item_ops    = &acc_item_ops,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct acc_instance *to_fi_acc(struct usb_function_instance *fi)
+{
+       return container_of(fi, struct acc_instance, func_inst);
+}
+
+static int acc_set_inst_name(struct usb_function_instance *fi, const char *name)
+{
+       struct acc_instance *fi_acc;
+       char *ptr;
+       int name_len;
+
+       name_len = strlen(name) + 1;
+       if (name_len > MAX_INST_NAME_LEN)
+               return -ENAMETOOLONG;
+
+       ptr = kstrndup(name, name_len, GFP_KERNEL);
+       if (!ptr)
+               return -ENOMEM;
+
+       fi_acc = to_fi_acc(fi);
+       fi_acc->name = ptr;
+       return 0;
+}
+
+static void acc_free_inst(struct usb_function_instance *fi)
+{
+       struct acc_instance *fi_acc;
+
+       fi_acc = to_fi_acc(fi);
+       kfree(fi_acc->name);
+       acc_cleanup();
+}
+
+static struct usb_function_instance *acc_alloc_inst(void)
+{
+       struct acc_instance *fi_acc;
+       struct acc_dev *dev;
+       int err;
+
+       fi_acc = kzalloc(sizeof(*fi_acc), GFP_KERNEL);
+       if (!fi_acc)
+               return ERR_PTR(-ENOMEM);
+       fi_acc->func_inst.set_inst_name = acc_set_inst_name;
+       fi_acc->func_inst.free_func_inst = acc_free_inst;
+
+       err = acc_setup();
+       if (err) {
+               kfree(fi_acc);
+               pr_err("Error setting ACCESSORY\n");
+               return ERR_PTR(err);
+       }
+
+       config_group_init_type_name(&fi_acc->func_inst.group,
+                                       "", &acc_func_type);
+       dev = _acc_dev;
+       return  &fi_acc->func_inst;
+}
+
+static void acc_free(struct usb_function *f)
+{
+/*NO-OP: no function specific resource allocation in mtp_alloc*/
+}
+
+int acc_ctrlrequest_configfs(struct usb_function *f,
+                       const struct usb_ctrlrequest *ctrl) {
+       if (f->config != NULL && f->config->cdev != NULL)
+               return acc_ctrlrequest(f->config->cdev, ctrl);
+       else
+               return -1;
+}
+
+static struct usb_function *acc_alloc(struct usb_function_instance *fi)
+{
+       struct acc_dev *dev = _acc_dev;
+
+       pr_info("acc_alloc\n");
+
+       dev->function.name = "accessory";
+       dev->function.strings = acc_strings,
+       dev->function.fs_descriptors = fs_acc_descs;
+       dev->function.hs_descriptors = hs_acc_descs;
+       dev->function.bind = acc_function_bind_configfs;
+       dev->function.unbind = acc_function_unbind;
+       dev->function.set_alt = acc_function_set_alt;
+       dev->function.disable = acc_function_disable;
+       dev->function.free_func = acc_free;
+       dev->function.setup = acc_ctrlrequest_configfs;
+
+       return &dev->function;
+}
+DECLARE_USB_FUNCTION_INIT(accessory, acc_alloc_inst, acc_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_audio_source.c b/drivers/usb/gadget/function/f_audio_source.c

new file mode 100644 (file)

index 0000000..8124af3
--- /dev/null
+++ b/drivers/usb/gadget/function/f_audio_source.c
@@ -0,0 +1,1071 @@
+/*
+ * Gadget Function Driver for USB audio source device
+ *
+ * Copyright (C) 2012 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/device.h>
+#include <linux/usb/audio.h>
+#include <linux/wait.h>
+#include <linux/pm_qos.h>
+#include <sound/core.h>
+#include <sound/initval.h>
+#include <sound/pcm.h>
+
+#include <linux/usb.h>
+#include <linux/usb_usual.h>
+#include <linux/usb/ch9.h>
+#include <linux/configfs.h>
+#include <linux/usb/composite.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#define SAMPLE_RATE 44100
+#define FRAMES_PER_MSEC (SAMPLE_RATE / 1000)
+
+#define IN_EP_MAX_PACKET_SIZE 256
+
+/* Number of requests to allocate */
+#define IN_EP_REQ_COUNT 4
+
+#define AUDIO_AC_INTERFACE     0
+#define AUDIO_AS_INTERFACE     1
+#define AUDIO_NUM_INTERFACES   2
+#define MAX_INST_NAME_LEN     40
+
+/* B.3.1  Standard AC Interface Descriptor */
+static struct usb_interface_descriptor ac_interface_desc = {
+       .bLength =              USB_DT_INTERFACE_SIZE,
+       .bDescriptorType =      USB_DT_INTERFACE,
+       .bNumEndpoints =        0,
+       .bInterfaceClass =      USB_CLASS_AUDIO,
+       .bInterfaceSubClass =   USB_SUBCLASS_AUDIOCONTROL,
+};
+
+DECLARE_UAC_AC_HEADER_DESCRIPTOR(2);
+
+#define UAC_DT_AC_HEADER_LENGTH        UAC_DT_AC_HEADER_SIZE(AUDIO_NUM_INTERFACES)
+/* 1 input terminal, 1 output terminal and 1 feature unit */
+#define UAC_DT_TOTAL_LENGTH (UAC_DT_AC_HEADER_LENGTH \
+       + UAC_DT_INPUT_TERMINAL_SIZE + UAC_DT_OUTPUT_TERMINAL_SIZE \
+       + UAC_DT_FEATURE_UNIT_SIZE(0))
+/* B.3.2  Class-Specific AC Interface Descriptor */
+static struct uac1_ac_header_descriptor_2 ac_header_desc = {
+       .bLength =              UAC_DT_AC_HEADER_LENGTH,
+       .bDescriptorType =      USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype =   UAC_HEADER,
+       .bcdADC =               __constant_cpu_to_le16(0x0100),
+       .wTotalLength =         __constant_cpu_to_le16(UAC_DT_TOTAL_LENGTH),
+       .bInCollection =        AUDIO_NUM_INTERFACES,
+       .baInterfaceNr = {
+               [0] =           AUDIO_AC_INTERFACE,
+               [1] =           AUDIO_AS_INTERFACE,
+       }
+};
+
+#define INPUT_TERMINAL_ID      1
+static struct uac_input_terminal_descriptor input_terminal_desc = {
+       .bLength =              UAC_DT_INPUT_TERMINAL_SIZE,
+       .bDescriptorType =      USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype =   UAC_INPUT_TERMINAL,
+       .bTerminalID =          INPUT_TERMINAL_ID,
+       .wTerminalType =        UAC_INPUT_TERMINAL_MICROPHONE,
+       .bAssocTerminal =       0,
+       .wChannelConfig =       0x3,
+};
+
+DECLARE_UAC_FEATURE_UNIT_DESCRIPTOR(0);
+
+#define FEATURE_UNIT_ID                2
+static struct uac_feature_unit_descriptor_0 feature_unit_desc = {
+       .bLength                = UAC_DT_FEATURE_UNIT_SIZE(0),
+       .bDescriptorType        = USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype     = UAC_FEATURE_UNIT,
+       .bUnitID                = FEATURE_UNIT_ID,
+       .bSourceID              = INPUT_TERMINAL_ID,
+       .bControlSize           = 2,
+};
+
+#define OUTPUT_TERMINAL_ID     3
+static struct uac1_output_terminal_descriptor output_terminal_desc = {
+       .bLength                = UAC_DT_OUTPUT_TERMINAL_SIZE,
+       .bDescriptorType        = USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype     = UAC_OUTPUT_TERMINAL,
+       .bTerminalID            = OUTPUT_TERMINAL_ID,
+       .wTerminalType          = UAC_TERMINAL_STREAMING,
+       .bAssocTerminal         = FEATURE_UNIT_ID,
+       .bSourceID              = FEATURE_UNIT_ID,
+};
+
+/* B.4.1  Standard AS Interface Descriptor */
+static struct usb_interface_descriptor as_interface_alt_0_desc = {
+       .bLength =              USB_DT_INTERFACE_SIZE,
+       .bDescriptorType =      USB_DT_INTERFACE,
+       .bAlternateSetting =    0,
+       .bNumEndpoints =        0,
+       .bInterfaceClass =      USB_CLASS_AUDIO,
+       .bInterfaceSubClass =   USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+static struct usb_interface_descriptor as_interface_alt_1_desc = {
+       .bLength =              USB_DT_INTERFACE_SIZE,
+       .bDescriptorType =      USB_DT_INTERFACE,
+       .bAlternateSetting =    1,
+       .bNumEndpoints =        1,
+       .bInterfaceClass =      USB_CLASS_AUDIO,
+       .bInterfaceSubClass =   USB_SUBCLASS_AUDIOSTREAMING,
+};
+
+/* B.4.2  Class-Specific AS Interface Descriptor */
+static struct uac1_as_header_descriptor as_header_desc = {
+       .bLength =              UAC_DT_AS_HEADER_SIZE,
+       .bDescriptorType =      USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype =   UAC_AS_GENERAL,
+       .bTerminalLink =        INPUT_TERMINAL_ID,
+       .bDelay =               1,
+       .wFormatTag =           UAC_FORMAT_TYPE_I_PCM,
+};
+
+DECLARE_UAC_FORMAT_TYPE_I_DISCRETE_DESC(1);
+
+static struct uac_format_type_i_discrete_descriptor_1 as_type_i_desc = {
+       .bLength =              UAC_FORMAT_TYPE_I_DISCRETE_DESC_SIZE(1),
+       .bDescriptorType =      USB_DT_CS_INTERFACE,
+       .bDescriptorSubtype =   UAC_FORMAT_TYPE,
+       .bFormatType =          UAC_FORMAT_TYPE_I,
+       .bSubframeSize =        2,
+       .bBitResolution =       16,
+       .bSamFreqType =         1,
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor hs_as_in_ep_desc  = {
+       .bLength =              USB_DT_ENDPOINT_AUDIO_SIZE,
+       .bDescriptorType =      USB_DT_ENDPOINT,
+       .bEndpointAddress =     USB_DIR_IN,
+       .bmAttributes =         USB_ENDPOINT_SYNC_SYNC
+                               | USB_ENDPOINT_XFER_ISOC,
+       .wMaxPacketSize =       __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+       .bInterval =            4, /* poll 1 per millisecond */
+};
+
+/* Standard ISO IN Endpoint Descriptor for highspeed */
+static struct usb_endpoint_descriptor fs_as_in_ep_desc  = {
+       .bLength =              USB_DT_ENDPOINT_AUDIO_SIZE,
+       .bDescriptorType =      USB_DT_ENDPOINT,
+       .bEndpointAddress =     USB_DIR_IN,
+       .bmAttributes =         USB_ENDPOINT_SYNC_SYNC
+                               | USB_ENDPOINT_XFER_ISOC,
+       .wMaxPacketSize =       __constant_cpu_to_le16(IN_EP_MAX_PACKET_SIZE),
+       .bInterval =            1, /* poll 1 per millisecond */
+};
+
+/* Class-specific AS ISO OUT Endpoint Descriptor */
+static struct uac_iso_endpoint_descriptor as_iso_in_desc = {
+       .bLength =              UAC_ISO_ENDPOINT_DESC_SIZE,
+       .bDescriptorType =      USB_DT_CS_ENDPOINT,
+       .bDescriptorSubtype =   UAC_EP_GENERAL,
+       .bmAttributes =         1,
+       .bLockDelayUnits =      1,
+       .wLockDelay =           __constant_cpu_to_le16(1),
+};
+
+static struct usb_descriptor_header *hs_audio_desc[] = {
+       (struct usb_descriptor_header *)&ac_interface_desc,
+       (struct usb_descriptor_header *)&ac_header_desc,
+
+       (struct usb_descriptor_header *)&input_terminal_desc,
+       (struct usb_descriptor_header *)&output_terminal_desc,
+       (struct usb_descriptor_header *)&feature_unit_desc,
+
+       (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+       (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+       (struct usb_descriptor_header *)&as_header_desc,
+
+       (struct usb_descriptor_header *)&as_type_i_desc,
+
+       (struct usb_descriptor_header *)&hs_as_in_ep_desc,
+       (struct usb_descriptor_header *)&as_iso_in_desc,
+       NULL,
+};
+
+static struct usb_descriptor_header *fs_audio_desc[] = {
+       (struct usb_descriptor_header *)&ac_interface_desc,
+       (struct usb_descriptor_header *)&ac_header_desc,
+
+       (struct usb_descriptor_header *)&input_terminal_desc,
+       (struct usb_descriptor_header *)&output_terminal_desc,
+       (struct usb_descriptor_header *)&feature_unit_desc,
+
+       (struct usb_descriptor_header *)&as_interface_alt_0_desc,
+       (struct usb_descriptor_header *)&as_interface_alt_1_desc,
+       (struct usb_descriptor_header *)&as_header_desc,
+
+       (struct usb_descriptor_header *)&as_type_i_desc,
+
+       (struct usb_descriptor_header *)&fs_as_in_ep_desc,
+       (struct usb_descriptor_header *)&as_iso_in_desc,
+       NULL,
+};
+
+static struct snd_pcm_hardware audio_hw_info = {
+       .info =                 SNDRV_PCM_INFO_MMAP |
+                               SNDRV_PCM_INFO_MMAP_VALID |
+                               SNDRV_PCM_INFO_BATCH |
+                               SNDRV_PCM_INFO_INTERLEAVED |
+                               SNDRV_PCM_INFO_BLOCK_TRANSFER,
+
+       .formats                = SNDRV_PCM_FMTBIT_S16_LE,
+       .channels_min           = 2,
+       .channels_max           = 2,
+       .rate_min               = SAMPLE_RATE,
+       .rate_max               = SAMPLE_RATE,
+
+       .buffer_bytes_max =     1024 * 1024,
+       .period_bytes_min =     64,
+       .period_bytes_max =     512 * 1024,
+       .periods_min =          2,
+       .periods_max =          1024,
+};
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_config {
+       int     card;
+       int     device;
+};
+
+struct audio_dev {
+       struct usb_function             func;
+       struct snd_card                 *card;
+       struct snd_pcm                  *pcm;
+       struct snd_pcm_substream *substream;
+
+       struct list_head                idle_reqs;
+       struct usb_ep                   *in_ep;
+
+       spinlock_t                      lock;
+
+       /* beginning, end and current position in our buffer */
+       void                            *buffer_start;
+       void                            *buffer_end;
+       void                            *buffer_pos;
+
+       /* byte size of a "period" */
+       unsigned int                    period;
+       /* bytes sent since last call to snd_pcm_period_elapsed */
+       unsigned int                    period_offset;
+       /* time we started playing */
+       ktime_t                         start_time;
+       /* number of frames sent since start_time */
+       s64                             frames_sent;
+       struct audio_source_config      *config;
+       /* for creating and issuing QoS requests */
+       struct pm_qos_request pm_qos;
+};
+
+static inline struct audio_dev *func_to_audio(struct usb_function *f)
+{
+       return container_of(f, struct audio_dev, func);
+}
+
+/*-------------------------------------------------------------------------*/
+
+struct audio_source_instance {
+       struct usb_function_instance func_inst;
+       const char *name;
+       struct audio_source_config *config;
+       struct device *audio_device;
+};
+
+static void audio_source_attr_release(struct config_item *item);
+
+static struct configfs_item_operations audio_source_item_ops = {
+       .release        = audio_source_attr_release,
+};
+
+static struct config_item_type audio_source_func_type = {
+       .ct_item_ops    = &audio_source_item_ops,
+       .ct_owner       = THIS_MODULE,
+};
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+               struct device_attribute *attr, char *buf);
+
+static DEVICE_ATTR(pcm, S_IRUGO, audio_source_pcm_show, NULL);
+
+static struct device_attribute *audio_source_function_attributes[] = {
+       &dev_attr_pcm,
+       NULL
+};
+
+/*--------------------------------------------------------------------------*/
+
+static struct usb_request *audio_request_new(struct usb_ep *ep, int buffer_size)
+{
+       struct usb_request *req = usb_ep_alloc_request(ep, GFP_KERNEL);
+
+       if (!req)
+               return NULL;
+
+       req->buf = kmalloc(buffer_size, GFP_KERNEL);
+       if (!req->buf) {
+               usb_ep_free_request(ep, req);
+               return NULL;
+       }
+       req->length = buffer_size;
+       return req;
+}
+
+static void audio_request_free(struct usb_request *req, struct usb_ep *ep)
+{
+       if (req) {
+               kfree(req->buf);
+               usb_ep_free_request(ep, req);
+       }
+}
+
+static void audio_req_put(struct audio_dev *audio, struct usb_request *req)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&audio->lock, flags);
+       list_add_tail(&req->list, &audio->idle_reqs);
+       spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static struct usb_request *audio_req_get(struct audio_dev *audio)
+{
+       unsigned long flags;
+       struct usb_request *req;
+
+       spin_lock_irqsave(&audio->lock, flags);
+       if (list_empty(&audio->idle_reqs)) {
+               req = 0;
+       } else {
+               req = list_first_entry(&audio->idle_reqs, struct usb_request,
+                               list);
+               list_del(&req->list);
+       }
+       spin_unlock_irqrestore(&audio->lock, flags);
+       return req;
+}
+
+/* send the appropriate number of packets to match our bitrate */
+static void audio_send(struct audio_dev *audio)
+{
+       struct snd_pcm_runtime *runtime;
+       struct usb_request *req;
+       int length, length1, length2, ret;
+       s64 msecs;
+       s64 frames;
+       ktime_t now;
+
+       /* audio->substream will be null if we have been closed */
+       if (!audio->substream)
+               return;
+       /* audio->buffer_pos will be null if we have been stopped */
+       if (!audio->buffer_pos)
+               return;
+
+       runtime = audio->substream->runtime;
+
+       /* compute number of frames to send */
+       now = ktime_get();
+       msecs = div_s64((ktime_to_ns(now) - ktime_to_ns(audio->start_time)),
+                       1000000);
+       frames = div_s64((msecs * SAMPLE_RATE), 1000);
+
+       /* Readjust our frames_sent if we fall too far behind.
+        * If we get too far behind it is better to drop some frames than
+        * to keep sending data too fast in an attempt to catch up.
+        */
+       if (frames - audio->frames_sent > 10 * FRAMES_PER_MSEC)
+               audio->frames_sent = frames - FRAMES_PER_MSEC;
+
+       frames -= audio->frames_sent;
+
+       /* We need to send something to keep the pipeline going */
+       if (frames <= 0)
+               frames = FRAMES_PER_MSEC;
+
+       while (frames > 0) {
+               req = audio_req_get(audio);
+               if (!req)
+                       break;
+
+               length = frames_to_bytes(runtime, frames);
+               if (length > IN_EP_MAX_PACKET_SIZE)
+                       length = IN_EP_MAX_PACKET_SIZE;
+
+               if (audio->buffer_pos + length > audio->buffer_end)
+                       length1 = audio->buffer_end - audio->buffer_pos;
+               else
+                       length1 = length;
+               memcpy(req->buf, audio->buffer_pos, length1);
+               if (length1 < length) {
+                       /* Wrap around and copy remaining length
+                        * at beginning of buffer.
+                        */
+                       length2 = length - length1;
+                       memcpy(req->buf + length1, audio->buffer_start,
+                                       length2);
+                       audio->buffer_pos = audio->buffer_start + length2;
+               } else {
+                       audio->buffer_pos += length1;
+                       if (audio->buffer_pos >= audio->buffer_end)
+                               audio->buffer_pos = audio->buffer_start;
+               }
+
+               req->length = length;
+               ret = usb_ep_queue(audio->in_ep, req, GFP_ATOMIC);
+               if (ret < 0) {
+                       pr_err("usb_ep_queue failed ret: %d\n", ret);
+                       audio_req_put(audio, req);
+                       break;
+               }
+
+               frames -= bytes_to_frames(runtime, length);
+               audio->frames_sent += bytes_to_frames(runtime, length);
+       }
+}
+
+static void audio_control_complete(struct usb_ep *ep, struct usb_request *req)
+{
+       /* nothing to do here */
+}
+
+static void audio_data_complete(struct usb_ep *ep, struct usb_request *req)
+{
+       struct audio_dev *audio = req->context;
+
+       pr_debug("audio_data_complete req->status %d req->actual %d\n",
+               req->status, req->actual);
+
+       audio_req_put(audio, req);
+
+       if (!audio->buffer_start || req->status)
+               return;
+
+       audio->period_offset += req->actual;
+       if (audio->period_offset >= audio->period) {
+               snd_pcm_period_elapsed(audio->substream);
+               audio->period_offset = 0;
+       }
+       audio_send(audio);
+}
+
+static int audio_set_endpoint_req(struct usb_function *f,
+               const struct usb_ctrlrequest *ctrl)
+{
+       int value = -EOPNOTSUPP;
+       u16 ep = le16_to_cpu(ctrl->wIndex);
+       u16 len = le16_to_cpu(ctrl->wLength);
+       u16 w_value = le16_to_cpu(ctrl->wValue);
+
+       pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+                       ctrl->bRequest, w_value, len, ep);
+
+       switch (ctrl->bRequest) {
+       case UAC_SET_CUR:
+       case UAC_SET_MIN:
+       case UAC_SET_MAX:
+       case UAC_SET_RES:
+               value = len;
+               break;
+       default:
+               break;
+       }
+
+       return value;
+}
+
+static int audio_get_endpoint_req(struct usb_function *f,
+               const struct usb_ctrlrequest *ctrl)
+{
+       struct usb_composite_dev *cdev = f->config->cdev;
+       int value = -EOPNOTSUPP;
+       u8 ep = ((le16_to_cpu(ctrl->wIndex) >> 8) & 0xFF);
+       u16 len = le16_to_cpu(ctrl->wLength);
+       u16 w_value = le16_to_cpu(ctrl->wValue);
+       u8 *buf = cdev->req->buf;
+
+       pr_debug("bRequest 0x%x, w_value 0x%04x, len %d, endpoint %d\n",
+                       ctrl->bRequest, w_value, len, ep);
+
+       if (w_value == UAC_EP_CS_ATTR_SAMPLE_RATE << 8) {
+               switch (ctrl->bRequest) {
+               case UAC_GET_CUR:
+               case UAC_GET_MIN:
+               case UAC_GET_MAX:
+               case UAC_GET_RES:
+                       /* return our sample rate */
+                       buf[0] = (u8)SAMPLE_RATE;
+                       buf[1] = (u8)(SAMPLE_RATE >> 8);
+                       buf[2] = (u8)(SAMPLE_RATE >> 16);
+                       value = 3;
+                       break;
+               default:
+                       break;
+               }
+       }
+
+       return value;
+}
+
+static int
+audio_setup(struct usb_function *f, const struct usb_ctrlrequest *ctrl)
+{
+       struct usb_composite_dev *cdev = f->config->cdev;
+       struct usb_request *req = cdev->req;
+       int value = -EOPNOTSUPP;
+       u16 w_index = le16_to_cpu(ctrl->wIndex);
+       u16 w_value = le16_to_cpu(ctrl->wValue);
+       u16 w_length = le16_to_cpu(ctrl->wLength);
+
+       /* composite driver infrastructure handles everything; interface
+        * activation uses set_alt().
+        */
+       switch (ctrl->bRequestType) {
+       case USB_DIR_OUT | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+               value = audio_set_endpoint_req(f, ctrl);
+               break;
+
+       case USB_DIR_IN | USB_TYPE_CLASS | USB_RECIP_ENDPOINT:
+               value = audio_get_endpoint_req(f, ctrl);
+               break;
+       }
+
+       /* respond with data transfer or status phase? */
+       if (value >= 0) {
+               pr_debug("audio req%02x.%02x v%04x i%04x l%d\n",
+                       ctrl->bRequestType, ctrl->bRequest,
+                       w_value, w_index, w_length);
+               req->zero = 0;
+               req->length = value;
+               req->complete = audio_control_complete;
+               value = usb_ep_queue(cdev->gadget->ep0, req, GFP_ATOMIC);
+               if (value < 0)
+                       pr_err("audio response on err %d\n", value);
+       }
+
+       /* device either stalls (value < 0) or reports success */
+       return value;
+}
+
+static int audio_set_alt(struct usb_function *f, unsigned intf, unsigned alt)
+{
+       struct audio_dev *audio = func_to_audio(f);
+       struct usb_composite_dev *cdev = f->config->cdev;
+       int ret;
+
+       pr_debug("audio_set_alt intf %d, alt %d\n", intf, alt);
+
+       ret = config_ep_by_speed(cdev->gadget, f, audio->in_ep);
+       if (ret)
+               return ret;
+
+       usb_ep_enable(audio->in_ep);
+       return 0;
+}
+
+static void audio_disable(struct usb_function *f)
+{
+       struct audio_dev        *audio = func_to_audio(f);
+
+       pr_debug("audio_disable\n");
+       usb_ep_disable(audio->in_ep);
+}
+
+static void audio_free_func(struct usb_function *f)
+{
+       /* no-op */
+}
+
+/*-------------------------------------------------------------------------*/
+
+static void audio_build_desc(struct audio_dev *audio)
+{
+       u8 *sam_freq;
+       int rate;
+
+       /* Set channel numbers */
+       input_terminal_desc.bNrChannels = 2;
+       as_type_i_desc.bNrChannels = 2;
+
+       /* Set sample rates */
+       rate = SAMPLE_RATE;
+       sam_freq = as_type_i_desc.tSamFreq[0];
+       memcpy(sam_freq, &rate, 3);
+}
+
+
+static int snd_card_setup(struct usb_configuration *c,
+       struct audio_source_config *config);
+static struct audio_source_instance *to_fi_audio_source(
+       const struct usb_function_instance *fi);
+
+
+/* audio function driver setup/binding */
+static int
+audio_bind(struct usb_configuration *c, struct usb_function *f)
+{
+       struct usb_composite_dev *cdev = c->cdev;
+       struct audio_dev *audio = func_to_audio(f);
+       int status;
+       struct usb_ep *ep;
+       struct usb_request *req;
+       int i;
+       int err;
+
+       if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+               struct audio_source_instance *fi_audio =
+                               to_fi_audio_source(f->fi);
+               struct audio_source_config *config =
+                               fi_audio->config;
+
+               err = snd_card_setup(c, config);
+               if (err)
+                       return err;
+       }
+
+       audio_build_desc(audio);
+
+       /* allocate instance-specific interface IDs, and patch descriptors */
+       status = usb_interface_id(c, f);
+       if (status < 0)
+               goto fail;
+       ac_interface_desc.bInterfaceNumber = status;
+
+       /* AUDIO_AC_INTERFACE */
+       ac_header_desc.baInterfaceNr[0] = status;
+
+       status = usb_interface_id(c, f);
+       if (status < 0)
+               goto fail;
+       as_interface_alt_0_desc.bInterfaceNumber = status;
+       as_interface_alt_1_desc.bInterfaceNumber = status;
+
+       /* AUDIO_AS_INTERFACE */
+       ac_header_desc.baInterfaceNr[1] = status;
+
+       status = -ENODEV;
+
+       /* allocate our endpoint */
+       ep = usb_ep_autoconfig(cdev->gadget, &fs_as_in_ep_desc);
+       if (!ep)
+               goto fail;
+       audio->in_ep = ep;
+       ep->driver_data = audio; /* claim */
+
+       if (gadget_is_dualspeed(c->cdev->gadget))
+               hs_as_in_ep_desc.bEndpointAddress =
+                       fs_as_in_ep_desc.bEndpointAddress;
+
+       f->fs_descriptors = fs_audio_desc;
+       f->hs_descriptors = hs_audio_desc;
+
+       for (i = 0, status = 0; i < IN_EP_REQ_COUNT && status == 0; i++) {
+               req = audio_request_new(ep, IN_EP_MAX_PACKET_SIZE);
+               if (req) {
+                       req->context = audio;
+                       req->complete = audio_data_complete;
+                       audio_req_put(audio, req);
+               } else
+                       status = -ENOMEM;
+       }
+
+fail:
+       return status;
+}
+
+static void
+audio_unbind(struct usb_configuration *c, struct usb_function *f)
+{
+       struct audio_dev *audio = func_to_audio(f);
+       struct usb_request *req;
+
+       while ((req = audio_req_get(audio)))
+               audio_request_free(req, audio->in_ep);
+
+       snd_card_free_when_closed(audio->card);
+       audio->card = NULL;
+       audio->pcm = NULL;
+       audio->substream = NULL;
+       audio->in_ep = NULL;
+
+       if (IS_ENABLED(CONFIG_USB_CONFIGFS)) {
+               struct audio_source_instance *fi_audio =
+                               to_fi_audio_source(f->fi);
+               struct audio_source_config *config =
+                               fi_audio->config;
+
+               config->card = -1;
+               config->device = -1;
+       }
+}
+
+static void audio_pcm_playback_start(struct audio_dev *audio)
+{
+       audio->start_time = ktime_get();
+       audio->frames_sent = 0;
+       audio_send(audio);
+}
+
+static void audio_pcm_playback_stop(struct audio_dev *audio)
+{
+       unsigned long flags;
+
+       spin_lock_irqsave(&audio->lock, flags);
+       audio->buffer_start = 0;
+       audio->buffer_end = 0;
+       audio->buffer_pos = 0;
+       spin_unlock_irqrestore(&audio->lock, flags);
+}
+
+static int audio_pcm_open(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct audio_dev *audio = substream->private_data;
+
+       runtime->private_data = audio;
+       runtime->hw = audio_hw_info;
+       snd_pcm_limit_hw_rates(runtime);
+       runtime->hw.channels_max = 2;
+
+       audio->substream = substream;
+
+       /* Add the QoS request and set the latency to 0 */
+       pm_qos_add_request(&audio->pm_qos, PM_QOS_CPU_DMA_LATENCY, 0);
+
+       return 0;
+}
+
+static int audio_pcm_close(struct snd_pcm_substream *substream)
+{
+       struct audio_dev *audio = substream->private_data;
+       unsigned long flags;
+
+       spin_lock_irqsave(&audio->lock, flags);
+
+       /* Remove the QoS request */
+       pm_qos_remove_request(&audio->pm_qos);
+
+       audio->substream = NULL;
+       spin_unlock_irqrestore(&audio->lock, flags);
+
+       return 0;
+}
+
+static int audio_pcm_hw_params(struct snd_pcm_substream *substream,
+                               struct snd_pcm_hw_params *params)
+{
+       unsigned int channels = params_channels(params);
+       unsigned int rate = params_rate(params);
+
+       if (rate != SAMPLE_RATE)
+               return -EINVAL;
+       if (channels != 2)
+               return -EINVAL;
+
+       return snd_pcm_lib_alloc_vmalloc_buffer(substream,
+               params_buffer_bytes(params));
+}
+
+static int audio_pcm_hw_free(struct snd_pcm_substream *substream)
+{
+       return snd_pcm_lib_free_vmalloc_buffer(substream);
+}
+
+static int audio_pcm_prepare(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct audio_dev *audio = runtime->private_data;
+
+       audio->period = snd_pcm_lib_period_bytes(substream);
+       audio->period_offset = 0;
+       audio->buffer_start = runtime->dma_area;
+       audio->buffer_end = audio->buffer_start
+               + snd_pcm_lib_buffer_bytes(substream);
+       audio->buffer_pos = audio->buffer_start;
+
+       return 0;
+}
+
+static snd_pcm_uframes_t audio_pcm_pointer(struct snd_pcm_substream *substream)
+{
+       struct snd_pcm_runtime *runtime = substream->runtime;
+       struct audio_dev *audio = runtime->private_data;
+       ssize_t bytes = audio->buffer_pos - audio->buffer_start;
+
+       /* return offset of next frame to fill in our buffer */
+       return bytes_to_frames(runtime, bytes);
+}
+
+static int audio_pcm_playback_trigger(struct snd_pcm_substream *substream,
+                                       int cmd)
+{
+       struct audio_dev *audio = substream->runtime->private_data;
+       int ret = 0;
+
+       switch (cmd) {
+       case SNDRV_PCM_TRIGGER_START:
+       case SNDRV_PCM_TRIGGER_RESUME:
+               audio_pcm_playback_start(audio);
+               break;
+
+       case SNDRV_PCM_TRIGGER_STOP:
+       case SNDRV_PCM_TRIGGER_SUSPEND:
+               audio_pcm_playback_stop(audio);
+               break;
+
+       default:
+               ret = -EINVAL;
+       }
+
+       return ret;
+}
+
+static struct audio_dev _audio_dev = {
+       .func = {
+               .name = "audio_source",
+               .bind = audio_bind,
+               .unbind = audio_unbind,
+               .set_alt = audio_set_alt,
+               .setup = audio_setup,
+               .disable = audio_disable,
+               .free_func = audio_free_func,
+       },
+       .lock = __SPIN_LOCK_UNLOCKED(_audio_dev.lock),
+       .idle_reqs = LIST_HEAD_INIT(_audio_dev.idle_reqs),
+};
+
+static struct snd_pcm_ops audio_playback_ops = {
+       .open           = audio_pcm_open,
+       .close          = audio_pcm_close,
+       .ioctl          = snd_pcm_lib_ioctl,
+       .hw_params      = audio_pcm_hw_params,
+       .hw_free        = audio_pcm_hw_free,
+       .prepare        = audio_pcm_prepare,
+       .trigger        = audio_pcm_playback_trigger,
+       .pointer        = audio_pcm_pointer,
+};
+
+int audio_source_bind_config(struct usb_configuration *c,
+               struct audio_source_config *config)
+{
+       struct audio_dev *audio;
+       int err;
+
+       config->card = -1;
+       config->device = -1;
+
+       audio = &_audio_dev;
+
+       err = snd_card_setup(c, config);
+       if (err)
+               return err;
+
+       err = usb_add_function(c, &audio->func);
+       if (err)
+               goto add_fail;
+
+       return 0;
+
+add_fail:
+       snd_card_free(audio->card);
+       return err;
+}
+
+static int snd_card_setup(struct usb_configuration *c,
+               struct audio_source_config *config)
+{
+       struct audio_dev *audio;
+       struct snd_card *card;
+       struct snd_pcm *pcm;
+       int err;
+
+       audio = &_audio_dev;
+
+       err = snd_card_new(&c->cdev->gadget->dev,
+                       SNDRV_DEFAULT_IDX1, SNDRV_DEFAULT_STR1,
+                       THIS_MODULE, 0, &card);
+       if (err)
+               return err;
+
+       err = snd_pcm_new(card, "USB audio source", 0, 1, 0, &pcm);
+       if (err)
+               goto pcm_fail;
+
+       pcm->private_data = audio;
+       pcm->info_flags = 0;
+       audio->pcm = pcm;
+
+       strlcpy(pcm->name, "USB gadget audio", sizeof(pcm->name));
+
+       snd_pcm_set_ops(pcm, SNDRV_PCM_STREAM_PLAYBACK, &audio_playback_ops);
+       snd_pcm_lib_preallocate_pages_for_all(pcm, SNDRV_DMA_TYPE_DEV,
+                               NULL, 0, 64 * 1024);
+
+       strlcpy(card->driver, "audio_source", sizeof(card->driver));
+       strlcpy(card->shortname, card->driver, sizeof(card->shortname));
+       strlcpy(card->longname, "USB accessory audio source",
+               sizeof(card->longname));
+
+       err = snd_card_register(card);
+       if (err)
+               goto register_fail;
+
+       config->card = pcm->card->number;
+       config->device = pcm->device;
+       audio->card = card;
+       return 0;
+
+register_fail:
+pcm_fail:
+       snd_card_free(audio->card);
+       return err;
+}
+
+static struct audio_source_instance *to_audio_source_instance(
+                                       struct config_item *item)
+{
+       return container_of(to_config_group(item), struct audio_source_instance,
+               func_inst.group);
+}
+
+static struct audio_source_instance *to_fi_audio_source(
+                                       const struct usb_function_instance *fi)
+{
+       return container_of(fi, struct audio_source_instance, func_inst);
+}
+
+static void audio_source_attr_release(struct config_item *item)
+{
+       struct audio_source_instance *fi_audio = to_audio_source_instance(item);
+
+       usb_put_function_instance(&fi_audio->func_inst);
+}
+
+static int audio_source_set_inst_name(struct usb_function_instance *fi,
+                                       const char *name)
+{
+       struct audio_source_instance *fi_audio;
+       char *ptr;
+       int name_len;
+
+       name_len = strlen(name) + 1;
+       if (name_len > MAX_INST_NAME_LEN)
+               return -ENAMETOOLONG;
+
+       ptr = kstrndup(name, name_len, GFP_KERNEL);
+       if (!ptr)
+               return -ENOMEM;
+
+       fi_audio = to_fi_audio_source(fi);
+       fi_audio->name = ptr;
+
+       return 0;
+}
+
+static void audio_source_free_inst(struct usb_function_instance *fi)
+{
+       struct audio_source_instance *fi_audio;
+
+       fi_audio = to_fi_audio_source(fi);
+       device_destroy(fi_audio->audio_device->class,
+                       fi_audio->audio_device->devt);
+       kfree(fi_audio->name);
+       kfree(fi_audio->config);
+}
+
+static ssize_t audio_source_pcm_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct audio_source_instance *fi_audio = dev_get_drvdata(dev);
+       struct audio_source_config *config = fi_audio->config;
+
+       /* print PCM card and device numbers */
+       return sprintf(buf, "%d %d\n", config->card, config->device);
+}
+
+struct device *create_function_device(char *name);
+
+static struct usb_function_instance *audio_source_alloc_inst(void)
+{
+       struct audio_source_instance *fi_audio;
+       struct device_attribute **attrs;
+       struct device_attribute *attr;
+       struct device *dev;
+       void *err_ptr;
+       int err = 0;
+
+       fi_audio = kzalloc(sizeof(*fi_audio), GFP_KERNEL);
+       if (!fi_audio)
+               return ERR_PTR(-ENOMEM);
+
+       fi_audio->func_inst.set_inst_name = audio_source_set_inst_name;
+       fi_audio->func_inst.free_func_inst = audio_source_free_inst;
+
+       fi_audio->config = kzalloc(sizeof(struct audio_source_config),
+                                                       GFP_KERNEL);
+       if (!fi_audio->config) {
+               err_ptr = ERR_PTR(-ENOMEM);
+               goto fail_audio;
+       }
+
+       config_group_init_type_name(&fi_audio->func_inst.group, "",
+                                               &audio_source_func_type);
+       dev = create_function_device("f_audio_source");
+
+       if (IS_ERR(dev)) {
+               err_ptr = dev;
+               goto fail_audio_config;
+       }
+
+       fi_audio->config->card = -1;
+       fi_audio->config->device = -1;
+       fi_audio->audio_device = dev;
+
+       attrs = audio_source_function_attributes;
+       if (attrs) {
+               while ((attr = *attrs++) && !err)
+                       err = device_create_file(dev, attr);
+               if (err) {
+                       err_ptr = ERR_PTR(-EINVAL);
+                       goto fail_device;
+               }
+       }
+
+       dev_set_drvdata(dev, fi_audio);
+       _audio_dev.config = fi_audio->config;
+
+       return  &fi_audio->func_inst;
+
+fail_device:
+       device_destroy(dev->class, dev->devt);
+fail_audio_config:
+       kfree(fi_audio->config);
+fail_audio:
+       kfree(fi_audio);
+       return err_ptr;
+
+}
+
+static struct usb_function *audio_source_alloc(struct usb_function_instance *fi)
+{
+       return &_audio_dev.func;
+}
+
+DECLARE_USB_FUNCTION_INIT(audio_source, audio_source_alloc_inst,
+                       audio_source_alloc);
+MODULE_LICENSE("GPL");
diff --git a/drivers/usb/gadget/function/f_midi.c b/drivers/usb/gadget/function/f_midi.c

index 5d3d7941d2c2227f5ae9d9f219d34bfd0156665f..7d2e5e6e6ba2ee868bff352001471145b9d7f19b 100644 (file)
--- a/drivers/usb/gadget/function/f_midi.c
+++ b/drivers/usb/gadget/function/f_midi.c
@@ -1207,6 +1207,65 @@ static void f_midi_free_inst(struct usb_function_instance *f)
         kfree(opts);
  }
  
+#ifdef CONFIG_USB_CONFIGFS_UEVENT
+extern struct device *create_function_device(char *name);
+static ssize_t alsa_show(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct usb_function_instance *fi_midi = dev_get_drvdata(dev);
+       struct f_midi *midi;
+
+       if (!fi_midi->f)
+               dev_warn(dev, "f_midi: function not set\n");
+
+       if (fi_midi && fi_midi->f) {
+               midi = func_to_midi(fi_midi->f);
+               if (midi->rmidi && midi->rmidi->card)
+                       return sprintf(buf, "%d %d\n",
+                       midi->rmidi->card->number, midi->rmidi->device);
+       }
+
+       /* print PCM card and device numbers */
+       return sprintf(buf, "%d %d\n", -1, -1);
+}
+
+static DEVICE_ATTR(alsa, S_IRUGO, alsa_show, NULL);
+
+static struct device_attribute *alsa_function_attributes[] = {
+       &dev_attr_alsa,
+       NULL
+};
+
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+       struct device *dev;
+       struct device_attribute **attrs;
+       struct device_attribute *attr;
+       int err = 0;
+
+       dev = create_function_device("f_midi");
+       if (IS_ERR(dev))
+               return PTR_ERR(dev);
+
+       attrs = alsa_function_attributes;
+       if (attrs) {
+               while ((attr = *attrs++) && !err)
+                       err = device_create_file(dev, attr);
+               if (err) {
+                       device_destroy(dev->class, dev->devt);
+                       return -EINVAL;
+               }
+       }
+       dev_set_drvdata(dev, fi);
+       return 0;
+}
+#else
+static int create_alsa_device(struct usb_function_instance *fi)
+{
+       return 0;
+}
+#endif
+
  static struct usb_function_instance *f_midi_alloc_inst(void)
  {
         struct f_midi_opts *opts;
@@ -1224,6 +1283,11 @@ static struct usb_function_instance *f_midi_alloc_inst(void)
         opts->in_ports = 1;
         opts->out_ports = 1;
  
+       if (create_alsa_device(&opts->func_inst)) {
+               kfree(opts);
+               return ERR_PTR(-ENODEV);
+       }
+
         config_group_init_type_name(&opts->func_inst.group, "",
                                     &midi_func_type);
  
@@ -1242,6 +1306,7 @@ static void f_midi_free(struct usb_function *f)
                 kfree(midi->id);
                 kfifo_free(&midi->in_req_fifo);
                 kfree(midi);
+               opts->func_inst.f = NULL;
                 --opts->refcnt;
         }
         mutex_unlock(&opts->lock);
@@ -1328,6 +1393,7 @@ static struct usb_function *f_midi_alloc(struct usb_function_instance *fi)
         midi->func.disable      = f_midi_disable;
         midi->func.free_func    = f_midi_free;
  
+       fi->f = &midi->func;
         return &midi->func;
  
  setup_fail:
diff --git a/drivers/usb/phy/Kconfig b/drivers/usb/phy/Kconfig

index aff702c0eb9fb5b83e210258d21f52827d75e134..15e822e55a726170e8f1c665da2c92c7cc6ebc3d 100644 (file)
--- a/drivers/usb/phy/Kconfig
+++ b/drivers/usb/phy/Kconfig
@@ -7,6 +7,14 @@ config USB_PHY
         select EXTCON
         def_bool n
  
+config USB_OTG_WAKELOCK
+       bool "Hold a wakelock when USB connected"
+       depends on PM_WAKELOCKS
+       select USB_OTG_UTILS
+       help
+         Select this to automatically hold a wakelock when USB is
+         connected, preventing suspend.
+
  #
  # USB Transceiver Drivers
  #
@@ -202,4 +210,13 @@ config USB_ULPI_VIEWPORT
           Provides read/write operations to the ULPI phy register set for
           controllers with a viewport register (e.g. Chipidea/ARC controllers).
  
+config DUAL_ROLE_USB_INTF
+       bool "Generic DUAL ROLE sysfs interface"
+       depends on SYSFS && USB_PHY
+       help
+         A generic sysfs interface to track and change the state of
+         dual role usb phys. The usb phy drivers can register to
+         this interface to expose it capabilities to the userspace
+         and thereby allowing userspace to change the port mode.
+
  endmenu
diff --git a/drivers/usb/phy/Makefile b/drivers/usb/phy/Makefile

index 0c40ccc906311f6876cb3e0035b8650bc66d010e..68867d61128201d20ae68e021f4de9a984ff423f 100644 (file)
--- a/drivers/usb/phy/Makefile
+++ b/drivers/usb/phy/Makefile
@@ -4,6 +4,8 @@
  #
  obj-$(CONFIG_USB_PHY)                  += phy.o
  obj-$(CONFIG_OF)                       += of.o
+obj-$(CONFIG_USB_OTG_WAKELOCK)         += otg-wakelock.o
+obj-$(CONFIG_DUAL_ROLE_USB_INTF)       += class-dual-role.o
  
  # transceiver drivers, keep the list sorted
  
diff --git a/drivers/usb/phy/class-dual-role.c b/drivers/usb/phy/class-dual-role.c

new file mode 100644 (file)

index 0000000..51fcb54
--- /dev/null
+++ b/drivers/usb/phy/class-dual-role.c
@@ -0,0 +1,529 @@
+/*
+ * class-dual-role.c
+ *
+ * Copyright (C) 2015 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/ctype.h>
+#include <linux/device.h>
+#include <linux/usb/class-dual-role.h>
+#include <linux/err.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/stat.h>
+#include <linux/types.h>
+
+#define DUAL_ROLE_NOTIFICATION_TIMEOUT 2000
+
+static ssize_t dual_role_store_property(struct device *dev,
+                                       struct device_attribute *attr,
+                                       const char *buf, size_t count);
+static ssize_t dual_role_show_property(struct device *dev,
+                                      struct device_attribute *attr,
+                                      char *buf);
+
+#define DUAL_ROLE_ATTR(_name)                          \
+{                                                      \
+       .attr = { .name = #_name },                     \
+       .show = dual_role_show_property,                \
+       .store = dual_role_store_property,              \
+}
+
+static struct device_attribute dual_role_attrs[] = {
+       DUAL_ROLE_ATTR(supported_modes),
+       DUAL_ROLE_ATTR(mode),
+       DUAL_ROLE_ATTR(power_role),
+       DUAL_ROLE_ATTR(data_role),
+       DUAL_ROLE_ATTR(powers_vconn),
+};
+
+struct class *dual_role_class;
+EXPORT_SYMBOL_GPL(dual_role_class);
+
+static struct device_type dual_role_dev_type;
+
+static char *kstrdupcase(const char *str, gfp_t gfp, bool to_upper)
+{
+       char *ret, *ustr;
+
+       ustr = ret = kmalloc(strlen(str) + 1, gfp);
+
+       if (!ret)
+               return NULL;
+
+       while (*str)
+               *ustr++ = to_upper ? toupper(*str++) : tolower(*str++);
+
+       *ustr = 0;
+
+       return ret;
+}
+
+static void dual_role_changed_work(struct work_struct *work)
+{
+       struct dual_role_phy_instance *dual_role =
+           container_of(work, struct dual_role_phy_instance,
+                        changed_work);
+
+       dev_dbg(&dual_role->dev, "%s\n", __func__);
+       kobject_uevent(&dual_role->dev.kobj, KOBJ_CHANGE);
+}
+
+void dual_role_instance_changed(struct dual_role_phy_instance *dual_role)
+{
+       dev_dbg(&dual_role->dev, "%s\n", __func__);
+       pm_wakeup_event(&dual_role->dev, DUAL_ROLE_NOTIFICATION_TIMEOUT);
+       schedule_work(&dual_role->changed_work);
+}
+EXPORT_SYMBOL_GPL(dual_role_instance_changed);
+
+int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+                          enum dual_role_property prop,
+                          unsigned int *val)
+{
+       return dual_role->desc->get_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_get_property);
+
+int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+                          enum dual_role_property prop,
+                          const unsigned int *val)
+{
+       if (!dual_role->desc->set_property)
+               return -ENODEV;
+
+       return dual_role->desc->set_property(dual_role, prop, val);
+}
+EXPORT_SYMBOL_GPL(dual_role_set_property);
+
+int dual_role_property_is_writeable(struct dual_role_phy_instance *dual_role,
+                                   enum dual_role_property prop)
+{
+       if (!dual_role->desc->property_is_writeable)
+               return -ENODEV;
+
+       return dual_role->desc->property_is_writeable(dual_role, prop);
+}
+EXPORT_SYMBOL_GPL(dual_role_property_is_writeable);
+
+static void dual_role_dev_release(struct device *dev)
+{
+       struct dual_role_phy_instance *dual_role =
+           container_of(dev, struct dual_role_phy_instance, dev);
+       pr_debug("device: '%s': %s\n", dev_name(dev), __func__);
+       kfree(dual_role);
+}
+
+static struct dual_role_phy_instance *__must_check
+__dual_role_register(struct device *parent,
+                    const struct dual_role_phy_desc *desc)
+{
+       struct device *dev;
+       struct dual_role_phy_instance *dual_role;
+       int rc;
+
+       dual_role = kzalloc(sizeof(*dual_role), GFP_KERNEL);
+       if (!dual_role)
+               return ERR_PTR(-ENOMEM);
+
+       dev = &dual_role->dev;
+
+       device_initialize(dev);
+
+       dev->class = dual_role_class;
+       dev->type = &dual_role_dev_type;
+       dev->parent = parent;
+       dev->release = dual_role_dev_release;
+       dev_set_drvdata(dev, dual_role);
+       dual_role->desc = desc;
+
+       rc = dev_set_name(dev, "%s", desc->name);
+       if (rc)
+               goto dev_set_name_failed;
+
+       INIT_WORK(&dual_role->changed_work, dual_role_changed_work);
+
+       rc = device_init_wakeup(dev, true);
+       if (rc)
+               goto wakeup_init_failed;
+
+       rc = device_add(dev);
+       if (rc)
+               goto device_add_failed;
+
+       dual_role_instance_changed(dual_role);
+
+       return dual_role;
+
+device_add_failed:
+       device_init_wakeup(dev, false);
+wakeup_init_failed:
+dev_set_name_failed:
+       put_device(dev);
+       kfree(dual_role);
+
+       return ERR_PTR(rc);
+}
+
+static void dual_role_instance_unregister(struct dual_role_phy_instance
+                                         *dual_role)
+{
+       cancel_work_sync(&dual_role->changed_work);
+       device_init_wakeup(&dual_role->dev, false);
+       device_unregister(&dual_role->dev);
+}
+
+static void devm_dual_role_release(struct device *dev, void *res)
+{
+       struct dual_role_phy_instance **dual_role = res;
+
+       dual_role_instance_unregister(*dual_role);
+}
+
+struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+                                const struct dual_role_phy_desc *desc)
+{
+       struct dual_role_phy_instance **ptr, *dual_role;
+
+       ptr = devres_alloc(devm_dual_role_release, sizeof(*ptr), GFP_KERNEL);
+
+       if (!ptr)
+               return ERR_PTR(-ENOMEM);
+       dual_role = __dual_role_register(parent, desc);
+       if (IS_ERR(dual_role)) {
+               devres_free(ptr);
+       } else {
+               *ptr = dual_role;
+               devres_add(parent, ptr);
+       }
+       return dual_role;
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_register);
+
+static int devm_dual_role_match(struct device *dev, void *res, void *data)
+{
+       struct dual_role_phy_instance **r = res;
+
+       if (WARN_ON(!r || !*r))
+               return 0;
+
+       return *r == data;
+}
+
+void devm_dual_role_instance_unregister(struct device *dev,
+                                       struct dual_role_phy_instance
+                                       *dual_role)
+{
+       int rc;
+
+       rc = devres_release(dev, devm_dual_role_release,
+                           devm_dual_role_match, dual_role);
+       WARN_ON(rc);
+}
+EXPORT_SYMBOL_GPL(devm_dual_role_instance_unregister);
+
+void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role)
+{
+       return dual_role->drv_data;
+}
+EXPORT_SYMBOL_GPL(dual_role_get_drvdata);
+
+/***************** Device attribute functions **************************/
+
+/* port type */
+static char *supported_modes_text[] = {
+       "ufp dfp", "dfp", "ufp"
+};
+
+/* current mode */
+static char *mode_text[] = {
+       "ufp", "dfp", "none"
+};
+
+/* Power role */
+static char *pr_text[] = {
+       "source", "sink", "none"
+};
+
+/* Data role */
+static char *dr_text[] = {
+       "host", "device", "none"
+};
+
+/* Vconn supply */
+static char *vconn_supply_text[] = {
+       "n", "y"
+};
+
+static ssize_t dual_role_show_property(struct device *dev,
+                                      struct device_attribute *attr, char *buf)
+{
+       ssize_t ret = 0;
+       struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+       const ptrdiff_t off = attr - dual_role_attrs;
+       unsigned int value;
+
+       if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+               value = dual_role->desc->supported_modes;
+       } else {
+               ret = dual_role_get_property(dual_role, off, &value);
+
+               if (ret < 0) {
+                       if (ret == -ENODATA)
+                               dev_dbg(dev,
+                                       "driver has no data for `%s' property\n",
+                                       attr->attr.name);
+                       else if (ret != -ENODEV)
+                               dev_err(dev,
+                                       "driver failed to report `%s' property: %zd\n",
+                                       attr->attr.name, ret);
+                       return ret;
+               }
+       }
+
+       if (off == DUAL_ROLE_PROP_SUPPORTED_MODES) {
+               BUILD_BUG_ON(DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL !=
+                       ARRAY_SIZE(supported_modes_text));
+               if (value < DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL)
+                       return snprintf(buf, PAGE_SIZE, "%s\n",
+                                       supported_modes_text[value]);
+               else
+                       return -EIO;
+       } else if (off == DUAL_ROLE_PROP_MODE) {
+               BUILD_BUG_ON(DUAL_ROLE_PROP_MODE_TOTAL !=
+                       ARRAY_SIZE(mode_text));
+               if (value < DUAL_ROLE_PROP_MODE_TOTAL)
+                       return snprintf(buf, PAGE_SIZE, "%s\n",
+                                       mode_text[value]);
+               else
+                       return -EIO;
+       } else if (off == DUAL_ROLE_PROP_PR) {
+               BUILD_BUG_ON(DUAL_ROLE_PROP_PR_TOTAL != ARRAY_SIZE(pr_text));
+               if (value < DUAL_ROLE_PROP_PR_TOTAL)
+                       return snprintf(buf, PAGE_SIZE, "%s\n",
+                                       pr_text[value]);
+               else
+                       return -EIO;
+       } else if (off == DUAL_ROLE_PROP_DR) {
+               BUILD_BUG_ON(DUAL_ROLE_PROP_DR_TOTAL != ARRAY_SIZE(dr_text));
+               if (value < DUAL_ROLE_PROP_DR_TOTAL)
+                       return snprintf(buf, PAGE_SIZE, "%s\n",
+                                       dr_text[value]);
+               else
+                       return -EIO;
+       } else if (off == DUAL_ROLE_PROP_VCONN_SUPPLY) {
+               BUILD_BUG_ON(DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL !=
+                               ARRAY_SIZE(vconn_supply_text));
+               if (value < DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL)
+                       return snprintf(buf, PAGE_SIZE, "%s\n",
+                                       vconn_supply_text[value]);
+               else
+                       return -EIO;
+       } else
+               return -EIO;
+}
+
+static ssize_t dual_role_store_property(struct device *dev,
+                                       struct device_attribute *attr,
+                                       const char *buf, size_t count)
+{
+       ssize_t ret;
+       struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+       const ptrdiff_t off = attr - dual_role_attrs;
+       unsigned int value;
+       int total, i;
+       char *dup_buf, **text_array;
+       bool result = false;
+
+       dup_buf = kstrdupcase(buf, GFP_KERNEL, false);
+       switch (off) {
+       case DUAL_ROLE_PROP_MODE:
+               total = DUAL_ROLE_PROP_MODE_TOTAL;
+               text_array = mode_text;
+               break;
+       case DUAL_ROLE_PROP_PR:
+               total = DUAL_ROLE_PROP_PR_TOTAL;
+               text_array = pr_text;
+               break;
+       case DUAL_ROLE_PROP_DR:
+               total = DUAL_ROLE_PROP_DR_TOTAL;
+               text_array = dr_text;
+               break;
+       case DUAL_ROLE_PROP_VCONN_SUPPLY:
+               ret = strtobool(dup_buf, &result);
+               value = result;
+               if (!ret)
+                       goto setprop;
+       default:
+               ret = -EINVAL;
+               goto error;
+       }
+
+       for (i = 0; i <= total; i++) {
+               if (i == total) {
+                       ret = -ENOTSUPP;
+                       goto error;
+               }
+               if (!strncmp(*(text_array + i), dup_buf,
+                            strlen(*(text_array + i)))) {
+                       value = i;
+                       break;
+               }
+       }
+
+setprop:
+       ret = dual_role->desc->set_property(dual_role, off, &value);
+
+error:
+       kfree(dup_buf);
+
+       if (ret < 0)
+               return ret;
+
+       return count;
+}
+
+static umode_t dual_role_attr_is_visible(struct kobject *kobj,
+                                        struct attribute *attr, int attrno)
+{
+       struct device *dev = container_of(kobj, struct device, kobj);
+       struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+       umode_t mode = S_IRUSR | S_IRGRP | S_IROTH;
+       int i;
+
+       if (attrno == DUAL_ROLE_PROP_SUPPORTED_MODES)
+               return mode;
+
+       for (i = 0; i < dual_role->desc->num_properties; i++) {
+               int property = dual_role->desc->properties[i];
+
+               if (property == attrno) {
+                       if (dual_role->desc->property_is_writeable &&
+                           dual_role_property_is_writeable(dual_role, property)
+                           > 0)
+                               mode |= S_IWUSR;
+
+                       return mode;
+               }
+       }
+
+       return 0;
+}
+
+static struct attribute *__dual_role_attrs[ARRAY_SIZE(dual_role_attrs) + 1];
+
+static struct attribute_group dual_role_attr_group = {
+       .attrs = __dual_role_attrs,
+       .is_visible = dual_role_attr_is_visible,
+};
+
+static const struct attribute_group *dual_role_attr_groups[] = {
+       &dual_role_attr_group,
+       NULL,
+};
+
+void dual_role_init_attrs(struct device_type *dev_type)
+{
+       int i;
+
+       dev_type->groups = dual_role_attr_groups;
+
+       for (i = 0; i < ARRAY_SIZE(dual_role_attrs); i++)
+               __dual_role_attrs[i] = &dual_role_attrs[i].attr;
+}
+
+int dual_role_uevent(struct device *dev, struct kobj_uevent_env *env)
+{
+       struct dual_role_phy_instance *dual_role = dev_get_drvdata(dev);
+       int ret = 0, j;
+       char *prop_buf;
+       char *attrname;
+
+       dev_dbg(dev, "uevent\n");
+
+       if (!dual_role || !dual_role->desc) {
+               dev_dbg(dev, "No dual_role phy yet\n");
+               return ret;
+       }
+
+       dev_dbg(dev, "DUAL_ROLE_NAME=%s\n", dual_role->desc->name);
+
+       ret = add_uevent_var(env, "DUAL_ROLE_NAME=%s", dual_role->desc->name);
+       if (ret)
+               return ret;
+
+       prop_buf = (char *)get_zeroed_page(GFP_KERNEL);
+       if (!prop_buf)
+               return -ENOMEM;
+
+       for (j = 0; j < dual_role->desc->num_properties; j++) {
+               struct device_attribute *attr;
+               char *line;
+
+               attr = &dual_role_attrs[dual_role->desc->properties[j]];
+
+               ret = dual_role_show_property(dev, attr, prop_buf);
+               if (ret == -ENODEV || ret == -ENODATA) {
+                       ret = 0;
+                       continue;
+               }
+
+               if (ret < 0)
+                       goto out;
+               line = strnchr(prop_buf, PAGE_SIZE, '\n');
+               if (line)
+                       *line = 0;
+
+               attrname = kstrdupcase(attr->attr.name, GFP_KERNEL, true);
+               if (!attrname)
+                       ret = -ENOMEM;
+
+               dev_dbg(dev, "prop %s=%s\n", attrname, prop_buf);
+
+               ret = add_uevent_var(env, "DUAL_ROLE_%s=%s", attrname,
+                                    prop_buf);
+               kfree(attrname);
+               if (ret)
+                       goto out;
+       }
+
+out:
+       free_page((unsigned long)prop_buf);
+
+       return ret;
+}
+
+/******************* Module Init ***********************************/
+
+static int __init dual_role_class_init(void)
+{
+       dual_role_class = class_create(THIS_MODULE, "dual_role_usb");
+
+       if (IS_ERR(dual_role_class))
+               return PTR_ERR(dual_role_class);
+
+       dual_role_class->dev_uevent = dual_role_uevent;
+       dual_role_init_attrs(&dual_role_dev_type);
+
+       return 0;
+}
+
+static void __exit dual_role_class_exit(void)
+{
+       class_destroy(dual_role_class);
+}
+
+subsys_initcall(dual_role_class_init);
+module_exit(dual_role_class_exit);
diff --git a/drivers/usb/phy/otg-wakelock.c b/drivers/usb/phy/otg-wakelock.c

new file mode 100644 (file)

index 0000000..ecd7410
--- /dev/null
+++ b/drivers/usb/phy/otg-wakelock.c
@@ -0,0 +1,170 @@
+/*
+ * otg-wakelock.c
+ *
+ * Copyright (C) 2011 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/notifier.h>
+#include <linux/spinlock.h>
+#include <linux/usb/otg.h>
+
+#define TEMPORARY_HOLD_TIME    2000
+
+static bool enabled = true;
+static struct usb_phy *otgwl_xceiv;
+static struct notifier_block otgwl_nb;
+
+/*
+ * otgwl_spinlock is held while the VBUS lock is grabbed or dropped and the
+ * held field is updated to match.
+ */
+
+static DEFINE_SPINLOCK(otgwl_spinlock);
+
+/*
+ * Only one lock, but since these 3 fields are associated with each other...
+ */
+
+struct otgwl_lock {
+       char name[40];
+       struct wakeup_source wakesrc;
+       bool held;
+};
+
+/*
+ * VBUS present lock.  Also used as a timed lock on charger
+ * connect/disconnect and USB host disconnect, to allow the system
+ * to react to the change in power.
+ */
+
+static struct otgwl_lock vbus_lock;
+
+static void otgwl_hold(struct otgwl_lock *lock)
+{
+       if (!lock->held) {
+               __pm_stay_awake(&lock->wakesrc);
+               lock->held = true;
+       }
+}
+
+static void otgwl_temporary_hold(struct otgwl_lock *lock)
+{
+       __pm_wakeup_event(&lock->wakesrc, TEMPORARY_HOLD_TIME);
+       lock->held = false;
+}
+
+static void otgwl_drop(struct otgwl_lock *lock)
+{
+       if (lock->held) {
+               __pm_relax(&lock->wakesrc);
+               lock->held = false;
+       }
+}
+
+static void otgwl_handle_event(unsigned long event)
+{
+       unsigned long irqflags;
+
+       spin_lock_irqsave(&otgwl_spinlock, irqflags);
+
+       if (!enabled) {
+               otgwl_drop(&vbus_lock);
+               spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+               return;
+       }
+
+       switch (event) {
+       case USB_EVENT_VBUS:
+       case USB_EVENT_ENUMERATED:
+               otgwl_hold(&vbus_lock);
+               break;
+
+       case USB_EVENT_NONE:
+       case USB_EVENT_ID:
+       case USB_EVENT_CHARGER:
+               otgwl_temporary_hold(&vbus_lock);
+               break;
+
+       default:
+               break;
+       }
+
+       spin_unlock_irqrestore(&otgwl_spinlock, irqflags);
+}
+
+static int otgwl_otg_notifications(struct notifier_block *nb,
+                                  unsigned long event, void *unused)
+{
+       otgwl_handle_event(event);
+       return NOTIFY_OK;
+}
+
+static int set_enabled(const char *val, const struct kernel_param *kp)
+{
+       int rv = param_set_bool(val, kp);
+
+       if (rv)
+               return rv;
+
+       if (otgwl_xceiv)
+               otgwl_handle_event(otgwl_xceiv->last_event);
+
+       return 0;
+}
+
+static struct kernel_param_ops enabled_param_ops = {
+       .set = set_enabled,
+       .get = param_get_bool,
+};
+
+module_param_cb(enabled, &enabled_param_ops, &enabled, 0644);
+MODULE_PARM_DESC(enabled, "enable wakelock when VBUS present");
+
+static int __init otg_wakelock_init(void)
+{
+       int ret;
+       struct usb_phy *phy;
+
+       phy = usb_get_phy(USB_PHY_TYPE_USB2);
+
+       if (IS_ERR(phy)) {
+               pr_err("%s: No USB transceiver found\n", __func__);
+               return PTR_ERR(phy);
+       }
+       otgwl_xceiv = phy;
+
+       snprintf(vbus_lock.name, sizeof(vbus_lock.name), "vbus-%s",
+                dev_name(otgwl_xceiv->dev));
+       wakeup_source_init(&vbus_lock.wakesrc, vbus_lock.name);
+
+       otgwl_nb.notifier_call = otgwl_otg_notifications;
+       ret = usb_register_notifier(otgwl_xceiv, &otgwl_nb);
+
+       if (ret) {
+               pr_err("%s: usb_register_notifier on transceiver %s"
+                      " failed\n", __func__,
+                      dev_name(otgwl_xceiv->dev));
+               otgwl_xceiv = NULL;
+               wakeup_source_trash(&vbus_lock.wakesrc);
+               return ret;
+       }
+
+       otgwl_handle_event(otgwl_xceiv->last_event);
+       return ret;
+}
+
+late_initcall(otg_wakelock_init);
diff --git a/drivers/video/fbdev/goldfishfb.c b/drivers/video/fbdev/goldfishfb.c

index 7f6c9e6cfc6c99d8d9912db5d2f78242923f51c5..1e56b50e408234f826fd28fdaa1bee0807911dcf 100644 (file)
--- a/drivers/video/fbdev/goldfishfb.c
+++ b/drivers/video/fbdev/goldfishfb.c
@@ -26,6 +26,7 @@
  #include <linux/interrupt.h>
  #include <linux/ioport.h>
  #include <linux/platform_device.h>
+#include <linux/acpi.h>
  
  enum {
         FB_GET_WIDTH        = 0x00,
@@ -234,7 +235,7 @@ static int goldfish_fb_probe(struct platform_device *pdev)
         fb->fb.var.activate     = FB_ACTIVATE_NOW;
         fb->fb.var.height       = readl(fb->reg_base + FB_GET_PHYS_HEIGHT);
         fb->fb.var.width        = readl(fb->reg_base + FB_GET_PHYS_WIDTH);
-       fb->fb.var.pixclock     = 10000;
+       fb->fb.var.pixclock     = 0;
  
         fb->fb.var.red.offset = 11;
         fb->fb.var.red.length = 5;
@@ -304,12 +305,25 @@ static int goldfish_fb_remove(struct platform_device *pdev)
         return 0;
  }
  
+static const struct of_device_id goldfish_fb_of_match[] = {
+       { .compatible = "google,goldfish-fb", },
+       {},
+};
+MODULE_DEVICE_TABLE(of, goldfish_fb_of_match);
+
+static const struct acpi_device_id goldfish_fb_acpi_match[] = {
+       { "GFSH0004", 0 },
+       { },
+};
+MODULE_DEVICE_TABLE(acpi, goldfish_fb_acpi_match);
  
  static struct platform_driver goldfish_fb_driver = {
         .probe          = goldfish_fb_probe,
         .remove         = goldfish_fb_remove,
         .driver = {
-               .name = "goldfish_fb"
+               .name = "goldfish_fb",
+               .of_match_table = goldfish_fb_of_match,
+               .acpi_match_table = ACPI_PTR(goldfish_fb_acpi_match),
         }
  };
  
diff --git a/fs/Kconfig b/fs/Kconfig

index 7aee6d699fd6b38949df0563281473c3afb445c2..121fabf122f2834e1742686042b966d0986af3d1 100644 (file)
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -227,6 +227,7 @@ source "fs/orangefs/Kconfig"
  source "fs/adfs/Kconfig"
  source "fs/affs/Kconfig"
  source "fs/ecryptfs/Kconfig"
+source "fs/sdcardfs/Kconfig"
  source "fs/hfs/Kconfig"
  source "fs/hfsplus/Kconfig"
  source "fs/befs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile

index ef772f1eaff80ade97232f502bb4c8915e87cdac..1e34e4bc81a33d3c9eb818ee28c6012d79d95e48 100644 (file)
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -4,7 +4,7 @@
  #
  # 14 Sep 2000, Christoph Hellwig <hch@infradead.org>
  # Rewritten to use lists instead of if-statements.
-# 
+#
  
  obj-y :=       open.o read_write.o file_table.o super.o \
                 char_dev.o stat.o exec.o pipe.o namei.o fcntl.o \
@@ -62,7 +62,7 @@ obj-y                         += devpts/
  
  obj-$(CONFIG_PROFILING)                += dcookies.o
  obj-$(CONFIG_DLM)              += dlm/
- 
+
  # Do not add any filesystems before this line
  obj-$(CONFIG_FSCACHE)          += fscache/
  obj-$(CONFIG_REISERFS_FS)      += reiserfs/
@@ -84,6 +84,7 @@ obj-$(CONFIG_ISO9660_FS)      += isofs/
  obj-$(CONFIG_HFSPLUS_FS)       += hfsplus/ # Before hfs to find wrapped HFS+
  obj-$(CONFIG_HFS_FS)           += hfs/
  obj-$(CONFIG_ECRYPT_FS)                += ecryptfs/
+obj-$(CONFIG_SDCARD_FS)                += sdcardfs/
  obj-$(CONFIG_VXFS_FS)          += freevxfs/
  obj-$(CONFIG_NFS_FS)           += nfs/
  obj-$(CONFIG_EXPORTFS)         += exportfs/
diff --git a/fs/attr.c b/fs/attr.c

index 12ffdb6fb63c27dfe17101d67a867bdfd83b1916..f5510d993284be8e55788ee4642c9265012d5492 100644 (file)
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -202,7 +202,7 @@ EXPORT_SYMBOL(setattr_copy);
   * the file open for write, as there can be no conflicting delegation in
   * that case.
   */
-int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+int notify_change2(struct vfsmount *mnt, struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
  {
         struct inode *inode = dentry->d_inode;
         umode_t mode = inode->i_mode;
@@ -226,7 +226,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
                         return -EPERM;
  
                 if (!inode_owner_or_capable(inode)) {
-                       error = inode_permission(inode, MAY_WRITE);
+                       error = inode_permission2(mnt, inode, MAY_WRITE);
                         if (error)
                                 return error;
                 }
@@ -309,7 +309,9 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
         if (error)
                 return error;
  
-       if (inode->i_op->setattr)
+       if (mnt && inode->i_op->setattr2)
+               error = inode->i_op->setattr2(mnt, dentry, attr);
+       else if (inode->i_op->setattr)
                 error = inode->i_op->setattr(dentry, attr);
         else
                 error = simple_setattr(dentry, attr);
@@ -322,4 +324,10 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de
  
         return error;
  }
+EXPORT_SYMBOL(notify_change2);
+
+int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **delegated_inode)
+{
+       return notify_change2(NULL, dentry, attr, delegated_inode);
+}
  EXPORT_SYMBOL(notify_change);
diff --git a/fs/coredump.c b/fs/coredump.c

index 52c63d6c9143f6fe9370ffd2101b0976f9345c28..4b15f407c1c0a600ebae8c152ad02852ffa7d231 100644 (file)
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -747,7 +747,7 @@ void do_coredump(const siginfo_t *siginfo)
                         goto close_fail;
                 if (!(cprm.file->f_mode & FMODE_CAN_WRITE))
                         goto close_fail;
-               if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
+               if (do_truncate2(cprm.file->f_path.mnt, cprm.file->f_path.dentry, 0, 0, cprm.file))
                         goto close_fail;
         }
  
diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile

index 9f6607f17b53bfeba9d7940380928439f4e5b466..cb496989a6b693fe0b13cb2c737a7643288d5bc9 100644 (file)
--- a/fs/crypto/Makefile
+++ b/fs/crypto/Makefile
@@ -1,4 +1,4 @@
  obj-$(CONFIG_FS_ENCRYPTION)    += fscrypto.o
  
-fscrypto-y := crypto.o fname.o policy.o keyinfo.o
+fscrypto-y := crypto.o fname.o hooks.o keyinfo.o policy.o
  fscrypto-$(CONFIG_BLOCK) += bio.o
diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c

index d262a93d9b31cb35a9055c443b67e681f0552bf6..732a786cce9deabe490410ee6dfb15c72fc8f048 100644 (file)
--- a/fs/crypto/crypto.c
+++ b/fs/crypto/crypto.c
@@ -126,21 +126,6 @@ struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags)
  }
  EXPORT_SYMBOL(fscrypt_get_ctx);
  
-/**
- * page_crypt_complete() - completion callback for page crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void page_crypt_complete(struct crypto_async_request *req, int res)
-{
-       struct fscrypt_completion_result *ecr = req->data;
-
-       if (res == -EINPROGRESS)
-               return;
-       ecr->res = res;
-       complete(&ecr->completion);
-}
-
  int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
                            u64 lblk_num, struct page *src_page,
                            struct page *dest_page, unsigned int len,
@@ -151,7 +136,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
                 u8 padding[FS_IV_SIZE - sizeof(__le64)];
         } iv;
         struct skcipher_request *req = NULL;
-       DECLARE_FS_COMPLETION_RESULT(ecr);
+       DECLARE_CRYPTO_WAIT(wait);
         struct scatterlist dst, src;
         struct fscrypt_info *ci = inode->i_crypt_info;
         struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -179,7 +164,7 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
  
         skcipher_request_set_callback(
                 req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-               page_crypt_complete, &ecr);
+               crypto_req_done, &wait);
  
         sg_init_table(&dst, 1);
         sg_set_page(&dst, dest_page, len, offs);
@@ -187,14 +172,9 @@ int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw,
         sg_set_page(&src, src_page, len, offs);
         skcipher_request_set_crypt(req, &src, &dst, len, &iv);
         if (rw == FS_DECRYPT)
-               res = crypto_skcipher_decrypt(req);
+               res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
         else
-               res = crypto_skcipher_encrypt(req);
-       if (res == -EINPROGRESS || res == -EBUSY) {
-               BUG_ON(req->base.data != &ecr);
-               wait_for_completion(&ecr.completion);
-               res = ecr.res;
-       }
+               res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
         skcipher_request_free(req);
         if (res) {
                 printk_ratelimited(KERN_ERR
@@ -340,7 +320,7 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags)
                 return -ECHILD;
  
         dir = dget_parent(dentry);
-       if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) {
+       if (!IS_ENCRYPTED(d_inode(dir))) {
                 dput(dir);
                 return 0;
         }
diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c

index 8606da1df0aa7e129810e3d73b86f50fa2216270..305541bcd108389695c5c20e37350d76936abb2d 100644 (file)
--- a/fs/crypto/fname.c
+++ b/fs/crypto/fname.c
@@ -15,21 +15,6 @@
  #include <linux/ratelimit.h>
  #include "fscrypt_private.h"
  
-/**
- * fname_crypt_complete() - completion callback for filename crypto
- * @req: The asynchronous cipher request context
- * @res: The result of the cipher operation
- */
-static void fname_crypt_complete(struct crypto_async_request *req, int res)
-{
-       struct fscrypt_completion_result *ecr = req->data;
-
-       if (res == -EINPROGRESS)
-               return;
-       ecr->res = res;
-       complete(&ecr->completion);
-}
-
  /**
   * fname_encrypt() - encrypt a filename
   *
@@ -41,7 +26,7 @@ static int fname_encrypt(struct inode *inode,
                         const struct qstr *iname, struct fscrypt_str *oname)
  {
         struct skcipher_request *req = NULL;
-       DECLARE_FS_COMPLETION_RESULT(ecr);
+       DECLARE_CRYPTO_WAIT(wait);
         struct fscrypt_info *ci = inode->i_crypt_info;
         struct crypto_skcipher *tfm = ci->ci_ctfm;
         int res = 0;
@@ -77,17 +62,12 @@ static int fname_encrypt(struct inode *inode,
         }
         skcipher_request_set_callback(req,
                         CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                       fname_crypt_complete, &ecr);
+                       crypto_req_done, &wait);
         sg_init_one(&sg, oname->name, cryptlen);
         skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv);
  
         /* Do the encryption */
-       res = crypto_skcipher_encrypt(req);
-       if (res == -EINPROGRESS || res == -EBUSY) {
-               /* Request is being completed asynchronously; wait for it */
-               wait_for_completion(&ecr.completion);
-               res = ecr.res;
-       }
+       res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
         skcipher_request_free(req);
         if (res < 0) {
                 printk_ratelimited(KERN_ERR
@@ -111,7 +91,7 @@ static int fname_decrypt(struct inode *inode,
                                 struct fscrypt_str *oname)
  {
         struct skcipher_request *req = NULL;
-       DECLARE_FS_COMPLETION_RESULT(ecr);
+       DECLARE_CRYPTO_WAIT(wait);
         struct scatterlist src_sg, dst_sg;
         struct fscrypt_info *ci = inode->i_crypt_info;
         struct crypto_skcipher *tfm = ci->ci_ctfm;
@@ -132,7 +112,7 @@ static int fname_decrypt(struct inode *inode,
         }
         skcipher_request_set_callback(req,
                 CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-               fname_crypt_complete, &ecr);
+               crypto_req_done, &wait);
  
         /* Initialize IV */
         memset(iv, 0, FS_CRYPTO_BLOCK_SIZE);
@@ -141,11 +121,7 @@ static int fname_decrypt(struct inode *inode,
         sg_init_one(&src_sg, iname->name, iname->len);
         sg_init_one(&dst_sg, oname->name, oname->len);
         skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv);
-       res = crypto_skcipher_decrypt(req);
-       if (res == -EINPROGRESS || res == -EBUSY) {
-               wait_for_completion(&ecr.completion);
-               res = ecr.res;
-       }
+       res = crypto_wait_req(crypto_skcipher_decrypt(req), &wait);
         skcipher_request_free(req);
         if (res < 0) {
                 printk_ratelimited(KERN_ERR
@@ -383,8 +359,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname,
         memset(fname, 0, sizeof(struct fscrypt_name));
         fname->usr_fname = iname;
  
-       if (!dir->i_sb->s_cop->is_encrypted(dir) ||
-                               fscrypt_is_dot_dotdot(iname)) {
+       if (!IS_ENCRYPTED(dir) || fscrypt_is_dot_dotdot(iname)) {
                 fname->disk_name.name = (unsigned char *)iname->name;
                 fname->disk_name.len = iname->len;
                 return 0;
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h

index 092e9dad14144e8b71e9432597c3646d76735000..c0b4f5597e1a3ee772d1c6ecba8f3b73033b74ba 100644 (file)
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -12,7 +12,8 @@
  #ifndef _FSCRYPT_PRIVATE_H
  #define _FSCRYPT_PRIVATE_H
  
-#include <linux/fscrypt_supp.h>
+#define __FS_HAS_ENCRYPTION 1
+#include <linux/fscrypt.h>
  #include <crypto/hash.h>
  
  /* Encryption parameters */
@@ -70,16 +71,6 @@ typedef enum {
  #define FS_CTX_REQUIRES_FREE_ENCRYPT_FL                0x00000001
  #define FS_CTX_HAS_BOUNCE_BUFFER_FL            0x00000002
  
-struct fscrypt_completion_result {
-       struct completion completion;
-       int res;
-};
-
-#define DECLARE_FS_COMPLETION_RESULT(ecr) \
-       struct fscrypt_completion_result ecr = { \
-               COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 }
-
-
  /* crypto.c */
  extern int fscrypt_initialize(unsigned int cop_flags);
  extern struct workqueue_struct *fscrypt_read_workqueue;
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c

new file mode 100644 (file)

index 0000000..9f5fb2e
--- /dev/null
+++ b/fs/crypto/hooks.c
@@ -0,0 +1,112 @@
+/*
+ * fs/crypto/hooks.c
+ *
+ * Encryption hooks for higher-level filesystem operations.
+ */
+
+#include <linux/ratelimit.h>
+#include "fscrypt_private.h"
+
+/**
+ * fscrypt_file_open - prepare to open a possibly-encrypted regular file
+ * @inode: the inode being opened
+ * @filp: the struct file being set up
+ *
+ * Currently, an encrypted regular file can only be opened if its encryption key
+ * is available; access to the raw encrypted contents is not supported.
+ * Therefore, we first set up the inode's encryption key (if not already done)
+ * and return an error if it's unavailable.
+ *
+ * We also verify that if the parent directory (from the path via which the file
+ * is being opened) is encrypted, then the inode being opened uses the same
+ * encryption policy.  This is needed as part of the enforcement that all files
+ * in an encrypted directory tree use the same encryption policy, as a
+ * protection against certain types of offline attacks.  Note that this check is
+ * needed even when opening an *unencrypted* file, since it's forbidden to have
+ * an unencrypted file in an encrypted directory.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ */
+int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+       int err;
+       struct dentry *dir;
+
+       err = fscrypt_require_key(inode);
+       if (err)
+               return err;
+
+       dir = dget_parent(file_dentry(filp));
+       if (IS_ENCRYPTED(d_inode(dir)) &&
+           !fscrypt_has_permitted_context(d_inode(dir), inode)) {
+               pr_warn_ratelimited("fscrypt: inconsistent encryption contexts: %lu/%lu",
+                                   d_inode(dir)->i_ino, inode->i_ino);
+               err = -EPERM;
+       }
+       dput(dir);
+       return err;
+}
+EXPORT_SYMBOL_GPL(fscrypt_file_open);
+
+int __fscrypt_prepare_link(struct inode *inode, struct inode *dir)
+{
+       int err;
+
+       err = fscrypt_require_key(dir);
+       if (err)
+               return err;
+
+       if (!fscrypt_has_permitted_context(dir, inode))
+               return -EPERM;
+
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_link);
+
+int __fscrypt_prepare_rename(struct inode *old_dir, struct dentry *old_dentry,
+                            struct inode *new_dir, struct dentry *new_dentry,
+                            unsigned int flags)
+{
+       int err;
+
+       err = fscrypt_require_key(old_dir);
+       if (err)
+               return err;
+
+       err = fscrypt_require_key(new_dir);
+       if (err)
+               return err;
+
+       if (old_dir != new_dir) {
+               if (IS_ENCRYPTED(new_dir) &&
+                   !fscrypt_has_permitted_context(new_dir,
+                                                  d_inode(old_dentry)))
+                       return -EPERM;
+
+               if ((flags & RENAME_EXCHANGE) &&
+                   IS_ENCRYPTED(old_dir) &&
+                   !fscrypt_has_permitted_context(old_dir,
+                                                  d_inode(new_dentry)))
+                       return -EPERM;
+       }
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_rename);
+
+int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry)
+{
+       int err = fscrypt_get_encryption_info(dir);
+
+       if (err)
+               return err;
+
+       if (fscrypt_has_encryption_key(dir)) {
+               spin_lock(&dentry->d_lock);
+               dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY;
+               spin_unlock(&dentry->d_lock);
+       }
+
+       d_set_d_op(dentry, &fscrypt_d_ops);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(__fscrypt_prepare_lookup);
diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c

index a38630214058214dec6c30eca64f74f3f7f693df..5e6e846f5a24dde322846fe54f642a40fbdc2f3c 100644 (file)
--- a/fs/crypto/keyinfo.c
+++ b/fs/crypto/keyinfo.c
@@ -18,17 +18,6 @@
  
  static struct crypto_shash *essiv_hash_tfm;
  
-static void derive_crypt_complete(struct crypto_async_request *req, int rc)
-{
-       struct fscrypt_completion_result *ecr = req->data;
-
-       if (rc == -EINPROGRESS)
-               return;
-
-       ecr->res = rc;
-       complete(&ecr->completion);
-}
-
  /**
   * derive_key_aes() - Derive a key using AES-128-ECB
   * @deriving_key: Encryption key used for derivation.
@@ -43,7 +32,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
  {
         int res = 0;
         struct skcipher_request *req = NULL;
-       DECLARE_FS_COMPLETION_RESULT(ecr);
+       DECLARE_CRYPTO_WAIT(wait);
         struct scatterlist src_sg, dst_sg;
         struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0);
  
@@ -60,7 +49,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
         }
         skcipher_request_set_callback(req,
                         CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP,
-                       derive_crypt_complete, &ecr);
+                       crypto_req_done, &wait);
         res = crypto_skcipher_setkey(tfm, deriving_key,
                                         FS_AES_128_ECB_KEY_SIZE);
         if (res < 0)
@@ -70,11 +59,7 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE],
         sg_init_one(&dst_sg, derived_raw_key, source_key->size);
         skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size,
                                    NULL);
-       res = crypto_skcipher_encrypt(req);
-       if (res == -EINPROGRESS || res == -EBUSY) {
-               wait_for_completion(&ecr.completion);
-               res = ecr.res;
-       }
+       res = crypto_wait_req(crypto_skcipher_encrypt(req), &wait);
  out:
         skcipher_request_free(req);
         crypto_free_skcipher(tfm);
@@ -274,7 +259,7 @@ int fscrypt_get_encryption_info(struct inode *inode)
         res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
         if (res < 0) {
                 if (!fscrypt_dummy_context_enabled(inode) ||
-                   inode->i_sb->s_cop->is_encrypted(inode))
+                   IS_ENCRYPTED(inode))
                         return res;
                 /* Fake up a context for an unencrypted directory */
                 memset(&ctx, 0, sizeof(ctx));
@@ -374,7 +359,7 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci)
         struct fscrypt_info *prev;
  
         if (ci == NULL)
-               ci = ACCESS_ONCE(inode->i_crypt_info);
+               ci = READ_ONCE(inode->i_crypt_info);
         if (ci == NULL)
                 return;
  
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c

index a120649beeca4e5e9b0e54d6e7ed16de7fb47f17..c6d431a5cce932fd80385415fe598aca0f74ba05 100644 (file)
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -110,7 +110,7 @@ int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg)
         struct fscrypt_policy policy;
         int res;
  
-       if (!inode->i_sb->s_cop->is_encrypted(inode))
+       if (!IS_ENCRYPTED(inode))
                 return -ENODATA;
  
         res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx));
@@ -167,11 +167,11 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child)
                 return 1;
  
         /* No restrictions if the parent directory is unencrypted */
-       if (!cops->is_encrypted(parent))
+       if (!IS_ENCRYPTED(parent))
                 return 1;
  
         /* Encrypted directories must not contain unencrypted files */
-       if (!cops->is_encrypted(child))
+       if (!IS_ENCRYPTED(child))
                 return 0;
  
         /*
diff --git a/fs/dcache.c b/fs/dcache.c

index 34c852af215c0f1ff2d73800ed0bc02c9a2f0794..694f5da1a0a9d316b3730202ebd2a8bda18c79b3 100644 (file)
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -3223,6 +3223,7 @@ char *d_absolute_path(const struct path *path,
                 return ERR_PTR(error);
         return res;
  }
+EXPORT_SYMBOL(d_absolute_path);
  
  /*
   * same as __d_path but appends "(deleted)" for unlinked files.
diff --git a/fs/eventpoll.c b/fs/eventpoll.c

index 2fabd19cdeea76e32f51f3c84348a22b9fe22843..a7d394704a55f2e48c7bb63f2259c8c558fa075a 100644 (file)
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -34,6 +34,7 @@
  #include <linux/mutex.h>
  #include <linux/anon_inodes.h>
  #include <linux/device.h>
+#include <linux/freezer.h>
  #include <linux/uaccess.h>
  #include <asm/io.h>
  #include <asm/mman.h>
@@ -1826,7 +1827,8 @@ fetch_events:
                         }
  
                         spin_unlock_irqrestore(&ep->lock, flags);
-                       if (!schedule_hrtimeout_range(to, slack, HRTIMER_MODE_ABS))
+                       if (!freezable_schedule_hrtimeout_range(to, slack,
+                                                               HRTIMER_MODE_ABS))
                                 timed_out = 1;
  
                         spin_lock_irqsave(&ep->lock, flags);
diff --git a/fs/exec.c b/fs/exec.c

index acec119fcc3141f8bf5e615e89bfbd95e7892e2f..ea3c4924dd6bdd8b5a859c542ec0a31d034f4717 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1304,7 +1304,7 @@ EXPORT_SYMBOL(flush_old_exec);
  void would_dump(struct linux_binprm *bprm, struct file *file)
  {
         struct inode *inode = file_inode(file);
-       if (inode_permission(inode, MAY_READ) < 0) {
+       if (inode_permission2(file->f_path.mnt, inode, MAY_READ) < 0) {
                 struct user_namespace *old, *user_ns;
                 bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
  
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h

index 58a0304566dbbd19742ca11d4e28af10cfd4af3f..27f38bb5046d6a7cf9d016be023083373ec34a54 100644 (file)
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -34,17 +34,15 @@
  #include <linux/percpu_counter.h>
  #include <linux/ratelimit.h>
  #include <crypto/hash.h>
-#ifdef CONFIG_EXT4_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
  #include <linux/falloc.h>
  #include <linux/percpu-rwsem.h>
  #ifdef __KERNEL__
  #include <linux/compat.h>
  #endif
  
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_EXT4_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
  /*
   * The fourth extended filesystem constants/structures
   */
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c

index fd9501977f1c7a5d9db13ab9241f9efe6ee47962..4e63507f5ce101caa6d488c37ca14653e1c50511 100644 (file)
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -18,6 +18,7 @@
  #include "ext4.h"
  #include "xattr.h"
  #include "truncate.h"
+#include <trace/events/android_fs.h>
  
  #define EXT4_XATTR_SYSTEM_DATA "data"
  #define EXT4_MIN_INLINE_DATA_SIZE      ((sizeof(__le32) * EXT4_N_BLOCKS))
@@ -504,6 +505,17 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
                 return -EAGAIN;
         }
  
+       if (trace_android_fs_dataread_start_enabled()) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_dataread_start(inode, page_offset(page),
+                                               PAGE_SIZE, current->pid,
+                                               path, current->comm);
+       }
+
         /*
          * Current inline data can only exist in the 1st page,
          * So for all the other pages, just set them uptodate.
@@ -515,6 +527,8 @@ int ext4_readpage_inline(struct inode *inode, struct page *page)
                 SetPageUptodate(page);
         }
  
+       trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE);
+
         up_read(&EXT4_I(inode)->xattr_sem);
  
         unlock_page(page);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c

index ea2ccc524bd98bb0dbb26207bd4a21ebe5528b37..1c258158d8efaed76fba5fa8c44009be6e40e320 100644 (file)
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -46,6 +46,7 @@
  #include "truncate.h"
  
  #include <trace/events/ext4.h>
+#include <trace/events/android_fs.h>
  
  #define MPAGE_DA_EXTENT_TAIL 0x01
  
@@ -1252,6 +1253,16 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
         if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
                 return -EIO;
  
+       if (trace_android_fs_datawrite_start_enabled()) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_datawrite_start(inode, pos, len,
+                                                current->pid, path,
+                                                current->comm);
+       }
         trace_ext4_write_begin(inode, pos, len, flags);
         /*
          * Reserve one block more for addition to orphan list in case
@@ -1389,6 +1400,7 @@ static int ext4_write_end(struct file *file,
         int ret = 0, ret2;
         int i_size_changed = 0;
  
+       trace_android_fs_datawrite_end(inode, pos, len);
         trace_ext4_write_end(inode, pos, len, copied);
         if (ext4_has_inline_data(inode)) {
                 ret = ext4_write_inline_data_end(inode, pos, len,
@@ -1493,6 +1505,7 @@ static int ext4_journalled_write_end(struct file *file,
         unsigned from, to;
         int size_changed = 0;
  
+       trace_android_fs_datawrite_end(inode, pos, len);
         trace_ext4_journalled_write_end(inode, pos, len, copied);
         from = pos & (PAGE_SIZE - 1);
         to = from + len;
@@ -3033,6 +3046,16 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
                                         len, flags, pagep, fsdata);
         }
         *fsdata = (void *)0;
+       if (trace_android_fs_datawrite_start_enabled()) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_datawrite_start(inode, pos, len,
+                                                current->pid,
+                                                path, current->comm);
+       }
         trace_ext4_da_write_begin(inode, pos, len, flags);
  
         if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
@@ -3151,6 +3174,7 @@ static int ext4_da_write_end(struct file *file,
                 return ext4_write_end(file, mapping, pos,
                                       len, copied, page, fsdata);
  
+       trace_android_fs_datawrite_end(inode, pos, len);
         trace_ext4_da_write_end(inode, pos, len, copied);
         start = pos & (PAGE_SIZE - 1);
         end = start + copied - 1;
@@ -3789,6 +3813,7 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         size_t count = iov_iter_count(iter);
         loff_t offset = iocb->ki_pos;
         ssize_t ret;
+       int rw = iov_iter_rw(iter);
  
  #ifdef CONFIG_EXT4_FS_ENCRYPTION
         if (ext4_encrypted_inode(inode) && S_ISREG(inode->i_mode))
@@ -3809,12 +3834,42 @@ static ssize_t ext4_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
         if (WARN_ON_ONCE(IS_DAX(inode)))
                 return 0;
  
+       if (trace_android_fs_dataread_start_enabled() &&
+           (rw == READ)) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_dataread_start(inode, offset, count,
+                                               current->pid, path,
+                                               current->comm);
+       }
+       if (trace_android_fs_datawrite_start_enabled() &&
+           (rw == WRITE)) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_datawrite_start(inode, offset, count,
+                                                current->pid, path,
+                                                current->comm);
+       }
         trace_ext4_direct_IO_enter(inode, offset, count, iov_iter_rw(iter));
         if (iov_iter_rw(iter) == READ)
                 ret = ext4_direct_IO_read(iocb, iter);
         else
                 ret = ext4_direct_IO_write(iocb, iter);
         trace_ext4_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), ret);
+
+       if (trace_android_fs_dataread_start_enabled() &&
+           (rw == READ))
+               trace_android_fs_dataread_end(inode, offset, count);
+       if (trace_android_fs_datawrite_start_enabled() &&
+           (rw == WRITE))
+               trace_android_fs_datawrite_end(inode, offset, count);
+
         return ret;
  }
  
@@ -4599,10 +4654,13 @@ void ext4_set_inode_flags(struct inode *inode)
                 new_fl |= S_DIRSYNC;
         if (test_opt(inode->i_sb, DAX) && S_ISREG(inode->i_mode) &&
             !ext4_should_journal_data(inode) && !ext4_has_inline_data(inode) &&
-           !ext4_encrypted_inode(inode))
+           !(flags & EXT4_ENCRYPT_FL))
                 new_fl |= S_DAX;
+       if (flags & EXT4_ENCRYPT_FL)
+               new_fl |= S_ENCRYPTED;
         inode_set_flags(inode, new_fl,
-                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX);
+                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|S_DAX|
+                       S_ENCRYPTED);
  }
  
  static blkcnt_t ext4_inode_blocks(struct ext4_inode *raw_inode,
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c

index 9ffa6fad18dbef1528f3ddbacc67fafff370cf03..df22fcb3c41c2922fdf94602f3e412290fbb932b 100644 (file)
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -46,6 +46,7 @@
  #include <linux/cleancache.h>
  
  #include "ext4.h"
+#include <trace/events/android_fs.h>
  
  static inline bool ext4_bio_encrypted(struct bio *bio)
  {
@@ -56,6 +57,17 @@ static inline bool ext4_bio_encrypted(struct bio *bio)
  #endif
  }
  
+static void
+ext4_trace_read_completion(struct bio *bio)
+{
+       struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+       if (first_page != NULL)
+               trace_android_fs_dataread_end(first_page->mapping->host,
+                                             page_offset(first_page),
+                                             bio->bi_iter.bi_size);
+}
+
  /*
   * I/O completion handler for multipage BIOs.
   *
@@ -73,6 +85,9 @@ static void mpage_end_io(struct bio *bio)
         struct bio_vec *bv;
         int i;
  
+       if (trace_android_fs_dataread_start_enabled())
+               ext4_trace_read_completion(bio);
+
         if (ext4_bio_encrypted(bio)) {
                 if (bio->bi_status) {
                         fscrypt_release_ctx(bio->bi_private);
@@ -96,6 +111,30 @@ static void mpage_end_io(struct bio *bio)
         bio_put(bio);
  }
  
+static void
+ext4_submit_bio_read(struct bio *bio)
+{
+       if (trace_android_fs_dataread_start_enabled()) {
+               struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+               if (first_page != NULL) {
+                       char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+                       path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   first_page->mapping->host);
+                       trace_android_fs_dataread_start(
+                               first_page->mapping->host,
+                               page_offset(first_page),
+                               bio->bi_iter.bi_size,
+                               current->pid,
+                               path,
+                               current->comm);
+               }
+       }
+       submit_bio(bio);
+}
+
  int ext4_mpage_readpages(struct address_space *mapping,
                          struct list_head *pages, struct page *page,
                          unsigned nr_pages)
@@ -236,7 +275,7 @@ int ext4_mpage_readpages(struct address_space *mapping,
                  */
                 if (bio && (last_block_in_bio != blocks[0] - 1)) {
                 submit_and_realloc:
-                       submit_bio(bio);
+                       ext4_submit_bio_read(bio);
                         bio = NULL;
                 }
                 if (bio == NULL) {
@@ -269,14 +308,14 @@ int ext4_mpage_readpages(struct address_space *mapping,
                 if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
                      (relative_block == map.m_len)) ||
                     (first_hole != blocks_per_page)) {
-                       submit_bio(bio);
+                       ext4_submit_bio_read(bio);
                         bio = NULL;
                 } else
                         last_block_in_bio = blocks[blocks_per_page - 1];
                 goto next_page;
         confused:
                 if (bio) {
-                       submit_bio(bio);
+                       ext4_submit_bio_read(bio);
                         bio = NULL;
                 }
                 if (!PageUptodate(page))
@@ -289,6 +328,6 @@ int ext4_mpage_readpages(struct address_space *mapping,
         }
         BUG_ON(pages && !list_empty(pages));
         if (bio)
-               submit_bio(bio);
+               ext4_submit_bio_read(bio);
         return 0;
  }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c

index f29351c666109752c09b64ba9f9ea47f55095971..e6a5df2ec8d16cd60792ac2db1c5d42c805bd60e 100644 (file)
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1181,7 +1181,8 @@ static int ext4_set_context(struct inode *inode, const void *ctx, size_t len,
                         ext4_clear_inode_state(inode,
                                         EXT4_STATE_MAY_INLINE_DATA);
                         /*
-                        * Update inode->i_flags - e.g. S_DAX may get disabled
+                        * Update inode->i_flags - S_ENCRYPTED will be enabled,
+                        * S_DAX may be disabled
                          */
                         ext4_set_inode_flags(inode);
                 }
@@ -1206,7 +1207,10 @@ retry:
                                     ctx, len, 0);
         if (!res) {
                 ext4_set_inode_flag(inode, EXT4_INODE_ENCRYPT);
-               /* Update inode->i_flags - e.g. S_DAX may get disabled */
+               /*
+                * Update inode->i_flags - S_ENCRYPTED will be enabled,
+                * S_DAX may be disabled
+                */
                 ext4_set_inode_flags(inode);
                 res = ext4_mark_inode_dirty(handle, inode);
                 if (res)
@@ -1237,14 +1241,9 @@ static const struct fscrypt_operations ext4_cryptops = {
         .get_context            = ext4_get_context,
         .set_context            = ext4_set_context,
         .dummy_context          = ext4_dummy_context,
-       .is_encrypted           = ext4_encrypted_inode,
         .empty_dir              = ext4_empty_dir,
         .max_namelen            = ext4_max_namelen,
  };
-#else
-static const struct fscrypt_operations ext4_cryptops = {
-       .is_encrypted           = ext4_encrypted_inode,
-};
  #endif
  
  #ifdef CONFIG_QUOTA
@@ -4001,7 +4000,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         sb->s_op = &ext4_sops;
         sb->s_export_op = &ext4_export_ops;
         sb->s_xattr = ext4_xattr_handlers;
+#ifdef CONFIG_EXT4_FS_ENCRYPTION
         sb->s_cop = &ext4_cryptops;
+#endif
  #ifdef CONFIG_QUOTA
         sb->dq_op = &ext4_quota_operations;
         if (ext4_has_feature_quota(sb))
diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c

index 436b3a1464d9bd29c8756750b3a38154b2e1c313..2bb7c9fc5144aba162b01ba72295994f9fd23f00 100644 (file)
--- a/fs/f2fs/acl.c
+++ b/fs/f2fs/acl.c
@@ -250,6 +250,9 @@ static int __f2fs_set_acl(struct inode *inode, int type,
  
  int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type)
  {
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         return __f2fs_set_acl(inode, type, acl, NULL);
  }
  
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c

index 04fe1df052b2b9e6a0c3f662797c3f46727ccdbe..a30024f2a567a9a34c95bd7c3f3506e8aae319fd 100644 (file)
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -29,7 +29,6 @@ struct kmem_cache *inode_entry_slab;
  void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io)
  {
         set_ckpt_flags(sbi, CP_ERROR_FLAG);
-       sbi->sb->s_flags |= MS_RDONLY;
         if (!end_io)
                 f2fs_flush_merged_writes(sbi);
  }
@@ -401,24 +400,23 @@ const struct address_space_operations f2fs_meta_aops = {
  #endif
  };
  
-static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
+static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino,
+                                               unsigned int devidx, int type)
  {
         struct inode_management *im = &sbi->im[type];
         struct ino_entry *e, *tmp;
  
         tmp = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS);
-retry:
+
         radix_tree_preload(GFP_NOFS | __GFP_NOFAIL);
  
         spin_lock(&im->ino_lock);
         e = radix_tree_lookup(&im->ino_root, ino);
         if (!e) {
                 e = tmp;
-               if (radix_tree_insert(&im->ino_root, ino, e)) {
-                       spin_unlock(&im->ino_lock);
-                       radix_tree_preload_end();
-                       goto retry;
-               }
+               if (unlikely(radix_tree_insert(&im->ino_root, ino, e)))
+                       f2fs_bug_on(sbi, 1);
+
                 memset(e, 0, sizeof(struct ino_entry));
                 e->ino = ino;
  
@@ -426,6 +424,10 @@ retry:
                 if (type != ORPHAN_INO)
                         im->ino_num++;
         }
+
+       if (type == FLUSH_INO)
+               f2fs_set_bit(devidx, (char *)&e->dirty_device);
+
         spin_unlock(&im->ino_lock);
         radix_tree_preload_end();
  
@@ -454,7 +456,7 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
  void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
  {
         /* add new dirty ino entry into list */
-       __add_ino_entry(sbi, ino, type);
+       __add_ino_entry(sbi, ino, 0, type);
  }
  
  void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type)
@@ -480,7 +482,7 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
         struct ino_entry *e, *tmp;
         int i;
  
-       for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) {
+       for (i = all ? ORPHAN_INO : APPEND_INO; i < MAX_INO_ENTRY; i++) {
                 struct inode_management *im = &sbi->im[i];
  
                 spin_lock(&im->ino_lock);
@@ -494,6 +496,27 @@ void release_ino_entry(struct f2fs_sb_info *sbi, bool all)
         }
  }
  
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+                                       unsigned int devidx, int type)
+{
+       __add_ino_entry(sbi, ino, devidx, type);
+}
+
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+                                       unsigned int devidx, int type)
+{
+       struct inode_management *im = &sbi->im[type];
+       struct ino_entry *e;
+       bool is_dirty = false;
+
+       spin_lock(&im->ino_lock);
+       e = radix_tree_lookup(&im->ino_root, ino);
+       if (e && f2fs_test_bit(devidx, (char *)&e->dirty_device))
+               is_dirty = true;
+       spin_unlock(&im->ino_lock);
+       return is_dirty;
+}
+
  int acquire_orphan_inode(struct f2fs_sb_info *sbi)
  {
         struct inode_management *im = &sbi->im[ORPHAN_INO];
@@ -530,7 +553,7 @@ void release_orphan_inode(struct f2fs_sb_info *sbi)
  void add_orphan_inode(struct inode *inode)
  {
         /* add new orphan ino entry into list */
-       __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO);
+       __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, 0, ORPHAN_INO);
         update_inode_page(inode);
  }
  
@@ -554,7 +577,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino)
                 return err;
         }
  
-       __add_ino_entry(sbi, ino, ORPHAN_INO);
+       __add_ino_entry(sbi, ino, 0, ORPHAN_INO);
  
         inode = f2fs_iget_retry(sbi->sb, ino);
         if (IS_ERR(inode)) {
@@ -590,6 +613,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
         block_t start_blk, orphan_blocks, i, j;
         unsigned int s_flags = sbi->sb->s_flags;
         int err = 0;
+#ifdef CONFIG_QUOTA
+       int quota_enabled;
+#endif
  
         if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG))
                 return 0;
@@ -602,8 +628,9 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
  #ifdef CONFIG_QUOTA
         /* Needed for iput() to work correctly and not trash data */
         sbi->sb->s_flags |= MS_ACTIVE;
+
         /* Turn on quotas so that they are updated correctly */
-       f2fs_enable_quota_files(sbi);
+       quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
  #endif
  
         start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi);
@@ -631,7 +658,8 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi)
  out:
  #ifdef CONFIG_QUOTA
         /* Turn quotas off */
-       f2fs_quota_off_umount(sbi->sb);
+       if (quota_enabled)
+               f2fs_quota_off_umount(sbi->sb);
  #endif
         sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
  
@@ -986,7 +1014,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi)
                                 update_inode_page(inode);
                         iput(inode);
                 }
-       };
+       }
         return 0;
  }
  
@@ -1146,6 +1174,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         struct super_block *sb = sbi->sb;
         struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE);
         u64 kbytes_written;
+       int err;
  
         /* Flush all the NAT/SIT pages */
         while (get_pages(sbi, F2FS_DIRTY_META)) {
@@ -1239,6 +1268,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         if (unlikely(f2fs_cp_error(sbi)))
                 return -EIO;
  
+       /* flush all device cache */
+       err = f2fs_flush_device_cache(sbi);
+       if (err)
+               return err;
+
         /* write out checkpoint buffer at block 0 */
         update_meta_page(sbi, ckpt, start_blk++);
  
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c

index 36b535207c88906efae946f71195781497be87f5..823c842a7414f33effd6ce07c04e204d2e44dcc8 100644 (file)
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -29,6 +29,7 @@
  #include "segment.h"
  #include "trace.h"
  #include <trace/events/f2fs.h>
+#include <trace/events/android_fs.h>
  
  static bool __is_cp_guaranteed(struct page *page)
  {
@@ -173,7 +174,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr,
  {
         struct bio *bio;
  
-       bio = f2fs_bio_alloc(npages);
+       bio = f2fs_bio_alloc(sbi, npages, true);
  
         f2fs_target_device(sbi, blk_addr, bio);
         bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io;
@@ -418,8 +419,8 @@ next:
  
         bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
  
-       /* set submitted = 1 as a return value */
-       fio->submitted = 1;
+       /* set submitted = true as a return value */
+       fio->submitted = true;
  
         inc_page_count(sbi, WB_DATA_TYPE(bio_page));
  
@@ -473,7 +474,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
                 f2fs_wait_on_block_writeback(sbi, blkaddr);
         }
  
-       bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES));
+       bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
         if (!bio) {
                 if (ctx)
                         fscrypt_release_ctx(ctx);
@@ -833,6 +834,13 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
         struct f2fs_map_blocks map;
         int err = 0;
  
+       /* convert inline data for Direct I/O*/
+       if (iocb->ki_flags & IOCB_DIRECT) {
+               err = f2fs_convert_inline_inode(inode);
+               if (err)
+                       return err;
+       }
+
         if (is_inode_flag_set(inode, FI_NO_PREALLOC))
                 return 0;
  
@@ -845,15 +853,11 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
  
         map.m_next_pgofs = NULL;
  
-       if (iocb->ki_flags & IOCB_DIRECT) {
-               err = f2fs_convert_inline_inode(inode);
-               if (err)
-                       return err;
+       if (iocb->ki_flags & IOCB_DIRECT)
                 return f2fs_map_blocks(inode, &map, 1,
                         __force_buffered_io(inode, WRITE) ?
                                 F2FS_GET_BLOCK_PRE_AIO :
                                 F2FS_GET_BLOCK_PRE_DIO);
-       }
         if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
                 err = f2fs_convert_inline_inode(inode);
                 if (err)
@@ -1334,7 +1338,7 @@ static int f2fs_read_data_pages(struct file *file,
                         struct address_space *mapping,
                         struct list_head *pages, unsigned nr_pages)
  {
-       struct inode *inode = file->f_mapping->host;
+       struct inode *inode = mapping->host;
         struct page *page = list_last_entry(pages, struct page, lru);
  
         trace_f2fs_readpages(inode, page, nr_pages);
@@ -1495,6 +1499,7 @@ static int __write_data_page(struct page *page, bool *submitted,
         int err = 0;
         struct f2fs_io_info fio = {
                 .sbi = sbi,
+               .ino = inode->i_ino,
                 .type = DATA,
                 .op = REQ_OP_WRITE,
                 .op_flags = wbc_to_write_flags(wbc),
@@ -1566,8 +1571,11 @@ write:
                         err = do_write_data_page(&fio);
                 }
         }
+
+       down_write(&F2FS_I(inode)->i_sem);
         if (F2FS_I(inode)->last_disk_size < psize)
                 F2FS_I(inode)->last_disk_size = psize;
+       up_write(&F2FS_I(inode)->i_sem);
  
  done:
         if (err && err != -ENOENT)
@@ -1935,8 +1943,24 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping,
         block_t blkaddr = NULL_ADDR;
         int err = 0;
  
+       if (trace_android_fs_datawrite_start_enabled()) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_datawrite_start(inode, pos, len,
+                                                current->pid, path,
+                                                current->comm);
+       }
         trace_f2fs_write_begin(inode, pos, len, flags);
  
+       if (f2fs_is_atomic_file(inode) &&
+                       !available_free_memory(sbi, INMEM_PAGES)) {
+               err = -ENOMEM;
+               goto fail;
+       }
+
         /*
          * We should check this at this moment to avoid deadlock on inode page
          * and #0 page. The locking rule for inline_data conversion should be:
@@ -1952,7 +1976,7 @@ repeat:
          * Do not use grab_cache_page_write_begin() to avoid deadlock due to
          * wait_for_stable_page. Will wait that below with our IO control.
          */
-       page = pagecache_get_page(mapping, index,
+       page = f2fs_pagecache_get_page(mapping, index,
                                 FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
         if (!page) {
                 err = -ENOMEM;
@@ -2014,6 +2038,8 @@ repeat:
  fail:
         f2fs_put_page(page, 1);
         f2fs_write_failed(mapping, pos + len);
+       if (f2fs_is_atomic_file(inode))
+               drop_inmem_pages_all(sbi);
         return err;
  }
  
@@ -2024,6 +2050,7 @@ static int f2fs_write_end(struct file *file,
  {
         struct inode *inode = page->mapping->host;
  
+       trace_android_fs_datawrite_end(inode, pos, len);
         trace_f2fs_write_end(inode, pos, len, copied);
  
         /*
@@ -2082,6 +2109,29 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
  
         trace_f2fs_direct_IO_enter(inode, offset, count, rw);
  
+       if (trace_android_fs_dataread_start_enabled() &&
+           (rw == READ)) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_dataread_start(inode, offset,
+                                               count, current->pid, path,
+                                               current->comm);
+       }
+       if (trace_android_fs_datawrite_start_enabled() &&
+           (rw == WRITE)) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_datawrite_start(inode, offset, count,
+                                                current->pid, path,
+                                                current->comm);
+       }
+
         down_read(&F2FS_I(inode)->dio_rwsem[rw]);
         err = blockdev_direct_IO(iocb, inode, iter, get_data_block_dio);
         up_read(&F2FS_I(inode)->dio_rwsem[rw]);
@@ -2096,6 +2146,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
                 }
         }
  
+       if (trace_android_fs_dataread_start_enabled() &&
+           (rw == READ))
+               trace_android_fs_dataread_end(inode, offset, count);
+       if (trace_android_fs_datawrite_start_enabled() &&
+           (rw == WRITE))
+               trace_android_fs_datawrite_end(inode, offset, count);
+
         trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
  
         return err;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c

index 87f449845f5f9a8010ee13c40548fb6620802872..ecada84252680f70770a7edeb71f89337ccdc018 100644 (file)
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -45,9 +45,18 @@ static void update_general_status(struct f2fs_sb_info *sbi)
         si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS);
         si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META);
         si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA);
+       si->ndirty_qdata = get_pages(sbi, F2FS_DIRTY_QDATA);
         si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA);
         si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE];
         si->ndirty_files = sbi->ndirty_inode[FILE_INODE];
+
+       si->nquota_files = 0;
+       if (f2fs_sb_has_quota_ino(sbi->sb)) {
+               for (i = 0; i < MAXQUOTAS; i++) {
+                       if (f2fs_qf_ino(sbi->sb, i))
+                               si->nquota_files++;
+               }
+       }
         si->ndirty_all = sbi->ndirty_inode[DIRTY_META];
         si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES);
         si->aw_cnt = atomic_read(&sbi->aw_cnt);
@@ -61,6 +70,8 @@ static void update_general_status(struct f2fs_sb_info *sbi)
                         atomic_read(&SM_I(sbi)->fcc_info->issued_flush);
                 si->nr_flushing =
                         atomic_read(&SM_I(sbi)->fcc_info->issing_flush);
+               si->flush_list_empty =
+                       llist_empty(&SM_I(sbi)->fcc_info->issue_list);
         }
         if (SM_I(sbi) && SM_I(sbi)->dcc_info) {
                 si->nr_discarded =
@@ -96,9 +107,9 @@ static void update_general_status(struct f2fs_sb_info *sbi)
         si->dirty_nats = NM_I(sbi)->dirty_nat_cnt;
         si->sits = MAIN_SEGS(sbi);
         si->dirty_sits = SIT_I(sbi)->dirty_sentries;
-       si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST];
+       si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID];
         si->avail_nids = NM_I(sbi)->available_nids;
-       si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST];
+       si->alloc_nids = NM_I(sbi)->nid_cnt[PREALLOC_NID];
         si->bg_gc = sbi->bg_gc;
         si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg)
                 * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg)
@@ -231,14 +242,14 @@ get_cache:
         }
  
         /* free nids */
-       si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] +
-                               NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) *
+       si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID] +
+                               NM_I(sbi)->nid_cnt[PREALLOC_NID]) *
                                 sizeof(struct free_nid);
         si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry);
         si->cache_mem += NM_I(sbi)->dirty_nat_cnt *
                                         sizeof(struct nat_entry_set);
         si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages);
-       for (i = 0; i <= ORPHAN_INO; i++)
+       for (i = 0; i < MAX_INO_ENTRY; i++)
                 si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry);
         si->cache_mem += atomic_read(&sbi->total_ext_tree) *
                                                 sizeof(struct extent_tree);
@@ -262,9 +273,10 @@ static int stat_show(struct seq_file *s, void *v)
         list_for_each_entry(si, &f2fs_stat_list, stat_list) {
                 update_general_status(si->sbi);
  
-               seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n",
+               seq_printf(s, "\n=====[ partition info(%pg). #%d, %s, CP: %s]=====\n",
                         si->sbi->sb->s_bdev, i++,
-                       f2fs_readonly(si->sbi->sb) ? "RO": "RW");
+                       f2fs_readonly(si->sbi->sb) ? "RO": "RW",
+                       f2fs_cp_error(si->sbi) ? "Error": "Good");
                 seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ",
                            si->sit_area_segs, si->nat_area_segs);
                 seq_printf(s, "[SSA: %d] [MAIN: %d",
@@ -349,10 +361,11 @@ static int stat_show(struct seq_file *s, void *v)
                 seq_printf(s, "  - Inner Struct Count: tree: %d(%d), node: %d\n",
                                 si->ext_tree, si->zombie_tree, si->ext_node);
                 seq_puts(s, "\nBalancing F2FS Async:\n");
-               seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), "
+               seq_printf(s, "  - IO (CP: %4d, Data: %4d, Flush: (%4d %4d %4d), "
                         "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n",
                            si->nr_wb_cp_data, si->nr_wb_data,
                            si->nr_flushing, si->nr_flushed,
+                          si->flush_list_empty,
                            si->nr_discarding, si->nr_discarded,
                            si->nr_discard_cmd, si->undiscard_blks);
                 seq_printf(s, "  - inmem: %4d, atomic IO: %4d (Max. %4d), "
@@ -365,6 +378,8 @@ static int stat_show(struct seq_file *s, void *v)
                            si->ndirty_dent, si->ndirty_dirs, si->ndirty_all);
                 seq_printf(s, "  - datas: %4d in files:%4d\n",
                            si->ndirty_data, si->ndirty_files);
+               seq_printf(s, "  - quota datas: %4d in quota files:%4d\n",
+                          si->ndirty_qdata, si->nquota_files);
                 seq_printf(s, "  - meta: %4d in %4d\n",
                            si->ndirty_meta, si->meta_pages);
                 seq_printf(s, "  - imeta: %4d\n",
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c

index c0c933ad43c8dce1910fe18615def9c96853bacb..2d98d877c09dada99dfae36b45f85b3da75af095 100644 (file)
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -10,10 +10,12 @@
   */
  #include <linux/fs.h>
  #include <linux/f2fs_fs.h>
+#include <linux/sched/signal.h>
  #include "f2fs.h"
  #include "node.h"
  #include "acl.h"
  #include "xattr.h"
+#include <trace/events/f2fs.h>
  
  static unsigned long dir_blocks(struct inode *inode)
  {
@@ -847,6 +849,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
         struct f2fs_dentry_block *dentry_blk = NULL;
         struct page *dentry_page = NULL;
         struct file_ra_state *ra = &file->f_ra;
+       loff_t start_pos = ctx->pos;
         unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK);
         struct f2fs_dentry_ptr d;
         struct fscrypt_str fstr = FSTR_INIT(NULL, 0);
@@ -855,24 +858,32 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
         if (f2fs_encrypted_inode(inode)) {
                 err = fscrypt_get_encryption_info(inode);
                 if (err && err != -ENOKEY)
-                       return err;
+                       goto out;
  
                 err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr);
                 if (err < 0)
-                       return err;
+                       goto out;
         }
  
         if (f2fs_has_inline_dentry(inode)) {
                 err = f2fs_read_inline_dir(file, ctx, &fstr);
-               goto out;
+               goto out_free;
         }
  
-       /* readahead for multi pages of dir */
-       if (npages - n > 1 && !ra_has_index(ra, n))
-               page_cache_sync_readahead(inode->i_mapping, ra, file, n,
+       for (; n < npages; n++, ctx->pos = n * NR_DENTRY_IN_BLOCK) {
+
+               /* allow readdir() to be interrupted */
+               if (fatal_signal_pending(current)) {
+                       err = -ERESTARTSYS;
+                       goto out_free;
+               }
+               cond_resched();
+
+               /* readahead for multi pages of dir */
+               if (npages - n > 1 && !ra_has_index(ra, n))
+                       page_cache_sync_readahead(inode->i_mapping, ra, file, n,
                                 min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES));
  
-       for (; n < npages; n++) {
                 dentry_page = get_lock_data_page(inode, n, false);
                 if (IS_ERR(dentry_page)) {
                         err = PTR_ERR(dentry_page);
@@ -880,7 +891,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                                 err = 0;
                                 continue;
                         } else {
-                               goto out;
+                               goto out_free;
                         }
                 }
  
@@ -896,12 +907,13 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx)
                         break;
                 }
  
-               ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK;
                 kunmap(dentry_page);
                 f2fs_put_page(dentry_page, 1);
         }
-out:
+out_free:
         fscrypt_fname_free_buffer(&fstr);
+out:
+       trace_f2fs_readdir(inode, start_pos, ctx->pos, err);
         return err < 0 ? err : 0;
  }
  
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h

index 4b4a72f392be4be76575d9d485cce0ced2adfdfe..f4e094e816c63df79bd40f62b097147826037dae 100644 (file)
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -23,13 +23,11 @@
  #include <linux/bio.h>
  #include <linux/blkdev.h>
  #include <linux/quotaops.h>
-#ifdef CONFIG_F2FS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
  #include <crypto/hash.h>
  
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_F2FS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
  #ifdef CONFIG_F2FS_CHECK_FS
  #define f2fs_bug_on(sbi, condition)    BUG_ON(condition)
  #else
@@ -46,6 +44,8 @@
  enum {
         FAULT_KMALLOC,
         FAULT_PAGE_ALLOC,
+       FAULT_PAGE_GET,
+       FAULT_ALLOC_BIO,
         FAULT_ALLOC_NID,
         FAULT_ORPHAN,
         FAULT_BLOCK,
@@ -93,6 +93,7 @@ extern char *fault_name[FAULT_MAX];
  #define F2FS_MOUNT_GRPQUOTA            0x00100000
  #define F2FS_MOUNT_PRJQUOTA            0x00200000
  #define F2FS_MOUNT_QUOTA               0x00400000
+#define F2FS_MOUNT_INLINE_XATTR_SIZE   0x00800000
  
  #define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option)
  #define set_opt(sbi, option)   ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option)
@@ -118,6 +119,8 @@ struct f2fs_mount_info {
  #define F2FS_FEATURE_EXTRA_ATTR                0x0008
  #define F2FS_FEATURE_PRJQUOTA          0x0010
  #define F2FS_FEATURE_INODE_CHKSUM      0x0020
+#define F2FS_FEATURE_FLEXIBLE_INLINE_XATTR     0x0040
+#define F2FS_FEATURE_QUOTA_INO         0x0080
  
  #define F2FS_HAS_FEATURE(sb, mask)                                     \
         ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0)
@@ -147,7 +150,7 @@ enum {
  #define BATCHED_TRIM_BLOCKS(sbi)       \
                 (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg)
  #define MAX_DISCARD_BLOCKS(sbi)                BLKS_PER_SEC(sbi)
-#define DISCARD_ISSUE_RATE             8
+#define DEF_MAX_DISCARD_REQUEST                8       /* issue 8 discards per round */
  #define DEF_MIN_DISCARD_ISSUE_TIME     50      /* 50 ms, if exists */
  #define DEF_MAX_DISCARD_ISSUE_TIME     60000   /* 60 s, if no candidates */
  #define DEF_CP_INTERVAL                        60      /* 60 secs */
@@ -158,7 +161,6 @@ struct cp_control {
         __u64 trim_start;
         __u64 trim_end;
         __u64 trim_minlen;
-       __u64 trimmed;
  };
  
  /*
@@ -177,12 +179,14 @@ enum {
         ORPHAN_INO,             /* for orphan ino list */
         APPEND_INO,             /* for append ino list */
         UPDATE_INO,             /* for update ino list */
+       FLUSH_INO,              /* for multiple device flushing */
         MAX_INO_ENTRY,          /* max. list */
  };
  
  struct ino_entry {
-       struct list_head list;  /* list head */
-       nid_t ino;              /* inode number */
+       struct list_head list;          /* list head */
+       nid_t ino;                      /* inode number */
+       unsigned int dirty_device;      /* dirty device bitmap */
  };
  
  /* for the list of inodes to be GCed */
@@ -206,10 +210,6 @@ struct discard_entry {
  #define plist_idx(blk_num)     ((blk_num) >= MAX_PLIST_NUM ?           \
                                         (MAX_PLIST_NUM - 1) : (blk_num - 1))
  
-#define P_ACTIVE       0x01
-#define P_TRIM         0x02
-#define plist_issue(tag)       (((tag) & P_ACTIVE) || ((tag) & P_TRIM))
-
  enum {
         D_PREP,
         D_SUBMIT,
@@ -241,12 +241,32 @@ struct discard_cmd {
         int error;                      /* bio error */
  };
  
+enum {
+       DPOLICY_BG,
+       DPOLICY_FORCE,
+       DPOLICY_FSTRIM,
+       DPOLICY_UMOUNT,
+       MAX_DPOLICY,
+};
+
+struct discard_policy {
+       int type;                       /* type of discard */
+       unsigned int min_interval;      /* used for candidates exist */
+       unsigned int max_interval;      /* used for candidates not exist */
+       unsigned int max_requests;      /* # of discards issued per round */
+       unsigned int io_aware_gran;     /* minimum granularity discard not be aware of I/O */
+       bool io_aware;                  /* issue discard in idle time */
+       bool sync;                      /* submit discard with REQ_SYNC flag */
+       unsigned int granularity;       /* discard granularity */
+};
+
  struct discard_cmd_control {
         struct task_struct *f2fs_issue_discard; /* discard thread */
         struct list_head entry_list;            /* 4KB discard entry list */
         struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */
         unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */
         struct list_head wait_list;             /* store on-flushing entries */
+       struct list_head fstrim_list;           /* in-flight discard from fstrim */
         wait_queue_head_t discard_wait_queue;   /* waiting queue for wake-up */
         unsigned int discard_wake;              /* to wake up discard thread */
         struct mutex cmd_lock;
@@ -379,11 +399,14 @@ struct f2fs_flush_device {
  
  /* for inline stuff */
  #define DEF_INLINE_RESERVED_SIZE       1
+#define DEF_MIN_INLINE_SIZE            1
  static inline int get_extra_isize(struct inode *inode);
-#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \
-                               (CUR_ADDRS_PER_INODE(inode) - \
-                               DEF_INLINE_RESERVED_SIZE - \
-                               F2FS_INLINE_XATTR_ADDRS))
+static inline int get_inline_xattr_addrs(struct inode *inode);
+#define F2FS_INLINE_XATTR_ADDRS(inode) get_inline_xattr_addrs(inode)
+#define MAX_INLINE_DATA(inode) (sizeof(__le32) *                       \
+                               (CUR_ADDRS_PER_INODE(inode) -           \
+                               F2FS_INLINE_XATTR_ADDRS(inode) -        \
+                               DEF_INLINE_RESERVED_SIZE))
  
  /* for inline dir */
  #define NR_INLINE_DENTRY(inode)        (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \
@@ -583,6 +606,7 @@ struct f2fs_inode_info {
  #endif
         struct list_head dirty_list;    /* dirty list for dirs and files */
         struct list_head gdirty_list;   /* linked in global dirty list */
+       struct list_head inmem_ilist;   /* list for inmem inodes */
         struct list_head inmem_pages;   /* inmemory pages managed by f2fs */
         struct task_struct *inmem_task; /* store inmemory task */
         struct mutex inmem_lock;        /* lock for inmemory pages */
@@ -593,6 +617,7 @@ struct f2fs_inode_info {
  
         int i_extra_isize;              /* size of extra space located in i_addr */
         kprojid_t i_projid;             /* id for project quota */
+       int i_inline_xattr_size;        /* inline xattr size */
  };
  
  static inline void get_extent_info(struct extent_info *ext,
@@ -666,10 +691,13 @@ static inline void __try_update_largest_extent(struct inode *inode,
         }
  }
  
-enum nid_list {
-       FREE_NID_LIST,
-       ALLOC_NID_LIST,
-       MAX_NID_LIST,
+/*
+ * For free nid management
+ */
+enum nid_state {
+       FREE_NID,               /* newly added to free nid list */
+       PREALLOC_NID,           /* it is preallocated */
+       MAX_NID_STATE,
  };
  
  struct f2fs_nm_info {
@@ -692,8 +720,8 @@ struct f2fs_nm_info {
  
         /* free node ids management */
         struct radix_tree_root free_nid_root;/* root of the free_nid cache */
-       struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */
-       unsigned int nid_cnt[MAX_NID_LIST];     /* the number of free node id */
+       struct list_head free_nid_list;         /* list for free nids excluding preallocated nids */
+       unsigned int nid_cnt[MAX_NID_STATE];    /* the number of free node id */
         spinlock_t nid_list_lock;       /* protect nid lists ops */
         struct mutex build_lock;        /* lock for build free nids */
         unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE];
@@ -771,6 +799,7 @@ enum {
  struct flush_cmd {
         struct completion wait;
         struct llist_node llnode;
+       nid_t ino;
         int ret;
  };
  
@@ -789,6 +818,8 @@ struct f2fs_sm_info {
         struct dirty_seglist_info *dirty_info;  /* dirty segment information */
         struct curseg_info *curseg_array;       /* active segment information */
  
+       struct rw_semaphore curseg_lock;        /* for preventing curseg change */
+
         block_t seg0_blkaddr;           /* block address of 0'th segment */
         block_t main_blkaddr;           /* start block address of main area */
         block_t ssa_blkaddr;            /* start block address of SSA area */
@@ -810,6 +841,7 @@ struct f2fs_sm_info {
         unsigned int min_ipu_util;      /* in-place-update threshold */
         unsigned int min_fsync_blocks;  /* threshold for fsync */
         unsigned int min_hot_blocks;    /* threshold for hot block allocation */
+       unsigned int min_ssr_sections;  /* threshold to trigger SSR allocation */
  
         /* for flush command control */
         struct flush_cmd_control *fcc_info;
@@ -831,6 +863,7 @@ struct f2fs_sm_info {
  enum count_type {
         F2FS_DIRTY_DENTS,
         F2FS_DIRTY_DATA,
+       F2FS_DIRTY_QDATA,
         F2FS_DIRTY_NODES,
         F2FS_DIRTY_META,
         F2FS_INMEM_PAGES,
@@ -879,6 +912,18 @@ enum need_lock_type {
         LOCK_RETRY,
  };
  
+enum cp_reason_type {
+       CP_NO_NEEDED,
+       CP_NON_REGULAR,
+       CP_HARDLINK,
+       CP_SB_NEED_CP,
+       CP_WRONG_PINO,
+       CP_NO_SPC_ROLL,
+       CP_NODE_NEED_CP,
+       CP_FASTBOOT_MODE,
+       CP_SPEC_LOG_NUM,
+};
+
  enum iostat_type {
         APP_DIRECT_IO,                  /* app direct IOs */
         APP_BUFFERED_IO,                /* app buffered IOs */
@@ -898,6 +943,7 @@ enum iostat_type {
  
  struct f2fs_io_info {
         struct f2fs_sb_info *sbi;       /* f2fs_sb_info pointer */
+       nid_t ino;              /* inode number */
         enum page_type type;    /* contains DATA/NODE/META/META_FLUSH */
         enum temp_type temp;    /* contains HOT/WARM/COLD */
         int op;                 /* contains REQ_OP_ */
@@ -942,6 +988,7 @@ enum inode_type {
         DIR_INODE,                      /* for dirty dir inode */
         FILE_INODE,                     /* for dirty regular/symlink inode */
         DIRTY_META,                     /* for all dirtied inode metadata */
+       ATOMIC_FILE,                    /* for all atomic files */
         NR_INODE_TYPE,
  };
  
@@ -1044,12 +1091,15 @@ struct f2fs_sb_info {
         loff_t max_file_blocks;                 /* max block index of file */
         int active_logs;                        /* # of active logs */
         int dir_level;                          /* directory level */
+       int inline_xattr_size;                  /* inline xattr size */
+       unsigned int trigger_ssr_threshold;     /* threshold to trigger ssr */
  
         block_t user_block_count;               /* # of user blocks */
         block_t total_valid_block_count;        /* # of valid blocks */
         block_t discard_blks;                   /* discard command candidats */
         block_t last_valid_block_count;         /* for recovery */
         block_t reserved_blocks;                /* configurable reserved blocks */
+       block_t current_reserved_blocks;        /* current reserved blocks */
  
         u32 s_next_generation;                  /* for NFS support */
  
@@ -1115,6 +1165,8 @@ struct f2fs_sb_info {
         struct list_head s_list;
         int s_ndevs;                            /* number of devices */
         struct f2fs_dev_info *devs;             /* for device list */
+       unsigned int dirty_device;              /* for checkpoint data flush */
+       spinlock_t dev_lock;                    /* protect dirty_device */
         struct mutex umount_mutex;
         unsigned int shrinker_run_no;
  
@@ -1178,8 +1230,7 @@ static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type)
  
  static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type)
  {
-       struct timespec ts = {sbi->interval_time[type], 0};
-       unsigned long interval = timespec_to_jiffies(&ts);
+       unsigned long interval = sbi->interval_time[type] * HZ;
  
         return time_after(jiffies, sbi->last_time[type] + interval);
  }
@@ -1346,6 +1397,13 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp)
         return le64_to_cpu(cp->checkpoint_ver);
  }
  
+static inline unsigned long f2fs_qf_ino(struct super_block *sb, int type)
+{
+       if (type < F2FS_MAX_QUOTAS)
+               return le32_to_cpu(F2FS_SB(sb)->raw_super->qf_ino[type]);
+       return 0;
+}
+
  static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp)
  {
         size_t crc_offset = le32_to_cpu(cp->checksum_offset);
@@ -1524,7 +1582,8 @@ static inline int inc_valid_block_count(struct f2fs_sb_info *sbi,
  
         spin_lock(&sbi->stat_lock);
         sbi->total_valid_block_count += (block_t)(*count);
-       avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks;
+       avail_user_block_count = sbi->user_block_count -
+                                       sbi->current_reserved_blocks;
         if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) {
                 diff = sbi->total_valid_block_count - avail_user_block_count;
                 *count -= diff;
@@ -1558,6 +1617,10 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi,
         f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count);
         f2fs_bug_on(sbi, inode->i_blocks < sectors);
         sbi->total_valid_block_count -= (block_t)count;
+       if (sbi->reserved_blocks &&
+               sbi->current_reserved_blocks < sbi->reserved_blocks)
+               sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+                                       sbi->current_reserved_blocks + count);
         spin_unlock(&sbi->stat_lock);
         f2fs_i_blocks_write(inode, count, false, true);
  }
@@ -1578,6 +1641,8 @@ static inline void inode_inc_dirty_pages(struct inode *inode)
         atomic_inc(&F2FS_I(inode)->dirty_pages);
         inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
                                 F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+       if (IS_NOQUOTA(inode))
+               inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
  }
  
  static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type)
@@ -1594,6 +1659,8 @@ static inline void inode_dec_dirty_pages(struct inode *inode)
         atomic_dec(&F2FS_I(inode)->dirty_pages);
         dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ?
                                 F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA);
+       if (IS_NOQUOTA(inode))
+               dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_QDATA);
  }
  
  static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type)
@@ -1701,10 +1768,17 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi,
                         return ret;
         }
  
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(sbi, FAULT_BLOCK)) {
+               f2fs_show_injection_info(FAULT_BLOCK);
+               goto enospc;
+       }
+#endif
+
         spin_lock(&sbi->stat_lock);
  
         valid_block_count = sbi->total_valid_block_count + 1;
-       if (unlikely(valid_block_count + sbi->reserved_blocks >
+       if (unlikely(valid_block_count + sbi->current_reserved_blocks >
                                                 sbi->user_block_count)) {
                 spin_unlock(&sbi->stat_lock);
                 goto enospc;
@@ -1747,6 +1821,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
  
         sbi->total_valid_node_count--;
         sbi->total_valid_block_count--;
+       if (sbi->reserved_blocks &&
+               sbi->current_reserved_blocks < sbi->reserved_blocks)
+               sbi->current_reserved_blocks++;
  
         spin_unlock(&sbi->stat_lock);
  
@@ -1793,6 +1870,19 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping,
         return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS);
  }
  
+static inline struct page *f2fs_pagecache_get_page(
+                               struct address_space *mapping, pgoff_t index,
+                               int fgp_flags, gfp_t gfp_mask)
+{
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) {
+               f2fs_show_injection_info(FAULT_PAGE_GET);
+               return NULL;
+       }
+#endif
+       return pagecache_get_page(mapping, index, fgp_flags, gfp_mask);
+}
+
  static inline void f2fs_copy_page(struct page *src, struct page *dst)
  {
         char *src_kaddr = kmap(src);
@@ -1842,15 +1932,25 @@ static inline void *f2fs_kmem_cache_alloc(struct kmem_cache *cachep,
         return entry;
  }
  
-static inline struct bio *f2fs_bio_alloc(int npages)
+static inline struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi,
+                                               int npages, bool no_fail)
  {
         struct bio *bio;
  
-       /* No failure on bio allocation */
-       bio = bio_alloc(GFP_NOIO, npages);
-       if (!bio)
-               bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
-       return bio;
+       if (no_fail) {
+               /* No failure on bio allocation */
+               bio = bio_alloc(GFP_NOIO, npages);
+               if (!bio)
+                       bio = bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
+               return bio;
+       }
+#ifdef CONFIG_F2FS_FAULT_INJECTION
+       if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
+               f2fs_show_injection_info(FAULT_ALLOC_BIO);
+               return NULL;
+       }
+#endif
+       return bio_alloc(GFP_KERNEL, npages);
  }
  
  static inline void f2fs_radix_tree_insert(struct radix_tree_root *root,
@@ -2160,25 +2260,20 @@ static inline int f2fs_has_inline_xattr(struct inode *inode)
  
  static inline unsigned int addrs_per_inode(struct inode *inode)
  {
-       if (f2fs_has_inline_xattr(inode))
-               return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS;
-       return CUR_ADDRS_PER_INODE(inode);
+       return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS(inode);
  }
  
-static inline void *inline_xattr_addr(struct page *page)
+static inline void *inline_xattr_addr(struct inode *inode, struct page *page)
  {
         struct f2fs_inode *ri = F2FS_INODE(page);
  
         return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE -
-                                       F2FS_INLINE_XATTR_ADDRS]);
+                                       F2FS_INLINE_XATTR_ADDRS(inode)]);
  }
  
  static inline int inline_xattr_size(struct inode *inode)
  {
-       if (f2fs_has_inline_xattr(inode))
-               return F2FS_INLINE_XATTR_ADDRS << 2;
-       else
-               return 0;
+       return get_inline_xattr_addrs(inode) * sizeof(__le32);
  }
  
  static inline int f2fs_has_inline_data(struct inode *inode)
@@ -2259,9 +2354,10 @@ static inline void clear_file(struct inode *inode, int type)
  
  static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
  {
+       bool ret;
+
         if (dsync) {
                 struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-               bool ret;
  
                 spin_lock(&sbi->inode_lock[DIRTY_META]);
                 ret = list_empty(&F2FS_I(inode)->gdirty_list);
@@ -2272,7 +2368,12 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
                         file_keep_isize(inode) ||
                         i_size_read(inode) & PAGE_MASK)
                 return false;
-       return F2FS_I(inode)->last_disk_size == i_size_read(inode);
+
+       down_read(&F2FS_I(inode)->i_sem);
+       ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
+       up_read(&F2FS_I(inode)->i_sem);
+
+       return ret;
  }
  
  static inline int f2fs_readonly(struct super_block *sb)
@@ -2322,6 +2423,12 @@ static inline int get_extra_isize(struct inode *inode)
         return F2FS_I(inode)->i_extra_isize / sizeof(__le32);
  }
  
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb);
+static inline int get_inline_xattr_addrs(struct inode *inode)
+{
+       return F2FS_I(inode)->i_inline_xattr_size;
+}
+
  #define get_inode_mode(i) \
         ((is_inode_flag_set(i, FI_ACL_MODE)) ? \
          (F2FS_I(i)->i_acl_mode) : ((i)->i_mode))
@@ -2450,7 +2557,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode)
   */
  int f2fs_inode_dirtied(struct inode *inode, bool sync);
  void f2fs_inode_synced(struct inode *inode);
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi);
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly);
  void f2fs_quota_off_umount(struct super_block *sb);
  int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover);
  int f2fs_sync_fs(struct super_block *sb, int sync);
@@ -2478,7 +2585,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni);
  pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs);
  int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode);
  int truncate_inode_blocks(struct inode *inode, pgoff_t from);
-int truncate_xattr_node(struct inode *inode, struct page *page);
+int truncate_xattr_node(struct inode *inode);
  int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino);
  int remove_inode_page(struct inode *inode);
  struct page *new_inode_page(struct inode *inode);
@@ -2513,19 +2620,22 @@ void destroy_node_manager_caches(void);
   */
  bool need_SSR(struct f2fs_sb_info *sbi);
  void register_inmem_page(struct inode *inode, struct page *page);
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi);
  void drop_inmem_pages(struct inode *inode);
  void drop_inmem_page(struct inode *inode, struct page *page);
  int commit_inmem_pages(struct inode *inode);
  void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
  void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
-int f2fs_issue_flush(struct f2fs_sb_info *sbi);
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
  int create_flush_cmd_control(struct f2fs_sb_info *sbi);
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
  void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free);
  void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr);
  bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr);
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new);
+void init_discard_policy(struct discard_policy *dpolicy, int discard_type,
+                                               unsigned int granularity);
  void stop_discard_thread(struct f2fs_sb_info *sbi);
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount);
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi);
  void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc);
  void release_discard_addrs(struct f2fs_sb_info *sbi);
  int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra);
@@ -2580,6 +2690,10 @@ void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
  void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type);
  void release_ino_entry(struct f2fs_sb_info *sbi, bool all);
  bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode);
+void set_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+                                       unsigned int devidx, int type);
+bool is_dirty_device(struct f2fs_sb_info *sbi, nid_t ino,
+                                       unsigned int devidx, int type);
  int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi);
  int acquire_orphan_inode(struct f2fs_sb_info *sbi);
  void release_orphan_inode(struct f2fs_sb_info *sbi);
@@ -2667,14 +2781,16 @@ struct f2fs_stat_info {
         unsigned long long hit_largest, hit_cached, hit_rbtree;
         unsigned long long hit_total, total_ext;
         int ext_tree, zombie_tree, ext_node;
-       int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta;
+       int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta;
+       int ndirty_data, ndirty_qdata;
         int inmem_pages;
-       unsigned int ndirty_dirs, ndirty_files, ndirty_all;
+       unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all;
         int nats, dirty_nats, sits, dirty_sits;
         int free_nids, avail_nids, alloc_nids;
         int total_count, utilization;
         int bg_gc, nr_wb_cp_data, nr_wb_data;
-       int nr_flushing, nr_flushed, nr_discarding, nr_discarded;
+       int nr_flushing, nr_flushed, flush_list_empty;
+       int nr_discarding, nr_discarded;
         int nr_discard_cmd;
         unsigned int undiscard_blks;
         int inline_xattr, inline_inode, inline_dir, append, update, orphans;
@@ -2949,6 +3065,7 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode)
  {
  #ifdef CONFIG_F2FS_FS_ENCRYPTION
         file_set_encrypt(inode);
+       inode->i_flags |= S_ENCRYPTED;
  #endif
  }
  
@@ -2982,6 +3099,16 @@ static inline int f2fs_sb_has_inode_chksum(struct super_block *sb)
         return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM);
  }
  
+static inline int f2fs_sb_has_flexible_inline_xattr(struct super_block *sb)
+{
+       return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_FLEXIBLE_INLINE_XATTR);
+}
+
+static inline int f2fs_sb_has_quota_ino(struct super_block *sb)
+{
+       return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_QUOTA_INO);
+}
+
  #ifdef CONFIG_BLK_DEV_ZONED
  static inline int get_blkz_type(struct f2fs_sb_info *sbi,
                         struct block_device *bdev, block_t blkaddr)
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c

index b8372095ba0a6e1b906b1a7ff40b46e3beb2d4c0..25038cfc921732ebcf410e6994865075d126da99 100644 (file)
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -53,6 +53,11 @@ static int f2fs_vm_page_mkwrite(struct vm_fault *vmf)
         struct dnode_of_data dn;
         int err;
  
+       if (unlikely(f2fs_cp_error(sbi))) {
+               err = -EIO;
+               goto err;
+       }
+
         sb_start_pagefault(inode->i_sb);
  
         f2fs_bug_on(sbi, f2fs_has_inline_data(inode));
@@ -114,6 +119,7 @@ out_sem:
  out:
         sb_end_pagefault(inode->i_sb);
         f2fs_update_time(sbi, REQ_TIME);
+err:
         return block_page_mkwrite_return(err);
  }
  
@@ -138,27 +144,29 @@ static int get_parent_ino(struct inode *inode, nid_t *pino)
         return 1;
  }
  
-static inline bool need_do_checkpoint(struct inode *inode)
+static inline enum cp_reason_type need_do_checkpoint(struct inode *inode)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
-       bool need_cp = false;
+       enum cp_reason_type cp_reason = CP_NO_NEEDED;
  
-       if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1)
-               need_cp = true;
+       if (!S_ISREG(inode->i_mode))
+               cp_reason = CP_NON_REGULAR;
+       else if (inode->i_nlink != 1)
+               cp_reason = CP_HARDLINK;
         else if (is_sbi_flag_set(sbi, SBI_NEED_CP))
-               need_cp = true;
+               cp_reason = CP_SB_NEED_CP;
         else if (file_wrong_pino(inode))
-               need_cp = true;
+               cp_reason = CP_WRONG_PINO;
         else if (!space_for_roll_forward(sbi))
-               need_cp = true;
+               cp_reason = CP_NO_SPC_ROLL;
         else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino))
-               need_cp = true;
+               cp_reason = CP_NODE_NEED_CP;
         else if (test_opt(sbi, FASTBOOT))
-               need_cp = true;
+               cp_reason = CP_FASTBOOT_MODE;
         else if (sbi->active_logs == 2)
-               need_cp = true;
+               cp_reason = CP_SPEC_LOG_NUM;
  
-       return need_cp;
+       return cp_reason;
  }
  
  static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino)
@@ -193,7 +201,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         nid_t ino = inode->i_ino;
         int ret = 0;
-       bool need_cp = false;
+       enum cp_reason_type cp_reason = 0;
         struct writeback_control wbc = {
                 .sync_mode = WB_SYNC_ALL,
                 .nr_to_write = LONG_MAX,
@@ -212,7 +220,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end,
         clear_inode_flag(inode, FI_NEED_IPU);
  
         if (ret) {
-               trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+               trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
                 return ret;
         }
  
@@ -243,10 +251,10 @@ go_write:
          * sudden-power-off.
          */
         down_read(&F2FS_I(inode)->i_sem);
-       need_cp = need_do_checkpoint(inode);
+       cp_reason = need_do_checkpoint(inode);
         up_read(&F2FS_I(inode)->i_sem);
  
-       if (need_cp) {
+       if (cp_reason) {
                 /* all the dirty node pages should be flushed for POR */
                 ret = f2fs_sync_fs(inode->i_sb, 1);
  
@@ -294,19 +302,24 @@ sync_nodes:
         remove_ino_entry(sbi, ino, APPEND_INO);
         clear_inode_flag(inode, FI_APPEND_WRITE);
  flush_out:
-       remove_ino_entry(sbi, ino, UPDATE_INO);
-       clear_inode_flag(inode, FI_UPDATE_WRITE);
         if (!atomic)
-               ret = f2fs_issue_flush(sbi);
+               ret = f2fs_issue_flush(sbi, inode->i_ino);
+       if (!ret) {
+               remove_ino_entry(sbi, ino, UPDATE_INO);
+               clear_inode_flag(inode, FI_UPDATE_WRITE);
+               remove_ino_entry(sbi, ino, FLUSH_INO);
+       }
         f2fs_update_time(sbi, REQ_TIME);
  out:
-       trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret);
+       trace_f2fs_sync_file_exit(inode, cp_reason, datasync, ret);
         f2fs_trace_ios(NULL, 1);
         return ret;
  }
  
  int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
  {
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(file)))))
+               return -EIO;
         return f2fs_do_sync_file(file, start, end, datasync, false);
  }
  
@@ -443,6 +456,9 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma)
         struct inode *inode = file_inode(file);
         int err;
  
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         /* we don't need to use inline_data strictly */
         err = f2fs_convert_inline_inode(inode);
         if (err)
@@ -629,6 +645,9 @@ int f2fs_truncate(struct inode *inode)
  {
         int err;
  
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                                 S_ISLNK(inode->i_mode)))
                 return 0;
@@ -728,6 +747,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
         int err;
         bool size_changed = false;
  
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         err = setattr_prepare(dentry, attr);
         if (err)
                 return err;
@@ -780,6 +802,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
                         inode->i_mtime = inode->i_ctime = current_time(inode);
                 }
  
+               down_write(&F2FS_I(inode)->i_sem);
+               F2FS_I(inode)->last_disk_size = i_size_read(inode);
+               up_write(&F2FS_I(inode)->i_sem);
+
                 size_changed = true;
         }
  
@@ -850,7 +876,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end)
                 err = get_dnode_of_data(&dn, pg_start, LOOKUP_NODE);
                 if (err) {
                         if (err == -ENOENT) {
-                               pg_start++;
+                               pg_start = get_next_page_offset(&dn, pg_start);
                                 continue;
                         }
                         return err;
@@ -1165,11 +1191,14 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
         if (ret)
                 goto out;
  
+       /* avoid gc operation during block exchange */
+       down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
         truncate_pagecache(inode, offset);
  
         ret = f2fs_do_collapse(inode, pg_start, pg_end);
         if (ret)
-               goto out;
+               goto out_unlock;
  
         /* write out all moved pages, if possible */
         filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX);
@@ -1181,7 +1210,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len)
         ret = truncate_blocks(inode, new_size, true);
         if (!ret)
                 f2fs_i_size_write(inode, new_size);
-
+out_unlock:
+       up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
  out:
         up_write(&F2FS_I(inode)->i_mmap_sem);
         return ret;
@@ -1364,6 +1394,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
         if (ret)
                 goto out;
  
+       /* avoid gc operation during block exchange */
+       down_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
+
         truncate_pagecache(inode, offset);
  
         pg_start = offset >> PAGE_SHIFT;
@@ -1391,6 +1424,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len)
  
         if (!ret)
                 f2fs_i_size_write(inode, new_size);
+
+       up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
  out:
         up_write(&F2FS_I(inode)->i_mmap_sem);
         return ret;
@@ -1440,8 +1475,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset,
                 new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end;
         }
  
-       if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size)
-               f2fs_i_size_write(inode, new_size);
+       if (new_size > i_size_read(inode)) {
+               if (mode & FALLOC_FL_KEEP_SIZE)
+                       file_set_keep_isize(inode);
+               else
+                       f2fs_i_size_write(inode, new_size);
+       }
  
         return err;
  }
@@ -1452,6 +1491,9 @@ static long f2fs_fallocate(struct file *file, int mode,
         struct inode *inode = file_inode(file);
         long ret = 0;
  
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         /* f2fs only support ->fallocate for regular file */
         if (!S_ISREG(inode->i_mode))
                 return -EINVAL;
@@ -1485,8 +1527,6 @@ static long f2fs_fallocate(struct file *file, int mode,
         if (!ret) {
                 inode->i_mtime = inode->i_ctime = current_time(inode);
                 f2fs_mark_inode_dirty_sync(inode, false);
-               if (mode & FALLOC_FL_KEEP_SIZE)
-                       file_set_keep_isize(inode);
                 f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
         }
  
@@ -1888,6 +1928,9 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
  {
         struct inode *inode = file_inode(filp);
  
+       if (!f2fs_sb_has_crypto(inode->i_sb))
+               return -EOPNOTSUPP;
+
         f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
  
         return fscrypt_ioctl_set_policy(filp, (const void __user *)arg);
@@ -1895,6 +1938,8 @@ static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg)
  
  static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg)
  {
+       if (!f2fs_sb_has_crypto(file_inode(filp)->i_sb))
+               return -EOPNOTSUPP;
         return fscrypt_ioctl_get_policy(filp, (void __user *)arg);
  }
  
@@ -2250,9 +2295,13 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
         }
  
         inode_lock(src);
+       down_write(&F2FS_I(src)->dio_rwsem[WRITE]);
         if (src != dst) {
-               if (!inode_trylock(dst)) {
-                       ret = -EBUSY;
+               ret = -EBUSY;
+               if (!inode_trylock(dst))
+                       goto out;
+               if (!down_write_trylock(&F2FS_I(dst)->dio_rwsem[WRITE])) {
+                       inode_unlock(dst);
                         goto out;
                 }
         }
@@ -2312,9 +2361,12 @@ static int f2fs_move_file_range(struct file *file_in, loff_t pos_in,
         }
         f2fs_unlock_op(sbi);
  out_unlock:
-       if (src != dst)
+       if (src != dst) {
+               up_write(&F2FS_I(dst)->dio_rwsem[WRITE]);
                 inode_unlock(dst);
+       }
  out:
+       up_write(&F2FS_I(src)->dio_rwsem[WRITE]);
         inode_unlock(src);
         return ret;
  }
@@ -2630,6 +2682,9 @@ static int f2fs_ioc_fssetxattr(struct file *filp, unsigned long arg)
  
  long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
  {
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
+               return -EIO;
+
         switch (cmd) {
         case F2FS_IOC_GETFLAGS:
                 return f2fs_ioc_getflags(filp, arg);
@@ -2687,6 +2742,9 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
         struct blk_plug plug;
         ssize_t ret;
  
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(inode))))
+               return -EIO;
+
         inode_lock(inode);
         ret = generic_write_checks(iocb, from);
         if (ret > 0) {
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c

index bfe6a8ccc3a0207d51e0f11878de931958b699c1..5d5bba462f26390512a50c4359ebc99b3b3481dc 100644 (file)
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -267,16 +267,6 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno)
         return UINT_MAX - ((100 * (100 - u) * age) / (100 + u));
  }
  
-static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi,
-                                               unsigned int segno)
-{
-       unsigned int valid_blocks =
-                       get_valid_blocks(sbi, segno, true);
-
-       return IS_DATASEG(get_seg_entry(sbi, segno)->type) ?
-                               valid_blocks * 2 : valid_blocks;
-}
-
  static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
                         unsigned int segno, struct victim_sel_policy *p)
  {
@@ -285,7 +275,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi,
  
         /* alloc_mode == LFS */
         if (p->gc_mode == GC_GREEDY)
-               return get_greedy_cost(sbi, segno);
+               return get_valid_blocks(sbi, segno, true);
         else
                 return get_cb_cost(sbi, segno);
  }
@@ -466,10 +456,10 @@ static int check_valid_map(struct f2fs_sb_info *sbi,
         struct seg_entry *sentry;
         int ret;
  
-       mutex_lock(&sit_i->sentry_lock);
+       down_read(&sit_i->sentry_lock);
         sentry = get_seg_entry(sbi, segno);
         ret = f2fs_test_bit(offset, sentry->cur_valid_map);
-       mutex_unlock(&sit_i->sentry_lock);
+       up_read(&sit_i->sentry_lock);
         return ret;
  }
  
@@ -608,6 +598,7 @@ static void move_data_block(struct inode *inode, block_t bidx,
  {
         struct f2fs_io_info fio = {
                 .sbi = F2FS_I_SB(inode),
+               .ino = inode->i_ino,
                 .type = DATA,
                 .temp = COLD,
                 .op = REQ_OP_READ,
@@ -659,8 +650,8 @@ static void move_data_block(struct inode *inode, block_t bidx,
         allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr,
                                         &sum, CURSEG_COLD_DATA, NULL, false);
  
-       fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr,
-                                       FGP_LOCK | FGP_CREAT, GFP_NOFS);
+       fio.encrypted_page = f2fs_pagecache_get_page(META_MAPPING(fio.sbi),
+                               newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS);
         if (!fio.encrypted_page) {
                 err = -ENOMEM;
                 goto recover_block;
@@ -738,6 +729,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type,
         } else {
                 struct f2fs_io_info fio = {
                         .sbi = F2FS_I_SB(inode),
+                       .ino = inode->i_ino,
                         .type = DATA,
                         .temp = COLD,
                         .op = REQ_OP_WRITE,
@@ -840,10 +832,17 @@ next_step:
                                 continue;
                         }
  
+                       if (!down_write_trylock(
+                               &F2FS_I(inode)->dio_rwsem[WRITE])) {
+                               iput(inode);
+                               continue;
+                       }
+
                         start_bidx = start_bidx_of_node(nofs, inode);
                         data_page = get_read_data_page(inode,
                                         start_bidx + ofs_in_node, REQ_RAHEAD,
                                         true);
+                       up_write(&F2FS_I(inode)->dio_rwsem[WRITE]);
                         if (IS_ERR(data_page)) {
                                 iput(inode);
                                 continue;
@@ -901,10 +900,10 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim,
         struct sit_info *sit_i = SIT_I(sbi);
         int ret;
  
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
         ret = DIRTY_I(sbi)->v_ops->get_victim(sbi, victim, gc_type,
                                               NO_CHECK_TYPE, LFS);
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
         return ret;
  }
  
@@ -952,8 +951,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
                 /*
                  * this is to avoid deadlock:
                  * - lock_page(sum_page)         - f2fs_replace_block
-                *  - check_valid_map()            - mutex_lock(sentry_lock)
-                *   - mutex_lock(sentry_lock)     - change_curseg()
+                *  - check_valid_map()            - down_write(sentry_lock)
+                *   - down_read(sentry_lock)     - change_curseg()
                  *                                  - lock_page(sum_page)
                  */
                 if (type == SUM_TYPE_NODE)
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c

index 8322e4e7bb3fc432aabc8067b7ab459274c6d326..364114ad21477fa7dccdbd219d66a0e6edd8ecde 100644 (file)
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -13,6 +13,7 @@
  
  #include "f2fs.h"
  #include "node.h"
+#include <trace/events/android_fs.h>
  
  bool f2fs_may_inline_data(struct inode *inode)
  {
@@ -85,14 +86,29 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
  {
         struct page *ipage;
  
+       if (trace_android_fs_dataread_start_enabled()) {
+               char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+               path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   inode);
+               trace_android_fs_dataread_start(inode, page_offset(page),
+                                               PAGE_SIZE, current->pid,
+                                               path, current->comm);
+       }
+
         ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino);
         if (IS_ERR(ipage)) {
+               trace_android_fs_dataread_end(inode, page_offset(page),
+                                             PAGE_SIZE);
                 unlock_page(page);
                 return PTR_ERR(ipage);
         }
  
         if (!f2fs_has_inline_data(inode)) {
                 f2fs_put_page(ipage, 1);
+               trace_android_fs_dataread_end(inode, page_offset(page),
+                                             PAGE_SIZE);
                 return -EAGAIN;
         }
  
@@ -104,6 +120,8 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page)
         if (!PageUptodate(page))
                 SetPageUptodate(page);
         f2fs_put_page(ipage, 1);
+       trace_android_fs_dataread_end(inode, page_offset(page),
+                                     PAGE_SIZE);
         unlock_page(page);
         return 0;
  }
@@ -112,6 +130,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page)
  {
         struct f2fs_io_info fio = {
                 .sbi = F2FS_I_SB(dn->inode),
+               .ino = dn->inode->i_ino,
                 .type = DATA,
                 .op = REQ_OP_WRITE,
                 .op_flags = REQ_SYNC | REQ_PRIO,
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c

index 50c88e37ed668106b20cf4993c4bff6b11f95009..b4c4f2b2530404d5dc5b60bafab86a910c1a7879 100644 (file)
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -43,8 +43,11 @@ void f2fs_set_inode_flags(struct inode *inode)
                 new_fl |= S_NOATIME;
         if (flags & FS_DIRSYNC_FL)
                 new_fl |= S_DIRSYNC;
+       if (f2fs_encrypted_inode(inode))
+               new_fl |= S_ENCRYPTED;
         inode_set_flags(inode, new_fl,
-                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
+                       S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC|
+                       S_ENCRYPTED);
  }
  
  static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri)
@@ -232,6 +235,23 @@ static int do_read_inode(struct inode *inode)
         fi->i_extra_isize = f2fs_has_extra_attr(inode) ?
                                         le16_to_cpu(ri->i_extra_isize) : 0;
  
+       if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+               f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+               fi->i_inline_xattr_size = le16_to_cpu(ri->i_inline_xattr_size);
+       } else if (f2fs_has_inline_xattr(inode) ||
+                               f2fs_has_inline_dentry(inode)) {
+               fi->i_inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+       } else {
+
+               /*
+                * Previous inline data or directory always reserved 200 bytes
+                * in inode layout, even if inline_xattr is disabled. In order
+                * to keep inline_dentry's structure for backward compatibility,
+                * we get the space back only from inline_data.
+                */
+               fi->i_inline_xattr_size = 0;
+       }
+
         /* check data exist */
         if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode))
                 __recover_inline_status(inode, node_page);
@@ -384,6 +404,10 @@ int update_inode(struct inode *inode, struct page *node_page)
         if (f2fs_has_extra_attr(inode)) {
                 ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize);
  
+               if (f2fs_sb_has_flexible_inline_xattr(F2FS_I_SB(inode)->sb))
+                       ri->i_inline_xattr_size =
+                               cpu_to_le16(F2FS_I(inode)->i_inline_xattr_size);
+
                 if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) &&
                         F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize,
                                                                 i_projid)) {
@@ -480,6 +504,7 @@ void f2fs_evict_inode(struct inode *inode)
  
         remove_ino_entry(sbi, inode->i_ino, APPEND_INO);
         remove_ino_entry(sbi, inode->i_ino, UPDATE_INO);
+       remove_ino_entry(sbi, inode->i_ino, FLUSH_INO);
  
         sb_start_intwrite(inode->i_sb);
         set_inode_flag(inode, FI_NO_ALLOC);
@@ -519,8 +544,10 @@ no_delete:
         stat_dec_inline_dir(inode);
         stat_dec_inline_inode(inode);
  
-       if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG))
+       if (likely(!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)))
                 f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE));
+       else
+               f2fs_inode_synced(inode);
  
         /* ino == 0, if f2fs_new_inode() was failed t*/
         if (inode->i_ino)
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c

index a4dab98c4b7ba7c2d2b69086e5b439c36d323382..28bdf8828e73cdaa785ff988c94d61aead3af547 100644 (file)
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -29,6 +29,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
         nid_t ino;
         struct inode *inode;
         bool nid_free = false;
+       int xattr_size = 0;
         int err;
  
         inode = new_inode(dir->i_sb);
@@ -86,11 +87,23 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
  
         if (test_opt(sbi, INLINE_XATTR))
                 set_inode_flag(inode, FI_INLINE_XATTR);
+
         if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
                 set_inode_flag(inode, FI_INLINE_DATA);
         if (f2fs_may_inline_dentry(inode))
                 set_inode_flag(inode, FI_INLINE_DENTRY);
  
+       if (f2fs_sb_has_flexible_inline_xattr(sbi->sb)) {
+               f2fs_bug_on(sbi, !f2fs_has_extra_attr(inode));
+               if (f2fs_has_inline_xattr(inode))
+                       xattr_size = sbi->inline_xattr_size;
+               /* Otherwise, will be 0 */
+       } else if (f2fs_has_inline_xattr(inode) ||
+                               f2fs_has_inline_dentry(inode)) {
+               xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
+       }
+       F2FS_I(inode)->i_inline_xattr_size = xattr_size;
+
         f2fs_init_extent_tree(inode, NULL);
  
         stat_inc_inline_xattr(inode);
@@ -177,6 +190,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
         nid_t ino = 0;
         int err;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         err = dquot_initialize(dir);
         if (err)
                 return err;
@@ -221,6 +237,9 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir,
         struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
         int err;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         if (f2fs_encrypted_inode(dir) &&
                         !fscrypt_has_permitted_context(dir, inode))
                 return -EPERM;
@@ -331,12 +350,15 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
         struct inode *inode = NULL;
         struct f2fs_dir_entry *de;
         struct page *page;
-       nid_t ino;
+       struct dentry *new;
+       nid_t ino = -1;
         int err = 0;
         unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir));
  
+       trace_f2fs_lookup_start(dir, dentry, flags);
+
         if (f2fs_encrypted_inode(dir)) {
-               int res = fscrypt_get_encryption_info(dir);
+               err = fscrypt_get_encryption_info(dir);
  
                 /*
                  * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is
@@ -346,18 +368,22 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
                 if (fscrypt_has_encryption_key(dir))
                         fscrypt_set_encrypted_dentry(dentry);
                 fscrypt_set_d_op(dentry);
-               if (res && res != -ENOKEY)
-                       return ERR_PTR(res);
+               if (err && err != -ENOKEY)
+                       goto out;
         }
  
-       if (dentry->d_name.len > F2FS_NAME_LEN)
-               return ERR_PTR(-ENAMETOOLONG);
+       if (dentry->d_name.len > F2FS_NAME_LEN) {
+               err = -ENAMETOOLONG;
+               goto out;
+       }
  
         de = f2fs_find_entry(dir, &dentry->d_name, &page);
         if (!de) {
-               if (IS_ERR(page))
-                       return (struct dentry *)page;
-               return d_splice_alias(inode, dentry);
+               if (IS_ERR(page)) {
+                       err = PTR_ERR(page);
+                       goto out;
+               }
+               goto out_splice;
         }
  
         ino = le32_to_cpu(de->ino);
@@ -365,19 +391,21 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
         f2fs_put_page(page, 0);
  
         inode = f2fs_iget(dir->i_sb, ino);
-       if (IS_ERR(inode))
-               return ERR_CAST(inode);
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               goto out;
+       }
  
         if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) {
                 err = __recover_dot_dentries(dir, root_ino);
                 if (err)
-                       goto err_out;
+                       goto out_iput;
         }
  
         if (f2fs_has_inline_dots(inode)) {
                 err = __recover_dot_dentries(inode, dir->i_ino);
                 if (err)
-                       goto err_out;
+                       goto out_iput;
         }
         if (f2fs_encrypted_inode(dir) &&
             (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) &&
@@ -386,12 +414,18 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry,
                          "Inconsistent encryption contexts: %lu/%lu",
                          dir->i_ino, inode->i_ino);
                 err = -EPERM;
-               goto err_out;
+               goto out_iput;
         }
-       return d_splice_alias(inode, dentry);
-
-err_out:
+out_splice:
+       new = d_splice_alias(inode, dentry);
+       if (IS_ERR(new))
+               err = PTR_ERR(new);
+       trace_f2fs_lookup_end(dir, dentry, ino, err);
+       return new;
+out_iput:
         iput(inode);
+out:
+       trace_f2fs_lookup_end(dir, dentry, ino, err);
         return ERR_PTR(err);
  }
  
@@ -405,7 +439,13 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry)
  
         trace_f2fs_unlink_enter(dir, dentry);
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         err = dquot_initialize(dir);
+       if (err)
+               return err;
+       err = dquot_initialize(inode);
         if (err)
                 return err;
  
@@ -460,6 +500,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry,
         struct fscrypt_symlink_data *sd = NULL;
         int err;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         if (f2fs_encrypted_inode(dir)) {
                 err = fscrypt_get_encryption_info(dir);
                 if (err)
@@ -566,6 +609,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
         struct inode *inode;
         int err;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         err = dquot_initialize(dir);
         if (err)
                 return err;
@@ -618,6 +664,9 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry,
         struct inode *inode;
         int err = 0;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         err = dquot_initialize(dir);
         if (err)
                 return err;
@@ -712,6 +761,9 @@ out:
  
  static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
  {
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+               return -EIO;
+
         if (f2fs_encrypted_inode(dir)) {
                 int err = fscrypt_get_encryption_info(dir);
                 if (err)
@@ -723,6 +775,9 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
  
  static int f2fs_create_whiteout(struct inode *dir, struct inode **whiteout)
  {
+       if (unlikely(f2fs_cp_error(F2FS_I_SB(dir))))
+               return -EIO;
+
         return __f2fs_tmpfile(dir, NULL, S_IFCHR | WHITEOUT_MODE, whiteout);
  }
  
@@ -742,6 +797,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
         bool is_old_inline = f2fs_has_inline_dentry(old_dir);
         int err = -ENOENT;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         if ((f2fs_encrypted_inode(old_dir) &&
                         !fscrypt_has_encryption_key(old_dir)) ||
                         (f2fs_encrypted_inode(new_dir) &&
@@ -767,6 +825,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (err)
                 goto out;
  
+       if (new_inode) {
+               err = dquot_initialize(new_inode);
+               if (err)
+                       goto out;
+       }
+
         old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page);
         if (!old_entry) {
                 if (IS_ERR(old_page))
@@ -935,6 +999,9 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry,
         int old_nlink = 0, new_nlink = 0;
         int err = -ENOENT;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return -EIO;
+
         if ((f2fs_encrypted_inode(old_dir) &&
                         !fscrypt_has_encryption_key(old_dir)) ||
                         (f2fs_encrypted_inode(new_dir) &&
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c

index fca87835a1da3f19a36a651f71493397e0fd7f8c..fe1fc662af2a8cf78516ae65b8f95f76997efa28 100644 (file)
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -46,7 +46,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
          * give 25%, 25%, 50%, 50%, 50% memory for each components respectively
          */
         if (type == FREE_NIDS) {
-               mem_size = (nm_i->nid_cnt[FREE_NID_LIST] *
+               mem_size = (nm_i->nid_cnt[FREE_NID] *
                                 sizeof(struct free_nid)) >> PAGE_SHIFT;
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2);
         } else if (type == NAT_ENTRIES) {
@@ -63,7 +63,7 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
         } else if (type == INO_ENTRIES) {
                 int i;
  
-               for (i = 0; i <= UPDATE_INO; i++)
+               for (i = 0; i < MAX_INO_ENTRY; i++)
                         mem_size += sbi->im[i].ino_num *
                                                 sizeof(struct ino_entry);
                 mem_size >>= PAGE_SHIFT;
@@ -74,6 +74,10 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type)
                                 atomic_read(&sbi->total_ext_node) *
                                 sizeof(struct extent_node)) >> PAGE_SHIFT;
                 res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1);
+       } else if (type == INMEM_PAGES) {
+               /* it allows 20% / total_ram for inmemory pages */
+               mem_size = get_pages(sbi, F2FS_INMEM_PAGES);
+               res = mem_size < (val.totalram / 5);
         } else {
                 if (!sbi->sb->s_bdi->wb.dirty_exceeded)
                         return true;
@@ -134,6 +138,44 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid)
         return dst_page;
  }
  
+static struct nat_entry *__alloc_nat_entry(nid_t nid, bool no_fail)
+{
+       struct nat_entry *new;
+
+       if (no_fail)
+               new = f2fs_kmem_cache_alloc(nat_entry_slab,
+                                               GFP_NOFS | __GFP_ZERO);
+       else
+               new = kmem_cache_alloc(nat_entry_slab,
+                                               GFP_NOFS | __GFP_ZERO);
+       if (new) {
+               nat_set_nid(new, nid);
+               nat_reset_flag(new);
+       }
+       return new;
+}
+
+static void __free_nat_entry(struct nat_entry *e)
+{
+       kmem_cache_free(nat_entry_slab, e);
+}
+
+/* must be locked by nat_tree_lock */
+static struct nat_entry *__init_nat_entry(struct f2fs_nm_info *nm_i,
+       struct nat_entry *ne, struct f2fs_nat_entry *raw_ne, bool no_fail)
+{
+       if (no_fail)
+               f2fs_radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne);
+       else if (radix_tree_insert(&nm_i->nat_root, nat_get_nid(ne), ne))
+               return NULL;
+
+       if (raw_ne)
+               node_info_from_raw_nat(&ne->ni, raw_ne);
+       list_add_tail(&ne->list, &nm_i->nat_entries);
+       nm_i->nat_cnt++;
+       return ne;
+}
+
  static struct nat_entry *__lookup_nat_cache(struct f2fs_nm_info *nm_i, nid_t n)
  {
         return radix_tree_lookup(&nm_i->nat_root, n);
@@ -150,7 +192,7 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e)
         list_del(&e->list);
         radix_tree_delete(&nm_i->nat_root, nat_get_nid(e));
         nm_i->nat_cnt--;
-       kmem_cache_free(nat_entry_slab, e);
+       __free_nat_entry(e);
  }
  
  static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i,
@@ -246,49 +288,29 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino)
         return need_update;
  }
  
-static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid,
-                                                               bool no_fail)
-{
-       struct nat_entry *new;
-
-       if (no_fail) {
-               new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-               f2fs_radix_tree_insert(&nm_i->nat_root, nid, new);
-       } else {
-               new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS);
-               if (!new)
-                       return NULL;
-               if (radix_tree_insert(&nm_i->nat_root, nid, new)) {
-                       kmem_cache_free(nat_entry_slab, new);
-                       return NULL;
-               }
-       }
-
-       memset(new, 0, sizeof(struct nat_entry));
-       nat_set_nid(new, nid);
-       nat_reset_flag(new);
-       list_add_tail(&new->list, &nm_i->nat_entries);
-       nm_i->nat_cnt++;
-       return new;
-}
-
+/* must be locked by nat_tree_lock */
  static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid,
                                                 struct f2fs_nat_entry *ne)
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
-       struct nat_entry *e;
+       struct nat_entry *new, *e;
  
+       new = __alloc_nat_entry(nid, false);
+       if (!new)
+               return;
+
+       down_write(&nm_i->nat_tree_lock);
         e = __lookup_nat_cache(nm_i, nid);
-       if (!e) {
-               e = grab_nat_entry(nm_i, nid, false);
-               if (e)
-                       node_info_from_raw_nat(&e->ni, ne);
-       } else {
+       if (!e)
+               e = __init_nat_entry(nm_i, new, ne, false);
+       else
                 f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) ||
                                 nat_get_blkaddr(e) !=
                                         le32_to_cpu(ne->block_addr) ||
                                 nat_get_version(e) != ne->version);
-       }
+       up_write(&nm_i->nat_tree_lock);
+       if (e != new)
+               __free_nat_entry(new);
  }
  
  static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
@@ -296,11 +318,12 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
         struct nat_entry *e;
+       struct nat_entry *new = __alloc_nat_entry(ni->nid, true);
  
         down_write(&nm_i->nat_tree_lock);
         e = __lookup_nat_cache(nm_i, ni->nid);
         if (!e) {
-               e = grab_nat_entry(nm_i, ni->nid, true);
+               e = __init_nat_entry(nm_i, new, NULL, true);
                 copy_node_info(&e->ni, ni);
                 f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR);
         } else if (new_blkaddr == NEW_ADDR) {
@@ -312,6 +335,9 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
                 copy_node_info(&e->ni, ni);
                 f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR);
         }
+       /* let's free early to reduce memory consumption */
+       if (e != new)
+               __free_nat_entry(new);
  
         /* sanity check */
         f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr);
@@ -327,10 +353,6 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni,
         if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) {
                 unsigned char version = nat_get_version(e);
                 nat_set_version(e, inc_node_version(version));
-
-               /* in order to reuse the nid */
-               if (nm_i->next_scan_nid > ni->nid)
-                       nm_i->next_scan_nid = ni->nid;
         }
  
         /* change address */
@@ -424,9 +446,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni)
         f2fs_put_page(page, 1);
  cache:
         /* cache nat entry */
-       down_write(&nm_i->nat_tree_lock);
         cache_nat_entry(sbi, nid, &ne);
-       up_write(&nm_i->nat_tree_lock);
  }
  
  /*
@@ -962,7 +982,8 @@ fail:
         return err > 0 ? 0 : err;
  }
  
-int truncate_xattr_node(struct inode *inode, struct page *page)
+/* caller must lock inode page */
+int truncate_xattr_node(struct inode *inode)
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         nid_t nid = F2FS_I(inode)->i_xattr_nid;
@@ -978,10 +999,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page)
  
         f2fs_i_xnid_write(inode, 0);
  
-       set_new_dnode(&dn, inode, page, npage, nid);
-
-       if (page)
-               dn.inode_page_locked = true;
+       set_new_dnode(&dn, inode, NULL, npage, nid);
         truncate_node(&dn);
         return 0;
  }
@@ -1000,7 +1018,7 @@ int remove_inode_page(struct inode *inode)
         if (err)
                 return err;
  
-       err = truncate_xattr_node(inode, dn.inode_page);
+       err = truncate_xattr_node(inode);
         if (err) {
                 f2fs_put_dnode(&dn);
                 return err;
@@ -1220,7 +1238,8 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino)
         if (!inode)
                 return;
  
-       page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0);
+       page = f2fs_pagecache_get_page(inode->i_mapping, 0,
+                                       FGP_LOCK|FGP_NOWAIT, 0);
         if (!page)
                 goto iput_out;
  
@@ -1244,37 +1263,6 @@ iput_out:
         iput(inode);
  }
  
-void move_node_page(struct page *node_page, int gc_type)
-{
-       if (gc_type == FG_GC) {
-               struct f2fs_sb_info *sbi = F2FS_P_SB(node_page);
-               struct writeback_control wbc = {
-                       .sync_mode = WB_SYNC_ALL,
-                       .nr_to_write = 1,
-                       .for_reclaim = 0,
-               };
-
-               set_page_dirty(node_page);
-               f2fs_wait_on_page_writeback(node_page, NODE, true);
-
-               f2fs_bug_on(sbi, PageWriteback(node_page));
-               if (!clear_page_dirty_for_io(node_page))
-                       goto out_page;
-
-               if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc))
-                       unlock_page(node_page);
-               goto release_page;
-       } else {
-               /* set page dirty and write it */
-               if (!PageWriteback(node_page))
-                       set_page_dirty(node_page);
-       }
-out_page:
-       unlock_page(node_page);
-release_page:
-       f2fs_put_page(node_page, 0);
-}
-
  static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino)
  {
         pgoff_t index, end;
@@ -1344,6 +1332,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
         struct node_info ni;
         struct f2fs_io_info fio = {
                 .sbi = sbi,
+               .ino = ino_of_node(page),
                 .type = NODE,
                 .op = REQ_OP_WRITE,
                 .op_flags = wbc_to_write_flags(wbc),
@@ -1416,6 +1405,37 @@ redirty_out:
         return AOP_WRITEPAGE_ACTIVATE;
  }
  
+void move_node_page(struct page *node_page, int gc_type)
+{
+       if (gc_type == FG_GC) {
+               struct writeback_control wbc = {
+                       .sync_mode = WB_SYNC_ALL,
+                       .nr_to_write = 1,
+                       .for_reclaim = 0,
+               };
+
+               set_page_dirty(node_page);
+               f2fs_wait_on_page_writeback(node_page, NODE, true);
+
+               f2fs_bug_on(F2FS_P_SB(node_page), PageWriteback(node_page));
+               if (!clear_page_dirty_for_io(node_page))
+                       goto out_page;
+
+               if (__write_node_page(node_page, false, NULL,
+                                       &wbc, false, FS_GC_NODE_IO))
+                       unlock_page(node_page);
+               goto release_page;
+       } else {
+               /* set page dirty and write it */
+               if (!PageWriteback(node_page))
+                       set_page_dirty(node_page);
+       }
+out_page:
+       unlock_page(node_page);
+release_page:
+       f2fs_put_page(node_page, 0);
+}
+
  static int f2fs_write_node_page(struct page *page,
                                 struct writeback_control *wbc)
  {
@@ -1761,35 +1781,54 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i,
         return radix_tree_lookup(&nm_i->free_nid_root, n);
  }
  
-static int __insert_nid_to_list(struct f2fs_sb_info *sbi,
-                       struct free_nid *i, enum nid_list list, bool new)
+static int __insert_free_nid(struct f2fs_sb_info *sbi,
+                       struct free_nid *i, enum nid_state state)
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
  
-       if (new) {
-               int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
-               if (err)
-                       return err;
-       }
+       int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i);
+       if (err)
+               return err;
  
-       f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-                                               i->state != NID_ALLOC);
-       nm_i->nid_cnt[list]++;
-       list_add_tail(&i->list, &nm_i->nid_list[list]);
+       f2fs_bug_on(sbi, state != i->state);
+       nm_i->nid_cnt[state]++;
+       if (state == FREE_NID)
+               list_add_tail(&i->list, &nm_i->free_nid_list);
         return 0;
  }
  
-static void __remove_nid_from_list(struct f2fs_sb_info *sbi,
-                       struct free_nid *i, enum nid_list list, bool reuse)
+static void __remove_free_nid(struct f2fs_sb_info *sbi,
+                       struct free_nid *i, enum nid_state state)
+{
+       struct f2fs_nm_info *nm_i = NM_I(sbi);
+
+       f2fs_bug_on(sbi, state != i->state);
+       nm_i->nid_cnt[state]--;
+       if (state == FREE_NID)
+               list_del(&i->list);
+       radix_tree_delete(&nm_i->free_nid_root, i->nid);
+}
+
+static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i,
+                       enum nid_state org_state, enum nid_state dst_state)
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
  
-       f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW :
-                                               i->state != NID_ALLOC);
-       nm_i->nid_cnt[list]--;
-       list_del(&i->list);
-       if (!reuse)
-               radix_tree_delete(&nm_i->free_nid_root, i->nid);
+       f2fs_bug_on(sbi, org_state != i->state);
+       i->state = dst_state;
+       nm_i->nid_cnt[org_state]--;
+       nm_i->nid_cnt[dst_state]++;
+
+       switch (dst_state) {
+       case PREALLOC_NID:
+               list_del(&i->list);
+               break;
+       case FREE_NID:
+               list_add_tail(&i->list, &nm_i->free_nid_list);
+               break;
+       default:
+               BUG_ON(1);
+       }
  }
  
  /* return if the nid is recognized as free */
@@ -1807,7 +1846,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
  
         i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS);
         i->nid = nid;
-       i->state = NID_NEW;
+       i->state = FREE_NID;
  
         if (radix_tree_preload(GFP_NOFS))
                 goto err;
@@ -1820,7 +1859,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
                  *  - f2fs_create
                  *   - f2fs_new_inode
                  *    - alloc_nid
-                *     - __insert_nid_to_list(ALLOC_NID_LIST)
+                *     - __insert_nid_to_list(PREALLOC_NID)
                  *                     - f2fs_balance_fs_bg
                  *                      - build_free_nids
                  *                       - __build_free_nids
@@ -1833,8 +1872,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
                  *     - new_node_page
                  *      - set_node_addr
                  *  - alloc_nid_done
-                *   - __remove_nid_from_list(ALLOC_NID_LIST)
-                *                         - __insert_nid_to_list(FREE_NID_LIST)
+                *   - __remove_nid_from_list(PREALLOC_NID)
+                *                         - __insert_nid_to_list(FREE_NID)
                  */
                 ne = __lookup_nat_cache(nm_i, nid);
                 if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) ||
@@ -1843,13 +1882,13 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build)
  
                 e = __lookup_free_nid_list(nm_i, nid);
                 if (e) {
-                       if (e->state == NID_NEW)
+                       if (e->state == FREE_NID)
                                 ret = true;
                         goto err_out;
                 }
         }
         ret = true;
-       err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true);
+       err = __insert_free_nid(sbi, i, FREE_NID);
  err_out:
         spin_unlock(&nm_i->nid_list_lock);
         radix_tree_preload_end();
@@ -1867,8 +1906,8 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid)
  
         spin_lock(&nm_i->nid_list_lock);
         i = __lookup_free_nid_list(nm_i, nid);
-       if (i && i->state == NID_NEW) {
-               __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+       if (i && i->state == FREE_NID) {
+               __remove_free_nid(sbi, i, FREE_NID);
                 need_free = true;
         }
         spin_unlock(&nm_i->nid_list_lock);
@@ -1887,15 +1926,18 @@ static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid,
         if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap))
                 return;
  
-       if (set)
+       if (set) {
+               if (test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+                       return;
                 __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-       else
-               __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
-
-       if (set)
                 nm_i->free_nid_count[nat_ofs]++;
-       else if (!build)
-               nm_i->free_nid_count[nat_ofs]--;
+       } else {
+               if (!test_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]))
+                       return;
+               __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]);
+               if (!build)
+                       nm_i->free_nid_count[nat_ofs]--;
+       }
  }
  
  static void scan_nat_page(struct f2fs_sb_info *sbi,
@@ -1930,12 +1972,32 @@ static void scan_nat_page(struct f2fs_sb_info *sbi,
         }
  }
  
-static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+static void scan_curseg_cache(struct f2fs_sb_info *sbi)
  {
-       struct f2fs_nm_info *nm_i = NM_I(sbi);
         struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
         struct f2fs_journal *journal = curseg->journal;
+       int i;
+
+       down_read(&curseg->journal_rwsem);
+       for (i = 0; i < nats_in_cursum(journal); i++) {
+               block_t addr;
+               nid_t nid;
+
+               addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
+               nid = le32_to_cpu(nid_in_journal(journal, i));
+               if (addr == NULL_ADDR)
+                       add_free_nid(sbi, nid, true);
+               else
+                       remove_free_nid(sbi, nid);
+       }
+       up_read(&curseg->journal_rwsem);
+}
+
+static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
+{
+       struct f2fs_nm_info *nm_i = NM_I(sbi);
         unsigned int i, idx;
+       nid_t nid;
  
         down_read(&nm_i->nat_tree_lock);
  
@@ -1945,40 +2007,27 @@ static void scan_free_nid_bits(struct f2fs_sb_info *sbi)
                 if (!nm_i->free_nid_count[i])
                         continue;
                 for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) {
-                       nid_t nid;
-
-                       if (!test_bit_le(idx, nm_i->free_nid_bitmap[i]))
-                               continue;
+                       idx = find_next_bit_le(nm_i->free_nid_bitmap[i],
+                                               NAT_ENTRY_PER_BLOCK, idx);
+                       if (idx >= NAT_ENTRY_PER_BLOCK)
+                               break;
  
                         nid = i * NAT_ENTRY_PER_BLOCK + idx;
                         add_free_nid(sbi, nid, true);
  
-                       if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS)
+                       if (nm_i->nid_cnt[FREE_NID] >= MAX_FREE_NIDS)
                                 goto out;
                 }
         }
  out:
-       down_read(&curseg->journal_rwsem);
-       for (i = 0; i < nats_in_cursum(journal); i++) {
-               block_t addr;
-               nid_t nid;
+       scan_curseg_cache(sbi);
  
-               addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-               nid = le32_to_cpu(nid_in_journal(journal, i));
-               if (addr == NULL_ADDR)
-                       add_free_nid(sbi, nid, true);
-               else
-                       remove_free_nid(sbi, nid);
-       }
-       up_read(&curseg->journal_rwsem);
         up_read(&nm_i->nat_tree_lock);
  }
  
  static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
  {
         struct f2fs_nm_info *nm_i = NM_I(sbi);
-       struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA);
-       struct f2fs_journal *journal = curseg->journal;
         int i = 0;
         nid_t nid = nm_i->next_scan_nid;
  
@@ -1986,7 +2035,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
                 nid = 0;
  
         /* Enough entries */
-       if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK)
+       if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
                 return;
  
         if (!sync && !available_free_memory(sbi, FREE_NIDS))
@@ -1996,7 +2045,7 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
                 /* try to find free nids in free_nid_bitmap */
                 scan_free_nid_bits(sbi);
  
-               if (nm_i->nid_cnt[FREE_NID_LIST])
+               if (nm_i->nid_cnt[FREE_NID] >= NAT_ENTRY_PER_BLOCK)
                         return;
         }
  
@@ -2024,18 +2073,8 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount)
         nm_i->next_scan_nid = nid;
  
         /* find free nids from current sum_pages */
-       down_read(&curseg->journal_rwsem);
-       for (i = 0; i < nats_in_cursum(journal); i++) {
-               block_t addr;
+       scan_curseg_cache(sbi);
  
-               addr = le32_to_cpu(nat_in_journal(journal, i).block_addr);
-               nid = le32_to_cpu(nid_in_journal(journal, i));
-               if (addr == NULL_ADDR)
-                       add_free_nid(sbi, nid, true);
-               else
-                       remove_free_nid(sbi, nid);
-       }
-       up_read(&curseg->journal_rwsem);
         up_read(&nm_i->nat_tree_lock);
  
         ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid),
@@ -2073,15 +2112,13 @@ retry:
         }
  
         /* We should not use stale free nids created by build_free_nids */
-       if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) {
-               f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST]));
-               i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
+       if (nm_i->nid_cnt[FREE_NID] && !on_build_free_nids(nm_i)) {
+               f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list));
+               i = list_first_entry(&nm_i->free_nid_list,
                                         struct free_nid, list);
                 *nid = i->nid;
  
-               __remove_nid_from_list(sbi, i, FREE_NID_LIST, true);
-               i->state = NID_ALLOC;
-               __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false);
+               __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID);
                 nm_i->available_nids--;
  
                 update_free_nid_bitmap(sbi, *nid, false, false);
@@ -2107,7 +2144,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid)
         spin_lock(&nm_i->nid_list_lock);
         i = __lookup_free_nid_list(nm_i, nid);
         f2fs_bug_on(sbi, !i);
-       __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+       __remove_free_nid(sbi, i, PREALLOC_NID);
         spin_unlock(&nm_i->nid_list_lock);
  
         kmem_cache_free(free_nid_slab, i);
@@ -2130,12 +2167,10 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid)
         f2fs_bug_on(sbi, !i);
  
         if (!available_free_memory(sbi, FREE_NIDS)) {
-               __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false);
+               __remove_free_nid(sbi, i, PREALLOC_NID);
                 need_free = true;
         } else {
-               __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true);
-               i->state = NID_NEW;
-               __insert_nid_to_list(sbi, i, FREE_NID_LIST, false);
+               __move_free_nid(sbi, i, PREALLOC_NID, FREE_NID);
         }
  
         nm_i->available_nids++;
@@ -2154,20 +2189,19 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink)
         struct free_nid *i, *next;
         int nr = nr_shrink;
  
-       if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+       if (nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
                 return 0;
  
         if (!mutex_trylock(&nm_i->build_lock))
                 return 0;
  
         spin_lock(&nm_i->nid_list_lock);
-       list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST],
-                                                                       list) {
+       list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) {
                 if (nr_shrink <= 0 ||
-                               nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS)
+                               nm_i->nid_cnt[FREE_NID] <= MAX_FREE_NIDS)
                         break;
  
-               __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+               __remove_free_nid(sbi, i, FREE_NID);
                 kmem_cache_free(free_nid_slab, i);
                 nr_shrink--;
         }
@@ -2193,8 +2227,8 @@ void recover_inline_xattr(struct inode *inode, struct page *page)
                 goto update_inode;
         }
  
-       dst_addr = inline_xattr_addr(ipage);
-       src_addr = inline_xattr_addr(page);
+       dst_addr = inline_xattr_addr(inode, ipage);
+       src_addr = inline_xattr_addr(inode, page);
         inline_size = inline_xattr_size(inode);
  
         f2fs_wait_on_page_writeback(ipage, NODE, true);
@@ -2283,6 +2317,12 @@ retry:
         dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR);
         if (dst->i_inline & F2FS_EXTRA_ATTR) {
                 dst->i_extra_isize = src->i_extra_isize;
+
+               if (f2fs_sb_has_flexible_inline_xattr(sbi->sb) &&
+                       F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
+                                                       i_inline_xattr_size))
+                       dst->i_inline_xattr_size = src->i_inline_xattr_size;
+
                 if (f2fs_sb_has_project_quota(sbi->sb) &&
                         F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize),
                                                                 i_projid))
@@ -2354,8 +2394,8 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi)
  
                 ne = __lookup_nat_cache(nm_i, nid);
                 if (!ne) {
-                       ne = grab_nat_entry(nm_i, nid, true);
-                       node_info_from_raw_nat(&ne->ni, &raw_ne);
+                       ne = __alloc_nat_entry(nid, true);
+                       __init_nat_entry(nm_i, ne, &raw_ne, true);
                 }
  
                 /*
@@ -2401,15 +2441,17 @@ static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid,
         unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK;
         struct f2fs_nat_block *nat_blk = page_address(page);
         int valid = 0;
-       int i;
+       int i = 0;
  
         if (!enabled_nat_bits(sbi, NULL))
                 return;
  
-       for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) {
-               if (start_nid == 0 && i == 0)
-                       valid++;
-               if (nat_blk->entries[i].block_addr)
+       if (nat_index == 0) {
+               valid = 1;
+               i = 1;
+       }
+       for (; i < NAT_ENTRY_PER_BLOCK; i++) {
+               if (nat_blk->entries[i].block_addr != NULL_ADDR)
                         valid++;
         }
         if (valid == 0) {
@@ -2604,7 +2646,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi)
                 __set_bit_le(i, nm_i->nat_block_bitmap);
  
                 nid = i * NAT_ENTRY_PER_BLOCK;
-               last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK;
+               last_nid = nid + NAT_ENTRY_PER_BLOCK;
  
                 spin_lock(&NM_I(sbi)->nid_list_lock);
                 for (; nid < last_nid; nid++)
@@ -2639,16 +2681,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi)
         /* not used nids: 0, node, meta, (and root counted as valid node) */
         nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count -
                                                         F2FS_RESERVED_NODE_NUM;
-       nm_i->nid_cnt[FREE_NID_LIST] = 0;
-       nm_i->nid_cnt[ALLOC_NID_LIST] = 0;
+       nm_i->nid_cnt[FREE_NID] = 0;
+       nm_i->nid_cnt[PREALLOC_NID] = 0;
         nm_i->nat_cnt = 0;
         nm_i->ram_thresh = DEF_RAM_THRESHOLD;
         nm_i->ra_nid_pages = DEF_RA_NID_PAGES;
         nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD;
  
         INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC);
-       INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]);
-       INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]);
+       INIT_LIST_HEAD(&nm_i->free_nid_list);
         INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO);
         INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO);
         INIT_LIST_HEAD(&nm_i->nat_entries);
@@ -2740,16 +2781,15 @@ void destroy_node_manager(struct f2fs_sb_info *sbi)
  
         /* destroy free nid list */
         spin_lock(&nm_i->nid_list_lock);
-       list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST],
-                                                                       list) {
-               __remove_nid_from_list(sbi, i, FREE_NID_LIST, false);
+       list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) {
+               __remove_free_nid(sbi, i, FREE_NID);
                 spin_unlock(&nm_i->nid_list_lock);
                 kmem_cache_free(free_nid_slab, i);
                 spin_lock(&nm_i->nid_list_lock);
         }
-       f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]);
-       f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]);
-       f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST]));
+       f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID]);
+       f2fs_bug_on(sbi, nm_i->nid_cnt[PREALLOC_NID]);
+       f2fs_bug_on(sbi, !list_empty(&nm_i->free_nid_list));
         spin_unlock(&nm_i->nid_list_lock);
  
         /* destroy nat cache */
diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h

index bb53e9955ff2ab8456dd75c0d83ebdf9e6ef3b55..0ee3e5ff49a30b68d4b3040efa79642d989b19e4 100644 (file)
--- a/fs/f2fs/node.h
+++ b/fs/f2fs/node.h
@@ -140,6 +140,7 @@ enum mem_type {
         DIRTY_DENTS,    /* indicates dirty dentry pages */
         INO_ENTRIES,    /* indicates inode entries */
         EXTENT_CACHE,   /* indicates extent cache */
+       INMEM_PAGES,    /* indicates inmemory pages */
         BASE_CHECK,     /* check kernel status */
  };
  
@@ -150,18 +151,10 @@ struct nat_entry_set {
         unsigned int entry_cnt;         /* the # of nat entries in set */
  };
  
-/*
- * For free nid mangement
- */
-enum nid_state {
-       NID_NEW,        /* newly added to free nid list */
-       NID_ALLOC       /* it is allocated */
-};
-
  struct free_nid {
         struct list_head list;  /* for free node id list */
         nid_t nid;              /* node id */
-       int state;              /* in use or not: NID_NEW or NID_ALLOC */
+       int state;              /* in use or not: FREE_NID or PREALLOC_NID */
  };
  
  static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
@@ -170,12 +163,11 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid)
         struct free_nid *fnid;
  
         spin_lock(&nm_i->nid_list_lock);
-       if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) {
+       if (nm_i->nid_cnt[FREE_NID] <= 0) {
                 spin_unlock(&nm_i->nid_list_lock);
                 return;
         }
-       fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST],
-                                               struct free_nid, list);
+       fnid = list_first_entry(&nm_i->free_nid_list, struct free_nid, list);
         *nid = fnid->nid;
         spin_unlock(&nm_i->nid_list_lock);
  }
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c

index 9626758bc76242ca4a91057924d7492e6c217a85..92c57ace1939b0a5d086cee4366f3f2168926c36 100644 (file)
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -594,6 +594,9 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
         int ret = 0;
         unsigned long s_flags = sbi->sb->s_flags;
         bool need_writecp = false;
+#ifdef CONFIG_QUOTA
+       int quota_enabled;
+#endif
  
         if (s_flags & MS_RDONLY) {
                 f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
@@ -604,7 +607,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
         /* Needed for iput() to work correctly and not trash data */
         sbi->sb->s_flags |= MS_ACTIVE;
         /* Turn on quotas so that they are updated correctly */
-       f2fs_enable_quota_files(sbi);
+       quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
  #endif
  
         fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
@@ -665,7 +668,8 @@ skip:
  out:
  #ifdef CONFIG_QUOTA
         /* Turn quotas off */
-       f2fs_quota_off_umount(sbi->sb);
+       if (quota_enabled)
+               f2fs_quota_off_umount(sbi->sb);
  #endif
         sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
  
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c

index c695ff462ee6990f1abd4d27458b9ad4bf89c025..c117e0913f2a3b4a7c2573fc4fd2ece85f31f075 100644 (file)
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -181,11 +181,12 @@ bool need_SSR(struct f2fs_sb_info *sbi)
                 return true;
  
         return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs +
-                                               2 * reserved_sections(sbi));
+                       SM_I(sbi)->min_ssr_sections + reserved_sections(sbi));
  }
  
  void register_inmem_page(struct inode *inode, struct page *page)
  {
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         struct f2fs_inode_info *fi = F2FS_I(inode);
         struct inmem_pages *new;
  
@@ -204,6 +205,10 @@ void register_inmem_page(struct inode *inode, struct page *page)
         mutex_lock(&fi->inmem_lock);
         get_page(page);
         list_add_tail(&new->list, &fi->inmem_pages);
+       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+       if (list_empty(&fi->inmem_ilist))
+               list_add_tail(&fi->inmem_ilist, &sbi->inode_list[ATOMIC_FILE]);
+       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
         inc_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES);
         mutex_unlock(&fi->inmem_lock);
  
@@ -262,12 +267,41 @@ next:
         return err;
  }
  
+void drop_inmem_pages_all(struct f2fs_sb_info *sbi)
+{
+       struct list_head *head = &sbi->inode_list[ATOMIC_FILE];
+       struct inode *inode;
+       struct f2fs_inode_info *fi;
+next:
+       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+       if (list_empty(head)) {
+               spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+               return;
+       }
+       fi = list_first_entry(head, struct f2fs_inode_info, inmem_ilist);
+       inode = igrab(&fi->vfs_inode);
+       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
+
+       if (inode) {
+               drop_inmem_pages(inode);
+               iput(inode);
+       }
+       congestion_wait(BLK_RW_ASYNC, HZ/50);
+       cond_resched();
+       goto next;
+}
+
  void drop_inmem_pages(struct inode *inode)
  {
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         struct f2fs_inode_info *fi = F2FS_I(inode);
  
         mutex_lock(&fi->inmem_lock);
         __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
+       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+       if (!list_empty(&fi->inmem_ilist))
+               list_del_init(&fi->inmem_ilist);
+       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
         mutex_unlock(&fi->inmem_lock);
  
         clear_inode_flag(inode, FI_ATOMIC_FILE);
@@ -313,6 +347,7 @@ static int __commit_inmem_pages(struct inode *inode,
         struct inmem_pages *cur, *tmp;
         struct f2fs_io_info fio = {
                 .sbi = sbi,
+               .ino = inode->i_ino,
                 .type = DATA,
                 .op = REQ_OP_WRITE,
                 .op_flags = REQ_SYNC | REQ_PRIO,
@@ -398,6 +433,10 @@ int commit_inmem_pages(struct inode *inode)
                 /* drop all uncommitted pages */
                 __revoke_inmem_pages(inode, &fi->inmem_pages, true, false);
         }
+       spin_lock(&sbi->inode_lock[ATOMIC_FILE]);
+       if (!list_empty(&fi->inmem_ilist))
+               list_del_init(&fi->inmem_ilist);
+       spin_unlock(&sbi->inode_lock[ATOMIC_FILE]);
         mutex_unlock(&fi->inmem_lock);
  
         clear_inode_flag(inode, FI_ATOMIC_COMMIT);
@@ -472,7 +511,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
  static int __submit_flush_wait(struct f2fs_sb_info *sbi,
                                 struct block_device *bdev)
  {
-       struct bio *bio = f2fs_bio_alloc(0);
+       struct bio *bio = f2fs_bio_alloc(sbi, 0, true);
         int ret;
  
         bio->bi_opf = REQ_OP_WRITE | REQ_SYNC | REQ_PREFLUSH;
@@ -485,15 +524,17 @@ static int __submit_flush_wait(struct f2fs_sb_info *sbi,
         return ret;
  }
  
-static int submit_flush_wait(struct f2fs_sb_info *sbi)
+static int submit_flush_wait(struct f2fs_sb_info *sbi, nid_t ino)
  {
-       int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev);
+       int ret = 0;
         int i;
  
-       if (!sbi->s_ndevs || ret)
-               return ret;
+       if (!sbi->s_ndevs)
+               return __submit_flush_wait(sbi, sbi->sb->s_bdev);
  
-       for (i = 1; i < sbi->s_ndevs; i++) {
+       for (i = 0; i < sbi->s_ndevs; i++) {
+               if (!is_dirty_device(sbi, ino, i, FLUSH_INO))
+                       continue;
                 ret = __submit_flush_wait(sbi, FDEV(i).bdev);
                 if (ret)
                         break;
@@ -519,7 +560,9 @@ repeat:
                 fcc->dispatch_list = llist_del_all(&fcc->issue_list);
                 fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list);
  
-               ret = submit_flush_wait(sbi);
+               cmd = llist_entry(fcc->dispatch_list, struct flush_cmd, llnode);
+
+               ret = submit_flush_wait(sbi, cmd->ino);
                 atomic_inc(&fcc->issued_flush);
  
                 llist_for_each_entry_safe(cmd, next,
@@ -537,7 +580,7 @@ repeat:
         goto repeat;
  }
  
-int f2fs_issue_flush(struct f2fs_sb_info *sbi)
+int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino)
  {
         struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info;
         struct flush_cmd cmd;
@@ -547,19 +590,20 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
                 return 0;
  
         if (!test_opt(sbi, FLUSH_MERGE)) {
-               ret = submit_flush_wait(sbi);
+               ret = submit_flush_wait(sbi, ino);
                 atomic_inc(&fcc->issued_flush);
                 return ret;
         }
  
-       if (atomic_inc_return(&fcc->issing_flush) == 1) {
-               ret = submit_flush_wait(sbi);
+       if (atomic_inc_return(&fcc->issing_flush) == 1 || sbi->s_ndevs > 1) {
+               ret = submit_flush_wait(sbi, ino);
                 atomic_dec(&fcc->issing_flush);
  
                 atomic_inc(&fcc->issued_flush);
                 return ret;
         }
  
+       cmd.ino = ino;
         init_completion(&cmd.wait);
  
         llist_add(&cmd.llnode, &fcc->issue_list);
@@ -583,7 +627,7 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi)
                 } else {
                         struct flush_cmd *tmp, *next;
  
-                       ret = submit_flush_wait(sbi);
+                       ret = submit_flush_wait(sbi, ino);
  
                         llist_for_each_entry_safe(tmp, next, list, llnode) {
                                 if (tmp == &cmd) {
@@ -653,6 +697,28 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free)
         }
  }
  
+int f2fs_flush_device_cache(struct f2fs_sb_info *sbi)
+{
+       int ret = 0, i;
+
+       if (!sbi->s_ndevs)
+               return 0;
+
+       for (i = 1; i < sbi->s_ndevs; i++) {
+               if (!f2fs_test_bit(i, (char *)&sbi->dirty_device))
+                       continue;
+               ret = __submit_flush_wait(sbi, FDEV(i).bdev);
+               if (ret)
+                       break;
+
+               spin_lock(&sbi->dev_lock);
+               f2fs_clear_bit(i, (char *)&sbi->dirty_device);
+               spin_unlock(&sbi->dev_lock);
+       }
+
+       return ret;
+}
+
  static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno,
                 enum dirty_type dirty_type)
  {
@@ -794,6 +860,8 @@ static void __remove_discard_cmd(struct f2fs_sb_info *sbi,
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
  
+       trace_f2fs_remove_discard(dc->bdev, dc->start, dc->len);
+
         f2fs_bug_on(sbi, dc->ref);
  
         if (dc->error == -EOPNOTSUPP)
@@ -845,10 +913,14 @@ void __check_sit_bitmap(struct f2fs_sb_info *sbi,
  
  /* this function is copied from blkdev_issue_discard from block/blk-lib.c */
  static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
-                               struct discard_cmd *dc)
+                                               struct discard_policy *dpolicy,
+                                               struct discard_cmd *dc)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+                                       &(dcc->fstrim_list) : &(dcc->wait_list);
         struct bio *bio = NULL;
+       int flag = dpolicy->sync ? REQ_SYNC : 0;
  
         if (dc->state != D_PREP)
                 return;
@@ -867,9 +939,9 @@ static void __submit_discard_cmd(struct f2fs_sb_info *sbi,
                 if (bio) {
                         bio->bi_private = dc;
                         bio->bi_end_io = f2fs_submit_discard_endio;
-                       bio->bi_opf |= REQ_SYNC;
+                       bio->bi_opf |= flag;
                         submit_bio(bio);
-                       list_move_tail(&dc->list, &dcc->wait_list);
+                       list_move_tail(&dc->list, wait_list);
                         __check_sit_bitmap(sbi, dc->start, dc->start + dc->len);
  
                         f2fs_update_iostat(sbi, FS_DISCARD, 1);
@@ -886,7 +958,7 @@ static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi,
                                 struct rb_node *insert_parent)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       struct rb_node **p = &dcc->root.rb_node;
+       struct rb_node **p;
         struct rb_node *parent = NULL;
         struct discard_cmd *dc = NULL;
  
@@ -1054,58 +1126,107 @@ static int __queue_discard_cmd(struct f2fs_sb_info *sbi,
         return 0;
  }
  
-static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond)
+static void __issue_discard_cmd_range(struct f2fs_sb_info *sbi,
+                                       struct discard_policy *dpolicy,
+                                       unsigned int start, unsigned int end)
+{
+       struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       struct discard_cmd *prev_dc = NULL, *next_dc = NULL;
+       struct rb_node **insert_p = NULL, *insert_parent = NULL;
+       struct discard_cmd *dc;
+       struct blk_plug plug;
+       int issued;
+
+next:
+       issued = 0;
+
+       mutex_lock(&dcc->cmd_lock);
+       f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+
+       dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root,
+                                       NULL, start,
+                                       (struct rb_entry **)&prev_dc,
+                                       (struct rb_entry **)&next_dc,
+                                       &insert_p, &insert_parent, true);
+       if (!dc)
+               dc = next_dc;
+
+       blk_start_plug(&plug);
+
+       while (dc && dc->lstart <= end) {
+               struct rb_node *node;
+
+               if (dc->len < dpolicy->granularity)
+                       goto skip;
+
+               if (dc->state != D_PREP) {
+                       list_move_tail(&dc->list, &dcc->fstrim_list);
+                       goto skip;
+               }
+
+               __submit_discard_cmd(sbi, dpolicy, dc);
+
+               if (++issued >= dpolicy->max_requests) {
+                       start = dc->lstart + dc->len;
+
+                       blk_finish_plug(&plug);
+                       mutex_unlock(&dcc->cmd_lock);
+
+                       schedule();
+
+                       goto next;
+               }
+skip:
+               node = rb_next(&dc->rb_node);
+               dc = rb_entry_safe(node, struct discard_cmd, rb_node);
+
+               if (fatal_signal_pending(current))
+                       break;
+       }
+
+       blk_finish_plug(&plug);
+       mutex_unlock(&dcc->cmd_lock);
+}
+
+static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
+                                       struct discard_policy *dpolicy)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
         struct list_head *pend_list;
         struct discard_cmd *dc, *tmp;
         struct blk_plug plug;
-       int iter = 0, issued = 0;
-       int i;
+       int i, iter = 0, issued = 0;
         bool io_interrupted = false;
  
-       mutex_lock(&dcc->cmd_lock);
-       f2fs_bug_on(sbi,
-               !__check_rb_tree_consistence(sbi, &dcc->root));
-       blk_start_plug(&plug);
-       for (i = MAX_PLIST_NUM - 1;
-                       i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+       for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+               if (i + 1 < dpolicy->granularity)
+                       break;
                 pend_list = &dcc->pend_list[i];
+
+               mutex_lock(&dcc->cmd_lock);
+               f2fs_bug_on(sbi, !__check_rb_tree_consistence(sbi, &dcc->root));
+               blk_start_plug(&plug);
                 list_for_each_entry_safe(dc, tmp, pend_list, list) {
                         f2fs_bug_on(sbi, dc->state != D_PREP);
  
-                       /* Hurry up to finish fstrim */
-                       if (dcc->pend_list_tag[i] & P_TRIM) {
-                               __submit_discard_cmd(sbi, dc);
-                               issued++;
-
-                               if (fatal_signal_pending(current))
-                                       break;
-                               continue;
-                       }
-
-                       if (!issue_cond) {
-                               __submit_discard_cmd(sbi, dc);
-                               issued++;
-                               continue;
-                       }
-
-                       if (is_idle(sbi)) {
-                               __submit_discard_cmd(sbi, dc);
-                               issued++;
-                       } else {
+                       if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
+                                                               !is_idle(sbi)) {
                                 io_interrupted = true;
+                               goto skip;
                         }
  
-                       if (++iter >= DISCARD_ISSUE_RATE)
-                               goto out;
+                       __submit_discard_cmd(sbi, dpolicy, dc);
+                       issued++;
+skip:
+                       if (++iter >= dpolicy->max_requests)
+                               break;
                 }
-               if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM)
-                       dcc->pend_list_tag[i] &= (~P_TRIM);
+               blk_finish_plug(&plug);
+               mutex_unlock(&dcc->cmd_lock);
+
+               if (iter >= dpolicy->max_requests)
+                       break;
         }
-out:
-       blk_finish_plug(&plug);
-       mutex_unlock(&dcc->cmd_lock);
  
         if (!issued && io_interrupted)
                 issued = -1;
@@ -1113,12 +1234,13 @@ out:
         return issued;
  }
  
-static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
+static bool __drop_discard_cmd(struct f2fs_sb_info *sbi)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
         struct list_head *pend_list;
         struct discard_cmd *dc, *tmp;
         int i;
+       bool dropped = false;
  
         mutex_lock(&dcc->cmd_lock);
         for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
@@ -1126,39 +1248,58 @@ static void __drop_discard_cmd(struct f2fs_sb_info *sbi)
                 list_for_each_entry_safe(dc, tmp, pend_list, list) {
                         f2fs_bug_on(sbi, dc->state != D_PREP);
                         __remove_discard_cmd(sbi, dc);
+                       dropped = true;
                 }
         }
         mutex_unlock(&dcc->cmd_lock);
+
+       return dropped;
  }
  
-static void __wait_one_discard_bio(struct f2fs_sb_info *sbi,
+static unsigned int __wait_one_discard_bio(struct f2fs_sb_info *sbi,
                                                         struct discard_cmd *dc)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
+       unsigned int len = 0;
  
         wait_for_completion_io(&dc->wait);
         mutex_lock(&dcc->cmd_lock);
         f2fs_bug_on(sbi, dc->state != D_DONE);
         dc->ref--;
-       if (!dc->ref)
+       if (!dc->ref) {
+               if (!dc->error)
+                       len = dc->len;
                 __remove_discard_cmd(sbi, dc);
+       }
         mutex_unlock(&dcc->cmd_lock);
+
+       return len;
  }
  
-static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond)
+static unsigned int __wait_discard_cmd_range(struct f2fs_sb_info *sbi,
+                                               struct discard_policy *dpolicy,
+                                               block_t start, block_t end)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       struct list_head *wait_list = &(dcc->wait_list);
+       struct list_head *wait_list = (dpolicy->type == DPOLICY_FSTRIM) ?
+                                       &(dcc->fstrim_list) : &(dcc->wait_list);
         struct discard_cmd *dc, *tmp;
         bool need_wait;
+       unsigned int trimmed = 0;
  
  next:
         need_wait = false;
  
         mutex_lock(&dcc->cmd_lock);
         list_for_each_entry_safe(dc, tmp, wait_list, list) {
-               if (!wait_cond || (dc->state == D_DONE && !dc->ref)) {
+               if (dc->lstart + dc->len <= start || end <= dc->lstart)
+                       continue;
+               if (dc->len < dpolicy->granularity)
+                       continue;
+               if (dc->state == D_DONE && !dc->ref) {
                         wait_for_completion_io(&dc->wait);
+                       if (!dc->error)
+                               trimmed += dc->len;
                         __remove_discard_cmd(sbi, dc);
                 } else {
                         dc->ref++;
@@ -1169,9 +1310,17 @@ next:
         mutex_unlock(&dcc->cmd_lock);
  
         if (need_wait) {
-               __wait_one_discard_bio(sbi, dc);
+               trimmed += __wait_one_discard_bio(sbi, dc);
                 goto next;
         }
+
+       return trimmed;
+}
+
+static void __wait_all_discard_cmd(struct f2fs_sb_info *sbi,
+                                               struct discard_policy *dpolicy)
+{
+       __wait_discard_cmd_range(sbi, dpolicy, 0, UINT_MAX);
  }
  
  /* This should be covered by global mutex, &sit_i->sentry_lock */
@@ -1209,23 +1358,19 @@ void stop_discard_thread(struct f2fs_sb_info *sbi)
         }
  }
  
-/* This comes from f2fs_put_super and f2fs_trim_fs */
-void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount)
-{
-       __issue_discard_cmd(sbi, false);
-       __drop_discard_cmd(sbi);
-       __wait_discard_cmd(sbi, !umount);
-}
-
-static void mark_discard_range_all(struct f2fs_sb_info *sbi)
+/* This comes from f2fs_put_super */
+bool f2fs_wait_discard_bios(struct f2fs_sb_info *sbi)
  {
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-       int i;
+       struct discard_policy dpolicy;
+       bool dropped;
  
-       mutex_lock(&dcc->cmd_lock);
-       for (i = 0; i < MAX_PLIST_NUM; i++)
-               dcc->pend_list_tag[i] |= P_TRIM;
-       mutex_unlock(&dcc->cmd_lock);
+       init_discard_policy(&dpolicy, DPOLICY_UMOUNT, dcc->discard_granularity);
+       __issue_discard_cmd(sbi, &dpolicy);
+       dropped = __drop_discard_cmd(sbi);
+       __wait_all_discard_cmd(sbi, &dpolicy);
+
+       return dropped;
  }
  
  static int issue_discard_thread(void *data)
@@ -1233,12 +1378,16 @@ static int issue_discard_thread(void *data)
         struct f2fs_sb_info *sbi = data;
         struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
         wait_queue_head_t *q = &dcc->discard_wait_queue;
+       struct discard_policy dpolicy;
         unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
         int issued;
  
         set_freezable();
  
         do {
+               init_discard_policy(&dpolicy, DPOLICY_BG,
+                                       dcc->discard_granularity);
+
                 wait_event_interruptible_timeout(*q,
                                 kthread_should_stop() || freezing(current) ||
                                 dcc->discard_wake,
@@ -1251,17 +1400,18 @@ static int issue_discard_thread(void *data)
                 if (dcc->discard_wake) {
                         dcc->discard_wake = 0;
                         if (sbi->gc_thread && sbi->gc_thread->gc_urgent)
-                               mark_discard_range_all(sbi);
+                               init_discard_policy(&dpolicy,
+                                                       DPOLICY_FORCE, 1);
                 }
  
                 sb_start_intwrite(sbi->sb);
  
-               issued = __issue_discard_cmd(sbi, true);
+               issued = __issue_discard_cmd(sbi, &dpolicy);
                 if (issued) {
-                       __wait_discard_cmd(sbi, true);
-                       wait_ms = DEF_MIN_DISCARD_ISSUE_TIME;
+                       __wait_all_discard_cmd(sbi, &dpolicy);
+                       wait_ms = dpolicy.min_interval;
                 } else {
-                       wait_ms = DEF_MAX_DISCARD_ISSUE_TIME;
+                       wait_ms = dpolicy.max_interval;
                 }
  
                 sb_end_intwrite(sbi->sb);
@@ -1525,7 +1675,6 @@ find_next:
  
                         f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos,
                                                                         len);
-                       cpc->trimmed += len;
                         total_len += len;
                 } else {
                         next_pos = find_next_bit_le(entry->discard_map,
@@ -1546,6 +1695,37 @@ skip:
         wake_up_discard_thread(sbi, false);
  }
  
+void init_discard_policy(struct discard_policy *dpolicy,
+                               int discard_type, unsigned int granularity)
+{
+       /* common policy */
+       dpolicy->type = discard_type;
+       dpolicy->sync = true;
+       dpolicy->granularity = granularity;
+
+       if (discard_type == DPOLICY_BG) {
+               dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+               dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+               dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+               dpolicy->io_aware_gran = MAX_PLIST_NUM;
+               dpolicy->io_aware = true;
+       } else if (discard_type == DPOLICY_FORCE) {
+               dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
+               dpolicy->max_interval = DEF_MAX_DISCARD_ISSUE_TIME;
+               dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+               dpolicy->io_aware_gran = MAX_PLIST_NUM;
+               dpolicy->io_aware = true;
+       } else if (discard_type == DPOLICY_FSTRIM) {
+               dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+               dpolicy->io_aware_gran = MAX_PLIST_NUM;
+               dpolicy->io_aware = false;
+       } else if (discard_type == DPOLICY_UMOUNT) {
+               dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
+               dpolicy->io_aware_gran = MAX_PLIST_NUM;
+               dpolicy->io_aware = false;
+       }
+}
+
  static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
  {
         dev_t dev = sbi->sb->s_bdev->bd_dev;
@@ -1563,12 +1743,10 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi)
  
         dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY;
         INIT_LIST_HEAD(&dcc->entry_list);
-       for (i = 0; i < MAX_PLIST_NUM; i++) {
+       for (i = 0; i < MAX_PLIST_NUM; i++)
                 INIT_LIST_HEAD(&dcc->pend_list[i]);
-               if (i >= dcc->discard_granularity - 1)
-                       dcc->pend_list_tag[i] |= P_ACTIVE;
-       }
         INIT_LIST_HEAD(&dcc->wait_list);
+       INIT_LIST_HEAD(&dcc->fstrim_list);
         mutex_init(&dcc->cmd_lock);
         atomic_set(&dcc->issued_discard, 0);
         atomic_set(&dcc->issing_discard, 0);
@@ -1716,16 +1894,6 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del)
                 get_sec_entry(sbi, segno)->valid_blocks += del;
  }
  
-void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new)
-{
-       update_sit_entry(sbi, new, 1);
-       if (GET_SEGNO(sbi, old) != NULL_SEGNO)
-               update_sit_entry(sbi, old, -1);
-
-       locate_dirty_segment(sbi, GET_SEGNO(sbi, old));
-       locate_dirty_segment(sbi, GET_SEGNO(sbi, new));
-}
-
  void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
  {
         unsigned int segno = GET_SEGNO(sbi, addr);
@@ -1736,14 +1904,14 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr)
                 return;
  
         /* add it into sit main buffer */
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
  
         update_sit_entry(sbi, addr, -1);
  
         /* add it into dirty seglist */
         locate_dirty_segment(sbi, segno);
  
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
  }
  
  bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
@@ -1756,7 +1924,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
         if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR)
                 return true;
  
-       mutex_lock(&sit_i->sentry_lock);
+       down_read(&sit_i->sentry_lock);
  
         segno = GET_SEGNO(sbi, blkaddr);
         se = get_seg_entry(sbi, segno);
@@ -1765,7 +1933,7 @@ bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr)
         if (f2fs_test_bit(offset, se->ckpt_valid_map))
                 is_cp = true;
  
-       mutex_unlock(&sit_i->sentry_lock);
+       up_read(&sit_i->sentry_lock);
  
         return is_cp;
  }
@@ -1823,12 +1991,8 @@ struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno)
  void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr)
  {
         struct page *page = grab_meta_page(sbi, blk_addr);
-       void *dst = page_address(page);
  
-       if (src)
-               memcpy(dst, src, PAGE_SIZE);
-       else
-               memset(dst, 0, PAGE_SIZE);
+       memcpy(page_address(page), src, PAGE_SIZE);
         set_page_dirty(page);
         f2fs_put_page(page, 1);
  }
@@ -1927,7 +2091,6 @@ find_other_zone:
         }
         secno = left_start;
  skip_left:
-       hint = secno;
         segno = GET_SEG_FROM_SEC(sbi, secno);
         zoneno = GET_ZONE_FROM_SEC(sbi, secno);
  
@@ -2162,12 +2325,16 @@ void allocate_new_segments(struct f2fs_sb_info *sbi)
         unsigned int old_segno;
         int i;
  
+       down_write(&SIT_I(sbi)->sentry_lock);
+
         for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
                 curseg = CURSEG_I(sbi, i);
                 old_segno = curseg->segno;
                 SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true);
                 locate_dirty_segment(sbi, old_segno);
         }
+
+       up_write(&SIT_I(sbi)->sentry_lock);
  }
  
  static const struct segment_allocation default_salloc_ops = {
@@ -2179,14 +2346,14 @@ bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         __u64 trim_start = cpc->trim_start;
         bool has_candidate = false;
  
-       mutex_lock(&SIT_I(sbi)->sentry_lock);
+       down_write(&SIT_I(sbi)->sentry_lock);
         for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) {
                 if (add_discard_addrs(sbi, cpc, true)) {
                         has_candidate = true;
                         break;
                 }
         }
-       mutex_unlock(&SIT_I(sbi)->sentry_lock);
+       up_write(&SIT_I(sbi)->sentry_lock);
  
         cpc->trim_start = trim_start;
         return has_candidate;
@@ -2196,14 +2363,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
  {
         __u64 start = F2FS_BYTES_TO_BLK(range->start);
         __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1;
-       unsigned int start_segno, end_segno;
+       unsigned int start_segno, end_segno, cur_segno;
+       block_t start_block, end_block;
         struct cp_control cpc;
+       struct discard_policy dpolicy;
+       unsigned long long trimmed = 0;
         int err = 0;
  
         if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
                 return -EINVAL;
  
-       cpc.trimmed = 0;
         if (end <= MAIN_BLKADDR(sbi))
                 goto out;
  
@@ -2217,12 +2386,14 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
         start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start);
         end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 :
                                                 GET_SEGNO(sbi, end);
+
         cpc.reason = CP_DISCARD;
         cpc.trim_minlen = max_t(__u64, 1, F2FS_BYTES_TO_BLK(range->minlen));
  
         /* do checkpoint to issue discard commands safely */
-       for (; start_segno <= end_segno; start_segno = cpc.trim_end + 1) {
-               cpc.trim_start = start_segno;
+       for (cur_segno = start_segno; cur_segno <= end_segno;
+                                       cur_segno = cpc.trim_end + 1) {
+               cpc.trim_start = cur_segno;
  
                 if (sbi->discard_blks == 0)
                         break;
@@ -2230,7 +2401,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
                         cpc.trim_end = end_segno;
                 else
                         cpc.trim_end = min_t(unsigned int,
-                               rounddown(start_segno +
+                               rounddown(cur_segno +
                                 BATCHED_TRIM_SEGMENTS(sbi),
                                 sbi->segs_per_sec) - 1, end_segno);
  
@@ -2242,11 +2413,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
  
                 schedule();
         }
-       /* It's time to issue all the filed discards */
-       mark_discard_range_all(sbi);
-       f2fs_wait_discard_bios(sbi, false);
+
+       start_block = START_BLOCK(sbi, start_segno);
+       end_block = START_BLOCK(sbi, min(cur_segno, end_segno) + 1);
+
+       init_discard_policy(&dpolicy, DPOLICY_FSTRIM, cpc.trim_minlen);
+       __issue_discard_cmd_range(sbi, &dpolicy, start_block, end_block);
+       trimmed = __wait_discard_cmd_range(sbi, &dpolicy,
+                                       start_block, end_block);
  out:
-       range->len = F2FS_BLK_TO_BYTES(cpc.trimmed);
+       range->len = F2FS_BLK_TO_BYTES(trimmed);
         return err;
  }
  
@@ -2258,6 +2434,18 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type)
         return false;
  }
  
+int rw_hint_to_seg_type(enum rw_hint hint)
+{
+       switch (hint) {
+       case WRITE_LIFE_SHORT:
+               return CURSEG_HOT_DATA;
+       case WRITE_LIFE_EXTREME:
+               return CURSEG_COLD_DATA;
+       default:
+               return CURSEG_WARM_DATA;
+       }
+}
+
  static int __get_segment_type_2(struct f2fs_io_info *fio)
  {
         if (fio->type == DATA)
@@ -2292,7 +2480,7 @@ static int __get_segment_type_6(struct f2fs_io_info *fio)
                         return CURSEG_COLD_DATA;
                 if (is_inode_flag_set(inode, FI_HOT_DATA))
                         return CURSEG_HOT_DATA;
-               return CURSEG_WARM_DATA;
+               return rw_hint_to_seg_type(inode->i_write_hint);
         } else {
                 if (IS_DNODE(fio->page))
                         return is_cold_node(fio->page) ? CURSEG_WARM_NODE :
@@ -2336,8 +2524,10 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
         struct sit_info *sit_i = SIT_I(sbi);
         struct curseg_info *curseg = CURSEG_I(sbi, type);
  
+       down_read(&SM_I(sbi)->curseg_lock);
+
         mutex_lock(&curseg->curseg_mutex);
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
  
         *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
  
@@ -2354,15 +2544,26 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
  
         stat_inc_block_count(sbi, curseg);
  
+       /*
+        * SIT information should be updated before segment allocation,
+        * since SSR needs latest valid block information.
+        */
+       update_sit_entry(sbi, *new_blkaddr, 1);
+       if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
+               update_sit_entry(sbi, old_blkaddr, -1);
+
         if (!__has_curseg_space(sbi, type))
                 sit_i->s_ops->allocate_segment(sbi, type, false);
+
         /*
-        * SIT information should be updated after segment allocation,
-        * since we need to keep dirty segments precisely under SSR.
+        * segment dirty status should be updated after segment allocation,
+        * so we just need to update status only one time after previous
+        * segment being closed.
          */
-       refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr);
+       locate_dirty_segment(sbi, GET_SEGNO(sbi, old_blkaddr));
+       locate_dirty_segment(sbi, GET_SEGNO(sbi, *new_blkaddr));
  
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
  
         if (page && IS_NODESEG(type)) {
                 fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg));
@@ -2382,6 +2583,29 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page,
         }
  
         mutex_unlock(&curseg->curseg_mutex);
+
+       up_read(&SM_I(sbi)->curseg_lock);
+}
+
+static void update_device_state(struct f2fs_io_info *fio)
+{
+       struct f2fs_sb_info *sbi = fio->sbi;
+       unsigned int devidx;
+
+       if (!sbi->s_ndevs)
+               return;
+
+       devidx = f2fs_target_device_index(sbi, fio->new_blkaddr);
+
+       /* update device state for fsync */
+       set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO);
+
+       /* update device state for checkpoint */
+       if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) {
+               spin_lock(&sbi->dev_lock);
+               f2fs_set_bit(devidx, (char *)&sbi->dirty_device);
+               spin_unlock(&sbi->dev_lock);
+       }
  }
  
  static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
@@ -2398,6 +2622,8 @@ reallocate:
         if (err == -EAGAIN) {
                 fio->old_blkaddr = fio->new_blkaddr;
                 goto reallocate;
+       } else if (!err) {
+               update_device_state(fio);
         }
  }
  
@@ -2458,12 +2684,26 @@ int rewrite_data_page(struct f2fs_io_info *fio)
         stat_inc_inplace_blocks(fio->sbi);
  
         err = f2fs_submit_page_bio(fio);
+       if (!err)
+               update_device_state(fio);
  
         f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE);
  
         return err;
  }
  
+static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi,
+                                               unsigned int segno)
+{
+       int i;
+
+       for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) {
+               if (CURSEG_I(sbi, i)->segno == segno)
+                       break;
+       }
+       return i;
+}
+
  void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                                 block_t old_blkaddr, block_t new_blkaddr,
                                 bool recover_curseg, bool recover_newaddr)
@@ -2479,6 +2719,8 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
         se = get_seg_entry(sbi, segno);
         type = se->type;
  
+       down_write(&SM_I(sbi)->curseg_lock);
+
         if (!recover_curseg) {
                 /* for recovery flow */
                 if (se->valid_blocks == 0 && !IS_CURSEG(sbi, segno)) {
@@ -2488,14 +2730,19 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                                 type = CURSEG_WARM_DATA;
                 }
         } else {
-               if (!IS_CURSEG(sbi, segno))
+               if (IS_CURSEG(sbi, segno)) {
+                       /* se->type is volatile as SSR allocation */
+                       type = __f2fs_get_curseg(sbi, segno);
+                       f2fs_bug_on(sbi, type == NO_CHECK_TYPE);
+               } else {
                         type = CURSEG_WARM_DATA;
+               }
         }
  
         curseg = CURSEG_I(sbi, type);
  
         mutex_lock(&curseg->curseg_mutex);
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
  
         old_cursegno = curseg->segno;
         old_blkoff = curseg->next_blkoff;
@@ -2527,8 +2774,9 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
                 curseg->next_blkoff = old_blkoff;
         }
  
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
         mutex_unlock(&curseg->curseg_mutex);
+       up_write(&SM_I(sbi)->curseg_lock);
  }
  
  void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn,
@@ -2982,7 +3230,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc)
         bool to_journal = true;
         struct seg_entry *se;
  
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
  
         if (!sit_i->dirty_sentries)
                 goto out;
@@ -3076,7 +3324,7 @@ out:
  
                 cpc->trim_start = trim_start;
         }
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
  
         set_prefree_as_free_segments(sbi);
  }
@@ -3169,7 +3417,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
         sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
         sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
         sit_i->mounted_time = ktime_get_real_seconds();
-       mutex_init(&sit_i->sentry_lock);
+       init_rwsem(&sit_i->sentry_lock);
         return 0;
  }
  
@@ -3410,7 +3658,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
         struct sit_info *sit_i = SIT_I(sbi);
         unsigned int segno;
  
-       mutex_lock(&sit_i->sentry_lock);
+       down_write(&sit_i->sentry_lock);
  
         sit_i->min_mtime = LLONG_MAX;
  
@@ -3427,7 +3675,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi)
                         sit_i->min_mtime = mtime;
         }
         sit_i->max_mtime = get_mtime(sbi);
-       mutex_unlock(&sit_i->sentry_lock);
+       up_write(&sit_i->sentry_lock);
  }
  
  int build_segment_manager(struct f2fs_sb_info *sbi)
@@ -3460,11 +3708,14 @@ int build_segment_manager(struct f2fs_sb_info *sbi)
         sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
         sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
         sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS;
+       sm_info->min_ssr_sections = reserved_sections(sbi);
  
         sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS;
  
         INIT_LIST_HEAD(&sm_info->sit_entry_set);
  
+       init_rwsem(&sm_info->curseg_lock);
+
         if (!f2fs_readonly(sbi->sb)) {
                 err = create_flush_cmd_control(sbi);
                 if (err)
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h

index e0a6cc23ace3a66ddcd5975c0e3f5ee14ace3946..d1d394cdf61dd9ffbde608a941463c98b4d4f0b2 100644 (file)
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -231,7 +231,7 @@ struct sit_info {
         unsigned long *dirty_sentries_bitmap;   /* bitmap for dirty sentries */
         unsigned int dirty_sentries;            /* # of dirty sentries */
         unsigned int sents_per_block;           /* # of SIT entries per block */
-       struct mutex sentry_lock;               /* to protect SIT cache */
+       struct rw_semaphore sentry_lock;        /* to protect SIT cache */
         struct seg_entry *sentries;             /* SIT segment-level cache */
         struct sec_entry *sec_entries;          /* SIT section-level cache */
  
@@ -497,6 +497,33 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi)
         return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi));
  }
  
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+{
+       unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+                                       get_pages(sbi, F2FS_DIRTY_DENTS);
+       unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+       unsigned int segno, left_blocks;
+       int i;
+
+       /* check current node segment */
+       for (i = CURSEG_HOT_NODE; i <= CURSEG_COLD_NODE; i++) {
+               segno = CURSEG_I(sbi, i)->segno;
+               left_blocks = sbi->blocks_per_seg -
+                       get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+
+               if (node_blocks > left_blocks)
+                       return false;
+       }
+
+       /* check current data segment */
+       segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno;
+       left_blocks = sbi->blocks_per_seg -
+                       get_seg_entry(sbi, segno)->ckpt_valid_blocks;
+       if (dent_blocks > left_blocks)
+               return false;
+       return true;
+}
+
  static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
                                         int freed, int needed)
  {
@@ -507,6 +534,9 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
                 return false;
  
+       if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
+                       has_curseg_enough_space(sbi))
+               return false;
         return (free_sections(sbi) + freed) <=
                 (node_secs + 2 * dent_secs + imeta_secs +
                 reserved_sections(sbi) + needed);
@@ -731,7 +761,7 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type)
  static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi,
                                                 unsigned int secno)
  {
-       if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >=
+       if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >
                                                 sbi->fggc_threshold)
                 return true;
         return false;
@@ -796,8 +826,9 @@ static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force)
                 goto wake_up;
  
         mutex_lock(&dcc->cmd_lock);
-       for (i = MAX_PLIST_NUM - 1;
-                       i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) {
+       for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
+               if (i + 1 < dcc->discard_granularity)
+                       break;
                 if (!list_empty(&dcc->pend_list[i])) {
                         wakeup = true;
                         break;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c

index 5c60fc28ec758894a56d7b73e1e830d03fe0f0fb..0b5664a1a6cc85c2420142359364e3c32b0e782e 100644 (file)
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -28,7 +28,7 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi)
  
  static unsigned long __count_free_nids(struct f2fs_sb_info *sbi)
  {
-       long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS;
+       long count = NM_I(sbi)->nid_cnt[FREE_NID] - MAX_FREE_NIDS;
  
         return count > 0 ? count : 0;
  }
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c

index 933c3d529e6531c827c2237c97f816103c6d4ef0..a6c5dd450002daa7d1c43f414d0ffbaa17ff1b04 100644 (file)
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -44,6 +44,8 @@ static struct kmem_cache *f2fs_inode_cachep;
  char *fault_name[FAULT_MAX] = {
         [FAULT_KMALLOC]         = "kmalloc",
         [FAULT_PAGE_ALLOC]      = "page alloc",
+       [FAULT_PAGE_GET]        = "page get",
+       [FAULT_ALLOC_BIO]       = "alloc bio",
         [FAULT_ALLOC_NID]       = "alloc nid",
         [FAULT_ORPHAN]          = "orphan",
         [FAULT_BLOCK]           = "no more block",
@@ -92,6 +94,7 @@ enum {
         Opt_disable_ext_identify,
         Opt_inline_xattr,
         Opt_noinline_xattr,
+       Opt_inline_xattr_size,
         Opt_inline_data,
         Opt_inline_dentry,
         Opt_noinline_dentry,
@@ -141,6 +144,7 @@ static match_table_t f2fs_tokens = {
         {Opt_disable_ext_identify, "disable_ext_identify"},
         {Opt_inline_xattr, "inline_xattr"},
         {Opt_noinline_xattr, "noinline_xattr"},
+       {Opt_inline_xattr_size, "inline_xattr_size=%u"},
         {Opt_inline_data, "inline_data"},
         {Opt_inline_dentry, "inline_dentry"},
         {Opt_noinline_dentry, "noinline_dentry"},
@@ -209,6 +213,12 @@ static int f2fs_set_qf_name(struct super_block *sb, int qtype,
                         "quota options when quota turned on");
                 return -EINVAL;
         }
+       if (f2fs_sb_has_quota_ino(sb)) {
+               f2fs_msg(sb, KERN_INFO,
+                       "QUOTA feature is enabled, so ignore qf_name");
+               return 0;
+       }
+
         qname = match_strdup(args);
         if (!qname) {
                 f2fs_msg(sb, KERN_ERR,
@@ -287,6 +297,18 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi)
                         return -1;
                 }
         }
+
+       if (f2fs_sb_has_quota_ino(sbi->sb) && sbi->s_jquota_fmt) {
+               f2fs_msg(sbi->sb, KERN_INFO,
+                       "QUOTA feature is enabled, so ignore jquota_fmt");
+               sbi->s_jquota_fmt = 0;
+       }
+       if (f2fs_sb_has_quota_ino(sbi->sb) && sb_rdonly(sbi->sb)) {
+               f2fs_msg(sbi->sb, KERN_INFO,
+                        "Filesystem with quota feature cannot be mounted RDWR "
+                        "without CONFIG_QUOTA");
+               return -1;
+       }
         return 0;
  }
  #endif
@@ -383,6 +405,12 @@ static int parse_options(struct super_block *sb, char *options)
                 case Opt_noinline_xattr:
                         clear_opt(sbi, INLINE_XATTR);
                         break;
+               case Opt_inline_xattr_size:
+                       if (args->from && match_int(args, &arg))
+                               return -EINVAL;
+                       set_opt(sbi, INLINE_XATTR_SIZE);
+                       sbi->inline_xattr_size = arg;
+                       break;
  #else
                 case Opt_user_xattr:
                         f2fs_msg(sb, KERN_INFO,
@@ -604,6 +632,24 @@ static int parse_options(struct super_block *sb, char *options)
                                 F2FS_IO_SIZE_KB(sbi));
                 return -EINVAL;
         }
+
+       if (test_opt(sbi, INLINE_XATTR_SIZE)) {
+               if (!test_opt(sbi, INLINE_XATTR)) {
+                       f2fs_msg(sb, KERN_ERR,
+                                       "inline_xattr_size option should be "
+                                       "set with inline_xattr option");
+                       return -EINVAL;
+               }
+               if (!sbi->inline_xattr_size ||
+                       sbi->inline_xattr_size >= DEF_ADDRS_PER_INODE -
+                                       F2FS_TOTAL_EXTRA_ATTR_SIZE -
+                                       DEF_INLINE_RESERVED_SIZE -
+                                       DEF_MIN_INLINE_SIZE) {
+                       f2fs_msg(sb, KERN_ERR,
+                                       "inline xattr size is out of range");
+                       return -EINVAL;
+               }
+       }
         return 0;
  }
  
@@ -618,13 +664,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
         init_once((void *) fi);
  
         /* Initialize f2fs-specific inode info */
-       fi->vfs_inode.i_version = 1;
         atomic_set(&fi->dirty_pages, 0);
         fi->i_current_depth = 1;
         fi->i_advise = 0;
         init_rwsem(&fi->i_sem);
         INIT_LIST_HEAD(&fi->dirty_list);
         INIT_LIST_HEAD(&fi->gdirty_list);
+       INIT_LIST_HEAD(&fi->inmem_ilist);
         INIT_LIST_HEAD(&fi->inmem_pages);
         mutex_init(&fi->inmem_lock);
         init_rwsem(&fi->dio_rwsem[READ]);
@@ -673,7 +719,6 @@ static int f2fs_drop_inode(struct inode *inode)
  
                         sb_end_intwrite(inode->i_sb);
  
-                       fscrypt_put_encryption_info(inode, NULL);
                         spin_lock(&inode->i_lock);
                         atomic_dec(&inode->i_count);
                 }
@@ -781,6 +826,7 @@ static void f2fs_put_super(struct super_block *sb)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         int i;
+       bool dropped;
  
         f2fs_quota_off_umount(sb);
  
@@ -801,9 +847,9 @@ static void f2fs_put_super(struct super_block *sb)
         }
  
         /* be sure to wait for any on-going discard commands */
-       f2fs_wait_discard_bios(sbi, true);
+       dropped = f2fs_wait_discard_bios(sbi);
  
-       if (f2fs_discard_en(sbi) && !sbi->discard_blks) {
+       if (f2fs_discard_en(sbi) && !sbi->discard_blks && !dropped) {
                 struct cp_control cpc = {
                         .reason = CP_UMOUNT | CP_TRIMMED,
                 };
@@ -858,6 +904,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync)
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
         int err = 0;
  
+       if (unlikely(f2fs_cp_error(sbi)))
+               return 0;
+
         trace_f2fs_sync_fs(sb, sync);
  
         if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
@@ -957,7 +1006,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf)
         buf->f_blocks = total_count - start_count;
         buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count;
         buf->f_bavail = user_block_count - valid_user_blocks(sbi) -
-                                               sbi->reserved_blocks;
+                                               sbi->current_reserved_blocks;
  
         avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM;
  
@@ -1046,6 +1095,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
                 seq_puts(seq, ",inline_xattr");
         else
                 seq_puts(seq, ",noinline_xattr");
+       if (test_opt(sbi, INLINE_XATTR_SIZE))
+               seq_printf(seq, ",inline_xattr_size=%u",
+                                       sbi->inline_xattr_size);
  #endif
  #ifdef CONFIG_F2FS_FS_POSIX_ACL
         if (test_opt(sbi, POSIX_ACL))
@@ -1108,6 +1160,7 @@ static void default_options(struct f2fs_sb_info *sbi)
  {
         /* init some FS parameters */
         sbi->active_logs = NR_CURSEG_TYPE;
+       sbi->inline_xattr_size = DEFAULT_INLINE_XATTR_ADDRS;
  
         set_opt(sbi, BG_GC);
         set_opt(sbi, INLINE_XATTR);
@@ -1136,6 +1189,9 @@ static void default_options(struct f2fs_sb_info *sbi)
  #endif
  }
  
+#ifdef CONFIG_QUOTA
+static int f2fs_enable_quotas(struct super_block *sb);
+#endif
  static int f2fs_remount(struct super_block *sb, int *flags, char *data)
  {
         struct f2fs_sb_info *sbi = F2FS_SB(sb);
@@ -1202,6 +1258,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         if (f2fs_readonly(sb) && (*flags & MS_RDONLY))
                 goto skip;
  
+#ifdef CONFIG_QUOTA
         if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) {
                 err = dquot_suspend(sb, -1);
                 if (err < 0)
@@ -1209,9 +1266,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
         } else {
                 /* dquot_resume needs RW */
                 sb->s_flags &= ~MS_RDONLY;
-               dquot_resume(sb, -1);
+               if (sb_any_quota_suspended(sb)) {
+                       dquot_resume(sb, -1);
+               } else if (f2fs_sb_has_quota_ino(sb)) {
+                       err = f2fs_enable_quotas(sb);
+                       if (err)
+                               goto restore_opts;
+               }
         }
-
+#endif
         /* disallow enable/disable extent_cache dynamically */
         if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) {
                 err = -EINVAL;
@@ -1320,8 +1383,13 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data,
                 tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread);
  repeat:
                 page = read_mapping_page(mapping, blkidx, NULL);
-               if (IS_ERR(page))
+               if (IS_ERR(page)) {
+                       if (PTR_ERR(page) == -ENOMEM) {
+                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               goto repeat;
+                       }
                         return PTR_ERR(page);
+               }
  
                 lock_page(page);
  
@@ -1364,11 +1432,16 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
         while (towrite > 0) {
                 tocopy = min_t(unsigned long, sb->s_blocksize - offset,
                                                                 towrite);
-
+retry:
                 err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
                                                         &page, NULL);
-               if (unlikely(err))
+               if (unlikely(err)) {
+                       if (err == -ENOMEM) {
+                               congestion_wait(BLK_RW_ASYNC, HZ/50);
+                               goto retry;
+                       }
                         break;
+               }
  
                 kaddr = kmap_atomic(page);
                 memcpy(kaddr + offset, data, tocopy);
@@ -1385,8 +1458,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
         }
  
         if (len == towrite)
-               return 0;
-       inode->i_version++;
+               return err;
         inode->i_mtime = inode->i_ctime = current_time(inode);
         f2fs_mark_inode_dirty_sync(inode, false);
         return len - towrite;
@@ -1408,19 +1480,91 @@ static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type)
                                                 sbi->s_jquota_fmt, type);
  }
  
-void f2fs_enable_quota_files(struct f2fs_sb_info *sbi)
+int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly)
  {
-       int i, ret;
+       int enabled = 0;
+       int i, err;
+
+       if (f2fs_sb_has_quota_ino(sbi->sb) && rdonly) {
+               err = f2fs_enable_quotas(sbi->sb);
+               if (err) {
+                       f2fs_msg(sbi->sb, KERN_ERR,
+                                       "Cannot turn on quota_ino: %d", err);
+                       return 0;
+               }
+               return 1;
+       }
  
         for (i = 0; i < MAXQUOTAS; i++) {
                 if (sbi->s_qf_names[i]) {
-                       ret = f2fs_quota_on_mount(sbi, i);
-                       if (ret < 0)
-                               f2fs_msg(sbi->sb, KERN_ERR,
-                                       "Cannot turn on journaled "
-                                       "quota: error %d", ret);
+                       err = f2fs_quota_on_mount(sbi, i);
+                       if (!err) {
+                               enabled = 1;
+                               continue;
+                       }
+                       f2fs_msg(sbi->sb, KERN_ERR,
+                               "Cannot turn on quotas: %d on %d", err, i);
+               }
+       }
+       return enabled;
+}
+
+static int f2fs_quota_enable(struct super_block *sb, int type, int format_id,
+                            unsigned int flags)
+{
+       struct inode *qf_inode;
+       unsigned long qf_inum;
+       int err;
+
+       BUG_ON(!f2fs_sb_has_quota_ino(sb));
+
+       qf_inum = f2fs_qf_ino(sb, type);
+       if (!qf_inum)
+               return -EPERM;
+
+       qf_inode = f2fs_iget(sb, qf_inum);
+       if (IS_ERR(qf_inode)) {
+               f2fs_msg(sb, KERN_ERR,
+                       "Bad quota inode %u:%lu", type, qf_inum);
+               return PTR_ERR(qf_inode);
+       }
+
+       /* Don't account quota for quota files to avoid recursion */
+       qf_inode->i_flags |= S_NOQUOTA;
+       err = dquot_enable(qf_inode, type, format_id, flags);
+       iput(qf_inode);
+       return err;
+}
+
+static int f2fs_enable_quotas(struct super_block *sb)
+{
+       int type, err = 0;
+       unsigned long qf_inum;
+       bool quota_mopt[MAXQUOTAS] = {
+               test_opt(F2FS_SB(sb), USRQUOTA),
+               test_opt(F2FS_SB(sb), GRPQUOTA),
+               test_opt(F2FS_SB(sb), PRJQUOTA),
+       };
+
+       sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NOLIST_DIRTY;
+       for (type = 0; type < MAXQUOTAS; type++) {
+               qf_inum = f2fs_qf_ino(sb, type);
+               if (qf_inum) {
+                       err = f2fs_quota_enable(sb, type, QFMT_VFS_V1,
+                               DQUOT_USAGE_ENABLED |
+                               (quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
+                       if (err) {
+                               f2fs_msg(sb, KERN_ERR,
+                                       "Failed to enable quota tracking "
+                                       "(type=%d, err=%d). Please run "
+                                       "fsck to fix.", type, err);
+                               for (type--; type >= 0; type--)
+                                       dquot_quota_off(sb, type);
+                               return err;
+                       }
                 }
         }
+       return 0;
  }
  
  static int f2fs_quota_sync(struct super_block *sb, int type)
@@ -1491,7 +1635,7 @@ static int f2fs_quota_off(struct super_block *sb, int type)
         f2fs_quota_sync(sb, type);
  
         err = dquot_quota_off(sb, type);
-       if (err)
+       if (err || f2fs_sb_has_quota_ino(sb))
                 goto out_put;
  
         inode_lock(inode);
@@ -1594,14 +1738,9 @@ static const struct fscrypt_operations f2fs_cryptops = {
         .key_prefix     = "f2fs:",
         .get_context    = f2fs_get_context,
         .set_context    = f2fs_set_context,
-       .is_encrypted   = f2fs_encrypted_inode,
         .empty_dir      = f2fs_empty_dir,
         .max_namelen    = f2fs_max_namelen,
  };
-#else
-static const struct fscrypt_operations f2fs_cryptops = {
-       .is_encrypted   = f2fs_encrypted_inode,
-};
  #endif
  
  static struct inode *f2fs_nfs_get_inode(struct super_block *sb,
@@ -1656,7 +1795,7 @@ static loff_t max_file_blocks(void)
  
         /*
          * note: previously, result is equal to (DEF_ADDRS_PER_INODE -
-        * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
+        * DEFAULT_INLINE_XATTR_ADDRS), but now f2fs try to reserve more
          * space in inode.i_addr, it will be more safe to reassign
          * result as zero.
          */
@@ -1965,6 +2104,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi)
                 for (j = HOT; j < NR_TEMP_TYPE; j++)
                         mutex_init(&sbi->wio_mutex[i][j]);
         spin_lock_init(&sbi->cp_lock);
+
+       sbi->dirty_device = 0;
+       spin_lock_init(&sbi->dev_lock);
  }
  
  static int init_percpu_info(struct f2fs_sb_info *sbi)
@@ -2315,12 +2457,17 @@ try_onemore:
  
  #ifdef CONFIG_QUOTA
         sb->dq_op = &f2fs_quota_operations;
-       sb->s_qcop = &f2fs_quotactl_ops;
+       if (f2fs_sb_has_quota_ino(sb))
+               sb->s_qcop = &dquot_quotactl_sysfile_ops;
+       else
+               sb->s_qcop = &f2fs_quotactl_ops;
         sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ;
  #endif
  
         sb->s_op = &f2fs_sops;
+#ifdef CONFIG_F2FS_FS_ENCRYPTION
         sb->s_cop = &f2fs_cryptops;
+#endif
         sb->s_xattr = f2fs_xattr_handlers;
         sb->s_export_op = &f2fs_export_ops;
         sb->s_magic = F2FS_SUPER_MAGIC;
@@ -2411,6 +2558,7 @@ try_onemore:
                                 le64_to_cpu(sbi->ckpt->valid_block_count);
         sbi->last_valid_block_count = sbi->total_valid_block_count;
         sbi->reserved_blocks = 0;
+       sbi->current_reserved_blocks = 0;
  
         for (i = 0; i < NR_INODE_TYPE; i++) {
                 INIT_LIST_HEAD(&sbi->inode_list[i]);
@@ -2485,10 +2633,24 @@ try_onemore:
         if (err)
                 goto free_root_inode;
  
+#ifdef CONFIG_QUOTA
+       /*
+        * Turn on quotas which were not enabled for read-only mounts if
+        * filesystem has quota feature, so that they are updated correctly.
+        */
+       if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb)) {
+               err = f2fs_enable_quotas(sb);
+               if (err) {
+                       f2fs_msg(sb, KERN_ERR,
+                               "Cannot turn on quotas: error %d", err);
+                       goto free_sysfs;
+               }
+       }
+#endif
         /* if there are nt orphan nodes free them */
         err = recover_orphan_inodes(sbi);
         if (err)
-               goto free_sysfs;
+               goto free_meta;
  
         /* recover fsynced data */
         if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
@@ -2522,7 +2684,7 @@ try_onemore:
                         err = -EINVAL;
                         f2fs_msg(sb, KERN_ERR,
                                 "Need to recover fsync data");
-                       goto free_sysfs;
+                       goto free_meta;
                 }
         }
  skip_recovery:
@@ -2556,6 +2718,10 @@ skip_recovery:
         return 0;
  
  free_meta:
+#ifdef CONFIG_QUOTA
+       if (f2fs_sb_has_quota_ino(sb) && !sb_rdonly(sb))
+               f2fs_quota_off_umount(sbi->sb);
+#endif
         f2fs_sync_inode_meta(sbi);
         /*
          * Some dirty meta pages can be produced by recover_orphan_inodes()
@@ -2564,7 +2730,9 @@ free_meta:
          * falls into an infinite loop in sync_meta_pages().
          */
         truncate_inode_pages_final(META_MAPPING(sbi));
+#ifdef CONFIG_QUOTA
  free_sysfs:
+#endif
         f2fs_unregister_sysfs(sbi);
  free_root_inode:
         dput(sb->s_root);
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c

index e2c258f717cd11214eb9fe9dd7f4bb202ccd6bc4..9835348b6e5d200b71112c2eb65ad101d5bd3be0 100644 (file)
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -30,7 +30,7 @@ enum {
         FAULT_INFO_RATE,        /* struct f2fs_fault_info */
         FAULT_INFO_TYPE,        /* struct f2fs_fault_info */
  #endif
-       RESERVED_BLOCKS,
+       RESERVED_BLOCKS,        /* struct f2fs_sb_info */
  };
  
  struct f2fs_attr {
@@ -63,6 +63,13 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type)
         return NULL;
  }
  
+static ssize_t dirty_segments_show(struct f2fs_attr *a,
+               struct f2fs_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%llu\n",
+               (unsigned long long)(dirty_segments(sbi)));
+}
+
  static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a,
                 struct f2fs_sb_info *sbi, char *buf)
  {
@@ -100,10 +107,22 @@ static ssize_t features_show(struct f2fs_attr *a,
         if (f2fs_sb_has_inode_chksum(sb))
                 len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
                                 len ? ", " : "", "inode_checksum");
+       if (f2fs_sb_has_flexible_inline_xattr(sb))
+               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+                               len ? ", " : "", "flexible_inline_xattr");
+       if (f2fs_sb_has_quota_ino(sb))
+               len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+                               len ? ", " : "", "quota_ino");
         len += snprintf(buf + len, PAGE_SIZE - len, "\n");
         return len;
  }
  
+static ssize_t current_reserved_blocks_show(struct f2fs_attr *a,
+                                       struct f2fs_sb_info *sbi, char *buf)
+{
+       return snprintf(buf, PAGE_SIZE, "%u\n", sbi->current_reserved_blocks);
+}
+
  static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
                         struct f2fs_sb_info *sbi, char *buf)
  {
@@ -143,34 +162,22 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a,
  #endif
         if (a->struct_type == RESERVED_BLOCKS) {
                 spin_lock(&sbi->stat_lock);
-               if ((unsigned long)sbi->total_valid_block_count + t >
-                               (unsigned long)sbi->user_block_count) {
+               if (t > (unsigned long)sbi->user_block_count) {
                         spin_unlock(&sbi->stat_lock);
                         return -EINVAL;
                 }
                 *ui = t;
+               sbi->current_reserved_blocks = min(sbi->reserved_blocks,
+                               sbi->user_block_count - valid_user_blocks(sbi));
                 spin_unlock(&sbi->stat_lock);
                 return count;
         }
  
         if (!strcmp(a->attr.name, "discard_granularity")) {
-               struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info;
-               int i;
-
                 if (t == 0 || t > MAX_PLIST_NUM)
                         return -EINVAL;
                 if (t == *ui)
                         return count;
-
-               mutex_lock(&dcc->cmd_lock);
-               for (i = 0; i < MAX_PLIST_NUM; i++) {
-                       if (i >= t - 1)
-                               dcc->pend_list_tag[i] |= P_ACTIVE;
-                       else
-                               dcc->pend_list_tag[i] &= (~P_ACTIVE);
-               }
-               mutex_unlock(&dcc->cmd_lock);
-
                 *ui = t;
                 return count;
         }
@@ -222,6 +229,8 @@ enum feat_id {
         FEAT_EXTRA_ATTR,
         FEAT_PROJECT_QUOTA,
         FEAT_INODE_CHECKSUM,
+       FEAT_FLEXIBLE_INLINE_XATTR,
+       FEAT_QUOTA_INO,
  };
  
  static ssize_t f2fs_feature_show(struct f2fs_attr *a,
@@ -234,6 +243,8 @@ static ssize_t f2fs_feature_show(struct f2fs_attr *a,
         case FEAT_EXTRA_ATTR:
         case FEAT_PROJECT_QUOTA:
         case FEAT_INODE_CHECKSUM:
+       case FEAT_FLEXIBLE_INLINE_XATTR:
+       case FEAT_QUOTA_INO:
                 return snprintf(buf, PAGE_SIZE, "supported\n");
         }
         return 0;
@@ -279,6 +290,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy);
  F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util);
  F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks);
  F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks);
+F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ssr_sections, min_ssr_sections);
  F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh);
  F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages);
  F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio);
@@ -291,8 +303,10 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable);
  F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate);
  F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type);
  #endif
+F2FS_GENERAL_RO_ATTR(dirty_segments);
  F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes);
  F2FS_GENERAL_RO_ATTR(features);
+F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
  
  #ifdef CONFIG_F2FS_FS_ENCRYPTION
  F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO);
@@ -304,6 +318,8 @@ F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE);
  F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR);
  F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA);
  F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM);
+F2FS_FEATURE_RO_ATTR(flexible_inline_xattr, FEAT_FLEXIBLE_INLINE_XATTR);
+F2FS_FEATURE_RO_ATTR(quota_ino, FEAT_QUOTA_INO);
  
  #define ATTR_LIST(name) (&f2fs_attr_##name.attr)
  static struct attribute *f2fs_attrs[] = {
@@ -321,6 +337,7 @@ static struct attribute *f2fs_attrs[] = {
         ATTR_LIST(min_ipu_util),
         ATTR_LIST(min_fsync_blocks),
         ATTR_LIST(min_hot_blocks),
+       ATTR_LIST(min_ssr_sections),
         ATTR_LIST(max_victim_search),
         ATTR_LIST(dir_level),
         ATTR_LIST(ram_thresh),
@@ -333,9 +350,11 @@ static struct attribute *f2fs_attrs[] = {
         ATTR_LIST(inject_rate),
         ATTR_LIST(inject_type),
  #endif
+       ATTR_LIST(dirty_segments),
         ATTR_LIST(lifetime_write_kbytes),
         ATTR_LIST(features),
         ATTR_LIST(reserved_blocks),
+       ATTR_LIST(current_reserved_blocks),
         NULL,
  };
  
@@ -350,6 +369,8 @@ static struct attribute *f2fs_feat_attrs[] = {
         ATTR_LIST(extra_attr),
         ATTR_LIST(project_quota),
         ATTR_LIST(inode_checksum),
+       ATTR_LIST(flexible_inline_xattr),
+       ATTR_LIST(quota_ino),
         NULL,
  };
  
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c

index 7c65540148f8b1b787c54e8eaff3a6813f4a5f9e..ec8961ef8cacfbcf2dcc5807ea11d7eb860294ec 100644 (file)
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -217,12 +217,12 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index,
         return entry;
  }
  
-static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
-                                       void **last_addr, int index,
-                                       size_t len, const char *name)
+static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode,
+                               void *base_addr, void **last_addr, int index,
+                               size_t len, const char *name)
  {
         struct f2fs_xattr_entry *entry;
-       unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2;
+       unsigned int inline_size = inline_xattr_size(inode);
  
         list_for_each_xattr(entry, base_addr) {
                 if ((void *)entry + sizeof(__u32) > base_addr + inline_size ||
@@ -241,12 +241,54 @@ static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr,
         return entry;
  }
  
+static int read_inline_xattr(struct inode *inode, struct page *ipage,
+                                                       void *txattr_addr)
+{
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       unsigned int inline_size = inline_xattr_size(inode);
+       struct page *page = NULL;
+       void *inline_addr;
+
+       if (ipage) {
+               inline_addr = inline_xattr_addr(inode, ipage);
+       } else {
+               page = get_node_page(sbi, inode->i_ino);
+               if (IS_ERR(page))
+                       return PTR_ERR(page);
+
+               inline_addr = inline_xattr_addr(inode, page);
+       }
+       memcpy(txattr_addr, inline_addr, inline_size);
+       f2fs_put_page(page, 1);
+
+       return 0;
+}
+
+static int read_xattr_block(struct inode *inode, void *txattr_addr)
+{
+       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
+       nid_t xnid = F2FS_I(inode)->i_xattr_nid;
+       unsigned int inline_size = inline_xattr_size(inode);
+       struct page *xpage;
+       void *xattr_addr;
+
+       /* The inode already has an extended attribute block. */
+       xpage = get_node_page(sbi, xnid);
+       if (IS_ERR(xpage))
+               return PTR_ERR(xpage);
+
+       xattr_addr = page_address(xpage);
+       memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE);
+       f2fs_put_page(xpage, 1);
+
+       return 0;
+}
+
  static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
                                 unsigned int index, unsigned int len,
                                 const char *name, struct f2fs_xattr_entry **xe,
                                 void **base_addr)
  {
-       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         void *cur_addr, *txattr_addr, *last_addr = NULL;
         nid_t xnid = F2FS_I(inode)->i_xattr_nid;
         unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0;
@@ -263,23 +305,11 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
  
         /* read from inline xattr */
         if (inline_size) {
-               struct page *page = NULL;
-               void *inline_addr;
-
-               if (ipage) {
-                       inline_addr = inline_xattr_addr(ipage);
-               } else {
-                       page = get_node_page(sbi, inode->i_ino);
-                       if (IS_ERR(page)) {
-                               err = PTR_ERR(page);
-                               goto out;
-                       }
-                       inline_addr = inline_xattr_addr(page);
-               }
-               memcpy(txattr_addr, inline_addr, inline_size);
-               f2fs_put_page(page, 1);
+               err = read_inline_xattr(inode, ipage, txattr_addr);
+               if (err)
+                       goto out;
  
-               *xe = __find_inline_xattr(txattr_addr, &last_addr,
+               *xe = __find_inline_xattr(inode, txattr_addr, &last_addr,
                                                 index, len, name);
                 if (*xe)
                         goto check;
@@ -287,19 +317,9 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
  
         /* read from xattr node block */
         if (xnid) {
-               struct page *xpage;
-               void *xattr_addr;
-
-               /* The inode already has an extended attribute block. */
-               xpage = get_node_page(sbi, xnid);
-               if (IS_ERR(xpage)) {
-                       err = PTR_ERR(xpage);
+               err = read_xattr_block(inode, txattr_addr);
+               if (err)
                         goto out;
-               }
-
-               xattr_addr = page_address(xpage);
-               memcpy(txattr_addr + inline_size, xattr_addr, size);
-               f2fs_put_page(xpage, 1);
         }
  
         if (last_addr)
@@ -324,7 +344,6 @@ out:
  static int read_all_xattrs(struct inode *inode, struct page *ipage,
                                                         void **base_addr)
  {
-       struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         struct f2fs_xattr_header *header;
         nid_t xnid = F2FS_I(inode)->i_xattr_nid;
         unsigned int size = VALID_XATTR_BLOCK_SIZE;
@@ -339,38 +358,16 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage,
  
         /* read from inline xattr */
         if (inline_size) {
-               struct page *page = NULL;
-               void *inline_addr;
-
-               if (ipage) {
-                       inline_addr = inline_xattr_addr(ipage);
-               } else {
-                       page = get_node_page(sbi, inode->i_ino);
-                       if (IS_ERR(page)) {
-                               err = PTR_ERR(page);
-                               goto fail;
-                       }
-                       inline_addr = inline_xattr_addr(page);
-               }
-               memcpy(txattr_addr, inline_addr, inline_size);
-               f2fs_put_page(page, 1);
+               err = read_inline_xattr(inode, ipage, txattr_addr);
+               if (err)
+                       goto fail;
         }
  
         /* read from xattr node block */
         if (xnid) {
-               struct page *xpage;
-               void *xattr_addr;
-
-               /* The inode already has an extended attribute block. */
-               xpage = get_node_page(sbi, xnid);
-               if (IS_ERR(xpage)) {
-                       err = PTR_ERR(xpage);
+               err = read_xattr_block(inode, txattr_addr);
+               if (err)
                         goto fail;
-               }
-
-               xattr_addr = page_address(xpage);
-               memcpy(txattr_addr + inline_size, xattr_addr, size);
-               f2fs_put_page(xpage, 1);
         }
  
         header = XATTR_HDR(txattr_addr);
@@ -392,10 +389,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
  {
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         size_t inline_size = inline_xattr_size(inode);
+       struct page *in_page = NULL;
         void *xattr_addr;
+       void *inline_addr = NULL;
         struct page *xpage;
         nid_t new_nid = 0;
-       int err;
+       int err = 0;
  
         if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid)
                 if (!alloc_nid(sbi, &new_nid))
@@ -403,30 +402,30 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
  
         /* write to inline xattr */
         if (inline_size) {
-               struct page *page = NULL;
-               void *inline_addr;
-
                 if (ipage) {
-                       inline_addr = inline_xattr_addr(ipage);
-                       f2fs_wait_on_page_writeback(ipage, NODE, true);
-                       set_page_dirty(ipage);
+                       inline_addr = inline_xattr_addr(inode, ipage);
                 } else {
-                       page = get_node_page(sbi, inode->i_ino);
-                       if (IS_ERR(page)) {
+                       in_page = get_node_page(sbi, inode->i_ino);
+                       if (IS_ERR(in_page)) {
                                 alloc_nid_failed(sbi, new_nid);
-                               return PTR_ERR(page);
+                               return PTR_ERR(in_page);
                         }
-                       inline_addr = inline_xattr_addr(page);
-                       f2fs_wait_on_page_writeback(page, NODE, true);
+                       inline_addr = inline_xattr_addr(inode, in_page);
                 }
-               memcpy(inline_addr, txattr_addr, inline_size);
-               f2fs_put_page(page, 1);
  
+               f2fs_wait_on_page_writeback(ipage ? ipage : in_page,
+                                                       NODE, true);
                 /* no need to use xattr node block */
                 if (hsize <= inline_size) {
-                       err = truncate_xattr_node(inode, ipage);
+                       err = truncate_xattr_node(inode);
                         alloc_nid_failed(sbi, new_nid);
-                       return err;
+                       if (err) {
+                               f2fs_put_page(in_page, 1);
+                               return err;
+                       }
+                       memcpy(inline_addr, txattr_addr, inline_size);
+                       set_page_dirty(ipage ? ipage : in_page);
+                       goto in_page_out;
                 }
         }
  
@@ -435,7 +434,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                 xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid);
                 if (IS_ERR(xpage)) {
                         alloc_nid_failed(sbi, new_nid);
-                       return PTR_ERR(xpage);
+                       goto in_page_out;
                 }
                 f2fs_bug_on(sbi, new_nid);
                 f2fs_wait_on_page_writeback(xpage, NODE, true);
@@ -445,17 +444,24 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize,
                 xpage = new_node_page(&dn, XATTR_NODE_OFFSET);
                 if (IS_ERR(xpage)) {
                         alloc_nid_failed(sbi, new_nid);
-                       return PTR_ERR(xpage);
+                       goto in_page_out;
                 }
                 alloc_nid_done(sbi, new_nid);
         }
-
         xattr_addr = page_address(xpage);
+
+       if (inline_size)
+               memcpy(inline_addr, txattr_addr, inline_size);
         memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE);
+
+       if (inline_size)
+               set_page_dirty(ipage ? ipage : in_page);
         set_page_dirty(xpage);
-       f2fs_put_page(xpage, 1);
  
-       return 0;
+       f2fs_put_page(xpage, 1);
+in_page_out:
+       f2fs_put_page(in_page, 1);
+       return err;
  }
  
  int f2fs_getxattr(struct inode *inode, int index, const char *name,
@@ -681,6 +687,10 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
         struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
         int err;
  
+       err = dquot_initialize(inode);
+       if (err)
+               return err;
+
         /* this case is only from init_inode_metadata */
         if (ipage)
                 return __f2fs_setxattr(inode, index, name, value,
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c

index 245c430a2e418b1ccaea562b8ce0f7425bc66c3e..3a76082c2c6b7837da8a86a0e7c9b5ea51671c41 100644 (file)
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -2111,7 +2111,7 @@ void __mark_inode_dirty(struct inode *inode, int flags)
             (dirtytime && (inode->i_state & I_DIRTY_INODE)))
                 return;
  
-       if (unlikely(block_dump))
+       if (unlikely(block_dump > 1))
                 block_dump___mark_inode_dirty(inode);
  
         spin_lock(&inode->i_lock);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c

index be0250788b737c7633ae155b608502daf3b7c4da..987c95b950f62470167e8e933b05dbfd1b52d84d 100644 (file)
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -45,6 +45,7 @@ void set_fs_pwd(struct fs_struct *fs, const struct path *path)
         if (old_pwd.dentry)
                 path_put(&old_pwd);
  }
+EXPORT_SYMBOL(set_fs_pwd);
  
  static inline int replace_path(struct path *p, const struct path *old, const struct path *new)
  {
@@ -90,6 +91,7 @@ void free_fs_struct(struct fs_struct *fs)
         path_put(&fs->pwd);
         kmem_cache_free(fs_cachep, fs);
  }
+EXPORT_SYMBOL(free_fs_struct);
  
  void exit_fs(struct task_struct *tsk)
  {
@@ -128,6 +130,7 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
         }
         return fs;
  }
+EXPORT_SYMBOL_GPL(copy_fs_struct);
  
  int unshare_fs_struct(void)
  {
diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c

index 13c65dd2d37d1ab1af358f82b42c43ba8c2cc2de..032485010a05ac672c13ac15340e1d0c21a38b98 100644 (file)
--- a/fs/fuse/dev.c
+++ b/fs/fuse/dev.c
@@ -14,6 +14,7 @@
  #include <linux/sched/signal.h>
  #include <linux/uio.h>
  #include <linux/miscdevice.h>
+#include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/file.h>
  #include <linux/slab.h>
@@ -21,6 +22,7 @@
  #include <linux/swap.h>
  #include <linux/splice.h>
  #include <linux/sched.h>
+#include <linux/freezer.h>
  
  MODULE_ALIAS_MISCDEV(FUSE_MINOR);
  MODULE_ALIAS("devname:fuse");
@@ -454,7 +456,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req)
          * Either request is already in userspace, or it was forced.
          * Wait it out.
          */
-       wait_event(req->waitq, test_bit(FR_FINISHED, &req->flags));
+       while (!test_bit(FR_FINISHED, &req->flags))
+               wait_event_freezable(req->waitq,
+                               test_bit(FR_FINISHED, &req->flags));
  }
  
  static void __fuse_request_send(struct fuse_conn *fc, struct fuse_req *req)
@@ -1887,6 +1891,10 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud,
                 cs->move_pages = 0;
  
         err = copy_out_args(cs, &req->out, nbytes);
+       if (req->in.h.opcode == FUSE_CANONICAL_PATH) {
+               req->out.h.error = kern_path((char *)req->out.args[0].value, 0,
+                                                       req->canonical_path);
+       }
         fuse_copy_finish(cs);
  
         spin_lock(&fpq->lock);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c

index 24967382a7b15271ae0646c3cb5522b3e58c37d0..cebd108dc3ce21ce62fd57dd8dddb9a9a551136c 100644 (file)
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -262,6 +262,50 @@ invalid:
         goto out;
  }
  
+/*
+ * Get the canonical path. Since we must translate to a path, this must be done
+ * in the context of the userspace daemon, however, the userspace daemon cannot
+ * look up paths on its own. Instead, we handle the lookup as a special case
+ * inside of the write request.
+ */
+static void fuse_dentry_canonical_path(const struct path *path, struct path *canonical_path) {
+       struct inode *inode = path->dentry->d_inode;
+       struct fuse_conn *fc = get_fuse_conn(inode);
+       struct fuse_req *req;
+       int err;
+       char *path_name;
+
+       req = fuse_get_req(fc, 1);
+       err = PTR_ERR(req);
+       if (IS_ERR(req))
+               goto default_path;
+
+       path_name = (char*)__get_free_page(GFP_KERNEL);
+       if (!path_name) {
+               fuse_put_request(fc, req);
+               goto default_path;
+       }
+
+       req->in.h.opcode = FUSE_CANONICAL_PATH;
+       req->in.h.nodeid = get_node_id(inode);
+       req->in.numargs = 0;
+       req->out.numargs = 1;
+       req->out.args[0].size = PATH_MAX;
+       req->out.args[0].value = path_name;
+       req->canonical_path = canonical_path;
+       req->out.argvar = 1;
+       fuse_request_send(fc, req);
+       err = req->out.h.error;
+       fuse_put_request(fc, req);
+       free_page((unsigned long)path_name);
+       if (!err)
+               return;
+default_path:
+       canonical_path->dentry = path->dentry;
+       canonical_path->mnt = path->mnt;
+       path_get(canonical_path);
+}
+
  static int invalid_nodeid(u64 nodeid)
  {
         return !nodeid || nodeid == FUSE_ROOT_ID;
@@ -284,11 +328,13 @@ const struct dentry_operations fuse_dentry_operations = {
         .d_revalidate   = fuse_dentry_revalidate,
         .d_init         = fuse_dentry_init,
         .d_release      = fuse_dentry_release,
+       .d_canonical_path = fuse_dentry_canonical_path,
  };
  
  const struct dentry_operations fuse_root_dentry_operations = {
         .d_init         = fuse_dentry_init,
         .d_release      = fuse_dentry_release,
+       .d_canonical_path = fuse_dentry_canonical_path,
  };
  
  int fuse_valid_type(int m)
diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h

index d5773ca67ad2bbc36155a30fafa596dd7e44502a..61581f54a482ba5cda0c2a4f221dd84ac8dd9cf0 100644 (file)
--- a/fs/fuse/fuse_i.h
+++ b/fs/fuse/fuse_i.h
@@ -370,6 +370,9 @@ struct fuse_req {
         /** Inode used in the request or NULL */
         struct inode *inode;
  
+       /** Path used for completing d_canonical_path */
+       struct path *canonical_path;
+
         /** AIO control block */
         struct fuse_io_priv *io;
  
diff --git a/fs/inode.c b/fs/inode.c

index d1e35b53bb23b80db7077500f63eeec9bce6bb28..f9497e62b9263485ef9f2207e46ac52e13facb2d 100644 (file)
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1793,7 +1793,7 @@ int dentry_needs_remove_privs(struct dentry *dentry)
         return mask;
  }
  
-static int __remove_privs(struct dentry *dentry, int kill)
+static int __remove_privs(struct vfsmount *mnt, struct dentry *dentry, int kill)
  {
         struct iattr newattrs;
  
@@ -1802,7 +1802,7 @@ static int __remove_privs(struct dentry *dentry, int kill)
          * Note we call this on write, so notify_change will not
          * encounter any conflicting delegations:
          */
-       return notify_change(dentry, &newattrs, NULL);
+       return notify_change2(mnt, dentry, &newattrs, NULL);
  }
  
  /*
@@ -1824,7 +1824,7 @@ int file_remove_privs(struct file *file)
         if (kill < 0)
                 return kill;
         if (kill)
-               error = __remove_privs(dentry, kill);
+               error = __remove_privs(file->f_path.mnt, dentry, kill);
         if (!error)
                 inode_has_no_xattr(inode);
  
diff --git a/fs/internal.h b/fs/internal.h

index 48cee21b4f146f5da86b3e485a1f9c155c33a5fe..b4338451c1e41a837b28d1698c05d927c18ef81c 100644 (file)
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -90,9 +90,11 @@ extern struct file *get_empty_filp(void);
   * super.c
   */
  extern int do_remount_sb(struct super_block *, int, void *, int);
+extern int do_remount_sb2(struct vfsmount *, struct super_block *, int,
+                                                               void *, int);
  extern bool trylock_super(struct super_block *sb);
  extern struct dentry *mount_fs(struct file_system_type *,
-                              int, const char *, void *);
+                              int, const char *, struct vfsmount *, void *);
  extern struct super_block *user_get_super(dev_t);
  
  /*
diff --git a/fs/mpage.c b/fs/mpage.c

index b7e7f570733ad0766afe5d7f116e787c7ebf21f4..93e3cf7bf27c7b524caf3f0f4584416452e98bee 100644 (file)
--- a/fs/mpage.c
+++ b/fs/mpage.c
@@ -32,6 +32,14 @@
  #include <linux/cleancache.h>
  #include "internal.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/android_fs.h>
+
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_datawrite_end);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_start);
+EXPORT_TRACEPOINT_SYMBOL(android_fs_dataread_end);
+
  /*
   * I/O completion handler for multipage BIOs.
   *
@@ -49,6 +57,16 @@ static void mpage_end_io(struct bio *bio)
         struct bio_vec *bv;
         int i;
  
+       if (trace_android_fs_dataread_end_enabled() &&
+           (bio_data_dir(bio) == READ)) {
+               struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+               if (first_page != NULL)
+                       trace_android_fs_dataread_end(first_page->mapping->host,
+                                                     page_offset(first_page),
+                                                     bio->bi_iter.bi_size);
+       }
+
         bio_for_each_segment_all(bv, bio, i) {
                 struct page *page = bv->bv_page;
                 page_endio(page, op_is_write(bio_op(bio)),
@@ -60,6 +78,24 @@ static void mpage_end_io(struct bio *bio)
  
  static struct bio *mpage_bio_submit(int op, int op_flags, struct bio *bio)
  {
+       if (trace_android_fs_dataread_start_enabled() && (op == REQ_OP_READ)) {
+               struct page *first_page = bio->bi_io_vec[0].bv_page;
+
+               if (first_page != NULL) {
+                       char *path, pathbuf[MAX_TRACE_PATHBUF_LEN];
+
+                       path = android_fstrace_get_pathname(pathbuf,
+                                                   MAX_TRACE_PATHBUF_LEN,
+                                                   first_page->mapping->host);
+                       trace_android_fs_dataread_start(
+                               first_page->mapping->host,
+                               page_offset(first_page),
+                               bio->bi_iter.bi_size,
+                               current->pid,
+                               path,
+                               current->comm);
+               }
+       }
         bio->bi_end_io = mpage_end_io;
         bio_set_op_attrs(bio, op, op_flags);
         guard_bio_eod(op, bio);
diff --git a/fs/namei.c b/fs/namei.c

index 62a0db6e6725e73ed93bb1a014a18fed42072444..11aae5599575ef407f5ec1ecf41aeaeffef9c261 100644 (file)
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -376,9 +376,11 @@ EXPORT_SYMBOL(generic_permission);
   * flag in inode->i_opflags, that says "this has not special
   * permission function, use the fast case".
   */
-static inline int do_inode_permission(struct inode *inode, int mask)
+static inline int do_inode_permission(struct vfsmount *mnt, struct inode *inode, int mask)
  {
         if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
+               if (likely(mnt && inode->i_op->permission2))
+                       return inode->i_op->permission2(mnt, inode, mask);
                 if (likely(inode->i_op->permission))
                         return inode->i_op->permission(inode, mask);
  
@@ -402,7 +404,7 @@ static inline int do_inode_permission(struct inode *inode, int mask)
   * This does not check for a read-only file system.  You probably want
   * inode_permission().
   */
-int __inode_permission(struct inode *inode, int mask)
+int __inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
  {
         int retval;
  
@@ -422,7 +424,7 @@ int __inode_permission(struct inode *inode, int mask)
                         return -EACCES;
         }
  
-       retval = do_inode_permission(inode, mask);
+       retval = do_inode_permission(mnt, inode, mask);
         if (retval)
                 return retval;
  
@@ -430,7 +432,14 @@ int __inode_permission(struct inode *inode, int mask)
         if (retval)
                 return retval;
  
-       return security_inode_permission(inode, mask);
+       retval = security_inode_permission(inode, mask);
+       return retval;
+}
+EXPORT_SYMBOL(__inode_permission2);
+
+int __inode_permission(struct inode *inode, int mask)
+{
+       return __inode_permission2(NULL, inode, mask);
  }
  EXPORT_SYMBOL(__inode_permission);
  
@@ -465,14 +474,20 @@ static int sb_permission(struct super_block *sb, struct inode *inode, int mask)
   *
   * When checking for MAY_APPEND, MAY_WRITE must also be set in @mask.
   */
-int inode_permission(struct inode *inode, int mask)
+int inode_permission2(struct vfsmount *mnt, struct inode *inode, int mask)
  {
         int retval;
  
         retval = sb_permission(inode->i_sb, inode, mask);
         if (retval)
                 return retval;
-       return __inode_permission(inode, mask);
+       return __inode_permission2(mnt, inode, mask);
+}
+EXPORT_SYMBOL(inode_permission2);
+
+int inode_permission(struct inode *inode, int mask)
+{
+       return inode_permission2(NULL, inode, mask);
  }
  EXPORT_SYMBOL(inode_permission);
  
@@ -1667,13 +1682,13 @@ out:
  static inline int may_lookup(struct nameidata *nd)
  {
         if (nd->flags & LOOKUP_RCU) {
-               int err = inode_permission(nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
+               int err = inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC|MAY_NOT_BLOCK);
                 if (err != -ECHILD)
                         return err;
                 if (unlazy_walk(nd))
                         return -ECHILD;
         }
-       return inode_permission(nd->inode, MAY_EXEC);
+       return inode_permission2(nd->path.mnt, nd->inode, MAY_EXEC);
  }
  
  static inline int handle_dots(struct nameidata *nd, int type)
@@ -2449,6 +2464,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
  /**
   * lookup_one_len - filesystem helper to lookup single pathname component
   * @name:      pathname component to lookup
+ * @mnt:       mount we are looking up on
   * @base:      base directory to lookup from
   * @len:       maximum length @len should be interpreted to
   *
@@ -2457,7 +2473,7 @@ EXPORT_SYMBOL(vfs_path_lookup);
   *
   * The caller must hold base->i_mutex.
   */
-struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+struct dentry *lookup_one_len2(const char *name, struct vfsmount *mnt, struct dentry *base, int len)
  {
         struct qstr this;
         unsigned int c;
@@ -2491,12 +2507,18 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
                         return ERR_PTR(err);
         }
  
-       err = inode_permission(base->d_inode, MAY_EXEC);
+       err = inode_permission2(mnt, base->d_inode, MAY_EXEC);
         if (err)
                 return ERR_PTR(err);
  
         return __lookup_hash(&this, base, 0);
  }
+EXPORT_SYMBOL(lookup_one_len2);
+
+struct dentry *lookup_one_len(const char *name, struct dentry *base, int len)
+{
+       return lookup_one_len2(name, NULL, base, len);
+}
  EXPORT_SYMBOL(lookup_one_len);
  
  /**
@@ -2774,7 +2796,7 @@ EXPORT_SYMBOL(__check_sticky);
   * 11. We don't allow removal of NFS sillyrenamed files; it's handled by
   *     nfs_async_unlink().
   */
-static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
+static int may_delete(struct vfsmount *mnt, struct inode *dir, struct dentry *victim, bool isdir)
  {
         struct inode *inode = d_backing_inode(victim);
         int error;
@@ -2786,7 +2808,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
         BUG_ON(victim->d_parent->d_inode != dir);
         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
  
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
         if (IS_APPEND(dir))
@@ -2818,7 +2840,7 @@ static int may_delete(struct inode *dir, struct dentry *victim, bool isdir)
   *  4. We should have write and exec permissions on dir
   *  5. We can't do it if dir is immutable (done in permission())
   */
-static inline int may_create(struct inode *dir, struct dentry *child)
+static inline int may_create(struct vfsmount *mnt, struct inode *dir, struct dentry *child)
  {
         struct user_namespace *s_user_ns;
         audit_inode_child(dir, child, AUDIT_TYPE_CHILD_CREATE);
@@ -2830,7 +2852,7 @@ static inline int may_create(struct inode *dir, struct dentry *child)
         if (!kuid_has_mapping(s_user_ns, current_fsuid()) ||
             !kgid_has_mapping(s_user_ns, current_fsgid()))
                 return -EOVERFLOW;
-       return inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       return inode_permission2(mnt, dir, MAY_WRITE | MAY_EXEC);
  }
  
  /*
@@ -2877,10 +2899,10 @@ void unlock_rename(struct dentry *p1, struct dentry *p2)
  }
  EXPORT_SYMBOL(unlock_rename);
  
-int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
-               bool want_excl)
+int vfs_create2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry,
+               umode_t mode, bool want_excl)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt, dir, dentry);
         if (error)
                 return error;
  
@@ -2896,6 +2918,13 @@ int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
                 fsnotify_create(dir, dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_create2);
+
+int vfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
+               bool want_excl)
+{
+       return vfs_create2(NULL, dir, dentry, mode, want_excl);
+}
  EXPORT_SYMBOL(vfs_create);
  
  bool may_open_dev(const struct path *path)
@@ -2907,6 +2936,7 @@ bool may_open_dev(const struct path *path)
  static int may_open(const struct path *path, int acc_mode, int flag)
  {
         struct dentry *dentry = path->dentry;
+       struct vfsmount *mnt = path->mnt;
         struct inode *inode = dentry->d_inode;
         int error;
  
@@ -2931,7 +2961,7 @@ static int may_open(const struct path *path, int acc_mode, int flag)
                 break;
         }
  
-       error = inode_permission(inode, MAY_OPEN | acc_mode);
+       error = inode_permission2(mnt, inode, MAY_OPEN | acc_mode);
         if (error)
                 return error;
  
@@ -2966,7 +2996,7 @@ static int handle_truncate(struct file *filp)
         if (!error)
                 error = security_path_truncate(path);
         if (!error) {
-               error = do_truncate(path->dentry, 0,
+               error = do_truncate2(path->mnt, path->dentry, 0,
                                     ATTR_MTIME|ATTR_CTIME|ATTR_OPEN,
                                     filp);
         }
@@ -2993,7 +3023,7 @@ static int may_o_create(const struct path *dir, struct dentry *dentry, umode_t m
             !kgid_has_mapping(s_user_ns, current_fsgid()))
                 return -EOVERFLOW;
  
-       error = inode_permission(dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
+       error = inode_permission2(dir->mnt, dir->dentry->d_inode, MAY_WRITE | MAY_EXEC);
         if (error)
                 return error;
  
@@ -3406,7 +3436,8 @@ struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode, int open_flag)
         int error;
  
         /* we want directory to be writable */
-       error = inode_permission(dir, MAY_WRITE | MAY_EXEC);
+       error = inode_permission2(ERR_PTR(-EOPNOTSUPP), dir,
+                                       MAY_WRITE | MAY_EXEC);
         if (error)
                 goto out_err;
         error = -EOPNOTSUPP;
@@ -3684,9 +3715,9 @@ inline struct dentry *user_path_create(int dfd, const char __user *pathname,
  }
  EXPORT_SYMBOL(user_path_create);
  
-int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+int vfs_mknod2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt, dir, dentry);
  
         if (error)
                 return error;
@@ -3710,6 +3741,12 @@ int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
                 fsnotify_create(dir, dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_mknod2);
+
+int vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev)
+{
+       return vfs_mknod2(NULL, dir, dentry, mode, dev);
+}
  EXPORT_SYMBOL(vfs_mknod);
  
  static int may_mknod(umode_t mode)
@@ -3752,12 +3789,12 @@ retry:
                 goto out;
         switch (mode & S_IFMT) {
                 case 0: case S_IFREG:
-                       error = vfs_create(path.dentry->d_inode,dentry,mode,true);
+                       error = vfs_create2(path.mnt, path.dentry->d_inode,dentry,mode,true);
                         if (!error)
                                 ima_post_path_mknod(dentry);
                         break;
                 case S_IFCHR: case S_IFBLK:
-                       error = vfs_mknod(path.dentry->d_inode,dentry,mode,
+                       error = vfs_mknod2(path.mnt, path.dentry->d_inode,dentry,mode,
                                         new_decode_dev(dev));
                         break;
                 case S_IFIFO: case S_IFSOCK:
@@ -3778,9 +3815,9 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d
         return sys_mknodat(AT_FDCWD, filename, mode, dev);
  }
  
-int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+int vfs_mkdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, umode_t mode)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt, dir, dentry);
         unsigned max_links = dir->i_sb->s_max_links;
  
         if (error)
@@ -3802,6 +3839,12 @@ int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
                 fsnotify_mkdir(dir, dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_mkdir2);
+
+int vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+       return vfs_mkdir2(NULL, dir, dentry, mode);
+}
  EXPORT_SYMBOL(vfs_mkdir);
  
  SYSCALL_DEFINE3(mkdirat, int, dfd, const char __user *, pathname, umode_t, mode)
@@ -3820,7 +3863,7 @@ retry:
                 mode &= ~current_umask();
         error = security_path_mkdir(&path, dentry, mode);
         if (!error)
-               error = vfs_mkdir(path.dentry->d_inode, dentry, mode);
+               error = vfs_mkdir2(path.mnt, path.dentry->d_inode, dentry, mode);
         done_path_create(&path, dentry);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@ -3834,9 +3877,9 @@ SYSCALL_DEFINE2(mkdir, const char __user *, pathname, umode_t, mode)
         return sys_mkdirat(AT_FDCWD, pathname, mode);
  }
  
-int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+int vfs_rmdir2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry)
  {
-       int error = may_delete(dir, dentry, 1);
+       int error = may_delete(mnt, dir, dentry, 1);
  
         if (error)
                 return error;
@@ -3871,6 +3914,12 @@ out:
                 d_delete(dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_rmdir2);
+
+int vfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       return vfs_rmdir2(NULL, dir, dentry);
+}
  EXPORT_SYMBOL(vfs_rmdir);
  
  static long do_rmdir(int dfd, const char __user *pathname)
@@ -3916,7 +3965,7 @@ retry:
         error = security_path_rmdir(&path, dentry);
         if (error)
                 goto exit3;
-       error = vfs_rmdir(path.dentry->d_inode, dentry);
+       error = vfs_rmdir2(path.mnt, path.dentry->d_inode, dentry);
  exit3:
         dput(dentry);
  exit2:
@@ -3955,10 +4004,10 @@ SYSCALL_DEFINE1(rmdir, const char __user *, pathname)
   * be appropriate for callers that expect the underlying filesystem not
   * to be NFS exported.
   */
-int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+int vfs_unlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
  {
         struct inode *target = dentry->d_inode;
-       int error = may_delete(dir, dentry, 0);
+       int error = may_delete(mnt, dir, dentry, 0);
  
         if (error)
                 return error;
@@ -3993,6 +4042,12 @@ out:
  
         return error;
  }
+EXPORT_SYMBOL(vfs_unlink2);
+
+int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegated_inode)
+{
+       return vfs_unlink2(NULL, dir, dentry, delegated_inode);
+}
  EXPORT_SYMBOL(vfs_unlink);
  
  /*
@@ -4040,7 +4095,7 @@ retry_deleg:
                 error = security_path_unlink(&path, dentry);
                 if (error)
                         goto exit2;
-               error = vfs_unlink(path.dentry->d_inode, dentry, &delegated_inode);
+               error = vfs_unlink2(path.mnt, path.dentry->d_inode, dentry, &delegated_inode);
  exit2:
                 dput(dentry);
         }
@@ -4090,9 +4145,9 @@ SYSCALL_DEFINE1(unlink, const char __user *, pathname)
         return do_unlinkat(AT_FDCWD, pathname);
  }
  
-int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+int vfs_symlink2(struct vfsmount *mnt, struct inode *dir, struct dentry *dentry, const char *oldname)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(mnt, dir, dentry);
  
         if (error)
                 return error;
@@ -4109,6 +4164,12 @@ int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
                 fsnotify_create(dir, dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_symlink2);
+
+int vfs_symlink(struct inode *dir, struct dentry *dentry, const char *oldname)
+{
+       return vfs_symlink2(NULL, dir, dentry, oldname);
+}
  EXPORT_SYMBOL(vfs_symlink);
  
  SYSCALL_DEFINE3(symlinkat, const char __user *, oldname,
@@ -4131,7 +4192,7 @@ retry:
  
         error = security_path_symlink(&path, dentry, from->name);
         if (!error)
-               error = vfs_symlink(path.dentry->d_inode, dentry, from->name);
+               error = vfs_symlink2(path.mnt, path.dentry->d_inode, dentry, from->name);
         done_path_create(&path, dentry);
         if (retry_estale(error, lookup_flags)) {
                 lookup_flags |= LOOKUP_REVAL;
@@ -4166,7 +4227,7 @@ SYSCALL_DEFINE2(symlink, const char __user *, oldname, const char __user *, newn
   * be appropriate for callers that expect the underlying filesystem not
   * to be NFS exported.
   */
-int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+int vfs_link2(struct vfsmount *mnt, struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
  {
         struct inode *inode = old_dentry->d_inode;
         unsigned max_links = dir->i_sb->s_max_links;
@@ -4175,7 +4236,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
         if (!inode)
                 return -ENOENT;
  
-       error = may_create(dir, new_dentry);
+       error = may_create(mnt, dir, new_dentry);
         if (error)
                 return error;
  
@@ -4225,6 +4286,12 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de
                 fsnotify_link(dir, inode, new_dentry);
         return error;
  }
+EXPORT_SYMBOL(vfs_link2);
+
+int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry, struct inode **delegated_inode)
+{
+       return vfs_link2(NULL, old_dentry, dir, new_dentry, delegated_inode);
+}
  EXPORT_SYMBOL(vfs_link);
  
  /*
@@ -4280,7 +4347,7 @@ retry:
         error = security_path_link(old_path.dentry, &new_path, new_dentry);
         if (error)
                 goto out_dput;
-       error = vfs_link(old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
+       error = vfs_link2(old_path.mnt, old_path.dentry, new_path.dentry->d_inode, new_dentry, &delegated_inode);
  out_dput:
         done_path_create(&new_path, new_dentry);
         if (delegated_inode) {
@@ -4356,7 +4423,8 @@ SYSCALL_DEFINE2(link, const char __user *, oldname, const char __user *, newname
   *        ->i_mutex on parents, which works but leads to some truly excessive
   *        locking].
   */
-int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+int vfs_rename2(struct vfsmount *mnt,
+              struct inode *old_dir, struct dentry *old_dentry,
                struct inode *new_dir, struct dentry *new_dentry,
                struct inode **delegated_inode, unsigned int flags)
  {
@@ -4371,19 +4439,19 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
         if (source == target)
                 return 0;
  
-       error = may_delete(old_dir, old_dentry, is_dir);
+       error = may_delete(mnt, old_dir, old_dentry, is_dir);
         if (error)
                 return error;
  
         if (!target) {
-               error = may_create(new_dir, new_dentry);
+               error = may_create(mnt, new_dir, new_dentry);
         } else {
                 new_is_dir = d_is_dir(new_dentry);
  
                 if (!(flags & RENAME_EXCHANGE))
-                       error = may_delete(new_dir, new_dentry, is_dir);
+                       error = may_delete(mnt, new_dir, new_dentry, is_dir);
                 else
-                       error = may_delete(new_dir, new_dentry, new_is_dir);
+                       error = may_delete(mnt, new_dir, new_dentry, new_is_dir);
         }
         if (error)
                 return error;
@@ -4397,12 +4465,12 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
          */
         if (new_dir != old_dir) {
                 if (is_dir) {
-                       error = inode_permission(source, MAY_WRITE);
+                       error = inode_permission2(mnt, source, MAY_WRITE);
                         if (error)
                                 return error;
                 }
                 if ((flags & RENAME_EXCHANGE) && new_is_dir) {
-                       error = inode_permission(target, MAY_WRITE);
+                       error = inode_permission2(mnt, target, MAY_WRITE);
                         if (error)
                                 return error;
                 }
@@ -4479,6 +4547,14 @@ out:
  
         return error;
  }
+EXPORT_SYMBOL(vfs_rename2);
+
+int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+              struct inode *new_dir, struct dentry *new_dentry,
+              struct inode **delegated_inode, unsigned int flags)
+{
+       return vfs_rename2(NULL, old_dir, old_dentry, new_dir, new_dentry, delegated_inode, flags);
+}
  EXPORT_SYMBOL(vfs_rename);
  
  SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname,
@@ -4592,7 +4668,7 @@ retry_deleg:
                                      &new_path, new_dentry, flags);
         if (error)
                 goto exit5;
-       error = vfs_rename(old_path.dentry->d_inode, old_dentry,
+       error = vfs_rename2(old_path.mnt, old_path.dentry->d_inode, old_dentry,
                            new_path.dentry->d_inode, new_dentry,
                            &delegated_inode, flags);
  exit5:
@@ -4637,7 +4713,7 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna
  
  int vfs_whiteout(struct inode *dir, struct dentry *dentry)
  {
-       int error = may_create(dir, dentry);
+       int error = may_create(NULL, dir, dentry);
         if (error)
                 return error;
  
diff --git a/fs/namespace.c b/fs/namespace.c

index adae9ffce91d895ba9cbd7e51db997bedb5389b4..6b2fbd7852a61dad65727c0d5490c2c3f0ca79a2 100644 (file)
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -226,6 +226,7 @@ static struct mount *alloc_vfsmnt(const char *name)
                 mnt->mnt_count = 1;
                 mnt->mnt_writers = 0;
  #endif
+               mnt->mnt.data = NULL;
  
                 INIT_HLIST_NODE(&mnt->mnt_hash);
                 INIT_LIST_HEAD(&mnt->mnt_child);
@@ -637,6 +638,7 @@ int sb_prepare_remount_readonly(struct super_block *sb)
  
  static void free_vfsmnt(struct mount *mnt)
  {
+       kfree(mnt->mnt.data);
         kfree_const(mnt->mnt_devname);
  #ifdef CONFIG_SMP
         free_percpu(mnt->mnt_pcp);
@@ -1031,10 +1033,18 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void
         if (!mnt)
                 return ERR_PTR(-ENOMEM);
  
+       if (type->alloc_mnt_data) {
+               mnt->mnt.data = type->alloc_mnt_data();
+               if (!mnt->mnt.data) {
+                       mnt_free_id(mnt);
+                       free_vfsmnt(mnt);
+                       return ERR_PTR(-ENOMEM);
+               }
+       }
         if (flags & SB_KERNMOUNT)
                 mnt->mnt.mnt_flags = MNT_INTERNAL;
  
-       root = mount_fs(type, flags, name, data);
+       root = mount_fs(type, flags, name, &mnt->mnt, data);
         if (IS_ERR(root)) {
                 mnt_free_id(mnt);
                 free_vfsmnt(mnt);
@@ -1078,6 +1088,14 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root,
         if (!mnt)
                 return ERR_PTR(-ENOMEM);
  
+       if (sb->s_op->clone_mnt_data) {
+               mnt->mnt.data = sb->s_op->clone_mnt_data(old->mnt.data);
+               if (!mnt->mnt.data) {
+                       err = -ENOMEM;
+                       goto out_free;
+               }
+       }
+
         if (flag & (CL_SLAVE | CL_PRIVATE | CL_SHARED_TO_SLAVE))
                 mnt->mnt_group_id = 0; /* not a peer of original */
         else
@@ -2329,8 +2347,14 @@ static int do_remount(struct path *path, int ms_flags, int sb_flags,
                 err = change_mount_flags(path->mnt, ms_flags);
         else if (!capable(CAP_SYS_ADMIN))
                 err = -EPERM;
-       else
-               err = do_remount_sb(sb, sb_flags, data, 0);
+       else {
+               err = do_remount_sb2(path->mnt, sb, sb_flags, data, 0);
+               namespace_lock();
+               lock_mount_hash();
+               propagate_remount(mnt);
+               unlock_mount_hash();
+               namespace_unlock();
+       }
         if (!err) {
                 lock_mount_hash();
                 mnt_flags |= mnt->mnt.mnt_flags & ~MNT_USER_SETTABLE_MASK;
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c

index 9752e7270e616d3641ca0f4fe9b0dd90ccd1ef1c..685e39e7b452a66f3b0381cf763552fde2422723 100644 (file)
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -495,7 +495,7 @@ static int fanotify_find_path(int dfd, const char __user *filename,
         }
  
         /* you can only watch an inode if you have read permissions on it */
-       ret = inode_permission(path->dentry->d_inode, MAY_READ);
+       ret = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
         if (ret)
                 path_put(path);
  out:
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c

index 7cc7d3fb1862fcb557933e63ec66361cf8bcfb7a..5c3caeaf0502ef78ff42d84111fd8ba944467df7 100644 (file)
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -335,7 +335,7 @@ static int inotify_find_inode(const char __user *dirname, struct path *path, uns
         if (error)
                 return error;
         /* you can only watch an inode if you have read permissions on it */
-       error = inode_permission(path->dentry->d_inode, MAY_READ);
+       error = inode_permission2(path->mnt, path->dentry->d_inode, MAY_READ);
         if (error)
                 path_put(path);
         return error;
@@ -671,6 +671,8 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
         struct fsnotify_group *group;
         struct inode *inode;
         struct path path;
+       struct path alteredpath;
+       struct path *canonical_path = &path;
         struct fd f;
         int ret;
         unsigned flags = 0;
@@ -710,13 +712,22 @@ SYSCALL_DEFINE3(inotify_add_watch, int, fd, const char __user *, pathname,
         if (ret)
                 goto fput_and_out;
  
+       /* support stacked filesystems */
+       if(path.dentry && path.dentry->d_op) {
+               if (path.dentry->d_op->d_canonical_path) {
+                       path.dentry->d_op->d_canonical_path(&path, &alteredpath);
+                       canonical_path = &alteredpath;
+                       path_put(&path);
+               }
+       }
+
         /* inode held in place by reference to path; group by fget on fd */
-       inode = path.dentry->d_inode;
+       inode = canonical_path->dentry->d_inode;
         group = f.file->private_data;
  
         /* create/update an inode mark */
         ret = inotify_update_watch(group, inode, mask);
-       path_put(&path);
+       path_put(canonical_path);
  fput_and_out:
         fdput(f);
         return ret;
diff --git a/fs/open.c b/fs/open.c

index 7ea118471dce59996ae3d3d9b9e3b9d156baffba..39c24f10e501018da565978f60f2bea7b5844714 100644 (file)
--- a/fs/open.c
+++ b/fs/open.c
@@ -34,8 +34,8 @@
  
  #include "internal.h"
  
-int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
-       struct file *filp)
+int do_truncate2(struct vfsmount *mnt, struct dentry *dentry, loff_t length,
+               unsigned int time_attrs, struct file *filp)
  {
         int ret;
         struct iattr newattrs;
@@ -60,18 +60,25 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
  
         inode_lock(dentry->d_inode);
         /* Note any delegations or leases have already been broken: */
-       ret = notify_change(dentry, &newattrs, NULL);
+       ret = notify_change2(mnt, dentry, &newattrs, NULL);
         inode_unlock(dentry->d_inode);
         return ret;
  }
+int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs,
+       struct file *filp)
+{
+       return do_truncate2(NULL, dentry, length, time_attrs, filp);
+}
  
  long vfs_truncate(const struct path *path, loff_t length)
  {
         struct inode *inode;
+       struct vfsmount *mnt;
         struct dentry *upperdentry;
         long error;
  
         inode = path->dentry->d_inode;
+       mnt = path->mnt;
  
         /* For directories it's -EISDIR, for other non-regulars - -EINVAL */
         if (S_ISDIR(inode->i_mode))
@@ -83,7 +90,7 @@ long vfs_truncate(const struct path *path, loff_t length)
         if (error)
                 goto out;
  
-       error = inode_permission(inode, MAY_WRITE);
+       error = inode_permission2(mnt, inode, MAY_WRITE);
         if (error)
                 goto mnt_drop_write_and_out;
  
@@ -117,7 +124,7 @@ long vfs_truncate(const struct path *path, loff_t length)
         if (!error)
                 error = security_path_truncate(path);
         if (!error)
-               error = do_truncate(path->dentry, length, 0, NULL);
+               error = do_truncate2(mnt, path->dentry, length, 0, NULL);
  
  put_write_and_out:
         put_write_access(upperdentry->d_inode);
@@ -166,6 +173,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
  {
         struct inode *inode;
         struct dentry *dentry;
+       struct vfsmount *mnt;
         struct fd f;
         int error;
  
@@ -182,6 +190,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
                 small = 0;
  
         dentry = f.file->f_path.dentry;
+       mnt = f.file->f_path.mnt;
         inode = dentry->d_inode;
         error = -EINVAL;
         if (!S_ISREG(inode->i_mode) || !(f.file->f_mode & FMODE_WRITE))
@@ -202,7 +211,7 @@ static long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
         if (!error)
                 error = security_path_truncate(&f.file->f_path);
         if (!error)
-               error = do_truncate(dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
+               error = do_truncate2(mnt, dentry, length, ATTR_MTIME|ATTR_CTIME, f.file);
         sb_end_write(inode->i_sb);
  out_putf:
         fdput(f);
@@ -356,6 +365,7 @@ SYSCALL_DEFINE3(faccessat, int, dfd, const char __user *, filename, int, mode)
         struct cred *override_cred;
         struct path path;
         struct inode *inode;
+       struct vfsmount *mnt;
         int res;
         unsigned int lookup_flags = LOOKUP_FOLLOW;
  
@@ -386,6 +396,7 @@ retry:
                 goto out;
  
         inode = d_backing_inode(path.dentry);
+       mnt = path.mnt;
  
         if ((mode & MAY_EXEC) && S_ISREG(inode->i_mode)) {
                 /*
@@ -397,7 +408,7 @@ retry:
                         goto out_path_release;
         }
  
-       res = inode_permission(inode, mode | MAY_ACCESS);
+       res = inode_permission2(mnt, inode, mode | MAY_ACCESS);
         /* SuS v2 requires we report a read only fs too */
         if (res || !(mode & S_IWOTH) || special_file(inode->i_mode))
                 goto out_path_release;
@@ -441,7 +452,7 @@ retry:
         if (error)
                 goto out;
  
-       error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+       error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
         if (error)
                 goto dput_and_out;
  
@@ -470,7 +481,8 @@ SYSCALL_DEFINE1(fchdir, unsigned int, fd)
         if (!d_can_lookup(f.file->f_path.dentry))
                 goto out_putf;
  
-       error = inode_permission(file_inode(f.file), MAY_EXEC | MAY_CHDIR);
+       error = inode_permission2(f.file->f_path.mnt, file_inode(f.file),
+                               MAY_EXEC | MAY_CHDIR);
         if (!error)
                 set_fs_pwd(current->fs, &f.file->f_path);
  out_putf:
@@ -489,7 +501,7 @@ retry:
         if (error)
                 goto out;
  
-       error = inode_permission(path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
+       error = inode_permission2(path.mnt, path.dentry->d_inode, MAY_EXEC | MAY_CHDIR);
         if (error)
                 goto dput_and_out;
  
@@ -529,7 +541,7 @@ retry_deleg:
                 goto out_unlock;
         newattrs.ia_mode = (mode & S_IALLUGO) | (inode->i_mode & ~S_IALLUGO);
         newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
-       error = notify_change(path->dentry, &newattrs, &delegated_inode);
+       error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
  out_unlock:
         inode_unlock(inode);
         if (delegated_inode) {
@@ -609,7 +621,7 @@ retry_deleg:
         inode_lock(inode);
         error = security_path_chown(path, uid, gid);
         if (!error)
-               error = notify_change(path->dentry, &newattrs, &delegated_inode);
+               error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
         inode_unlock(inode);
         if (delegated_inode) {
                 error = break_deleg_wait(&delegated_inode);
diff --git a/fs/pnode.c b/fs/pnode.c

index 53d411a371ce8d7efd060ee3e9ad37053a54e4ab..386884dd6d97c8fd5674a0eae378205d8d3b43f1 100644 (file)
--- a/fs/pnode.c
+++ b/fs/pnode.c
@@ -607,3 +607,32 @@ int propagate_umount(struct list_head *list)
  
         return 0;
  }
+
+/*
+ *  Iterates over all slaves, and slaves of slaves.
+ */
+static struct mount *next_descendent(struct mount *root, struct mount *cur)
+{
+       if (!IS_MNT_NEW(cur) && !list_empty(&cur->mnt_slave_list))
+               return first_slave(cur);
+       do {
+               if (cur->mnt_slave.next != &cur->mnt_master->mnt_slave_list)
+                       return next_slave(cur);
+               cur = cur->mnt_master;
+       } while (cur != root);
+       return NULL;
+}
+
+void propagate_remount(struct mount *mnt)
+{
+       struct mount *m = mnt;
+       struct super_block *sb = mnt->mnt.mnt_sb;
+
+       if (sb->s_op->copy_mnt_data) {
+               m = next_descendent(mnt, m);
+               while (m) {
+                       sb->s_op->copy_mnt_data(m->mnt.data, mnt->mnt.data);
+                       m = next_descendent(mnt, m);
+               }
+       }
+}
diff --git a/fs/pnode.h b/fs/pnode.h

index dc87e65becd21eb0235775fedf28d8a21702f548..a9a6576540addb55510a41df67cce73339460208 100644 (file)
--- a/fs/pnode.h
+++ b/fs/pnode.h
@@ -44,6 +44,7 @@ int propagate_mnt(struct mount *, struct mountpoint *, struct mount *,
  int propagate_umount(struct list_head *);
  int propagate_mount_busy(struct mount *, int);
  void propagate_mount_unlock(struct mount *);
+void propagate_remount(struct mount *);
  void mnt_release_group_id(struct mount *);
  int get_dominating_id(struct mount *mnt, const struct path *root);
  unsigned int mnt_get_count(struct mount *mnt);
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c

index 6744bd706ecf018f0db0a3e335449945b523ea74..323d379d367b66a1e199e4841ae802e8c7afcac6 100644 (file)
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -130,6 +130,56 @@ static void release_task_mempolicy(struct proc_maps_private *priv)
  }
  #endif
  
+static void seq_print_vma_name(struct seq_file *m, struct vm_area_struct *vma)
+{
+       const char __user *name = vma_get_anon_name(vma);
+       struct mm_struct *mm = vma->vm_mm;
+
+       unsigned long page_start_vaddr;
+       unsigned long page_offset;
+       unsigned long num_pages;
+       unsigned long max_len = NAME_MAX;
+       int i;
+
+       page_start_vaddr = (unsigned long)name & PAGE_MASK;
+       page_offset = (unsigned long)name - page_start_vaddr;
+       num_pages = DIV_ROUND_UP(page_offset + max_len, PAGE_SIZE);
+
+       seq_puts(m, "[anon:");
+
+       for (i = 0; i < num_pages; i++) {
+               int len;
+               int write_len;
+               const char *kaddr;
+               long pages_pinned;
+               struct page *page;
+
+               pages_pinned = get_user_pages_remote(current, mm,
+                               page_start_vaddr, 1, 0, &page, NULL, NULL);
+               if (pages_pinned < 1) {
+                       seq_puts(m, "<fault>]");
+                       return;
+               }
+
+               kaddr = (const char *)kmap(page);
+               len = min(max_len, PAGE_SIZE - page_offset);
+               write_len = strnlen(kaddr + page_offset, len);
+               seq_write(m, kaddr + page_offset, write_len);
+               kunmap(page);
+               put_page(page);
+
+               /* if strnlen hit a null terminator then we're done */
+               if (write_len != len)
+                       break;
+
+               max_len -= len;
+               page_offset = 0;
+               page_start_vaddr += PAGE_SIZE;
+       }
+
+       seq_putc(m, ']');
+}
+
  static void vma_stop(struct proc_maps_private *priv)
  {
         struct mm_struct *mm = priv->mm;
@@ -349,8 +399,15 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid)
                         goto done;
                 }
  
-               if (is_stack(vma))
+               if (is_stack(vma)) {
                         name = "[stack]";
+                       goto done;
+               }
+
+               if (vma_get_anon_name(vma)) {
+                       seq_pad(m, ' ');
+                       seq_print_vma_name(m, vma);
+               }
         }
  
  done:
@@ -798,6 +855,11 @@ static int show_smap(struct seq_file *m, void *v, int is_pid)
  
         if (!rollup_mode) {
                 show_map_vma(m, vma, is_pid);
+               if (vma_get_anon_name(vma)) {
+                       seq_puts(m, "Name:           ");
+                       seq_print_vma_name(m, vma);
+                       seq_putc(m, '\n');
+               }
         } else if (last_vma) {
                 show_vma_header_prefix(
                         m, mss->first_vma_start, vma->vm_end, 0, 0, 0, 0);
diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c

index 7626ee11b06c67edac5d9c021516ff6ea3390b98..b859aaeecb270676a6ba94c57e1c309f61f7b97e 100644 (file)
--- a/fs/proc_namespace.c
+++ b/fs/proc_namespace.c
@@ -121,7 +121,9 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt)
         if (err)
                 goto out;
         show_mnt_opts(m, mnt);
-       if (sb->s_op->show_options)
+       if (sb->s_op->show_options2)
+                       err = sb->s_op->show_options2(mnt, m, mnt_path.dentry);
+       else if (sb->s_op->show_options)
                 err = sb->s_op->show_options(m, mnt_path.dentry);
         seq_puts(m, " 0 0\n");
  out:
@@ -183,7 +185,9 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt)
         err = show_sb_opts(m, sb);
         if (err)
                 goto out;
-       if (sb->s_op->show_options)
+       if (sb->s_op->show_options2) {
+               err = sb->s_op->show_options2(mnt, m, mnt->mnt_root);
+       } else if (sb->s_op->show_options)
                 err = sb->s_op->show_options(m, mnt->mnt_root);
         seq_putc(m, '\n');
  out:
diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c

index 7125b398d312e71714d96d4ed51baeff2e149601..379b53546fe82c724f253b577656dd81dad58e56 100644 (file)
--- a/fs/pstore/ram.c
+++ b/fs/pstore/ram.c
@@ -707,6 +707,12 @@ static int ramoops_parse_dt(struct platform_device *pdev,
         return 0;
  }
  
+void notrace ramoops_console_write_buf(const char *buf, size_t size)
+{
+       struct ramoops_context *cxt = &oops_cxt;
+       persistent_ram_write(cxt->cprz, buf, size);
+}
+
  static int ramoops_probe(struct platform_device *pdev)
  {
         struct device *dev = &pdev->dev;
diff --git a/fs/read_write.c b/fs/read_write.c

index 0046d72efe94667619b8d6016abb087364ceb8d6..62b9c341afa9e70bf35101f345032c484a2d3d8d 100644 (file)
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -455,6 +455,8 @@ ssize_t vfs_read(struct file *file, char __user *buf, size_t count, loff_t *pos)
         return ret;
  }
  
+EXPORT_SYMBOL(vfs_read);
+
  static ssize_t new_sync_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
  {
         struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len };
@@ -553,6 +555,8 @@ ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_
         return ret;
  }
  
+EXPORT_SYMBOL(vfs_write);
+
  static inline loff_t file_pos_read(struct file *file)
  {
         return file->f_pos;
diff --git a/fs/sdcardfs/Kconfig b/fs/sdcardfs/Kconfig

new file mode 100644 (file)

index 0000000..a1c1033
--- /dev/null
+++ b/fs/sdcardfs/Kconfig
@@ -0,0 +1,13 @@
+config SDCARD_FS
+       tristate "sdcard file system"
+       depends on CONFIGFS_FS
+       default n
+       help
+         Sdcardfs is based on Wrapfs file system.
+
+config SDCARD_FS_FADV_NOACTIVE
+       bool "sdcardfs fadvise noactive support"
+       depends on FADV_NOACTIVE
+       default y
+       help
+         Sdcardfs supports fadvise noactive mode.
diff --git a/fs/sdcardfs/Makefile b/fs/sdcardfs/Makefile

new file mode 100644 (file)

index 0000000..b84fbb2
--- /dev/null
+++ b/fs/sdcardfs/Makefile
@@ -0,0 +1,7 @@
+SDCARDFS_VERSION="0.1"
+
+EXTRA_CFLAGS += -DSDCARDFS_VERSION=\"$(SDCARDFS_VERSION)\"
+
+obj-$(CONFIG_SDCARD_FS) += sdcardfs.o
+
+sdcardfs-y := dentry.o file.o inode.o main.o super.o lookup.o mmap.o packagelist.o derived_perm.o
diff --git a/fs/sdcardfs/dentry.c b/fs/sdcardfs/dentry.c

new file mode 100644 (file)

index 0000000..e9426a6
--- /dev/null
+++ b/fs/sdcardfs/dentry.c
@@ -0,0 +1,193 @@
+/*
+ * fs/sdcardfs/dentry.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/ctype.h"
+
+/*
+ * returns: -ERRNO if error (returned to user)
+ *          0: tell VFS to invalidate dentry
+ *          1: dentry is valid
+ */
+static int sdcardfs_d_revalidate(struct dentry *dentry, unsigned int flags)
+{
+       int err = 1;
+       struct path parent_lower_path, lower_path;
+       struct dentry *parent_dentry = NULL;
+       struct dentry *parent_lower_dentry = NULL;
+       struct dentry *lower_cur_parent_dentry = NULL;
+       struct dentry *lower_dentry = NULL;
+       struct inode *inode;
+       struct sdcardfs_inode_data *data;
+
+       if (flags & LOOKUP_RCU)
+               return -ECHILD;
+
+       spin_lock(&dentry->d_lock);
+       if (IS_ROOT(dentry)) {
+               spin_unlock(&dentry->d_lock);
+               return 1;
+       }
+       spin_unlock(&dentry->d_lock);
+
+       /* check uninitialized obb_dentry and
+        * whether the base obbpath has been changed or not
+        */
+       if (is_obbpath_invalid(dentry)) {
+               d_drop(dentry);
+               return 0;
+       }
+
+       parent_dentry = dget_parent(dentry);
+       sdcardfs_get_lower_path(parent_dentry, &parent_lower_path);
+       sdcardfs_get_real_lower(dentry, &lower_path);
+       parent_lower_dentry = parent_lower_path.dentry;
+       lower_dentry = lower_path.dentry;
+       lower_cur_parent_dentry = dget_parent(lower_dentry);
+
+       if ((lower_dentry->d_flags & DCACHE_OP_REVALIDATE)) {
+               err = lower_dentry->d_op->d_revalidate(lower_dentry, flags);
+               if (err == 0) {
+                       d_drop(dentry);
+                       goto out;
+               }
+       }
+
+       spin_lock(&lower_dentry->d_lock);
+       if (d_unhashed(lower_dentry)) {
+               spin_unlock(&lower_dentry->d_lock);
+               d_drop(dentry);
+               err = 0;
+               goto out;
+       }
+       spin_unlock(&lower_dentry->d_lock);
+
+       if (parent_lower_dentry != lower_cur_parent_dentry) {
+               d_drop(dentry);
+               err = 0;
+               goto out;
+       }
+
+       if (dentry < lower_dentry) {
+               spin_lock(&dentry->d_lock);
+               spin_lock_nested(&lower_dentry->d_lock, DENTRY_D_LOCK_NESTED);
+       } else {
+               spin_lock(&lower_dentry->d_lock);
+               spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
+       }
+
+       if (!qstr_case_eq(&dentry->d_name, &lower_dentry->d_name)) {
+               __d_drop(dentry);
+               err = 0;
+       }
+
+       if (dentry < lower_dentry) {
+               spin_unlock(&lower_dentry->d_lock);
+               spin_unlock(&dentry->d_lock);
+       } else {
+               spin_unlock(&dentry->d_lock);
+               spin_unlock(&lower_dentry->d_lock);
+       }
+       if (!err)
+               goto out;
+
+       /* If our top's inode is gone, we may be out of date */
+       inode = igrab(d_inode(dentry));
+       if (inode) {
+               data = top_data_get(SDCARDFS_I(inode));
+               if (!data || data->abandoned) {
+                       d_drop(dentry);
+                       err = 0;
+               }
+               if (data)
+                       data_put(data);
+               iput(inode);
+       }
+
+out:
+       dput(parent_dentry);
+       dput(lower_cur_parent_dentry);
+       sdcardfs_put_lower_path(parent_dentry, &parent_lower_path);
+       sdcardfs_put_real_lower(dentry, &lower_path);
+       return err;
+}
+
+static void sdcardfs_d_release(struct dentry *dentry)
+{
+       /* release and reset the lower paths */
+       if (has_graft_path(dentry))
+               sdcardfs_put_reset_orig_path(dentry);
+       sdcardfs_put_reset_lower_path(dentry);
+       free_dentry_private_data(dentry);
+}
+
+static int sdcardfs_hash_ci(const struct dentry *dentry,
+                               struct qstr *qstr)
+{
+       /*
+        * This function is copy of vfat_hashi.
+        * FIXME Should we support national language?
+        *       Refer to vfat_hashi()
+        * struct nls_table *t = MSDOS_SB(dentry->d_sb)->nls_io;
+        */
+       const unsigned char *name;
+       unsigned int len;
+       unsigned long hash;
+
+       name = qstr->name;
+       len = qstr->len;
+
+       hash = init_name_hash(dentry);
+       while (len--)
+               hash = partial_name_hash(tolower(*name++), hash);
+       qstr->hash = end_name_hash(hash);
+
+       return 0;
+}
+
+/*
+ * Case insensitive compare of two vfat names.
+ */
+static int sdcardfs_cmp_ci(const struct dentry *dentry,
+               unsigned int len, const char *str, const struct qstr *name)
+{
+       /* FIXME Should we support national language? */
+
+       if (name->len == len) {
+               if (str_n_case_eq(name->name, str, len))
+                       return 0;
+       }
+       return 1;
+}
+
+static void sdcardfs_canonical_path(const struct path *path,
+                               struct path *actual_path)
+{
+       sdcardfs_get_real_lower(path->dentry, actual_path);
+}
+
+const struct dentry_operations sdcardfs_ci_dops = {
+       .d_revalidate   = sdcardfs_d_revalidate,
+       .d_release      = sdcardfs_d_release,
+       .d_hash = sdcardfs_hash_ci,
+       .d_compare      = sdcardfs_cmp_ci,
+       .d_canonical_path = sdcardfs_canonical_path,
+};
+
diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c

new file mode 100644 (file)

index 0000000..0b3b223
--- /dev/null
+++ b/fs/sdcardfs/derived_perm.c
@@ -0,0 +1,472 @@
+/*
+ * fs/sdcardfs/derived_perm.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/* copy derived state from parent inode */
+static void inherit_derived_state(struct inode *parent, struct inode *child)
+{
+       struct sdcardfs_inode_info *pi = SDCARDFS_I(parent);
+       struct sdcardfs_inode_info *ci = SDCARDFS_I(child);
+
+       ci->data->perm = PERM_INHERIT;
+       ci->data->userid = pi->data->userid;
+       ci->data->d_uid = pi->data->d_uid;
+       ci->data->under_android = pi->data->under_android;
+       ci->data->under_cache = pi->data->under_cache;
+       ci->data->under_obb = pi->data->under_obb;
+}
+
+/* helper function for derived state */
+void setup_derived_state(struct inode *inode, perm_t perm, userid_t userid,
+                                       uid_t uid)
+{
+       struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+
+       info->data->perm = perm;
+       info->data->userid = userid;
+       info->data->d_uid = uid;
+       info->data->under_android = false;
+       info->data->under_cache = false;
+       info->data->under_obb = false;
+}
+
+/* While renaming, there is a point where we want the path from dentry,
+ * but the name from newdentry
+ */
+void get_derived_permission_new(struct dentry *parent, struct dentry *dentry,
+                               const struct qstr *name)
+{
+       struct sdcardfs_inode_info *info = SDCARDFS_I(d_inode(dentry));
+       struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+       struct sdcardfs_inode_data *parent_data = parent_info->data;
+       appid_t appid;
+       unsigned long user_num;
+       int err;
+       struct qstr q_Android = QSTR_LITERAL("Android");
+       struct qstr q_data = QSTR_LITERAL("data");
+       struct qstr q_obb = QSTR_LITERAL("obb");
+       struct qstr q_media = QSTR_LITERAL("media");
+       struct qstr q_cache = QSTR_LITERAL("cache");
+
+       /* By default, each inode inherits from its parent.
+        * the properties are maintained on its private fields
+        * because the inode attributes will be modified with that of
+        * its lower inode.
+        * These values are used by our custom permission call instead
+        * of using the inode permissions.
+        */
+
+       inherit_derived_state(d_inode(parent), d_inode(dentry));
+
+       /* Files don't get special labels */
+       if (!S_ISDIR(d_inode(dentry)->i_mode)) {
+               set_top(info, parent_info);
+               return;
+       }
+       /* Derive custom permissions based on parent and current node */
+       switch (parent_data->perm) {
+       case PERM_INHERIT:
+       case PERM_ANDROID_PACKAGE_CACHE:
+               set_top(info, parent_info);
+               break;
+       case PERM_PRE_ROOT:
+               /* Legacy internal layout places users at top level */
+               info->data->perm = PERM_ROOT;
+               err = kstrtoul(name->name, 10, &user_num);
+               if (err)
+                       info->data->userid = 0;
+               else
+                       info->data->userid = user_num;
+               break;
+       case PERM_ROOT:
+               /* Assume masked off by default. */
+               if (qstr_case_eq(name, &q_Android)) {
+                       /* App-specific directories inside; let anyone traverse */
+                       info->data->perm = PERM_ANDROID;
+                       info->data->under_android = true;
+               } else {
+                       set_top(info, parent_info);
+               }
+               break;
+       case PERM_ANDROID:
+               if (qstr_case_eq(name, &q_data)) {
+                       /* App-specific directories inside; let anyone traverse */
+                       info->data->perm = PERM_ANDROID_DATA;
+               } else if (qstr_case_eq(name, &q_obb)) {
+                       /* App-specific directories inside; let anyone traverse */
+                       info->data->perm = PERM_ANDROID_OBB;
+                       info->data->under_obb = true;
+                       /* Single OBB directory is always shared */
+               } else if (qstr_case_eq(name, &q_media)) {
+                       /* App-specific directories inside; let anyone traverse */
+                       info->data->perm = PERM_ANDROID_MEDIA;
+               } else {
+                       set_top(info, parent_info);
+               }
+               break;
+       case PERM_ANDROID_OBB:
+       case PERM_ANDROID_DATA:
+       case PERM_ANDROID_MEDIA:
+               info->data->perm = PERM_ANDROID_PACKAGE;
+               appid = get_appid(name->name);
+               if (appid != 0 && !is_excluded(name->name, parent_data->userid))
+                       info->data->d_uid =
+                               multiuser_get_uid(parent_data->userid, appid);
+               break;
+       case PERM_ANDROID_PACKAGE:
+               if (qstr_case_eq(name, &q_cache)) {
+                       info->data->perm = PERM_ANDROID_PACKAGE_CACHE;
+                       info->data->under_cache = true;
+               }
+               set_top(info, parent_info);
+               break;
+       }
+}
+
+void get_derived_permission(struct dentry *parent, struct dentry *dentry)
+{
+       get_derived_permission_new(parent, dentry, &dentry->d_name);
+}
+
+static appid_t get_type(const char *name)
+{
+       const char *ext = strrchr(name, '.');
+       appid_t id;
+
+       if (ext && ext[0]) {
+               ext = &ext[1];
+               id = get_ext_gid(ext);
+               return id?:AID_MEDIA_RW;
+       }
+       return AID_MEDIA_RW;
+}
+
+void fixup_lower_ownership(struct dentry *dentry, const char *name)
+{
+       struct path path;
+       struct inode *inode;
+       struct inode *delegated_inode = NULL;
+       int error;
+       struct sdcardfs_inode_info *info;
+       struct sdcardfs_inode_data *info_d;
+       struct sdcardfs_inode_data *info_top;
+       perm_t perm;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       uid_t uid = sbi->options.fs_low_uid;
+       gid_t gid = sbi->options.fs_low_gid;
+       struct iattr newattrs;
+
+       if (!sbi->options.gid_derivation)
+               return;
+
+       info = SDCARDFS_I(d_inode(dentry));
+       info_d = info->data;
+       perm = info_d->perm;
+       if (info_d->under_obb) {
+               perm = PERM_ANDROID_OBB;
+       } else if (info_d->under_cache) {
+               perm = PERM_ANDROID_PACKAGE_CACHE;
+       } else if (perm == PERM_INHERIT) {
+               info_top = top_data_get(info);
+               perm = info_top->perm;
+               data_put(info_top);
+       }
+
+       switch (perm) {
+       case PERM_ROOT:
+       case PERM_ANDROID:
+       case PERM_ANDROID_DATA:
+       case PERM_ANDROID_MEDIA:
+       case PERM_ANDROID_PACKAGE:
+       case PERM_ANDROID_PACKAGE_CACHE:
+               uid = multiuser_get_uid(info_d->userid, uid);
+               break;
+       case PERM_ANDROID_OBB:
+               uid = AID_MEDIA_OBB;
+               break;
+       case PERM_PRE_ROOT:
+       default:
+               break;
+       }
+       switch (perm) {
+       case PERM_ROOT:
+       case PERM_ANDROID:
+       case PERM_ANDROID_DATA:
+       case PERM_ANDROID_MEDIA:
+               if (S_ISDIR(d_inode(dentry)->i_mode))
+                       gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+               else
+                       gid = multiuser_get_uid(info_d->userid, get_type(name));
+               break;
+       case PERM_ANDROID_OBB:
+               gid = AID_MEDIA_OBB;
+               break;
+       case PERM_ANDROID_PACKAGE:
+               if (uid_is_app(info_d->d_uid))
+                       gid = multiuser_get_ext_gid(info_d->d_uid);
+               else
+                       gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+               break;
+       case PERM_ANDROID_PACKAGE_CACHE:
+               if (uid_is_app(info_d->d_uid))
+                       gid = multiuser_get_ext_cache_gid(info_d->d_uid);
+               else
+                       gid = multiuser_get_uid(info_d->userid, AID_MEDIA_RW);
+               break;
+       case PERM_PRE_ROOT:
+       default:
+               break;
+       }
+
+       sdcardfs_get_lower_path(dentry, &path);
+       inode = d_inode(path.dentry);
+       if (d_inode(path.dentry)->i_gid.val != gid || d_inode(path.dentry)->i_uid.val != uid) {
+retry_deleg:
+               newattrs.ia_valid = ATTR_GID | ATTR_UID | ATTR_FORCE;
+               newattrs.ia_uid = make_kuid(current_user_ns(), uid);
+               newattrs.ia_gid = make_kgid(current_user_ns(), gid);
+               if (!S_ISDIR(inode->i_mode))
+                       newattrs.ia_valid |=
+                               ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV;
+               inode_lock(inode);
+               error = security_path_chown(&path, newattrs.ia_uid, newattrs.ia_gid);
+               if (!error)
+                       error = notify_change2(path.mnt, path.dentry, &newattrs, &delegated_inode);
+               inode_unlock(inode);
+               if (delegated_inode) {
+                       error = break_deleg_wait(&delegated_inode);
+                       if (!error)
+                               goto retry_deleg;
+               }
+               if (error)
+                       pr_debug("sdcardfs: Failed to touch up lower fs gid/uid for %s\n", name);
+       }
+       sdcardfs_put_lower_path(dentry, &path);
+}
+
+static int descendant_may_need_fixup(struct sdcardfs_inode_data *data,
+               struct limit_search *limit)
+{
+       if (data->perm == PERM_ROOT)
+               return (limit->flags & BY_USERID) ?
+                               data->userid == limit->userid : 1;
+       if (data->perm == PERM_PRE_ROOT || data->perm == PERM_ANDROID)
+               return 1;
+       return 0;
+}
+
+static int needs_fixup(perm_t perm)
+{
+       if (perm == PERM_ANDROID_DATA || perm == PERM_ANDROID_OBB
+                       || perm == PERM_ANDROID_MEDIA)
+               return 1;
+       return 0;
+}
+
+static void __fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit, int depth)
+{
+       struct dentry *child;
+       struct sdcardfs_inode_info *info;
+
+       /*
+        * All paths will terminate their recursion on hitting PERM_ANDROID_OBB,
+        * PERM_ANDROID_MEDIA, or PERM_ANDROID_DATA. This happens at a depth of
+        * at most 3.
+        */
+       WARN(depth > 3, "%s: Max expected depth exceeded!\n", __func__);
+       spin_lock_nested(&dentry->d_lock, depth);
+       if (!d_inode(dentry)) {
+               spin_unlock(&dentry->d_lock);
+               return;
+       }
+       info = SDCARDFS_I(d_inode(dentry));
+
+       if (needs_fixup(info->data->perm)) {
+               list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+                       spin_lock_nested(&child->d_lock, depth + 1);
+                       if (!(limit->flags & BY_NAME) || qstr_case_eq(&child->d_name, &limit->name)) {
+                               if (d_inode(child)) {
+                                       get_derived_permission(dentry, child);
+                                       fixup_tmp_permissions(d_inode(child));
+                                       spin_unlock(&child->d_lock);
+                                       break;
+                               }
+                       }
+                       spin_unlock(&child->d_lock);
+               }
+       } else if (descendant_may_need_fixup(info->data, limit)) {
+               list_for_each_entry(child, &dentry->d_subdirs, d_child) {
+                       __fixup_perms_recursive(child, limit, depth + 1);
+               }
+       }
+       spin_unlock(&dentry->d_lock);
+}
+
+void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit)
+{
+       __fixup_perms_recursive(dentry, limit, 0);
+}
+
+/* main function for updating derived permission */
+inline void update_derived_permission_lock(struct dentry *dentry)
+{
+       struct dentry *parent;
+
+       if (!dentry || !d_inode(dentry)) {
+               pr_err("sdcardfs: %s: invalid dentry\n", __func__);
+               return;
+       }
+       /* FIXME:
+        * 1. need to check whether the dentry is updated or not
+        * 2. remove the root dentry update
+        */
+       if (!IS_ROOT(dentry)) {
+               parent = dget_parent(dentry);
+               if (parent) {
+                       get_derived_permission(parent, dentry);
+                       dput(parent);
+               }
+       }
+       fixup_tmp_permissions(d_inode(dentry));
+}
+
+int need_graft_path(struct dentry *dentry)
+{
+       int ret = 0;
+       struct dentry *parent = dget_parent(dentry);
+       struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       struct qstr obb = QSTR_LITERAL("obb");
+
+       if (parent_info->data->perm == PERM_ANDROID &&
+                       qstr_case_eq(&dentry->d_name, &obb)) {
+
+               /* /Android/obb is the base obbpath of DERIVED_UNIFIED */
+               if (!(sbi->options.multiuser == false
+                               && parent_info->data->userid == 0)) {
+                       ret = 1;
+               }
+       }
+       dput(parent);
+       return ret;
+}
+
+int is_obbpath_invalid(struct dentry *dent)
+{
+       int ret = 0;
+       struct sdcardfs_dentry_info *di = SDCARDFS_D(dent);
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dent->d_sb);
+       char *path_buf, *obbpath_s;
+       int need_put = 0;
+       struct path lower_path;
+
+       /* check the base obbpath has been changed.
+        * this routine can check an uninitialized obb dentry as well.
+        * regarding the uninitialized obb, refer to the sdcardfs_mkdir()
+        */
+       spin_lock(&di->lock);
+       if (di->orig_path.dentry) {
+               if (!di->lower_path.dentry) {
+                       ret = 1;
+               } else {
+                       path_get(&di->lower_path);
+
+                       path_buf = kmalloc(PATH_MAX, GFP_ATOMIC);
+                       if (!path_buf) {
+                               ret = 1;
+                               pr_err("sdcardfs: fail to allocate path_buf in %s.\n", __func__);
+                       } else {
+                               obbpath_s = d_path(&di->lower_path, path_buf, PATH_MAX);
+                               if (d_unhashed(di->lower_path.dentry) ||
+                                       !str_case_eq(sbi->obbpath_s, obbpath_s)) {
+                                       ret = 1;
+                               }
+                               kfree(path_buf);
+                       }
+
+                       pathcpy(&lower_path, &di->lower_path);
+                       need_put = 1;
+               }
+       }
+       spin_unlock(&di->lock);
+       if (need_put)
+               path_put(&lower_path);
+       return ret;
+}
+
+int is_base_obbpath(struct dentry *dentry)
+{
+       int ret = 0;
+       struct dentry *parent = dget_parent(dentry);
+       struct sdcardfs_inode_info *parent_info = SDCARDFS_I(d_inode(parent));
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       struct qstr q_obb = QSTR_LITERAL("obb");
+
+       spin_lock(&SDCARDFS_D(dentry)->lock);
+       if (sbi->options.multiuser) {
+               if (parent_info->data->perm == PERM_PRE_ROOT &&
+                               qstr_case_eq(&dentry->d_name, &q_obb)) {
+                       ret = 1;
+               }
+       } else  if (parent_info->data->perm == PERM_ANDROID &&
+                       qstr_case_eq(&dentry->d_name, &q_obb)) {
+               ret = 1;
+       }
+       spin_unlock(&SDCARDFS_D(dentry)->lock);
+       return ret;
+}
+
+/* The lower_path will be stored to the dentry's orig_path
+ * and the base obbpath will be copyed to the lower_path variable.
+ * if an error returned, there's no change in the lower_path
+ * returns: -ERRNO if error (0: no error)
+ */
+int setup_obb_dentry(struct dentry *dentry, struct path *lower_path)
+{
+       int err = 0;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       struct path obbpath;
+
+       /* A local obb dentry must have its own orig_path to support rmdir
+        * and mkdir of itself. Usually, we expect that the sbi->obbpath
+        * is avaiable on this stage.
+        */
+       sdcardfs_set_orig_path(dentry, lower_path);
+
+       err = kern_path(sbi->obbpath_s,
+                       LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &obbpath);
+
+       if (!err) {
+               /* the obbpath base has been found */
+               pathcpy(lower_path, &obbpath);
+       } else {
+               /* if the sbi->obbpath is not available, we can optionally
+                * setup the lower_path with its orig_path.
+                * but, the current implementation just returns an error
+                * because the sdcard daemon also regards this case as
+                * a lookup fail.
+                */
+               pr_info("sdcardfs: the sbi->obbpath is not available\n");
+       }
+       return err;
+}
+
+
diff --git a/fs/sdcardfs/file.c b/fs/sdcardfs/file.c

new file mode 100644 (file)

index 0000000..5ac0b0b
--- /dev/null
+++ b/fs/sdcardfs/file.c
@@ -0,0 +1,449 @@
+/*
+ * fs/sdcardfs/file.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+#include <linux/backing-dev.h>
+#endif
+
+static ssize_t sdcardfs_read(struct file *file, char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       int err;
+       struct file *lower_file;
+       struct dentry *dentry = file->f_path.dentry;
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+       struct backing_dev_info *bdi;
+#endif
+
+       lower_file = sdcardfs_lower_file(file);
+
+#ifdef CONFIG_SDCARD_FS_FADV_NOACTIVE
+       if (file->f_mode & FMODE_NOACTIVE) {
+               if (!(lower_file->f_mode & FMODE_NOACTIVE)) {
+                       bdi = lower_file->f_mapping->backing_dev_info;
+                       lower_file->f_ra.ra_pages = bdi->ra_pages * 2;
+                       spin_lock(&lower_file->f_lock);
+                       lower_file->f_mode |= FMODE_NOACTIVE;
+                       spin_unlock(&lower_file->f_lock);
+               }
+       }
+#endif
+
+       err = vfs_read(lower_file, buf, count, ppos);
+       /* update our inode atime upon a successful lower read */
+       if (err >= 0)
+               fsstack_copy_attr_atime(d_inode(dentry),
+                                       file_inode(lower_file));
+
+       return err;
+}
+
+static ssize_t sdcardfs_write(struct file *file, const char __user *buf,
+                           size_t count, loff_t *ppos)
+{
+       int err;
+       struct file *lower_file;
+       struct dentry *dentry = file->f_path.dentry;
+
+       /* check disk space */
+       if (!check_min_free_space(dentry, count, 0)) {
+               pr_err("No minimum free space.\n");
+               return -ENOSPC;
+       }
+
+       lower_file = sdcardfs_lower_file(file);
+       err = vfs_write(lower_file, buf, count, ppos);
+       /* update our inode times+sizes upon a successful lower write */
+       if (err >= 0) {
+               fsstack_copy_inode_size(d_inode(dentry),
+                                       file_inode(lower_file));
+               fsstack_copy_attr_times(d_inode(dentry),
+                                       file_inode(lower_file));
+       }
+
+       return err;
+}
+
+static int sdcardfs_readdir(struct file *file, struct dir_context *ctx)
+{
+       int err;
+       struct file *lower_file = NULL;
+       struct dentry *dentry = file->f_path.dentry;
+
+       lower_file = sdcardfs_lower_file(file);
+
+       lower_file->f_pos = file->f_pos;
+       err = iterate_dir(lower_file, ctx);
+       file->f_pos = lower_file->f_pos;
+       if (err >= 0)           /* copy the atime */
+               fsstack_copy_attr_atime(d_inode(dentry),
+                                       file_inode(lower_file));
+       return err;
+}
+
+static long sdcardfs_unlocked_ioctl(struct file *file, unsigned int cmd,
+                                 unsigned long arg)
+{
+       long err = -ENOTTY;
+       struct file *lower_file;
+       const struct cred *saved_cred = NULL;
+       struct dentry *dentry = file->f_path.dentry;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+       lower_file = sdcardfs_lower_file(file);
+
+       /* XXX: use vfs_ioctl if/when VFS exports it */
+       if (!lower_file || !lower_file->f_op)
+               goto out;
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+
+       if (lower_file->f_op->unlocked_ioctl)
+               err = lower_file->f_op->unlocked_ioctl(lower_file, cmd, arg);
+
+       /* some ioctls can change inode attributes (EXT2_IOC_SETFLAGS) */
+       if (!err)
+               sdcardfs_copy_and_fix_attrs(file_inode(file),
+                                     file_inode(lower_file));
+       REVERT_CRED(saved_cred);
+out:
+       return err;
+}
+
+#ifdef CONFIG_COMPAT
+static long sdcardfs_compat_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg)
+{
+       long err = -ENOTTY;
+       struct file *lower_file;
+       const struct cred *saved_cred = NULL;
+       struct dentry *dentry = file->f_path.dentry;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+       lower_file = sdcardfs_lower_file(file);
+
+       /* XXX: use vfs_ioctl if/when VFS exports it */
+       if (!lower_file || !lower_file->f_op)
+               goto out;
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(file_inode(file)));
+
+       if (lower_file->f_op->compat_ioctl)
+               err = lower_file->f_op->compat_ioctl(lower_file, cmd, arg);
+
+       REVERT_CRED(saved_cred);
+out:
+       return err;
+}
+#endif
+
+static int sdcardfs_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       int err = 0;
+       bool willwrite;
+       struct file *lower_file;
+       const struct vm_operations_struct *saved_vm_ops = NULL;
+
+       /* this might be deferred to mmap's writepage */
+       willwrite = ((vma->vm_flags | VM_SHARED | VM_WRITE) == vma->vm_flags);
+
+       /*
+        * File systems which do not implement ->writepage may use
+        * generic_file_readonly_mmap as their ->mmap op.  If you call
+        * generic_file_readonly_mmap with VM_WRITE, you'd get an -EINVAL.
+        * But we cannot call the lower ->mmap op, so we can't tell that
+        * writeable mappings won't work.  Therefore, our only choice is to
+        * check if the lower file system supports the ->writepage, and if
+        * not, return EINVAL (the same error that
+        * generic_file_readonly_mmap returns in that case).
+        */
+       lower_file = sdcardfs_lower_file(file);
+       if (willwrite && !lower_file->f_mapping->a_ops->writepage) {
+               err = -EINVAL;
+               pr_err("sdcardfs: lower file system does not support writeable mmap\n");
+               goto out;
+       }
+
+       /*
+        * find and save lower vm_ops.
+        *
+        * XXX: the VFS should have a cleaner way of finding the lower vm_ops
+        */
+       if (!SDCARDFS_F(file)->lower_vm_ops) {
+               err = lower_file->f_op->mmap(lower_file, vma);
+               if (err) {
+                       pr_err("sdcardfs: lower mmap failed %d\n", err);
+                       goto out;
+               }
+               saved_vm_ops = vma->vm_ops; /* save: came from lower ->mmap */
+       }
+
+       /*
+        * Next 3 lines are all I need from generic_file_mmap.  I definitely
+        * don't want its test for ->readpage which returns -ENOEXEC.
+        */
+       file_accessed(file);
+       vma->vm_ops = &sdcardfs_vm_ops;
+
+       file->f_mapping->a_ops = &sdcardfs_aops; /* set our aops */
+       if (!SDCARDFS_F(file)->lower_vm_ops) /* save for our ->fault */
+               SDCARDFS_F(file)->lower_vm_ops = saved_vm_ops;
+       vma->vm_private_data = file;
+       get_file(lower_file);
+       vma->vm_file = lower_file;
+
+out:
+       return err;
+}
+
+static int sdcardfs_open(struct inode *inode, struct file *file)
+{
+       int err = 0;
+       struct file *lower_file = NULL;
+       struct path lower_path;
+       struct dentry *dentry = file->f_path.dentry;
+       struct dentry *parent = dget_parent(dentry);
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       const struct cred *saved_cred = NULL;
+
+       /* don't open unhashed/deleted files */
+       if (d_unhashed(dentry)) {
+               err = -ENOENT;
+               goto out_err;
+       }
+
+       if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+               err = -EACCES;
+               goto out_err;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(sbi, saved_cred, SDCARDFS_I(inode));
+
+       file->private_data =
+               kzalloc(sizeof(struct sdcardfs_file_info), GFP_KERNEL);
+       if (!SDCARDFS_F(file)) {
+               err = -ENOMEM;
+               goto out_revert_cred;
+       }
+
+       /* open lower object and link sdcardfs's file struct to lower's */
+       sdcardfs_get_lower_path(file->f_path.dentry, &lower_path);
+       lower_file = dentry_open(&lower_path, file->f_flags, current_cred());
+       path_put(&lower_path);
+       if (IS_ERR(lower_file)) {
+               err = PTR_ERR(lower_file);
+               lower_file = sdcardfs_lower_file(file);
+               if (lower_file) {
+                       sdcardfs_set_lower_file(file, NULL);
+                       fput(lower_file); /* fput calls dput for lower_dentry */
+               }
+       } else {
+               sdcardfs_set_lower_file(file, lower_file);
+       }
+
+       if (err)
+               kfree(SDCARDFS_F(file));
+       else
+               sdcardfs_copy_and_fix_attrs(inode, sdcardfs_lower_inode(inode));
+
+out_revert_cred:
+       REVERT_CRED(saved_cred);
+out_err:
+       dput(parent);
+       return err;
+}
+
+static int sdcardfs_flush(struct file *file, fl_owner_t id)
+{
+       int err = 0;
+       struct file *lower_file = NULL;
+
+       lower_file = sdcardfs_lower_file(file);
+       if (lower_file && lower_file->f_op && lower_file->f_op->flush) {
+               filemap_write_and_wait(file->f_mapping);
+               err = lower_file->f_op->flush(lower_file, id);
+       }
+
+       return err;
+}
+
+/* release all lower object references & free the file info structure */
+static int sdcardfs_file_release(struct inode *inode, struct file *file)
+{
+       struct file *lower_file;
+
+       lower_file = sdcardfs_lower_file(file);
+       if (lower_file) {
+               sdcardfs_set_lower_file(file, NULL);
+               fput(lower_file);
+       }
+
+       kfree(SDCARDFS_F(file));
+       return 0;
+}
+
+static int sdcardfs_fsync(struct file *file, loff_t start, loff_t end,
+                       int datasync)
+{
+       int err;
+       struct file *lower_file;
+       struct path lower_path;
+       struct dentry *dentry = file->f_path.dentry;
+
+       err = __generic_file_fsync(file, start, end, datasync);
+       if (err)
+               goto out;
+
+       lower_file = sdcardfs_lower_file(file);
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       err = vfs_fsync_range(lower_file, start, end, datasync);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+out:
+       return err;
+}
+
+static int sdcardfs_fasync(int fd, struct file *file, int flag)
+{
+       int err = 0;
+       struct file *lower_file = NULL;
+
+       lower_file = sdcardfs_lower_file(file);
+       if (lower_file->f_op && lower_file->f_op->fasync)
+               err = lower_file->f_op->fasync(fd, lower_file, flag);
+
+       return err;
+}
+
+/*
+ * Sdcardfs cannot use generic_file_llseek as ->llseek, because it would
+ * only set the offset of the upper file.  So we have to implement our
+ * own method to set both the upper and lower file offsets
+ * consistently.
+ */
+static loff_t sdcardfs_file_llseek(struct file *file, loff_t offset, int whence)
+{
+       int err;
+       struct file *lower_file;
+
+       err = generic_file_llseek(file, offset, whence);
+       if (err < 0)
+               goto out;
+
+       lower_file = sdcardfs_lower_file(file);
+       err = generic_file_llseek(lower_file, offset, whence);
+
+out:
+       return err;
+}
+
+/*
+ * Sdcardfs read_iter, redirect modified iocb to lower read_iter
+ */
+ssize_t sdcardfs_read_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       int err;
+       struct file *file = iocb->ki_filp, *lower_file;
+
+       lower_file = sdcardfs_lower_file(file);
+       if (!lower_file->f_op->read_iter) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       get_file(lower_file); /* prevent lower_file from being released */
+       iocb->ki_filp = lower_file;
+       err = lower_file->f_op->read_iter(iocb, iter);
+       iocb->ki_filp = file;
+       fput(lower_file);
+       /* update upper inode atime as needed */
+       if (err >= 0 || err == -EIOCBQUEUED)
+               fsstack_copy_attr_atime(file->f_path.dentry->d_inode,
+                                       file_inode(lower_file));
+out:
+       return err;
+}
+
+/*
+ * Sdcardfs write_iter, redirect modified iocb to lower write_iter
+ */
+ssize_t sdcardfs_write_iter(struct kiocb *iocb, struct iov_iter *iter)
+{
+       int err;
+       struct file *file = iocb->ki_filp, *lower_file;
+
+       lower_file = sdcardfs_lower_file(file);
+       if (!lower_file->f_op->write_iter) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       get_file(lower_file); /* prevent lower_file from being released */
+       iocb->ki_filp = lower_file;
+       err = lower_file->f_op->write_iter(iocb, iter);
+       iocb->ki_filp = file;
+       fput(lower_file);
+       /* update upper inode times/sizes as needed */
+       if (err >= 0 || err == -EIOCBQUEUED) {
+               fsstack_copy_inode_size(file->f_path.dentry->d_inode,
+                                       file_inode(lower_file));
+               fsstack_copy_attr_times(file->f_path.dentry->d_inode,
+                                       file_inode(lower_file));
+       }
+out:
+       return err;
+}
+
+const struct file_operations sdcardfs_main_fops = {
+       .llseek         = generic_file_llseek,
+       .read           = sdcardfs_read,
+       .write          = sdcardfs_write,
+       .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = sdcardfs_compat_ioctl,
+#endif
+       .mmap           = sdcardfs_mmap,
+       .open           = sdcardfs_open,
+       .flush          = sdcardfs_flush,
+       .release        = sdcardfs_file_release,
+       .fsync          = sdcardfs_fsync,
+       .fasync         = sdcardfs_fasync,
+       .read_iter      = sdcardfs_read_iter,
+       .write_iter     = sdcardfs_write_iter,
+};
+
+/* trimmed directory options */
+const struct file_operations sdcardfs_dir_fops = {
+       .llseek         = sdcardfs_file_llseek,
+       .read           = generic_read_dir,
+       .iterate        = sdcardfs_readdir,
+       .unlocked_ioctl = sdcardfs_unlocked_ioctl,
+#ifdef CONFIG_COMPAT
+       .compat_ioctl   = sdcardfs_compat_ioctl,
+#endif
+       .open           = sdcardfs_open,
+       .release        = sdcardfs_file_release,
+       .flush          = sdcardfs_flush,
+       .fsync          = sdcardfs_fsync,
+       .fasync         = sdcardfs_fasync,
+};
diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c

new file mode 100644 (file)

index 0000000..b432586
--- /dev/null
+++ b/fs/sdcardfs/inode.c
@@ -0,0 +1,921 @@
+/*
+ * fs/sdcardfs/inode.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/fs_struct.h>
+#include <linux/ratelimit.h>
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+               struct sdcardfs_inode_data *data)
+{
+       struct cred *cred;
+       const struct cred *old_cred;
+       uid_t uid;
+
+       cred = prepare_creds();
+       if (!cred)
+               return NULL;
+
+       if (sbi->options.gid_derivation) {
+               if (data->under_obb)
+                       uid = AID_MEDIA_OBB;
+               else
+                       uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid);
+       } else {
+               uid = sbi->options.fs_low_uid;
+       }
+       cred->fsuid = make_kuid(&init_user_ns, uid);
+       cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid);
+
+       old_cred = override_creds(cred);
+
+       return old_cred;
+}
+
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred)
+{
+       const struct cred *cur_cred;
+
+       cur_cred = current->cred;
+       revert_creds(old_cred);
+       put_cred(cur_cred);
+}
+
+static int sdcardfs_create(struct inode *dir, struct dentry *dentry,
+                        umode_t mode, bool want_excl)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct vfsmount *lower_dentry_mnt;
+       struct dentry *lower_parent_dentry = NULL;
+       struct path lower_path;
+       const struct cred *saved_cred = NULL;
+       struct fs_struct *saved_fs;
+       struct fs_struct *copied_fs;
+
+       if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+               err = -EACCES;
+               goto out_eacces;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_dentry_mnt = lower_path.mnt;
+       lower_parent_dentry = lock_parent(lower_dentry);
+
+       /* set last 16bytes of mode field to 0664 */
+       mode = (mode & S_IFMT) | 00664;
+
+       /* temporarily change umask for lower fs write */
+       saved_fs = current->fs;
+       copied_fs = copy_fs_struct(current->fs);
+       if (!copied_fs) {
+               err = -ENOMEM;
+               goto out_unlock;
+       }
+       current->fs = copied_fs;
+       current->fs->umask = 0;
+       err = vfs_create2(lower_dentry_mnt, d_inode(lower_parent_dentry), lower_dentry, mode, want_excl);
+       if (err)
+               goto out;
+
+       err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path,
+                       SDCARDFS_I(dir)->data->userid);
+       if (err)
+               goto out;
+       fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+       fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+       fixup_lower_ownership(dentry, dentry->d_name.name);
+
+out:
+       current->fs = saved_fs;
+       free_fs_struct(copied_fs);
+out_unlock:
+       unlock_dir(lower_parent_dentry);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       REVERT_CRED(saved_cred);
+out_eacces:
+       return err;
+}
+
+#if 0
+static int sdcardfs_link(struct dentry *old_dentry, struct inode *dir,
+                      struct dentry *new_dentry)
+{
+       struct dentry *lower_old_dentry;
+       struct dentry *lower_new_dentry;
+       struct dentry *lower_dir_dentry;
+       u64 file_size_save;
+       int err;
+       struct path lower_old_path, lower_new_path;
+
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+       file_size_save = i_size_read(d_inode(old_dentry));
+       sdcardfs_get_lower_path(old_dentry, &lower_old_path);
+       sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+       lower_old_dentry = lower_old_path.dentry;
+       lower_new_dentry = lower_new_path.dentry;
+       lower_dir_dentry = lock_parent(lower_new_dentry);
+
+       err = vfs_link(lower_old_dentry, d_inode(lower_dir_dentry),
+                      lower_new_dentry, NULL);
+       if (err || !d_inode(lower_new_dentry))
+               goto out;
+
+       err = sdcardfs_interpose(new_dentry, dir->i_sb, &lower_new_path);
+       if (err)
+               goto out;
+       fsstack_copy_attr_times(dir, d_inode(lower_new_dentry));
+       fsstack_copy_inode_size(dir, d_inode(lower_new_dentry));
+       set_nlink(d_inode(old_dentry),
+                 sdcardfs_lower_inode(d_inode(old_dentry))->i_nlink);
+       i_size_write(d_inode(new_dentry), file_size_save);
+out:
+       unlock_dir(lower_dir_dentry);
+       sdcardfs_put_lower_path(old_dentry, &lower_old_path);
+       sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+       REVERT_CRED();
+       return err;
+}
+#endif
+
+static int sdcardfs_unlink(struct inode *dir, struct dentry *dentry)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct vfsmount *lower_mnt;
+       struct inode *lower_dir_inode = sdcardfs_lower_inode(dir);
+       struct dentry *lower_dir_dentry;
+       struct path lower_path;
+       const struct cred *saved_cred = NULL;
+
+       if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+               err = -EACCES;
+               goto out_eacces;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_mnt = lower_path.mnt;
+       dget(lower_dentry);
+       lower_dir_dentry = lock_parent(lower_dentry);
+
+       err = vfs_unlink2(lower_mnt, lower_dir_inode, lower_dentry, NULL);
+
+       /*
+        * Note: unlinking on top of NFS can cause silly-renamed files.
+        * Trying to delete such files results in EBUSY from NFS
+        * below.  Silly-renamed files will get deleted by NFS later on, so
+        * we just need to detect them here and treat such EBUSY errors as
+        * if the upper file was successfully deleted.
+        */
+       if (err == -EBUSY && lower_dentry->d_flags & DCACHE_NFSFS_RENAMED)
+               err = 0;
+       if (err)
+               goto out;
+       fsstack_copy_attr_times(dir, lower_dir_inode);
+       fsstack_copy_inode_size(dir, lower_dir_inode);
+       set_nlink(d_inode(dentry),
+                 sdcardfs_lower_inode(d_inode(dentry))->i_nlink);
+       d_inode(dentry)->i_ctime = dir->i_ctime;
+       d_drop(dentry); /* this is needed, else LTP fails (VFS won't do it) */
+out:
+       unlock_dir(lower_dir_dentry);
+       dput(lower_dentry);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       REVERT_CRED(saved_cred);
+out_eacces:
+       return err;
+}
+
+#if 0
+static int sdcardfs_symlink(struct inode *dir, struct dentry *dentry,
+                         const char *symname)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct dentry *lower_parent_dentry = NULL;
+       struct path lower_path;
+
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_parent_dentry = lock_parent(lower_dentry);
+
+       err = vfs_symlink(d_inode(lower_parent_dentry), lower_dentry, symname);
+       if (err)
+               goto out;
+       err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
+       if (err)
+               goto out;
+       fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+       fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+
+out:
+       unlock_dir(lower_parent_dentry);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       REVERT_CRED();
+       return err;
+}
+#endif
+
+static int touch(char *abs_path, mode_t mode)
+{
+       struct file *filp = filp_open(abs_path, O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW, mode);
+
+       if (IS_ERR(filp)) {
+               if (PTR_ERR(filp) == -EEXIST) {
+                       return 0;
+               } else {
+                       pr_err("sdcardfs: failed to open(%s): %ld\n",
+                                               abs_path, PTR_ERR(filp));
+                       return PTR_ERR(filp);
+               }
+       }
+       filp_close(filp, current->files);
+       return 0;
+}
+
+static int sdcardfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
+{
+       int err;
+       int make_nomedia_in_obb = 0;
+       struct dentry *lower_dentry;
+       struct vfsmount *lower_mnt;
+       struct dentry *lower_parent_dentry = NULL;
+       struct path lower_path;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+       const struct cred *saved_cred = NULL;
+       struct sdcardfs_inode_data *pd = SDCARDFS_I(dir)->data;
+       int touch_err = 0;
+       struct fs_struct *saved_fs;
+       struct fs_struct *copied_fs;
+       struct qstr q_obb = QSTR_LITERAL("obb");
+       struct qstr q_data = QSTR_LITERAL("data");
+
+       if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+               err = -EACCES;
+               goto out_eacces;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+       /* check disk space */
+       if (!check_min_free_space(dentry, 0, 1)) {
+               pr_err("sdcardfs: No minimum free space.\n");
+               err = -ENOSPC;
+               goto out_revert;
+       }
+
+       /* the lower_dentry is negative here */
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_mnt = lower_path.mnt;
+       lower_parent_dentry = lock_parent(lower_dentry);
+
+       /* set last 16bytes of mode field to 0775 */
+       mode = (mode & S_IFMT) | 00775;
+
+       /* temporarily change umask for lower fs write */
+       saved_fs = current->fs;
+       copied_fs = copy_fs_struct(current->fs);
+       if (!copied_fs) {
+               err = -ENOMEM;
+               unlock_dir(lower_parent_dentry);
+               goto out_unlock;
+       }
+       current->fs = copied_fs;
+       current->fs->umask = 0;
+       err = vfs_mkdir2(lower_mnt, d_inode(lower_parent_dentry), lower_dentry, mode);
+
+       if (err) {
+               unlock_dir(lower_parent_dentry);
+               goto out;
+       }
+
+       /* if it is a local obb dentry, setup it with the base obbpath */
+       if (need_graft_path(dentry)) {
+
+               err = setup_obb_dentry(dentry, &lower_path);
+               if (err) {
+                       /* if the sbi->obbpath is not available, the lower_path won't be
+                        * changed by setup_obb_dentry() but the lower path is saved to
+                        * its orig_path. this dentry will be revalidated later.
+                        * but now, the lower_path should be NULL
+                        */
+                       sdcardfs_put_reset_lower_path(dentry);
+
+                       /* the newly created lower path which saved to its orig_path or
+                        * the lower_path is the base obbpath.
+                        * therefore, an additional path_get is required
+                        */
+                       path_get(&lower_path);
+               } else
+                       make_nomedia_in_obb = 1;
+       }
+
+       err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path, pd->userid);
+       if (err) {
+               unlock_dir(lower_parent_dentry);
+               goto out;
+       }
+
+       fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+       fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+       /* update number of links on parent directory */
+       set_nlink(dir, sdcardfs_lower_inode(dir)->i_nlink);
+       fixup_lower_ownership(dentry, dentry->d_name.name);
+       unlock_dir(lower_parent_dentry);
+       if ((!sbi->options.multiuser) && (qstr_case_eq(&dentry->d_name, &q_obb))
+               && (pd->perm == PERM_ANDROID) && (pd->userid == 0))
+               make_nomedia_in_obb = 1;
+
+       /* When creating /Android/data and /Android/obb, mark them as .nomedia */
+       if (make_nomedia_in_obb ||
+               ((pd->perm == PERM_ANDROID)
+                               && (qstr_case_eq(&dentry->d_name, &q_data)))) {
+               REVERT_CRED(saved_cred);
+               OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(d_inode(dentry)));
+               set_fs_pwd(current->fs, &lower_path);
+               touch_err = touch(".nomedia", 0664);
+               if (touch_err) {
+                       pr_err("sdcardfs: failed to create .nomedia in %s: %d\n",
+                                                       lower_path.dentry->d_name.name, touch_err);
+                       goto out;
+               }
+       }
+out:
+       current->fs = saved_fs;
+       free_fs_struct(copied_fs);
+out_unlock:
+       sdcardfs_put_lower_path(dentry, &lower_path);
+out_revert:
+       REVERT_CRED(saved_cred);
+out_eacces:
+       return err;
+}
+
+static int sdcardfs_rmdir(struct inode *dir, struct dentry *dentry)
+{
+       struct dentry *lower_dentry;
+       struct dentry *lower_dir_dentry;
+       struct vfsmount *lower_mnt;
+       int err;
+       struct path lower_path;
+       const struct cred *saved_cred = NULL;
+
+       if (!check_caller_access_to_name(dir, &dentry->d_name)) {
+               err = -EACCES;
+               goto out_eacces;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+       /* sdcardfs_get_real_lower(): in case of remove an user's obb dentry
+        * the dentry on the original path should be deleted.
+        */
+       sdcardfs_get_real_lower(dentry, &lower_path);
+
+       lower_dentry = lower_path.dentry;
+       lower_mnt = lower_path.mnt;
+       lower_dir_dentry = lock_parent(lower_dentry);
+
+       err = vfs_rmdir2(lower_mnt, d_inode(lower_dir_dentry), lower_dentry);
+       if (err)
+               goto out;
+
+       d_drop(dentry); /* drop our dentry on success (why not VFS's job?) */
+       if (d_inode(dentry))
+               clear_nlink(d_inode(dentry));
+       fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry));
+       fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry));
+       set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink);
+
+out:
+       unlock_dir(lower_dir_dentry);
+       sdcardfs_put_real_lower(dentry, &lower_path);
+       REVERT_CRED(saved_cred);
+out_eacces:
+       return err;
+}
+
+#if 0
+static int sdcardfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
+                       dev_t dev)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct dentry *lower_parent_dentry = NULL;
+       struct path lower_path;
+
+       OVERRIDE_CRED(SDCARDFS_SB(dir->i_sb));
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_parent_dentry = lock_parent(lower_dentry);
+
+       err = vfs_mknod(d_inode(lower_parent_dentry), lower_dentry, mode, dev);
+       if (err)
+               goto out;
+
+       err = sdcardfs_interpose(dentry, dir->i_sb, &lower_path);
+       if (err)
+               goto out;
+       fsstack_copy_attr_times(dir, sdcardfs_lower_inode(dir));
+       fsstack_copy_inode_size(dir, d_inode(lower_parent_dentry));
+
+out:
+       unlock_dir(lower_parent_dentry);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       REVERT_CRED();
+       return err;
+}
+#endif
+
+/*
+ * The locking rules in sdcardfs_rename are complex.  We could use a simpler
+ * superblock-level name-space lock for renames and copy-ups.
+ */
+static int sdcardfs_rename(struct inode *old_dir, struct dentry *old_dentry,
+                        struct inode *new_dir, struct dentry *new_dentry,
+                        unsigned int flags)
+{
+       int err = 0;
+       struct dentry *lower_old_dentry = NULL;
+       struct dentry *lower_new_dentry = NULL;
+       struct dentry *lower_old_dir_dentry = NULL;
+       struct dentry *lower_new_dir_dentry = NULL;
+       struct vfsmount *lower_mnt = NULL;
+       struct dentry *trap = NULL;
+       struct path lower_old_path, lower_new_path;
+       const struct cred *saved_cred = NULL;
+
+       if (flags)
+               return -EINVAL;
+
+       if (!check_caller_access_to_name(old_dir, &old_dentry->d_name) ||
+               !check_caller_access_to_name(new_dir, &new_dentry->d_name)) {
+               err = -EACCES;
+               goto out_eacces;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(old_dir->i_sb), saved_cred, SDCARDFS_I(new_dir));
+
+       sdcardfs_get_real_lower(old_dentry, &lower_old_path);
+       sdcardfs_get_lower_path(new_dentry, &lower_new_path);
+       lower_old_dentry = lower_old_path.dentry;
+       lower_new_dentry = lower_new_path.dentry;
+       lower_mnt = lower_old_path.mnt;
+       lower_old_dir_dentry = dget_parent(lower_old_dentry);
+       lower_new_dir_dentry = dget_parent(lower_new_dentry);
+
+       trap = lock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+       /* source should not be ancestor of target */
+       if (trap == lower_old_dentry) {
+               err = -EINVAL;
+               goto out;
+       }
+       /* target should not be ancestor of source */
+       if (trap == lower_new_dentry) {
+               err = -ENOTEMPTY;
+               goto out;
+       }
+
+       err = vfs_rename2(lower_mnt,
+                        d_inode(lower_old_dir_dentry), lower_old_dentry,
+                        d_inode(lower_new_dir_dentry), lower_new_dentry,
+                        NULL, 0);
+       if (err)
+               goto out;
+
+       /* Copy attrs from lower dir, but i_uid/i_gid */
+       sdcardfs_copy_and_fix_attrs(new_dir, d_inode(lower_new_dir_dentry));
+       fsstack_copy_inode_size(new_dir, d_inode(lower_new_dir_dentry));
+
+       if (new_dir != old_dir) {
+               sdcardfs_copy_and_fix_attrs(old_dir, d_inode(lower_old_dir_dentry));
+               fsstack_copy_inode_size(old_dir, d_inode(lower_old_dir_dentry));
+       }
+       get_derived_permission_new(new_dentry->d_parent, old_dentry, &new_dentry->d_name);
+       fixup_tmp_permissions(d_inode(old_dentry));
+       fixup_lower_ownership(old_dentry, new_dentry->d_name.name);
+       d_invalidate(old_dentry); /* Can't fixup ownership recursively :( */
+out:
+       unlock_rename(lower_old_dir_dentry, lower_new_dir_dentry);
+       dput(lower_old_dir_dentry);
+       dput(lower_new_dir_dentry);
+       sdcardfs_put_real_lower(old_dentry, &lower_old_path);
+       sdcardfs_put_lower_path(new_dentry, &lower_new_path);
+       REVERT_CRED(saved_cred);
+out_eacces:
+       return err;
+}
+
+#if 0
+static int sdcardfs_readlink(struct dentry *dentry, char __user *buf, int bufsiz)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct path lower_path;
+       /* XXX readlink does not requires overriding credential */
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       if (!d_inode(lower_dentry)->i_op ||
+           !d_inode(lower_dentry)->i_op->readlink) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = d_inode(lower_dentry)->i_op->readlink(lower_dentry,
+                                                   buf, bufsiz);
+       if (err < 0)
+               goto out;
+       fsstack_copy_attr_atime(d_inode(dentry), d_inode(lower_dentry));
+
+out:
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       return err;
+}
+#endif
+
+#if 0
+static const char *sdcardfs_follow_link(struct dentry *dentry, void **cookie)
+{
+       char *buf;
+       int len = PAGE_SIZE, err;
+       mm_segment_t old_fs;
+
+       /* This is freed by the put_link method assuming a successful call. */
+       buf = kmalloc(len, GFP_KERNEL);
+       if (!buf) {
+               buf = ERR_PTR(-ENOMEM);
+               return buf;
+       }
+
+       /* read the symlink, and then we will follow it */
+       old_fs = get_fs();
+       set_fs(KERNEL_DS);
+       err = sdcardfs_readlink(dentry, buf, len);
+       set_fs(old_fs);
+       if (err < 0) {
+               kfree(buf);
+               buf = ERR_PTR(err);
+       } else {
+               buf[err] = '\0';
+       }
+       return *cookie = buf;
+}
+#endif
+
+static int sdcardfs_permission_wrn(struct inode *inode, int mask)
+{
+       WARN_RATELIMIT(1, "sdcardfs does not support permission. Use permission2.\n");
+       return -EINVAL;
+}
+
+void copy_attrs(struct inode *dest, const struct inode *src)
+{
+       dest->i_mode = src->i_mode;
+       dest->i_uid = src->i_uid;
+       dest->i_gid = src->i_gid;
+       dest->i_rdev = src->i_rdev;
+       dest->i_atime = src->i_atime;
+       dest->i_mtime = src->i_mtime;
+       dest->i_ctime = src->i_ctime;
+       dest->i_blkbits = src->i_blkbits;
+       dest->i_flags = src->i_flags;
+#ifdef CONFIG_FS_POSIX_ACL
+       dest->i_acl = src->i_acl;
+#endif
+#ifdef CONFIG_SECURITY
+       dest->i_security = src->i_security;
+#endif
+}
+
+static int sdcardfs_permission(struct vfsmount *mnt, struct inode *inode, int mask)
+{
+       int err;
+       struct inode tmp;
+       struct sdcardfs_inode_data *top = top_data_get(SDCARDFS_I(inode));
+
+       if (IS_ERR(mnt))
+               return PTR_ERR(mnt);
+
+       if (!top)
+               return -EINVAL;
+
+       /*
+        * Permission check on sdcardfs inode.
+        * Calling process should have AID_SDCARD_RW permission
+        * Since generic_permission only needs i_mode, i_uid,
+        * i_gid, and i_sb, we can create a fake inode to pass
+        * this information down in.
+        *
+        * The underlying code may attempt to take locks in some
+        * cases for features we're not using, but if that changes,
+        * locks must be dealt with to avoid undefined behavior.
+        */
+       copy_attrs(&tmp, inode);
+       tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+       tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, inode->i_sb, top));
+       tmp.i_mode = (inode->i_mode & S_IFMT)
+                       | get_mode(mnt, SDCARDFS_I(inode), top);
+       data_put(top);
+       tmp.i_sb = inode->i_sb;
+       if (IS_POSIXACL(inode))
+               pr_warn("%s: This may be undefined behavior...\n", __func__);
+       err = generic_permission(&tmp, mask);
+       /* XXX
+        * Original sdcardfs code calls inode_permission(lower_inode,.. )
+        * for checking inode permission. But doing such things here seems
+        * duplicated work, because the functions called after this func,
+        * such as vfs_create, vfs_unlink, vfs_rename, and etc,
+        * does exactly same thing, i.e., they calls inode_permission().
+        * So we just let they do the things.
+        * If there are any security hole, just uncomment following if block.
+        */
+#if 0
+       if (!err) {
+               /*
+                * Permission check on lower_inode(=EXT4).
+                * we check it with AID_MEDIA_RW permission
+                */
+               struct inode *lower_inode;
+
+               OVERRIDE_CRED(SDCARDFS_SB(inode->sb));
+
+               lower_inode = sdcardfs_lower_inode(inode);
+               err = inode_permission(lower_inode, mask);
+
+               REVERT_CRED();
+       }
+#endif
+       return err;
+
+}
+
+static int sdcardfs_setattr_wrn(struct dentry *dentry, struct iattr *ia)
+{
+       WARN_RATELIMIT(1, "sdcardfs does not support setattr. User setattr2.\n");
+       return -EINVAL;
+}
+
+static int sdcardfs_setattr(struct vfsmount *mnt, struct dentry *dentry, struct iattr *ia)
+{
+       int err;
+       struct dentry *lower_dentry;
+       struct vfsmount *lower_mnt;
+       struct inode *inode;
+       struct inode *lower_inode;
+       struct path lower_path;
+       struct iattr lower_ia;
+       struct dentry *parent;
+       struct inode tmp;
+       struct dentry tmp_d;
+       struct sdcardfs_inode_data *top;
+
+       const struct cred *saved_cred = NULL;
+
+       inode = d_inode(dentry);
+       top = top_data_get(SDCARDFS_I(inode));
+
+       if (!top)
+               return -EINVAL;
+
+       /*
+        * Permission check on sdcardfs inode.
+        * Calling process should have AID_SDCARD_RW permission
+        * Since generic_permission only needs i_mode, i_uid,
+        * i_gid, and i_sb, we can create a fake inode to pass
+        * this information down in.
+        *
+        * The underlying code may attempt to take locks in some
+        * cases for features we're not using, but if that changes,
+        * locks must be dealt with to avoid undefined behavior.
+        *
+        */
+       copy_attrs(&tmp, inode);
+       tmp.i_uid = make_kuid(&init_user_ns, top->d_uid);
+       tmp.i_gid = make_kgid(&init_user_ns, get_gid(mnt, dentry->d_sb, top));
+       tmp.i_mode = (inode->i_mode & S_IFMT)
+                       | get_mode(mnt, SDCARDFS_I(inode), top);
+       tmp.i_size = i_size_read(inode);
+       data_put(top);
+       tmp.i_sb = inode->i_sb;
+       tmp_d.d_inode = &tmp;
+
+       /*
+        * Check if user has permission to change dentry.  We don't check if
+        * this user can change the lower inode: that should happen when
+        * calling notify_change on the lower inode.
+        */
+       /* prepare our own lower struct iattr (with the lower file) */
+       memcpy(&lower_ia, ia, sizeof(lower_ia));
+       /* Allow touch updating timestamps. A previous permission check ensures
+        * we have write access. Changes to mode, owner, and group are ignored
+        */
+       ia->ia_valid |= ATTR_FORCE;
+       err = setattr_prepare(&tmp_d, ia);
+
+       if (!err) {
+               /* check the Android group ID */
+               parent = dget_parent(dentry);
+               if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name))
+                       err = -EACCES;
+               dput(parent);
+       }
+
+       if (err)
+               goto out_err;
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED(SDCARDFS_SB(dentry->d_sb), saved_cred, SDCARDFS_I(inode));
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       lower_dentry = lower_path.dentry;
+       lower_mnt = lower_path.mnt;
+       lower_inode = sdcardfs_lower_inode(inode);
+
+       if (ia->ia_valid & ATTR_FILE)
+               lower_ia.ia_file = sdcardfs_lower_file(ia->ia_file);
+
+       lower_ia.ia_valid &= ~(ATTR_UID | ATTR_GID | ATTR_MODE);
+
+       /*
+        * If shrinking, first truncate upper level to cancel writing dirty
+        * pages beyond the new eof; and also if its' maxbytes is more
+        * limiting (fail with -EFBIG before making any change to the lower
+        * level).  There is no need to vmtruncate the upper level
+        * afterwards in the other cases: we fsstack_copy_inode_size from
+        * the lower level.
+        */
+       if (ia->ia_valid & ATTR_SIZE) {
+               err = inode_newsize_ok(&tmp, ia->ia_size);
+               if (err) {
+                       goto out;
+               }
+               truncate_setsize(inode, ia->ia_size);
+       }
+
+       /*
+        * mode change is for clearing setuid/setgid bits. Allow lower fs
+        * to interpret this in its own way.
+        */
+       if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID))
+               lower_ia.ia_valid &= ~ATTR_MODE;
+
+       /* notify the (possibly copied-up) lower inode */
+       /*
+        * Note: we use d_inode(lower_dentry), because lower_inode may be
+        * unlinked (no inode->i_sb and i_ino==0.  This happens if someone
+        * tries to open(), unlink(), then ftruncate() a file.
+        */
+       inode_lock(d_inode(lower_dentry));
+       err = notify_change2(lower_mnt, lower_dentry, &lower_ia, /* note: lower_ia */
+                       NULL);
+       inode_unlock(d_inode(lower_dentry));
+       if (err)
+               goto out;
+
+       /* get attributes from the lower inode and update derived permissions */
+       sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+
+       /*
+        * Not running fsstack_copy_inode_size(inode, lower_inode), because
+        * VFS should update our inode size, and notify_change on
+        * lower_inode should update its size.
+        */
+
+out:
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       REVERT_CRED(saved_cred);
+out_err:
+       return err;
+}
+
+static int sdcardfs_fillattr(struct vfsmount *mnt, struct inode *inode,
+                               struct kstat *lower_stat, struct kstat *stat)
+{
+       struct sdcardfs_inode_info *info = SDCARDFS_I(inode);
+       struct sdcardfs_inode_data *top = top_data_get(info);
+       struct super_block *sb = inode->i_sb;
+
+       if (!top)
+               return -EINVAL;
+
+       stat->dev = inode->i_sb->s_dev;
+       stat->ino = inode->i_ino;
+       stat->mode = (inode->i_mode  & S_IFMT) | get_mode(mnt, info, top);
+       stat->nlink = inode->i_nlink;
+       stat->uid = make_kuid(&init_user_ns, top->d_uid);
+       stat->gid = make_kgid(&init_user_ns, get_gid(mnt, sb, top));
+       stat->rdev = inode->i_rdev;
+       stat->size = lower_stat->size;
+       stat->atime = lower_stat->atime;
+       stat->mtime = lower_stat->mtime;
+       stat->ctime = lower_stat->ctime;
+       stat->blksize = lower_stat->blksize;
+       stat->blocks = lower_stat->blocks;
+       data_put(top);
+       return 0;
+}
+static int sdcardfs_getattr(const struct path *path, struct kstat *stat,
+                               u32 request_mask, unsigned int flags)
+{
+       struct vfsmount *mnt = path->mnt;
+       struct dentry *dentry = path->dentry;
+       struct kstat lower_stat;
+       struct path lower_path;
+       struct dentry *parent;
+       int err;
+
+       parent = dget_parent(dentry);
+       if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+               dput(parent);
+               return -EACCES;
+       }
+       dput(parent);
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       err = vfs_getattr(&lower_path, &lower_stat, request_mask, flags);
+       if (err)
+               goto out;
+       sdcardfs_copy_and_fix_attrs(d_inode(dentry),
+                             d_inode(lower_path.dentry));
+       err = sdcardfs_fillattr(mnt, d_inode(dentry), &lower_stat, stat);
+out:
+       sdcardfs_put_lower_path(dentry, &lower_path);
+       return err;
+}
+
+const struct inode_operations sdcardfs_symlink_iops = {
+       .permission2    = sdcardfs_permission,
+       .setattr2       = sdcardfs_setattr,
+       /* XXX Following operations are implemented,
+        *     but FUSE(sdcard) or FAT does not support them
+        *     These methods are *NOT* perfectly tested.
+       .readlink       = sdcardfs_readlink,
+       .follow_link    = sdcardfs_follow_link,
+       .put_link       = kfree_put_link,
+        */
+};
+
+const struct inode_operations sdcardfs_dir_iops = {
+       .create         = sdcardfs_create,
+       .lookup         = sdcardfs_lookup,
+       .permission     = sdcardfs_permission_wrn,
+       .permission2    = sdcardfs_permission,
+       .unlink         = sdcardfs_unlink,
+       .mkdir          = sdcardfs_mkdir,
+       .rmdir          = sdcardfs_rmdir,
+       .rename         = sdcardfs_rename,
+       .setattr        = sdcardfs_setattr_wrn,
+       .setattr2       = sdcardfs_setattr,
+       .getattr        = sdcardfs_getattr,
+       /* XXX Following operations are implemented,
+        *     but FUSE(sdcard) or FAT does not support them
+        *     These methods are *NOT* perfectly tested.
+       .symlink        = sdcardfs_symlink,
+       .link           = sdcardfs_link,
+       .mknod          = sdcardfs_mknod,
+        */
+};
+
+const struct inode_operations sdcardfs_main_iops = {
+       .permission     = sdcardfs_permission_wrn,
+       .permission2    = sdcardfs_permission,
+       .setattr        = sdcardfs_setattr_wrn,
+       .setattr2       = sdcardfs_setattr,
+       .getattr        = sdcardfs_getattr,
+};
diff --git a/fs/sdcardfs/lookup.c b/fs/sdcardfs/lookup.c

new file mode 100644 (file)

index 0000000..7dab5f7
--- /dev/null
+++ b/fs/sdcardfs/lookup.c
@@ -0,0 +1,467 @@
+/*
+ * fs/sdcardfs/lookup.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include "linux/delay.h"
+
+/* The dentry cache is just so we have properly sized dentries */
+static struct kmem_cache *sdcardfs_dentry_cachep;
+
+int sdcardfs_init_dentry_cache(void)
+{
+       sdcardfs_dentry_cachep =
+               kmem_cache_create("sdcardfs_dentry",
+                                 sizeof(struct sdcardfs_dentry_info),
+                                 0, SLAB_RECLAIM_ACCOUNT, NULL);
+
+       return sdcardfs_dentry_cachep ? 0 : -ENOMEM;
+}
+
+void sdcardfs_destroy_dentry_cache(void)
+{
+       kmem_cache_destroy(sdcardfs_dentry_cachep);
+}
+
+void free_dentry_private_data(struct dentry *dentry)
+{
+       if (!dentry || !dentry->d_fsdata)
+               return;
+       kmem_cache_free(sdcardfs_dentry_cachep, dentry->d_fsdata);
+       dentry->d_fsdata = NULL;
+}
+
+/* allocate new dentry private data */
+int new_dentry_private_data(struct dentry *dentry)
+{
+       struct sdcardfs_dentry_info *info = SDCARDFS_D(dentry);
+
+       /* use zalloc to init dentry_info.lower_path */
+       info = kmem_cache_zalloc(sdcardfs_dentry_cachep, GFP_ATOMIC);
+       if (!info)
+               return -ENOMEM;
+
+       spin_lock_init(&info->lock);
+       dentry->d_fsdata = info;
+
+       return 0;
+}
+
+struct inode_data {
+       struct inode *lower_inode;
+       userid_t id;
+};
+
+static int sdcardfs_inode_test(struct inode *inode, void *candidate_data/*void *candidate_lower_inode*/)
+{
+       struct inode *current_lower_inode = sdcardfs_lower_inode(inode);
+       userid_t current_userid = SDCARDFS_I(inode)->data->userid;
+
+       if (current_lower_inode == ((struct inode_data *)candidate_data)->lower_inode &&
+                       current_userid == ((struct inode_data *)candidate_data)->id)
+               return 1; /* found a match */
+       else
+               return 0; /* no match */
+}
+
+static int sdcardfs_inode_set(struct inode *inode, void *lower_inode)
+{
+       /* we do actual inode initialization in sdcardfs_iget */
+       return 0;
+}
+
+struct inode *sdcardfs_iget(struct super_block *sb, struct inode *lower_inode, userid_t id)
+{
+       struct sdcardfs_inode_info *info;
+       struct inode_data data;
+       struct inode *inode; /* the new inode to return */
+
+       if (!igrab(lower_inode))
+               return ERR_PTR(-ESTALE);
+
+       data.id = id;
+       data.lower_inode = lower_inode;
+       inode = iget5_locked(sb, /* our superblock */
+                            /*
+                             * hashval: we use inode number, but we can
+                             * also use "(unsigned long)lower_inode"
+                             * instead.
+                             */
+                            lower_inode->i_ino, /* hashval */
+                            sdcardfs_inode_test, /* inode comparison function */
+                            sdcardfs_inode_set, /* inode init function */
+                            &data); /* data passed to test+set fxns */
+       if (!inode) {
+               iput(lower_inode);
+               return ERR_PTR(-ENOMEM);
+       }
+       /* if found a cached inode, then just return it (after iput) */
+       if (!(inode->i_state & I_NEW)) {
+               iput(lower_inode);
+               return inode;
+       }
+
+       /* initialize new inode */
+       info = SDCARDFS_I(inode);
+
+       inode->i_ino = lower_inode->i_ino;
+       sdcardfs_set_lower_inode(inode, lower_inode);
+
+       inode->i_version++;
+
+       /* use different set of inode ops for symlinks & directories */
+       if (S_ISDIR(lower_inode->i_mode))
+               inode->i_op = &sdcardfs_dir_iops;
+       else if (S_ISLNK(lower_inode->i_mode))
+               inode->i_op = &sdcardfs_symlink_iops;
+       else
+               inode->i_op = &sdcardfs_main_iops;
+
+       /* use different set of file ops for directories */
+       if (S_ISDIR(lower_inode->i_mode))
+               inode->i_fop = &sdcardfs_dir_fops;
+       else
+               inode->i_fop = &sdcardfs_main_fops;
+
+       inode->i_mapping->a_ops = &sdcardfs_aops;
+
+       inode->i_atime.tv_sec = 0;
+       inode->i_atime.tv_nsec = 0;
+       inode->i_mtime.tv_sec = 0;
+       inode->i_mtime.tv_nsec = 0;
+       inode->i_ctime.tv_sec = 0;
+       inode->i_ctime.tv_nsec = 0;
+
+       /* properly initialize special inodes */
+       if (S_ISBLK(lower_inode->i_mode) || S_ISCHR(lower_inode->i_mode) ||
+           S_ISFIFO(lower_inode->i_mode) || S_ISSOCK(lower_inode->i_mode))
+               init_special_inode(inode, lower_inode->i_mode,
+                                  lower_inode->i_rdev);
+
+       /* all well, copy inode attributes */
+       sdcardfs_copy_and_fix_attrs(inode, lower_inode);
+       fsstack_copy_inode_size(inode, lower_inode);
+
+       unlock_new_inode(inode);
+       return inode;
+}
+
+/*
+ * Helper interpose routine, called directly by ->lookup to handle
+ * spliced dentries.
+ */
+static struct dentry *__sdcardfs_interpose(struct dentry *dentry,
+                                        struct super_block *sb,
+                                        struct path *lower_path,
+                                        userid_t id)
+{
+       struct inode *inode;
+       struct inode *lower_inode;
+       struct super_block *lower_sb;
+       struct dentry *ret_dentry;
+
+       lower_inode = d_inode(lower_path->dentry);
+       lower_sb = sdcardfs_lower_super(sb);
+
+       /* check that the lower file system didn't cross a mount point */
+       if (lower_inode->i_sb != lower_sb) {
+               ret_dentry = ERR_PTR(-EXDEV);
+               goto out;
+       }
+
+       /*
+        * We allocate our new inode below by calling sdcardfs_iget,
+        * which will initialize some of the new inode's fields
+        */
+
+       /* inherit lower inode number for sdcardfs's inode */
+       inode = sdcardfs_iget(sb, lower_inode, id);
+       if (IS_ERR(inode)) {
+               ret_dentry = ERR_CAST(inode);
+               goto out;
+       }
+
+       ret_dentry = d_splice_alias(inode, dentry);
+       dentry = ret_dentry ?: dentry;
+       if (!IS_ERR(dentry))
+               update_derived_permission_lock(dentry);
+out:
+       return ret_dentry;
+}
+
+/*
+ * Connect an sdcardfs inode dentry/inode with several lower ones.  This is
+ * the classic stackable file system "vnode interposition" action.
+ *
+ * @dentry: sdcardfs's dentry which interposes on lower one
+ * @sb: sdcardfs's super_block
+ * @lower_path: the lower path (caller does path_get/put)
+ */
+int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+                    struct path *lower_path, userid_t id)
+{
+       struct dentry *ret_dentry;
+
+       ret_dentry = __sdcardfs_interpose(dentry, sb, lower_path, id);
+       return PTR_ERR(ret_dentry);
+}
+
+struct sdcardfs_name_data {
+       struct dir_context ctx;
+       const struct qstr *to_find;
+       char *name;
+       bool found;
+};
+
+static int sdcardfs_name_match(struct dir_context *ctx, const char *name,
+               int namelen, loff_t offset, u64 ino, unsigned int d_type)
+{
+       struct sdcardfs_name_data *buf = container_of(ctx, struct sdcardfs_name_data, ctx);
+       struct qstr candidate = QSTR_INIT(name, namelen);
+
+       if (qstr_case_eq(buf->to_find, &candidate)) {
+               memcpy(buf->name, name, namelen);
+               buf->name[namelen] = 0;
+               buf->found = true;
+               return 1;
+       }
+       return 0;
+}
+
+/*
+ * Main driver function for sdcardfs's lookup.
+ *
+ * Returns: NULL (ok), ERR_PTR if an error occurred.
+ * Fills in lower_parent_path with <dentry,mnt> on success.
+ */
+static struct dentry *__sdcardfs_lookup(struct dentry *dentry,
+               unsigned int flags, struct path *lower_parent_path, userid_t id)
+{
+       int err = 0;
+       struct vfsmount *lower_dir_mnt;
+       struct dentry *lower_dir_dentry = NULL;
+       struct dentry *lower_dentry;
+       const struct qstr *name;
+       struct path lower_path;
+       struct qstr dname;
+       struct dentry *ret_dentry = NULL;
+       struct sdcardfs_sb_info *sbi;
+
+       sbi = SDCARDFS_SB(dentry->d_sb);
+       /* must initialize dentry operations */
+       d_set_d_op(dentry, &sdcardfs_ci_dops);
+
+       if (IS_ROOT(dentry))
+               goto out;
+
+       name = &dentry->d_name;
+
+       /* now start the actual lookup procedure */
+       lower_dir_dentry = lower_parent_path->dentry;
+       lower_dir_mnt = lower_parent_path->mnt;
+
+       /* Use vfs_path_lookup to check if the dentry exists or not */
+       err = vfs_path_lookup(lower_dir_dentry, lower_dir_mnt, name->name, 0,
+                               &lower_path);
+       /* check for other cases */
+       if (err == -ENOENT) {
+               struct file *file;
+               const struct cred *cred = current_cred();
+
+               struct sdcardfs_name_data buffer = {
+                       .ctx.actor = sdcardfs_name_match,
+                       .to_find = name,
+                       .name = __getname(),
+                       .found = false,
+               };
+
+               if (!buffer.name) {
+                       err = -ENOMEM;
+                       goto out;
+               }
+               file = dentry_open(lower_parent_path, O_RDONLY, cred);
+               if (IS_ERR(file)) {
+                       err = PTR_ERR(file);
+                       goto put_name;
+               }
+               err = iterate_dir(file, &buffer.ctx);
+               fput(file);
+               if (err)
+                       goto put_name;
+
+               if (buffer.found)
+                       err = vfs_path_lookup(lower_dir_dentry,
+                                               lower_dir_mnt,
+                                               buffer.name, 0,
+                                               &lower_path);
+               else
+                       err = -ENOENT;
+put_name:
+               __putname(buffer.name);
+       }
+
+       /* no error: handle positive dentries */
+       if (!err) {
+               /* check if the dentry is an obb dentry
+                * if true, the lower_inode must be replaced with
+                * the inode of the graft path
+                */
+
+               if (need_graft_path(dentry)) {
+
+                       /* setup_obb_dentry()
+                        * The lower_path will be stored to the dentry's orig_path
+                        * and the base obbpath will be copyed to the lower_path variable.
+                        * if an error returned, there's no change in the lower_path
+                        * returns: -ERRNO if error (0: no error)
+                        */
+                       err = setup_obb_dentry(dentry, &lower_path);
+
+                       if (err) {
+                               /* if the sbi->obbpath is not available, we can optionally
+                                * setup the lower_path with its orig_path.
+                                * but, the current implementation just returns an error
+                                * because the sdcard daemon also regards this case as
+                                * a lookup fail.
+                                */
+                               pr_info("sdcardfs: base obbpath is not available\n");
+                               sdcardfs_put_reset_orig_path(dentry);
+                               goto out;
+                       }
+               }
+
+               sdcardfs_set_lower_path(dentry, &lower_path);
+               ret_dentry =
+                       __sdcardfs_interpose(dentry, dentry->d_sb, &lower_path, id);
+               if (IS_ERR(ret_dentry)) {
+                       err = PTR_ERR(ret_dentry);
+                        /* path_put underlying path on error */
+                       sdcardfs_put_reset_lower_path(dentry);
+               }
+               goto out;
+       }
+
+       /*
+        * We don't consider ENOENT an error, and we want to return a
+        * negative dentry.
+        */
+       if (err && err != -ENOENT)
+               goto out;
+
+       /* instatiate a new negative dentry */
+       dname.name = name->name;
+       dname.len = name->len;
+
+       /* See if the low-level filesystem might want
+        * to use its own hash
+        */
+       lower_dentry = d_hash_and_lookup(lower_dir_dentry, &dname);
+       if (IS_ERR(lower_dentry))
+               return lower_dentry;
+
+       if (!lower_dentry) {
+               /* We called vfs_path_lookup earlier, and did not get a negative
+                * dentry then. Don't confuse the lower filesystem by forcing
+                * one on it now...
+                */
+               err = -ENOENT;
+               goto out;
+       }
+
+       lower_path.dentry = lower_dentry;
+       lower_path.mnt = mntget(lower_dir_mnt);
+       sdcardfs_set_lower_path(dentry, &lower_path);
+
+       /*
+        * If the intent is to create a file, then don't return an error, so
+        * the VFS will continue the process of making this negative dentry
+        * into a positive one.
+        */
+       if (flags & (LOOKUP_CREATE|LOOKUP_RENAME_TARGET))
+               err = 0;
+
+out:
+       if (err)
+               return ERR_PTR(err);
+       return ret_dentry;
+}
+
+/*
+ * On success:
+ * fills dentry object appropriate values and returns NULL.
+ * On fail (== error)
+ * returns error ptr
+ *
+ * @dir : Parent inode.
+ * @dentry : Target dentry to lookup. we should set each of fields.
+ *          (dentry->d_name is initialized already)
+ * @nd : nameidata of parent inode
+ */
+struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+                            unsigned int flags)
+{
+       struct dentry *ret = NULL, *parent;
+       struct path lower_parent_path;
+       int err = 0;
+       const struct cred *saved_cred = NULL;
+
+       parent = dget_parent(dentry);
+
+       if (!check_caller_access_to_name(d_inode(parent), &dentry->d_name)) {
+               ret = ERR_PTR(-EACCES);
+               goto out_err;
+       }
+
+       /* save current_cred and override it */
+       OVERRIDE_CRED_PTR(SDCARDFS_SB(dir->i_sb), saved_cred, SDCARDFS_I(dir));
+
+       sdcardfs_get_lower_path(parent, &lower_parent_path);
+
+       /* allocate dentry private data.  We free it in ->d_release */
+       err = new_dentry_private_data(dentry);
+       if (err) {
+               ret = ERR_PTR(err);
+               goto out;
+       }
+
+       ret = __sdcardfs_lookup(dentry, flags, &lower_parent_path,
+                               SDCARDFS_I(dir)->data->userid);
+       if (IS_ERR(ret))
+               goto out;
+       if (ret)
+               dentry = ret;
+       if (d_inode(dentry)) {
+               fsstack_copy_attr_times(d_inode(dentry),
+                                       sdcardfs_lower_inode(d_inode(dentry)));
+               /* get derived permission */
+               get_derived_permission(parent, dentry);
+               fixup_tmp_permissions(d_inode(dentry));
+               fixup_lower_ownership(dentry, dentry->d_name.name);
+       }
+       /* update parent directory's atime */
+       fsstack_copy_attr_atime(d_inode(parent),
+                               sdcardfs_lower_inode(d_inode(parent)));
+
+out:
+       sdcardfs_put_lower_path(parent, &lower_parent_path);
+       REVERT_CRED(saved_cred);
+out_err:
+       dput(parent);
+       return ret;
+}
diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c

new file mode 100644 (file)

index 0000000..e4fd3fb
--- /dev/null
+++ b/fs/sdcardfs/main.c
@@ -0,0 +1,491 @@
+/*
+ * fs/sdcardfs/main.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/parser.h>
+
+enum {
+       Opt_fsuid,
+       Opt_fsgid,
+       Opt_gid,
+       Opt_debug,
+       Opt_mask,
+       Opt_multiuser,
+       Opt_userid,
+       Opt_reserved_mb,
+       Opt_gid_derivation,
+       Opt_default_normal,
+       Opt_err,
+};
+
+static const match_table_t sdcardfs_tokens = {
+       {Opt_fsuid, "fsuid=%u"},
+       {Opt_fsgid, "fsgid=%u"},
+       {Opt_gid, "gid=%u"},
+       {Opt_debug, "debug"},
+       {Opt_mask, "mask=%u"},
+       {Opt_userid, "userid=%d"},
+       {Opt_multiuser, "multiuser"},
+       {Opt_gid_derivation, "derive_gid"},
+       {Opt_default_normal, "default_normal"},
+       {Opt_reserved_mb, "reserved_mb=%u"},
+       {Opt_err, NULL}
+};
+
+static int parse_options(struct super_block *sb, char *options, int silent,
+                               int *debug, struct sdcardfs_vfsmount_options *vfsopts,
+                               struct sdcardfs_mount_options *opts)
+{
+       char *p;
+       substring_t args[MAX_OPT_ARGS];
+       int option;
+
+       /* by default, we use AID_MEDIA_RW as uid, gid */
+       opts->fs_low_uid = AID_MEDIA_RW;
+       opts->fs_low_gid = AID_MEDIA_RW;
+       vfsopts->mask = 0;
+       opts->multiuser = false;
+       opts->fs_user_id = 0;
+       vfsopts->gid = 0;
+       /* by default, 0MB is reserved */
+       opts->reserved_mb = 0;
+       /* by default, gid derivation is off */
+       opts->gid_derivation = false;
+       opts->default_normal = false;
+
+       *debug = 0;
+
+       if (!options)
+               return 0;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, sdcardfs_tokens, args);
+
+               switch (token) {
+               case Opt_debug:
+                       *debug = 1;
+                       break;
+               case Opt_fsuid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       opts->fs_low_uid = option;
+                       break;
+               case Opt_fsgid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       opts->fs_low_gid = option;
+                       break;
+               case Opt_gid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       vfsopts->gid = option;
+                       break;
+               case Opt_userid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       opts->fs_user_id = option;
+                       break;
+               case Opt_mask:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       vfsopts->mask = option;
+                       break;
+               case Opt_multiuser:
+                       opts->multiuser = true;
+                       break;
+               case Opt_reserved_mb:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       opts->reserved_mb = option;
+                       break;
+               case Opt_gid_derivation:
+                       opts->gid_derivation = true;
+                       break;
+               case Opt_default_normal:
+                       opts->default_normal = true;
+                       break;
+               /* unknown option */
+               default:
+                       if (!silent)
+                               pr_err("Unrecognized mount option \"%s\" or missing value", p);
+                       return -EINVAL;
+               }
+       }
+
+       if (*debug) {
+               pr_info("sdcardfs : options - debug:%d\n", *debug);
+               pr_info("sdcardfs : options - uid:%d\n",
+                                                       opts->fs_low_uid);
+               pr_info("sdcardfs : options - gid:%d\n",
+                                                       opts->fs_low_gid);
+       }
+
+       return 0;
+}
+
+int parse_options_remount(struct super_block *sb, char *options, int silent,
+                               struct sdcardfs_vfsmount_options *vfsopts)
+{
+       char *p;
+       substring_t args[MAX_OPT_ARGS];
+       int option;
+       int debug;
+
+       if (!options)
+               return 0;
+
+       while ((p = strsep(&options, ",")) != NULL) {
+               int token;
+
+               if (!*p)
+                       continue;
+
+               token = match_token(p, sdcardfs_tokens, args);
+
+               switch (token) {
+               case Opt_debug:
+                       debug = 1;
+                       break;
+               case Opt_gid:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       vfsopts->gid = option;
+
+                       break;
+               case Opt_mask:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       vfsopts->mask = option;
+                       break;
+               case Opt_default_normal:
+               case Opt_multiuser:
+               case Opt_userid:
+               case Opt_fsuid:
+               case Opt_fsgid:
+               case Opt_reserved_mb:
+                       pr_warn("Option \"%s\" can't be changed during remount\n", p);
+                       break;
+               /* unknown option */
+               default:
+                       if (!silent)
+                               pr_err("Unrecognized mount option \"%s\" or missing value", p);
+                       return -EINVAL;
+               }
+       }
+
+       if (debug) {
+               pr_info("sdcardfs : options - debug:%d\n", debug);
+               pr_info("sdcardfs : options - gid:%d\n", vfsopts->gid);
+               pr_info("sdcardfs : options - mask:%d\n", vfsopts->mask);
+       }
+
+       return 0;
+}
+
+#if 0
+/*
+ * our custom d_alloc_root work-alike
+ *
+ * we can't use d_alloc_root if we want to use our own interpose function
+ * unchanged, so we simply call our own "fake" d_alloc_root
+ */
+static struct dentry *sdcardfs_d_alloc_root(struct super_block *sb)
+{
+       struct dentry *ret = NULL;
+
+       if (sb) {
+               static const struct qstr name = {
+                       .name = "/",
+                       .len = 1
+               };
+
+               ret = d_alloc(NULL, &name);
+               if (ret) {
+                       d_set_d_op(ret, &sdcardfs_ci_dops);
+                       ret->d_sb = sb;
+                       ret->d_parent = ret;
+               }
+       }
+       return ret;
+}
+#endif
+
+DEFINE_MUTEX(sdcardfs_super_list_lock);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list_lock);
+LIST_HEAD(sdcardfs_super_list);
+EXPORT_SYMBOL_GPL(sdcardfs_super_list);
+
+/*
+ * There is no need to lock the sdcardfs_super_info's rwsem as there is no
+ * way anyone can have a reference to the superblock at this point in time.
+ */
+static int sdcardfs_read_super(struct vfsmount *mnt, struct super_block *sb,
+               const char *dev_name, void *raw_data, int silent)
+{
+       int err = 0;
+       int debug;
+       struct super_block *lower_sb;
+       struct path lower_path;
+       struct sdcardfs_sb_info *sb_info;
+       struct sdcardfs_vfsmount_options *mnt_opt = mnt->data;
+       struct inode *inode;
+
+       pr_info("sdcardfs version 2.0\n");
+
+       if (!dev_name) {
+               pr_err("sdcardfs: read_super: missing dev_name argument\n");
+               err = -EINVAL;
+               goto out;
+       }
+
+       pr_info("sdcardfs: dev_name -> %s\n", dev_name);
+       pr_info("sdcardfs: options -> %s\n", (char *)raw_data);
+       pr_info("sdcardfs: mnt -> %p\n", mnt);
+
+       /* parse lower path */
+       err = kern_path(dev_name, LOOKUP_FOLLOW | LOOKUP_DIRECTORY,
+                       &lower_path);
+       if (err) {
+               pr_err("sdcardfs: error accessing lower directory '%s'\n", dev_name);
+               goto out;
+       }
+
+       /* allocate superblock private data */
+       sb->s_fs_info = kzalloc(sizeof(struct sdcardfs_sb_info), GFP_KERNEL);
+       if (!SDCARDFS_SB(sb)) {
+               pr_crit("sdcardfs: read_super: out of memory\n");
+               err = -ENOMEM;
+               goto out_free;
+       }
+
+       sb_info = sb->s_fs_info;
+       /* parse options */
+       err = parse_options(sb, raw_data, silent, &debug, mnt_opt, &sb_info->options);
+       if (err) {
+               pr_err("sdcardfs: invalid options\n");
+               goto out_freesbi;
+       }
+
+       /* set the lower superblock field of upper superblock */
+       lower_sb = lower_path.dentry->d_sb;
+       atomic_inc(&lower_sb->s_active);
+       sdcardfs_set_lower_super(sb, lower_sb);
+
+       /* inherit maxbytes from lower file system */
+       sb->s_maxbytes = lower_sb->s_maxbytes;
+
+       /*
+        * Our c/m/atime granularity is 1 ns because we may stack on file
+        * systems whose granularity is as good.
+        */
+       sb->s_time_gran = 1;
+
+       sb->s_magic = SDCARDFS_SUPER_MAGIC;
+       sb->s_op = &sdcardfs_sops;
+
+       /* get a new inode and allocate our root dentry */
+       inode = sdcardfs_iget(sb, d_inode(lower_path.dentry), 0);
+       if (IS_ERR(inode)) {
+               err = PTR_ERR(inode);
+               goto out_sput;
+       }
+       sb->s_root = d_make_root(inode);
+       if (!sb->s_root) {
+               err = -ENOMEM;
+               goto out_iput;
+       }
+       d_set_d_op(sb->s_root, &sdcardfs_ci_dops);
+
+       /* link the upper and lower dentries */
+       sb->s_root->d_fsdata = NULL;
+       err = new_dentry_private_data(sb->s_root);
+       if (err)
+               goto out_freeroot;
+
+       /* set the lower dentries for s_root */
+       sdcardfs_set_lower_path(sb->s_root, &lower_path);
+
+       /*
+        * No need to call interpose because we already have a positive
+        * dentry, which was instantiated by d_make_root.  Just need to
+        * d_rehash it.
+        */
+       d_rehash(sb->s_root);
+
+       /* setup permission policy */
+       sb_info->obbpath_s = kzalloc(PATH_MAX, GFP_KERNEL);
+       mutex_lock(&sdcardfs_super_list_lock);
+       if (sb_info->options.multiuser) {
+               setup_derived_state(d_inode(sb->s_root), PERM_PRE_ROOT,
+                               sb_info->options.fs_user_id, AID_ROOT);
+               snprintf(sb_info->obbpath_s, PATH_MAX, "%s/obb", dev_name);
+       } else {
+               setup_derived_state(d_inode(sb->s_root), PERM_ROOT,
+                               sb_info->options.fs_user_id, AID_ROOT);
+               snprintf(sb_info->obbpath_s, PATH_MAX, "%s/Android/obb", dev_name);
+       }
+       fixup_tmp_permissions(d_inode(sb->s_root));
+       sb_info->sb = sb;
+       list_add(&sb_info->list, &sdcardfs_super_list);
+       mutex_unlock(&sdcardfs_super_list_lock);
+
+       if (!silent)
+               pr_info("sdcardfs: mounted on top of %s type %s\n",
+                               dev_name, lower_sb->s_type->name);
+       goto out; /* all is well */
+
+       /* no longer needed: free_dentry_private_data(sb->s_root); */
+out_freeroot:
+       dput(sb->s_root);
+out_iput:
+       iput(inode);
+out_sput:
+       /* drop refs we took earlier */
+       atomic_dec(&lower_sb->s_active);
+out_freesbi:
+       kfree(SDCARDFS_SB(sb));
+       sb->s_fs_info = NULL;
+out_free:
+       path_put(&lower_path);
+
+out:
+       return err;
+}
+
+struct sdcardfs_mount_private {
+       struct vfsmount *mnt;
+       const char *dev_name;
+       void *raw_data;
+};
+
+static int __sdcardfs_fill_super(
+       struct super_block *sb,
+       void *_priv, int silent)
+{
+       struct sdcardfs_mount_private *priv = _priv;
+
+       return sdcardfs_read_super(priv->mnt,
+               sb, priv->dev_name, priv->raw_data, silent);
+}
+
+static struct dentry *sdcardfs_mount(struct vfsmount *mnt,
+               struct file_system_type *fs_type, int flags,
+                           const char *dev_name, void *raw_data)
+{
+       struct sdcardfs_mount_private priv = {
+               .mnt = mnt,
+               .dev_name = dev_name,
+               .raw_data = raw_data
+       };
+
+       return mount_nodev(fs_type, flags,
+               &priv, __sdcardfs_fill_super);
+}
+
+static struct dentry *sdcardfs_mount_wrn(struct file_system_type *fs_type,
+                   int flags, const char *dev_name, void *raw_data)
+{
+       WARN(1, "sdcardfs does not support mount. Use mount2.\n");
+       return ERR_PTR(-EINVAL);
+}
+
+void *sdcardfs_alloc_mnt_data(void)
+{
+       return kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+}
+
+void sdcardfs_kill_sb(struct super_block *sb)
+{
+       struct sdcardfs_sb_info *sbi;
+
+       if (sb->s_magic == SDCARDFS_SUPER_MAGIC) {
+               sbi = SDCARDFS_SB(sb);
+               mutex_lock(&sdcardfs_super_list_lock);
+               list_del(&sbi->list);
+               mutex_unlock(&sdcardfs_super_list_lock);
+       }
+       kill_anon_super(sb);
+}
+
+static struct file_system_type sdcardfs_fs_type = {
+       .owner          = THIS_MODULE,
+       .name           = SDCARDFS_NAME,
+       .mount          = sdcardfs_mount_wrn,
+       .mount2         = sdcardfs_mount,
+       .alloc_mnt_data = sdcardfs_alloc_mnt_data,
+       .kill_sb        = sdcardfs_kill_sb,
+       .fs_flags       = 0,
+};
+MODULE_ALIAS_FS(SDCARDFS_NAME);
+
+static int __init init_sdcardfs_fs(void)
+{
+       int err;
+
+       pr_info("Registering sdcardfs " SDCARDFS_VERSION "\n");
+
+       err = sdcardfs_init_inode_cache();
+       if (err)
+               goto out;
+       err = sdcardfs_init_dentry_cache();
+       if (err)
+               goto out;
+       err = packagelist_init();
+       if (err)
+               goto out;
+       err = register_filesystem(&sdcardfs_fs_type);
+out:
+       if (err) {
+               sdcardfs_destroy_inode_cache();
+               sdcardfs_destroy_dentry_cache();
+               packagelist_exit();
+       }
+       return err;
+}
+
+static void __exit exit_sdcardfs_fs(void)
+{
+       sdcardfs_destroy_inode_cache();
+       sdcardfs_destroy_dentry_cache();
+       packagelist_exit();
+       unregister_filesystem(&sdcardfs_fs_type);
+       pr_info("Completed sdcardfs module unload\n");
+}
+
+/* Original wrapfs authors */
+MODULE_AUTHOR("Erez Zadok, Filesystems and Storage Lab, Stony Brook University (http://www.fsl.cs.sunysb.edu/)");
+
+/* Original sdcardfs authors */
+MODULE_AUTHOR("Woojoong Lee, Daeho Jeong, Kitae Lee, Yeongjin Gil System Memory Lab., Samsung Electronics");
+
+/* Current maintainer */
+MODULE_AUTHOR("Daniel Rosenberg, Google");
+MODULE_DESCRIPTION("Sdcardfs " SDCARDFS_VERSION);
+MODULE_LICENSE("GPL");
+
+module_init(init_sdcardfs_fs);
+module_exit(exit_sdcardfs_fs);
diff --git a/fs/sdcardfs/mmap.c b/fs/sdcardfs/mmap.c

new file mode 100644 (file)

index 0000000..2847c0e
--- /dev/null
+++ b/fs/sdcardfs/mmap.c
@@ -0,0 +1,87 @@
+/*
+ * fs/sdcardfs/mmap.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+static int sdcardfs_fault(struct vm_fault *vmf)
+{
+       int err;
+       struct file *file;
+       const struct vm_operations_struct *lower_vm_ops;
+
+       file = (struct file *)vmf->vma->vm_private_data;
+       lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+       BUG_ON(!lower_vm_ops);
+
+       err = lower_vm_ops->fault(vmf);
+       return err;
+}
+
+static void sdcardfs_vm_open(struct vm_area_struct *vma)
+{
+       struct file *file = (struct file *)vma->vm_private_data;
+
+       get_file(file);
+}
+
+static void sdcardfs_vm_close(struct vm_area_struct *vma)
+{
+       struct file *file = (struct file *)vma->vm_private_data;
+
+       fput(file);
+}
+
+static int sdcardfs_page_mkwrite(struct vm_fault *vmf)
+{
+       int err = 0;
+       struct file *file;
+       const struct vm_operations_struct *lower_vm_ops;
+
+       file = (struct file *)vmf->vma->vm_private_data;
+       lower_vm_ops = SDCARDFS_F(file)->lower_vm_ops;
+       BUG_ON(!lower_vm_ops);
+       if (!lower_vm_ops->page_mkwrite)
+               goto out;
+
+       err = lower_vm_ops->page_mkwrite(vmf);
+out:
+       return err;
+}
+
+static ssize_t sdcardfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+       /*
+        * This function should never be called directly.  We need it
+        * to exist, to get past a check in open_check_o_direct(),
+        * which is called from do_last().
+        */
+       return -EINVAL;
+}
+
+const struct address_space_operations sdcardfs_aops = {
+       .direct_IO      = sdcardfs_direct_IO,
+};
+
+const struct vm_operations_struct sdcardfs_vm_ops = {
+       .fault          = sdcardfs_fault,
+       .page_mkwrite   = sdcardfs_page_mkwrite,
+       .open           = sdcardfs_vm_open,
+       .close          = sdcardfs_vm_close,
+};
diff --git a/fs/sdcardfs/multiuser.h b/fs/sdcardfs/multiuser.h

new file mode 100644 (file)

index 0000000..85341e7
--- /dev/null
+++ b/fs/sdcardfs/multiuser.h
@@ -0,0 +1,53 @@
+/*
+ * fs/sdcardfs/multiuser.h
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#define AID_USER_OFFSET     100000 /* offset for uid ranges for each user */
+#define AID_APP_START        10000 /* first app user */
+#define AID_APP_END          19999 /* last app user */
+#define AID_CACHE_GID_START  20000 /* start of gids for apps to mark cached data */
+#define AID_EXT_GID_START    30000 /* start of gids for apps to mark external data */
+#define AID_EXT_CACHE_GID_START 40000 /* start of gids for apps to mark external cached data */
+#define AID_EXT_CACHE_GID_END 49999   /* end of gids for apps to mark external cached data */
+#define AID_SHARED_GID_START 50000 /* start of gids for apps in each user to share */
+
+typedef uid_t userid_t;
+typedef uid_t appid_t;
+
+static inline uid_t multiuser_get_uid(userid_t user_id, appid_t app_id)
+{
+       return (user_id * AID_USER_OFFSET) + (app_id % AID_USER_OFFSET);
+}
+
+static inline bool uid_is_app(uid_t uid)
+{
+       appid_t appid = uid % AID_USER_OFFSET;
+
+       return appid >= AID_APP_START && appid <= AID_APP_END;
+}
+
+static inline gid_t multiuser_get_ext_cache_gid(uid_t uid)
+{
+       return uid - AID_APP_START + AID_EXT_CACHE_GID_START;
+}
+
+static inline gid_t multiuser_get_ext_gid(uid_t uid)
+{
+       return uid - AID_APP_START + AID_EXT_GID_START;
+}
diff --git a/fs/sdcardfs/packagelist.c b/fs/sdcardfs/packagelist.c

new file mode 100644 (file)

index 0000000..6da0c21
--- /dev/null
+++ b/fs/sdcardfs/packagelist.c
@@ -0,0 +1,881 @@
+/*
+ * fs/sdcardfs/packagelist.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+#include <linux/hashtable.h>
+#include <linux/ctype.h>
+#include <linux/delay.h>
+#include <linux/radix-tree.h>
+#include <linux/dcache.h>
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include <linux/configfs.h>
+
+struct hashtable_entry {
+       struct hlist_node hlist;
+       struct hlist_node dlist; /* for deletion cleanup */
+       struct qstr key;
+       atomic_t value;
+};
+
+static DEFINE_HASHTABLE(package_to_appid, 8);
+static DEFINE_HASHTABLE(package_to_userid, 8);
+static DEFINE_HASHTABLE(ext_to_groupid, 8);
+
+
+static struct kmem_cache *hashtable_entry_cachep;
+
+static unsigned int full_name_case_hash(const void *salt, const unsigned char *name, unsigned int len)
+{
+       unsigned long hash = init_name_hash(salt);
+
+       while (len--)
+               hash = partial_name_hash(tolower(*name++), hash);
+       return end_name_hash(hash);
+}
+
+static inline void qstr_init(struct qstr *q, const char *name)
+{
+       q->name = name;
+       q->len = strlen(q->name);
+       q->hash = full_name_case_hash(0, q->name, q->len);
+}
+
+static inline int qstr_copy(const struct qstr *src, struct qstr *dest)
+{
+       dest->name = kstrdup(src->name, GFP_KERNEL);
+       dest->hash_len = src->hash_len;
+       return !!dest->name;
+}
+
+
+static appid_t __get_appid(const struct qstr *key)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = key->hash;
+       appid_t ret_id;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key)) {
+                       ret_id = atomic_read(&hash_cur->value);
+                       rcu_read_unlock();
+                       return ret_id;
+               }
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
+appid_t get_appid(const char *key)
+{
+       struct qstr q;
+
+       qstr_init(&q, key);
+       return __get_appid(&q);
+}
+
+static appid_t __get_ext_gid(const struct qstr *key)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = key->hash;
+       appid_t ret_id;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key)) {
+                       ret_id = atomic_read(&hash_cur->value);
+                       rcu_read_unlock();
+                       return ret_id;
+               }
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
+appid_t get_ext_gid(const char *key)
+{
+       struct qstr q;
+
+       qstr_init(&q, key);
+       return __get_ext_gid(&q);
+}
+
+static appid_t __is_excluded(const struct qstr *app_name, userid_t user)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = app_name->hash;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+               if (atomic_read(&hash_cur->value) == user &&
+                               qstr_case_eq(app_name, &hash_cur->key)) {
+                       rcu_read_unlock();
+                       return 1;
+               }
+       }
+       rcu_read_unlock();
+       return 0;
+}
+
+appid_t is_excluded(const char *key, userid_t user)
+{
+       struct qstr q;
+       qstr_init(&q, key);
+       return __is_excluded(&q, user);
+}
+
+/* Kernel has already enforced everything we returned through
+ * derive_permissions_locked(), so this is used to lock down access
+ * even further, such as enforcing that apps hold sdcard_rw.
+ */
+int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name)
+{
+       struct qstr q_autorun = QSTR_LITERAL("autorun.inf");
+       struct qstr q__android_secure = QSTR_LITERAL(".android_secure");
+       struct qstr q_android_secure = QSTR_LITERAL("android_secure");
+
+       /* Always block security-sensitive files at root */
+       if (parent_node && SDCARDFS_I(parent_node)->data->perm == PERM_ROOT) {
+               if (qstr_case_eq(name, &q_autorun)
+                       || qstr_case_eq(name, &q__android_secure)
+                       || qstr_case_eq(name, &q_android_secure)) {
+                       return 0;
+               }
+       }
+
+       /* Root always has access; access for any other UIDs should always
+        * be controlled through packages.list.
+        */
+       if (from_kuid(&init_user_ns, current_fsuid()) == 0)
+               return 1;
+
+       /* No extra permissions to enforce */
+       return 1;
+}
+
+static struct hashtable_entry *alloc_hashtable_entry(const struct qstr *key,
+               appid_t value)
+{
+       struct hashtable_entry *ret = kmem_cache_alloc(hashtable_entry_cachep,
+                       GFP_KERNEL);
+       if (!ret)
+               return NULL;
+       INIT_HLIST_NODE(&ret->dlist);
+       INIT_HLIST_NODE(&ret->hlist);
+
+       if (!qstr_copy(key, &ret->key)) {
+               kmem_cache_free(hashtable_entry_cachep, ret);
+               return NULL;
+       }
+
+       atomic_set(&ret->value, value);
+       return ret;
+}
+
+static int insert_packagelist_appid_entry_locked(const struct qstr *key, appid_t value)
+{
+       struct hashtable_entry *hash_cur;
+       struct hashtable_entry *new_entry;
+       unsigned int hash = key->hash;
+
+       hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key)) {
+                       atomic_set(&hash_cur->value, value);
+                       return 0;
+               }
+       }
+       new_entry = alloc_hashtable_entry(key, value);
+       if (!new_entry)
+               return -ENOMEM;
+       hash_add_rcu(package_to_appid, &new_entry->hlist, hash);
+       return 0;
+}
+
+static int insert_ext_gid_entry_locked(const struct qstr *key, appid_t value)
+{
+       struct hashtable_entry *hash_cur;
+       struct hashtable_entry *new_entry;
+       unsigned int hash = key->hash;
+
+       /* An extension can only belong to one gid */
+       hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key))
+                       return -EINVAL;
+       }
+       new_entry = alloc_hashtable_entry(key, value);
+       if (!new_entry)
+               return -ENOMEM;
+       hash_add_rcu(ext_to_groupid, &new_entry->hlist, hash);
+       return 0;
+}
+
+static int insert_userid_exclude_entry_locked(const struct qstr *key, userid_t value)
+{
+       struct hashtable_entry *hash_cur;
+       struct hashtable_entry *new_entry;
+       unsigned int hash = key->hash;
+
+       /* Only insert if not already present */
+       hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+               if (atomic_read(&hash_cur->value) == value &&
+                               qstr_case_eq(key, &hash_cur->key))
+                       return 0;
+       }
+       new_entry = alloc_hashtable_entry(key, value);
+       if (!new_entry)
+               return -ENOMEM;
+       hash_add_rcu(package_to_userid, &new_entry->hlist, hash);
+       return 0;
+}
+
+static void fixup_all_perms_name(const struct qstr *key)
+{
+       struct sdcardfs_sb_info *sbinfo;
+       struct limit_search limit = {
+               .flags = BY_NAME,
+               .name = QSTR_INIT(key->name, key->len),
+       };
+       list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+               if (sbinfo_has_sdcard_magic(sbinfo))
+                       fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+       }
+}
+
+static void fixup_all_perms_name_userid(const struct qstr *key, userid_t userid)
+{
+       struct sdcardfs_sb_info *sbinfo;
+       struct limit_search limit = {
+               .flags = BY_NAME | BY_USERID,
+               .name = QSTR_INIT(key->name, key->len),
+               .userid = userid,
+       };
+       list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+               if (sbinfo_has_sdcard_magic(sbinfo))
+                       fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+       }
+}
+
+static void fixup_all_perms_userid(userid_t userid)
+{
+       struct sdcardfs_sb_info *sbinfo;
+       struct limit_search limit = {
+               .flags = BY_USERID,
+               .userid = userid,
+       };
+       list_for_each_entry(sbinfo, &sdcardfs_super_list, list) {
+               if (sbinfo_has_sdcard_magic(sbinfo))
+                       fixup_perms_recursive(sbinfo->sb->s_root, &limit);
+       }
+}
+
+static int insert_packagelist_entry(const struct qstr *key, appid_t value)
+{
+       int err;
+
+       mutex_lock(&sdcardfs_super_list_lock);
+       err = insert_packagelist_appid_entry_locked(key, value);
+       if (!err)
+               fixup_all_perms_name(key);
+       mutex_unlock(&sdcardfs_super_list_lock);
+
+       return err;
+}
+
+static int insert_ext_gid_entry(const struct qstr *key, appid_t value)
+{
+       int err;
+
+       mutex_lock(&sdcardfs_super_list_lock);
+       err = insert_ext_gid_entry_locked(key, value);
+       mutex_unlock(&sdcardfs_super_list_lock);
+
+       return err;
+}
+
+static int insert_userid_exclude_entry(const struct qstr *key, userid_t value)
+{
+       int err;
+
+       mutex_lock(&sdcardfs_super_list_lock);
+       err = insert_userid_exclude_entry_locked(key, value);
+       if (!err)
+               fixup_all_perms_name_userid(key, value);
+       mutex_unlock(&sdcardfs_super_list_lock);
+
+       return err;
+}
+
+static void free_hashtable_entry(struct hashtable_entry *entry)
+{
+       kfree(entry->key.name);
+       kmem_cache_free(hashtable_entry_cachep, entry);
+}
+
+static void remove_packagelist_entry_locked(const struct qstr *key)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = key->hash;
+       struct hlist_node *h_t;
+       HLIST_HEAD(free_list);
+
+       hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key)) {
+                       hash_del_rcu(&hash_cur->hlist);
+                       hlist_add_head(&hash_cur->dlist, &free_list);
+               }
+       }
+       hash_for_each_possible_rcu(package_to_appid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key)) {
+                       hash_del_rcu(&hash_cur->hlist);
+                       hlist_add_head(&hash_cur->dlist, &free_list);
+                       break;
+               }
+       }
+       synchronize_rcu();
+       hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+               free_hashtable_entry(hash_cur);
+}
+
+static void remove_packagelist_entry(const struct qstr *key)
+{
+       mutex_lock(&sdcardfs_super_list_lock);
+       remove_packagelist_entry_locked(key);
+       fixup_all_perms_name(key);
+       mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_ext_gid_entry_locked(const struct qstr *key, gid_t group)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = key->hash;
+
+       hash_for_each_possible_rcu(ext_to_groupid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key) && atomic_read(&hash_cur->value) == group) {
+                       hash_del_rcu(&hash_cur->hlist);
+                       synchronize_rcu();
+                       free_hashtable_entry(hash_cur);
+                       break;
+               }
+       }
+}
+
+static void remove_ext_gid_entry(const struct qstr *key, gid_t group)
+{
+       mutex_lock(&sdcardfs_super_list_lock);
+       remove_ext_gid_entry_locked(key, group);
+       mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_all_entry_locked(userid_t userid)
+{
+       struct hashtable_entry *hash_cur;
+       struct hlist_node *h_t;
+       HLIST_HEAD(free_list);
+       int i;
+
+       hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+               if (atomic_read(&hash_cur->value) == userid) {
+                       hash_del_rcu(&hash_cur->hlist);
+                       hlist_add_head(&hash_cur->dlist, &free_list);
+               }
+       }
+       synchronize_rcu();
+       hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist) {
+               free_hashtable_entry(hash_cur);
+       }
+}
+
+static void remove_userid_all_entry(userid_t userid)
+{
+       mutex_lock(&sdcardfs_super_list_lock);
+       remove_userid_all_entry_locked(userid);
+       fixup_all_perms_userid(userid);
+       mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void remove_userid_exclude_entry_locked(const struct qstr *key, userid_t userid)
+{
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = key->hash;
+
+       hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(key, &hash_cur->key) &&
+                               atomic_read(&hash_cur->value) == userid) {
+                       hash_del_rcu(&hash_cur->hlist);
+                       synchronize_rcu();
+                       free_hashtable_entry(hash_cur);
+                       break;
+               }
+       }
+}
+
+static void remove_userid_exclude_entry(const struct qstr *key, userid_t userid)
+{
+       mutex_lock(&sdcardfs_super_list_lock);
+       remove_userid_exclude_entry_locked(key, userid);
+       fixup_all_perms_name_userid(key, userid);
+       mutex_unlock(&sdcardfs_super_list_lock);
+}
+
+static void packagelist_destroy(void)
+{
+       struct hashtable_entry *hash_cur;
+       struct hlist_node *h_t;
+       HLIST_HEAD(free_list);
+       int i;
+
+       mutex_lock(&sdcardfs_super_list_lock);
+       hash_for_each_rcu(package_to_appid, i, hash_cur, hlist) {
+               hash_del_rcu(&hash_cur->hlist);
+               hlist_add_head(&hash_cur->dlist, &free_list);
+       }
+       hash_for_each_rcu(package_to_userid, i, hash_cur, hlist) {
+               hash_del_rcu(&hash_cur->hlist);
+               hlist_add_head(&hash_cur->dlist, &free_list);
+       }
+       synchronize_rcu();
+       hlist_for_each_entry_safe(hash_cur, h_t, &free_list, dlist)
+               free_hashtable_entry(hash_cur);
+       mutex_unlock(&sdcardfs_super_list_lock);
+       pr_info("sdcardfs: destroyed packagelist pkgld\n");
+}
+
+#define SDCARDFS_CONFIGFS_ATTR(_pfx, _name)                    \
+static struct configfs_attribute _pfx##attr_##_name = {        \
+       .ca_name        = __stringify(_name),           \
+       .ca_mode        = S_IRUGO | S_IWUGO,            \
+       .ca_owner       = THIS_MODULE,                  \
+       .show           = _pfx##_name##_show,           \
+       .store          = _pfx##_name##_store,          \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_RO(_pfx, _name)                 \
+static struct configfs_attribute _pfx##attr_##_name = {        \
+       .ca_name        = __stringify(_name),           \
+       .ca_mode        = S_IRUGO,                      \
+       .ca_owner       = THIS_MODULE,                  \
+       .show           = _pfx##_name##_show,           \
+}
+
+#define SDCARDFS_CONFIGFS_ATTR_WO(_pfx, _name)                 \
+static struct configfs_attribute _pfx##attr_##_name = {        \
+       .ca_name        = __stringify(_name),           \
+       .ca_mode        = S_IWUGO,                      \
+       .ca_owner       = THIS_MODULE,                  \
+       .store          = _pfx##_name##_store,          \
+}
+
+struct package_details {
+       struct config_item item;
+       struct qstr name;
+};
+
+static inline struct package_details *to_package_details(struct config_item *item)
+{
+       return item ? container_of(item, struct package_details, item) : NULL;
+}
+
+static ssize_t package_details_appid_show(struct config_item *item, char *page)
+{
+       return scnprintf(page, PAGE_SIZE, "%u\n", __get_appid(&to_package_details(item)->name));
+}
+
+static ssize_t package_details_appid_store(struct config_item *item,
+                                      const char *page, size_t count)
+{
+       unsigned int tmp;
+       int ret;
+
+       ret = kstrtouint(page, 10, &tmp);
+       if (ret)
+               return ret;
+
+       ret = insert_packagelist_entry(&to_package_details(item)->name, tmp);
+
+       if (ret)
+               return ret;
+
+       return count;
+}
+
+static ssize_t package_details_excluded_userids_show(struct config_item *item,
+                                     char *page)
+{
+       struct package_details *package_details = to_package_details(item);
+       struct hashtable_entry *hash_cur;
+       unsigned int hash = package_details->name.hash;
+       int count = 0;
+
+       rcu_read_lock();
+       hash_for_each_possible_rcu(package_to_userid, hash_cur, hlist, hash) {
+               if (qstr_case_eq(&package_details->name, &hash_cur->key))
+                       count += scnprintf(page + count, PAGE_SIZE - count,
+                                       "%d ", atomic_read(&hash_cur->value));
+       }
+       rcu_read_unlock();
+       if (count)
+               count--;
+       count += scnprintf(page + count, PAGE_SIZE - count, "\n");
+       return count;
+}
+
+static ssize_t package_details_excluded_userids_store(struct config_item *item,
+                                      const char *page, size_t count)
+{
+       unsigned int tmp;
+       int ret;
+
+       ret = kstrtouint(page, 10, &tmp);
+       if (ret)
+               return ret;
+
+       ret = insert_userid_exclude_entry(&to_package_details(item)->name, tmp);
+
+       if (ret)
+               return ret;
+
+       return count;
+}
+
+static ssize_t package_details_clear_userid_store(struct config_item *item,
+                                      const char *page, size_t count)
+{
+       unsigned int tmp;
+       int ret;
+
+       ret = kstrtouint(page, 10, &tmp);
+       if (ret)
+               return ret;
+       remove_userid_exclude_entry(&to_package_details(item)->name, tmp);
+       return count;
+}
+
+static void package_details_release(struct config_item *item)
+{
+       struct package_details *package_details = to_package_details(item);
+
+       pr_info("sdcardfs: removing %s\n", package_details->name.name);
+       remove_packagelist_entry(&package_details->name);
+       kfree(package_details->name.name);
+       kfree(package_details);
+}
+
+SDCARDFS_CONFIGFS_ATTR(package_details_, appid);
+SDCARDFS_CONFIGFS_ATTR(package_details_, excluded_userids);
+SDCARDFS_CONFIGFS_ATTR_WO(package_details_, clear_userid);
+
+static struct configfs_attribute *package_details_attrs[] = {
+       &package_details_attr_appid,
+       &package_details_attr_excluded_userids,
+       &package_details_attr_clear_userid,
+       NULL,
+};
+
+static struct configfs_item_operations package_details_item_ops = {
+       .release = package_details_release,
+};
+
+static struct config_item_type package_appid_type = {
+       .ct_item_ops    = &package_details_item_ops,
+       .ct_attrs       = package_details_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+struct extensions_value {
+       struct config_group group;
+       unsigned int num;
+};
+
+struct extension_details {
+       struct config_item item;
+       struct qstr name;
+       unsigned int num;
+};
+
+static inline struct extensions_value *to_extensions_value(struct config_item *item)
+{
+       return item ? container_of(to_config_group(item), struct extensions_value, group) : NULL;
+}
+
+static inline struct extension_details *to_extension_details(struct config_item *item)
+{
+       return item ? container_of(item, struct extension_details, item) : NULL;
+}
+
+static void extension_details_release(struct config_item *item)
+{
+       struct extension_details *extension_details = to_extension_details(item);
+
+       pr_info("sdcardfs: No longer mapping %s files to gid %d\n",
+                       extension_details->name.name, extension_details->num);
+       remove_ext_gid_entry(&extension_details->name, extension_details->num);
+       kfree(extension_details->name.name);
+       kfree(extension_details);
+}
+
+static struct configfs_item_operations extension_details_item_ops = {
+       .release = extension_details_release,
+};
+
+static struct config_item_type extension_details_type = {
+       .ct_item_ops = &extension_details_item_ops,
+       .ct_owner = THIS_MODULE,
+};
+
+static struct config_item *extension_details_make_item(struct config_group *group, const char *name)
+{
+       struct extensions_value *extensions_value = to_extensions_value(&group->cg_item);
+       struct extension_details *extension_details = kzalloc(sizeof(struct extension_details), GFP_KERNEL);
+       const char *tmp;
+       int ret;
+
+       if (!extension_details)
+               return ERR_PTR(-ENOMEM);
+
+       tmp = kstrdup(name, GFP_KERNEL);
+       if (!tmp) {
+               kfree(extension_details);
+               return ERR_PTR(-ENOMEM);
+       }
+       qstr_init(&extension_details->name, tmp);
+       ret = insert_ext_gid_entry(&extension_details->name, extensions_value->num);
+
+       if (ret) {
+               kfree(extension_details->name.name);
+               kfree(extension_details);
+               return ERR_PTR(ret);
+       }
+       config_item_init_type_name(&extension_details->item, name, &extension_details_type);
+
+       return &extension_details->item;
+}
+
+static struct configfs_group_operations extensions_value_group_ops = {
+       .make_item = extension_details_make_item,
+};
+
+static struct config_item_type extensions_name_type = {
+       .ct_group_ops   = &extensions_value_group_ops,
+       .ct_owner       = THIS_MODULE,
+};
+
+static struct config_group *extensions_make_group(struct config_group *group, const char *name)
+{
+       struct extensions_value *extensions_value;
+       unsigned int tmp;
+       int ret;
+
+       extensions_value = kzalloc(sizeof(struct extensions_value), GFP_KERNEL);
+       if (!extensions_value)
+               return ERR_PTR(-ENOMEM);
+       ret = kstrtouint(name, 10, &tmp);
+       if (ret) {
+               kfree(extensions_value);
+               return ERR_PTR(ret);
+       }
+
+       extensions_value->num = tmp;
+       config_group_init_type_name(&extensions_value->group, name,
+                                               &extensions_name_type);
+       return &extensions_value->group;
+}
+
+static void extensions_drop_group(struct config_group *group, struct config_item *item)
+{
+       struct extensions_value *value = to_extensions_value(item);
+
+       pr_info("sdcardfs: No longer mapping any files to gid %d\n", value->num);
+       kfree(value);
+}
+
+static struct configfs_group_operations extensions_group_ops = {
+       .make_group     = extensions_make_group,
+       .drop_item      = extensions_drop_group,
+};
+
+static struct config_item_type extensions_type = {
+       .ct_group_ops   = &extensions_group_ops,
+       .ct_owner       = THIS_MODULE,
+};
+
+struct config_group extension_group = {
+       .cg_item = {
+               .ci_namebuf = "extensions",
+               .ci_type = &extensions_type,
+       },
+};
+
+static struct config_item *packages_make_item(struct config_group *group, const char *name)
+{
+       struct package_details *package_details;
+       const char *tmp;
+
+       package_details = kzalloc(sizeof(struct package_details), GFP_KERNEL);
+       if (!package_details)
+               return ERR_PTR(-ENOMEM);
+       tmp = kstrdup(name, GFP_KERNEL);
+       if (!tmp) {
+               kfree(package_details);
+               return ERR_PTR(-ENOMEM);
+       }
+       qstr_init(&package_details->name, tmp);
+       config_item_init_type_name(&package_details->item, name,
+                                               &package_appid_type);
+
+       return &package_details->item;
+}
+
+static ssize_t packages_list_show(struct config_item *item, char *page)
+{
+       struct hashtable_entry *hash_cur_app;
+       struct hashtable_entry *hash_cur_user;
+       int i;
+       int count = 0, written = 0;
+       const char errormsg[] = "<truncated>\n";
+       unsigned int hash;
+
+       rcu_read_lock();
+       hash_for_each_rcu(package_to_appid, i, hash_cur_app, hlist) {
+               written = scnprintf(page + count, PAGE_SIZE - sizeof(errormsg) - count, "%s %d\n",
+                                       hash_cur_app->key.name, atomic_read(&hash_cur_app->value));
+               hash = hash_cur_app->key.hash;
+               hash_for_each_possible_rcu(package_to_userid, hash_cur_user, hlist, hash) {
+                       if (qstr_case_eq(&hash_cur_app->key, &hash_cur_user->key)) {
+                               written += scnprintf(page + count + written - 1,
+                                       PAGE_SIZE - sizeof(errormsg) - count - written + 1,
+                                       " %d\n", atomic_read(&hash_cur_user->value)) - 1;
+                       }
+               }
+               if (count + written == PAGE_SIZE - sizeof(errormsg) - 1) {
+                       count += scnprintf(page + count, PAGE_SIZE - count, errormsg);
+                       break;
+               }
+               count += written;
+       }
+       rcu_read_unlock();
+
+       return count;
+}
+
+static ssize_t packages_remove_userid_store(struct config_item *item,
+                                      const char *page, size_t count)
+{
+       unsigned int tmp;
+       int ret;
+
+       ret = kstrtouint(page, 10, &tmp);
+       if (ret)
+               return ret;
+       remove_userid_all_entry(tmp);
+       return count;
+}
+
+static struct configfs_attribute packages_attr_packages_gid_list = {
+       .ca_name        = "packages_gid.list",
+       .ca_mode        = S_IRUGO,
+       .ca_owner       = THIS_MODULE,
+       .show           = packages_list_show,
+};
+
+SDCARDFS_CONFIGFS_ATTR_WO(packages_, remove_userid);
+
+static struct configfs_attribute *packages_attrs[] = {
+       &packages_attr_packages_gid_list,
+       &packages_attr_remove_userid,
+       NULL,
+};
+
+/*
+ * Note that, since no extra work is required on ->drop_item(),
+ * no ->drop_item() is provided.
+ */
+static struct configfs_group_operations packages_group_ops = {
+       .make_item      = packages_make_item,
+};
+
+static struct config_item_type packages_type = {
+       .ct_group_ops   = &packages_group_ops,
+       .ct_attrs       = packages_attrs,
+       .ct_owner       = THIS_MODULE,
+};
+
+struct config_group *sd_default_groups[] = {
+       &extension_group,
+       NULL,
+};
+
+static struct configfs_subsystem sdcardfs_packages = {
+       .su_group = {
+               .cg_item = {
+                       .ci_namebuf = "sdcardfs",
+                       .ci_type = &packages_type,
+               },
+       },
+};
+
+static int configfs_sdcardfs_init(void)
+{
+       int ret, i;
+       struct configfs_subsystem *subsys = &sdcardfs_packages;
+
+       config_group_init(&subsys->su_group);
+       for (i = 0; sd_default_groups[i]; i++) {
+               config_group_init(sd_default_groups[i]);
+               configfs_add_default_group(sd_default_groups[i], &subsys->su_group);
+       }
+       mutex_init(&subsys->su_mutex);
+       ret = configfs_register_subsystem(subsys);
+       if (ret) {
+               pr_err("Error %d while registering subsystem %s\n",
+                      ret,
+                      subsys->su_group.cg_item.ci_namebuf);
+       }
+       return ret;
+}
+
+static void configfs_sdcardfs_exit(void)
+{
+       configfs_unregister_subsystem(&sdcardfs_packages);
+}
+
+int packagelist_init(void)
+{
+       hashtable_entry_cachep =
+               kmem_cache_create("packagelist_hashtable_entry",
+                                       sizeof(struct hashtable_entry), 0, 0, NULL);
+       if (!hashtable_entry_cachep) {
+               pr_err("sdcardfs: failed creating pkgl_hashtable entry slab cache\n");
+               return -ENOMEM;
+       }
+
+       configfs_sdcardfs_init();
+       return 0;
+}
+
+void packagelist_exit(void)
+{
+       configfs_sdcardfs_exit();
+       packagelist_destroy();
+       kmem_cache_destroy(hashtable_entry_cachep);
+}
diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h

new file mode 100644 (file)

index 0000000..610466a
--- /dev/null
+++ b/fs/sdcardfs/sdcardfs.h
@@ -0,0 +1,677 @@
+/*
+ * fs/sdcardfs/sdcardfs.h
+ *
+ * The sdcardfs v2.0
+ *   This file system replaces the sdcard daemon on Android
+ *   On version 2.0, some of the daemon functions have been ported
+ *   to support the multi-user concepts of Android 4.4
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#ifndef _SDCARDFS_H_
+#define _SDCARDFS_H_
+
+#include <linux/dcache.h>
+#include <linux/file.h>
+#include <linux/fs.h>
+#include <linux/aio.h>
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/mount.h>
+#include <linux/namei.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/fs_stack.h>
+#include <linux/magic.h>
+#include <linux/uaccess.h>
+#include <linux/slab.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/security.h>
+#include <linux/string.h>
+#include <linux/list.h>
+#include "multiuser.h"
+
+/* the file system name */
+#define SDCARDFS_NAME "sdcardfs"
+
+/* sdcardfs root inode number */
+#define SDCARDFS_ROOT_INO     1
+
+/* useful for tracking code reachability */
+#define UDBG pr_default("DBG:%s:%s:%d\n", __FILE__, __func__, __LINE__)
+
+#define SDCARDFS_DIRENT_SIZE 256
+
+/* temporary static uid settings for development */
+#define AID_ROOT             0 /* uid for accessing /mnt/sdcard & extSdcard */
+#define AID_MEDIA_RW      1023 /* internal media storage write access */
+
+#define AID_SDCARD_RW     1015 /* external storage write access */
+#define AID_SDCARD_R      1028 /* external storage read access */
+#define AID_SDCARD_PICS   1033 /* external storage photos access */
+#define AID_SDCARD_AV     1034 /* external storage audio/video access */
+#define AID_SDCARD_ALL    1035 /* access all users external storage */
+#define AID_MEDIA_OBB     1059  /* obb files */
+
+#define AID_SDCARD_IMAGE  1057
+
+#define AID_PACKAGE_INFO  1027
+
+
+/*
+ * Permissions are handled by our permission function.
+ * We don't want anyone who happens to look at our inode value to prematurely
+ * block access, so store more permissive values. These are probably never
+ * used.
+ */
+#define fixup_tmp_permissions(x)       \
+       do {                                            \
+               (x)->i_uid = make_kuid(&init_user_ns,   \
+                               SDCARDFS_I(x)->data->d_uid);    \
+               (x)->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);   \
+               (x)->i_mode = ((x)->i_mode & S_IFMT) | 0775;\
+       } while (0)
+
+/* OVERRIDE_CRED() and REVERT_CRED()
+ *     OVERRIDE_CRED()
+ *             backup original task->cred
+ *             and modifies task->cred->fsuid/fsgid to specified value.
+ *     REVERT_CRED()
+ *             restore original task->cred->fsuid/fsgid.
+ * These two macro should be used in pair, and OVERRIDE_CRED() should be
+ * placed at the beginning of a function, right after variable declaration.
+ */
+#define OVERRIDE_CRED(sdcardfs_sbi, saved_cred, info)          \
+       do {    \
+               saved_cred = override_fsids(sdcardfs_sbi, info->data);  \
+               if (!saved_cred)        \
+                       return -ENOMEM; \
+       } while (0)
+
+#define OVERRIDE_CRED_PTR(sdcardfs_sbi, saved_cred, info)      \
+       do {    \
+               saved_cred = override_fsids(sdcardfs_sbi, info->data);  \
+               if (!saved_cred)        \
+                       return ERR_PTR(-ENOMEM);        \
+       } while (0)
+
+#define REVERT_CRED(saved_cred)        revert_fsids(saved_cred)
+
+/* Android 5.0 support */
+
+/* Permission mode for a specific node. Controls how file permissions
+ * are derived for children nodes.
+ */
+typedef enum {
+       /* Nothing special; this node should just inherit from its parent. */
+       PERM_INHERIT,
+       /* This node is one level above a normal root; used for legacy layouts
+        * which use the first level to represent user_id.
+        */
+       PERM_PRE_ROOT,
+       /* This node is "/" */
+       PERM_ROOT,
+       /* This node is "/Android" */
+       PERM_ANDROID,
+       /* This node is "/Android/data" */
+       PERM_ANDROID_DATA,
+       /* This node is "/Android/obb" */
+       PERM_ANDROID_OBB,
+       /* This node is "/Android/media" */
+       PERM_ANDROID_MEDIA,
+       /* This node is "/Android/[data|media|obb]/[package]" */
+       PERM_ANDROID_PACKAGE,
+       /* This node is "/Android/[data|media|obb]/[package]/cache" */
+       PERM_ANDROID_PACKAGE_CACHE,
+} perm_t;
+
+struct sdcardfs_sb_info;
+struct sdcardfs_mount_options;
+struct sdcardfs_inode_info;
+struct sdcardfs_inode_data;
+
+/* Do not directly use this function. Use OVERRIDE_CRED() instead. */
+const struct cred *override_fsids(struct sdcardfs_sb_info *sbi,
+                       struct sdcardfs_inode_data *data);
+/* Do not directly use this function, use REVERT_CRED() instead. */
+void revert_fsids(const struct cred *old_cred);
+
+/* operations vectors defined in specific files */
+extern const struct file_operations sdcardfs_main_fops;
+extern const struct file_operations sdcardfs_dir_fops;
+extern const struct inode_operations sdcardfs_main_iops;
+extern const struct inode_operations sdcardfs_dir_iops;
+extern const struct inode_operations sdcardfs_symlink_iops;
+extern const struct super_operations sdcardfs_sops;
+extern const struct dentry_operations sdcardfs_ci_dops;
+extern const struct address_space_operations sdcardfs_aops, sdcardfs_dummy_aops;
+extern const struct vm_operations_struct sdcardfs_vm_ops;
+
+extern int sdcardfs_init_inode_cache(void);
+extern void sdcardfs_destroy_inode_cache(void);
+extern int sdcardfs_init_dentry_cache(void);
+extern void sdcardfs_destroy_dentry_cache(void);
+extern int new_dentry_private_data(struct dentry *dentry);
+extern void free_dentry_private_data(struct dentry *dentry);
+extern struct dentry *sdcardfs_lookup(struct inode *dir, struct dentry *dentry,
+                               unsigned int flags);
+extern struct inode *sdcardfs_iget(struct super_block *sb,
+                                struct inode *lower_inode, userid_t id);
+extern int sdcardfs_interpose(struct dentry *dentry, struct super_block *sb,
+                           struct path *lower_path, userid_t id);
+
+/* file private data */
+struct sdcardfs_file_info {
+       struct file *lower_file;
+       const struct vm_operations_struct *lower_vm_ops;
+};
+
+struct sdcardfs_inode_data {
+       struct kref refcount;
+       bool abandoned;
+
+       perm_t perm;
+       userid_t userid;
+       uid_t d_uid;
+       bool under_android;
+       bool under_cache;
+       bool under_obb;
+};
+
+/* sdcardfs inode data in memory */
+struct sdcardfs_inode_info {
+       struct inode *lower_inode;
+       /* state derived based on current position in hierarchy */
+       struct sdcardfs_inode_data *data;
+
+       /* top folder for ownership */
+       spinlock_t top_lock;
+       struct sdcardfs_inode_data *top_data;
+
+       struct inode vfs_inode;
+};
+
+
+/* sdcardfs dentry data in memory */
+struct sdcardfs_dentry_info {
+       spinlock_t lock;        /* protects lower_path */
+       struct path lower_path;
+       struct path orig_path;
+};
+
+struct sdcardfs_mount_options {
+       uid_t fs_low_uid;
+       gid_t fs_low_gid;
+       userid_t fs_user_id;
+       bool multiuser;
+       bool gid_derivation;
+       bool default_normal;
+       unsigned int reserved_mb;
+};
+
+struct sdcardfs_vfsmount_options {
+       gid_t gid;
+       mode_t mask;
+};
+
+extern int parse_options_remount(struct super_block *sb, char *options, int silent,
+               struct sdcardfs_vfsmount_options *vfsopts);
+
+/* sdcardfs super-block data in memory */
+struct sdcardfs_sb_info {
+       struct super_block *sb;
+       struct super_block *lower_sb;
+       /* derived perm policy : some of options have been added
+        * to sdcardfs_mount_options (Android 4.4 support)
+        */
+       struct sdcardfs_mount_options options;
+       spinlock_t lock;        /* protects obbpath */
+       char *obbpath_s;
+       struct path obbpath;
+       void *pkgl_id;
+       struct list_head list;
+};
+
+/*
+ * inode to private data
+ *
+ * Since we use containers and the struct inode is _inside_ the
+ * sdcardfs_inode_info structure, SDCARDFS_I will always (given a non-NULL
+ * inode pointer), return a valid non-NULL pointer.
+ */
+static inline struct sdcardfs_inode_info *SDCARDFS_I(const struct inode *inode)
+{
+       return container_of(inode, struct sdcardfs_inode_info, vfs_inode);
+}
+
+/* dentry to private data */
+#define SDCARDFS_D(dent) ((struct sdcardfs_dentry_info *)(dent)->d_fsdata)
+
+/* superblock to private data */
+#define SDCARDFS_SB(super) ((struct sdcardfs_sb_info *)(super)->s_fs_info)
+
+/* file to private Data */
+#define SDCARDFS_F(file) ((struct sdcardfs_file_info *)((file)->private_data))
+
+/* file to lower file */
+static inline struct file *sdcardfs_lower_file(const struct file *f)
+{
+       return SDCARDFS_F(f)->lower_file;
+}
+
+static inline void sdcardfs_set_lower_file(struct file *f, struct file *val)
+{
+       SDCARDFS_F(f)->lower_file = val;
+}
+
+/* inode to lower inode. */
+static inline struct inode *sdcardfs_lower_inode(const struct inode *i)
+{
+       return SDCARDFS_I(i)->lower_inode;
+}
+
+static inline void sdcardfs_set_lower_inode(struct inode *i, struct inode *val)
+{
+       SDCARDFS_I(i)->lower_inode = val;
+}
+
+/* superblock to lower superblock */
+static inline struct super_block *sdcardfs_lower_super(
+       const struct super_block *sb)
+{
+       return SDCARDFS_SB(sb)->lower_sb;
+}
+
+static inline void sdcardfs_set_lower_super(struct super_block *sb,
+                                         struct super_block *val)
+{
+       SDCARDFS_SB(sb)->lower_sb = val;
+}
+
+/* path based (dentry/mnt) macros */
+static inline void pathcpy(struct path *dst, const struct path *src)
+{
+       dst->dentry = src->dentry;
+       dst->mnt = src->mnt;
+}
+
+/* sdcardfs_get_pname functions calls path_get()
+ * therefore, the caller must call "proper" path_put functions
+ */
+#define SDCARDFS_DENT_FUNC(pname) \
+static inline void sdcardfs_get_##pname(const struct dentry *dent, \
+                                       struct path *pname) \
+{ \
+       spin_lock(&SDCARDFS_D(dent)->lock); \
+       pathcpy(pname, &SDCARDFS_D(dent)->pname); \
+       path_get(pname); \
+       spin_unlock(&SDCARDFS_D(dent)->lock); \
+       return; \
+} \
+static inline void sdcardfs_put_##pname(const struct dentry *dent, \
+                                       struct path *pname) \
+{ \
+       path_put(pname); \
+       return; \
+} \
+static inline void sdcardfs_set_##pname(const struct dentry *dent, \
+                                       struct path *pname) \
+{ \
+       spin_lock(&SDCARDFS_D(dent)->lock); \
+       pathcpy(&SDCARDFS_D(dent)->pname, pname); \
+       spin_unlock(&SDCARDFS_D(dent)->lock); \
+       return; \
+} \
+static inline void sdcardfs_reset_##pname(const struct dentry *dent) \
+{ \
+       spin_lock(&SDCARDFS_D(dent)->lock); \
+       SDCARDFS_D(dent)->pname.dentry = NULL; \
+       SDCARDFS_D(dent)->pname.mnt = NULL; \
+       spin_unlock(&SDCARDFS_D(dent)->lock); \
+       return; \
+} \
+static inline void sdcardfs_put_reset_##pname(const struct dentry *dent) \
+{ \
+       struct path pname; \
+       spin_lock(&SDCARDFS_D(dent)->lock); \
+       if (SDCARDFS_D(dent)->pname.dentry) { \
+               pathcpy(&pname, &SDCARDFS_D(dent)->pname); \
+               SDCARDFS_D(dent)->pname.dentry = NULL; \
+               SDCARDFS_D(dent)->pname.mnt = NULL; \
+               spin_unlock(&SDCARDFS_D(dent)->lock); \
+               path_put(&pname); \
+       } else \
+               spin_unlock(&SDCARDFS_D(dent)->lock); \
+       return; \
+}
+
+SDCARDFS_DENT_FUNC(lower_path)
+SDCARDFS_DENT_FUNC(orig_path)
+
+static inline bool sbinfo_has_sdcard_magic(struct sdcardfs_sb_info *sbinfo)
+{
+       return sbinfo && sbinfo->sb
+                       && sbinfo->sb->s_magic == SDCARDFS_SUPER_MAGIC;
+}
+
+static inline struct sdcardfs_inode_data *data_get(
+               struct sdcardfs_inode_data *data)
+{
+       if (data)
+               kref_get(&data->refcount);
+       return data;
+}
+
+static inline struct sdcardfs_inode_data *top_data_get(
+               struct sdcardfs_inode_info *info)
+{
+       struct sdcardfs_inode_data *top_data;
+
+       spin_lock(&info->top_lock);
+       top_data = data_get(info->top_data);
+       spin_unlock(&info->top_lock);
+       return top_data;
+}
+
+extern void data_release(struct kref *ref);
+
+static inline void data_put(struct sdcardfs_inode_data *data)
+{
+       kref_put(&data->refcount, data_release);
+}
+
+static inline void release_own_data(struct sdcardfs_inode_info *info)
+{
+       /*
+        * This happens exactly once per inode. At this point, the inode that
+        * originally held this data is about to be freed, and all references
+        * to it are held as a top value, and will likely be released soon.
+        */
+       info->data->abandoned = true;
+       data_put(info->data);
+}
+
+static inline void set_top(struct sdcardfs_inode_info *info,
+                       struct sdcardfs_inode_info *top_owner)
+{
+       struct sdcardfs_inode_data *old_top;
+       struct sdcardfs_inode_data *new_top = NULL;
+
+       if (top_owner)
+               new_top = top_data_get(top_owner);
+
+       spin_lock(&info->top_lock);
+       old_top = info->top_data;
+       info->top_data = new_top;
+       if (old_top)
+               data_put(old_top);
+       spin_unlock(&info->top_lock);
+}
+
+static inline int get_gid(struct vfsmount *mnt,
+               struct super_block *sb,
+               struct sdcardfs_inode_data *data)
+{
+       struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(sb);
+
+       if (vfsopts->gid == AID_SDCARD_RW && !sbi->options.default_normal)
+               /* As an optimization, certain trusted system components only run
+                * as owner but operate across all users. Since we're now handing
+                * out the sdcard_rw GID only to trusted apps, we're okay relaxing
+                * the user boundary enforcement for the default view. The UIDs
+                * assigned to app directories are still multiuser aware.
+                */
+               return AID_SDCARD_RW;
+       else
+               return multiuser_get_uid(data->userid, vfsopts->gid);
+}
+
+static inline int get_mode(struct vfsmount *mnt,
+               struct sdcardfs_inode_info *info,
+               struct sdcardfs_inode_data *data)
+{
+       int owner_mode;
+       int filtered_mode;
+       struct sdcardfs_vfsmount_options *opts = mnt->data;
+       int visible_mode = 0775 & ~opts->mask;
+
+
+       if (data->perm == PERM_PRE_ROOT) {
+               /* Top of multi-user view should always be visible to ensure
+               * secondary users can traverse inside.
+               */
+               visible_mode = 0711;
+       } else if (data->under_android) {
+               /* Block "other" access to Android directories, since only apps
+               * belonging to a specific user should be in there; we still
+               * leave +x open for the default view.
+               */
+               if (opts->gid == AID_SDCARD_RW)
+                       visible_mode = visible_mode & ~0006;
+               else
+                       visible_mode = visible_mode & ~0007;
+       }
+       owner_mode = info->lower_inode->i_mode & 0700;
+       filtered_mode = visible_mode & (owner_mode | (owner_mode >> 3) | (owner_mode >> 6));
+       return filtered_mode;
+}
+
+static inline int has_graft_path(const struct dentry *dent)
+{
+       int ret = 0;
+
+       spin_lock(&SDCARDFS_D(dent)->lock);
+       if (SDCARDFS_D(dent)->orig_path.dentry != NULL)
+               ret = 1;
+       spin_unlock(&SDCARDFS_D(dent)->lock);
+
+       return ret;
+}
+
+static inline void sdcardfs_get_real_lower(const struct dentry *dent,
+                                               struct path *real_lower)
+{
+       /* in case of a local obb dentry
+        * the orig_path should be returned
+        */
+       if (has_graft_path(dent))
+               sdcardfs_get_orig_path(dent, real_lower);
+       else
+               sdcardfs_get_lower_path(dent, real_lower);
+}
+
+static inline void sdcardfs_put_real_lower(const struct dentry *dent,
+                                               struct path *real_lower)
+{
+       if (has_graft_path(dent))
+               sdcardfs_put_orig_path(dent, real_lower);
+       else
+               sdcardfs_put_lower_path(dent, real_lower);
+}
+
+extern struct mutex sdcardfs_super_list_lock;
+extern struct list_head sdcardfs_super_list;
+
+/* for packagelist.c */
+extern appid_t get_appid(const char *app_name);
+extern appid_t get_ext_gid(const char *app_name);
+extern appid_t is_excluded(const char *app_name, userid_t userid);
+extern int check_caller_access_to_name(struct inode *parent_node, const struct qstr *name);
+extern int packagelist_init(void);
+extern void packagelist_exit(void);
+
+/* for derived_perm.c */
+#define BY_NAME                (1 << 0)
+#define BY_USERID      (1 << 1)
+struct limit_search {
+       unsigned int flags;
+       struct qstr name;
+       userid_t userid;
+};
+
+extern void setup_derived_state(struct inode *inode, perm_t perm,
+                       userid_t userid, uid_t uid);
+extern void get_derived_permission(struct dentry *parent, struct dentry *dentry);
+extern void get_derived_permission_new(struct dentry *parent, struct dentry *dentry, const struct qstr *name);
+extern void fixup_perms_recursive(struct dentry *dentry, struct limit_search *limit);
+
+extern void update_derived_permission_lock(struct dentry *dentry);
+void fixup_lower_ownership(struct dentry *dentry, const char *name);
+extern int need_graft_path(struct dentry *dentry);
+extern int is_base_obbpath(struct dentry *dentry);
+extern int is_obbpath_invalid(struct dentry *dentry);
+extern int setup_obb_dentry(struct dentry *dentry, struct path *lower_path);
+
+/* locking helpers */
+static inline struct dentry *lock_parent(struct dentry *dentry)
+{
+       struct dentry *dir = dget_parent(dentry);
+
+       inode_lock_nested(d_inode(dir), I_MUTEX_PARENT);
+       return dir;
+}
+
+static inline void unlock_dir(struct dentry *dir)
+{
+       inode_unlock(d_inode(dir));
+       dput(dir);
+}
+
+static inline int prepare_dir(const char *path_s, uid_t uid, gid_t gid, mode_t mode)
+{
+       int err;
+       struct dentry *dent;
+       struct iattr attrs;
+       struct path parent;
+
+       dent = kern_path_locked(path_s, &parent);
+       if (IS_ERR(dent)) {
+               err = PTR_ERR(dent);
+               if (err == -EEXIST)
+                       err = 0;
+               goto out_unlock;
+       }
+
+       err = vfs_mkdir2(parent.mnt, d_inode(parent.dentry), dent, mode);
+       if (err) {
+               if (err == -EEXIST)
+                       err = 0;
+               goto out_dput;
+       }
+
+       attrs.ia_uid = make_kuid(&init_user_ns, uid);
+       attrs.ia_gid = make_kgid(&init_user_ns, gid);
+       attrs.ia_valid = ATTR_UID | ATTR_GID;
+       inode_lock(d_inode(dent));
+       notify_change2(parent.mnt, dent, &attrs, NULL);
+       inode_unlock(d_inode(dent));
+
+out_dput:
+       dput(dent);
+
+out_unlock:
+       /* parent dentry locked by lookup_create */
+       inode_unlock(d_inode(parent.dentry));
+       path_put(&parent);
+       return err;
+}
+
+/*
+ * Return 1, if a disk has enough free space, otherwise 0.
+ * We assume that any files can not be overwritten.
+ */
+static inline int check_min_free_space(struct dentry *dentry, size_t size, int dir)
+{
+       int err;
+       struct path lower_path;
+       struct kstatfs statfs;
+       u64 avail;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+       if (sbi->options.reserved_mb) {
+               /* Get fs stat of lower filesystem. */
+               sdcardfs_get_lower_path(dentry, &lower_path);
+               err = vfs_statfs(&lower_path, &statfs);
+               sdcardfs_put_lower_path(dentry, &lower_path);
+
+               if (unlikely(err))
+                       return 0;
+
+               /* Invalid statfs informations. */
+               if (unlikely(statfs.f_bsize == 0))
+                       return 0;
+
+               /* if you are checking directory, set size to f_bsize. */
+               if (unlikely(dir))
+                       size = statfs.f_bsize;
+
+               /* available size */
+               avail = statfs.f_bavail * statfs.f_bsize;
+
+               /* not enough space */
+               if ((u64)size > avail)
+                       return 0;
+
+               /* enough space */
+               if ((avail - size) > (sbi->options.reserved_mb * 1024 * 1024))
+                       return 1;
+
+               return 0;
+       } else
+               return 1;
+}
+
+/*
+ * Copies attrs and maintains sdcardfs managed attrs
+ * Since our permission check handles all special permissions, set those to be open
+ */
+static inline void sdcardfs_copy_and_fix_attrs(struct inode *dest, const struct inode *src)
+{
+       dest->i_mode = (src->i_mode  & S_IFMT) | S_IRWXU | S_IRWXG |
+                       S_IROTH | S_IXOTH; /* 0775 */
+       dest->i_uid = make_kuid(&init_user_ns, SDCARDFS_I(dest)->data->d_uid);
+       dest->i_gid = make_kgid(&init_user_ns, AID_SDCARD_RW);
+       dest->i_rdev = src->i_rdev;
+       dest->i_atime = src->i_atime;
+       dest->i_mtime = src->i_mtime;
+       dest->i_ctime = src->i_ctime;
+       dest->i_blkbits = src->i_blkbits;
+       dest->i_flags = src->i_flags;
+       set_nlink(dest, src->i_nlink);
+}
+
+static inline bool str_case_eq(const char *s1, const char *s2)
+{
+       return !strcasecmp(s1, s2);
+}
+
+static inline bool str_n_case_eq(const char *s1, const char *s2, size_t len)
+{
+       return !strncasecmp(s1, s2, len);
+}
+
+static inline bool qstr_case_eq(const struct qstr *q1, const struct qstr *q2)
+{
+       return q1->len == q2->len && str_case_eq(q1->name, q2->name);
+}
+
+#define QSTR_LITERAL(string) QSTR_INIT(string, sizeof(string)-1)
+
+#endif /* not _SDCARDFS_H_ */
diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c

new file mode 100644 (file)

index 0000000..cffcdb1
--- /dev/null
+++ b/fs/sdcardfs/super.c
@@ -0,0 +1,331 @@
+/*
+ * fs/sdcardfs/super.c
+ *
+ * Copyright (c) 2013 Samsung Electronics Co. Ltd
+ *   Authors: Daeho Jeong, Woojoong Lee, Seunghwan Hyun,
+ *               Sunghwan Yun, Sungjong Seo
+ *
+ * This program has been developed as a stackable file system based on
+ * the WrapFS which written by
+ *
+ * Copyright (c) 1998-2011 Erez Zadok
+ * Copyright (c) 2009     Shrikar Archak
+ * Copyright (c) 2003-2011 Stony Brook University
+ * Copyright (c) 2003-2011 The Research Foundation of SUNY
+ *
+ * This file is dual licensed.  It may be redistributed and/or modified
+ * under the terms of the Apache 2.0 License OR version 2 of the GNU
+ * General Public License.
+ */
+
+#include "sdcardfs.h"
+
+/*
+ * The inode cache is used with alloc_inode for both our inode info and the
+ * vfs inode.
+ */
+static struct kmem_cache *sdcardfs_inode_cachep;
+
+/*
+ * To support the top references, we must track some data separately.
+ * An sdcardfs_inode_info always has a reference to its data, and once set up,
+ * also has a reference to its top. The top may be itself, in which case it
+ * holds two references to its data. When top is changed, it takes a ref to the
+ * new data and then drops the ref to the old data.
+ */
+static struct kmem_cache *sdcardfs_inode_data_cachep;
+
+void data_release(struct kref *ref)
+{
+       struct sdcardfs_inode_data *data =
+               container_of(ref, struct sdcardfs_inode_data, refcount);
+
+       kmem_cache_free(sdcardfs_inode_data_cachep, data);
+}
+
+/* final actions when unmounting a file system */
+static void sdcardfs_put_super(struct super_block *sb)
+{
+       struct sdcardfs_sb_info *spd;
+       struct super_block *s;
+
+       spd = SDCARDFS_SB(sb);
+       if (!spd)
+               return;
+
+       if (spd->obbpath_s) {
+               kfree(spd->obbpath_s);
+               path_put(&spd->obbpath);
+       }
+
+       /* decrement lower super references */
+       s = sdcardfs_lower_super(sb);
+       sdcardfs_set_lower_super(sb, NULL);
+       atomic_dec(&s->s_active);
+
+       kfree(spd);
+       sb->s_fs_info = NULL;
+}
+
+static int sdcardfs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+       int err;
+       struct path lower_path;
+       u32 min_blocks;
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(dentry->d_sb);
+
+       sdcardfs_get_lower_path(dentry, &lower_path);
+       err = vfs_statfs(&lower_path, buf);
+       sdcardfs_put_lower_path(dentry, &lower_path);
+
+       if (sbi->options.reserved_mb) {
+               /* Invalid statfs informations. */
+               if (buf->f_bsize == 0) {
+                       pr_err("Returned block size is zero.\n");
+                       return -EINVAL;
+               }
+
+               min_blocks = ((sbi->options.reserved_mb * 1024 * 1024)/buf->f_bsize);
+               buf->f_blocks -= min_blocks;
+
+               if (buf->f_bavail > min_blocks)
+                       buf->f_bavail -= min_blocks;
+               else
+                       buf->f_bavail = 0;
+
+               /* Make reserved blocks invisiable to media storage */
+               buf->f_bfree = buf->f_bavail;
+       }
+
+       /* set return buf to our f/s to avoid confusing user-level utils */
+       buf->f_type = SDCARDFS_SUPER_MAGIC;
+
+       return err;
+}
+
+/*
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs(struct super_block *sb, int *flags, char *options)
+{
+       int err = 0;
+
+       /*
+        * The VFS will take care of "ro" and "rw" flags among others.  We
+        * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+        * SILENT, but anything else left over is an error.
+        */
+       if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT)) != 0) {
+               pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+               err = -EINVAL;
+       }
+
+       return err;
+}
+
+/*
+ * @mnt: mount point we are remounting
+ * @sb: superblock we are remounting
+ * @flags: numeric mount options
+ * @options: mount options string
+ */
+static int sdcardfs_remount_fs2(struct vfsmount *mnt, struct super_block *sb,
+                                               int *flags, char *options)
+{
+       int err = 0;
+
+       /*
+        * The VFS will take care of "ro" and "rw" flags among others.  We
+        * can safely accept a few flags (RDONLY, MANDLOCK), and honor
+        * SILENT, but anything else left over is an error.
+        */
+       if ((*flags & ~(MS_RDONLY | MS_MANDLOCK | MS_SILENT | MS_REMOUNT)) != 0) {
+               pr_err("sdcardfs: remount flags 0x%x unsupported\n", *flags);
+               err = -EINVAL;
+       }
+       pr_info("Remount options were %s for vfsmnt %p.\n", options, mnt);
+       err = parse_options_remount(sb, options, *flags & ~MS_SILENT, mnt->data);
+
+
+       return err;
+}
+
+static void *sdcardfs_clone_mnt_data(void *data)
+{
+       struct sdcardfs_vfsmount_options *opt = kmalloc(sizeof(struct sdcardfs_vfsmount_options), GFP_KERNEL);
+       struct sdcardfs_vfsmount_options *old = data;
+
+       if (!opt)
+               return NULL;
+       opt->gid = old->gid;
+       opt->mask = old->mask;
+       return opt;
+}
+
+static void sdcardfs_copy_mnt_data(void *data, void *newdata)
+{
+       struct sdcardfs_vfsmount_options *old = data;
+       struct sdcardfs_vfsmount_options *new = newdata;
+
+       old->gid = new->gid;
+       old->mask = new->mask;
+}
+
+/*
+ * Called by iput() when the inode reference count reached zero
+ * and the inode is not hashed anywhere.  Used to clear anything
+ * that needs to be, before the inode is completely destroyed and put
+ * on the inode free list.
+ */
+static void sdcardfs_evict_inode(struct inode *inode)
+{
+       struct inode *lower_inode;
+
+       truncate_inode_pages(&inode->i_data, 0);
+       set_top(SDCARDFS_I(inode), NULL);
+       clear_inode(inode);
+       /*
+        * Decrement a reference to a lower_inode, which was incremented
+        * by our read_inode when it was created initially.
+        */
+       lower_inode = sdcardfs_lower_inode(inode);
+       sdcardfs_set_lower_inode(inode, NULL);
+       iput(lower_inode);
+}
+
+static struct inode *sdcardfs_alloc_inode(struct super_block *sb)
+{
+       struct sdcardfs_inode_info *i;
+       struct sdcardfs_inode_data *d;
+
+       i = kmem_cache_alloc(sdcardfs_inode_cachep, GFP_KERNEL);
+       if (!i)
+               return NULL;
+
+       /* memset everything up to the inode to 0 */
+       memset(i, 0, offsetof(struct sdcardfs_inode_info, vfs_inode));
+
+       d = kmem_cache_alloc(sdcardfs_inode_data_cachep,
+                                       GFP_KERNEL | __GFP_ZERO);
+       if (!d) {
+               kmem_cache_free(sdcardfs_inode_cachep, i);
+               return NULL;
+       }
+
+       i->data = d;
+       kref_init(&d->refcount);
+       i->top_data = d;
+       spin_lock_init(&i->top_lock);
+       kref_get(&d->refcount);
+
+       i->vfs_inode.i_version = 1;
+       return &i->vfs_inode;
+}
+
+static void i_callback(struct rcu_head *head)
+{
+       struct inode *inode = container_of(head, struct inode, i_rcu);
+
+       release_own_data(SDCARDFS_I(inode));
+       kmem_cache_free(sdcardfs_inode_cachep, SDCARDFS_I(inode));
+}
+
+static void sdcardfs_destroy_inode(struct inode *inode)
+{
+       call_rcu(&inode->i_rcu, i_callback);
+}
+
+/* sdcardfs inode cache constructor */
+static void init_once(void *obj)
+{
+       struct sdcardfs_inode_info *i = obj;
+
+       inode_init_once(&i->vfs_inode);
+}
+
+int sdcardfs_init_inode_cache(void)
+{
+       sdcardfs_inode_cachep =
+               kmem_cache_create("sdcardfs_inode_cache",
+                                 sizeof(struct sdcardfs_inode_info), 0,
+                                 SLAB_RECLAIM_ACCOUNT, init_once);
+
+       if (!sdcardfs_inode_cachep)
+               return -ENOMEM;
+
+       sdcardfs_inode_data_cachep =
+               kmem_cache_create("sdcardfs_inode_data_cache",
+                                 sizeof(struct sdcardfs_inode_data), 0,
+                                 SLAB_RECLAIM_ACCOUNT, NULL);
+       if (!sdcardfs_inode_data_cachep) {
+               kmem_cache_destroy(sdcardfs_inode_cachep);
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+/* sdcardfs inode cache destructor */
+void sdcardfs_destroy_inode_cache(void)
+{
+       kmem_cache_destroy(sdcardfs_inode_data_cachep);
+       kmem_cache_destroy(sdcardfs_inode_cachep);
+}
+
+/*
+ * Used only in nfs, to kill any pending RPC tasks, so that subsequent
+ * code can actually succeed and won't leave tasks that need handling.
+ */
+static void sdcardfs_umount_begin(struct super_block *sb)
+{
+       struct super_block *lower_sb;
+
+       lower_sb = sdcardfs_lower_super(sb);
+       if (lower_sb && lower_sb->s_op && lower_sb->s_op->umount_begin)
+               lower_sb->s_op->umount_begin(lower_sb);
+}
+
+static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m,
+                       struct dentry *root)
+{
+       struct sdcardfs_sb_info *sbi = SDCARDFS_SB(root->d_sb);
+       struct sdcardfs_mount_options *opts = &sbi->options;
+       struct sdcardfs_vfsmount_options *vfsopts = mnt->data;
+
+       if (opts->fs_low_uid != 0)
+               seq_printf(m, ",fsuid=%u", opts->fs_low_uid);
+       if (opts->fs_low_gid != 0)
+               seq_printf(m, ",fsgid=%u", opts->fs_low_gid);
+       if (vfsopts->gid != 0)
+               seq_printf(m, ",gid=%u", vfsopts->gid);
+       if (opts->multiuser)
+               seq_puts(m, ",multiuser");
+       if (vfsopts->mask)
+               seq_printf(m, ",mask=%u", vfsopts->mask);
+       if (opts->fs_user_id)
+               seq_printf(m, ",userid=%u", opts->fs_user_id);
+       if (opts->gid_derivation)
+               seq_puts(m, ",derive_gid");
+       if (opts->default_normal)
+               seq_puts(m, ",default_normal");
+       if (opts->reserved_mb != 0)
+               seq_printf(m, ",reserved=%uMB", opts->reserved_mb);
+
+       return 0;
+};
+
+const struct super_operations sdcardfs_sops = {
+       .put_super      = sdcardfs_put_super,
+       .statfs         = sdcardfs_statfs,
+       .remount_fs     = sdcardfs_remount_fs,
+       .remount_fs2    = sdcardfs_remount_fs2,
+       .clone_mnt_data = sdcardfs_clone_mnt_data,
+       .copy_mnt_data  = sdcardfs_copy_mnt_data,
+       .evict_inode    = sdcardfs_evict_inode,
+       .umount_begin   = sdcardfs_umount_begin,
+       .show_options2  = sdcardfs_show_options,
+       .alloc_inode    = sdcardfs_alloc_inode,
+       .destroy_inode  = sdcardfs_destroy_inode,
+       .drop_inode     = generic_delete_inode,
+};
diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig

index 1adb3346b9d6aa659cf5aaf5f82afe1d81a8118f..6c81bf62006712510fe0bb93b9733162d00e9724 100644 (file)
--- a/fs/squashfs/Kconfig
+++ b/fs/squashfs/Kconfig
@@ -25,34 +25,6 @@ config SQUASHFS
  
           If unsure, say N.
  
-choice
-       prompt "File decompression options"
-       depends on SQUASHFS
-       help
-         Squashfs now supports two options for decompressing file
-         data.  Traditionally Squashfs has decompressed into an
-         intermediate buffer and then memcopied it into the page cache.
-         Squashfs now supports the ability to decompress directly into
-         the page cache.
-
-         If unsure, select "Decompress file data into an intermediate buffer"
-
-config SQUASHFS_FILE_CACHE
-       bool "Decompress file data into an intermediate buffer"
-       help
-         Decompress file data into an intermediate buffer and then
-         memcopy it into the page cache.
-
-config SQUASHFS_FILE_DIRECT
-       bool "Decompress files directly into the page cache"
-       help
-         Directly decompress file data into the page cache.
-         Doing so can significantly improve performance because
-         it eliminates a memcpy and it also removes the lock contention
-         on the single buffer.
-
-endchoice
-
  choice
         prompt "Decompressor parallelisation options"
         depends on SQUASHFS
diff --git a/fs/squashfs/Makefile b/fs/squashfs/Makefile

index 7bd9b8b856d0bf61f49358625eb26b2912c92fc8..b9a2990a6c19fb6f7488ec7f9d4211f076afb61e 100644 (file)
--- a/fs/squashfs/Makefile
+++ b/fs/squashfs/Makefile
@@ -6,8 +6,7 @@
  obj-$(CONFIG_SQUASHFS) += squashfs.o
  squashfs-y += block.o cache.o dir.o export.o file.o fragment.o id.o inode.o
  squashfs-y += namei.o super.o symlink.o decompressor.o
-squashfs-$(CONFIG_SQUASHFS_FILE_CACHE) += file_cache.o
-squashfs-$(CONFIG_SQUASHFS_FILE_DIRECT) += file_direct.o page_actor.o
+squashfs-y += file_direct.o page_actor.o
  squashfs-$(CONFIG_SQUASHFS_DECOMP_SINGLE) += decompressor_single.o
  squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI) += decompressor_multi.o
  squashfs-$(CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU) += decompressor_multi_percpu.o
diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c

index 2751476e6b6e85e094b33f717093ba0251af999e..dfc2e7641c309530492f0c525dbe799bd2618e92 100644 (file)
--- a/fs/squashfs/block.c
+++ b/fs/squashfs/block.c
@@ -28,10 +28,13 @@
  
  #include <linux/fs.h>
  #include <linux/vfs.h>
+#include <linux/bio.h>
  #include <linux/slab.h>
  #include <linux/string.h>
+#include <linux/pagemap.h>
  #include <linux/buffer_head.h>
  #include <linux/bio.h>
+#include <linux/workqueue.h>
  
  #include "squashfs_fs.h"
  #include "squashfs_fs_sb.h"
@@ -39,177 +42,436 @@
  #include "decompressor.h"
  #include "page_actor.h"
  
-/*
- * Read the metadata block length, this is stored in the first two
- * bytes of the metadata block.
- */
-static struct buffer_head *get_block_length(struct super_block *sb,
-                       u64 *cur_index, int *offset, int *length)
+static struct workqueue_struct *squashfs_read_wq;
+
+struct squashfs_read_request {
+       struct super_block *sb;
+       u64 index;
+       int length;
+       int compressed;
+       int offset;
+       u64 read_end;
+       struct squashfs_page_actor *output;
+       enum {
+               SQUASHFS_COPY,
+               SQUASHFS_DECOMPRESS,
+               SQUASHFS_METADATA,
+       } data_processing;
+       bool synchronous;
+
+       /*
+        * If the read is synchronous, it is possible to retrieve information
+        * about the request by setting these pointers.
+        */
+       int *res;
+       int *bytes_read;
+       int *bytes_uncompressed;
+
+       int nr_buffers;
+       struct buffer_head **bh;
+       struct work_struct offload;
+};
+
+struct squashfs_bio_request {
+       struct buffer_head **bh;
+       int nr_buffers;
+};
+
+static int squashfs_bio_submit(struct squashfs_read_request *req);
+
+int squashfs_init_read_wq(void)
  {
-       struct squashfs_sb_info *msblk = sb->s_fs_info;
-       struct buffer_head *bh;
+       squashfs_read_wq = create_workqueue("SquashFS read wq");
+       return !!squashfs_read_wq;
+}
+
+void squashfs_destroy_read_wq(void)
+{
+       flush_workqueue(squashfs_read_wq);
+       destroy_workqueue(squashfs_read_wq);
+}
+
+static void free_read_request(struct squashfs_read_request *req, int error)
+{
+       if (!req->synchronous)
+               squashfs_page_actor_free(req->output, error);
+       if (req->res)
+               *(req->res) = error;
+       kfree(req->bh);
+       kfree(req);
+}
  
-       bh = sb_bread(sb, *cur_index);
-       if (bh == NULL)
-               return NULL;
-
-       if (msblk->devblksize - *offset == 1) {
-               *length = (unsigned char) bh->b_data[*offset];
-               put_bh(bh);
-               bh = sb_bread(sb, ++(*cur_index));
-               if (bh == NULL)
-                       return NULL;
-               *length |= (unsigned char) bh->b_data[0] << 8;
-               *offset = 1;
-       } else {
-               *length = (unsigned char) bh->b_data[*offset] |
-                       (unsigned char) bh->b_data[*offset + 1] << 8;
-               *offset += 2;
-
-               if (*offset == msblk->devblksize) {
-                       put_bh(bh);
-                       bh = sb_bread(sb, ++(*cur_index));
-                       if (bh == NULL)
-                               return NULL;
-                       *offset = 0;
+static void squashfs_process_blocks(struct squashfs_read_request *req)
+{
+       int error = 0;
+       int bytes, i, length;
+       struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+       struct squashfs_page_actor *actor = req->output;
+       struct buffer_head **bh = req->bh;
+       int nr_buffers = req->nr_buffers;
+
+       for (i = 0; i < nr_buffers; ++i) {
+               if (!bh[i])
+                       continue;
+               wait_on_buffer(bh[i]);
+               if (!buffer_uptodate(bh[i]))
+                       error = -EIO;
+       }
+       if (error)
+               goto cleanup;
+
+       if (req->data_processing == SQUASHFS_METADATA) {
+               /* Extract the length of the metadata block */
+               if (req->offset != msblk->devblksize - 1) {
+                       length = le16_to_cpup((__le16 *)
+                                       (bh[0]->b_data + req->offset));
+               } else {
+                       length = (unsigned char)bh[0]->b_data[req->offset];
+                       length |= (unsigned char)bh[1]->b_data[0] << 8;
+               }
+               req->compressed = SQUASHFS_COMPRESSED(length);
+               req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+                                                      : SQUASHFS_COPY;
+               length = SQUASHFS_COMPRESSED_SIZE(length);
+               if (req->index + length + 2 > req->read_end) {
+                       for (i = 0; i < nr_buffers; ++i)
+                               put_bh(bh[i]);
+                       kfree(bh);
+                       req->length = length;
+                       req->index += 2;
+                       squashfs_bio_submit(req);
+                       return;
+               }
+               req->length = length;
+               req->offset = (req->offset + 2) % PAGE_SIZE;
+               if (req->offset < 2) {
+                       put_bh(bh[0]);
+                       ++bh;
+                       --nr_buffers;
                 }
         }
+       if (req->bytes_read)
+               *(req->bytes_read) = req->length;
  
-       return bh;
+       if (req->data_processing == SQUASHFS_COPY) {
+               squashfs_bh_to_actor(bh, nr_buffers, req->output, req->offset,
+                       req->length, msblk->devblksize);
+       } else if (req->data_processing == SQUASHFS_DECOMPRESS) {
+               req->length = squashfs_decompress(msblk, bh, nr_buffers,
+                       req->offset, req->length, actor);
+               if (req->length < 0) {
+                       error = -EIO;
+                       goto cleanup;
+               }
+       }
+
+       /* Last page may have trailing bytes not filled */
+       bytes = req->length % PAGE_SIZE;
+       if (bytes && actor->page[actor->pages - 1])
+               zero_user_segment(actor->page[actor->pages - 1], bytes,
+                                 PAGE_SIZE);
+
+cleanup:
+       if (req->bytes_uncompressed)
+               *(req->bytes_uncompressed) = req->length;
+       if (error) {
+               for (i = 0; i < nr_buffers; ++i)
+                       if (bh[i])
+                               put_bh(bh[i]);
+       }
+       free_read_request(req, error);
  }
  
+static void read_wq_handler(struct work_struct *work)
+{
+       squashfs_process_blocks(container_of(work,
+                   struct squashfs_read_request, offload));
+}
  
-/*
- * Read and decompress a metadata block or datablock.  Length is non-zero
- * if a datablock is being read (the size is stored elsewhere in the
- * filesystem), otherwise the length is obtained from the first two bytes of
- * the metadata block.  A bit in the length field indicates if the block
- * is stored uncompressed in the filesystem (usually because compression
- * generated a larger block - this does occasionally happen with compression
- * algorithms).
- */
-int squashfs_read_data(struct super_block *sb, u64 index, int length,
-               u64 *next_index, struct squashfs_page_actor *output)
+static void squashfs_bio_end_io(struct bio *bio)
  {
-       struct squashfs_sb_info *msblk = sb->s_fs_info;
-       struct buffer_head **bh;
-       int offset = index & ((1 << msblk->devblksize_log2) - 1);
-       u64 cur_index = index >> msblk->devblksize_log2;
-       int bytes, compressed, b = 0, k = 0, avail, i;
+       int i;
+       blk_status_t error = bio->bi_status;
+       struct squashfs_bio_request *bio_req = bio->bi_private;
+
+       bio_put(bio);
+
+       for (i = 0; i < bio_req->nr_buffers; ++i) {
+               if (!bio_req->bh[i])
+                       continue;
+               if (!error)
+                       set_buffer_uptodate(bio_req->bh[i]);
+               else
+                       clear_buffer_uptodate(bio_req->bh[i]);
+               unlock_buffer(bio_req->bh[i]);
+       }
+       kfree(bio_req);
+}
+
+static int bh_is_optional(struct squashfs_read_request *req, int idx)
+{
+       int start_idx, end_idx;
+       struct squashfs_sb_info *msblk = req->sb->s_fs_info;
  
-       bh = kcalloc(((output->length + msblk->devblksize - 1)
-               >> msblk->devblksize_log2) + 1, sizeof(*bh), GFP_KERNEL);
-       if (bh == NULL)
+       start_idx = (idx * msblk->devblksize - req->offset) >> PAGE_SHIFT;
+       end_idx = ((idx + 1) * msblk->devblksize - req->offset + 1) >> PAGE_SHIFT;
+       if (start_idx >= req->output->pages)
+               return 1;
+       if (start_idx < 0)
+               start_idx = end_idx;
+       if (end_idx >= req->output->pages)
+               end_idx = start_idx;
+       return !req->output->page[start_idx] && !req->output->page[end_idx];
+}
+
+static int actor_getblks(struct squashfs_read_request *req, u64 block)
+{
+       int i;
+
+       req->bh = kmalloc_array(req->nr_buffers, sizeof(*(req->bh)), GFP_NOIO);
+       if (!req->bh)
                 return -ENOMEM;
  
-       if (length) {
+       for (i = 0; i < req->nr_buffers; ++i) {
                 /*
-                * Datablock.
+                * When dealing with an uncompressed block, the actor may
+                * contains NULL pages. There's no need to read the buffers
+                * associated with these pages.
                  */
-               bytes = -offset;
-               compressed = SQUASHFS_COMPRESSED_BLOCK(length);
-               length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
-               if (next_index)
-                       *next_index = index + length;
-
-               TRACE("Block @ 0x%llx, %scompressed size %d, src size %d\n",
-                       index, compressed ? "" : "un", length, output->length);
-
-               if (length < 0 || length > output->length ||
-                               (index + length) > msblk->bytes_used)
-                       goto read_failure;
-
-               for (b = 0; bytes < length; b++, cur_index++) {
-                       bh[b] = sb_getblk(sb, cur_index);
-                       if (bh[b] == NULL)
-                               goto block_release;
-                       bytes += msblk->devblksize;
+               if (!req->compressed && bh_is_optional(req, i)) {
+                       req->bh[i] = NULL;
+                       continue;
                 }
-               ll_rw_block(REQ_OP_READ, 0, b, bh);
-       } else {
-               /*
-                * Metadata block.
-                */
-               if ((index + 2) > msblk->bytes_used)
-                       goto read_failure;
+               req->bh[i] = sb_getblk(req->sb, block + i);
+               if (!req->bh[i]) {
+                       while (--i) {
+                               if (req->bh[i])
+                                       put_bh(req->bh[i]);
+                       }
+                       return -1;
+               }
+       }
+       return 0;
+}
  
-               bh[0] = get_block_length(sb, &cur_index, &offset, &length);
-               if (bh[0] == NULL)
-                       goto read_failure;
-               b = 1;
+static int squashfs_bio_submit(struct squashfs_read_request *req)
+{
+       struct bio *bio = NULL;
+       struct buffer_head *bh;
+       struct squashfs_bio_request *bio_req = NULL;
+       int b = 0, prev_block = 0;
+       struct squashfs_sb_info *msblk = req->sb->s_fs_info;
  
-               bytes = msblk->devblksize - offset;
-               compressed = SQUASHFS_COMPRESSED(length);
-               length = SQUASHFS_COMPRESSED_SIZE(length);
-               if (next_index)
-                       *next_index = index + length + 2;
+       u64 read_start = round_down(req->index, msblk->devblksize);
+       u64 read_end = round_up(req->index + req->length, msblk->devblksize);
+       sector_t block = read_start >> msblk->devblksize_log2;
+       sector_t block_end = read_end >> msblk->devblksize_log2;
+       int offset = read_start - round_down(req->index, PAGE_SIZE);
+       int nr_buffers = block_end - block;
+       int blksz = msblk->devblksize;
+       int bio_max_pages = nr_buffers > BIO_MAX_PAGES ? BIO_MAX_PAGES
+                                                      : nr_buffers;
  
-               TRACE("Block @ 0x%llx, %scompressed size %d\n", index,
-                               compressed ? "" : "un", length);
+       /* Setup the request */
+       req->read_end = read_end;
+       req->offset = req->index - read_start;
+       req->nr_buffers = nr_buffers;
+       if (actor_getblks(req, block) < 0)
+               goto getblk_failed;
  
-               if (length < 0 || length > output->length ||
-                                       (index + length) > msblk->bytes_used)
-                       goto block_release;
+       /* Create and submit the BIOs */
+       for (b = 0; b < nr_buffers; ++b, offset += blksz) {
+               bh = req->bh[b];
+               if (!bh || !trylock_buffer(bh))
+                       continue;
+               if (buffer_uptodate(bh)) {
+                       unlock_buffer(bh);
+                       continue;
+               }
+               offset %= PAGE_SIZE;
  
-               for (; bytes < length; b++) {
-                       bh[b] = sb_getblk(sb, ++cur_index);
-                       if (bh[b] == NULL)
-                               goto block_release;
-                       bytes += msblk->devblksize;
+               /* Append the buffer to the current BIO if it is contiguous */
+               if (bio && bio_req && prev_block + 1 == b) {
+                       if (bio_add_page(bio, bh->b_page, blksz, offset)) {
+                               bio_req->nr_buffers += 1;
+                               prev_block = b;
+                               continue;
+                       }
                 }
-               ll_rw_block(REQ_OP_READ, 0, b - 1, bh + 1);
+
+               /* Otherwise, submit the current BIO and create a new one */
+               if (bio)
+                       submit_bio(bio);
+               bio_req = kcalloc(1, sizeof(struct squashfs_bio_request),
+                                 GFP_NOIO);
+               if (!bio_req)
+                       goto req_alloc_failed;
+               bio_req->bh = &req->bh[b];
+               bio = bio_alloc(GFP_NOIO, bio_max_pages);
+               if (!bio)
+                       goto bio_alloc_failed;
+               bio_set_dev(bio, req->sb->s_bdev);
+               bio->bi_iter.bi_sector = (block + b)
+                                      << (msblk->devblksize_log2 - 9);
+               bio_set_op_attrs(bio, REQ_OP_READ, 0);
+               bio->bi_private = bio_req;
+               bio->bi_end_io = squashfs_bio_end_io;
+
+               bio_add_page(bio, bh->b_page, blksz, offset);
+               bio_req->nr_buffers += 1;
+               prev_block = b;
         }
+       if (bio)
+               submit_bio(bio);
  
-       for (i = 0; i < b; i++) {
-               wait_on_buffer(bh[i]);
-               if (!buffer_uptodate(bh[i]))
-                       goto block_release;
+       if (req->synchronous)
+               squashfs_process_blocks(req);
+       else {
+               INIT_WORK(&req->offload, read_wq_handler);
+               schedule_work(&req->offload);
         }
+       return 0;
  
-       if (compressed) {
-               length = squashfs_decompress(msblk, bh, b, offset, length,
-                       output);
-               if (length < 0)
-                       goto read_failure;
-       } else {
-               /*
-                * Block is uncompressed.
-                */
-               int in, pg_offset = 0;
-               void *data = squashfs_first_page(output);
-
-               for (bytes = length; k < b; k++) {
-                       in = min(bytes, msblk->devblksize - offset);
-                       bytes -= in;
-                       while (in) {
-                               if (pg_offset == PAGE_SIZE) {
-                                       data = squashfs_next_page(output);
-                                       pg_offset = 0;
-                               }
-                               avail = min_t(int, in, PAGE_SIZE -
-                                               pg_offset);
-                               memcpy(data + pg_offset, bh[k]->b_data + offset,
-                                               avail);
-                               in -= avail;
-                               pg_offset += avail;
-                               offset += avail;
-                       }
-                       offset = 0;
-                       put_bh(bh[k]);
-               }
-               squashfs_finish_page(output);
+bio_alloc_failed:
+       kfree(bio_req);
+req_alloc_failed:
+       unlock_buffer(bh);
+       while (--nr_buffers >= b)
+               if (req->bh[nr_buffers])
+                       put_bh(req->bh[nr_buffers]);
+       while (--b >= 0)
+               if (req->bh[b])
+                       wait_on_buffer(req->bh[b]);
+getblk_failed:
+       free_read_request(req, -ENOMEM);
+       return -ENOMEM;
+}
+
+static int read_metadata_block(struct squashfs_read_request *req,
+                              u64 *next_index)
+{
+       int ret, error, bytes_read = 0, bytes_uncompressed = 0;
+       struct squashfs_sb_info *msblk = req->sb->s_fs_info;
+
+       if (req->index + 2 > msblk->bytes_used) {
+               free_read_request(req, -EINVAL);
+               return -EINVAL;
+       }
+       req->length = 2;
+
+       /* Do not read beyond the end of the device */
+       if (req->index + req->length > msblk->bytes_used)
+               req->length = msblk->bytes_used - req->index;
+       req->data_processing = SQUASHFS_METADATA;
+
+       /*
+        * Reading metadata is always synchronous because we don't know the
+        * length in advance and the function is expected to update
+        * 'next_index' and return the length.
+        */
+       req->synchronous = true;
+       req->res = &error;
+       req->bytes_read = &bytes_read;
+       req->bytes_uncompressed = &bytes_uncompressed;
+
+       TRACE("Metadata block @ 0x%llx, %scompressed size %d, src size %d\n",
+             req->index, req->compressed ? "" : "un", bytes_read,
+             req->output->length);
+
+       ret = squashfs_bio_submit(req);
+       if (ret)
+               return ret;
+       if (error)
+               return error;
+       if (next_index)
+               *next_index += 2 + bytes_read;
+       return bytes_uncompressed;
+}
+
+static int read_data_block(struct squashfs_read_request *req, int length,
+                          u64 *next_index, bool synchronous)
+{
+       int ret, error = 0, bytes_uncompressed = 0, bytes_read = 0;
+
+       req->compressed = SQUASHFS_COMPRESSED_BLOCK(length);
+       req->length = length = SQUASHFS_COMPRESSED_SIZE_BLOCK(length);
+       req->data_processing = req->compressed ? SQUASHFS_DECOMPRESS
+                                              : SQUASHFS_COPY;
+
+       req->synchronous = synchronous;
+       if (synchronous) {
+               req->res = &error;
+               req->bytes_read = &bytes_read;
+               req->bytes_uncompressed = &bytes_uncompressed;
+       }
+
+       TRACE("Data block @ 0x%llx, %scompressed size %d, src size %d\n",
+             req->index, req->compressed ? "" : "un", req->length,
+             req->output->length);
+
+       ret = squashfs_bio_submit(req);
+       if (ret)
+               return ret;
+       if (synchronous)
+               ret = error ? error : bytes_uncompressed;
+       if (next_index)
+               *next_index += length;
+       return ret;
+}
+
+/*
+ * Read and decompress a metadata block or datablock.  Length is non-zero
+ * if a datablock is being read (the size is stored elsewhere in the
+ * filesystem), otherwise the length is obtained from the first two bytes of
+ * the metadata block.  A bit in the length field indicates if the block
+ * is stored uncompressed in the filesystem (usually because compression
+ * generated a larger block - this does occasionally happen with compression
+ * algorithms).
+ */
+static int __squashfs_read_data(struct super_block *sb, u64 index, int length,
+       u64 *next_index, struct squashfs_page_actor *output, bool sync)
+{
+       struct squashfs_read_request *req;
+
+       req = kcalloc(1, sizeof(struct squashfs_read_request), GFP_KERNEL);
+       if (!req) {
+               if (!sync)
+                       squashfs_page_actor_free(output, -ENOMEM);
+               return -ENOMEM;
+       }
+
+       req->sb = sb;
+       req->index = index;
+       req->output = output;
+
+       if (next_index)
+               *next_index = index;
+
+       if (length)
+               length = read_data_block(req, length, next_index, sync);
+       else
+               length = read_metadata_block(req, next_index);
+
+       if (length < 0) {
+               ERROR("squashfs_read_data failed to read block 0x%llx\n",
+                     (unsigned long long)index);
+               return -EIO;
         }
  
-       kfree(bh);
         return length;
+}
  
-block_release:
-       for (; k < b; k++)
-               put_bh(bh[k]);
+int squashfs_read_data(struct super_block *sb, u64 index, int length,
+       u64 *next_index, struct squashfs_page_actor *output)
+{
+       return __squashfs_read_data(sb, index, length, next_index, output,
+                                   true);
+}
+
+int squashfs_read_data_async(struct super_block *sb, u64 index, int length,
+       u64 *next_index, struct squashfs_page_actor *output)
+{
  
-read_failure:
-       ERROR("squashfs_read_data failed to read block 0x%llx\n",
-                                       (unsigned long long) index);
-       kfree(bh);
-       return -EIO;
+       return __squashfs_read_data(sb, index, length, next_index, output,
+                                   false);
  }
diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c

index 23813c078cc9527f547c345ba01ce31dafd570ab..05e42441d1065df13838c0fc4af5db085e4c1570 100644 (file)
--- a/fs/squashfs/cache.c
+++ b/fs/squashfs/cache.c
@@ -209,17 +209,14 @@ void squashfs_cache_put(struct squashfs_cache_entry *entry)
   */
  void squashfs_cache_delete(struct squashfs_cache *cache)
  {
-       int i, j;
+       int i;
  
         if (cache == NULL)
                 return;
  
         for (i = 0; i < cache->entries; i++) {
-               if (cache->entry[i].data) {
-                       for (j = 0; j < cache->pages; j++)
-                               kfree(cache->entry[i].data[j]);
-                       kfree(cache->entry[i].data);
-               }
+               if (cache->entry[i].page)
+                       free_page_array(cache->entry[i].page, cache->pages);
                 kfree(cache->entry[i].actor);
         }
  
@@ -236,7 +233,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache)
  struct squashfs_cache *squashfs_cache_init(char *name, int entries,
         int block_size)
  {
-       int i, j;
+       int i;
         struct squashfs_cache *cache = kzalloc(sizeof(*cache), GFP_KERNEL);
  
         if (cache == NULL) {
@@ -268,22 +265,13 @@ struct squashfs_cache *squashfs_cache_init(char *name, int entries,
                 init_waitqueue_head(&cache->entry[i].wait_queue);
                 entry->cache = cache;
                 entry->block = SQUASHFS_INVALID_BLK;
-               entry->data = kcalloc(cache->pages, sizeof(void *), GFP_KERNEL);
-               if (entry->data == NULL) {
+               entry->page = alloc_page_array(cache->pages, GFP_KERNEL);
+               if (!entry->page) {
                         ERROR("Failed to allocate %s cache entry\n", name);
                         goto cleanup;
                 }
-
-               for (j = 0; j < cache->pages; j++) {
-                       entry->data[j] = kmalloc(PAGE_SIZE, GFP_KERNEL);
-                       if (entry->data[j] == NULL) {
-                               ERROR("Failed to allocate %s buffer\n", name);
-                               goto cleanup;
-                       }
-               }
-
-               entry->actor = squashfs_page_actor_init(entry->data,
-                                               cache->pages, 0);
+               entry->actor = squashfs_page_actor_init(entry->page,
+                       cache->pages, 0, NULL);
                 if (entry->actor == NULL) {
                         ERROR("Failed to allocate %s cache entry\n", name);
                         goto cleanup;
@@ -314,18 +302,20 @@ int squashfs_copy_data(void *buffer, struct squashfs_cache_entry *entry,
                 return min(length, entry->length - offset);
  
         while (offset < entry->length) {
-               void *buff = entry->data[offset / PAGE_SIZE]
-                               + (offset % PAGE_SIZE);
+               void *buff = kmap_atomic(entry->page[offset / PAGE_SIZE])
+                            + (offset % PAGE_SIZE);
                 int bytes = min_t(int, entry->length - offset,
                                 PAGE_SIZE - (offset % PAGE_SIZE));
  
                 if (bytes >= remaining) {
                         memcpy(buffer, buff, remaining);
+                       kunmap_atomic(buff);
                         remaining = 0;
                         break;
                 }
  
                 memcpy(buffer, buff, bytes);
+               kunmap_atomic(buff);
                 buffer += bytes;
                 remaining -= bytes;
                 offset += bytes;
@@ -416,43 +406,38 @@ struct squashfs_cache_entry *squashfs_get_datablock(struct super_block *sb,
  void *squashfs_read_table(struct super_block *sb, u64 block, int length)
  {
         int pages = (length + PAGE_SIZE - 1) >> PAGE_SHIFT;
-       int i, res;
-       void *table, *buffer, **data;
+       struct page **page;
+       void *buff;
+       int res;
         struct squashfs_page_actor *actor;
  
-       table = buffer = kmalloc(length, GFP_KERNEL);
-       if (table == NULL)
+       page = alloc_page_array(pages, GFP_KERNEL);
+       if (!page)
                 return ERR_PTR(-ENOMEM);
  
-       data = kcalloc(pages, sizeof(void *), GFP_KERNEL);
-       if (data == NULL) {
-               res = -ENOMEM;
-               goto failed;
-       }
-
-       actor = squashfs_page_actor_init(data, pages, length);
+       actor = squashfs_page_actor_init(page, pages, length, NULL);
         if (actor == NULL) {
                 res = -ENOMEM;
-               goto failed2;
+               goto failed;
         }
  
-       for (i = 0; i < pages; i++, buffer += PAGE_SIZE)
-               data[i] = buffer;
-
         res = squashfs_read_data(sb, block, length |
                 SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, actor);
  
-       kfree(data);
-       kfree(actor);
-
         if (res < 0)
-               goto failed;
+               goto failed2;
  
-       return table;
+       buff = kmalloc(length, GFP_KERNEL);
+       if (!buff)
+               goto failed2;
+       squashfs_actor_to_buf(actor, buff, length);
+       squashfs_page_actor_free(actor, 0);
+       free_page_array(page, pages);
+       return buff;
  
  failed2:
-       kfree(data);
+       squashfs_page_actor_free(actor, 0);
  failed:
-       kfree(table);
+       free_page_array(page, pages);
         return ERR_PTR(res);
  }
diff --git a/fs/squashfs/decompressor.c b/fs/squashfs/decompressor.c

index 836639810ea01720982176ddd1aeb22d7e40835a..86831aaedb9e876582ece5e90bb0810d28994d47 100644 (file)
--- a/fs/squashfs/decompressor.c
+++ b/fs/squashfs/decompressor.c
@@ -24,7 +24,8 @@
  #include <linux/types.h>
  #include <linux/mutex.h>
  #include <linux/slab.h>
-#include <linux/buffer_head.h>
+#include <linux/highmem.h>
+#include <linux/fs.h>
  
  #include "squashfs_fs.h"
  #include "squashfs_fs_sb.h"
@@ -101,40 +102,44 @@ const struct squashfs_decompressor *squashfs_lookup_decompressor(int id)
  static void *get_comp_opts(struct super_block *sb, unsigned short flags)
  {
         struct squashfs_sb_info *msblk = sb->s_fs_info;
-       void *buffer = NULL, *comp_opts;
+       void *comp_opts, *buffer = NULL;
+       struct page *page;
         struct squashfs_page_actor *actor = NULL;
         int length = 0;
  
+       if (!SQUASHFS_COMP_OPTS(flags))
+               return squashfs_comp_opts(msblk, buffer, length);
+
         /*
          * Read decompressor specific options from file system if present
          */
-       if (SQUASHFS_COMP_OPTS(flags)) {
-               buffer = kmalloc(PAGE_SIZE, GFP_KERNEL);
-               if (buffer == NULL) {
-                       comp_opts = ERR_PTR(-ENOMEM);
-                       goto out;
-               }
-
-               actor = squashfs_page_actor_init(&buffer, 1, 0);
-               if (actor == NULL) {
-                       comp_opts = ERR_PTR(-ENOMEM);
-                       goto out;
-               }
-
-               length = squashfs_read_data(sb,
-                       sizeof(struct squashfs_super_block), 0, NULL, actor);
-
-               if (length < 0) {
-                       comp_opts = ERR_PTR(length);
-                       goto out;
-               }
+
+       page = alloc_page(GFP_KERNEL);
+       if (!page)
+               return ERR_PTR(-ENOMEM);
+
+       actor = squashfs_page_actor_init(&page, 1, 0, NULL);
+       if (actor == NULL) {
+               comp_opts = ERR_PTR(-ENOMEM);
+               goto actor_error;
+       }
+
+       length = squashfs_read_data(sb,
+               sizeof(struct squashfs_super_block), 0, NULL, actor);
+
+       if (length < 0) {
+               comp_opts = ERR_PTR(length);
+               goto read_error;
         }
  
+       buffer = kmap_atomic(page);
         comp_opts = squashfs_comp_opts(msblk, buffer, length);
+       kunmap_atomic(buffer);
  
-out:
-       kfree(actor);
-       kfree(buffer);
+read_error:
+       squashfs_page_actor_free(actor, 0);
+actor_error:
+       __free_page(page);
         return comp_opts;
  }
  
diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c

index 13d80947bf9e6adac348878e3494b38cdd206099..bb2e77ee4209cb38f7f95e06d39c52624e84c0b4 100644 (file)
--- a/fs/squashfs/file.c
+++ b/fs/squashfs/file.c
@@ -47,6 +47,7 @@
  #include <linux/string.h>
  #include <linux/pagemap.h>
  #include <linux/mutex.h>
+#include <linux/mm_inline.h>
  
  #include "squashfs_fs.h"
  #include "squashfs_fs_sb.h"
@@ -438,6 +439,21 @@ static int squashfs_readpage_fragment(struct page *page)
         return res;
  }
  
+static int squashfs_readpages_fragment(struct page *page,
+       struct list_head *readahead_pages, struct address_space *mapping)
+{
+       if (!page) {
+               page = lru_to_page(readahead_pages);
+               list_del(&page->lru);
+               if (add_to_page_cache_lru(page, mapping, page->index,
+                       mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+                       put_page(page);
+                       return 0;
+               }
+       }
+       return squashfs_readpage_fragment(page);
+}
+
  static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
  {
         struct inode *inode = page->mapping->host;
@@ -450,54 +466,105 @@ static int squashfs_readpage_sparse(struct page *page, int index, int file_end)
         return 0;
  }
  
-static int squashfs_readpage(struct file *file, struct page *page)
+static int squashfs_readpages_sparse(struct page *page,
+       struct list_head *readahead_pages, int index, int file_end,
+       struct address_space *mapping)
  {
-       struct inode *inode = page->mapping->host;
+       if (!page) {
+               page = lru_to_page(readahead_pages);
+               list_del(&page->lru);
+               if (add_to_page_cache_lru(page, mapping, page->index,
+                       mapping_gfp_constraint(mapping, GFP_KERNEL))) {
+                       put_page(page);
+                       return 0;
+               }
+       }
+       return squashfs_readpage_sparse(page, index, file_end);
+}
+
+static int __squashfs_readpages(struct file *file, struct page *page,
+       struct list_head *readahead_pages, unsigned int nr_pages,
+       struct address_space *mapping)
+{
+       struct inode *inode = mapping->host;
         struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-       int index = page->index >> (msblk->block_log - PAGE_SHIFT);
         int file_end = i_size_read(inode) >> msblk->block_log;
         int res;
-       void *pageaddr;
  
-       TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
-                               page->index, squashfs_i(inode)->start);
+       do {
+               struct page *cur_page = page ? page
+                                            : lru_to_page(readahead_pages);
+               int page_index = cur_page->index;
+               int index = page_index >> (msblk->block_log - PAGE_SHIFT);
+
+               if (page_index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
+                                               PAGE_SHIFT))
+                       return 1;
+
+               if (index < file_end || squashfs_i(inode)->fragment_block ==
+                                               SQUASHFS_INVALID_BLK) {
+                       u64 block = 0;
+                       int bsize = read_blocklist(inode, index, &block);
+
+                       if (bsize < 0)
+                               return -1;
+
+                       if (bsize == 0) {
+                               res = squashfs_readpages_sparse(page,
+                                       readahead_pages, index, file_end,
+                                       mapping);
+                       } else {
+                               res = squashfs_readpages_block(page,
+                                       readahead_pages, &nr_pages, mapping,
+                                       page_index, block, bsize);
+                       }
+               } else {
+                       res = squashfs_readpages_fragment(page,
+                               readahead_pages, mapping);
+               }
+               if (res)
+                       return 0;
+               page = NULL;
+       } while (readahead_pages && !list_empty(readahead_pages));
+
+       return 0;
+}
+
+static int squashfs_readpage(struct file *file, struct page *page)
+{
+       int ret;
  
-       if (page->index >= ((i_size_read(inode) + PAGE_SIZE - 1) >>
-                                       PAGE_SHIFT))
-               goto out;
+       TRACE("Entered squashfs_readpage, page index %lx, start block %llx\n",
+             page->index, squashfs_i(page->mapping->host)->start);
  
-       if (index < file_end || squashfs_i(inode)->fragment_block ==
-                                       SQUASHFS_INVALID_BLK) {
-               u64 block = 0;
-               int bsize = read_blocklist(inode, index, &block);
-               if (bsize < 0)
-                       goto error_out;
+       get_page(page);
  
-               if (bsize == 0)
-                       res = squashfs_readpage_sparse(page, index, file_end);
+       ret = __squashfs_readpages(file, page, NULL, 1, page->mapping);
+       if (ret) {
+               flush_dcache_page(page);
+               if (ret < 0)
+                       SetPageError(page);
                 else
-                       res = squashfs_readpage_block(page, block, bsize);
-       } else
-               res = squashfs_readpage_fragment(page);
-
-       if (!res)
-               return 0;
-
-error_out:
-       SetPageError(page);
-out:
-       pageaddr = kmap_atomic(page);
-       memset(pageaddr, 0, PAGE_SIZE);
-       kunmap_atomic(pageaddr);
-       flush_dcache_page(page);
-       if (!PageError(page))
-               SetPageUptodate(page);
-       unlock_page(page);
+                       SetPageUptodate(page);
+               zero_user_segment(page, 0, PAGE_SIZE);
+               unlock_page(page);
+               put_page(page);
+       }
  
         return 0;
  }
  
+static int squashfs_readpages(struct file *file, struct address_space *mapping,
+                             struct list_head *pages, unsigned int nr_pages)
+{
+       TRACE("Entered squashfs_readpages, %u pages, first page index %lx\n",
+               nr_pages, lru_to_page(pages)->index);
+       __squashfs_readpages(file, NULL, pages, nr_pages, mapping);
+       return 0;
+}
+
  
  const struct address_space_operations squashfs_aops = {
-       .readpage = squashfs_readpage
+       .readpage = squashfs_readpage,
+       .readpages = squashfs_readpages,
  };
diff --git a/fs/squashfs/file_cache.c b/fs/squashfs/file_cache.c

deleted file mode 100644 (file)

index f2310d2..0000000
--- a/fs/squashfs/file_cache.c
+++ /dev/null
@@ -1,38 +0,0 @@
-/*
- * Copyright (c) 2013
- * Phillip Lougher <phillip@squashfs.org.uk>
- *
- * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
- */
-
-#include <linux/fs.h>
-#include <linux/vfs.h>
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/string.h>
-#include <linux/pagemap.h>
-#include <linux/mutex.h>
-
-#include "squashfs_fs.h"
-#include "squashfs_fs_sb.h"
-#include "squashfs_fs_i.h"
-#include "squashfs.h"
-
-/* Read separately compressed datablock and memcopy into page cache */
-int squashfs_readpage_block(struct page *page, u64 block, int bsize)
-{
-       struct inode *i = page->mapping->host;
-       struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-               block, bsize);
-       int res = buffer->error;
-
-       if (res)
-               ERROR("Unable to read page, block %llx, size %x\n", block,
-                       bsize);
-       else
-               squashfs_copy_cache(page, buffer, buffer->length, 0);
-
-       squashfs_cache_put(buffer);
-       return res;
-}
diff --git a/fs/squashfs/file_direct.c b/fs/squashfs/file_direct.c

index cb485d8e0e91b1b2ff1cb9b0330339c51c15b8a4..dc87f77ce11e4a08cb6423b06e79d450a6cafacf 100644 (file)
--- a/fs/squashfs/file_direct.c
+++ b/fs/squashfs/file_direct.c
@@ -13,6 +13,7 @@
  #include <linux/string.h>
  #include <linux/pagemap.h>
  #include <linux/mutex.h>
+#include <linux/mm_inline.h>
  
  #include "squashfs_fs.h"
  #include "squashfs_fs_sb.h"
@@ -20,157 +21,136 @@
  #include "squashfs.h"
  #include "page_actor.h"
  
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-       int pages, struct page **page);
+static void release_actor_pages(struct page **page, int pages, int error)
+{
+       int i;
  
-/* Read separately compressed datablock directly into page cache */
-int squashfs_readpage_block(struct page *target_page, u64 block, int bsize)
+       for (i = 0; i < pages; i++) {
+               if (!page[i])
+                       continue;
+               flush_dcache_page(page[i]);
+               if (!error)
+                       SetPageUptodate(page[i]);
+               else {
+                       SetPageError(page[i]);
+                       zero_user_segment(page[i], 0, PAGE_SIZE);
+               }
+               unlock_page(page[i]);
+               put_page(page[i]);
+       }
+       kfree(page);
+}
  
+/*
+ * Create a "page actor" which will kmap and kunmap the
+ * page cache pages appropriately within the decompressor
+ */
+static struct squashfs_page_actor *actor_from_page_cache(
+       unsigned int actor_pages, struct page *target_page,
+       struct list_head *rpages, unsigned int *nr_pages, int start_index,
+       struct address_space *mapping)
  {
-       struct inode *inode = target_page->mapping->host;
-       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
-
-       int file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
-       int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
-       int start_index = target_page->index & ~mask;
-       int end_index = start_index | mask;
-       int i, n, pages, missing_pages, bytes, res = -ENOMEM;
         struct page **page;
         struct squashfs_page_actor *actor;
-       void *pageaddr;
-
-       if (end_index > file_end)
-               end_index = file_end;
-
-       pages = end_index - start_index + 1;
-
-       page = kmalloc_array(pages, sizeof(void *), GFP_KERNEL);
-       if (page == NULL)
-               return res;
-
-       /*
-        * Create a "page actor" which will kmap and kunmap the
-        * page cache pages appropriately within the decompressor
-        */
-       actor = squashfs_page_actor_init_special(page, pages, 0);
-       if (actor == NULL)
-               goto out;
-
-       /* Try to grab all the pages covered by the Squashfs block */
-       for (missing_pages = 0, i = 0, n = start_index; i < pages; i++, n++) {
-               page[i] = (n == target_page->index) ? target_page :
-                       grab_cache_page_nowait(target_page->mapping, n);
+       int i, n;
+       gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+
+       page = kmalloc_array(actor_pages, sizeof(void *), GFP_KERNEL);
+       if (!page)
+               return NULL;
+
+       for (i = 0, n = start_index; i < actor_pages; i++, n++) {
+               if (target_page == NULL && rpages && !list_empty(rpages)) {
+                       struct page *cur_page = lru_to_page(rpages);
+
+                       if (cur_page->index < start_index + actor_pages) {
+                               list_del(&cur_page->lru);
+                               --(*nr_pages);
+                               if (add_to_page_cache_lru(cur_page, mapping,
+                                                         cur_page->index, gfp))
+                                       put_page(cur_page);
+                               else
+                                       target_page = cur_page;
+                       } else
+                               rpages = NULL;
+               }
  
-               if (page[i] == NULL) {
-                       missing_pages++;
-                       continue;
+               if (target_page && target_page->index == n) {
+                       page[i] = target_page;
+                       target_page = NULL;
+               } else {
+                       page[i] = grab_cache_page_nowait(mapping, n);
+                       if (page[i] == NULL)
+                               continue;
                 }
  
                 if (PageUptodate(page[i])) {
                         unlock_page(page[i]);
                         put_page(page[i]);
                         page[i] = NULL;
-                       missing_pages++;
                 }
         }
  
-       if (missing_pages) {
-               /*
-                * Couldn't get one or more pages, this page has either
-                * been VM reclaimed, but others are still in the page cache
-                * and uptodate, or we're racing with another thread in
-                * squashfs_readpage also trying to grab them.  Fall back to
-                * using an intermediate buffer.
-                */
-               res = squashfs_read_cache(target_page, block, bsize, pages,
-                                                               page);
-               if (res < 0)
-                       goto mark_errored;
-
-               goto out;
+       actor = squashfs_page_actor_init(page, actor_pages, 0,
+                       release_actor_pages);
+       if (!actor) {
+               release_actor_pages(page, actor_pages, -ENOMEM);
+               kfree(page);
+               return NULL;
         }
-
-       /* Decompress directly into the page cache buffers */
-       res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor);
-       if (res < 0)
-               goto mark_errored;
-
-       /* Last page may have trailing bytes not filled */
-       bytes = res % PAGE_SIZE;
-       if (bytes) {
-               pageaddr = kmap_atomic(page[pages - 1]);
-               memset(pageaddr + bytes, 0, PAGE_SIZE - bytes);
-               kunmap_atomic(pageaddr);
-       }
-
-       /* Mark pages as uptodate, unlock and release */
-       for (i = 0; i < pages; i++) {
-               flush_dcache_page(page[i]);
-               SetPageUptodate(page[i]);
-               unlock_page(page[i]);
-               if (page[i] != target_page)
-                       put_page(page[i]);
-       }
-
-       kfree(actor);
-       kfree(page);
-
-       return 0;
-
-mark_errored:
-       /* Decompression failed, mark pages as errored.  Target_page is
-        * dealt with by the caller
-        */
-       for (i = 0; i < pages; i++) {
-               if (page[i] == NULL || page[i] == target_page)
-                       continue;
-               flush_dcache_page(page[i]);
-               SetPageError(page[i]);
-               unlock_page(page[i]);
-               put_page(page[i]);
-       }
-
-out:
-       kfree(actor);
-       kfree(page);
-       return res;
+       return actor;
  }
  
+int squashfs_readpages_block(struct page *target_page,
+                            struct list_head *readahead_pages,
+                            unsigned int *nr_pages,
+                            struct address_space *mapping,
+                            int page_index, u64 block, int bsize)
  
-static int squashfs_read_cache(struct page *target_page, u64 block, int bsize,
-       int pages, struct page **page)
  {
-       struct inode *i = target_page->mapping->host;
-       struct squashfs_cache_entry *buffer = squashfs_get_datablock(i->i_sb,
-                                                block, bsize);
-       int bytes = buffer->length, res = buffer->error, n, offset = 0;
-       void *pageaddr;
-
-       if (res) {
-               ERROR("Unable to read page, block %llx, size %x\n", block,
-                       bsize);
-               goto out;
-       }
-
-       for (n = 0; n < pages && bytes > 0; n++,
-                       bytes -= PAGE_SIZE, offset += PAGE_SIZE) {
-               int avail = min_t(int, bytes, PAGE_SIZE);
-
-               if (page[n] == NULL)
-                       continue;
+       struct squashfs_page_actor *actor;
+       struct inode *inode = mapping->host;
+       struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info;
+       int start_index, end_index, file_end, actor_pages, res;
+       int mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1;
  
-               pageaddr = kmap_atomic(page[n]);
-               squashfs_copy_data(pageaddr, buffer, offset, avail);
-               memset(pageaddr + avail, 0, PAGE_SIZE - avail);
-               kunmap_atomic(pageaddr);
-               flush_dcache_page(page[n]);
-               SetPageUptodate(page[n]);
-               unlock_page(page[n]);
-               if (page[n] != target_page)
-                       put_page(page[n]);
+       /*
+        * If readpage() is called on an uncompressed datablock, we can just
+        * read the pages instead of fetching the whole block.
+        * This greatly improves the performance when a process keep doing
+        * random reads because we only fetch the necessary data.
+        * The readahead algorithm will take care of doing speculative reads
+        * if necessary.
+        * We can't read more than 1 block even if readahead provides use more
+        * pages because we don't know yet if the next block is compressed or
+        * not.
+        */
+       if (bsize && !SQUASHFS_COMPRESSED_BLOCK(bsize)) {
+               u64 block_end = block + msblk->block_size;
+
+               block += (page_index & mask) * PAGE_SIZE;
+               actor_pages = (block_end - block) / PAGE_SIZE;
+               if (*nr_pages < actor_pages)
+                       actor_pages = *nr_pages;
+               start_index = page_index;
+               bsize = min_t(int, bsize, (PAGE_SIZE * actor_pages)
+                                         | SQUASHFS_COMPRESSED_BIT_BLOCK);
+       } else {
+               file_end = (i_size_read(inode) - 1) >> PAGE_SHIFT;
+               start_index = page_index & ~mask;
+               end_index = start_index | mask;
+               if (end_index > file_end)
+                       end_index = file_end;
+               actor_pages = end_index - start_index + 1;
         }
  
-out:
-       squashfs_cache_put(buffer);
-       return res;
+       actor = actor_from_page_cache(actor_pages, target_page,
+                                     readahead_pages, nr_pages, start_index,
+                                     mapping);
+       if (!actor)
+               return -ENOMEM;
+
+       res = squashfs_read_data_async(inode->i_sb, block, bsize, NULL,
+                                      actor);
+       return res < 0 ? res : 0;
  }
diff --git a/fs/squashfs/lz4_wrapper.c b/fs/squashfs/lz4_wrapper.c

index 95da653665485974cb8281fab13eb6515f6d857b..5d8512534d9353d391e133809011bc67238749c7 100644 (file)
--- a/fs/squashfs/lz4_wrapper.c
+++ b/fs/squashfs/lz4_wrapper.c
@@ -94,39 +94,17 @@ static int lz4_uncompress(struct squashfs_sb_info *msblk, void *strm,
         struct buffer_head **bh, int b, int offset, int length,
         struct squashfs_page_actor *output)
  {
+       int res;
         struct squashfs_lz4 *stream = strm;
-       void *buff = stream->input, *data;
-       int avail, i, bytes = length, res;
-
-       for (i = 0; i < b; i++) {
-               avail = min(bytes, msblk->devblksize - offset);
-               memcpy(buff, bh[i]->b_data + offset, avail);
-               buff += avail;
-               bytes -= avail;
-               offset = 0;
-               put_bh(bh[i]);
-       }
  
+       squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+               msblk->devblksize);
         res = LZ4_decompress_safe(stream->input, stream->output,
                 length, output->length);
  
         if (res < 0)
                 return -EIO;
-
-       bytes = res;
-       data = squashfs_first_page(output);
-       buff = stream->output;
-       while (data) {
-               if (bytes <= PAGE_SIZE) {
-                       memcpy(data, buff, bytes);
-                       break;
-               }
-               memcpy(data, buff, PAGE_SIZE);
-               buff += PAGE_SIZE;
-               bytes -= PAGE_SIZE;
-               data = squashfs_next_page(output);
-       }
-       squashfs_finish_page(output);
+       squashfs_buf_to_actor(stream->output, output, res);
  
         return res;
  }
diff --git a/fs/squashfs/lzo_wrapper.c b/fs/squashfs/lzo_wrapper.c

index 934c17e965908eccff7e23729f8b9f27a0425628..2c844d53a59e07e57a3731aca9167f363c18f76f 100644 (file)
--- a/fs/squashfs/lzo_wrapper.c
+++ b/fs/squashfs/lzo_wrapper.c
@@ -79,45 +79,19 @@ static int lzo_uncompress(struct squashfs_sb_info *msblk, void *strm,
         struct buffer_head **bh, int b, int offset, int length,
         struct squashfs_page_actor *output)
  {
-       struct squashfs_lzo *stream = strm;
-       void *buff = stream->input, *data;
-       int avail, i, bytes = length, res;
+       int res;
         size_t out_len = output->length;
+       struct squashfs_lzo *stream = strm;
  
-       for (i = 0; i < b; i++) {
-               avail = min(bytes, msblk->devblksize - offset);
-               memcpy(buff, bh[i]->b_data + offset, avail);
-               buff += avail;
-               bytes -= avail;
-               offset = 0;
-               put_bh(bh[i]);
-       }
-
+       squashfs_bh_to_buf(bh, b, stream->input, offset, length,
+               msblk->devblksize);
         res = lzo1x_decompress_safe(stream->input, (size_t)length,
                                         stream->output, &out_len);
         if (res != LZO_E_OK)
-               goto failed;
+               return -EIO;
+       squashfs_buf_to_actor(stream->output, output, out_len);
  
-       res = bytes = (int)out_len;
-       data = squashfs_first_page(output);
-       buff = stream->output;
-       while (data) {
-               if (bytes <= PAGE_SIZE) {
-                       memcpy(data, buff, bytes);
-                       break;
-               } else {
-                       memcpy(data, buff, PAGE_SIZE);
-                       buff += PAGE_SIZE;
-                       bytes -= PAGE_SIZE;
-                       data = squashfs_next_page(output);
-               }
-       }
-       squashfs_finish_page(output);
-
-       return res;
-
-failed:
-       return -EIO;
+       return out_len;
  }
  
  const struct squashfs_decompressor squashfs_lzo_comp_ops = {
diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c

index 9b7b1b6a78926b605843119ab04ed5cc9524181a..e348f5647fbdcc028305039b3fef86d961260555 100644 (file)
--- a/fs/squashfs/page_actor.c
+++ b/fs/squashfs/page_actor.c
@@ -9,39 +9,11 @@
  #include <linux/kernel.h>
  #include <linux/slab.h>
  #include <linux/pagemap.h>
+#include <linux/buffer_head.h>
  #include "page_actor.h"
  
-/*
- * This file contains implementations of page_actor for decompressing into
- * an intermediate buffer, and for decompressing directly into the
- * page cache.
- *
- * Calling code should avoid sleeping between calls to squashfs_first_page()
- * and squashfs_finish_page().
- */
-
-/* Implementation of page_actor for decompressing into intermediate buffer */
-static void *cache_first_page(struct squashfs_page_actor *actor)
-{
-       actor->next_page = 1;
-       return actor->buffer[0];
-}
-
-static void *cache_next_page(struct squashfs_page_actor *actor)
-{
-       if (actor->next_page == actor->pages)
-               return NULL;
-
-       return actor->buffer[actor->next_page++];
-}
-
-static void cache_finish_page(struct squashfs_page_actor *actor)
-{
-       /* empty */
-}
-
-struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
-       int pages, int length)
+struct squashfs_page_actor *squashfs_page_actor_init(struct page **page,
+       int pages, int length, void (*release_pages)(struct page **, int, int))
  {
         struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
  
@@ -49,52 +21,133 @@ struct squashfs_page_actor *squashfs_page_actor_init(void **buffer,
                 return NULL;
  
         actor->length = length ? : pages * PAGE_SIZE;
-       actor->buffer = buffer;
+       actor->page = page;
         actor->pages = pages;
         actor->next_page = 0;
-       actor->squashfs_first_page = cache_first_page;
-       actor->squashfs_next_page = cache_next_page;
-       actor->squashfs_finish_page = cache_finish_page;
+       actor->pageaddr = NULL;
+       actor->release_pages = release_pages;
         return actor;
  }
  
-/* Implementation of page_actor for decompressing directly into page cache. */
-static void *direct_first_page(struct squashfs_page_actor *actor)
+void squashfs_page_actor_free(struct squashfs_page_actor *actor, int error)
+{
+       if (!actor)
+               return;
+
+       if (actor->release_pages)
+               actor->release_pages(actor->page, actor->pages, error);
+       kfree(actor);
+}
+
+void squashfs_actor_to_buf(struct squashfs_page_actor *actor, void *buf,
+       int length)
  {
-       actor->next_page = 1;
-       return actor->pageaddr = kmap_atomic(actor->page[0]);
+       void *pageaddr;
+       int pos = 0, avail, i;
+
+       for (i = 0; i < actor->pages && pos < length; ++i) {
+               avail = min_t(int, length - pos, PAGE_SIZE);
+               if (actor->page[i]) {
+                       pageaddr = kmap_atomic(actor->page[i]);
+                       memcpy(buf + pos, pageaddr, avail);
+                       kunmap_atomic(pageaddr);
+               }
+               pos += avail;
+       }
  }
  
-static void *direct_next_page(struct squashfs_page_actor *actor)
+void squashfs_buf_to_actor(void *buf, struct squashfs_page_actor *actor,
+       int length)
  {
-       if (actor->pageaddr)
-               kunmap_atomic(actor->pageaddr);
+       void *pageaddr;
+       int pos = 0, avail, i;
+
+       for (i = 0; i < actor->pages && pos < length; ++i) {
+               avail = min_t(int, length - pos, PAGE_SIZE);
+               if (actor->page[i]) {
+                       pageaddr = kmap_atomic(actor->page[i]);
+                       memcpy(pageaddr, buf + pos, avail);
+                       kunmap_atomic(pageaddr);
+               }
+               pos += avail;
+       }
+}
  
-       return actor->pageaddr = actor->next_page == actor->pages ? NULL :
-               kmap_atomic(actor->page[actor->next_page++]);
+void squashfs_bh_to_actor(struct buffer_head **bh, int nr_buffers,
+       struct squashfs_page_actor *actor, int offset, int length, int blksz)
+{
+       void *kaddr = NULL;
+       int bytes = 0, pgoff = 0, b = 0, p = 0, avail, i;
+
+       while (bytes < length) {
+               if (actor->page[p]) {
+                       kaddr = kmap_atomic(actor->page[p]);
+                       while (pgoff < PAGE_SIZE && bytes < length) {
+                               avail = min_t(int, blksz - offset,
+                                               PAGE_SIZE - pgoff);
+                               memcpy(kaddr + pgoff, bh[b]->b_data + offset,
+                                      avail);
+                               pgoff += avail;
+                               bytes += avail;
+                               offset = (offset + avail) % blksz;
+                               if (!offset) {
+                                       put_bh(bh[b]);
+                                       ++b;
+                               }
+                       }
+                       kunmap_atomic(kaddr);
+                       pgoff = 0;
+               } else {
+                       for (i = 0; i < PAGE_SIZE / blksz; ++i) {
+                               if (bh[b])
+                                       put_bh(bh[b]);
+                               ++b;
+                       }
+                       bytes += PAGE_SIZE;
+               }
+               ++p;
+       }
  }
  
-static void direct_finish_page(struct squashfs_page_actor *actor)
+void squashfs_bh_to_buf(struct buffer_head **bh, int nr_buffers, void *buf,
+       int offset, int length, int blksz)
  {
-       if (actor->pageaddr)
-               kunmap_atomic(actor->pageaddr);
+       int i, avail, bytes = 0;
+
+       for (i = 0; i < nr_buffers && bytes < length; ++i) {
+               avail = min_t(int, length - bytes, blksz - offset);
+               if (bh[i]) {
+                       memcpy(buf + bytes, bh[i]->b_data + offset, avail);
+                       put_bh(bh[i]);
+               }
+               bytes += avail;
+               offset = 0;
+       }
  }
  
-struct squashfs_page_actor *squashfs_page_actor_init_special(struct page **page,
-       int pages, int length)
+void free_page_array(struct page **page, int nr_pages)
  {
-       struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
+       int i;
  
-       if (actor == NULL)
-               return NULL;
+       for (i = 0; i < nr_pages; ++i)
+               __free_page(page[i]);
+       kfree(page);
+}
  
-       actor->length = length ? : pages * PAGE_SIZE;
-       actor->page = page;
-       actor->pages = pages;
-       actor->next_page = 0;
-       actor->pageaddr = NULL;
-       actor->squashfs_first_page = direct_first_page;
-       actor->squashfs_next_page = direct_next_page;
-       actor->squashfs_finish_page = direct_finish_page;
-       return actor;
+struct page **alloc_page_array(int nr_pages, int gfp_mask)
+{
+       int i;
+       struct page **page;
+
+       page = kcalloc(nr_pages, sizeof(struct page *), gfp_mask);
+       if (!page)
+               return NULL;
+       for (i = 0; i < nr_pages; ++i) {
+               page[i] = alloc_page(gfp_mask);
+               if (!page[i]) {
+                       free_page_array(page, i);
+                       return NULL;
+               }
+       }
+       return page;
  }
diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h

index 98537eab27e270d8b04f04b7d0db2ee519e21d46..aa1ed790b5a387fce482a79a442dd10d391815c8 100644 (file)
--- a/fs/squashfs/page_actor.h
+++ b/fs/squashfs/page_actor.h
@@ -5,77 +5,61 @@
   * Phillip Lougher <phillip@squashfs.org.uk>
   *
   * This work is licensed under the terms of the GNU GPL, version 2. See
- * the COPYING file in the top-level directory.
+ * the COPYING file in the top-level squashfsory.
   */
  
-#ifndef CONFIG_SQUASHFS_FILE_DIRECT
  struct squashfs_page_actor {
-       void    **page;
+       struct page     **page;
+       void    *pageaddr;
         int     pages;
         int     length;
         int     next_page;
+       void    (*release_pages)(struct page **, int, int);
  };
  
-static inline struct squashfs_page_actor *squashfs_page_actor_init(void **page,
-       int pages, int length)
-{
-       struct squashfs_page_actor *actor = kmalloc(sizeof(*actor), GFP_KERNEL);
-
-       if (actor == NULL)
-               return NULL;
+extern struct squashfs_page_actor *squashfs_page_actor_init(struct page **,
+       int, int, void (*)(struct page **, int, int));
+extern void squashfs_page_actor_free(struct squashfs_page_actor *, int);
  
-       actor->length = length ? : pages * PAGE_SIZE;
-       actor->page = page;
-       actor->pages = pages;
-       actor->next_page = 0;
-       return actor;
-}
+extern void squashfs_actor_to_buf(struct squashfs_page_actor *, void *, int);
+extern void squashfs_buf_to_actor(void *, struct squashfs_page_actor *, int);
+extern void squashfs_bh_to_actor(struct buffer_head **, int,
+       struct squashfs_page_actor *, int, int, int);
+extern void squashfs_bh_to_buf(struct buffer_head **, int, void *, int, int,
+       int);
  
+/*
+ * Calling code should avoid sleeping between calls to squashfs_first_page()
+ * and squashfs_finish_page().
+ */
  static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
  {
         actor->next_page = 1;
-       return actor->page[0];
+       return actor->pageaddr = actor->page[0] ? kmap_atomic(actor->page[0])
+                                               : NULL;
  }
  
  static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
  {
-       return actor->next_page == actor->pages ? NULL :
-               actor->page[actor->next_page++];
-}
+       if (!IS_ERR_OR_NULL(actor->pageaddr))
+               kunmap_atomic(actor->pageaddr);
  
-static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
-{
-       /* empty */
-}
-#else
-struct squashfs_page_actor {
-       union {
-               void            **buffer;
-               struct page     **page;
-       };
-       void    *pageaddr;
-       void    *(*squashfs_first_page)(struct squashfs_page_actor *);
-       void    *(*squashfs_next_page)(struct squashfs_page_actor *);
-       void    (*squashfs_finish_page)(struct squashfs_page_actor *);
-       int     pages;
-       int     length;
-       int     next_page;
-};
+       if (actor->next_page == actor->pages)
+               return actor->pageaddr = ERR_PTR(-ENODATA);
  
-extern struct squashfs_page_actor *squashfs_page_actor_init(void **, int, int);
-extern struct squashfs_page_actor *squashfs_page_actor_init_special(struct page
-                                                        **, int, int);
-static inline void *squashfs_first_page(struct squashfs_page_actor *actor)
-{
-       return actor->squashfs_first_page(actor);
-}
-static inline void *squashfs_next_page(struct squashfs_page_actor *actor)
-{
-       return actor->squashfs_next_page(actor);
+       actor->pageaddr = actor->page[actor->next_page] ?
+           kmap_atomic(actor->page[actor->next_page]) : NULL;
+       ++actor->next_page;
+       return actor->pageaddr;
  }
+
  static inline void squashfs_finish_page(struct squashfs_page_actor *actor)
  {
-       actor->squashfs_finish_page(actor);
+       if (!IS_ERR_OR_NULL(actor->pageaddr))
+               kunmap_atomic(actor->pageaddr);
  }
-#endif
+
+extern struct page **alloc_page_array(int, int);
+extern void free_page_array(struct page **, int);
+
  #endif
diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h

index 887d6d270080a6d8d945868c5bc1265d3f38f93c..f4faab52a879009074586838d7c55fa519a5d202 100644 (file)
--- a/fs/squashfs/squashfs.h
+++ b/fs/squashfs/squashfs.h
@@ -28,8 +28,12 @@
  #define WARNING(s, args...)    pr_warn("SQUASHFS: "s, ## args)
  
  /* block.c */
+extern int squashfs_init_read_wq(void);
+extern void squashfs_destroy_read_wq(void);
  extern int squashfs_read_data(struct super_block *, u64, int, u64 *,
                                 struct squashfs_page_actor *);
+extern int squashfs_read_data_async(struct super_block *, u64, int, u64 *,
+                               struct squashfs_page_actor *);
  
  /* cache.c */
  extern struct squashfs_cache *squashfs_cache_init(char *, int, int);
@@ -70,8 +74,9 @@ extern __le64 *squashfs_read_fragment_index_table(struct super_block *,
  void squashfs_copy_cache(struct page *, struct squashfs_cache_entry *, int,
                                 int);
  
-/* file_xxx.c */
-extern int squashfs_readpage_block(struct page *, u64, int);
+/* file_direct.c */
+extern int squashfs_readpages_block(struct page *, struct list_head *,
+       unsigned int *, struct address_space *, int, u64, int);
  
  /* id.c */
  extern int squashfs_get_id(struct super_block *, unsigned int, unsigned int *);
diff --git a/fs/squashfs/squashfs_fs_sb.h b/fs/squashfs/squashfs_fs_sb.h

index 1da565cb50c3d0f1e652671dd7a81577b598ca2f..8a6995de02773d0387ab36070c1ed1aac8228e6b 100644 (file)
--- a/fs/squashfs/squashfs_fs_sb.h
+++ b/fs/squashfs/squashfs_fs_sb.h
@@ -49,7 +49,7 @@ struct squashfs_cache_entry {
         int                     num_waiters;
         wait_queue_head_t       wait_queue;
         struct squashfs_cache   *cache;
-       void                    **data;
+       struct page             **page;
         struct squashfs_page_actor      *actor;
  };
  
diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c

index cf01e15a7b16dff288e2479014d20e0d787096d5..e2a0a7342bf80b37bf01bc3ecd422f0a566636b1 100644 (file)
--- a/fs/squashfs/super.c
+++ b/fs/squashfs/super.c
@@ -444,9 +444,15 @@ static int __init init_squashfs_fs(void)
         if (err)
                 return err;
  
+       if (!squashfs_init_read_wq()) {
+               destroy_inodecache();
+               return -ENOMEM;
+       }
+
         err = register_filesystem(&squashfs_fs_type);
         if (err) {
                 destroy_inodecache();
+               squashfs_destroy_read_wq();
                 return err;
         }
  
@@ -460,6 +466,7 @@ static void __exit exit_squashfs_fs(void)
  {
         unregister_filesystem(&squashfs_fs_type);
         destroy_inodecache();
+       squashfs_destroy_read_wq();
  }
  
  
diff --git a/fs/squashfs/xz_wrapper.c b/fs/squashfs/xz_wrapper.c

index 6bfaef73d06527f82dcc70ffe33f465a767c4bc2..2f7be1fb167cfa2ce630f947b8cc716fa221bdbe 100644 (file)
--- a/fs/squashfs/xz_wrapper.c
+++ b/fs/squashfs/xz_wrapper.c
@@ -55,7 +55,7 @@ static void *squashfs_xz_comp_opts(struct squashfs_sb_info *msblk,
         struct comp_opts *opts;
         int err = 0, n;
  
-       opts = kmalloc(sizeof(*opts), GFP_KERNEL);
+       opts = kmalloc(sizeof(*opts), GFP_ATOMIC);
         if (opts == NULL) {
                 err = -ENOMEM;
                 goto out2;
@@ -136,6 +136,7 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
         enum xz_ret xz_err;
         int avail, total = 0, k = 0;
         struct squashfs_xz *stream = strm;
+       void *buf = NULL;
  
         xz_dec_reset(stream->state);
         stream->buf.in_pos = 0;
@@ -156,12 +157,20 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
  
                 if (stream->buf.out_pos == stream->buf.out_size) {
                         stream->buf.out = squashfs_next_page(output);
-                       if (stream->buf.out != NULL) {
+                       if (!IS_ERR(stream->buf.out)) {
                                 stream->buf.out_pos = 0;
                                 total += PAGE_SIZE;
                         }
                 }
  
+               if (!stream->buf.out) {
+                       if (!buf) {
+                               buf = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+                               if (!buf)
+                                       goto out;
+                       }
+                       stream->buf.out = buf;
+               }
                 xz_err = xz_dec_run(stream->state, &stream->buf);
  
                 if (stream->buf.in_pos == stream->buf.in_size && k < b)
@@ -173,11 +182,13 @@ static int squashfs_xz_uncompress(struct squashfs_sb_info *msblk, void *strm,
         if (xz_err != XZ_STREAM_END || k < b)
                 goto out;
  
+       kfree(buf);
         return total + stream->buf.out_pos;
  
  out:
         for (; k < b; k++)
                 put_bh(bh[k]);
+       kfree(buf);
  
         return -EIO;
  }
diff --git a/fs/squashfs/zlib_wrapper.c b/fs/squashfs/zlib_wrapper.c

index 2ec24d128bce0856ec89b46c3ec0476b5de78c87..d917c728422b5120c4eddecebcd847735c38b8cd 100644 (file)
--- a/fs/squashfs/zlib_wrapper.c
+++ b/fs/squashfs/zlib_wrapper.c
@@ -66,6 +66,7 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
         struct buffer_head **bh, int b, int offset, int length,
         struct squashfs_page_actor *output)
  {
+       void *buf = NULL;
         int zlib_err, zlib_init = 0, k = 0;
         z_stream *stream = strm;
  
@@ -84,10 +85,19 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
  
                 if (stream->avail_out == 0) {
                         stream->next_out = squashfs_next_page(output);
-                       if (stream->next_out != NULL)
+                       if (!IS_ERR(stream->next_out))
                                 stream->avail_out = PAGE_SIZE;
                 }
  
+               if (!stream->next_out) {
+                       if (!buf) {
+                               buf = kmalloc(PAGE_SIZE, GFP_ATOMIC);
+                               if (!buf)
+                                       goto out;
+                       }
+                       stream->next_out = buf;
+               }
+
                 if (!zlib_init) {
                         zlib_err = zlib_inflateInit(stream);
                         if (zlib_err != Z_OK) {
@@ -115,11 +125,13 @@ static int zlib_uncompress(struct squashfs_sb_info *msblk, void *strm,
         if (k < b)
                 goto out;
  
+       kfree(buf);
         return stream->total_out;
  
  out:
         for (; k < b; k++)
                 put_bh(bh[k]);
+       kfree(buf);
  
         return -EIO;
  }
diff --git a/fs/super.c b/fs/super.c

index 994db21f59bf58d3f83d750f8524d1c35849706b..5dcfc793ce3e74f934fb54af947412996d388f34 100644 (file)
--- a/fs/super.c
+++ b/fs/super.c
@@ -800,7 +800,8 @@ rescan:
  }
  
  /**
- *     do_remount_sb - asks filesystem to change mount options.
+ *     do_remount_sb2 - asks filesystem to change mount options.
+ *     @mnt:   mount we are looking at
   *     @sb:    superblock in question
   *     @sb_flags: revised superblock flags
   *     @data:  the rest of options
@@ -808,7 +809,7 @@ rescan:
   *
   *     Alters the mount options of a mounted file system.
   */
-int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
+int do_remount_sb2(struct vfsmount *mnt, struct super_block *sb, int sb_flags, void *data, int force)
  {
         int retval;
         int remount_ro;
@@ -850,7 +851,16 @@ int do_remount_sb(struct super_block *sb, int sb_flags, void *data, int force)
                 }
         }
  
-       if (sb->s_op->remount_fs) {
+       if (mnt && sb->s_op->remount_fs2) {
+               retval = sb->s_op->remount_fs2(mnt, sb, &sb_flags, data);
+               if (retval) {
+                       if (!force)
+                               goto cancel_readonly;
+                       /* If forced remount, go ahead despite any errors */
+                       WARN(1, "forced remount of a %s fs returned %i\n",
+                            sb->s_type->name, retval);
+               }
+       } else if (sb->s_op->remount_fs) {
                 retval = sb->s_op->remount_fs(sb, &sb_flags, data);
                 if (retval) {
                         if (!force)
@@ -882,12 +892,17 @@ cancel_readonly:
         return retval;
  }
  
+int do_remount_sb(struct super_block *sb, int flags, void *data, int force)
+{
+       return do_remount_sb2(NULL, sb, flags, data, force);
+}
+
  static void do_emergency_remount(struct work_struct *work)
  {
         struct super_block *sb, *p = NULL;
  
         spin_lock(&sb_lock);
-       list_for_each_entry(sb, &super_blocks, s_list) {
+       list_for_each_entry_reverse(sb, &super_blocks, s_list) {
                 if (hlist_unhashed(&sb->s_instances))
                         continue;
                 sb->s_count++;
@@ -1203,7 +1218,7 @@ struct dentry *mount_single(struct file_system_type *fs_type,
  EXPORT_SYMBOL(mount_single);
  
  struct dentry *
-mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
+mount_fs(struct file_system_type *type, int flags, const char *name, struct vfsmount *mnt, void *data)
  {
         struct dentry *root;
         struct super_block *sb;
@@ -1220,7 +1235,10 @@ mount_fs(struct file_system_type *type, int flags, const char *name, void *data)
                         goto out_free_secdata;
         }
  
-       root = type->mount(type, flags, name, data);
+       if (type->mount2)
+               root = type->mount2(mnt, type, flags, name, data);
+       else
+               root = type->mount(type, flags, name, data);
         if (IS_ERR(root)) {
                 error = PTR_ERR(root);
                 goto out_free_secdata;
diff --git a/fs/sync.c b/fs/sync.c

index 83ac79a960dd1aea9aa79932bbb08de662e7abab..12f2aa594c5035e45a31de3211c7c1080b732e07 100644 (file)
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -9,7 +9,7 @@
  #include <linux/slab.h>
  #include <linux/export.h>
  #include <linux/namei.h>
-#include <linux/sched.h>
+#include <linux/sched/xacct.h>
  #include <linux/writeback.h>
  #include <linux/syscalls.h>
  #include <linux/linkage.h>
@@ -219,6 +219,7 @@ static int do_fsync(unsigned int fd, int datasync)
         if (f.file) {
                 ret = vfs_fsync(f.file, datasync);
                 fdput(f);
+               inc_syscfs(current);
         }
         return ret;
  }
diff --git a/fs/ubifs/crypto.c b/fs/ubifs/crypto.c

index 16a5d5c82073cc38a132c98e392788e3c6db74b4..616a688f5d8fd738369b96de15a0c6f27689dce8 100644 (file)
--- a/fs/ubifs/crypto.c
+++ b/fs/ubifs/crypto.c
@@ -88,7 +88,6 @@ const struct fscrypt_operations ubifs_crypt_operations = {
         .key_prefix             = "ubifs:",
         .get_context            = ubifs_crypt_get_context,
         .set_context            = ubifs_crypt_set_context,
-       .is_encrypted           = __ubifs_crypt_is_encrypted,
         .empty_dir              = ubifs_crypt_empty_dir,
         .max_namelen            = ubifs_crypt_max_namelen,
  };
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c

index fdc311246807a4c3a018ff53e06f44614f5c6800..0164bcc827f891cf7d3e2cbead41989336244786 100644 (file)
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -38,7 +38,8 @@ void ubifs_set_inode_flags(struct inode *inode)
  {
         unsigned int flags = ubifs_inode(inode)->flags;
  
-       inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC);
+       inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_DIRSYNC |
+                           S_ENCRYPTED);
         if (flags & UBIFS_SYNC_FL)
                 inode->i_flags |= S_SYNC;
         if (flags & UBIFS_APPEND_FL)
@@ -47,6 +48,8 @@ void ubifs_set_inode_flags(struct inode *inode)
                 inode->i_flags |= S_IMMUTABLE;
         if (flags & UBIFS_DIRSYNC_FL)
                 inode->i_flags |= S_DIRSYNC;
+       if (flags & UBIFS_CRYPT_FL)
+               inode->i_flags |= S_ENCRYPTED;
  }
  
  /*
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c

index 5496b17b959c702408c933c3a98c460042e00543..7503e7cdf8702a61ce91576316bfce10bd63e113 100644 (file)
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2007,12 +2007,6 @@ static struct ubifs_info *alloc_ubifs_info(struct ubi_volume_desc *ubi)
         return c;
  }
  
-#ifndef CONFIG_UBIFS_FS_ENCRYPTION
-const struct fscrypt_operations ubifs_crypt_operations = {
-       .is_encrypted           = __ubifs_crypt_is_encrypted,
-};
-#endif
-
  static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
  {
         struct ubifs_info *c = sb->s_fs_info;
@@ -2055,7 +2049,9 @@ static int ubifs_fill_super(struct super_block *sb, void *data, int silent)
                 sb->s_maxbytes = c->max_inode_sz = MAX_LFS_FILESIZE;
         sb->s_op = &ubifs_super_operations;
         sb->s_xattr = ubifs_xattr_handlers;
+#ifdef CONFIG_UBIFS_FS_ENCRYPTION
         sb->s_cop = &ubifs_crypt_operations;
+#endif
  
         mutex_lock(&c->umount_mutex);
         err = mount_ubifs(c);
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h

index cd43651f173141dd854d07ffc140a953cab92f10..63c7468147eb9b573db8270f55e8840febe99ffc 100644 (file)
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -38,12 +38,11 @@
  #include <linux/backing-dev.h>
  #include <linux/security.h>
  #include <linux/xattr.h>
-#ifdef CONFIG_UBIFS_FS_ENCRYPTION
-#include <linux/fscrypt_supp.h>
-#else
-#include <linux/fscrypt_notsupp.h>
-#endif
  #include <linux/random.h>
+
+#define __FS_HAS_ENCRYPTION IS_ENABLED(CONFIG_UBIFS_FS_ENCRYPTION)
+#include <linux/fscrypt.h>
+
  #include "ubifs-media.h"
  
  /* Version of this UBIFS implementation */
@@ -1835,18 +1834,13 @@ int ubifs_decrypt(const struct inode *inode, struct ubifs_data_node *dn,
  
  extern const struct fscrypt_operations ubifs_crypt_operations;
  
-static inline bool __ubifs_crypt_is_encrypted(struct inode *inode)
+static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
  {
-       struct ubifs_inode *ui = ubifs_inode(inode);
+       const struct ubifs_inode *ui = ubifs_inode(inode);
  
         return ui->flags & UBIFS_CRYPT_FL;
  }
  
-static inline bool ubifs_crypt_is_encrypted(const struct inode *inode)
-{
-       return __ubifs_crypt_is_encrypted((struct inode *)inode);
-}
-
  /* Normal UBIFS messages */
  __printf(2, 3)
  void ubifs_msg(const struct ubifs_info *c, const char *fmt, ...);
diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c

index c13eae819cbc889ffe245dc5928278b1ccc2f4c5..5ddc89d564fd4827f3d95beb6e4ce64d8fe963c3 100644 (file)
--- a/fs/ubifs/xattr.c
+++ b/fs/ubifs/xattr.c
@@ -170,6 +170,7 @@ static int create_xattr(struct ubifs_info *c, struct inode *host,
         err = ubifs_jnl_update(c, host, nm, inode, 0, 1);
         if (err)
                 goto out_cancel;
+       ubifs_set_inode_flags(host);
         mutex_unlock(&host_ui->ui_mutex);
  
         ubifs_release_budget(c, &req);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c

index 5aa392eae1c3c2e4ceb39db2da63d55d0b27a38a..5811bb06a0cc5d27d5106104fbc4c6408851c64c 100644 (file)
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -877,7 +877,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
                                  new_flags, vma->anon_vma,
                                  vma->vm_file, vma->vm_pgoff,
                                  vma_policy(vma),
-                                NULL_VM_UFFD_CTX);
+                                NULL_VM_UFFD_CTX,
+                                vma_get_anon_name(vma));
                 if (prev)
                         vma = prev;
                 else
@@ -1420,7 +1421,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
                 prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                  vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                  vma_policy(vma),
-                                ((struct vm_userfaultfd_ctx){ ctx }));
+                                ((struct vm_userfaultfd_ctx){ ctx }),
+                                vma_get_anon_name(vma));
                 if (prev) {
                         vma = prev;
                         goto next;
@@ -1578,7 +1580,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
                 prev = vma_merge(mm, prev, start, vma_end, new_flags,
                                  vma->anon_vma, vma->vm_file, vma->vm_pgoff,
                                  vma_policy(vma),
-                                NULL_VM_UFFD_CTX);
+                                NULL_VM_UFFD_CTX,
+                                vma_get_anon_name(vma));
                 if (prev) {
                         vma = prev;
                         goto next;
diff --git a/fs/utimes.c b/fs/utimes.c

index e4b3d7c2c9f55182049b475117cf3fe2c06e76d6..4f3b158f04a48504bb81eae264b9425f663867e6 100644 (file)
--- a/fs/utimes.c
+++ b/fs/utimes.c
@@ -88,7 +88,7 @@ static int utimes_common(const struct path *path, struct timespec64 *times)
         }
  retry_deleg:
         inode_lock(inode);
-       error = notify_change(path->dentry, &newattrs, &delegated_inode);
+       error = notify_change2(path->mnt, path->dentry, &newattrs, &delegated_inode);
         inode_unlock(inode);
         if (delegated_inode) {
                 error = break_deleg_wait(&delegated_inode);
diff --git a/fs/xattr.c b/fs/xattr.c

index 61cd28ba25f364df5af103277924befb6d4a39a0..bf6f6e761b886e629220ead421dd298e3e88d5ad 100644 (file)
--- a/fs/xattr.c
+++ b/fs/xattr.c
@@ -131,7 +131,7 @@ xattr_permission(struct inode *inode, const char *name, int mask)
                         return -EPERM;
         }
  
-       return inode_permission(inode, mask);
+       return inode_permission2(ERR_PTR(-EOPNOTSUPP), inode, mask);
  }
  
  int
diff --git a/include/linux/amba/mmci.h b/include/linux/amba/mmci.h

index da8357ba11bcb45b52a2c23d2e0aedb9c51096e0..02dc2f3a76f74a472ab1dde8567fe21ad06487c7 100644 (file)
--- a/include/linux/amba/mmci.h
+++ b/include/linux/amba/mmci.h
@@ -6,6 +6,15 @@
  #define AMBA_MMCI_H
  
  #include <linux/mmc/host.h>
+#include <linux/mmc/card.h>
+#include <linux/mmc/sdio_func.h>
+
+struct embedded_sdio_data {
+        struct sdio_cis cis;
+        struct sdio_cccr cccr;
+        struct sdio_embedded_func *funcs;
+        int num_funcs;
+};
  
  /**
   * struct mmci_platform_data - platform configuration for the MMCI
@@ -32,6 +41,7 @@ struct mmci_platform_data {
         int     gpio_wp;
         int     gpio_cd;
         bool    cd_invert;
+       struct embedded_sdio_data *embedded_sdio;
  };
  
  #endif
diff --git a/include/linux/android_aid.h b/include/linux/android_aid.h

new file mode 100644 (file)

index 0000000..6f1fa17
--- /dev/null
+++ b/include/linux/android_aid.h
@@ -0,0 +1,28 @@
+/* include/linux/android_aid.h
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_ANDROID_AID_H
+#define _LINUX_ANDROID_AID_H
+
+/* AIDs that the kernel treats differently */
+#define AID_OBSOLETE_000 KGIDT_INIT(3001)  /* was NET_BT_ADMIN */
+#define AID_OBSOLETE_001 KGIDT_INIT(3002)  /* was NET_BT */
+#define AID_INET         KGIDT_INIT(3003)
+#define AID_NET_RAW      KGIDT_INIT(3004)
+#define AID_NET_ADMIN    KGIDT_INIT(3005)
+#define AID_NET_BW_STATS KGIDT_INIT(3006)  /* read bandwidth statistics */
+#define AID_NET_BW_ACCT  KGIDT_INIT(3007)  /* change bandwidth statistics accounting */
+
+#endif
diff --git a/include/linux/arch_topology.h b/include/linux/arch_topology.h

index d4fcb0efb896c3e4a40cb527c4da531bc58b3831..e7fe03600c0212f1dc4842119ffcb367d98d8fdb 100644 (file)
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -6,15 +6,35 @@
  #define _LINUX_ARCH_TOPOLOGY_H_
  
  #include <linux/types.h>
+#include <linux/percpu.h>
  
  void topology_normalize_cpu_scale(void);
+int topology_detect_flags(void);
+int topology_smt_flags(void);
+int topology_core_flags(void);
+int topology_cpu_flags(void);
+int topology_update_cpu_topology(void);
  
  struct device_node;
  bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
  
+DECLARE_PER_CPU(unsigned long, cpu_scale);
+
  struct sched_domain;
-unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu);
+static inline
+unsigned long topology_get_cpu_scale(struct sched_domain *sd, int cpu)
+{
+       return per_cpu(cpu_scale, cpu);
+}
  
  void topology_set_cpu_scale(unsigned int cpu, unsigned long capacity);
  
+DECLARE_PER_CPU(unsigned long, freq_scale);
+
+static inline
+unsigned long topology_get_freq_scale(struct sched_domain *sd, int cpu)
+{
+       return per_cpu(freq_scale, cpu);
+}
+
  #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
diff --git a/include/linux/bpf.h b/include/linux/bpf.h

index 5c5be80ce802cea43190d08df64f29eee95f73f9..0961516d9de0f8b31e51abc591bc3a3138fa1a60 100644 (file)
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -198,6 +198,9 @@ struct bpf_prog_aux {
         struct bpf_map **used_maps;
         struct bpf_prog *prog;
         struct user_struct *user;
+#ifdef CONFIG_SECURITY
+       void *security;
+#endif
         union {
                 struct work_struct work;
                 struct rcu_head rcu;
@@ -253,6 +256,9 @@ int bpf_prog_test_run_skb(struct bpf_prog *prog, const union bpf_attr *kattr,
  #ifdef CONFIG_BPF_SYSCALL
  DECLARE_PER_CPU(int, bpf_prog_active);
  
+extern const struct file_operations bpf_map_fops;
+extern const struct file_operations bpf_prog_fops;
+
  #define BPF_PROG_TYPE(_id, _ops) \
         extern const struct bpf_verifier_ops _ops;
  #define BPF_MAP_TYPE(_id, _ops) \
@@ -282,11 +288,11 @@ void bpf_map_area_free(void *base);
  
  extern int sysctl_unprivileged_bpf_disabled;
  
-int bpf_map_new_fd(struct bpf_map *map);
+int bpf_map_new_fd(struct bpf_map *map, int flags);
  int bpf_prog_new_fd(struct bpf_prog *prog);
  
  int bpf_obj_pin_user(u32 ufd, const char __user *pathname);
-int bpf_obj_get_user(const char __user *pathname);
+int bpf_obj_get_user(const char __user *pathname, int flags);
  
  int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value);
  int bpf_percpu_array_copy(struct bpf_map *map, void *key, void *value);
@@ -305,6 +311,8 @@ int bpf_fd_htab_map_update_elem(struct bpf_map *map, struct file *map_file,
                                 void *key, void *value, u64 map_flags);
  int bpf_fd_htab_map_lookup_elem(struct bpf_map *map, void *key, u32 *value);
  
+int bpf_get_file_flag(int flags);
+
  /* memcpy that is used with 8-byte aligned pointers, power-of-8 size and
   * forced to use 'long' read/writes to try to atomically copy long counters.
   * Best-effort only.  No barriers here, since it _will_ race with concurrent
@@ -381,7 +389,7 @@ static inline void __bpf_prog_uncharge(struct user_struct *user, u32 pages)
  {
  }
  
-static inline int bpf_obj_get_user(const char __user *pathname)
+static inline int bpf_obj_get_user(const char __user *pathname, int flags)
  {
         return -EOPNOTSUPP;
  }
diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h

index acb77dcff3b41dbb5d265ee358b1a6958d809c11..8996c092568bc233a2aa7df33717f8c9690b95b6 100644 (file)
--- a/include/linux/cgroup_subsys.h
+++ b/include/linux/cgroup_subsys.h
@@ -21,6 +21,10 @@ SUBSYS(cpu)
  SUBSYS(cpuacct)
  #endif
  
+#if IS_ENABLED(CONFIG_SCHED_TUNE)
+SUBSYS(schedtune)
+#endif
+
  #if IS_ENABLED(CONFIG_BLK_CGROUP)
  SUBSYS(io)
  #endif
diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h

index cbf85c4c745f856b6c5e8f4f6d6c3dad80b887c7..065f3a8eb48615becdb299e0b50241794799facf 100644 (file)
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -920,6 +920,9 @@ static inline bool policy_has_boost_freq(struct cpufreq_policy *policy)
  extern void arch_freq_prepare_all(void);
  extern unsigned int arch_freq_get_on_cpu(int cpu);
  
+extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
+                               unsigned long max_freq);
+
  /* the following are really really optional */
  extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
  extern struct freq_attr cpufreq_freq_attr_scaling_boost_freqs;
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h

index 8f7788d23b5732a3fb5c2b1d704b05d370d41954..b7c12ee882b978f9adca1b4ac024283b49ecc294 100644 (file)
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -214,7 +214,7 @@ static inline void cpuidle_use_deepest_state(bool enable)
  #endif
  
  /* kernel/sched/idle.c */
-extern void sched_idle_set_state(struct cpuidle_state *idle_state);
+extern void sched_idle_set_state(struct cpuidle_state *idle_state, int index);
  extern void default_idle_call(void);
  
  #ifdef CONFIG_ARCH_NEEDS_CPU_IDLE_COUPLED
diff --git a/include/linux/crypto.h b/include/linux/crypto.h

index cc36484d29e1671826776ac4bcd3446a96474b1c..29c4257f9c5b14c3c66e2e57eef573f9a2ff7a10 100644 (file)
--- a/include/linux/crypto.h
+++ b/include/linux/crypto.h
@@ -24,6 +24,7 @@
  #include <linux/slab.h>
  #include <linux/string.h>
  #include <linux/uaccess.h>
+#include <linux/completion.h>
  
  /*
   * Autoloaded crypto modules should only use a prefixed name to avoid allowing
@@ -475,6 +476,45 @@ struct crypto_alg {
         struct module *cra_module;
  } CRYPTO_MINALIGN_ATTR;
  
+/*
+ * A helper struct for waiting for completion of async crypto ops
+ */
+struct crypto_wait {
+       struct completion completion;
+       int err;
+};
+
+/*
+ * Macro for declaring a crypto op async wait object on stack
+ */
+#define DECLARE_CRYPTO_WAIT(_wait) \
+       struct crypto_wait _wait = { \
+               COMPLETION_INITIALIZER_ONSTACK((_wait).completion), 0 }
+
+/*
+ * Async ops completion helper functioons
+ */
+void crypto_req_done(struct crypto_async_request *req, int err);
+
+static inline int crypto_wait_req(int err, struct crypto_wait *wait)
+{
+       switch (err) {
+       case -EINPROGRESS:
+       case -EBUSY:
+               wait_for_completion(&wait->completion);
+               reinit_completion(&wait->completion);
+               err = wait->err;
+               break;
+       };
+
+       return err;
+}
+
+static inline void crypto_init_wait(struct crypto_wait *wait)
+{
+       init_completion(&wait->completion);
+}
+
  /*
   * Algorithm registration interface.
   */
diff --git a/include/linux/dcache.h b/include/linux/dcache.h

index f05a659cdf348a0f2efa2f11b1a932a8f3181482..d8fcc02e378be17e2f83deaa974d7b700cdc6751 100644 (file)
--- a/include/linux/dcache.h
+++ b/include/linux/dcache.h
@@ -149,6 +149,7 @@ struct dentry_operations {
         int (*d_manage)(const struct path *, bool);
         struct dentry *(*d_real)(struct dentry *, const struct inode *,
                                  unsigned int, unsigned int);
+       void (*d_canonical_path)(const struct path *, struct path *);
  } ____cacheline_aligned;
  
  /*
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h

index a5538433c927abebad75ec6763a3e6ab8f1cc0e2..a79c930d1dc475078145431fc4a77e3ed2c443f2 100644 (file)
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -430,6 +430,12 @@ void dm_put(struct mapped_device *md);
  void dm_set_mdptr(struct mapped_device *md, void *ptr);
  void *dm_get_mdptr(struct mapped_device *md);
  
+/*
+ * Export the device via the ioctl interface (uses mdptr).
+ */
+int dm_ioctl_export(struct mapped_device *md, const char *name,
+                   const char *uuid);
+
  /*
   * A device can still be used while suspended, but I/O is deferred.
   */
@@ -459,6 +465,13 @@ union map_info *dm_get_rq_mapinfo(struct request *rq);
  
  struct queue_limits *dm_get_queue_limits(struct mapped_device *md);
  
+void dm_lock_md_type(struct mapped_device *md);
+void dm_unlock_md_type(struct mapped_device *md);
+void dm_set_md_type(struct mapped_device *md, unsigned type);
+unsigned dm_get_md_type(struct mapped_device *md);
+int dm_setup_md_queue(struct mapped_device *md, struct dm_table *t);
+unsigned dm_table_get_type(struct dm_table *t);
+
  /*
   * Geometry functions.
   */
diff --git a/include/linux/dma-fence.h b/include/linux/dma-fence.h

index 171895072435bd7efa0303e401a3f907d12cdc7d..6385ecd031559e602c936a523fb9ccd0542cfe14 100644 (file)
--- a/include/linux/dma-fence.h
+++ b/include/linux/dma-fence.h
@@ -111,6 +111,7 @@ struct dma_fence_cb {
   * @get_driver_name: returns the driver name.
   * @get_timeline_name: return the name of the context this fence belongs to.
   * @enable_signaling: enable software signaling of fence.
+ * @disable_signaling: disable software signaling of fence (optional).
   * @signaled: [optional] peek whether the fence is signaled, can be null.
   * @wait: custom wait implementation, or dma_fence_default_wait.
   * @release: [optional] called on destruction of fence, can be null
@@ -170,6 +171,7 @@ struct dma_fence_ops {
         const char * (*get_driver_name)(struct dma_fence *fence);
         const char * (*get_timeline_name)(struct dma_fence *fence);
         bool (*enable_signaling)(struct dma_fence *fence);
+       void (*disable_signaling)(struct dma_fence *fence);
         bool (*signaled)(struct dma_fence *fence);
         signed long (*wait)(struct dma_fence *fence,
                             bool intr, signed long timeout);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h

index 2a0c453d72354ef6394cf06d2831326ff6bf7aaf..43e98d30d2df210bba056e89ae64e0d2eeddd6a4 100644 (file)
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -36,6 +36,8 @@
  #define F2FS_NODE_INO(sbi)     ((sbi)->node_ino_num)
  #define F2FS_META_INO(sbi)     ((sbi)->meta_ino_num)
  
+#define F2FS_MAX_QUOTAS                3
+
  #define F2FS_IO_SIZE(sbi)      (1 << (sbi)->write_io_size_bits) /* Blocks */
  #define F2FS_IO_SIZE_KB(sbi)   (1 << ((sbi)->write_io_size_bits + 2)) /* KB */
  #define F2FS_IO_SIZE_BYTES(sbi)        (1 << ((sbi)->write_io_size_bits + 12)) /* B */
@@ -108,7 +110,8 @@ struct f2fs_super_block {
         __u8 encryption_level;          /* versioning level for encryption */
         __u8 encrypt_pw_salt[16];       /* Salt used for string2key algorithm */
         struct f2fs_device devs[MAX_DEVICES];   /* device list */
-       __u8 reserved[327];             /* valid reserved region */
+       __le32 qf_ino[F2FS_MAX_QUOTAS]; /* quota inode numbers */
+       __u8 reserved[315];             /* valid reserved region */
  } __packed;
  
  /*
@@ -184,7 +187,8 @@ struct f2fs_extent {
  } __packed;
  
  #define F2FS_NAME_LEN          255
-#define F2FS_INLINE_XATTR_ADDRS        50      /* 200 bytes for inline xattrs */
+/* 200 bytes for inline xattrs by default */
+#define DEFAULT_INLINE_XATTR_ADDRS     50
  #define DEF_ADDRS_PER_INODE    923     /* Address Pointers in an Inode */
  #define CUR_ADDRS_PER_INODE(inode)     (DEF_ADDRS_PER_INODE - \
                                         get_extra_isize(inode))
@@ -238,7 +242,7 @@ struct f2fs_inode {
         union {
                 struct {
                         __le16 i_extra_isize;   /* extra inode attribute size */
-                       __le16 i_padding;       /* padding */
+                       __le16 i_inline_xattr_size;     /* inline xattr size, unit: 4 bytes */
                         __le32 i_projid;        /* project id */
                         __le32 i_inode_checksum;/* inode meta checksum */
                         __le32 i_extra_end[0];  /* for attribute size calculation */
diff --git a/include/linux/fs.h b/include/linux/fs.h

index 440281f8564d8aadb3fd8d7e2286d6e9189824f3..92a33ab261ebca5fbe7632d8e3bad64c362d6377 100644 (file)
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1596,13 +1596,21 @@ extern bool inode_owner_or_capable(const struct inode *inode);
   * VFS helper functions..
   */
  extern int vfs_create(struct inode *, struct dentry *, umode_t, bool);
+extern int vfs_create2(struct vfsmount *, struct inode *, struct dentry *, umode_t, bool);
  extern int vfs_mkdir(struct inode *, struct dentry *, umode_t);
+extern int vfs_mkdir2(struct vfsmount *, struct inode *, struct dentry *, umode_t);
  extern int vfs_mknod(struct inode *, struct dentry *, umode_t, dev_t);
+extern int vfs_mknod2(struct vfsmount *, struct inode *, struct dentry *, umode_t, dev_t);
  extern int vfs_symlink(struct inode *, struct dentry *, const char *);
+extern int vfs_symlink2(struct vfsmount *, struct inode *, struct dentry *, const char *);
  extern int vfs_link(struct dentry *, struct inode *, struct dentry *, struct inode **);
+extern int vfs_link2(struct vfsmount *, struct dentry *, struct inode *, struct dentry *, struct inode **);
  extern int vfs_rmdir(struct inode *, struct dentry *);
+extern int vfs_rmdir2(struct vfsmount *, struct inode *, struct dentry *);
  extern int vfs_unlink(struct inode *, struct dentry *, struct inode **);
+extern int vfs_unlink2(struct vfsmount *, struct inode *, struct dentry *, struct inode **);
  extern int vfs_rename(struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
+extern int vfs_rename2(struct vfsmount *, struct inode *, struct dentry *, struct inode *, struct dentry *, struct inode **, unsigned int);
  extern int vfs_whiteout(struct inode *, struct dentry *);
  
  extern struct dentry *vfs_tmpfile(struct dentry *dentry, umode_t mode,
@@ -1733,6 +1741,7 @@ struct inode_operations {
         struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
         const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
         int (*permission) (struct inode *, int);
+       int (*permission2) (struct vfsmount *, struct inode *, int);
         struct posix_acl * (*get_acl)(struct inode *, int);
  
         int (*readlink) (struct dentry *, char __user *,int);
@@ -1747,7 +1756,8 @@ struct inode_operations {
         int (*rename) (struct inode *, struct dentry *,
                         struct inode *, struct dentry *, unsigned int);
         int (*setattr) (struct dentry *, struct iattr *);
-       int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
+       int (*setattr2) (struct vfsmount *, struct dentry *, struct iattr *);
+        int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
         ssize_t (*listxattr) (struct dentry *, char *, size_t);
         int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
                       u64 len);
@@ -1815,9 +1825,13 @@ struct super_operations {
         int (*unfreeze_fs) (struct super_block *);
         int (*statfs) (struct dentry *, struct kstatfs *);
         int (*remount_fs) (struct super_block *, int *, char *);
+       int (*remount_fs2) (struct vfsmount *, struct super_block *, int *, char *);
+       void *(*clone_mnt_data) (void *);
+       void (*copy_mnt_data) (void *, void *);
         void (*umount_begin) (struct super_block *);
  
         int (*show_options)(struct seq_file *, struct dentry *);
+       int (*show_options2)(struct vfsmount *,struct seq_file *, struct dentry *);
         int (*show_devname)(struct seq_file *, struct dentry *);
         int (*show_path)(struct seq_file *, struct dentry *);
         int (*show_stats)(struct seq_file *, struct dentry *);
@@ -1854,6 +1868,7 @@ struct super_operations {
  #else
  #define S_DAX          0       /* Make all the DAX code disappear */
  #endif
+#define S_ENCRYPTED    16384   /* Encrypted file (using fs/crypto/) */
  
  /*
   * Note that nosuid etc flags are inode-specific: setting some file-system
@@ -1893,6 +1908,7 @@ static inline bool sb_rdonly(const struct super_block *sb) { return sb->s_flags
  #define IS_AUTOMOUNT(inode)    ((inode)->i_flags & S_AUTOMOUNT)
  #define IS_NOSEC(inode)                ((inode)->i_flags & S_NOSEC)
  #define IS_DAX(inode)          ((inode)->i_flags & S_DAX)
+#define IS_ENCRYPTED(inode)    ((inode)->i_flags & S_ENCRYPTED)
  
  #define IS_WHITEOUT(inode)     (S_ISCHR(inode->i_mode) && \
                                  (inode)->i_rdev == WHITEOUT_DEV)
@@ -2075,6 +2091,9 @@ struct file_system_type {
  #define FS_RENAME_DOES_D_MOVE  32768   /* FS will handle d_move() during rename() internally. */
         struct dentry *(*mount) (struct file_system_type *, int,
                        const char *, void *);
+       struct dentry *(*mount2) (struct vfsmount *, struct file_system_type *, int,
+                              const char *, void *);
+       void *(*alloc_mnt_data) (void);
         void (*kill_sb) (struct super_block *);
         struct module *owner;
         struct file_system_type * next;
@@ -2376,6 +2395,8 @@ struct filename {
  extern long vfs_truncate(const struct path *, loff_t);
  extern int do_truncate(struct dentry *, loff_t start, unsigned int time_attrs,
                        struct file *filp);
+extern int do_truncate2(struct vfsmount *, struct dentry *, loff_t start,
+                       unsigned int time_attrs, struct file *filp);
  extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
                         loff_t len);
  extern long do_sys_open(int dfd, const char __user *filename, int flags,
@@ -2680,8 +2701,11 @@ extern void emergency_remount(void);
  extern sector_t bmap(struct inode *, sector_t);
  #endif
  extern int notify_change(struct dentry *, struct iattr *, struct inode **);
+extern int notify_change2(struct vfsmount *, struct dentry *, struct iattr *, struct inode **);
  extern int inode_permission(struct inode *, int);
+extern int inode_permission2(struct vfsmount *, struct inode *, int);
  extern int __inode_permission(struct inode *, int);
+extern int __inode_permission2(struct vfsmount *, struct inode *, int);
  extern int generic_permission(struct inode *, int);
  extern int __check_sticky(struct inode *dir, struct inode *inode);
  
diff --git a/include/linux/fscrypt.h b/include/linux/fscrypt.h

new file mode 100644 (file)

index 0000000..08b4b40
--- /dev/null
+++ b/include/linux/fscrypt.h
@@ -0,0 +1,294 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * fscrypt.h: declarations for per-file encryption
+ *
+ * Filesystems that implement per-file encryption include this header
+ * file with the __FS_HAS_ENCRYPTION set according to whether that filesystem
+ * is being built with encryption support or not.
+ *
+ * Copyright (C) 2015, Google, Inc.
+ *
+ * Written by Michael Halcrow, 2015.
+ * Modified by Jaegeuk Kim, 2015.
+ */
+#ifndef _LINUX_FSCRYPT_H
+#define _LINUX_FSCRYPT_H
+
+#include <linux/key.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/bio.h>
+#include <linux/dcache.h>
+#include <crypto/skcipher.h>
+#include <uapi/linux/fs.h>
+
+#define FS_CRYPTO_BLOCK_SIZE           16
+
+struct fscrypt_info;
+
+struct fscrypt_ctx {
+       union {
+               struct {
+                       struct page *bounce_page;       /* Ciphertext page */
+                       struct page *control_page;      /* Original page  */
+               } w;
+               struct {
+                       struct bio *bio;
+                       struct work_struct work;
+               } r;
+               struct list_head free_list;     /* Free list */
+       };
+       u8 flags;                               /* Flags */
+};
+
+/**
+ * For encrypted symlinks, the ciphertext length is stored at the beginning
+ * of the string in little-endian format.
+ */
+struct fscrypt_symlink_data {
+       __le16 len;
+       char encrypted_path[1];
+} __packed;
+
+struct fscrypt_str {
+       unsigned char *name;
+       u32 len;
+};
+
+struct fscrypt_name {
+       const struct qstr *usr_fname;
+       struct fscrypt_str disk_name;
+       u32 hash;
+       u32 minor_hash;
+       struct fscrypt_str crypto_buf;
+};
+
+#define FSTR_INIT(n, l)                { .name = n, .len = l }
+#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
+#define fname_name(p)          ((p)->disk_name.name)
+#define fname_len(p)           ((p)->disk_name.len)
+
+/*
+ * fscrypt superblock flags
+ */
+#define FS_CFLG_OWN_PAGES (1U << 1)
+
+/*
+ * crypto opertions for filesystems
+ */
+struct fscrypt_operations {
+       unsigned int flags;
+       const char *key_prefix;
+       int (*get_context)(struct inode *, void *, size_t);
+       int (*set_context)(struct inode *, const void *, size_t, void *);
+       bool (*dummy_context)(struct inode *);
+       bool (*empty_dir)(struct inode *);
+       unsigned (*max_namelen)(struct inode *);
+};
+
+/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
+#define FSCRYPT_SET_CONTEXT_MAX_SIZE   28
+
+static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
+{
+       if (inode->i_sb->s_cop->dummy_context &&
+                               inode->i_sb->s_cop->dummy_context(inode))
+               return true;
+       return false;
+}
+
+static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
+                                       u32 filenames_mode)
+{
+       if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
+           filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
+               return true;
+
+       if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
+           filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
+               return true;
+
+       return false;
+}
+
+static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
+{
+       if (str->len == 1 && str->name[0] == '.')
+               return true;
+
+       if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
+               return true;
+
+       return false;
+}
+
+#if __FS_HAS_ENCRYPTION
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+       return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+       return (inode->i_crypt_info != NULL);
+}
+
+#include <linux/fscrypt_supp.h>
+
+#else /* !__FS_HAS_ENCRYPTION */
+
+static inline struct page *fscrypt_control_page(struct page *page)
+{
+       WARN_ON_ONCE(1);
+       return ERR_PTR(-EINVAL);
+}
+
+static inline bool fscrypt_has_encryption_key(const struct inode *inode)
+{
+       return 0;
+}
+
+#include <linux/fscrypt_notsupp.h>
+#endif /* __FS_HAS_ENCRYPTION */
+
+/**
+ * fscrypt_require_key - require an inode's encryption key
+ * @inode: the inode we need the key for
+ *
+ * If the inode is encrypted, set up its encryption key if not already done.
+ * Then require that the key be present and return -ENOKEY otherwise.
+ *
+ * No locks are needed, and the key will live as long as the struct inode --- so
+ * it won't go away from under you.
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_require_key(struct inode *inode)
+{
+       if (IS_ENCRYPTED(inode)) {
+               int err = fscrypt_get_encryption_info(inode);
+
+               if (err)
+                       return err;
+               if (!fscrypt_has_encryption_key(inode))
+                       return -ENOKEY;
+       }
+       return 0;
+}
+
+/**
+ * fscrypt_prepare_link - prepare to link an inode into a possibly-encrypted directory
+ * @old_dentry: an existing dentry for the inode being linked
+ * @dir: the target directory
+ * @dentry: negative dentry for the target filename
+ *
+ * A new link can only be added to an encrypted directory if the directory's
+ * encryption key is available --- since otherwise we'd have no way to encrypt
+ * the filename.  Therefore, we first set up the directory's encryption key (if
+ * not already done) and return an error if it's unavailable.
+ *
+ * We also verify that the link will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if the directory's encryption key is missing,
+ * -EPERM if the link would result in an inconsistent encryption policy, or
+ * another -errno code.
+ */
+static inline int fscrypt_prepare_link(struct dentry *old_dentry,
+                                      struct inode *dir,
+                                      struct dentry *dentry)
+{
+       if (IS_ENCRYPTED(dir))
+               return __fscrypt_prepare_link(d_inode(old_dentry), dir);
+       return 0;
+}
+
+/**
+ * fscrypt_prepare_rename - prepare for a rename between possibly-encrypted directories
+ * @old_dir: source directory
+ * @old_dentry: dentry for source file
+ * @new_dir: target directory
+ * @new_dentry: dentry for target location (may be negative unless exchanging)
+ * @flags: rename flags (we care at least about %RENAME_EXCHANGE)
+ *
+ * Prepare for ->rename() where the source and/or target directories may be
+ * encrypted.  A new link can only be added to an encrypted directory if the
+ * directory's encryption key is available --- since otherwise we'd have no way
+ * to encrypt the filename.  A rename to an existing name, on the other hand,
+ * *is* cryptographically possible without the key.  However, we take the more
+ * conservative approach and just forbid all no-key renames.
+ *
+ * We also verify that the rename will not violate the constraint that all files
+ * in an encrypted directory tree use the same encryption policy.
+ *
+ * Return: 0 on success, -ENOKEY if an encryption key is missing, -EPERM if the
+ * rename would cause inconsistent encryption policies, or another -errno code.
+ */
+static inline int fscrypt_prepare_rename(struct inode *old_dir,
+                                        struct dentry *old_dentry,
+                                        struct inode *new_dir,
+                                        struct dentry *new_dentry,
+                                        unsigned int flags)
+{
+       if (IS_ENCRYPTED(old_dir) || IS_ENCRYPTED(new_dir))
+               return __fscrypt_prepare_rename(old_dir, old_dentry,
+                                               new_dir, new_dentry, flags);
+       return 0;
+}
+
+/**
+ * fscrypt_prepare_lookup - prepare to lookup a name in a possibly-encrypted directory
+ * @dir: directory being searched
+ * @dentry: filename being looked up
+ * @flags: lookup flags
+ *
+ * Prepare for ->lookup() in a directory which may be encrypted.  Lookups can be
+ * done with or without the directory's encryption key; without the key,
+ * filenames are presented in encrypted form.  Therefore, we'll try to set up
+ * the directory's encryption key, but even without it the lookup can continue.
+ *
+ * To allow invalidating stale dentries if the directory's encryption key is
+ * added later, we also install a custom ->d_revalidate() method and use the
+ * DCACHE_ENCRYPTED_WITH_KEY flag to indicate whether a given dentry is a
+ * plaintext name (flag set) or a ciphertext name (flag cleared).
+ *
+ * Return: 0 on success, -errno if a problem occurred while setting up the
+ * encryption key
+ */
+static inline int fscrypt_prepare_lookup(struct inode *dir,
+                                        struct dentry *dentry,
+                                        unsigned int flags)
+{
+       if (IS_ENCRYPTED(dir))
+               return __fscrypt_prepare_lookup(dir, dentry);
+       return 0;
+}
+
+/**
+ * fscrypt_prepare_setattr - prepare to change a possibly-encrypted inode's attributes
+ * @dentry: dentry through which the inode is being changed
+ * @attr: attributes to change
+ *
+ * Prepare for ->setattr() on a possibly-encrypted inode.  On an encrypted file,
+ * most attribute changes are allowed even without the encryption key.  However,
+ * without the encryption key we do have to forbid truncates.  This is needed
+ * because the size being truncated to may not be a multiple of the filesystem
+ * block size, and in that case we'd have to decrypt the final block, zero the
+ * portion past i_size, and re-encrypt it.  (We *could* allow truncating to a
+ * filesystem block boundary, but it's simpler to just forbid all truncates ---
+ * and we already forbid all other contents modifications without the key.)
+ *
+ * Return: 0 on success, -ENOKEY if the key is missing, or another -errno code
+ * if a problem occurred while setting up the encryption key.
+ */
+static inline int fscrypt_prepare_setattr(struct dentry *dentry,
+                                         struct iattr *attr)
+{
+       if (attr->ia_valid & ATTR_SIZE)
+               return fscrypt_require_key(d_inode(dentry));
+       return 0;
+}
+
+#endif /* _LINUX_FSCRYPT_H */
diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h

deleted file mode 100644 (file)

index 854d724..0000000
--- a/include/linux/fscrypt_common.h
+++ /dev/null
@@ -1,142 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * fscrypt_common.h: common declarations for per-file encryption
- *
- * Copyright (C) 2015, Google, Inc.
- *
- * Written by Michael Halcrow, 2015.
- * Modified by Jaegeuk Kim, 2015.
- */
-
-#ifndef _LINUX_FSCRYPT_COMMON_H
-#define _LINUX_FSCRYPT_COMMON_H
-
-#include <linux/key.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/bio.h>
-#include <linux/dcache.h>
-#include <crypto/skcipher.h>
-#include <uapi/linux/fs.h>
-
-#define FS_CRYPTO_BLOCK_SIZE           16
-
-struct fscrypt_info;
-
-struct fscrypt_ctx {
-       union {
-               struct {
-                       struct page *bounce_page;       /* Ciphertext page */
-                       struct page *control_page;      /* Original page  */
-               } w;
-               struct {
-                       struct bio *bio;
-                       struct work_struct work;
-               } r;
-               struct list_head free_list;     /* Free list */
-       };
-       u8 flags;                               /* Flags */
-};
-
-/**
- * For encrypted symlinks, the ciphertext length is stored at the beginning
- * of the string in little-endian format.
- */
-struct fscrypt_symlink_data {
-       __le16 len;
-       char encrypted_path[1];
-} __packed;
-
-struct fscrypt_str {
-       unsigned char *name;
-       u32 len;
-};
-
-struct fscrypt_name {
-       const struct qstr *usr_fname;
-       struct fscrypt_str disk_name;
-       u32 hash;
-       u32 minor_hash;
-       struct fscrypt_str crypto_buf;
-};
-
-#define FSTR_INIT(n, l)                { .name = n, .len = l }
-#define FSTR_TO_QSTR(f)                QSTR_INIT((f)->name, (f)->len)
-#define fname_name(p)          ((p)->disk_name.name)
-#define fname_len(p)           ((p)->disk_name.len)
-
-/*
- * fscrypt superblock flags
- */
-#define FS_CFLG_OWN_PAGES (1U << 1)
-
-/*
- * crypto opertions for filesystems
- */
-struct fscrypt_operations {
-       unsigned int flags;
-       const char *key_prefix;
-       int (*get_context)(struct inode *, void *, size_t);
-       int (*set_context)(struct inode *, const void *, size_t, void *);
-       bool (*dummy_context)(struct inode *);
-       bool (*is_encrypted)(struct inode *);
-       bool (*empty_dir)(struct inode *);
-       unsigned (*max_namelen)(struct inode *);
-};
-
-/* Maximum value for the third parameter of fscrypt_operations.set_context(). */
-#define FSCRYPT_SET_CONTEXT_MAX_SIZE   28
-
-static inline bool fscrypt_dummy_context_enabled(struct inode *inode)
-{
-       if (inode->i_sb->s_cop->dummy_context &&
-                               inode->i_sb->s_cop->dummy_context(inode))
-               return true;
-       return false;
-}
-
-static inline bool fscrypt_valid_enc_modes(u32 contents_mode,
-                                       u32 filenames_mode)
-{
-       if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC &&
-           filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS)
-               return true;
-
-       if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS &&
-           filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS)
-               return true;
-
-       return false;
-}
-
-static inline bool fscrypt_is_dot_dotdot(const struct qstr *str)
-{
-       if (str->len == 1 && str->name[0] == '.')
-               return true;
-
-       if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.')
-               return true;
-
-       return false;
-}
-
-static inline struct page *fscrypt_control_page(struct page *page)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-       return ((struct fscrypt_ctx *)page_private(page))->w.control_page;
-#else
-       WARN_ON_ONCE(1);
-       return ERR_PTR(-EINVAL);
-#endif
-}
-
-static inline int fscrypt_has_encryption_key(const struct inode *inode)
-{
-#if IS_ENABLED(CONFIG_FS_ENCRYPTION)
-       return (inode->i_crypt_info != NULL);
-#else
-       return 0;
-#endif
-}
-
-#endif /* _LINUX_FSCRYPT_COMMON_H */
diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h

index 19609ceea350333c76156e337b975f16be62dcce..63e58808519aaeb60aea66287b40963668a8c6bc 100644 (file)
--- a/include/linux/fscrypt_notsupp.h
+++ b/include/linux/fscrypt_notsupp.h
@@ -4,13 +4,16 @@
   *
   * This stubs out the fscrypt functions for filesystems configured without
   * encryption support.
+ *
+ * Do not include this file directly. Use fscrypt.h instead!
   */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_notsupp.h!"
+#endif
  
  #ifndef _LINUX_FSCRYPT_NOTSUPP_H
  #define _LINUX_FSCRYPT_NOTSUPP_H
  
-#include <linux/fscrypt_common.h>
-
  /* crypto.c */
  static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode,
                                                   gfp_t gfp_flags)
@@ -98,7 +101,7 @@ static inline int fscrypt_setup_filename(struct inode *dir,
                                          const struct qstr *iname,
                                          int lookup, struct fscrypt_name *fname)
  {
-       if (dir->i_sb->s_cop->is_encrypted(dir))
+       if (IS_ENCRYPTED(dir))
                 return -EOPNOTSUPP;
  
         memset(fname, 0, sizeof(struct fscrypt_name));
@@ -175,4 +178,34 @@ static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk,
         return -EOPNOTSUPP;
  }
  
+/* hooks.c */
+
+static inline int fscrypt_file_open(struct inode *inode, struct file *filp)
+{
+       if (IS_ENCRYPTED(inode))
+               return -EOPNOTSUPP;
+       return 0;
+}
+
+static inline int __fscrypt_prepare_link(struct inode *inode,
+                                        struct inode *dir)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_rename(struct inode *old_dir,
+                                          struct dentry *old_dentry,
+                                          struct inode *new_dir,
+                                          struct dentry *new_dentry,
+                                          unsigned int flags)
+{
+       return -EOPNOTSUPP;
+}
+
+static inline int __fscrypt_prepare_lookup(struct inode *dir,
+                                          struct dentry *dentry)
+{
+       return -EOPNOTSUPP;
+}
+
  #endif /* _LINUX_FSCRYPT_NOTSUPP_H */
diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h

index 5153dce22f09c12e17a726350e9c55417abc2fed..cf9e9fc02f0afe7348404ca769927d587a63ec64 100644 (file)
--- a/include/linux/fscrypt_supp.h
+++ b/include/linux/fscrypt_supp.h
@@ -2,14 +2,15 @@
  /*
   * fscrypt_supp.h
   *
- * This is included by filesystems configured with encryption support.
+ * Do not include this file directly. Use fscrypt.h instead!
   */
+#ifndef _LINUX_FSCRYPT_H
+#error "Incorrect include of linux/fscrypt_supp.h!"
+#endif
  
  #ifndef _LINUX_FSCRYPT_SUPP_H
  #define _LINUX_FSCRYPT_SUPP_H
  
-#include <linux/fscrypt_common.h>
-
  /* crypto.c */
  extern struct kmem_cache *fscrypt_info_cachep;
  extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t);
@@ -143,4 +144,14 @@ extern void fscrypt_pullback_bio_page(struct page **, bool);
  extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t,
                                  unsigned int);
  
+/* hooks.c */
+extern int fscrypt_file_open(struct inode *inode, struct file *filp);
+extern int __fscrypt_prepare_link(struct inode *inode, struct inode *dir);
+extern int __fscrypt_prepare_rename(struct inode *old_dir,
+                                   struct dentry *old_dentry,
+                                   struct inode *new_dir,
+                                   struct dentry *new_dentry,
+                                   unsigned int flags);
+extern int __fscrypt_prepare_lookup(struct inode *dir, struct dentry *dentry);
+
  #endif /* _LINUX_FSCRYPT_SUPP_H */
diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h

index bdaf22582f6ea5c59a1eb9e4e7c319854087df42..4636b8f8893e1908fd013fcca7f5aeec02e82546 100644 (file)
--- a/include/linux/fsnotify.h
+++ b/include/linux/fsnotify.h
@@ -214,12 +214,19 @@ static inline void fsnotify_modify(struct file *file)
  static inline void fsnotify_open(struct file *file)
  {
         const struct path *path = &file->f_path;
+       struct path lower_path;
         struct inode *inode = path->dentry->d_inode;
         __u32 mask = FS_OPEN;
  
         if (S_ISDIR(inode->i_mode))
                 mask |= FS_ISDIR;
  
+       if (path->dentry->d_op && path->dentry->d_op->d_canonical_path) {
+               path->dentry->d_op->d_canonical_path(path, &lower_path);
+               fsnotify_parent(&lower_path, NULL, mask);
+               fsnotify(lower_path.dentry->d_inode, mask, &lower_path, FSNOTIFY_EVENT_PATH, NULL, 0);
+               path_put(&lower_path);
+       }
         fsnotify_parent(path, NULL, mask);
         fsnotify(inode, mask, path, FSNOTIFY_EVENT_PATH, NULL, 0);
  }
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h

index e54d257983f28c4e395d9a7bf871652e7f89c3de..07edd89dca53e378b0609490196350d809d96e84 100644 (file)
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -743,7 +743,8 @@ static inline unsigned long get_lock_parent_ip(void)
    static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { }
  #endif
  
-#ifdef CONFIG_PREEMPT_TRACER
+#if defined(CONFIG_PREEMPT_TRACER) || \
+       (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
    extern void trace_preempt_on(unsigned long a0, unsigned long a1);
    extern void trace_preempt_off(unsigned long a0, unsigned long a1);
  #else
diff --git a/include/linux/gpio_event.h b/include/linux/gpio_event.h

new file mode 100644 (file)

index 0000000..2613fc5
--- /dev/null
+++ b/include/linux/gpio_event.h
@@ -0,0 +1,170 @@
+/* include/linux/gpio_event.h
+ *
+ * Copyright (C) 2007 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_GPIO_EVENT_H
+#define _LINUX_GPIO_EVENT_H
+
+#include <linux/input.h>
+
+struct gpio_event_input_devs {
+       int count;
+       struct input_dev *dev[];
+};
+enum {
+       GPIO_EVENT_FUNC_UNINIT  = 0x0,
+       GPIO_EVENT_FUNC_INIT    = 0x1,
+       GPIO_EVENT_FUNC_SUSPEND = 0x2,
+       GPIO_EVENT_FUNC_RESUME  = 0x3,
+};
+struct gpio_event_info {
+       int (*func)(struct gpio_event_input_devs *input_devs,
+                   struct gpio_event_info *info,
+                   void **data, int func);
+       int (*event)(struct gpio_event_input_devs *input_devs,
+                    struct gpio_event_info *info,
+                    void **data, unsigned int dev, unsigned int type,
+                    unsigned int code, int value); /* out events */
+       bool no_suspend;
+};
+
+struct gpio_event_platform_data {
+       const char *name;
+       struct gpio_event_info **info;
+       size_t info_count;
+       int (*power)(const struct gpio_event_platform_data *pdata, bool on);
+       const char *names[]; /* If name is NULL, names contain a NULL */
+                            /* terminated list of input devices to create */
+};
+
+#define GPIO_EVENT_DEV_NAME "gpio-event"
+
+/* Key matrix */
+
+enum gpio_event_matrix_flags {
+       /* unset: drive active output low, set: drive active output high */
+       GPIOKPF_ACTIVE_HIGH              = 1U << 0,
+       GPIOKPF_DEBOUNCE                 = 1U << 1,
+       GPIOKPF_REMOVE_SOME_PHANTOM_KEYS = 1U << 2,
+       GPIOKPF_REMOVE_PHANTOM_KEYS      = GPIOKPF_REMOVE_SOME_PHANTOM_KEYS |
+                                          GPIOKPF_DEBOUNCE,
+       GPIOKPF_DRIVE_INACTIVE           = 1U << 3,
+       GPIOKPF_LEVEL_TRIGGERED_IRQ      = 1U << 4,
+       GPIOKPF_PRINT_UNMAPPED_KEYS      = 1U << 16,
+       GPIOKPF_PRINT_MAPPED_KEYS        = 1U << 17,
+       GPIOKPF_PRINT_PHANTOM_KEYS       = 1U << 18,
+};
+
+#define MATRIX_CODE_BITS (10)
+#define MATRIX_KEY_MASK ((1U << MATRIX_CODE_BITS) - 1)
+#define MATRIX_KEY(dev, code) \
+       (((dev) << MATRIX_CODE_BITS) | (code & MATRIX_KEY_MASK))
+
+extern int gpio_event_matrix_func(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data, int func);
+struct gpio_event_matrix_info {
+       /* initialize to gpio_event_matrix_func */
+       struct gpio_event_info info;
+       /* size must be ninputs * noutputs */
+       const unsigned short *keymap;
+       unsigned int *input_gpios;
+       unsigned int *output_gpios;
+       unsigned int ninputs;
+       unsigned int noutputs;
+       /* time to wait before reading inputs after driving each output */
+       ktime_t settle_time;
+       /* time to wait before scanning the keypad a second time */
+       ktime_t debounce_delay;
+       ktime_t poll_time;
+       unsigned flags;
+};
+
+/* Directly connected inputs and outputs */
+
+enum gpio_event_direct_flags {
+       GPIOEDF_ACTIVE_HIGH         = 1U << 0,
+/*     GPIOEDF_USE_DOWN_IRQ        = 1U << 1, */
+/*     GPIOEDF_USE_IRQ             = (1U << 2) | GPIOIDF_USE_DOWN_IRQ, */
+       GPIOEDF_PRINT_KEYS          = 1U << 8,
+       GPIOEDF_PRINT_KEY_DEBOUNCE  = 1U << 9,
+       GPIOEDF_PRINT_KEY_UNSTABLE  = 1U << 10,
+};
+
+struct gpio_event_direct_entry {
+       uint32_t gpio:16;
+       uint32_t code:10;
+       uint32_t dev:6;
+};
+
+/* inputs */
+extern int gpio_event_input_func(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data, int func);
+struct gpio_event_input_info {
+       /* initialize to gpio_event_input_func */
+       struct gpio_event_info info;
+       ktime_t debounce_time;
+       ktime_t poll_time;
+       uint16_t flags;
+       uint16_t type;
+       const struct gpio_event_direct_entry *keymap;
+       size_t keymap_size;
+};
+
+/* outputs */
+extern int gpio_event_output_func(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data, int func);
+extern int gpio_event_output_event(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data,
+                       unsigned int dev, unsigned int type,
+                       unsigned int code, int value);
+struct gpio_event_output_info {
+       /* initialize to gpio_event_output_func and gpio_event_output_event */
+       struct gpio_event_info info;
+       uint16_t flags;
+       uint16_t type;
+       const struct gpio_event_direct_entry *keymap;
+       size_t keymap_size;
+};
+
+
+/* axes */
+
+enum gpio_event_axis_flags {
+       GPIOEAF_PRINT_UNKNOWN_DIRECTION  = 1U << 16,
+       GPIOEAF_PRINT_RAW                = 1U << 17,
+       GPIOEAF_PRINT_EVENT              = 1U << 18,
+};
+
+extern int gpio_event_axis_func(struct gpio_event_input_devs *input_devs,
+                       struct gpio_event_info *info, void **data, int func);
+struct gpio_event_axis_info {
+       /* initialize to gpio_event_axis_func */
+       struct gpio_event_info info;
+       uint8_t  count; /* number of gpios for this axis */
+       uint8_t  dev; /* device index when using multiple input devices */
+       uint8_t  type; /* EV_REL or EV_ABS */
+       uint16_t code;
+       uint16_t decoded_size;
+       uint16_t (*map)(struct gpio_event_axis_info *info, uint16_t in);
+       uint32_t *gpio;
+       uint32_t flags;
+};
+#define gpio_axis_2bit_gray_map gpio_axis_4bit_gray_map
+#define gpio_axis_3bit_gray_map gpio_axis_4bit_gray_map
+uint16_t gpio_axis_4bit_gray_map(
+                       struct gpio_event_axis_info *info, uint16_t in);
+uint16_t gpio_axis_5bit_singletrack_map(
+                       struct gpio_event_axis_info *info, uint16_t in);
+
+#endif
diff --git a/include/linux/initramfs.h b/include/linux/initramfs.h

new file mode 100644 (file)

index 0000000..fc7da63
--- /dev/null
+++ b/include/linux/initramfs.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/initramfs.h
+ *
+ * Copyright (C) 2015, Google
+ * Rom Lemarchand <romlem@android.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ */
+
+#ifndef _LINUX_INITRAMFS_H
+#define _LINUX_INITRAMFS_H
+
+#include <linux/kconfig.h>
+
+#if IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+
+int __init default_rootfs(void);
+
+#endif
+
+#endif /* _LINUX_INITRAMFS_H */
diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h

index 067a6fa675eda0c1f330b6a2b446a71f200bd0cb..3e3893f1b596cca3da3e1e887f82c34eb724832b 100644 (file)
--- a/include/linux/ipv6.h
+++ b/include/linux/ipv6.h
@@ -42,6 +42,7 @@ struct ipv6_devconf {
         __s32           accept_ra_rt_info_max_plen;
  #endif
  #endif
+       __s32           accept_ra_rt_table;
         __s32           proxy_ndp;
         __s32           accept_source_route;
         __s32           accept_ra_from_local;
diff --git a/include/linux/keychord.h b/include/linux/keychord.h

new file mode 100644 (file)

index 0000000..08cf540
--- /dev/null
+++ b/include/linux/keychord.h
@@ -0,0 +1,23 @@
+/*
+ *  Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef __LINUX_KEYCHORD_H_
+#define __LINUX_KEYCHORD_H_
+
+#include <uapi/linux/keychord.h>
+
+#endif /* __LINUX_KEYCHORD_H_ */
diff --git a/include/linux/keycombo.h b/include/linux/keycombo.h

new file mode 100644 (file)

index 0000000..c6db262
--- /dev/null
+++ b/include/linux/keycombo.h
@@ -0,0 +1,36 @@
+/*
+ * include/linux/keycombo.h - platform data structure for keycombo driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYCOMBO_H
+#define _LINUX_KEYCOMBO_H
+
+#define KEYCOMBO_NAME "keycombo"
+
+/*
+ * if key_down_fn and key_up_fn are both present, you are guaranteed that
+ * key_down_fn will return before key_up_fn is called, and that key_up_fn
+ * is called iff key_down_fn is called.
+ */
+struct keycombo_platform_data {
+       void (*key_down_fn)(void *);
+       void (*key_up_fn)(void *);
+       void *priv;
+       int key_down_delay; /* Time in ms */
+       int *keys_up;
+       int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYCOMBO_H */
diff --git a/include/linux/keyreset.h b/include/linux/keyreset.h

new file mode 100644 (file)

index 0000000..2e34afa
--- /dev/null
+++ b/include/linux/keyreset.h
@@ -0,0 +1,29 @@
+/*
+ * include/linux/keyreset.h - platform data structure for resetkeys driver
+ *
+ * Copyright (C) 2014 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _LINUX_KEYRESET_H
+#define _LINUX_KEYRESET_H
+
+#define KEYRESET_NAME "keyreset"
+
+struct keyreset_platform_data {
+       int (*reset_fn)(void);
+       int key_down_delay;
+       int *keys_up;
+       int keys_down[]; /* 0 terminated */
+};
+
+#endif /* _LINUX_KEYRESET_H */
diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h

index c9258124e41757187cdb8b2f83c5901966345902..7161d8e7ee79246ffca220805826f883f26d7ddd 100644 (file)
--- a/include/linux/lsm_hooks.h
+++ b/include/linux/lsm_hooks.h
@@ -1351,6 +1351,40 @@
   *     @inode we wish to get the security context of.
   *     @ctx is a pointer in which to place the allocated security context.
   *     @ctxlen points to the place to put the length of @ctx.
+ *
+ * Security hooks for using the eBPF maps and programs functionalities through
+ * eBPF syscalls.
+ *
+ * @bpf:
+ *     Do a initial check for all bpf syscalls after the attribute is copied
+ *     into the kernel. The actual security module can implement their own
+ *     rules to check the specific cmd they need.
+ *
+ * @bpf_map:
+ *     Do a check when the kernel generate and return a file descriptor for
+ *     eBPF maps.
+ *
+ *     @map: bpf map that we want to access
+ *     @mask: the access flags
+ *
+ * @bpf_prog:
+ *     Do a check when the kernel generate and return a file descriptor for
+ *     eBPF programs.
+ *
+ *     @prog: bpf prog that userspace want to use.
+ *
+ * @bpf_map_alloc_security:
+ *     Initialize the security field inside bpf map.
+ *
+ * @bpf_map_free_security:
+ *     Clean up the security information stored inside bpf map.
+ *
+ * @bpf_prog_alloc_security:
+ *     Initialize the security field inside bpf program.
+ *
+ * @bpf_prog_free_security:
+ *     Clean up the security information stored inside bpf prog.
+ *
   */
  union security_list_options {
         int (*binder_set_context_mgr)(struct task_struct *mgr);
@@ -1682,6 +1716,17 @@ union security_list_options {
                                 struct audit_context *actx);
         void (*audit_rule_free)(void *lsmrule);
  #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+       int (*bpf)(int cmd, union bpf_attr *attr,
+                                unsigned int size);
+       int (*bpf_map)(struct bpf_map *map, fmode_t fmode);
+       int (*bpf_prog)(struct bpf_prog *prog);
+       int (*bpf_map_alloc_security)(struct bpf_map *map);
+       void (*bpf_map_free_security)(struct bpf_map *map);
+       int (*bpf_prog_alloc_security)(struct bpf_prog_aux *aux);
+       void (*bpf_prog_free_security)(struct bpf_prog_aux *aux);
+#endif /* CONFIG_BPF_SYSCALL */
  };
  
  struct security_hook_heads {
@@ -1901,6 +1946,15 @@ struct security_hook_heads {
         struct list_head audit_rule_match;
         struct list_head audit_rule_free;
  #endif /* CONFIG_AUDIT */
+#ifdef CONFIG_BPF_SYSCALL
+       struct list_head bpf;
+       struct list_head bpf_map;
+       struct list_head bpf_prog;
+       struct list_head bpf_map_alloc_security;
+       struct list_head bpf_map_free_security;
+       struct list_head bpf_prog_alloc_security;
+       struct list_head bpf_prog_free_security;
+#endif /* CONFIG_BPF_SYSCALL */
  } __randomize_layout;
  
  /*
diff --git a/include/linux/memory-state-time.h b/include/linux/memory-state-time.h

new file mode 100644 (file)

index 0000000..d2212b0
--- /dev/null
+++ b/include/linux/memory-state-time.h
@@ -0,0 +1,42 @@
+/* include/linux/memory-state-time.h
+ *
+ * Copyright (C) 2016 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/workqueue.h>
+
+#define UPDATE_MEMORY_STATE(BLOCK, VALUE) BLOCK->update_call(BLOCK, VALUE)
+
+struct memory_state_update_block;
+
+typedef void (*memory_state_update_fn_t)(struct memory_state_update_block *ub,
+               int value);
+
+/* This struct is populated when you pass it to a memory_state_register*
+ * function. The update_call function is used for an update and defined in the
+ * typedef memory_state_update_fn_t
+ */
+struct memory_state_update_block {
+       memory_state_update_fn_t update_call;
+       int id;
+};
+
+/* Register a frequency struct memory_state_update_block to provide updates to
+ * memory_state_time about frequency changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_frequency_source(void);
+
+/* Register a bandwidth struct memory_state_update_block to provide updates to
+ * memory_state_time about bandwidth changes using its update_call function.
+ */
+struct memory_state_update_block *memory_state_register_bandwidth_source(void);
diff --git a/include/linux/mm.h b/include/linux/mm.h

index f50deada0f5c53790e268a4b23489fb681f9e696..4ea83366021557973bdd45ad274a234026eaba49 100644 (file)
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1223,6 +1223,8 @@ extern void pagefault_out_of_memory(void);
  
  extern void show_free_areas(unsigned int flags, nodemask_t *nodemask);
  
+void shmem_set_file(struct vm_area_struct *vma, struct file *file);
+
  extern bool can_do_mlock(void);
  extern int user_shm_lock(size_t, struct user_struct *);
  extern void user_shm_unlock(size_t, struct user_struct *);
@@ -2094,7 +2096,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
  extern struct vm_area_struct *vma_merge(struct mm_struct *,
         struct vm_area_struct *prev, unsigned long addr, unsigned long end,
         unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
-       struct mempolicy *, struct vm_userfaultfd_ctx);
+       struct mempolicy *, struct vm_userfaultfd_ctx, const char __user *);
  extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
  extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
         unsigned long addr, int new_below);
diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h

index c85f11dafd56064c5c77a4e39cd4ad7a74f65755..13673691ff6422a926cf6b5570cecdefe65e7cde 100644 (file)
--- a/include/linux/mm_types.h
+++ b/include/linux/mm_types.h
@@ -312,11 +312,18 @@ struct vm_area_struct {
         /*
          * For areas with an address space and backing store,
          * linkage into the address_space->i_mmap interval tree.
+        *
+        * For private anonymous mappings, a pointer to a null terminated string
+        * in the user process containing the name given to the vma, or NULL
+        * if unnamed.
          */
-       struct {
-               struct rb_node rb;
-               unsigned long rb_subtree_last;
-       } shared;
+       union {
+               struct {
+                       struct rb_node rb;
+                       unsigned long rb_subtree_last;
+               } shared;
+               const char __user *anon_name;
+       };
  
         /*
          * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -663,4 +670,13 @@ typedef struct {
         unsigned long val;
  } swp_entry_t;
  
+/* Return the name for an anonymous mapping or NULL for a file-backed mapping */
+static inline const char __user *vma_get_anon_name(struct vm_area_struct *vma)
+{
+       if (vma->vm_file)
+               return NULL;
+
+       return vma->anon_name;
+}
+
  #endif /* _LINUX_MM_TYPES_H */
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h

index 9a43763a68adb3e998ec942300688f99e065489b..227961c5011b523e820b78c11cb70e0f33506504 100644 (file)
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -439,6 +439,15 @@ struct mmc_host {
         bool                    cqe_enabled;
         bool                    cqe_on;
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+       struct {
+               struct sdio_cis                 *cis;
+               struct sdio_cccr                *cccr;
+               struct sdio_embedded_func       *funcs;
+               int                             num_funcs;
+       } embedded_sdio_data;
+#endif
+
         unsigned long           private[0] ____cacheline_aligned;
  };
  
@@ -451,6 +460,14 @@ void mmc_free_host(struct mmc_host *);
  int mmc_of_parse(struct mmc_host *host);
  int mmc_of_parse_voltage(struct device_node *np, u32 *mask);
  
+#ifdef CONFIG_MMC_EMBEDDED_SDIO
+extern void mmc_set_embedded_sdio_data(struct mmc_host *host,
+                                      struct sdio_cis *cis,
+                                      struct sdio_cccr *cccr,
+                                      struct sdio_embedded_func *funcs,
+                                      int num_funcs);
+#endif
+
  static inline void *mmc_priv(struct mmc_host *host)
  {
         return (void *)host->private;
diff --git a/include/linux/mmc/pm.h b/include/linux/mmc/pm.h

index 4a139204c20c0bb8aab7a7759c74e7e5d5cde9a4..6e2d6a135c7e0d75f830af3a429fb5bc02ca07b2 100644 (file)
--- a/include/linux/mmc/pm.h
+++ b/include/linux/mmc/pm.h
@@ -26,5 +26,6 @@ typedef unsigned int mmc_pm_flag_t;
  
  #define MMC_PM_KEEP_POWER      (1 << 0)        /* preserve card power during suspend */
  #define MMC_PM_WAKE_SDIO_IRQ   (1 << 1)        /* wake up host system on SDIO IRQ assertion */
+#define MMC_PM_IGNORE_PM_NOTIFY        (1 << 2)        /* ignore mmc pm notify */
  
  #endif /* LINUX_MMC_PM_H */
diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h

index 97ca105347a6c5e608297ae1a3562925dc6f6834..f466f381ad25cfb87ed96dd3457b168ee42c1c21 100644 (file)
--- a/include/linux/mmc/sdio_func.h
+++ b/include/linux/mmc/sdio_func.h
@@ -22,6 +22,14 @@ struct sdio_func;
  
  typedef void (sdio_irq_handler_t)(struct sdio_func *);
  
+/*
+ * Structure used to hold embedded SDIO device data from platform layer
+ */
+struct sdio_embedded_func {
+       uint8_t f_class;
+       uint32_t f_maxblksize;
+};
+
  /*
   * SDIO function CIS tuple (unknown to the core)
   */
diff --git a/include/linux/mount.h b/include/linux/mount.h

index 45b1f56c6c2f9fb0db10fc912251955af04fc05e..1ff21c19b0b9f9e8c6d6ef182bc1829f6f04a4be 100644 (file)
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -68,6 +68,7 @@ struct vfsmount {
         struct dentry *mnt_root;        /* root of the mounted tree */
         struct super_block *mnt_sb;     /* pointer to superblock */
         int mnt_flags;
+       void *data;
  } __randomize_layout;
  
  struct file; /* forward dec */
diff --git a/include/linux/namei.h b/include/linux/namei.h

index a982bb7cd4806f887811b4928e1154f654cd0474..db4f5c09a7da7951792632aec8b201a2fb4cec0a 100644 (file)
--- a/include/linux/namei.h
+++ b/include/linux/namei.h
@@ -80,8 +80,11 @@ extern struct dentry *user_path_create(int, const char __user *, struct path *,
  extern void done_path_create(struct path *, struct dentry *);
  extern struct dentry *kern_path_locked(const char *, struct path *);
  extern int kern_path_mountpoint(int, const char *, struct path *, unsigned int);
+extern int vfs_path_lookup(struct dentry *, struct vfsmount *,
+               const char *, unsigned int, struct path *);
  
  extern struct dentry *lookup_one_len(const char *, struct dentry *, int);
+extern struct dentry *lookup_one_len2(const char *, struct vfsmount *mnt, struct dentry *, int);
  extern struct dentry *lookup_one_len_unlocked(const char *, struct dentry *, int);
  
  extern int follow_down_one(struct path *);
diff --git a/include/linux/netfilter/xt_qtaguid.h b/include/linux/netfilter/xt_qtaguid.h

new file mode 100644 (file)

index 0000000..1c67155
--- /dev/null
+++ b/include/linux/netfilter/xt_qtaguid.h
@@ -0,0 +1,14 @@
+#ifndef _XT_QTAGUID_MATCH_H
+#define _XT_QTAGUID_MATCH_H
+
+/* For now we just replace the xt_owner.
+ * FIXME: make iptables aware of qtaguid. */
+#include <linux/netfilter/xt_owner.h>
+
+#define XT_QTAGUID_UID    XT_OWNER_UID
+#define XT_QTAGUID_GID    XT_OWNER_GID
+#define XT_QTAGUID_SOCKET XT_OWNER_SOCKET
+#define xt_qtaguid_match_info xt_owner_match_info
+
+int qtaguid_untag(struct socket *sock, bool kernel);
+#endif /* _XT_QTAGUID_MATCH_H */
diff --git a/include/linux/netfilter/xt_quota2.h b/include/linux/netfilter/xt_quota2.h

new file mode 100644 (file)

index 0000000..eadc690
--- /dev/null
+++ b/include/linux/netfilter/xt_quota2.h
@@ -0,0 +1,25 @@
+#ifndef _XT_QUOTA_H
+#define _XT_QUOTA_H
+
+enum xt_quota_flags {
+       XT_QUOTA_INVERT    = 1 << 0,
+       XT_QUOTA_GROW      = 1 << 1,
+       XT_QUOTA_PACKET    = 1 << 2,
+       XT_QUOTA_NO_CHANGE = 1 << 3,
+       XT_QUOTA_MASK      = 0x0F,
+};
+
+struct xt_quota_counter;
+
+struct xt_quota_mtinfo2 {
+       char name[15];
+       u_int8_t flags;
+
+       /* Comparison-invariant */
+       aligned_u64 quota;
+
+       /* Used internally by the kernel */
+       struct xt_quota_counter *master __attribute__((aligned(8)));
+};
+
+#endif /* _XT_QUOTA_H */
diff --git a/include/linux/of_fdt.h b/include/linux/of_fdt.h

index 013c5418aeecfdbdfb5804c6f69547b2aa57c463..064bac7a464bc60c4d0572f3dfff78136fec9f08 100644 (file)
--- a/include/linux/of_fdt.h
+++ b/include/linux/of_fdt.h
@@ -66,6 +66,27 @@ extern unsigned long of_get_flat_dt_root(void);
  extern int of_get_flat_dt_size(void);
  extern uint32_t of_get_flat_dt_phandle(unsigned long node);
  
+/*
+ * early_init_dt_scan_chosen - scan the device tree for ramdisk and bootargs
+ *
+ * The boot arguments will be placed into the memory pointed to by @data.
+ * That memory should be COMMAND_LINE_SIZE big and initialized to be a valid
+ * (possibly empty) string.  Logic for what will be in @data after this
+ * function finishes:
+ *
+ * - CONFIG_CMDLINE_FORCE=true
+ *     CONFIG_CMDLINE
+ * - CONFIG_CMDLINE_EXTEND=true, @data is non-empty string
+ *     @data + dt bootargs (even if dt bootargs are empty)
+ * - CONFIG_CMDLINE_EXTEND=true, @data is empty string
+ *     CONFIG_CMDLINE + dt bootargs (even if dt bootargs are empty)
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=non-empty:
+ *     dt bootargs
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is non-empty string
+ *     @data is left unchanged
+ * - CMDLINE_FROM_BOOTLOADER=true, dt bootargs=empty, @data is empty string
+ *     CONFIG_CMDLINE (or "" if that's not defined)
+ */
  extern int early_init_dt_scan_chosen(unsigned long node, const char *uname,
                                      int depth, void *data);
  extern int early_init_dt_scan_memory(unsigned long node, const char *uname,
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h

index 8e22f24ded6a3ad0e2accd96fe781be16c1987e4..b7fecdfa6de58d78c235b67ab8ed399729055808 100644 (file)
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -1165,6 +1165,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
  int perf_event_max_stack_handler(struct ctl_table *table, int write,
                                  void __user *buffer, size_t *lenp, loff_t *ppos);
  
+static inline bool perf_paranoid_any(void)
+{
+       return sysctl_perf_event_paranoid > 2;
+}
+
  static inline bool perf_paranoid_tracepoint_raw(void)
  {
         return sysctl_perf_event_paranoid > -1;
diff --git a/include/linux/power_supply.h b/include/linux/power_supply.h

index 79e90b3d32888fe6fd5e3abff958a346738d4ab2..e2706432f1c61cdacf7a6e6f9fdd5c89339b8118 100644 (file)
--- a/include/linux/power_supply.h
+++ b/include/linux/power_supply.h
@@ -18,6 +18,7 @@
  #include <linux/leds.h>
  #include <linux/spinlock.h>
  #include <linux/notifier.h>
+#include <linux/types.h>
  
  /*
   * All voltages, currents, charges, energies, time and temperatures in uV,
@@ -149,6 +150,12 @@ enum power_supply_property {
         POWER_SUPPLY_PROP_PRECHARGE_CURRENT,
         POWER_SUPPLY_PROP_CHARGE_TERM_CURRENT,
         POWER_SUPPLY_PROP_CALIBRATE,
+       /* Local extensions */
+       POWER_SUPPLY_PROP_USB_HC,
+       POWER_SUPPLY_PROP_USB_OTG,
+       POWER_SUPPLY_PROP_CHARGE_ENABLED,
+       /* Local extensions of type int64_t */
+       POWER_SUPPLY_PROP_CHARGE_COUNTER_EXT,
         /* Properties of type `const char *' */
         POWER_SUPPLY_PROP_MODEL_NAME,
         POWER_SUPPLY_PROP_MANUFACTURER,
@@ -177,6 +184,7 @@ enum power_supply_notifier_events {
  union power_supply_propval {
         int intval;
         const char *strval;
+       int64_t int64val;
  };
  
  struct device_node;
diff --git a/include/linux/pstore_ram.h b/include/linux/pstore_ram.h

index 9395f06e837217f05d039eab57c39d71ae0fd9cb..10fe2e28701faad19811d1d26ef05644571aafc3 100644 (file)
--- a/include/linux/pstore_ram.h
+++ b/include/linux/pstore_ram.h
@@ -80,6 +80,8 @@ void persistent_ram_free_old(struct persistent_ram_zone *prz);
  ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz,
         char *str, size_t len);
  
+void ramoops_console_write_buf(const char *buf, size_t size);
+
  /*
   * Ramoops platform data
   * @mem_size   memory size for ramoops
diff --git a/include/linux/sched.h b/include/linux/sched.h

index fdf74f27acf1e9801051c5b6c22f1ec5008b9f42..30c35a2ecb46a8deb5e48ec98c4468f9f9bf62de 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,6 +166,15 @@ struct task_group;
  /* Task command name length: */
  #define TASK_COMM_LEN                  16
  
+enum task_event {
+       PUT_PREV_TASK   = 0,
+       PICK_NEXT_TASK  = 1,
+       TASK_WAKE       = 2,
+       TASK_MIGRATE    = 3,
+       TASK_UPDATE     = 4,
+       IRQ_UPDATE      = 5,
+};
+
  extern cpumask_var_t                   cpu_isolated_map;
  
  extern void scheduler_tick(void);
@@ -410,6 +419,41 @@ struct sched_entity {
  #endif
  };
  
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+       /*
+        * 'mark_start' marks the beginning of an event (task waking up, task
+        * starting to execute, task being preempted) within a window
+        *
+        * 'sum' represents how runnable a task has been within current
+        * window. It incorporates both running time and wait time and is
+        * frequency scaled.
+        *
+        * 'sum_history' keeps track of history of 'sum' seen over previous
+        * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+        * ignored.
+        *
+        * 'demand' represents maximum sum seen over previous
+        * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+        * demand for tasks.
+        *
+        * 'curr_window' represents task's contribution to cpu busy time
+        * statistics (rq->curr_runnable_sum) in current window
+        *
+        * 'prev_window' represents task's contribution to cpu busy time
+        * statistics (rq->prev_runnable_sum) in previous window
+        */
+       u64 mark_start;
+       u32 sum, demand;
+       u32 sum_history[RAVG_HIST_SIZE_MAX];
+       u32 curr_window, prev_window;
+       u16 active_windows;
+};
+#endif
+
  struct sched_rt_entity {
         struct list_head                run_list;
         unsigned long                   timeout;
@@ -562,6 +606,16 @@ struct task_struct {
         const struct sched_class        *sched_class;
         struct sched_entity             se;
         struct sched_rt_entity          rt;
+#ifdef CONFIG_SCHED_WALT
+       struct ravg ravg;
+       /*
+        * 'init_load_pct' represents the initial task load assigned to children
+        * of this task
+        */
+       u32 init_load_pct;
+       u64 last_sleep_ts;
+#endif
+
  #ifdef CONFIG_CGROUP_SCHED
         struct task_group               *sched_task_group;
  #endif
diff --git a/include/linux/sched/cpufreq.h b/include/linux/sched/cpufreq.h

index d1ad3d825561118c814320b444267a92a87fcd70..0b55834efd46495d7e1b3bf0983592426984138c 100644 (file)
--- a/include/linux/sched/cpufreq.h
+++ b/include/linux/sched/cpufreq.h
@@ -12,8 +12,6 @@
  #define SCHED_CPUFREQ_DL       (1U << 1)
  #define SCHED_CPUFREQ_IOWAIT   (1U << 2)
  
-#define SCHED_CPUFREQ_RT_DL    (SCHED_CPUFREQ_RT | SCHED_CPUFREQ_DL)
-
  #ifdef CONFIG_CPU_FREQ
  struct update_util_data {
         void (*func)(struct update_util_data *data, u64 time, unsigned int flags);
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index d6a18a3839cc281c6c7c61f4e7cb4281a5087c66..e076ff8179b209dbe26e0f09e32e40d446f12c3a 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -21,8 +21,16 @@ enum { sysctl_hung_task_timeout_secs = 0 };
  
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
+extern unsigned int sysctl_sched_sync_hint_enable;
+extern unsigned int sysctl_sched_cstate_aware;
  extern unsigned int sysctl_sched_wakeup_granularity;
  extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
  
  enum sched_tunable_scaling {
         SCHED_TUNABLESCALING_NONE,
diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h

index cf257c2e728d622ec7db40f69620d2ed39ddcf7b..e0161c3da0da6c002920aee12dbe33fa7feea76b 100644 (file)
--- a/include/linux/sched/topology.h
+++ b/include/linux/sched/topology.h
@@ -26,6 +26,7 @@
  #define SD_PREFER_SIBLING      0x1000  /* Prefer to place tasks in a sibling domain */
  #define SD_OVERLAP             0x2000  /* sched_domains of this level overlap */
  #define SD_NUMA                        0x4000  /* cross-node balancing */
+#define SD_SHARE_CAP_STATES    0x8000  /* Domain members share capacity state */
  
  /*
   * Increase resolution of cpu_capacity calculations
@@ -66,12 +67,30 @@ struct sched_domain_attr {
  
  extern int sched_domain_level_max;
  
+struct capacity_state {
+       unsigned long cap;      /* compute capacity */
+       unsigned long power;    /* power consumption at this compute capacity */
+};
+
+struct idle_state {
+       unsigned long power;     /* power consumption in this idle state */
+};
+
+struct sched_group_energy {
+       unsigned int nr_idle_states;    /* number of idle states */
+       struct idle_state *idle_states; /* ptr to idle state array */
+       unsigned int nr_cap_states;     /* number of capacity states */
+       struct capacity_state *cap_states; /* ptr to capacity state array */
+};
+
  struct sched_group;
  
  struct sched_domain_shared {
         atomic_t        ref;
         atomic_t        nr_busy_cpus;
         int             has_idle_cores;
+
+       bool            overutilized;
  };
  
  struct sched_domain {
@@ -173,6 +192,8 @@ bool cpus_share_cache(int this_cpu, int that_cpu);
  
  typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
  typedef int (*sched_domain_flags_f)(void);
+typedef
+const struct sched_group_energy * const(*sched_domain_energy_f)(int cpu);
  
  #define SDTL_OVERLAP   0x01
  
@@ -186,6 +207,7 @@ struct sd_data {
  struct sched_domain_topology_level {
         sched_domain_mask_f mask;
         sched_domain_flags_f sd_flags;
+       sched_domain_energy_f energy;
         int                 flags;
         int                 numa_level;
         struct sd_data      data;
diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h

index 10b19a192b2d000121eff7dc91e8f224918d5fec..a3661e93da6f7afe5b24e3827a111e0c67d37f48 100644 (file)
--- a/include/linux/sched/wake_q.h
+++ b/include/linux/sched/wake_q.h
@@ -34,6 +34,7 @@
  struct wake_q_head {
         struct wake_q_node *first;
         struct wake_q_node **lastp;
+       int count;
  };
  
  #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
@@ -45,6 +46,7 @@ static inline void wake_q_init(struct wake_q_head *head)
  {
         head->first = WAKE_Q_TAIL;
         head->lastp = &head->first;
+       head->count = 0;
  }
  
  extern void wake_q_add(struct wake_q_head *head,
diff --git a/include/linux/sched/xacct.h b/include/linux/sched/xacct.h

index c078f0a94ceca7ddae6630fc6640f3553d5d8a42..9544c9d9d53465a6c6ee7aebf48c287fde4b5c41 100644 (file)
--- a/include/linux/sched/xacct.h
+++ b/include/linux/sched/xacct.h
@@ -28,6 +28,11 @@ static inline void inc_syscw(struct task_struct *tsk)
  {
         tsk->ioac.syscw++;
  }
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+       tsk->ioac.syscfs++;
+}
  #else
  static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
  {
@@ -44,6 +49,10 @@ static inline void inc_syscr(struct task_struct *tsk)
  static inline void inc_syscw(struct task_struct *tsk)
  {
  }
+
+static inline void inc_syscfs(struct task_struct *tsk)
+{
+}
  #endif
  
  #endif /* _LINUX_SCHED_XACCT_H */
diff --git a/include/linux/sched_energy.h b/include/linux/sched_energy.h

new file mode 100644 (file)

index 0000000..83d7178
--- /dev/null
+++ b/include/linux/sched_energy.h
@@ -0,0 +1,41 @@
+#ifndef _LINUX_SCHED_ENERGY_H
+#define _LINUX_SCHED_ENERGY_H
+
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+/*
+ * There doesn't seem to be an NR_CPUS style max number of sched domain
+ * levels so here's an arbitrary constant one for the moment.
+ *
+ * The levels alluded to here correspond to entries in struct
+ * sched_domain_topology_level that are meant to be populated by arch
+ * specific code (topology.c).
+ */
+#define NR_SD_LEVELS 8
+
+#define SD_LEVEL0   0
+#define SD_LEVEL1   1
+#define SD_LEVEL2   2
+#define SD_LEVEL3   3
+#define SD_LEVEL4   4
+#define SD_LEVEL5   5
+#define SD_LEVEL6   6
+#define SD_LEVEL7   7
+
+/*
+ * Convenience macro for iterating through said sd levels.
+ */
+#define for_each_possible_sd_level(level)                  \
+       for (level = 0; level < NR_SD_LEVELS; level++)
+
+extern struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+#ifdef CONFIG_GENERIC_ARCH_TOPOLOGY
+void init_sched_energy_costs(void);
+int sched_energy_installed(int cpu);
+#else
+void init_sched_energy_costs(void) {}
+#endif
+
+#endif
diff --git a/include/linux/security.h b/include/linux/security.h

index ce6265960d6c430a90e1ad3c3749d0a438ecaca9..73f1ef625d40c900430778fab29f8bad6cd2e029 100644 (file)
--- a/include/linux/security.h
+++ b/include/linux/security.h
@@ -1730,6 +1730,54 @@ static inline void securityfs_remove(struct dentry *dentry)
  
  #endif
  
+#ifdef CONFIG_BPF_SYSCALL
+union bpf_attr;
+struct bpf_map;
+struct bpf_prog;
+struct bpf_prog_aux;
+#ifdef CONFIG_SECURITY
+extern int security_bpf(int cmd, union bpf_attr *attr, unsigned int size);
+extern int security_bpf_map(struct bpf_map *map, fmode_t fmode);
+extern int security_bpf_prog(struct bpf_prog *prog);
+extern int security_bpf_map_alloc(struct bpf_map *map);
+extern void security_bpf_map_free(struct bpf_map *map);
+extern int security_bpf_prog_alloc(struct bpf_prog_aux *aux);
+extern void security_bpf_prog_free(struct bpf_prog_aux *aux);
+#else
+static inline int security_bpf(int cmd, union bpf_attr *attr,
+                                            unsigned int size)
+{
+       return 0;
+}
+
+static inline int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+       return 0;
+}
+
+static inline int security_bpf_prog(struct bpf_prog *prog)
+{
+       return 0;
+}
+
+static inline int security_bpf_map_alloc(struct bpf_map *map)
+{
+       return 0;
+}
+
+static inline void security_bpf_map_free(struct bpf_map *map)
+{ }
+
+static inline int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+       return 0;
+}
+
+static inline void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{ }
+#endif /* CONFIG_SECURITY */
+#endif /* CONFIG_BPF_SYSCALL */
+
  #ifdef CONFIG_SECURITY
  
  static inline char *alloc_secdata(void)
diff --git a/include/linux/suspend.h b/include/linux/suspend.h

index d60b0f5c38d504e52a2c1c2726476def808595c7..31b0b27fb375c5ff233177b52cede85067270de8 100644 (file)
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -442,6 +442,7 @@ extern bool pm_get_wakeup_count(unsigned int *count, bool block);
  extern bool pm_save_wakeup_count(unsigned int count);
  extern void pm_wakep_autosleep_enabled(bool set);
  extern void pm_print_active_wakeup_sources(void);
+extern void pm_get_active_wakeup_sources(char *pending_sources, size_t max);
  
  static inline void lock_system_sleep(void)
  {
diff --git a/include/linux/task_io_accounting.h b/include/linux/task_io_accounting.h

index 6f6acce064dea535b0a4e6fe84d20a99a3c9a9dd..bb26108ca23c011524567ca3d6456858fc3b642f 100644 (file)
--- a/include/linux/task_io_accounting.h
+++ b/include/linux/task_io_accounting.h
@@ -19,6 +19,8 @@ struct task_io_accounting {
         u64 syscr;
         /* # of write syscalls */
         u64 syscw;
+       /* # of fsync syscalls */
+       u64 syscfs;
  #endif /* CONFIG_TASK_XACCT */
  
  #ifdef CONFIG_TASK_IO_ACCOUNTING
diff --git a/include/linux/task_io_accounting_ops.h b/include/linux/task_io_accounting_ops.h

index bb5498bcdd961d2a998dd302b03074adc5ce29ef..733ab62ae14130dc62c8d62e48e45002aa149781 100644 (file)
--- a/include/linux/task_io_accounting_ops.h
+++ b/include/linux/task_io_accounting_ops.h
@@ -97,6 +97,7 @@ static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
         dst->wchar += src->wchar;
         dst->syscr += src->syscr;
         dst->syscw += src->syscw;
+       dst->syscfs += src->syscfs;
  }
  #else
  static inline void task_chr_io_accounting_add(struct task_io_accounting *dst,
diff --git a/include/linux/usb/class-dual-role.h b/include/linux/usb/class-dual-role.h

new file mode 100644 (file)

index 0000000..c6df223
--- /dev/null
+++ b/include/linux/usb/class-dual-role.h
@@ -0,0 +1,129 @@
+#ifndef __LINUX_CLASS_DUAL_ROLE_H__
+#define __LINUX_CLASS_DUAL_ROLE_H__
+
+#include <linux/workqueue.h>
+#include <linux/errno.h>
+#include <linux/types.h>
+
+struct device;
+
+enum dual_role_supported_modes {
+       DUAL_ROLE_SUPPORTED_MODES_DFP_AND_UFP = 0,
+       DUAL_ROLE_SUPPORTED_MODES_DFP,
+       DUAL_ROLE_SUPPORTED_MODES_UFP,
+/*The following should be the last element*/
+       DUAL_ROLE_PROP_SUPPORTED_MODES_TOTAL,
+};
+
+enum {
+       DUAL_ROLE_PROP_MODE_UFP = 0,
+       DUAL_ROLE_PROP_MODE_DFP,
+       DUAL_ROLE_PROP_MODE_NONE,
+/*The following should be the last element*/
+       DUAL_ROLE_PROP_MODE_TOTAL,
+};
+
+enum {
+       DUAL_ROLE_PROP_PR_SRC = 0,
+       DUAL_ROLE_PROP_PR_SNK,
+       DUAL_ROLE_PROP_PR_NONE,
+/*The following should be the last element*/
+       DUAL_ROLE_PROP_PR_TOTAL,
+
+};
+
+enum {
+       DUAL_ROLE_PROP_DR_HOST = 0,
+       DUAL_ROLE_PROP_DR_DEVICE,
+       DUAL_ROLE_PROP_DR_NONE,
+/*The following should be the last element*/
+       DUAL_ROLE_PROP_DR_TOTAL,
+};
+
+enum {
+       DUAL_ROLE_PROP_VCONN_SUPPLY_NO = 0,
+       DUAL_ROLE_PROP_VCONN_SUPPLY_YES,
+/*The following should be the last element*/
+       DUAL_ROLE_PROP_VCONN_SUPPLY_TOTAL,
+};
+
+enum dual_role_property {
+       DUAL_ROLE_PROP_SUPPORTED_MODES = 0,
+       DUAL_ROLE_PROP_MODE,
+       DUAL_ROLE_PROP_PR,
+       DUAL_ROLE_PROP_DR,
+       DUAL_ROLE_PROP_VCONN_SUPPLY,
+};
+
+struct dual_role_phy_instance;
+
+/* Description of typec port */
+struct dual_role_phy_desc {
+       /* /sys/class/dual_role_usb/<name>/ */
+       const char *name;
+       enum dual_role_supported_modes supported_modes;
+       enum dual_role_property *properties;
+       size_t num_properties;
+
+       /* Callback for "cat /sys/class/dual_role_usb/<name>/<property>" */
+       int (*get_property)(struct dual_role_phy_instance *dual_role,
+                            enum dual_role_property prop,
+                            unsigned int *val);
+       /* Callback for "echo <value> >
+        *                      /sys/class/dual_role_usb/<name>/<property>" */
+       int (*set_property)(struct dual_role_phy_instance *dual_role,
+                            enum dual_role_property prop,
+                            const unsigned int *val);
+       /* Decides whether userspace can change a specific property */
+       int (*property_is_writeable)(struct dual_role_phy_instance *dual_role,
+                                     enum dual_role_property prop);
+};
+
+struct dual_role_phy_instance {
+       const struct dual_role_phy_desc *desc;
+
+       /* Driver private data */
+       void *drv_data;
+
+       struct device dev;
+       struct work_struct changed_work;
+};
+
+#if IS_ENABLED(CONFIG_DUAL_ROLE_USB_INTF)
+extern void dual_role_instance_changed(struct dual_role_phy_instance
+                                      *dual_role);
+extern struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+                                const struct dual_role_phy_desc *desc);
+extern void devm_dual_role_instance_unregister(struct device *dev,
+                                              struct dual_role_phy_instance
+                                              *dual_role);
+extern int dual_role_get_property(struct dual_role_phy_instance *dual_role,
+                                 enum dual_role_property prop,
+                                 unsigned int *val);
+extern int dual_role_set_property(struct dual_role_phy_instance *dual_role,
+                                 enum dual_role_property prop,
+                                 const unsigned int *val);
+extern int dual_role_property_is_writeable(struct dual_role_phy_instance
+                                          *dual_role,
+                                          enum dual_role_property prop);
+extern void *dual_role_get_drvdata(struct dual_role_phy_instance *dual_role);
+#else /* CONFIG_DUAL_ROLE_USB_INTF */
+static inline void dual_role_instance_changed(struct dual_role_phy_instance
+                                      *dual_role){}
+static inline struct dual_role_phy_instance *__must_check
+devm_dual_role_instance_register(struct device *parent,
+                                const struct dual_role_phy_desc *desc)
+{
+       return ERR_PTR(-ENOSYS);
+}
+static inline void devm_dual_role_instance_unregister(struct device *dev,
+                                              struct dual_role_phy_instance
+                                              *dual_role){}
+static inline void *dual_role_get_drvdata(struct dual_role_phy_instance
+               *dual_role)
+{
+       return ERR_PTR(-ENOSYS);
+}
+#endif /* CONFIG_DUAL_ROLE_USB_INTF */
+#endif /* __LINUX_CLASS_DUAL_ROLE_H__ */
diff --git a/include/linux/usb/composite.h b/include/linux/usb/composite.h

index f665d2ceac20587a285428202a7ba17de23b5734..09da0c80497d7823b5b62e40c489b358565b5f60 100644 (file)
--- a/include/linux/usb/composite.h
+++ b/include/linux/usb/composite.h
@@ -583,6 +583,7 @@ struct usb_function_instance {
         struct config_group group;
         struct list_head cfs_list;
         struct usb_function_driver *fd;
+       struct usb_function *f;
         int (*set_inst_name)(struct usb_function_instance *inst,
                               const char *name);
         void (*free_func_inst)(struct usb_function_instance *inst);
diff --git a/include/linux/usb/f_accessory.h b/include/linux/usb/f_accessory.h

new file mode 100644 (file)

index 0000000..ebe3c4d
--- /dev/null
+++ b/include/linux/usb/f_accessory.h
@@ -0,0 +1,23 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef __LINUX_USB_F_ACCESSORY_H
+#define __LINUX_USB_F_ACCESSORY_H
+
+#include <uapi/linux/usb/f_accessory.h>
+
+#endif /* __LINUX_USB_F_ACCESSORY_H */
diff --git a/include/linux/wakeup_reason.h b/include/linux/wakeup_reason.h

new file mode 100644 (file)

index 0000000..d84d8c3
--- /dev/null
+++ b/include/linux/wakeup_reason.h
@@ -0,0 +1,32 @@
+/*
+ * include/linux/wakeup_reason.h
+ *
+ * Logs the reason which caused the kernel to resume
+ * from the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef _LINUX_WAKEUP_REASON_H
+#define _LINUX_WAKEUP_REASON_H
+
+#define MAX_SUSPEND_ABORT_LEN 256
+
+void log_wakeup_reason(int irq);
+int check_wakeup_reason(int irq);
+
+#ifdef CONFIG_SUSPEND
+void log_suspend_abort_reason(const char *fmt, ...);
+#else
+static inline void log_suspend_abort_reason(const char *fmt, ...) { }
+#endif
+
+#endif /* _LINUX_WAKEUP_REASON_H */
diff --git a/include/net/addrconf.h b/include/net/addrconf.h

index 35f5aabd432ff8212b99094e13bb65b64830782f..bcd9b88bc4e8c58c4f48e3acfa6a410ca34c5716 100644 (file)
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -261,6 +261,8 @@ static inline bool ipv6_is_mld(struct sk_buff *skb, int nexthdr, int offset)
  void addrconf_prefix_rcv(struct net_device *dev,
                          u8 *opt, int len, bool sllao);
  
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table);
+
  /*
   *     anycast prototypes (anycast.c)
   */
diff --git a/include/net/tcp.h b/include/net/tcp.h

index 0a13574134b8b34ac1ddedd08615a8ff0f439181..44d5d2e8204cf70cfc2c9796d2f1adb4e89eec11 100644 (file)
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -275,6 +275,7 @@ extern int sysctl_tcp_autocorking;
  extern int sysctl_tcp_invalid_ratelimit;
  extern int sysctl_tcp_pacing_ss_ratio;
  extern int sysctl_tcp_pacing_ca_ratio;
+extern int sysctl_tcp_default_init_rwnd;
  
  extern atomic_long_t tcp_memory_allocated;
  extern struct percpu_counter tcp_sockets_allocated;
diff --git a/include/trace/events/android_fs.h b/include/trace/events/android_fs.h

new file mode 100644 (file)

index 0000000..4950953
--- /dev/null
+++ b/include/trace/events/android_fs.h
@@ -0,0 +1,65 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM android_fs
+
+#if !defined(_TRACE_ANDROID_FS_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_H
+
+#include <linux/tracepoint.h>
+#include <trace/events/android_fs_template.h>
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_dataread_start,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+                pid_t pid, char *pathname, char *command),
+       TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_dataread_end,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+       TP_ARGS(inode, offset, bytes));
+
+DEFINE_EVENT(android_fs_data_start_template, android_fs_datawrite_start,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+                pid_t pid, char *pathname, char *command),
+       TP_ARGS(inode, offset, bytes, pid, pathname, command));
+
+DEFINE_EVENT(android_fs_data_end_template, android_fs_datawrite_end,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+            TP_ARGS(inode, offset, bytes));
+
+#endif /* _TRACE_ANDROID_FS_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
+#ifndef ANDROID_FSTRACE_GET_PATHNAME
+#define ANDROID_FSTRACE_GET_PATHNAME
+
+/* Sizes an on-stack array, so careful if sizing this up ! */
+#define MAX_TRACE_PATHBUF_LEN  256
+
+static inline char *
+android_fstrace_get_pathname(char *buf, int buflen, struct inode *inode)
+{
+       char *path;
+       struct dentry *d;
+
+       /*
+        * d_obtain_alias() will either iput() if it locates an existing
+        * dentry or transfer the reference to the new dentry created.
+        * So get an extra reference here.
+        */
+       ihold(inode);
+       d = d_obtain_alias(inode);
+       if (likely(!IS_ERR(d))) {
+               path = dentry_path_raw(d, buf, buflen);
+               if (unlikely(IS_ERR(path))) {
+                       strcpy(buf, "ERROR");
+                       path = buf;
+               }
+               dput(d);
+       } else {
+               strcpy(buf, "ERROR");
+               path = buf;
+       }
+       return path;
+}
+#endif
diff --git a/include/trace/events/android_fs_template.h b/include/trace/events/android_fs_template.h

new file mode 100644 (file)

index 0000000..b23d17b
--- /dev/null
+++ b/include/trace/events/android_fs_template.h
@@ -0,0 +1,64 @@
+#if !defined(_TRACE_ANDROID_FS_TEMPLATE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_ANDROID_FS_TEMPLATE_H
+
+#include <linux/tracepoint.h>
+
+DECLARE_EVENT_CLASS(android_fs_data_start_template,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes,
+                pid_t pid, char *pathname, char *command),
+       TP_ARGS(inode, offset, bytes, pid, pathname, command),
+       TP_STRUCT__entry(
+               __string(pathbuf, pathname);
+               __field(loff_t, offset);
+               __field(int,    bytes);
+               __field(loff_t, i_size);
+               __string(cmdline, command);
+               __field(pid_t,  pid);
+               __field(ino_t,  ino);
+       ),
+       TP_fast_assign(
+               {
+                       /*
+                        * Replace the spaces in filenames and cmdlines
+                        * because this screws up the tooling that parses
+                        * the traces.
+                        */
+                       __assign_str(pathbuf, pathname);
+                       (void)strreplace(__get_str(pathbuf), ' ', '_');
+                       __entry->offset         = offset;
+                       __entry->bytes          = bytes;
+                       __entry->i_size         = i_size_read(inode);
+                       __assign_str(cmdline, command);
+                       (void)strreplace(__get_str(cmdline), ' ', '_');
+                       __entry->pid            = pid;
+                       __entry->ino            = inode->i_ino;
+               }
+       ),
+       TP_printk("entry_name %s, offset %llu, bytes %d, cmdline %s,"
+                 " pid %d, i_size %llu, ino %lu",
+                 __get_str(pathbuf), __entry->offset, __entry->bytes,
+                 __get_str(cmdline), __entry->pid, __entry->i_size,
+                 (unsigned long) __entry->ino)
+);
+
+DECLARE_EVENT_CLASS(android_fs_data_end_template,
+       TP_PROTO(struct inode *inode, loff_t offset, int bytes),
+       TP_ARGS(inode, offset, bytes),
+       TP_STRUCT__entry(
+               __field(ino_t,  ino);
+               __field(loff_t, offset);
+               __field(int,    bytes);
+       ),
+       TP_fast_assign(
+               {
+                       __entry->ino            = inode->i_ino;
+                       __entry->offset         = offset;
+                       __entry->bytes          = bytes;
+               }
+       ),
+       TP_printk("ino %lu, offset %llu, bytes %d",
+                 (unsigned long) __entry->ino,
+                 __entry->offset, __entry->bytes)
+);
+
+#endif /* _TRACE_ANDROID_FS_TEMPLATE_H */
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h

index 7ab40491485bc0b9604aa07fae9fa4a3575509e1..8f8dd42fa57bd39c0a07cb15ece2249e8bb8928e 100644 (file)
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -137,6 +137,18 @@ TRACE_DEFINE_ENUM(CP_TRIMMED);
                 { CP_UMOUNT,    "Umount" },                             \
                 { CP_TRIMMED,   "Trimmed" })
  
+#define show_fsync_cpreason(type)                                      \
+       __print_symbolic(type,                                          \
+               { CP_NO_NEEDED,         "no needed" },                  \
+               { CP_NON_REGULAR,       "non regular" },                \
+               { CP_HARDLINK,          "hardlink" },                   \
+               { CP_SB_NEED_CP,        "sb needs cp" },                \
+               { CP_WRONG_PINO,        "wrong pino" },                 \
+               { CP_NO_SPC_ROLL,       "no space roll forward" },      \
+               { CP_NODE_NEED_CP,      "node needs cp" },              \
+               { CP_FASTBOOT_MODE,     "fastboot mode" },              \
+               { CP_SPEC_LOG_NUM,      "log type is 2" })
+
  struct victim_sel_policy;
  struct f2fs_map_blocks;
  
@@ -211,14 +223,14 @@ DEFINE_EVENT(f2fs__inode, f2fs_sync_file_enter,
  
  TRACE_EVENT(f2fs_sync_file_exit,
  
-       TP_PROTO(struct inode *inode, int need_cp, int datasync, int ret),
+       TP_PROTO(struct inode *inode, int cp_reason, int datasync, int ret),
  
-       TP_ARGS(inode, need_cp, datasync, ret),
+       TP_ARGS(inode, cp_reason, datasync, ret),
  
         TP_STRUCT__entry(
                 __field(dev_t,  dev)
                 __field(ino_t,  ino)
-               __field(int,    need_cp)
+               __field(int,    cp_reason)
                 __field(int,    datasync)
                 __field(int,    ret)
         ),
@@ -226,15 +238,15 @@ TRACE_EVENT(f2fs_sync_file_exit,
         TP_fast_assign(
                 __entry->dev            = inode->i_sb->s_dev;
                 __entry->ino            = inode->i_ino;
-               __entry->need_cp        = need_cp;
+               __entry->cp_reason      = cp_reason;
                 __entry->datasync       = datasync;
                 __entry->ret            = ret;
         ),
  
-       TP_printk("dev = (%d,%d), ino = %lu, checkpoint is %s, "
+       TP_printk("dev = (%d,%d), ino = %lu, cp_reason: %s, "
                 "datasync = %d, ret = %d",
                 show_dev_ino(__entry),
-               __entry->need_cp ? "needed" : "not needed",
+               show_fsync_cpreason(__entry->cp_reason),
                 __entry->datasync,
                 __entry->ret)
  );
@@ -729,6 +741,91 @@ TRACE_EVENT(f2fs_get_victim,
                 __entry->free)
  );
  
+TRACE_EVENT(f2fs_lookup_start,
+
+       TP_PROTO(struct inode *dir, struct dentry *dentry, unsigned int flags),
+
+       TP_ARGS(dir, dentry, flags),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(const char *,   name)
+               __field(unsigned int, flags)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = dir->i_sb->s_dev;
+               __entry->ino    = dir->i_ino;
+               __entry->name   = dentry->d_name.name;
+               __entry->flags  = flags;
+       ),
+
+       TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u",
+               show_dev_ino(__entry),
+               __entry->name,
+               __entry->flags)
+);
+
+TRACE_EVENT(f2fs_lookup_end,
+
+       TP_PROTO(struct inode *dir, struct dentry *dentry, nid_t ino,
+               int err),
+
+       TP_ARGS(dir, dentry, ino, err),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(const char *,   name)
+               __field(nid_t,  cino)
+               __field(int,    err)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = dir->i_sb->s_dev;
+               __entry->ino    = dir->i_ino;
+               __entry->name   = dentry->d_name.name;
+               __entry->cino   = ino;
+               __entry->err    = err;
+       ),
+
+       TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d",
+               show_dev_ino(__entry),
+               __entry->name,
+               __entry->cino,
+               __entry->err)
+);
+
+TRACE_EVENT(f2fs_readdir,
+
+       TP_PROTO(struct inode *dir, loff_t start_pos, loff_t end_pos, int err),
+
+       TP_ARGS(dir, start_pos, end_pos, err),
+
+       TP_STRUCT__entry(
+               __field(dev_t,  dev)
+               __field(ino_t,  ino)
+               __field(loff_t, start)
+               __field(loff_t, end)
+               __field(int,    err)
+       ),
+
+       TP_fast_assign(
+               __entry->dev    = dir->i_sb->s_dev;
+               __entry->ino    = dir->i_ino;
+               __entry->start  = start_pos;
+               __entry->end    = end_pos;
+               __entry->err    = err;
+       ),
+
+       TP_printk("dev = (%d,%d), ino = %lu, start_pos:%llu, end_pos:%llu, err:%d",
+               show_dev_ino(__entry),
+               __entry->start,
+               __entry->end,
+               __entry->err)
+);
+
  TRACE_EVENT(f2fs_fallocate,
  
         TP_PROTO(struct inode *inode, int mode,
@@ -1287,6 +1384,13 @@ DEFINE_EVENT(f2fs_discard, f2fs_issue_discard,
         TP_ARGS(dev, blkstart, blklen)
  );
  
+DEFINE_EVENT(f2fs_discard, f2fs_remove_discard,
+
+       TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen),
+
+       TP_ARGS(dev, blkstart, blklen)
+);
+
  TRACE_EVENT(f2fs_issue_reset_zone,
  
         TP_PROTO(struct block_device *dev, block_t blkstart),
diff --git a/include/trace/events/gpu.h b/include/trace/events/gpu.h

new file mode 100644 (file)

index 0000000..7e15cdf
--- /dev/null
+++ b/include/trace/events/gpu.h
@@ -0,0 +1,143 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM gpu
+
+#if !defined(_TRACE_GPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_GPU_H
+
+#include <linux/tracepoint.h>
+#include <linux/time.h>
+
+#define show_secs_from_ns(ns) \
+       ({ \
+               u64 t = ns + (NSEC_PER_USEC / 2); \
+               do_div(t, NSEC_PER_SEC); \
+               t; \
+       })
+
+#define show_usecs_from_ns(ns) \
+       ({ \
+               u64 t = ns + (NSEC_PER_USEC / 2) ; \
+               u32 rem; \
+               do_div(t, NSEC_PER_USEC); \
+               rem = do_div(t, USEC_PER_SEC); \
+       })
+
+/*
+ * The gpu_sched_switch event indicates that a switch from one GPU context to
+ * another occurred on one of the GPU hardware blocks.
+ *
+ * The gpu_name argument identifies the GPU hardware block.  Each independently
+ * scheduled GPU hardware block should have a different name.  This may be used
+ * in different ways for different GPUs.  For example, if a GPU includes
+ * multiple processing cores it may use names "GPU 0", "GPU 1", etc.  If a GPU
+ * includes a separately scheduled 2D and 3D hardware block, it might use the
+ * names "2D" and "3D".
+ *
+ * The timestamp argument is the timestamp at which the switch occurred on the
+ * GPU. These timestamps are in units of nanoseconds and must use
+ * approximately the same time as sched_clock, though they need not come from
+ * any CPU clock. The timestamps for a single hardware block must be
+ * monotonically nondecreasing.  This means that if a variable compensation
+ * offset is used to translate from some other clock to the sched_clock, then
+ * care must be taken when increasing that offset, and doing so may result in
+ * multiple events with the same timestamp.
+ *
+ * The next_ctx_id argument identifies the next context that was running on
+ * the GPU hardware block.  A value of 0 indicates that the hardware block
+ * will be idle.
+ *
+ * The next_prio argument indicates the priority of the next context at the
+ * time of the event.  The exact numeric values may mean different things for
+ * different GPUs, but they should follow the rule that lower values indicate a
+ * higher priority.
+ *
+ * The next_job_id argument identifies the batch of work that the GPU will be
+ * working on.  This should correspond to a job_id that was previously traced
+ * as a gpu_job_enqueue event when the batch of work was created.
+ */
+TRACE_EVENT(gpu_sched_switch,
+
+       TP_PROTO(const char *gpu_name, u64 timestamp,
+               u32 next_ctx_id, s32 next_prio, u32 next_job_id),
+
+       TP_ARGS(gpu_name, timestamp, next_ctx_id, next_prio, next_job_id),
+
+       TP_STRUCT__entry(
+               __string(       gpu_name,       gpu_name        )
+               __field(        u64,            timestamp       )
+               __field(        u32,            next_ctx_id     )
+               __field(        s32,            next_prio       )
+               __field(        u32,            next_job_id     )
+       ),
+
+       TP_fast_assign(
+               __assign_str(gpu_name, gpu_name);
+               __entry->timestamp = timestamp;
+               __entry->next_ctx_id = next_ctx_id;
+               __entry->next_prio = next_prio;
+               __entry->next_job_id = next_job_id;
+       ),
+
+       TP_printk("gpu_name=%s ts=%llu.%06lu next_ctx_id=%lu next_prio=%ld "
+               "next_job_id=%lu",
+               __get_str(gpu_name),
+               (unsigned long long)show_secs_from_ns(__entry->timestamp),
+               (unsigned long)show_usecs_from_ns(__entry->timestamp),
+               (unsigned long)__entry->next_ctx_id,
+               (long)__entry->next_prio,
+               (unsigned long)__entry->next_job_id)
+);
+
+/*
+ * The gpu_job_enqueue event indicates that a batch of work has been queued up
+ * to be processed by the GPU.  This event is not intended to indicate that
+ * the batch of work has been submitted to the GPU hardware, but rather that
+ * it has been submitted to the GPU kernel driver.
+ *
+ * This event should be traced on the thread that initiated the work being
+ * queued.  For example, if a batch of work is submitted to the kernel by a
+ * userland thread, the event should be traced on that thread.
+ *
+ * The ctx_id field identifies the GPU context in which the batch of work
+ * being queued is to be run.
+ *
+ * The job_id field identifies the batch of work being queued within the given
+ * GPU context.  The first batch of work submitted for a given GPU context
+ * should have a job_id of 0, and each subsequent batch of work should
+ * increment the job_id by 1.
+ *
+ * The type field identifies the type of the job being enqueued.  The job
+ * types may be different for different GPU hardware.  For example, a GPU may
+ * differentiate between "2D", "3D", and "compute" jobs.
+ */
+TRACE_EVENT(gpu_job_enqueue,
+
+       TP_PROTO(u32 ctx_id, u32 job_id, const char *type),
+
+       TP_ARGS(ctx_id, job_id, type),
+
+       TP_STRUCT__entry(
+               __field(        u32,            ctx_id          )
+               __field(        u32,            job_id          )
+               __string(       type,           type            )
+       ),
+
+       TP_fast_assign(
+               __entry->ctx_id = ctx_id;
+               __entry->job_id = job_id;
+               __assign_str(type, type);
+       ),
+
+       TP_printk("ctx_id=%lu job_id=%lu type=%s",
+               (unsigned long)__entry->ctx_id,
+               (unsigned long)__entry->job_id,
+               __get_str(type))
+);
+
+#undef show_secs_from_ns
+#undef show_usecs_from_ns
+
+#endif /* _TRACE_GPU_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/net.h b/include/trace/events/net.h

index 9c886739246ae4e91d8369f4c690bccd56f27912..f1a300c8ef8517775f9ad286ee1440f1cc3854f8 100644 (file)
--- a/include/trace/events/net.h
+++ b/include/trace/events/net.h
@@ -58,7 +58,7 @@ TRACE_EVENT(net_dev_start_xmit,
                 __entry->gso_type = skb_shinfo(skb)->gso_type;
         ),
  
-       TP_printk("dev=%s queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
+       TP_printk("dev=%s queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d len=%u data_len=%u network_offset=%d transport_offset_valid=%d transport_offset=%d tx_flags=%d gso_size=%d gso_segs=%d gso_type=%#x",
                   __get_str(name), __entry->queue_mapping, __entry->skbaddr,
                   __entry->vlan_tagged, __entry->vlan_proto, __entry->vlan_tci,
                   __entry->protocol, __entry->ip_summed, __entry->len,
@@ -91,7 +91,7 @@ TRACE_EVENT(net_dev_xmit,
                 __assign_str(name, dev->name);
         ),
  
-       TP_printk("dev=%s skbaddr=%p len=%u rc=%d",
+       TP_printk("dev=%s skbaddr=%pK len=%u rc=%d",
                 __get_str(name), __entry->skbaddr, __entry->len, __entry->rc)
  );
  
@@ -113,7 +113,7 @@ DECLARE_EVENT_CLASS(net_dev_template,
                 __assign_str(name, skb->dev->name);
         ),
  
-       TP_printk("dev=%s skbaddr=%p len=%u",
+       TP_printk("dev=%s skbaddr=%pK len=%u",
                 __get_str(name), __entry->skbaddr, __entry->len)
  )
  
@@ -192,7 +192,7 @@ DECLARE_EVENT_CLASS(net_dev_rx_verbose_template,
                 __entry->gso_type = skb_shinfo(skb)->gso_type;
         ),
  
-       TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%p vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
+       TP_printk("dev=%s napi_id=%#x queue_mapping=%u skbaddr=%pK vlan_tagged=%d vlan_proto=0x%04x vlan_tci=0x%04x protocol=0x%04x ip_summed=%d hash=0x%08x l4_hash=%d len=%u data_len=%u truesize=%u mac_header_valid=%d mac_header=%d nr_frags=%d gso_size=%d gso_type=%#x",
                   __get_str(name), __entry->napi_id, __entry->queue_mapping,
                   __entry->skbaddr, __entry->vlan_tagged, __entry->vlan_proto,
                   __entry->vlan_tci, __entry->protocol, __entry->ip_summed,
diff --git a/include/trace/events/power.h b/include/trace/events/power.h

index 908977d69783b8e4ba1c1f775a5c51bc9f0befd7..3eac1f9f66b929293bbc2ac839b95da73cc21460 100644 (file)
--- a/include/trace/events/power.h
+++ b/include/trace/events/power.h
@@ -148,6 +148,31 @@ DEFINE_EVENT(cpu, cpu_frequency,
         TP_ARGS(frequency, cpu_id)
  );
  
+TRACE_EVENT(cpu_frequency_limits,
+
+       TP_PROTO(unsigned int max_freq, unsigned int min_freq,
+               unsigned int cpu_id),
+
+       TP_ARGS(max_freq, min_freq, cpu_id),
+
+       TP_STRUCT__entry(
+               __field(        u32,            min_freq        )
+               __field(        u32,            max_freq        )
+               __field(        u32,            cpu_id          )
+       ),
+
+       TP_fast_assign(
+               __entry->min_freq = min_freq;
+               __entry->max_freq = max_freq;
+               __entry->cpu_id = cpu_id;
+       ),
+
+       TP_printk("min=%lu max=%lu cpu_id=%lu",
+                 (unsigned long)__entry->min_freq,
+                 (unsigned long)__entry->max_freq,
+                 (unsigned long)__entry->cpu_id)
+);
+
  TRACE_EVENT(device_pm_callback_start,
  
         TP_PROTO(struct device *dev, const char *pm_ops, int event),
@@ -301,6 +326,25 @@ DEFINE_EVENT(clock, clock_set_rate,
         TP_ARGS(name, state, cpu_id)
  );
  
+TRACE_EVENT(clock_set_parent,
+
+       TP_PROTO(const char *name, const char *parent_name),
+
+       TP_ARGS(name, parent_name),
+
+       TP_STRUCT__entry(
+               __string(       name,           name            )
+               __string(       parent_name,    parent_name     )
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, name);
+               __assign_str(parent_name, parent_name);
+       ),
+
+       TP_printk("%s parent=%s", __get_str(name), __get_str(parent_name))
+);
+
  /*
   * The power domain events are used for power domains transitions
   */
diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h

new file mode 100644 (file)

index 0000000..f5024c5
--- /dev/null
+++ b/include/trace/events/preemptirq.h
@@ -0,0 +1,70 @@
+#ifdef CONFIG_PREEMPTIRQ_EVENTS
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM preemptirq
+
+#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PREEMPTIRQ_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+#include <linux/string.h>
+#include <asm/sections.h>
+
+DECLARE_EVENT_CLASS(preemptirq_template,
+
+       TP_PROTO(unsigned long ip, unsigned long parent_ip),
+
+       TP_ARGS(ip, parent_ip),
+
+       TP_STRUCT__entry(
+               __field(u32, caller_offs)
+               __field(u32, parent_offs)
+       ),
+
+       TP_fast_assign(
+               __entry->caller_offs = (u32)(ip - (unsigned long)_stext);
+               __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext);
+       ),
+
+       TP_printk("caller=%pF parent=%pF",
+                 (void *)((unsigned long)(_stext) + __entry->caller_offs),
+                 (void *)((unsigned long)(_stext) + __entry->parent_offs))
+);
+
+#ifndef CONFIG_PROVE_LOCKING
+DEFINE_EVENT(preemptirq_template, irq_disable,
+            TP_PROTO(unsigned long ip, unsigned long parent_ip),
+            TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, irq_enable,
+            TP_PROTO(unsigned long ip, unsigned long parent_ip),
+            TP_ARGS(ip, parent_ip));
+#endif
+
+#ifdef CONFIG_DEBUG_PREEMPT
+DEFINE_EVENT(preemptirq_template, preempt_disable,
+            TP_PROTO(unsigned long ip, unsigned long parent_ip),
+            TP_ARGS(ip, parent_ip));
+
+DEFINE_EVENT(preemptirq_template, preempt_enable,
+            TP_PROTO(unsigned long ip, unsigned long parent_ip),
+            TP_ARGS(ip, parent_ip));
+#endif
+
+#endif /* _TRACE_PREEMPTIRQ_H */
+
+#include <trace/define_trace.h>
+
+#else /* !CONFIG_PREEMPTIRQ_EVENTS */
+
+#define trace_irq_enable(...)
+#define trace_irq_disable(...)
+#define trace_preempt_enable(...)
+#define trace_preempt_disable(...)
+#define trace_irq_enable_rcuidle(...)
+#define trace_irq_disable_rcuidle(...)
+#define trace_preempt_enable_rcuidle(...)
+#define trace_preempt_disable_rcuidle(...)
+
+#endif
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index da10aa21bebc847bd1c71d1ffed177494bcb774e..7909097a1e5edf35b95064d323e747fc072bd52c 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -226,7 +226,7 @@ DECLARE_EVENT_CLASS(sched_process_template,
  DEFINE_EVENT(sched_process_template, sched_process_free,
              TP_PROTO(struct task_struct *p),
              TP_ARGS(p));
-            
+
  
  /*
   * Tracepoint for a task exiting:
@@ -380,6 +380,30 @@ DEFINE_EVENT(sched_stat_template, sched_stat_blocked,
              TP_PROTO(struct task_struct *tsk, u64 delay),
              TP_ARGS(tsk, delay));
  
+/*
+ * Tracepoint for recording the cause of uninterruptible sleep.
+ */
+TRACE_EVENT(sched_blocked_reason,
+
+       TP_PROTO(struct task_struct *tsk),
+
+       TP_ARGS(tsk),
+
+       TP_STRUCT__entry(
+               __field( pid_t, pid     )
+               __field( void*, caller  )
+               __field( bool, io_wait  )
+       ),
+
+       TP_fast_assign(
+               __entry->pid    = tsk->pid;
+               __entry->caller = (void*)get_wchan(tsk);
+               __entry->io_wait = tsk->in_iowait;
+       ),
+
+       TP_printk("pid=%d iowait=%d caller=%pS", __entry->pid, __entry->io_wait, __entry->caller)
+);
+
  /*
   * Tracepoint for accounting runtime (time the task is executing
   * on a CPU).
@@ -570,6 +594,578 @@ TRACE_EVENT(sched_wake_idle_without_ipi,
  
         TP_printk("cpu=%d", __entry->cpu)
  );
+
+#ifdef CONFIG_SMP
+#ifdef CREATE_TRACE_POINTS
+static inline
+int __trace_sched_cpu(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       struct rq *rq = cfs_rq ? cfs_rq->rq : NULL;
+#else
+       struct rq *rq = cfs_rq ? container_of(cfs_rq, struct rq, cfs) : NULL;
+#endif
+       return rq ? cpu_of(rq)
+                 : task_cpu((container_of(se, struct task_struct, se)));
+}
+
+static inline
+int __trace_sched_path(struct cfs_rq *cfs_rq, char *path, int len)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       int l = path ? len : 0;
+
+       if (cfs_rq && task_group_is_autogroup(cfs_rq->tg))
+               return autogroup_path(cfs_rq->tg, path, l) + 1;
+       else if (cfs_rq && cfs_rq->tg->css.cgroup)
+               return cgroup_path(cfs_rq->tg->css.cgroup, path, l) + 1;
+#endif
+       if (path)
+               strcpy(path, "(null)");
+
+       return strlen("(null)");
+}
+
+static inline
+struct cfs_rq *__trace_sched_group_cfs_rq(struct sched_entity *se)
+{
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       return se->my_q;
+#else
+       return NULL;
+#endif
+}
+#endif /* CREATE_TRACE_POINTS */
+
+/*
+ * Tracepoint for cfs_rq load tracking:
+ */
+TRACE_EVENT(sched_load_cfs_rq,
+
+       TP_PROTO(struct cfs_rq *cfs_rq),
+
+       TP_ARGS(cfs_rq),
+
+       TP_STRUCT__entry(
+               __field(        int,            cpu                     )
+               __dynamic_array(char,           path,
+                               __trace_sched_path(cfs_rq, NULL, 0)     )
+               __field(        unsigned long,  load                    )
+               __field(        unsigned long,  util                    )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = __trace_sched_cpu(cfs_rq, NULL);
+               __trace_sched_path(cfs_rq, __get_dynamic_array(path),
+                                  __get_dynamic_array_len(path));
+               __entry->load   = cfs_rq->runnable_load_avg;
+               __entry->util   = cfs_rq->avg.util_avg;
+       ),
+
+       TP_printk("cpu=%d path=%s load=%lu util=%lu", __entry->cpu,
+                 __get_str(path), __entry->load, __entry->util)
+);
+
+/*
+ * Tracepoint for rt_rq load tracking:
+ */
+struct rt_rq;
+
+TRACE_EVENT(sched_load_rt_rq,
+
+       TP_PROTO(int cpu, struct rt_rq *rt_rq),
+
+       TP_ARGS(cpu, rt_rq),
+
+       TP_STRUCT__entry(
+               __field(        int,            cpu                     )
+               __field(        unsigned long,  util                    )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cpu;
+               __entry->util   = rt_rq->avg.util_avg;
+       ),
+
+       TP_printk("cpu=%d util=%lu", __entry->cpu,
+                 __entry->util)
+);
+
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+
+#define walt_util(util_var, demand_sum) {\
+       u64 sum = demand_sum << SCHED_CAPACITY_SHIFT;\
+       do_div(sum, walt_ravg_window);\
+       util_var = (typeof(util_var))sum;\
+       }
+#endif
+
+/*
+ * Tracepoint for accounting cpu root cfs_rq
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+        TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+        TP_ARGS(cpu, cfs_rq),
+
+        TP_STRUCT__entry(
+                __field( int,   cpu                             )
+                __field( unsigned long, load_avg                )
+                __field( unsigned long, util_avg                )
+                __field( unsigned long, util_avg_pelt           )
+                __field( unsigned long, util_avg_walt           )
+        ),
+
+        TP_fast_assign(
+                __entry->cpu                    = cpu;
+                __entry->load_avg               = cfs_rq->avg.load_avg;
+                __entry->util_avg               = cfs_rq->avg.util_avg;
+                __entry->util_avg_pelt  = cfs_rq->avg.util_avg;
+                __entry->util_avg_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+                walt_util(__entry->util_avg_walt, cpu_rq(cpu)->prev_runnable_sum);
+                if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+                        __entry->util_avg = __entry->util_avg_walt;
+#endif
+        ),
+
+        TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+                          "util_avg_pelt=%lu util_avg_walt=%lu",
+                  __entry->cpu, __entry->load_avg, __entry->util_avg,
+                  __entry->util_avg_pelt, __entry->util_avg_walt)
+);
+
+
+/*
+ * Tracepoint for sched_entity load tracking:
+ */
+TRACE_EVENT(sched_load_se,
+
+       TP_PROTO(struct sched_entity *se),
+
+       TP_ARGS(se),
+
+       TP_STRUCT__entry(
+               __field(        int,            cpu                           )
+               __dynamic_array(char,           path,
+                 __trace_sched_path(__trace_sched_group_cfs_rq(se), NULL, 0) )
+               __array(        char,           comm,   TASK_COMM_LEN         )
+               __field(        pid_t,          pid                           )
+               __field(        unsigned long,  load                          )
+               __field(        unsigned long,  util                          )
+               __field(        unsigned long,  util_pelt                     )
+               __field(        unsigned long,  util_walt                     )
+       ),
+
+       TP_fast_assign(
+               struct cfs_rq *gcfs_rq = __trace_sched_group_cfs_rq(se);
+               struct task_struct *p = gcfs_rq ? NULL
+                                   : container_of(se, struct task_struct, se);
+
+               __entry->cpu = __trace_sched_cpu(gcfs_rq, se);
+               __trace_sched_path(gcfs_rq, __get_dynamic_array(path),
+                                  __get_dynamic_array_len(path));
+               memcpy(__entry->comm, p ? p->comm : "(null)", TASK_COMM_LEN);
+               __entry->pid = p ? p->pid : -1;
+               __entry->load = se->avg.load_avg;
+               __entry->util = se->avg.util_avg;
+               __entry->util_pelt  = __entry->util;
+               __entry->util_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+               if (!se->my_q) {
+                       struct task_struct *p = container_of(se, struct task_struct, se);
+                       walt_util(__entry->util_walt, p->ravg.demand);
+                       if (!walt_disabled && sysctl_sched_use_walt_task_util)
+                               __entry->util = __entry->util_walt;
+               }
+#endif
+       ),
+
+       TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu util_pelt=%lu util_walt=%lu",
+                 __entry->cpu, __get_str(path), __entry->comm,
+                 __entry->pid, __entry->load, __entry->util,
+                 __entry->util_pelt, __entry->util_walt)
+);
+
+/*
+ * Tracepoint for task_group load tracking:
+ */
+#ifdef CONFIG_FAIR_GROUP_SCHED
+TRACE_EVENT(sched_load_tg,
+
+       TP_PROTO(struct cfs_rq *cfs_rq),
+
+       TP_ARGS(cfs_rq),
+
+       TP_STRUCT__entry(
+               __field(        int,    cpu                             )
+               __dynamic_array(char,   path,
+                               __trace_sched_path(cfs_rq, NULL, 0)     )
+               __field(        long,   load                            )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cfs_rq->rq->cpu;
+               __trace_sched_path(cfs_rq, __get_dynamic_array(path),
+                                  __get_dynamic_array_len(path));
+               __entry->load   = atomic_long_read(&cfs_rq->tg->load_avg);
+       ),
+
+       TP_printk("cpu=%d path=%s load=%ld", __entry->cpu, __get_str(path),
+                 __entry->load)
+);
+#endif /* CONFIG_FAIR_GROUP_SCHED */
+
+/*
+ * Tracepoint for accounting CPU  boosted utilization
+ */
+TRACE_EVENT(sched_boost_cpu,
+
+       TP_PROTO(int cpu, unsigned long util, long margin),
+
+       TP_ARGS(cpu, util, margin),
+
+       TP_STRUCT__entry(
+               __field( int,           cpu                     )
+               __field( unsigned long, util                    )
+               __field(long,           margin                  )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu    = cpu;
+               __entry->util   = util;
+               __entry->margin = margin;
+       ),
+
+       TP_printk("cpu=%d util=%lu margin=%ld",
+                 __entry->cpu,
+                 __entry->util,
+                 __entry->margin)
+);
+
+/*
+ * Tracepoint for schedtune_tasks_update
+ */
+TRACE_EVENT(sched_tune_tasks_update,
+
+       TP_PROTO(struct task_struct *tsk, int cpu, int tasks, int idx,
+               int boost, int max_boost),
+
+       TP_ARGS(tsk, cpu, tasks, idx, boost, max_boost),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN   )
+               __field( pid_t,         pid             )
+               __field( int,           cpu             )
+               __field( int,           tasks           )
+               __field( int,           idx             )
+               __field( int,           boost           )
+               __field( int,           max_boost       )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid            = tsk->pid;
+               __entry->cpu            = cpu;
+               __entry->tasks          = tasks;
+               __entry->idx            = idx;
+               __entry->boost          = boost;
+               __entry->max_boost      = max_boost;
+       ),
+
+       TP_printk("pid=%d comm=%s "
+                       "cpu=%d tasks=%d idx=%d boost=%d max_boost=%d",
+               __entry->pid, __entry->comm,
+               __entry->cpu, __entry->tasks, __entry->idx,
+               __entry->boost, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for schedtune_boostgroup_update
+ */
+TRACE_EVENT(sched_tune_boostgroup_update,
+
+       TP_PROTO(int cpu, int variation, int max_boost),
+
+       TP_ARGS(cpu, variation, max_boost),
+
+       TP_STRUCT__entry(
+               __field( int,   cpu             )
+               __field( int,   variation       )
+               __field( int,   max_boost       )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu            = cpu;
+               __entry->variation      = variation;
+               __entry->max_boost      = max_boost;
+       ),
+
+       TP_printk("cpu=%d variation=%d max_boost=%d",
+               __entry->cpu, __entry->variation, __entry->max_boost)
+);
+
+/*
+ * Tracepoint for accounting task boosted utilization
+ */
+TRACE_EVENT(sched_boost_task,
+
+       TP_PROTO(struct task_struct *tsk, unsigned long util, long margin),
+
+       TP_ARGS(tsk, util, margin),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN           )
+               __field( pid_t,         pid                     )
+               __field( unsigned long, util                    )
+               __field( long,          margin                  )
+
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid    = tsk->pid;
+               __entry->util   = util;
+               __entry->margin = margin;
+       ),
+
+       TP_printk("comm=%s pid=%d util=%lu margin=%ld",
+                 __entry->comm, __entry->pid,
+                 __entry->util,
+                 __entry->margin)
+);
+
+/*
+ * Tracepoint for system overutilized flag
+ */
+struct sched_domain;
+TRACE_EVENT_CONDITION(sched_overutilized,
+
+       TP_PROTO(struct sched_domain *sd, bool was_overutilized, bool overutilized),
+
+       TP_ARGS(sd, was_overutilized, overutilized),
+
+       TP_CONDITION(overutilized != was_overutilized),
+
+       TP_STRUCT__entry(
+               __field( bool,  overutilized      )
+               __array( char,  cpulist , 32      )
+       ),
+
+       TP_fast_assign(
+               __entry->overutilized   = overutilized;
+               scnprintf(__entry->cpulist, sizeof(__entry->cpulist), "%*pbl", cpumask_pr_args(sched_domain_span(sd)));
+       ),
+
+       TP_printk("overutilized=%d sd_span=%s",
+               __entry->overutilized ? 1 : 0, __entry->cpulist)
+);
+
+/*
+ * Tracepoint for find_best_target
+ */
+TRACE_EVENT(sched_find_best_target,
+
+       TP_PROTO(struct task_struct *tsk, bool prefer_idle,
+               unsigned long min_util, int start_cpu,
+               int best_idle, int best_active, int target),
+
+       TP_ARGS(tsk, prefer_idle, min_util, start_cpu,
+               best_idle, best_active, target),
+
+       TP_STRUCT__entry(
+               __array( char,  comm,   TASK_COMM_LEN   )
+               __field( pid_t, pid                     )
+               __field( unsigned long, min_util        )
+               __field( bool,  prefer_idle             )
+               __field( int,   start_cpu               )
+               __field( int,   best_idle               )
+               __field( int,   best_active             )
+               __field( int,   target                  )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+               __entry->pid            = tsk->pid;
+               __entry->min_util       = min_util;
+               __entry->prefer_idle    = prefer_idle;
+               __entry->start_cpu      = start_cpu;
+               __entry->best_idle      = best_idle;
+               __entry->best_active    = best_active;
+               __entry->target         = target;
+       ),
+
+       TP_printk("pid=%d comm=%s prefer_idle=%d start_cpu=%d "
+                 "best_idle=%d best_active=%d target=%d",
+               __entry->pid, __entry->comm,
+               __entry->prefer_idle, __entry->start_cpu,
+               __entry->best_idle, __entry->best_active,
+               __entry->target)
+);
+
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+       TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+                                               u64 wallclock, u64 irqtime),
+
+       TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        pid_t,  cur_pid                 )
+               __field(        u64,    wallclock               )
+               __field(        u64,    mark_start              )
+               __field(        u64,    delta_m                 )
+               __field(        u64,    win_start               )
+               __field(        u64,    delta                   )
+               __field(        u64,    irqtime                 )
+               __array(    char,   evt, 16                     )
+               __field(unsigned int,   demand                  )
+               __field(unsigned int,   sum                     )
+               __field(         int,   cpu                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        u32,    curr_window             )
+               __field(        u32,    prev_window             )
+               __field(        u64,    nt_cs                   )
+               __field(        u64,    nt_ps                   )
+               __field(        u32,    active_windows          )
+       ),
+
+       TP_fast_assign(
+                       static const char* walt_event_names[] =
+                       {
+                               "PUT_PREV_TASK",
+                               "PICK_NEXT_TASK",
+                               "TASK_WAKE",
+                               "TASK_MIGRATE",
+                               "TASK_UPDATE",
+                               "IRQ_UPDATE"
+                       };
+               __entry->wallclock      = wallclock;
+               __entry->win_start      = rq->window_start;
+               __entry->delta          = (wallclock - rq->window_start);
+               strcpy(__entry->evt, walt_event_names[evt]);
+               __entry->cpu            = rq->cpu;
+               __entry->cur_pid        = rq->curr->pid;
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->mark_start     = p->ravg.mark_start;
+               __entry->delta_m        = (wallclock - p->ravg.mark_start);
+               __entry->demand         = p->ravg.demand;
+               __entry->sum            = p->ravg.sum;
+               __entry->irqtime        = irqtime;
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->curr_window    = p->ravg.curr_window;
+               __entry->prev_window    = p->ravg.prev_window;
+               __entry->nt_cs          = rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = rq->nt_prev_runnable_sum;
+               __entry->active_windows = p->ravg.active_windows;
+       ),
+
+       TP_printk("wallclock=%llu window_start=%llu delta=%llu event=%s cpu=%d cur_pid=%d pid=%d comm=%s"
+               " mark_start=%llu delta=%llu demand=%u sum=%u irqtime=%llu"
+               " curr_runnable_sum=%llu prev_runnable_sum=%llu cur_window=%u"
+               " prev_window=%u nt_curr_runnable_sum=%llu nt_prev_runnable_sum=%llu active_windows=%u",
+               __entry->wallclock, __entry->win_start, __entry->delta,
+               __entry->evt, __entry->cpu, __entry->cur_pid,
+               __entry->pid, __entry->comm, __entry->mark_start,
+               __entry->delta_m, __entry->demand,
+               __entry->sum, __entry->irqtime,
+               __entry->cs, __entry->ps,
+               __entry->curr_window, __entry->prev_window,
+               __entry->nt_cs, __entry->nt_ps,
+               __entry->active_windows
+               )
+);
+
+TRACE_EVENT(walt_update_history,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+                       int evt),
+
+       TP_ARGS(rq, p, runtime, samples, evt),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(unsigned int,   runtime                 )
+               __field(         int,   samples                 )
+               __field(         int,   evt                     )
+               __field(         u64,   demand                  )
+               __field(unsigned int,   walt_avg                )
+               __field(unsigned int,   pelt_avg                )
+               __array(         u32,   hist, RAVG_HIST_SIZE_MAX)
+               __field(         int,   cpu                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->runtime        = runtime;
+               __entry->samples        = samples;
+               __entry->evt            = evt;
+               __entry->demand         = p->ravg.demand;
+               walt_util(__entry->walt_avg,__entry->demand);
+               __entry->pelt_avg       = p->se.avg.util_avg;
+               memcpy(__entry->hist, p->ravg.sum_history,
+                                       RAVG_HIST_SIZE_MAX * sizeof(u32));
+               __entry->cpu            = rq->cpu;
+       ),
+
+       TP_printk("pid=%d comm=%s runtime=%u samples=%d event=%d demand=%llu ravg_window=%u"
+               " walt=%u pelt=%u hist0=%u hist1=%u hist2=%u hist3=%u hist4=%u cpu=%d",
+               __entry->pid, __entry->comm,
+               __entry->runtime, __entry->samples, __entry->evt,
+               __entry->demand,
+               walt_ravg_window,
+               __entry->walt_avg,
+               __entry->pelt_avg,
+               __entry->hist[0], __entry->hist[1],
+               __entry->hist[2], __entry->hist[3],
+               __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p),
+
+       TP_ARGS(rq, p),
+
+       TP_STRUCT__entry(
+               __field(int,            cpu                     )
+               __field(int,            pid                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        s64,    nt_cs                   )
+               __field(        s64,    nt_ps                   )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu            = cpu_of(rq);
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->nt_cs          = (s64)rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = (s64)rq->nt_prev_runnable_sum;
+               __entry->pid            = p->pid;
+       ),
+
+       TP_printk("cpu=%d curr_runnable_sum=%llu prev_runnable_sum=%llu nt_curr_runnable_sum=%lld nt_prev_runnable_sum=%lld pid=%d",
+                 __entry->cpu, __entry->cs, __entry->ps,
+                 __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
+#endif /* CONFIG_SMP */
  #endif /* _TRACE_SCHED_H */
  
  /* This part must be outside protection */
diff --git a/include/uapi/linux/android/binder.h b/include/uapi/linux/android/binder.h

index bfaec6903b8bca4b4091a53010dd50e3efeea893..b4723e36b6cf2853f30294ebaf9b5a6761fde835 100644 (file)
--- a/include/uapi/linux/android/binder.h
+++ b/include/uapi/linux/android/binder.h
@@ -38,9 +38,56 @@ enum {
         BINDER_TYPE_PTR         = B_PACK_CHARS('p', 't', '*', B_TYPE_LARGE),
  };
  
-enum {
+/**
+ * enum flat_binder_object_shifts: shift values for flat_binder_object_flags
+ * @FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT: shift for getting scheduler policy.
+ *
+ */
+enum flat_binder_object_shifts {
+       FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT = 9,
+};
+
+/**
+ * enum flat_binder_object_flags - flags for use in flat_binder_object.flags
+ */
+enum flat_binder_object_flags {
+       /**
+        * @FLAT_BINDER_FLAG_PRIORITY_MASK: bit-mask for min scheduler priority
+        *
+        * These bits can be used to set the minimum scheduler priority
+        * at which transactions into this node should run. Valid values
+        * in these bits depend on the scheduler policy encoded in
+        * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK.
+        *
+        * For SCHED_NORMAL/SCHED_BATCH, the valid range is between [-20..19]
+        * For SCHED_FIFO/SCHED_RR, the value can run between [1..99]
+        */
         FLAT_BINDER_FLAG_PRIORITY_MASK = 0xff,
+       /**
+        * @FLAT_BINDER_FLAG_ACCEPTS_FDS: whether the node accepts fds.
+        */
         FLAT_BINDER_FLAG_ACCEPTS_FDS = 0x100,
+       /**
+        * @FLAT_BINDER_FLAG_SCHED_POLICY_MASK: bit-mask for scheduling policy
+        *
+        * These two bits can be used to set the min scheduling policy at which
+        * transactions on this node should run. These match the UAPI
+        * scheduler policy values, eg:
+        * 00b: SCHED_NORMAL
+        * 01b: SCHED_FIFO
+        * 10b: SCHED_RR
+        * 11b: SCHED_BATCH
+        */
+       FLAT_BINDER_FLAG_SCHED_POLICY_MASK =
+               3U << FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT,
+
+       /**
+        * @FLAT_BINDER_FLAG_INHERIT_RT: whether the node inherits RT policy
+        *
+        * Only when set, calls into this node will inherit a real-time
+        * scheduling policy from the caller (for synchronous transactions).
+        */
+       FLAT_BINDER_FLAG_INHERIT_RT = 0x800,
  };
  
  #ifdef BINDER_IPC_32BIT
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h

index 30f2ce76b5170c8ea04f93a8f8f70f4633e4322b..a88b2c458dccf97a71a17a97899e78c20b183c3e 100644 (file)
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -176,6 +176,10 @@ enum bpf_attach_type {
  /* Specify numa node during map creation */
  #define BPF_F_NUMA_NODE                (1U << 2)
  
+/* Flags for accessing BPF object */
+#define BPF_F_RDONLY           (1U << 3)
+#define BPF_F_WRONLY           (1U << 4)
+
  union bpf_attr {
         struct { /* anonymous struct used by BPF_MAP_CREATE command */
                 __u32   map_type;       /* one of enum bpf_map_type */
@@ -216,6 +220,7 @@ union bpf_attr {
         struct { /* anonymous struct used by BPF_OBJ_* commands */
                 __aligned_u64   pathname;
                 __u32           bpf_fd;
+               __u32           file_flags;
         };
  
         struct { /* anonymous struct used by BPF_PROG_ATTACH/DETACH commands */
@@ -243,6 +248,7 @@ union bpf_attr {
                         __u32           map_id;
                 };
                 __u32           next_id;
+               __u32           open_flags;
         };
  
         struct { /* anonymous struct used by BPF_OBJ_GET_INFO_BY_FD */
diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h

index 4b5001c57f4642d57559607e1398492f78bf998b..d5ac0ebd82c5ed368af439b282ec098fcd29d418 100644 (file)
--- a/include/uapi/linux/fuse.h
+++ b/include/uapi/linux/fuse.h
@@ -376,6 +376,7 @@ enum fuse_opcode {
         FUSE_READDIRPLUS   = 44,
         FUSE_RENAME2       = 45,
         FUSE_LSEEK         = 46,
+       FUSE_CANONICAL_PATH= 2016,
  
         /* CUSE specific operations */
         CUSE_INIT          = 4096,
diff --git a/include/uapi/linux/ipv6.h b/include/uapi/linux/ipv6.h

index b22a9c4e1b1248d73938fb271e83b516292130a1..8c3ea81d5bd77939a3f8f6a68db9c2529aa8ec7c 100644 (file)
--- a/include/uapi/linux/ipv6.h
+++ b/include/uapi/linux/ipv6.h
@@ -166,6 +166,7 @@ enum {
         DEVCONF_ACCEPT_DAD,
         DEVCONF_FORCE_TLLAO,
         DEVCONF_NDISC_NOTIFY,
+       DEVCONF_ACCEPT_RA_RT_TABLE,
         DEVCONF_MLDV1_UNSOLICITED_REPORT_INTERVAL,
         DEVCONF_MLDV2_UNSOLICITED_REPORT_INTERVAL,
         DEVCONF_SUPPRESS_FRAG_NDISC,
diff --git a/include/uapi/linux/keychord.h b/include/uapi/linux/keychord.h

new file mode 100644 (file)

index 0000000..ea7cf4d
--- /dev/null
+++ b/include/uapi/linux/keychord.h
@@ -0,0 +1,52 @@
+/*
+ *  Key chord input driver
+ *
+ * Copyright (C) 2008 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+*/
+
+#ifndef _UAPI_LINUX_KEYCHORD_H_
+#define _UAPI_LINUX_KEYCHORD_H_
+
+#include <linux/input.h>
+
+#define KEYCHORD_VERSION               1
+
+/*
+ * One or more input_keychord structs are written to /dev/keychord
+ * at once to specify the list of keychords to monitor.
+ * Reading /dev/keychord returns the id of a keychord when the
+ * keychord combination is pressed.  A keychord is signalled when
+ * all of the keys in the keycode list are in the pressed state.
+ * The order in which the keys are pressed does not matter.
+ * The keychord will not be signalled if keys not in the keycode
+ * list are pressed.
+ * Keychords will not be signalled on key release events.
+ */
+struct input_keychord {
+       /* should be KEYCHORD_VERSION */
+       __u16 version;
+       /*
+        * client specified ID, returned from read()
+        * when this keychord is pressed.
+        */
+       __u16 id;
+
+       /* number of keycodes in this keychord */
+       __u16 count;
+
+       /* variable length array of keycodes */
+       __u16 keycodes[];
+};
+
+#endif /* _UAPI_LINUX_KEYCHORD_H_ */
diff --git a/include/uapi/linux/magic.h b/include/uapi/linux/magic.h

index aa50113ebe5b0bec825ac29ddc8a37fc3a9d90b6..5bdb07fed129b58e5e64d2bc38aaa52b4c3c725c 100644 (file)
--- a/include/uapi/linux/magic.h
+++ b/include/uapi/linux/magic.h
@@ -55,6 +55,8 @@
  #define REISER2FS_SUPER_MAGIC_STRING   "ReIsEr2Fs"
  #define REISER2FS_JR_SUPER_MAGIC_STRING        "ReIsEr3Fs"
  
+#define SDCARDFS_SUPER_MAGIC   0x5dca2df5
+
  #define SMB_SUPER_MAGIC                0x517B
  #define CGROUP_SUPER_MAGIC     0x27e0eb
  #define CGROUP2_SUPER_MAGIC    0x63677270
diff --git a/include/uapi/linux/netfilter/xt_IDLETIMER.h b/include/uapi/linux/netfilter/xt_IDLETIMER.h

index 3c586a19baeab5ab683c0c6eb03c948f96701710..c82a1c1d53ec014686b70b19366f344c4ceaff3f 100644 (file)
--- a/include/uapi/linux/netfilter/xt_IDLETIMER.h
+++ b/include/uapi/linux/netfilter/xt_IDLETIMER.h
@@ -5,6 +5,7 @@
   * Header file for Xtables timer target module.
   *
   * Copyright (C) 2004, 2010 Nokia Corporation
+ *
   * Written by Timo Teras <ext-timo.teras@nokia.com>
   *
   * Converted to x_tables and forward-ported to 2.6.34
@@ -33,12 +34,19 @@
  #include <linux/types.h>
  
  #define MAX_IDLETIMER_LABEL_SIZE 28
+#define NLMSG_MAX_SIZE 64
+
+#define NL_EVENT_TYPE_INACTIVE 0
+#define NL_EVENT_TYPE_ACTIVE 1
  
  struct idletimer_tg_info {
         __u32 timeout;
  
         char label[MAX_IDLETIMER_LABEL_SIZE];
  
+       /* Use netlink messages for notification in addition to sysfs */
+       __u8 send_nl_msg;
+
         /* for kernel module internal use only */
         struct idletimer_tg *timer __attribute__((aligned(8)));
  };
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h

index b640071421f709e783f839b3d4c17344d8c86dac..56c0ed196a1f3bb3d197fcb76327eb3e5628d90a 100644 (file)
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -198,4 +198,7 @@ struct prctl_mm_map {
  # define PR_CAP_AMBIENT_LOWER          3
  # define PR_CAP_AMBIENT_CLEAR_ALL      4
  
+#define PR_SET_VMA             0x53564d41
+# define PR_SET_VMA_ANON_NAME          0
+
  #endif /* _LINUX_PRCTL_H */
diff --git a/include/uapi/linux/usb/f_accessory.h b/include/uapi/linux/usb/f_accessory.h

new file mode 100644 (file)

index 0000000..0baeb7d
--- /dev/null
+++ b/include/uapi/linux/usb/f_accessory.h
@@ -0,0 +1,146 @@
+/*
+ * Gadget Function Driver for Android USB accessories
+ *
+ * Copyright (C) 2011 Google, Inc.
+ * Author: Mike Lockwood <lockwood@android.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#ifndef _UAPI_LINUX_USB_F_ACCESSORY_H
+#define _UAPI_LINUX_USB_F_ACCESSORY_H
+
+/* Use Google Vendor ID when in accessory mode */
+#define USB_ACCESSORY_VENDOR_ID 0x18D1
+
+
+/* Product ID to use when in accessory mode */
+#define USB_ACCESSORY_PRODUCT_ID 0x2D00
+
+/* Product ID to use when in accessory mode and adb is enabled */
+#define USB_ACCESSORY_ADB_PRODUCT_ID 0x2D01
+
+/* Indexes for strings sent by the host via ACCESSORY_SEND_STRING */
+#define ACCESSORY_STRING_MANUFACTURER   0
+#define ACCESSORY_STRING_MODEL          1
+#define ACCESSORY_STRING_DESCRIPTION    2
+#define ACCESSORY_STRING_VERSION        3
+#define ACCESSORY_STRING_URI            4
+#define ACCESSORY_STRING_SERIAL         5
+
+/* Control request for retrieving device's protocol version
+ *
+ *     requestType:    USB_DIR_IN | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_GET_PROTOCOL
+ *     value:          0
+ *     index:          0
+ *     data            version number (16 bits little endian)
+ *                     1 for original accessory support
+ *                     2 adds HID and device to host audio support
+ */
+#define ACCESSORY_GET_PROTOCOL  51
+
+/* Control request for host to send a string to the device
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_SEND_STRING
+ *     value:          0
+ *     index:          string ID
+ *     data            zero terminated UTF8 string
+ *
+ *  The device can later retrieve these strings via the
+ *  ACCESSORY_GET_STRING_* ioctls
+ */
+#define ACCESSORY_SEND_STRING   52
+
+/* Control request for starting device in accessory mode.
+ * The host sends this after setting all its strings to the device.
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_START
+ *     value:          0
+ *     index:          0
+ *     data            none
+ */
+#define ACCESSORY_START         53
+
+/* Control request for registering a HID device.
+ * Upon registering, a unique ID is sent by the accessory in the
+ * value parameter. This ID will be used for future commands for
+ * the device
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_REGISTER_HID_DEVICE
+ *     value:          Accessory assigned ID for the HID device
+ *     index:          total length of the HID report descriptor
+ *     data            none
+ */
+#define ACCESSORY_REGISTER_HID         54
+
+/* Control request for unregistering a HID device.
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_REGISTER_HID
+ *     value:          Accessory assigned ID for the HID device
+ *     index:          0
+ *     data            none
+ */
+#define ACCESSORY_UNREGISTER_HID         55
+
+/* Control request for sending the HID report descriptor.
+ * If the HID descriptor is longer than the endpoint zero max packet size,
+ * the descriptor will be sent in multiple ACCESSORY_SET_HID_REPORT_DESC
+ * commands. The data for the descriptor must be sent sequentially
+ * if multiple packets are needed.
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_SET_HID_REPORT_DESC
+ *     value:          Accessory assigned ID for the HID device
+ *     index:          offset of data in descriptor
+ *                      (needed when HID descriptor is too big for one packet)
+ *     data            the HID report descriptor
+ */
+#define ACCESSORY_SET_HID_REPORT_DESC         56
+
+/* Control request for sending HID events.
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_SEND_HID_EVENT
+ *     value:          Accessory assigned ID for the HID device
+ *     index:          0
+ *     data            the HID report for the event
+ */
+#define ACCESSORY_SEND_HID_EVENT         57
+
+/* Control request for setting the audio mode.
+ *
+ *     requestType:    USB_DIR_OUT | USB_TYPE_VENDOR
+ *     request:        ACCESSORY_SET_AUDIO_MODE
+ *     value:          0 - no audio
+ *                     1 - device to host, 44100 16-bit stereo PCM
+ *     index:          0
+ *     data            none
+ */
+#define ACCESSORY_SET_AUDIO_MODE         58
+
+/* ioctls for retrieving strings set by the host */
+#define ACCESSORY_GET_STRING_MANUFACTURER   _IOW('M', 1, char[256])
+#define ACCESSORY_GET_STRING_MODEL          _IOW('M', 2, char[256])
+#define ACCESSORY_GET_STRING_DESCRIPTION    _IOW('M', 3, char[256])
+#define ACCESSORY_GET_STRING_VERSION        _IOW('M', 4, char[256])
+#define ACCESSORY_GET_STRING_URI            _IOW('M', 5, char[256])
+#define ACCESSORY_GET_STRING_SERIAL         _IOW('M', 6, char[256])
+/* returns 1 if there is a start request pending */
+#define ACCESSORY_IS_START_REQUESTED        _IO('M', 7)
+/* returns audio mode (set via the ACCESSORY_SET_AUDIO_MODE control request) */
+#define ACCESSORY_GET_AUDIO_MODE            _IO('M', 8)
+
+#endif /* _UAPI_LINUX_USB_F_ACCESSORY_H */
diff --git a/init/Kconfig b/init/Kconfig

index 46075327c165dd798b0283dd8600b21e2396a61b..cc828a1d223254be35018be72cc89f48b421ccdc 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING
  
           If in doubt, say N here.
  
+config SCHED_WALT
+        bool "Support window based load tracking"
+        depends on SMP
+        help
+        This feature will allow the scheduler to maintain a tunable window
+       based set of metrics for tasks and runqueues. These metrics can be
+       used to guide task placement as well as task frequency requirements
+       for cpufreq governors.
+
  config BSD_PROCESS_ACCT
         bool "BSD Process Accounting"
         depends on MULTIUSER
@@ -959,6 +968,39 @@ config SCHED_AUTOGROUP
           desktop applications.  Task group autogeneration is currently based
           upon task session.
  
+config SCHED_TUNE
+       bool "Boosting for CFS tasks (EXPERIMENTAL)"
+       depends on SMP
+       help
+         This option enables support for task classification using a new
+         cgroup controller, schedtune. Schedtune allows tasks to be given
+         a boost value and marked as latency-sensitive or not. This option
+         provides the "schedtune" controller.
+
+         This new controller:
+         1. allows only a two layers hierarchy, where the root defines the
+            system-wide boost value and its direct childrens define each one a
+            different "class of tasks" to be boosted with a different value
+         2. supports up to 16 different task classes, each one which could be
+            configured with a different boost value
+
+         Latency-sensitive tasks are not subject to energy-aware wakeup
+         task placement. The boost value assigned to tasks is used to
+         influence task placement and CPU frequency selection (if
+         utilization-driven frequency selection is in use).
+
+         If unsure, say N.
+
+config DEFAULT_USE_ENERGY_AWARE
+       bool "Default to enabling the Energy Aware Scheduler feature"
+       default n
+       help
+         This option defaults the ENERGY_AWARE scheduling feature to true,
+         as without SCHED_DEBUG set this feature can't be enabled or disabled
+         via sysctl.
+
+         Say N if unsure.
+
  config SYSFS_DEPRECATED
         bool "Enable deprecated sysfs features to support old userspace tools"
         depends on SYSFS
diff --git a/init/Makefile b/init/Makefile

index 1dbb23787290081a834162ce35bffbbb96acfaf7..0320e1a0705d20778f016cee1c425b5da278868b 100644 (file)
--- a/init/Makefile
+++ b/init/Makefile
@@ -6,11 +6,8 @@
  ccflags-y := -fno-function-sections -fno-data-sections
  
  obj-y                          := main.o version.o mounts.o
-ifneq ($(CONFIG_BLK_DEV_INITRD),y)
  obj-y                          += noinitramfs.o
-else
  obj-$(CONFIG_BLK_DEV_INITRD)   += initramfs.o
-endif
  obj-$(CONFIG_GENERIC_CALIBRATE_DELAY) += calibrate.o
  
  ifneq ($(CONFIG_ARCH_INIT_TASK),y)
@@ -21,6 +18,7 @@ mounts-y                      := do_mounts.o
  mounts-$(CONFIG_BLK_DEV_RAM)   += do_mounts_rd.o
  mounts-$(CONFIG_BLK_DEV_INITRD)        += do_mounts_initrd.o
  mounts-$(CONFIG_BLK_DEV_MD)    += do_mounts_md.o
+mounts-$(CONFIG_BLK_DEV_DM)    += do_mounts_dm.o
  
  # dependencies on generated files need to be listed explicitly
  $(obj)/version.o: include/generated/compile.h
diff --git a/init/do_mounts.c b/init/do_mounts.c

index f6d4dd764a52483f0fed18593f7c38e4d1cfc30a..c9ce810d2744d629d57857976fe64975ee715c23 100644 (file)
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -566,6 +566,7 @@ void __init prepare_namespace(void)
         wait_for_device_probe();
  
         md_run_setup();
+       dm_run_setup();
  
         if (saved_root_name[0]) {
                 root_device_name = saved_root_name;
diff --git a/init/do_mounts.h b/init/do_mounts.h

index 5b05c8f93f476c622f7ec95160c1759fee2d2503..cd201124714b5c35c8776f34a1b325e5a888a306 100644 (file)
--- a/init/do_mounts.h
+++ b/init/do_mounts.h
@@ -61,3 +61,13 @@ void md_run_setup(void);
  static inline void md_run_setup(void) {}
  
  #endif
+
+#ifdef CONFIG_BLK_DEV_DM
+
+void dm_run_setup(void);
+
+#else
+
+static inline void dm_run_setup(void) {}
+
+#endif
diff --git a/init/do_mounts_dm.c b/init/do_mounts_dm.c

new file mode 100644 (file)

index 0000000..af84b01
--- /dev/null
+++ b/init/do_mounts_dm.c
@@ -0,0 +1,470 @@
+/* do_mounts_dm.c
+ * Copyright (C) 2010 The Chromium OS Authors <chromium-os-dev@chromium.org>
+ *                    All Rights Reserved.
+ * Based on do_mounts_md.c
+ *
+ * This file is released under the GPL.
+ */
+#include <linux/async.h>
+#include <linux/ctype.h>
+#include <linux/device-mapper.h>
+#include <linux/fs.h>
+#include <linux/string.h>
+#include <linux/delay.h>
+
+#include "do_mounts.h"
+
+#define DM_MAX_DEVICES 256
+#define DM_MAX_TARGETS 256
+#define DM_MAX_NAME 32
+#define DM_MAX_UUID 129
+#define DM_NO_UUID "none"
+
+#define DM_MSG_PREFIX "init"
+
+/* Separators used for parsing the dm= argument. */
+#define DM_FIELD_SEP " "
+#define DM_LINE_SEP ","
+#define DM_ANY_SEP DM_FIELD_SEP DM_LINE_SEP
+
+/*
+ * When the device-mapper and any targets are compiled into the kernel
+ * (not a module), one or more device-mappers may be created and used
+ * as the root device at boot time with the parameters given with the
+ * boot line dm=...
+ *
+ * Multiple device-mappers can be stacked specifing the number of
+ * devices. A device can have multiple targets if the the number of
+ * targets is specified.
+ *
+ * TODO(taysom:defect 32847)
+ * In the future, the <num> field will be mandatory.
+ *
+ * <device>        ::= [<num>] <device-mapper>+
+ * <device-mapper> ::= <head> "," <target>+
+ * <head>          ::= <name> <uuid> <mode> [<num>]
+ * <target>        ::= <start> <length> <type> <options> ","
+ * <mode>          ::= "ro" | "rw"
+ * <uuid>          ::= xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx | "none"
+ * <type>          ::= "verity" | "bootcache" | ...
+ *
+ * Example:
+ * 2 vboot none ro 1,
+ *     0 1768000 bootcache
+ *       device=aa55b119-2a47-8c45-946a-5ac57765011f+1
+ *       signature=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       cache_start=1768000 max_blocks=100000 size_limit=23 max_trace=20000,
+ *   vroot none ro 1,
+ *     0 1740800 verity payload=254:0 hashtree=254:0 hashstart=1740800 alg=sha1
+ *       root_hexdigest=76e9be054b15884a9fa85973e9cb274c93afadb6
+ *       salt=5b3549d54d6c7a3837b9b81ed72e49463a64c03680c47835bef94d768e5646fe
+ *
+ * Notes:
+ *  1. uuid is a label for the device and we set it to "none".
+ *  2. The <num> field will be optional initially and assumed to be 1.
+ *     Once all the scripts that set these fields have been set, it will
+ *     be made mandatory.
+ */
+
+struct dm_setup_target {
+       sector_t begin;
+       sector_t length;
+       char *type;
+       char *params;
+       /* simple singly linked list */
+       struct dm_setup_target *next;
+};
+
+struct dm_device {
+       int minor;
+       int ro;
+       char name[DM_MAX_NAME];
+       char uuid[DM_MAX_UUID];
+       unsigned long num_targets;
+       struct dm_setup_target *target;
+       int target_count;
+       struct dm_device *next;
+};
+
+struct dm_option {
+       char *start;
+       char *next;
+       size_t len;
+       char delim;
+};
+
+static struct {
+       unsigned long num_devices;
+       char *str;
+} dm_setup_args __initdata;
+
+static __initdata int dm_early_setup;
+
+static int __init get_dm_option(struct dm_option *opt, const char *accept)
+{
+       char *str = opt->next;
+       char *endp;
+
+       if (!str)
+               return 0;
+
+       str = skip_spaces(str);
+       opt->start = str;
+       endp = strpbrk(str, accept);
+       if (!endp) {  /* act like strchrnul */
+               opt->len = strlen(str);
+               endp = str + opt->len;
+       } else {
+               opt->len = endp - str;
+       }
+       opt->delim = *endp;
+       if (*endp == 0) {
+               /* Don't advance past the nul. */
+               opt->next = endp;
+       } else {
+               opt->next = endp + 1;
+       }
+       return opt->len != 0;
+}
+
+static int __init dm_setup_cleanup(struct dm_device *devices)
+{
+       struct dm_device *dev = devices;
+
+       while (dev) {
+               struct dm_device *old_dev = dev;
+               struct dm_setup_target *target = dev->target;
+               while (target) {
+                       struct dm_setup_target *old_target = target;
+                       kfree(target->type);
+                       kfree(target->params);
+                       target = target->next;
+                       kfree(old_target);
+                       dev->target_count--;
+               }
+               BUG_ON(dev->target_count);
+               dev = dev->next;
+               kfree(old_dev);
+       }
+       return 0;
+}
+
+static char * __init dm_parse_device(struct dm_device *dev, char *str)
+{
+       struct dm_option opt;
+       size_t len;
+
+       /* Grab the logical name of the device to be exported to udev */
+       opt.next = str;
+       if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+               DMERR("failed to parse device name");
+               goto parse_fail;
+       }
+       len = min(opt.len + 1, sizeof(dev->name));
+       strlcpy(dev->name, opt.start, len);  /* includes nul */
+
+       /* Grab the UUID value or "none" */
+       if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+               DMERR("failed to parse device uuid");
+               goto parse_fail;
+       }
+       len = min(opt.len + 1, sizeof(dev->uuid));
+       strlcpy(dev->uuid, opt.start, len);
+
+       /* Determine if the table/device will be read only or read-write */
+       get_dm_option(&opt, DM_ANY_SEP);
+       if (!strncmp("ro", opt.start, opt.len)) {
+               dev->ro = 1;
+       } else if (!strncmp("rw", opt.start, opt.len)) {
+               dev->ro = 0;
+       } else {
+               DMERR("failed to parse table mode");
+               goto parse_fail;
+       }
+
+       /* Optional number field */
+       /* XXX: The <num> field will be mandatory in the next round */
+       if (opt.delim == DM_FIELD_SEP[0]) {
+               if (!get_dm_option(&opt, DM_LINE_SEP))
+                       return NULL;
+               dev->num_targets = simple_strtoul(opt.start, NULL, 10);
+       } else {
+               dev->num_targets = 1;
+       }
+       if (dev->num_targets > DM_MAX_TARGETS) {
+               DMERR("too many targets %lu > %d",
+                       dev->num_targets, DM_MAX_TARGETS);
+       }
+       return opt.next;
+
+parse_fail:
+       return NULL;
+}
+
+static char * __init dm_parse_targets(struct dm_device *dev, char *str)
+{
+       struct dm_option opt;
+       struct dm_setup_target **target = &dev->target;
+       unsigned long num_targets = dev->num_targets;
+       unsigned long i;
+
+       /* Targets are defined as per the table format but with a
+        * comma as a newline separator. */
+       opt.next = str;
+       for (i = 0; i < num_targets; i++) {
+               *target = kzalloc(sizeof(struct dm_setup_target), GFP_KERNEL);
+               if (!*target) {
+                       DMERR("failed to allocate memory for target %s<%ld>",
+                               dev->name, i);
+                       goto parse_fail;
+               }
+               dev->target_count++;
+
+               if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+                       DMERR("failed to parse starting sector"
+                               " for target %s<%ld>", dev->name, i);
+                       goto parse_fail;
+               }
+               (*target)->begin = simple_strtoull(opt.start, NULL, 10);
+
+               if (!get_dm_option(&opt, DM_FIELD_SEP)) {
+                       DMERR("failed to parse length for target %s<%ld>",
+                               dev->name, i);
+                       goto parse_fail;
+               }
+               (*target)->length = simple_strtoull(opt.start, NULL, 10);
+
+               if (get_dm_option(&opt, DM_FIELD_SEP))
+                       (*target)->type = kstrndup(opt.start, opt.len,
+                                                       GFP_KERNEL);
+               if (!((*target)->type)) {
+                       DMERR("failed to parse type for target %s<%ld>",
+                               dev->name, i);
+                       goto parse_fail;
+               }
+               if (get_dm_option(&opt, DM_LINE_SEP))
+                       (*target)->params = kstrndup(opt.start, opt.len,
+                                                       GFP_KERNEL);
+               if (!((*target)->params)) {
+                       DMERR("failed to parse params for target %s<%ld>",
+                               dev->name, i);
+                       goto parse_fail;
+               }
+               target = &((*target)->next);
+       }
+       DMDEBUG("parsed %d targets", dev->target_count);
+
+       return opt.next;
+
+parse_fail:
+       return NULL;
+}
+
+static struct dm_device * __init dm_parse_args(void)
+{
+       struct dm_device *devices = NULL;
+       struct dm_device **tail = &devices;
+       struct dm_device *dev;
+       char *str = dm_setup_args.str;
+       unsigned long num_devices = dm_setup_args.num_devices;
+       unsigned long i;
+
+       if (!str)
+               return NULL;
+       for (i = 0; i < num_devices; i++) {
+               dev = kzalloc(sizeof(*dev), GFP_KERNEL);
+               if (!dev) {
+                       DMERR("failed to allocated memory for dev");
+                       goto error;
+               }
+               *tail = dev;
+               tail = &dev->next;
+               /*
+                * devices are given minor numbers 0 - n-1
+                * in the order they are found in the arg
+                * string.
+                */
+               dev->minor = i;
+               str = dm_parse_device(dev, str);
+               if (!str)       /* NULL indicates error in parsing, bail */
+                       goto error;
+
+               str = dm_parse_targets(dev, str);
+               if (!str)
+                       goto error;
+       }
+       return devices;
+error:
+       dm_setup_cleanup(devices);
+       return NULL;
+}
+
+/*
+ * Parse the command-line parameters given our kernel, but do not
+ * actually try to invoke the DM device now; that is handled by
+ * dm_setup_drives after the low-level disk drivers have initialised.
+ * dm format is described at the top of the file.
+ *
+ * Because dm minor numbers are assigned in assending order starting with 0,
+ * You can assume the first device is /dev/dm-0, the next device is /dev/dm-1,
+ * and so forth.
+ */
+static int __init dm_setup(char *str)
+{
+       struct dm_option opt;
+       unsigned long num_devices;
+
+       if (!str) {
+               DMDEBUG("str is NULL");
+               goto parse_fail;
+       }
+       opt.next = str;
+       if (!get_dm_option(&opt, DM_FIELD_SEP))
+               goto parse_fail;
+       if (isdigit(opt.start[0])) {    /* XXX: Optional number field */
+               num_devices = simple_strtoul(opt.start, NULL, 10);
+               str = opt.next;
+       } else {
+               num_devices = 1;
+               /* Don't advance str */
+       }
+       if (num_devices > DM_MAX_DEVICES) {
+               DMDEBUG("too many devices %lu > %d",
+                       num_devices, DM_MAX_DEVICES);
+       }
+       dm_setup_args.str = str;
+       dm_setup_args.num_devices = num_devices;
+       DMINFO("will configure %lu devices", num_devices);
+       dm_early_setup = 1;
+       return 1;
+
+parse_fail:
+       DMWARN("Invalid arguments supplied to dm=.");
+       return 0;
+}
+
+static void __init dm_setup_drives(void)
+{
+       struct mapped_device *md = NULL;
+       struct dm_table *table = NULL;
+       struct dm_setup_target *target;
+       struct dm_device *dev;
+       char *uuid;
+       fmode_t fmode = FMODE_READ;
+       struct dm_device *devices;
+
+       devices = dm_parse_args();
+
+       for (dev = devices; dev; dev = dev->next) {
+               if (dm_create(dev->minor, &md)) {
+                       DMDEBUG("failed to create the device");
+                       goto dm_create_fail;
+               }
+               DMDEBUG("created device '%s'", dm_device_name(md));
+
+               /*
+                * In addition to flagging the table below, the disk must be
+                * set explicitly ro/rw.
+                */
+               set_disk_ro(dm_disk(md), dev->ro);
+
+               if (!dev->ro)
+                       fmode |= FMODE_WRITE;
+               if (dm_table_create(&table, fmode, dev->target_count, md)) {
+                       DMDEBUG("failed to create the table");
+                       goto dm_table_create_fail;
+               }
+
+               dm_lock_md_type(md);
+
+               for (target = dev->target; target; target = target->next) {
+                       DMINFO("adding target '%llu %llu %s %s'",
+                              (unsigned long long) target->begin,
+                              (unsigned long long) target->length,
+                              target->type, target->params);
+                       if (dm_table_add_target(table, target->type,
+                                               target->begin,
+                                               target->length,
+                                               target->params)) {
+                               DMDEBUG("failed to add the target"
+                                       " to the table");
+                               goto add_target_fail;
+                       }
+               }
+               if (dm_table_complete(table)) {
+                       DMDEBUG("failed to complete the table");
+                       goto table_complete_fail;
+               }
+
+               /* Suspend the device so that we can bind it to the table. */
+               if (dm_suspend(md, 0)) {
+                       DMDEBUG("failed to suspend the device pre-bind");
+                       goto suspend_fail;
+               }
+
+               /* Initial table load: acquire type of table. */
+               dm_set_md_type(md, dm_table_get_type(table));
+
+               /* Setup md->queue to reflect md's type. */
+               if (dm_setup_md_queue(md, table)) {
+                       DMWARN("unable to set up device queue for new table.");
+                       goto setup_md_queue_fail;
+               }
+
+               /*
+                * Bind the table to the device. This is the only way
+                * to associate md->map with the table and set the disk
+                * capacity directly.
+                */
+               if (dm_swap_table(md, table)) {  /* should return NULL. */
+                       DMDEBUG("failed to bind the device to the table");
+                       goto table_bind_fail;
+               }
+
+               /* Finally, resume and the device should be ready. */
+               if (dm_resume(md)) {
+                       DMDEBUG("failed to resume the device");
+                       goto resume_fail;
+               }
+
+               /* Export the dm device via the ioctl interface */
+               if (!strcmp(DM_NO_UUID, dev->uuid))
+                       uuid = NULL;
+               if (dm_ioctl_export(md, dev->name, uuid)) {
+                       DMDEBUG("failed to export device with given"
+                               " name and uuid");
+                       goto export_fail;
+               }
+
+               dm_unlock_md_type(md);
+
+               DMINFO("dm-%d is ready", dev->minor);
+       }
+       dm_setup_cleanup(devices);
+       return;
+
+export_fail:
+resume_fail:
+table_bind_fail:
+setup_md_queue_fail:
+suspend_fail:
+table_complete_fail:
+add_target_fail:
+       dm_unlock_md_type(md);
+dm_table_create_fail:
+       dm_put(md);
+dm_create_fail:
+       DMWARN("starting dm-%d (%s) failed",
+              dev->minor, dev->name);
+       dm_setup_cleanup(devices);
+}
+
+__setup("dm=", dm_setup);
+
+void __init dm_run_setup(void)
+{
+       if (!dm_early_setup)
+               return;
+       DMINFO("attempting early device configuration.");
+       dm_setup_drives();
+}
diff --git a/init/initramfs.c b/init/initramfs.c

index 7046feffef6b5852102a8c021afcaeb8420d3a74..5ea7f1b5ec44b12c8ea15e1ce2507cdcc4b9454e 100644 (file)
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -20,6 +20,7 @@
  #include <linux/syscalls.h>
  #include <linux/utime.h>
  #include <linux/file.h>
+#include <linux/initramfs.h>
  
  static ssize_t __init xwrite(int fd, const char *p, size_t count)
  {
@@ -607,10 +608,29 @@ static void __init clean_rootfs(void)
  }
  #endif
  
+static int __initdata do_skip_initramfs;
+
+static int __init skip_initramfs_param(char *str)
+{
+       if (*str)
+               return 0;
+       do_skip_initramfs = 1;
+       return 1;
+}
+__setup("skip_initramfs", skip_initramfs_param);
+
  static int __init populate_rootfs(void)
  {
+       char *err;
+
+       if (do_skip_initramfs) {
+               if (initrd_start)
+                       free_initrd();
+               return default_rootfs();
+       }
+
         /* Load the built in initramfs */
-       char *err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
+       err = unpack_to_rootfs(__initramfs_start, __initramfs_size);
         if (err)
                 panic("%s", err); /* Failed to decompress INTERNAL initramfs */
         /* If available load the bootloader supplied initrd */
diff --git a/init/noinitramfs.c b/init/noinitramfs.c

index 267739d851791b0a3e4526a3be5fb4944d8b507e..bcc8bcb053eeb05dc095780fcbc276fa2f311bd4 100644 (file)
--- a/init/noinitramfs.c
+++ b/init/noinitramfs.c
@@ -21,11 +21,16 @@
  #include <linux/stat.h>
  #include <linux/kdev_t.h>
  #include <linux/syscalls.h>
+#include <linux/kconfig.h>
+#include <linux/initramfs.h>
  
  /*
   * Create a simple rootfs that is similar to the default initramfs
   */
-static int __init default_rootfs(void)
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
+static
+#endif
+int __init default_rootfs(void)
  {
         int err;
  
@@ -49,4 +54,6 @@ out:
         printk(KERN_WARNING "Failed to create a rootfs\n");
         return err;
  }
+#if !IS_BUILTIN(CONFIG_BLK_DEV_INITRD)
  rootfs_initcall(default_rootfs);
+#endif
diff --git a/ipc/mqueue.c b/ipc/mqueue.c

index d240256263103f89972ab5e62201549a7074a829..5f46c15009dabfffc3776991db5fe3469e342855 100644 (file)
--- a/ipc/mqueue.c
+++ b/ipc/mqueue.c
@@ -747,7 +747,7 @@ static struct file *do_create(struct ipc_namespace *ipc_ns, struct inode *dir,
         }
  
         mode &= ~current_umask();
-       ret = vfs_create(dir, path->dentry, mode, true);
+       ret = vfs_create2(path->mnt, dir, path->dentry, mode, true);
         path->dentry->d_fsdata = NULL;
         if (ret)
                 return ERR_PTR(ret);
@@ -763,7 +763,7 @@ static struct file *do_open(struct path *path, int oflag)
         if ((oflag & O_ACCMODE) == (O_RDWR | O_WRONLY))
                 return ERR_PTR(-EINVAL);
         acc = oflag2acc[oflag & O_ACCMODE];
-       if (inode_permission(d_inode(path->dentry), acc))
+       if (inode_permission2(path->mnt, d_inode(path->dentry), acc))
                 return ERR_PTR(-EACCES);
         return dentry_open(path, oflag, current_cred());
  }
@@ -792,7 +792,7 @@ static int do_mq_open(const char __user *u_name, int oflag, umode_t mode,
         ro = mnt_want_write(mnt);       /* we'll drop it in any case */
         error = 0;
         inode_lock(d_inode(root));
-       path.dentry = lookup_one_len(name->name, root, strlen(name->name));
+       path.dentry = lookup_one_len2(name->name, mnt, root, strlen(name->name));
         if (IS_ERR(path.dentry)) {
                 error = PTR_ERR(path.dentry);
                 goto out_putfd;
@@ -872,7 +872,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
         if (err)
                 goto out_name;
         inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT);
-       dentry = lookup_one_len(name->name, mnt->mnt_root,
+       dentry = lookup_one_len2(name->name, mnt, mnt->mnt_root,
                                 strlen(name->name));
         if (IS_ERR(dentry)) {
                 err = PTR_ERR(dentry);
@@ -884,7 +884,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name)
                 err = -ENOENT;
         } else {
                 ihold(inode);
-               err = vfs_unlink(d_inode(dentry->d_parent), dentry, NULL);
+               err = vfs_unlink2(mnt, d_inode(dentry->d_parent), dentry, NULL);
         }
         dput(dentry);
  
diff --git a/kernel/bpf/arraymap.c b/kernel/bpf/arraymap.c

index a4ae1ca44a5765fdbec078f2bb846ceecfe1508f..49ff958e88836e0f85ddda3e1f64fa43c3706f13 100644 (file)
--- a/kernel/bpf/arraymap.c
+++ b/kernel/bpf/arraymap.c
@@ -19,6 +19,9 @@
  
  #include "map_in_map.h"
  
+#define ARRAY_CREATE_FLAG_MASK \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
  static void bpf_array_free_percpu(struct bpf_array *array)
  {
         int i;
@@ -57,7 +60,8 @@ static struct bpf_map *array_map_alloc(union bpf_attr *attr)
  
         /* check sanity of attributes */
         if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size == 0 || attr->map_flags & ~BPF_F_NUMA_NODE ||
+           attr->value_size == 0 ||
+           attr->map_flags & ~ARRAY_CREATE_FLAG_MASK ||
             (percpu && numa_node != NUMA_NO_NODE))
                 return ERR_PTR(-EINVAL);
  
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c

index e745d6a88224f5b5e9a8241dc7dee5d35e4832de..ebdef54bf7df2b898e7d03a5ddc3b0c8a7554bbb 100644 (file)
--- a/kernel/bpf/devmap.c
+++ b/kernel/bpf/devmap.c
@@ -50,6 +50,9 @@
  #include <linux/bpf.h>
  #include <linux/filter.h>
  
+#define DEV_CREATE_FLAG_MASK \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
  struct bpf_dtab_netdev {
         struct net_device *dev;
         struct bpf_dtab *dtab;
@@ -83,7 +86,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
  
         /* check sanity of attributes */
         if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+           attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
                 return ERR_PTR(-EINVAL);
  
         dtab = kzalloc(sizeof(*dtab), GFP_USER);
diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c

index 6533f08d1238e136895a5cf0665be31d7b23df51..e469e05c8e83bc3256378644e3f3c26555651261 100644 (file)
--- a/kernel/bpf/hashtab.c
+++ b/kernel/bpf/hashtab.c
@@ -18,8 +18,9 @@
  #include "bpf_lru_list.h"
  #include "map_in_map.h"
  
-#define HTAB_CREATE_FLAG_MASK \
-       (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE)
+#define HTAB_CREATE_FLAG_MASK                                          \
+       (BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU | BPF_F_NUMA_NODE |    \
+        BPF_F_RDONLY | BPF_F_WRONLY)
  
  struct bucket {
         struct hlist_nulls_head head;
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c

index be1dde967208eb19d1f92872275df5fdce437024..01aaef1a77c5af164660b0f75ac99e4bd7c55a52 100644 (file)
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -295,7 +295,7 @@ out:
  }
  
  static void *bpf_obj_do_get(const struct filename *pathname,
-                           enum bpf_type *type)
+                           enum bpf_type *type, int flags)
  {
         struct inode *inode;
         struct path path;
@@ -307,7 +307,7 @@ static void *bpf_obj_do_get(const struct filename *pathname,
                 return ERR_PTR(ret);
  
         inode = d_backing_inode(path.dentry);
-       ret = inode_permission(inode, MAY_WRITE);
+       ret = inode_permission(inode, ACC_MODE(flags));
         if (ret)
                 goto out;
  
@@ -326,18 +326,23 @@ out:
         return ERR_PTR(ret);
  }
  
-int bpf_obj_get_user(const char __user *pathname)
+int bpf_obj_get_user(const char __user *pathname, int flags)
  {
         enum bpf_type type = BPF_TYPE_UNSPEC;
         struct filename *pname;
         int ret = -ENOENT;
+       int f_flags;
         void *raw;
  
+       f_flags = bpf_get_file_flag(flags);
+       if (f_flags < 0)
+               return f_flags;
+
         pname = getname(pathname);
         if (IS_ERR(pname))
                 return PTR_ERR(pname);
  
-       raw = bpf_obj_do_get(pname, &type);
+       raw = bpf_obj_do_get(pname, &type, f_flags);
         if (IS_ERR(raw)) {
                 ret = PTR_ERR(raw);
                 goto out;
@@ -346,7 +351,7 @@ int bpf_obj_get_user(const char __user *pathname)
         if (type == BPF_TYPE_PROG)
                 ret = bpf_prog_new_fd(raw);
         else if (type == BPF_TYPE_MAP)
-               ret = bpf_map_new_fd(raw);
+               ret = bpf_map_new_fd(raw, f_flags);
         else
                 goto out;
  
diff --git a/kernel/bpf/lpm_trie.c b/kernel/bpf/lpm_trie.c

index 1b767844a76ff54b1e99c0c191964269a34c85a7..2dbd74582b6566e178c86545834b54960539de05 100644 (file)
--- a/kernel/bpf/lpm_trie.c
+++ b/kernel/bpf/lpm_trie.c
@@ -406,7 +406,8 @@ static int trie_delete_elem(struct bpf_map *map, void *key)
  #define LPM_KEY_SIZE_MAX       LPM_KEY_SIZE(LPM_DATA_SIZE_MAX)
  #define LPM_KEY_SIZE_MIN       LPM_KEY_SIZE(LPM_DATA_SIZE_MIN)
  
-#define LPM_CREATE_FLAG_MASK   (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE)
+#define LPM_CREATE_FLAG_MASK   (BPF_F_NO_PREALLOC | BPF_F_NUMA_NODE |  \
+                                BPF_F_RDONLY | BPF_F_WRONLY)
  
  static struct bpf_map *trie_alloc(union bpf_attr *attr)
  {
diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c

index dbd7b322a86b5c3eb7150d2cc886223da6387b70..624c707668550e059400380197ee29f087670f4d 100644 (file)
--- a/kernel/bpf/sockmap.c
+++ b/kernel/bpf/sockmap.c
@@ -41,6 +41,9 @@
  #include <net/strparser.h>
  #include <net/tcp.h>
  
+#define SOCK_CREATE_FLAG_MASK \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
  struct bpf_stab {
         struct bpf_map map;
         struct sock **sock_map;
@@ -508,7 +511,7 @@ static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  
         /* check sanity of attributes */
         if (attr->max_entries == 0 || attr->key_size != 4 ||
-           attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
+           attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
                 return ERR_PTR(-EINVAL);
  
         if (attr->value_size > KMALLOC_MAX_SIZE)
diff --git a/kernel/bpf/stackmap.c b/kernel/bpf/stackmap.c

index 135be433e9a0fb9a3c18e96db336cf1817be1c1c..a15bc636cc98ff7b2fe55b0d50b5432e9d4b50de 100644 (file)
--- a/kernel/bpf/stackmap.c
+++ b/kernel/bpf/stackmap.c
@@ -11,6 +11,9 @@
  #include <linux/perf_event.h>
  #include "percpu_freelist.h"
  
+#define STACK_CREATE_FLAG_MASK \
+       (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
+
  struct stack_map_bucket {
         struct pcpu_freelist_node fnode;
         u32 hash;
@@ -60,7 +63,7 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
         if (!capable(CAP_SYS_ADMIN))
                 return ERR_PTR(-EPERM);
  
-       if (attr->map_flags & ~BPF_F_NUMA_NODE)
+       if (attr->map_flags & ~STACK_CREATE_FLAG_MASK)
                 return ERR_PTR(-EINVAL);
  
         /* check sanity of attributes */
diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c

index 25d074920a009ff682d97bf88e68f466c79bd564..499e00fc805d520251af3713e9495e593b6b5b12 100644 (file)
--- a/kernel/bpf/syscall.c
+++ b/kernel/bpf/syscall.c
@@ -31,6 +31,8 @@
  #define IS_FD_HASH(map) ((map)->map_type == BPF_MAP_TYPE_HASH_OF_MAPS)
  #define IS_FD_MAP(map) (IS_FD_ARRAY(map) || IS_FD_HASH(map))
  
+#define BPF_OBJ_FLAG_MASK   (BPF_F_RDONLY | BPF_F_WRONLY)
+
  DEFINE_PER_CPU(int, bpf_prog_active);
  static DEFINE_IDR(prog_idr);
  static DEFINE_SPINLOCK(prog_idr_lock);
@@ -207,6 +209,7 @@ static void bpf_map_free_deferred(struct work_struct *work)
         struct bpf_map *map = container_of(work, struct bpf_map, work);
  
         bpf_map_uncharge_memlock(map);
+       security_bpf_map_free(map);
         /* implementation dependent freeing */
         map->ops->map_free(map);
  }
@@ -291,17 +294,54 @@ static void bpf_map_show_fdinfo(struct seq_file *m, struct file *filp)
  }
  #endif
  
-static const struct file_operations bpf_map_fops = {
+static ssize_t bpf_dummy_read(struct file *filp, char __user *buf, size_t siz,
+                             loff_t *ppos)
+{
+       /* We need this handler such that alloc_file() enables
+        * f_mode with FMODE_CAN_READ.
+        */
+       return -EINVAL;
+}
+
+static ssize_t bpf_dummy_write(struct file *filp, const char __user *buf,
+                              size_t siz, loff_t *ppos)
+{
+       /* We need this handler such that alloc_file() enables
+        * f_mode with FMODE_CAN_WRITE.
+        */
+       return -EINVAL;
+}
+
+const struct file_operations bpf_map_fops = {
  #ifdef CONFIG_PROC_FS
         .show_fdinfo    = bpf_map_show_fdinfo,
  #endif
         .release        = bpf_map_release,
+       .read           = bpf_dummy_read,
+       .write          = bpf_dummy_write,
  };
  
-int bpf_map_new_fd(struct bpf_map *map)
+int bpf_map_new_fd(struct bpf_map *map, int flags)
  {
+       int ret;
+
+       ret = security_bpf_map(map, OPEN_FMODE(flags));
+       if (ret < 0)
+               return ret;
+
         return anon_inode_getfd("bpf-map", &bpf_map_fops, map,
-                               O_RDWR | O_CLOEXEC);
+                               flags | O_CLOEXEC);
+}
+
+int bpf_get_file_flag(int flags)
+{
+       if ((flags & BPF_F_RDONLY) && (flags & BPF_F_WRONLY))
+               return -EINVAL;
+       if (flags & BPF_F_RDONLY)
+               return O_RDONLY;
+       if (flags & BPF_F_WRONLY)
+               return O_WRONLY;
+       return O_RDWR;
  }
  
  /* helper macro to check that unused fields 'union bpf_attr' are zero */
@@ -318,12 +358,17 @@ static int map_create(union bpf_attr *attr)
  {
         int numa_node = bpf_map_attr_numa_node(attr);
         struct bpf_map *map;
+       int f_flags;
         int err;
  
         err = CHECK_ATTR(BPF_MAP_CREATE);
         if (err)
                 return -EINVAL;
  
+       f_flags = bpf_get_file_flag(attr->map_flags);
+       if (f_flags < 0)
+               return f_flags;
+
         if (numa_node != NUMA_NO_NODE &&
             ((unsigned int)numa_node >= nr_node_ids ||
              !node_online(numa_node)))
@@ -337,15 +382,19 @@ static int map_create(union bpf_attr *attr)
         atomic_set(&map->refcnt, 1);
         atomic_set(&map->usercnt, 1);
  
-       err = bpf_map_charge_memlock(map);
+       err = security_bpf_map_alloc(map);
         if (err)
                 goto free_map_nouncharge;
  
+       err = bpf_map_charge_memlock(map);
+       if (err)
+               goto free_map_sec;
+
         err = bpf_map_alloc_id(map);
         if (err)
                 goto free_map;
  
-       err = bpf_map_new_fd(map);
+       err = bpf_map_new_fd(map, f_flags);
         if (err < 0) {
                 /* failed to allocate fd.
                  * bpf_map_put() is needed because the above
@@ -362,6 +411,8 @@ static int map_create(union bpf_attr *attr)
  
  free_map:
         bpf_map_uncharge_memlock(map);
+free_map_sec:
+       security_bpf_map_free(map);
  free_map_nouncharge:
         map->ops->map_free(map);
         return err;
@@ -460,6 +511,11 @@ static int map_lookup_elem(union bpf_attr *attr)
         if (IS_ERR(map))
                 return PTR_ERR(map);
  
+       if (!(f.file->f_mode & FMODE_CAN_READ)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
         key = memdup_user(ukey, map->key_size);
         if (IS_ERR(key)) {
                 err = PTR_ERR(key);
@@ -540,6 +596,11 @@ static int map_update_elem(union bpf_attr *attr)
         if (IS_ERR(map))
                 return PTR_ERR(map);
  
+       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
         key = memdup_user(ukey, map->key_size);
         if (IS_ERR(key)) {
                 err = PTR_ERR(key);
@@ -623,6 +684,11 @@ static int map_delete_elem(union bpf_attr *attr)
         if (IS_ERR(map))
                 return PTR_ERR(map);
  
+       if (!(f.file->f_mode & FMODE_CAN_WRITE)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
         key = memdup_user(ukey, map->key_size);
         if (IS_ERR(key)) {
                 err = PTR_ERR(key);
@@ -666,6 +732,11 @@ static int map_get_next_key(union bpf_attr *attr)
         if (IS_ERR(map))
                 return PTR_ERR(map);
  
+       if (!(f.file->f_mode & FMODE_CAN_READ)) {
+               err = -EPERM;
+               goto err_put;
+       }
+
         if (ukey) {
                 key = memdup_user(ukey, map->key_size);
                 if (IS_ERR(key)) {
@@ -820,6 +891,7 @@ static void __bpf_prog_put_rcu(struct rcu_head *rcu)
  
         free_used_maps(aux);
         bpf_prog_uncharge_memlock(aux->prog);
+       security_bpf_prog_free(aux);
         bpf_prog_free(aux->prog);
  }
  
@@ -867,15 +939,23 @@ static void bpf_prog_show_fdinfo(struct seq_file *m, struct file *filp)
  }
  #endif
  
-static const struct file_operations bpf_prog_fops = {
+const struct file_operations bpf_prog_fops = {
  #ifdef CONFIG_PROC_FS
         .show_fdinfo    = bpf_prog_show_fdinfo,
  #endif
         .release        = bpf_prog_release,
+       .read           = bpf_dummy_read,
+       .write          = bpf_dummy_write,
  };
  
  int bpf_prog_new_fd(struct bpf_prog *prog)
  {
+       int ret;
+
+       ret = security_bpf_prog(prog);
+       if (ret < 0)
+               return ret;
+
         return anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog,
                                 O_RDWR | O_CLOEXEC);
  }
@@ -1015,10 +1095,14 @@ static int bpf_prog_load(union bpf_attr *attr)
         if (!prog)
                 return -ENOMEM;
  
-       err = bpf_prog_charge_memlock(prog);
+       err = security_bpf_prog_alloc(prog->aux);
         if (err)
                 goto free_prog_nouncharge;
  
+       err = bpf_prog_charge_memlock(prog);
+       if (err)
+               goto free_prog_sec;
+
         prog->len = attr->insn_cnt;
  
         err = -EFAULT;
@@ -1071,16 +1155,18 @@ free_used_maps:
         free_used_maps(prog->aux);
  free_prog:
         bpf_prog_uncharge_memlock(prog);
+free_prog_sec:
+       security_bpf_prog_free(prog->aux);
  free_prog_nouncharge:
         bpf_prog_free(prog);
         return err;
  }
  
-#define BPF_OBJ_LAST_FIELD bpf_fd
+#define BPF_OBJ_LAST_FIELD file_flags
  
  static int bpf_obj_pin(const union bpf_attr *attr)
  {
-       if (CHECK_ATTR(BPF_OBJ))
+       if (CHECK_ATTR(BPF_OBJ) || attr->file_flags != 0)
                 return -EINVAL;
  
         return bpf_obj_pin_user(attr->bpf_fd, u64_to_user_ptr(attr->pathname));
@@ -1088,10 +1174,12 @@ static int bpf_obj_pin(const union bpf_attr *attr)
  
  static int bpf_obj_get(const union bpf_attr *attr)
  {
-       if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0)
+       if (CHECK_ATTR(BPF_OBJ) || attr->bpf_fd != 0 ||
+           attr->file_flags & ~BPF_OBJ_FLAG_MASK)
                 return -EINVAL;
  
-       return bpf_obj_get_user(u64_to_user_ptr(attr->pathname));
+       return bpf_obj_get_user(u64_to_user_ptr(attr->pathname),
+                               attr->file_flags);
  }
  
  #ifdef CONFIG_CGROUP_BPF
@@ -1305,20 +1393,26 @@ static int bpf_prog_get_fd_by_id(const union bpf_attr *attr)
         return fd;
  }
  
-#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD map_id
+#define BPF_MAP_GET_FD_BY_ID_LAST_FIELD open_flags
  
  static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
  {
         struct bpf_map *map;
         u32 id = attr->map_id;
+       int f_flags;
         int fd;
  
-       if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID))
+       if (CHECK_ATTR(BPF_MAP_GET_FD_BY_ID) ||
+           attr->open_flags & ~BPF_OBJ_FLAG_MASK)
                 return -EINVAL;
  
         if (!capable(CAP_SYS_ADMIN))
                 return -EPERM;
  
+       f_flags = bpf_get_file_flag(attr->open_flags);
+       if (f_flags < 0)
+               return f_flags;
+
         spin_lock_bh(&map_idr_lock);
         map = idr_find(&map_idr, id);
         if (map)
@@ -1330,7 +1424,7 @@ static int bpf_map_get_fd_by_id(const union bpf_attr *attr)
         if (IS_ERR(map))
                 return PTR_ERR(map);
  
-       fd = bpf_map_new_fd(map);
+       fd = bpf_map_new_fd(map, f_flags);
         if (fd < 0)
                 bpf_map_put(map);
  
@@ -1467,6 +1561,10 @@ SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, siz
         if (copy_from_user(&attr, uattr, size) != 0)
                 return -EFAULT;
  
+       err = security_bpf(cmd, &attr, size);
+       if (err < 0)
+               return err;
+
         switch (cmd) {
         case BPF_MAP_CREATE:
                 err = map_create(&attr);
diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c

index 024085daab1aede5958235b0663c19ec667b5836..cc816dcb83c75ec1d430dd75a9e6dc0f134a05a1 100644 (file)
--- a/kernel/cgroup/cgroup-v1.c
+++ b/kernel/cgroup/cgroup-v1.c
@@ -537,7 +537,8 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of,
         tcred = get_task_cred(task);
         if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) &&
             !uid_eq(cred->euid, tcred->uid) &&
-           !uid_eq(cred->euid, tcred->suid))
+           !uid_eq(cred->euid, tcred->suid) &&
+           !ns_capable(tcred->user_ns, CAP_SYS_NICE))
                 ret = -EACCES;
         put_cred(tcred);
         if (ret)
diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c

index 4657e2924ecb1a72281cc70e3c46f25a2f2ce962..2982fb7f41e0310d55198e72613e3f0ec15b0d94 100644 (file)
--- a/kernel/cgroup/cpuset.c
+++ b/kernel/cgroup/cpuset.c
@@ -103,6 +103,7 @@ struct cpuset {
  
         /* user-configured CPUs and Memory Nodes allow to tasks */
         cpumask_var_t cpus_allowed;
+       cpumask_var_t cpus_requested;
         nodemask_t mems_allowed;
  
         /* effective CPUs and Memory Nodes allow to tasks */
@@ -412,7 +413,7 @@ static void cpuset_update_task_spread_flag(struct cpuset *cs,
  
  static int is_cpuset_subset(const struct cpuset *p, const struct cpuset *q)
  {
-       return  cpumask_subset(p->cpus_allowed, q->cpus_allowed) &&
+       return  cpumask_subset(p->cpus_requested, q->cpus_requested) &&
                 nodes_subset(p->mems_allowed, q->mems_allowed) &&
                 is_cpu_exclusive(p) <= is_cpu_exclusive(q) &&
                 is_mem_exclusive(p) <= is_mem_exclusive(q);
@@ -511,7 +512,7 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial)
         cpuset_for_each_child(c, css, par) {
                 if ((is_cpu_exclusive(trial) || is_cpu_exclusive(c)) &&
                     c != cur &&
-                   cpumask_intersects(trial->cpus_allowed, c->cpus_allowed))
+                   cpumask_intersects(trial->cpus_requested, c->cpus_requested))
                         goto out;
                 if ((is_mem_exclusive(trial) || is_mem_exclusive(c)) &&
                     c != cur &&
@@ -976,17 +977,18 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
         if (!*buf) {
                 cpumask_clear(trialcs->cpus_allowed);
         } else {
-               retval = cpulist_parse(buf, trialcs->cpus_allowed);
+               retval = cpulist_parse(buf, trialcs->cpus_requested);
                 if (retval < 0)
                         return retval;
  
-               if (!cpumask_subset(trialcs->cpus_allowed,
-                                   top_cpuset.cpus_allowed))
+               if (!cpumask_subset(trialcs->cpus_requested, cpu_present_mask))
                         return -EINVAL;
+
+               cpumask_and(trialcs->cpus_allowed, trialcs->cpus_requested, cpu_active_mask);
         }
  
         /* Nothing to do if the cpus didn't change */
-       if (cpumask_equal(cs->cpus_allowed, trialcs->cpus_allowed))
+       if (cpumask_equal(cs->cpus_requested, trialcs->cpus_requested))
                 return 0;
  
         retval = validate_change(cs, trialcs);
@@ -995,6 +997,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs,
  
         spin_lock_irq(&callback_lock);
         cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed);
+       cpumask_copy(cs->cpus_requested, trialcs->cpus_requested);
         spin_unlock_irq(&callback_lock);
  
         /* use trialcs->cpus_allowed as a temp variable */
@@ -1763,7 +1766,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v)
  
         switch (type) {
         case FILE_CPULIST:
-               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_allowed));
+               seq_printf(sf, "%*pbl\n", cpumask_pr_args(cs->cpus_requested));
                 break;
         case FILE_MEMLIST:
                 seq_printf(sf, "%*pbl\n", nodemask_pr_args(&cs->mems_allowed));
@@ -1953,11 +1956,14 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
                 return ERR_PTR(-ENOMEM);
         if (!alloc_cpumask_var(&cs->cpus_allowed, GFP_KERNEL))
                 goto free_cs;
+       if (!alloc_cpumask_var(&cs->cpus_requested, GFP_KERNEL))
+               goto free_allowed;
         if (!alloc_cpumask_var(&cs->effective_cpus, GFP_KERNEL))
-               goto free_cpus;
+               goto free_requested;
  
         set_bit(CS_SCHED_LOAD_BALANCE, &cs->flags);
         cpumask_clear(cs->cpus_allowed);
+       cpumask_clear(cs->cpus_requested);
         nodes_clear(cs->mems_allowed);
         cpumask_clear(cs->effective_cpus);
         nodes_clear(cs->effective_mems);
@@ -1966,7 +1972,9 @@ cpuset_css_alloc(struct cgroup_subsys_state *parent_css)
  
         return &cs->css;
  
-free_cpus:
+free_requested:
+       free_cpumask_var(cs->cpus_requested);
+free_allowed:
         free_cpumask_var(cs->cpus_allowed);
  free_cs:
         kfree(cs);
@@ -2029,6 +2037,7 @@ static int cpuset_css_online(struct cgroup_subsys_state *css)
         cs->mems_allowed = parent->mems_allowed;
         cs->effective_mems = parent->mems_allowed;
         cpumask_copy(cs->cpus_allowed, parent->cpus_allowed);
+       cpumask_copy(cs->cpus_requested, parent->cpus_requested);
         cpumask_copy(cs->effective_cpus, parent->cpus_allowed);
         spin_unlock_irq(&callback_lock);
  out_unlock:
@@ -2063,6 +2072,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css)
  
         free_cpumask_var(cs->effective_cpus);
         free_cpumask_var(cs->cpus_allowed);
+       free_cpumask_var(cs->cpus_requested);
         kfree(cs);
  }
  
@@ -2125,8 +2135,10 @@ int __init cpuset_init(void)
  
         BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_allowed, GFP_KERNEL));
         BUG_ON(!alloc_cpumask_var(&top_cpuset.effective_cpus, GFP_KERNEL));
+       BUG_ON(!alloc_cpumask_var(&top_cpuset.cpus_requested, GFP_KERNEL));
  
         cpumask_setall(top_cpuset.cpus_allowed);
+       cpumask_setall(top_cpuset.cpus_requested);
         nodes_setall(top_cpuset.mems_allowed);
         cpumask_setall(top_cpuset.effective_cpus);
         nodes_setall(top_cpuset.effective_mems);
@@ -2259,7 +2271,7 @@ retry:
                 goto retry;
         }
  
-       cpumask_and(&new_cpus, cs->cpus_allowed, parent_cs(cs)->effective_cpus);
+       cpumask_and(&new_cpus, cs->cpus_requested, parent_cs(cs)->effective_cpus);
         nodes_and(new_mems, cs->mems_allowed, parent_cs(cs)->effective_mems);
  
         cpus_updated = !cpumask_equal(&new_cpus, cs->effective_cpus);
diff --git a/kernel/configs/android-fetch-configs.sh b/kernel/configs/android-fetch-configs.sh

new file mode 100755 (executable)

index 0000000..2dcd298
--- /dev/null
+++ b/kernel/configs/android-fetch-configs.sh
@@ -0,0 +1,4 @@
+#!/bin/sh
+
+curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.14.tar.gz | tar xzv
+
diff --git a/kernel/cpu.c b/kernel/cpu.c

index f21bfa3172d8a0c460142ed3ba1c6095f0fdcb48..a42b9eaf57824ce7d5f37743eb4ca26ebb54a55a 100644 (file)
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1133,6 +1133,7 @@ void __weak arch_enable_nonboot_cpus_end(void)
  void enable_nonboot_cpus(void)
  {
         int cpu, error;
+       struct device *cpu_device;
  
         /* Allow everyone to use the CPU hotplug again */
         cpu_maps_update_begin();
@@ -1150,6 +1151,12 @@ void enable_nonboot_cpus(void)
                 trace_suspend_resume(TPS("CPU_ON"), cpu, false);
                 if (!error) {
                         pr_info("CPU%d is up\n", cpu);
+                       cpu_device = get_cpu_device(cpu);
+                       if (!cpu_device)
+                               pr_err("%s: failed to get cpu%d device\n",
+                                      __func__, cpu);
+                       else
+                               kobject_uevent(&cpu_device->kobj, KOBJ_ONLINE);
                         continue;
                 }
                 pr_warn("Error taking CPU%d up: %d\n", cpu, error);
diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c

index ed5d34925ad0617a40aeed3774b0e393aec03e99..8d28e3062eab315c80bd8fca2643aca1c9ae5db1 100644 (file)
--- a/kernel/debug/kdb/kdb_io.c
+++ b/kernel/debug/kdb/kdb_io.c
@@ -217,7 +217,7 @@ static char *kdb_read(char *buffer, size_t bufsize)
         int i;
         int diag, dtab_count;
         int key;
-
+       static int last_crlf;
  
         diag = kdbgetintenv("DTABCOUNT", &dtab_count);
         if (diag)
@@ -238,6 +238,9 @@ poll_again:
                 return buffer;
         if (key != 9)
                 tab = 0;
+       if (key != 10 && key != 13)
+               last_crlf = 0;
+
         switch (key) {
         case 8: /* backspace */
                 if (cp > buffer) {
@@ -255,7 +258,12 @@ poll_again:
                         *cp = tmp;
                 }
                 break;
-       case 13: /* enter */
+       case 10: /* new line */
+       case 13: /* carriage return */
+               /* handle \n after \r */
+               if (last_crlf && last_crlf != key)
+                       break;
+               last_crlf = key;
                 *lastchar++ = '\n';
                 *lastchar++ = '\0';
                 if (!KDB_STATE(KGDB_TRANS)) {
diff --git a/kernel/events/core.c b/kernel/events/core.c

index 8c20af8738ac022d627d7a4a4ebffb4f7b182c5e..f37ff6c46cc92d7a23d7de5e845885362d8d9272 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -397,8 +397,13 @@ static cpumask_var_t perf_online_mask;
   *   0 - disallow raw tracepoint access for unpriv
   *   1 - disallow cpu events for unpriv
   *   2 - disallow kernel profiling for unpriv
+ *   3 - disallow all unpriv perf event use
   */
+#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT
+int sysctl_perf_event_paranoid __read_mostly = 3;
+#else
  int sysctl_perf_event_paranoid __read_mostly = 2;
+#endif
  
  /* Minimum for 512 kiB + 1 user control page */
  int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
@@ -9930,6 +9935,9 @@ SYSCALL_DEFINE5(perf_event_open,
         if (flags & ~PERF_FLAG_ALL)
                 return -EINVAL;
  
+       if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
         err = perf_copy_attr(attr_uptr, &attr);
         if (err)
                 return err;
diff --git a/kernel/power/Makefile b/kernel/power/Makefile

index a3f79f0eef3675917752d09b4aff2d0d04ba776a..5c1743d4d8ef3e482275863db2fee22601e4b5d9 100644 (file)
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -15,3 +15,5 @@ obj-$(CONFIG_PM_AUTOSLEEP)    += autosleep.o
  obj-$(CONFIG_PM_WAKELOCKS)     += wakelock.o
  
  obj-$(CONFIG_MAGIC_SYSRQ)      += poweroff.o
+
+obj-$(CONFIG_SUSPEND)  += wakeup_reason.o
diff --git a/kernel/power/process.c b/kernel/power/process.c

index 7381d49a44db5a728dcf56bb3aa08a62b49887bd..c366e3d34a0721708b027e8c3a88d9bd6bb57184 100644 (file)
--- a/kernel/power/process.c
+++ b/kernel/power/process.c
@@ -22,6 +22,7 @@
  #include <linux/kmod.h>
  #include <trace/events/power.h>
  #include <linux/cpuset.h>
+#include <linux/wakeup_reason.h>
  
  /*
   * Timeout for stopping processes
@@ -38,6 +39,9 @@ static int try_to_freeze_tasks(bool user_only)
         unsigned int elapsed_msecs;
         bool wakeup = false;
         int sleep_usecs = USEC_PER_MSEC;
+#ifdef CONFIG_PM_SLEEP
+       char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+#endif
  
         start = ktime_get_boottime();
  
@@ -67,6 +71,11 @@ static int try_to_freeze_tasks(bool user_only)
                         break;
  
                 if (pm_wakeup_pending()) {
+#ifdef CONFIG_PM_SLEEP
+                       pm_get_active_wakeup_sources(suspend_abort,
+                               MAX_SUSPEND_ABORT_LEN);
+                       log_suspend_abort_reason(suspend_abort);
+#endif
                         wakeup = true;
                         break;
                 }
@@ -85,26 +94,27 @@ static int try_to_freeze_tasks(bool user_only)
         elapsed = ktime_sub(end, start);
         elapsed_msecs = ktime_to_ms(elapsed);
  
-       if (todo) {
+       if (wakeup) {
                 pr_cont("\n");
-               pr_err("Freezing of tasks %s after %d.%03d seconds "
-                      "(%d tasks refusing to freeze, wq_busy=%d):\n",
-                      wakeup ? "aborted" : "failed",
+               pr_err("Freezing of tasks aborted after %d.%03d seconds",
+                      elapsed_msecs / 1000, elapsed_msecs % 1000);
+       } else if (todo) {
+               pr_cont("\n");
+               pr_err("Freezing of tasks failed after %d.%03d seconds"
+                      " (%d tasks refusing to freeze, wq_busy=%d):\n",
                        elapsed_msecs / 1000, elapsed_msecs % 1000,
                        todo - wq_busy, wq_busy);
  
                 if (wq_busy)
                         show_workqueue_state();
  
-               if (!wakeup) {
-                       read_lock(&tasklist_lock);
-                       for_each_process_thread(g, p) {
-                               if (p != current && !freezer_should_skip(p)
-                                   && freezing(p) && !frozen(p))
-                                       sched_show_task(p);
-                       }
-                       read_unlock(&tasklist_lock);
+               read_lock(&tasklist_lock);
+               for_each_process_thread(g, p) {
+                       if (p != current && !freezer_should_skip(p)
+                           && freezing(p) && !frozen(p))
+                               sched_show_task(p);
                 }
+               read_unlock(&tasklist_lock);
         } else {
                 pr_cont("(elapsed %d.%03d seconds) ", elapsed_msecs / 1000,
                         elapsed_msecs % 1000);
diff --git a/kernel/power/suspend.c b/kernel/power/suspend.c

index 0685c44994314614780606c6f303e5035165a0af..511e79197ff81d7073ac786890677d4f808eaa1a 100644 (file)
--- a/kernel/power/suspend.c
+++ b/kernel/power/suspend.c
@@ -31,6 +31,7 @@
  #include <trace/events/power.h>
  #include <linux/compiler.h>
  #include <linux/moduleparam.h>
+#include <linux/wakeup_reason.h>
  
  #include "power.h"
  
@@ -389,7 +390,8 @@ void __weak arch_suspend_enable_irqs(void)
   */
  static int suspend_enter(suspend_state_t state, bool *wakeup)
  {
-       int error;
+       char suspend_abort[MAX_SUSPEND_ABORT_LEN];
+       int error, last_dev;
  
         error = platform_suspend_prepare(state);
         if (error)
@@ -397,7 +399,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
  
         error = dpm_suspend_late(PMSG_SUSPEND);
         if (error) {
+               last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+               last_dev %= REC_FAILED_NUM;
                 pr_err("late suspend of devices failed\n");
+               log_suspend_abort_reason("%s device failed to power down",
+                       suspend_stats.failed_devs[last_dev]);
                 goto Platform_finish;
         }
         error = platform_suspend_prepare_late(state);
@@ -411,7 +417,11 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
  
         error = dpm_suspend_noirq(PMSG_SUSPEND);
         if (error) {
+               last_dev = suspend_stats.last_failed_dev + REC_FAILED_NUM - 1;
+               last_dev %= REC_FAILED_NUM;
                 pr_err("noirq suspend of devices failed\n");
+               log_suspend_abort_reason("noirq suspend of %s device failed",
+                       suspend_stats.failed_devs[last_dev]);
                 goto Platform_early_resume;
         }
         error = platform_suspend_prepare_noirq(state);
@@ -422,8 +432,10 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                 goto Platform_wake;
  
         error = disable_nonboot_cpus();
-       if (error || suspend_test(TEST_CPUS))
+       if (error || suspend_test(TEST_CPUS)) {
+               log_suspend_abort_reason("Disabling non-boot cpus failed");
                 goto Enable_cpus;
+       }
  
         arch_suspend_disable_irqs();
         BUG_ON(!irqs_disabled());
@@ -438,6 +450,9 @@ static int suspend_enter(suspend_state_t state, bool *wakeup)
                         trace_suspend_resume(TPS("machine_suspend"),
                                 state, false);
                 } else if (*wakeup) {
+                       pm_get_active_wakeup_sources(suspend_abort,
+                               MAX_SUSPEND_ABORT_LEN);
+                       log_suspend_abort_reason(suspend_abort);
                         error = -EBUSY;
                 }
                 syscore_resume();
@@ -487,6 +502,7 @@ int suspend_devices_and_enter(suspend_state_t state)
         error = dpm_suspend_start(PMSG_SUSPEND);
         if (error) {
                 pr_err("Some devices failed to suspend, or early wake event detected\n");
+               log_suspend_abort_reason("Some devices failed to suspend, or early wake event detected");
                 goto Recover_platform;
         }
         suspend_test_finish("suspend devices");
diff --git a/kernel/power/wakeup_reason.c b/kernel/power/wakeup_reason.c

new file mode 100644 (file)

index 0000000..252611f
--- /dev/null
+++ b/kernel/power/wakeup_reason.c
@@ -0,0 +1,225 @@
+/*
+ * kernel/power/wakeup_reason.c
+ *
+ * Logs the reasons which caused the kernel to resume from
+ * the suspend mode.
+ *
+ * Copyright (C) 2014 Google, Inc.
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/wakeup_reason.h>
+#include <linux/kernel.h>
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/io.h>
+#include <linux/kobject.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <linux/spinlock.h>
+#include <linux/notifier.h>
+#include <linux/suspend.h>
+
+
+#define MAX_WAKEUP_REASON_IRQS 32
+static int irq_list[MAX_WAKEUP_REASON_IRQS];
+static int irqcount;
+static bool suspend_abort;
+static char abort_reason[MAX_SUSPEND_ABORT_LEN];
+static struct kobject *wakeup_reason;
+static DEFINE_SPINLOCK(resume_reason_lock);
+
+static ktime_t last_monotime; /* monotonic time before last suspend */
+static ktime_t curr_monotime; /* monotonic time after last suspend */
+static ktime_t last_stime; /* monotonic boottime offset before last suspend */
+static ktime_t curr_stime; /* monotonic boottime offset after last suspend */
+
+static ssize_t last_resume_reason_show(struct kobject *kobj, struct kobj_attribute *attr,
+               char *buf)
+{
+       int irq_no, buf_offset = 0;
+       struct irq_desc *desc;
+       spin_lock(&resume_reason_lock);
+       if (suspend_abort) {
+               buf_offset = sprintf(buf, "Abort: %s", abort_reason);
+       } else {
+               for (irq_no = 0; irq_no < irqcount; irq_no++) {
+                       desc = irq_to_desc(irq_list[irq_no]);
+                       if (desc && desc->action && desc->action->name)
+                               buf_offset += sprintf(buf + buf_offset, "%d %s\n",
+                                               irq_list[irq_no], desc->action->name);
+                       else
+                               buf_offset += sprintf(buf + buf_offset, "%d\n",
+                                               irq_list[irq_no]);
+               }
+       }
+       spin_unlock(&resume_reason_lock);
+       return buf_offset;
+}
+
+static ssize_t last_suspend_time_show(struct kobject *kobj,
+                       struct kobj_attribute *attr, char *buf)
+{
+       struct timespec sleep_time;
+       struct timespec total_time;
+       struct timespec suspend_resume_time;
+
+       /*
+        * total_time is calculated from monotonic bootoffsets because
+        * unlike CLOCK_MONOTONIC it include the time spent in suspend state.
+        */
+       total_time = ktime_to_timespec(ktime_sub(curr_stime, last_stime));
+
+       /*
+        * suspend_resume_time is calculated as monotonic (CLOCK_MONOTONIC)
+        * time interval before entering suspend and post suspend.
+        */
+       suspend_resume_time = ktime_to_timespec(ktime_sub(curr_monotime, last_monotime));
+
+       /* sleep_time = total_time - suspend_resume_time */
+       sleep_time = timespec_sub(total_time, suspend_resume_time);
+
+       /* Export suspend_resume_time and sleep_time in pair here. */
+       return sprintf(buf, "%lu.%09lu %lu.%09lu\n",
+                               suspend_resume_time.tv_sec, suspend_resume_time.tv_nsec,
+                               sleep_time.tv_sec, sleep_time.tv_nsec);
+}
+
+static struct kobj_attribute resume_reason = __ATTR_RO(last_resume_reason);
+static struct kobj_attribute suspend_time = __ATTR_RO(last_suspend_time);
+
+static struct attribute *attrs[] = {
+       &resume_reason.attr,
+       &suspend_time.attr,
+       NULL,
+};
+static struct attribute_group attr_group = {
+       .attrs = attrs,
+};
+
+/*
+ * logs all the wake up reasons to the kernel
+ * stores the irqs to expose them to the userspace via sysfs
+ */
+void log_wakeup_reason(int irq)
+{
+       struct irq_desc *desc;
+       desc = irq_to_desc(irq);
+       if (desc && desc->action && desc->action->name)
+               printk(KERN_INFO "Resume caused by IRQ %d, %s\n", irq,
+                               desc->action->name);
+       else
+               printk(KERN_INFO "Resume caused by IRQ %d\n", irq);
+
+       spin_lock(&resume_reason_lock);
+       if (irqcount == MAX_WAKEUP_REASON_IRQS) {
+               spin_unlock(&resume_reason_lock);
+               printk(KERN_WARNING "Resume caused by more than %d IRQs\n",
+                               MAX_WAKEUP_REASON_IRQS);
+               return;
+       }
+
+       irq_list[irqcount++] = irq;
+       spin_unlock(&resume_reason_lock);
+}
+
+int check_wakeup_reason(int irq)
+{
+       int irq_no;
+       int ret = false;
+
+       spin_lock(&resume_reason_lock);
+       for (irq_no = 0; irq_no < irqcount; irq_no++)
+               if (irq_list[irq_no] == irq) {
+                       ret = true;
+                       break;
+       }
+       spin_unlock(&resume_reason_lock);
+       return ret;
+}
+
+void log_suspend_abort_reason(const char *fmt, ...)
+{
+       va_list args;
+
+       spin_lock(&resume_reason_lock);
+
+       //Suspend abort reason has already been logged.
+       if (suspend_abort) {
+               spin_unlock(&resume_reason_lock);
+               return;
+       }
+
+       suspend_abort = true;
+       va_start(args, fmt);
+       vsnprintf(abort_reason, MAX_SUSPEND_ABORT_LEN, fmt, args);
+       va_end(args);
+       spin_unlock(&resume_reason_lock);
+}
+
+/* Detects a suspend and clears all the previous wake up reasons*/
+static int wakeup_reason_pm_event(struct notifier_block *notifier,
+               unsigned long pm_event, void *unused)
+{
+       switch (pm_event) {
+       case PM_SUSPEND_PREPARE:
+               spin_lock(&resume_reason_lock);
+               irqcount = 0;
+               suspend_abort = false;
+               spin_unlock(&resume_reason_lock);
+               /* monotonic time since boot */
+               last_monotime = ktime_get();
+               /* monotonic time since boot including the time spent in suspend */
+               last_stime = ktime_get_boottime();
+               break;
+       case PM_POST_SUSPEND:
+               /* monotonic time since boot */
+               curr_monotime = ktime_get();
+               /* monotonic time since boot including the time spent in suspend */
+               curr_stime = ktime_get_boottime();
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block wakeup_reason_pm_notifier_block = {
+       .notifier_call = wakeup_reason_pm_event,
+};
+
+/* Initializes the sysfs parameter
+ * registers the pm_event notifier
+ */
+int __init wakeup_reason_init(void)
+{
+       int retval;
+
+       retval = register_pm_notifier(&wakeup_reason_pm_notifier_block);
+       if (retval)
+               printk(KERN_WARNING "[%s] failed to register PM notifier %d\n",
+                               __func__, retval);
+
+       wakeup_reason = kobject_create_and_add("wakeup_reasons", kernel_kobj);
+       if (!wakeup_reason) {
+               printk(KERN_WARNING "[%s] failed to create a sysfs kobject\n",
+                               __func__);
+               return 1;
+       }
+       retval = sysfs_create_group(wakeup_reason, &attr_group);
+       if (retval) {
+               kobject_put(wakeup_reason);
+               printk(KERN_WARNING "[%s] failed to create a sysfs group %d\n",
+                               __func__, retval);
+       }
+       return 0;
+}
+
+late_initcall(wakeup_reason_init);
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index a9ee16bbc6931a6e6cb8631f48bc6c19b39215b4..b9207a9caa8616bde019367347305ce2a9de73f1 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -20,9 +20,12 @@ obj-y += core.o loadavg.o clock.o cputime.o
  obj-y += idle_task.o fair.o rt.o deadline.o
  obj-y += wait.o wait_bit.o swait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
+obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
  obj-$(CONFIG_SCHEDSTATS) += stats.o
  obj-$(CONFIG_SCHED_DEBUG) += debug.o
+obj-$(CONFIG_SCHED_TUNE) += tune.o
  obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
  obj-$(CONFIG_CPU_FREQ) += cpufreq.o
  obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/autogroup.c b/kernel/sched/autogroup.c

index a43df51935386fcd48f9fad8905d722880c8f8c8..c80a47a161a732d79bec761014912131e271ac27 100644 (file)
--- a/kernel/sched/autogroup.c
+++ b/kernel/sched/autogroup.c
@@ -260,7 +260,6 @@ out:
  }
  #endif /* CONFIG_PROC_FS */
  
-#ifdef CONFIG_SCHED_DEBUG
  int autogroup_path(struct task_group *tg, char *buf, int buflen)
  {
         if (!task_group_is_autogroup(tg))
@@ -268,4 +267,3 @@ int autogroup_path(struct task_group *tg, char *buf, int buflen)
  
         return snprintf(buf, buflen, "%s-%ld", "/autogroup", tg->autogroup->id);
  }
-#endif /* CONFIG_SCHED_DEBUG */
diff --git a/kernel/sched/autogroup.h b/kernel/sched/autogroup.h

index 27cd22b8982405c5ef2f07c7c4ff50314d220f36..590184bbdc7f1c59f79701064deb148aef553f54 100644 (file)
--- a/kernel/sched/autogroup.h
+++ b/kernel/sched/autogroup.h
@@ -56,11 +56,9 @@ autogroup_task_group(struct task_struct *p, struct task_group *tg)
         return tg;
  }
  
-#ifdef CONFIG_SCHED_DEBUG
  static inline int autogroup_path(struct task_group *tg, char *buf, int buflen)
  {
         return 0;
  }
-#endif
  
  #endif /* CONFIG_SCHED_AUTOGROUP */
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 55062461b2fd16fd8c935a5099b4fab22418a8e5..15d9be654a55cc3a2301d7b2092d446f4a3b8d30 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -39,6 +39,7 @@
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
+#include "walt.h"
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@ -438,6 +439,8 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
                 return;
  
+       head->count++;
+
         get_task_struct(task);
  
         /*
@@ -447,6 +450,10 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task)
         head->lastp = &node->next;
  }
  
+static int
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+              int sibling_count_hint);
+
  void wake_up_q(struct wake_q_head *head)
  {
         struct wake_q_node *node = head->first;
@@ -461,10 +468,10 @@ void wake_up_q(struct wake_q_head *head)
                 task->wake_q.next = NULL;
  
                 /*
-                * wake_up_process() implies a wmb() to pair with the queueing
+                * try_to_wake_up() implies a wmb() to pair with the queueing
                  * in wake_q_add() so as not to miss wakeups.
                  */
-               wake_up_process(task);
+               try_to_wake_up(task, TASK_NORMAL, 0, head->count);
                 put_task_struct(task);
         }
  }
@@ -1185,6 +1192,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                         p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
+
+               walt_fixup_busy_time(p, new_cpu);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -1535,12 +1544,14 @@ out:
   * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
   */
  static inline
-int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags)
+int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
+                  int sibling_count_hint)
  {
         lockdep_assert_held(&p->pi_lock);
  
         if (p->nr_cpus_allowed > 1)
-               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags);
+               cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
+                                                    sibling_count_hint);
         else
                 cpu = cpumask_any(&p->cpus_allowed);
  
@@ -1947,11 +1958,33 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   *
   */
  
+#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_WALT
+/* utility function to update walt signals at wakeup */
+static inline void walt_try_to_wake_up(struct task_struct *p)
+{
+       struct rq *rq = cpu_rq(task_cpu(p));
+       struct rq_flags rf;
+       u64 wallclock;
+
+       rq_lock_irqsave(rq, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+       rq_unlock_irqrestore(rq, &rf);
+}
+#else
+#define walt_try_to_wake_up(a) {}
+#endif
+#endif
+
  /**
   * try_to_wake_up - wake up a thread
   * @p: the thread to be awakened
   * @state: the mask of task states that can be woken
   * @wake_flags: wake modifier flags (WF_*)
+ * @sibling_count_hint: A hint at the number of threads that are being woken up
+ *                      in this event.
   *
   * If (@state & @p->state) @p->state = TASK_RUNNING.
   *
@@ -1964,7 +1997,8 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   *        %false otherwise.
   */
  static int
-try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
+try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
+              int sibling_count_hint)
  {
         unsigned long flags;
         int cpu, success = 0;
@@ -2042,6 +2076,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
          */
         smp_cond_load_acquire(&p->on_cpu, !VAL);
  
+       walt_try_to_wake_up(p);
+
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
@@ -2050,7 +2086,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
                 atomic_dec(&task_rq(p)->nr_iowait);
         }
  
-       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags);
+       cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
+                            sibling_count_hint);
         if (task_cpu(p) != cpu) {
                 wake_flags |= WF_MIGRATED;
                 set_task_cpu(p, cpu);
@@ -2111,6 +2148,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
         trace_sched_waking(p);
  
         if (!task_on_rq_queued(p)) {
+               u64 wallclock = walt_ktime_clock();
+
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+               walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+
                 if (p->in_iowait) {
                         delayacct_blkio_end(p);
                         atomic_dec(&rq->nr_iowait);
@@ -2138,13 +2180,13 @@ out:
   */
  int wake_up_process(struct task_struct *p)
  {
-       return try_to_wake_up(p, TASK_NORMAL, 0);
+       return try_to_wake_up(p, TASK_NORMAL, 0, 1);
  }
  EXPORT_SYMBOL(wake_up_process);
  
  int wake_up_state(struct task_struct *p, unsigned int state)
  {
-       return try_to_wake_up(p, state, 0);
+       return try_to_wake_up(p, state, 0, 1);
  }
  
  /*
@@ -2163,7 +2205,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
+#ifdef CONFIG_SCHED_WALT
+       p->last_sleep_ts                = 0;
+#endif
+
         INIT_LIST_HEAD(&p->se.group_node);
+       walt_init_new_task_load(p);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         p->se.cfs_rq                    = NULL;
@@ -2440,6 +2487,9 @@ void wake_up_new_task(struct task_struct *p)
         struct rq *rq;
  
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+       walt_init_new_task_load(p);
+
         p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
         /*
@@ -2450,13 +2500,15 @@ void wake_up_new_task(struct task_struct *p)
          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
          * as we're not fully set-up yet.
          */
-       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
  #endif
         rq = __task_rq_lock(p, &rf);
         update_rq_clock(rq);
         post_init_entity_util_avg(&p->se);
  
         activate_task(rq, p, ENQUEUE_NOCLOCK);
+       walt_mark_task_starting(p);
+
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
@@ -2911,7 +2963,7 @@ void sched_exec(void)
         int dest_cpu;
  
         raw_spin_lock_irqsave(&p->pi_lock, flags);
-       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0);
+       dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
         if (dest_cpu == smp_processor_id())
                 goto unlock;
  
@@ -3010,6 +3062,9 @@ void scheduler_tick(void)
  
         rq_lock(rq, &rf);
  
+       walt_set_window_start(rq, &rf);
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                       walt_ktime_clock(), 0);
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         cpu_load_update_active(rq);
@@ -3281,6 +3336,7 @@ static void __sched notrace __schedule(bool preempt)
         struct rq_flags rf;
         struct rq *rq;
         int cpu;
+       u64 wallclock;
  
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
@@ -3336,10 +3392,17 @@ static void __sched notrace __schedule(bool preempt)
         }
  
         next = pick_next_task(rq, prev, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+       walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
  
         if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+               if (!prev->on_rq)
+                       prev->last_sleep_ts = wallclock;
+#endif
                 rq->nr_switches++;
                 rq->curr = next;
                 /*
@@ -3615,7 +3678,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void)
  int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
                           void *key)
  {
-       return try_to_wake_up(curr->private, mode, wake_flags);
+       return try_to_wake_up(curr->private, mode, wake_flags, 1);
  }
  EXPORT_SYMBOL(default_wake_function);
  
@@ -5691,6 +5754,9 @@ int sched_cpu_dying(unsigned int cpu)
         sched_ttwu_pending();
  
         rq_lock_irqsave(rq, &rf);
+
+       walt_migrate_sync_cpu(cpu);
+
         if (rq->rd) {
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_offline(rq);
@@ -5916,12 +5982,18 @@ void __init sched_init(void)
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+               rq->cur_irqload = 0;
+               rq->avg_irqload = 0;
+               rq->irqload_ts = 0;
+#endif
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
                 rq_attach_root(rq, &def_root_domain);
  #ifdef CONFIG_NO_HZ_COMMON
                 rq->last_load_update_tick = jiffies;
+               rq->last_blocked_load_update_tick = jiffies;
                 rq->nohz_flags = 0;
  #endif
  #ifdef CONFIG_NO_HZ_FULL
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c

index d6717a3331a1b21bd1be02f034596ab293e8ceac..1f525acba1fb49cf550a0da44a3bb300131ff2e2 100644 (file)
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -19,11 +19,14 @@
  
  #include "sched.h"
  
+unsigned long boosted_cpu_util(int cpu);
+
  #define SUGOV_KTHREAD_PRIORITY 50
  
  struct sugov_tunables {
         struct gov_attr_set attr_set;
-       unsigned int rate_limit_us;
+       unsigned int up_rate_limit_us;
+       unsigned int down_rate_limit_us;
  };
  
  struct sugov_policy {
@@ -34,7 +37,9 @@ struct sugov_policy {
  
         raw_spinlock_t update_lock;  /* For shared policies */
         u64 last_freq_update_time;
-       s64 freq_update_delay_ns;
+       s64 min_rate_limit_ns;
+       s64 up_rate_delay_ns;
+       s64 down_rate_delay_ns;
         unsigned int next_freq;
         unsigned int cached_raw_freq;
  
@@ -111,8 +116,32 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
                 return true;
         }
  
+       /* No need to recalculate next freq for min_rate_limit_us
+        * at least. However we might still decide to further rate
+        * limit once frequency change direction is decided, according
+        * to the separate rate limits.
+        */
+
         delta_ns = time - sg_policy->last_freq_update_time;
-       return delta_ns >= sg_policy->freq_update_delay_ns;
+       return delta_ns >= sg_policy->min_rate_limit_ns;
+}
+
+static bool sugov_up_down_rate_limit(struct sugov_policy *sg_policy, u64 time,
+                                    unsigned int next_freq)
+{
+       s64 delta_ns;
+
+       delta_ns = time - sg_policy->last_freq_update_time;
+
+       if (next_freq > sg_policy->next_freq &&
+           delta_ns < sg_policy->up_rate_delay_ns)
+                       return true;
+
+       if (next_freq < sg_policy->next_freq &&
+           delta_ns < sg_policy->down_rate_delay_ns)
+                       return true;
+
+       return false;
  }
  
  static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
@@ -123,6 +152,9 @@ static void sugov_update_commit(struct sugov_policy *sg_policy, u64 time,
         if (sg_policy->next_freq == next_freq)
                 return;
  
+       if (sugov_up_down_rate_limit(sg_policy, time, next_freq))
+               return;
+
         sg_policy->next_freq = next_freq;
         sg_policy->last_freq_update_time = time;
  
@@ -178,13 +210,15 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy,
  
  static void sugov_get_util(unsigned long *util, unsigned long *max, int cpu)
  {
-       struct rq *rq = cpu_rq(cpu);
-       unsigned long cfs_max;
+       unsigned long max_cap, rt;
+
+       max_cap = arch_scale_cpu_capacity(NULL, cpu);
  
-       cfs_max = arch_scale_cpu_capacity(NULL, cpu);
+       rt = sched_get_rt_rq_util(cpu);
  
-       *util = min(rq->cfs.avg.util_avg, cfs_max);
-       *max = cfs_max;
+       *util = boosted_cpu_util(cpu) + rt;
+       *util = min(*util, max_cap);
+       *max = max_cap;
  }
  
  static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time,
@@ -272,7 +306,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
  
         busy = sugov_cpu_is_busy(sg_cpu);
  
-       if (flags & SCHED_CPUFREQ_RT_DL) {
+       if (flags & SCHED_CPUFREQ_DL) {
                 next_f = policy->cpuinfo.max_freq;
         } else {
                 sugov_get_util(&util, &max, sg_cpu->cpu);
@@ -289,6 +323,7 @@ static void sugov_update_single(struct update_util_data *hook, u64 time,
                         sg_policy->cached_raw_freq = 0;
                 }
         }
+
         sugov_update_commit(sg_policy, time, next_f);
  }
  
@@ -317,7 +352,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time)
                         j_sg_cpu->iowait_boost_pending = false;
                         continue;
                 }
-               if (j_sg_cpu->flags & SCHED_CPUFREQ_RT_DL)
+               if (j_sg_cpu->flags & SCHED_CPUFREQ_DL)
                         return policy->cpuinfo.max_freq;
  
                 j_util = j_sg_cpu->util;
@@ -353,7 +388,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time,
         sg_cpu->last_update = time;
  
         if (sugov_should_update_freq(sg_policy, time)) {
-               if (flags & SCHED_CPUFREQ_RT_DL)
+               if (flags & SCHED_CPUFREQ_DL)
                         next_f = sg_policy->policy->cpuinfo.max_freq;
                 else
                         next_f = sugov_next_freq_shared(sg_cpu, time);
@@ -408,15 +443,52 @@ static inline struct sugov_tunables *to_sugov_tunables(struct gov_attr_set *attr
         return container_of(attr_set, struct sugov_tunables, attr_set);
  }
  
-static ssize_t rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+static DEFINE_MUTEX(min_rate_lock);
+
+static void update_min_rate_limit_ns(struct sugov_policy *sg_policy)
+{
+       mutex_lock(&min_rate_lock);
+       sg_policy->min_rate_limit_ns = min(sg_policy->up_rate_delay_ns,
+                                          sg_policy->down_rate_delay_ns);
+       mutex_unlock(&min_rate_lock);
+}
+
+static ssize_t up_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
  {
         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
  
-       return sprintf(buf, "%u\n", tunables->rate_limit_us);
+       return sprintf(buf, "%u\n", tunables->up_rate_limit_us);
  }
  
-static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *buf,
-                                  size_t count)
+static ssize_t down_rate_limit_us_show(struct gov_attr_set *attr_set, char *buf)
+{
+       struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+
+       return sprintf(buf, "%u\n", tunables->down_rate_limit_us);
+}
+
+static ssize_t up_rate_limit_us_store(struct gov_attr_set *attr_set,
+                                     const char *buf, size_t count)
+{
+       struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
+       struct sugov_policy *sg_policy;
+       unsigned int rate_limit_us;
+
+       if (kstrtouint(buf, 10, &rate_limit_us))
+               return -EINVAL;
+
+       tunables->up_rate_limit_us = rate_limit_us;
+
+       list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+               sg_policy->up_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+               update_min_rate_limit_ns(sg_policy);
+       }
+
+       return count;
+}
+
+static ssize_t down_rate_limit_us_store(struct gov_attr_set *attr_set,
+                                       const char *buf, size_t count)
  {
         struct sugov_tunables *tunables = to_sugov_tunables(attr_set);
         struct sugov_policy *sg_policy;
@@ -425,18 +497,22 @@ static ssize_t rate_limit_us_store(struct gov_attr_set *attr_set, const char *bu
         if (kstrtouint(buf, 10, &rate_limit_us))
                 return -EINVAL;
  
-       tunables->rate_limit_us = rate_limit_us;
+       tunables->down_rate_limit_us = rate_limit_us;
  
-       list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook)
-               sg_policy->freq_update_delay_ns = rate_limit_us * NSEC_PER_USEC;
+       list_for_each_entry(sg_policy, &attr_set->policy_list, tunables_hook) {
+               sg_policy->down_rate_delay_ns = rate_limit_us * NSEC_PER_USEC;
+               update_min_rate_limit_ns(sg_policy);
+       }
  
         return count;
  }
  
-static struct governor_attr rate_limit_us = __ATTR_RW(rate_limit_us);
+static struct governor_attr up_rate_limit_us = __ATTR_RW(up_rate_limit_us);
+static struct governor_attr down_rate_limit_us = __ATTR_RW(down_rate_limit_us);
  
  static struct attribute *sugov_attributes[] = {
-       &rate_limit_us.attr,
+       &up_rate_limit_us.attr,
+       &down_rate_limit_us.attr,
         NULL
  };
  
@@ -583,7 +659,8 @@ static int sugov_init(struct cpufreq_policy *policy)
                 goto stop_kthread;
         }
  
-       tunables->rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+       tunables->up_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
+       tunables->down_rate_limit_us = cpufreq_policy_transition_delay_us(policy);
  
         policy->governor_data = sg_policy;
         sg_policy->tunables = tunables;
@@ -642,7 +719,11 @@ static int sugov_start(struct cpufreq_policy *policy)
         struct sugov_policy *sg_policy = policy->governor_data;
         unsigned int cpu;
  
-       sg_policy->freq_update_delay_ns = sg_policy->tunables->rate_limit_us * NSEC_PER_USEC;
+       sg_policy->up_rate_delay_ns =
+               sg_policy->tunables->up_rate_limit_us * NSEC_PER_USEC;
+       sg_policy->down_rate_delay_ns =
+               sg_policy->tunables->down_rate_limit_us * NSEC_PER_USEC;
+       update_min_rate_limit_ns(sg_policy);
         sg_policy->last_freq_update_time = 0;
         sg_policy->next_freq = UINT_MAX;
         sg_policy->work_in_progress = false;
@@ -655,7 +736,7 @@ static int sugov_start(struct cpufreq_policy *policy)
                 memset(sg_cpu, 0, sizeof(*sg_cpu));
                 sg_cpu->cpu = cpu;
                 sg_cpu->sg_policy = sg_policy;
-               sg_cpu->flags = SCHED_CPUFREQ_RT;
+               sg_cpu->flags = SCHED_CPUFREQ_DL;
                 sg_cpu->iowait_boost_max = policy->cpuinfo.max_freq;
         }
  
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 14d2dbf97c531db0dd0c1bba3113a35cbe8d3bd9..029b505aca494c58ec224eeb1c6bead63450c169 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,6 +6,7 @@
  #include <linux/context_tracking.h>
  #include <linux/sched/cputime.h>
  #include "sched.h"
+#include "walt.h"
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  
@@ -55,11 +56,18 @@ void irqtime_account_irq(struct task_struct *curr)
         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
         s64 delta;
         int cpu;
+#ifdef CONFIG_SCHED_WALT
+       u64 wallclock;
+       bool account = true;
+#endif
  
         if (!sched_clock_irqtime)
                 return;
  
         cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+       wallclock = sched_clock_cpu(cpu);
+#endif
         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
         irqtime->irq_start_time += delta;
  
@@ -73,6 +81,13 @@ void irqtime_account_irq(struct task_struct *curr)
                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+       else
+               account = false;
+
+       if (account)
+               walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
  }
  EXPORT_SYMBOL_GPL(irqtime_account_irq);
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 4ae5c1ea90e26bea2253124a8bc17cc3f84c1ce0..f982a3fa825393319047764690dfe21ce3e15fb5 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -20,6 +20,8 @@
  #include <linux/slab.h>
  #include <uapi/linux/sched/types.h>
  
+#include "walt.h"
+
  struct dl_bandwidth def_dl_bandwidth;
  
  static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -1290,6 +1292,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
         WARN_ON(!dl_prio(prio));
         dl_rq->dl_nr_running++;
         add_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  
         inc_dl_deadline(dl_rq, deadline);
         inc_dl_migration(dl_se, dl_rq);
@@ -1304,6 +1307,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
         WARN_ON(!dl_rq->dl_nr_running);
         dl_rq->dl_nr_running--;
         sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  
         dec_dl_deadline(dl_rq, dl_se->deadline);
         dec_dl_migration(dl_se, dl_rq);
@@ -1505,7 +1509,8 @@ static void yield_task_dl(struct rq *rq)
  static int find_later_rq(struct task_struct *task);
  
  static int
-select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_dl(struct task_struct *p, int cpu, int sd_flag, int flags,
+                 int sibling_count_hint)
  {
         struct task_struct *curr;
         struct rq *rq;
@@ -2017,7 +2022,9 @@ retry:
         deactivate_task(rq, next_task, 0);
         sub_running_bw(next_task->dl.dl_bw, &rq->dl);
         sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
         set_task_cpu(next_task, later_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
         add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
         add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
         activate_task(later_rq, next_task, 0);
@@ -2109,7 +2116,9 @@ static void pull_dl_task(struct rq *this_rq)
                         deactivate_task(src_rq, p, 0);
                         sub_running_bw(p->dl.dl_bw, &src_rq->dl);
                         sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                         set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                         add_rq_bw(p->dl.dl_bw, &this_rq->dl);
                         add_running_bw(p->dl.dl_bw, &this_rq->dl);
                         activate_task(this_rq, p, 0);
diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c

index 2f93e4a2d9f623915d0023f9b3a7d8b7d7b95cf7..0a93f253673c2df477825d202c652bafe14b412e 100644 (file)
--- a/kernel/sched/debug.c
+++ b/kernel/sched/debug.c
@@ -266,10 +266,61 @@ set_table_entry(struct ctl_table *entry,
         }
  }
  
+static struct ctl_table *
+sd_alloc_ctl_energy_table(struct sched_group_energy *sge)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(5);
+
+       if (table == NULL)
+               return NULL;
+
+       set_table_entry(&table[0], "nr_idle_states", &sge->nr_idle_states,
+                       sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[1], "idle_states", &sge->idle_states[0].power,
+                       sge->nr_idle_states*sizeof(struct idle_state), 0644,
+                       proc_doulongvec_minmax, false);
+       set_table_entry(&table[2], "nr_cap_states", &sge->nr_cap_states,
+                       sizeof(int), 0644, proc_dointvec_minmax, false);
+       set_table_entry(&table[3], "cap_states", &sge->cap_states[0].cap,
+                       sge->nr_cap_states*sizeof(struct capacity_state), 0644,
+                       proc_doulongvec_minmax, false);
+
+       return table;
+}
+
+static struct ctl_table *
+sd_alloc_ctl_group_table(struct sched_group *sg)
+{
+       struct ctl_table *table = sd_alloc_ctl_entry(2);
+
+       if (table == NULL)
+               return NULL;
+
+       table->procname = kstrdup("energy", GFP_KERNEL);
+       table->mode = 0555;
+       table->child = sd_alloc_ctl_energy_table((struct sched_group_energy *)sg->sge);
+
+       return table;
+}
+
  static struct ctl_table *
  sd_alloc_ctl_domain_table(struct sched_domain *sd)
  {
-       struct ctl_table *table = sd_alloc_ctl_entry(14);
+       struct ctl_table *table;
+       unsigned int nr_entries = 14;
+
+       int i = 0;
+       struct sched_group *sg = sd->groups;
+
+       if (sg->sge) {
+               int nr_sgs = 0;
+
+               do {} while (nr_sgs++, sg = sg->next, sg != sd->groups);
+
+               nr_entries += nr_sgs;
+       }
+
+       table = sd_alloc_ctl_entry(nr_entries);
  
         if (table == NULL)
                 return NULL;
@@ -302,7 +353,19 @@ sd_alloc_ctl_domain_table(struct sched_domain *sd)
                 sizeof(long), 0644, proc_doulongvec_minmax, false);
         set_table_entry(&table[12], "name", sd->name,
                 CORENAME_MAX_SIZE, 0444, proc_dostring, false);
-       /* &table[13] is terminator */
+       sg = sd->groups;
+       if (sg->sge) {
+               char buf[32];
+               struct ctl_table *entry = &table[13];
+
+               do {
+                       snprintf(buf, 32, "group%d", i);
+                       entry->procname = kstrdup(buf, GFP_KERNEL);
+                       entry->mode = 0555;
+                       entry->child = sd_alloc_ctl_group_table(sg);
+               } while (entry++, i++, sg = sg->next, sg != sd->groups);
+       }
+       /* &table[nr_entries-1] is terminator */
  
         return table;
  }
diff --git a/kernel/sched/energy.c b/kernel/sched/energy.c

new file mode 100644 (file)

index 0000000..e82248a
--- /dev/null
+++ b/kernel/sched/energy.c
@@ -0,0 +1,145 @@
+/*
+ * Obtain energy cost data from DT and populate relevant scheduler data
+ * structures.
+ *
+ * Copyright (C) 2015 ARM Ltd.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+#define pr_fmt(fmt) "sched-energy: " fmt
+
+#define DEBUG
+
+#include <linux/gfp.h>
+#include <linux/of.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/topology.h>
+#include <linux/sched_energy.h>
+#include <linux/stddef.h>
+#include <linux/arch_topology.h>
+
+struct sched_group_energy *sge_array[NR_CPUS][NR_SD_LEVELS];
+
+static void free_resources(void)
+{
+       int cpu, sd_level;
+       struct sched_group_energy *sge;
+
+       for_each_possible_cpu(cpu) {
+               for_each_possible_sd_level(sd_level) {
+                       sge = sge_array[cpu][sd_level];
+                       if (sge) {
+                               kfree(sge->cap_states);
+                               kfree(sge->idle_states);
+                               kfree(sge);
+                       }
+               }
+       }
+}
+
+static inline unsigned long cpu_max_capacity(int cpu)
+{
+       if (!sge_array[cpu][0]->cap_states)
+               return 1024;
+       if (!sge_array[cpu][0]->nr_cap_states)
+               return 1024;
+
+       return sge_array[cpu][0]->cap_states[sge_array[cpu][0]->nr_cap_states-1].cap;
+}
+
+int sched_energy_installed(int cpu)
+{
+       return (sge_array[cpu][0]->cap_states != NULL);
+}
+
+void init_sched_energy_costs(void)
+{
+       struct device_node *cn, *cp;
+       struct capacity_state *cap_states;
+       struct idle_state *idle_states;
+       struct sched_group_energy *sge;
+       const struct property *prop;
+       int sd_level, i, nstates, cpu;
+       const __be32 *val;
+
+       for_each_possible_cpu(cpu) {
+               cn = of_get_cpu_node(cpu, NULL);
+               if (!cn) {
+                       pr_warn("CPU device node missing for CPU %d\n", cpu);
+                       return;
+               }
+
+               if (!of_find_property(cn, "sched-energy-costs", NULL)) {
+                       pr_warn("CPU device node has no sched-energy-costs\n");
+                       return;
+               }
+
+               for_each_possible_sd_level(sd_level) {
+                       cp = of_parse_phandle(cn, "sched-energy-costs", sd_level);
+                       if (!cp)
+                               break;
+
+                       prop = of_find_property(cp, "busy-cost-data", NULL);
+                       if (!prop || !prop->value) {
+                               pr_warn("No busy-cost data, skipping sched_energy init\n");
+                               goto out;
+                       }
+
+                       sge = kcalloc(1, sizeof(struct sched_group_energy),
+                                     GFP_NOWAIT);
+
+                       nstates = (prop->length / sizeof(u32)) / 2;
+                       cap_states = kcalloc(nstates,
+                                            sizeof(struct capacity_state),
+                                            GFP_NOWAIT);
+
+                       for (i = 0, val = prop->value; i < nstates; i++) {
+                               cap_states[i].cap = be32_to_cpup(val++);
+                               cap_states[i].power = be32_to_cpup(val++);
+                       }
+
+                       sge->nr_cap_states = nstates;
+                       sge->cap_states = cap_states;
+
+                       prop = of_find_property(cp, "idle-cost-data", NULL);
+                       if (!prop || !prop->value) {
+                               pr_warn("No idle-cost data, skipping sched_energy init\n");
+                               goto out;
+                       }
+
+                       nstates = (prop->length / sizeof(u32));
+                       idle_states = kcalloc(nstates,
+                                             sizeof(struct idle_state),
+                                             GFP_NOWAIT);
+
+                       for (i = 0, val = prop->value; i < nstates; i++)
+                               idle_states[i].power = be32_to_cpup(val++);
+
+                       sge->nr_idle_states = nstates;
+                       sge->idle_states = idle_states;
+
+                       sge_array[cpu][sd_level] = sge;
+
+                       /* populate cpu scale so that flags get set correctly */
+                       if (sd_level == 0)
+                               topology_set_cpu_scale(cpu, cpu_max_capacity(cpu));
+               }
+       }
+
+       pr_info("Sched-energy-costs installed from DT\n");
+       return;
+
+out:
+       free_resources();
+}
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 5c09ddf8c8321ca1aa18a6ff7f78d3d2937ec92e..541f482650f70431fe3c84674a40fae3cf01cef7 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -37,6 +37,8 @@
  #include <trace/events/sched.h>
  
  #include "sched.h"
+#include "tune.h"
+#include "walt.h"
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
@@ -54,6 +56,15 @@
  unsigned int sysctl_sched_latency                      = 6000000ULL;
  unsigned int normalized_sysctl_sched_latency           = 6000000ULL;
  
+/*
+ * Enable/disable honoring sync flag in energy-aware wakeups.
+ */
+unsigned int sysctl_sched_sync_hint_enable = 1;
+/*
+ * Enable/disable using cstate knowledge in idle sibling selection
+ */
+unsigned int sysctl_sched_cstate_aware = 1;
+
  /*
   * The initial- and re-scaling of tunables is configurable
   *
@@ -100,6 +111,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity    = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
  
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
+
  #ifdef CONFIG_SMP
  /*
   * For asym packing, by default the lower numbered cpu has higher priority.
@@ -966,6 +984,7 @@ update_stats_enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
                         }
  
                         trace_sched_stat_blocked(tsk, delta);
+                       trace_sched_blocked_reason(tsk);
  
                         /*
                          * Blocking time is in units of nanosecs, so shift by
@@ -1430,7 +1449,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
  static unsigned long weighted_cpuload(struct rq *rq);
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
  
  /* Cached statistics for all CPUs within a node */
  struct numa_stats {
@@ -2811,6 +2829,9 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                  * See cpu_util().
                  */
                 cpufreq_update_util(rq, 0);
+#ifdef CONFIG_SMP
+               trace_sched_load_avg_cpu(cpu_of(rq), cfs_rq);
+#endif
         }
  }
  
@@ -2967,7 +2988,8 @@ accumulate_sum(u64 delta, int cpu, struct sched_avg *sa,
   */
  static __always_inline int
  ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
-                 unsigned long weight, int running, struct cfs_rq *cfs_rq)
+                 unsigned long weight, int running, struct cfs_rq *cfs_rq,
+                 struct rt_rq *rt_rq)
  {
         u64 delta;
  
@@ -3023,13 +3045,22 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
         }
         sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
  
+       if (cfs_rq)
+               trace_sched_load_cfs_rq(cfs_rq);
+       else {
+               if (likely(!rt_rq))
+                       trace_sched_load_se(container_of(sa, struct sched_entity, avg));
+               else
+                       trace_sched_load_rt_rq(cpu, rt_rq);
+       }
+
         return 1;
  }
  
  static int
  __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
  {
-       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL);
+       return ___update_load_avg(now, cpu, &se->avg, 0, 0, NULL, NULL);
  }
  
  static int
@@ -3037,7 +3068,7 @@ __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entit
  {
         return ___update_load_avg(now, cpu, &se->avg,
                                   se->on_rq * scale_load_down(se->load.weight),
-                                 cfs_rq->curr == se, NULL);
+                                 cfs_rq->curr == se, NULL, NULL);
  }
  
  static int
@@ -3045,7 +3076,7 @@ __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq)
  {
         return ___update_load_avg(now, cpu, &cfs_rq->avg,
                         scale_load_down(cfs_rq->load.weight),
-                       cfs_rq->curr != NULL, cfs_rq);
+                       cfs_rq->curr != NULL, cfs_rq, NULL);
  }
  
  /*
@@ -3098,6 +3129,8 @@ static inline void update_tg_load_avg(struct cfs_rq *cfs_rq, int force)
                 atomic_long_add(delta, &cfs_rq->tg->load_avg);
                 cfs_rq->tg_load_avg_contrib = cfs_rq->avg.load_avg;
         }
+
+       trace_sched_load_tg(cfs_rq);
  }
  
  /*
@@ -3266,6 +3299,9 @@ static inline int propagate_entity_load_avg(struct sched_entity *se)
         update_tg_cfs_util(cfs_rq, se);
         update_tg_cfs_load(cfs_rq, se);
  
+       trace_sched_load_cfs_rq(cfs_rq);
+       trace_sched_load_se(se);
+
         return 1;
  }
  
@@ -3380,6 +3416,21 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         return decayed || removed_load;
  }
  
+int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
+{
+       int ret;
+
+       ret = ___update_load_avg(now, cpu, &rt_rq->avg, 0, running, NULL, rt_rq);
+
+       return ret;
+}
+
+unsigned long sched_get_rt_rq_util(int cpu)
+{
+       struct rt_rq *rt_rq = &(cpu_rq(cpu)->rt);
+       return rt_rq->avg.util_avg;
+}
+
  /*
   * Optional action to be done while updating the load average
   */
@@ -3427,6 +3478,8 @@ static void attach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         set_tg_cfs_propagate(cfs_rq);
  
         cfs_rq_util_change(cfs_rq);
+
+       trace_sched_load_cfs_rq(cfs_rq);
  }
  
  /**
@@ -3447,6 +3500,8 @@ static void detach_entity_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *s
         set_tg_cfs_propagate(cfs_rq);
  
         cfs_rq_util_change(cfs_rq);
+
+       trace_sched_load_cfs_rq(cfs_rq);
  }
  
  /* Add the load generated by se into cfs_rq's load average */
@@ -3551,6 +3606,11 @@ update_cfs_rq_load_avg(u64 now, struct cfs_rq *cfs_rq)
         return 0;
  }
  
+int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running)
+{
+       return 0;
+}
+
  #define UPDATE_TG      0x0
  #define SKIP_AGE_LOAD  0x0
  
@@ -4870,6 +4930,48 @@ static inline void hrtick_update(struct rq *rq)
  }
  #endif
  
+#ifdef CONFIG_SMP
+static bool cpu_overutilized(int cpu);
+
+static unsigned long cpu_util(int cpu);
+
+static bool sd_overutilized(struct sched_domain *sd)
+{
+       return sd->shared->overutilized;
+}
+
+static void set_sd_overutilized(struct sched_domain *sd)
+{
+       trace_sched_overutilized(sd, sd->shared->overutilized, true);
+       sd->shared->overutilized = true;
+}
+
+static void clear_sd_overutilized(struct sched_domain *sd)
+{
+       trace_sched_overutilized(sd, sd->shared->overutilized, false);
+       sd->shared->overutilized = false;
+}
+
+static inline void update_overutilized_status(struct rq *rq)
+{
+       struct sched_domain *sd;
+
+       rcu_read_lock();
+       sd = rcu_dereference(rq->sd);
+       if (sd && !sd_overutilized(sd) &&
+           cpu_overutilized(rq->cpu))
+               set_sd_overutilized(sd);
+       rcu_read_unlock();
+}
+
+unsigned long boosted_cpu_util(int cpu);
+#else
+
+#define update_overutilized_status(rq) do {} while (0)
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+
+#endif /* CONFIG_SMP */
+
  /*
   * The enqueue_task method is called before nr_running is
   * increased. Here we update the fair scheduling stats and
@@ -4880,6 +4982,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  {
         struct cfs_rq *cfs_rq;
         struct sched_entity *se = &p->se;
+       int task_new = !(flags & ENQUEUE_WAKEUP);
  
         /*
          * If in_iowait is set, the code below may not trigger any cpufreq
@@ -4904,6 +5007,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 flags = ENQUEUE_WAKEUP;
         }
@@ -4911,6 +5015,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -4919,8 +5024,31 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 update_cfs_shares(se);
         }
  
-       if (!se)
+       /*
+        * Update SchedTune accounting.
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
+       if (!se) {
                 add_nr_running(rq, 1);
+               if (!task_new)
+                       update_overutilized_status(rq);
+               walt_inc_cumulative_runnable_avg(rq, p);
+       }
  
         hrtick_update(rq);
  }
@@ -4951,6 +5079,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@ -4970,6 +5099,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -4978,8 +5108,19 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 update_cfs_shares(se);
         }
  
-       if (!se)
+       /*
+        * Update SchedTune accounting
+        *
+        * We do it before updating the CPU capacity to ensure the
+        * boost value of the current task is accounted for in the
+        * selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
+       if (!se) {
                 sub_nr_running(rq, 1);
+               walt_dec_cumulative_runnable_avg(rq, p);
+       }
  
         hrtick_update(rq);
  }
@@ -5288,16 +5429,6 @@ static unsigned long target_load(int cpu, int type)
         return max(rq->cpu_load[type-1], total);
  }
  
-static unsigned long capacity_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -5328,179 +5459,916 @@ static void record_wakee(struct task_struct *p)
  }
  
  /*
- * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ * Returns the current capacity of cpu after applying both
+ * cpu and freq scaling.
+ */
+unsigned long capacity_curr_of(int cpu)
+{
+       unsigned long max_cap = cpu_rq(cpu)->cpu_capacity_orig;
+       unsigned long scale_freq = arch_scale_freq_capacity(NULL, cpu);
+
+       return cap_scale(max_cap, scale_freq);
+}
+
+static inline bool energy_aware(void)
+{
+       return sched_feat(ENERGY_AWARE);
+}
+
+static int cpu_util_wake(int cpu, struct task_struct *p);
+
+/*
+ * __cpu_norm_util() returns the cpu util relative to a specific capacity,
+ * i.e. it's busy ratio, in the range [0..SCHED_CAPACITY_SCALE] which is useful
+ * for energy calculations. Using the scale-invariant util returned by
+ * cpu_util() and approximating scale-invariant util by:
   *
- * A waker of many should wake a different task than the one last awakened
- * at a frequency roughly N times higher than one of its wakees.
+ *   util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time
   *
- * In order to determine whether we should let the load spread vs consolidating
- * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
- * partner, and a factor of lls_size higher frequency in the other.
+ * the normalized util can be found using the specific capacity.
   *
- * With both conditions met, we can be relatively sure that the relationship is
- * non-monogamous, with partner count exceeding socket size.
+ *   capacity = capacity_orig * curr_freq/max_freq
   *
- * Waker/wakee being client/server, worker/dispatcher, interrupt source or
- * whatever is irrelevant, spread criteria is apparent partner count exceeds
- * socket size.
+ *   norm_util = running_time/time ~ util/capacity
   */
-static int wake_wide(struct task_struct *p)
+static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity)
  {
-       unsigned int master = current->wakee_flips;
-       unsigned int slave = p->wakee_flips;
-       int factor = this_cpu_read(sd_llc_size);
+       if (util >= capacity)
+               return SCHED_CAPACITY_SCALE;
  
-       if (master < slave)
-               swap(master, slave);
-       if (slave < factor || master < slave * factor)
-               return 0;
-       return 1;
+       return (util << SCHED_CAPACITY_SHIFT)/capacity;
  }
  
  /*
- * The purpose of wake_affine() is to quickly determine on which CPU we can run
- * soonest. For the purpose of speed we only consider the waking and previous
- * CPU.
- *
- * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
- *                     will be) idle.
+ * CPU candidates.
   *
- * wake_affine_weight() - considers the weight to reflect the average
- *                       scheduling latency of the CPUs. This seems to work
- *                       for the overloaded case.
+ * These are labels to reference CPU candidates for an energy_diff.
+ * Currently we support only two possible candidates: the task's previous CPU
+ * and another candiate CPU.
+ * More advanced/aggressive EAS selection policies can consider more
+ * candidates.
   */
+#define EAS_CPU_PRV    0
+#define EAS_CPU_NXT    1
+#define EAS_CPU_BKP    2
  
-static bool
-wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
-                int this_cpu, int prev_cpu, int sync)
-{
-       if (idle_cpu(this_cpu))
-               return true;
+/*
+ * energy_diff - supports the computation of the estimated energy impact in
+ * moving a "task"'s "util_delta" between different CPU candidates.
+ */
+/*
+ * NOTE: When using or examining WALT task signals, all wakeup
+ * latency is included as busy time for task util.
+ *
+ * This is relevant here because:
+ * When debugging is enabled, it can take as much as 1ms to
+ * write the output to the trace buffer for each eenv
+ * scenario. For periodic tasks where the sleep time is of
+ * a similar order, the WALT task util can be inflated.
+ *
+ * Further, and even without debugging enabled,
+ * task wakeup latency changes depending upon the EAS
+ * wakeup algorithm selected - FIND_BEST_TARGET only does
+ * energy calculations for up to 2 candidate CPUs. When
+ * NO_FIND_BEST_TARGET is configured, we can potentially
+ * do an energy calculation across all CPUS in the system.
+ *
+ * The impact to WALT task util on a Juno board
+ * running a periodic task which only sleeps for 200usec
+ * between 1ms activations has been measured.
+ * (i.e. the wakeup latency induced by energy calculation
+ * and debug output is double the desired sleep time and
+ * almost equivalent to the runtime which is more-or-less
+ * the worst case possible for this test)
+ *
+ * In this scenario, a task which has a PELT util of around
+ * 220 is inflated under WALT to have util around 400.
+ *
+ * This is simply a property of the way WALT includes
+ * wakeup latency in busy time while PELT does not.
+ *
+ * Hence - be careful when enabling DEBUG_EENV_DECISIONS
+ * expecially if WALT is the task signal.
+ */
+/*#define DEBUG_EENV_DECISIONS*/
+
+#ifdef DEBUG_EENV_DECISIONS
+/* max of 8 levels of sched groups traversed */
+#define EAS_EENV_DEBUG_LEVELS 16
+
+struct _eenv_debug {
+       unsigned long cap;
+       unsigned long norm_util;
+       unsigned long cap_energy;
+       unsigned long idle_energy;
+       unsigned long this_energy;
+       unsigned long this_busy_energy;
+       unsigned long this_idle_energy;
+       cpumask_t group_cpumask;
+       unsigned long cpu_util[1];
+};
+#endif
  
-       if (sync && cpu_rq(this_cpu)->nr_running == 1)
-               return true;
+struct eenv_cpu {
+       /* CPU ID, must be in cpus_mask */
+       int     cpu_id;
  
-       return false;
-}
+       /*
+        * Index (into sched_group_energy::cap_states) of the OPP the
+        * CPU needs to run at if the task is placed on it.
+        * This includes the both active and blocked load, due to
+        * other tasks on this CPU,  as well as the task's own
+        * utilization.
+       */
+       int     cap_idx;
+       int     cap;
  
-static bool
-wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
-                  int this_cpu, int prev_cpu, int sync)
-{
-       s64 this_eff_load, prev_eff_load;
-       unsigned long task_load;
+       /* Estimated system energy */
+       unsigned long energy;
  
-       this_eff_load = target_load(this_cpu, sd->wake_idx);
-       prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+       /* Estimated energy variation wrt EAS_CPU_PRV */
+       long nrg_delta;
  
-       if (sync) {
-               unsigned long current_load = task_h_load(current);
+#ifdef DEBUG_EENV_DECISIONS
+       struct _eenv_debug *debug;
+       int debug_idx;
+#endif /* DEBUG_EENV_DECISIONS */
+};
  
-               if (current_load > this_eff_load)
-                       return true;
+struct energy_env {
+       /* Utilization to move */
+       struct task_struct      *p;
+       unsigned long           util_delta;
+       unsigned long           util_delta_boosted;
  
-               this_eff_load -= current_load;
-       }
+       /* Mask of CPUs candidates to evaluate */
+       cpumask_t               cpus_mask;
  
-       task_load = task_h_load(p);
+       /* CPU candidates to evaluate */
+       struct eenv_cpu *cpu;
+       int eenv_cpu_count;
  
-       this_eff_load += task_load;
-       if (sched_feat(WA_BIAS))
-               this_eff_load *= 100;
-       this_eff_load *= capacity_of(prev_cpu);
+#ifdef DEBUG_EENV_DECISIONS
+       /* pointer to the memory block reserved
+        * for debug on this CPU - there will be
+        * sizeof(struct _eenv_debug) *
+        *  (EAS_CPU_CNT * EAS_EENV_DEBUG_LEVELS)
+        * bytes allocated here.
+        */
+       struct _eenv_debug *debug;
+#endif
+       /*
+        * Index (into energy_env::cpu) of the morst energy efficient CPU for
+        * the specified energy_env::task
+        */
+       int     next_idx;
+       int     max_cpu_count;
  
-       prev_eff_load -= task_load;
-       if (sched_feat(WA_BIAS))
-               prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
-       prev_eff_load *= capacity_of(this_cpu);
+       /* Support data */
+       struct sched_group      *sg_top;
+       struct sched_group      *sg_cap;
+       struct sched_group      *sg;
+};
  
-       return this_eff_load <= prev_eff_load;
-}
+static int cpu_util_wake(int cpu, struct task_struct *p);
  
-static int wake_affine(struct sched_domain *sd, struct task_struct *p,
-                      int prev_cpu, int sync)
+static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
  {
-       int this_cpu = smp_processor_id();
-       bool affine = false;
+       unsigned long max_util = 0;
+       unsigned long util;
+       int cpu;
  
-       if (sched_feat(WA_IDLE) && !affine)
-               affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+       for_each_cpu(cpu, sched_group_span(eenv->sg_cap)) {
+               util = cpu_util_wake(cpu, eenv->p);
  
-       if (sched_feat(WA_WEIGHT) && !affine)
-               affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+               /*
+                * If we are looking at the target CPU specified by the eenv,
+                * then we should add the (estimated) utilization of the task
+                * assuming we will wake it up on that CPU.
+                */
+               if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+                       util += eenv->util_delta_boosted;
  
-       schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
-       if (affine) {
-               schedstat_inc(sd->ttwu_move_affine);
-               schedstat_inc(p->se.statistics.nr_wakeups_affine);
+               max_util = max(max_util, util);
         }
  
-       return affine;
+       return max_util;
  }
  
-static inline int task_util(struct task_struct *p);
-static int cpu_util_wake(int cpu, struct task_struct *p);
+/*
+ * group_norm_util() returns the approximated group util relative to it's
+ * current capacity (busy ratio) in the range [0..SCHED_CAPACITY_SCALE] for use
+ * in energy calculations. Since task executions may or may not overlap in time
+ * in the group the true normalized util is between max(cpu_norm_util(i)) and
+ * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The
+ * latter is used as the estimate as it leads to a more pessimistic energy
+ * estimate (more busy).
+ */
+static unsigned
+long group_norm_util(struct energy_env *eenv, int cpu_idx)
+{
+       unsigned long capacity = eenv->cpu[cpu_idx].cap;
+       unsigned long util, util_sum = 0;
+       int cpu;
  
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+       for_each_cpu(cpu, sched_group_span(eenv->sg)) {
+               util = cpu_util_wake(cpu, eenv->p);
+
+               /*
+                * If we are looking at the target CPU specified by the eenv,
+                * then we should add the (estimated) utilization of the task
+                * assuming we will wake it up on that CPU.
+                */
+               if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id))
+                       util += eenv->util_delta;
+
+               util_sum += __cpu_norm_util(util, capacity);
+       }
+
+       if (util_sum > SCHED_CAPACITY_SCALE)
+               return SCHED_CAPACITY_SCALE;
+       return util_sum;
+}
+
+static int find_new_capacity(struct energy_env *eenv, int cpu_idx)
  {
-       return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+       const struct sched_group_energy *sge = eenv->sg_cap->sge;
+       unsigned long util = group_max_util(eenv, cpu_idx);
+       int idx, cap_idx;
+
+       cap_idx = sge->nr_cap_states - 1;
+
+       for (idx = 0; idx < sge->nr_cap_states; idx++) {
+               if (sge->cap_states[idx].cap >= util) {
+                       cap_idx = idx;
+                       break;
+               }
+       }
+       /* Keep track of SG's capacity */
+       eenv->cpu[cpu_idx].cap = sge->cap_states[cap_idx].cap;
+       eenv->cpu[cpu_idx].cap_idx = cap_idx;
+
+       return cap_idx;
  }
  
-/*
- * find_idlest_group finds and returns the least busy CPU group within the
- * domain.
- */
-static struct sched_group *
-find_idlest_group(struct sched_domain *sd, struct task_struct *p,
-                 int this_cpu, int sd_flag)
+static int group_idle_state(struct energy_env *eenv, int cpu_idx)
  {
-       struct sched_group *idlest = NULL, *group = sd->groups;
-       struct sched_group *most_spare_sg = NULL;
-       unsigned long min_runnable_load = ULONG_MAX, this_runnable_load = 0;
-       unsigned long min_avg_load = ULONG_MAX, this_avg_load = 0;
-       unsigned long most_spare = 0, this_spare = 0;
-       int load_idx = sd->forkexec_idx;
-       int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
-       unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
-                               (sd->imbalance_pct-100) / 100;
+       struct sched_group *sg = eenv->sg;
+       int src_in_grp, dst_in_grp;
+       int i, state = INT_MAX;
+       int max_idle_state_idx;
+       long grp_util = 0;
+       int new_state;
  
-       if (sd_flag & SD_BALANCE_WAKE)
-               load_idx = sd->wake_idx;
+       /* Find the shallowest idle state in the sched group. */
+       for_each_cpu(i, sched_group_span(sg))
+               state = min(state, idle_get_state_idx(cpu_rq(i)));
  
-       do {
-               unsigned long load, avg_load, runnable_load;
-               unsigned long spare_cap, max_spare_cap;
-               int local_group;
-               int i;
+       /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */
+       state++;
+       /*
+        * Try to estimate if a deeper idle state is
+        * achievable when we move the task.
+        */
+       for_each_cpu(i, sched_group_span(sg))
+               grp_util += cpu_util(i);
  
-               /* Skip over this group if it has no CPUs allowed */
-               if (!cpumask_intersects(sched_group_span(group),
-                                       &p->cpus_allowed))
-                       continue;
+       src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id,
+                                     sched_group_span(sg));
+       dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id,
+                                     sched_group_span(sg));
+       if (src_in_grp == dst_in_grp) {
+               /*
+                * both CPUs under consideration are in the same group or not in
+                * either group, migration should leave idle state the same.
+                */
+               return state;
+       }
+       /*
+        * add or remove util as appropriate to indicate what group util
+        * will be (worst case - no concurrent execution) after moving the task
+        */
+       grp_util += src_in_grp ? -eenv->util_delta : eenv->util_delta;
  
-               local_group = cpumask_test_cpu(this_cpu,
-                                              sched_group_span(group));
+       if (grp_util >
+               ((long)sg->sgc->max_capacity * (int)sg->group_weight)) {
+               /*
+                * After moving, the group will be fully occupied
+                * so assume it will not be idle at all.
+                */
+               return 0;
+       }
  
+       /*
+        * after moving, this group is at most partly
+        * occupied, so it should have some idle time.
+        */
+       max_idle_state_idx = sg->sge->nr_idle_states - 2;
+       new_state = grp_util * max_idle_state_idx;
+       if (grp_util <= 0) {
+               /* group will have no util, use lowest state */
+               new_state = max_idle_state_idx + 1;
+       } else {
                 /*
-                * Tally up the load of all CPUs in the group and find
-                * the group containing the CPU with most spare capacity.
+                * for partially idle, linearly map util to idle
+                * states, excluding the lowest one. This does not
+                * correspond to the state we expect to enter in
+                * reality, but an indication of what might happen.
                  */
-               avg_load = 0;
-               runnable_load = 0;
-               max_spare_cap = 0;
+               new_state = min_t(int, max_idle_state_idx,
+                                 new_state / sg->sgc->max_capacity);
+               new_state = max_idle_state_idx - new_state;
+       }
+       return new_state;
+}
  
-               for_each_cpu(i, sched_group_span(group)) {
-                       /* Bias balancing toward cpus of our domain */
-                       if (local_group)
-                               load = source_load(i, load_idx);
-                       else
-                               load = target_load(i, load_idx);
+#ifdef DEBUG_EENV_DECISIONS
+static struct _eenv_debug *eenv_debug_entry_ptr(struct _eenv_debug *base, int idx);
  
-                       runnable_load += load;
+static void store_energy_calc_debug_info(struct energy_env *eenv, int cpu_idx, int cap_idx, int idle_idx)
+{
+       int debug_idx = eenv->cpu[cpu_idx].debug_idx;
+       unsigned long sg_util, busy_energy, idle_energy;
+       const struct sched_group_energy *sge;
+       struct _eenv_debug *dbg;
+       int cpu;
  
-                       avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+       if (debug_idx < EAS_EENV_DEBUG_LEVELS) {
+               sge = eenv->sg->sge;
+               sg_util = group_norm_util(eenv, cpu_idx);
+               busy_energy   = sge->cap_states[cap_idx].power;
+               busy_energy  *= sg_util;
+               idle_energy   = SCHED_CAPACITY_SCALE - sg_util;
+               idle_energy  *= sge->idle_states[idle_idx].power;
+               /* should we use sg_cap or sg? */
+               dbg = eenv_debug_entry_ptr(eenv->cpu[cpu_idx].debug, debug_idx);
+               dbg->cap = sge->cap_states[cap_idx].cap;
+               dbg->norm_util = sg_util;
+               dbg->cap_energy = sge->cap_states[cap_idx].power;
+               dbg->idle_energy = sge->idle_states[idle_idx].power;
+               dbg->this_energy = busy_energy + idle_energy;
+               dbg->this_busy_energy = busy_energy;
+               dbg->this_idle_energy = idle_energy;
  
-                       spare_cap = capacity_spare_wake(i, p);
+               cpumask_copy(&dbg->group_cpumask,
+                               sched_group_span(eenv->sg));
+
+               for_each_cpu(cpu, &dbg->group_cpumask)
+                       dbg->cpu_util[cpu] = cpu_util(cpu);
+
+               eenv->cpu[cpu_idx].debug_idx = debug_idx+1;
+       }
+}
+#else
+#define store_energy_calc_debug_info(a,b,c,d) {}
+#endif /* DEBUG_EENV_DECISIONS */
+
+/*
+ * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg).
+ *
+ * This works in iterations to compute the SG's energy for each CPU
+ * candidate defined by the energy_env's cpu array.
+ */
+static void calc_sg_energy(struct energy_env *eenv)
+{
+       struct sched_group *sg = eenv->sg;
+       unsigned long busy_energy, idle_energy;
+       unsigned int busy_power, idle_power;
+       unsigned long total_energy = 0;
+       unsigned long sg_util;
+       int cap_idx, idle_idx;
+       int cpu_idx;
+
+       for (cpu_idx = EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; ++cpu_idx) {
+               if (eenv->cpu[cpu_idx].cpu_id == -1)
+                       continue;
+
+               /* Compute ACTIVE energy */
+               cap_idx = find_new_capacity(eenv, cpu_idx);
+               busy_power = sg->sge->cap_states[cap_idx].power;
+               sg_util = group_norm_util(eenv, cpu_idx);
+               busy_energy   = sg_util * busy_power;
+
+               /* Compute IDLE energy */
+               idle_idx = group_idle_state(eenv, cpu_idx);
+               idle_power = sg->sge->idle_states[idle_idx].power;
+               idle_energy   = SCHED_CAPACITY_SCALE - sg_util;
+               idle_energy  *= idle_power;
+
+               total_energy = busy_energy + idle_energy;
+               eenv->cpu[cpu_idx].energy += total_energy;
+
+               store_energy_calc_debug_info(eenv, cpu_idx, cap_idx, idle_idx);
+       }
+}
+
+/*
+ * compute_energy() computes the absolute variation in energy consumption by
+ * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT.
+ *
+ * NOTE: compute_energy() may fail when racing with sched_domain updates, in
+ *       which case we abort by returning -EINVAL.
+ */
+static int compute_energy(struct energy_env *eenv)
+{
+       struct sched_domain *sd;
+       int cpu;
+       struct cpumask visit_cpus;
+       struct sched_group *sg;
+
+       WARN_ON(!eenv->sg_top->sge);
+
+       cpumask_copy(&visit_cpus, sched_group_span(eenv->sg_top));
+
+       while (!cpumask_empty(&visit_cpus)) {
+               struct sched_group *sg_shared_cap = NULL;
+
+               cpu = cpumask_first(&visit_cpus);
+
+               /*
+                * Is the group utilization affected by cpus outside this
+                * sched_group?
+                */
+               sd = rcu_dereference(per_cpu(sd_scs, cpu));
+               if (sd && sd->parent)
+                       sg_shared_cap = sd->parent->groups;
+
+               for_each_domain(cpu, sd) {
+                       sg = sd->groups;
+
+                       /* Has this sched_domain already been visited? */
+                       if (sd->child && group_first_cpu(sg) != cpu)
+                               break;
+
+                       do {
+                               eenv->sg_cap = sg;
+                               if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight)
+                                       eenv->sg_cap = sg_shared_cap;
+
+                               /*
+                                * Compute the energy for all the candidate
+                                * CPUs in the current visited SG.
+                                */
+                               eenv->sg = sg;
+                               calc_sg_energy(eenv);
+
+                               /* remove CPUs we have just visited */
+                               if (!sd->child)
+                                       cpumask_xor(&visit_cpus, &visit_cpus, sched_group_span(sg));
+
+                               if (cpumask_equal(sched_group_span(sg), sched_group_span(eenv->sg_top)))
+                                       goto next_cpu;
+
+                       } while (sg = sg->next, sg != sd->groups);
+               }
+next_cpu:
+               continue;
+       }
+
+       return 0;
+}
+
+static inline bool cpu_in_sg(struct sched_group *sg, int cpu)
+{
+       return cpu != -1 && cpumask_test_cpu(cpu, sched_group_span(sg));
+}
+
+#ifdef DEBUG_EENV_DECISIONS
+static void dump_eenv_debug(struct energy_env *eenv)
+{
+       int cpu_idx, grp_idx;
+       char cpu_utils[(NR_CPUS*12)+10]="cpu_util: ";
+       char cpulist[64];
+
+       trace_printk("eenv scenario: task=%p %s task_util=%lu prev_cpu=%d",
+                       eenv->p, eenv->p->comm, eenv->util_delta, eenv->cpu[EAS_CPU_PRV].cpu_id);
+
+       for (cpu_idx=EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; cpu_idx++) {
+               if (eenv->cpu[cpu_idx].cpu_id == -1)
+                       continue;
+               trace_printk("---Scenario %d: Place task on cpu %d energy=%lu (%d debug logs at %p)",
+                               cpu_idx+1, eenv->cpu[cpu_idx].cpu_id,
+                               eenv->cpu[cpu_idx].energy >> SCHED_CAPACITY_SHIFT,
+                               eenv->cpu[cpu_idx].debug_idx,
+                               eenv->cpu[cpu_idx].debug);
+               for (grp_idx = 0; grp_idx < eenv->cpu[cpu_idx].debug_idx; grp_idx++) {
+                       struct _eenv_debug *debug;
+                       int cpu, written=0;
+
+                       debug = eenv_debug_entry_ptr(eenv->cpu[cpu_idx].debug, grp_idx);
+                       cpu = scnprintf(cpulist, sizeof(cpulist), "%*pbl", cpumask_pr_args(&debug->group_cpumask));
+
+                       cpu_utils[0] = 0;
+                       /* print out the relevant cpu_util */
+                       for_each_cpu(cpu, &(debug->group_cpumask)) {
+                               char tmp[64];
+                               if (written > sizeof(cpu_utils)-10) {
+                                       cpu_utils[written]=0;
+                                       break;
+                               }
+                               written += snprintf(tmp, sizeof(tmp), "cpu%d(%lu) ", cpu, debug->cpu_util[cpu]);
+                               strcat(cpu_utils, tmp);
+                       }
+                       /* trace the data */
+                       trace_printk("  | %s : cap=%lu nutil=%lu, cap_nrg=%lu, idle_nrg=%lu energy=%lu busy_energy=%lu idle_energy=%lu %s",
+                                       cpulist, debug->cap, debug->norm_util,
+                                       debug->cap_energy, debug->idle_energy,
+                                       debug->this_energy >> SCHED_CAPACITY_SHIFT,
+                                       debug->this_busy_energy >> SCHED_CAPACITY_SHIFT,
+                                       debug->this_idle_energy >> SCHED_CAPACITY_SHIFT,
+                                       cpu_utils);
+
+               }
+               trace_printk("---");
+       }
+       trace_printk("----- done");
+       return;
+}
+#else
+#define dump_eenv_debug(a) {}
+#endif /* DEBUG_EENV_DECISIONS */
+/*
+ * select_energy_cpu_idx(): estimate the energy impact of changing the
+ * utilization distribution.
+ *
+ * The eenv parameter specifies the changes: utilization amount and a
+ * collection of possible CPU candidates. The number of candidates
+ * depends upon the selection algorithm used.
+ *
+ * If find_best_target was used to select candidate CPUs, there will
+ * be at most 3 including prev_cpu. If not, we used a brute force
+ * selection which will provide the union of:
+ *  * CPUs belonging to the highest sd which is not overutilized
+ *  * CPUs the task is allowed to run on
+ *  * online CPUs
+ *
+ * This function returns the index of a CPU candidate specified by the
+ * energy_env which corresponds to the most energy efficient CPU.
+ * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy
+ * efficient than running on prev_cpu. This is also the value returned in case
+ * of abort due to error conditions during the computations. The only
+ * exception to this if we fail to access the energy model via sd_ea, where
+ * we return -1 with the intent of asking the system to use a different
+ * wakeup placement algorithm.
+ *
+ * A value greater than zero means that the most energy efficient CPU is the
+ * one represented by eenv->cpu[eenv->next_idx].cpu_id.
+ */
+static inline int select_energy_cpu_idx(struct energy_env *eenv)
+{
+       int last_cpu_idx = eenv->max_cpu_count - 1;
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int sd_cpu = -1;
+       int cpu_idx;
+       int margin;
+
+       sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id;
+       sd = rcu_dereference(per_cpu(sd_ea, sd_cpu));
+       if (!sd)
+               return -1;
+
+       cpumask_clear(&eenv->cpus_mask);
+       for (cpu_idx = EAS_CPU_PRV; cpu_idx < eenv->max_cpu_count; ++cpu_idx) {
+               int cpu = eenv->cpu[cpu_idx].cpu_id;
+
+               if (cpu < 0)
+                       continue;
+               cpumask_set_cpu(cpu, &eenv->cpus_mask);
+       }
+
+       sg = sd->groups;
+       do {
+               /* Skip SGs which do not contains a candidate CPU */
+               if (!cpumask_intersects(&eenv->cpus_mask, sched_group_span(sg)))
+                       continue;
+
+               eenv->sg_top = sg;
+               if (compute_energy(eenv) == -EINVAL)
+                       return EAS_CPU_PRV;
+       } while (sg = sg->next, sg != sd->groups);
+       /* remember - eenv energy values are unscaled */
+
+       /*
+        * Compute the dead-zone margin used to prevent too many task
+        * migrations with negligible energy savings.
+        * An energy saving is considered meaningful if it reduces the energy
+        * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56%
+        */
+       margin = eenv->cpu[EAS_CPU_PRV].energy >> 6;
+
+       /*
+        * By default the EAS_CPU_PRV CPU is considered the most energy
+        * efficient, with a 0 energy variation.
+        */
+       eenv->next_idx = EAS_CPU_PRV;
+       eenv->cpu[EAS_CPU_PRV].nrg_delta = 0;
+
+       dump_eenv_debug(eenv);
+
+       /*
+        * Compare the other CPU candidates to find a CPU which can be
+        * more energy efficient then EAS_CPU_PRV
+        */
+       if (sched_feat(FBT_STRICT_ORDER))
+               last_cpu_idx = EAS_CPU_BKP;
+
+       for(cpu_idx = EAS_CPU_NXT; cpu_idx <= last_cpu_idx; cpu_idx++) {
+               if (eenv->cpu[cpu_idx].cpu_id < 0)
+                       continue;
+               eenv->cpu[cpu_idx].nrg_delta =
+                       eenv->cpu[cpu_idx].energy -
+                       eenv->cpu[EAS_CPU_PRV].energy;
+
+               /* filter energy variations within the dead-zone margin */
+               if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin)
+                       eenv->cpu[cpu_idx].nrg_delta = 0;
+               /* update the schedule candidate with min(nrg_delta) */
+               if (eenv->cpu[cpu_idx].nrg_delta <
+                   eenv->cpu[eenv->next_idx].nrg_delta) {
+                       eenv->next_idx = cpu_idx;
+                       /* break out if we want to stop on first saving candidate */
+                       if (sched_feat(FBT_STRICT_ORDER))
+                               break;
+               }
+       }
+
+       return eenv->next_idx;
+}
+
+/*
+ * Detect M:N waker/wakee relationships via a switching-frequency heuristic.
+ *
+ * A waker of many should wake a different task than the one last awakened
+ * at a frequency roughly N times higher than one of its wakees.
+ *
+ * In order to determine whether we should let the load spread vs consolidating
+ * to shared cache, we look for a minimum 'flip' frequency of llc_size in one
+ * partner, and a factor of lls_size higher frequency in the other.
+ *
+ * With both conditions met, we can be relatively sure that the relationship is
+ * non-monogamous, with partner count exceeding socket size.
+ *
+ * Waker/wakee being client/server, worker/dispatcher, interrupt source or
+ * whatever is irrelevant, spread criteria is apparent partner count exceeds
+ * socket size.
+ */
+static int wake_wide(struct task_struct *p, int sibling_count_hint)
+{
+       unsigned int master = current->wakee_flips;
+       unsigned int slave = p->wakee_flips;
+       int llc_size = this_cpu_read(sd_llc_size);
+
+       if (sibling_count_hint >= llc_size)
+               return 1;
+
+       if (master < slave)
+               swap(master, slave);
+       if (slave < llc_size || master < slave * llc_size)
+               return 0;
+       return 1;
+}
+
+/*
+ * The purpose of wake_affine() is to quickly determine on which CPU we can run
+ * soonest. For the purpose of speed we only consider the waking and previous
+ * CPU.
+ *
+ * wake_affine_idle() - only considers 'now', it check if the waking CPU is (or
+ *                     will be) idle.
+ *
+ * wake_affine_weight() - considers the weight to reflect the average
+ *                       scheduling latency of the CPUs. This seems to work
+ *                       for the overloaded case.
+ */
+
+static bool
+wake_affine_idle(struct sched_domain *sd, struct task_struct *p,
+                int this_cpu, int prev_cpu, int sync)
+{
+       if (idle_cpu(this_cpu))
+               return true;
+
+       if (sync && cpu_rq(this_cpu)->nr_running == 1)
+               return true;
+
+       return false;
+}
+
+static bool
+wake_affine_weight(struct sched_domain *sd, struct task_struct *p,
+                  int this_cpu, int prev_cpu, int sync)
+{
+       s64 this_eff_load, prev_eff_load;
+       unsigned long task_load;
+
+       this_eff_load = target_load(this_cpu, sd->wake_idx);
+       prev_eff_load = source_load(prev_cpu, sd->wake_idx);
+
+       if (sync) {
+               unsigned long current_load = task_h_load(current);
+
+               if (current_load > this_eff_load)
+                       return true;
+
+               this_eff_load -= current_load;
+       }
+
+       task_load = task_h_load(p);
+
+       this_eff_load += task_load;
+       if (sched_feat(WA_BIAS))
+               this_eff_load *= 100;
+       this_eff_load *= capacity_of(prev_cpu);
+
+       prev_eff_load -= task_load;
+       if (sched_feat(WA_BIAS))
+               prev_eff_load *= 100 + (sd->imbalance_pct - 100) / 2;
+       prev_eff_load *= capacity_of(this_cpu);
+
+       return this_eff_load <= prev_eff_load;
+}
+
+static int wake_affine(struct sched_domain *sd, struct task_struct *p,
+                      int prev_cpu, int sync)
+{
+       int this_cpu = smp_processor_id();
+       bool affine = false;
+
+       if (sched_feat(WA_IDLE) && !affine)
+               affine = wake_affine_idle(sd, p, this_cpu, prev_cpu, sync);
+
+       if (sched_feat(WA_WEIGHT) && !affine)
+               affine = wake_affine_weight(sd, p, this_cpu, prev_cpu, sync);
+
+       schedstat_inc(p->se.statistics.nr_wakeups_affine_attempts);
+       if (affine) {
+               schedstat_inc(sd->ttwu_move_affine);
+               schedstat_inc(p->se.statistics.nr_wakeups_affine);
+       }
+
+       return affine;
+}
+
+static inline unsigned long task_util(struct task_struct *p);
+
+#ifdef CONFIG_SCHED_TUNE
+struct reciprocal_value schedtune_spc_rdiv;
+
+static long
+schedtune_margin(unsigned long signal, long boost)
+{
+       long long margin = 0;
+
+       /*
+        * Signal proportional compensation (SPC)
+        *
+        * The Boost (B) value is used to compute a Margin (M) which is
+        * proportional to the complement of the original Signal (S):
+        *   M = B * (SCHED_CAPACITY_SCALE - S)
+        * The obtained M could be used by the caller to "boost" S.
+        */
+       if (boost >= 0) {
+               margin  = SCHED_CAPACITY_SCALE - signal;
+               margin *= boost;
+       } else
+               margin = -signal * boost;
+
+       margin  = reciprocal_divide(margin, schedtune_spc_rdiv);
+
+       if (boost < 0)
+               margin *= -1;
+       return margin;
+}
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       int boost = schedtune_cpu_boost(cpu);
+
+       if (boost == 0)
+               return 0;
+
+       return schedtune_margin(util, boost);
+}
+
+static inline long
+schedtune_task_margin(struct task_struct *task)
+{
+       int boost = schedtune_task_boost(task);
+       unsigned long util;
+       long margin;
+
+       if (boost == 0)
+               return 0;
+
+       util = task_util(task);
+       margin = schedtune_margin(util, boost);
+
+       return margin;
+}
+
+#else /* CONFIG_SCHED_TUNE */
+
+static inline int
+schedtune_cpu_margin(unsigned long util, int cpu)
+{
+       return 0;
+}
+
+static inline int
+schedtune_task_margin(struct task_struct *task)
+{
+       return 0;
+}
+
+#endif /* CONFIG_SCHED_TUNE */
+
+unsigned long
+boosted_cpu_util(int cpu)
+{
+       unsigned long util = cpu_util_freq(cpu);
+       long margin = schedtune_cpu_margin(util, cpu);
+
+       trace_sched_boost_cpu(cpu, util, margin);
+
+       return util + margin;
+}
+
+static inline unsigned long
+boosted_task_util(struct task_struct *task)
+{
+       unsigned long util = task_util(task);
+       long margin = schedtune_task_margin(task);
+
+       trace_sched_boost_task(task, util, margin);
+
+       return util + margin;
+}
+
+static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+{
+       return capacity_orig_of(cpu) - cpu_util_wake(cpu, p);
+}
+
+/*
+ * find_idlest_group finds and returns the least busy CPU group within the
+ * domain.
+ *
+ * Assumes p is allowed on at least one CPU in sd.
+ */
+static struct sched_group *
+find_idlest_group(struct sched_domain *sd, struct task_struct *p,
+                 int this_cpu, int sd_flag)
+{
+       struct sched_group *idlest = NULL, *group = sd->groups;
+       struct sched_group *most_spare_sg = NULL;
+       unsigned long min_runnable_load = ULONG_MAX;
+       unsigned long this_runnable_load = ULONG_MAX;
+       unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX;
+       unsigned long most_spare = 0, this_spare = 0;
+       int load_idx = sd->forkexec_idx;
+       int imbalance_scale = 100 + (sd->imbalance_pct-100)/2;
+       unsigned long imbalance = scale_load_down(NICE_0_LOAD) *
+                               (sd->imbalance_pct-100) / 100;
+
+       if (sd_flag & SD_BALANCE_WAKE)
+               load_idx = sd->wake_idx;
+
+       do {
+               unsigned long load, avg_load, runnable_load;
+               unsigned long spare_cap, max_spare_cap;
+               int local_group;
+               int i;
+
+               /* Skip over this group if it has no CPUs allowed */
+               if (!cpumask_intersects(sched_group_span(group),
+                                       &p->cpus_allowed))
+                       continue;
+
+               local_group = cpumask_test_cpu(this_cpu,
+                                              sched_group_span(group));
+
+               /*
+                * Tally up the load of all CPUs in the group and find
+                * the group containing the CPU with most spare capacity.
+                */
+               avg_load = 0;
+               runnable_load = 0;
+               max_spare_cap = 0;
+
+               for_each_cpu(i, sched_group_span(group)) {
+                       /* Bias balancing toward cpus of our domain */
+                       if (local_group)
+                               load = source_load(i, load_idx);
+                       else
+                               load = target_load(i, load_idx);
+
+                       runnable_load += load;
+
+                       avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
+
+                       spare_cap = capacity_spare_wake(i, p);
  
                         if (spare_cap > max_spare_cap)
                                 max_spare_cap = spare_cap;
@@ -5578,10 +6446,10 @@ skip_spare:
  }
  
  /*
- * find_idlest_cpu - find the idlest cpu among the cpus in group.
+ * find_idlest_group_cpu - find the idlest cpu among the cpus in group.
   */
  static int
-find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
+find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
  {
         unsigned long load, min_load = ULONG_MAX;
         unsigned int min_exit_latency = UINT_MAX;
@@ -5627,7 +6495,54 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
                 }
         }
  
-       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+       return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu;
+}
+
+static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p,
+                                 int cpu, int prev_cpu, int sd_flag)
+{
+       int new_cpu = cpu;
+
+       if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed))
+               return prev_cpu;
+
+       while (sd) {
+               struct sched_group *group;
+               struct sched_domain *tmp;
+               int weight;
+
+               if (!(sd->flags & sd_flag)) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               group = find_idlest_group(sd, p, cpu, sd_flag);
+               if (!group) {
+                       sd = sd->child;
+                       continue;
+               }
+
+               new_cpu = find_idlest_group_cpu(group, p, cpu);
+               if (new_cpu == cpu) {
+                       /* Now try balancing at a lower domain level of cpu */
+                       sd = sd->child;
+                       continue;
+               }
+
+               /* Now try balancing at a lower domain level of new_cpu */
+               cpu = new_cpu;
+               weight = sd->span_weight;
+               sd = NULL;
+               for_each_domain(cpu, tmp) {
+                       if (weight <= tmp->span_weight)
+                               break;
+                       if (tmp->flags & sd_flag)
+                               sd = tmp;
+               }
+               /* while loop will break here if sd == NULL */
+       }
+
+       return new_cpu;
  }
  
  #ifdef CONFIG_SCHED_SMT
@@ -5811,7 +6726,7 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t
  /*
   * Try and locate an idle core/thread in the LLC cache domain.
   */
-static int select_idle_sibling(struct task_struct *p, int prev, int target)
+static inline int __select_idle_sibling(struct task_struct *p, int prev, int target)
  {
         struct sched_domain *sd;
         int i;
@@ -5841,88 +6756,708 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
         if ((unsigned)i < nr_cpumask_bits)
                 return i;
  
-       return target;
-}
+       return target;
+}
+
+static inline int select_idle_sibling_cstate_aware(struct task_struct *p, int prev, int target)
+{
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int best_idle_cpu = -1;
+       int best_idle_cstate = -1;
+       int best_idle_capacity = INT_MAX;
+       int i;
+
+       /*
+        * Iterate the domains and find an elegible idle cpu.
+        */
+       sd = rcu_dereference(per_cpu(sd_llc, target));
+       for_each_lower_domain(sd) {
+               sg = sd->groups;
+               do {
+                       if (!cpumask_intersects(
+                                       sched_group_span(sg), &p->cpus_allowed))
+                               goto next;
+
+                       for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) {
+                               int idle_idx;
+                               unsigned long new_usage;
+                               unsigned long capacity_orig;
+
+                               if (!idle_cpu(i))
+                                       goto next;
+
+                               /* figure out if the task can fit here at all */
+                               new_usage = boosted_task_util(p);
+                               capacity_orig = capacity_orig_of(i);
+
+                               if (new_usage > capacity_orig)
+                                       goto next;
+
+                               /* if the task fits without changing OPP and we
+                                * intended to use this CPU, just proceed
+                                */
+                               if (i == target && new_usage <= capacity_curr_of(target)) {
+                                       return target;
+                               }
+
+                               /* otherwise select CPU with shallowest idle state
+                                * to reduce wakeup latency.
+                                */
+                               idle_idx = idle_get_state_idx(cpu_rq(i));
+
+                               if (idle_idx < best_idle_cstate &&
+                                       capacity_orig <= best_idle_capacity) {
+                                       best_idle_cpu = i;
+                                       best_idle_cstate = idle_idx;
+                                       best_idle_capacity = capacity_orig;
+                               }
+                       }
+       next:
+                       sg = sg->next;
+               } while (sg != sd->groups);
+       }
+
+       if (best_idle_cpu >= 0)
+               target = best_idle_cpu;
+
+       return target;
+}
+
+static int select_idle_sibling(struct task_struct *p, int prev, int target)
+{
+       if (!sysctl_sched_cstate_aware)
+               return __select_idle_sibling(p, prev, target);
+
+       return select_idle_sibling_cstate_aware(p, prev, target);
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               return (p->ravg.demand / (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+       }
+#endif
+       return p->se.avg.util_avg;
+}
+
+/*
+ * cpu_util_wake: Compute cpu utilization with any contributions from
+ * the waking task p removed.
+ */
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+       unsigned long util, capacity;
+
+#ifdef CONFIG_SCHED_WALT
+       /*
+        * WALT does not decay idle tasks in the same manner
+        * as PELT, so it makes little sense to subtract task
+        * utilization from cpu utilization. Instead just use
+        * cpu_util for this case.
+        */
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+               return cpu_util(cpu);
+#endif
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+               return cpu_util(cpu);
+
+       capacity = capacity_orig_of(cpu);
+       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+       return (util >= capacity) ? capacity : util;
+}
+
+static inline int task_fits_capacity(struct task_struct *p, long capacity)
+{
+       return capacity * 1024 > boosted_task_util(p) * capacity_margin;
+}
+
+static int start_cpu(bool boosted)
+{
+       struct root_domain *rd = cpu_rq(smp_processor_id())->rd;
+
+       return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu;
+}
+
+static inline int find_best_target(struct task_struct *p, int *backup_cpu,
+                                  bool boosted, bool prefer_idle)
+{
+       unsigned long best_idle_min_cap_orig = ULONG_MAX;
+       unsigned long min_util = boosted_task_util(p);
+       unsigned long target_capacity = ULONG_MAX;
+       unsigned long min_wake_util = ULONG_MAX;
+       unsigned long target_max_spare_cap = 0;
+       unsigned long target_util = ULONG_MAX;
+       unsigned long best_active_util = ULONG_MAX;
+       int best_idle_cstate = INT_MAX;
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       int best_active_cpu = -1;
+       int best_idle_cpu = -1;
+       int target_cpu = -1;
+       int cpu, i;
+
+       *backup_cpu = -1;
+
+       /* Find start CPU based on boost value */
+       cpu = start_cpu(boosted);
+       if (cpu < 0)
+               return -1;
+
+       /* Find SD for the start CPU */
+       sd = rcu_dereference(per_cpu(sd_ea, cpu));
+       if (!sd)
+               return -1;
+
+       /* Scan CPUs in all SDs */
+       sg = sd->groups;
+       do {
+               for_each_cpu_and(i, &p->cpus_allowed, sched_group_span(sg)) {
+                       unsigned long capacity_curr = capacity_curr_of(i);
+                       unsigned long capacity_orig = capacity_orig_of(i);
+                       unsigned long wake_util, new_util;
+
+                       if (!cpu_online(i))
+                               continue;
+
+                       if (walt_cpu_high_irqload(i))
+                               continue;
+
+                       /*
+                        * p's blocked utilization is still accounted for on prev_cpu
+                        * so prev_cpu will receive a negative bias due to the double
+                        * accounting. However, the blocked utilization may be zero.
+                        */
+                       wake_util = cpu_util_wake(i, p);
+                       new_util = wake_util + task_util(p);
+
+                       /*
+                        * Ensure minimum capacity to grant the required boost.
+                        * The target CPU can be already at a capacity level higher
+                        * than the one required to boost the task.
+                        */
+                       new_util = max(min_util, new_util);
+                       if (new_util > capacity_orig)
+                               continue;
+
+                       /*
+                        * Case A) Latency sensitive tasks
+                        *
+                        * Unconditionally favoring tasks that prefer idle CPU to
+                        * improve latency.
+                        *
+                        * Looking for:
+                        * - an idle CPU, whatever its idle_state is, since
+                        *   the first CPUs we explore are more likely to be
+                        *   reserved for latency sensitive tasks.
+                        * - a non idle CPU where the task fits in its current
+                        *   capacity and has the maximum spare capacity.
+                        * - a non idle CPU with lower contention from other
+                        *   tasks and running at the lowest possible OPP.
+                        *
+                        * The last two goals tries to favor a non idle CPU
+                        * where the task can run as if it is "almost alone".
+                        * A maximum spare capacity CPU is favoured since
+                        * the task already fits into that CPU's capacity
+                        * without waiting for an OPP chance.
+                        *
+                        * The following code path is the only one in the CPUs
+                        * exploration loop which is always used by
+                        * prefer_idle tasks. It exits the loop with wither a
+                        * best_active_cpu or a target_cpu which should
+                        * represent an optimal choice for latency sensitive
+                        * tasks.
+                        */
+                       if (prefer_idle) {
+
+                               /*
+                                * Case A.1: IDLE CPU
+                                * Return the first IDLE CPU we find.
+                                */
+                               if (idle_cpu(i)) {
+                                       trace_sched_find_best_target(p,
+                                                       prefer_idle, min_util,
+                                                       cpu, best_idle_cpu,
+                                                       best_active_cpu, i);
+
+                                       return i;
+                               }
+
+                               /*
+                                * Case A.2: Target ACTIVE CPU
+                                * Favor CPUs with max spare capacity.
+                                */
+                               if ((capacity_curr > new_util) &&
+                                       (capacity_orig - new_util > target_max_spare_cap)) {
+                                       target_max_spare_cap = capacity_orig - new_util;
+                                       target_cpu = i;
+                                       continue;
+                               }
+                               if (target_cpu != -1)
+                                       continue;
+
+
+                               /*
+                                * Case A.3: Backup ACTIVE CPU
+                                * Favor CPUs with:
+                                * - lower utilization due to other tasks
+                                * - lower utilization with the task in
+                                */
+                               if (wake_util > min_wake_util)
+                                       continue;
+                               if (new_util > best_active_util)
+                                       continue;
+                               min_wake_util = wake_util;
+                               best_active_util = new_util;
+                               best_active_cpu = i;
+                               continue;
+                       }
+
+                       /*
+                        * Enforce EAS mode
+                        *
+                        * For non latency sensitive tasks, skip CPUs that
+                        * will be overutilized by moving the task there.
+                        *
+                        * The goal here is to remain in EAS mode as long as
+                        * possible at least for !prefer_idle tasks.
+                        */
+                       if ((new_util * capacity_margin) >
+                           (capacity_orig * SCHED_CAPACITY_SCALE))
+                               continue;
+
+                       /*
+                        * Case B) Non latency sensitive tasks on IDLE CPUs.
+                        *
+                        * Find an optimal backup IDLE CPU for non latency
+                        * sensitive tasks.
+                        *
+                        * Looking for:
+                        * - minimizing the capacity_orig,
+                        *   i.e. preferring LITTLE CPUs
+                        * - favoring shallowest idle states
+                        *   i.e. avoid to wakeup deep-idle CPUs
+                        *
+                        * The following code path is used by non latency
+                        * sensitive tasks if IDLE CPUs are available. If at
+                        * least one of such CPUs are available it sets the
+                        * best_idle_cpu to the most suitable idle CPU to be
+                        * selected.
+                        *
+                        * If idle CPUs are available, favour these CPUs to
+                        * improve performances by spreading tasks.
+                        * Indeed, the energy_diff() computed by the caller
+                        * will take care to ensure the minimization of energy
+                        * consumptions without affecting performance.
+                        */
+                       if (idle_cpu(i)) {
+                               int idle_idx = idle_get_state_idx(cpu_rq(i));
+
+                               /* Select idle CPU with lower cap_orig */
+                               if (capacity_orig > best_idle_min_cap_orig)
+                                       continue;
+
+                               /*
+                                * Skip CPUs in deeper idle state, but only
+                                * if they are also less energy efficient.
+                                * IOW, prefer a deep IDLE LITTLE CPU vs a
+                                * shallow idle big CPU.
+                                */
+                               if (sysctl_sched_cstate_aware &&
+                                   best_idle_cstate <= idle_idx)
+                                       continue;
+
+                               /* Keep track of best idle CPU */
+                               best_idle_min_cap_orig = capacity_orig;
+                               best_idle_cstate = idle_idx;
+                               best_idle_cpu = i;
+                               continue;
+                       }
+
+                       /*
+                        * Case C) Non latency sensitive tasks on ACTIVE CPUs.
+                        *
+                        * Pack tasks in the most energy efficient capacities.
+                        *
+                        * This task packing strategy prefers more energy
+                        * efficient CPUs (i.e. pack on smaller maximum
+                        * capacity CPUs) while also trying to spread tasks to
+                        * run them all at the lower OPP.
+                        *
+                        * This assumes for example that it's more energy
+                        * efficient to run two tasks on two CPUs at a lower
+                        * OPP than packing both on a single CPU but running
+                        * that CPU at an higher OPP.
+                        *
+                        * Thus, this case keep track of the CPU with the
+                        * smallest maximum capacity and highest spare maximum
+                        * capacity.
+                        */
+
+                       /* Favor CPUs with smaller capacity */
+                       if (capacity_orig > target_capacity)
+                               continue;
+
+                       /* Favor CPUs with maximum spare capacity */
+                       if ((capacity_orig - new_util) < target_max_spare_cap)
+                               continue;
+
+                       target_max_spare_cap = capacity_orig - new_util;
+                       target_capacity = capacity_orig;
+                       target_util = new_util;
+                       target_cpu = i;
+               }
+
+       } while (sg = sg->next, sg != sd->groups);
+
+       /*
+        * For non latency sensitive tasks, cases B and C in the previous loop,
+        * we pick the best IDLE CPU only if we was not able to find a target
+        * ACTIVE CPU.
+        *
+        * Policies priorities:
+        *
+        * - prefer_idle tasks:
+        *
+        *   a) IDLE CPU available, we return immediately
+        *   b) ACTIVE CPU where task fits and has the bigger maximum spare
+        *      capacity (i.e. target_cpu)
+        *   c) ACTIVE CPU with less contention due to other tasks
+        *      (i.e. best_active_cpu)
+        *
+        * - NON prefer_idle tasks:
+        *
+        *   a) ACTIVE CPU: target_cpu
+        *   b) IDLE CPU: best_idle_cpu
+        */
+       if (target_cpu == -1)
+               target_cpu = prefer_idle
+                       ? best_active_cpu
+                       : best_idle_cpu;
+       else
+               *backup_cpu = prefer_idle
+               ? best_active_cpu
+               : best_idle_cpu;
+
+       trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+                                    best_idle_cpu, best_active_cpu,
+                                    target_cpu);
+
+       /* it is possible for target and backup
+        * to select same CPU - if so, drop backup
+        */
+       if (*backup_cpu == target_cpu)
+               *backup_cpu = -1;
+
+       return target_cpu;
+}
+
+/*
+ * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
+ * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ *
+ * In that case WAKE_AFFINE doesn't make sense and we'll let
+ * BALANCE_WAKE sort things out.
+ */
+static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+{
+       long min_cap, max_cap;
+
+       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       /* Minimum capacity is close to max, no need to abort wake_affine */
+       if (max_cap - min_cap < max_cap >> 3)
+               return 0;
+
+       /* Bring task utilization in sync with prev_cpu */
+       sync_entity_load_avg(&p->se);
+
+       return task_fits_capacity(p, min_cap);
+}
+
+static bool cpu_overutilized(int cpu)
+{
+       return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin);
+}
+
+DEFINE_PER_CPU(struct energy_env, eenv_cache);
+
+/* kernels often have NR_CPUS defined to be much
+ * larger than exist in practise on booted systems.
+ * Allocate the cpu array for eenv calculations
+ * at boot time to avoid massive overprovisioning.
+ */
+#ifdef DEBUG_EENV_DECISIONS
+static inline int eenv_debug_size_per_dbg_entry(void)
+{
+       return sizeof(struct _eenv_debug) + (sizeof(unsigned long) * num_possible_cpus());
+}
+
+static inline int eenv_debug_size_per_cpu_entry(void)
+{
+       /* each cpu struct has an array of _eenv_debug structs
+        * which have an array of unsigned longs at the end -
+        * the allocation should be extended so that there are
+        * at least 'num_possible_cpus' entries in the array.
+        */
+       return EAS_EENV_DEBUG_LEVELS * eenv_debug_size_per_dbg_entry();
+}
+/* given a per-_eenv_cpu debug env ptr, get the ptr for a given index */
+static inline struct _eenv_debug *eenv_debug_entry_ptr(struct _eenv_debug *base, int idx)
+{
+       char *ptr = (char *)base;
+       ptr += (idx * eenv_debug_size_per_dbg_entry());
+       return (struct _eenv_debug *)ptr;
+}
+/* given a pointer to the per-cpu global copy of _eenv_debug, get
+ * a pointer to the specified _eenv_cpu debug env.
+ */
+static inline struct _eenv_debug *eenv_debug_percpu_debug_env_ptr(struct _eenv_debug *base, int cpu_idx)
+{
+       char *ptr = (char *)base;
+       ptr += (cpu_idx * eenv_debug_size_per_cpu_entry());
+       return (struct _eenv_debug *)ptr;
+}
+
+static inline int eenv_debug_size(void)
+{
+       return num_possible_cpus() * eenv_debug_size_per_cpu_entry();
+}
+#endif
+
+static inline void alloc_eenv(void)
+{
+       int cpu;
+       int cpu_count = num_possible_cpus();
+
+       for_each_possible_cpu(cpu) {
+               struct energy_env *eenv = &per_cpu(eenv_cache, cpu);
+               eenv->cpu = kmalloc(sizeof(struct eenv_cpu) * cpu_count, GFP_KERNEL);
+               eenv->eenv_cpu_count = cpu_count;
+#ifdef DEBUG_EENV_DECISIONS
+               eenv->debug = (struct _eenv_debug *)kmalloc(eenv_debug_size(), GFP_KERNEL);
+#endif
+       }
+}
+
+static inline void reset_eenv(struct energy_env *eenv)
+{
+       int cpu_count;
+       struct eenv_cpu *cpu;
+#ifdef DEBUG_EENV_DECISIONS
+       struct _eenv_debug *debug;
+       int cpu_idx;
+       debug = eenv->debug;
+#endif
+
+       cpu_count = eenv->eenv_cpu_count;
+       cpu = eenv->cpu;
+       memset(eenv, 0, sizeof(struct energy_env));
+       eenv->cpu = cpu;
+       memset(eenv->cpu, 0, sizeof(struct eenv_cpu)*cpu_count);
+       eenv->eenv_cpu_count = cpu_count;
+
+#ifdef DEBUG_EENV_DECISIONS
+       memset(debug, 0, eenv_debug_size());
+       eenv->debug = debug;
+       for(cpu_idx = 0; cpu_idx < eenv->cpu_array_len; cpu_idx++)
+               eenv->cpu[cpu_idx].debug = eenv_debug_percpu_debug_env_ptr(debug, cpu_idx);
+#endif
+}
+/*
+ * get_eenv - reset the eenv struct cached for this CPU
+ *
+ * When the eenv is returned, it is configured to do
+ * energy calculations for the maximum number of CPUs
+ * the task can be placed on. The prev_cpu entry is
+ * filled in here. Callers are responsible for adding
+ * other CPU candidates up to eenv->max_cpu_count.
+ */
+static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu)
+{
+       struct energy_env *eenv;
+       cpumask_t cpumask_possible_cpus;
+       int cpu = smp_processor_id();
+       int i;
+
+       eenv = &(per_cpu(eenv_cache, cpu));
+       reset_eenv(eenv);
+
+       /* populate eenv */
+       eenv->p = p;
+       /* use boosted task util for capacity selection
+        * during energy calculation, but unboosted task
+        * util for group utilization calculations
+        */
+       eenv->util_delta = task_util(p);
+       eenv->util_delta_boosted = boosted_task_util(p);
+
+       cpumask_and(&cpumask_possible_cpus, &p->cpus_allowed, cpu_online_mask);
+       eenv->max_cpu_count = cpumask_weight(&cpumask_possible_cpus);
+
+       for (i=0; i < eenv->max_cpu_count; i++)
+               eenv->cpu[i].cpu_id = -1;
+       eenv->cpu[EAS_CPU_PRV].cpu_id = prev_cpu;
+       eenv->next_idx = EAS_CPU_PRV;
+
+       return eenv;
+}
+
+/*
+ * Needs to be called inside rcu_read_lock critical section.
+ * sd is a pointer to the sched domain we wish to use for an
+ * energy-aware placement option.
+ */
+static int find_energy_efficient_cpu(struct sched_domain *sd,
+                                    struct task_struct *p,
+                                    int cpu, int prev_cpu,
+                                    int sync)
+{
+       int use_fbt = sched_feat(FIND_BEST_TARGET);
+       int cpu_iter, eas_cpu_idx = EAS_CPU_NXT;
+       int energy_cpu = -1;
+       struct energy_env *eenv;
+
+       if (sysctl_sched_sync_hint_enable && sync) {
+               if (cpumask_test_cpu(cpu, &p->cpus_allowed)) {
+                       return cpu;
+               }
+       }
+
+       /* prepopulate energy diff environment */
+       eenv = get_eenv(p, prev_cpu);
+       if (eenv->max_cpu_count < 2)
+               return energy_cpu;
+
+       if(!use_fbt) {
+               /*
+                * using this function outside wakeup balance will not supply
+                * an sd ptr. Instead, fetch the highest level with energy data.
+                */
+               if (!sd)
+                       sd = rcu_dereference(per_cpu(sd_ea, prev_cpu));
+
+               for_each_cpu_and(cpu_iter, &p->cpus_allowed, sched_domain_span(sd)) {
+                       unsigned long spare;
+
+                       /* prev_cpu already in list */
+                       if (cpu_iter == prev_cpu)
+                               continue;
+
+                       spare = capacity_spare_wake(cpu_iter, p);
+
+                       if (spare * 1024 < capacity_margin * task_util(p))
+                               continue;
+
+                       /* Add CPU candidate */
+                       eenv->cpu[eas_cpu_idx++].cpu_id = cpu_iter;
+                       eenv->max_cpu_count = eas_cpu_idx;
+
+                       /* stop adding CPUs if we have no space left */
+                       if (eas_cpu_idx >= eenv->eenv_cpu_count)
+                               break;
+               }
+       } else {
+               int boosted = (schedtune_task_boost(p) > 0);
+               int prefer_idle;
+
+               /*
+                * give compiler a hint that if sched_features
+                * cannot be changed, it is safe to optimise out
+                * all if(prefer_idle) blocks.
+                */
+               prefer_idle = sched_feat(EAS_PREFER_IDLE) ?
+                               (schedtune_prefer_idle(p) > 0) : 0;
  
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static int cpu_util(int cpu)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
+               eenv->max_cpu_count = EAS_CPU_BKP + 1;
  
-       return (util >= capacity) ? capacity : util;
-}
+               /* Find a cpu with sufficient capacity */
+               eenv->cpu[EAS_CPU_NXT].cpu_id = find_best_target(p,
+                               &eenv->cpu[EAS_CPU_BKP].cpu_id,
+                               boosted, prefer_idle);
  
-static inline int task_util(struct task_struct *p)
-{
-       return p->se.avg.util_avg;
-}
+               /* take note if no backup was found */
+               if (eenv->cpu[EAS_CPU_BKP].cpu_id < 0)
+                       eenv->max_cpu_count = EAS_CPU_BKP;
  
-/*
- * cpu_util_wake: Compute cpu utilization with any contributions from
- * the waking task p removed.
- */
-static int cpu_util_wake(int cpu, struct task_struct *p)
-{
-       unsigned long util, capacity;
+               /* take note if no target was found */
+                if (eenv->cpu[EAS_CPU_NXT].cpu_id < 0)
+                        eenv->max_cpu_count = EAS_CPU_NXT;
+       }
  
-       /* Task has no contribution or is new */
-       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
-               return cpu_util(cpu);
+       if (eenv->max_cpu_count == EAS_CPU_NXT) {
+               /*
+                * we did not find any energy-awareness
+                * candidates beyond prev_cpu, so we will
+                * fall-back to the regular slow-path.
+                */
+               return energy_cpu;
+       }
  
-       capacity = capacity_orig_of(cpu);
-       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+       /* find most energy-efficient CPU */
+       energy_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 :
+                                       eenv->cpu[eenv->next_idx].cpu_id;
  
-       return (util >= capacity) ? capacity : util;
+       return energy_cpu;
  }
  
+static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
+static void nohz_balancer_kick(bool only_update);
+
  /*
- * Disable WAKE_AFFINE in the case where task @p doesn't fit in the
- * capacity of either the waking CPU @cpu or the previous CPU @prev_cpu.
+ * wake_energy: Make the decision if we want to use an energy-aware
+ * wakeup task placement or not. This is limited to situations where
+ * we cannot use energy-awareness right now.
   *
- * In that case WAKE_AFFINE doesn't make sense and we'll let
- * BALANCE_WAKE sort things out.
+ * Returns TRUE if we should attempt energy-aware wakeup, FALSE if not.
+ *
+ * Should only be called from select_task_rq_fair inside the RCU
+ * read-side critical section.
   */
-static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
+static inline int wake_energy(struct task_struct *p, int prev_cpu,
+                             int sd_flag, int wake_flags)
  {
-       long min_cap, max_cap;
+       struct sched_domain *sd = NULL;
+       int sync = wake_flags & WF_SYNC;
  
-       min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
-       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+       sd = rcu_dereference_sched(cpu_rq(prev_cpu)->sd);
  
-       /* Minimum capacity is close to max, no need to abort wake_affine */
-       if (max_cap - min_cap < max_cap >> 3)
-               return 0;
+       /*
+        * Check all definite no-energy-awareness conditions
+        */
+       if (!sd)
+               return false;
  
-       /* Bring task utilization in sync with prev_cpu */
-       sync_entity_load_avg(&p->se);
+       if (!energy_aware())
+               return false;
+
+       if (sd_overutilized(sd))
+               return false;
  
-       return min_cap * 1024 < task_util(p) * capacity_margin;
+       /*
+        * we cannot do energy-aware wakeup placement sensibly
+        * for tasks with 0 utilization, so let them be placed
+        * according to the normal strategy.
+        * However if fbt is in use we may still benefit from
+        * the heuristics we use there in selecting candidate
+        * CPUs.
+        */
+       if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util(p)))
+               return false;
+
+       if(!sched_feat(EAS_PREFER_IDLE)){
+               /*
+                * Force prefer-idle tasks into the slow path, this may not happen
+                * if none of the sd flags matched.
+                */
+               if (schedtune_prefer_idle(p) > 0 && !sync)
+                       return false;
+       }
+       return true;
  }
  
  /*
@@ -5938,21 +7473,28 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
   * preempt must be disabled.
   */
  static int
-select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags)
+select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_flags,
+                   int sibling_count_hint)
  {
-       struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
+       struct sched_domain *tmp, *affine_sd = NULL;
+       struct sched_domain *sd = NULL, *energy_sd = NULL;
         int cpu = smp_processor_id();
         int new_cpu = prev_cpu;
         int want_affine = 0;
+       int want_energy = 0;
         int sync = wake_flags & WF_SYNC;
  
+       rcu_read_lock();
+
         if (sd_flag & SD_BALANCE_WAKE) {
                 record_wakee(p);
-               want_affine = !wake_wide(p) && !wake_cap(p, cpu, prev_cpu)
-                             && cpumask_test_cpu(cpu, &p->cpus_allowed);
+               want_energy = wake_energy(p, prev_cpu, sd_flag, wake_flags);
+               want_affine = !want_energy &&
+                             !wake_wide(p, sibling_count_hint) &&
+                             !wake_cap(p, cpu, prev_cpu) &&
+                             cpumask_test_cpu(cpu, &p->cpus_allowed);
         }
  
-       rcu_read_lock();
         for_each_domain(cpu, tmp) {
                 if (!(tmp->flags & SD_LOAD_BALANCE))
                         break;
@@ -5967,9 +7509,22 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         break;
                 }
  
+               /*
+                * If we are able to try an energy-aware wakeup,
+                * select the highest non-overutilized sched domain
+                * which includes this cpu and prev_cpu
+                *
+                * maybe want to not test prev_cpu and only consider
+                * the current one?
+                */
+               if (want_energy &&
+                   !sd_overutilized(tmp) &&
+                   cpumask_test_cpu(prev_cpu, sched_domain_span(tmp)))
+                       energy_sd = tmp;
+
                 if (tmp->flags & sd_flag)
                         sd = tmp;
-               else if (!want_affine)
+               else if (!(want_affine || want_energy))
                         break;
         }
  
@@ -5982,47 +7537,38 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
                         new_cpu = cpu;
         }
  
+       if (sd && !(sd_flag & SD_BALANCE_FORK)) {
+               /*
+                * We're going to need the task's util for capacity_spare_wake
+                * in find_idlest_group. Sync it up to prev_cpu's
+                * last_update_time.
+                */
+               sync_entity_load_avg(&p->se);
+       }
+
         if (!sd) {
- pick_cpu:
+pick_cpu:
                 if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */
                         new_cpu = select_idle_sibling(p, prev_cpu, new_cpu);
  
-       } else while (sd) {
-               struct sched_group *group;
-               int weight;
-
-               if (!(sd->flags & sd_flag)) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               group = find_idlest_group(sd, p, cpu, sd_flag);
-               if (!group) {
-                       sd = sd->child;
-                       continue;
-               }
-
-               new_cpu = find_idlest_cpu(group, p, cpu);
-               if (new_cpu == -1 || new_cpu == cpu) {
-                       /* Now try balancing at a lower domain level of cpu */
-                       sd = sd->child;
-                       continue;
-               }
+       } else {
+               if (energy_sd)
+                       new_cpu = find_energy_efficient_cpu(energy_sd, p, cpu, prev_cpu, sync);
  
-               /* Now try balancing at a lower domain level of new_cpu */
-               cpu = new_cpu;
-               weight = sd->span_weight;
-               sd = NULL;
-               for_each_domain(cpu, tmp) {
-                       if (weight <= tmp->span_weight)
-                               break;
-                       if (tmp->flags & sd_flag)
-                               sd = tmp;
-               }
-               /* while loop will break here if sd == NULL */
+               /* if we did an energy-aware placement and had no choices available
+                * then fall back to the default find_idlest_cpu choice
+                */
+               if (!energy_sd || (energy_sd && new_cpu == -1))
+                       new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag);
         }
+
         rcu_read_unlock();
  
+#ifdef CONFIG_NO_HZ_COMMON
+       if (nohz_kick_needed(cpu_rq(new_cpu), true))
+               nohz_balancer_kick(true);
+#endif
+
         return new_cpu;
  }
  
@@ -6247,6 +7793,29 @@ preempt:
                 set_last_buddy(se);
  }
  
+static inline void update_misfit_task(struct rq *rq, struct task_struct *p)
+{
+#ifdef CONFIG_SMP
+       rq->misfit_task = !task_fits_capacity(p, capacity_of(rq->cpu));
+#endif
+}
+
+static inline void clear_rq_misfit(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       rq->misfit_task = 0;
+#endif
+}
+
+static inline unsigned int rq_has_misfit(struct rq *rq)
+{
+#ifdef CONFIG_SMP
+       return rq->misfit_task;
+#else
+       return 0;
+#endif
+}
+
  static struct task_struct *
  pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
@@ -6337,6 +7906,8 @@ again:
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
  
+       update_misfit_task(rq, p);
+
         return p;
  simple:
  #endif
@@ -6354,9 +7925,12 @@ simple:
         if (hrtick_enabled(rq))
                 hrtick_start_fair(rq, p);
  
+       update_misfit_task(rq, p);
+
         return p;
  
  idle:
+       clear_rq_misfit(rq);
         new_tasks = idle_balance(rq, rf);
  
         /*
@@ -6562,6 +8136,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
  enum fbq_type { regular, remote, all };
  
+enum group_type {
+       group_other = 0,
+       group_misfit_task,
+       group_imbalanced,
+       group_overloaded,
+};
+
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
  #define LBF_DST_PINNED  0x04
@@ -6580,6 +8161,7 @@ struct lb_env {
         int                     new_dst_cpu;
         enum cpu_idle_type      idle;
         long                    imbalance;
+       unsigned int            src_grp_nr_running;
         /* The set of CPUs under consideration for load-balancing */
         struct cpumask          *cpus;
  
@@ -6590,6 +8172,7 @@ struct lb_env {
         unsigned int            loop_max;
  
         enum fbq_type           fbq_type;
+       enum group_type         src_grp_type;
         struct list_head        tasks;
  };
  
@@ -7003,6 +8586,10 @@ static void update_blocked_averages(int cpu)
                 if (cfs_rq_is_decayed(cfs_rq))
                         list_del_leaf_cfs_rq(cfs_rq);
         }
+       update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+#ifdef CONFIG_NO_HZ_COMMON
+       rq->last_blocked_load_update_tick = jiffies;
+#endif
         rq_unlock_irqrestore(rq, &rf);
  }
  
@@ -7062,6 +8649,10 @@ static inline void update_blocked_averages(int cpu)
         rq_lock_irqsave(rq, &rf);
         update_rq_clock(rq);
         update_cfs_rq_load_avg(cfs_rq_clock_task(cfs_rq), cfs_rq);
+       update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
+#ifdef CONFIG_NO_HZ_COMMON
+       rq->last_blocked_load_update_tick = jiffies;
+#endif
         rq_unlock_irqrestore(rq, &rf);
  }
  
@@ -7073,12 +8664,6 @@ static unsigned long task_h_load(struct task_struct *p)
  
  /********** Helpers for find_busiest_group ************************/
  
-enum group_type {
-       group_other = 0,
-       group_imbalanced,
-       group_overloaded,
-};
-
  /*
   * sg_lb_stats - stats of a sched_group required for load_balancing
   */
@@ -7094,6 +8679,7 @@ struct sg_lb_stats {
         unsigned int group_weight;
         enum group_type group_type;
         int group_no_capacity;
+       int group_misfit_task; /* A cpu has a task too big for its capacity */
  #ifdef CONFIG_NUMA_BALANCING
         unsigned int nr_numa_running;
         unsigned int nr_preferred_running;
@@ -7110,6 +8696,7 @@ struct sd_lb_stats {
         unsigned long total_running;
         unsigned long total_load;       /* Total load of all groups in sd */
         unsigned long total_capacity;   /* Total capacity of all groups in sd */
+       unsigned long total_util;       /* Total util of all groups in sd */
         unsigned long avg_load; /* Average load across all groups in sd */
  
         struct sg_lb_stats busiest_stat;/* Statistics of the busiest group */
@@ -7130,6 +8717,7 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
                 .total_running = 0UL,
                 .total_load = 0UL,
                 .total_capacity = 0UL,
+               .total_util = 0UL,
                 .busiest_stat = {
                         .avg_load = 0UL,
                         .sum_nr_running = 0,
@@ -7215,7 +8803,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
  {
         struct sched_domain *child = sd->child;
         struct sched_group *group, *sdg = sd->groups;
-       unsigned long capacity, min_capacity;
+       unsigned long capacity, min_capacity, max_capacity;
         unsigned long interval;
  
         interval = msecs_to_jiffies(sd->balance_interval);
@@ -7229,6 +8817,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
  
         capacity = 0;
         min_capacity = ULONG_MAX;
+       max_capacity = 0;
  
         if (child->flags & SD_OVERLAP) {
                 /*
@@ -7259,6 +8848,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
                         }
  
                         min_capacity = min(capacity, min_capacity);
+                       max_capacity = max(capacity, max_capacity);
                 }
         } else  {
                 /*
@@ -7272,12 +8862,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
  
                         capacity += sgc->capacity;
                         min_capacity = min(sgc->min_capacity, min_capacity);
+                       max_capacity = max(sgc->max_capacity, max_capacity);
                         group = group->next;
                 } while (group != child->groups);
         }
  
         sdg->sgc->capacity = capacity;
         sdg->sgc->min_capacity = min_capacity;
+       sdg->sgc->max_capacity = max_capacity;
  }
  
  /*
@@ -7383,6 +8975,19 @@ group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
                                                 ref->sgc->min_capacity * 1024;
  }
  
+/*
+ * group_similar_cpu_capacity: Returns true if the minimum capacity of the
+ * compared groups differ by less than 12.5%.
+ */
+static inline bool
+group_similar_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+       long diff = sg->sgc->min_capacity - ref->sgc->min_capacity;
+       long max = max(sg->sgc->min_capacity, ref->sgc->min_capacity);
+
+       return abs(diff) < max >> 3;
+}
+
  static inline enum
  group_type group_classify(struct sched_group *group,
                           struct sg_lb_stats *sgs)
@@ -7393,6 +8998,9 @@ group_type group_classify(struct sched_group *group,
         if (sg_imbalanced(group))
                 return group_imbalanced;
  
+       if (sgs->group_misfit_task)
+               return group_misfit_task;
+
         return group_other;
  }
  
@@ -7404,11 +9012,12 @@ group_type group_classify(struct sched_group *group,
   * @local_group: Does group contain this_cpu.
   * @sgs: variable to hold the statistics for this group.
   * @overload: Indicate more than one runnable task for any CPU.
+ * @overutilized: Indicate overutilization for any CPU.
   */
  static inline void update_sg_lb_stats(struct lb_env *env,
                         struct sched_group *group, int load_idx,
                         int local_group, struct sg_lb_stats *sgs,
-                       bool *overload)
+                       bool *overload, bool *overutilized, bool *misfit_task)
  {
         unsigned long load;
         int i, nr_running;
@@ -7442,6 +9051,17 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                  */
                 if (!nr_running && idle_cpu(i))
                         sgs->idle_cpus++;
+
+               if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+                   !sgs->group_misfit_task && rq_has_misfit(rq))
+                       sgs->group_misfit_task = capacity_of(i);
+
+               if (cpu_overutilized(i)) {
+                       *overutilized = true;
+
+                       if (rq_has_misfit(rq))
+                               *misfit_task = true;
+               }
         }
  
         /* Adjust by relative CPU capacity of the group */
@@ -7477,6 +9097,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
  {
         struct sg_lb_stats *busiest = &sds->busiest_stat;
  
+       /*
+        * Don't try to pull misfit tasks we can't help.
+        */
+       if (sgs->group_type == group_misfit_task &&
+           (!group_smaller_cpu_capacity(sg, sds->local) ||
+            !group_has_capacity(env, &sds->local_stat)))
+               return false;
+
         if (sgs->group_type > busiest->group_type)
                 return true;
  
@@ -7499,6 +9127,15 @@ static bool update_sd_pick_busiest(struct lb_env *env,
             group_smaller_cpu_capacity(sds->local, sg))
                 return false;
  
+       /*
+        * Candidate sg doesn't face any severe imbalance issues so
+        * don't disturb unless the groups are of similar capacity
+        * where balancing is more harmless.
+        */
+       if (sgs->group_type == group_other &&
+               !group_similar_cpu_capacity(sds->local, sg))
+               return false;
+
  asym_packing:
         /* This is the busiest node in its class. */
         if (!(env->sd->flags & SD_ASYM_PACKING))
@@ -7556,6 +9193,18 @@ static inline enum fbq_type fbq_classify_rq(struct rq *rq)
  }
  #endif /* CONFIG_NUMA_BALANCING */
  
+#ifdef CONFIG_NO_HZ_COMMON
+static struct {
+       cpumask_var_t idle_cpus_mask;
+       atomic_t nr_cpus;
+       unsigned long next_balance;     /* in jiffy units */
+       unsigned long next_update;     /* in jiffy units */
+} nohz ____cacheline_aligned;
+#endif
+
+#define lb_sd_parent(sd) \
+       (sd->parent && sd->parent->groups != sd->parent->groups->next)
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
@@ -7568,11 +9217,35 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
         struct sg_lb_stats *local = &sds->local_stat;
         struct sg_lb_stats tmp_sgs;
         int load_idx, prefer_sibling = 0;
-       bool overload = false;
+       bool overload = false, overutilized = false, misfit_task = false;
  
         if (child && child->flags & SD_PREFER_SIBLING)
                 prefer_sibling = 1;
  
+#ifdef CONFIG_NO_HZ_COMMON
+       if (env->idle == CPU_NEWLY_IDLE) {
+               int cpu;
+
+               /* Update the stats of NOHZ idle CPUs in the sd */
+               for_each_cpu_and(cpu, sched_domain_span(env->sd),
+                                nohz.idle_cpus_mask) {
+                       struct rq *rq = cpu_rq(cpu);
+
+                       /* ... Unless we've already done since the last tick */
+                       if (time_after(jiffies,
+                                       rq->last_blocked_load_update_tick))
+                               update_blocked_averages(cpu);
+               }
+       }
+       /*
+        * If we've just updated all of the NOHZ idle CPUs, then we can push
+        * back the next nohz.next_update, which will prevent an unnecessary
+        * wakeup for the nohz stats kick
+        */
+       if (cpumask_subset(nohz.idle_cpus_mask, sched_domain_span(env->sd)))
+               nohz.next_update = jiffies + LOAD_AVG_PERIOD;
+#endif
+
         load_idx = get_sd_load_idx(env->sd, env->idle);
  
         do {
@@ -7590,7 +9263,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 }
  
                 update_sg_lb_stats(env, sg, load_idx, local_group, sgs,
-                                               &overload);
+                                               &overload, &overutilized,
+                                               &misfit_task);
  
                 if (local_group)
                         goto next_group;
@@ -7622,6 +9296,7 @@ next_group:
                 sds->total_running += sgs->sum_nr_running;
                 sds->total_load += sgs->group_load;
                 sds->total_capacity += sgs->group_capacity;
+               sds->total_util += sgs->group_util;
  
                 sg = sg->next;
         } while (sg != env->sd->groups);
@@ -7629,11 +9304,52 @@ next_group:
         if (env->sd->flags & SD_NUMA)
                 env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  
-       if (!env->sd->parent) {
+       env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
+
+       if (!lb_sd_parent(env->sd)) {
                 /* update overload indicator if we are at root domain */
                 if (env->dst_rq->rd->overload != overload)
                         env->dst_rq->rd->overload = overload;
         }
+
+       if (overutilized)
+               set_sd_overutilized(env->sd);
+       else
+               clear_sd_overutilized(env->sd);
+
+       /*
+        * If there is a misfit task in one cpu in this sched_domain
+        * it is likely that the imbalance cannot be sorted out among
+        * the cpu's in this sched_domain. In this case set the
+        * overutilized flag at the parent sched_domain.
+        */
+       if (misfit_task) {
+               struct sched_domain *sd = env->sd->parent;
+
+               /*
+                * In case of a misfit task, load balance at the parent
+                * sched domain level will make sense only if the the cpus
+                * have a different capacity. If cpus at a domain level have
+                * the same capacity, the misfit task cannot be well
+                * accomodated  in any of the cpus and there in no point in
+                * trying a load balance at this level
+                */
+               while (sd) {
+                       if (sd->flags & SD_ASYM_CPUCAPACITY) {
+                               set_sd_overutilized(sd);
+                               break;
+                       }
+                       sd = sd->parent;
+               }
+       }
+
+       /*
+        * If the domain util is greater that domain capacity, load balancing
+        * needs to be done at the next sched domain level as well.
+        */
+       if (lb_sd_parent(env->sd) &&
+           sds->total_capacity * 1024 < sds->total_util * capacity_margin)
+               set_sd_overutilized(env->sd->parent);
  }
  
  /**
@@ -7782,8 +9498,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
          * factors in sg capacity and sgs with smaller group_type are
          * skipped when updating the busiest sg:
          */
-       if (busiest->avg_load <= sds->avg_load ||
-           local->avg_load >= sds->avg_load) {
+       if (busiest->group_type != group_misfit_task &&
+           (busiest->avg_load <= sds->avg_load ||
+            local->avg_load >= sds->avg_load)) {
                 env->imbalance = 0;
                 return fix_small_imbalance(env, sds);
         }
@@ -7817,6 +9534,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                 (sds->avg_load - local->avg_load) * local->group_capacity
         ) / SCHED_CAPACITY_SCALE;
  
+       /* Boost imbalance to allow misfit task to be balanced. */
+       if (busiest->group_type == group_misfit_task) {
+               env->imbalance = max_t(long, env->imbalance,
+                                      busiest->group_misfit_task);
+       }
+
         /*
          * if *imbalance is less than the average load per runnable task
          * there is no guarantee that any tasks will be moved so we'll have
@@ -7852,6 +9575,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
          * this level.
          */
         update_sd_lb_stats(env, &sds);
+
+       if (energy_aware() && !sd_overutilized(env->sd))
+               goto out_balanced;
+
         local = &sds.local_stat;
         busiest = &sds.busiest_stat;
  
@@ -7875,11 +9602,18 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
         if (busiest->group_type == group_imbalanced)
                 goto force_balance;
  
-       /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
-       if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) &&
+       /*
+        * When dst_cpu is idle, prevent SMP nice and/or asymmetric group
+        * capacities from resulting in underutilization due to avg_load.
+        */
+       if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) &&
             busiest->group_no_capacity)
                 goto force_balance;
  
+       /* Misfitting tasks should be dealt with regardless of the avg load */
+       if (busiest->group_type == group_misfit_task)
+               goto force_balance;
+
         /*
          * If the local group is busier than the selected busiest group
          * don't try and pull any tasks.
@@ -7917,6 +9651,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
  
  force_balance:
         /* Looks like there is an imbalance. Compute it */
+       env->src_grp_type = busiest->group_type;
         calculate_imbalance(env, &sds);
         return sds.busiest;
  
@@ -7964,6 +9699,13 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                 if (rt > env->fbq_type)
                         continue;
  
+               /*
+                * For ASYM_CPUCAPACITY domains with misfit tasks we ignore
+                * load.
+                */
+               if (env->src_grp_type == group_misfit_task && rq_has_misfit(rq))
+                       return rq;
+
                 capacity = capacity_of(i);
  
                 wl = weighted_cpuload(rq);
@@ -8033,6 +9775,14 @@ static int need_active_balance(struct lb_env *env)
                         return 1;
         }
  
+       if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
+           ((capacity_orig_of(env->src_cpu) < capacity_orig_of(env->dst_cpu))) &&
+                               env->src_rq->cfs.h_nr_running == 1 &&
+                               cpu_overutilized(env->src_cpu) &&
+                               !cpu_overutilized(env->dst_cpu)) {
+                       return 1;
+       }
+
         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
  }
  
@@ -8085,7 +9835,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                         int *continue_balancing)
  {
         int ld_moved, cur_ld_moved, active_balance = 0;
-       struct sched_domain *sd_parent = sd->parent;
+       struct sched_domain *sd_parent = lb_sd_parent(sd) ? sd->parent : NULL;
         struct sched_group *group;
         struct rq *busiest;
         struct rq_flags rf;
@@ -8251,7 +10001,8 @@ more_balance:
                  * excessive cache_hot migrations and active balances.
                  */
                 if (idle != CPU_NEWLY_IDLE)
-                       sd->nr_balance_failed++;
+                       if (env.src_grp_nr_running > 1)
+                               sd->nr_balance_failed++;
  
                 if (need_active_balance(&env)) {
                         unsigned long flags;
@@ -8347,6 +10098,7 @@ static inline unsigned long
  get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
  {
         unsigned long interval = sd->balance_interval;
+       unsigned int cpu;
  
         if (cpu_busy)
                 interval *= sd->busy_factor;
@@ -8355,6 +10107,24 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
         interval = msecs_to_jiffies(interval);
         interval = clamp(interval, 1UL, max_load_balance_interval);
  
+       /*
+        * check if sched domain is marked as overutilized
+        * we ought to only do this on systems which have SD_ASYMCAPACITY
+        * but we want to do it for all sched domains in those systems
+        * So for now, just check if overutilized as a proxy.
+        */
+       /*
+        * If we are overutilized and we have a misfit task, then
+        * we want to balance as soon as practically possible, so
+        * we return an interval of zero.
+        */
+       if (energy_aware() && sd_overutilized(sd)) {
+               /* we know the root is overutilized, let's check for a misfit task */
+               for_each_cpu(cpu, sched_domain_span(sd)) {
+                       if (rq_has_misfit(cpu_rq(cpu)))
+                               return 1;
+               }
+       }
         return interval;
  }
  
@@ -8588,11 +10358,6 @@ static inline int on_null_domain(struct rq *rq)
   *   needed, they will kick the idle load balancer, which then does idle
   *   load balancing for all the idle CPUs.
   */
-static struct {
-       cpumask_var_t idle_cpus_mask;
-       atomic_t nr_cpus;
-       unsigned long next_balance;     /* in jiffy units */
-} nohz ____cacheline_aligned;
  
  static inline int find_new_ilb(void)
  {
@@ -8609,7 +10374,7 @@ static inline int find_new_ilb(void)
   * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
   * CPU (if there is one).
   */
-static void nohz_balancer_kick(void)
+static void nohz_balancer_kick(bool only_update)
  {
         int ilb_cpu;
  
@@ -8622,6 +10387,10 @@ static void nohz_balancer_kick(void)
  
         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
                 return;
+
+       if (only_update)
+               set_bit(NOHZ_STATS_KICK, nohz_flags(ilb_cpu));
+
         /*
          * Use smp_send_reschedule() instead of resched_cpu().
          * This way we generate a sched IPI on the target cpu which
@@ -8709,6 +10478,8 @@ void nohz_balance_enter_idle(int cpu)
         atomic_inc(&nohz.nr_cpus);
         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
  }
+#else
+static inline void nohz_balancer_kick(bool only_update) {}
  #endif
  
  static DEFINE_SPINLOCK(balancing);
@@ -8740,8 +10511,6 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
         int need_serialize, need_decay = 0;
         u64 max_cost = 0;
  
-       update_blocked_averages(cpu);
-
         rcu_read_lock();
         for_each_domain(cpu, sd) {
                 /*
@@ -8756,6 +10525,9 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                 }
                 max_cost += sd->max_newidle_lb_cost;
  
+               if (energy_aware() && !sd_overutilized(sd))
+                       continue;
+
                 if (!(sd->flags & SD_LOAD_BALANCE))
                         continue;
  
@@ -8840,6 +10612,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
  {
         int this_cpu = this_rq->cpu;
         struct rq *rq;
+       struct sched_domain *sd;
         int balance_cpu;
         /* Earliest time when we have to do rebalance again */
         unsigned long next_balance = jiffies + 60*HZ;
@@ -8849,6 +10622,23 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
                 goto end;
  
+       /*
+        * This cpu is going to update the blocked load of idle CPUs either
+        * before doing a rebalancing or just to keep metrics up to date. we
+        * can safely update the next update timestamp
+        */
+       rcu_read_lock();
+       sd = rcu_dereference(this_rq->sd);
+       /*
+        * Check whether there is a sched_domain available for this cpu.
+        * The last other cpu can have been unplugged since the ILB has been
+        * triggered and the sched_domain can now be null. The idle balance
+        * sequence will quickly be aborted as there is no more idle CPUs
+        */
+       if (sd)
+               nohz.next_update = jiffies + msecs_to_jiffies(LOAD_AVG_PERIOD);
+       rcu_read_unlock();
+
         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
                         continue;
@@ -8875,7 +10665,15 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle)
                         cpu_load_update_idle(rq);
                         rq_unlock_irq(rq, &rf);
  
-                       rebalance_domains(rq, CPU_IDLE);
+                       update_blocked_averages(balance_cpu);
+                       /*
+                        * This idle load balance softirq may have been
+                        * triggered only to update the blocked load and shares
+                        * of idle CPUs (which we have just done for
+                        * balance_cpu). In that case skip the actual balance.
+                        */
+                       if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_cpu)))
+                               rebalance_domains(rq, idle);
                 }
  
                 if (time_after(next_balance, rq->next_balance)) {
@@ -8906,7 +10704,7 @@ end:
   *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
   *     domain span are idle.
   */
-static inline bool nohz_kick_needed(struct rq *rq)
+static inline bool nohz_kick_needed(struct rq *rq, bool only_update)
  {
         unsigned long now = jiffies;
         struct sched_domain_shared *sds;
@@ -8914,7 +10712,7 @@ static inline bool nohz_kick_needed(struct rq *rq)
         int nr_busy, i, cpu = rq->cpu;
         bool kick = false;
  
-       if (unlikely(rq->idle_balance))
+       if (unlikely(rq->idle_balance) && !only_update)
                 return false;
  
         /*
@@ -8931,15 +10729,27 @@ static inline bool nohz_kick_needed(struct rq *rq)
         if (likely(!atomic_read(&nohz.nr_cpus)))
                 return false;
  
+       if (only_update) {
+               if (time_before(now, nohz.next_update))
+                       return false;
+               else
+                       return true;
+       }
+
         if (time_before(now, nohz.next_balance))
                 return false;
  
-       if (rq->nr_running >= 2)
+       if (rq->nr_running >= 2 &&
+           (!energy_aware() || cpu_overutilized(cpu)))
                 return true;
  
+       /* Do idle load balance if there have misfit task */
+       if (energy_aware())
+               return rq_has_misfit(rq);
+
         rcu_read_lock();
         sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
-       if (sds) {
+       if (sds && !energy_aware()) {
                 /*
                  * XXX: write a coherent comment on why we do this.
                  * See also: http://lkml.kernel.org/r/20111202010832.602203411@sbsiddha-desk.sc.intel.com
@@ -8980,6 +10790,7 @@ unlock:
  }
  #else
  static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { }
+static inline bool nohz_kick_needed(struct rq *rq, bool only_update) { return false; }
  #endif
  
  /*
@@ -9001,7 +10812,14 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h)
          * and abort nohz_idle_balance altogether if we pull some load.
          */
         nohz_idle_balance(this_rq, idle);
+       update_blocked_averages(this_rq->cpu);
+#ifdef CONFIG_NO_HZ_COMMON
+       if (!test_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu)))
+               rebalance_domains(this_rq, idle);
+       clear_bit(NOHZ_STATS_KICK, nohz_flags(this_rq->cpu));
+#else
         rebalance_domains(this_rq, idle);
+#endif
  }
  
  /*
@@ -9016,8 +10834,8 @@ void trigger_load_balance(struct rq *rq)
         if (time_after_eq(jiffies, rq->next_balance))
                 raise_softirq(SCHED_SOFTIRQ);
  #ifdef CONFIG_NO_HZ_COMMON
-       if (nohz_kick_needed(rq))
-               nohz_balancer_kick();
+       if (nohz_kick_needed(rq, false))
+               nohz_balancer_kick(false);
  #endif
  }
  
@@ -9053,6 +10871,10 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
  
         if (static_branch_unlikely(&sched_numa_balancing))
                 task_tick_numa(rq, curr);
+
+       update_misfit_task(rq, curr);
+
+       update_overutilized_status(rq);
  }
  
  /*
@@ -9596,8 +11418,11 @@ __init void init_sched_fair_class(void)
  
  #ifdef CONFIG_NO_HZ_COMMON
         nohz.next_balance = jiffies;
+       nohz.next_update = jiffies;
         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
  #endif
+
+       alloc_eenv();
  #endif /* SMP */
  
  }
diff --git a/kernel/sched/features.h b/kernel/sched/features.h

index 9552fd5854bffc2c07ec77225a1aed9176da53cb..306333beea5f28935a2586a7bbe498fe8f2b83e1 100644 (file)
--- a/kernel/sched/features.h
+++ b/kernel/sched/features.h
@@ -85,3 +85,31 @@ SCHED_FEAT(ATTACH_AGE_LOAD, true)
  SCHED_FEAT(WA_IDLE, true)
  SCHED_FEAT(WA_WEIGHT, true)
  SCHED_FEAT(WA_BIAS, true)
+
+/*
+ * Energy aware scheduling. Use platform energy model to guide scheduling
+ * decisions optimizing for energy efficiency.
+ */
+#ifdef CONFIG_DEFAULT_USE_ENERGY_AWARE
+SCHED_FEAT(ENERGY_AWARE, true)
+#else
+SCHED_FEAT(ENERGY_AWARE, false)
+#endif
+
+/*
+ * Energy aware scheduling algorithm choices:
+ * EAS_PREFER_IDLE
+ *   Direct tasks in a schedtune.prefer_idle=1 group through
+ *   the EAS path for wakeup task placement. Otherwise, put
+ *   those tasks through the mainline slow path.
+ * FIND_BEST_TARGET
+ *   Limit the number of placement options for which we calculate
+ *   energy by using heuristics to select 'best idle' and
+ *   'best active' cpu options.
+ * FBT_STRICT_ORDER
+ *   ON: If the target CPU saves any energy, use that.
+ *   OFF: Use whichever of target or backup saves most.
+ */
+SCHED_FEAT(EAS_PREFER_IDLE, true)
+SCHED_FEAT(FIND_BEST_TARGET, true)
+SCHED_FEAT(FBT_STRICT_ORDER, true)
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c

index 257f4f0b4532b0428a8ff3c72513b0a54cb0029d..c1dbb2cc1f29c8a936f5a0fe4b1a467c10e2b9b5 100644 (file)
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -25,9 +25,10 @@ extern char __cpuidle_text_start[], __cpuidle_text_end[];
   * sched_idle_set_state - Record idle state for the current CPU.
   * @idle_state: State to record.
   */
-void sched_idle_set_state(struct cpuidle_state *idle_state)
+void sched_idle_set_state(struct cpuidle_state *idle_state, int index)
  {
         idle_set_state(this_rq(), idle_state);
+       idle_set_state_idx(this_rq(), index);
  }
  
  static int __read_mostly cpu_idle_force_poll;
diff --git a/kernel/sched/idle_task.c b/kernel/sched/idle_task.c

index d518664cce4f1105376610aa0ec35bc3b4cd49d8..609abdf1c93885e8c64c771f6b3ab138bafd52e6 100644 (file)
--- a/kernel/sched/idle_task.c
+++ b/kernel/sched/idle_task.c
@@ -10,7 +10,8 @@
  
  #ifdef CONFIG_SMP
  static int
-select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_idle(struct task_struct *p, int cpu, int sd_flag, int flags,
+                   int sibling_count_hint)
  {
         return task_cpu(p); /* IDLE tasks as never migrated */
  }
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 298f62b8662d4a3f0838f04dd7f77e21e39593a5..2b51986afd81cfcb87834648e06c5a4cc7e7c04c 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
  #include <linux/slab.h>
  #include <linux/irq_work.h>
  
+#include "walt.h"
+
  int sched_rr_timeslice = RR_TIMESLICE;
  int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  
@@ -1324,6 +1326,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
                 rt_se->timeout = 0;
  
         enqueue_rt_entity(rt_se, flags);
+       walt_inc_cumulative_runnable_avg(rq, p);
  
         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
@@ -1335,6 +1338,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se, flags);
+       walt_dec_cumulative_runnable_avg(rq, p);
  
         dequeue_pushable_task(rq, p);
  }
@@ -1377,7 +1381,8 @@ static void yield_task_rt(struct rq *rq)
  static int find_lowest_rq(struct task_struct *task);
  
  static int
-select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
+                 int sibling_count_hint)
  {
         struct task_struct *curr;
         struct rq *rq;
@@ -1524,6 +1529,8 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
         return p;
  }
  
+extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
+
  static struct task_struct *
  pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  {
@@ -1569,6 +1576,10 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
  
         queue_push_tasks(rq);
  
+       if (p)
+               update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
+                                       rq->curr->sched_class == &rt_sched_class);
+
         return p;
  }
  
@@ -1576,6 +1587,8 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  {
         update_curr_rt(rq);
  
+       update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
+
         /*
          * The previous task needs to be made eligible for pushing
          * if it is still active
@@ -1843,7 +1856,9 @@ retry:
         }
  
         deactivate_task(rq, next_task, 0);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
         set_task_cpu(next_task, lowest_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
         activate_task(lowest_rq, next_task, 0);
         ret = 1;
  
@@ -2115,7 +2130,9 @@ static void pull_rt_task(struct rq *this_rq)
                         resched = true;
  
                         deactivate_task(src_rq, p, 0);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                         set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                         activate_task(this_rq, p, 0);
                         /*
                          * We continue with the search, just in
@@ -2295,6 +2312,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
         struct sched_rt_entity *rt_se = &p->rt;
  
         update_curr_rt(rq);
+       update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
  
         watchdog(rq, p);
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 307c35d3366097cd42701a17e9927f7431c15bef..453ce4530dd9f0cf1821237eff2053e8a7f5dc0d 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -483,6 +483,10 @@ struct cfs_rq {
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
  
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+#endif
+
  #ifdef CONFIG_CFS_BANDWIDTH
         int runtime_enabled;
         u64 runtime_expires;
@@ -524,6 +528,9 @@ struct rt_rq {
         unsigned long rt_nr_total;
         int overloaded;
         struct plist_head pushable_tasks;
+
+       struct sched_avg avg;
+
  #endif /* CONFIG_SMP */
         int rt_queued;
  
@@ -653,6 +660,9 @@ struct root_domain {
         struct cpupri cpupri;
  
         unsigned long max_cpu_capacity;
+
+       /* First cpu with maximum and minimum original capacity */
+       int max_cap_orig_cpu, min_cap_orig_cpu;
  };
  
  extern struct root_domain def_root_domain;
@@ -694,6 +704,7 @@ struct rq {
  #ifdef CONFIG_NO_HZ_COMMON
  #ifdef CONFIG_SMP
         unsigned long last_load_update_tick;
+       unsigned long last_blocked_load_update_tick;
  #endif /* CONFIG_SMP */
         unsigned long nohz_flags;
  #endif /* CONFIG_NO_HZ_COMMON */
@@ -743,6 +754,8 @@ struct rq {
         struct callback_head *balance_callback;
  
         unsigned char idle_balance;
+
+       unsigned int misfit_task;
         /* For active balancing */
         int active_balance;
         int push_cpu;
@@ -762,6 +775,20 @@ struct rq {
         u64 max_idle_balance_cost;
  #endif
  
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+       u64 window_start;
+       u64 curr_runnable_sum;
+       u64 prev_runnable_sum;
+       u64 nt_curr_runnable_sum;
+       u64 nt_prev_runnable_sum;
+       u64 cur_irqload;
+       u64 avg_irqload;
+       u64 irqload_ts;
+       u64 cum_window_demand;
+#endif /* CONFIG_SCHED_WALT */
+
+
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         u64 prev_irq_time;
  #endif
@@ -809,6 +836,7 @@ struct rq {
  #ifdef CONFIG_CPU_IDLE
         /* Must be inspected within a rcu lock section */
         struct cpuidle_state *idle_state;
+       int idle_state_idx;
  #endif
  };
  
@@ -1067,6 +1095,8 @@ DECLARE_PER_CPU(int, sd_llc_id);
  DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
  DECLARE_PER_CPU(struct sched_domain *, sd_numa);
  DECLARE_PER_CPU(struct sched_domain *, sd_asym);
+DECLARE_PER_CPU(struct sched_domain *, sd_ea);
+DECLARE_PER_CPU(struct sched_domain *, sd_scs);
  
  struct sched_group_capacity {
         atomic_t ref;
@@ -1076,6 +1106,7 @@ struct sched_group_capacity {
          */
         unsigned long capacity;
         unsigned long min_capacity; /* Min per-CPU capacity in group */
+       unsigned long max_capacity; /* Max per-CPU capacity in group */
         unsigned long next_update;
         int imbalance; /* XXX unrelated to capacity but shared group state */
  
@@ -1093,6 +1124,7 @@ struct sched_group {
         unsigned int group_weight;
         struct sched_group_capacity *sgc;
         int asym_prefer_cpu;            /* cpu of highest priority in group */
+       const struct sched_group_energy const *sge;
  
         /*
          * The CPUs this group covers.
@@ -1433,7 +1465,8 @@ struct sched_class {
         void (*put_prev_task) (struct rq *rq, struct task_struct *p);
  
  #ifdef CONFIG_SMP
-       int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags);
+       int  (*select_task_rq)(struct task_struct *p, int task_cpu, int sd_flag, int flags,
+                              int subling_count_hint);
         void (*migrate_task_rq)(struct task_struct *p);
  
         void (*task_woken) (struct rq *this_rq, struct task_struct *task);
@@ -1520,6 +1553,17 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
         SCHED_WARN_ON(!rcu_read_lock_held());
         return rq->idle_state;
  }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+       rq->idle_state_idx = idle_state_idx;
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+       WARN_ON(!rcu_read_lock_held());
+       return rq->idle_state_idx;
+}
  #else
  static inline void idle_set_state(struct rq *rq,
                                   struct cpuidle_state *idle_state)
@@ -1530,6 +1574,15 @@ static inline struct cpuidle_state *idle_get_state(struct rq *rq)
  {
         return NULL;
  }
+
+static inline void idle_set_state_idx(struct rq *rq, int idle_state_idx)
+{
+}
+
+static inline int idle_get_state_idx(struct rq *rq)
+{
+       return -1;
+}
  #endif
  
  extern void schedule_idle(void);
@@ -1666,6 +1719,7 @@ static inline int hrtick_enabled(struct rq *rq)
  
  #ifdef CONFIG_SMP
  extern void sched_avg_update(struct rq *rq);
+extern unsigned long sched_get_rt_rq_util(int cpu);
  
  #ifndef arch_scale_freq_capacity
  static __always_inline
@@ -1686,6 +1740,91 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
  }
  #endif
  
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+
+#ifdef CONFIG_SCHED_WALT
+#define walt_util(util_var, demand_sum) {\
+       u64 sum = demand_sum << SCHED_CAPACITY_SHIFT;\
+       do_div(sum, walt_ravg_window);\
+       util_var = (typeof(util_var))sum;\
+       }
+#endif
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               walt_util(util, cpu_rq(cpu)->cumulative_runnable_avg);
+       }
+#endif
+       delta += util;
+       if (delta < 0)
+               return 0;
+
+       return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+       return __cpu_util(cpu, 0);
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               walt_util(util, cpu_rq(cpu)->prev_runnable_sum);
+       }
+#endif
+       return (util >= capacity) ? capacity : util;
+}
+
+#endif
+
  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
  {
         rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -1991,6 +2130,7 @@ extern void cfs_bandwidth_usage_dec(void);
  enum rq_nohz_flag_bits {
         NOHZ_TICK_STOPPED,
         NOHZ_BALANCE_KICK,
+       NOHZ_STATS_KICK
  };
  
  #define nohz_flags(cpu)        (&cpu_rq(cpu)->nohz_flags)
@@ -2002,6 +2142,9 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
  
  
  #ifdef CONFIG_SMP
+
+extern void init_energy_aware_data(int cpu);
+
  static inline
  void __dl_update(struct dl_bw *dl_b, s64 bw)
  {
@@ -2095,6 +2238,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
  static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
  #endif /* CONFIG_CPU_FREQ */
  
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+       return cpu_of(rq) == task_cpu(p) &&
+              (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
  #ifdef arch_scale_freq_capacity
  #ifndef arch_scale_freq_invariant
  #define arch_scale_freq_invariant()    (true)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index 45caf90b24cd9693a72b943220ecd0176580f748..7ca03e528c45d3224f8662a09d2a485162e886cf 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0
  #include "sched.h"
+#include "walt.h"
  
  /*
   * stop-task scheduling class.
@@ -12,7 +13,8 @@
  
  #ifdef CONFIG_SMP
  static int
-select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags)
+select_task_rq_stop(struct task_struct *p, int cpu, int sd_flag, int flags,
+                   int sibling_count_hint)
  {
         return task_cpu(p); /* stop tasks as never migrate */
  }
@@ -43,12 +45,14 @@ static void
  enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
  {
         add_nr_running(rq, 1);
+       walt_inc_cumulative_runnable_avg(rq, p);
  }
  
  static void
  dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
  {
         sub_nr_running(rq, 1);
+       walt_dec_cumulative_runnable_avg(rq, p);
  }
  
  static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c

index 659e075ef70b7d5c43dc78567946837e1071f911..ad78fd4a66e5710c2f1b7d8b44364b49e58f878f 100644 (file)
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -39,9 +39,6 @@ static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level,
  
         if (!(sd->flags & SD_LOAD_BALANCE)) {
                 printk("does not load-balance\n");
-               if (sd->parent)
-                       printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain"
-                                       " has parent");
                 return -1;
         }
  
@@ -154,8 +151,12 @@ static inline bool sched_debug(void)
  
  static int sd_degenerate(struct sched_domain *sd)
  {
-       if (cpumask_weight(sched_domain_span(sd)) == 1)
-               return 1;
+       if (cpumask_weight(sched_domain_span(sd)) == 1) {
+               if (sd->groups->sge)
+                       sd->flags &= ~SD_LOAD_BALANCE;
+               else
+                       return 1;
+       }
  
         /* Following flags need at least 2 groups */
         if (sd->flags & (SD_LOAD_BALANCE |
@@ -165,7 +166,8 @@ static int sd_degenerate(struct sched_domain *sd)
                          SD_SHARE_CPUCAPACITY |
                          SD_ASYM_CPUCAPACITY |
                          SD_SHARE_PKG_RESOURCES |
-                        SD_SHARE_POWERDOMAIN)) {
+                        SD_SHARE_POWERDOMAIN |
+                        SD_SHARE_CAP_STATES)) {
                 if (sd->groups != sd->groups->next)
                         return 0;
         }
@@ -198,7 +200,12 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_SHARE_CPUCAPACITY |
                                 SD_SHARE_PKG_RESOURCES |
                                 SD_PREFER_SIBLING |
-                               SD_SHARE_POWERDOMAIN);
+                               SD_SHARE_POWERDOMAIN |
+                               SD_SHARE_CAP_STATES);
+               if (parent->groups->sge) {
+                       parent->flags &= ~SD_LOAD_BALANCE;
+                       return 0;
+               }
                 if (nr_node_ids == 1)
                         pflags &= ~SD_SERIALIZE;
         }
@@ -294,6 +301,9 @@ static int init_rootdomain(struct root_domain *rd)
  
         if (cpupri_init(&rd->cpupri) != 0)
                 goto free_cpudl;
+
+       rd->max_cap_orig_cpu = rd->min_cap_orig_cpu = -1;
+
         return 0;
  
  free_cpudl:
@@ -405,11 +415,14 @@ DEFINE_PER_CPU(int, sd_llc_id);
  DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
  DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  DEFINE_PER_CPU(struct sched_domain *, sd_asym);
+DEFINE_PER_CPU(struct sched_domain *, sd_ea);
+DEFINE_PER_CPU(struct sched_domain *, sd_scs);
  
  static void update_top_cache_domain(int cpu)
  {
         struct sched_domain_shared *sds = NULL;
         struct sched_domain *sd;
+       struct sched_domain *ea_sd = NULL;
         int id = cpu;
         int size = 1;
  
@@ -430,6 +443,17 @@ static void update_top_cache_domain(int cpu)
  
         sd = highest_flag_domain(cpu, SD_ASYM_PACKING);
         rcu_assign_pointer(per_cpu(sd_asym, cpu), sd);
+
+       for_each_domain(cpu, sd) {
+               if (sd->groups->sge)
+                       ea_sd = sd;
+               else
+                       break;
+       }
+       rcu_assign_pointer(per_cpu(sd_ea, cpu), ea_sd);
+
+       sd = highest_flag_domain(cpu, SD_SHARE_CAP_STATES);
+       rcu_assign_pointer(per_cpu(sd_scs, cpu), sd);
  }
  
  /*
@@ -714,6 +738,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
         sg_span = sched_group_span(sg);
         sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
         sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
+       sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
  }
  
  static int
@@ -962,6 +987,108 @@ next:
         update_group_capacity(sd, cpu);
  }
  
+#define cap_state_power(s,i) (s->cap_states[i].power)
+#define cap_state_cap(s,i) (s->cap_states[i].cap)
+#define idle_state_power(s,i) (s->idle_states[i].power)
+
+static inline int sched_group_energy_equal(const struct sched_group_energy *a,
+               const struct sched_group_energy *b)
+{
+       int i;
+
+       /* check pointers first */
+       if (a == b)
+               return true;
+
+       /* check contents are equivalent */
+       if (a->nr_cap_states != b->nr_cap_states)
+               return false;
+       if (a->nr_idle_states != b->nr_idle_states)
+               return false;
+       for (i=0;i<a->nr_cap_states;i++){
+               if (cap_state_power(a,i) !=
+                       cap_state_power(b,i))
+                       return false;
+               if (cap_state_cap(a,i) !=
+                       cap_state_cap(b,i))
+                       return false;
+       }
+       for (i=0;i<a->nr_idle_states;i++){
+               if (idle_state_power(a,i) !=
+                       idle_state_power(b,i))
+                       return false;
+       }
+
+       return true;
+}
+
+#define energy_eff(e, n) \
+    ((e->cap_states[n].cap << SCHED_CAPACITY_SHIFT)/e->cap_states[n].power)
+
+static void init_sched_groups_energy(int cpu, struct sched_domain *sd,
+                                    sched_domain_energy_f fn)
+{
+       struct sched_group *sg = sd->groups;
+       const struct sched_group_energy *sge;
+       int i;
+
+       if (!(fn && fn(cpu)))
+               return;
+
+       if (cpu != group_balance_cpu(sg))
+               return;
+
+       if (sd->flags & SD_OVERLAP) {
+               pr_err("BUG: EAS does not support overlapping sd spans\n");
+#ifdef CONFIG_SCHED_DEBUG
+               pr_err("     the %s domain has SD_OVERLAP set\n", sd->name);
+#endif
+               return;
+       }
+
+       if (sd->child && !sd->child->groups->sge) {
+               pr_err("BUG: EAS setup borken for CPU%d\n", cpu);
+#ifdef CONFIG_SCHED_DEBUG
+               pr_err("     energy data on %s but not on %s domain\n",
+                       sd->name, sd->child->name);
+#endif
+               return;
+       }
+
+       sge = fn(cpu);
+
+       /*
+        * Check that the per-cpu provided sd energy data is consistent for all
+        * cpus within the mask.
+        */
+       if (cpumask_weight(sched_group_span(sg)) > 1) {
+               struct cpumask mask;
+
+               cpumask_xor(&mask, sched_group_span(sg), get_cpu_mask(cpu));
+
+               for_each_cpu(i, &mask)
+                       BUG_ON(!sched_group_energy_equal(sge,fn(i)));
+       }
+
+       /* Check that energy efficiency (capacity/power) is monotonically
+        * decreasing in the capacity state vector with higher indexes
+        */
+       for (i = 0; i < (sge->nr_cap_states - 1); i++) {
+               if (energy_eff(sge, i) > energy_eff(sge, i+1))
+                       continue;
+#ifdef CONFIG_SCHED_DEBUG
+               pr_warn("WARN: cpu=%d, domain=%s: incr. energy eff %lu[%d]->%lu[%d]\n",
+                       cpu, sd->name, energy_eff(sge, i), i,
+                       energy_eff(sge, i+1), i+1);
+#else
+               pr_warn("WARN: cpu=%d: incr. energy eff %lu[%d]->%lu[%d]\n",
+                       cpu, energy_eff(sge, i), i, energy_eff(sge, i+1), i+1);
+#endif
+       }
+
+       sd->groups->sge = fn(cpu);
+}
+
  /*
   * Initializers for schedule domains
   * Non-inlined to reduce accumulated stack pressure in build_sched_domains()
@@ -1081,6 +1208,7 @@ static int sched_domains_curr_level;
   *   SD_NUMA                - describes NUMA topologies
   *   SD_SHARE_POWERDOMAIN   - describes shared power domain
   *   SD_ASYM_CPUCAPACITY    - describes mixed capacity topologies
+ *   SD_SHARE_CAP_STATES    - describes shared capacity states
   *
   * Odd one out, which beside describing the topology has a quirk also
   * prescribes the desired behaviour that goes along with it:
@@ -1093,7 +1221,8 @@ static int sched_domains_curr_level;
          SD_NUMA |                      \
          SD_ASYM_PACKING |              \
          SD_ASYM_CPUCAPACITY |          \
-        SD_SHARE_POWERDOMAIN)
+        SD_SHARE_POWERDOMAIN |         \
+        SD_SHARE_CAP_STATES)
  
  static struct sched_domain *
  sd_init(struct sched_domain_topology_level *tl,
@@ -1202,15 +1331,11 @@ sd_init(struct sched_domain_topology_level *tl,
                 sd->idle_idx = 1;
         }
  
-       /*
-        * For all levels sharing cache; connect a sched_domain_shared
-        * instance.
-        */
-       if (sd->flags & SD_SHARE_PKG_RESOURCES) {
-               sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
-               atomic_inc(&sd->shared->ref);
+       sd->shared = *per_cpu_ptr(sdd->sds, sd_id);
+       atomic_inc(&sd->shared->ref);
+
+       if (sd->flags & SD_SHARE_PKG_RESOURCES)
                 atomic_set(&sd->shared->nr_busy_cpus, sd_weight);
-       }
  
         sd->private = sdd;
  
@@ -1669,8 +1794,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                                 *per_cpu_ptr(d.sd, i) = sd;
                         if (tl->flags & SDTL_OVERLAP)
                                 sd->flags |= SD_OVERLAP;
-                       if (cpumask_equal(cpu_map, sched_domain_span(sd)))
-                               break;
                 }
         }
  
@@ -1690,10 +1813,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
  
         /* Calculate CPU capacity for physical packages and nodes */
         for (i = nr_cpumask_bits-1; i >= 0; i--) {
+               struct sched_domain_topology_level *tl = sched_domain_topology;
+
                 if (!cpumask_test_cpu(i, cpu_map))
                         continue;
  
-               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) {
+               for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent, tl++) {
+                       init_sched_groups_energy(i, sd, tl->energy);
                         claim_allocations(i, sd);
                         init_sched_groups_capacity(i, sd);
                 }
@@ -1702,6 +1828,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
         /* Attach the domains */
         rcu_read_lock();
         for_each_cpu(i, cpu_map) {
+               int max_cpu = READ_ONCE(d.rd->max_cap_orig_cpu);
+               int min_cpu = READ_ONCE(d.rd->min_cap_orig_cpu);
+
                 rq = cpu_rq(i);
                 sd = *per_cpu_ptr(d.sd, i);
  
@@ -1709,6 +1838,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
                 if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
                         WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
  
+               if ((max_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig >
+                   cpu_rq(max_cpu)->cpu_capacity_orig))
+                       WRITE_ONCE(d.rd->max_cap_orig_cpu, i);
+
+               if ((min_cpu < 0) || (cpu_rq(i)->cpu_capacity_orig <
+                   cpu_rq(min_cpu)->cpu_capacity_orig))
+                       WRITE_ONCE(d.rd->min_cap_orig_cpu, i);
+
                 cpu_attach_domain(sd, d.rd, i);
         }
         rcu_read_unlock();
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c

new file mode 100644 (file)

index 0000000..2e6ef5f
--- /dev/null
+++ b/kernel/sched/tune.c
@@ -0,0 +1,559 @@
+#include <linux/cgroup.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <linux/printk.h>
+#include <linux/rcupdate.h>
+#include <linux/slab.h>
+
+#include <trace/events/sched.h>
+
+#include "sched.h"
+#include "tune.h"
+
+bool schedtune_initialized = false;
+extern struct reciprocal_value schedtune_spc_rdiv;
+
+/*
+ * EAS scheduler tunables for task groups.
+ */
+
+/* SchdTune tunables for a group of tasks */
+struct schedtune {
+       /* SchedTune CGroup subsystem */
+       struct cgroup_subsys_state css;
+
+       /* Boost group allocated ID */
+       int idx;
+
+       /* Boost value for tasks on that SchedTune CGroup */
+       int boost;
+
+       /* Hint to bias scheduling of tasks on that SchedTune CGroup
+        * towards idle CPUs */
+       int prefer_idle;
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+       return css ? container_of(css, struct schedtune, css) : NULL;
+}
+
+static inline struct schedtune *task_schedtune(struct task_struct *tsk)
+{
+       return css_st(task_css(tsk, schedtune_cgrp_id));
+}
+
+static inline struct schedtune *parent_st(struct schedtune *st)
+{
+       return css_st(st->css.parent);
+}
+
+/*
+ * SchedTune root control group
+ * The root control group is used to defined a system-wide boosting tuning,
+ * which is applied to all tasks in the system.
+ * Task specific boost tuning could be specified by creating and
+ * configuring a child control group under the root one.
+ * By default, system-wide boosting is disabled, i.e. no boosting is applied
+ * to tasks which are not into a child control group.
+ */
+static struct schedtune
+root_schedtune = {
+       .boost  = 0,
+       .prefer_idle = 0,
+};
+
+/*
+ * Maximum number of boost groups to support
+ * When per-task boosting is used we still allow only limited number of
+ * boost groups for two main reasons:
+ * 1. on a real system we usually have only few classes of workloads which
+ *    make sense to boost with different values (e.g. background vs foreground
+ *    tasks, interactive vs low-priority tasks)
+ * 2. a limited number allows for a simpler and more memory/time efficient
+ *    implementation especially for the computation of the per-CPU boost
+ *    value
+ */
+#define BOOSTGROUPS_COUNT 5
+
+/* Array of configured boostgroups */
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+       &root_schedtune,
+       NULL,
+};
+
+/* SchedTune boost groups
+ * Keep track of all the boost groups which impact on CPU, for example when a
+ * CPU has two RUNNABLE tasks belonging to two different boost groups and thus
+ * likely with different boost values.
+ * Since on each system we expect only a limited number of boost groups, here
+ * we use a simple array to keep track of the metrics required to compute the
+ * maximum per-CPU boosting value.
+ */
+struct boost_groups {
+       /* Maximum boost value for all RUNNABLE tasks on a CPU */
+       bool idle;
+       int boost_max;
+       struct {
+               /* The boost for tasks on that boost group */
+               int boost;
+               /* Count of RUNNABLE tasks on that boost group */
+               unsigned tasks;
+       } group[BOOSTGROUPS_COUNT];
+       /* CPU's boost group locking */
+       raw_spinlock_t lock;
+};
+
+/* Boost groups affecting each CPU in the system */
+DEFINE_PER_CPU(struct boost_groups, cpu_boost_groups);
+
+static void
+schedtune_cpu_update(int cpu)
+{
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       int boost_max;
+       int idx;
+
+       /* The root boost group is always active */
+       boost_max = bg->group[0].boost;
+       for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx) {
+               /*
+                * A boost group affects a CPU only if it has
+                * RUNNABLE tasks on that CPU
+                */
+               if (bg->group[idx].tasks == 0)
+                       continue;
+
+               boost_max = max(boost_max, bg->group[idx].boost);
+       }
+       /* Ensures boost_max is non-negative when all cgroup boost values
+        * are neagtive. Avoids under-accounting of cpu capacity which may cause
+        * task stacking and frequency spikes.*/
+       boost_max = max(boost_max, 0);
+       bg->boost_max = boost_max;
+}
+
+static int
+schedtune_boostgroup_update(int idx, int boost)
+{
+       struct boost_groups *bg;
+       int cur_boost_max;
+       int old_boost;
+       int cpu;
+
+       /* Update per CPU boost groups */
+       for_each_possible_cpu(cpu) {
+               bg = &per_cpu(cpu_boost_groups, cpu);
+
+               /*
+                * Keep track of current boost values to compute the per CPU
+                * maximum only when it has been affected by the new value of
+                * the updated boost group
+                */
+               cur_boost_max = bg->boost_max;
+               old_boost = bg->group[idx].boost;
+
+               /* Update the boost value of this boost group */
+               bg->group[idx].boost = boost;
+
+               /* Check if this update increase current max */
+               if (boost > cur_boost_max && bg->group[idx].tasks) {
+                       bg->boost_max = boost;
+                       trace_sched_tune_boostgroup_update(cpu, 1, bg->boost_max);
+                       continue;
+               }
+
+               /* Check if this update has decreased current max */
+               if (cur_boost_max == old_boost && old_boost > boost) {
+                       schedtune_cpu_update(cpu);
+                       trace_sched_tune_boostgroup_update(cpu, -1, bg->boost_max);
+                       continue;
+               }
+
+               trace_sched_tune_boostgroup_update(cpu, 0, bg->boost_max);
+       }
+
+       return 0;
+}
+
+#define ENQUEUE_TASK  1
+#define DEQUEUE_TASK -1
+
+static inline void
+schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
+{
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       int tasks = bg->group[idx].tasks + task_count;
+
+       /* Update boosted tasks count while avoiding to make it negative */
+       bg->group[idx].tasks = max(0, tasks);
+
+       trace_sched_tune_tasks_update(p, cpu, tasks, idx,
+                       bg->group[idx].boost, bg->boost_max);
+
+       /* Boost group activation or deactivation on that RQ */
+       if (tasks == 1 || tasks == 0)
+               schedtune_cpu_update(cpu);
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_enqueue_task(struct task_struct *p, int cpu)
+{
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       unsigned long irq_flags;
+       struct schedtune *st;
+       int idx;
+
+       if (unlikely(!schedtune_initialized))
+               return;
+
+       /*
+        * Boost group accouting is protected by a per-cpu lock and requires
+        * interrupt to be disabled to avoid race conditions for example on
+        * do_exit()::cgroup_exit() and task migration.
+        */
+       raw_spin_lock_irqsave(&bg->lock, irq_flags);
+       rcu_read_lock();
+
+       st = task_schedtune(p);
+       idx = st->idx;
+
+       schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
+       rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+       struct boost_groups *bg;
+       struct rq_flags rq_flags;
+       unsigned int cpu;
+       struct rq *rq;
+       int src_bg; /* Source boost group index */
+       int dst_bg; /* Destination boost group index */
+       int tasks;
+
+       if (unlikely(!schedtune_initialized))
+               return 0;
+
+
+       cgroup_taskset_for_each(task, css, tset) {
+
+               /*
+                * Lock the CPU's RQ the task is enqueued to avoid race
+                * conditions with migration code while the task is being
+                * accounted
+                */
+               rq = task_rq_lock(task, &rq_flags);
+
+               if (!task->on_rq) {
+                       task_rq_unlock(rq, task, &rq_flags);
+                       continue;
+               }
+
+               /*
+                * Boost group accouting is protected by a per-cpu lock and requires
+                * interrupt to be disabled to avoid race conditions on...
+                */
+               cpu = cpu_of(rq);
+               bg = &per_cpu(cpu_boost_groups, cpu);
+               raw_spin_lock(&bg->lock);
+
+               dst_bg = css_st(css)->idx;
+               src_bg = task_schedtune(task)->idx;
+
+               /*
+                * Current task is not changing boostgroup, which can
+                * happen when the new hierarchy is in use.
+                */
+               if (unlikely(dst_bg == src_bg)) {
+                       raw_spin_unlock(&bg->lock);
+                       task_rq_unlock(rq, task, &rq_flags);
+                       continue;
+               }
+
+               /*
+                * This is the case of a RUNNABLE task which is switching its
+                * current boost group.
+                */
+
+               /* Move task from src to dst boost group */
+               tasks = bg->group[src_bg].tasks - 1;
+               bg->group[src_bg].tasks = max(0, tasks);
+               bg->group[dst_bg].tasks += 1;
+
+               raw_spin_unlock(&bg->lock);
+               task_rq_unlock(rq, task, &rq_flags);
+
+               /* Update CPU boost group */
+               if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+                       schedtune_cpu_update(task_cpu(task));
+
+       }
+
+       return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+       /* This can happen only if SchedTune controller is mounted with
+        * other hierarchies ane one of them fails. Since usually SchedTune is
+        * mouted on its own hierarcy, for the time being we do not implement
+        * a proper rollback mechanism */
+       WARN(1, "SchedTune cancel attach not implemented");
+}
+
+/*
+ * NOTE: This function must be called while holding the lock on the CPU RQ
+ */
+void schedtune_dequeue_task(struct task_struct *p, int cpu)
+{
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       unsigned long irq_flags;
+       struct schedtune *st;
+       int idx;
+
+       if (unlikely(!schedtune_initialized))
+               return;
+
+       /*
+        * Boost group accouting is protected by a per-cpu lock and requires
+        * interrupt to be disabled to avoid race conditions on...
+        */
+       raw_spin_lock_irqsave(&bg->lock, irq_flags);
+       rcu_read_lock();
+
+       st = task_schedtune(p);
+       idx = st->idx;
+
+       schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
+       rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+int schedtune_cpu_boost(int cpu)
+{
+       struct boost_groups *bg;
+
+       bg = &per_cpu(cpu_boost_groups, cpu);
+       return bg->boost_max;
+}
+
+int schedtune_task_boost(struct task_struct *p)
+{
+       struct schedtune *st;
+       int task_boost;
+
+       if (unlikely(!schedtune_initialized))
+               return 0;
+
+       /* Get task boost value */
+       rcu_read_lock();
+       st = task_schedtune(p);
+       task_boost = st->boost;
+       rcu_read_unlock();
+
+       return task_boost;
+}
+
+int schedtune_prefer_idle(struct task_struct *p)
+{
+       struct schedtune *st;
+       int prefer_idle;
+
+       if (unlikely(!schedtune_initialized))
+               return 0;
+
+       /* Get prefer_idle value */
+       rcu_read_lock();
+       st = task_schedtune(p);
+       prefer_idle = st->prefer_idle;
+       rcu_read_unlock();
+
+       return prefer_idle;
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+       struct schedtune *st = css_st(css);
+
+       return st->prefer_idle;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+           u64 prefer_idle)
+{
+       struct schedtune *st = css_st(css);
+       st->prefer_idle = prefer_idle;
+
+       return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+       struct schedtune *st = css_st(css);
+
+       return st->boost;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+           s64 boost)
+{
+       struct schedtune *st = css_st(css);
+
+       if (boost < 0 || boost > 100)
+               return -EINVAL;
+
+       st->boost = boost;
+
+       /* Update CPU boost */
+       schedtune_boostgroup_update(st->idx, st->boost);
+
+       return 0;
+}
+
+static struct cftype files[] = {
+       {
+               .name = "boost",
+               .read_s64 = boost_read,
+               .write_s64 = boost_write,
+       },
+       {
+               .name = "prefer_idle",
+               .read_u64 = prefer_idle_read,
+               .write_u64 = prefer_idle_write,
+       },
+       { }     /* terminate */
+};
+
+static int
+schedtune_boostgroup_init(struct schedtune *st)
+{
+       struct boost_groups *bg;
+       int cpu;
+
+       /* Keep track of allocated boost groups */
+       allocated_group[st->idx] = st;
+
+       /* Initialize the per CPU boost groups */
+       for_each_possible_cpu(cpu) {
+               bg = &per_cpu(cpu_boost_groups, cpu);
+               bg->group[st->idx].boost = 0;
+               bg->group[st->idx].tasks = 0;
+               raw_spin_lock_init(&bg->lock);
+       }
+
+       return 0;
+}
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+       struct schedtune *st;
+       int idx;
+
+       if (!parent_css)
+               return &root_schedtune.css;
+
+       /* Allow only single level hierachies */
+       if (parent_css != &root_schedtune.css) {
+               pr_err("Nested SchedTune boosting groups not allowed\n");
+               return ERR_PTR(-ENOMEM);
+       }
+
+       /* Allow only a limited number of boosting groups */
+       for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+               if (!allocated_group[idx])
+                       break;
+       if (idx == BOOSTGROUPS_COUNT) {
+               pr_err("Trying to create more than %d SchedTune boosting groups\n",
+                      BOOSTGROUPS_COUNT);
+               return ERR_PTR(-ENOSPC);
+       }
+
+       st = kzalloc(sizeof(*st), GFP_KERNEL);
+       if (!st)
+               goto out;
+
+       /* Initialize per CPUs boost group support */
+       st->idx = idx;
+       if (schedtune_boostgroup_init(st))
+               goto release;
+
+       return &st->css;
+
+release:
+       kfree(st);
+out:
+       return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_boostgroup_release(struct schedtune *st)
+{
+       /* Reset this boost group */
+       schedtune_boostgroup_update(st->idx, 0);
+
+       /* Keep track of allocated boost groups */
+       allocated_group[st->idx] = NULL;
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+       struct schedtune *st = css_st(css);
+
+       schedtune_boostgroup_release(st);
+       kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+       .css_alloc      = schedtune_css_alloc,
+       .css_free       = schedtune_css_free,
+       .can_attach     = schedtune_can_attach,
+       .cancel_attach  = schedtune_cancel_attach,
+       .legacy_cftypes = files,
+       .early_init     = 1,
+};
+
+static inline void
+schedtune_init_cgroups(void)
+{
+       struct boost_groups *bg;
+       int cpu;
+
+       /* Initialize the per CPU boost groups */
+       for_each_possible_cpu(cpu) {
+               bg = &per_cpu(cpu_boost_groups, cpu);
+               memset(bg, 0, sizeof(struct boost_groups));
+               raw_spin_lock_init(&bg->lock);
+       }
+
+       pr_info("schedtune: configured to support %d boost groups\n",
+               BOOSTGROUPS_COUNT);
+
+       schedtune_initialized = true;
+}
+
+/*
+ * Initialize the cgroup structures
+ */
+static int
+schedtune_init(void)
+{
+       schedtune_spc_rdiv = reciprocal_value(100);
+       schedtune_init_cgroups();
+       return 0;
+}
+postcore_initcall(schedtune_init);
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h

new file mode 100644 (file)

index 0000000..e79e1b1
--- /dev/null
+++ b/kernel/sched/tune.h
@@ -0,0 +1,33 @@
+
+#ifdef CONFIG_SCHED_TUNE
+
+#include <linux/reciprocal_div.h>
+
+/*
+ * System energy normalization constants
+ */
+struct target_nrg {
+       unsigned long min_power;
+       unsigned long max_power;
+       struct reciprocal_value rdiv;
+};
+
+int schedtune_cpu_boost(int cpu);
+int schedtune_task_boost(struct task_struct *tsk);
+
+int schedtune_prefer_idle(struct task_struct *tsk);
+
+void schedtune_enqueue_task(struct task_struct *p, int cpu);
+void schedtune_dequeue_task(struct task_struct *p, int cpu);
+
+#else /* CONFIG_SCHED_TUNE */
+
+#define schedtune_cpu_boost(cpu)  0
+#define schedtune_task_boost(tsk) 0
+
+#define schedtune_prefer_idle(tsk) 0
+
+#define schedtune_enqueue_task(task, cpu) do { } while (0)
+#define schedtune_dequeue_task(task, cpu) do { } while (0)
+
+#endif /* CONFIG_SCHED_TUNE */
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c

new file mode 100644 (file)

index 0000000..01a411b
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,921 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ */
+
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT            0
+#define WINDOW_STATS_MAX               1
+#define WINDOW_STATS_MAX_RECENT_AVG    2
+#define WINDOW_STATS_AVG               3
+#define WINDOW_STATS_INVALID_POLICY    4
+
+#define EXITING_TASK_MARKER    0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+       WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
+
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
+__read_mostly unsigned int walt_ravg_window =
+                                           (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static __read_mostly bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+       return p->ravg.demand;
+}
+
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+       rq->cum_window_demand += delta;
+       if (unlikely((s64)rq->cum_window_demand < 0))
+               rq->cum_window_demand = 0;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg += p->ravg.demand;
+
+       /*
+        * Add a task's contribution to the cumulative window demand when
+        *
+        * (1) task is enqueued with on_rq = 1 i.e migration,
+        *     prio/cgroup/class change.
+        * (2) task is waking for the first time in this window.
+        */
+       if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+               fixup_cum_window_demand(rq, p->ravg.demand);
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg -= p->ravg.demand;
+       BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+       /*
+        * on_rq will be 1 for sleeping tasks. So check if the task
+        * is migrating or dequeuing in RUNNING state to change the
+        * prio/cgroup/class.
+        */
+       if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+               fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+                             struct task_struct *p, u64 new_task_load)
+{
+       s64 task_load_delta = (s64)new_task_load - task_load(p);
+
+       rq->cumulative_runnable_avg += task_load_delta;
+       if ((s64)rq->cumulative_runnable_avg < 0)
+               panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+                       task_load_delta, task_load(p));
+
+       fixup_cum_window_demand(rq, task_load_delta);
+}
+
+u64 walt_ktime_clock(void)
+{
+       if (unlikely(walt_ktime_suspended))
+               return ktime_to_ns(ktime_last);
+       return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+       walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+       ktime_last = ktime_get();
+       walt_ktime_suspended = true;
+       return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+       .resume = walt_resume,
+       .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+       register_syscore_ops(&walt_syscore_ops);
+       return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+       if (p->flags & PF_EXITING) {
+               if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+                       p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+       unsigned int adj_window;
+       bool no_walt = walt_disabled;
+
+       get_option(&str, &walt_ravg_window);
+
+       /* Adjust for CONFIG_HZ */
+       adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+       /* Warn if we're a bit too far away from the expected window size */
+       WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+            "tick-adjusted window size %u, original was %u\n", adj_window,
+            walt_ravg_window);
+
+       walt_ravg_window = adj_window;
+
+       walt_disabled = walt_disabled ||
+                       (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+                        walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+       WARN(!no_walt && walt_disabled,
+            "invalid window size, disabling WALT\n");
+
+       return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+       s64 delta;
+       int nr_windows;
+
+       delta = wallclock - rq->window_start;
+       /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+       if (delta < 0) {
+               delta = 0;
+               WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+       }
+
+       if (delta < walt_ravg_window)
+               return;
+
+       nr_windows = div64_u64(delta, walt_ravg_window);
+       rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+       rq->cum_window_demand = rq->cumulative_runnable_avg;
+}
+
+extern unsigned long capacity_curr_of(int cpu);
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+       unsigned long capcurr = capacity_curr_of(cpu_of(rq));
+
+       return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+       if (!walt_io_is_busy)
+               return 0;
+
+       return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+                                u64 delta, u64 wallclock)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags, nr_windows;
+       u64 cur_jiffies_ts;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       /*
+        * cputime (wallclock) uses sched_clock so use the same here for
+        * consistency.
+        */
+       delta += sched_clock() - wallclock;
+       cur_jiffies_ts = get_jiffies_64();
+
+       if (is_idle_task(curr))
+               walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+                                delta);
+
+       nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+       if (nr_windows) {
+               if (nr_windows < 10) {
+                       /* Decay CPU's irqload by 3/4 for each window. */
+                       rq->avg_irqload *= (3 * nr_windows);
+                       rq->avg_irqload = div64_u64(rq->avg_irqload,
+                                                   4 * nr_windows);
+               } else {
+                       rq->avg_irqload = 0;
+               }
+               rq->avg_irqload += rq->cur_irqload;
+               rq->cur_irqload = 0;
+       }
+
+       rq->cur_irqload += delta;
+       rq->irqload_ts = cur_jiffies_ts;
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+       struct rq *rq = cpu_rq(cpu);
+       s64 delta;
+       delta = get_jiffies_64() - rq->irqload_ts;
+
+        /*
+        * Current context can be preempted by irq and rq->irqload_ts can be
+        * updated by irq context so that delta can be negative.
+        * But this is okay and we can safely return as this means there
+        * was recent irq occurrence.
+        */
+
+        if (delta < WALT_HIGH_IRQ_TIMEOUT)
+               return rq->avg_irqload;
+        else
+               return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+       return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+                                    u64 irqtime, int event)
+{
+       if (is_idle_task(p)) {
+               /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+               if (event == PICK_NEXT_TASK)
+                       return 0;
+
+               /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+               return irqtime || cpu_is_waiting_on_io(rq);
+       }
+
+       if (event == TASK_WAKE)
+               return 0;
+
+       if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+                                        event == TASK_UPDATE)
+               return 1;
+
+       /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+       return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       int new_window, nr_full_windows = 0;
+       int p_is_curr_task = (p == rq->curr);
+       u64 mark_start = p->ravg.mark_start;
+       u64 window_start = rq->window_start;
+       u32 window_size = walt_ravg_window;
+       u64 delta;
+
+       new_window = mark_start < window_start;
+       if (new_window) {
+               nr_full_windows = div64_u64((window_start - mark_start),
+                                               window_size);
+               if (p->ravg.active_windows < USHRT_MAX)
+                       p->ravg.active_windows++;
+       }
+
+       /* Handle per-task window rollover. We don't care about the idle
+        * task or exiting tasks. */
+       if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+               u32 curr_window = 0;
+
+               if (!nr_full_windows)
+                       curr_window = p->ravg.curr_window;
+
+               p->ravg.prev_window = curr_window;
+               p->ravg.curr_window = 0;
+       }
+
+       if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+               /* account_busy_for_cpu_time() = 0, so no update to the
+                * task's current window needs to be made. This could be
+                * for example
+                *
+                *   - a wakeup event on a task within the current
+                *     window (!new_window below, no action required),
+                *   - switching to a new task from idle (PICK_NEXT_TASK)
+                *     in a new window where irqtime is 0 and we aren't
+                *     waiting on IO */
+
+               if (!new_window)
+                       return;
+
+               /* A new window has started. The RQ demand must be rolled
+                * over if p is the current task. */
+               if (p_is_curr_task) {
+                       u64 prev_sum = 0;
+
+                       /* p is either idle task or an exiting task */
+                       if (!nr_full_windows) {
+                               prev_sum = rq->curr_runnable_sum;
+                       }
+
+                       rq->prev_runnable_sum = prev_sum;
+                       rq->curr_runnable_sum = 0;
+               }
+
+               return;
+       }
+
+       if (!new_window) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. No rollover
+                * since we didn't start a new window. An example of this is
+                * when a task starts execution and then sleeps within the
+                * same window. */
+
+               if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+                       delta = wallclock - mark_start;
+               else
+                       delta = irqtime;
+               delta = scale_exec_time(delta, rq);
+               rq->curr_runnable_sum += delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window += delta;
+
+               return;
+       }
+
+       if (!p_is_curr_task) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has also started, but p is not the current task, so the
+                * window is not rolled over - just split up and account
+                * as necessary into curr and prev. The window is only
+                * rolled over when a new window is processed for the current
+                * task.
+                *
+                * Irqtime can't be accounted by a task that isn't the
+                * currently running task. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window += delta;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window = delta;
+               }
+               rq->prev_runnable_sum += delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum += delta;
+               if (!exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. If any of these three above conditions are true
+                * then this busy time can't be accounted as irqtime.
+                *
+                * Busy time for the idle task or exiting tasks need not
+                * be accounted.
+                *
+                * An example of this would be a task that starts execution
+                * and then sleeps once a new window has begun. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window += delta;
+
+                       delta += rq->curr_runnable_sum;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window = delta;
+
+               }
+               /*
+                * Rollover for normal runnable sum is done here by overwriting
+                * the values in prev_runnable_sum and curr_runnable_sum.
+                * Rollover for new task runnable sum has completed by previous
+                * if-else statement.
+                */
+               rq->prev_runnable_sum = delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum = delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (irqtime) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. The current task must be the idle task because
+                * irqtime is not accounted for any other task.
+                *
+                * Irqtime will be accounted each time we process IRQ activity
+                * after a period of idleness, so we know the IRQ busy time
+                * started at wallclock - irqtime. */
+
+               BUG_ON(!is_idle_task(p));
+               mark_start = wallclock - irqtime;
+
+               /* Roll window over. If IRQ busy time was just in the current
+                * window then that is all that need be accounted. */
+               rq->prev_runnable_sum = rq->curr_runnable_sum;
+               if (mark_start > window_start) {
+                       rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+                       return;
+               }
+
+               /* The IRQ busy time spanned multiple windows. Process the
+                * busy time preceding the current window start first. */
+               delta = window_start - mark_start;
+               if (delta > window_size)
+                       delta = window_size;
+               delta = scale_exec_time(delta, rq);
+               rq->prev_runnable_sum += delta;
+
+               /* Process the remaining IRQ busy time in the current window. */
+               delta = wallclock - window_start;
+               rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+               return;
+       }
+
+       BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+       /* No need to bother updating task demand for exiting tasks
+        * or the idle task. */
+       if (exiting_task(p) || is_idle_task(p))
+               return 0;
+
+       /* When a task is waking up it is completing a segment of non-busy
+        * time. Likewise, if wait time is not treated as busy time, then
+        * when a task begins to run or is migrated, it is not running and
+        * is completing a segment of non-busy time. */
+       if (event == TASK_WAKE || (!walt_account_wait_time &&
+                        (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+                        u32 runtime, int samples, int event)
+{
+       u32 *hist = &p->ravg.sum_history[0];
+       int ridx, widx;
+       u32 max = 0, avg, demand;
+       u64 sum = 0;
+
+       /* Ignore windows where task had no activity */
+       if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+                       goto done;
+
+       /* Push new 'runtime' value onto stack */
+       widx = walt_ravg_hist_size - 1;
+       ridx = widx - samples;
+       for (; ridx >= 0; --widx, --ridx) {
+               hist[widx] = hist[ridx];
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+               hist[widx] = runtime;
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       p->ravg.sum = 0;
+
+       if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+               demand = runtime;
+       } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+               demand = max;
+       } else {
+               avg = div64_u64(sum, walt_ravg_hist_size);
+               if (walt_window_stats_policy == WINDOW_STATS_AVG)
+                       demand = avg;
+               else
+                       demand = max(avg, runtime);
+       }
+
+       /*
+        * A throttled deadline sched class task gets dequeued without
+        * changing p->on_rq. Since the dequeue decrements hmp stats
+        * avoid decrementing it here again.
+        *
+        * When window is rolled over, the cumulative window demand
+        * is reset to the cumulative runnable average (contribution from
+        * the tasks on the runqueue). If the current task is dequeued
+        * already, it's demand is not included in the cumulative runnable
+        * average. So add the task demand separately to cumulative window
+        * demand.
+        */
+       if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+               if (task_on_rq_queued(p))
+                       fixup_cumulative_runnable_avg(rq, p, demand);
+               else if (rq->curr == p)
+                       fixup_cum_window_demand(rq, demand);
+       }
+
+       p->ravg.demand = demand;
+
+done:
+       trace_walt_update_history(rq, p, runtime, samples, event);
+       return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+                               u64 delta)
+{
+       delta = scale_exec_time(delta, rq);
+       p->ravg.sum += delta;
+       if (unlikely(p->ravg.sum > walt_ravg_window))
+               p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *     a) Task event is contained within one window.
+ *             window_start < mark_start < wallclock
+ *
+ *             ws   ms  wc
+ *             |    |   |
+ *             V    V   V
+ *             |---------------|
+ *
+ *     In this case, p->ravg.sum is updated *iff* event is appropriate
+ *     (ex: event == PUT_PREV_TASK)
+ *
+ *     b) Task event spans two windows.
+ *             mark_start < window_start < wallclock
+ *
+ *             ms   ws   wc
+ *             |    |    |
+ *             V    V    V
+ *             -----|-------------------
+ *
+ *     In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *     is appropriate, then a new window sample is recorded followed
+ *     by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *     c) Task event spans more than two windows.
+ *
+ *             ms ws_tmp                          ws  wc
+ *             |  |                               |   |
+ *             V  V                               V   V
+ *             ---|-------|-------|-------|-------|------
+ *                |                               |
+ *                |<------ nr_full_windows ------>|
+ *
+ *     In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *     event is appropriate, window sample of p->ravg.sum is recorded,
+ *     'nr_full_window' samples of window_size is also recorded *iff*
+ *     event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *     *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock)
+{
+       u64 mark_start = p->ravg.mark_start;
+       u64 delta, window_start = rq->window_start;
+       int new_window, nr_full_windows;
+       u32 window_size = walt_ravg_window;
+
+       new_window = mark_start < window_start;
+       if (!account_busy_for_task_demand(p, event)) {
+               if (new_window)
+                       /* If the time accounted isn't being accounted as
+                        * busy time, and a new window started, only the
+                        * previous window need be closed out with the
+                        * pre-existing demand. Multiple windows may have
+                        * elapsed, but since empty windows are dropped,
+                        * it is not necessary to account those. */
+                       update_history(rq, p, p->ravg.sum, 1, event);
+               return;
+       }
+
+       if (!new_window) {
+               /* The simple case - busy time contained within the existing
+                * window. */
+               add_to_task_demand(rq, p, wallclock - mark_start);
+               return;
+       }
+
+       /* Busy time spans at least two windows. Temporarily rewind
+        * window_start to first window boundary after mark_start. */
+       delta = window_start - mark_start;
+       nr_full_windows = div64_u64(delta, window_size);
+       window_start -= (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (window_start - mark_start) first */
+       add_to_task_demand(rq, p, window_start - mark_start);
+
+       /* Push new sample(s) into task's demand history */
+       update_history(rq, p, p->ravg.sum, 1, event);
+       if (nr_full_windows)
+               update_history(rq, p, scale_exec_time(window_size, rq),
+                              nr_full_windows, event);
+
+       /* Roll window_start back to current to process any remainder
+        * in current window. */
+       window_start += (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (wallclock - window_start) next */
+       mark_start = window_start;
+       add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       if (walt_disabled || !rq->window_start)
+               return;
+
+       /* there's a bug here - there are many cases where
+        * we enter here without holding this lock, coming from
+        * walt_fixup_busy_time - looks like in 4.14 we don't
+        * hold the dest_rq at time of migration, but I haven't
+        * yet worked out if it is safe to always lock dest_rq there.
+        *
+        * temporarily disable this assert to continue checking the
+        * rest of the locking here.
+        */
+       //lockdep_assert_held(&rq->lock);
+
+       update_window_start(rq, wallclock);
+
+       if (!p->ravg.mark_start)
+               goto done;
+
+       update_task_demand(p, rq, event, wallclock);
+       update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+       trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+       p->ravg.mark_start = wallclock;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+       u32 sum = 0;
+
+       if (exiting_task(p))
+               sum = EXITING_TASK_MARKER;
+
+       memset(&p->ravg, 0, sizeof(struct ravg));
+       /* Retain EXITING_TASK marker */
+       p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+       u64 wallclock;
+       struct rq *rq = task_rq(p);
+
+       if (!rq->window_start) {
+               reset_task_stats(p);
+               return;
+       }
+
+       wallclock = walt_ktime_clock();
+       p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf)
+{
+       if (likely(rq->window_start))
+               return;
+
+       if (cpu_of(rq) == sync_cpu) {
+               rq->window_start = 1;
+       } else {
+               struct rq *sync_rq = cpu_rq(sync_cpu);
+               rq_unpin_lock(rq, rf);
+               double_lock_balance(rq, sync_rq);
+               rq->window_start = sync_rq->window_start;
+               rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+               raw_spin_unlock(&sync_rq->lock);
+               rq_repin_lock(rq, rf);
+       }
+
+       rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+       if (cpu == sync_cpu)
+               sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+       struct rq *src_rq = task_rq(p);
+       struct rq *dest_rq = cpu_rq(new_cpu);
+       u64 wallclock;
+
+       if (!p->on_rq && p->state != TASK_WAKING)
+               return;
+
+       if (exiting_task(p)) {
+               return;
+       }
+
+       if (p->state == TASK_WAKING)
+               double_rq_lock(src_rq, dest_rq);
+
+       wallclock = walt_ktime_clock();
+
+//#define LOCK_CONDITION(rq) (debug_locks && !lockdep_is_held(&rq->lock))
+//     WARN(LOCK_CONDITION(task_rq(p)), "task_rq(p) not held. p->state=%08lx new_cpu=%d task_cpu=%d", p->state, new_cpu, p->cpu);
+//     WARN(LOCK_CONDITION(dest_rq), "dest_rq not held. p->state=%08lx new_cpu=%d task_cpu=%d", p->state, new_cpu, p->cpu);
+
+       /*
+        * It seems that in lots of cases we don't have
+        * dest_rq locked when we get here, which means
+        * we can't be sure to the WALT stats - someone
+        * needs to fix this.
+        */
+       walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+                       TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(dest_rq->curr, dest_rq,
+                       TASK_UPDATE, wallclock, 0);
+
+//     WARN(LOCK_CONDITION(task_rq(p)), "task_rq(p) not held after rq update. p->state=%08lx new_cpu=%d task_cpu=%d", p->state, new_cpu, p->cpu);
+       walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+       /*
+        * When a task is migrating during the wakeup, adjust
+        * the task's contribution towards cumulative window
+        * demand.
+        */
+       if (p->state == TASK_WAKING &&
+           p->last_sleep_ts >= src_rq->window_start) {
+               fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+               fixup_cum_window_demand(dest_rq, p->ravg.demand);
+       }
+
+       if (p->ravg.curr_window) {
+               src_rq->curr_runnable_sum -= p->ravg.curr_window;
+               dest_rq->curr_runnable_sum += p->ravg.curr_window;
+       }
+
+       if (p->ravg.prev_window) {
+               src_rq->prev_runnable_sum -= p->ravg.prev_window;
+               dest_rq->prev_runnable_sum += p->ravg.prev_window;
+       }
+
+       if ((s64)src_rq->prev_runnable_sum < 0) {
+               src_rq->prev_runnable_sum = 0;
+               WARN_ON(1);
+       }
+       if ((s64)src_rq->curr_runnable_sum < 0) {
+               src_rq->curr_runnable_sum = 0;
+               WARN_ON(1);
+       }
+
+       trace_walt_migration_update_sum(src_rq, p);
+       trace_walt_migration_update_sum(dest_rq, p);
+
+       if (p->state == TASK_WAKING)
+               double_rq_unlock(src_rq, dest_rq);
+}
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+       int i;
+       u32 init_load_windows =
+                       div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+                          (u64)walt_ravg_window, 100);
+       u32 init_load_pct = current->init_load_pct;
+
+       p->init_load_pct = 0;
+       memset(&p->ravg, 0, sizeof(struct ravg));
+
+       if (init_load_pct) {
+               init_load_windows = div64_u64((u64)init_load_pct *
+                         (u64)walt_ravg_window, 100);
+       }
+
+       p->ravg.demand = init_load_windows;
+       for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+               p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h

new file mode 100644 (file)

index 0000000..c7a4ef9
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+               u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf);
+void walt_migrate_sync_cpu(int cpu);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+               int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq, struct rq_flags *rf) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern bool walt_disabled;
+
+#endif
diff --git a/kernel/sys.c b/kernel/sys.c

index 524a4cb9bbe25d02d39d82e145c78d0584569d60..745953a1a736ad444b9de43ffde5ab4b84d677de 100644 (file)
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -42,6 +42,8 @@
  #include <linux/syscore_ops.h>
  #include <linux/version.h>
  #include <linux/ctype.h>
+#include <linux/mm.h>
+#include <linux/mempolicy.h>
  
  #include <linux/compat.h>
  #include <linux/syscalls.h>
@@ -2184,6 +2186,153 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data)
         return 1;
  }
  
+#ifdef CONFIG_MMU
+static int prctl_update_vma_anon_name(struct vm_area_struct *vma,
+               struct vm_area_struct **prev,
+               unsigned long start, unsigned long end,
+               const char __user *name_addr)
+{
+       struct mm_struct *mm = vma->vm_mm;
+       int error = 0;
+       pgoff_t pgoff;
+
+       if (name_addr == vma_get_anon_name(vma)) {
+               *prev = vma;
+               goto out;
+       }
+
+       pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+       *prev = vma_merge(mm, *prev, start, end, vma->vm_flags, vma->anon_vma,
+                               vma->vm_file, pgoff, vma_policy(vma),
+                               vma->vm_userfaultfd_ctx, name_addr);
+       if (*prev) {
+               vma = *prev;
+               goto success;
+       }
+
+       *prev = vma;
+
+       if (start != vma->vm_start) {
+               error = split_vma(mm, vma, start, 1);
+               if (error)
+                       goto out;
+       }
+
+       if (end != vma->vm_end) {
+               error = split_vma(mm, vma, end, 0);
+               if (error)
+                       goto out;
+       }
+
+success:
+       if (!vma->vm_file)
+               vma->anon_name = name_addr;
+
+out:
+       if (error == -ENOMEM)
+               error = -EAGAIN;
+       return error;
+}
+
+static int prctl_set_vma_anon_name(unsigned long start, unsigned long end,
+                       unsigned long arg)
+{
+       unsigned long tmp;
+       struct vm_area_struct *vma, *prev;
+       int unmapped_error = 0;
+       int error = -EINVAL;
+
+       /*
+        * If the interval [start,end) covers some unmapped address
+        * ranges, just ignore them, but return -ENOMEM at the end.
+        * - this matches the handling in madvise.
+        */
+       vma = find_vma_prev(current->mm, start, &prev);
+       if (vma && start > vma->vm_start)
+               prev = vma;
+
+       for (;;) {
+               /* Still start < end. */
+               error = -ENOMEM;
+               if (!vma)
+                       return error;
+
+               /* Here start < (end|vma->vm_end). */
+               if (start < vma->vm_start) {
+                       unmapped_error = -ENOMEM;
+                       start = vma->vm_start;
+                       if (start >= end)
+                               return error;
+               }
+
+               /* Here vma->vm_start <= start < (end|vma->vm_end) */
+               tmp = vma->vm_end;
+               if (end < tmp)
+                       tmp = end;
+
+               /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */
+               error = prctl_update_vma_anon_name(vma, &prev, start, tmp,
+                               (const char __user *)arg);
+               if (error)
+                       return error;
+               start = tmp;
+               if (prev && start < prev->vm_end)
+                       start = prev->vm_end;
+               error = unmapped_error;
+               if (start >= end)
+                       return error;
+               if (prev)
+                       vma = prev->vm_next;
+               else    /* madvise_remove dropped mmap_sem */
+                       vma = find_vma(current->mm, start);
+       }
+}
+
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+               unsigned long len_in, unsigned long arg)
+{
+       struct mm_struct *mm = current->mm;
+       int error;
+       unsigned long len;
+       unsigned long end;
+
+       if (start & ~PAGE_MASK)
+               return -EINVAL;
+       len = (len_in + ~PAGE_MASK) & PAGE_MASK;
+
+       /* Check to see whether len was rounded up from small -ve to zero */
+       if (len_in && !len)
+               return -EINVAL;
+
+       end = start + len;
+       if (end < start)
+               return -EINVAL;
+
+       if (end == start)
+               return 0;
+
+       down_write(&mm->mmap_sem);
+
+       switch (opt) {
+       case PR_SET_VMA_ANON_NAME:
+               error = prctl_set_vma_anon_name(start, end, arg);
+               break;
+       default:
+               error = -EINVAL;
+       }
+
+       up_write(&mm->mmap_sem);
+
+       return error;
+}
+#else /* CONFIG_MMU */
+static int prctl_set_vma(unsigned long opt, unsigned long start,
+               unsigned long len_in, unsigned long arg)
+{
+       return -EINVAL;
+}
+#endif
+
  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
                 unsigned long, arg4, unsigned long, arg5)
  {
@@ -2386,6 +2535,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
         case PR_GET_FP_MODE:
                 error = GET_FP_MODE(me);
                 break;
+       case PR_SET_VMA:
+               error = prctl_set_vma(arg2, arg3, arg4, arg5);
+               break;
         default:
                 error = -EINVAL;
                 break;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 56aca862c4f584f59cf29a932daf6491d8abdbc0..a536801c957b2c3037d1830d015c98efe394e935 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -329,6 +329,50 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_sched_granularity_ns,
                 .extra2         = &max_sched_granularity_ns,
         },
+#ifdef CONFIG_SCHED_WALT
+       {
+               .procname       = "sched_use_walt_cpu_util",
+               .data           = &sysctl_sched_use_walt_cpu_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_use_walt_task_util",
+               .data           = &sysctl_sched_use_walt_task_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_init_task_load_pct",
+               .data           = &sysctl_sched_walt_init_task_load_pct,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_cpu_high_irqload",
+               .data           = &sysctl_sched_walt_cpu_high_irqload,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif
+       {
+               .procname       = "sched_sync_hint_enable",
+               .data           = &sysctl_sched_sync_hint_enable,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_cstate_aware",
+               .data           = &sysctl_sched_cstate_aware,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         {
                 .procname       = "sched_wakeup_granularity_ns",
                 .data           = &sysctl_sched_wakeup_granularity,
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig

index 434c840e2d82f64c127a5c13f1f6aa244544ed43..a37b87be4aa552098e9f4b478feb56eb2c771bbc 100644 (file)
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -73,6 +73,9 @@ config EVENT_TRACING
          select GLOB
         bool
  
+config GPU_TRACEPOINTS
+       bool
+
  config CONTEXT_SWITCH_TRACER
         bool
  
@@ -160,6 +163,17 @@ config FUNCTION_GRAPH_TRACER
           address on the current task structure into a stack of calls.
  
  
+config PREEMPTIRQ_EVENTS
+       bool "Enable trace events for preempt and irq disable/enable"
+       select TRACE_IRQFLAGS
+       depends on DEBUG_PREEMPT || !PROVE_LOCKING
+       default n
+       help
+         Enable tracing of disable and enable events for preemption and irqs.
+         For tracing preempt disable/enable events, DEBUG_PREEMPT must be
+         enabled. For tracing irq disable/enable events, PROVE_LOCKING must
+         be disabled.
+
  config IRQSOFF_TRACER
         bool "Interrupts-off Latency Tracer"
         default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile

index 19a15b2f119010f5dd38d2dfe99ff7d2eb81a5f0..803e38a33bbc11e02b86b1d5b6d49f64a4bacdca 100644 (file)
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -35,6 +35,7 @@ obj-$(CONFIG_TRACING) += trace_printk.o
  obj-$(CONFIG_TRACING_MAP) += tracing_map.o
  obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
  obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
+obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o
  obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o
  obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o
  obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o
@@ -68,6 +69,7 @@ obj-$(CONFIG_KGDB_KDB) += trace_kdb.o
  endif
  obj-$(CONFIG_PROBE_EVENTS) += trace_probe.o
  obj-$(CONFIG_UPROBE_EVENTS) += trace_uprobe.o
+obj-$(CONFIG_GPU_TRACEPOINTS) += gpu-traces.o
  
  obj-$(CONFIG_TRACEPOINT_BENCHMARK) += trace_benchmark.o
  
diff --git a/kernel/trace/gpu-traces.c b/kernel/trace/gpu-traces.c

new file mode 100644 (file)

index 0000000..a4b3f00
--- /dev/null
+++ b/kernel/trace/gpu-traces.c
@@ -0,0 +1,23 @@
+/*
+ * GPU tracepoints
+ *
+ * Copyright (C) 2013 Google, Inc.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/gpu.h>
+
+EXPORT_TRACEPOINT_SYMBOL(gpu_sched_switch);
+EXPORT_TRACEPOINT_SYMBOL(gpu_job_enqueue);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c

index 23c0b0cb5fb95c9875fb35cbd0d22f027430343c..c2492bdf749bb59ad713e0f98fdab253f7ba3767 100644 (file)
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -66,6 +66,9 @@ struct fgraph_data {
  
  #define TRACE_GRAPH_INDENT     2
  
+/* Flag options */
+#define TRACE_GRAPH_PRINT_FLAT         0x80
+
  unsigned int fgraph_max_depth;
  
  static struct tracer_opt trace_opts[] = {
@@ -89,6 +92,8 @@ static struct tracer_opt trace_opts[] = {
         { TRACER_OPT(sleep-time, TRACE_GRAPH_SLEEP_TIME) },
         /* Include time within nested functions */
         { TRACER_OPT(graph-time, TRACE_GRAPH_GRAPH_TIME) },
+       /* Use standard trace formatting rather than hierarchical */
+       { TRACER_OPT(funcgraph-flat, TRACE_GRAPH_PRINT_FLAT) },
         { } /* Empty entry */
  };
  
@@ -1244,6 +1249,9 @@ print_graph_function_flags(struct trace_iterator *iter, u32 flags)
         int cpu = iter->cpu;
         int ret;
  
+       if (flags & TRACE_GRAPH_PRINT_FLAT)
+               return TRACE_TYPE_UNHANDLED;
+
         if (data && per_cpu_ptr(data->cpu_data, cpu)->ignore) {
                 per_cpu_ptr(data->cpu_data, cpu)->ignore = 0;
                 return TRACE_TYPE_HANDLED;
@@ -1301,13 +1309,6 @@ print_graph_function(struct trace_iterator *iter)
         return print_graph_function_flags(iter, tracer_flags.val);
  }
  
-static enum print_line_t
-print_graph_function_event(struct trace_iterator *iter, int flags,
-                          struct trace_event *event)
-{
-       return print_graph_function(iter);
-}
-
  static void print_lat_header(struct seq_file *s, u32 flags)
  {
         static const char spaces[] = "                " /* 16 spaces */
@@ -1376,6 +1377,11 @@ void print_graph_headers_flags(struct seq_file *s, u32 flags)
         struct trace_iterator *iter = s->private;
         struct trace_array *tr = iter->tr;
  
+       if (flags & TRACE_GRAPH_PRINT_FLAT) {
+               trace_default_header(s);
+               return;
+       }
+
         if (!(tr->trace_flags & TRACE_ITER_CONTEXT_INFO))
                 return;
  
@@ -1457,19 +1463,6 @@ func_graph_set_flag(struct trace_array *tr, u32 old_flags, u32 bit, int set)
         return 0;
  }
  
-static struct trace_event_functions graph_functions = {
-       .trace          = print_graph_function_event,
-};
-
-static struct trace_event graph_trace_entry_event = {
-       .type           = TRACE_GRAPH_ENT,
-       .funcs          = &graph_functions,
-};
-
-static struct trace_event graph_trace_ret_event = {
-       .type           = TRACE_GRAPH_RET,
-       .funcs          = &graph_functions
-};
  
  static struct tracer graph_trace __tracer_data = {
         .name           = "function_graph",
@@ -1546,16 +1539,6 @@ static __init int init_graph_trace(void)
  {
         max_bytes_for_cpu = snprintf(NULL, 0, "%u", nr_cpu_ids - 1);
  
-       if (!register_trace_event(&graph_trace_entry_event)) {
-               pr_warn("Warning: could not register graph trace events\n");
-               return 1;
-       }
-
-       if (!register_trace_event(&graph_trace_ret_event)) {
-               pr_warn("Warning: could not register graph trace events\n");
-               return 1;
-       }
-
         return register_tracer(&graph_trace);
  }
  
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c

index 7758bc0617cb15d8731defbc67912b5eb46246c2..03ecb4465ee4587290e0474143f425f892771140 100644 (file)
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -16,6 +16,10 @@
  
  #include "trace.h"
  
+#define CREATE_TRACE_POINTS
+#include <trace/events/preemptirq.h>
+
+#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER)
  static struct trace_array              *irqsoff_trace __read_mostly;
  static int                             tracer_enabled __read_mostly;
  
@@ -462,64 +466,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1)
  
  #else /* !CONFIG_PROVE_LOCKING */
  
-/*
- * Stubs:
- */
-
-void trace_softirqs_on(unsigned long ip)
-{
-}
-
-void trace_softirqs_off(unsigned long ip)
-{
-}
-
-inline void print_irqtrace_events(struct task_struct *curr)
-{
-}
-
  /*
   * We are only interested in hardirq on/off events:
   */
-void trace_hardirqs_on(void)
+static inline void tracer_hardirqs_on(void)
  {
         if (!preempt_trace() && irq_trace())
                 stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  }
-EXPORT_SYMBOL(trace_hardirqs_on);
  
-void trace_hardirqs_off(void)
+static inline void tracer_hardirqs_off(void)
  {
         if (!preempt_trace() && irq_trace())
                 start_critical_timing(CALLER_ADDR0, CALLER_ADDR1);
  }
-EXPORT_SYMBOL(trace_hardirqs_off);
  
-__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr)
  {
         if (!preempt_trace() && irq_trace())
                 stop_critical_timing(CALLER_ADDR0, caller_addr);
  }
-EXPORT_SYMBOL(trace_hardirqs_on_caller);
  
-__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr)
  {
         if (!preempt_trace() && irq_trace())
                 start_critical_timing(CALLER_ADDR0, caller_addr);
  }
-EXPORT_SYMBOL(trace_hardirqs_off_caller);
  
  #endif /* CONFIG_PROVE_LOCKING */
  #endif /*  CONFIG_IRQSOFF_TRACER */
  
  #ifdef CONFIG_PREEMPT_TRACER
-void trace_preempt_on(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1)
  {
         if (preempt_trace() && !irq_trace())
                 stop_critical_timing(a0, a1);
  }
  
-void trace_preempt_off(unsigned long a0, unsigned long a1)
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1)
  {
         if (preempt_trace() && !irq_trace())
                 start_critical_timing(a0, a1);
@@ -781,3 +765,100 @@ __init static int init_irqsoff_tracer(void)
         return 0;
  }
  core_initcall(init_irqsoff_tracer);
+#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */
+
+#ifndef CONFIG_IRQSOFF_TRACER
+static inline void tracer_hardirqs_on(void) { }
+static inline void tracer_hardirqs_off(void) { }
+static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { }
+static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { }
+#endif
+
+#ifndef CONFIG_PREEMPT_TRACER
+static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { }
+static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { }
+#endif
+
+#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING)
+/* Per-cpu variable to prevent redundant calls when IRQs already off */
+static DEFINE_PER_CPU(int, tracing_irq_cpu);
+
+void trace_hardirqs_on(void)
+{
+       if (!this_cpu_read(tracing_irq_cpu))
+               return;
+
+       trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+       tracer_hardirqs_on();
+
+       this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on);
+
+void trace_hardirqs_off(void)
+{
+       if (this_cpu_read(tracing_irq_cpu))
+               return;
+
+       this_cpu_write(tracing_irq_cpu, 1);
+
+       trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1);
+       tracer_hardirqs_off();
+}
+EXPORT_SYMBOL(trace_hardirqs_off);
+
+__visible void trace_hardirqs_on_caller(unsigned long caller_addr)
+{
+       if (!this_cpu_read(tracing_irq_cpu))
+               return;
+
+       trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr);
+       tracer_hardirqs_on_caller(caller_addr);
+
+       this_cpu_write(tracing_irq_cpu, 0);
+}
+EXPORT_SYMBOL(trace_hardirqs_on_caller);
+
+__visible void trace_hardirqs_off_caller(unsigned long caller_addr)
+{
+       if (this_cpu_read(tracing_irq_cpu))
+               return;
+
+       this_cpu_write(tracing_irq_cpu, 1);
+
+       trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr);
+       tracer_hardirqs_off_caller(caller_addr);
+}
+EXPORT_SYMBOL(trace_hardirqs_off_caller);
+
+/*
+ * Stubs:
+ */
+
+void trace_softirqs_on(unsigned long ip)
+{
+}
+
+void trace_softirqs_off(unsigned long ip)
+{
+}
+
+inline void print_irqtrace_events(struct task_struct *curr)
+{
+}
+#endif
+
+#if defined(CONFIG_PREEMPT_TRACER) || \
+       (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS))
+void trace_preempt_on(unsigned long a0, unsigned long a1)
+{
+       trace_preempt_enable_rcuidle(a0, a1);
+       tracer_preempt_on(a0, a1);
+}
+
+void trace_preempt_off(unsigned long a0, unsigned long a1)
+{
+       trace_preempt_disable_rcuidle(a0, a1);
+       tracer_preempt_off(a0, a1);
+}
+#endif
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c

index c738e764e2a55cfd3303a3262d748c94a917a86d..9fbaa809d7476284ca9d189d323df4d45a3d9168 100644 (file)
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -910,6 +910,174 @@ static struct trace_event trace_fn_event = {
         .funcs          = &trace_fn_funcs,
  };
  
+/* TRACE_GRAPH_ENT */
+static enum print_line_t trace_graph_ent_trace(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
+{
+       struct trace_seq *s = &iter->seq;
+       struct ftrace_graph_ent_entry *field;
+
+       trace_assign_type(field, iter->ent);
+
+       trace_seq_puts(s, "graph_ent: func=");
+       if (trace_seq_has_overflowed(s))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       if (!seq_print_ip_sym(s, field->graph_ent.func, flags))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       trace_seq_puts(s, "\n");
+       if (trace_seq_has_overflowed(s))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_raw(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ent_entry *field;
+
+       trace_assign_type(field, iter->ent);
+
+       trace_seq_printf(&iter->seq, "%lx %d\n",
+                             field->graph_ent.func,
+                             field->graph_ent.depth);
+       if (trace_seq_has_overflowed(&iter->seq))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_hex(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ent_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       SEQ_PUT_HEX_FIELD(s, field->graph_ent.func);
+       SEQ_PUT_HEX_FIELD(s, field->graph_ent.depth);
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ent_bin(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ent_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       SEQ_PUT_FIELD(s, field->graph_ent.func);
+       SEQ_PUT_FIELD(s, field->graph_ent.depth);
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ent_funcs = {
+       .trace          = trace_graph_ent_trace,
+       .raw            = trace_graph_ent_raw,
+       .hex            = trace_graph_ent_hex,
+       .binary         = trace_graph_ent_bin,
+};
+
+static struct trace_event trace_graph_ent_event = {
+       .type           = TRACE_GRAPH_ENT,
+       .funcs          = &trace_graph_ent_funcs,
+};
+
+/* TRACE_GRAPH_RET */
+static enum print_line_t trace_graph_ret_trace(struct trace_iterator *iter, int flags,
+                                       struct trace_event *event)
+{
+       struct trace_seq *s = &iter->seq;
+       struct trace_entry *entry = iter->ent;
+       struct ftrace_graph_ret_entry *field;
+
+       trace_assign_type(field, entry);
+
+       trace_seq_puts(s, "graph_ret: func=");
+       if (trace_seq_has_overflowed(s))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       if (!seq_print_ip_sym(s, field->ret.func, flags))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       trace_seq_puts(s, "\n");
+       if (trace_seq_has_overflowed(s))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_raw(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ret_entry *field;
+
+       trace_assign_type(field, iter->ent);
+
+       trace_seq_printf(&iter->seq, "%lx %lld %lld %ld %d\n",
+                             field->ret.func,
+                             field->ret.calltime,
+                             field->ret.rettime,
+                             field->ret.overrun,
+                             field->ret.depth);
+       if (trace_seq_has_overflowed(&iter->seq))
+               return TRACE_TYPE_PARTIAL_LINE;
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_hex(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ret_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       SEQ_PUT_HEX_FIELD(s, field->ret.func);
+       SEQ_PUT_HEX_FIELD(s, field->ret.calltime);
+       SEQ_PUT_HEX_FIELD(s, field->ret.rettime);
+       SEQ_PUT_HEX_FIELD(s, field->ret.overrun);
+       SEQ_PUT_HEX_FIELD(s, field->ret.depth);
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t trace_graph_ret_bin(struct trace_iterator *iter, int flags,
+                                     struct trace_event *event)
+{
+       struct ftrace_graph_ret_entry *field;
+       struct trace_seq *s = &iter->seq;
+
+       trace_assign_type(field, iter->ent);
+
+       SEQ_PUT_FIELD(s, field->ret.func);
+       SEQ_PUT_FIELD(s, field->ret.calltime);
+       SEQ_PUT_FIELD(s, field->ret.rettime);
+       SEQ_PUT_FIELD(s, field->ret.overrun);
+       SEQ_PUT_FIELD(s, field->ret.depth);
+
+       return TRACE_TYPE_HANDLED;
+}
+
+static struct trace_event_functions trace_graph_ret_funcs = {
+       .trace          = trace_graph_ret_trace,
+       .raw            = trace_graph_ret_raw,
+       .hex            = trace_graph_ret_hex,
+       .binary         = trace_graph_ret_bin,
+};
+
+static struct trace_event trace_graph_ret_event = {
+       .type           = TRACE_GRAPH_RET,
+       .funcs          = &trace_graph_ret_funcs,
+};
+
  /* TRACE_CTX an TRACE_WAKE */
  static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
                                              char *delim)
@@ -1381,6 +1549,8 @@ static struct trace_event trace_raw_data_event = {
  
  static struct trace_event *events[] __initdata = {
         &trace_fn_event,
+       &trace_graph_ent_event,
+       &trace_graph_ret_event,
         &trace_ctx_event,
         &trace_wake_event,
         &trace_stack_event,
diff --git a/mm/madvise.c b/mm/madvise.c

index 751e97aa22106f9be73919033271ad9f98498fca..86e514bace7e06303cef30df92e91ab920a5d31e 100644 (file)
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -138,7 +138,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
         *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
                           vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx);
+                         vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
         if (*prev) {
                 vma = *prev;
                 goto success;
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index a2af6d58a68fc087feddf1181ecb511bde86cecd..638f84884627b74f5ead0f3b8d4c71aacd05b05d 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -729,7 +729,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
                         ((vmstart - vma->vm_start) >> PAGE_SHIFT);
                 prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
                                  vma->anon_vma, vma->vm_file, pgoff,
-                                new_pol, vma->vm_userfaultfd_ctx);
+                                new_pol, vma->vm_userfaultfd_ctx,
+                                vma_get_anon_name(vma));
                 if (prev) {
                         vma = prev;
                         next = vma->vm_next;
diff --git a/mm/mlock.c b/mm/mlock.c

index 46af369c13e5de4928c912fec2b5fd6aa87b48d2..658ad5562dcaf5c3a632e49b4975f50753500937 100644 (file)
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -528,7 +528,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
         *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
                           vma->vm_file, pgoff, vma_policy(vma),
-                         vma->vm_userfaultfd_ctx);
+                         vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
         if (*prev) {
                 vma = *prev;
                 goto success;
diff --git a/mm/mmap.c b/mm/mmap.c

index 0de87a376aaa18b4f00ecefd214a912b10e3a432..389c929424f5dde9330e9731a4faa6a737d48960 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -973,7 +973,8 @@ again:
   */
  static inline int is_mergeable_vma(struct vm_area_struct *vma,
                                 struct file *file, unsigned long vm_flags,
-                               struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                               struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                               const char __user *anon_name)
  {
         /*
          * VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -991,6 +992,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
                 return 0;
         if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
                 return 0;
+       if (vma_get_anon_name(vma) != anon_name)
+               return 0;
         return 1;
  }
  
@@ -1023,9 +1026,10 @@ static int
  can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
                      struct anon_vma *anon_vma, struct file *file,
                      pgoff_t vm_pgoff,
-                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                    struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                    const char __user *anon_name)
  {
-       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                 if (vma->vm_pgoff == vm_pgoff)
                         return 1;
@@ -1044,9 +1048,10 @@ static int
  can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
                     struct anon_vma *anon_vma, struct file *file,
                     pgoff_t vm_pgoff,
-                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                   struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                   const char __user *anon_name)
  {
-       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
+       if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
             is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
                 pgoff_t vm_pglen;
                 vm_pglen = vma_pages(vma);
@@ -1057,9 +1062,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
  }
  
  /*
- * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
- * whether that can be merged with its predecessor or its successor.
- * Or both (it neatly fills a hole).
+ * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
+ * figure out whether that can be merged with its predecessor or its
+ * successor.  Or both (it neatly fills a hole).
   *
   * In most cases - when called for mmap, brk or mremap - [addr,end) is
   * certain not to be mapped by the time vma_merge is called; but when
@@ -1101,7 +1106,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                         unsigned long end, unsigned long vm_flags,
                         struct anon_vma *anon_vma, struct file *file,
                         pgoff_t pgoff, struct mempolicy *policy,
-                       struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
+                       struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
+                       const char __user *anon_name)
  {
         pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
         struct vm_area_struct *area, *next;
@@ -1134,7 +1140,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                         mpol_equal(vma_policy(prev), policy) &&
                         can_vma_merge_after(prev, vm_flags,
                                             anon_vma, file, pgoff,
-                                           vm_userfaultfd_ctx)) {
+                                           vm_userfaultfd_ctx,
+                                           anon_name)) {
                 /*
                  * OK, it can.  Can we now merge in the successor as well?
                  */
@@ -1143,7 +1150,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                                 can_vma_merge_before(next, vm_flags,
                                                      anon_vma, file,
                                                      pgoff+pglen,
-                                                    vm_userfaultfd_ctx) &&
+                                                    vm_userfaultfd_ctx,
+                                                    anon_name) &&
                                 is_mergeable_anon_vma(prev->anon_vma,
                                                       next->anon_vma, NULL)) {
                                                         /* cases 1, 6 */
@@ -1166,7 +1174,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
                         mpol_equal(policy, vma_policy(next)) &&
                         can_vma_merge_before(next, vm_flags,
                                              anon_vma, file, pgoff+pglen,
-                                            vm_userfaultfd_ctx)) {
+                                            vm_userfaultfd_ctx,
+                                            anon_name)) {
                 if (prev && addr < prev->vm_end)        /* case 4 */
                         err = __vma_adjust(prev, prev->vm_start,
                                          addr, prev->vm_pgoff, NULL, next);
@@ -1646,7 +1655,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
          * Can we just expand an old mapping?
          */
         vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
-                       NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
+                       NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
         if (vma)
                 goto out;
  
@@ -2717,6 +2726,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
  
         return 0;
  }
+EXPORT_SYMBOL(do_munmap);
  
  int vm_munmap(unsigned long start, size_t len)
  {
@@ -2910,7 +2920,7 @@ static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long
  
         /* Can we just expand an old private anonymous mapping? */
         vma = vma_merge(mm, prev, addr, addr + len, flags,
-                       NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
+                       NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
         if (vma)
                 goto out;
  
@@ -3101,7 +3111,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
                 return NULL;    /* should never get here */
         new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
                             vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                           vma->vm_userfaultfd_ctx);
+                           vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
         if (new_vma) {
                 /*
                  * Source vma may have been merged into new_vma
diff --git a/mm/mprotect.c b/mm/mprotect.c

index 58b629bb70de3024aba118000f83f52dd92e6d95..fc969367ef604ae59ac928ade450efaac6335561 100644 (file)
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -335,7 +335,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
         pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
         *pprev = vma_merge(mm, *pprev, start, end, newflags,
                            vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
-                          vma->vm_userfaultfd_ctx);
+                          vma->vm_userfaultfd_ctx, vma_get_anon_name(vma));
         if (*pprev) {
                 vma = *pprev;
                 VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);
diff --git a/mm/shmem.c b/mm/shmem.c

index 07a1d22807beb7cfb5f85c3343a5eacdc28fb523..4aace97f8a25dc0beafacb6d12dfdde620307516 100644 (file)
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -4266,6 +4266,14 @@ struct file *shmem_file_setup(const char *name, loff_t size, unsigned long flags
  }
  EXPORT_SYMBOL_GPL(shmem_file_setup);
  
+void shmem_set_file(struct vm_area_struct *vma, struct file *file)
+{
+       if (vma->vm_file)
+               fput(vma->vm_file);
+       vma->vm_file = file;
+       vma->vm_ops = &shmem_vm_ops;
+}
+
  /**
   * shmem_zero_setup - setup a shared anonymous mapping
   * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
@@ -4285,10 +4293,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
         if (IS_ERR(file))
                 return PTR_ERR(file);
  
-       if (vma->vm_file)
-               fput(vma->vm_file);
-       vma->vm_file = file;
-       vma->vm_ops = &shmem_vm_ops;
+       shmem_set_file(vma, file);
  
         if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) &&
                         ((vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK) <
diff --git a/net/Kconfig b/net/Kconfig

index 9dba2715919d7668af369a34ed2d67089b112cd4..454a26178f6d697302f86602f0946091cb611ff7 100644 (file)
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -91,6 +91,12 @@ source "net/netlabel/Kconfig"
  
  endif # if INET
  
+config ANDROID_PARANOID_NETWORK
+       bool "Only allow certain groups to create sockets"
+       default y
+       help
+               none
+
  config NETWORK_SECMARK
         bool "Security Marking"
         help
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c

index 91e3ba28070647bc93960c1d729aa200b666b249..91d52a80745d83350b1c6d43620ecb8ecb0deb63 100644 (file)
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -108,11 +108,40 @@ void bt_sock_unregister(int proto)
  }
  EXPORT_SYMBOL(bt_sock_unregister);
  
+#ifdef CONFIG_PARANOID_NETWORK
+static inline int current_has_bt_admin(void)
+{
+       return !current_euid();
+}
+
+static inline int current_has_bt(void)
+{
+       return current_has_bt_admin();
+}
+# else
+static inline int current_has_bt_admin(void)
+{
+       return 1;
+}
+
+static inline int current_has_bt(void)
+{
+       return 1;
+}
+#endif
+
  static int bt_sock_create(struct net *net, struct socket *sock, int proto,
                           int kern)
  {
         int err;
  
+       if (proto == BTPROTO_RFCOMM || proto == BTPROTO_SCO ||
+                       proto == BTPROTO_L2CAP) {
+               if (!current_has_bt())
+                       return -EPERM;
+       } else if (!current_has_bt_admin())
+               return -EPERM;
+
         if (net != &init_net)
                 return -EAFNOSUPPORT;
  
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile

index c6c8ad1d4b6d65eded5571d875fedae4af2bf47b..d206121c82ff7e30e787f2c07bff2dbfdc405818 100644 (file)
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -17,6 +17,7 @@ obj-y     := route.o inetpeer.o protocol.o \
  
  obj-$(CONFIG_NET_IP_TUNNEL) += ip_tunnel.o
  obj-$(CONFIG_SYSCTL) += sysctl_net_ipv4.o
+obj-$(CONFIG_SYSFS) += sysfs_net_ipv4.o
  obj-$(CONFIG_PROC_FS) += proc.o
  obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o
  obj-$(CONFIG_IP_MROUTE) += ipmr.o
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c

index b9d9a2b8792c7a9aa6744f80b55f7b5a727a5cef..c7e47f40c2ffb7ca9bbe92726536118dffd22231 100644 (file)
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -89,6 +89,7 @@
  #include <linux/netfilter_ipv4.h>
  #include <linux/random.h>
  #include <linux/slab.h>
+#include <linux/netfilter/xt_qtaguid.h>
  
  #include <linux/uaccess.h>
  
@@ -121,6 +122,19 @@
  #endif
  #include <net/l3mdev.h>
  
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+       return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+       return 1;
+}
+#endif
  
  /* The inetsw table contains everything that inet_create needs to
   * build a new socket.
@@ -255,6 +269,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
         if (protocol < 0 || protocol >= IPPROTO_MAX)
                 return -EINVAL;
  
+       if (!current_has_network())
+               return -EACCES;
+
         sock->state = SS_UNCONNECTED;
  
         /* Look for the requested type/protocol pair. */
@@ -303,8 +320,7 @@ lookup_protocol:
         }
  
         err = -EPERM;
-       if (sock->type == SOCK_RAW && !kern &&
-           !ns_capable(net->user_ns, CAP_NET_RAW))
+       if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
                 goto out_rcu_unlock;
  
         sock->ops = answer->ops;
@@ -407,6 +423,9 @@ int inet_release(struct socket *sock)
         if (sk) {
                 long timeout;
  
+#ifdef CONFIG_NETFILTER_XT_MATCH_QTAGUID
+               qtaguid_untag(sock, true);
+#endif
                 /* Applications forget to leave groups before exiting */
                 ip_mc_drop_socket(sk);
  
diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c

index e9293bdebba04f5b329cd4fd4a6b5b676ade5988..2880dddb5431eda0c547126dbfc73f6d61a308b4 100644 (file)
--- a/net/ipv4/netfilter/nf_socket_ipv4.c
+++ b/net/ipv4/netfilter/nf_socket_ipv4.c
@@ -100,6 +100,7 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
         __be16 uninitialized_var(dport), uninitialized_var(sport);
         const struct iphdr *iph = ip_hdr(skb);
         struct sk_buff *data_skb = NULL;
+       struct sock *sk = skb->sk;
         u8 uninitialized_var(protocol);
  #if IS_ENABLED(CONFIG_NF_CONNTRACK)
         enum ip_conntrack_info ctinfo;
@@ -153,8 +154,14 @@ struct sock *nf_sk_lookup_slow_v4(struct net *net, const struct sk_buff *skb,
         }
  #endif
  
-       return nf_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
-                                    daddr, sport, dport, indev);
+       if (sk)
+               refcount_inc(&sk->sk_refcnt);
+       else
+               sk = nf_socket_get_sock_v4(dev_net(skb->dev), data_skb, doff,
+                                          protocol, saddr, daddr, sport,
+                                          dport, indev);
+
+       return sk;
  }
  EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v4);
  
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c

index 0989e739d09809ee68db3de48b5c852591f1c31c..9a137ca52e23dec660d0e7456a4f10e9d56a515f 100644 (file)
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -197,6 +197,21 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
         return ret;
  }
  
+/* Validate changes from /proc interface. */
+static int proc_tcp_default_init_rwnd(struct ctl_table *ctl, int write,
+                                     void __user *buffer,
+                                     size_t *lenp, loff_t *ppos)
+{
+       int old_value = *(int *)ctl->data;
+       int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);
+       int new_value = *(int *)ctl->data;
+
+       if (write && ret == 0 && (new_value < 3 || new_value > 100))
+               *(int *)ctl->data = old_value;
+
+       return ret;
+}
+
  static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
                                        void __user *buffer, size_t *lenp, loff_t *ppos)
  {
@@ -714,6 +729,13 @@ static struct ctl_table ipv4_table[] = {
                 .mode           = 0444,
                 .proc_handler   = proc_tcp_available_ulp,
         },
+       {
+               .procname       = "tcp_default_init_rwnd",
+               .data           = &sysctl_tcp_default_init_rwnd,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_tcp_default_init_rwnd
+       },
         {
                 .procname       = "icmp_msgs_per_sec",
                 .data           = &sysctl_icmp_msgs_per_sec,
diff --git a/net/ipv4/sysfs_net_ipv4.c b/net/ipv4/sysfs_net_ipv4.c

new file mode 100644 (file)

index 0000000..0cbbf10
--- /dev/null
+++ b/net/ipv4/sysfs_net_ipv4.c
@@ -0,0 +1,88 @@
+/*
+ * net/ipv4/sysfs_net_ipv4.c
+ *
+ * sysfs-based networking knobs (so we can, unlike with sysctl, control perms)
+ *
+ * Copyright (C) 2008 Google, Inc.
+ *
+ * Robert Love <rlove@google.com>
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include <linux/kobject.h>
+#include <linux/string.h>
+#include <linux/sysfs.h>
+#include <linux/init.h>
+#include <net/tcp.h>
+
+#define CREATE_IPV4_FILE(_name, _var) \
+static ssize_t _name##_show(struct kobject *kobj, \
+                           struct kobj_attribute *attr, char *buf) \
+{ \
+       return sprintf(buf, "%d\n", _var); \
+} \
+static ssize_t _name##_store(struct kobject *kobj, \
+                            struct kobj_attribute *attr, \
+                            const char *buf, size_t count) \
+{ \
+       int val, ret; \
+       ret = sscanf(buf, "%d", &val); \
+       if (ret != 1) \
+               return -EINVAL; \
+       if (val < 0) \
+               return -EINVAL; \
+       _var = val; \
+       return count; \
+} \
+static struct kobj_attribute _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+
+CREATE_IPV4_FILE(tcp_wmem_min, sysctl_tcp_wmem[0]);
+CREATE_IPV4_FILE(tcp_wmem_def, sysctl_tcp_wmem[1]);
+CREATE_IPV4_FILE(tcp_wmem_max, sysctl_tcp_wmem[2]);
+
+CREATE_IPV4_FILE(tcp_rmem_min, sysctl_tcp_rmem[0]);
+CREATE_IPV4_FILE(tcp_rmem_def, sysctl_tcp_rmem[1]);
+CREATE_IPV4_FILE(tcp_rmem_max, sysctl_tcp_rmem[2]);
+
+static struct attribute *ipv4_attrs[] = {
+       &tcp_wmem_min_attr.attr,
+       &tcp_wmem_def_attr.attr,
+       &tcp_wmem_max_attr.attr,
+       &tcp_rmem_min_attr.attr,
+       &tcp_rmem_def_attr.attr,
+       &tcp_rmem_max_attr.attr,
+       NULL
+};
+
+static struct attribute_group ipv4_attr_group = {
+       .attrs = ipv4_attrs,
+};
+
+static __init int sysfs_ipv4_init(void)
+{
+       struct kobject *ipv4_kobject;
+       int ret;
+
+       ipv4_kobject = kobject_create_and_add("ipv4", kernel_kobj);
+       if (!ipv4_kobject)
+               return -ENOMEM;
+
+       ret = sysfs_create_group(ipv4_kobject, &ipv4_attr_group);
+       if (ret) {
+               kobject_put(ipv4_kobject);
+               return ret;
+       }
+
+       return 0;
+}
+
+subsys_initcall(sysfs_ipv4_init);
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c

index ff48ac654e5ae110cd34e20c9407868e25f2a201..c54d7a2c7fef00a1245ab4fce3d9d1a9dc4ef8e2 100644 (file)
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -95,6 +95,7 @@ int sysctl_tcp_min_rtt_wlen __read_mostly = 300;
  int sysctl_tcp_moderate_rcvbuf __read_mostly = 1;
  int sysctl_tcp_early_retrans __read_mostly = 3;
  int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
+int sysctl_tcp_default_init_rwnd __read_mostly = TCP_INIT_CWND * 2;
  
  #define FLAG_DATA              0x01 /* Incoming frame contained data.          */
  #define FLAG_WIN_UPDATE                0x02 /* Incoming ACK was a window update.       */
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c

index cd3d60bb7cc8ace78f0c85a94a8582c9144612d0..a241d51c80d300081fdbfaafc132579d839a035b 100644 (file)
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -189,7 +189,7 @@ u32 tcp_default_init_rwnd(u32 mss)
          * (RFC 3517, Section 4, NextSeg() rule (2)). Further place a
          * limit when mss is larger than 1460.
          */
-       u32 init_rwnd = TCP_INIT_CWND * 2;
+       u32 init_rwnd = sysctl_tcp_default_init_rwnd;
  
         if (mss > 1460)
                 init_rwnd = max((1460 * init_rwnd) / mss, 2U);
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c

index 6a76e41e6d51695553d281900f73b28e9af4f814..e773245b867b9a53632f624bb977b0333bb893fa 100644 (file)
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -229,6 +229,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
         .accept_ra_rt_info_max_plen = 0,
  #endif
  #endif
+       .accept_ra_rt_table     = 0,
         .proxy_ndp              = 0,
         .accept_source_route    = 0,    /* we do not accept RH0 by default. */
         .disable_ipv6           = 0,
@@ -283,6 +284,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
         .accept_ra_rt_info_max_plen = 0,
  #endif
  #endif
+       .accept_ra_rt_table     = 0,
         .proxy_ndp              = 0,
         .accept_source_route    = 0,    /* we do not accept RH0 by default. */
         .disable_ipv6           = 0,
@@ -2274,6 +2276,31 @@ static void  ipv6_try_regen_rndid(struct inet6_dev *idev, struct in6_addr *tmpad
                 ipv6_regen_rndid(idev);
  }
  
+u32 addrconf_rt_table(const struct net_device *dev, u32 default_table) {
+       /* Determines into what table to put autoconf PIO/RIO/default routes
+        * learned on this device.
+        *
+        * - If 0, use the same table for every device. This puts routes into
+        *   one of RT_TABLE_{PREFIX,INFO,DFLT} depending on the type of route
+        *   (but note that these three are currently all equal to
+        *   RT6_TABLE_MAIN).
+        * - If > 0, use the specified table.
+        * - If < 0, put routes into table dev->ifindex + (-rt_table).
+        */
+       struct inet6_dev *idev = in6_dev_get(dev);
+       u32 table;
+       int sysctl = idev->cnf.accept_ra_rt_table;
+       if (sysctl == 0) {
+               table = default_table;
+       } else if (sysctl > 0) {
+               table = (u32) sysctl;
+       } else {
+               table = (unsigned) dev->ifindex + (-sysctl);
+       }
+       in6_dev_put(idev);
+       return table;
+}
+
  /*
   *     Add prefix route.
   */
@@ -2283,7 +2310,7 @@ addrconf_prefix_route(struct in6_addr *pfx, int plen, struct net_device *dev,
                       unsigned long expires, u32 flags)
  {
         struct fib6_config cfg = {
-               .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX,
+               .fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX),
                 .fc_metric = IP6_RT_PRIO_ADDRCONF,
                 .fc_ifindex = dev->ifindex,
                 .fc_expires = expires,
@@ -2316,7 +2343,7 @@ static struct rt6_info *addrconf_get_prefix_route(const struct in6_addr *pfx,
         struct fib6_node *fn;
         struct rt6_info *rt = NULL;
         struct fib6_table *table;
-       u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_PREFIX;
+       u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_PREFIX);
  
         table = fib6_get_table(dev_net(dev), tb_id);
         if (!table)
@@ -5043,6 +5070,7 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
         array[DEVCONF_ACCEPT_RA_RT_INFO_MAX_PLEN] = cnf->accept_ra_rt_info_max_plen;
  #endif
  #endif
+       array[DEVCONF_ACCEPT_RA_RT_TABLE] = cnf->accept_ra_rt_table;
         array[DEVCONF_PROXY_NDP] = cnf->proxy_ndp;
         array[DEVCONF_ACCEPT_SOURCE_ROUTE] = cnf->accept_source_route;
  #ifdef CONFIG_IPV6_OPTIMISTIC_DAD
@@ -6202,6 +6230,13 @@ static const struct ctl_table addrconf_sysctl[] = {
         },
  #endif
  #endif
+       {
+               .procname       = "accept_ra_rt_table",
+               .data           = &ipv6_devconf.accept_ra_rt_table,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
         {
                 .procname       = "proxy_ndp",
                 .data           = &ipv6_devconf.proxy_ndp,
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c

index 9ccbf74deb99c2af94a954bef605548f0b5866cc..e9fa881753e3131f9bce5db7c4ded5192a88b62f 100644 (file)
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -66,6 +66,20 @@
  #include <linux/uaccess.h>
  #include <linux/mroute6.h>
  
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+
+static inline int current_has_network(void)
+{
+       return in_egroup_p(AID_INET) || capable(CAP_NET_RAW);
+}
+#else
+static inline int current_has_network(void)
+{
+       return 1;
+}
+#endif
+
  #include "ip6_offload.h"
  
  MODULE_AUTHOR("Cast of dozens");
@@ -122,6 +136,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
         if (protocol < 0 || protocol >= IPPROTO_MAX)
                 return -EINVAL;
  
+       if (!current_has_network())
+               return -EACCES;
+
         /* Look for the requested type/protocol pair. */
  lookup_protocol:
         err = -ESOCKTNOSUPPORT;
@@ -168,8 +185,7 @@ lookup_protocol:
         }
  
         err = -EPERM;
-       if (sock->type == SOCK_RAW && !kern &&
-           !ns_capable(net->user_ns, CAP_NET_RAW))
+       if (sock->type == SOCK_RAW && !kern && !capable(CAP_NET_RAW))
                 goto out_rcu_unlock;
  
         sock->ops = answer->ops;
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c

index 305e2ed730bf42438190ad2fd7ae9ac9b4eed98f..477692f80f0da4db55bcf8faa8cd0792ec5c20aa 100644 (file)
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -166,15 +166,15 @@ EXPORT_SYMBOL_GPL(ipv6_find_tlv);
   * to explore inner IPv6 header, eg. ICMPv6 error messages.
   *
   * If target header is found, its offset is set in *offset and return protocol
- * number. Otherwise, return -1.
+ * number. Otherwise, return -ENOENT or -EBADMSG.
   *
   * If the first fragment doesn't contain the final protocol header or
   * NEXTHDR_NONE it is considered invalid.
   *
   * Note that non-1st fragment is special case that "the protocol number
   * of last header" is "next header" field in Fragment header. In this case,
- * *offset is meaningless and fragment offset is stored in *fragoff if fragoff
- * isn't NULL.
+ * *offset is meaningless. If fragoff is not NULL, the fragment offset is
+ * stored in *fragoff; if it is NULL, return -EINVAL.
   *
   * if flags is not NULL and it's a fragment, then the frag flag
   * IP6_FH_F_FRAG will be set. If it's an AH header, the
@@ -253,9 +253,12 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
                                 if (target < 0 &&
                                     ((!ipv6_ext_hdr(hp->nexthdr)) ||
                                      hp->nexthdr == NEXTHDR_NONE)) {
-                                       if (fragoff)
+                                       if (fragoff) {
                                                 *fragoff = _frag_off;
-                                       return hp->nexthdr;
+                                               return hp->nexthdr;
+                                       } else {
+                                               return -EINVAL;
+                                       }
                                 }
                                 if (!found)
                                         return -ENOENT;
diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c

index ebb2bf84232acd1fac088dbc4d3151701cc52a40..42ce8496f44a0b350acf0d6cdae5130c9986385f 100644 (file)
--- a/net/ipv6/netfilter/nf_socket_ipv6.c
+++ b/net/ipv6/netfilter/nf_socket_ipv6.c
@@ -102,6 +102,7 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
  struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
                                   const struct net_device *indev)
  {
+       struct sock *sk = skb->sk;
         __be16 uninitialized_var(dport), uninitialized_var(sport);
         const struct in6_addr *daddr = NULL, *saddr = NULL;
         struct ipv6hdr *iph = ipv6_hdr(skb);
@@ -141,8 +142,14 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb,
                 return NULL;
         }
  
-       return nf_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
-                                    sport, dport, indev);
+       if (sk)
+               refcount_inc(&sk->sk_refcnt);
+       else
+               sk = nf_socket_get_sock_v6(dev_net(skb->dev), data_skb, doff,
+                                          tproto, saddr, daddr, sport, dport,
+                                          indev);
+
+       return sk;
  }
  EXPORT_SYMBOL_GPL(nf_sk_lookup_slow_v6);
  
diff --git a/net/ipv6/route.c b/net/ipv6/route.c

index ca8d3266e92e0d9bdd429c1e52b4825a51437cbf..696ed09a08b18cab07005ee5700fc74949a10c90 100644 (file)
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2501,8 +2501,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
                                            const struct in6_addr *gwaddr,
                                            struct net_device *dev)
  {
-       u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
-       int ifindex = dev->ifindex;
+       u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO);
         struct fib6_node *fn;
         struct rt6_info *rt = NULL;
         struct fib6_table *table;
@@ -2517,7 +2516,7 @@ static struct rt6_info *rt6_get_route_info(struct net *net,
                 goto out;
  
         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
-               if (rt->dst.dev->ifindex != ifindex)
+               if (rt->dst.dev->ifindex != dev->ifindex)
                         continue;
                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
                         continue;
@@ -2549,7 +2548,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
                 .fc_nlinfo.nl_net = net,
         };
  
-       cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
+       cfg.fc_table = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_INFO),
         cfg.fc_dst = *prefix;
         cfg.fc_gateway = *gwaddr;
  
@@ -2565,7 +2564,7 @@ static struct rt6_info *rt6_add_route_info(struct net *net,
  
  struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
  {
-       u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
+       u32 tb_id = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_MAIN);
         struct rt6_info *rt;
         struct fib6_table *table;
  
@@ -2591,7 +2590,7 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
                                      unsigned int pref)
  {
         struct fib6_config cfg = {
-               .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
+               .fc_table       = l3mdev_fib_table(dev) ? : addrconf_rt_table(dev, RT6_TABLE_DFLT),
                 .fc_metric      = IP6_RT_PRIO_USER,
                 .fc_ifindex     = dev->ifindex,
                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
@@ -2615,43 +2614,16 @@ struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
         return rt6_get_dflt_router(gwaddr, dev);
  }
  
-static void __rt6_purge_dflt_routers(struct fib6_table *table)
-{
-       struct rt6_info *rt;
-
-restart:
-       read_lock_bh(&table->tb6_lock);
-       for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
-               if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
-                   (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
-                       dst_hold(&rt->dst);
-                       read_unlock_bh(&table->tb6_lock);
-                       ip6_del_rt(rt);
-                       goto restart;
-               }
-       }
-       read_unlock_bh(&table->tb6_lock);
-
-       table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
+int rt6_addrconf_purge(struct rt6_info *rt, void *arg) {
+       if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
+           (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2))
+               return -1;
+       return 0;
  }
  
  void rt6_purge_dflt_routers(struct net *net)
  {
-       struct fib6_table *table;
-       struct hlist_head *head;
-       unsigned int h;
-
-       rcu_read_lock();
-
-       for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
-               head = &net->ipv6.fib_table_hash[h];
-               hlist_for_each_entry_rcu(table, head, tb6_hlist) {
-                       if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
-                               __rt6_purge_dflt_routers(table);
-               }
-       }
-
-       rcu_read_unlock();
+       fib6_clean_all(net, rt6_addrconf_purge, NULL);
  }
  
  static void rtmsg_to_fib6_config(struct net *net,
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig

index e4a13cc8a2e7635841a29be9901412ab2904c3ae..1a7a43f8b287bd3dd161676f75968a814e6effac 100644 (file)
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -1360,6 +1360,8 @@ config NETFILTER_XT_MATCH_OWNER
         based on who created the socket: the user or group. It is also
         possible to check whether a socket actually exists.
  
+       Conflicts with '"quota, tag, uid" match'
+
  config NETFILTER_XT_MATCH_POLICY
         tristate 'IPsec "policy" match support'
         depends on XFRM
@@ -1393,6 +1395,22 @@ config NETFILTER_XT_MATCH_PKTTYPE
  
           To compile it as a module, choose M here.  If unsure, say N.
  
+config NETFILTER_XT_MATCH_QTAGUID
+       bool '"quota, tag, owner" match and stats support'
+        depends on NETFILTER_XT_MATCH_SOCKET
+       depends on NETFILTER_XT_MATCH_OWNER=n
+       help
+         This option replaces the `owner' match. In addition to matching
+         on uid, it keeps stats based on a tag assigned to a socket.
+         The full tag is comprised of a UID and an accounting tag.
+         The tags are assignable to sockets from user space (e.g. a download
+         manager can assign the socket to another UID for accounting).
+         Stats and control are done via /proc/net/xt_qtaguid/.
+         It replaces owner as it takes the same arguments, but should
+         really be recognized by the iptables tool.
+
+         If unsure, say `N'.
+
  config NETFILTER_XT_MATCH_QUOTA
         tristate '"quota" match support'
         depends on NETFILTER_ADVANCED
@@ -1403,6 +1421,29 @@ config NETFILTER_XT_MATCH_QUOTA
           If you want to compile it as a module, say M here and read
           <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
  
+config NETFILTER_XT_MATCH_QUOTA2
+       tristate '"quota2" match support'
+       depends on NETFILTER_ADVANCED
+       help
+         This option adds a `quota2' match, which allows to match on a
+         byte counter correctly and not per CPU.
+         It allows naming the quotas.
+         This is based on http://xtables-addons.git.sourceforge.net
+
+         If you want to compile it as a module, say M here and read
+         <file:Documentation/kbuild/modules.txt>.  If unsure, say `N'.
+
+config NETFILTER_XT_MATCH_QUOTA2_LOG
+       bool '"quota2" Netfilter LOG support'
+       depends on NETFILTER_XT_MATCH_QUOTA2
+       default n
+       help
+         This option allows `quota2' to log ONCE when a quota limit
+         is passed. It logs via NETLINK using the NETLINK_NFLOG family.
+         It logs similarly to how ipt_ULOG would without data.
+
+         If unsure, say `N'.
+
  config NETFILTER_XT_MATCH_RATEEST
         tristate '"rateest" match support'
         depends on NETFILTER_ADVANCED
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile

index f78ed2470831d54262852aba19a9d98d80cf7c72..bcd56034a94bfd133d8b0e420720fbe7692b7163 100644 (file)
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -173,7 +173,9 @@ obj-$(CONFIG_NETFILTER_XT_MATCH_CGROUP) += xt_cgroup.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_PHYSDEV) += xt_physdev.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_PKTTYPE) += xt_pkttype.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_POLICY) += xt_policy.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QTAGUID) += xt_qtaguid_print.o xt_qtaguid.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA) += xt_quota.o
+obj-$(CONFIG_NETFILTER_XT_MATCH_QUOTA2) += xt_quota2.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_RATEEST) += xt_rateest.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_REALM) += xt_realm.o
  obj-$(CONFIG_NETFILTER_XT_MATCH_RECENT) += xt_recent.o
diff --git a/net/netfilter/xt_IDLETIMER.c b/net/netfilter/xt_IDLETIMER.c

index daf45da448fab4406cf4b5727404c88c1f0759be..f11aa28b96ce610d099c5399e717d5a07d31be74 100644 (file)
--- a/net/netfilter/xt_IDLETIMER.c
+++ b/net/netfilter/xt_IDLETIMER.c
@@ -5,6 +5,7 @@
   * After timer expires a kevent will be sent.
   *
   * Copyright (C) 2004, 2010 Nokia Corporation
+ *
   * Written by Timo Teras <ext-timo.teras@nokia.com>
   *
   * Converted to x_tables and reworked for upstream inclusion
@@ -38,8 +39,17 @@
  #include <linux/netfilter/xt_IDLETIMER.h>
  #include <linux/kdev_t.h>
  #include <linux/kobject.h>
+#include <linux/skbuff.h>
  #include <linux/workqueue.h>
  #include <linux/sysfs.h>
+#include <linux/rtc.h>
+#include <linux/time.h>
+#include <linux/math64.h>
+#include <linux/suspend.h>
+#include <linux/notifier.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/inet_sock.h>
  
  struct idletimer_tg_attr {
         struct attribute attr;
@@ -55,14 +65,110 @@ struct idletimer_tg {
         struct kobject *kobj;
         struct idletimer_tg_attr attr;
  
+       struct timespec delayed_timer_trigger;
+       struct timespec last_modified_timer;
+       struct timespec last_suspend_time;
+       struct notifier_block pm_nb;
+
+       int timeout;
         unsigned int refcnt;
+       bool work_pending;
+       bool send_nl_msg;
+       bool active;
+       uid_t uid;
  };
  
  static LIST_HEAD(idletimer_tg_list);
  static DEFINE_MUTEX(list_mutex);
+static DEFINE_SPINLOCK(timestamp_lock);
  
  static struct kobject *idletimer_tg_kobj;
  
+static bool check_for_delayed_trigger(struct idletimer_tg *timer,
+               struct timespec *ts)
+{
+       bool state;
+       struct timespec temp;
+       spin_lock_bh(&timestamp_lock);
+       timer->work_pending = false;
+       if ((ts->tv_sec - timer->last_modified_timer.tv_sec) > timer->timeout ||
+                       timer->delayed_timer_trigger.tv_sec != 0) {
+               state = false;
+               temp.tv_sec = timer->timeout;
+               temp.tv_nsec = 0;
+               if (timer->delayed_timer_trigger.tv_sec != 0) {
+                       temp = timespec_add(timer->delayed_timer_trigger, temp);
+                       ts->tv_sec = temp.tv_sec;
+                       ts->tv_nsec = temp.tv_nsec;
+                       timer->delayed_timer_trigger.tv_sec = 0;
+                       timer->work_pending = true;
+                       schedule_work(&timer->work);
+               } else {
+                       temp = timespec_add(timer->last_modified_timer, temp);
+                       ts->tv_sec = temp.tv_sec;
+                       ts->tv_nsec = temp.tv_nsec;
+               }
+       } else {
+               state = timer->active;
+       }
+       spin_unlock_bh(&timestamp_lock);
+       return state;
+}
+
+static void notify_netlink_uevent(const char *iface, struct idletimer_tg *timer)
+{
+       char iface_msg[NLMSG_MAX_SIZE];
+       char state_msg[NLMSG_MAX_SIZE];
+       char timestamp_msg[NLMSG_MAX_SIZE];
+       char uid_msg[NLMSG_MAX_SIZE];
+       char *envp[] = { iface_msg, state_msg, timestamp_msg, uid_msg, NULL };
+       int res;
+       struct timespec ts;
+       uint64_t time_ns;
+       bool state;
+
+       res = snprintf(iface_msg, NLMSG_MAX_SIZE, "INTERFACE=%s",
+                      iface);
+       if (NLMSG_MAX_SIZE <= res) {
+               pr_err("message too long (%d)", res);
+               return;
+       }
+
+       get_monotonic_boottime(&ts);
+       state = check_for_delayed_trigger(timer, &ts);
+       res = snprintf(state_msg, NLMSG_MAX_SIZE, "STATE=%s",
+                       state ? "active" : "inactive");
+
+       if (NLMSG_MAX_SIZE <= res) {
+               pr_err("message too long (%d)", res);
+               return;
+       }
+
+       if (state) {
+               res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=%u", timer->uid);
+               if (NLMSG_MAX_SIZE <= res)
+                       pr_err("message too long (%d)", res);
+       } else {
+               res = snprintf(uid_msg, NLMSG_MAX_SIZE, "UID=");
+               if (NLMSG_MAX_SIZE <= res)
+                       pr_err("message too long (%d)", res);
+       }
+
+       time_ns = timespec_to_ns(&ts);
+       res = snprintf(timestamp_msg, NLMSG_MAX_SIZE, "TIME_NS=%llu", time_ns);
+       if (NLMSG_MAX_SIZE <= res) {
+               timestamp_msg[0] = '\0';
+               pr_err("message too long (%d)", res);
+       }
+
+       pr_debug("putting nlmsg: <%s> <%s> <%s> <%s>\n", iface_msg, state_msg,
+                timestamp_msg, uid_msg);
+       kobject_uevent_env(idletimer_tg_kobj, KOBJ_CHANGE, envp);
+       return;
+
+
+}
+
  static
  struct idletimer_tg *__idletimer_tg_find_by_label(const char *label)
  {
@@ -83,6 +189,7 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
  {
         struct idletimer_tg *timer;
         unsigned long expires = 0;
+       unsigned long now = jiffies;
  
         mutex_lock(&list_mutex);
  
@@ -92,11 +199,15 @@ static ssize_t idletimer_tg_show(struct kobject *kobj, struct attribute *attr,
  
         mutex_unlock(&list_mutex);
  
-       if (time_after(expires, jiffies))
+       if (time_after(expires, now))
                 return sprintf(buf, "%u\n",
-                              jiffies_to_msecs(expires - jiffies) / 1000);
+                              jiffies_to_msecs(expires - now) / 1000);
  
-       return sprintf(buf, "0\n");
+       if (timer->send_nl_msg)
+               return sprintf(buf, "0 %d\n",
+                       jiffies_to_msecs(now - expires) / 1000);
+       else
+               return sprintf(buf, "0\n");
  }
  
  static void idletimer_tg_work(struct work_struct *work)
@@ -105,6 +216,9 @@ static void idletimer_tg_work(struct work_struct *work)
                                                   work);
  
         sysfs_notify(idletimer_tg_kobj, NULL, timer->attr.attr.name);
+
+       if (timer->send_nl_msg)
+               notify_netlink_uevent(timer->attr.attr.name, timer);
  }
  
  static void idletimer_tg_expired(unsigned long data)
@@ -112,8 +226,55 @@ static void idletimer_tg_expired(unsigned long data)
         struct idletimer_tg *timer = (struct idletimer_tg *) data;
  
         pr_debug("timer %s expired\n", timer->attr.attr.name);
-
+       spin_lock_bh(&timestamp_lock);
+       timer->active = false;
+       timer->work_pending = true;
         schedule_work(&timer->work);
+       spin_unlock_bh(&timestamp_lock);
+}
+
+static int idletimer_resume(struct notifier_block *notifier,
+               unsigned long pm_event, void *unused)
+{
+       struct timespec ts;
+       unsigned long time_diff, now = jiffies;
+       struct idletimer_tg *timer = container_of(notifier,
+                       struct idletimer_tg, pm_nb);
+       if (!timer)
+               return NOTIFY_DONE;
+       switch (pm_event) {
+       case PM_SUSPEND_PREPARE:
+               get_monotonic_boottime(&timer->last_suspend_time);
+               break;
+       case PM_POST_SUSPEND:
+               spin_lock_bh(&timestamp_lock);
+               if (!timer->active) {
+                       spin_unlock_bh(&timestamp_lock);
+                       break;
+               }
+               /* since jiffies are not updated when suspended now represents
+                * the time it would have suspended */
+               if (time_after(timer->timer.expires, now)) {
+                       get_monotonic_boottime(&ts);
+                       ts = timespec_sub(ts, timer->last_suspend_time);
+                       time_diff = timespec_to_jiffies(&ts);
+                       if (timer->timer.expires > (time_diff + now)) {
+                               mod_timer_pending(&timer->timer,
+                                               (timer->timer.expires - time_diff));
+                       } else {
+                               del_timer(&timer->timer);
+                               timer->timer.expires = 0;
+                               timer->active = false;
+                               timer->work_pending = true;
+                               schedule_work(&timer->work);
+                       }
+               }
+               spin_unlock_bh(&timestamp_lock);
+               break;
+       default:
+               break;
+       }
+       return NOTIFY_DONE;
  }
  
  static int idletimer_tg_create(struct idletimer_tg_info *info)
@@ -146,6 +307,21 @@ static int idletimer_tg_create(struct idletimer_tg_info *info)
         setup_timer(&info->timer->timer, idletimer_tg_expired,
                     (unsigned long) info->timer);
         info->timer->refcnt = 1;
+       info->timer->send_nl_msg = (info->send_nl_msg == 0) ? false : true;
+       info->timer->active = true;
+       info->timer->timeout = info->timeout;
+
+       info->timer->delayed_timer_trigger.tv_sec = 0;
+       info->timer->delayed_timer_trigger.tv_nsec = 0;
+       info->timer->work_pending = false;
+       info->timer->uid = 0;
+       get_monotonic_boottime(&info->timer->last_modified_timer);
+
+       info->timer->pm_nb.notifier_call = idletimer_resume;
+       ret = register_pm_notifier(&info->timer->pm_nb);
+       if (ret)
+               printk(KERN_WARNING "[%s] Failed to register pm notifier %d\n",
+                               __func__, ret);
  
         mod_timer(&info->timer->timer,
                   msecs_to_jiffies(info->timeout * 1000) + jiffies);
@@ -162,6 +338,42 @@ out:
         return ret;
  }
  
+static void reset_timer(const struct idletimer_tg_info *info,
+                       struct sk_buff *skb)
+{
+       unsigned long now = jiffies;
+       struct idletimer_tg *timer = info->timer;
+       bool timer_prev;
+
+       spin_lock_bh(&timestamp_lock);
+       timer_prev = timer->active;
+       timer->active = true;
+       /* timer_prev is used to guard overflow problem in time_before*/
+       if (!timer_prev || time_before(timer->timer.expires, now)) {
+               pr_debug("Starting Checkentry timer (Expired, Jiffies): %lu, %lu\n",
+                               timer->timer.expires, now);
+
+               /* Stores the uid resposible for waking up the radio */
+               if (skb && (skb->sk)) {
+                       timer->uid = from_kuid_munged(current_user_ns(),
+                                       sock_i_uid(skb_to_full_sk(skb)));
+               }
+
+               /* checks if there is a pending inactive notification*/
+               if (timer->work_pending)
+                       timer->delayed_timer_trigger = timer->last_modified_timer;
+               else {
+                       timer->work_pending = true;
+                       schedule_work(&timer->work);
+               }
+       }
+
+       get_monotonic_boottime(&timer->last_modified_timer);
+       mod_timer(&timer->timer,
+                       msecs_to_jiffies(info->timeout * 1000) + now);
+       spin_unlock_bh(&timestamp_lock);
+}
+
  /*
   * The actual xt_tables plugin.
   */
@@ -169,15 +381,23 @@ static unsigned int idletimer_tg_target(struct sk_buff *skb,
                                          const struct xt_action_param *par)
  {
         const struct idletimer_tg_info *info = par->targinfo;
+       unsigned long now = jiffies;
  
         pr_debug("resetting timer %s, timeout period %u\n",
                  info->label, info->timeout);
  
         BUG_ON(!info->timer);
  
-       mod_timer(&info->timer->timer,
-                 msecs_to_jiffies(info->timeout * 1000) + jiffies);
+       info->timer->active = true;
+
+       if (time_before(info->timer->timer.expires, now)) {
+               schedule_work(&info->timer->work);
+               pr_debug("Starting timer %s (Expired, Jiffies): %lu, %lu\n",
+                        info->label, info->timer->timer.expires, now);
+       }
  
+       /* TODO: Avoid modifying timers on each packet */
+       reset_timer(info, skb);
         return XT_CONTINUE;
  }
  
@@ -186,7 +406,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
         struct idletimer_tg_info *info = par->targinfo;
         int ret;
  
-       pr_debug("checkentry targinfo%s\n", info->label);
+       pr_debug("checkentry targinfo %s\n", info->label);
  
         if (info->timeout == 0) {
                 pr_debug("timeout value is zero\n");
@@ -205,9 +425,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
         info->timer = __idletimer_tg_find_by_label(info->label);
         if (info->timer) {
                 info->timer->refcnt++;
-               mod_timer(&info->timer->timer,
-                         msecs_to_jiffies(info->timeout * 1000) + jiffies);
-
+               reset_timer(info, NULL);
                 pr_debug("increased refcnt of timer %s to %u\n",
                          info->label, info->timer->refcnt);
         } else {
@@ -220,6 +438,7 @@ static int idletimer_tg_checkentry(const struct xt_tgchk_param *par)
         }
  
         mutex_unlock(&list_mutex);
+
         return 0;
  }
  
@@ -236,13 +455,14 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
  
                 list_del(&info->timer->entry);
                 del_timer_sync(&info->timer->timer);
-               cancel_work_sync(&info->timer->work);
                 sysfs_remove_file(idletimer_tg_kobj, &info->timer->attr.attr);
+               unregister_pm_notifier(&info->timer->pm_nb);
+               cancel_work_sync(&info->timer->work);
                 kfree(info->timer->attr.attr.name);
                 kfree(info->timer);
         } else {
                 pr_debug("decreased refcnt of timer %s to %u\n",
-                        info->label, info->timer->refcnt);
+               info->label, info->timer->refcnt);
         }
  
         mutex_unlock(&list_mutex);
@@ -250,6 +470,7 @@ static void idletimer_tg_destroy(const struct xt_tgdtor_param *par)
  
  static struct xt_target idletimer_tg __read_mostly = {
         .name           = "IDLETIMER",
+       .revision       = 1,
         .family         = NFPROTO_UNSPEC,
         .target         = idletimer_tg_target,
         .targetsize     = sizeof(struct idletimer_tg_info),
@@ -315,3 +536,4 @@ MODULE_DESCRIPTION("Xtables: idle time monitor");
  MODULE_LICENSE("GPL v2");
  MODULE_ALIAS("ipt_IDLETIMER");
  MODULE_ALIAS("ip6t_IDLETIMER");
+MODULE_ALIAS("arpt_IDLETIMER");
diff --git a/net/netfilter/xt_bpf.c b/net/netfilter/xt_bpf.c

index 29123934887bbfe5081178f9ce2425c5bb618a9c..041da0d9c06f2b1c2ecb31851932ac5a350122a9 100644 (file)
--- a/net/netfilter/xt_bpf.c
+++ b/net/netfilter/xt_bpf.c
@@ -56,7 +56,7 @@ static int __bpf_mt_check_path(const char *path, struct bpf_prog **ret)
         int retval, fd;
  
         set_fs(KERNEL_DS);
-       fd = bpf_obj_get_user(path);
+       fd = bpf_obj_get_user(path, 0);
         set_fs(oldfs);
         if (fd < 0)
                 return fd;
diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c

new file mode 100644 (file)

index 0000000..cd7c34b
--- /dev/null
+++ b/net/netfilter/xt_qtaguid.c
@@ -0,0 +1,3032 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * There are run-time debug flags enabled via the debug_mask module param, or
+ * via the DEFAULT_DEBUG_MASK. See xt_qtaguid_internal.h.
+ */
+#define DEBUG
+
+#include <linux/file.h>
+#include <linux/inetdevice.h>
+#include <linux/module.h>
+#include <linux/miscdevice.h>
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_qtaguid.h>
+#include <linux/ratelimit.h>
+#include <linux/seq_file.h>
+#include <linux/skbuff.h>
+#include <linux/workqueue.h>
+#include <net/addrconf.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <net/udp.h>
+#include <net/netfilter/nf_socket.h>
+
+#if defined(CONFIG_IP6_NF_IPTABLES) || defined(CONFIG_IP6_NF_IPTABLES_MODULE)
+#include <linux/netfilter_ipv6/ip6_tables.h>
+#endif
+
+#include <linux/netfilter/xt_socket.h>
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+#include "../../fs/proc/internal.h"
+
+/*
+ * We only use the xt_socket funcs within a similar context to avoid unexpected
+ * return values.
+ */
+#define XT_SOCKET_SUPPORTED_HOOKS \
+       ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN))
+
+
+static const char *module_procdirname = "xt_qtaguid";
+static struct proc_dir_entry *xt_qtaguid_procdir;
+
+static unsigned int proc_iface_perms = S_IRUGO;
+module_param_named(iface_perms, proc_iface_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_stats_file;
+static unsigned int proc_stats_perms = S_IRUGO;
+module_param_named(stats_perms, proc_stats_perms, uint, S_IRUGO | S_IWUSR);
+
+static struct proc_dir_entry *xt_qtaguid_ctrl_file;
+
+/* Everybody can write. But proc_ctrl_write_limited is true by default which
+ * limits what can be controlled. See the can_*() functions.
+ */
+static unsigned int proc_ctrl_perms = S_IRUGO | S_IWUGO;
+module_param_named(ctrl_perms, proc_ctrl_perms, uint, S_IRUGO | S_IWUSR);
+
+/* Limited by default, so the gid of the ctrl and stats proc entries
+ * will limit what can be done. See the can_*() functions.
+ */
+static bool proc_stats_readall_limited = true;
+static bool proc_ctrl_write_limited = true;
+
+module_param_named(stats_readall_limited, proc_stats_readall_limited, bool,
+                  S_IRUGO | S_IWUSR);
+module_param_named(ctrl_write_limited, proc_ctrl_write_limited, bool,
+                  S_IRUGO | S_IWUSR);
+
+/*
+ * Limit the number of active tags (via socket tags) for a given UID.
+ * Multiple processes could share the UID.
+ */
+static int max_sock_tags = DEFAULT_MAX_SOCK_TAGS;
+module_param(max_sock_tags, int, S_IRUGO | S_IWUSR);
+
+/*
+ * After the kernel has initiallized this module, it is still possible
+ * to make it passive.
+ * Setting passive to Y:
+ *  - the iface stats handling will not act on notifications.
+ *  - iptables matches will never match.
+ *  - ctrl commands silently succeed.
+ *  - stats are always empty.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool module_passive;
+module_param_named(passive, module_passive, bool, S_IRUGO | S_IWUSR);
+
+/*
+ * Control how qtaguid data is tracked per proc/uid.
+ * Setting tag_tracking_passive to Y:
+ *  - don't create proc specific structs to track tags
+ *  - don't check that active tag stats exceed some limits.
+ *  - don't clean up socket tags on process exits.
+ * This is mostly usefull when a bug is suspected.
+ */
+static bool qtu_proc_handling_passive;
+module_param_named(tag_tracking_passive, qtu_proc_handling_passive, bool,
+                  S_IRUGO | S_IWUSR);
+
+#define QTU_DEV_NAME "xt_qtaguid"
+
+uint qtaguid_debug_mask = DEFAULT_DEBUG_MASK;
+module_param_named(debug_mask, qtaguid_debug_mask, uint, S_IRUGO | S_IWUSR);
+
+/*---------------------------------------------------------------------------*/
+static const char *iface_stat_procdirname = "iface_stat";
+static struct proc_dir_entry *iface_stat_procdir;
+/*
+ * The iface_stat_all* will go away once userspace gets use to the new fields
+ * that have a format line.
+ */
+static const char *iface_stat_all_procfilename = "iface_stat_all";
+static struct proc_dir_entry *iface_stat_all_procfile;
+static const char *iface_stat_fmt_procfilename = "iface_stat_fmt";
+static struct proc_dir_entry *iface_stat_fmt_procfile;
+
+
+static LIST_HEAD(iface_stat_list);
+static DEFINE_SPINLOCK(iface_stat_list_lock);
+
+static struct rb_root sock_tag_tree = RB_ROOT;
+static DEFINE_SPINLOCK(sock_tag_list_lock);
+
+static struct rb_root tag_counter_set_tree = RB_ROOT;
+static DEFINE_SPINLOCK(tag_counter_set_list_lock);
+
+static struct rb_root uid_tag_data_tree = RB_ROOT;
+static DEFINE_SPINLOCK(uid_tag_data_tree_lock);
+
+static struct rb_root proc_qtu_data_tree = RB_ROOT;
+/* No proc_qtu_data_tree_lock; use uid_tag_data_tree_lock */
+
+static struct qtaguid_event_counts qtu_events;
+/*----------------------------------------------*/
+static bool can_manipulate_uids(void)
+{
+       /* root pwnd */
+       return in_egroup_p(xt_qtaguid_ctrl_file->gid)
+               || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || unlikely(!proc_ctrl_write_limited)
+               || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static bool can_impersonate_uid(kuid_t uid)
+{
+       return uid_eq(uid, current_fsuid()) || can_manipulate_uids();
+}
+
+static bool can_read_other_uid_stats(kuid_t uid)
+{
+       /* root pwnd */
+       return in_egroup_p(xt_qtaguid_stats_file->gid)
+               || unlikely(!from_kuid(&init_user_ns, current_fsuid())) || uid_eq(uid, current_fsuid())
+               || unlikely(!proc_stats_readall_limited)
+               || unlikely(uid_eq(current_fsuid(), xt_qtaguid_ctrl_file->uid));
+}
+
+static inline void dc_add_byte_packets(struct data_counters *counters, int set,
+                                 enum ifs_tx_rx direction,
+                                 enum ifs_proto ifs_proto,
+                                 int bytes,
+                                 int packets)
+{
+       counters->bpc[set][direction][ifs_proto].bytes += bytes;
+       counters->bpc[set][direction][ifs_proto].packets += packets;
+}
+
+static struct tag_node *tag_node_tree_search(struct rb_root *root, tag_t tag)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct tag_node *data = rb_entry(node, struct tag_node, node);
+               int result;
+               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+                        " node=%p data=%p\n", tag, node, data);
+               result = tag_compare(tag, data->tag);
+               RB_DEBUG("qtaguid: tag_node_tree_search(0x%llx): "
+                        " data.tag=0x%llx (uid=%u) res=%d\n",
+                        tag, data->tag, get_uid_from_tag(data->tag), result);
+               if (result < 0)
+                       node = node->rb_left;
+               else if (result > 0)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static void tag_node_tree_insert(struct tag_node *data, struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct tag_node *this = rb_entry(*new, struct tag_node,
+                                                node);
+               int result = tag_compare(data->tag, this->tag);
+               RB_DEBUG("qtaguid: %s(): tag=0x%llx"
+                        " (uid=%u)\n", __func__,
+                        this->tag,
+                        get_uid_from_tag(this->tag));
+               parent = *new;
+               if (result < 0)
+                       new = &((*new)->rb_left);
+               else if (result > 0)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static void tag_stat_tree_insert(struct tag_stat *data, struct rb_root *root)
+{
+       tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_stat *tag_stat_tree_search(struct rb_root *root, tag_t tag)
+{
+       struct tag_node *node = tag_node_tree_search(root, tag);
+       if (!node)
+               return NULL;
+       return rb_entry(&node->node, struct tag_stat, tn.node);
+}
+
+static void tag_counter_set_tree_insert(struct tag_counter_set *data,
+                                       struct rb_root *root)
+{
+       tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_counter_set *tag_counter_set_tree_search(struct rb_root *root,
+                                                          tag_t tag)
+{
+       struct tag_node *node = tag_node_tree_search(root, tag);
+       if (!node)
+               return NULL;
+       return rb_entry(&node->node, struct tag_counter_set, tn.node);
+
+}
+
+static void tag_ref_tree_insert(struct tag_ref *data, struct rb_root *root)
+{
+       tag_node_tree_insert(&data->tn, root);
+}
+
+static struct tag_ref *tag_ref_tree_search(struct rb_root *root, tag_t tag)
+{
+       struct tag_node *node = tag_node_tree_search(root, tag);
+       if (!node)
+               return NULL;
+       return rb_entry(&node->node, struct tag_ref, tn.node);
+}
+
+static struct sock_tag *sock_tag_tree_search(struct rb_root *root,
+                                            const struct sock *sk)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct sock_tag *data = rb_entry(node, struct sock_tag,
+                                                sock_node);
+               if (sk < data->sk)
+                       node = node->rb_left;
+               else if (sk > data->sk)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static void sock_tag_tree_insert(struct sock_tag *data, struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct sock_tag *this = rb_entry(*new, struct sock_tag,
+                                                sock_node);
+               parent = *new;
+               if (data->sk < this->sk)
+                       new = &((*new)->rb_left);
+               else if (data->sk > this->sk)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->sock_node, parent, new);
+       rb_insert_color(&data->sock_node, root);
+}
+
+static void sock_tag_tree_erase(struct rb_root *st_to_free_tree)
+{
+       struct rb_node *node;
+       struct sock_tag *st_entry;
+
+       node = rb_first(st_to_free_tree);
+       while (node) {
+               st_entry = rb_entry(node, struct sock_tag, sock_node);
+               node = rb_next(node);
+               CT_DEBUG("qtaguid: %s(): "
+                        "erase st: sk=%p tag=0x%llx (uid=%u)\n", __func__,
+                        st_entry->sk,
+                        st_entry->tag,
+                        get_uid_from_tag(st_entry->tag));
+               rb_erase(&st_entry->sock_node, st_to_free_tree);
+               sock_put(st_entry->sk);
+               kfree(st_entry);
+       }
+}
+
+static struct proc_qtu_data *proc_qtu_data_tree_search(struct rb_root *root,
+                                                      const pid_t pid)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct proc_qtu_data *data = rb_entry(node,
+                                                     struct proc_qtu_data,
+                                                     node);
+               if (pid < data->pid)
+                       node = node->rb_left;
+               else if (pid > data->pid)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+static void proc_qtu_data_tree_insert(struct proc_qtu_data *data,
+                                     struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct proc_qtu_data *this = rb_entry(*new,
+                                                     struct proc_qtu_data,
+                                                     node);
+               parent = *new;
+               if (data->pid < this->pid)
+                       new = &((*new)->rb_left);
+               else if (data->pid > this->pid)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static void uid_tag_data_tree_insert(struct uid_tag_data *data,
+                                    struct rb_root *root)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+       /* Figure out where to put new node */
+       while (*new) {
+               struct uid_tag_data *this = rb_entry(*new,
+                                                    struct uid_tag_data,
+                                                    node);
+               parent = *new;
+               if (data->uid < this->uid)
+                       new = &((*new)->rb_left);
+               else if (data->uid > this->uid)
+                       new = &((*new)->rb_right);
+               else
+                       BUG();
+       }
+
+       /* Add new node and rebalance tree. */
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+}
+
+static struct uid_tag_data *uid_tag_data_tree_search(struct rb_root *root,
+                                                    uid_t uid)
+{
+       struct rb_node *node = root->rb_node;
+
+       while (node) {
+               struct uid_tag_data *data = rb_entry(node,
+                                                    struct uid_tag_data,
+                                                    node);
+               if (uid < data->uid)
+                       node = node->rb_left;
+               else if (uid > data->uid)
+                       node = node->rb_right;
+               else
+                       return data;
+       }
+       return NULL;
+}
+
+/*
+ * Allocates a new uid_tag_data struct if needed.
+ * Returns a pointer to the found or allocated uid_tag_data.
+ * Returns a PTR_ERR on failures, and lock is not held.
+ * If found is not NULL:
+ *   sets *found to true if not allocated.
+ *   sets *found to false if allocated.
+ */
+struct uid_tag_data *get_uid_data(uid_t uid, bool *found_res)
+{
+       struct uid_tag_data *utd_entry;
+
+       /* Look for top level uid_tag_data for the UID */
+       utd_entry = uid_tag_data_tree_search(&uid_tag_data_tree, uid);
+       DR_DEBUG("qtaguid: get_uid_data(%u) utd=%p\n", uid, utd_entry);
+
+       if (found_res)
+               *found_res = utd_entry;
+       if (utd_entry)
+               return utd_entry;
+
+       utd_entry = kzalloc(sizeof(*utd_entry), GFP_ATOMIC);
+       if (!utd_entry) {
+               pr_err("qtaguid: get_uid_data(%u): "
+                      "tag data alloc failed\n", uid);
+               return ERR_PTR(-ENOMEM);
+       }
+
+       utd_entry->uid = uid;
+       utd_entry->tag_ref_tree = RB_ROOT;
+       uid_tag_data_tree_insert(utd_entry, &uid_tag_data_tree);
+       DR_DEBUG("qtaguid: get_uid_data(%u) new utd=%p\n", uid, utd_entry);
+       return utd_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *new_tag_ref(tag_t new_tag,
+                                  struct uid_tag_data *utd_entry)
+{
+       struct tag_ref *tr_entry;
+       int res;
+
+       if (utd_entry->num_active_tags + 1 > max_sock_tags) {
+               pr_info("qtaguid: new_tag_ref(0x%llx): "
+                       "tag ref alloc quota exceeded. max=%d\n",
+                       new_tag, max_sock_tags);
+               res = -EMFILE;
+               goto err_res;
+
+       }
+
+       tr_entry = kzalloc(sizeof(*tr_entry), GFP_ATOMIC);
+       if (!tr_entry) {
+               pr_err("qtaguid: new_tag_ref(0x%llx): "
+                      "tag ref alloc failed\n",
+                      new_tag);
+               res = -ENOMEM;
+               goto err_res;
+       }
+       tr_entry->tn.tag = new_tag;
+       /* tr_entry->num_sock_tags  handled by caller */
+       utd_entry->num_active_tags++;
+       tag_ref_tree_insert(tr_entry, &utd_entry->tag_ref_tree);
+       DR_DEBUG("qtaguid: new_tag_ref(0x%llx): "
+                " inserted new tag ref %p\n",
+                new_tag, tr_entry);
+       return tr_entry;
+
+err_res:
+       return ERR_PTR(res);
+}
+
+static struct tag_ref *lookup_tag_ref(tag_t full_tag,
+                                     struct uid_tag_data **utd_res)
+{
+       struct uid_tag_data *utd_entry;
+       struct tag_ref *tr_entry;
+       bool found_utd;
+       uid_t uid = get_uid_from_tag(full_tag);
+
+       DR_DEBUG("qtaguid: lookup_tag_ref(tag=0x%llx (uid=%u))\n",
+                full_tag, uid);
+
+       utd_entry = get_uid_data(uid, &found_utd);
+       if (IS_ERR_OR_NULL(utd_entry)) {
+               if (utd_res)
+                       *utd_res = utd_entry;
+               return NULL;
+       }
+
+       tr_entry = tag_ref_tree_search(&utd_entry->tag_ref_tree, full_tag);
+       if (utd_res)
+               *utd_res = utd_entry;
+       DR_DEBUG("qtaguid: lookup_tag_ref(0x%llx) utd_entry=%p tr_entry=%p\n",
+                full_tag, utd_entry, tr_entry);
+       return tr_entry;
+}
+
+/* Never returns NULL. Either PTR_ERR or a valid ptr. */
+static struct tag_ref *get_tag_ref(tag_t full_tag,
+                                  struct uid_tag_data **utd_res)
+{
+       struct uid_tag_data *utd_entry;
+       struct tag_ref *tr_entry;
+
+       DR_DEBUG("qtaguid: get_tag_ref(0x%llx)\n",
+                full_tag);
+       tr_entry = lookup_tag_ref(full_tag, &utd_entry);
+       BUG_ON(IS_ERR_OR_NULL(utd_entry));
+       if (!tr_entry)
+               tr_entry = new_tag_ref(full_tag, utd_entry);
+
+       if (utd_res)
+               *utd_res = utd_entry;
+       DR_DEBUG("qtaguid: get_tag_ref(0x%llx) utd=%p tr=%p\n",
+                full_tag, utd_entry, tr_entry);
+       return tr_entry;
+}
+
+/* Checks and maybe frees the UID Tag Data entry */
+static void put_utd_entry(struct uid_tag_data *utd_entry)
+{
+       /* Are we done with the UID tag data entry? */
+       if (RB_EMPTY_ROOT(&utd_entry->tag_ref_tree) &&
+               !utd_entry->num_pqd) {
+               DR_DEBUG("qtaguid: %s(): "
+                        "erase utd_entry=%p uid=%u "
+                        "by pid=%u tgid=%u uid=%u\n", __func__,
+                        utd_entry, utd_entry->uid,
+                        current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+               BUG_ON(utd_entry->num_active_tags);
+               rb_erase(&utd_entry->node, &uid_tag_data_tree);
+               kfree(utd_entry);
+       } else {
+               DR_DEBUG("qtaguid: %s(): "
+                        "utd_entry=%p still has %d tags %d proc_qtu_data\n",
+                        __func__, utd_entry, utd_entry->num_active_tags,
+                        utd_entry->num_pqd);
+               BUG_ON(!(utd_entry->num_active_tags ||
+                        utd_entry->num_pqd));
+       }
+}
+
+/*
+ * If no sock_tags are using this tag_ref,
+ * decrements refcount of utd_entry, removes tr_entry
+ * from utd_entry->tag_ref_tree and frees.
+ */
+static void free_tag_ref_from_utd_entry(struct tag_ref *tr_entry,
+                                       struct uid_tag_data *utd_entry)
+{
+       DR_DEBUG("qtaguid: %s(): %p tag=0x%llx (uid=%u)\n", __func__,
+                tr_entry, tr_entry->tn.tag,
+                get_uid_from_tag(tr_entry->tn.tag));
+       if (!tr_entry->num_sock_tags) {
+               BUG_ON(!utd_entry->num_active_tags);
+               utd_entry->num_active_tags--;
+               rb_erase(&tr_entry->tn.node, &utd_entry->tag_ref_tree);
+               DR_DEBUG("qtaguid: %s(): erased %p\n", __func__, tr_entry);
+               kfree(tr_entry);
+       }
+}
+
+static void put_tag_ref_tree(tag_t full_tag, struct uid_tag_data *utd_entry)
+{
+       struct rb_node *node;
+       struct tag_ref *tr_entry;
+       tag_t acct_tag;
+
+       DR_DEBUG("qtaguid: %s(tag=0x%llx (uid=%u))\n", __func__,
+                full_tag, get_uid_from_tag(full_tag));
+       acct_tag = get_atag_from_tag(full_tag);
+       node = rb_first(&utd_entry->tag_ref_tree);
+       while (node) {
+               tr_entry = rb_entry(node, struct tag_ref, tn.node);
+               node = rb_next(node);
+               if (!acct_tag || tr_entry->tn.tag == full_tag)
+                       free_tag_ref_from_utd_entry(tr_entry, utd_entry);
+       }
+}
+
+static ssize_t read_proc_u64(struct file *file, char __user *buf,
+                        size_t size, loff_t *ppos)
+{
+       uint64_t *valuep = PDE_DATA(file_inode(file));
+       char tmp[24];
+       size_t tmp_size;
+
+       tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", *valuep);
+       return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t read_proc_bool(struct file *file, char __user *buf,
+                         size_t size, loff_t *ppos)
+{
+       bool *valuep = PDE_DATA(file_inode(file));
+       char tmp[24];
+       size_t tmp_size;
+
+       tmp_size = scnprintf(tmp, sizeof(tmp), "%u\n", *valuep);
+       return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static int get_active_counter_set(tag_t tag)
+{
+       int active_set = 0;
+       struct tag_counter_set *tcs;
+
+       MT_DEBUG("qtaguid: get_active_counter_set(tag=0x%llx)"
+                " (uid=%u)\n",
+                tag, get_uid_from_tag(tag));
+       /* For now we only handle UID tags for active sets */
+       tag = get_utag_from_tag(tag);
+       spin_lock_bh(&tag_counter_set_list_lock);
+       tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+       if (tcs)
+               active_set = tcs->active_set;
+       spin_unlock_bh(&tag_counter_set_list_lock);
+       return active_set;
+}
+
+/*
+ * Find the entry for tracking the specified interface.
+ * Caller must hold iface_stat_list_lock
+ */
+static struct iface_stat *get_iface_entry(const char *ifname)
+{
+       struct iface_stat *iface_entry;
+
+       /* Find the entry for tracking the specified tag within the interface */
+       if (ifname == NULL) {
+               pr_info("qtaguid: iface_stat: get() NULL device name\n");
+               return NULL;
+       }
+
+       /* Iterate over interfaces */
+       list_for_each_entry(iface_entry, &iface_stat_list, list) {
+               if (!strcmp(ifname, iface_entry->ifname))
+                       goto done;
+       }
+       iface_entry = NULL;
+done:
+       return iface_entry;
+}
+
+/* This is for fmt2 only */
+static void pp_iface_stat_header(struct seq_file *m)
+{
+       seq_puts(m,
+                "ifname "
+                "total_skb_rx_bytes total_skb_rx_packets "
+                "total_skb_tx_bytes total_skb_tx_packets "
+                "rx_tcp_bytes rx_tcp_packets "
+                "rx_udp_bytes rx_udp_packets "
+                "rx_other_bytes rx_other_packets "
+                "tx_tcp_bytes tx_tcp_packets "
+                "tx_udp_bytes tx_udp_packets "
+                "tx_other_bytes tx_other_packets\n"
+       );
+}
+
+static void pp_iface_stat_line(struct seq_file *m,
+                              struct iface_stat *iface_entry)
+{
+       struct data_counters *cnts;
+       int cnt_set = 0;   /* We only use one set for the device */
+       cnts = &iface_entry->totals_via_skb;
+       seq_printf(m, "%s %llu %llu %llu %llu %llu %llu %llu %llu "
+                  "%llu %llu %llu %llu %llu %llu %llu %llu\n",
+                  iface_entry->ifname,
+                  dc_sum_bytes(cnts, cnt_set, IFS_RX),
+                  dc_sum_packets(cnts, cnt_set, IFS_RX),
+                  dc_sum_bytes(cnts, cnt_set, IFS_TX),
+                  dc_sum_packets(cnts, cnt_set, IFS_TX),
+                  cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+                  cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+                  cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+                  cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+                  cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+                  cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+                  cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+}
+
+struct proc_iface_stat_fmt_info {
+       int fmt;
+};
+
+static void *iface_stat_fmt_proc_start(struct seq_file *m, loff_t *pos)
+{
+       struct proc_iface_stat_fmt_info *p = m->private;
+       loff_t n = *pos;
+
+       /*
+        * This lock will prevent iface_stat_update() from changing active,
+        * and in turn prevent an interface from unregistering itself.
+        */
+       spin_lock_bh(&iface_stat_list_lock);
+
+       if (unlikely(module_passive))
+               return NULL;
+
+       if (!n && p->fmt == 2)
+               pp_iface_stat_header(m);
+
+       return seq_list_start(&iface_stat_list, n);
+}
+
+static void *iface_stat_fmt_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+       return seq_list_next(p, &iface_stat_list, pos);
+}
+
+static void iface_stat_fmt_proc_stop(struct seq_file *m, void *p)
+{
+       spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_stat_fmt_proc_show(struct seq_file *m, void *v)
+{
+       struct proc_iface_stat_fmt_info *p = m->private;
+       struct iface_stat *iface_entry;
+       struct rtnl_link_stats64 dev_stats, *stats;
+       struct rtnl_link_stats64 no_dev_stats = {0};
+
+
+       CT_DEBUG("qtaguid:proc iface_stat_fmt pid=%u tgid=%u uid=%u\n",
+                current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+       iface_entry = list_entry(v, struct iface_stat, list);
+
+       if (iface_entry->active) {
+               stats = dev_get_stats(iface_entry->net_dev,
+                                     &dev_stats);
+       } else {
+               stats = &no_dev_stats;
+       }
+       /*
+        * If the meaning of the data changes, then update the fmtX
+        * string.
+        */
+       if (p->fmt == 1) {
+               seq_printf(m, "%s %d %llu %llu %llu %llu %llu %llu %llu %llu\n",
+                          iface_entry->ifname,
+                          iface_entry->active,
+                          iface_entry->totals_via_dev[IFS_RX].bytes,
+                          iface_entry->totals_via_dev[IFS_RX].packets,
+                          iface_entry->totals_via_dev[IFS_TX].bytes,
+                          iface_entry->totals_via_dev[IFS_TX].packets,
+                          stats->rx_bytes, stats->rx_packets,
+                          stats->tx_bytes, stats->tx_packets
+                          );
+       } else {
+               pp_iface_stat_line(m, iface_entry);
+       }
+       return 0;
+}
+
+static const struct file_operations read_u64_fops = {
+       .read           = read_proc_u64,
+       .llseek         = default_llseek,
+};
+
+static const struct file_operations read_bool_fops = {
+       .read           = read_proc_bool,
+       .llseek         = default_llseek,
+};
+
+static void iface_create_proc_worker(struct work_struct *work)
+{
+       struct proc_dir_entry *proc_entry;
+       struct iface_stat_work *isw = container_of(work, struct iface_stat_work,
+                                                  iface_work);
+       struct iface_stat *new_iface  = isw->iface_entry;
+
+       /* iface_entries are not deleted, so safe to manipulate. */
+       proc_entry = proc_mkdir(new_iface->ifname, iface_stat_procdir);
+       if (IS_ERR_OR_NULL(proc_entry)) {
+               pr_err("qtaguid: iface_stat: create_proc(): alloc failed.\n");
+               kfree(isw);
+               return;
+       }
+
+       new_iface->proc_ptr = proc_entry;
+
+       proc_create_data("tx_bytes", proc_iface_perms, proc_entry,
+                        &read_u64_fops,
+                        &new_iface->totals_via_dev[IFS_TX].bytes);
+       proc_create_data("rx_bytes", proc_iface_perms, proc_entry,
+                        &read_u64_fops,
+                        &new_iface->totals_via_dev[IFS_RX].bytes);
+       proc_create_data("tx_packets", proc_iface_perms, proc_entry,
+                        &read_u64_fops,
+                        &new_iface->totals_via_dev[IFS_TX].packets);
+       proc_create_data("rx_packets", proc_iface_perms, proc_entry,
+                        &read_u64_fops,
+                        &new_iface->totals_via_dev[IFS_RX].packets);
+       proc_create_data("active", proc_iface_perms, proc_entry,
+                        &read_bool_fops, &new_iface->active);
+
+       IF_DEBUG("qtaguid: iface_stat: create_proc(): done "
+                "entry=%p dev=%s\n", new_iface, new_iface->ifname);
+       kfree(isw);
+}
+
+/*
+ * Will set the entry's active state, and
+ * update the net_dev accordingly also.
+ */
+static void _iface_stat_set_active(struct iface_stat *entry,
+                                  struct net_device *net_dev,
+                                  bool activate)
+{
+       if (activate) {
+               entry->net_dev = net_dev;
+               entry->active = true;
+               IF_DEBUG("qtaguid: %s(%s): "
+                        "enable tracking. rfcnt=%d\n", __func__,
+                        entry->ifname,
+                        __this_cpu_read(*net_dev->pcpu_refcnt));
+       } else {
+               entry->active = false;
+               entry->net_dev = NULL;
+               IF_DEBUG("qtaguid: %s(%s): "
+                        "disable tracking. rfcnt=%d\n", __func__,
+                        entry->ifname,
+                        __this_cpu_read(*net_dev->pcpu_refcnt));
+
+       }
+}
+
+/* Caller must hold iface_stat_list_lock */
+static struct iface_stat *iface_alloc(struct net_device *net_dev)
+{
+       struct iface_stat *new_iface;
+       struct iface_stat_work *isw;
+
+       new_iface = kzalloc(sizeof(*new_iface), GFP_ATOMIC);
+       if (new_iface == NULL) {
+               pr_err("qtaguid: iface_stat: create(%s): "
+                      "iface_stat alloc failed\n", net_dev->name);
+               return NULL;
+       }
+       new_iface->ifname = kstrdup(net_dev->name, GFP_ATOMIC);
+       if (new_iface->ifname == NULL) {
+               pr_err("qtaguid: iface_stat: create(%s): "
+                      "ifname alloc failed\n", net_dev->name);
+               kfree(new_iface);
+               return NULL;
+       }
+       spin_lock_init(&new_iface->tag_stat_list_lock);
+       new_iface->tag_stat_tree = RB_ROOT;
+       _iface_stat_set_active(new_iface, net_dev, true);
+
+       /*
+        * ipv6 notifier chains are atomic :(
+        * No create_proc_read_entry() for you!
+        */
+       isw = kmalloc(sizeof(*isw), GFP_ATOMIC);
+       if (!isw) {
+               pr_err("qtaguid: iface_stat: create(%s): "
+                      "work alloc failed\n", new_iface->ifname);
+               _iface_stat_set_active(new_iface, net_dev, false);
+               kfree(new_iface->ifname);
+               kfree(new_iface);
+               return NULL;
+       }
+       isw->iface_entry = new_iface;
+       INIT_WORK(&isw->iface_work, iface_create_proc_worker);
+       schedule_work(&isw->iface_work);
+       list_add(&new_iface->list, &iface_stat_list);
+       return new_iface;
+}
+
+static void iface_check_stats_reset_and_adjust(struct net_device *net_dev,
+                                              struct iface_stat *iface)
+{
+       struct rtnl_link_stats64 dev_stats, *stats;
+       bool stats_rewound;
+
+       stats = dev_get_stats(net_dev, &dev_stats);
+       /* No empty packets */
+       stats_rewound =
+               (stats->rx_bytes < iface->last_known[IFS_RX].bytes)
+               || (stats->tx_bytes < iface->last_known[IFS_TX].bytes);
+
+       IF_DEBUG("qtaguid: %s(%s): iface=%p netdev=%p "
+                "bytes rx/tx=%llu/%llu "
+                "active=%d last_known=%d "
+                "stats_rewound=%d\n", __func__,
+                net_dev ? net_dev->name : "?",
+                iface, net_dev,
+                stats->rx_bytes, stats->tx_bytes,
+                iface->active, iface->last_known_valid, stats_rewound);
+
+       if (iface->active && iface->last_known_valid && stats_rewound) {
+               pr_warn_once("qtaguid: iface_stat: %s(%s): "
+                            "iface reset its stats unexpectedly\n", __func__,
+                            net_dev->name);
+
+               iface->totals_via_dev[IFS_TX].bytes +=
+                       iface->last_known[IFS_TX].bytes;
+               iface->totals_via_dev[IFS_TX].packets +=
+                       iface->last_known[IFS_TX].packets;
+               iface->totals_via_dev[IFS_RX].bytes +=
+                       iface->last_known[IFS_RX].bytes;
+               iface->totals_via_dev[IFS_RX].packets +=
+                       iface->last_known[IFS_RX].packets;
+               iface->last_known_valid = false;
+               IF_DEBUG("qtaguid: %s(%s): iface=%p "
+                        "used last known bytes rx/tx=%llu/%llu\n", __func__,
+                        iface->ifname, iface, iface->last_known[IFS_RX].bytes,
+                        iface->last_known[IFS_TX].bytes);
+       }
+}
+
+/*
+ * Create a new entry for tracking the specified interface.
+ * Do nothing if the entry already exists.
+ * Called when an interface is configured with a valid IP address.
+ */
+static void iface_stat_create(struct net_device *net_dev,
+                             struct in_ifaddr *ifa)
+{
+       struct in_device *in_dev = NULL;
+       const char *ifname;
+       struct iface_stat *entry;
+       __be32 ipaddr = 0;
+       struct iface_stat *new_iface;
+
+       IF_DEBUG("qtaguid: iface_stat: create(%s): ifa=%p netdev=%p\n",
+                net_dev ? net_dev->name : "?",
+                ifa, net_dev);
+       if (!net_dev) {
+               pr_err("qtaguid: iface_stat: create(): no net dev\n");
+               return;
+       }
+
+       ifname = net_dev->name;
+       if (!ifa) {
+               in_dev = in_dev_get(net_dev);
+               if (!in_dev) {
+                       pr_err("qtaguid: iface_stat: create(%s): no inet dev\n",
+                              ifname);
+                       return;
+               }
+               IF_DEBUG("qtaguid: iface_stat: create(%s): in_dev=%p\n",
+                        ifname, in_dev);
+               for (ifa = in_dev->ifa_list; ifa; ifa = ifa->ifa_next) {
+                       IF_DEBUG("qtaguid: iface_stat: create(%s): "
+                                "ifa=%p ifa_label=%s\n",
+                                ifname, ifa, ifa->ifa_label);
+                       if (!strcmp(ifname, ifa->ifa_label))
+                               break;
+               }
+       }
+
+       if (!ifa) {
+               IF_DEBUG("qtaguid: iface_stat: create(%s): no matching IP\n",
+                        ifname);
+               goto done_put;
+       }
+       ipaddr = ifa->ifa_local;
+
+       spin_lock_bh(&iface_stat_list_lock);
+       entry = get_iface_entry(ifname);
+       if (entry != NULL) {
+               IF_DEBUG("qtaguid: iface_stat: create(%s): entry=%p\n",
+                        ifname, entry);
+               iface_check_stats_reset_and_adjust(net_dev, entry);
+               _iface_stat_set_active(entry, net_dev, true);
+               IF_DEBUG("qtaguid: %s(%s): "
+                        "tracking now %d on ip=%pI4\n", __func__,
+                        entry->ifname, true, &ipaddr);
+               goto done_unlock_put;
+       }
+
+       new_iface = iface_alloc(net_dev);
+       IF_DEBUG("qtaguid: iface_stat: create(%s): done "
+                "entry=%p ip=%pI4\n", ifname, new_iface, &ipaddr);
+done_unlock_put:
+       spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+       if (in_dev)
+               in_dev_put(in_dev);
+}
+
+static void iface_stat_create_ipv6(struct net_device *net_dev,
+                                  struct inet6_ifaddr *ifa)
+{
+       struct in_device *in_dev;
+       const char *ifname;
+       struct iface_stat *entry;
+       struct iface_stat *new_iface;
+       int addr_type;
+
+       IF_DEBUG("qtaguid: iface_stat: create6(): ifa=%p netdev=%p->name=%s\n",
+                ifa, net_dev, net_dev ? net_dev->name : "");
+       if (!net_dev) {
+               pr_err("qtaguid: iface_stat: create6(): no net dev!\n");
+               return;
+       }
+       ifname = net_dev->name;
+
+       in_dev = in_dev_get(net_dev);
+       if (!in_dev) {
+               pr_err("qtaguid: iface_stat: create6(%s): no inet dev\n",
+                      ifname);
+               return;
+       }
+
+       IF_DEBUG("qtaguid: iface_stat: create6(%s): in_dev=%p\n",
+                ifname, in_dev);
+
+       if (!ifa) {
+               IF_DEBUG("qtaguid: iface_stat: create6(%s): no matching IP\n",
+                        ifname);
+               goto done_put;
+       }
+       addr_type = ipv6_addr_type(&ifa->addr);
+
+       spin_lock_bh(&iface_stat_list_lock);
+       entry = get_iface_entry(ifname);
+       if (entry != NULL) {
+               IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+                        ifname, entry);
+               iface_check_stats_reset_and_adjust(net_dev, entry);
+               _iface_stat_set_active(entry, net_dev, true);
+               IF_DEBUG("qtaguid: %s(%s): "
+                        "tracking now %d on ip=%pI6c\n", __func__,
+                        entry->ifname, true, &ifa->addr);
+               goto done_unlock_put;
+       }
+
+       new_iface = iface_alloc(net_dev);
+       IF_DEBUG("qtaguid: iface_stat: create6(%s): done "
+                "entry=%p ip=%pI6c\n", ifname, new_iface, &ifa->addr);
+
+done_unlock_put:
+       spin_unlock_bh(&iface_stat_list_lock);
+done_put:
+       in_dev_put(in_dev);
+}
+
+static struct sock_tag *get_sock_stat_nl(const struct sock *sk)
+{
+       MT_DEBUG("qtaguid: get_sock_stat_nl(sk=%p)\n", sk);
+       return sock_tag_tree_search(&sock_tag_tree, sk);
+}
+
+static struct sock_tag *get_sock_stat(const struct sock *sk)
+{
+       struct sock_tag *sock_tag_entry;
+       MT_DEBUG("qtaguid: get_sock_stat(sk=%p)\n", sk);
+       if (!sk)
+               return NULL;
+       spin_lock_bh(&sock_tag_list_lock);
+       sock_tag_entry = get_sock_stat_nl(sk);
+       spin_unlock_bh(&sock_tag_list_lock);
+       return sock_tag_entry;
+}
+
+static int ipx_proto(const struct sk_buff *skb,
+                    struct xt_action_param *par)
+{
+       int thoff = 0, tproto;
+
+       switch (par->state->pf) {
+       case NFPROTO_IPV6:
+               tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
+               if (tproto < 0)
+                       MT_DEBUG("%s(): transport header not found in ipv6"
+                                " skb=%p\n", __func__, skb);
+               break;
+       case NFPROTO_IPV4:
+               tproto = ip_hdr(skb)->protocol;
+               break;
+       default:
+               tproto = IPPROTO_RAW;
+       }
+       return tproto;
+}
+
+static void
+data_counters_update(struct data_counters *dc, int set,
+                    enum ifs_tx_rx direction, int proto, int bytes)
+{
+       switch (proto) {
+       case IPPROTO_TCP:
+               dc_add_byte_packets(dc, set, direction, IFS_TCP, bytes, 1);
+               break;
+       case IPPROTO_UDP:
+               dc_add_byte_packets(dc, set, direction, IFS_UDP, bytes, 1);
+               break;
+       case IPPROTO_IP:
+       default:
+               dc_add_byte_packets(dc, set, direction, IFS_PROTO_OTHER, bytes,
+                                   1);
+               break;
+       }
+}
+
+/*
+ * Update stats for the specified interface. Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called when an device is being unregistered.
+ */
+static void iface_stat_update(struct net_device *net_dev, bool stash_only)
+{
+       struct rtnl_link_stats64 dev_stats, *stats;
+       struct iface_stat *entry;
+
+       stats = dev_get_stats(net_dev, &dev_stats);
+       spin_lock_bh(&iface_stat_list_lock);
+       entry = get_iface_entry(net_dev->name);
+       if (entry == NULL) {
+               IF_DEBUG("qtaguid: iface_stat: update(%s): not tracked\n",
+                        net_dev->name);
+               spin_unlock_bh(&iface_stat_list_lock);
+               return;
+       }
+
+       IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__,
+                net_dev->name, entry);
+       if (!entry->active) {
+               IF_DEBUG("qtaguid: %s(%s): already disabled\n", __func__,
+                        net_dev->name);
+               spin_unlock_bh(&iface_stat_list_lock);
+               return;
+       }
+
+       if (stash_only) {
+               entry->last_known[IFS_TX].bytes = stats->tx_bytes;
+               entry->last_known[IFS_TX].packets = stats->tx_packets;
+               entry->last_known[IFS_RX].bytes = stats->rx_bytes;
+               entry->last_known[IFS_RX].packets = stats->rx_packets;
+               entry->last_known_valid = true;
+               IF_DEBUG("qtaguid: %s(%s): "
+                        "dev stats stashed rx/tx=%llu/%llu\n", __func__,
+                        net_dev->name, stats->rx_bytes, stats->tx_bytes);
+               spin_unlock_bh(&iface_stat_list_lock);
+               return;
+       }
+       entry->totals_via_dev[IFS_TX].bytes += stats->tx_bytes;
+       entry->totals_via_dev[IFS_TX].packets += stats->tx_packets;
+       entry->totals_via_dev[IFS_RX].bytes += stats->rx_bytes;
+       entry->totals_via_dev[IFS_RX].packets += stats->rx_packets;
+       /* We don't need the last_known[] anymore */
+       entry->last_known_valid = false;
+       _iface_stat_set_active(entry, net_dev, false);
+       IF_DEBUG("qtaguid: %s(%s): "
+                "disable tracking. rx/tx=%llu/%llu\n", __func__,
+                net_dev->name, stats->rx_bytes, stats->tx_bytes);
+       spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/* Guarantied to return a net_device that has a name */
+static void get_dev_and_dir(const struct sk_buff *skb,
+                           struct xt_action_param *par,
+                           enum ifs_tx_rx *direction,
+                           const struct net_device **el_dev)
+{
+       const struct nf_hook_state *parst = par->state;
+
+       BUG_ON(!direction || !el_dev);
+
+       if (parst->in) {
+               *el_dev = parst->in;
+               *direction = IFS_RX;
+       } else if (parst->out) {
+               *el_dev = parst->out;
+               *direction = IFS_TX;
+       } else {
+               pr_err("qtaguid[%d]: %s(): no par->state->in/out?!!\n",
+                      parst->hook, __func__);
+               BUG();
+       }
+       if (unlikely(!(*el_dev)->name)) {
+               pr_err("qtaguid[%d]: %s(): no dev->name?!!\n",
+                      parst->hook, __func__);
+               BUG();
+       }
+       if (skb->dev && *el_dev != skb->dev) {
+               MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n",
+                        parst->hook, skb->dev, skb->dev->name,
+                        *direction == IFS_RX ? "in" : "out",  *el_dev,
+                        (*el_dev)->name);
+       }
+}
+
+/*
+ * Update stats for the specified interface from the skb.
+ * Do nothing if the entry
+ * does not exist (when a device was never configured with an IP address).
+ * Called on each sk.
+ */
+static void iface_stat_update_from_skb(const struct sk_buff *skb,
+                                      struct xt_action_param *par)
+{
+       const struct nf_hook_state *parst = par->state;
+       struct iface_stat *entry;
+       const struct net_device *el_dev;
+       enum ifs_tx_rx direction;
+       int bytes = skb->len;
+       int proto;
+
+       get_dev_and_dir(skb, par, &direction, &el_dev);
+       proto = ipx_proto(skb, par);
+       MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): "
+                "type=%d fam=%d proto=%d dir=%d\n",
+                parst->hook, __func__, el_dev->name, el_dev->type,
+                parst->pf, proto, direction);
+
+       spin_lock_bh(&iface_stat_list_lock);
+       entry = get_iface_entry(el_dev->name);
+       if (entry == NULL) {
+               IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n",
+                        parst->hook, __func__, el_dev->name);
+               spin_unlock_bh(&iface_stat_list_lock);
+               return;
+       }
+
+       IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", parst->hook,  __func__,
+                el_dev->name, entry);
+
+       data_counters_update(&entry->totals_via_skb, 0, direction, proto,
+                            bytes);
+       spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static void tag_stat_update(struct tag_stat *tag_entry,
+                       enum ifs_tx_rx direction, int proto, int bytes)
+{
+       int active_set;
+       active_set = get_active_counter_set(tag_entry->tn.tag);
+       MT_DEBUG("qtaguid: tag_stat_update(tag=0x%llx (uid=%u) set=%d "
+                "dir=%d proto=%d bytes=%d)\n",
+                tag_entry->tn.tag, get_uid_from_tag(tag_entry->tn.tag),
+                active_set, direction, proto, bytes);
+       data_counters_update(&tag_entry->counters, active_set, direction,
+                            proto, bytes);
+       if (tag_entry->parent_counters)
+               data_counters_update(tag_entry->parent_counters, active_set,
+                                    direction, proto, bytes);
+}
+
+/*
+ * Create a new entry for tracking the specified {acct_tag,uid_tag} within
+ * the interface.
+ * iface_entry->tag_stat_list_lock should be held.
+ */
+static struct tag_stat *create_if_tag_stat(struct iface_stat *iface_entry,
+                                          tag_t tag)
+{
+       struct tag_stat *new_tag_stat_entry = NULL;
+       IF_DEBUG("qtaguid: iface_stat: %s(): ife=%p tag=0x%llx"
+                " (uid=%u)\n", __func__,
+                iface_entry, tag, get_uid_from_tag(tag));
+       new_tag_stat_entry = kzalloc(sizeof(*new_tag_stat_entry), GFP_ATOMIC);
+       if (!new_tag_stat_entry) {
+               pr_err("qtaguid: iface_stat: tag stat alloc failed\n");
+               goto done;
+       }
+       new_tag_stat_entry->tn.tag = tag;
+       tag_stat_tree_insert(new_tag_stat_entry, &iface_entry->tag_stat_tree);
+done:
+       return new_tag_stat_entry;
+}
+
+static void if_tag_stat_update(const char *ifname, uid_t uid,
+                              const struct sock *sk, enum ifs_tx_rx direction,
+                              int proto, int bytes)
+{
+       struct tag_stat *tag_stat_entry;
+       tag_t tag, acct_tag;
+       tag_t uid_tag;
+       struct data_counters *uid_tag_counters;
+       struct sock_tag *sock_tag_entry;
+       struct iface_stat *iface_entry;
+       struct tag_stat *new_tag_stat = NULL;
+       MT_DEBUG("qtaguid: if_tag_stat_update(ifname=%s "
+               "uid=%u sk=%p dir=%d proto=%d bytes=%d)\n",
+                ifname, uid, sk, direction, proto, bytes);
+
+       spin_lock_bh(&iface_stat_list_lock);
+       iface_entry = get_iface_entry(ifname);
+       if (!iface_entry) {
+               pr_err_ratelimited("qtaguid: tag_stat: stat_update() "
+                                  "%s not found\n", ifname);
+               spin_unlock_bh(&iface_stat_list_lock);
+               return;
+       }
+       /* It is ok to process data when an iface_entry is inactive */
+
+       MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n",
+                ifname, iface_entry);
+
+       /*
+        * Look for a tagged sock.
+        * It will have an acct_uid.
+        */
+       sock_tag_entry = get_sock_stat(sk);
+       if (sock_tag_entry) {
+               tag = sock_tag_entry->tag;
+               acct_tag = get_atag_from_tag(tag);
+               uid_tag = get_utag_from_tag(tag);
+       } else {
+               acct_tag = make_atag_from_value(0);
+               tag = combine_atag_with_uid(acct_tag, uid);
+               uid_tag = make_tag_from_uid(uid);
+       }
+       MT_DEBUG("qtaguid: tag_stat: stat_update(): "
+                " looking for tag=0x%llx (uid=%u) in ife=%p\n",
+                tag, get_uid_from_tag(tag), iface_entry);
+       /* Loop over tag list under this interface for {acct_tag,uid_tag} */
+       spin_lock_bh(&iface_entry->tag_stat_list_lock);
+
+       tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+                                             tag);
+       if (tag_stat_entry) {
+               /*
+                * Updating the {acct_tag, uid_tag} entry handles both stats:
+                * {0, uid_tag} will also get updated.
+                */
+               tag_stat_update(tag_stat_entry, direction, proto, bytes);
+               goto unlock;
+       }
+
+       /* Loop over tag list under this interface for {0,uid_tag} */
+       tag_stat_entry = tag_stat_tree_search(&iface_entry->tag_stat_tree,
+                                             uid_tag);
+       if (!tag_stat_entry) {
+               /* Here: the base uid_tag did not exist */
+               /*
+                * No parent counters. So
+                *  - No {0, uid_tag} stats and no {acc_tag, uid_tag} stats.
+                */
+               new_tag_stat = create_if_tag_stat(iface_entry, uid_tag);
+               if (!new_tag_stat)
+                       goto unlock;
+               uid_tag_counters = &new_tag_stat->counters;
+       } else {
+               uid_tag_counters = &tag_stat_entry->counters;
+       }
+
+       if (acct_tag) {
+               /* Create the child {acct_tag, uid_tag} and hook up parent. */
+               new_tag_stat = create_if_tag_stat(iface_entry, tag);
+               if (!new_tag_stat)
+                       goto unlock;
+               new_tag_stat->parent_counters = uid_tag_counters;
+       } else {
+               /*
+                * For new_tag_stat to be still NULL here would require:
+                *  {0, uid_tag} exists
+                *  and {acct_tag, uid_tag} doesn't exist
+                *  AND acct_tag == 0.
+                * Impossible. This reassures us that new_tag_stat
+                * below will always be assigned.
+                */
+               BUG_ON(!new_tag_stat);
+       }
+       tag_stat_update(new_tag_stat, direction, proto, bytes);
+unlock:
+       spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+       spin_unlock_bh(&iface_stat_list_lock);
+}
+
+static int iface_netdev_event_handler(struct notifier_block *nb,
+                                     unsigned long event, void *ptr) {
+       struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+       if (unlikely(module_passive))
+               return NOTIFY_DONE;
+
+       IF_DEBUG("qtaguid: iface_stat: netdev_event(): "
+                "ev=0x%lx/%s netdev=%p->name=%s\n",
+                event, netdev_evt_str(event), dev, dev ? dev->name : "");
+
+       switch (event) {
+       case NETDEV_UP:
+               iface_stat_create(dev, NULL);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       case NETDEV_DOWN:
+       case NETDEV_UNREGISTER:
+               iface_stat_update(dev, event == NETDEV_DOWN);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static int iface_inet6addr_event_handler(struct notifier_block *nb,
+                                        unsigned long event, void *ptr)
+{
+       struct inet6_ifaddr *ifa = ptr;
+       struct net_device *dev;
+
+       if (unlikely(module_passive))
+               return NOTIFY_DONE;
+
+       IF_DEBUG("qtaguid: iface_stat: inet6addr_event(): "
+                "ev=0x%lx/%s ifa=%p\n",
+                event, netdev_evt_str(event), ifa);
+
+       switch (event) {
+       case NETDEV_UP:
+               BUG_ON(!ifa || !ifa->idev);
+               dev = (struct net_device *)ifa->idev->dev;
+               iface_stat_create_ipv6(dev, ifa);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       case NETDEV_DOWN:
+       case NETDEV_UNREGISTER:
+               BUG_ON(!ifa || !ifa->idev);
+               dev = (struct net_device *)ifa->idev->dev;
+               iface_stat_update(dev, event == NETDEV_DOWN);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static int iface_inetaddr_event_handler(struct notifier_block *nb,
+                                       unsigned long event, void *ptr)
+{
+       struct in_ifaddr *ifa = ptr;
+       struct net_device *dev;
+
+       if (unlikely(module_passive))
+               return NOTIFY_DONE;
+
+       IF_DEBUG("qtaguid: iface_stat: inetaddr_event(): "
+                "ev=0x%lx/%s ifa=%p\n",
+                event, netdev_evt_str(event), ifa);
+
+       switch (event) {
+       case NETDEV_UP:
+               BUG_ON(!ifa || !ifa->ifa_dev);
+               dev = ifa->ifa_dev->dev;
+               iface_stat_create(dev, ifa);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       case NETDEV_DOWN:
+       case NETDEV_UNREGISTER:
+               BUG_ON(!ifa || !ifa->ifa_dev);
+               dev = ifa->ifa_dev->dev;
+               iface_stat_update(dev, event == NETDEV_DOWN);
+               atomic64_inc(&qtu_events.iface_events);
+               break;
+       }
+       return NOTIFY_DONE;
+}
+
+static struct notifier_block iface_netdev_notifier_blk = {
+       .notifier_call = iface_netdev_event_handler,
+};
+
+static struct notifier_block iface_inetaddr_notifier_blk = {
+       .notifier_call = iface_inetaddr_event_handler,
+};
+
+static struct notifier_block iface_inet6addr_notifier_blk = {
+       .notifier_call = iface_inet6addr_event_handler,
+};
+
+static const struct seq_operations iface_stat_fmt_proc_seq_ops = {
+       .start  = iface_stat_fmt_proc_start,
+       .next   = iface_stat_fmt_proc_next,
+       .stop   = iface_stat_fmt_proc_stop,
+       .show   = iface_stat_fmt_proc_show,
+};
+
+static int proc_iface_stat_fmt_open(struct inode *inode, struct file *file)
+{
+       struct proc_iface_stat_fmt_info *s;
+
+       s = __seq_open_private(file, &iface_stat_fmt_proc_seq_ops,
+                       sizeof(struct proc_iface_stat_fmt_info));
+       if (!s)
+               return -ENOMEM;
+
+       s->fmt = (uintptr_t)PDE_DATA(inode);
+       return 0;
+}
+
+static const struct file_operations proc_iface_stat_fmt_fops = {
+       .open           = proc_iface_stat_fmt_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+static int __init iface_stat_init(struct proc_dir_entry *parent_procdir)
+{
+       int err;
+
+       iface_stat_procdir = proc_mkdir(iface_stat_procdirname, parent_procdir);
+       if (!iface_stat_procdir) {
+               pr_err("qtaguid: iface_stat: init failed to create proc entry\n");
+               err = -1;
+               goto err;
+       }
+
+       iface_stat_all_procfile = proc_create_data(iface_stat_all_procfilename,
+                                                  proc_iface_perms,
+                                                  parent_procdir,
+                                                  &proc_iface_stat_fmt_fops,
+                                                  (void *)1 /* fmt1 */);
+       if (!iface_stat_all_procfile) {
+               pr_err("qtaguid: iface_stat: init "
+                      " failed to create stat_old proc entry\n");
+               err = -1;
+               goto err_zap_entry;
+       }
+
+       iface_stat_fmt_procfile = proc_create_data(iface_stat_fmt_procfilename,
+                                                  proc_iface_perms,
+                                                  parent_procdir,
+                                                  &proc_iface_stat_fmt_fops,
+                                                  (void *)2 /* fmt2 */);
+       if (!iface_stat_fmt_procfile) {
+               pr_err("qtaguid: iface_stat: init "
+                      " failed to create stat_all proc entry\n");
+               err = -1;
+               goto err_zap_all_stats_entry;
+       }
+
+
+       err = register_netdevice_notifier(&iface_netdev_notifier_blk);
+       if (err) {
+               pr_err("qtaguid: iface_stat: init "
+                      "failed to register dev event handler\n");
+               goto err_zap_all_stats_entries;
+       }
+       err = register_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+       if (err) {
+               pr_err("qtaguid: iface_stat: init "
+                      "failed to register ipv4 dev event handler\n");
+               goto err_unreg_nd;
+       }
+
+       err = register_inet6addr_notifier(&iface_inet6addr_notifier_blk);
+       if (err) {
+               pr_err("qtaguid: iface_stat: init "
+                      "failed to register ipv6 dev event handler\n");
+               goto err_unreg_ip4_addr;
+       }
+       return 0;
+
+err_unreg_ip4_addr:
+       unregister_inetaddr_notifier(&iface_inetaddr_notifier_blk);
+err_unreg_nd:
+       unregister_netdevice_notifier(&iface_netdev_notifier_blk);
+err_zap_all_stats_entries:
+       remove_proc_entry(iface_stat_fmt_procfilename, parent_procdir);
+err_zap_all_stats_entry:
+       remove_proc_entry(iface_stat_all_procfilename, parent_procdir);
+err_zap_entry:
+       remove_proc_entry(iface_stat_procdirname, parent_procdir);
+err:
+       return err;
+}
+
+static struct sock *qtaguid_find_sk(const struct sk_buff *skb,
+                                   struct xt_action_param *par)
+{
+       const struct nf_hook_state *parst = par->state;
+       struct sock *sk;
+       unsigned int hook_mask = (1 << parst->hook);
+
+       MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n",
+                parst->hook, skb, parst->pf);
+
+       /*
+        * Let's not abuse the the xt_socket_get*_sk(), or else it will
+        * return garbage SKs.
+        */
+       if (!(hook_mask & XT_SOCKET_SUPPORTED_HOOKS))
+               return NULL;
+
+       switch (parst->pf) {
+       case NFPROTO_IPV6:
+               sk = nf_sk_lookup_slow_v6(dev_net(skb->dev), skb, parst->in);
+               break;
+       case NFPROTO_IPV4:
+               sk = nf_sk_lookup_slow_v4(dev_net(skb->dev), skb, parst->in);
+               break;
+       default:
+               return NULL;
+       }
+
+       if (sk) {
+               MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n",
+                        parst->hook, sk, sk->sk_protocol, sk->sk_state);
+       }
+       return sk;
+}
+
+static void account_for_uid(const struct sk_buff *skb,
+                           const struct sock *alternate_sk, uid_t uid,
+                           struct xt_action_param *par)
+{
+       const struct net_device *el_dev;
+       enum ifs_tx_rx direction;
+       int proto;
+
+       get_dev_and_dir(skb, par, &direction, &el_dev);
+       proto = ipx_proto(skb, par);
+       MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n",
+                par->state->hook, el_dev->name, el_dev->type,
+                par->state->pf, proto, direction);
+
+       if_tag_stat_update(el_dev->name, uid,
+                          skb->sk ? skb->sk : alternate_sk,
+                          direction,
+                          proto, skb->len);
+}
+
+static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par)
+{
+       const struct xt_qtaguid_match_info *info = par->matchinfo;
+       const struct nf_hook_state *parst = par->state;
+       const struct file *filp;
+       bool got_sock = false;
+       struct sock *sk;
+       kuid_t sock_uid;
+       bool res;
+       bool set_sk_callback_lock = false;
+       /*
+        * TODO: unhack how to force just accounting.
+        * For now we only do tag stats when the uid-owner is not requested
+        */
+       bool do_tag_stat = !(info->match & XT_QTAGUID_UID);
+
+       if (unlikely(module_passive))
+               return (info->match ^ info->invert) == 0;
+
+       MT_DEBUG("qtaguid[%d]: entered skb=%p par->in=%p/out=%p fam=%d\n",
+                parst->hook, skb, parst->in, parst->out, parst->pf);
+
+       atomic64_inc(&qtu_events.match_calls);
+       if (skb == NULL) {
+               res = (info->match ^ info->invert) == 0;
+               goto ret_res;
+       }
+
+       switch (parst->hook) {
+       case NF_INET_PRE_ROUTING:
+       case NF_INET_POST_ROUTING:
+               atomic64_inc(&qtu_events.match_calls_prepost);
+               iface_stat_update_from_skb(skb, par);
+               /*
+                * We are done in pre/post. The skb will get processed
+                * further alter.
+                */
+               res = (info->match ^ info->invert);
+               goto ret_res;
+               break;
+       /* default: Fall through and do UID releated work */
+       }
+
+       sk = skb_to_full_sk(skb);
+       /*
+        * When in TCP_TIME_WAIT the sk is not a "struct sock" but
+        * "struct inet_timewait_sock" which is missing fields.
+        * So we ignore it.
+        */
+       if (sk && sk->sk_state == TCP_TIME_WAIT)
+               sk = NULL;
+       if (sk == NULL) {
+               /*
+                * A missing sk->sk_socket happens when packets are in-flight
+                * and the matching socket is already closed and gone.
+                */
+               sk = qtaguid_find_sk(skb, par);
+               /*
+                * TCP_NEW_SYN_RECV are not "struct sock" but "struct request_sock"
+                * where we can get a pointer to a full socket to retrieve uid/gid.
+                * When in TCP_TIME_WAIT, sk is a struct inet_timewait_sock
+                * which is missing fields and does not contain any reference
+                * to a full socket, so just ignore the socket.
+                */
+               if (sk && sk->sk_state == TCP_NEW_SYN_RECV) {
+                       sock_gen_put(sk);
+                       sk = sk_to_full_sk(sk);
+               } else if (sk && (!sk_fullsock(sk) || sk->sk_state == TCP_TIME_WAIT)) {
+                       sock_gen_put(sk);
+                       sk = NULL;
+               } else {
+                       /*
+                        * If we got the socket from the find_sk(), we will need to put
+                        * it back, as nf_tproxy_get_sock_v4() got it.
+                        */
+                       got_sock = sk;
+               }
+               if (sk)
+                       atomic64_inc(&qtu_events.match_found_sk_in_ct);
+               else
+                       atomic64_inc(&qtu_events.match_found_no_sk_in_ct);
+       } else {
+               atomic64_inc(&qtu_events.match_found_sk);
+       }
+       MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n",
+                parst->hook, sk, got_sock, parst->pf, ipx_proto(skb, par));
+
+       if (!sk) {
+               /*
+                * Here, the qtaguid_find_sk() using connection tracking
+                * couldn't find the owner, so for now we just count them
+                * against the system.
+                */
+               if (do_tag_stat)
+                       account_for_uid(skb, sk, 0, par);
+               MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", parst->hook);
+               res = (info->match ^ info->invert) == 0;
+               atomic64_inc(&qtu_events.match_no_sk);
+               goto put_sock_ret_res;
+       } else if (info->match & info->invert & XT_QTAGUID_SOCKET) {
+               res = false;
+               goto put_sock_ret_res;
+       }
+       sock_uid = sk->sk_uid;
+       if (do_tag_stat)
+               account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid),
+                               par);
+
+       /*
+        * The following two tests fail the match when:
+        *    id not in range AND no inverted condition requested
+        * or id     in range AND    inverted condition requested
+        * Thus (!a && b) || (a && !b) == a ^ b
+        */
+       if (info->match & XT_QTAGUID_UID) {
+               kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min);
+               kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max);
+
+               if ((uid_gte(sock_uid, uid_min) &&
+                    uid_lte(sock_uid, uid_max)) ^
+                   !(info->invert & XT_QTAGUID_UID)) {
+                       MT_DEBUG("qtaguid[%d]: leaving uid not matching\n",
+                                parst->hook);
+                       res = false;
+                       goto put_sock_ret_res;
+               }
+       }
+       if (info->match & XT_QTAGUID_GID) {
+               kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min);
+               kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max);
+               set_sk_callback_lock = true;
+               read_lock_bh(&sk->sk_callback_lock);
+               MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n",
+                        parst->hook, sk, sk->sk_socket,
+                        sk->sk_socket ? sk->sk_socket->file : (void *)-1LL);
+               filp = sk->sk_socket ? sk->sk_socket->file : NULL;
+               if (!filp) {
+                       res = ((info->match ^ info->invert) &
+                              XT_QTAGUID_GID) == 0;
+                       atomic64_inc(&qtu_events.match_no_sk_gid);
+                       goto put_sock_ret_res;
+               }
+               MT_DEBUG("qtaguid[%d]: filp...uid=%u\n",
+                        parst->hook, filp ?
+                        from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1);
+               if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
+                               gid_lte(filp->f_cred->fsgid, gid_max)) ^
+                       !(info->invert & XT_QTAGUID_GID)) {
+                       MT_DEBUG("qtaguid[%d]: leaving gid not matching\n",
+                               parst->hook);
+                       res = false;
+                       goto put_sock_ret_res;
+               }
+       }
+       MT_DEBUG("qtaguid[%d]: leaving matched\n", parst->hook);
+       res = true;
+
+put_sock_ret_res:
+       if (got_sock)
+               sock_gen_put(sk);
+       if (set_sk_callback_lock)
+               read_unlock_bh(&sk->sk_callback_lock);
+ret_res:
+       MT_DEBUG("qtaguid[%d]: left %d\n", parst->hook, res);
+       return res;
+}
+
+#ifdef DDEBUG
+/*
+ * This function is not in xt_qtaguid_print.c because of locks visibility.
+ * The lock of sock_tag_list must be aquired before calling this function
+ */
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...)
+{
+       va_list args;
+       char *fmt_buff;
+       char *buff;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       fmt_buff = kasprintf(GFP_ATOMIC,
+                            "qtaguid: %s(): %s {\n", __func__, fmt);
+       BUG_ON(!fmt_buff);
+       va_start(args, fmt);
+       buff = kvasprintf(GFP_ATOMIC,
+                         fmt_buff, args);
+       BUG_ON(!buff);
+       pr_debug("%s", buff);
+       kfree(fmt_buff);
+       kfree(buff);
+       va_end(args);
+
+       prdebug_sock_tag_tree(indent_level, &sock_tag_tree);
+
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       prdebug_uid_tag_data_tree(indent_level, &uid_tag_data_tree);
+       prdebug_proc_qtu_data_tree(indent_level, &proc_qtu_data_tree);
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+
+       spin_lock_bh(&iface_stat_list_lock);
+       prdebug_iface_stat_list(indent_level, &iface_stat_list);
+       spin_unlock_bh(&iface_stat_list_lock);
+
+       pr_debug("qtaguid: %s(): }\n", __func__);
+}
+#else
+static void prdebug_full_state_locked(int indent_level, const char *fmt, ...) {}
+#endif
+
+struct proc_ctrl_print_info {
+       struct sock *sk; /* socket found by reading to sk_pos */
+       loff_t sk_pos;
+};
+
+static void *qtaguid_ctrl_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct proc_ctrl_print_info *pcpi = m->private;
+       struct sock_tag *sock_tag_entry = v;
+       struct rb_node *node;
+
+       (*pos)++;
+
+       if (!v || v  == SEQ_START_TOKEN)
+               return NULL;
+
+       node = rb_next(&sock_tag_entry->sock_node);
+       if (!node) {
+               pcpi->sk = NULL;
+               sock_tag_entry = SEQ_START_TOKEN;
+       } else {
+               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+               pcpi->sk = sock_tag_entry->sk;
+       }
+       pcpi->sk_pos = *pos;
+       return sock_tag_entry;
+}
+
+static void *qtaguid_ctrl_proc_start(struct seq_file *m, loff_t *pos)
+{
+       struct proc_ctrl_print_info *pcpi = m->private;
+       struct sock_tag *sock_tag_entry;
+       struct rb_node *node;
+
+       spin_lock_bh(&sock_tag_list_lock);
+
+       if (unlikely(module_passive))
+               return NULL;
+
+       if (*pos == 0) {
+               pcpi->sk_pos = 0;
+               node = rb_first(&sock_tag_tree);
+               if (!node) {
+                       pcpi->sk = NULL;
+                       return SEQ_START_TOKEN;
+               }
+               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+               pcpi->sk = sock_tag_entry->sk;
+       } else {
+               sock_tag_entry = (pcpi->sk ? get_sock_stat_nl(pcpi->sk) :
+                                               NULL) ?: SEQ_START_TOKEN;
+               if (*pos != pcpi->sk_pos) {
+                       /* seq_read skipped a next call */
+                       *pos = pcpi->sk_pos;
+                       return qtaguid_ctrl_proc_next(m, sock_tag_entry, pos);
+               }
+       }
+       return sock_tag_entry;
+}
+
+static void qtaguid_ctrl_proc_stop(struct seq_file *m, void *v)
+{
+       spin_unlock_bh(&sock_tag_list_lock);
+}
+
+/*
+ * Procfs reader to get all active socket tags using style "1)" as described in
+ * fs/proc/generic.c
+ */
+static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v)
+{
+       struct sock_tag *sock_tag_entry = v;
+       uid_t uid;
+
+       CT_DEBUG("qtaguid: proc ctrl pid=%u tgid=%u uid=%u\n",
+                current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+       if (sock_tag_entry != SEQ_START_TOKEN) {
+               int sk_ref_count;
+               uid = get_uid_from_tag(sock_tag_entry->tag);
+               CT_DEBUG("qtaguid: proc_read(): sk=%p tag=0x%llx (uid=%u) "
+                        "pid=%u\n",
+                        sock_tag_entry->sk,
+                        sock_tag_entry->tag,
+                        uid,
+                        sock_tag_entry->pid
+                       );
+               sk_ref_count = refcount_read(
+                       &sock_tag_entry->sk->sk_refcnt);
+               seq_printf(m, "sock=%pK tag=0x%llx (uid=%u) pid=%u "
+                          "f_count=%d\n",
+                          sock_tag_entry->sk,
+                          sock_tag_entry->tag, uid,
+                          sock_tag_entry->pid, sk_ref_count);
+       } else {
+               seq_printf(m, "events: sockets_tagged=%llu "
+                          "sockets_untagged=%llu "
+                          "counter_set_changes=%llu "
+                          "delete_cmds=%llu "
+                          "iface_events=%llu "
+                          "match_calls=%llu "
+                          "match_calls_prepost=%llu "
+                          "match_found_sk=%llu "
+                          "match_found_sk_in_ct=%llu "
+                          "match_found_no_sk_in_ct=%llu "
+                          "match_no_sk=%llu "
+                          "match_no_sk_gid=%llu\n",
+                          (u64)atomic64_read(&qtu_events.sockets_tagged),
+                          (u64)atomic64_read(&qtu_events.sockets_untagged),
+                          (u64)atomic64_read(&qtu_events.counter_set_changes),
+                          (u64)atomic64_read(&qtu_events.delete_cmds),
+                          (u64)atomic64_read(&qtu_events.iface_events),
+                          (u64)atomic64_read(&qtu_events.match_calls),
+                          (u64)atomic64_read(&qtu_events.match_calls_prepost),
+                          (u64)atomic64_read(&qtu_events.match_found_sk),
+                          (u64)atomic64_read(&qtu_events.match_found_sk_in_ct),
+                          (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct),
+                          (u64)atomic64_read(&qtu_events.match_no_sk),
+                          (u64)atomic64_read(&qtu_events.match_no_sk_gid));
+
+               /* Count the following as part of the last item_index. No need
+                * to lock the sock_tag_list here since it is already locked when
+                * starting the seq_file operation
+                */
+               prdebug_full_state_locked(0, "proc ctrl");
+       }
+
+       return 0;
+}
+
+/*
+ * Delete socket tags, and stat tags associated with a given
+ * accouting tag and uid.
+ */
+static int ctrl_cmd_delete(const char *input)
+{
+       char cmd;
+       int uid_int;
+       kuid_t uid;
+       uid_t entry_uid;
+       tag_t acct_tag;
+       tag_t tag;
+       int res, argc;
+       struct iface_stat *iface_entry;
+       struct rb_node *node;
+       struct sock_tag *st_entry;
+       struct rb_root st_to_free_tree = RB_ROOT;
+       struct tag_stat *ts_entry;
+       struct tag_counter_set *tcs_entry;
+       struct tag_ref *tr_entry;
+       struct uid_tag_data *utd_entry;
+
+       argc = sscanf(input, "%c %llu %u", &cmd, &acct_tag, &uid_int);
+       uid = make_kuid(&init_user_ns, uid_int);
+       CT_DEBUG("qtaguid: ctrl_delete(%s): argc=%d cmd=%c "
+                "user_tag=0x%llx uid=%u\n", input, argc, cmd,
+                acct_tag, uid_int);
+       if (argc < 2) {
+               res = -EINVAL;
+               goto err;
+       }
+       if (!valid_atag(acct_tag)) {
+               pr_info("qtaguid: ctrl_delete(%s): invalid tag\n", input);
+               res = -EINVAL;
+               goto err;
+       }
+       if (argc < 3) {
+               uid = current_fsuid();
+               uid_int = from_kuid(&init_user_ns, uid);
+       } else if (!can_impersonate_uid(uid)) {
+               pr_info("qtaguid: ctrl_delete(%s): "
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+               res = -EPERM;
+               goto err;
+       }
+
+       tag = combine_atag_with_uid(acct_tag, uid_int);
+       CT_DEBUG("qtaguid: ctrl_delete(%s): "
+                "looking for tag=0x%llx (uid=%u)\n",
+                input, tag, uid_int);
+
+       /* Delete socket tags */
+       spin_lock_bh(&sock_tag_list_lock);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       node = rb_first(&sock_tag_tree);
+       while (node) {
+               st_entry = rb_entry(node, struct sock_tag, sock_node);
+               entry_uid = get_uid_from_tag(st_entry->tag);
+               node = rb_next(node);
+               if (entry_uid != uid_int)
+                       continue;
+
+               CT_DEBUG("qtaguid: ctrl_delete(%s): st tag=0x%llx (uid=%u)\n",
+                        input, st_entry->tag, entry_uid);
+
+               if (!acct_tag || st_entry->tag == tag) {
+                       rb_erase(&st_entry->sock_node, &sock_tag_tree);
+                       /* Can't sockfd_put() within spinlock, do it later. */
+                       sock_tag_tree_insert(st_entry, &st_to_free_tree);
+                       tr_entry = lookup_tag_ref(st_entry->tag, NULL);
+                       BUG_ON(tr_entry->num_sock_tags <= 0);
+                       tr_entry->num_sock_tags--;
+                       /*
+                        * TODO: remove if, and start failing.
+                        * This is a hack to work around the fact that in some
+                        * places we have "if (IS_ERR_OR_NULL(pqd_entry))"
+                        * and are trying to work around apps
+                        * that didn't open the /dev/xt_qtaguid.
+                        */
+                       if (st_entry->list.next && st_entry->list.prev)
+                               list_del(&st_entry->list);
+               }
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       spin_unlock_bh(&sock_tag_list_lock);
+
+       sock_tag_tree_erase(&st_to_free_tree);
+
+       /* Delete tag counter-sets */
+       spin_lock_bh(&tag_counter_set_list_lock);
+       /* Counter sets are only on the uid tag, not full tag */
+       tcs_entry = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+       if (tcs_entry) {
+               CT_DEBUG("qtaguid: ctrl_delete(%s): "
+                        "erase tcs: tag=0x%llx (uid=%u) set=%d\n",
+                        input,
+                        tcs_entry->tn.tag,
+                        get_uid_from_tag(tcs_entry->tn.tag),
+                        tcs_entry->active_set);
+               rb_erase(&tcs_entry->tn.node, &tag_counter_set_tree);
+               kfree(tcs_entry);
+       }
+       spin_unlock_bh(&tag_counter_set_list_lock);
+
+       /*
+        * If acct_tag is 0, then all entries belonging to uid are
+        * erased.
+        */
+       spin_lock_bh(&iface_stat_list_lock);
+       list_for_each_entry(iface_entry, &iface_stat_list, list) {
+               spin_lock_bh(&iface_entry->tag_stat_list_lock);
+               node = rb_first(&iface_entry->tag_stat_tree);
+               while (node) {
+                       ts_entry = rb_entry(node, struct tag_stat, tn.node);
+                       entry_uid = get_uid_from_tag(ts_entry->tn.tag);
+                       node = rb_next(node);
+
+                       CT_DEBUG("qtaguid: ctrl_delete(%s): "
+                                "ts tag=0x%llx (uid=%u)\n",
+                                input, ts_entry->tn.tag, entry_uid);
+
+                       if (entry_uid != uid_int)
+                               continue;
+                       if (!acct_tag || ts_entry->tn.tag == tag) {
+                               CT_DEBUG("qtaguid: ctrl_delete(%s): "
+                                        "erase ts: %s 0x%llx %u\n",
+                                        input, iface_entry->ifname,
+                                        get_atag_from_tag(ts_entry->tn.tag),
+                                        entry_uid);
+                               rb_erase(&ts_entry->tn.node,
+                                        &iface_entry->tag_stat_tree);
+                               kfree(ts_entry);
+                       }
+               }
+               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+       }
+       spin_unlock_bh(&iface_stat_list_lock);
+
+       /* Cleanup the uid_tag_data */
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       node = rb_first(&uid_tag_data_tree);
+       while (node) {
+               utd_entry = rb_entry(node, struct uid_tag_data, node);
+               entry_uid = utd_entry->uid;
+               node = rb_next(node);
+
+               CT_DEBUG("qtaguid: ctrl_delete(%s): "
+                        "utd uid=%u\n",
+                        input, entry_uid);
+
+               if (entry_uid != uid_int)
+                       continue;
+               /*
+                * Go over the tag_refs, and those that don't have
+                * sock_tags using them are freed.
+                */
+               put_tag_ref_tree(tag, utd_entry);
+               put_utd_entry(utd_entry);
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+
+       atomic64_inc(&qtu_events.delete_cmds);
+       res = 0;
+
+err:
+       return res;
+}
+
+static int ctrl_cmd_counter_set(const char *input)
+{
+       char cmd;
+       uid_t uid = 0;
+       tag_t tag;
+       int res, argc;
+       struct tag_counter_set *tcs;
+       int counter_set;
+
+       argc = sscanf(input, "%c %d %u", &cmd, &counter_set, &uid);
+       CT_DEBUG("qtaguid: ctrl_counterset(%s): argc=%d cmd=%c "
+                "set=%d uid=%u\n", input, argc, cmd,
+                counter_set, uid);
+       if (argc != 3) {
+               res = -EINVAL;
+               goto err;
+       }
+       if (counter_set < 0 || counter_set >= IFS_MAX_COUNTER_SETS) {
+               pr_info("qtaguid: ctrl_counterset(%s): invalid counter_set range\n",
+                       input);
+               res = -EINVAL;
+               goto err;
+       }
+       if (!can_manipulate_uids()) {
+               pr_info("qtaguid: ctrl_counterset(%s): "
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+               res = -EPERM;
+               goto err;
+       }
+
+       tag = make_tag_from_uid(uid);
+       spin_lock_bh(&tag_counter_set_list_lock);
+       tcs = tag_counter_set_tree_search(&tag_counter_set_tree, tag);
+       if (!tcs) {
+               tcs = kzalloc(sizeof(*tcs), GFP_ATOMIC);
+               if (!tcs) {
+                       spin_unlock_bh(&tag_counter_set_list_lock);
+                       pr_err("qtaguid: ctrl_counterset(%s): "
+                              "failed to alloc counter set\n",
+                              input);
+                       res = -ENOMEM;
+                       goto err;
+               }
+               tcs->tn.tag = tag;
+               tag_counter_set_tree_insert(tcs, &tag_counter_set_tree);
+               CT_DEBUG("qtaguid: ctrl_counterset(%s): added tcs tag=0x%llx "
+                        "(uid=%u) set=%d\n",
+                        input, tag, get_uid_from_tag(tag), counter_set);
+       }
+       tcs->active_set = counter_set;
+       spin_unlock_bh(&tag_counter_set_list_lock);
+       atomic64_inc(&qtu_events.counter_set_changes);
+       res = 0;
+
+err:
+       return res;
+}
+
+static int ctrl_cmd_tag(const char *input)
+{
+       char cmd;
+       int sock_fd = 0;
+       kuid_t uid;
+       unsigned int uid_int = 0;
+       tag_t acct_tag = make_atag_from_value(0);
+       tag_t full_tag;
+       struct socket *el_socket;
+       int res, argc;
+       struct sock_tag *sock_tag_entry;
+       struct tag_ref *tag_ref_entry;
+       struct uid_tag_data *uid_tag_data_entry;
+       struct proc_qtu_data *pqd_entry;
+
+       /* Unassigned args will get defaulted later. */
+       argc = sscanf(input, "%c %d %llu %u", &cmd, &sock_fd, &acct_tag, &uid_int);
+       uid = make_kuid(&init_user_ns, uid_int);
+       CT_DEBUG("qtaguid: ctrl_tag(%s): argc=%d cmd=%c sock_fd=%d "
+                "acct_tag=0x%llx uid=%u\n", input, argc, cmd, sock_fd,
+                acct_tag, uid_int);
+       if (argc < 2) {
+               res = -EINVAL;
+               goto err;
+       }
+       el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
+       if (!el_socket) {
+               pr_info("qtaguid: ctrl_tag(%s): failed to lookup"
+                       " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+                       input, sock_fd, res, current->pid, current->tgid,
+                       from_kuid(&init_user_ns, current_fsuid()));
+               goto err;
+       }
+       CT_DEBUG("qtaguid: ctrl_tag(%s): socket->...->sk_refcnt=%d ->sk=%p\n",
+                input, refcount_read(&el_socket->sk->sk_refcnt),
+                el_socket->sk);
+       if (argc < 3) {
+               acct_tag = make_atag_from_value(0);
+       } else if (!valid_atag(acct_tag)) {
+               pr_info("qtaguid: ctrl_tag(%s): invalid tag\n", input);
+               res = -EINVAL;
+               goto err_put;
+       }
+       CT_DEBUG("qtaguid: ctrl_tag(%s): "
+                "pid=%u tgid=%u uid=%u euid=%u fsuid=%u "
+                "ctrl.gid=%u in_group()=%d in_egroup()=%d\n",
+                input, current->pid, current->tgid,
+                from_kuid(&init_user_ns, current_uid()),
+                from_kuid(&init_user_ns, current_euid()),
+                from_kuid(&init_user_ns, current_fsuid()),
+                from_kgid(&init_user_ns, xt_qtaguid_ctrl_file->gid),
+                in_group_p(xt_qtaguid_ctrl_file->gid),
+                in_egroup_p(xt_qtaguid_ctrl_file->gid));
+       if (argc < 4) {
+               uid = current_fsuid();
+               uid_int = from_kuid(&init_user_ns, uid);
+       } else if (!can_impersonate_uid(uid)) {
+               pr_info("qtaguid: ctrl_tag(%s): "
+                       "insufficient priv from pid=%u tgid=%u uid=%u\n",
+                       input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+               res = -EPERM;
+               goto err_put;
+       }
+       full_tag = combine_atag_with_uid(acct_tag, uid_int);
+
+       spin_lock_bh(&sock_tag_list_lock);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+       tag_ref_entry = get_tag_ref(full_tag, &uid_tag_data_entry);
+       if (IS_ERR(tag_ref_entry)) {
+               res = PTR_ERR(tag_ref_entry);
+               spin_unlock_bh(&uid_tag_data_tree_lock);
+               spin_unlock_bh(&sock_tag_list_lock);
+               goto err_put;
+       }
+       tag_ref_entry->num_sock_tags++;
+       if (sock_tag_entry) {
+               struct tag_ref *prev_tag_ref_entry;
+
+               CT_DEBUG("qtaguid: ctrl_tag(%s): retag for sk=%p "
+                        "st@%p ...->sk_refcnt=%d\n",
+                        input, el_socket->sk, sock_tag_entry,
+                        refcount_read(&el_socket->sk->sk_refcnt));
+               prev_tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag,
+                                                   &uid_tag_data_entry);
+               BUG_ON(IS_ERR_OR_NULL(prev_tag_ref_entry));
+               BUG_ON(prev_tag_ref_entry->num_sock_tags <= 0);
+               prev_tag_ref_entry->num_sock_tags--;
+               sock_tag_entry->tag = full_tag;
+       } else {
+               CT_DEBUG("qtaguid: ctrl_tag(%s): newtag for sk=%p\n",
+                        input, el_socket->sk);
+               sock_tag_entry = kzalloc(sizeof(*sock_tag_entry),
+                                        GFP_ATOMIC);
+               if (!sock_tag_entry) {
+                       pr_err("qtaguid: ctrl_tag(%s): "
+                              "socket tag alloc failed\n",
+                              input);
+                       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+                       tag_ref_entry->num_sock_tags--;
+                       free_tag_ref_from_utd_entry(tag_ref_entry,
+                                                   uid_tag_data_entry);
+                       spin_unlock_bh(&uid_tag_data_tree_lock);
+                       spin_unlock_bh(&sock_tag_list_lock);
+                       res = -ENOMEM;
+                       goto err_put;
+               }
+               /*
+                * Hold the sk refcount here to make sure the sk pointer cannot
+                * be freed and reused
+                */
+               sock_hold(el_socket->sk);
+               sock_tag_entry->sk = el_socket->sk;
+               sock_tag_entry->pid = current->tgid;
+               sock_tag_entry->tag = combine_atag_with_uid(acct_tag, uid_int);
+               pqd_entry = proc_qtu_data_tree_search(
+                       &proc_qtu_data_tree, current->tgid);
+               /*
+                * TODO: remove if, and start failing.
+                * At first, we want to catch user-space code that is not
+                * opening the /dev/xt_qtaguid.
+                */
+               if (IS_ERR_OR_NULL(pqd_entry))
+                       pr_warn_once(
+                               "qtaguid: %s(): "
+                               "User space forgot to open /dev/xt_qtaguid? "
+                               "pid=%u tgid=%u uid=%u\n", __func__,
+                               current->pid, current->tgid,
+                               from_kuid(&init_user_ns, current_fsuid()));
+               else
+                       list_add(&sock_tag_entry->list,
+                                &pqd_entry->sock_tag_list);
+
+               sock_tag_tree_insert(sock_tag_entry, &sock_tag_tree);
+               atomic64_inc(&qtu_events.sockets_tagged);
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       spin_unlock_bh(&sock_tag_list_lock);
+       /* We keep the ref to the sk until it is untagged */
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done st@%p ...->sk_refcnt=%d\n",
+                input, sock_tag_entry,
+                refcount_read(&el_socket->sk->sk_refcnt));
+       sockfd_put(el_socket);
+       return 0;
+
+err_put:
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done. ...->sk_refcnt=%d\n",
+                input, refcount_read(&el_socket->sk->sk_refcnt) - 1);
+       /* Release the sock_fd that was grabbed by sockfd_lookup(). */
+       sockfd_put(el_socket);
+       return res;
+
+err:
+       CT_DEBUG("qtaguid: ctrl_tag(%s): done.\n", input);
+       return res;
+}
+
+static int ctrl_cmd_untag(const char *input)
+{
+       char cmd;
+       int sock_fd = 0;
+       struct socket *el_socket;
+       int res, argc;
+
+       argc = sscanf(input, "%c %d", &cmd, &sock_fd);
+       CT_DEBUG("qtaguid: ctrl_untag(%s): argc=%d cmd=%c sock_fd=%d\n",
+                input, argc, cmd, sock_fd);
+       if (argc < 2) {
+               res = -EINVAL;
+               return res;
+       }
+       el_socket = sockfd_lookup(sock_fd, &res);  /* This locks the file */
+       if (!el_socket) {
+               pr_info("qtaguid: ctrl_untag(%s): failed to lookup"
+                       " sock_fd=%d err=%d pid=%u tgid=%u uid=%u\n",
+                       input, sock_fd, res, current->pid, current->tgid,
+                       from_kuid(&init_user_ns, current_fsuid()));
+               return res;
+       }
+       CT_DEBUG("qtaguid: ctrl_untag(%s): socket->...->f_count=%ld ->sk=%p\n",
+                input, atomic_long_read(&el_socket->file->f_count),
+                el_socket->sk);
+       res = qtaguid_untag(el_socket, false);
+       sockfd_put(el_socket);
+       return res;
+}
+
+int qtaguid_untag(struct socket *el_socket, bool kernel)
+{
+       int res;
+       pid_t pid;
+       struct sock_tag *sock_tag_entry;
+       struct tag_ref *tag_ref_entry;
+       struct uid_tag_data *utd_entry;
+       struct proc_qtu_data *pqd_entry;
+
+       spin_lock_bh(&sock_tag_list_lock);
+       sock_tag_entry = get_sock_stat_nl(el_socket->sk);
+       if (!sock_tag_entry) {
+               spin_unlock_bh(&sock_tag_list_lock);
+               res = -EINVAL;
+               return res;
+       }
+       /*
+        * The socket already belongs to the current process
+        * so it can do whatever it wants to it.
+        */
+       rb_erase(&sock_tag_entry->sock_node, &sock_tag_tree);
+
+       tag_ref_entry = lookup_tag_ref(sock_tag_entry->tag, &utd_entry);
+       BUG_ON(!tag_ref_entry);
+       BUG_ON(tag_ref_entry->num_sock_tags <= 0);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+       if (kernel)
+               pid = sock_tag_entry->pid;
+       else
+               pid = current->tgid;
+       pqd_entry = proc_qtu_data_tree_search(
+               &proc_qtu_data_tree, pid);
+       /*
+        * TODO: remove if, and start failing.
+        * At first, we want to catch user-space code that is not
+        * opening the /dev/xt_qtaguid.
+        */
+       if (IS_ERR_OR_NULL(pqd_entry) || !sock_tag_entry->list.next) {
+               pr_warn_once("qtaguid: %s(): "
+                            "User space forgot to open /dev/xt_qtaguid? "
+                            "pid=%u tgid=%u sk_pid=%u, uid=%u\n", __func__,
+                            current->pid, current->tgid, sock_tag_entry->pid,
+                            from_kuid(&init_user_ns, current_fsuid()));
+       } else {
+               list_del(&sock_tag_entry->list);
+       }
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       /*
+        * We don't free tag_ref from the utd_entry here,
+        * only during a cmd_delete().
+        */
+       tag_ref_entry->num_sock_tags--;
+       spin_unlock_bh(&sock_tag_list_lock);
+       /*
+        * Release the sock_fd that was grabbed at tag time.
+        */
+       sock_put(sock_tag_entry->sk);
+       CT_DEBUG("qtaguid: done. st@%p ...->sk_refcnt=%d\n",
+                sock_tag_entry,
+                refcount_read(&el_socket->sk->sk_refcnt));
+
+       kfree(sock_tag_entry);
+       atomic64_inc(&qtu_events.sockets_untagged);
+
+       return 0;
+}
+
+static ssize_t qtaguid_ctrl_parse(const char *input, size_t count)
+{
+       char cmd;
+       ssize_t res;
+
+       CT_DEBUG("qtaguid: ctrl(%s): pid=%u tgid=%u uid=%u\n",
+                input, current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+       cmd = input[0];
+       /* Collect params for commands */
+       switch (cmd) {
+       case 'd':
+               res = ctrl_cmd_delete(input);
+               break;
+
+       case 's':
+               res = ctrl_cmd_counter_set(input);
+               break;
+
+       case 't':
+               res = ctrl_cmd_tag(input);
+               break;
+
+       case 'u':
+               res = ctrl_cmd_untag(input);
+               break;
+
+       default:
+               res = -EINVAL;
+               goto err;
+       }
+       if (!res)
+               res = count;
+err:
+       CT_DEBUG("qtaguid: ctrl(%s): res=%zd\n", input, res);
+       return res;
+}
+
+#define MAX_QTAGUID_CTRL_INPUT_LEN 255
+static ssize_t qtaguid_ctrl_proc_write(struct file *file, const char __user *buffer,
+                                  size_t count, loff_t *offp)
+{
+       char input_buf[MAX_QTAGUID_CTRL_INPUT_LEN];
+
+       if (unlikely(module_passive))
+               return count;
+
+       if (count >= MAX_QTAGUID_CTRL_INPUT_LEN)
+               return -EINVAL;
+
+       if (copy_from_user(input_buf, buffer, count))
+               return -EFAULT;
+
+       input_buf[count] = '\0';
+       return qtaguid_ctrl_parse(input_buf, count);
+}
+
+struct proc_print_info {
+       struct iface_stat *iface_entry;
+       int item_index;
+       tag_t tag; /* tag found by reading to tag_pos */
+       off_t tag_pos;
+       int tag_item_index;
+};
+
+static void pp_stats_header(struct seq_file *m)
+{
+       seq_puts(m,
+                "idx iface acct_tag_hex uid_tag_int cnt_set "
+                "rx_bytes rx_packets "
+                "tx_bytes tx_packets "
+                "rx_tcp_bytes rx_tcp_packets "
+                "rx_udp_bytes rx_udp_packets "
+                "rx_other_bytes rx_other_packets "
+                "tx_tcp_bytes tx_tcp_packets "
+                "tx_udp_bytes tx_udp_packets "
+                "tx_other_bytes tx_other_packets\n");
+}
+
+static int pp_stats_line(struct seq_file *m, struct tag_stat *ts_entry,
+                        int cnt_set)
+{
+       struct data_counters *cnts;
+       tag_t tag = ts_entry->tn.tag;
+       uid_t stat_uid = get_uid_from_tag(tag);
+       struct proc_print_info *ppi = m->private;
+       /* Detailed tags are not available to everybody */
+       if (!can_read_other_uid_stats(make_kuid(&init_user_ns,stat_uid))) {
+               CT_DEBUG("qtaguid: stats line: "
+                        "%s 0x%llx %u: insufficient priv "
+                        "from pid=%u tgid=%u uid=%u stats.gid=%u\n",
+                        ppi->iface_entry->ifname,
+                        get_atag_from_tag(tag), stat_uid,
+                        current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+                        from_kgid(&init_user_ns,xt_qtaguid_stats_file->gid));
+               return 0;
+       }
+       ppi->item_index++;
+       cnts = &ts_entry->counters;
+       seq_printf(m, "%d %s 0x%llx %u %u "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu "
+               "%llu %llu\n",
+               ppi->item_index,
+               ppi->iface_entry->ifname,
+               get_atag_from_tag(tag),
+               stat_uid,
+               cnt_set,
+               dc_sum_bytes(cnts, cnt_set, IFS_RX),
+               dc_sum_packets(cnts, cnt_set, IFS_RX),
+               dc_sum_bytes(cnts, cnt_set, IFS_TX),
+               dc_sum_packets(cnts, cnt_set, IFS_TX),
+               cnts->bpc[cnt_set][IFS_RX][IFS_TCP].bytes,
+               cnts->bpc[cnt_set][IFS_RX][IFS_TCP].packets,
+               cnts->bpc[cnt_set][IFS_RX][IFS_UDP].bytes,
+               cnts->bpc[cnt_set][IFS_RX][IFS_UDP].packets,
+               cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].bytes,
+               cnts->bpc[cnt_set][IFS_RX][IFS_PROTO_OTHER].packets,
+               cnts->bpc[cnt_set][IFS_TX][IFS_TCP].bytes,
+               cnts->bpc[cnt_set][IFS_TX][IFS_TCP].packets,
+               cnts->bpc[cnt_set][IFS_TX][IFS_UDP].bytes,
+               cnts->bpc[cnt_set][IFS_TX][IFS_UDP].packets,
+               cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].bytes,
+               cnts->bpc[cnt_set][IFS_TX][IFS_PROTO_OTHER].packets);
+       return seq_has_overflowed(m) ? -ENOSPC : 1;
+}
+
+static bool pp_sets(struct seq_file *m, struct tag_stat *ts_entry)
+{
+       int ret;
+       int counter_set;
+       for (counter_set = 0; counter_set < IFS_MAX_COUNTER_SETS;
+            counter_set++) {
+               ret = pp_stats_line(m, ts_entry, counter_set);
+               if (ret < 0)
+                       return false;
+       }
+       return true;
+}
+
+static int qtaguid_stats_proc_iface_stat_ptr_valid(struct iface_stat *ptr)
+{
+       struct iface_stat *iface_entry;
+
+       if (!ptr)
+               return false;
+
+       list_for_each_entry(iface_entry, &iface_stat_list, list)
+               if (iface_entry == ptr)
+                       return true;
+       return false;
+}
+
+static void qtaguid_stats_proc_next_iface_entry(struct proc_print_info *ppi)
+{
+       spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+       list_for_each_entry_continue(ppi->iface_entry, &iface_stat_list, list) {
+               spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+               return;
+       }
+       ppi->iface_entry = NULL;
+}
+
+static void *qtaguid_stats_proc_next(struct seq_file *m, void *v, loff_t *pos)
+{
+       struct proc_print_info *ppi = m->private;
+       struct tag_stat *ts_entry;
+       struct rb_node *node;
+
+       if (!v) {
+               pr_err("qtaguid: %s(): unexpected v: NULL\n", __func__);
+               return NULL;
+       }
+
+       (*pos)++;
+
+       if (!ppi->iface_entry || unlikely(module_passive))
+               return NULL;
+
+       if (v == SEQ_START_TOKEN)
+               node = rb_first(&ppi->iface_entry->tag_stat_tree);
+       else
+               node = rb_next(&((struct tag_stat *)v)->tn.node);
+
+       while (!node) {
+               qtaguid_stats_proc_next_iface_entry(ppi);
+               if (!ppi->iface_entry)
+                       return NULL;
+               node = rb_first(&ppi->iface_entry->tag_stat_tree);
+       }
+
+       ts_entry = rb_entry(node, struct tag_stat, tn.node);
+       ppi->tag = ts_entry->tn.tag;
+       ppi->tag_pos = *pos;
+       ppi->tag_item_index = ppi->item_index;
+       return ts_entry;
+}
+
+static void *qtaguid_stats_proc_start(struct seq_file *m, loff_t *pos)
+{
+       struct proc_print_info *ppi = m->private;
+       struct tag_stat *ts_entry = NULL;
+
+       spin_lock_bh(&iface_stat_list_lock);
+
+       if (*pos == 0) {
+               ppi->item_index = 1;
+               ppi->tag_pos = 0;
+               if (list_empty(&iface_stat_list)) {
+                       ppi->iface_entry = NULL;
+               } else {
+                       ppi->iface_entry = list_first_entry(&iface_stat_list,
+                                                           struct iface_stat,
+                                                           list);
+                       spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+               }
+               return SEQ_START_TOKEN;
+       }
+       if (!qtaguid_stats_proc_iface_stat_ptr_valid(ppi->iface_entry)) {
+               if (ppi->iface_entry) {
+                       pr_err("qtaguid: %s(): iface_entry %p not found\n",
+                              __func__, ppi->iface_entry);
+                       ppi->iface_entry = NULL;
+               }
+               return NULL;
+       }
+
+       spin_lock_bh(&ppi->iface_entry->tag_stat_list_lock);
+
+       if (!ppi->tag_pos) {
+               /* seq_read skipped first next call */
+               ts_entry = SEQ_START_TOKEN;
+       } else {
+               ts_entry = tag_stat_tree_search(
+                               &ppi->iface_entry->tag_stat_tree, ppi->tag);
+               if (!ts_entry) {
+                       pr_info("qtaguid: %s(): tag_stat.tag 0x%llx not found. Abort.\n",
+                               __func__, ppi->tag);
+                       return NULL;
+               }
+       }
+
+       if (*pos == ppi->tag_pos) { /* normal resume */
+               ppi->item_index = ppi->tag_item_index;
+       } else {
+               /* seq_read skipped a next call */
+               *pos = ppi->tag_pos;
+               ts_entry = qtaguid_stats_proc_next(m, ts_entry, pos);
+       }
+
+       return ts_entry;
+}
+
+static void qtaguid_stats_proc_stop(struct seq_file *m, void *v)
+{
+       struct proc_print_info *ppi = m->private;
+       if (ppi->iface_entry)
+               spin_unlock_bh(&ppi->iface_entry->tag_stat_list_lock);
+       spin_unlock_bh(&iface_stat_list_lock);
+}
+
+/*
+ * Procfs reader to get all tag stats using style "1)" as described in
+ * fs/proc/generic.c
+ * Groups all protocols tx/rx bytes.
+ */
+static int qtaguid_stats_proc_show(struct seq_file *m, void *v)
+{
+       struct tag_stat *ts_entry = v;
+
+       if (v == SEQ_START_TOKEN)
+               pp_stats_header(m);
+       else
+               pp_sets(m, ts_entry);
+
+       return 0;
+}
+
+/*------------------------------------------*/
+static int qtudev_open(struct inode *inode, struct file *file)
+{
+       struct uid_tag_data *utd_entry;
+       struct proc_qtu_data  *pqd_entry;
+       struct proc_qtu_data  *new_pqd_entry;
+       int res;
+       bool utd_entry_found;
+
+       if (unlikely(qtu_proc_handling_passive))
+               return 0;
+
+       DR_DEBUG("qtaguid: qtudev_open(): pid=%u tgid=%u uid=%u\n",
+                current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+
+       spin_lock_bh(&uid_tag_data_tree_lock);
+
+       /* Look for existing uid data, or alloc one. */
+       utd_entry = get_uid_data(from_kuid(&init_user_ns, current_fsuid()), &utd_entry_found);
+       if (IS_ERR_OR_NULL(utd_entry)) {
+               res = PTR_ERR(utd_entry);
+               goto err_unlock;
+       }
+
+       /* Look for existing PID based proc_data */
+       pqd_entry = proc_qtu_data_tree_search(&proc_qtu_data_tree,
+                                             current->tgid);
+       if (pqd_entry) {
+               pr_err("qtaguid: qtudev_open(): %u/%u %u "
+                      "%s already opened\n",
+                      current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()),
+                      QTU_DEV_NAME);
+               res = -EBUSY;
+               goto err_unlock_free_utd;
+       }
+
+       new_pqd_entry = kzalloc(sizeof(*new_pqd_entry), GFP_ATOMIC);
+       if (!new_pqd_entry) {
+               pr_err("qtaguid: qtudev_open(): %u/%u %u: "
+                      "proc data alloc failed\n",
+                      current->pid, current->tgid, from_kuid(&init_user_ns, current_fsuid()));
+               res = -ENOMEM;
+               goto err_unlock_free_utd;
+       }
+       new_pqd_entry->pid = current->tgid;
+       INIT_LIST_HEAD(&new_pqd_entry->sock_tag_list);
+       new_pqd_entry->parent_tag_data = utd_entry;
+       utd_entry->num_pqd++;
+
+       proc_qtu_data_tree_insert(new_pqd_entry,
+                                 &proc_qtu_data_tree);
+
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       DR_DEBUG("qtaguid: tracking data for uid=%u in pqd=%p\n",
+                from_kuid(&init_user_ns, current_fsuid()), new_pqd_entry);
+       file->private_data = new_pqd_entry;
+       return 0;
+
+err_unlock_free_utd:
+       if (!utd_entry_found) {
+               rb_erase(&utd_entry->node, &uid_tag_data_tree);
+               kfree(utd_entry);
+       }
+err_unlock:
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       return res;
+}
+
+static int qtudev_release(struct inode *inode, struct file *file)
+{
+       struct proc_qtu_data  *pqd_entry = file->private_data;
+       struct uid_tag_data  *utd_entry = pqd_entry->parent_tag_data;
+       struct sock_tag *st_entry;
+       struct rb_root st_to_free_tree = RB_ROOT;
+       struct list_head *entry, *next;
+       struct tag_ref *tr;
+
+       if (unlikely(qtu_proc_handling_passive))
+               return 0;
+
+       /*
+        * Do not trust the current->pid, it might just be a kworker cleaning
+        * up after a dead proc.
+        */
+       DR_DEBUG("qtaguid: qtudev_release(): "
+                "pid=%u tgid=%u uid=%u "
+                "pqd_entry=%p->pid=%u utd_entry=%p->active_tags=%d\n",
+                current->pid, current->tgid, pqd_entry->parent_tag_data->uid,
+                pqd_entry, pqd_entry->pid, utd_entry,
+                utd_entry->num_active_tags);
+
+       spin_lock_bh(&sock_tag_list_lock);
+       spin_lock_bh(&uid_tag_data_tree_lock);
+
+       list_for_each_safe(entry, next, &pqd_entry->sock_tag_list) {
+               st_entry = list_entry(entry, struct sock_tag, list);
+               DR_DEBUG("qtaguid: %s(): "
+                        "erase sock_tag=%p->sk=%p pid=%u tgid=%u uid=%u\n",
+                        __func__,
+                        st_entry, st_entry->sk,
+                        current->pid, current->tgid,
+                        pqd_entry->parent_tag_data->uid);
+
+               utd_entry = uid_tag_data_tree_search(
+                       &uid_tag_data_tree,
+                       get_uid_from_tag(st_entry->tag));
+               BUG_ON(IS_ERR_OR_NULL(utd_entry));
+               DR_DEBUG("qtaguid: %s(): "
+                        "looking for tag=0x%llx in utd_entry=%p\n", __func__,
+                        st_entry->tag, utd_entry);
+               tr = tag_ref_tree_search(&utd_entry->tag_ref_tree,
+                                        st_entry->tag);
+               BUG_ON(!tr);
+               BUG_ON(tr->num_sock_tags <= 0);
+               tr->num_sock_tags--;
+               free_tag_ref_from_utd_entry(tr, utd_entry);
+
+               rb_erase(&st_entry->sock_node, &sock_tag_tree);
+               list_del(&st_entry->list);
+               /* Can't sockfd_put() within spinlock, do it later. */
+               sock_tag_tree_insert(st_entry, &st_to_free_tree);
+
+               /*
+                * Try to free the utd_entry if no other proc_qtu_data is
+                * using it (num_pqd is 0) and it doesn't have active tags
+                * (num_active_tags is 0).
+                */
+               put_utd_entry(utd_entry);
+       }
+
+       rb_erase(&pqd_entry->node, &proc_qtu_data_tree);
+       BUG_ON(pqd_entry->parent_tag_data->num_pqd < 1);
+       pqd_entry->parent_tag_data->num_pqd--;
+       put_utd_entry(pqd_entry->parent_tag_data);
+       kfree(pqd_entry);
+       file->private_data = NULL;
+
+       spin_unlock_bh(&uid_tag_data_tree_lock);
+       spin_unlock_bh(&sock_tag_list_lock);
+
+
+       sock_tag_tree_erase(&st_to_free_tree);
+
+       spin_lock_bh(&sock_tag_list_lock);
+       prdebug_full_state_locked(0, "%s(): pid=%u tgid=%u", __func__,
+                          current->pid, current->tgid);
+       spin_unlock_bh(&sock_tag_list_lock);
+       return 0;
+}
+
+/*------------------------------------------*/
+static const struct file_operations qtudev_fops = {
+       .owner = THIS_MODULE,
+       .open = qtudev_open,
+       .release = qtudev_release,
+};
+
+static struct miscdevice qtu_device = {
+       .minor = MISC_DYNAMIC_MINOR,
+       .name = QTU_DEV_NAME,
+       .fops = &qtudev_fops,
+       /* How sad it doesn't allow for defaults: .mode = S_IRUGO | S_IWUSR */
+};
+
+static const struct seq_operations proc_qtaguid_ctrl_seqops = {
+       .start = qtaguid_ctrl_proc_start,
+       .next = qtaguid_ctrl_proc_next,
+       .stop = qtaguid_ctrl_proc_stop,
+       .show = qtaguid_ctrl_proc_show,
+};
+
+static int proc_qtaguid_ctrl_open(struct inode *inode, struct file *file)
+{
+       return seq_open_private(file, &proc_qtaguid_ctrl_seqops,
+                               sizeof(struct proc_ctrl_print_info));
+}
+
+static const struct file_operations proc_qtaguid_ctrl_fops = {
+       .open           = proc_qtaguid_ctrl_open,
+       .read           = seq_read,
+       .write          = qtaguid_ctrl_proc_write,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+static const struct seq_operations proc_qtaguid_stats_seqops = {
+       .start = qtaguid_stats_proc_start,
+       .next = qtaguid_stats_proc_next,
+       .stop = qtaguid_stats_proc_stop,
+       .show = qtaguid_stats_proc_show,
+};
+
+static int proc_qtaguid_stats_open(struct inode *inode, struct file *file)
+{
+       return seq_open_private(file, &proc_qtaguid_stats_seqops,
+                               sizeof(struct proc_print_info));
+}
+
+static const struct file_operations proc_qtaguid_stats_fops = {
+       .open           = proc_qtaguid_stats_open,
+       .read           = seq_read,
+       .llseek         = seq_lseek,
+       .release        = seq_release_private,
+};
+
+/*------------------------------------------*/
+static int __init qtaguid_proc_register(struct proc_dir_entry **res_procdir)
+{
+       int ret;
+       *res_procdir = proc_mkdir(module_procdirname, init_net.proc_net);
+       if (!*res_procdir) {
+               pr_err("qtaguid: failed to create proc/.../xt_qtaguid\n");
+               ret = -ENOMEM;
+               goto no_dir;
+       }
+
+       xt_qtaguid_ctrl_file = proc_create_data("ctrl", proc_ctrl_perms,
+                                               *res_procdir,
+                                               &proc_qtaguid_ctrl_fops,
+                                               NULL);
+       if (!xt_qtaguid_ctrl_file) {
+               pr_err("qtaguid: failed to create xt_qtaguid/ctrl "
+                       " file\n");
+               ret = -ENOMEM;
+               goto no_ctrl_entry;
+       }
+
+       xt_qtaguid_stats_file = proc_create_data("stats", proc_stats_perms,
+                                                *res_procdir,
+                                                &proc_qtaguid_stats_fops,
+                                                NULL);
+       if (!xt_qtaguid_stats_file) {
+               pr_err("qtaguid: failed to create xt_qtaguid/stats "
+                       "file\n");
+               ret = -ENOMEM;
+               goto no_stats_entry;
+       }
+       /*
+        * TODO: add support counter hacking
+        * xt_qtaguid_stats_file->write_proc = qtaguid_stats_proc_write;
+        */
+       return 0;
+
+no_stats_entry:
+       remove_proc_entry("ctrl", *res_procdir);
+no_ctrl_entry:
+       remove_proc_entry("xt_qtaguid", NULL);
+no_dir:
+       return ret;
+}
+
+static struct xt_match qtaguid_mt_reg __read_mostly = {
+       /*
+        * This module masquerades as the "owner" module so that iptables
+        * tools can deal with it.
+        */
+       .name       = "owner",
+       .revision   = 1,
+       .family     = NFPROTO_UNSPEC,
+       .match      = qtaguid_mt,
+       .matchsize  = sizeof(struct xt_qtaguid_match_info),
+       .me         = THIS_MODULE,
+};
+
+static int __init qtaguid_mt_init(void)
+{
+       if (qtaguid_proc_register(&xt_qtaguid_procdir)
+           || iface_stat_init(xt_qtaguid_procdir)
+           || xt_register_match(&qtaguid_mt_reg)
+           || misc_register(&qtu_device))
+               return -1;
+       return 0;
+}
+
+/*
+ * TODO: allow unloading of the module.
+ * For now stats are permanent.
+ * Kconfig forces'y/n' and never an 'm'.
+ */
+
+module_init(qtaguid_mt_init);
+MODULE_AUTHOR("jpa <jpa@google.com>");
+MODULE_DESCRIPTION("Xtables: socket owner+tag matching and associated stats");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_owner");
+MODULE_ALIAS("ip6t_owner");
+MODULE_ALIAS("ipt_qtaguid");
+MODULE_ALIAS("ip6t_qtaguid");
diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h

new file mode 100644 (file)

index 0000000..c705270
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_internal.h
@@ -0,0 +1,350 @@
+/*
+ * Kernel iptables module to track stats for packets based on user tags.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_INTERNAL_H__
+#define __XT_QTAGUID_INTERNAL_H__
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+#include <linux/spinlock_types.h>
+#include <linux/workqueue.h>
+
+/* Iface handling */
+#define IDEBUG_MASK (1<<0)
+/* Iptable Matching. Per packet. */
+#define MDEBUG_MASK (1<<1)
+/* Red-black tree handling. Per packet. */
+#define RDEBUG_MASK (1<<2)
+/* procfs ctrl/stats handling */
+#define CDEBUG_MASK (1<<3)
+/* dev and resource tracking */
+#define DDEBUG_MASK (1<<4)
+
+/* E.g (IDEBUG_MASK | CDEBUG_MASK | DDEBUG_MASK) */
+#define DEFAULT_DEBUG_MASK 0
+
+/*
+ * (Un)Define these *DEBUG to compile out/in the pr_debug calls.
+ * All undef: text size ~ 0x3030; all def: ~ 0x4404.
+ */
+#define IDEBUG
+#define MDEBUG
+#define RDEBUG
+#define CDEBUG
+#define DDEBUG
+
+#define MSK_DEBUG(mask, ...) do {                           \
+               if (unlikely(qtaguid_debug_mask & (mask)))  \
+                       pr_debug(__VA_ARGS__);              \
+       } while (0)
+#ifdef IDEBUG
+#define IF_DEBUG(...) MSK_DEBUG(IDEBUG_MASK, __VA_ARGS__)
+#else
+#define IF_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef MDEBUG
+#define MT_DEBUG(...) MSK_DEBUG(MDEBUG_MASK, __VA_ARGS__)
+#else
+#define MT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef RDEBUG
+#define RB_DEBUG(...) MSK_DEBUG(RDEBUG_MASK, __VA_ARGS__)
+#else
+#define RB_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef CDEBUG
+#define CT_DEBUG(...) MSK_DEBUG(CDEBUG_MASK, __VA_ARGS__)
+#else
+#define CT_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+#ifdef DDEBUG
+#define DR_DEBUG(...) MSK_DEBUG(DDEBUG_MASK, __VA_ARGS__)
+#else
+#define DR_DEBUG(...) no_printk(__VA_ARGS__)
+#endif
+
+extern uint qtaguid_debug_mask;
+
+/*---------------------------------------------------------------------------*/
+/*
+ * Tags:
+ *
+ * They represent what the data usage counters will be tracked against.
+ * By default a tag is just based on the UID.
+ * The UID is used as the base for policing, and can not be ignored.
+ * So a tag will always at least represent a UID (uid_tag).
+ *
+ * A tag can be augmented with an "accounting tag" which is associated
+ * with a UID.
+ * User space can set the acct_tag portion of the tag which is then used
+ * with sockets: all data belonging to that socket will be counted against the
+ * tag. The policing is then based on the tag's uid_tag portion,
+ * and stats are collected for the acct_tag portion separately.
+ *
+ * There could be
+ * a:  {acct_tag=1, uid_tag=10003}
+ * b:  {acct_tag=2, uid_tag=10003}
+ * c:  {acct_tag=3, uid_tag=10003}
+ * d:  {acct_tag=0, uid_tag=10003}
+ * a, b, and c represent tags associated with specific sockets.
+ * d is for the totals for that uid, including all untagged traffic.
+ * Typically d is used with policing/quota rules.
+ *
+ * We want tag_t big enough to distinguish uid_t and acct_tag.
+ * It might become a struct if needed.
+ * Nothing should be using it as an int.
+ */
+typedef uint64_t tag_t;  /* Only used via accessors */
+
+#define TAG_UID_MASK 0xFFFFFFFFULL
+#define TAG_ACCT_MASK (~0xFFFFFFFFULL)
+
+static inline int tag_compare(tag_t t1, tag_t t2)
+{
+       return t1 < t2 ? -1 : t1 == t2 ? 0 : 1;
+}
+
+static inline tag_t combine_atag_with_uid(tag_t acct_tag, uid_t uid)
+{
+       return acct_tag | uid;
+}
+static inline tag_t make_tag_from_uid(uid_t uid)
+{
+       return uid;
+}
+static inline uid_t get_uid_from_tag(tag_t tag)
+{
+       return tag & TAG_UID_MASK;
+}
+static inline tag_t get_utag_from_tag(tag_t tag)
+{
+       return tag & TAG_UID_MASK;
+}
+static inline tag_t get_atag_from_tag(tag_t tag)
+{
+       return tag & TAG_ACCT_MASK;
+}
+
+static inline bool valid_atag(tag_t tag)
+{
+       return !(tag & TAG_UID_MASK);
+}
+static inline tag_t make_atag_from_value(uint32_t value)
+{
+       return (uint64_t)value << 32;
+}
+/*---------------------------------------------------------------------------*/
+
+/*
+ * Maximum number of socket tags that a UID is allowed to have active.
+ * Multiple processes belonging to the same UID contribute towards this limit.
+ * Special UIDs that can impersonate a UID also contribute (e.g. download
+ * manager, ...)
+ */
+#define DEFAULT_MAX_SOCK_TAGS 1024
+
+/*
+ * For now we only track 2 sets of counters.
+ * The default set is 0.
+ * Userspace can activate another set for a given uid being tracked.
+ */
+#define IFS_MAX_COUNTER_SETS 2
+
+enum ifs_tx_rx {
+       IFS_TX,
+       IFS_RX,
+       IFS_MAX_DIRECTIONS
+};
+
+/* For now, TCP, UDP, the rest */
+enum ifs_proto {
+       IFS_TCP,
+       IFS_UDP,
+       IFS_PROTO_OTHER,
+       IFS_MAX_PROTOS
+};
+
+struct byte_packet_counters {
+       uint64_t bytes;
+       uint64_t packets;
+};
+
+struct data_counters {
+       struct byte_packet_counters bpc[IFS_MAX_COUNTER_SETS][IFS_MAX_DIRECTIONS][IFS_MAX_PROTOS];
+};
+
+static inline uint64_t dc_sum_bytes(struct data_counters *counters,
+                                   int set,
+                                   enum ifs_tx_rx direction)
+{
+       return counters->bpc[set][direction][IFS_TCP].bytes
+               + counters->bpc[set][direction][IFS_UDP].bytes
+               + counters->bpc[set][direction][IFS_PROTO_OTHER].bytes;
+}
+
+static inline uint64_t dc_sum_packets(struct data_counters *counters,
+                                     int set,
+                                     enum ifs_tx_rx direction)
+{
+       return counters->bpc[set][direction][IFS_TCP].packets
+               + counters->bpc[set][direction][IFS_UDP].packets
+               + counters->bpc[set][direction][IFS_PROTO_OTHER].packets;
+}
+
+
+/* Generic X based nodes used as a base for rb_tree ops */
+struct tag_node {
+       struct rb_node node;
+       tag_t tag;
+};
+
+struct tag_stat {
+       struct tag_node tn;
+       struct data_counters counters;
+       /*
+        * If this tag is acct_tag based, we need to count against the
+        * matching parent uid_tag.
+        */
+       struct data_counters *parent_counters;
+};
+
+struct iface_stat {
+       struct list_head list;  /* in iface_stat_list */
+       char *ifname;
+       bool active;
+       /* net_dev is only valid for active iface_stat */
+       struct net_device *net_dev;
+
+       struct byte_packet_counters totals_via_dev[IFS_MAX_DIRECTIONS];
+       struct data_counters totals_via_skb;
+       /*
+        * We keep the last_known, because some devices reset their counters
+        * just before NETDEV_UP, while some will reset just before
+        * NETDEV_REGISTER (which is more normal).
+        * So now, if the device didn't do a NETDEV_UNREGISTER and we see
+        * its current dev stats smaller that what was previously known, we
+        * assume an UNREGISTER and just use the last_known.
+        */
+       struct byte_packet_counters last_known[IFS_MAX_DIRECTIONS];
+       /* last_known is usable when last_known_valid is true */
+       bool last_known_valid;
+
+       struct proc_dir_entry *proc_ptr;
+
+       struct rb_root tag_stat_tree;
+       spinlock_t tag_stat_list_lock;
+};
+
+/* This is needed to create proc_dir_entries from atomic context. */
+struct iface_stat_work {
+       struct work_struct iface_work;
+       struct iface_stat *iface_entry;
+};
+
+/*
+ * Track tag that this socket is transferring data for, and not necessarily
+ * the uid that owns the socket.
+ * This is the tag against which tag_stat.counters will be billed.
+ * These structs need to be looked up by sock and pid.
+ */
+struct sock_tag {
+       struct rb_node sock_node;
+       struct sock *sk;  /* Only used as a number, never dereferenced */
+       /* Used to associate with a given pid */
+       struct list_head list;   /* in proc_qtu_data.sock_tag_list */
+       pid_t pid;
+
+       tag_t tag;
+};
+
+struct qtaguid_event_counts {
+       /* Various successful events */
+       atomic64_t sockets_tagged;
+       atomic64_t sockets_untagged;
+       atomic64_t counter_set_changes;
+       atomic64_t delete_cmds;
+       atomic64_t iface_events;  /* Number of NETDEV_* events handled */
+
+       atomic64_t match_calls;   /* Number of times iptables called mt */
+       /* Number of times iptables called mt from pre or post routing hooks */
+       atomic64_t match_calls_prepost;
+       /*
+        * match_found_sk_*: numbers related to the netfilter matching
+        * function finding a sock for the sk_buff.
+        * Total skbs processed is sum(match_found*).
+        */
+       atomic64_t match_found_sk;   /* An sk was already in the sk_buff. */
+       /* The connection tracker had or didn't have the sk. */
+       atomic64_t match_found_sk_in_ct;
+       atomic64_t match_found_no_sk_in_ct;
+       /*
+        * No sk could be found. No apparent owner. Could happen with
+        * unsolicited traffic.
+        */
+       atomic64_t match_no_sk;
+       /*
+        * The file ptr in the sk_socket wasn't there and we couldn't get GID.
+        * This might happen for traffic while the socket is being closed.
+        */
+       atomic64_t match_no_sk_gid;
+};
+
+/* Track the set active_set for the given tag. */
+struct tag_counter_set {
+       struct tag_node tn;
+       int active_set;
+};
+
+/*----------------------------------------------*/
+/*
+ * The qtu uid data is used to track resources that are created directly or
+ * indirectly by processes (uid tracked).
+ * It is shared by the processes with the same uid.
+ * Some of the resource will be counted to prevent further rogue allocations,
+ * some will need freeing once the owner process (uid) exits.
+ */
+struct uid_tag_data {
+       struct rb_node node;
+       uid_t uid;
+
+       /*
+        * For the uid, how many accounting tags have been set.
+        */
+       int num_active_tags;
+       /* Track the number of proc_qtu_data that reference it */
+       int num_pqd;
+       struct rb_root tag_ref_tree;
+       /* No tag_node_tree_lock; use uid_tag_data_tree_lock */
+};
+
+struct tag_ref {
+       struct tag_node tn;
+
+       /*
+        * This tracks the number of active sockets that have a tag on them
+        * which matches this tag_ref.tn.tag.
+        * A tag ref can live on after the sockets are untagged.
+        * A tag ref can only be removed during a tag delete command.
+        */
+       int num_sock_tags;
+};
+
+struct proc_qtu_data {
+       struct rb_node node;
+       pid_t pid;
+
+       struct uid_tag_data *parent_tag_data;
+
+       /* Tracks the sock_tags that need freeing upon this proc's death */
+       struct list_head sock_tag_list;
+       /* No spinlock_t sock_tag_list_lock; use the global one. */
+};
+
+/*----------------------------------------------*/
+#endif  /* ifndef __XT_QTAGUID_INTERNAL_H__ */
diff --git a/net/netfilter/xt_qtaguid_print.c b/net/netfilter/xt_qtaguid_print.c

new file mode 100644 (file)

index 0000000..cab478e
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.c
@@ -0,0 +1,565 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+/*
+ * Most of the functions in this file just waste time if DEBUG is not defined.
+ * The matching xt_qtaguid_print.h will static inline empty funcs if the needed
+ * debug flags ore not defined.
+ * Those funcs that fail to allocate memory will panic as there is no need to
+ * hobble allong just pretending to do the requested work.
+ */
+
+#define DEBUG
+
+#include <linux/fs.h>
+#include <linux/gfp.h>
+#include <linux/net.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/spinlock_types.h>
+#include <net/sock.h>
+
+#include "xt_qtaguid_internal.h"
+#include "xt_qtaguid_print.h"
+
+#ifdef DDEBUG
+
+static void _bug_on_err_or_null(void *ptr)
+{
+       if (IS_ERR_OR_NULL(ptr)) {
+               pr_err("qtaguid: kmalloc failed\n");
+               BUG();
+       }
+}
+
+char *pp_tag_t(tag_t *tag)
+{
+       char *res;
+
+       if (!tag)
+               res = kasprintf(GFP_ATOMIC, "tag_t@null{}");
+       else
+               res = kasprintf(GFP_ATOMIC,
+                               "tag_t@%p{tag=0x%llx, uid=%u}",
+                               tag, *tag, get_uid_from_tag(*tag));
+       _bug_on_err_or_null(res);
+       return res;
+}
+
+char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+       char *res;
+
+       if (!dc)
+               res = kasprintf(GFP_ATOMIC, "data_counters@null{}");
+       else if (showValues)
+               res = kasprintf(
+                       GFP_ATOMIC, "data_counters@%p{"
+                       "set0{"
+                       "rx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}, "
+                       "tx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}}, "
+                       "set1{"
+                       "rx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}, "
+                       "tx{"
+                       "tcp{b=%llu, p=%llu}, "
+                       "udp{b=%llu, p=%llu},"
+                       "other{b=%llu, p=%llu}}}}",
+                       dc,
+                       dc->bpc[0][IFS_RX][IFS_TCP].bytes,
+                       dc->bpc[0][IFS_RX][IFS_TCP].packets,
+                       dc->bpc[0][IFS_RX][IFS_UDP].bytes,
+                       dc->bpc[0][IFS_RX][IFS_UDP].packets,
+                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[0][IFS_RX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[0][IFS_TX][IFS_TCP].bytes,
+                       dc->bpc[0][IFS_TX][IFS_TCP].packets,
+                       dc->bpc[0][IFS_TX][IFS_UDP].bytes,
+                       dc->bpc[0][IFS_TX][IFS_UDP].packets,
+                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[0][IFS_TX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[1][IFS_RX][IFS_TCP].bytes,
+                       dc->bpc[1][IFS_RX][IFS_TCP].packets,
+                       dc->bpc[1][IFS_RX][IFS_UDP].bytes,
+                       dc->bpc[1][IFS_RX][IFS_UDP].packets,
+                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[1][IFS_RX][IFS_PROTO_OTHER].packets,
+                       dc->bpc[1][IFS_TX][IFS_TCP].bytes,
+                       dc->bpc[1][IFS_TX][IFS_TCP].packets,
+                       dc->bpc[1][IFS_TX][IFS_UDP].bytes,
+                       dc->bpc[1][IFS_TX][IFS_UDP].packets,
+                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].bytes,
+                       dc->bpc[1][IFS_TX][IFS_PROTO_OTHER].packets);
+       else
+               res = kasprintf(GFP_ATOMIC, "data_counters@%p{...}", dc);
+       _bug_on_err_or_null(res);
+       return res;
+}
+
+char *pp_tag_node(struct tag_node *tn)
+{
+       char *tag_str;
+       char *res;
+
+       if (!tn) {
+               res = kasprintf(GFP_ATOMIC, "tag_node@null{}");
+               _bug_on_err_or_null(res);
+               return res;
+       }
+       tag_str = pp_tag_t(&tn->tag);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_node@%p{tag=%s}",
+                       tn, tag_str);
+       _bug_on_err_or_null(res);
+       kfree(tag_str);
+       return res;
+}
+
+char *pp_tag_ref(struct tag_ref *tr)
+{
+       char *tn_str;
+       char *res;
+
+       if (!tr) {
+               res = kasprintf(GFP_ATOMIC, "tag_ref@null{}");
+               _bug_on_err_or_null(res);
+               return res;
+       }
+       tn_str = pp_tag_node(&tr->tn);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_ref@%p{%s, num_sock_tags=%d}",
+                       tr, tn_str, tr->num_sock_tags);
+       _bug_on_err_or_null(res);
+       kfree(tn_str);
+       return res;
+}
+
+char *pp_tag_stat(struct tag_stat *ts)
+{
+       char *tn_str;
+       char *counters_str;
+       char *parent_counters_str;
+       char *res;
+
+       if (!ts) {
+               res = kasprintf(GFP_ATOMIC, "tag_stat@null{}");
+               _bug_on_err_or_null(res);
+               return res;
+       }
+       tn_str = pp_tag_node(&ts->tn);
+       counters_str = pp_data_counters(&ts->counters, true);
+       parent_counters_str = pp_data_counters(ts->parent_counters, false);
+       res = kasprintf(GFP_ATOMIC,
+                       "tag_stat@%p{%s, counters=%s, parent_counters=%s}",
+                       ts, tn_str, counters_str, parent_counters_str);
+       _bug_on_err_or_null(res);
+       kfree(tn_str);
+       kfree(counters_str);
+       kfree(parent_counters_str);
+       return res;
+}
+
+char *pp_iface_stat(struct iface_stat *is)
+{
+       char *res;
+       if (!is) {
+               res = kasprintf(GFP_ATOMIC, "iface_stat@null{}");
+       } else {
+               struct data_counters *cnts = &is->totals_via_skb;
+               res = kasprintf(GFP_ATOMIC, "iface_stat@%p{"
+                               "list=list_head{...}, "
+                               "ifname=%s, "
+                               "total_dev={rx={bytes=%llu, "
+                               "packets=%llu}, "
+                               "tx={bytes=%llu, "
+                               "packets=%llu}}, "
+                               "total_skb={rx={bytes=%llu, "
+                               "packets=%llu}, "
+                               "tx={bytes=%llu, "
+                               "packets=%llu}}, "
+                               "last_known_valid=%d, "
+                               "last_known={rx={bytes=%llu, "
+                               "packets=%llu}, "
+                               "tx={bytes=%llu, "
+                               "packets=%llu}}, "
+                               "active=%d, "
+                               "net_dev=%p, "
+                               "proc_ptr=%p, "
+                               "tag_stat_tree=rb_root{...}}",
+                               is,
+                               is->ifname,
+                               is->totals_via_dev[IFS_RX].bytes,
+                               is->totals_via_dev[IFS_RX].packets,
+                               is->totals_via_dev[IFS_TX].bytes,
+                               is->totals_via_dev[IFS_TX].packets,
+                               dc_sum_bytes(cnts, 0, IFS_RX),
+                               dc_sum_packets(cnts, 0, IFS_RX),
+                               dc_sum_bytes(cnts, 0, IFS_TX),
+                               dc_sum_packets(cnts, 0, IFS_TX),
+                               is->last_known_valid,
+                               is->last_known[IFS_RX].bytes,
+                               is->last_known[IFS_RX].packets,
+                               is->last_known[IFS_TX].bytes,
+                               is->last_known[IFS_TX].packets,
+                               is->active,
+                               is->net_dev,
+                               is->proc_ptr);
+       }
+       _bug_on_err_or_null(res);
+       return res;
+}
+
+char *pp_sock_tag(struct sock_tag *st)
+{
+       char *tag_str;
+       char *res;
+
+       if (!st) {
+               res = kasprintf(GFP_ATOMIC, "sock_tag@null{}");
+               _bug_on_err_or_null(res);
+               return res;
+       }
+       tag_str = pp_tag_t(&st->tag);
+       res = kasprintf(GFP_ATOMIC, "sock_tag@%p{"
+                       "sock_node=rb_node{...}, "
+                       "sk=%p (f_count=%d), list=list_head{...}, "
+                       "pid=%u, tag=%s}",
+                       st, st->sk, refcount_read(&st->sk->sk_refcnt),
+                       st->pid, tag_str);
+       _bug_on_err_or_null(res);
+       kfree(tag_str);
+       return res;
+}
+
+char *pp_uid_tag_data(struct uid_tag_data *utd)
+{
+       char *res;
+
+       if (!utd)
+               res = kasprintf(GFP_ATOMIC, "uid_tag_data@null{}");
+       else
+               res = kasprintf(GFP_ATOMIC, "uid_tag_data@%p{"
+                               "uid=%u, num_active_acct_tags=%d, "
+                               "num_pqd=%d, "
+                               "tag_node_tree=rb_root{...}, "
+                               "proc_qtu_data_tree=rb_root{...}}",
+                               utd, utd->uid,
+                               utd->num_active_tags, utd->num_pqd);
+       _bug_on_err_or_null(res);
+       return res;
+}
+
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+       char *parent_tag_data_str;
+       char *res;
+
+       if (!pqd) {
+               res = kasprintf(GFP_ATOMIC, "proc_qtu_data@null{}");
+               _bug_on_err_or_null(res);
+               return res;
+       }
+       parent_tag_data_str = pp_uid_tag_data(pqd->parent_tag_data);
+       res = kasprintf(GFP_ATOMIC, "proc_qtu_data@%p{"
+                       "node=rb_node{...}, pid=%u, "
+                       "parent_tag_data=%s, "
+                       "sock_tag_list=list_head{...}}",
+                       pqd, pqd->pid, parent_tag_data_str
+               );
+       _bug_on_err_or_null(res);
+       kfree(parent_tag_data_str);
+       return res;
+}
+
+/*------------------------------------------*/
+void prdebug_sock_tag_tree(int indent_level,
+                          struct rb_root *sock_tag_tree)
+{
+       struct rb_node *node;
+       struct sock_tag *sock_tag_entry;
+       char *str;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (RB_EMPTY_ROOT(sock_tag_tree)) {
+               str = "sock_tag_tree=rb_root{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "sock_tag_tree=rb_root{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(sock_tag_tree);
+            node;
+            node = rb_next(node)) {
+               sock_tag_entry = rb_entry(node, struct sock_tag, sock_node);
+               str = pp_sock_tag(sock_tag_entry);
+               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_sock_tag_list(int indent_level,
+                          struct list_head *sock_tag_list)
+{
+       struct sock_tag *sock_tag_entry;
+       char *str;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (list_empty(sock_tag_list)) {
+               str = "sock_tag_list=list_head{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "sock_tag_list=list_head{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       list_for_each_entry(sock_tag_entry, sock_tag_list, list) {
+               str = pp_sock_tag(sock_tag_entry);
+               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_proc_qtu_data_tree(int indent_level,
+                               struct rb_root *proc_qtu_data_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct proc_qtu_data *proc_qtu_data_entry;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (RB_EMPTY_ROOT(proc_qtu_data_tree)) {
+               str = "proc_qtu_data_tree=rb_root{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "proc_qtu_data_tree=rb_root{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(proc_qtu_data_tree);
+            node;
+            node = rb_next(node)) {
+               proc_qtu_data_entry = rb_entry(node,
+                                              struct proc_qtu_data,
+                                              node);
+               str = pp_proc_qtu_data(proc_qtu_data_entry);
+               pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+               indent_level++;
+               prdebug_sock_tag_list(indent_level,
+                                     &proc_qtu_data_entry->sock_tag_list);
+               indent_level--;
+
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct tag_ref *tag_ref_entry;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (RB_EMPTY_ROOT(tag_ref_tree)) {
+               str = "tag_ref_tree{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "tag_ref_tree{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(tag_ref_tree);
+            node;
+            node = rb_next(node)) {
+               tag_ref_entry = rb_entry(node,
+                                        struct tag_ref,
+                                        tn.node);
+               str = pp_tag_ref(tag_ref_entry);
+               pr_debug("%*d: %s,\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_uid_tag_data_tree(int indent_level,
+                              struct rb_root *uid_tag_data_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct uid_tag_data *uid_tag_data_entry;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (RB_EMPTY_ROOT(uid_tag_data_tree)) {
+               str = "uid_tag_data_tree=rb_root{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "uid_tag_data_tree=rb_root{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(uid_tag_data_tree);
+            node;
+            node = rb_next(node)) {
+               uid_tag_data_entry = rb_entry(node, struct uid_tag_data,
+                                             node);
+               str = pp_uid_tag_data(uid_tag_data_entry);
+               pr_debug("%*d: %s,\n", indent_level*2, indent_level, str);
+               kfree(str);
+               if (!RB_EMPTY_ROOT(&uid_tag_data_entry->tag_ref_tree)) {
+                       indent_level++;
+                       prdebug_tag_ref_tree(indent_level,
+                                            &uid_tag_data_entry->tag_ref_tree);
+                       indent_level--;
+               }
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_tag_stat_tree(int indent_level,
+                                 struct rb_root *tag_stat_tree)
+{
+       char *str;
+       struct rb_node *node;
+       struct tag_stat *ts_entry;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (RB_EMPTY_ROOT(tag_stat_tree)) {
+               str = "tag_stat_tree{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "tag_stat_tree{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       for (node = rb_first(tag_stat_tree);
+            node;
+            node = rb_next(node)) {
+               ts_entry = rb_entry(node, struct tag_stat, tn.node);
+               str = pp_tag_stat(ts_entry);
+               pr_debug("%*d: %s\n", indent_level*2, indent_level,
+                        str);
+               kfree(str);
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+void prdebug_iface_stat_list(int indent_level,
+                            struct list_head *iface_stat_list)
+{
+       char *str;
+       struct iface_stat *iface_entry;
+
+       if (!unlikely(qtaguid_debug_mask & DDEBUG_MASK))
+               return;
+
+       if (list_empty(iface_stat_list)) {
+               str = "iface_stat_list=list_head{}";
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               return;
+       }
+
+       str = "iface_stat_list=list_head{";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+       indent_level++;
+       list_for_each_entry(iface_entry, iface_stat_list, list) {
+               str = pp_iface_stat(iface_entry);
+               pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+               kfree(str);
+
+               spin_lock_bh(&iface_entry->tag_stat_list_lock);
+               if (!RB_EMPTY_ROOT(&iface_entry->tag_stat_tree)) {
+                       indent_level++;
+                       prdebug_tag_stat_tree(indent_level,
+                                             &iface_entry->tag_stat_tree);
+                       indent_level--;
+               }
+               spin_unlock_bh(&iface_entry->tag_stat_list_lock);
+       }
+       indent_level--;
+       str = "}";
+       pr_debug("%*d: %s\n", indent_level*2, indent_level, str);
+}
+
+#endif  /* ifdef DDEBUG */
+/*------------------------------------------*/
+static const char * const netdev_event_strings[] = {
+       "netdev_unknown",
+       "NETDEV_UP",
+       "NETDEV_DOWN",
+       "NETDEV_REBOOT",
+       "NETDEV_CHANGE",
+       "NETDEV_REGISTER",
+       "NETDEV_UNREGISTER",
+       "NETDEV_CHANGEMTU",
+       "NETDEV_CHANGEADDR",
+       "NETDEV_GOING_DOWN",
+       "NETDEV_CHANGENAME",
+       "NETDEV_FEAT_CHANGE",
+       "NETDEV_BONDING_FAILOVER",
+       "NETDEV_PRE_UP",
+       "NETDEV_PRE_TYPE_CHANGE",
+       "NETDEV_POST_TYPE_CHANGE",
+       "NETDEV_POST_INIT",
+       "NETDEV_UNREGISTER_BATCH",
+       "NETDEV_RELEASE",
+       "NETDEV_NOTIFY_PEERS",
+       "NETDEV_JOIN",
+};
+
+const char *netdev_evt_str(int netdev_event)
+{
+       if (netdev_event < 0
+           || netdev_event >= ARRAY_SIZE(netdev_event_strings))
+               return "bad event num";
+       return netdev_event_strings[netdev_event];
+}
diff --git a/net/netfilter/xt_qtaguid_print.h b/net/netfilter/xt_qtaguid_print.h

new file mode 100644 (file)

index 0000000..b63871a
--- /dev/null
+++ b/net/netfilter/xt_qtaguid_print.h
@@ -0,0 +1,120 @@
+/*
+ * Pretty printing Support for iptables xt_qtaguid module.
+ *
+ * (C) 2011 Google, Inc
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+#ifndef __XT_QTAGUID_PRINT_H__
+#define __XT_QTAGUID_PRINT_H__
+
+#include "xt_qtaguid_internal.h"
+
+#ifdef DDEBUG
+
+char *pp_tag_t(tag_t *tag);
+char *pp_data_counters(struct data_counters *dc, bool showValues);
+char *pp_tag_node(struct tag_node *tn);
+char *pp_tag_ref(struct tag_ref *tr);
+char *pp_tag_stat(struct tag_stat *ts);
+char *pp_iface_stat(struct iface_stat *is);
+char *pp_sock_tag(struct sock_tag *st);
+char *pp_uid_tag_data(struct uid_tag_data *qtd);
+char *pp_proc_qtu_data(struct proc_qtu_data *pqd);
+
+/*------------------------------------------*/
+void prdebug_sock_tag_list(int indent_level,
+                          struct list_head *sock_tag_list);
+void prdebug_sock_tag_tree(int indent_level,
+                          struct rb_root *sock_tag_tree);
+void prdebug_proc_qtu_data_tree(int indent_level,
+                               struct rb_root *proc_qtu_data_tree);
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree);
+void prdebug_uid_tag_data_tree(int indent_level,
+                              struct rb_root *uid_tag_data_tree);
+void prdebug_tag_stat_tree(int indent_level,
+                          struct rb_root *tag_stat_tree);
+void prdebug_iface_stat_list(int indent_level,
+                            struct list_head *iface_stat_list);
+
+#else
+
+/*------------------------------------------*/
+static inline char *pp_tag_t(tag_t *tag)
+{
+       return NULL;
+}
+static inline char *pp_data_counters(struct data_counters *dc, bool showValues)
+{
+       return NULL;
+}
+static inline char *pp_tag_node(struct tag_node *tn)
+{
+       return NULL;
+}
+static inline char *pp_tag_ref(struct tag_ref *tr)
+{
+       return NULL;
+}
+static inline char *pp_tag_stat(struct tag_stat *ts)
+{
+       return NULL;
+}
+static inline char *pp_iface_stat(struct iface_stat *is)
+{
+       return NULL;
+}
+static inline char *pp_sock_tag(struct sock_tag *st)
+{
+       return NULL;
+}
+static inline char *pp_uid_tag_data(struct uid_tag_data *qtd)
+{
+       return NULL;
+}
+static inline char *pp_proc_qtu_data(struct proc_qtu_data *pqd)
+{
+       return NULL;
+}
+
+/*------------------------------------------*/
+static inline
+void prdebug_sock_tag_list(int indent_level,
+                          struct list_head *sock_tag_list)
+{
+}
+static inline
+void prdebug_sock_tag_tree(int indent_level,
+                          struct rb_root *sock_tag_tree)
+{
+}
+static inline
+void prdebug_proc_qtu_data_tree(int indent_level,
+                               struct rb_root *proc_qtu_data_tree)
+{
+}
+static inline
+void prdebug_tag_ref_tree(int indent_level, struct rb_root *tag_ref_tree)
+{
+}
+static inline
+void prdebug_uid_tag_data_tree(int indent_level,
+                              struct rb_root *uid_tag_data_tree)
+{
+}
+static inline
+void prdebug_tag_stat_tree(int indent_level,
+                          struct rb_root *tag_stat_tree)
+{
+}
+static inline
+void prdebug_iface_stat_list(int indent_level,
+                            struct list_head *iface_stat_list)
+{
+}
+#endif
+/*------------------------------------------*/
+const char *netdev_evt_str(int netdev_event);
+#endif  /* ifndef __XT_QTAGUID_PRINT_H__ */
diff --git a/net/netfilter/xt_quota2.c b/net/netfilter/xt_quota2.c

new file mode 100644 (file)

index 0000000..24b7742
--- /dev/null
+++ b/net/netfilter/xt_quota2.c
@@ -0,0 +1,401 @@
+/*
+ * xt_quota2 - enhanced xt_quota that can count upwards and in packets
+ * as a minimal accounting match.
+ * by Jan Engelhardt <jengelh@medozas.de>, 2008
+ *
+ * Originally based on xt_quota.c:
+ *     netfilter module to enforce network quotas
+ *     Sam Johnston <samj@samj.net>
+ *
+ *     This program is free software; you can redistribute it and/or modify
+ *     it under the terms of the GNU General Public License; either
+ *     version 2 of the License, as published by the Free Software Foundation.
+ */
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/proc_fs.h>
+#include <linux/skbuff.h>
+#include <linux/spinlock.h>
+#include <asm/atomic.h>
+#include <net/netlink.h>
+
+#include <linux/netfilter/x_tables.h>
+#include <linux/netfilter/xt_quota2.h>
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* For compatibility, these definitions are copied from the
+ * deprecated header file <linux/netfilter_ipv4/ipt_ULOG.h> */
+#define ULOG_MAC_LEN   80
+#define ULOG_PREFIX_LEN        32
+
+/* Format of the ULOG packets passed through netlink */
+typedef struct ulog_packet_msg {
+       unsigned long mark;
+       long timestamp_sec;
+       long timestamp_usec;
+       unsigned int hook;
+       char indev_name[IFNAMSIZ];
+       char outdev_name[IFNAMSIZ];
+       size_t data_len;
+       char prefix[ULOG_PREFIX_LEN];
+       unsigned char mac_len;
+       unsigned char mac[ULOG_MAC_LEN];
+       unsigned char payload[0];
+} ulog_packet_msg_t;
+#endif
+
+/**
+ * @lock:      lock to protect quota writers from each other
+ */
+struct xt_quota_counter {
+       u_int64_t quota;
+       spinlock_t lock;
+       struct list_head list;
+       atomic_t ref;
+       char name[sizeof(((struct xt_quota_mtinfo2 *)NULL)->name)];
+       struct proc_dir_entry *procfs_entry;
+};
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+/* Harald's favorite number +1 :D From ipt_ULOG.C */
+static int qlog_nl_event = 112;
+module_param_named(event_num, qlog_nl_event, uint, S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(event_num,
+                "Event number for NETLINK_NFLOG message. 0 disables log."
+                "111 is what ipt_ULOG uses.");
+static struct sock *nflognl;
+#endif
+
+static LIST_HEAD(counter_list);
+static DEFINE_SPINLOCK(counter_list_lock);
+
+static struct proc_dir_entry *proc_xt_quota;
+static unsigned int quota_list_perms = S_IRUGO | S_IWUSR;
+static kuid_t quota_list_uid = KUIDT_INIT(0);
+static kgid_t quota_list_gid = KGIDT_INIT(0);
+module_param_named(perms, quota_list_perms, uint, S_IRUGO | S_IWUSR);
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+static void quota2_log(unsigned int hooknum,
+                      const struct sk_buff *skb,
+                      const struct net_device *in,
+                      const struct net_device *out,
+                      const char *prefix)
+{
+       ulog_packet_msg_t *pm;
+       struct sk_buff *log_skb;
+       size_t size;
+       struct nlmsghdr *nlh;
+
+       if (!qlog_nl_event)
+               return;
+
+       size = NLMSG_SPACE(sizeof(*pm));
+       size = max(size, (size_t)NLMSG_GOODSIZE);
+       log_skb = alloc_skb(size, GFP_ATOMIC);
+       if (!log_skb) {
+               pr_err("xt_quota2: cannot alloc skb for logging\n");
+               return;
+       }
+
+       nlh = nlmsg_put(log_skb, /*pid*/0, /*seq*/0, qlog_nl_event,
+                       sizeof(*pm), 0);
+       if (!nlh) {
+               pr_err("xt_quota2: nlmsg_put failed\n");
+               kfree_skb(log_skb);
+               return;
+       }
+       pm = nlmsg_data(nlh);
+       if (skb->tstamp == 0)
+               __net_timestamp((struct sk_buff *)skb);
+       pm->data_len = 0;
+       pm->hook = hooknum;
+       if (prefix != NULL)
+               strlcpy(pm->prefix, prefix, sizeof(pm->prefix));
+       else
+               *(pm->prefix) = '\0';
+       if (in)
+               strlcpy(pm->indev_name, in->name, sizeof(pm->indev_name));
+       else
+               pm->indev_name[0] = '\0';
+
+       if (out)
+               strlcpy(pm->outdev_name, out->name, sizeof(pm->outdev_name));
+       else
+               pm->outdev_name[0] = '\0';
+
+       NETLINK_CB(log_skb).dst_group = 1;
+       pr_debug("throwing 1 packets to netlink group 1\n");
+       netlink_broadcast(nflognl, log_skb, 0, 1, GFP_ATOMIC);
+}
+#else
+static void quota2_log(unsigned int hooknum,
+                      const struct sk_buff *skb,
+                      const struct net_device *in,
+                      const struct net_device *out,
+                      const char *prefix)
+{
+}
+#endif  /* if+else CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG */
+
+static ssize_t quota_proc_read(struct file *file, char __user *buf,
+                          size_t size, loff_t *ppos)
+{
+       struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+       char tmp[24];
+       size_t tmp_size;
+
+       spin_lock_bh(&e->lock);
+       tmp_size = scnprintf(tmp, sizeof(tmp), "%llu\n", e->quota);
+       spin_unlock_bh(&e->lock);
+       return simple_read_from_buffer(buf, size, ppos, tmp, tmp_size);
+}
+
+static ssize_t quota_proc_write(struct file *file, const char __user *input,
+                            size_t size, loff_t *ppos)
+{
+       struct xt_quota_counter *e = PDE_DATA(file_inode(file));
+       char buf[sizeof("18446744073709551616")];
+
+       if (size > sizeof(buf))
+               size = sizeof(buf);
+       if (copy_from_user(buf, input, size) != 0)
+               return -EFAULT;
+       buf[sizeof(buf)-1] = '\0';
+
+       spin_lock_bh(&e->lock);
+       e->quota = simple_strtoull(buf, NULL, 0);
+       spin_unlock_bh(&e->lock);
+       return size;
+}
+
+static const struct file_operations q2_counter_fops = {
+       .read           = quota_proc_read,
+       .write          = quota_proc_write,
+       .llseek         = default_llseek,
+};
+
+static struct xt_quota_counter *
+q2_new_counter(const struct xt_quota_mtinfo2 *q, bool anon)
+{
+       struct xt_quota_counter *e;
+       unsigned int size;
+
+       /* Do not need all the procfs things for anonymous counters. */
+       size = anon ? offsetof(typeof(*e), list) : sizeof(*e);
+       e = kmalloc(size, GFP_KERNEL);
+       if (e == NULL)
+               return NULL;
+
+       e->quota = q->quota;
+       spin_lock_init(&e->lock);
+       if (!anon) {
+               INIT_LIST_HEAD(&e->list);
+               atomic_set(&e->ref, 1);
+               strlcpy(e->name, q->name, sizeof(e->name));
+       }
+       return e;
+}
+
+/**
+ * q2_get_counter - get ref to counter or create new
+ * @name:      name of counter
+ */
+static struct xt_quota_counter *
+q2_get_counter(const struct xt_quota_mtinfo2 *q)
+{
+       struct proc_dir_entry *p;
+       struct xt_quota_counter *e = NULL;
+       struct xt_quota_counter *new_e;
+
+       if (*q->name == '\0')
+               return q2_new_counter(q, true);
+
+       /* No need to hold a lock while getting a new counter */
+       new_e = q2_new_counter(q, false);
+       if (new_e == NULL)
+               goto out;
+
+       spin_lock_bh(&counter_list_lock);
+       list_for_each_entry(e, &counter_list, list)
+               if (strcmp(e->name, q->name) == 0) {
+                       atomic_inc(&e->ref);
+                       spin_unlock_bh(&counter_list_lock);
+                       kfree(new_e);
+                       pr_debug("xt_quota2: old counter name=%s", e->name);
+                       return e;
+               }
+       e = new_e;
+       pr_debug("xt_quota2: new_counter name=%s", e->name);
+       list_add_tail(&e->list, &counter_list);
+       /* The entry having a refcount of 1 is not directly destructible.
+        * This func has not yet returned the new entry, thus iptables
+        * has not references for destroying this entry.
+        * For another rule to try to destroy it, it would 1st need for this
+        * func* to be re-invoked, acquire a new ref for the same named quota.
+        * Nobody will access the e->procfs_entry either.
+        * So release the lock. */
+       spin_unlock_bh(&counter_list_lock);
+
+       /* create_proc_entry() is not spin_lock happy */
+       p = e->procfs_entry = proc_create_data(e->name, quota_list_perms,
+                             proc_xt_quota, &q2_counter_fops, e);
+
+       if (IS_ERR_OR_NULL(p)) {
+               spin_lock_bh(&counter_list_lock);
+               list_del(&e->list);
+               spin_unlock_bh(&counter_list_lock);
+               goto out;
+       }
+       proc_set_user(p, quota_list_uid, quota_list_gid);
+       return e;
+
+ out:
+       kfree(e);
+       return NULL;
+}
+
+static int quota_mt2_check(const struct xt_mtchk_param *par)
+{
+       struct xt_quota_mtinfo2 *q = par->matchinfo;
+
+       pr_debug("xt_quota2: check() flags=0x%04x", q->flags);
+
+       if (q->flags & ~XT_QUOTA_MASK)
+               return -EINVAL;
+
+       q->name[sizeof(q->name)-1] = '\0';
+       if (*q->name == '.' || strchr(q->name, '/') != NULL) {
+               printk(KERN_ERR "xt_quota.3: illegal name\n");
+               return -EINVAL;
+       }
+
+       q->master = q2_get_counter(q);
+       if (q->master == NULL) {
+               printk(KERN_ERR "xt_quota.3: memory alloc failure\n");
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static void quota_mt2_destroy(const struct xt_mtdtor_param *par)
+{
+       struct xt_quota_mtinfo2 *q = par->matchinfo;
+       struct xt_quota_counter *e = q->master;
+
+       if (*q->name == '\0') {
+               kfree(e);
+               return;
+       }
+
+       spin_lock_bh(&counter_list_lock);
+       if (!atomic_dec_and_test(&e->ref)) {
+               spin_unlock_bh(&counter_list_lock);
+               return;
+       }
+
+       list_del(&e->list);
+       remove_proc_entry(e->name, proc_xt_quota);
+       spin_unlock_bh(&counter_list_lock);
+       kfree(e);
+}
+
+static bool
+quota_mt2(const struct sk_buff *skb, struct xt_action_param *par)
+{
+       struct xt_quota_mtinfo2 *q = (void *)par->matchinfo;
+       struct xt_quota_counter *e = q->master;
+       bool ret = q->flags & XT_QUOTA_INVERT;
+
+       spin_lock_bh(&e->lock);
+       if (q->flags & XT_QUOTA_GROW) {
+               /*
+                * While no_change is pointless in "grow" mode, we will
+                * implement it here simply to have a consistent behavior.
+                */
+               if (!(q->flags & XT_QUOTA_NO_CHANGE)) {
+                       e->quota += (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+               }
+               ret = true;
+       } else {
+               if (e->quota >= skb->len) {
+                       if (!(q->flags & XT_QUOTA_NO_CHANGE))
+                               e->quota -= (q->flags & XT_QUOTA_PACKET) ? 1 : skb->len;
+                       ret = !ret;
+               } else {
+                       /* We are transitioning, log that fact. */
+                       if (e->quota) {
+                               quota2_log(xt_hooknum(par),
+                                          skb,
+                                          xt_in(par),
+                                          xt_out(par),
+                                          q->name);
+                       }
+                       /* we do not allow even small packets from now on */
+                       e->quota = 0;
+               }
+       }
+       spin_unlock_bh(&e->lock);
+       return ret;
+}
+
+static struct xt_match quota_mt2_reg[] __read_mostly = {
+       {
+               .name       = "quota2",
+               .revision   = 3,
+               .family     = NFPROTO_IPV4,
+               .checkentry = quota_mt2_check,
+               .match      = quota_mt2,
+               .destroy    = quota_mt2_destroy,
+               .matchsize  = sizeof(struct xt_quota_mtinfo2),
+               .me         = THIS_MODULE,
+       },
+       {
+               .name       = "quota2",
+               .revision   = 3,
+               .family     = NFPROTO_IPV6,
+               .checkentry = quota_mt2_check,
+               .match      = quota_mt2,
+               .destroy    = quota_mt2_destroy,
+               .matchsize  = sizeof(struct xt_quota_mtinfo2),
+               .me         = THIS_MODULE,
+       },
+};
+
+static int __init quota_mt2_init(void)
+{
+       int ret;
+       pr_debug("xt_quota2: init()");
+
+#ifdef CONFIG_NETFILTER_XT_MATCH_QUOTA2_LOG
+       nflognl = netlink_kernel_create(&init_net, NETLINK_NFLOG, NULL);
+       if (!nflognl)
+               return -ENOMEM;
+#endif
+
+       proc_xt_quota = proc_mkdir("xt_quota", init_net.proc_net);
+       if (proc_xt_quota == NULL)
+               return -EACCES;
+
+       ret = xt_register_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+       if (ret < 0)
+               remove_proc_entry("xt_quota", init_net.proc_net);
+       pr_debug("xt_quota2: init() %d", ret);
+       return ret;
+}
+
+static void __exit quota_mt2_exit(void)
+{
+       xt_unregister_matches(quota_mt2_reg, ARRAY_SIZE(quota_mt2_reg));
+       remove_proc_entry("xt_quota", init_net.proc_net);
+}
+
+module_init(quota_mt2_init);
+module_exit(quota_mt2_exit);
+MODULE_DESCRIPTION("Xtables: countdown quota match; up counter");
+MODULE_AUTHOR("Sam Johnston <samj@samj.net>");
+MODULE_AUTHOR("Jan Engelhardt <jengelh@medozas.de>");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS("ipt_quota2");
+MODULE_ALIAS("ip6t_quota2");
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c

index 575d2153e3b819f32e9a262abddca95a108eee02..cddd5cb810133128d47af44c96137da84012ec26 100644 (file)
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -79,8 +79,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par,
                     transparent && sk_fullsock(sk))
                         pskb->mark = sk->sk_mark;
  
-               if (sk != skb->sk)
-                       sock_gen_put(sk);
+               sock_gen_put(sk);
  
                 if (wildcard || !transparent)
                         sk = NULL;
diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c

index b740fef0acc5ef5b6f81c633da4693675d2be417..6bf14f4f4b428e8283d19460331a7b9f26334b72 100644 (file)
--- a/net/nfc/hci/core.c
+++ b/net/nfc/hci/core.c
@@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
                 }
                 create_info = (struct hci_create_pipe_resp *)skb->data;
  
+               if (create_info->pipe >= NFC_HCI_MAX_PIPES) {
+                       status = NFC_HCI_ANY_E_NOK;
+                       goto exit;
+               }
+
                 /* Save the new created pipe and bind with local gate,
                  * the description for skb->data[3] is destination gate id
                  * but since we received this cmd from host controller, we
@@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd,
                 }
                 delete_info = (struct hci_delete_pipe_noti *)skb->data;
  
+               if (delete_info->pipe >= NFC_HCI_MAX_PIPES) {
+                       status = NFC_HCI_ANY_E_NOK;
+                       goto exit;
+               }
+
                 hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE;
                 hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST;
                 break;
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig

index 060600b03fad14bad59b83a25e9b28aee4bc7da8..7c33c8bb2cd90068d12954b5efca8cf6b8d74480 100644 (file)
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -10,6 +10,11 @@ menuconfig RFKILL
           To compile this driver as a module, choose M here: the
           module will be called rfkill.
  
+config RFKILL_PM
+       bool "Power off on suspend"
+       depends on RFKILL && PM
+       default y
+
  # LED trigger support
  config RFKILL_LEDS
         bool
diff --git a/net/rfkill/core.c b/net/rfkill/core.c

index 2064c3a35ef84d4c54c065e8b54d524cdb407736..d223a86e9778c698fdee1fc8e3551022d7237d45 100644 (file)
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -854,8 +854,7 @@ void rfkill_resume_polling(struct rfkill *rfkill)
  }
  EXPORT_SYMBOL(rfkill_resume_polling);
  
-#ifdef CONFIG_PM_SLEEP
-static int rfkill_suspend(struct device *dev)
+static __maybe_unused int rfkill_suspend(struct device *dev)
  {
         struct rfkill *rfkill = to_rfkill(dev);
  
@@ -865,7 +864,7 @@ static int rfkill_suspend(struct device *dev)
         return 0;
  }
  
-static int rfkill_resume(struct device *dev)
+static __maybe_unused int rfkill_resume(struct device *dev)
  {
         struct rfkill *rfkill = to_rfkill(dev);
         bool cur;
@@ -885,17 +884,13 @@ static int rfkill_resume(struct device *dev)
  }
  
  static SIMPLE_DEV_PM_OPS(rfkill_pm_ops, rfkill_suspend, rfkill_resume);
-#define RFKILL_PM_OPS (&rfkill_pm_ops)
-#else
-#define RFKILL_PM_OPS NULL
-#endif
  
  static struct class rfkill_class = {
         .name           = "rfkill",
         .dev_release    = rfkill_release,
         .dev_groups     = rfkill_dev_groups,
         .dev_uevent     = rfkill_dev_uevent,
-       .pm             = RFKILL_PM_OPS,
+       .pm             = IS_ENABLED(CONFIG_RFKILL_PM) ? &rfkill_pm_ops : NULL,
  };
  
  bool rfkill_blocked(struct rfkill *rfkill)
diff --git a/net/wireless/scan.c b/net/wireless/scan.c

index f6c5fe4825065ddb0bc99f882d36aa390129d540..1289cc14e9f922dd0de1ac27ff86745adc468117 100644 (file)
--- a/net/wireless/scan.c
+++ b/net/wireless/scan.c
@@ -71,7 +71,7 @@ module_param(bss_entries_limit, int, 0644);
  MODULE_PARM_DESC(bss_entries_limit,
                   "limit to number of scan BSS entries (per wiphy, default 1000)");
  
-#define IEEE80211_SCAN_RESULT_EXPIRE   (30 * HZ)
+#define IEEE80211_SCAN_RESULT_EXPIRE   (7 * HZ)
  
  static void bss_free(struct cfg80211_internal_bss *bss)
  {
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c

index 44ac85fe2bc9bfe72ca1e1e55399428c41a581fa..d0ca0dbf494e4ce98675235c0f8af698e6d6e602 100644 (file)
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -241,7 +241,7 @@ static struct xfrm_algo_desc aalg_list[] = {
  
         .uinfo = {
                 .auth = {
-                       .icv_truncbits = 96,
+                       .icv_truncbits = 128,
                         .icv_fullbits = 256,
                 }
         },
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c

index 688ed34f0671f5f524331a207bf28bb758aa2c7d..524d2195069ff983317be54fefaaf660c824bbc1 100644 (file)
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -1252,7 +1252,7 @@ EXPORT_SYMBOL(xfrm_policy_delete);
  
  int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
  {
-       struct net *net = xp_net(pol);
+       struct net *net = sock_net(sk);
         struct xfrm_policy *old_pol;
  
  #ifdef CONFIG_XFRM_SUB_POLICY
diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c

index 1f5cee2269af4296bd41745adec063a4a04faa9f..ec0c738c4dfe98484f274158331041580e6f2459 100644 (file)
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -2049,6 +2049,13 @@ int xfrm_user_policy(struct sock *sk, int optname, u8 __user *optval, int optlen
         struct xfrm_mgr *km;
         struct xfrm_policy *pol = NULL;
  
+       if (!optval && !optlen) {
+               xfrm_sk_policy_insert(sk, XFRM_POLICY_IN, NULL);
+               xfrm_sk_policy_insert(sk, XFRM_POLICY_OUT, NULL);
+               __sk_dst_reset(sk);
+               return 0;
+       }
+
         if (optlen <= 0 || optlen > PAGE_SIZE)
                 return -EMSGSIZE;
  
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c

index e44a0fed48dd088ac95a0726a5f2b58b0f07c5bb..1429960cba214b1e38ac2bf7ca2f244b1c510651 100644 (file)
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -1734,6 +1734,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb,
         struct sk_buff *skb;
         int err;
  
+       err = verify_policy_dir(dir);
+       if (err)
+               return ERR_PTR(err);
+
         skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
         if (!skb)
                 return ERR_PTR(-ENOMEM);
@@ -2255,6 +2259,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh,
         struct net *net = sock_net(skb->sk);
         struct xfrm_encap_tmpl  *encap = NULL;
  
+       err = verify_policy_dir(pi->dir);
+       if (err)
+               return err;
+
         if (attrs[XFRMA_MIGRATE] == NULL)
                 return -EINVAL;
  
@@ -2388,6 +2396,11 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type,
  {
         struct net *net = &init_net;
         struct sk_buff *skb;
+       int err;
+
+       err = verify_policy_dir(dir);
+       if (err)
+               return err;
  
         skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k, !!encap),
                         GFP_ATOMIC);
@@ -3057,6 +3070,11 @@ out_free_skb:
  
  static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c)
  {
+       int err;
+
+       err = verify_policy_dir(dir);
+       if (err)
+               return err;
  
         switch (c->event) {
         case XFRM_MSG_NEWPOLICY:
diff --git a/scripts/Makefile.clean b/scripts/Makefile.clean

index 808d09f27ad4063424211a8264297a0b3945d3c5..11f4067f03e8bd861628dc28932d478c055bc39d 100644 (file)
--- a/scripts/Makefile.clean
+++ b/scripts/Makefile.clean
@@ -12,7 +12,7 @@ include scripts/Kbuild.include
  
  # The filename Kbuild has precedence over Makefile
  kbuild-dir := $(if $(filter /%,$(src)),$(src),$(srctree)/$(src))
-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
+-include $(if $(wildcard $(kbuild-dir)/Kbuild), $(kbuild-dir)/Kbuild, $(kbuild-dir)/Makefile)
  
  # Figure out what we need to build from the various variables
  # ==========================================================================
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib

index 0b46136a91a80b51901f68bca1587b35a6f5f962..097849121963b90e6b0346013a0661dee959bea5 100644 (file)
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -318,6 +318,12 @@ $(obj)/%.dtb: $(src)/%.dts FORCE
  
  dtc-tmp = $(subst $(comma),_,$(dot-target).dts.tmp)
  
+# cat
+# ---------------------------------------------------------------------------
+# Concatentate multiple files together
+quiet_cmd_cat = CAT     $@
+cmd_cat = (cat $(filter-out FORCE,$^) > $@) || (rm -f $@; false)
+
  # Bzip2
  # ---------------------------------------------------------------------------
  
diff --git a/scripts/Makefile.modinst b/scripts/Makefile.modinst

index 51ca0244fc8ac4f8e2981fb9dbc9df3efe507861..96524c6fa7a5b444cb9004426ae8870e704d1853 100644 (file)
--- a/scripts/Makefile.modinst
+++ b/scripts/Makefile.modinst
@@ -30,7 +30,7 @@ quiet_cmd_modules_install = INSTALL $@
  INSTALL_MOD_DIR ?= extra
  ext-mod-dir = $(INSTALL_MOD_DIR)$(subst $(patsubst %/,%,$(KBUILD_EXTMOD)),,$(@D))
  
-modinst_dir = $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
+modinst_dir ?= $(if $(KBUILD_EXTMOD),$(ext-mod-dir),kernel/$(@D))
  
  $(modules):
         $(call cmd,modules_install,$(MODLIB)/$(modinst_dir))
diff --git a/security/Kconfig b/security/Kconfig

index b5c2b5d0c6c0ebd1523ee3aa34fbf14dce97a448..1110a430bbe83cc0ae4d9202050a00106be8dd9f 100644 (file)
--- a/security/Kconfig
+++ b/security/Kconfig
@@ -18,6 +18,15 @@ config SECURITY_DMESG_RESTRICT
  
           If you are unsure how to answer this question, answer N.
  
+config SECURITY_PERF_EVENTS_RESTRICT
+       bool "Restrict unprivileged use of performance events"
+       depends on PERF_EVENTS
+       help
+         If you say Y here, the kernel.perf_event_paranoid sysctl
+         will be set to 3 by default, and no unprivileged use of the
+         perf_event_open syscall will be permitted unless it is
+         changed.
+
  config SECURITY
         bool "Enable different security models"
         depends on SYSFS
diff --git a/security/commoncap.c b/security/commoncap.c

index 7b01431d1e19752cf5ec3770c3338b973e104eaf..163e701f8d050fe66d206dda2d6e548452d484c1 100644 (file)
--- a/security/commoncap.c
+++ b/security/commoncap.c
@@ -31,6 +31,10 @@
  #include <linux/binfmts.h>
  #include <linux/personality.h>
  
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+#include <linux/android_aid.h>
+#endif
+
  /*
   * If a non-root user executes a setuid-root binary in
   * !secure(SECURE_NOROOT) mode, then we raise capabilities.
@@ -54,7 +58,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
  }
  
  /**
- * cap_capable - Determine whether a task has a particular effective capability
+ * __cap_capable - Determine whether a task has a particular effective capability
   * @cred: The credentials to use
   * @ns:  The user namespace in which we need the capability
   * @cap: The capability to check for
@@ -68,7 +72,7 @@ static void warn_setuid_and_fcaps_mixed(const char *fname)
   * cap_has_capability() returns 0 when a task has a capability, but the
   * kernel's capable() and has_capability() returns 1 for this case.
   */
-int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
+int __cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
                 int cap, int audit)
  {
         struct user_namespace *ns = targ_ns;
@@ -106,6 +110,27 @@ int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
         /* We never get here */
  }
  
+int cap_capable(const struct cred *cred, struct user_namespace *targ_ns,
+               int cap, int audit)
+{
+       int ret = __cap_capable(cred, targ_ns, cap, audit);
+
+#ifdef CONFIG_ANDROID_PARANOID_NETWORK
+       if (ret != 0 && cap == CAP_NET_RAW && in_egroup_p(AID_NET_RAW)) {
+               printk("Process %s granted CAP_NET_RAW from Android group net_raw.\n", current->comm);
+               printk("  Please update the .rc file to explictly set 'capabilities NET_RAW'\n");
+               printk("  Implicit grants are deprecated and will be removed in the future.\n");
+               return 0;
+       }
+       if (ret != 0 && cap == CAP_NET_ADMIN && in_egroup_p(AID_NET_ADMIN)) {
+               printk("Process %s granted CAP_NET_ADMIN from Android group net_admin.\n", current->comm);
+               printk("  Please update the .rc file to explictly set 'capabilities NET_ADMIN'\n");
+               printk("  Implicit grants are deprecated and will be removed in the future.\n");
+               return 0;
+       }
+#endif
+       return ret;
+}
  /**
   * cap_settime - Determine whether the current process may set the system clock
   * @ts: The time to set
diff --git a/security/inode.c b/security/inode.c

index 8dd9ca8848e4361fae0e3c1d15593a41c53ff18a..bf2810936dfba670eaaced8c4b686e12767dc5a1 100644 (file)
--- a/security/inode.c
+++ b/security/inode.c
@@ -122,7 +122,7 @@ static struct dentry *securityfs_create_dentry(const char *name, umode_t mode,
         dir = d_inode(parent);
  
         inode_lock(dir);
-       dentry = lookup_one_len(name, parent, strlen(name));
+       dentry = lookup_one_len2(name, mount, parent, strlen(name));
         if (IS_ERR(dentry))
                 goto out;
  
diff --git a/security/security.c b/security/security.c

index 4bf0f571b4ef94df1d3c44b7fed6b7b651c1924f..264a5e5a0595eda46e846ce1430b79f43eec008f 100644 (file)
--- a/security/security.c
+++ b/security/security.c
@@ -12,6 +12,7 @@
   *     (at your option) any later version.
   */
  
+#include <linux/bpf.h>
  #include <linux/capability.h>
  #include <linux/dcache.h>
  #include <linux/module.h>
@@ -595,6 +596,7 @@ int security_path_chown(const struct path *path, kuid_t uid, kgid_t gid)
                 return 0;
         return call_int_hook(path_chown, 0, path, uid, gid);
  }
+EXPORT_SYMBOL(security_path_chown);
  
  int security_path_chroot(const struct path *path)
  {
@@ -1703,3 +1705,34 @@ int security_audit_rule_match(u32 secid, u32 field, u32 op, void *lsmrule,
                                 actx);
  }
  #endif /* CONFIG_AUDIT */
+
+#ifdef CONFIG_BPF_SYSCALL
+int security_bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+       return call_int_hook(bpf, 0, cmd, attr, size);
+}
+int security_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+       return call_int_hook(bpf_map, 0, map, fmode);
+}
+int security_bpf_prog(struct bpf_prog *prog)
+{
+       return call_int_hook(bpf_prog, 0, prog);
+}
+int security_bpf_map_alloc(struct bpf_map *map)
+{
+       return call_int_hook(bpf_map_alloc_security, 0, map);
+}
+int security_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+       return call_int_hook(bpf_prog_alloc_security, 0, aux);
+}
+void security_bpf_map_free(struct bpf_map *map)
+{
+       call_void_hook(bpf_map_free_security, map);
+}
+void security_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+       call_void_hook(bpf_prog_free_security, aux);
+}
+#endif /* CONFIG_BPF_SYSCALL */
diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c

index f5d304736852f98508144c23e7325068c6ae2ee8..2e3a627fc0b1f034a68469ccab6232cae6e41f5a 100644 (file)
--- a/security/selinux/hooks.c
+++ b/security/selinux/hooks.c
@@ -85,6 +85,7 @@
  #include <linux/export.h>
  #include <linux/msg.h>
  #include <linux/shm.h>
+#include <linux/bpf.h>
  
  #include "avc.h"
  #include "objsec.h"
@@ -1814,6 +1815,10 @@ static inline int file_path_has_perm(const struct cred *cred,
         return inode_has_perm(cred, file_inode(file), av, &ad);
  }
  
+#ifdef CONFIG_BPF_SYSCALL
+static int bpf_fd_pass(struct file *file, u32 sid);
+#endif
+
  /* Check whether a task can use an open file descriptor to
     access an inode in a given way.  Check access to the
     descriptor itself, and then use dentry_has_perm to
@@ -1844,6 +1849,12 @@ static int file_has_perm(const struct cred *cred,
                         goto out;
         }
  
+#ifdef CONFIG_BPF_SYSCALL
+       rc = bpf_fd_pass(file, cred_sid(cred));
+       if (rc)
+               return rc;
+#endif
+
         /* av is zero if only checking access to the descriptor. */
         rc = 0;
         if (av)
@@ -2164,6 +2175,12 @@ static int selinux_binder_transfer_file(struct task_struct *from,
                         return rc;
         }
  
+#ifdef CONFIG_BPF_SYSCALL
+       rc = bpf_fd_pass(file, sid);
+       if (rc)
+               return rc;
+#endif
+
         if (unlikely(IS_PRIVATE(d_backing_inode(dentry))))
                 return 0;
  
@@ -6252,6 +6269,139 @@ static void selinux_ib_free_security(void *ib_sec)
  }
  #endif
  
+#ifdef CONFIG_BPF_SYSCALL
+static int selinux_bpf(int cmd, union bpf_attr *attr,
+                                    unsigned int size)
+{
+       u32 sid = current_sid();
+       int ret;
+
+       switch (cmd) {
+       case BPF_MAP_CREATE:
+               ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__MAP_CREATE,
+                                  NULL);
+               break;
+       case BPF_PROG_LOAD:
+               ret = avc_has_perm(sid, sid, SECCLASS_BPF, BPF__PROG_LOAD,
+                                  NULL);
+               break;
+       default:
+               ret = 0;
+               break;
+       }
+
+       return ret;
+}
+
+static u32 bpf_map_fmode_to_av(fmode_t fmode)
+{
+       u32 av = 0;
+
+       if (fmode & FMODE_READ)
+               av |= BPF__MAP_READ;
+       if (fmode & FMODE_WRITE)
+               av |= BPF__MAP_WRITE;
+       return av;
+}
+
+/* This function will check the file pass through unix socket or binder to see
+ * if it is a bpf related object. And apply correspinding checks on the bpf
+ * object based on the type. The bpf maps and programs, not like other files and
+ * socket, are using a shared anonymous inode inside the kernel as their inode.
+ * So checking that inode cannot identify if the process have privilege to
+ * access the bpf object and that's why we have to add this additional check in
+ * selinux_file_receive and selinux_binder_transfer_files.
+ */
+static int bpf_fd_pass(struct file *file, u32 sid)
+{
+       struct bpf_security_struct *bpfsec;
+       struct bpf_prog *prog;
+       struct bpf_map *map;
+       int ret;
+
+       if (file->f_op == &bpf_map_fops) {
+               map = file->private_data;
+               bpfsec = map->security;
+               ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+                                  bpf_map_fmode_to_av(file->f_mode), NULL);
+               if (ret)
+                       return ret;
+       } else if (file->f_op == &bpf_prog_fops) {
+               prog = file->private_data;
+               bpfsec = prog->aux->security;
+               ret = avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+                                  BPF__PROG_RUN, NULL);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int selinux_bpf_map(struct bpf_map *map, fmode_t fmode)
+{
+       u32 sid = current_sid();
+       struct bpf_security_struct *bpfsec;
+
+       bpfsec = map->security;
+       return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+                           bpf_map_fmode_to_av(fmode), NULL);
+}
+
+static int selinux_bpf_prog(struct bpf_prog *prog)
+{
+       u32 sid = current_sid();
+       struct bpf_security_struct *bpfsec;
+
+       bpfsec = prog->aux->security;
+       return avc_has_perm(sid, bpfsec->sid, SECCLASS_BPF,
+                           BPF__PROG_RUN, NULL);
+}
+
+static int selinux_bpf_map_alloc(struct bpf_map *map)
+{
+       struct bpf_security_struct *bpfsec;
+
+       bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+       if (!bpfsec)
+               return -ENOMEM;
+
+       bpfsec->sid = current_sid();
+       map->security = bpfsec;
+
+       return 0;
+}
+
+static void selinux_bpf_map_free(struct bpf_map *map)
+{
+       struct bpf_security_struct *bpfsec = map->security;
+
+       map->security = NULL;
+       kfree(bpfsec);
+}
+
+static int selinux_bpf_prog_alloc(struct bpf_prog_aux *aux)
+{
+       struct bpf_security_struct *bpfsec;
+
+       bpfsec = kzalloc(sizeof(*bpfsec), GFP_KERNEL);
+       if (!bpfsec)
+               return -ENOMEM;
+
+       bpfsec->sid = current_sid();
+       aux->security = bpfsec;
+
+       return 0;
+}
+
+static void selinux_bpf_prog_free(struct bpf_prog_aux *aux)
+{
+       struct bpf_security_struct *bpfsec = aux->security;
+
+       aux->security = NULL;
+       kfree(bpfsec);
+}
+#endif
+
  static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
         LSM_HOOK_INIT(binder_set_context_mgr, selinux_binder_set_context_mgr),
         LSM_HOOK_INIT(binder_transaction, selinux_binder_transaction),
@@ -6471,6 +6621,16 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = {
         LSM_HOOK_INIT(audit_rule_match, selinux_audit_rule_match),
         LSM_HOOK_INIT(audit_rule_free, selinux_audit_rule_free),
  #endif
+
+#ifdef CONFIG_BPF_SYSCALL
+       LSM_HOOK_INIT(bpf, selinux_bpf),
+       LSM_HOOK_INIT(bpf_map, selinux_bpf_map),
+       LSM_HOOK_INIT(bpf_prog, selinux_bpf_prog),
+       LSM_HOOK_INIT(bpf_map_alloc_security, selinux_bpf_map_alloc),
+       LSM_HOOK_INIT(bpf_prog_alloc_security, selinux_bpf_prog_alloc),
+       LSM_HOOK_INIT(bpf_map_free_security, selinux_bpf_map_free),
+       LSM_HOOK_INIT(bpf_prog_free_security, selinux_bpf_prog_free),
+#endif
  };
  
  static __init int selinux_init(void)
diff --git a/security/selinux/include/classmap.h b/security/selinux/include/classmap.h

index cc35695d97b4a6682702a2405f1ca38a258d9f50..acdee7795297f3b8e5c8a53403ceea3a1124ee4e 100644 (file)
--- a/security/selinux/include/classmap.h
+++ b/security/selinux/include/classmap.h
@@ -238,6 +238,8 @@ struct security_class_mapping secclass_map[] = {
           { "access", NULL } },
         { "infiniband_endport",
           { "manage_subnet", NULL } },
+       { "bpf",
+         {"map_create", "map_read", "map_write", "prog_load", "prog_run"} },
         { NULL }
    };
  
diff --git a/security/selinux/include/objsec.h b/security/selinux/include/objsec.h

index 1649cd18eb0bed125bb38466506309d26c647f90..3d54468ce3342851169396813c48fc3a0428a813 100644 (file)
--- a/security/selinux/include/objsec.h
+++ b/security/selinux/include/objsec.h
@@ -150,6 +150,10 @@ struct pkey_security_struct {
         u32     sid;    /* SID of pkey */
  };
  
+struct bpf_security_struct {
+       u32 sid;  /*SID of bpf obj creater*/
+};
+
  extern unsigned int selinux_checkreqprot;
  
  #endif /* _SELINUX_OBJSEC_H_ */
author	Greg Kroah-Hartman <gregkh@google.com>
	Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)
committer	Greg Kroah-Hartman <gregkh@google.com>
	Sat, 17 Feb 2018 13:54:49 +0000 (14:54 +0100)