Merge tag 'stable/for-linus-3.6-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel...

author Linus Torvalds <torvalds@linux-foundation.org>

Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)

committer Linus Torvalds <torvalds@linux-foundation.org>

Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)
author Linus Torvalds <torvalds@linux-foundation.org>
Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)
committer Linus Torvalds <torvalds@linux-foundation.org>
Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)
diff --combined arch/x86/kernel/cpu/mcheck/mce.c

index 9473e8772fd19bba99ac568a2d7afda3af86aa6f,aa7548799af4a36e71bc08f135ea7d3a32df9be8..5e095f873e3eb731a42012c5e10310fc238ed9c7
--- 1/arch/x86/kernel/cpu/mcheck/mce.c
--- 2/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@@ -7,9 -7,6 +7,9 @@@
    * Copyright 2008 Intel Corporation
    * Author: Andi Kleen
    */
+ +
+ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+ +
   #include <linux/thread_info.h>
   #include <linux/capability.h>
   #include <linux/miscdevice.h>
@@@ -60,8 -57,6 +60,6 @@@ static DEFINE_MUTEX(mce_chrdev_read_mut
   
   int mce_disabled __read_mostly;
   
- #define MISC_MCELOG_MINOR     227
- 
   #define SPINUNIT 100  /* 100ns */
   
   atomic_t mce_entry;
@@@ -213,7 -208,7 +211,7 @@@ static void drain_mcelog_buffer(void
                                 cpu_relax();
   
                                 if (!m->finished && retries >= 4) {
- -                                      pr_err("MCE: skipping error being logged currently!\n");
+ +                                      pr_err("skipping error being logged currently!\n");
                                         break;
                                 }
                         }
@@@ -1170,9 -1165,8 +1168,9 @@@ int memory_failure(unsigned long pfn, i
   {
         /* mce_severity() should not hand us an ACTION_REQUIRED error */
         BUG_ON(flags & MF_ACTION_REQUIRED);
- -      printk(KERN_ERR "Uncorrected memory error in page 0x%lx ignored\n"
- -              "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n", pfn);
+ +      pr_err("Uncorrected memory error in page 0x%lx ignored\n"
+ +             "Rebuild kernel with CONFIG_MEMORY_FAILURE=y for smarter handling\n",
+ +             pfn);
   
         return 0;
   }
@@@ -1190,7 -1184,6 +1188,7 @@@ void mce_notify_process(void
   {
         unsigned long pfn;
         struct mce_info *mi = mce_find_info();
+ +      int flags = MF_ACTION_REQUIRED;
   
         if (!mi)
                 mce_panic("Lost physical address for unconsumed uncorrectable error", NULL, NULL);
@@@ -1205,9 -1198,8 +1203,9 @@@
          * doomed. We still need to mark the page as poisoned and alert any
          * other users of the page.
          */
- -      if (memory_failure(pfn, MCE_VECTOR, MF_ACTION_REQUIRED) < 0 ||
- -                         mi->restartable == 0) {
+ +      if (!mi->restartable)
+ +              flags |= MF_MUST_KILL;
+ +      if (memory_failure(pfn, MCE_VECTOR, flags) < 0) {
                 pr_err("Memory error not recovered");
                 force_sig(SIGBUS, current);
         }
@@@ -1364,10 -1356,11 +1362,10 @@@ static int __cpuinit __mcheck_cpu_cap_i
   
         b = cap & MCG_BANKCNT_MASK;
         if (!banks)
- -              printk(KERN_INFO "mce: CPU supports %d MCE banks\n", b);
+ +              pr_info("CPU supports %d MCE banks\n", b);
   
         if (b > MAX_NR_BANKS) {
- -              printk(KERN_WARNING
- -                     "MCE: Using only %u machine check banks out of %u\n",
+ +              pr_warn("Using only %u machine check banks out of %u\n",
                         MAX_NR_BANKS, b);
                 b = MAX_NR_BANKS;
         }
@@@ -1424,7 -1417,7 +1422,7 @@@ static void __mcheck_cpu_init_generic(v
   static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
   {
         if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
- -              pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
+ +              pr_info("unknown CPU type - not enabling MCE support\n");
                 return -EOPNOTSUPP;
         }
   
@@@ -1579,7 -1572,7 +1577,7 @@@ static void __mcheck_cpu_init_timer(voi
   /* Handle unconfigured int18 (should never happen) */
   static void unexpected_machine_check(struct pt_regs *regs, long error_code)
   {
- -      printk(KERN_ERR "CPU#%d: Unexpected int18 (Machine Check).\n",
+ +      pr_err("CPU#%d: Unexpected int18 (Machine Check)\n",
                smp_processor_id());
   }
   
@@@ -1898,7 -1891,8 +1896,7 @@@ static int __init mcheck_enable(char *s
                         get_option(&str, &monarch_timeout);
                 }
         } else {
- -              printk(KERN_INFO "mce argument %s ignored. Please use /sys\n",
- -                     str);
+ +              pr_info("mce argument %s ignored. Please use /sys\n", str);
                 return 0;
         }
         return 1;
@@@ -2346,7 -2340,7 +2344,7 @@@ static __init int mcheck_init_device(vo
   
         return err;
   }
- device_initcall(mcheck_init_device);
+ device_initcall_sync(mcheck_init_device);
   
   /*
    * Old style boot options parsing. Only for compatibility.
diff --combined arch/x86/kernel/cpu/mcheck/mce_amd.c

index 671b95a2ffb5fd45381e59f1a3e1198f5a46e834,be52744904284c3360d55b2d1555497c14008fd1..c4e916d773780f4239ac831478c15aa2325c7382
--- 1/arch/x86/kernel/cpu/mcheck/mce_amd.c
--- 2/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@@ -1,17 -1,15 +1,17 @@@
   /*
- - *  (c) 2005, 2006 Advanced Micro Devices, Inc.
+ + *  (c) 2005-2012 Advanced Micro Devices, Inc.
    *  Your use of this code is subject to the terms and conditions of the
    *  GNU general public license version 2. See "COPYING" or
    *  http://www.gnu.org/licenses/gpl.html
    *
    *  Written by Jacob Shin - AMD, Inc.
    *
- - *  Support : jacob.shin@amd.com
+ + *  Support: borislav.petkov@amd.com
    *
    *  April 2006
    *     - added support for AMD Family 0x10 processors
+ + *  May 2012
+ + *     - major scrubbing
    *
    *  All MC4_MISCi registers are shared between multi-cores
    */
@@@ -27,7 -25,6 +27,7 @@@
   #include <linux/cpu.h>
   #include <linux/smp.h>
   
+ +#include <asm/amd_nb.h>
   #include <asm/apic.h>
   #include <asm/idle.h>
   #include <asm/mce.h>
@@@ -48,15 -45,23 +48,15 @@@
   #define MASK_BLKPTR_LO    0xFF000000
   #define MCG_XBLK_ADDR     0xC0000400
   
- -struct threshold_block {
- -      unsigned int            block;
- -      unsigned int            bank;
- -      unsigned int            cpu;
- -      u32                     address;
- -      u16                     interrupt_enable;
- -      bool                    interrupt_capable;
- -      u16                     threshold_limit;
- -      struct kobject          kobj;
- -      struct list_head        miscj;
+ +static const char * const th_names[] = {
+ +      "load_store",
+ +      "insn_fetch",
+ +      "combined_unit",
+ +      "",
+ +      "northbridge",
+ +      "execution_unit",
   };
   
- -struct threshold_bank {
- -      struct kobject          *kobj;
- -      struct threshold_block  *blocks;
- -      cpumask_var_t           cpus;
- -};
   static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks);
   
   static unsigned char shared_bank[NR_BANKS] = {
@@@ -79,26 -84,6 +79,26 @@@ struct thresh_restart 
         u16                     old_limit;
   };
   
+ +static const char * const bank4_names(struct threshold_block *b)
+ +{
+ +      switch (b->address) {
+ +      /* MSR4_MISC0 */
+ +      case 0x00000413:
+ +              return "dram";
+ +
+ +      case 0xc0000408:
+ +              return "ht_links";
+ +
+ +      case 0xc0000409:
+ +              return "l3_cache";
+ +
+ +      default:
+ +              WARN(1, "Funny MSR: 0x%08x\n", b->address);
+ +              return "";
+ +      }
+ +};
+ +
+ +
   static bool lvt_interrupt_supported(unsigned int bank, u32 msr_high_bits)
   {
         /*
@@@ -239,6 -224,8 +239,6 @@@ void mce_amd_feature_init(struct cpuinf
   
                         if (!block)
                                 per_cpu(bank_map, cpu) |= (1 << bank);
- -                      if (shared_bank[bank] && c->cpu_core_id)
- -                              break;
   
                         memset(&b, 0, sizeof(b));
                         b.cpu                   = cpu;
@@@ -339,7 -326,7 +339,7 @@@ struct threshold_attr 
   #define SHOW_FIELDS(name)                                             \
   static ssize_t show_ ## name(struct threshold_block *b, char *buf)    \
   {                                                                     \
- -      return sprintf(buf, "%lx\n", (unsigned long) b->name);          \
+ +      return sprintf(buf, "%lu\n", (unsigned long) b->name);          \
   }
   SHOW_FIELDS(interrupt_enable)
   SHOW_FIELDS(threshold_limit)
@@@ -390,21 -377,38 +390,21 @@@ store_threshold_limit(struct threshold_
         return size;
   }
   
- -struct threshold_block_cross_cpu {
- -      struct threshold_block  *tb;
- -      long                    retval;
- -};
- -
- -static void local_error_count_handler(void *_tbcc)
- -{
- -      struct threshold_block_cross_cpu *tbcc = _tbcc;
- -      struct threshold_block *b = tbcc->tb;
- -      u32 low, high;
- -
- -      rdmsr(b->address, low, high);
- -      tbcc->retval = (high & 0xFFF) - (THRESHOLD_MAX - b->threshold_limit);
- -}
- -
   static ssize_t show_error_count(struct threshold_block *b, char *buf)
   {
- -      struct threshold_block_cross_cpu tbcc = { .tb = b, };
+ +      u32 lo, hi;
   
- -      smp_call_function_single(b->cpu, local_error_count_handler, &tbcc, 1);
- -      return sprintf(buf, "%lx\n", tbcc.retval);
- -}
+ +      rdmsr_on_cpu(b->cpu, b->address, &lo, &hi);
   
- -static ssize_t store_error_count(struct threshold_block *b,
- -                               const char *buf, size_t count)
- -{
- -      struct thresh_restart tr = { .b = b, .reset = 1, .old_limit = 0 };
- -
- -      smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
- -      return 1;
+ +      return sprintf(buf, "%u\n", ((hi & THRESHOLD_MAX) -
+ +                                   (THRESHOLD_MAX - b->threshold_limit)));
   }
   
+ +static struct threshold_attr error_count = {
+ +      .attr = {.name = __stringify(error_count), .mode = 0444 },
+ +      .show = show_error_count,
+ +};
+ +
   #define RW_ATTR(val)                                                  \
   static struct threshold_attr val = {                                  \
         .attr   = {.name = __stringify(val), .mode = 0644 },            \
@@@ -414,6 -418,7 +414,6 @@@
   
   RW_ATTR(interrupt_enable);
   RW_ATTR(threshold_limit);
- -RW_ATTR(error_count);
   
   static struct attribute *default_attrs[] = {
         &threshold_limit.attr,
@@@ -512,7 -517,7 +512,7 @@@ static __cpuinit int allocate_threshold
   
         err = kobject_init_and_add(&b->kobj, &threshold_ktype,
                                    per_cpu(threshold_banks, cpu)[bank]->kobj,
- -                                 "misc%i", block);
+ +                                 (bank == 4 ? bank4_names(b) : th_names[bank]));
         if (err)
                 goto out_free;
   recurse:
@@@ -543,91 -548,98 +543,91 @@@ out_free
         return err;
   }
   
- -static __cpuinit long
- -local_allocate_threshold_blocks(int cpu, unsigned int bank)
+ +static __cpuinit int __threshold_add_blocks(struct threshold_bank *b)
   {
- -      return allocate_threshold_blocks(cpu, bank, 0,
- -                                       MSR_IA32_MC0_MISC + bank * 4);
+ +      struct list_head *head = &b->blocks->miscj;
+ +      struct threshold_block *pos = NULL;
+ +      struct threshold_block *tmp = NULL;
+ +      int err = 0;
+ +
+ +      err = kobject_add(&b->blocks->kobj, b->kobj, b->blocks->kobj.name);
+ +      if (err)
+ +              return err;
+ +
+ +      list_for_each_entry_safe(pos, tmp, head, miscj) {
+ +
+ +              err = kobject_add(&pos->kobj, b->kobj, pos->kobj.name);
+ +              if (err) {
+ +                      list_for_each_entry_safe_reverse(pos, tmp, head, miscj)
+ +                              kobject_del(&pos->kobj);
+ +
+ +                      return err;
+ +              }
+ +      }
+ +      return err;
   }
   
- -/* symlinks sibling shared banks to first core.  first core owns dir/files. */
   static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
   {
- -      int i, err = 0;
- -      struct threshold_bank *b = NULL;
         struct device *dev = per_cpu(mce_device, cpu);
- -      char name[32];
- -
- -      sprintf(name, "threshold_bank%i", bank);
+ +      struct amd_northbridge *nb = NULL;
+ +      struct threshold_bank *b = NULL;
+ +      const char *name = th_names[bank];
+ +      int err = 0;
   
- -#ifdef CONFIG_SMP
- -      if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) {   /* symlink */
- -              i = cpumask_first(cpu_llc_shared_mask(cpu));
+ +      if (shared_bank[bank]) {
   
- -              /* first core not up yet */
- -              if (cpu_data(i).cpu_core_id)
- -                      goto out;
+ +              nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ +              WARN_ON(!nb);
   
- -              /* already linked */
- -              if (per_cpu(threshold_banks, cpu)[bank])
- -                      goto out;
+ +              /* threshold descriptor already initialized on this node? */
+ +              if (nb->bank4) {
+ +                      /* yes, use it */
+ +                      b = nb->bank4;
+ +                      err = kobject_add(b->kobj, &dev->kobj, name);
+ +                      if (err)
+ +                              goto out;
   
- -              b = per_cpu(threshold_banks, i)[bank];
+ +                      per_cpu(threshold_banks, cpu)[bank] = b;
+ +                      atomic_inc(&b->cpus);
   
- -              if (!b)
- -                      goto out;
+ +                      err = __threshold_add_blocks(b);
   
- -              err = sysfs_create_link(&dev->kobj, b->kobj, name);
- -              if (err)
                         goto out;
- -
- -              cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
- -              per_cpu(threshold_banks, cpu)[bank] = b;
- -
- -              goto out;
+ +              }
         }
- -#endif
   
         b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL);
         if (!b) {
                 err = -ENOMEM;
                 goto out;
         }
- -      if (!zalloc_cpumask_var(&b->cpus, GFP_KERNEL)) {
- -              kfree(b);
- -              err = -ENOMEM;
- -              goto out;
- -      }
   
         b->kobj = kobject_create_and_add(name, &dev->kobj);
- -      if (!b->kobj)
+ +      if (!b->kobj) {
+ +              err = -EINVAL;
                 goto out_free;
- -
- -#ifndef CONFIG_SMP
- -      cpumask_setall(b->cpus);
- -#else
- -      cpumask_set_cpu(cpu, b->cpus);
- -#endif
+ +      }
   
         per_cpu(threshold_banks, cpu)[bank] = b;
   
- -      err = local_allocate_threshold_blocks(cpu, bank);
- -      if (err)
- -              goto out_free;
- -
- -      for_each_cpu(i, b->cpus) {
- -              if (i == cpu)
- -                      continue;
+ +      if (shared_bank[bank]) {
+ +              atomic_set(&b->cpus, 1);
   
- -              dev = per_cpu(mce_device, i);
- -              if (dev)
- -                      err = sysfs_create_link(&dev->kobj,b->kobj, name);
- -              if (err)
- -                      goto out;
- -
- -              per_cpu(threshold_banks, i)[bank] = b;
+ +              /* nb is already initialized, see above */
+ +              WARN_ON(nb->bank4);
+ +              nb->bank4 = b;
         }
   
- -      goto out;
+ +      err = allocate_threshold_blocks(cpu, bank, 0,
+ +                                      MSR_IA32_MC0_MISC + bank * 4);
+ +      if (!err)
+ +              goto out;
   
- -out_free:
- -      per_cpu(threshold_banks, cpu)[bank] = NULL;
- -      free_cpumask_var(b->cpus);
+ + out_free:
         kfree(b);
- -out:
+ +
+ + out:
         return err;
   }
   
@@@ -648,6 -660,12 +648,6 @@@ static __cpuinit int threshold_create_d
         return err;
   }
   
- -/*
- - * let's be hotplug friendly.
- - * in case of multiple core processors, the first core always takes ownership
- - *   of shared sysfs dir/files, and rest of the cores will be symlinked to it.
- - */
- -
   static void deallocate_threshold_block(unsigned int cpu,
                                                  unsigned int bank)
   {
@@@ -668,42 -686,41 +668,42 @@@
         per_cpu(threshold_banks, cpu)[bank]->blocks = NULL;
   }
   
+ +static void __threshold_remove_blocks(struct threshold_bank *b)
+ +{
+ +      struct threshold_block *pos = NULL;
+ +      struct threshold_block *tmp = NULL;
+ +
+ +      kobject_del(b->kobj);
+ +
+ +      list_for_each_entry_safe(pos, tmp, &b->blocks->miscj, miscj)
+ +              kobject_del(&pos->kobj);
+ +}
+ +
   static void threshold_remove_bank(unsigned int cpu, int bank)
   {
+ +      struct amd_northbridge *nb;
         struct threshold_bank *b;
- -      struct device *dev;
- -      char name[32];
- -      int i = 0;
   
         b = per_cpu(threshold_banks, cpu)[bank];
         if (!b)
                 return;
+ +
         if (!b->blocks)
                 goto free_out;
   
- -      sprintf(name, "threshold_bank%i", bank);
- -
- -#ifdef CONFIG_SMP
- -      /* sibling symlink */
- -      if (shared_bank[bank] && b->blocks->cpu != cpu) {
- -              dev = per_cpu(mce_device, cpu);
- -              sysfs_remove_link(&dev->kobj, name);
- -              per_cpu(threshold_banks, cpu)[bank] = NULL;
- -
- -              return;
- -      }
- -#endif
- -
- -      /* remove all sibling symlinks before unregistering */
- -      for_each_cpu(i, b->cpus) {
- -              if (i == cpu)
- -                      continue;
- -
- -              dev = per_cpu(mce_device, i);
- -              if (dev)
- -                      sysfs_remove_link(&dev->kobj, name);
- -              per_cpu(threshold_banks, i)[bank] = NULL;
+ +      if (shared_bank[bank]) {
+ +              if (!atomic_dec_and_test(&b->cpus)) {
+ +                      __threshold_remove_blocks(b);
+ +                      per_cpu(threshold_banks, cpu)[bank] = NULL;
+ +                      return;
+ +              } else {
+ +                      /*
+ +                       * the last CPU on this node using the shared bank is
+ +                       * going away, remove that bank now.
+ +                       */
+ +                      nb = node_to_amd_nb(amd_get_nb_id(cpu));
+ +                      nb->bank4 = NULL;
+ +              }
         }
   
         deallocate_threshold_block(cpu, bank);
@@@ -711,6 -728,7 +711,6 @@@
   free_out:
         kobject_del(b->kobj);
         kobject_put(b->kobj);
- -      free_cpumask_var(b->cpus);
         kfree(b);
         per_cpu(threshold_banks, cpu)[bank] = NULL;
   }
@@@ -759,4 -777,24 +759,24 @@@ static __init int threshold_init_device
   
         return 0;
   }
- device_initcall(threshold_init_device);
+ /*
+  * there are 3 funcs which need to be _initcalled in a logic sequence:
+  * 1. xen_late_init_mcelog
+  * 2. mcheck_init_device
+  * 3. threshold_init_device
+  *
+  * xen_late_init_mcelog must register xen_mce_chrdev_device before
+  * native mce_chrdev_device registration if running under xen platform;
+  *
+  * mcheck_init_device should be inited before threshold_init_device to
+  * initialize mce_device, otherwise a NULL ptr dereference will cause panic.
+  *
+  * so we use following _initcalls
+  * 1. device_initcall(xen_late_init_mcelog);
+  * 2. device_initcall_sync(mcheck_init_device);
+  * 3. late_initcall(threshold_init_device);
+  *
+  * when running under xen, the initcall order is 1,2,3;
+  * on baremetal, we skip 1 and we do only 2 and 3.
+  */
+ late_initcall(threshold_init_device);
diff --combined arch/x86/xen/enlighten.c

index ed7d54985d0cb558b6e4177d2891a9609fa3e1e6,a6f8acbdfc9ac43b6c01296c0651abd612f84ed3..bf4bda6d3e9ad66f19af6e4669063a12739c78db
--- 1/arch/x86/xen/enlighten.c
--- 2/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@@ -31,6 -31,7 +31,7 @@@
   #include <linux/pci.h>
   #include <linux/gfp.h>
   #include <linux/memblock.h>
+ #include <linux/syscore_ops.h>
   
   #include <xen/xen.h>
   #include <xen/interface/xen.h>
@@@ -38,6 -39,7 +39,7 @@@
   #include <xen/interface/physdev.h>
   #include <xen/interface/vcpu.h>
   #include <xen/interface/memory.h>
+ #include <xen/interface/xen-mca.h>
   #include <xen/features.h>
   #include <xen/page.h>
   #include <xen/hvm.h>
@@@ -107,7 -109,7 +109,7 @@@ EXPORT_SYMBOL_GPL(xen_have_vector_callb
    * Point at some empty memory to start with. We map the real shared_info
    * page as soon as fixmap is up and running.
    */
- struct shared_info *HYPERVISOR_shared_info = (void *)&xen_dummy_shared_info;
+ struct shared_info *HYPERVISOR_shared_info = &xen_dummy_shared_info;
   
   /*
    * Flag to determine whether vcpu info placement is available on all
@@@ -124,6 -126,19 +126,19 @@@
    */
   static int have_vcpu_info_placement = 1;
   
+ struct tls_descs {
+       struct desc_struct desc[3];
+ };
+ 
+ /*
+  * Updating the 3 TLS descriptors in the GDT on every task switch is
+  * surprisingly expensive so we avoid updating them if they haven't
+  * changed.  Since Xen writes different descriptors than the one
+  * passed in the update_descriptor hypercall we keep shadow copies to
+  * compare against.
+  */
+ static DEFINE_PER_CPU(struct tls_descs, shadow_tls_desc);
+ 
   static void clamp_max_cpus(void)
   {
   #ifdef CONFIG_SMP
@@@ -341,9 -356,7 +356,7 @@@ static void __init xen_init_cpuid_mask(
         unsigned int xsave_mask;
   
         cpuid_leaf1_edx_mask =
-               ~((1 << X86_FEATURE_MCE)  |  /* disable MCE */
-                 (1 << X86_FEATURE_MCA)  |  /* disable MCA */
-                 (1 << X86_FEATURE_MTRR) |  /* disable MTRR */
+               ~((1 << X86_FEATURE_MTRR) |  /* disable MTRR */
                   (1 << X86_FEATURE_ACC));   /* thermal monitoring */
   
         if (!xen_initial_domain())
@@@ -540,12 -553,28 +553,28 @@@ static void __init xen_load_gdt_boot(co
                 BUG();
   }
   
+ static inline bool desc_equal(const struct desc_struct *d1,
+                             const struct desc_struct *d2)
+ {
+       return d1->a == d2->a && d1->b == d2->b;
+ }
+ 
   static void load_TLS_descriptor(struct thread_struct *t,
                                 unsigned int cpu, unsigned int i)
   {
-       struct desc_struct *gdt = get_cpu_gdt_table(cpu);
-       xmaddr_t maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
-       struct multicall_space mc = __xen_mc_entry(0);
+       struct desc_struct *shadow = &per_cpu(shadow_tls_desc, cpu).desc[i];
+       struct desc_struct *gdt;
+       xmaddr_t maddr;
+       struct multicall_space mc;
+ 
+       if (desc_equal(shadow, &t->tls_array[i]))
+               return;
+ 
+       *shadow = t->tls_array[i];
+ 
+       gdt = get_cpu_gdt_table(cpu);
+       maddr = arbitrary_virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN+i]);
+       mc = __xen_mc_entry(0);
   
         MULTI_update_descriptor(mc.mc, maddr.maddr, t->tls_array[i]);
   }
@@@ -627,8 -656,8 +656,8 @@@ static int cvt_gate_to_trap(int vector
         /*
          * Look for known traps using IST, and substitute them
          * appropriately.  The debugger ones are the only ones we care
-        * about.  Xen will handle faults like double_fault and
-        * machine_check, so we should never see them.  Warn if
+        * about.  Xen will handle faults like double_fault,
+        * so we should never see them.  Warn if
          * there's an unexpected IST-using fault handler.
          */
         if (addr == (unsigned long)debug)
@@@ -643,7 -672,11 +672,11 @@@
                 return 0;
   #ifdef CONFIG_X86_MCE
         } else if (addr == (unsigned long)machine_check) {
-               return 0;
+               /*
+                * when xen hypervisor inject vMCE to guest,
+                * use native mce handler to handle it
+                */
+               ;
   #endif
         } else {
                 /* Some other trap using IST? */
@@@ -1124,7 -1157,9 +1157,7 @@@ static const struct pv_cpu_ops xen_cpu_
         .wbinvd = native_wbinvd,
   
         .read_msr = native_read_msr_safe,
- -      .rdmsr_regs = native_rdmsr_safe_regs,
         .write_msr = xen_write_msr_safe,
- -      .wrmsr_regs = native_wrmsr_safe_regs,
   
         .read_tsc = native_read_tsc,
         .read_pmc = native_read_pmc,
@@@ -1437,64 -1472,155 +1470,155 @@@ asmlinkage void __init xen_start_kernel
   #endif
   }
   
- static int init_hvm_pv_info(int *major, int *minor)
- {
-       uint32_t eax, ebx, ecx, edx, pages, msr, base;
-       u64 pfn;
- 
-       base = xen_cpuid_base();
-       cpuid(base + 1, &eax, &ebx, &ecx, &edx);
- 
-       *major = eax >> 16;
-       *minor = eax & 0xffff;
-       printk(KERN_INFO "Xen version %d.%d.\n", *major, *minor);
- 
-       cpuid(base + 2, &pages, &msr, &ecx, &edx);
- 
-       pfn = __pa(hypercall_page);
-       wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
- 
-       xen_setup_features();
- 
-       pv_info.name = "Xen HVM";
- 
-       xen_domain_type = XEN_HVM_DOMAIN;
+ #ifdef CONFIG_XEN_PVHVM
+ /*
+  * The pfn containing the shared_info is located somewhere in RAM. This
+  * will cause trouble if the current kernel is doing a kexec boot into a
+  * new kernel. The new kernel (and its startup code) can not know where
+  * the pfn is, so it can not reserve the page. The hypervisor will
+  * continue to update the pfn, and as a result memory corruption occours
+  * in the new kernel.
+  *
+  * One way to work around this issue is to allocate a page in the
+  * xen-platform pci device's BAR memory range. But pci init is done very
+  * late and the shared_info page is already in use very early to read
+  * the pvclock. So moving the pfn from RAM to MMIO is racy because some
+  * code paths on other vcpus could access the pfn during the small
+  * window when the old pfn is moved to the new pfn. There is even a
+  * small window were the old pfn is not backed by a mfn, and during that
+  * time all reads return -1.
+  *
+  * Because it is not known upfront where the MMIO region is located it
+  * can not be used right from the start in xen_hvm_init_shared_info.
+  *
+  * To minimise trouble the move of the pfn is done shortly before kexec.
+  * This does not eliminate the race because all vcpus are still online
+  * when the syscore_ops will be called. But hopefully there is no work
+  * pending at this point in time. Also the syscore_op is run last which
+  * reduces the risk further.
+  */
   
-       return 0;
- }
+ static struct shared_info *xen_hvm_shared_info;
   
- void __ref xen_hvm_init_shared_info(void)
+ static void xen_hvm_connect_shared_info(unsigned long pfn)
   {
-       int cpu;
         struct xen_add_to_physmap xatp;
-       static struct shared_info *shared_info_page = 0;
   
-       if (!shared_info_page)
-               shared_info_page = (struct shared_info *)
-                       extend_brk(PAGE_SIZE, PAGE_SIZE);
         xatp.domid = DOMID_SELF;
         xatp.idx = 0;
         xatp.space = XENMAPSPACE_shared_info;
-       xatp.gpfn = __pa(shared_info_page) >> PAGE_SHIFT;
+       xatp.gpfn = pfn;
         if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp))
                 BUG();
   
-       HYPERVISOR_shared_info = (struct shared_info *)shared_info_page;
+ }
+ static void xen_hvm_set_shared_info(struct shared_info *sip)
+ {
+       int cpu;
+ 
+       HYPERVISOR_shared_info = sip;
   
         /* xen_vcpu is a pointer to the vcpu_info struct in the shared_info
          * page, we use it in the event channel upcall and in some pvclock
          * related functions. We don't need the vcpu_info placement
          * optimizations because we don't use any pv_mmu or pv_irq op on
          * HVM.
-        * When xen_hvm_init_shared_info is run at boot time only vcpu 0 is
-        * online but xen_hvm_init_shared_info is run at resume time too and
+        * When xen_hvm_set_shared_info is run at boot time only vcpu 0 is
+        * online but xen_hvm_set_shared_info is run at resume time too and
          * in that case multiple vcpus might be online. */
         for_each_online_cpu(cpu) {
                 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
         }
   }
   
- #ifdef CONFIG_XEN_PVHVM
+ /* Reconnect the shared_info pfn to a mfn */
+ void xen_hvm_resume_shared_info(void)
+ {
+       xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+ }
+ 
+ #ifdef CONFIG_KEXEC
+ static struct shared_info *xen_hvm_shared_info_kexec;
+ static unsigned long xen_hvm_shared_info_pfn_kexec;
+ 
+ /* Remember a pfn in MMIO space for kexec reboot */
+ void __devinit xen_hvm_prepare_kexec(struct shared_info *sip, unsigned long pfn)
+ {
+       xen_hvm_shared_info_kexec = sip;
+       xen_hvm_shared_info_pfn_kexec = pfn;
+ }
+ 
+ static void xen_hvm_syscore_shutdown(void)
+ {
+       struct xen_memory_reservation reservation = {
+               .domid = DOMID_SELF,
+               .nr_extents = 1,
+       };
+       unsigned long prev_pfn;
+       int rc;
+ 
+       if (!xen_hvm_shared_info_kexec)
+               return;
+ 
+       prev_pfn = __pa(xen_hvm_shared_info) >> PAGE_SHIFT;
+       set_xen_guest_handle(reservation.extent_start, &prev_pfn);
+ 
+       /* Move pfn to MMIO, disconnects previous pfn from mfn */
+       xen_hvm_connect_shared_info(xen_hvm_shared_info_pfn_kexec);
+ 
+       /* Update pointers, following hypercall is also a memory barrier */
+       xen_hvm_set_shared_info(xen_hvm_shared_info_kexec);
+ 
+       /* Allocate new mfn for previous pfn */
+       do {
+               rc = HYPERVISOR_memory_op(XENMEM_populate_physmap, &reservation);
+               if (rc == 0)
+                       msleep(123);
+       } while (rc == 0);
+ 
+       /* Make sure the previous pfn is really connected to a (new) mfn */
+       BUG_ON(rc != 1);
+ }
+ 
+ static struct syscore_ops xen_hvm_syscore_ops = {
+       .shutdown = xen_hvm_syscore_shutdown,
+ };
+ #endif
+ 
+ /* Use a pfn in RAM, may move to MMIO before kexec. */
+ static void __init xen_hvm_init_shared_info(void)
+ {
+       /* Remember pointer for resume */
+       xen_hvm_shared_info = extend_brk(PAGE_SIZE, PAGE_SIZE);
+       xen_hvm_connect_shared_info(__pa(xen_hvm_shared_info) >> PAGE_SHIFT);
+       xen_hvm_set_shared_info(xen_hvm_shared_info);
+ }
+ 
+ static void __init init_hvm_pv_info(void)
+ {
+       int major, minor;
+       uint32_t eax, ebx, ecx, edx, pages, msr, base;
+       u64 pfn;
+ 
+       base = xen_cpuid_base();
+       cpuid(base + 1, &eax, &ebx, &ecx, &edx);
+ 
+       major = eax >> 16;
+       minor = eax & 0xffff;
+       printk(KERN_INFO "Xen version %d.%d.\n", major, minor);
+ 
+       cpuid(base + 2, &pages, &msr, &ecx, &edx);
+ 
+       pfn = __pa(hypercall_page);
+       wrmsr_safe(msr, (u32)pfn, (u32)(pfn >> 32));
+ 
+       xen_setup_features();
+ 
+       pv_info.name = "Xen HVM";
+ 
+       xen_domain_type = XEN_HVM_DOMAIN;
+ }
+ 
   static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
                                     unsigned long action, void *hcpu)
   {
@@@ -1517,14 -1643,12 +1641,12 @@@ static struct notifier_block xen_hvm_cp
   
   static void __init xen_hvm_guest_init(void)
   {
-       int r;
-       int major, minor;
- 
-       r = init_hvm_pv_info(&major, &minor);
-       if (r < 0)
-               return;
+       init_hvm_pv_info();
   
         xen_hvm_init_shared_info();
+ #ifdef CONFIG_KEXEC
+       register_syscore_ops(&xen_hvm_syscore_ops);
+ #endif
   
         if (xen_feature(XENFEAT_hvm_callback_vector))
                 xen_have_vector_callback = 1;
author	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Tue, 24 Jul 2012 20:14:03 +0000 (13:14 -0700)
		1	2
arch/x86/kernel/cpu/mcheck/mce.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/kernel/cpu/mcheck/mce_amd.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/enlighten.c	patch \|	diff1 \|	diff2 \|	blob \| history