Merge branch 'x86/urgent' into x86/xen

author Ingo Molnar <mingo@elte.hu>

Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)

committer Ingo Molnar <mingo@elte.hu>

Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)
author Ingo Molnar <mingo@elte.hu>
Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)
committer Ingo Molnar <mingo@elte.hu>
Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)
diff --combined arch/x86/include/asm/paravirt.h

index bc384be6aa44d404ca12ce7ec350bee248dd6df4,378e3691c08c54dd76e060eb468a00e8b61c59f9..1fe583783792800da28de628b5bc9dea69382b28
--- 1/arch/x86/include/asm/paravirt.h
--- 2/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@@ -56,7 -56,6 +56,7 @@@ struct desc_ptr
   struct tss_struct;
   struct mm_struct;
   struct desc_struct;
+ +struct task_struct;
   
   /*
    * Wrapper type for pointers to code which uses the non-standard
@@@ -204,8 -203,7 +204,8 @@@ struct pv_cpu_ops 
   
         void (*swapgs)(void);
   
- -      struct pv_lazy_ops lazy_mode;
+ +      void (*start_context_switch)(struct task_struct *prev);
+ +      void (*end_context_switch)(struct task_struct *next);
   };
   
   struct pv_irq_ops {
@@@ -349,7 -347,7 +349,7 @@@ struct pv_mmu_ops 
         /* Sometimes the physical address is a pfn, and sometimes its
            an mfn.  We can tell which is which from the index. */
         void (*set_fixmap)(unsigned /* enum fixed_addresses */ idx,
-                          unsigned long phys, pgprot_t flags);
+                          phys_addr_t phys, pgprot_t flags);
   };
   
   struct raw_spinlock;
@@@ -1401,23 -1399,25 +1401,23 @@@ enum paravirt_lazy_mode 
   };
   
   enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
- -void paravirt_enter_lazy_cpu(void);
- -void paravirt_leave_lazy_cpu(void);
+ +void paravirt_start_context_switch(struct task_struct *prev);
+ +void paravirt_end_context_switch(struct task_struct *next);
+ +
   void paravirt_enter_lazy_mmu(void);
   void paravirt_leave_lazy_mmu(void);
- -void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
   
- -#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
- -static inline void arch_enter_lazy_cpu_mode(void)
+ +#define  __HAVE_ARCH_START_CONTEXT_SWITCH
+ +static inline void arch_start_context_switch(struct task_struct *prev)
   {
- -      PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+ +      PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
   }
   
- -static inline void arch_leave_lazy_cpu_mode(void)
+ +static inline void arch_end_context_switch(struct task_struct *next)
   {
- -      PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+ +      PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
   }
   
- -void arch_flush_lazy_cpu_mode(void);
- -
   #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
   static inline void arch_enter_lazy_mmu_mode(void)
   {
@@@ -1432,7 -1432,7 +1432,7 @@@ static inline void arch_leave_lazy_mmu_
   void arch_flush_lazy_mmu_mode(void);
   
   static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
-                               unsigned long phys, pgprot_t flags)
+                               phys_addr_t phys, pgprot_t flags)
   {
         pv_mmu_ops.set_fixmap(idx, phys, flags);
   }
diff --combined arch/x86/lguest/boot.c

index cfb2d68dc7959a6767696f509418f115a1307a07,ca7ec44bafc3b313aa1e3919339042f8ed53cf20..8f935c6d5512232dd44faf794da816313691f2c4
--- 1/arch/x86/lguest/boot.c
--- 2/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@@ -166,16 -166,10 +166,16 @@@ static void lazy_hcall3(unsigned long c
   
   /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
    * issue the do-nothing hypercall to flush any stored calls. */
- -static void lguest_leave_lazy_mode(void)
+ +static void lguest_leave_lazy_mmu_mode(void)
   {
- -      paravirt_leave_lazy(paravirt_get_lazy_mode());
         kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ +      paravirt_leave_lazy_mmu();
+ +}
+ +
+ +static void lguest_end_context_switch(struct task_struct *next)
+ +{
+ +      kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+ +      paravirt_end_context_switch(next);
   }
   
   /*G:033
@@@ -279,15 -273,15 +279,15 @@@ static void lguest_load_idt(const struc
    * controls the entire thing and the Guest asks it to make changes using the
    * LOAD_GDT hypercall.
    *
-  * This is the opposite of the IDT code where we have a LOAD_IDT_ENTRY
-  * hypercall and use that repeatedly to load a new IDT.  I don't think it
-  * really matters, but wouldn't it be nice if they were the same?  Wouldn't
-  * it be even better if you were the one to send the patch to fix it?
+  * This is the exactly like the IDT code.
    */
   static void lguest_load_gdt(const struct desc_ptr *desc)
   {
-       BUG_ON((desc->size + 1) / 8 != GDT_ENTRIES);
-       kvm_hypercall2(LHCALL_LOAD_GDT, __pa(desc->address), GDT_ENTRIES);
+       unsigned int i;
+       struct desc_struct *gdt = (void *)desc->address;
+ 
+       for (i = 0; i < (desc->size+1)/8; i++)
+               kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b);
   }
   
   /* For a single GDT entry which changes, we do the lazy thing: alter our GDT,
@@@ -297,7 -291,9 +297,9 @@@ static void lguest_write_gdt_entry(stru
                                    const void *desc, int type)
   {
         native_write_gdt_entry(dt, entrynum, desc, type);
-       kvm_hypercall2(LHCALL_LOAD_GDT, __pa(dt), GDT_ENTRIES);
+       /* Tell Host about this new entry. */
+       kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum,
+                      dt[entrynum].a, dt[entrynum].b);
   }
   
   /* OK, I lied.  There are three "thread local storage" GDT entries which change
@@@ -667,7 -663,7 +669,7 @@@ static unsigned long lguest_tsc_khz(voi
   
   /* If we can't use the TSC, the kernel falls back to our lower-priority
    * "lguest_clock", where we read the time value given to us by the Host. */
- static cycle_t lguest_clock_read(void)
+ static cycle_t lguest_clock_read(struct clocksource *cs)
   {
         unsigned long sec, nsec;
   
@@@ -1057,8 -1053,8 +1059,8 @@@ __init void lguest_init(void
         pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
         pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
         pv_cpu_ops.wbinvd = lguest_wbinvd;
- -      pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
- -      pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ +      pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+ +      pv_cpu_ops.end_context_switch = lguest_end_context_switch;
   
         /* pagetable management */
         pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@@ -1071,7 -1067,7 +1073,7 @@@
         pv_mmu_ops.read_cr2 = lguest_read_cr2;
         pv_mmu_ops.read_cr3 = lguest_read_cr3;
         pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
- -      pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+ +      pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
         pv_mmu_ops.pte_update = lguest_pte_update;
         pv_mmu_ops.pte_update_defer = lguest_pte_update;
   
diff --combined arch/x86/mm/pageattr.c

index 660cac75ae11e489e2d443bee8d9a78d37eb8178,797f9f107cb6871a3797680c7dbed9430cb0f354..b81b41a0481f8a42924c78b036ab2630722fefeb
--- 1/arch/x86/mm/pageattr.c
--- 2/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@@ -844,6 -844,13 +844,6 @@@ static int change_page_attr_set_clr(uns
   
         vm_unmap_aliases();
   
- -      /*
- -       * If we're called with lazy mmu updates enabled, the
- -       * in-memory pte state may be stale.  Flush pending updates to
- -       * bring them up to date.
- -       */
- -      arch_flush_lazy_mmu_mode();
- -
         cpa.vaddr = addr;
         cpa.pages = pages;
         cpa.numpages = numpages;
@@@ -888,6 -895,13 +888,6 @@@
         } else
                 cpa_flush_all(cache);
   
- -      /*
- -       * If we've been called with lazy mmu updates enabled, then
- -       * make sure that everything gets flushed out before we
- -       * return.
- -       */
- -      arch_flush_lazy_mmu_mode();
- -
   out:
         return ret;
   }
@@@ -931,71 -945,94 +931,94 @@@ int _set_memory_uc(unsigned long addr, 
   
   int set_memory_uc(unsigned long addr, int numpages)
   {
+       int ret;
+ 
         /*
          * for now UC MINUS. see comments in ioremap_nocache()
          */
-       if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-                           _PAGE_CACHE_UC_MINUS, NULL))
-               return -EINVAL;
+       ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+                           _PAGE_CACHE_UC_MINUS, NULL);
+       if (ret)
+               goto out_err;
+ 
+       ret = _set_memory_uc(addr, numpages);
+       if (ret)
+               goto out_free;
   
-       return _set_memory_uc(addr, numpages);
+       return 0;
+ 
+ out_free:
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ out_err:
+       return ret;
   }
   EXPORT_SYMBOL(set_memory_uc);
   
   int set_memory_array_uc(unsigned long *addr, int addrinarray)
   {
-       unsigned long start;
-       unsigned long end;
-       int i;
+       int i, j;
+       int ret;
+ 
         /*
          * for now UC MINUS. see comments in ioremap_nocache()
          */
         for (i = 0; i < addrinarray; i++) {
-               start = __pa(addr[i]);
-               for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-                       if (end != __pa(addr[i + 1]))
-                               break;
-                       i++;
-               }
-               if (reserve_memtype(start, end, _PAGE_CACHE_UC_MINUS, NULL))
-                       goto out;
+               ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
+                                       _PAGE_CACHE_UC_MINUS, NULL);
+               if (ret)
+                       goto out_free;
         }
   
-       return change_page_attr_set(addr, addrinarray,
+       ret = change_page_attr_set(addr, addrinarray,
                                     __pgprot(_PAGE_CACHE_UC_MINUS), 1);
- out:
-       for (i = 0; i < addrinarray; i++) {
-               unsigned long tmp = __pa(addr[i]);
- 
-               if (tmp == start)
-                       break;
-               for (end = tmp + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-                       if (end != __pa(addr[i + 1]))
-                               break;
-                       i++;
-               }
-               free_memtype(tmp, end);
-       }
-       return -EINVAL;
+       if (ret)
+               goto out_free;
+ 
+       return 0;
+ 
+ out_free:
+       for (j = 0; j < i; j++)
+               free_memtype(__pa(addr[j]), __pa(addr[j]) + PAGE_SIZE);
+ 
+       return ret;
   }
   EXPORT_SYMBOL(set_memory_array_uc);
   
   int _set_memory_wc(unsigned long addr, int numpages)
   {
-       return change_page_attr_set(&addr, numpages,
+       int ret;
+       ret = change_page_attr_set(&addr, numpages,
+                                   __pgprot(_PAGE_CACHE_UC_MINUS), 0);
+ 
+       if (!ret) {
+               ret = change_page_attr_set(&addr, numpages,
                                     __pgprot(_PAGE_CACHE_WC), 0);
+       }
+       return ret;
   }
   
   int set_memory_wc(unsigned long addr, int numpages)
   {
+       int ret;
+ 
         if (!pat_enabled)
                 return set_memory_uc(addr, numpages);
   
-       if (reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
-               _PAGE_CACHE_WC, NULL))
-               return -EINVAL;
+       ret = reserve_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE,
+               _PAGE_CACHE_WC, NULL);
+       if (ret)
+               goto out_err;
+ 
+       ret = _set_memory_wc(addr, numpages);
+       if (ret)
+               goto out_free;
+ 
+       return 0;
   
-       return _set_memory_wc(addr, numpages);
+ out_free:
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+ out_err:
+       return ret;
   }
   EXPORT_SYMBOL(set_memory_wc);
   
@@@ -1007,29 -1044,31 +1030,31 @@@ int _set_memory_wb(unsigned long addr, 
   
   int set_memory_wb(unsigned long addr, int numpages)
   {
-       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+       int ret;
+ 
+       ret = _set_memory_wb(addr, numpages);
+       if (ret)
+               return ret;
   
-       return _set_memory_wb(addr, numpages);
+       free_memtype(__pa(addr), __pa(addr) + numpages * PAGE_SIZE);
+       return 0;
   }
   EXPORT_SYMBOL(set_memory_wb);
   
   int set_memory_array_wb(unsigned long *addr, int addrinarray)
   {
         int i;
+       int ret;
   
-       for (i = 0; i < addrinarray; i++) {
-               unsigned long start = __pa(addr[i]);
-               unsigned long end;
- 
-               for (end = start + PAGE_SIZE; i < addrinarray - 1; end += PAGE_SIZE) {
-                       if (end != __pa(addr[i + 1]))
-                               break;
-                       i++;
-               }
-               free_memtype(start, end);
-       }
-       return change_page_attr_clear(addr, addrinarray,
+       ret = change_page_attr_clear(addr, addrinarray,
                                       __pgprot(_PAGE_CACHE_MASK), 1);
+       if (ret)
+               return ret;
+ 
+       for (i = 0; i < addrinarray; i++)
+               free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
+ 
+       return 0;
   }
   EXPORT_SYMBOL(set_memory_array_wb);
   
@@@ -1122,6 -1161,8 +1147,8 @@@ int set_pages_array_wb(struct page **pa
   
         retval = cpa_clear_pages_array(pages, addrinarray,
                         __pgprot(_PAGE_CACHE_MASK));
+       if (retval)
+               return retval;
   
         for (i = 0; i < addrinarray; i++) {
                 start = (unsigned long)page_address(pages[i]);
@@@ -1129,7 -1170,7 +1156,7 @@@
                 free_memtype(start, end);
         }
   
-       return retval;
+       return 0;
   }
   EXPORT_SYMBOL(set_pages_array_wb);
   
diff --combined arch/x86/xen/mmu.c

index a96f5b9393ea9b8a0540660e9993723b5ff662be,e25a78e1113a11f8b2c057508697d48b3d99b2af..760e3a512059053041db5a9a0216ad5d4e203ca8
--- 1/arch/x86/xen/mmu.c
--- 2/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@@ -451,6 -451,10 +451,6 @@@ void set_pte_mfn(unsigned long vaddr, u
   void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                     pte_t *ptep, pte_t pteval)
   {
- -      /* updates to init_mm may be done without lock */
- -      if (mm == &init_mm)
- -              preempt_disable();
- -
         ADD_STATS(set_pte_at, 1);
   //    ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
         ADD_STATS(set_pte_at_current, mm == current->mm);
@@@ -471,7 -475,9 +471,7 @@@
         }
         xen_set_pte(ptep, pteval);
   
- -out:
- -      if (mm == &init_mm)
- -              preempt_enable();
+ +out:  return;
   }
   
   pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@@ -1145,8 -1151,10 +1145,8 @@@ static void drop_other_mm_ref(void *inf
   
         /* If this cpu still has a stale cr3 reference, then make sure
            it has been flushed. */
- -      if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+ +      if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
                 load_cr3(swapper_pg_dir);
- -              arch_flush_lazy_cpu_mode();
- -      }
   }
   
   static void xen_drop_mm_ref(struct mm_struct *mm)
@@@ -1159,6 -1167,7 +1159,6 @@@
                         load_cr3(swapper_pg_dir);
                 else
                         leave_mm(smp_processor_id());
- -              arch_flush_lazy_cpu_mode();
         }
   
         /* Get the "official" set of cpus referring to our pagetable. */
@@@ -1785,11 -1794,16 +1785,16 @@@ __init pgd_t *xen_setup_kernel_pagetabl
   
         pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir)));
   
+       reserve_early(__pa(xen_start_info->pt_base),
+                     __pa(xen_start_info->pt_base +
+                          xen_start_info->nr_pt_frames * PAGE_SIZE),
+                     "XEN PAGETABLES");
+ 
         return swapper_pg_dir;
   }
   #endif        /* CONFIG_X86_64 */
   
- static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot)
+ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
   {
         pte_t pte;
   
@@@ -1861,14 -1875,6 +1866,14 @@@ __init void xen_post_allocator_init(voi
         xen_mark_init_mm_pinned();
   }
   
+ +static void xen_leave_lazy_mmu(void)
+ +{
+ +      preempt_disable();
+ +      xen_mc_flush();
+ +      paravirt_leave_lazy_mmu();
+ +      preempt_enable();
+ +}
+ +
   const struct pv_mmu_ops xen_mmu_ops __initdata = {
         .pagetable_setup_start = xen_pagetable_setup_start,
         .pagetable_setup_done = xen_pagetable_setup_done,
@@@ -1942,7 -1948,7 +1947,7 @@@
   
         .lazy_mode = {
                 .enter = paravirt_enter_lazy_mmu,
- -              .leave = xen_leave_lazy,
+ +              .leave = xen_leave_lazy_mmu,
         },
   
         .set_fixmap = xen_set_fixmap,
diff --combined kernel/sched.c

index b38bd96098f62da8154a9edf048ca15f0c95d6b6,26efa475bdc143f6e4459067c18ce57e71608764..9e0fd1ef1a47425b1388d19af05b8d4b2bab1b7f
--- 1/kernel/sched.c
--- 2/kernel/sched.c
+++ b/kernel/sched.c
@@@ -1418,10 -1418,22 +1418,22 @@@ iter_move_one_task(struct rq *this_rq, 
                    struct rq_iterator *iterator);
   #endif
   
+ /* Time spent by the tasks of the cpu accounting group executing in ... */
+ enum cpuacct_stat_index {
+       CPUACCT_STAT_USER,      /* ... user mode */
+       CPUACCT_STAT_SYSTEM,    /* ... kernel mode */
+ 
+       CPUACCT_STAT_NSTATS,
+ };
+ 
   #ifdef CONFIG_CGROUP_CPUACCT
   static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
+ static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val);
   #else
   static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
+ static inline void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val) {}
   #endif
   
   static inline void inc_cpu_load(struct rq *rq, unsigned long load)
@@@ -2754,7 -2766,7 +2766,7 @@@ context_switch(struct rq *rq, struct ta
          * combine the page table reload and the switch backend into
          * one hypercall.
          */
- -      arch_enter_lazy_cpu_mode();
+ +      arch_start_context_switch(prev);
   
         if (unlikely(!mm)) {
                 next->active_mm = oldmm;
@@@ -4511,9 -4523,25 +4523,25 @@@ DEFINE_PER_CPU(struct kernel_stat, ksta
   EXPORT_PER_CPU_SYMBOL(kstat);
   
   /*
-  * Return any ns on the sched_clock that have not yet been banked in
+  * Return any ns on the sched_clock that have not yet been accounted in
    * @p in case that task is currently running.
+  *
+  * Called with task_rq_lock() held on @rq.
    */
+ static u64 do_task_delta_exec(struct task_struct *p, struct rq *rq)
+ {
+       u64 ns = 0;
+ 
+       if (task_current(rq, p)) {
+               update_rq_clock(rq);
+               ns = rq->clock - p->se.exec_start;
+               if ((s64)ns < 0)
+                       ns = 0;
+       }
+ 
+       return ns;
+ }
+ 
   unsigned long long task_delta_exec(struct task_struct *p)
   {
         unsigned long flags;
@@@ -4521,16 -4549,49 +4549,49 @@@
         u64 ns = 0;
   
         rq = task_rq_lock(p, &flags);
+       ns = do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
   
-       if (task_current(rq, p)) {
-               u64 delta_exec;
+       return ns;
+ }
   
-               update_rq_clock(rq);
-               delta_exec = rq->clock - p->se.exec_start;
-               if ((s64)delta_exec > 0)
-                       ns = delta_exec;
-       }
+ /*
+  * Return accounted runtime for the task.
+  * In case the task is currently running, return the runtime plus current's
+  * pending runtime that have not been accounted yet.
+  */
+ unsigned long long task_sched_runtime(struct task_struct *p)
+ {
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns = 0;
+ 
+       rq = task_rq_lock(p, &flags);
+       ns = p->se.sum_exec_runtime + do_task_delta_exec(p, rq);
+       task_rq_unlock(rq, &flags);
+ 
+       return ns;
+ }
+ 
+ /*
+  * Return sum_exec_runtime for the thread group.
+  * In case the task is currently running, return the sum plus current's
+  * pending runtime that have not been accounted yet.
+  *
+  * Note that the thread group might have other running tasks as well,
+  * so the return value not includes other pending runtime that other
+  * running tasks might have.
+  */
+ unsigned long long thread_group_sched_runtime(struct task_struct *p)
+ {
+       struct task_cputime totals;
+       unsigned long flags;
+       struct rq *rq;
+       u64 ns;
   
+       rq = task_rq_lock(p, &flags);
+       thread_group_cputime(p, &totals);
+       ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
         task_rq_unlock(rq, &flags);
   
         return ns;
@@@ -4559,6 -4620,8 +4620,8 @@@ void account_user_time(struct task_stru
                 cpustat->nice = cputime64_add(cpustat->nice, tmp);
         else
                 cpustat->user = cputime64_add(cpustat->user, tmp);
+ 
+       cpuacct_update_stats(p, CPUACCT_STAT_USER, cputime);
         /* Account for user time used */
         acct_update_integrals(p);
   }
@@@ -4620,6 -4683,8 +4683,8 @@@ void account_system_time(struct task_st
         else
                 cpustat->system = cputime64_add(cpustat->system, tmp);
   
+       cpuacct_update_stats(p, CPUACCT_STAT_SYSTEM, cputime);
+ 
         /* Account for system time used */
         acct_update_integrals(p);
   }
@@@ -4667,7 -4732,7 +4732,7 @@@ void account_process_tick(struct task_s
   
         if (user_tick)
                 account_user_time(p, one_jiffy, one_jiffy_scaled);
-       else if (p != rq->idle)
+       else if ((p != rq->idle) || (irq_count() != HARDIRQ_OFFSET))
                 account_system_time(p, HARDIRQ_OFFSET, one_jiffy,
                                     one_jiffy_scaled);
         else
@@@ -4781,7 -4846,7 +4846,7 @@@ void scheduler_tick(void
   #endif
   }
   
- unsigned long get_parent_ip(unsigned long addr)
+ notrace unsigned long get_parent_ip(unsigned long addr)
   {
         if (in_lock_functions(addr)) {
                 addr = CALLER_ADDR2;
@@@ -7302,7 -7367,12 +7367,12 @@@ static int sched_domain_debug_one(struc
                 cpumask_or(groupmask, groupmask, sched_group_cpus(group));
   
                 cpulist_scnprintf(str, sizeof(str), sched_group_cpus(group));
+ 
                 printk(KERN_CONT " %s", str);
+               if (group->__cpu_power != SCHED_LOAD_SCALE) {
+                       printk(KERN_CONT " (__cpu_power = %d)",
+                               group->__cpu_power);
+               }
   
                 group = group->next;
         } while (group != sd->groups);
@@@ -9925,6 -9995,7 +9995,7 @@@ struct cpuacct 
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
         u64 *cpuusage;
+       struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
   };
   
@@@ -9949,20 -10020,32 +10020,32 @@@ static struct cgroup_subsys_state *cpua
         struct cgroup_subsys *ss, struct cgroup *cgrp)
   {
         struct cpuacct *ca = kzalloc(sizeof(*ca), GFP_KERNEL);
+       int i;
   
         if (!ca)
-               return ERR_PTR(-ENOMEM);
+               goto out;
   
         ca->cpuusage = alloc_percpu(u64);
-       if (!ca->cpuusage) {
-               kfree(ca);
-               return ERR_PTR(-ENOMEM);
-       }
+       if (!ca->cpuusage)
+               goto out_free_ca;
+ 
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               if (percpu_counter_init(&ca->cpustat[i], 0))
+                       goto out_free_counters;
   
         if (cgrp->parent)
                 ca->parent = cgroup_ca(cgrp->parent);
   
         return &ca->css;
+ 
+ out_free_counters:
+       while (--i >= 0)
+               percpu_counter_destroy(&ca->cpustat[i]);
+       free_percpu(ca->cpuusage);
+ out_free_ca:
+       kfree(ca);
+ out:
+       return ERR_PTR(-ENOMEM);
   }
   
   /* destroy an existing cpu accounting group */
@@@ -9970,7 -10053,10 +10053,10 @@@ static voi
   cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
   {
         struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
   
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++)
+               percpu_counter_destroy(&ca->cpustat[i]);
         free_percpu(ca->cpuusage);
         kfree(ca);
   }
@@@ -10057,6 -10143,25 +10143,25 @@@ static int cpuacct_percpu_seq_read(stru
         return 0;
   }
   
+ static const char *cpuacct_stat_desc[] = {
+       [CPUACCT_STAT_USER] = "user",
+       [CPUACCT_STAT_SYSTEM] = "system",
+ };
+ 
+ static int cpuacct_stats_show(struct cgroup *cgrp, struct cftype *cft,
+               struct cgroup_map_cb *cb)
+ {
+       struct cpuacct *ca = cgroup_ca(cgrp);
+       int i;
+ 
+       for (i = 0; i < CPUACCT_STAT_NSTATS; i++) {
+               s64 val = percpu_counter_read(&ca->cpustat[i]);
+               val = cputime64_to_clock_t(val);
+               cb->fill(cb, cpuacct_stat_desc[i], val);
+       }
+       return 0;
+ }
+ 
   static struct cftype files[] = {
         {
                 .name = "usage",
@@@ -10067,7 -10172,10 +10172,10 @@@
                 .name = "usage_percpu",
                 .read_seq_string = cpuacct_percpu_seq_read,
         },
- 
+       {
+               .name = "stat",
+               .read_map = cpuacct_stats_show,
+       },
   };
   
   static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
@@@ -10089,12 -10197,38 +10197,38 @@@ static void cpuacct_charge(struct task_
                 return;
   
         cpu = task_cpu(tsk);
+ 
+       rcu_read_lock();
+ 
         ca = task_ca(tsk);
   
         for (; ca; ca = ca->parent) {
                 u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
                 *cpuusage += cputime;
         }
+ 
+       rcu_read_unlock();
+ }
+ 
+ /*
+  * Charge the system/user time to the task's accounting group.
+  */
+ static void cpuacct_update_stats(struct task_struct *tsk,
+               enum cpuacct_stat_index idx, cputime_t val)
+ {
+       struct cpuacct *ca;
+ 
+       if (unlikely(!cpuacct_subsys.active))
+               return;
+ 
+       rcu_read_lock();
+       ca = task_ca(tsk);
+ 
+       do {
+               percpu_counter_add(&ca->cpustat[idx], val);
+               ca = ca->parent;
+       } while (ca);
+       rcu_read_unlock();
   }
   
   struct cgroup_subsys cpuacct_subsys = {
author	Ingo Molnar <mingo@elte.hu>
	Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 8 May 2009 08:50:00 +0000 (10:50 +0200)
		1	2
arch/x86/include/asm/paravirt.h	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/lguest/boot.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/mm/pageattr.c	patch \|	diff1 \|	diff2 \|	blob \| history
arch/x86/xen/mmu.c	patch \|	diff1 \|	diff2 \|	blob \| history
kernel/sched.c	patch \|	diff1 \|	diff2 \|	blob \| history