Merge branch 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm
authorLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jun 2009 17:03:30 +0000 (10:03 -0700)
committerLinus Torvalds <torvalds@linux-foundation.org>
Thu, 11 Jun 2009 17:03:30 +0000 (10:03 -0700)
* 'kvm-updates/2.6.31' of git://git.kernel.org/pub/scm/virt/kvm/kvm: (138 commits)
  KVM: Prevent overflow in largepages calculation
  KVM: Disable large pages on misaligned memory slots
  KVM: Add VT-x machine check support
  KVM: VMX: Rename rmode.active to rmode.vm86_active
  KVM: Move "exit due to NMI" handling into vmx_complete_interrupts()
  KVM: Disable CR8 intercept if tpr patching is active
  KVM: Do not migrate pending software interrupts.
  KVM: inject NMI after IRET from a previous NMI, not before.
  KVM: Always request IRQ/NMI window if an interrupt is pending
  KVM: Do not re-execute INTn instruction.
  KVM: skip_emulated_instruction() decode instruction if size is not known
  KVM: Remove irq_pending bitmap
  KVM: Do not allow interrupt injection from userspace if there is a pending event.
  KVM: Unprotect a page if #PF happens during NMI injection.
  KVM: s390: Verify memory in kvm run
  KVM: s390: Sanity check on validity intercept
  KVM: s390: Unlink vcpu on destroy - v2
  KVM: s390: optimize float int lock: spin_lock_bh --> spin_lock
  KVM: s390: use hrtimer for clock wakeup from idle - v2
  KVM: s390: Fix memory slot versus run - v3
  ...

609 files changed:
Documentation/ABI/testing/sysfs-devices-cache_disable [new file with mode: 0644]
Documentation/DMA-API.txt
Documentation/DocBook/Makefile
Documentation/DocBook/tracepoint.tmpl [new file with mode: 0644]
Documentation/RCU/trace.txt
Documentation/Smack.txt
Documentation/futex-requeue-pi.txt [new file with mode: 0644]
Documentation/kernel-parameters.txt
Documentation/memory-barriers.txt
Documentation/scheduler/sched-rt-group.txt
Documentation/sysctl/kernel.txt
Documentation/trace/events.txt [new file with mode: 0644]
Documentation/trace/ftrace.txt
Documentation/trace/power.txt [new file with mode: 0644]
Documentation/x86/boot.txt
Documentation/x86/x86_64/boot-options.txt
Documentation/x86/x86_64/mm.txt
MAINTAINERS
arch/alpha/kernel/sys_dp264.c
arch/alpha/kernel/sys_titan.c
arch/arm/common/gic.c
arch/arm/plat-mxc/include/mach/imx-uart.h
arch/cris/arch-v32/kernel/irq.c
arch/frv/Kconfig
arch/frv/include/asm/bitops.h
arch/frv/include/asm/elf.h
arch/frv/include/asm/pci.h
arch/frv/include/asm/ptrace.h
arch/frv/include/asm/syscall.h [new file with mode: 0644]
arch/frv/include/asm/thread_info.h
arch/frv/kernel/entry.S
arch/frv/kernel/ptrace.c
arch/frv/kernel/signal.c
arch/frv/kernel/uaccess.c
arch/frv/mb93090-mb00/pci-dma-nommu.c
arch/frv/mb93090-mb00/pci-dma.c
arch/ia64/hp/sim/hpsim_irq.c
arch/ia64/kernel/acpi.c
arch/ia64/kernel/iosapic.c
arch/ia64/kernel/msi_ia64.c
arch/ia64/sn/kernel/irq.c
arch/ia64/sn/kernel/msi_sn.c
arch/mips/cavium-octeon/octeon-irq.c
arch/mips/include/asm/irq.h
arch/mips/kernel/irq-gic.c
arch/mips/mti-malta/malta-smtc.c
arch/mips/sibyte/bcm1480/irq.c
arch/mips/sibyte/sb1250/irq.c
arch/mn10300/Kconfig
arch/mn10300/include/asm/elf.h
arch/mn10300/include/asm/processor.h
arch/mn10300/include/asm/ptrace.h
arch/mn10300/kernel/entry.S
arch/mn10300/kernel/ptrace.c
arch/mn10300/kernel/signal.c
arch/mn10300/mm/tlb-mn10300.S
arch/parisc/kernel/irq.c
arch/powerpc/platforms/pseries/xics.c
arch/powerpc/sysdev/mpic.c
arch/powerpc/sysdev/mpic.h
arch/sparc/include/asm/thread_info_64.h
arch/sparc/kernel/irq_64.c
arch/x86/Kbuild [new file with mode: 0644]
arch/x86/Kconfig
arch/x86/Kconfig.debug
arch/x86/Makefile
arch/x86/boot/.gitignore
arch/x86/boot/Makefile
arch/x86/boot/a20.c
arch/x86/boot/apm.c
arch/x86/boot/bioscall.S [new file with mode: 0644]
arch/x86/boot/boot.h
arch/x86/boot/compressed/.gitignore
arch/x86/boot/compressed/Makefile
arch/x86/boot/compressed/head_32.S
arch/x86/boot/compressed/head_64.S
arch/x86/boot/compressed/misc.c
arch/x86/boot/compressed/mkpiggy.c [new file with mode: 0644]
arch/x86/boot/compressed/vmlinux.lds.S [new file with mode: 0644]
arch/x86/boot/compressed/vmlinux.scr [deleted file]
arch/x86/boot/compressed/vmlinux_32.lds [deleted file]
arch/x86/boot/compressed/vmlinux_64.lds [deleted file]
arch/x86/boot/edd.c
arch/x86/boot/header.S
arch/x86/boot/main.c
arch/x86/boot/mca.c
arch/x86/boot/memory.c
arch/x86/boot/regs.c [new file with mode: 0644]
arch/x86/boot/setup.ld
arch/x86/boot/tty.c
arch/x86/boot/video-bios.c
arch/x86/boot/video-vesa.c
arch/x86/boot/video-vga.c
arch/x86/boot/video.c
arch/x86/boot/video.h
arch/x86/configs/i386_defconfig
arch/x86/configs/x86_64_defconfig
arch/x86/ia32/ia32entry.S
arch/x86/include/asm/alternative.h
arch/x86/include/asm/amd_iommu.h
arch/x86/include/asm/amd_iommu_types.h
arch/x86/include/asm/apic.h
arch/x86/include/asm/apicdef.h
arch/x86/include/asm/boot.h
arch/x86/include/asm/bootparam.h
arch/x86/include/asm/cpu_debug.h
arch/x86/include/asm/cpufeature.h
arch/x86/include/asm/ds.h
arch/x86/include/asm/hw_irq.h
arch/x86/include/asm/i387.h
arch/x86/include/asm/i8259.h
arch/x86/include/asm/io_apic.h
arch/x86/include/asm/iomap.h
arch/x86/include/asm/irq_remapping.h
arch/x86/include/asm/irq_vectors.h
arch/x86/include/asm/k8.h
arch/x86/include/asm/microcode.h
arch/x86/include/asm/mpspec.h
arch/x86/include/asm/msr-index.h
arch/x86/include/asm/nmi.h
arch/x86/include/asm/numa_64.h
arch/x86/include/asm/page_32_types.h
arch/x86/include/asm/page_64_types.h
arch/x86/include/asm/page_types.h
arch/x86/include/asm/paravirt.h
arch/x86/include/asm/pgtable.h
arch/x86/include/asm/pgtable_64.h
arch/x86/include/asm/pgtable_64_types.h
arch/x86/include/asm/pgtable_types.h
arch/x86/include/asm/processor.h
arch/x86/include/asm/ptrace.h
arch/x86/include/asm/required-features.h
arch/x86/include/asm/setup.h
arch/x86/include/asm/smp.h
arch/x86/include/asm/sparsemem.h
arch/x86/include/asm/syscalls.h
arch/x86/include/asm/termios.h
arch/x86/include/asm/thread_info.h
arch/x86/include/asm/tlbflush.h
arch/x86/include/asm/topology.h
arch/x86/include/asm/traps.h
arch/x86/include/asm/unistd_32.h
arch/x86/include/asm/unistd_64.h
arch/x86/include/asm/uv/uv_bau.h
arch/x86/include/asm/uv/uv_hub.h
arch/x86/kernel/Makefile
arch/x86/kernel/acpi/boot.c
arch/x86/kernel/acpi/realmode/Makefile
arch/x86/kernel/acpi/realmode/bioscall.S [new file with mode: 0644]
arch/x86/kernel/acpi/realmode/regs.c [new file with mode: 0644]
arch/x86/kernel/amd_iommu.c
arch/x86/kernel/amd_iommu_init.c
arch/x86/kernel/apic/apic.c
arch/x86/kernel/apic/apic_flat_64.c
arch/x86/kernel/apic/es7000_32.c
arch/x86/kernel/apic/io_apic.c
arch/x86/kernel/apic/nmi.c
arch/x86/kernel/apic/probe_32.c
arch/x86/kernel/apic/probe_64.c
arch/x86/kernel/apic/summit_32.c
arch/x86/kernel/apic/x2apic_cluster.c
arch/x86/kernel/apic/x2apic_uv_x.c
arch/x86/kernel/asm-offsets_32.c
arch/x86/kernel/asm-offsets_64.c
arch/x86/kernel/cpu/amd.c
arch/x86/kernel/cpu/common.c
arch/x86/kernel/cpu/cpu_debug.c
arch/x86/kernel/cpu/cpufreq/Kconfig
arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
arch/x86/kernel/cpu/intel.c
arch/x86/kernel/cpu/intel_cacheinfo.c
arch/x86/kernel/cpu/mcheck/mce_intel_64.c
arch/x86/kernel/cpu/mtrr/cleanup.c
arch/x86/kernel/cpu/mtrr/generic.c
arch/x86/kernel/cpu/mtrr/main.c
arch/x86/kernel/cpu/mtrr/mtrr.h
arch/x86/kernel/cpu/mtrr/state.c
arch/x86/kernel/ds.c
arch/x86/kernel/ds_selftest.c [new file with mode: 0644]
arch/x86/kernel/ds_selftest.h [new file with mode: 0644]
arch/x86/kernel/dumpstack.h
arch/x86/kernel/e820.c
arch/x86/kernel/early-quirks.c
arch/x86/kernel/entry_64.S
arch/x86/kernel/head_32.S
arch/x86/kernel/irq.c
arch/x86/kernel/irqinit.c [new file with mode: 0644]
arch/x86/kernel/irqinit_32.c [deleted file]
arch/x86/kernel/irqinit_64.c [deleted file]
arch/x86/kernel/kgdb.c
arch/x86/kernel/kvm.c
arch/x86/kernel/microcode_amd.c
arch/x86/kernel/microcode_core.c
arch/x86/kernel/microcode_intel.c
arch/x86/kernel/mpparse.c
arch/x86/kernel/paravirt.c
arch/x86/kernel/pci-calgary_64.c
arch/x86/kernel/pci-gart_64.c
arch/x86/kernel/pci-swiotlb.c
arch/x86/kernel/process.c
arch/x86/kernel/process_32.c
arch/x86/kernel/process_64.c
arch/x86/kernel/ptrace.c
arch/x86/kernel/quirks.c
arch/x86/kernel/reboot.c
arch/x86/kernel/setup.c
arch/x86/kernel/setup_percpu.c
arch/x86/kernel/smp.c
arch/x86/kernel/smpboot.c
arch/x86/kernel/stacktrace.c
arch/x86/kernel/syscall_table_32.S
arch/x86/kernel/tlb_uv.c
arch/x86/kernel/traps.c
arch/x86/kernel/tsc.c
arch/x86/kernel/tsc_sync.c
arch/x86/kernel/vm86_32.c
arch/x86/kernel/vmi_32.c
arch/x86/kernel/vmlinux.lds.S
arch/x86/kernel/vmlinux_32.lds.S [deleted file]
arch/x86/kernel/vmlinux_64.lds.S [deleted file]
arch/x86/kernel/vsyscall_64.c
arch/x86/lguest/boot.c
arch/x86/mm/dump_pagetables.c
arch/x86/mm/fault.c
arch/x86/mm/highmem_32.c
arch/x86/mm/init.c
arch/x86/mm/init_32.c
arch/x86/mm/init_64.c
arch/x86/mm/iomap_32.c
arch/x86/mm/kmmio.c
arch/x86/mm/memtest.c
arch/x86/mm/mmio-mod.c
arch/x86/mm/numa_64.c
arch/x86/mm/pageattr.c
arch/x86/mm/srat_64.c
arch/x86/oprofile/nmi_int.c
arch/x86/pci/irq.c
arch/x86/vdso/vma.c
arch/x86/xen/enlighten.c
arch/x86/xen/mmu.c
arch/x86/xen/setup.c
arch/x86/xen/xen-ops.h
block/blk-core.c
block/blk-sysfs.c
block/compat_ioctl.c
block/elevator.c
drivers/acpi/pci_irq.c
drivers/bluetooth/hci_ldisc.c
drivers/char/Kconfig
drivers/char/Makefile
drivers/char/bfin_jtag_comm.c [new file with mode: 0644]
drivers/char/cyclades.c
drivers/char/epca.c
drivers/char/hpet.c
drivers/char/ip2/i2lib.c
drivers/char/ip2/ip2main.c
drivers/char/isicom.c
drivers/char/istallion.c
drivers/char/mem.c
drivers/char/moxa.c
drivers/char/mxser.c
drivers/char/n_hdlc.c
drivers/char/n_tty.c
drivers/char/pcmcia/synclink_cs.c
drivers/char/pty.c
drivers/char/rocket.c
drivers/char/selection.c
drivers/char/stallion.c
drivers/char/synclink.c
drivers/char/synclink_gt.c
drivers/char/synclinkmp.c
drivers/char/tty_audit.c
drivers/char/tty_io.c
drivers/char/tty_ioctl.c
drivers/char/tty_ldisc.c
drivers/char/tty_port.c
drivers/ide/alim15x3.c
drivers/ide/ide-atapi.c
drivers/ide/ide-cd.c
drivers/ide/ide-cd.h
drivers/ide/ide-disk.c
drivers/ide/ide-dma.c
drivers/ide/ide-floppy.c
drivers/ide/ide-io.c
drivers/ide/ide-ioctls.c
drivers/ide/ide-park.c
drivers/ide/ide-pm.c
drivers/ide/ide-tape.c
drivers/ide/ide-taskfile.c
drivers/md/dm.c
drivers/parisc/iosapic.c
drivers/parport/parport_pc.c
drivers/pci/hotplug/ibmphp_core.c
drivers/pci/htirq.c
drivers/pci/intel-iommu.c
drivers/pci/intr_remapping.c
drivers/pnp/pnpacpi/rsparser.c
drivers/scsi/sg.c
drivers/serial/8250.c
drivers/serial/8250_pci.c
drivers/serial/Kconfig
drivers/serial/Makefile
drivers/serial/bfin_5xx.c
drivers/serial/bfin_sport_uart.c
drivers/serial/icom.c
drivers/serial/imx.c
drivers/serial/jsm/jsm.h
drivers/serial/jsm/jsm_tty.c
drivers/serial/timbuart.c [new file with mode: 0644]
drivers/serial/timbuart.h [new file with mode: 0644]
drivers/usb/class/cdc-acm.c
drivers/usb/class/cdc-acm.h
drivers/usb/serial/belkin_sa.c
drivers/usb/serial/ch341.c
drivers/usb/serial/console.c
drivers/usb/serial/cp210x.c
drivers/usb/serial/cyberjack.c
drivers/usb/serial/cypress_m8.c
drivers/usb/serial/digi_acceleport.c
drivers/usb/serial/empeg.c
drivers/usb/serial/ftdi_sio.c
drivers/usb/serial/garmin_gps.c
drivers/usb/serial/generic.c
drivers/usb/serial/io_edgeport.c
drivers/usb/serial/io_ti.c
drivers/usb/serial/ipaq.c
drivers/usb/serial/ipw.c
drivers/usb/serial/ir-usb.c
drivers/usb/serial/iuu_phoenix.c
drivers/usb/serial/keyspan.c
drivers/usb/serial/keyspan.h
drivers/usb/serial/keyspan_pda.c
drivers/usb/serial/kl5kusb105.c
drivers/usb/serial/kobil_sct.c
drivers/usb/serial/mct_u232.c
drivers/usb/serial/mos7720.c
drivers/usb/serial/mos7840.c
drivers/usb/serial/navman.c
drivers/usb/serial/omninet.c
drivers/usb/serial/opticon.c
drivers/usb/serial/option.c
drivers/usb/serial/oti6858.c
drivers/usb/serial/pl2303.c
drivers/usb/serial/sierra.c
drivers/usb/serial/spcp8x5.c
drivers/usb/serial/symbolserial.c
drivers/usb/serial/ti_usb_3410_5052.c
drivers/usb/serial/usb-serial.c
drivers/usb/serial/visor.c
drivers/usb/serial/whiteheat.c
drivers/xen/Kconfig
drivers/xen/Makefile
drivers/xen/events.c
drivers/xen/evtchn.c [new file with mode: 0644]
drivers/xen/manage.c
drivers/xen/sys-hypervisor.c [new file with mode: 0644]
drivers/xen/xenbus/xenbus_probe.c
drivers/xen/xenbus/xenbus_xs.c
drivers/xen/xenfs/super.c
fs/bio.c
fs/buffer.c
fs/cifs/CHANGES
fs/cifs/README
fs/cifs/cifs_spnego.c
fs/cifs/cifsacl.c
fs/cifs/cifsfs.c
fs/cifs/cifsfs.h
fs/cifs/cifsproto.h
fs/cifs/cifssmb.c
fs/cifs/connect.c
fs/cifs/file.c
fs/cifs/inode.c
fs/cifs/netmisc.c
fs/cifs/readdir.c
fs/compat.c
fs/devpts/inode.c
fs/exec.c
fs/ext2/super.c
fs/ext3/super.c
fs/ext4/Makefile
fs/ext4/balloc.c
fs/ext4/block_validity.c [new file with mode: 0644]
fs/ext4/dir.c
fs/ext4/ext4.h
fs/ext4/ext4_i.h [deleted file]
fs/ext4/ext4_sb.h [deleted file]
fs/ext4/extents.c
fs/ext4/group.h [deleted file]
fs/ext4/ialloc.c
fs/ext4/inode.c
fs/ext4/mballoc.c
fs/ext4/mballoc.h
fs/ext4/namei.c
fs/ext4/namei.h [deleted file]
fs/ext4/resize.c
fs/ext4/super.c
fs/hugetlbfs/inode.c
fs/ioctl.c
fs/jbd2/journal.c
fs/mpage.c
fs/namei.c
fs/nfsd/vfs.c
fs/proc/base.c
fs/proc/loadavg.c
include/Kbuild
include/asm-generic/pgtable.h
include/asm-generic/vmlinux.lds.h
include/linux/acpi.h
include/linux/blktrace_api.h
include/linux/compat.h
include/linux/cyclades.h
include/linux/dma-debug.h
include/linux/dmar.h
include/linux/ftrace.h
include/linux/ftrace_event.h [new file with mode: 0644]
include/linux/futex.h
include/linux/ide.h
include/linux/ima.h
include/linux/init_task.h
include/linux/interrupt.h
include/linux/irq.h
include/linux/kmemtrace.h [new file with mode: 0644]
include/linux/lsm_audit.h [new file with mode: 0644]
include/linux/magic.h
include/linux/mm.h
include/linux/mmiotrace.h
include/linux/module.h
include/linux/mutex.h
include/linux/pci_ids.h
include/linux/ptrace.h
include/linux/rational.h [new file with mode: 0644]
include/linux/rculist.h
include/linux/rcutree.h
include/linux/ring_buffer.h
include/linux/sched.h
include/linux/security.h
include/linux/serial.h
include/linux/serial_core.h
include/linux/signal.h
include/linux/slab_def.h
include/linux/slub_def.h
include/linux/spinlock_up.h
include/linux/swiotlb.h
include/linux/thread_info.h
include/linux/trace_seq.h [new file with mode: 0644]
include/linux/tracepoint.h
include/linux/tty.h
include/linux/tty_driver.h
include/linux/usb/serial.h
include/linux/wait.h
include/trace/block.h [deleted file]
include/trace/define_trace.h [new file with mode: 0644]
include/trace/events/block.h [new file with mode: 0644]
include/trace/events/irq.h [new file with mode: 0644]
include/trace/events/kmem.h [new file with mode: 0644]
include/trace/events/lockdep.h [new file with mode: 0644]
include/trace/events/sched.h [new file with mode: 0644]
include/trace/events/skb.h [new file with mode: 0644]
include/trace/events/workqueue.h [new file with mode: 0644]
include/trace/ftrace.h [new file with mode: 0644]
include/trace/irq.h [deleted file]
include/trace/irq_event_types.h [deleted file]
include/trace/kmemtrace.h [deleted file]
include/trace/lockdep.h [deleted file]
include/trace/lockdep_event_types.h [deleted file]
include/trace/sched.h [deleted file]
include/trace/sched_event_types.h [deleted file]
include/trace/skb.h [deleted file]
include/trace/trace_event_types.h [deleted file]
include/trace/trace_events.h [deleted file]
include/trace/workqueue.h [deleted file]
include/xen/Kbuild [new file with mode: 0644]
include/xen/events.h
include/xen/evtchn.h [new file with mode: 0644]
include/xen/interface/version.h
include/xen/xenbus.h
init/Kconfig
init/main.c
ipc/sem.c
ipc/shm.c
kernel/Makefile
kernel/compat.c
kernel/cred.c
kernel/exit.c
kernel/fork.c
kernel/futex.c
kernel/irq/Makefile
kernel/irq/chip.c
kernel/irq/handle.c
kernel/irq/internals.h
kernel/irq/manage.c
kernel/irq/migration.c
kernel/irq/numa_migrate.c
kernel/kthread.c
kernel/lockdep.c
kernel/module.c
kernel/mutex.c
kernel/ptrace.c
kernel/rcupreempt.c
kernel/rcutree.c
kernel/rcutree_trace.c
kernel/rtmutex.c
kernel/rtmutex_common.h
kernel/sched.c
kernel/sched_fair.c
kernel/sched_idletask.c
kernel/signal.c
kernel/softirq.c
kernel/sysctl.c
kernel/time/timekeeping.c
kernel/timer.c
kernel/trace/Kconfig
kernel/trace/Makefile
kernel/trace/blktrace.c
kernel/trace/events.c [deleted file]
kernel/trace/ftrace.c
kernel/trace/kmemtrace.c
kernel/trace/ring_buffer.c
kernel/trace/ring_buffer_benchmark.c [new file with mode: 0644]
kernel/trace/trace.c
kernel/trace/trace.h
kernel/trace/trace_boot.c
kernel/trace/trace_branch.c
kernel/trace/trace_event_profile.c
kernel/trace/trace_event_types.h
kernel/trace/trace_events.c
kernel/trace/trace_events_filter.c
kernel/trace/trace_events_stage_1.h [deleted file]
kernel/trace/trace_events_stage_2.h [deleted file]
kernel/trace/trace_events_stage_3.h [deleted file]
kernel/trace/trace_export.c
kernel/trace/trace_functions_graph.c
kernel/trace/trace_hw_branches.c
kernel/trace/trace_mmiotrace.c
kernel/trace/trace_output.c
kernel/trace/trace_output.h
kernel/trace/trace_power.c
kernel/trace/trace_printk.c
kernel/trace/trace_sched_switch.c
kernel/trace/trace_sched_wakeup.c
kernel/trace/trace_selftest.c
kernel/trace/trace_stack.c
kernel/trace/trace_stat.c
kernel/trace/trace_stat.h
kernel/trace/trace_sysprof.c
kernel/trace/trace_workqueue.c
kernel/wait.c
kernel/workqueue.c
lib/Kconfig
lib/Makefile
lib/dma-debug.c
lib/rational.c [new file with mode: 0644]
lib/swiotlb.c
lib/vsprintf.c
mm/Kconfig
mm/bounce.c
mm/mlock.c
mm/mmap.c
mm/nommu.c
mm/page_alloc.c
mm/percpu.c
mm/shmem.c
mm/slab.c
mm/slob.c
mm/slub.c
mm/util.c
net/core/drop_monitor.c
net/core/net-traces.c
net/core/skbuff.c
samples/Kconfig
samples/Makefile
samples/trace_events/Makefile [new file with mode: 0644]
samples/trace_events/trace-events-sample.c [new file with mode: 0644]
samples/trace_events/trace-events-sample.h [new file with mode: 0644]
scripts/Makefile.lib
scripts/bin_size [deleted file]
scripts/kernel-doc
scripts/recordmcount.pl
security/Kconfig
security/Makefile
security/commoncap.c
security/inode.c
security/integrity/ima/ima_audit.c
security/integrity/ima/ima_crypto.c
security/integrity/ima/ima_fs.c
security/integrity/ima/ima_iint.c
security/integrity/ima/ima_init.c
security/integrity/ima/ima_main.c
security/integrity/ima/ima_policy.c
security/lsm_audit.c [new file with mode: 0644]
security/root_plug.c
security/security.c
security/selinux/avc.c
security/selinux/hooks.c
security/selinux/include/security.h
security/selinux/nlmsgtab.c
security/selinux/selinuxfs.c
security/selinux/ss/services.c
security/smack/smack.h
security/smack/smack_access.c
security/smack/smack_lsm.c
security/smack/smackfs.c
security/tomoyo/common.c
security/tomoyo/common.h
security/tomoyo/domain.c
security/tomoyo/file.c
security/tomoyo/realpath.c
security/tomoyo/tomoyo.c
security/tomoyo/tomoyo.h

diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable
new file mode 100644 (file)
index 0000000..175bb4f
--- /dev/null
@@ -0,0 +1,18 @@
+What:      /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X
+Date:      August 2008
+KernelVersion: 2.6.27
+Contact:       mark.langsdorf@amd.com
+Description:   These files exist in every cpu's cache index directories.
+               There are currently 2 cache_disable_# files in each
+               directory.  Reading from these files on a supported
+               processor will return that cache disable index value
+               for that processor and node.  Writing to one of these
+               files will cause the specificed cache index to be disabled.
+
+               Currently, only AMD Family 10h Processors support cache index
+               disable, and only for their L3 caches.  See the BIOS and
+               Kernel Developer's Guide at
+               http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf
+               for formatting information and other details on the
+               cache index disable.
+Users:    joachim.deguara@amd.com
index d9aa43d78bcc9fc92e582a62cadb18cd9b27f4f0..25fb8bcf32a276f280d44aa53a4f2712a14257f4 100644 (file)
@@ -704,12 +704,24 @@ this directory the following files can currently be found:
                                The current number of free dma_debug_entries
                                in the allocator.
 
+       dma-api/driver-filter
+                               You can write a name of a driver into this file
+                               to limit the debug output to requests from that
+                               particular driver. Write an empty string to
+                               that file to disable the filter and see
+                               all errors again.
+
 If you have this code compiled into your kernel it will be enabled by default.
 If you want to boot without the bookkeeping anyway you can provide
 'dma_debug=off' as a boot parameter. This will disable DMA-API debugging.
 Notice that you can not enable it again at runtime. You have to reboot to do
 so.
 
+If you want to see debug messages only for a special device driver you can
+specify the dma_debug_driver=<drivername> parameter. This will enable the
+driver filter at boot time. The debug code will only print errors for that
+driver afterwards. This filter can be disabled or changed later using debugfs.
+
 When the code disables itself at runtime this is most likely because it ran
 out of dma_debug_entries. These entries are preallocated at boot. The number
 of preallocated entries is defined per architecture. If it is too low for you
index b1eb661e6302a3e5aeae0a58d3cda169f46f66a2..9632444f6c6270b87894e2381f95985ed3126d7a 100644 (file)
@@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \
            gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \
            genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \
            mac80211.xml debugobjects.xml sh.xml regulator.xml \
-           alsa-driver-api.xml writing-an-alsa-driver.xml
+           alsa-driver-api.xml writing-an-alsa-driver.xml \
+           tracepoint.xml
 
 ###
 # The build process is as follows (targets):
diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl
new file mode 100644 (file)
index 0000000..b0756d0
--- /dev/null
@@ -0,0 +1,89 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"
+       "http://www.oasis-open.org/docbook/xml/4.1.2/docbookx.dtd" []>
+
+<book id="Tracepoints">
+ <bookinfo>
+  <title>The Linux Kernel Tracepoint API</title>
+
+  <authorgroup>
+   <author>
+    <firstname>Jason</firstname>
+    <surname>Baron</surname>
+    <affiliation>
+     <address>
+      <email>jbaron@redhat.com</email>
+     </address>
+    </affiliation>
+   </author>
+  </authorgroup>
+
+  <legalnotice>
+   <para>
+     This documentation is free software; you can redistribute
+     it and/or modify it under the terms of the GNU General Public
+     License as published by the Free Software Foundation; either
+     version 2 of the License, or (at your option) any later
+     version.
+   </para>
+
+   <para>
+     This program is distributed in the hope that it will be
+     useful, but WITHOUT ANY WARRANTY; without even the implied
+     warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+     See the GNU General Public License for more details.
+   </para>
+
+   <para>
+     You should have received a copy of the GNU General Public
+     License along with this program; if not, write to the Free
+     Software Foundation, Inc., 59 Temple Place, Suite 330, Boston,
+     MA 02111-1307 USA
+   </para>
+
+   <para>
+     For more details see the file COPYING in the source
+     distribution of Linux.
+   </para>
+  </legalnotice>
+ </bookinfo>
+
+ <toc></toc>
+  <chapter id="intro">
+   <title>Introduction</title>
+   <para>
+     Tracepoints are static probe points that are located in strategic points
+     throughout the kernel. 'Probes' register/unregister with tracepoints
+     via a callback mechanism. The 'probes' are strictly typed functions that
+     are passed a unique set of parameters defined by each tracepoint.
+   </para>
+
+   <para>
+     From this simple callback mechanism, 'probes' can be used to profile, debug,
+     and understand kernel behavior. There are a number of tools that provide a
+     framework for using 'probes'. These tools include Systemtap, ftrace, and
+     LTTng.
+   </para>
+
+   <para>
+     Tracepoints are defined in a number of header files via various macros. Thus,
+     the purpose of this document is to provide a clear accounting of the available
+     tracepoints. The intention is to understand not only what tracepoints are
+     available but also to understand where future tracepoints might be added.
+   </para>
+
+   <para>
+     The API presented has functions of the form:
+     <function>trace_tracepointname(function parameters)</function>. These are the
+     tracepoints callbacks that are found throughout the code. Registering and
+     unregistering probes with these callback sites is covered in the
+     <filename>Documentation/trace/*</filename> directory.
+   </para>
+  </chapter>
+
+  <chapter id="irq">
+   <title>IRQ</title>
+!Iinclude/trace/events/irq.h
+  </chapter>
+
+</book>
index 068848240a8bdf135ba706da5c3d01a43609397a..02cced183b2d63f81428b3d6a21064397c7a7533 100644 (file)
@@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy).
 The output of "cat rcu/rcudata" looks as follows:
 
 rcu:
-  0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10
-  1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10
-  2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10
-  3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10
-  4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10
-  5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10
-  6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10
-  7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10
+rcu:
+  0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10
+  1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10
+  2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10
+  3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10
+  4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10
+  5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10
+  6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10
+  7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10
 rcu_bh:
-  0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10
-  1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10
-  2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10
-  3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10
-  4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
-  6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10
-  7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10
+  0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10
+  1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10
+  2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10
+  4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
+  7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10
 
 The first section lists the rcu_data structures for rcu, the second for
 rcu_bh.  Each section has one line per CPU, or eight for this 8-CPU system.
@@ -253,12 +254,6 @@ o  "pqc" indicates which grace period the last-observed quiescent
 o      "qp" indicates that RCU still expects a quiescent state from
        this CPU.
 
-o      "rpfq" is the number of rcu_pending() calls on this CPU required
-       to induce this CPU to invoke force_quiescent_state().
-
-o      "rp" is low-order four hex digits of the count of how many times
-       rcu_pending() has been invoked on this CPU.
-
 o      "dt" is the current value of the dyntick counter that is incremented
        when entering or leaving dynticks idle state, either by the
        scheduler or by irq.  The number after the "/" is the interrupt
@@ -305,6 +300,9 @@ o   "b" is the batch limit for this CPU.  If more than this number
        of RCU callbacks is ready to invoke, then the remainder will
        be deferred.
 
+There is also an rcu/rcudata.csv file with the same information in
+comma-separated-variable spreadsheet format.
+
 
 The output of "cat rcu/rcugp" looks as follows:
 
@@ -411,3 +409,63 @@ o  Each element of the form "1/1 0:127 ^0" represents one struct
                For example, the first entry at the lowest level shows
                "^0", indicating that it corresponds to bit zero in
                the first entry at the middle level.
+
+
+The output of "cat rcu/rcu_pending" looks as follows:
+
+rcu:
+  0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741
+  1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792
+  2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629
+  3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723
+  4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110
+  5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456
+  6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834
+  7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888
+rcu_bh:
+  0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314
+  1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180
+  2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936
+  3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863
+  4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671
+  5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235
+  6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921
+  7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542
+
+As always, this is once again split into "rcu" and "rcu_bh" portions.
+The fields are as follows:
+
+o      "np" is the number of times that __rcu_pending() has been invoked
+       for the corresponding flavor of RCU.
+
+o      "qsp" is the number of times that the RCU was waiting for a
+       quiescent state from this CPU.
+
+o      "cbr" is the number of times that this CPU had RCU callbacks
+       that had passed through a grace period, and were thus ready
+       to be invoked.
+
+o      "cng" is the number of times that this CPU needed another
+       grace period while RCU was idle.
+
+o      "gpc" is the number of times that an old grace period had
+       completed, but this CPU was not yet aware of it.
+
+o      "gps" is the number of times that a new grace period had started,
+       but this CPU was not yet aware of it.
+
+o      "nf" is the number of times that this CPU suspected that the
+       current grace period had run for too long, and thus needed to
+       be forced.
+
+       Please note that "forcing" consists of sending resched IPIs
+       to holdout CPUs.  If that CPU really still is in an old RCU
+       read-side critical section, then we really do have to wait for it.
+       The assumption behing "forcing" is that the CPU is not still in
+       an old RCU read-side critical section, but has not yet responded
+       for some other reason.
+
+o      "nn" is the number of times that this CPU needed nothing.  Alert
+       readers will note that the rcu "nn" number for a given CPU very
+       closely matches the rcu_bh "np" number for that same CPU.  This
+       is due to short-circuit evaluation in rcu_pending().
index 629c92e99783ecb7934ee00cf42c4a9f8556004f..34614b4c708eba850acad3ac15f7e59add026f3e 100644 (file)
@@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything
 other than a letter or digit, are reserved for use by the Smack development
 team. Smack labels are unstructured, case sensitive, and the only operation
 ever performed on them is comparison for equality. Smack labels cannot
-contain unprintable characters or the "/" (slash) character. Smack labels
-cannot begin with a '-', which is reserved for special options.
+contain unprintable characters, the "/" (slash), the "\" (backslash), the "'"
+(quote) and '"' (double-quote) characters.
+Smack labels cannot begin with a '-', which is reserved for special options.
 
 There are some predefined labels:
 
@@ -523,3 +524,18 @@ Smack supports some mount options:
 
 These mount options apply to all file system types.
 
+Smack auditing
+
+If you want Smack auditing of security events, you need to set CONFIG_AUDIT
+in your kernel configuration.
+By default, all denied events will be audited. You can change this behavior by
+writing a single character to the /smack/logging file :
+0 : no logging
+1 : log denied (default)
+2 : log accepted
+3 : log denied & accepted
+
+Events are logged as 'key=value' pairs, for each event you at least will get
+the subjet, the object, the rights requested, the action, the kernel function
+that triggered the event, plus other pairs depending on the type of event
+audited.
diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt
new file mode 100644 (file)
index 0000000..9dc1ff4
--- /dev/null
@@ -0,0 +1,131 @@
+Futex Requeue PI
+----------------
+
+Requeueing of tasks from a non-PI futex to a PI futex requires
+special handling in order to ensure the underlying rt_mutex is never
+left without an owner if it has waiters; doing so would break the PI
+boosting logic [see rt-mutex-desgin.txt] For the purposes of
+brevity, this action will be referred to as "requeue_pi" throughout
+this document.  Priority inheritance is abbreviated throughout as
+"PI".
+
+Motivation
+----------
+
+Without requeue_pi, the glibc implementation of
+pthread_cond_broadcast() must resort to waking all the tasks waiting
+on a pthread_condvar and letting them try to sort out which task
+gets to run first in classic thundering-herd formation.  An ideal
+implementation would wake the highest-priority waiter, and leave the
+rest to the natural wakeup inherent in unlocking the mutex
+associated with the condvar.
+
+Consider the simplified glibc calls:
+
+/* caller must lock mutex */
+pthread_cond_wait(cond, mutex)
+{
+       lock(cond->__data.__lock);
+       unlock(mutex);
+       do {
+          unlock(cond->__data.__lock);
+          futex_wait(cond->__data.__futex);
+          lock(cond->__data.__lock);
+       } while(...)
+       unlock(cond->__data.__lock);
+       lock(mutex);
+}
+
+pthread_cond_broadcast(cond)
+{
+       lock(cond->__data.__lock);
+       unlock(cond->__data.__lock);
+       futex_requeue(cond->data.__futex, cond->mutex);
+}
+
+Once pthread_cond_broadcast() requeues the tasks, the cond->mutex
+has waiters. Note that pthread_cond_wait() attempts to lock the
+mutex only after it has returned to user space.  This will leave the
+underlying rt_mutex with waiters, and no owner, breaking the
+previously mentioned PI-boosting algorithms.
+
+In order to support PI-aware pthread_condvar's, the kernel needs to
+be able to requeue tasks to PI futexes.  This support implies that
+upon a successful futex_wait system call, the caller would return to
+user space already holding the PI futex.  The glibc implementation
+would be modified as follows:
+
+
+/* caller must lock mutex */
+pthread_cond_wait_pi(cond, mutex)
+{
+       lock(cond->__data.__lock);
+       unlock(mutex);
+       do {
+          unlock(cond->__data.__lock);
+          futex_wait_requeue_pi(cond->__data.__futex);
+          lock(cond->__data.__lock);
+       } while(...)
+       unlock(cond->__data.__lock);
+        /* the kernel acquired the the mutex for us */
+}
+
+pthread_cond_broadcast_pi(cond)
+{
+       lock(cond->__data.__lock);
+       unlock(cond->__data.__lock);
+       futex_requeue_pi(cond->data.__futex, cond->mutex);
+}
+
+The actual glibc implementation will likely test for PI and make the
+necessary changes inside the existing calls rather than creating new
+calls for the PI cases.  Similar changes are needed for
+pthread_cond_timedwait() and pthread_cond_signal().
+
+Implementation
+--------------
+
+In order to ensure the rt_mutex has an owner if it has waiters, it
+is necessary for both the requeue code, as well as the waiting code,
+to be able to acquire the rt_mutex before returning to user space.
+The requeue code cannot simply wake the waiter and leave it to
+acquire the rt_mutex as it would open a race window between the
+requeue call returning to user space and the waiter waking and
+starting to run.  This is especially true in the uncontended case.
+
+The solution involves two new rt_mutex helper routines,
+rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which
+allow the requeue code to acquire an uncontended rt_mutex on behalf
+of the waiter and to enqueue the waiter on a contended rt_mutex.
+Two new system calls provide the kernel<->user interface to
+requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI.
+
+FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait()
+and pthread_cond_timedwait()) to block on the initial futex and wait
+to be requeued to a PI-aware futex.  The implementation is the
+result of a high-speed collision between futex_wait() and
+futex_lock_pi(), with some extra logic to check for the additional
+wake-up scenarios.
+
+FUTEX_REQUEUE_CMP_PI is called by the waker
+(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and
+possibly wake the waiting tasks. Internally, this system call is
+still handled by futex_requeue (by passing requeue_pi=1).  Before
+requeueing, futex_requeue() attempts to acquire the requeue target
+PI futex on behalf of the top waiter.  If it can, this waiter is
+woken.  futex_requeue() then proceeds to requeue the remaining
+nr_wake+nr_requeue tasks to the PI futex, calling
+rt_mutex_start_proxy_lock() prior to each requeue to prepare the
+task as a waiter on the underlying rt_mutex.  It is possible that
+the lock can be acquired at this stage as well, if so, the next
+waiter is woken to finish the acquisition of the lock.
+
+FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but
+their sum is all that really matters.  futex_requeue() will wake or
+requeue up to nr_wake + nr_requeue tasks.  It will wake only as many
+tasks as it can acquire the lock for, which in the majority of cases
+should be 0 as good programming practice dictates that the caller of
+either pthread_cond_broadcast() or pthread_cond_signal() acquire the
+mutex prior to making the call. FUTEX_REQUEUE_PI requires that
+nr_wake=1.  nr_requeue should be INT_MAX for broadcast and 0 for
+signal.
index fd5cac013037defc68c3ef118ed70b476debee71..72d3bf08d79b769ec909ed8d37977bea41b6679f 100644 (file)
@@ -56,7 +56,6 @@ parameter is applicable:
        ISAPNP  ISA PnP code is enabled.
        ISDN    Appropriate ISDN support is enabled.
        JOY     Appropriate joystick support is enabled.
-       KMEMTRACE kmemtrace is enabled.
        LIBATA  Libata driver is enabled
        LP      Printer support is enabled.
        LOOP    Loopback device support is enabled.
@@ -329,11 +328,6 @@ and is between 256 and 4096 characters. It is defined in the file
                                    flushed before they will be reused, which
                                    is a lot of faster
 
-       amd_iommu_size= [HW,X86-64]
-                       Define the size of the aperture for the AMD IOMMU
-                       driver. Possible values are:
-                       '32M', '64M' (default), '128M', '256M', '512M', '1G'
-
        amijoy.map=     [HW,JOY] Amiga joystick support
                        Map of devices attached to JOY0DAT and JOY1DAT
                        Format: <a>,<b>
@@ -646,6 +640,13 @@ and is between 256 and 4096 characters. It is defined in the file
                        DMA-API debugging code disables itself because the
                        architectural default is too low.
 
+       dma_debug_driver=<driver_name>
+                       With this option the DMA-API debugging driver
+                       filter feature can be enabled at boot time. Just
+                       pass the driver to filter for as the parameter.
+                       The filter can be disabled or changed to another
+                       driver later using sysfs.
+
        dscc4.setup=    [NET]
 
        dtc3181e=       [HW,SCSI]
@@ -752,12 +753,25 @@ and is between 256 and 4096 characters. It is defined in the file
                        ia64_pal_cache_flush instead of SAL_CACHE_FLUSH.
 
        ftrace=[tracer]
-                       [ftrace] will set and start the specified tracer
+                       [FTRACE] will set and start the specified tracer
                        as early as possible in order to facilitate early
                        boot debugging.
 
        ftrace_dump_on_oops
-                       [ftrace] will dump the trace buffers on oops.
+                       [FTRACE] will dump the trace buffers on oops.
+
+       ftrace_filter=[function-list]
+                       [FTRACE] Limit the functions traced by the function
+                       tracer at boot up. function-list is a comma separated
+                       list of functions. This list can be changed at run
+                       time by the set_ftrace_filter file in the debugfs
+                       tracing directory. 
+
+       ftrace_notrace=[function-list]
+                       [FTRACE] Do not trace the functions specified in
+                       function-list. This list can be changed at run time
+                       by the set_ftrace_notrace file in the debugfs
+                       tracing directory.
 
        gamecon.map[2|3]=
                        [HW,JOY] Multisystem joystick and NES/SNES/PSX pad
@@ -914,6 +928,12 @@ and is between 256 and 4096 characters. It is defined in the file
                        Formt: { "sha1" | "md5" }
                        default: "sha1"
 
+       ima_tcb         [IMA]
+                       Load a policy which meets the needs of the Trusted
+                       Computing Base.  This means IMA will measure all
+                       programs exec'd, files mmap'd for exec, and all files
+                       opened for read by uid=0.
+
        in2000=         [HW,SCSI]
                        See header of drivers/scsi/in2000.c.
 
@@ -1054,15 +1074,6 @@ and is between 256 and 4096 characters. It is defined in the file
                        use the HighMem zone if it exists, and the Normal
                        zone if it does not.
 
-       kmemtrace.enable=       [KNL,KMEMTRACE] Format: { yes | no }
-                               Controls whether kmemtrace is enabled
-                               at boot-time.
-
-       kmemtrace.subbufs=n     [KNL,KMEMTRACE] Overrides the number of
-                       subbufs kmemtrace's relay channel has. Set this
-                       higher than default (KMEMTRACE_N_SUBBUFS in code) if
-                       you experience buffer overruns.
-
        kgdboc=         [HW] kgdb over consoles.
                        Requires a tty driver that supports console polling.
                        (only serial suported for now)
@@ -1575,6 +1586,9 @@ and is between 256 and 4096 characters. It is defined in the file
        noinitrd        [RAM] Tells the kernel not to load any configured
                        initial RAM disk.
 
+       nointremap      [X86-64, Intel-IOMMU] Do not enable interrupt
+                       remapping.
+
        nointroute      [IA-64]
 
        nojitter        [IA64] Disables jitter checking for ITC timers.
@@ -1660,6 +1674,14 @@ and is between 256 and 4096 characters. It is defined in the file
        oprofile.timer= [HW]
                        Use timer interrupt instead of performance counters
 
+       oprofile.cpu_type=      Force an oprofile cpu type
+                       This might be useful if you have an older oprofile
+                       userland or if you want common events.
+                       Format: { archperfmon }
+                       archperfmon: [X86] Force use of architectural
+                               perfmon on Intel CPUs instead of the
+                               CPU specific event set.
+
        osst=           [HW,SCSI] SCSI Tape Driver
                        Format: <buffer_size>,<write_threshold>
                        See also Documentation/scsi/st.txt.
index f5b7127f54acb6af1d9f997a40e099b60c0b7571..7f5809eddee62eaf524103eeda485f5e553d5a01 100644 (file)
@@ -31,6 +31,7 @@ Contents:
 
      - Locking functions.
      - Interrupt disabling functions.
+     - Sleep and wake-up functions.
      - Miscellaneous functions.
 
  (*) Inter-CPU locking barrier effects.
@@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some
 other means.
 
 
+SLEEP AND WAKE-UP FUNCTIONS
+---------------------------
+
+Sleeping and waking on an event flagged in global data can be viewed as an
+interaction between two pieces of data: the task state of the task waiting for
+the event and the global data used to indicate the event.  To make sure that
+these appear to happen in the right order, the primitives to begin the process
+of going to sleep, and the primitives to initiate a wake up imply certain
+barriers.
+
+Firstly, the sleeper normally follows something like this sequence of events:
+
+       for (;;) {
+               set_current_state(TASK_UNINTERRUPTIBLE);
+               if (event_indicated)
+                       break;
+               schedule();
+       }
+
+A general memory barrier is interpolated automatically by set_current_state()
+after it has altered the task state:
+
+       CPU 1
+       ===============================
+       set_current_state();
+         set_mb();
+           STORE current->state
+           <general barrier>
+       LOAD event_indicated
+
+set_current_state() may be wrapped by:
+
+       prepare_to_wait();
+       prepare_to_wait_exclusive();
+
+which therefore also imply a general memory barrier after setting the state.
+The whole sequence above is available in various canned forms, all of which
+interpolate the memory barrier in the right place:
+
+       wait_event();
+       wait_event_interruptible();
+       wait_event_interruptible_exclusive();
+       wait_event_interruptible_timeout();
+       wait_event_killable();
+       wait_event_timeout();
+       wait_on_bit();
+       wait_on_bit_lock();
+
+
+Secondly, code that performs a wake up normally follows something like this:
+
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+or:
+
+       event_indicated = 1;
+       wake_up_process(event_daemon);
+
+A write memory barrier is implied by wake_up() and co. if and only if they wake
+something up.  The barrier occurs before the task state is cleared, and so sits
+between the STORE to indicate the event and the STORE to set TASK_RUNNING:
+
+       CPU 1                           CPU 2
+       =============================== ===============================
+       set_current_state();            STORE event_indicated
+         set_mb();                     wake_up();
+           STORE current->state          <write barrier>
+           <general barrier>             STORE current->state
+       LOAD event_indicated
+
+The available waker functions include:
+
+       complete();
+       wake_up();
+       wake_up_all();
+       wake_up_bit();
+       wake_up_interruptible();
+       wake_up_interruptible_all();
+       wake_up_interruptible_nr();
+       wake_up_interruptible_poll();
+       wake_up_interruptible_sync();
+       wake_up_interruptible_sync_poll();
+       wake_up_locked();
+       wake_up_locked_poll();
+       wake_up_nr();
+       wake_up_poll();
+       wake_up_process();
+
+
+[!] Note that the memory barriers implied by the sleeper and the waker do _not_
+order multiple stores before the wake-up with respect to loads of those stored
+values after the sleeper has called set_current_state().  For instance, if the
+sleeper does:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated)
+               break;
+       __set_current_state(TASK_RUNNING);
+       do_something(my_data);
+
+and the waker does:
+
+       my_data = value;
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+there's no guarantee that the change to event_indicated will be perceived by
+the sleeper as coming after the change to my_data.  In such a circumstance, the
+code on both sides must interpolate its own memory barriers between the
+separate data accesses.  Thus the above sleeper ought to do:
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       if (event_indicated) {
+               smp_rmb();
+               do_something(my_data);
+       }
+
+and the waker should do:
+
+       my_data = value;
+       smp_wmb();
+       event_indicated = 1;
+       wake_up(&event_wait_queue);
+
+
 MISCELLANEOUS FUNCTIONS
 -----------------------
 
@@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED?
 
 Under normal operation, memory operation reordering is generally not going to
 be a problem as a single-threaded linear piece of code will still appear to
-work correctly, even if it's in an SMP kernel.  There are, however, three
+work correctly, even if it's in an SMP kernel.  There are, however, four
 circumstances in which reordering definitely _could_ be a problem:
 
  (*) Interprocessor interaction.
index 5ba4d3fc625a424b341bfa5b5853473eaf80fe6a..1df7f9cdab0576227671703c6fec1fc780ec596b 100644 (file)
@@ -4,6 +4,7 @@
 CONTENTS
 ========
 
+0. WARNING
 1. Overview
   1.1 The problem
   1.2 The solution
@@ -14,6 +15,23 @@ CONTENTS
 3. Future plans
 
 
+0. WARNING
+==========
+
+ Fiddling with these settings can result in an unstable system, the knobs are
+ root only and assumes root knows what he is doing.
+
+Most notable:
+
+ * very small values in sched_rt_period_us can result in an unstable
+   system when the period is smaller than either the available hrtimer
+   resolution, or the time it takes to handle the budget refresh itself.
+
+ * very small values in sched_rt_runtime_us can result in an unstable
+   system when the runtime is so small the system has difficulty making
+   forward progress (NOTE: the migration thread and kstopmachine both
+   are real-time processes).
+
 1. Overview
 ===========
 
@@ -169,7 +187,7 @@ get their allocated time.
 
 Implementing SCHED_EDF might take a while to complete. Priority Inheritance is
 the biggest challenge as the current linux PI infrastructure is geared towards
-the limited static priority levels 0-139. With deadline scheduling you need to
+the limited static priority levels 0-99. With deadline scheduling you need to
 do deadline inheritance (since priority is inversely proportional to the
 deadline delta (deadline - now).
 
index f11ca7979fa67b5bbf08339cf473d110ecf92f7d..322a00bb99d97f703130de7f349f2dbc9b6fa24e 100644 (file)
@@ -32,6 +32,7 @@ show up in /proc/sys/kernel:
 - kstack_depth_to_print       [ X86 only ]
 - l2cr                        [ PPC only ]
 - modprobe                    ==> Documentation/debugging-modules.txt
+- modules_disabled
 - msgmax
 - msgmnb
 - msgmni
@@ -184,6 +185,16 @@ kernel stack.
 
 ==============================================================
 
+modules_disabled:
+
+A toggle value indicating if modules are allowed to be loaded
+in an otherwise modular kernel.  This toggle defaults to off
+(0), but can be set true (1).  Once true, modules can be
+neither loaded nor unloaded, and the toggle cannot be set back
+to false.
+
+==============================================================
+
 osrelease, ostype & version:
 
 # cat osrelease
diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt
new file mode 100644 (file)
index 0000000..f157d75
--- /dev/null
@@ -0,0 +1,90 @@
+                            Event Tracing
+
+               Documentation written by Theodore Ts'o
+                       Updated by Li Zefan
+
+1. Introduction
+===============
+
+Tracepoints (see Documentation/trace/tracepoints.txt) can be used
+without creating custom kernel modules to register probe functions
+using the event tracing infrastructure.
+
+Not all tracepoints can be traced using the event tracing system;
+the kernel developer must provide code snippets which define how the
+tracing information is saved into the tracing buffer, and how the
+tracing information should be printed.
+
+2. Using Event Tracing
+======================
+
+2.1 Via the 'set_event' interface
+---------------------------------
+
+The events which are available for tracing can be found in the file
+/debug/tracing/available_events.
+
+To enable a particular event, such as 'sched_wakeup', simply echo it
+to /debug/tracing/set_event. For example:
+
+       # echo sched_wakeup >> /debug/tracing/set_event
+
+[ Note: '>>' is necessary, otherwise it will firstly disable
+  all the events. ]
+
+To disable an event, echo the event name to the set_event file prefixed
+with an exclamation point:
+
+       # echo '!sched_wakeup' >> /debug/tracing/set_event
+
+To disable all events, echo an empty line to the set_event file:
+
+       # echo > /debug/tracing/set_event
+
+To enable all events, echo '*:*' or '*:' to the set_event file:
+
+       # echo *:* > /debug/tracing/set_event
+
+The events are organized into subsystems, such as ext4, irq, sched,
+etc., and a full event name looks like this: <subsystem>:<event>.  The
+subsystem name is optional, but it is displayed in the available_events
+file.  All of the events in a subsystem can be specified via the syntax
+"<subsystem>:*"; for example, to enable all irq events, you can use the
+command:
+
+       # echo 'irq:*' > /debug/tracing/set_event
+
+2.2 Via the 'enable' toggle
+---------------------------
+
+The events available are also listed in /debug/tracing/events/ hierarchy
+of directories.
+
+To enable event 'sched_wakeup':
+
+       # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To disable it:
+
+       # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable
+
+To enable all events in sched subsystem:
+
+       # echo 1 > /debug/tracing/events/sched/enable
+
+To eanble all events:
+
+       # echo 1 > /debug/tracing/events/enable
+
+When reading one of these enable files, there are four results:
+
+ 0 - all events this file affects are disabled
+ 1 - all events this file affects are enabled
+ X - there is a mixture of events enabled and disabled
+ ? - this file does not affect any event
+
+3. Defining an event-enabled tracepoint
+=======================================
+
+See The example provided in samples/trace_events
+
index fd9a3e69381351aeeaa6712d07c5170aa7dca1a4..2a82d8602944abca930635cff44f37c52238529d 100644 (file)
@@ -179,7 +179,7 @@ Here is the list of current tracers that may be configured.
 
        Function call tracer to trace all kernel functions.
 
-  "function_graph_tracer"
+  "function_graph"
 
        Similar to the function tracer except that the
        function tracer probes the functions on their entry
@@ -518,9 +518,18 @@ priority with zero (0) being the highest priority and the nice
 values starting at 100 (nice -20). Below is a quick chart to map
 the kernel priority to user land priorities.
 
-  Kernel priority: 0 to 99    ==> user RT priority 99 to 0
-  Kernel priority: 100 to 139 ==> user nice -20 to 19
-  Kernel priority: 140        ==> idle task priority
+   Kernel Space                     User Space
+ ===============================================================
+   0(high) to  98(low)     user RT priority 99(high) to 1(low)
+                           with SCHED_RR or SCHED_FIFO
+ ---------------------------------------------------------------
+  99                       sched_priority is not used in scheduling
+                           decisions(it must be specified as 0)
+ ---------------------------------------------------------------
+ 100(high) to 139(low)     user nice -20(high) to 19(low)
+ ---------------------------------------------------------------
+ 140                       idle task priority
+ ---------------------------------------------------------------
 
 The task states are:
 
diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
new file mode 100644 (file)
index 0000000..cd805e1
--- /dev/null
@@ -0,0 +1,17 @@
+The power tracer collects detailed information about C-state and P-state
+transitions, instead of just looking at the high-level "average"
+information.
+
+There is a helper script found in scrips/tracing/power.pl in the kernel
+sources which can be used to parse this information and create a
+Scalable Vector Graphics (SVG) picture from the trace data.
+
+To use this tracer:
+
+       echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+       echo power > /sys/kernel/debug/tracing/current_tracer
+       echo 1 > /sys/kernel/debug/tracing/tracing_enabled
+       sleep 1
+       echo 0 > /sys/kernel/debug/tracing/tracing_enabled
+       cat /sys/kernel/debug/tracing/trace | \
+               perl scripts/tracing/power.pl > out.sv
index e0203662f9e9f8a1a907060437f3fcdf907e2667..8da3a795083fec448cb7e13831b0c10180ff0b6e 100644 (file)
@@ -50,6 +50,10 @@ Protocol 2.08:       (Kernel 2.6.26) Added crc32 checksum and ELF format
 Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical
                pointer to single linked list of struct setup_data.
 
+Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment
+               beyond the kernel_alignment added, new init_size and
+               pref_address fields.  Added extended boot loader IDs.
+
 **** MEMORY LAYOUT
 
 The traditional memory map for the kernel loader, used for Image or
@@ -168,12 +172,13 @@ Offset    Proto   Name            Meaning
 021C/4 2.00+   ramdisk_size    initrd size (set by boot loader)
 0220/4 2.00+   bootsect_kludge DO NOT USE - for bootsect.S use only
 0224/2 2.01+   heap_end_ptr    Free memory after setup end
-0226/2 N/A     pad1            Unused
+0226/1 2.02+(3 ext_loader_ver  Extended boot loader version
+0227/1 2.02+(3 ext_loader_type Extended boot loader ID
 0228/4 2.02+   cmd_line_ptr    32-bit pointer to the kernel command line
 022C/4 2.03+   ramdisk_max     Highest legal initrd address
 0230/4 2.05+   kernel_alignment Physical addr alignment required for kernel
 0234/1 2.05+   relocatable_kernel Whether kernel is relocatable or not
-0235/1 N/A     pad2            Unused
+0235/1 2.10+   min_alignment   Minimum alignment, as a power of two
 0236/2 N/A     pad3            Unused
 0238/4 2.06+   cmdline_size    Maximum size of the kernel command line
 023C/4 2.07+   hardware_subarch Hardware subarchitecture
@@ -182,6 +187,8 @@ Offset      Proto   Name            Meaning
 024C/4 2.08+   payload_length  Length of kernel payload
 0250/8 2.09+   setup_data      64-bit physical pointer to linked list
                                of struct setup_data
+0258/8 2.10+   pref_address    Preferred loading address
+0260/4 2.10+   init_size       Linear memory required during initialization
 
 (1) For backwards compatibility, if the setup_sects field contains 0, the
     real value is 4.
@@ -190,6 +197,8 @@ Offset      Proto   Name            Meaning
     field are unusable, which means the size of a bzImage kernel
     cannot be determined.
 
+(3) Ignored, but safe to set, for boot protocols 2.02-2.09.
+
 If the "HdrS" (0x53726448) magic number is not found at offset 0x202,
 the boot protocol version is "old".  Loading an old kernel, the
 following parameters should be assumed:
@@ -343,18 +352,32 @@ Protocol: 2.00+
   0xTV here, where T is an identifier for the boot loader and V is
   a version number.  Otherwise, enter 0xFF here.
 
+  For boot loader IDs above T = 0xD, write T = 0xE to this field and
+  write the extended ID minus 0x10 to the ext_loader_type field.
+  Similarly, the ext_loader_ver field can be used to provide more than
+  four bits for the bootloader version.
+
+  For example, for T = 0x15, V = 0x234, write:
+
+  type_of_loader  <- 0xE4
+  ext_loader_type <- 0x05
+  ext_loader_ver  <- 0x23
+
   Assigned boot loader ids:
        0  LILO                 (0x00 reserved for pre-2.00 bootloader)
        1  Loadlin
        2  bootsect-loader      (0x20, all other values reserved)
-       3  SYSLINUX
-       4  EtherBoot
+       3  Syslinux
+       4  Etherboot/gPXE
        5  ELILO
        7  GRUB
-       8  U-BOOT
+       8  U-Boot
        9  Xen
        A  Gujin
        B  Qemu
+       C  Arcturus Networks uCbootloader
+       E  Extended             (see ext_loader_type)
+       F  Special              (0xFF = undefined)
 
   Please contact <hpa@zytor.com> if you need a bootloader ID
   value assigned.
@@ -453,6 +476,35 @@ Protocol:  2.01+
   Set this field to the offset (from the beginning of the real-mode
   code) of the end of the setup stack/heap, minus 0x0200.
 
+Field name:    ext_loader_ver
+Type:          write (optional)
+Offset/size:   0x226/1
+Protocol:      2.02+
+
+  This field is used as an extension of the version number in the
+  type_of_loader field.  The total version number is considered to be
+  (type_of_loader & 0x0f) + (ext_loader_ver << 4).
+
+  The use of this field is boot loader specific.  If not written, it
+  is zero.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
+Field name:    ext_loader_type
+Type:          write (obligatory if (type_of_loader & 0xf0) == 0xe0)
+Offset/size:   0x227/1
+Protocol:      2.02+
+
+  This field is used as an extension of the type number in
+  type_of_loader field.  If the type in type_of_loader is 0xE, then
+  the actual type is (ext_loader_type + 0x10).
+
+  This field is ignored if the type in type_of_loader is not 0xE.
+
+  Kernels prior to 2.6.31 did not recognize this field, but it is safe
+  to write for protocol version 2.02 or higher.
+
 Field name:    cmd_line_ptr
 Type:          write (obligatory)
 Offset/size:   0x228/4
@@ -482,11 +534,19 @@ Protocol: 2.03+
   0x37FFFFFF, you can start your ramdisk at 0x37FE0000.)
 
 Field name:    kernel_alignment
-Type:          read (reloc)
+Type:          read/modify (reloc)
 Offset/size:   0x230/4
-Protocol:      2.05+
+Protocol:      2.05+ (read), 2.10+ (modify)
+
+  Alignment unit required by the kernel (if relocatable_kernel is
+  true.)  A relocatable kernel that is loaded at an alignment
+  incompatible with the value in this field will be realigned during
+  kernel initialization.
 
-  Alignment unit required by the kernel (if relocatable_kernel is true.)
+  Starting with protocol version 2.10, this reflects the kernel
+  alignment preferred for optimal performance; it is possible for the
+  loader to modify this field to permit a lesser alignment.  See the
+  min_alignment and pref_address field below.
 
 Field name:    relocatable_kernel
 Type:          read (reloc)
@@ -498,6 +558,22 @@ Protocol:  2.05+
   After loading, the boot loader must set the code32_start field to
   point to the loaded code, or to a boot loader hook.
 
+Field name:    min_alignment
+Type:          read (reloc)
+Offset/size:   0x235/1
+Protocol:      2.10+
+
+  This field, if nonzero, indicates as a power of two the minimum
+  alignment required, as opposed to preferred, by the kernel to boot.
+  If a boot loader makes use of this field, it should update the
+  kernel_alignment field with the alignment unit desired; typically:
+
+       kernel_alignment = 1 << min_alignment
+
+  There may be a considerable performance cost with an excessively
+  misaligned kernel.  Therefore, a loader should typically try each
+  power-of-two alignment from kernel_alignment down to this alignment.
+
 Field name:    cmdline_size
 Type:          read
 Offset/size:   0x238/4
@@ -582,6 +658,36 @@ Protocol:  2.09+
   sure to consider the case where the linked list already contains
   entries.
 
+Field name:    pref_address
+Type:          read (reloc)
+Offset/size:   0x258/8
+Protocol:      2.10+
+
+  This field, if nonzero, represents a preferred load address for the
+  kernel.  A relocating bootloader should attempt to load at this
+  address if possible.
+
+  A non-relocatable kernel will unconditionally move itself and to run
+  at this address.
+
+Field name:    init_size
+Type:          read
+Offset/size:   0x25c/4
+
+  This field indicates the amount of linear contiguous memory starting
+  at the kernel runtime start address that the kernel needs before it
+  is capable of examining its memory map.  This is not the same thing
+  as the total amount of memory the kernel needs to boot, but it can
+  be used by a relocating boot loader to help select a safe load
+  address for the kernel.
+
+  The kernel runtime start address is determined by the following algorithm:
+
+  if (relocatable_kernel)
+       runtime_start = align_up(load_address, kernel_alignment)
+  else
+       runtime_start = pref_address
+
 
 **** THE IMAGE CHECKSUM
 
index 34c13040a718f6febbccadc1f32bc24f4a060664..2db5893d6c97234ded88dbc8b52daaa8defc3af4 100644 (file)
@@ -150,11 +150,6 @@ NUMA
                Otherwise, the remaining system RAM is allocated to an
                additional node.
 
-  numa=hotadd=percent
-               Only allow hotadd memory to preallocate page structures upto
-               percent of already available memory.
-               numa=hotadd=0 will disable hotadd memory.
-
 ACPI
 
   acpi=off     Don't enable ACPI
index 29b52b14d0b43950fa79f3948b1fbe4789396fbf..d6498e3cd7133c7d88279c26d8618aa5fc3d10ad 100644 (file)
@@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables:
 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm
 hole caused by [48:63] sign extension
 ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole
-ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory
-ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole
-ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space
-ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB)
+ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory
+ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole
+ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space
+ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole
+ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB)
 ... unused hole ...
 ffffffff80000000 - ffffffffa0000000 (=512 MB)  kernel text mapping, from phys 0
 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space
index cf4abddfc8a40dcc161e851c26294480fff3076f..84285b5ba359c26cb53811937ab5f4422e94ff6f 100644 (file)
@@ -71,7 +71,7 @@ P: Person
 M: Mail patches to
 L: Mailing list that is relevant to this area
 W: Web-page with status/info
-T: SCM tree type and location.  Type is one of: git, hg, quilt.
+T: SCM tree type and location.  Type is one of: git, hg, quilt, stgit.
 S: Status, one of the following:
 
        Supported:      Someone is actually paid to look after this.
@@ -159,7 +159,8 @@ F:  drivers/net/r8169.c
 8250/16?50 (AND CLONE UARTS) SERIAL DRIVER
 L:     linux-serial@vger.kernel.org
 W:     http://serial.sourceforge.net
-S:     Orphan
+M:     alan@lxorguk.ukuu.org.uk
+S:     Odd Fixes
 F:     drivers/serial/8250*
 F:     include/linux/serial_8250.h
 
@@ -5629,6 +5630,7 @@ P:        Alan Cox
 M:     alan@lxorguk.ukuu.org.uk
 L:     linux-kernel@vger.kernel.org
 S:     Maintained
+T:     stgit http://zeniv.linux.org.uk/~alan/ttydev/
 
 TULIP NETWORK DRIVERS
 P:     Grant Grundler
index 9c9d1fd4155fc5e736c98d89e68d034cd8f2b9ef..5bd5259324b7c8827adb237facd7bf120edd4ca2 100644 (file)
@@ -176,22 +176,26 @@ cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
        }
 }
 
-static void
+static int
 dp264_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
        cpu_set_irq_affinity(irq, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
+
+       return 0;
 }
 
-static void
+static int
 clipper_set_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&dp264_irq_lock);
        cpu_set_irq_affinity(irq - 16, *affinity);
        tsunami_update_irq_hw(cached_irq_mask);
        spin_unlock(&dp264_irq_lock);
+
+       return 0;
 }
 
 static struct hw_interrupt_type dp264_irq_type = {
index 27f840a4ad3d7ec0949ac92a55ef5c7c1a3b9280..8dd239ebdb9e2cc3489c3d128402a9da37033a29 100644 (file)
@@ -157,13 +157,15 @@ titan_cpu_set_irq_affinity(unsigned int irq, cpumask_t affinity)
 
 }
 
-static void
+static int
 titan_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 { 
        spin_lock(&titan_irq_lock);
        titan_cpu_set_irq_affinity(irq - 16, *affinity);
        titan_update_irq_hw(titan_cached_irq_mask);
        spin_unlock(&titan_irq_lock);
+
+       return 0;
 }
 
 static void
index 3e1714c6523f06f4ab0572886edba151f21ffb8c..664c7b8b1ba87d7d5268c661808636a27b4f610c 100644 (file)
@@ -109,7 +109,7 @@ static void gic_unmask_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
+static int gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
 {
        void __iomem *reg = gic_dist_base(irq) + GIC_DIST_TARGET + (gic_irq(irq) & ~3);
        unsigned int shift = (irq % 4) * 8;
@@ -122,6 +122,8 @@ static void gic_set_cpu(unsigned int irq, const struct cpumask *mask_val)
        val |= 1 << (cpu + shift);
        writel(val, reg);
        spin_unlock(&irq_controller_lock);
+
+       return 0;
 }
 #endif
 
index 599217b2e13f94cea800c55653e6497c9d2ff3ad..f9bd17dd8dd71af1fd6d6c05f7c7781b51daea26 100644 (file)
 #define ASMARM_ARCH_UART_H
 
 #define IMXUART_HAVE_RTSCTS (1<<0)
+#define IMXUART_IRDA        (1<<1)
 
 struct imxuart_platform_data {
        int (*init)(struct platform_device *pdev);
        int (*exit)(struct platform_device *pdev);
        unsigned int flags;
+       void (*irda_enable)(int enable);
+       unsigned int irda_inv_rx:1;
+       unsigned int irda_inv_tx:1;
+       unsigned short transceiver_delay;
 };
 
 #endif
index df3925cb1c7fbbcab86f8445491d0e77ebf684a9..d70b445f4a8f0fc82922e848ae4f0bbbae6a1fb5 100644 (file)
@@ -325,12 +325,14 @@ static void end_crisv32_irq(unsigned int irq)
 {
 }
 
-void set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
+int set_affinity_crisv32_irq(unsigned int irq, const struct cpumask *dest)
 {
        unsigned long flags;
        spin_lock_irqsave(&irq_lock, flags);
        irq_allocations[irq - FIRST_IRQ].mask = *dest;
        spin_unlock_irqrestore(&irq_lock, flags);
+
+       return 0;
 }
 
 static struct irq_chip crisv32_irq_type = {
index 9d1552a9ee2c88ddb40bc7d70dea316f725d2843..8a5bd7a9c6f533b452b21de81127c476b1826765 100644 (file)
@@ -6,6 +6,7 @@ config FRV
        bool
        default y
        select HAVE_IDE
+       select HAVE_ARCH_TRACEHOOK
 
 config ZONE_DMA
        bool
index 287f6f697ce276d55b498cea168a67e402e6b5bf..50ae91b29674584743e5935aa49e0717fb260b34 100644 (file)
@@ -112,7 +112,7 @@ extern unsigned long atomic_test_and_XOR_mask(unsigned long mask, volatile unsig
 #define atomic_clear_mask(mask, v)     atomic_test_and_ANDNOT_mask((mask), (v))
 #define atomic_set_mask(mask, v)       atomic_test_and_OR_mask((mask), (v))
 
-static inline int test_and_clear_bit(int nr, volatile void *addr)
+static inline int test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *ptr = addr;
        unsigned long mask = 1UL << (nr & 31);
@@ -120,7 +120,7 @@ static inline int test_and_clear_bit(int nr, volatile void *addr)
        return (atomic_test_and_ANDNOT_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_set_bit(int nr, volatile void *addr)
+static inline int test_and_set_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *ptr = addr;
        unsigned long mask = 1UL << (nr & 31);
@@ -128,7 +128,7 @@ static inline int test_and_set_bit(int nr, volatile void *addr)
        return (atomic_test_and_OR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline int test_and_change_bit(int nr, volatile void *addr)
+static inline int test_and_change_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *ptr = addr;
        unsigned long mask = 1UL << (nr & 31);
@@ -136,22 +136,22 @@ static inline int test_and_change_bit(int nr, volatile void *addr)
        return (atomic_test_and_XOR_mask(mask, ptr) & mask) != 0;
 }
 
-static inline void clear_bit(int nr, volatile void *addr)
+static inline void clear_bit(unsigned long nr, volatile void *addr)
 {
        test_and_clear_bit(nr, addr);
 }
 
-static inline void set_bit(int nr, volatile void *addr)
+static inline void set_bit(unsigned long nr, volatile void *addr)
 {
        test_and_set_bit(nr, addr);
 }
 
-static inline void change_bit(int nr, volatile void * addr)
+static inline void change_bit(unsigned long nr, volatile void *addr)
 {
        test_and_change_bit(nr, addr);
 }
 
-static inline void __clear_bit(int nr, volatile void * addr)
+static inline void __clear_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask;
@@ -161,7 +161,7 @@ static inline void __clear_bit(int nr, volatile void * addr)
        *a &= ~mask;
 }
 
-static inline void __set_bit(int nr, volatile void * addr)
+static inline void __set_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask;
@@ -171,7 +171,7 @@ static inline void __set_bit(int nr, volatile void * addr)
        *a |= mask;
 }
 
-static inline void __change_bit(int nr, volatile void *addr)
+static inline void __change_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask;
@@ -181,7 +181,7 @@ static inline void __change_bit(int nr, volatile void *addr)
        *a ^= mask;
 }
 
-static inline int __test_and_clear_bit(int nr, volatile void * addr)
+static inline int __test_and_clear_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask, retval;
@@ -193,7 +193,7 @@ static inline int __test_and_clear_bit(int nr, volatile void * addr)
        return retval;
 }
 
-static inline int __test_and_set_bit(int nr, volatile void * addr)
+static inline int __test_and_set_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask, retval;
@@ -205,7 +205,7 @@ static inline int __test_and_set_bit(int nr, volatile void * addr)
        return retval;
 }
 
-static inline int __test_and_change_bit(int nr, volatile void * addr)
+static inline int __test_and_change_bit(unsigned long nr, volatile void *addr)
 {
        volatile unsigned long *a = addr;
        int mask, retval;
@@ -220,12 +220,13 @@ static inline int __test_and_change_bit(int nr, volatile void * addr)
 /*
  * This routine doesn't need to be atomic.
  */
-static inline int __constant_test_bit(int nr, const volatile void * addr)
+static inline int
+__constant_test_bit(unsigned long nr, const volatile void *addr)
 {
        return ((1UL << (nr & 31)) & (((const volatile unsigned int *) addr)[nr >> 5])) != 0;
 }
 
-static inline int __test_bit(int nr, const volatile void * addr)
+static inline int __test_bit(unsigned long nr, const volatile void *addr)
 {
        int     * a = (int *) addr;
        int     mask;
index 7279ec07d62e32007f91a01bfe58a20014283a4d..7bbf6e47f8c8a5cb216c9b8f5e0b8be3b902c5bb 100644 (file)
@@ -116,6 +116,7 @@ do {                                                                                        \
 } while(0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_FDPIC_CORE_EFLAGS  EF_FRV_FDPIC
 #define ELF_EXEC_PAGESIZE      16384
 
index 585d9b49949a7537d9d1f89221f93c047cfedacf..cc685e60b0f9ce92c06372cd695346a3b3ba6b17 100644 (file)
@@ -87,8 +87,7 @@ static inline void pci_dma_sync_single(struct pci_dev *hwdev,
                                       dma_addr_t dma_handle,
                                       size_t size, int direction)
 {
-       if (direction == PCI_DMA_NONE)
-                BUG();
+       BUG_ON(direction == PCI_DMA_NONE);
 
        frv_cache_wback_inv((unsigned long)bus_to_virt(dma_handle),
                            (unsigned long)bus_to_virt(dma_handle) + size);
@@ -105,9 +104,7 @@ static inline void pci_dma_sync_sg(struct pci_dev *hwdev,
                                   int nelems, int direction)
 {
        int i;
-
-       if (direction == PCI_DMA_NONE)
-                BUG();
+       BUG_ON(direction == PCI_DMA_NONE);
 
        for (i = 0; i < nelems; i++)
                frv_cache_wback_inv(sg_dma_address(&sg[i]),
index cf6934012b64dc70736e26073470e143b2ed11ab..a54b535c9e493f2ba18fa0974993d5859e18bf0a 100644 (file)
@@ -65,6 +65,8 @@
 #ifdef __KERNEL__
 #ifndef __ASSEMBLY__
 
+struct task_struct;
+
 /*
  * we dedicate GR28 to keeping a pointer to the current exception frame
  * - gr28 is destroyed on entry to the kernel from userspace
@@ -73,11 +75,18 @@ register struct pt_regs *__frame asm("gr28");
 
 #define user_mode(regs)                        (!((regs)->psr & PSR_S))
 #define instruction_pointer(regs)      ((regs)->pc)
+#define user_stack_pointer(regs)       ((regs)->sp)
 
 extern unsigned long user_stack(const struct pt_regs *);
 extern void show_regs(struct pt_regs *);
 #define profile_pc(regs) ((regs)->pc)
-#endif
+
+#define task_pt_regs(task) ((task)->thread.frame0)
+
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
 
 #endif /* !__ASSEMBLY__ */
+#endif /* __KERNEL__ */
 #endif /* _ASM_PTRACE_H */
diff --git a/arch/frv/include/asm/syscall.h b/arch/frv/include/asm/syscall.h
new file mode 100644 (file)
index 0000000..70689eb
--- /dev/null
@@ -0,0 +1,123 @@
+/* syscall parameter access functions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _ASM_SYSCALL_H
+#define _ASM_SYSCALL_H
+
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+/*
+ * Get the system call number or -1
+ */
+static inline long syscall_get_nr(struct task_struct *task,
+                                 struct pt_regs *regs)
+{
+       return regs->syscallno;
+}
+
+/*
+ * Restore the clobbered GR8 register
+ * (1st syscall arg was overwritten with syscall return or error)
+ */
+static inline void syscall_rollback(struct task_struct *task,
+                                   struct pt_regs *regs)
+{
+       regs->gr8 = regs->orig_gr8;
+}
+
+/*
+ * See if the syscall return value is an error, returning it if it is and 0 if
+ * not
+ */
+static inline long syscall_get_error(struct task_struct *task,
+                                    struct pt_regs *regs)
+{
+       return IS_ERR_VALUE(regs->gr8) ? regs->gr8 : 0;
+}
+
+/*
+ * Get the syscall return value
+ */
+static inline long syscall_get_return_value(struct task_struct *task,
+                                           struct pt_regs *regs)
+{
+       return regs->gr8;
+}
+
+/*
+ * Set the syscall return value
+ */
+static inline void syscall_set_return_value(struct task_struct *task,
+                                           struct pt_regs *regs,
+                                           int error, long val)
+{
+       if (error)
+               regs->gr8 = -error;
+       else
+               regs->gr8 = val;
+}
+
+/*
+ * Retrieve the system call arguments
+ */
+static inline void syscall_get_arguments(struct task_struct *task,
+                                        struct pt_regs *regs,
+                                        unsigned int i, unsigned int n,
+                                        unsigned long *args)
+{
+       /*
+        * Do this simply for now. If we need to start supporting
+        * fetching arguments from arbitrary indices, this will need some
+        * extra logic. Presently there are no in-tree users that depend
+        * on this behaviour.
+        */
+       BUG_ON(i);
+
+       /* Argument pattern is: GR8, GR9, GR10, GR11, GR12, GR13 */
+       switch (n) {
+       case 6: args[5] = regs->gr13;
+       case 5: args[4] = regs->gr12;
+       case 4: args[3] = regs->gr11;
+       case 3: args[2] = regs->gr10;
+       case 2: args[1] = regs->gr9;
+       case 1: args[0] = regs->gr8;
+               break;
+       default:
+               BUG();
+       }
+}
+
+/*
+ * Alter the system call arguments
+ */
+static inline void syscall_set_arguments(struct task_struct *task,
+                                        struct pt_regs *regs,
+                                        unsigned int i, unsigned int n,
+                                        const unsigned long *args)
+{
+       /* Same note as above applies */
+       BUG_ON(i);
+
+       switch (n) {
+       case 6: regs->gr13 = args[5];
+       case 5: regs->gr12 = args[4];
+       case 4: regs->gr11 = args[3];
+       case 3: regs->gr10 = args[2];
+       case 2: regs->gr9  = args[1];
+       case 1: regs->gr8  = args[0];
+               break;
+       default:
+               BUG();
+       }
+}
+
+#endif /* _ASM_SYSCALL_H */
index bb53ab753ffbf7b3a62e9e105583b4339d9768ce..e8a5ed7be0212791674996fd16afeb4f38a6b751 100644 (file)
@@ -109,20 +109,20 @@ register struct thread_info *__current_thread_info asm("gr15");
  * - other flags in MSW
  */
 #define TIF_SYSCALL_TRACE      0       /* syscall trace active */
-#define TIF_SIGPENDING         1       /* signal pending */
-#define TIF_NEED_RESCHED       2       /* rescheduling necessary */
-#define TIF_SINGLESTEP         3       /* restore singlestep on return to user mode */
-#define TIF_IRET               4       /* return with iret */
+#define TIF_NOTIFY_RESUME      1       /* callback before returning to user */
+#define TIF_SIGPENDING         2       /* signal pending */
+#define TIF_NEED_RESCHED       3       /* rescheduling necessary */
+#define TIF_SINGLESTEP         4       /* restore singlestep on return to user mode */
 #define TIF_RESTORE_SIGMASK    5       /* restore signal mask in do_signal() */
 #define TIF_POLLING_NRFLAG     16      /* true if poll_idle() is polling TIF_NEED_RESCHED */
 #define TIF_MEMDIE             17      /* OOM killer killed process */
 #define TIF_FREEZE             18      /* freezing for suspend */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
+#define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
 #define _TIF_SIGPENDING                (1 << TIF_SIGPENDING)
 #define _TIF_NEED_RESCHED      (1 << TIF_NEED_RESCHED)
 #define _TIF_SINGLESTEP                (1 << TIF_SINGLESTEP)
-#define _TIF_IRET              (1 << TIF_IRET)
 #define _TIF_RESTORE_SIGMASK   (1 << TIF_RESTORE_SIGMASK)
 #define _TIF_POLLING_NRFLAG    (1 << TIF_POLLING_NRFLAG)
 #define _TIF_FREEZE            (1 << TIF_FREEZE)
index 1da523b3298e03a36c8e27e23bf9d20f10615e67..356e0e327a8923f87d366776f06e5b916852bf56 100644 (file)
@@ -886,7 +886,6 @@ system_call:
        bnc             icc0,#0,__syscall_badsys
 
        ldi             @(gr15,#TI_FLAGS),gr4
-       ori             gr4,#_TIF_SYSCALL_TRACE,gr4
        andicc          gr4,#_TIF_SYSCALL_TRACE,gr0,icc0
        bne             icc0,#0,__syscall_trace_entry
 
@@ -1150,11 +1149,10 @@ __entry_work_notifysig:
        # perform syscall entry tracing
 __syscall_trace_entry:
        LEDS            0x6320
-       setlos.p        #0,gr8
-       call            do_syscall_trace
+       call            syscall_trace_entry
 
-       ldi             @(gr28,#REG_SYSCALLNO),gr7
-       lddi            @(gr28,#REG_GR(8)) ,gr8
+       lddi.p          @(gr28,#REG_GR(8)) ,gr8
+       ori             gr8,#0,gr7              ; syscall_trace_entry() returned new syscallno
        lddi            @(gr28,#REG_GR(10)),gr10
        lddi.p          @(gr28,#REG_GR(12)),gr12
 
@@ -1169,11 +1167,10 @@ __syscall_exit_work:
        beq             icc0,#1,__entry_work_pending
 
        movsg           psr,gr23
-       andi            gr23,#~PSR_PIL,gr23     ; could let do_syscall_trace() call schedule()
+       andi            gr23,#~PSR_PIL,gr23     ; could let syscall_trace_exit() call schedule()
        movgs           gr23,psr
 
-       setlos.p        #1,gr8
-       call            do_syscall_trace
+       call            syscall_trace_exit
        bra             __entry_resume_userspace
 
 __syscall_badsys:
index 5e7d401d21e7d6455883f118213a040e97754fbf..60eeed3694c0764d2d0b9d7d3a0c2cffe14b9a89 100644 (file)
@@ -19,6 +19,9 @@
 #include <linux/user.h>
 #include <linux/security.h>
 #include <linux/signal.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
  * in exit.c or in signal.c.
  */
 
+/*
+ * retrieve the contents of FRV userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      void *kbuf, void __user *ubuf)
+{
+       const struct user_int_regs *iregs = &target->thread.user->i;
+       int ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 iregs, 0, sizeof(*iregs));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       sizeof(*iregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      const void *kbuf, const void __user *ubuf)
+{
+       struct user_int_regs *iregs = &target->thread.user->i;
+       unsigned int offs_gr0, offs_gr1;
+       int ret;
+
+       /* not allowed to set PSR or __status */
+       if (pos < offsetof(struct user_int_regs, psr) + sizeof(long) &&
+           pos + count > offsetof(struct user_int_regs, psr))
+               return -EIO;
+
+       if (pos < offsetof(struct user_int_regs, __status) + sizeof(long) &&
+           pos + count > offsetof(struct user_int_regs, __status))
+               return -EIO;
+
+       /* set the control regs */
+       offs_gr0 = offsetof(struct user_int_regs, gr[0]);
+       offs_gr1 = offsetof(struct user_int_regs, gr[1]);
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                iregs, 0, offs_gr0);
+       if (ret < 0)
+               return ret;
+
+       /* skip GR0/TBR */
+       ret = user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                       offs_gr0, offs_gr1);
+       if (ret < 0)
+               return ret;
+
+       /* set the general regs */
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &iregs->gr[1], offs_gr1, sizeof(*iregs));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                       sizeof(*iregs), -1);
+}
+
+/*
+ * retrieve the contents of FRV userspace FP/Media registers
+ */
+static int fpmregs_get(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      void *kbuf, void __user *ubuf)
+{
+       const struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+       int ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 fpregs, 0, sizeof(*fpregs));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       sizeof(*fpregs), -1);
+}
+
+/*
+ * update the contents of the FRV userspace FP/Media registers
+ */
+static int fpmregs_set(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      const void *kbuf, const void __user *ubuf)
+{
+       struct user_fpmedia_regs *fpregs = &target->thread.user->f;
+       int ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                fpregs, 0, sizeof(*fpregs));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                       sizeof(*fpregs), -1);
+}
+
+/*
+ * determine if the FP/Media registers have actually been used
+ */
+static int fpmregs_active(struct task_struct *target,
+                         const struct user_regset *regset)
+{
+       return tsk_used_math(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the FRV under Linux
+ */
+enum frv_regset {
+       REGSET_GENERAL,
+       REGSET_FPMEDIA,
+};
+
+static const struct user_regset frv_regsets[] = {
+       /*
+        * General register format is:
+        *      PSR, ISR, CCR, CCCR, LR, LCR, PC, (STATUS), SYSCALLNO, ORIG_G8
+        *      GNER0-1, IACC0, TBR, GR1-63
+        */
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS,
+               .n              = ELF_NGREG,
+               .size           = sizeof(long),
+               .align          = sizeof(long),
+               .get            = genregs_get,
+               .set            = genregs_set,
+       },
+       /*
+        * FPU/Media register format is:
+        *      FR0-63, FNER0-1, MSR0-1, ACC0-7, ACCG0-8, FSR
+        */
+       [REGSET_FPMEDIA] = {
+               .core_note_type = NT_PRFPREG,
+               .n              = sizeof(struct user_fpmedia_regs) / sizeof(long),
+               .size           = sizeof(long),
+               .align          = sizeof(long),
+               .get            = fpmregs_get,
+               .set            = fpmregs_set,
+               .active         = fpmregs_active,
+       },
+};
+
+static const struct user_regset_view user_frv_native_view = {
+       .name           = "frv",
+       .e_machine      = EM_FRV,
+       .regsets        = frv_regsets,
+       .n              = ARRAY_SIZE(frv_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+       return &user_frv_native_view;
+}
+
 /*
  * Get contents of register REGNO in task TASK.
  */
@@ -68,41 +234,24 @@ static inline int put_reg(struct task_struct *task, int regno,
        }
 }
 
-/*
- * check that an address falls within the bounds of the target process's memory
- * mappings
- */
-static inline int is_user_addr_valid(struct task_struct *child,
-                                    unsigned long start, unsigned long len)
-{
-#ifdef CONFIG_MMU
-       if (start >= PAGE_OFFSET || len > PAGE_OFFSET - start)
-               return -EIO;
-       return 0;
-#else
-       struct vm_area_struct *vma;
-
-       vma = find_vma(child->mm, start);
-       if (vma && start >= vma->vm_start && start + len <= vma->vm_end)
-               return 0;
-
-       return -EIO;
-#endif
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
  * Control h/w single stepping
  */
-void ptrace_disable(struct task_struct *child)
+void user_enable_single_step(struct task_struct *child)
+{
+       child->thread.frame0->__status |= REG__STATUS_STEP;
+}
+
+void user_disable_single_step(struct task_struct *child)
 {
        child->thread.frame0->__status &= ~REG__STATUS_STEP;
 }
 
-void ptrace_enable(struct task_struct *child)
+void ptrace_disable(struct task_struct *child)
 {
-       child->thread.frame0->__status |= REG__STATUS_STEP;
+       user_disable_single_step(child);
 }
 
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
@@ -111,15 +260,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
        int ret;
 
        switch (request) {
-               /* when I and D space are separate, these will need to be fixed. */
-       case PTRACE_PEEKTEXT: /* read word at location addr. */
-       case PTRACE_PEEKDATA:
-               ret = -EIO;
-               if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-                       break;
-               ret = generic_ptrace_peekdata(child, addr, data);
-               break;
-
                /* read the word at location addr in the USER area. */
        case PTRACE_PEEKUSR: {
                tmp = 0;
@@ -163,15 +303,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                break;
        }
 
-               /* when I and D space are separate, this will have to be fixed. */
-       case PTRACE_POKETEXT: /* write the word at location addr. */
-       case PTRACE_POKEDATA:
-               ret = -EIO;
-               if (is_user_addr_valid(child, addr, sizeof(tmp)) < 0)
-                       break;
-               ret = generic_ptrace_pokedata(child, addr, data);
-               break;
-
        case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
                ret = -EIO;
                if ((addr & 3) || addr < 0)
@@ -179,7 +310,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 
                ret = 0;
                switch (addr >> 2) {
-               case 0 ... PT__END-1:
+               case 0 ... PT__END - 1:
                        ret = put_reg(child, addr >> 2, data);
                        break;
 
@@ -189,95 +320,29 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                }
                break;
 
-       case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
-       case PTRACE_CONT: /* restart after signal. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               if (request == PTRACE_SYSCALL)
-                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               else
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               child->exit_code = data;
-               ptrace_disable(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-               /* make the child exit.  Best I can do is send it a sigkill.
-                * perhaps it should be put in the status that it wants to
-                * exit.
-                */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               child->exit_code = SIGKILL;
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               ptrace_disable(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SINGLESTEP:  /* set the trap flag. */
-               ret = -EIO;
-               if (!valid_signal(data))
-                       break;
-               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               ptrace_enable(child);
-               child->exit_code = data;
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-       case PTRACE_DETACH:     /* detach a process that was attached. */
-               ret = ptrace_detach(child, data);
-               break;
-
-       case PTRACE_GETREGS: { /* Get all integer regs from the child. */
-               int i;
-               for (i = 0; i < PT__GPEND; i++) {
-                       tmp = get_reg(child, i);
-                       if (put_user(tmp, (unsigned long *) data)) {
-                               ret = -EFAULT;
-                               break;
-                       }
-                       data += sizeof(long);
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all integer regs in the child. */
-               int i;
-               for (i = 0; i < PT__GPEND; i++) {
-                       if (get_user(tmp, (unsigned long *) data)) {
-                               ret = -EFAULT;
-                               break;
-                       }
-                       put_reg(child, i, tmp);
-                       data += sizeof(long);
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child FP/Media state. */
-               ret = 0;
-               if (copy_to_user((void *) data,
-                                &child->thread.user->f,
-                                sizeof(child->thread.user->f)))
-                       ret = -EFAULT;
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child FP/Media state. */
-               ret = 0;
-               if (copy_from_user(&child->thread.user->f,
-                                  (void *) data,
-                                  sizeof(child->thread.user->f)))
-                       ret = -EFAULT;
-               break;
-       }
+       case PTRACE_GETREGS:    /* Get all integer regs from the child. */
+               return copy_regset_to_user(child, &user_frv_native_view,
+                                          REGSET_GENERAL,
+                                          0, sizeof(child->thread.user->i),
+                                          (void __user *)data);
+
+       case PTRACE_SETREGS:    /* Set all integer regs in the child. */
+               return copy_regset_from_user(child, &user_frv_native_view,
+                                            REGSET_GENERAL,
+                                            0, sizeof(child->thread.user->i),
+                                            (const void __user *)data);
+
+       case PTRACE_GETFPREGS:  /* Get the child FP/Media state. */
+               return copy_regset_to_user(child, &user_frv_native_view,
+                                          REGSET_FPMEDIA,
+                                          0, sizeof(child->thread.user->f),
+                                          (void __user *)data);
+
+       case PTRACE_SETFPREGS:  /* Set the child FP/Media state. */
+               return copy_regset_from_user(child, &user_frv_native_view,
+                                            REGSET_FPMEDIA,
+                                            0, sizeof(child->thread.user->f),
+                                            (const void __user *)data);
 
        case PTRACE_GETFDPIC:
                tmp = 0;
@@ -300,414 +365,36 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                break;
 
        default:
-               ret = -EIO;
+               ret = ptrace_request(child, request, addr, data);
                break;
        }
        return ret;
 }
 
-int __nongprelbss kstrace;
-
-static const struct {
-       const char      *name;
-       unsigned        argmask;
-} __syscall_name_table[NR_syscalls] = {
-       [0]     = { "restart_syscall"                   },
-       [1]     = { "exit",             0x000001        },
-       [2]     = { "fork",             0xffffff        },
-       [3]     = { "read",             0x000141        },
-       [4]     = { "write",            0x000141        },
-       [5]     = { "open",             0x000235        },
-       [6]     = { "close",            0x000001        },
-       [7]     = { "waitpid",          0x000141        },
-       [8]     = { "creat",            0x000025        },
-       [9]     = { "link",             0x000055        },
-       [10]    = { "unlink",           0x000005        },
-       [11]    = { "execve",           0x000445        },
-       [12]    = { "chdir",            0x000005        },
-       [13]    = { "time",             0x000004        },
-       [14]    = { "mknod",            0x000325        },
-       [15]    = { "chmod",            0x000025        },
-       [16]    = { "lchown",           0x000025        },
-       [17]    = { "break" },
-       [18]    = { "oldstat",          0x000045        },
-       [19]    = { "lseek",            0x000131        },
-       [20]    = { "getpid",           0xffffff        },
-       [21]    = { "mount",            0x043555        },
-       [22]    = { "umount",           0x000005        },
-       [23]    = { "setuid",           0x000001        },
-       [24]    = { "getuid",           0xffffff        },
-       [25]    = { "stime",            0x000004        },
-       [26]    = { "ptrace",           0x004413        },
-       [27]    = { "alarm",            0x000001        },
-       [28]    = { "oldfstat",         0x000041        },
-       [29]    = { "pause",            0xffffff        },
-       [30]    = { "utime",            0x000045        },
-       [31]    = { "stty" },
-       [32]    = { "gtty" },
-       [33]    = { "access",           0x000025        },
-       [34]    = { "nice",             0x000001        },
-       [35]    = { "ftime" },
-       [36]    = { "sync",             0xffffff        },
-       [37]    = { "kill",             0x000011        },
-       [38]    = { "rename",           0x000055        },
-       [39]    = { "mkdir",            0x000025        },
-       [40]    = { "rmdir",            0x000005        },
-       [41]    = { "dup",              0x000001        },
-       [42]    = { "pipe",             0x000004        },
-       [43]    = { "times",            0x000004        },
-       [44]    = { "prof" },
-       [45]    = { "brk",              0x000004        },
-       [46]    = { "setgid",           0x000001        },
-       [47]    = { "getgid",           0xffffff        },
-       [48]    = { "signal",           0x000041        },
-       [49]    = { "geteuid",          0xffffff        },
-       [50]    = { "getegid",          0xffffff        },
-       [51]    = { "acct",             0x000005        },
-       [52]    = { "umount2",          0x000035        },
-       [53]    = { "lock" },
-       [54]    = { "ioctl",            0x000331        },
-       [55]    = { "fcntl",            0x000331        },
-       [56]    = { "mpx" },
-       [57]    = { "setpgid",          0x000011        },
-       [58]    = { "ulimit" },
-       [60]    = { "umask",            0x000002        },
-       [61]    = { "chroot",           0x000005        },
-       [62]    = { "ustat",            0x000043        },
-       [63]    = { "dup2",             0x000011        },
-       [64]    = { "getppid",          0xffffff        },
-       [65]    = { "getpgrp",          0xffffff        },
-       [66]    = { "setsid",           0xffffff        },
-       [67]    = { "sigaction" },
-       [68]    = { "sgetmask" },
-       [69]    = { "ssetmask" },
-       [70]    = { "setreuid" },
-       [71]    = { "setregid" },
-       [72]    = { "sigsuspend" },
-       [73]    = { "sigpending" },
-       [74]    = { "sethostname" },
-       [75]    = { "setrlimit" },
-       [76]    = { "getrlimit" },
-       [77]    = { "getrusage" },
-       [78]    = { "gettimeofday" },
-       [79]    = { "settimeofday" },
-       [80]    = { "getgroups" },
-       [81]    = { "setgroups" },
-       [82]    = { "select" },
-       [83]    = { "symlink" },
-       [84]    = { "oldlstat" },
-       [85]    = { "readlink" },
-       [86]    = { "uselib" },
-       [87]    = { "swapon" },
-       [88]    = { "reboot" },
-       [89]    = { "readdir" },
-       [91]    = { "munmap",           0x000034        },
-       [92]    = { "truncate" },
-       [93]    = { "ftruncate" },
-       [94]    = { "fchmod" },
-       [95]    = { "fchown" },
-       [96]    = { "getpriority" },
-       [97]    = { "setpriority" },
-       [99]    = { "statfs" },
-       [100]   = { "fstatfs" },
-       [102]   = { "socketcall" },
-       [103]   = { "syslog" },
-       [104]   = { "setitimer" },
-       [105]   = { "getitimer" },
-       [106]   = { "stat" },
-       [107]   = { "lstat" },
-       [108]   = { "fstat" },
-       [111]   = { "vhangup" },
-       [114]   = { "wait4" },
-       [115]   = { "swapoff" },
-       [116]   = { "sysinfo" },
-       [117]   = { "ipc" },
-       [118]   = { "fsync" },
-       [119]   = { "sigreturn" },
-       [120]   = { "clone" },
-       [121]   = { "setdomainname" },
-       [122]   = { "uname" },
-       [123]   = { "modify_ldt" },
-       [123]   = { "cacheflush" },
-       [124]   = { "adjtimex" },
-       [125]   = { "mprotect" },
-       [126]   = { "sigprocmask" },
-       [127]   = { "create_module" },
-       [128]   = { "init_module" },
-       [129]   = { "delete_module" },
-       [130]   = { "get_kernel_syms" },
-       [131]   = { "quotactl" },
-       [132]   = { "getpgid" },
-       [133]   = { "fchdir" },
-       [134]   = { "bdflush" },
-       [135]   = { "sysfs" },
-       [136]   = { "personality" },
-       [137]   = { "afs_syscall" },
-       [138]   = { "setfsuid" },
-       [139]   = { "setfsgid" },
-       [140]   = { "_llseek",                  0x014331        },
-       [141]   = { "getdents" },
-       [142]   = { "_newselect",               0x000141        },
-       [143]   = { "flock" },
-       [144]   = { "msync" },
-       [145]   = { "readv" },
-       [146]   = { "writev" },
-       [147]   = { "getsid",                   0x000001        },
-       [148]   = { "fdatasync",                0x000001        },
-       [149]   = { "_sysctl",                  0x000004        },
-       [150]   = { "mlock" },
-       [151]   = { "munlock" },
-       [152]   = { "mlockall" },
-       [153]   = { "munlockall" },
-       [154]   = { "sched_setparam" },
-       [155]   = { "sched_getparam" },
-       [156]   = { "sched_setscheduler" },
-       [157]   = { "sched_getscheduler" },
-       [158]   = { "sched_yield" },
-       [159]   = { "sched_get_priority_max" },
-       [160]   = { "sched_get_priority_min" },
-       [161]   = { "sched_rr_get_interval" },
-       [162]   = { "nanosleep",                0x000044        },
-       [163]   = { "mremap" },
-       [164]   = { "setresuid" },
-       [165]   = { "getresuid" },
-       [166]   = { "vm86" },
-       [167]   = { "query_module" },
-       [168]   = { "poll" },
-       [169]   = { "nfsservctl" },
-       [170]   = { "setresgid" },
-       [171]   = { "getresgid" },
-       [172]   = { "prctl",                    0x333331        },
-       [173]   = { "rt_sigreturn",             0xffffff        },
-       [174]   = { "rt_sigaction",             0x001441        },
-       [175]   = { "rt_sigprocmask",           0x001441        },
-       [176]   = { "rt_sigpending",            0x000014        },
-       [177]   = { "rt_sigtimedwait",          0x001444        },
-       [178]   = { "rt_sigqueueinfo",          0x000411        },
-       [179]   = { "rt_sigsuspend",            0x000014        },
-       [180]   = { "pread",                    0x003341        },
-       [181]   = { "pwrite",                   0x003341        },
-       [182]   = { "chown",                    0x000115        },
-       [183]   = { "getcwd" },
-       [184]   = { "capget" },
-       [185]   = { "capset" },
-       [186]   = { "sigaltstack" },
-       [187]   = { "sendfile" },
-       [188]   = { "getpmsg" },
-       [189]   = { "putpmsg" },
-       [190]   = { "vfork",                    0xffffff        },
-       [191]   = { "ugetrlimit" },
-       [192]   = { "mmap2",                    0x313314        },
-       [193]   = { "truncate64" },
-       [194]   = { "ftruncate64" },
-       [195]   = { "stat64",                   0x000045        },
-       [196]   = { "lstat64",                  0x000045        },
-       [197]   = { "fstat64",                  0x000041        },
-       [198]   = { "lchown32" },
-       [199]   = { "getuid32",                 0xffffff        },
-       [200]   = { "getgid32",                 0xffffff        },
-       [201]   = { "geteuid32",                0xffffff        },
-       [202]   = { "getegid32",                0xffffff        },
-       [203]   = { "setreuid32" },
-       [204]   = { "setregid32" },
-       [205]   = { "getgroups32" },
-       [206]   = { "setgroups32" },
-       [207]   = { "fchown32" },
-       [208]   = { "setresuid32" },
-       [209]   = { "getresuid32" },
-       [210]   = { "setresgid32" },
-       [211]   = { "getresgid32" },
-       [212]   = { "chown32" },
-       [213]   = { "setuid32" },
-       [214]   = { "setgid32" },
-       [215]   = { "setfsuid32" },
-       [216]   = { "setfsgid32" },
-       [217]   = { "pivot_root" },
-       [218]   = { "mincore" },
-       [219]   = { "madvise" },
-       [220]   = { "getdents64" },
-       [221]   = { "fcntl64" },
-       [223]   = { "security" },
-       [224]   = { "gettid" },
-       [225]   = { "readahead" },
-       [226]   = { "setxattr" },
-       [227]   = { "lsetxattr" },
-       [228]   = { "fsetxattr" },
-       [229]   = { "getxattr" },
-       [230]   = { "lgetxattr" },
-       [231]   = { "fgetxattr" },
-       [232]   = { "listxattr" },
-       [233]   = { "llistxattr" },
-       [234]   = { "flistxattr" },
-       [235]   = { "removexattr" },
-       [236]   = { "lremovexattr" },
-       [237]   = { "fremovexattr" },
-       [238]   = { "tkill" },
-       [239]   = { "sendfile64" },
-       [240]   = { "futex" },
-       [241]   = { "sched_setaffinity" },
-       [242]   = { "sched_getaffinity" },
-       [243]   = { "set_thread_area" },
-       [244]   = { "get_thread_area" },
-       [245]   = { "io_setup" },
-       [246]   = { "io_destroy" },
-       [247]   = { "io_getevents" },
-       [248]   = { "io_submit" },
-       [249]   = { "io_cancel" },
-       [250]   = { "fadvise64" },
-       [252]   = { "exit_group",               0x000001        },
-       [253]   = { "lookup_dcookie" },
-       [254]   = { "epoll_create" },
-       [255]   = { "epoll_ctl" },
-       [256]   = { "epoll_wait" },
-       [257]   = { "remap_file_pages" },
-       [258]   = { "set_tid_address" },
-       [259]   = { "timer_create" },
-       [260]   = { "timer_settime" },
-       [261]   = { "timer_gettime" },
-       [262]   = { "timer_getoverrun" },
-       [263]   = { "timer_delete" },
-       [264]   = { "clock_settime" },
-       [265]   = { "clock_gettime" },
-       [266]   = { "clock_getres" },
-       [267]   = { "clock_nanosleep" },
-       [268]   = { "statfs64" },
-       [269]   = { "fstatfs64" },
-       [270]   = { "tgkill" },
-       [271]   = { "utimes" },
-       [272]   = { "fadvise64_64" },
-       [273]   = { "vserver" },
-       [274]   = { "mbind" },
-       [275]   = { "get_mempolicy" },
-       [276]   = { "set_mempolicy" },
-       [277]   = { "mq_open" },
-       [278]   = { "mq_unlink" },
-       [279]   = { "mq_timedsend" },
-       [280]   = { "mq_timedreceive" },
-       [281]   = { "mq_notify" },
-       [282]   = { "mq_getsetattr" },
-       [283]   = { "sys_kexec_load" },
-};
-
-asmlinkage void do_syscall_trace(int leaving)
+/*
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
+ */
+asmlinkage unsigned long syscall_trace_entry(void)
 {
-#if 0
-       unsigned long *argp;
-       const char *name;
-       unsigned argmask;
-       char buffer[16];
-
-       if (!kstrace)
-               return;
-
-       if (!current->mm)
-               return;
-
-       if (__frame->gr7 == __NR_close)
-               return;
-
-#if 0
-       if (__frame->gr7 != __NR_mmap2 &&
-           __frame->gr7 != __NR_vfork &&
-           __frame->gr7 != __NR_execve &&
-           __frame->gr7 != __NR_exit)
-               return;
-#endif
-
-       argmask = 0;
-       name = NULL;
-       if (__frame->gr7 < NR_syscalls) {
-               name = __syscall_name_table[__frame->gr7].name;
-               argmask = __syscall_name_table[__frame->gr7].argmask;
-       }
-       if (!name) {
-               sprintf(buffer, "sys_%lx", __frame->gr7);
-               name = buffer;
-       }
-
-       if (!leaving) {
-               if (!argmask) {
-                       printk(KERN_CRIT "[%d] %s(%lx,%lx,%lx,%lx,%lx,%lx)\n",
-                              current->pid,
-                              name,
-                              __frame->gr8,
-                              __frame->gr9,
-                              __frame->gr10,
-                              __frame->gr11,
-                              __frame->gr12,
-                              __frame->gr13);
-               }
-               else if (argmask == 0xffffff) {
-                       printk(KERN_CRIT "[%d] %s()\n",
-                              current->pid,
-                              name);
-               }
-               else {
-                       printk(KERN_CRIT "[%d] %s(",
-                              current->pid,
-                              name);
-
-                       argp = &__frame->gr8;
-
-                       do {
-                               switch (argmask & 0xf) {
-                               case 1:
-                                       printk("%ld", (long) *argp);
-                                       break;
-                               case 2:
-                                       printk("%lo", *argp);
-                                       break;
-                               case 3:
-                                       printk("%lx", *argp);
-                                       break;
-                               case 4:
-                                       printk("%p", (void *) *argp);
-                                       break;
-                               case 5:
-                                       printk("\"%s\"", (char *) *argp);
-                                       break;
-                               }
-
-                               argp++;
-                               argmask >>= 4;
-                               if (argmask)
-                                       printk(",");
-
-                       } while (argmask);
-
-                       printk(")\n");
-               }
-       }
-       else {
-               if ((int)__frame->gr8 > -4096 && (int)__frame->gr8 < 4096)
-                       printk(KERN_CRIT "[%d] %s() = %ld\n", current->pid, name, __frame->gr8);
-               else
-                       printk(KERN_CRIT "[%d] %s() = %lx\n", current->pid, name, __frame->gr8);
+       __frame->__status |= REG__STATUS_SYSC_ENTRY;
+       if (tracehook_report_syscall_entry(__frame)) {
+               /* tracing decided this syscall should not happen, so
+                * We'll return a bogus call number to get an ENOSYS
+                * error, but leave the original number in
+                * __frame->syscallno
+                */
+               return ULONG_MAX;
        }
-       return;
-#endif
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE))
-               return;
-
-       if (!(current->ptrace & PT_PTRACED))
-               return;
 
-       /* we need to indicate entry or exit to strace */
-       if (leaving)
-               __frame->__status |= REG__STATUS_SYSC_EXIT;
-       else
-               __frame->__status |= REG__STATUS_SYSC_ENTRY;
-
-       ptrace_notify(SIGTRAP);
+       return __frame->syscallno;
+}
 
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(void)
+{
+       __frame->__status |= REG__STATUS_SYSC_EXIT;
+       tracehook_report_syscall_exit(__frame, 0);
 }
index 3bdb368292a8c8a53f9c00b3aa39dc169420b095..4a7a62c6e7833ed91a9087b3762cb3d0ba24df5f 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/unistd.h>
 #include <linux/personality.h>
 #include <linux/freezer.h>
+#include <linux/tracehook.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
 #include <asm/cacheflush.h>
@@ -516,6 +517,9 @@ static void do_signal(void)
                         * clear the TIF_RESTORE_SIGMASK flag */
                        if (test_thread_flag(TIF_RESTORE_SIGMASK))
                                clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+                       tracehook_signal_handler(signr, &info, &ka, __frame,
+                                                test_thread_flag(TIF_SINGLESTEP));
                }
 
                return;
@@ -564,4 +568,10 @@ asmlinkage void do_notify_resume(__u32 thread_info_flags)
        if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
                do_signal();
 
+       /* deal with notification on about to resume userspace execution */
+       if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+               clear_thread_flag(TIF_NOTIFY_RESUME);
+               tracehook_notify_resume(__frame);
+       }
+
 } /* end do_notify_resume() */
index 9fb771a20df3cf831f4779aba7859e4c865928de..374f88d6cc00d0c32fffd03f5c7b3a8b1af395ee 100644 (file)
@@ -23,8 +23,7 @@ long strncpy_from_user(char *dst, const char __user *src, long count)
        char *p, ch;
        long err = -EFAULT;
 
-       if (count < 0)
-               BUG();
+       BUG_ON(count < 0);
 
        p = dst;
 
@@ -76,8 +75,7 @@ long strnlen_user(const char __user *src, long count)
        long err = 0;
        char ch;
 
-       if (count < 0)
-               BUG();
+       BUG_ON(count < 0);
 
 #ifndef CONFIG_MMU
        if ((unsigned long) src < memory_start)
index 52ff9aec799d29c84a649aa128ccb985b60cd68c..4e1ba0b15443a86ac4f5fd12bb34b552c392a768 100644 (file)
@@ -116,8 +116,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
                          enum dma_data_direction direction)
 {
-       if (direction == DMA_NONE)
-                BUG();
+       BUG_ON(direction == DMA_NONE);
 
        frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -151,8 +150,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
                frv_cache_wback_inv(sg_dma_address(&sg[i]),
                                    sg_dma_address(&sg[i]) + sg_dma_len(&sg[i]));
 
-       if (direction == DMA_NONE)
-                BUG();
+       BUG_ON(direction == DMA_NONE);
 
        return nents;
 }
index 3ddedebc4eb3db1a1bbc25a7a10dd2e1fd1e66e5..45954f0813dc9e75a5e15f06e5cdb13776170c30 100644 (file)
@@ -48,8 +48,7 @@ EXPORT_SYMBOL(dma_free_coherent);
 dma_addr_t dma_map_single(struct device *dev, void *ptr, size_t size,
                          enum dma_data_direction direction)
 {
-       if (direction == DMA_NONE)
-                BUG();
+       BUG_ON(direction == DMA_NONE);
 
        frv_cache_wback_inv((unsigned long) ptr, (unsigned long) ptr + size);
 
@@ -81,8 +80,7 @@ int dma_map_sg(struct device *dev, struct scatterlist *sg, int nents,
        void *vaddr;
        int i;
 
-       if (direction == DMA_NONE)
-                BUG();
+       BUG_ON(direction == DMA_NONE);
 
        dampr2 = __get_DAMPR(2);
 
index cc0a3182db3c0158d246f658065a0f0f5d426a5f..acb5047ab57341d9818160f361db47c7e6e59313 100644 (file)
@@ -21,9 +21,10 @@ hpsim_irq_noop (unsigned int irq)
 {
 }
 
-static void
+static int
 hpsim_set_affinity_noop(unsigned int a, const struct cpumask *b)
 {
+       return 0;
 }
 
 static struct hw_interrupt_type irq_type_hp_sim = {
index 5510317db37b2a439711bb3fa97d6af0c7cd00bf..baec6f00f7f3feee71c5110b01f12b5c8b3f4fda 100644 (file)
@@ -636,7 +636,7 @@ void __init acpi_numa_arch_fixup(void)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int triggering, int polarity)
 {
        if (acpi_irq_model == ACPI_IRQ_MODEL_PLATFORM)
                return gsi;
@@ -678,7 +678,8 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
 
        fadt = (struct acpi_table_fadt *)fadt_header;
 
-       acpi_register_gsi(fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE, ACPI_ACTIVE_LOW);
+       acpi_register_gsi(NULL, fadt->sci_interrupt, ACPI_LEVEL_SENSITIVE,
+                                ACPI_ACTIVE_LOW);
        return 0;
 }
 
index 166e0d839fa04d753fa3709bfe62544eee32e9ec..f92cef47bf862ed8cd3943875585535e0c412763 100644 (file)
@@ -329,7 +329,7 @@ unmask_irq (unsigned int irq)
 }
 
 
-static void
+static int
 iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
 #ifdef CONFIG_SMP
@@ -343,15 +343,15 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        cpu = cpumask_first_and(cpu_online_mask, mask);
        if (cpu >= nr_cpu_ids)
-               return;
+               return -1;
 
        if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
 
        dest = cpu_physical_id(cpu);
 
        if (!iosapic_intr_info[irq].count)
-               return;                 /* not an IOSAPIC interrupt */
+               return -1;                      /* not an IOSAPIC interrupt */
 
        set_irq_affinity_info(irq, dest, redir);
 
@@ -376,7 +376,9 @@ iosapic_set_affinity(unsigned int irq, const struct cpumask *mask)
                iosapic_write(iosapic, IOSAPIC_RTE_HIGH(rte_index), high32);
                iosapic_write(iosapic, IOSAPIC_RTE_LOW(rte_index), low32);
        }
+
 #endif
+       return 0;
 }
 
 /*
index 2b15e233f7fef6b016f50367c90f9c91ba096b63..0f8ade9331badf4acaf29818dcad0dc67b31fd5e 100644 (file)
@@ -12,7 +12,7 @@
 static struct irq_chip ia64_msi_chip;
 
 #ifdef CONFIG_SMP
-static void ia64_set_msi_irq_affinity(unsigned int irq,
+static int ia64_set_msi_irq_affinity(unsigned int irq,
                                      const cpumask_t *cpu_mask)
 {
        struct msi_msg msg;
@@ -20,10 +20,10 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
        int cpu = first_cpu(*cpu_mask);
 
        if (!cpu_online(cpu))
-               return;
+               return -1;
 
        if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
 
        read_msi_msg(irq, &msg);
 
@@ -39,6 +39,8 @@ static void ia64_set_msi_irq_affinity(unsigned int irq,
 
        write_msi_msg(irq, &msg);
        cpumask_copy(irq_desc[irq].affinity, cpumask_of(cpu));
+
+       return 0;
 }
 #endif /* CONFIG_SMP */
 
@@ -130,17 +132,17 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #ifdef CONFIG_DMAR
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_cfg *cfg = irq_cfg + irq;
        struct msi_msg msg;
        int cpu = cpumask_first(mask);
 
        if (!cpu_online(cpu))
-               return;
+               return -1;
 
        if (irq_prepare_move(irq, cpu))
-               return;
+               return -1;
 
        dmar_msi_read(irq, &msg);
 
@@ -151,6 +153,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        dmar_msi_write(irq, &msg);
        cpumask_copy(irq_desc[irq].affinity, mask);
+
+       return 0;
 }
 #endif /* CONFIG_SMP */
 
index 66fd705e82c09ee1e1905356cf447526461c086a..764f26abac05df5e9d7659ff7254da6fde398301 100644 (file)
@@ -227,7 +227,7 @@ finish_up:
        return new_irq_info;
 }
 
-static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
+static int sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
 {
        struct sn_irq_info *sn_irq_info, *sn_irq_info_safe;
        nasid_t nasid;
@@ -239,6 +239,8 @@ static void sn_set_affinity_irq(unsigned int irq, const struct cpumask *mask)
        list_for_each_entry_safe(sn_irq_info, sn_irq_info_safe,
                                 sn_irq_lh[irq], list)
                (void)sn_retarget_vector(sn_irq_info, nasid, slice);
+
+       return 0;
 }
 
 #ifdef CONFIG_SMP
index 81e428943d7374d01e0aef8530e39c7ecd892b30..fbbfb970120128a29df68c549895187ea3e3013e 100644 (file)
@@ -151,7 +151,7 @@ int sn_setup_msi_irq(struct pci_dev *pdev, struct msi_desc *entry)
 }
 
 #ifdef CONFIG_SMP
-static void sn_set_msi_irq_affinity(unsigned int irq,
+static int sn_set_msi_irq_affinity(unsigned int irq,
                                    const struct cpumask *cpu_mask)
 {
        struct msi_msg msg;
@@ -168,7 +168,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
        cpu = cpumask_first(cpu_mask);
        sn_irq_info = sn_msi_info[irq].sn_irq_info;
        if (sn_irq_info == NULL || sn_irq_info->irq_int_bit >= 0)
-               return;
+               return -1;
 
        /*
         * Release XIO resources for the old MSI PCI address
@@ -189,7 +189,7 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
        new_irq_info = sn_retarget_vector(sn_irq_info, nasid, slice);
        sn_msi_info[irq].sn_irq_info = new_irq_info;
        if (new_irq_info == NULL)
-               return;
+               return -1;
 
        /*
         * Map the xio address into bus space
@@ -206,6 +206,8 @@ static void sn_set_msi_irq_affinity(unsigned int irq,
 
        write_msi_msg(irq, &msg);
        cpumask_copy(irq_desc[irq].affinity, cpu_mask);
+
+       return 0;
 }
 #endif /* CONFIG_SMP */
 
index 1c19af8daa62a3f992f20ecb1a281a527bfec3cd..d3a0c8154beca80cd7cb40bc3d783fa247891941 100644 (file)
@@ -177,7 +177,7 @@ static void octeon_irq_ciu0_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
        int cpu;
        int bit = irq - OCTEON_IRQ_WORKQ0;      /* Bit 0-63 of EN0 */
@@ -199,6 +199,8 @@ static void octeon_irq_ciu0_set_affinity(unsigned int irq, const struct cpumask
         */
        cvmx_read_csr(CVMX_CIU_INTX_EN0(cvmx_get_core_num() * 2));
        write_unlock(&octeon_irq_ciu0_rwlock);
+
+       return 0;
 }
 #endif
 
@@ -292,7 +294,7 @@ static void octeon_irq_ciu1_disable(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
+static int octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask *dest)
 {
        int cpu;
        int bit = irq - OCTEON_IRQ_WDOG0;       /* Bit 0-63 of EN1 */
@@ -315,6 +317,8 @@ static void octeon_irq_ciu1_set_affinity(unsigned int irq, const struct cpumask
         */
        cvmx_read_csr(CVMX_CIU_INTX_EN1(cvmx_get_core_num() * 2 + 1));
        write_unlock(&octeon_irq_ciu1_rwlock);
+
+       return 0;
 }
 #endif
 
index 3214ade02d105988937576c8425f444b08783b0e..4f1eed107b08217f8f2991ed17f92c424b6c437f 100644 (file)
@@ -49,7 +49,7 @@ static inline void smtc_im_ack_irq(unsigned int irq)
 #ifdef CONFIG_MIPS_MT_SMTC_IRQAFF
 #include <linux/cpumask.h>
 
-extern void plat_set_irq_affinity(unsigned int irq,
+extern int plat_set_irq_affinity(unsigned int irq,
                                  const struct cpumask *affinity);
 extern void smtc_forward_irq(unsigned int irq);
 
index 87deb8f6c45885e5c356df3ac4ddf1e58a6ac81c..3f43c2e3aa5a59ede8eab7ecdaee7762ea6afb68 100644 (file)
@@ -155,7 +155,7 @@ static void gic_unmask_irq(unsigned int irq)
 
 static DEFINE_SPINLOCK(gic_lock);
 
-static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+static int gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        cpumask_t       tmp = CPU_MASK_NONE;
        unsigned long   flags;
@@ -166,7 +166,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
        cpumask_and(&tmp, cpumask, cpu_online_mask);
        if (cpus_empty(tmp))
-               return;
+               return -1;
 
        /* Assumption : cpumask refers to a single CPU */
        spin_lock_irqsave(&gic_lock, flags);
@@ -190,6 +190,7 @@ static void gic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        cpumask_copy(irq_desc[irq].affinity, cpumask);
        spin_unlock_irqrestore(&gic_lock, flags);
 
+       return 0;
 }
 #endif
 
index 5ba31888fefbbecacd123e7033ef8ddcb6c30944..499ffe5475dff4fe8254990499d13d6f5ff4a266 100644 (file)
@@ -114,7 +114,7 @@ struct plat_smp_ops msmtc_smp_ops = {
  */
 
 
-void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
+int plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 {
        cpumask_t tmask;
        int cpu = 0;
@@ -156,5 +156,7 @@ void plat_set_irq_affinity(unsigned int irq, const struct cpumask *affinity)
 
        /* Do any generic SMTC IRQ affinity setup */
        smtc_set_irq_affinity(irq, tmask);
+
+       return 0;
 }
 #endif /* CONFIG_MIPS_MT_SMTC_IRQAFF */
index c147c4b35d3fc6f9e1f48b81866eb36d7f882594..690de06bde902f38a49b8d1e1765ae3fd2251b48 100644 (file)
@@ -50,7 +50,7 @@ static void enable_bcm1480_irq(unsigned int irq);
 static void disable_bcm1480_irq(unsigned int irq);
 static void ack_bcm1480_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_PCI
@@ -109,7 +109,7 @@ void bcm1480_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on, k;
        u64 cur_ints;
@@ -118,7 +118,7 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        if (cpumask_weight(mask) != 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-               return;
+               return -1;
        }
        i = cpumask_first(mask);
 
@@ -152,6 +152,8 @@ static void bcm1480_set_affinity(unsigned int irq, const struct cpumask *mask)
                }
        }
        spin_unlock_irqrestore(&bcm1480_imr_lock, flags);
+
+       return 0;
 }
 #endif
 
index 38cb998ade22053b8963377b70d6f7dad7fb6b94..409dec798863194f364eb378666c9e7471fd0227 100644 (file)
@@ -50,7 +50,7 @@ static void enable_sb1250_irq(unsigned int irq);
 static void disable_sb1250_irq(unsigned int irq);
 static void ack_sb1250_irq(unsigned int irq);
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask);
 #endif
 
 #ifdef CONFIG_SIBYTE_HAS_LDT
@@ -103,7 +103,7 @@ void sb1250_unmask_irq(int cpu, int irq)
 }
 
 #ifdef CONFIG_SMP
-static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        int i = 0, old_cpu, cpu, int_on;
        u64 cur_ints;
@@ -113,7 +113,7 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        if (cpumask_weight(mask) > 1) {
                printk("attempted to set irq affinity for irq %d to multiple CPUs\n", irq);
-               return;
+               return -1;
        }
 
        /* Convert logical CPU to physical CPU */
@@ -143,6 +143,8 @@ static void sb1250_set_affinity(unsigned int irq, const struct cpumask *mask)
                                        R_IMR_INTERRUPT_MASK));
        }
        spin_unlock_irqrestore(&sb1250_imr_lock, flags);
+
+       return 0;
 }
 #endif
 
index 355926730e8d9f49387138fb2cf1cc5b9b3b8898..89faacad5d17d71871d75453fdf55d107f15985e 100644 (file)
@@ -8,6 +8,7 @@ mainmenu "Linux Kernel Configuration"
 config MN10300
        def_bool y
        select HAVE_OPROFILE
+       select HAVE_ARCH_TRACEHOOK
 
 config AM33
        def_bool y
index bf09f8bb392eef17bf41441050b5adb6e9920d0b..49105462e6fc9c5fb095fa35ad25b55493ba15a6 100644 (file)
@@ -34,7 +34,7 @@
  */
 typedef unsigned long elf_greg_t;
 
-#define ELF_NGREG (sizeof (struct pt_regs) / sizeof(elf_greg_t))
+#define ELF_NGREG ((sizeof(struct pt_regs) / sizeof(elf_greg_t)) - 1)
 typedef elf_greg_t elf_gregset_t[ELF_NGREG];
 
 #define ELF_NFPREG 32
@@ -76,6 +76,7 @@ do {                                                                  \
 } while (0)
 
 #define USE_ELF_CORE_DUMP
+#define CORE_DUMP_USE_REGSET
 #define ELF_EXEC_PAGESIZE      4096
 
 /*
index 73239271873da304c88fe396eda7074686c3185a..f7d4b0d285e8dddb835781ac071898ff5d34dfe9 100644 (file)
@@ -143,13 +143,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
 
 unsigned long get_wchan(struct task_struct *p);
 
-#define task_pt_regs(task)                                             \
-({                                                                     \
-       struct pt_regs *__regs__;                                       \
-       __regs__ = (struct pt_regs *) (KSTK_TOP(task_stack_page(task)) - 8); \
-       __regs__ - 1;                                                   \
-})
-
+#define task_pt_regs(task) ((task)->thread.uregs)
 #define KSTK_EIP(task) (task_pt_regs(task)->pc)
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
index 7b06cc623d8b074a3a23511673e2c170bb033840..921942ed1b03508a582e6e226ed33acf50c8fa5b 100644 (file)
@@ -91,9 +91,17 @@ extern struct pt_regs *__frame; /* current frame pointer */
 #if defined(__KERNEL__)
 
 #if !defined(__ASSEMBLY__)
+struct task_struct;
+
 #define user_mode(regs)                        (((regs)->epsw & EPSW_nSL) == EPSW_nSL)
 #define instruction_pointer(regs)      ((regs)->pc)
+#define user_stack_pointer(regs)       ((regs)->sp)
 extern void show_regs(struct pt_regs *);
+
+#define arch_has_single_step() (1)
+extern void user_enable_single_step(struct task_struct *);
+extern void user_disable_single_step(struct task_struct *);
+
 #endif  /*  !__ASSEMBLY  */
 
 #define profile_pc(regs) ((regs)->pc)
index 3dc3e462f92a4e39b496f4d78f7bc1adc6e64e85..7408a27199f342fd26679db7a326aa0b07885f53 100644 (file)
@@ -76,7 +76,7 @@ ENTRY(system_call)
        cmp     nr_syscalls,d0
        bcc     syscall_badsys
        btst    _TIF_SYSCALL_TRACE,(TI_flags,a2)
-       bne     syscall_trace_entry
+       bne     syscall_entry_trace
 syscall_call:
        add     d0,d0,a1
        add     a1,a1
@@ -104,11 +104,10 @@ restore_all:
 syscall_exit_work:
        btst    _TIF_SYSCALL_TRACE,d2
        beq     work_pending
-       __sti                           # could let do_syscall_trace() call
+       __sti                           # could let syscall_trace_exit() call
                                        # schedule() instead
        mov     fp,d0
-       mov     1,d1
-       call    do_syscall_trace[],0    # do_syscall_trace(regs,entryexit)
+       call    syscall_trace_exit[],0  # do_syscall_trace(regs)
        jmp     resume_userspace
 
        ALIGN
@@ -138,13 +137,11 @@ work_notifysig:
        jmp     resume_userspace
 
        # perform syscall entry tracing
-syscall_trace_entry:
+syscall_entry_trace:
        mov     -ENOSYS,d0
        mov     d0,(REG_D0,fp)
        mov     fp,d0
-       clr     d1
-       call    do_syscall_trace[],0
-       mov     (REG_ORIG_D0,fp),d0
+       call    syscall_trace_entry[],0 # returns the syscall number to actually use
        mov     (REG_D1,fp),d1
        cmp     nr_syscalls,d0
        bcs     syscall_call
index d6d6cdc75c523b66e6218d493311aac2afc5f017..e143339ad28e00869d9b23c1d882d994cdd89d62 100644 (file)
@@ -17,6 +17,9 @@
 #include <linux/errno.h>
 #include <linux/ptrace.h>
 #include <linux/user.h>
+#include <linux/regset.h>
+#include <linux/elf.h>
+#include <linux/tracehook.h>
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
 #include <asm/system.h>
@@ -64,12 +67,6 @@ static inline int get_stack_long(struct task_struct *task, int offset)
                ((unsigned long) task->thread.uregs + offset);
 }
 
-/*
- * this routine will put a word on the processes privileged stack.
- * the offset is how far from the base addr as stored in the TSS.
- * this routine assumes that all the privileged stacks are in our
- * data space.
- */
 static inline
 int put_stack_long(struct task_struct *task, int offset, unsigned long data)
 {
@@ -80,94 +77,233 @@ int put_stack_long(struct task_struct *task, int offset, unsigned long data)
        return 0;
 }
 
-static inline unsigned long get_fpregs(struct fpu_state_struct *buf,
-                                      struct task_struct *tsk)
+/*
+ * retrieve the contents of MN10300 userspace general registers
+ */
+static int genregs_get(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      void *kbuf, void __user *ubuf)
 {
-       return __copy_to_user(buf, &tsk->thread.fpu_state,
-                             sizeof(struct fpu_state_struct));
+       const struct pt_regs *regs = task_pt_regs(target);
+       int ret;
+
+       /* we need to skip regs->next */
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 regs, 0, PT_ORIG_D0 * sizeof(long));
+       if (ret < 0)
+               return ret;
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+                                 NR_PTREGS * sizeof(long));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       NR_PTREGS * sizeof(long), -1);
 }
 
-static inline unsigned long set_fpregs(struct task_struct *tsk,
-                                      struct fpu_state_struct *buf)
+/*
+ * update the contents of the MN10300 userspace general registers
+ */
+static int genregs_set(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      const void *kbuf, const void __user *ubuf)
 {
-       return __copy_from_user(&tsk->thread.fpu_state, buf,
-                               sizeof(struct fpu_state_struct));
+       struct pt_regs *regs = task_pt_regs(target);
+       unsigned long tmp;
+       int ret;
+
+       /* we need to skip regs->next */
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                regs, 0, PT_ORIG_D0 * sizeof(long));
+       if (ret < 0)
+               return ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &regs->orig_d0, PT_ORIG_D0 * sizeof(long),
+                                PT_EPSW * sizeof(long));
+       if (ret < 0)
+               return ret;
+
+       /* we need to mask off changes to EPSW */
+       tmp = regs->epsw;
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &tmp, PT_EPSW * sizeof(long),
+                                PT_PC * sizeof(long));
+       tmp &= EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N | EPSW_FLAG_Z;
+       tmp |= regs->epsw & ~(EPSW_FLAG_V | EPSW_FLAG_C | EPSW_FLAG_N |
+                             EPSW_FLAG_Z);
+       regs->epsw = tmp;
+
+       if (ret < 0)
+               return ret;
+
+       /* and finally load the PC */
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &regs->pc, PT_PC * sizeof(long),
+                                NR_PTREGS * sizeof(long));
+
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        NR_PTREGS * sizeof(long), -1);
 }
 
-static inline void fpsave_init(struct task_struct *task)
+/*
+ * retrieve the contents of MN10300 userspace FPU registers
+ */
+static int fpuregs_get(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      void *kbuf, void __user *ubuf)
 {
-       memset(&task->thread.fpu_state, 0, sizeof(struct fpu_state_struct));
+       const struct fpu_state_struct *fpregs = &target->thread.fpu_state;
+       int ret;
+
+       unlazy_fpu(target);
+
+       ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
+                                 fpregs, 0, sizeof(*fpregs));
+       if (ret < 0)
+               return ret;
+
+       return user_regset_copyout_zero(&pos, &count, &kbuf, &ubuf,
+                                       sizeof(*fpregs), -1);
 }
 
 /*
- * make sure the single step bit is not set
+ * update the contents of the MN10300 userspace FPU registers
  */
-void ptrace_disable(struct task_struct *child)
+static int fpuregs_set(struct task_struct *target,
+                      const struct user_regset *regset,
+                      unsigned int pos, unsigned int count,
+                      const void *kbuf, const void __user *ubuf)
+{
+       struct fpu_state_struct fpu_state = target->thread.fpu_state;
+       int ret;
+
+       ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
+                                &fpu_state, 0, sizeof(fpu_state));
+       if (ret < 0)
+               return ret;
+
+       fpu_kill_state(target);
+       target->thread.fpu_state = fpu_state;
+       set_using_fpu(target);
+
+       return user_regset_copyin_ignore(&pos, &count, &kbuf, &ubuf,
+                                        sizeof(fpu_state), -1);
+}
+
+/*
+ * determine if the FPU registers have actually been used
+ */
+static int fpuregs_active(struct task_struct *target,
+                         const struct user_regset *regset)
+{
+       return is_using_fpu(target) ? regset->n : 0;
+}
+
+/*
+ * Define the register sets available on the MN10300 under Linux
+ */
+enum mn10300_regset {
+       REGSET_GENERAL,
+       REGSET_FPU,
+};
+
+static const struct user_regset mn10300_regsets[] = {
+       /*
+        * General register format is:
+        *      A3, A2, D3, D2, MCVF, MCRL, MCRH, MDRQ
+        *      E1, E0, E7...E2, SP, LAR, LIR, MDR
+        *      A1, A0, D1, D0, ORIG_D0, EPSW, PC
+        */
+       [REGSET_GENERAL] = {
+               .core_note_type = NT_PRSTATUS,
+               .n              = ELF_NGREG,
+               .size           = sizeof(long),
+               .align          = sizeof(long),
+               .get            = genregs_get,
+               .set            = genregs_set,
+       },
+       /*
+        * FPU register format is:
+        *      FS0-31, FPCR
+        */
+       [REGSET_FPU] = {
+               .core_note_type = NT_PRFPREG,
+               .n              = sizeof(struct fpu_state_struct) / sizeof(long),
+               .size           = sizeof(long),
+               .align          = sizeof(long),
+               .get            = fpuregs_get,
+               .set            = fpuregs_set,
+               .active         = fpuregs_active,
+       },
+};
+
+static const struct user_regset_view user_mn10300_native_view = {
+       .name           = "mn10300",
+       .e_machine      = EM_MN10300,
+       .regsets        = mn10300_regsets,
+       .n              = ARRAY_SIZE(mn10300_regsets),
+};
+
+const struct user_regset_view *task_user_regset_view(struct task_struct *task)
+{
+       return &user_mn10300_native_view;
+}
+
+/*
+ * set the single-step bit
+ */
+void user_enable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
        struct user *dummy = NULL;
        long tmp;
 
        tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-       tmp &= ~EPSW_T;
+       tmp |= EPSW_T;
        put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
 /*
- * set the single step bit
+ * make sure the single-step bit is not set
  */
-void ptrace_enable(struct task_struct *child)
+void user_disable_single_step(struct task_struct *child)
 {
 #ifndef CONFIG_MN10300_USING_JTAG
        struct user *dummy = NULL;
        long tmp;
 
        tmp = get_stack_long(child, (unsigned long) &dummy->regs.epsw);
-       tmp |= EPSW_T;
+       tmp &= ~EPSW_T;
        put_stack_long(child, (unsigned long) &dummy->regs.epsw, tmp);
 #endif
 }
 
+void ptrace_disable(struct task_struct *child)
+{
+       user_disable_single_step(child);
+}
+
 /*
  * handle the arch-specific side of process tracing
  */
 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 {
-       struct fpu_state_struct fpu_state;
-       int i, ret;
+       unsigned long tmp;
+       int ret;
 
        switch (request) {
-       /* read the word at location addr. */
-       case PTRACE_PEEKTEXT: {
-               unsigned long tmp;
-               int copied;
-
-               copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-               ret = -EIO;
-               if (copied != sizeof(tmp))
-                       break;
-               ret = put_user(tmp, (unsigned long *) data);
-               break;
-       }
-
-       /* read the word at location addr. */
-       case PTRACE_PEEKDATA: {
-               unsigned long tmp;
-               int copied;
-
-               copied = access_process_vm(child, addr, &tmp, sizeof(tmp), 0);
-               ret = -EIO;
-               if (copied != sizeof(tmp))
-                       break;
-               ret = put_user(tmp, (unsigned long *) data);
-               break;
-       }
-
        /* read the word at location addr in the USER area. */
-       case PTRACE_PEEKUSR: {
-               unsigned long tmp;
-
+       case PTRACE_PEEKUSR:
                ret = -EIO;
                if ((addr & 3) || addr < 0 ||
                    addr > sizeof(struct user) - 3)
@@ -179,17 +315,6 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                                             ptrace_regid_to_frame[addr]);
                ret = put_user(tmp, (unsigned long *) data);
                break;
-       }
-
-       /* write the word at location addr. */
-       case PTRACE_POKETEXT:
-       case PTRACE_POKEDATA:
-               if (access_process_vm(child, addr, &data, sizeof(data), 1) ==
-                   sizeof(data))
-                       ret = 0;
-               else
-                       ret = -EIO;
-               break;
 
                /* write the word at location addr in the USER area */
        case PTRACE_POKEUSR:
@@ -204,132 +329,32 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
                                             data);
                break;
 
-               /* continue and stop at next (return from) syscall */
-       case PTRACE_SYSCALL:
-               /* restart after signal. */
-       case PTRACE_CONT:
-               ret = -EIO;
-               if ((unsigned long) data > _NSIG)
-                       break;
-               if (request == PTRACE_SYSCALL)
-                       set_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               else
-                       clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               child->exit_code = data;
-               ptrace_disable(child);
-               wake_up_process(child);
-               ret = 0;
-               break;
-
-               /*
-                * make the child exit
-                * - the best I can do is send it a sigkill
-                * - perhaps it should be put in the status that it wants to
-                *   exit
-                */
-       case PTRACE_KILL:
-               ret = 0;
-               if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
-                       break;
-               child->exit_code = SIGKILL;
-               clear_tsk_thread_flag(child, TIF_SINGLESTEP);
-               ptrace_disable(child);
-               wake_up_process(child);
-               break;
-
-       case PTRACE_SINGLESTEP: /* set the trap flag. */
-#ifndef CONFIG_MN10300_USING_JTAG
-               ret = -EIO;
-               if ((unsigned long) data > _NSIG)
-                       break;
-               clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
-               ptrace_enable(child);
-               child->exit_code = data;
-               wake_up_process(child);
-               ret = 0;
-#else
-               ret = -EINVAL;
-#endif
-               break;
-
-       case PTRACE_DETACH:     /* detach a process that was attached. */
-               ret = ptrace_detach(child, data);
-               break;
-
-               /* Get all gp regs from the child. */
-       case PTRACE_GETREGS: {
-               unsigned long tmp;
-
-               if (!access_ok(VERIFY_WRITE, (unsigned *) data, NR_PTREGS << 2)) {
-                       ret = -EIO;
-                       break;
-               }
-
-               for (i = 0; i < NR_PTREGS << 2; i += 4) {
-                       tmp = get_stack_long(child, ptrace_regid_to_frame[i]);
-                       __put_user(tmp, (unsigned long *) data);
-                       data += sizeof(tmp);
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_SETREGS: { /* Set all gp regs in the child. */
-               unsigned long tmp;
-
-               if (!access_ok(VERIFY_READ, (unsigned long *)data,
-                              sizeof(struct pt_regs))) {
-                       ret = -EIO;
-                       break;
-               }
-
-               for (i = 0; i < NR_PTREGS << 2; i += 4) {
-                       __get_user(tmp, (unsigned long *) data);
-                       put_stack_long(child, ptrace_regid_to_frame[i], tmp);
-                       data += sizeof(tmp);
-               }
-               ret = 0;
-               break;
-       }
-
-       case PTRACE_GETFPREGS: { /* Get the child FPU state. */
-               if (is_using_fpu(child)) {
-                       unlazy_fpu(child);
-                       fpu_state = child->thread.fpu_state;
-               } else {
-                       memset(&fpu_state, 0, sizeof(fpu_state));
-               }
-
-               ret = -EIO;
-               if (copy_to_user((void *) data, &fpu_state,
-                                sizeof(fpu_state)) == 0)
-                       ret = 0;
-               break;
-       }
-
-       case PTRACE_SETFPREGS: { /* Set the child FPU state. */
-               ret = -EFAULT;
-               if (copy_from_user(&fpu_state, (const void *) data,
-                                  sizeof(fpu_state)) == 0) {
-                       fpu_kill_state(child);
-                       child->thread.fpu_state = fpu_state;
-                       set_using_fpu(child);
-                       ret = 0;
-               }
-               break;
-       }
-
-       case PTRACE_SETOPTIONS: {
-               if (data & PTRACE_O_TRACESYSGOOD)
-                       child->ptrace |= PT_TRACESYSGOOD;
-               else
-                       child->ptrace &= ~PT_TRACESYSGOOD;
-               ret = 0;
-               break;
-       }
+       case PTRACE_GETREGS:    /* Get all integer regs from the child. */
+               return copy_regset_to_user(child, &user_mn10300_native_view,
+                                          REGSET_GENERAL,
+                                          0, NR_PTREGS * sizeof(long),
+                                          (void __user *)data);
+
+       case PTRACE_SETREGS:    /* Set all integer regs in the child. */
+               return copy_regset_from_user(child, &user_mn10300_native_view,
+                                            REGSET_GENERAL,
+                                            0, NR_PTREGS * sizeof(long),
+                                            (const void __user *)data);
+
+       case PTRACE_GETFPREGS:  /* Get the child FPU state. */
+               return copy_regset_to_user(child, &user_mn10300_native_view,
+                                          REGSET_FPU,
+                                          0, sizeof(struct fpu_state_struct),
+                                          (void __user *)data);
+
+       case PTRACE_SETFPREGS:  /* Set the child FPU state. */
+               return copy_regset_from_user(child, &user_mn10300_native_view,
+                                            REGSET_FPU,
+                                            0, sizeof(struct fpu_state_struct),
+                                            (const void __user *)data);
 
        default:
-               ret = -EIO;
+               ret = ptrace_request(child, request, addr, data);
                break;
        }
 
@@ -337,43 +362,26 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
 }
 
 /*
- * notification of system call entry/exit
- * - triggered by current->work.syscall_trace
+ * handle tracing of system call entry
+ * - return the revised system call number or ULONG_MAX to cause ENOSYS
  */
-asmlinkage void do_syscall_trace(struct pt_regs *regs, int entryexit)
+asmlinkage unsigned long syscall_trace_entry(struct pt_regs *regs)
 {
-#if 0
-       /* just in case... */
-       printk(KERN_DEBUG "[%d] syscall_%lu(%lx,%lx,%lx,%lx) = %lx\n",
-              current->pid,
-              regs->orig_d0,
-              regs->a0,
-              regs->d1,
-              regs->a3,
-              regs->a2,
-              regs->d0);
-       return;
-#endif
-
-       if (!test_thread_flag(TIF_SYSCALL_TRACE) &&
-           !test_thread_flag(TIF_SINGLESTEP))
-               return;
-       if (!(current->ptrace & PT_PTRACED))
-               return;
+       if (tracehook_report_syscall_entry(regs))
+               /* tracing decided this syscall should not happen, so
+                * We'll return a bogus call number to get an ENOSYS
+                * error, but leave the original number in
+                * regs->orig_d0
+                */
+               return ULONG_MAX;
 
-       /* the 0x80 provides a way for the tracing parent to distinguish
-          between a syscall stop and SIGTRAP delivery */
-       ptrace_notify(SIGTRAP |
-                     ((current->ptrace & PT_TRACESYSGOOD) &&
-                      !test_thread_flag(TIF_SINGLESTEP) ? 0x80 : 0));
+       return regs->orig_d0;
+}
 
-       /*
-        * this isn't the same as continuing with a signal, but it will do
-        * for normal use.  strace only continues with a signal if the
-        * stopping signal is not SIGTRAP.  -brl
-        */
-       if (current->exit_code) {
-               send_sig(current->exit_code, current, 1);
-               current->exit_code = 0;
-       }
+/*
+ * handle tracing of system call exit
+ */
+asmlinkage void syscall_trace_exit(struct pt_regs *regs)
+{
+       tracehook_report_syscall_exit(regs, 0);
 }
index 841ca9955a1813836ef2d07d0b7dfbc03e335347..9f7572a0f5784a30d9ff16bbcfa407d10fc8a066 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/tty.h>
 #include <linux/personality.h>
 #include <linux/suspend.h>
+#include <linux/tracehook.h>
 #include <asm/cacheflush.h>
 #include <asm/ucontext.h>
 #include <asm/uaccess.h>
@@ -511,6 +512,9 @@ static void do_signal(struct pt_regs *regs)
                         * clear the TIF_RESTORE_SIGMASK flag */
                        if (test_thread_flag(TIF_RESTORE_SIGMASK))
                                clear_thread_flag(TIF_RESTORE_SIGMASK);
+
+                       tracehook_signal_handler(signr, &info, &ka, regs,
+                                                test_thread_flag(TIF_SINGLESTEP));
                }
 
                return;
@@ -561,4 +565,9 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, u32 thread_info_flags)
        /* deal with pending signal delivery */
        if (thread_info_flags & (_TIF_SIGPENDING | _TIF_RESTORE_SIGMASK))
                do_signal(regs);
+
+       if (thread_info_flags & _TIF_NOTIFY_RESUME) {
+               clear_thread_flag(TIF_NOTIFY_RESUME);
+               tracehook_notify_resume(__frame);
+       }
 }
index 789208094e985ec6a02f25c666492b5735c58bb9..7095147dcb8ba2f83bbad869b7b2e4366dd4a6a5 100644 (file)
@@ -165,24 +165,6 @@ ENTRY(itlb_aerror)
 ENTRY(dtlb_aerror)
        and     ~EPSW_NMID,epsw
        add     -4,sp
-       mov     d1,(sp)
-
-       movhu   (MMUFCR_DFC),d1                 # is it the initial valid write
-                                               # to this page?
-       and     MMUFCR_xFC_INITWR,d1
-       beq     dtlb_pagefault                  # jump if not
-
-       mov     (DPTEL),d1                      # set the dirty bit
-                                               # (don't replace with BSET!)
-       or      _PAGE_DIRTY,d1
-       mov     d1,(DPTEL)
-       mov     (sp),d1
-       add     4,sp
-       rti
-
-       ALIGN
-dtlb_pagefault:
-       mov     (sp),d1
        SAVE_ALL
        add     -4,sp                           # need to pass three params
 
index 4ea4229d765ccc0657148d4f2268931ea3d6b8f9..8007f1e6572986559789a2520603a77336c72214 100644 (file)
@@ -130,15 +130,17 @@ int cpu_check_affinity(unsigned int irq, const struct cpumask *dest)
        return cpu_dest;
 }
 
-static void cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
+static int cpu_set_affinity_irq(unsigned int irq, const struct cpumask *dest)
 {
        int cpu_dest;
 
        cpu_dest = cpu_check_affinity(irq, dest);
        if (cpu_dest < 0)
-               return;
+               return -1;
 
        cpumask_copy(&irq_desc[irq].affinity, dest);
+
+       return 0;
 }
 #endif
 
index 80b513449f4c44d3c98915efe233c01caa382f4d..be3581a8c294c8de4d083b1b2c5710381a35c2d9 100644 (file)
@@ -333,7 +333,7 @@ static void xics_eoi_lpar(unsigned int virq)
        lpar_xirr_info_set((0xff << 24) | irq);
 }
 
-static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
+static int xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 {
        unsigned int irq;
        int status;
@@ -342,14 +342,14 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
 
        irq = (unsigned int)irq_map[virq].hwirq;
        if (irq == XICS_IPI || irq == XICS_IRQ_SPURIOUS)
-               return;
+               return -1;
 
        status = rtas_call(ibm_get_xive, 1, 3, xics_status, irq);
 
        if (status) {
                printk(KERN_ERR "%s: ibm,get-xive irq=%u returns %d\n",
                        __func__, irq, status);
-               return;
+               return -1;
        }
 
        /*
@@ -363,7 +363,7 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
                printk(KERN_WARNING
                        "%s: No online cpus in the mask %s for irq %d\n",
                        __func__, cpulist, virq);
-               return;
+               return -1;
        }
 
        status = rtas_call(ibm_set_xive, 3, 1, NULL,
@@ -372,8 +372,10 @@ static void xics_set_affinity(unsigned int virq, const struct cpumask *cpumask)
        if (status) {
                printk(KERN_ERR "%s: ibm,set-xive irq=%u returns %d\n",
                        __func__, irq, status);
-               return;
+               return -1;
        }
+
+       return 0;
 }
 
 static struct irq_chip xics_pic_direct = {
index 0efc12d1a3d77f98716f500c5ca12f9f923e3e9d..352d8c3ef5269c97e548789edccc49c420214a71 100644 (file)
@@ -807,7 +807,7 @@ static void mpic_end_ipi(unsigned int irq)
 
 #endif /* CONFIG_SMP */
 
-void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
+int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 {
        struct mpic *mpic = mpic_from_irq(irq);
        unsigned int src = mpic_irq_to_hw(irq);
@@ -824,6 +824,8 @@ void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask)
                mpic_irq_write(src, MPIC_INFO(IRQ_DESTINATION),
                               mpic_physmask(cpus_addr(tmp)[0]));
        }
+
+       return 0;
 }
 
 static unsigned int mpic_type_to_vecpri(struct mpic *mpic, unsigned int type)
index 3cef2af10f4254daea1ff69769117ffb1f8a23b3..eff433c322a0c55632b7d413acfddc18ed5cbaa8 100644 (file)
@@ -36,6 +36,6 @@ static inline int mpic_pasemi_msi_init(struct mpic *mpic)
 
 extern int mpic_set_irq_type(unsigned int virq, unsigned int flow_type);
 extern void mpic_set_vector(unsigned int virq, unsigned int vector);
-extern void mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
+extern int mpic_set_affinity(unsigned int irq, const struct cpumask *cpumask);
 
 #endif /* _POWERPC_SYSDEV_MPIC_H */
index 639ac805448ab500497270013b9176510849062a..65865726b28319238b16f109a032510a14b624e5 100644 (file)
@@ -102,8 +102,8 @@ struct thread_info {
 #define TI_KERN_CNTD1  0x00000488
 #define TI_PCR         0x00000490
 #define TI_RESTART_BLOCK 0x00000498
-#define TI_KUNA_REGS   0x000004c0
-#define TI_KUNA_INSN   0x000004c8
+#define TI_KUNA_REGS   0x000004c8
+#define TI_KUNA_INSN   0x000004d0
 #define TI_FPREGS      0x00000500
 
 /* We embed this in the uppermost byte of thread_info->flags */
index 5deabe921a47b6ebdcd5f6371faf7eaab3c135e4..e5e78f9cfc95d7104ede2ab767fdd1e5401f623c 100644 (file)
@@ -318,10 +318,12 @@ static void sun4u_irq_enable(unsigned int virt_irq)
        }
 }
 
-static void sun4u_set_affinity(unsigned int virt_irq,
+static int sun4u_set_affinity(unsigned int virt_irq,
                               const struct cpumask *mask)
 {
        sun4u_irq_enable(virt_irq);
+
+       return 0;
 }
 
 /* Don't do anything.  The desc->status check for IRQ_DISABLED in
@@ -377,7 +379,7 @@ static void sun4v_irq_enable(unsigned int virt_irq)
                       ino, err);
 }
 
-static void sun4v_set_affinity(unsigned int virt_irq,
+static int sun4v_set_affinity(unsigned int virt_irq,
                               const struct cpumask *mask)
 {
        unsigned int ino = virt_irq_table[virt_irq].dev_ino;
@@ -388,6 +390,8 @@ static void sun4v_set_affinity(unsigned int virt_irq,
        if (err != HV_EOK)
                printk(KERN_ERR "sun4v_intr_settarget(%x,%lu): "
                       "err(%d)\n", ino, cpuid, err);
+
+       return 0;
 }
 
 static void sun4v_irq_disable(unsigned int virt_irq)
@@ -445,7 +449,7 @@ static void sun4v_virq_enable(unsigned int virt_irq)
                       dev_handle, dev_ino, err);
 }
 
-static void sun4v_virt_set_affinity(unsigned int virt_irq,
+static int sun4v_virt_set_affinity(unsigned int virt_irq,
                                    const struct cpumask *mask)
 {
        unsigned long cpuid, dev_handle, dev_ino;
@@ -461,6 +465,8 @@ static void sun4v_virt_set_affinity(unsigned int virt_irq,
                printk(KERN_ERR "sun4v_vintr_set_target(%lx,%lx,%lu): "
                       "err(%d)\n",
                       dev_handle, dev_ino, cpuid, err);
+
+       return 0;
 }
 
 static void sun4v_virq_disable(unsigned int virt_irq)
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
new file mode 100644 (file)
index 0000000..ad8ec35
--- /dev/null
@@ -0,0 +1,16 @@
+
+obj-$(CONFIG_KVM) += kvm/
+
+# Xen paravirtualization support
+obj-$(CONFIG_XEN) += xen/
+
+# lguest paravirtualization support
+obj-$(CONFIG_LGUEST_GUEST) += lguest/
+
+obj-y += kernel/
+obj-y += mm/
+
+obj-y += crypto/
+obj-y += vdso/
+obj-$(CONFIG_IA32_EMULATION) += ia32/
+
index a6efe0a2e9ae613a81bedec5e4772698d16541cc..aafae3b140de40269abd0d9e02791853dc5cd1e6 100644 (file)
@@ -47,6 +47,11 @@ config X86
        select HAVE_KERNEL_BZIP2
        select HAVE_KERNEL_LZMA
 
+config OUTPUT_FORMAT
+       string
+       default "elf32-i386" if X86_32
+       default "elf64-x86-64" if X86_64
+
 config ARCH_DEFCONFIG
        string
        default "arch/x86/configs/i386_defconfig" if X86_32
@@ -274,15 +279,9 @@ config SPARSE_IRQ
 
          If you don't know what to do here, say N.
 
-config NUMA_MIGRATE_IRQ_DESC
-       bool "Move irq desc when changing irq smp_affinity"
+config NUMA_IRQ_DESC
+       def_bool y
        depends on SPARSE_IRQ && NUMA
-       depends on BROKEN
-       default n
-       ---help---
-         This enables moving irq_desc to cpu/node that irq will use handled.
-
-         If you don't know what to do here, say N.
 
 config X86_MPPARSE
        bool "Enable MPS table" if ACPI
@@ -355,7 +354,7 @@ config X86_UV
        depends on X86_64
        depends on X86_EXTENDED_PLATFORM
        depends on NUMA
-       select X86_X2APIC
+       depends on X86_X2APIC
        ---help---
          This option is needed in order to support SGI Ultraviolet systems.
          If you don't have one of these, you should say N here.
@@ -1466,9 +1465,7 @@ config KEXEC_JUMP
 
 config PHYSICAL_START
        hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP)
-       default "0x1000000" if X86_NUMAQ
-       default "0x200000" if X86_64
-       default "0x100000"
+       default "0x1000000"
        ---help---
          This gives the physical address where the kernel is loaded.
 
@@ -1487,15 +1484,15 @@ config PHYSICAL_START
          to be specifically compiled to run from a specific memory area
          (normally a reserved region) and this option comes handy.
 
-         So if you are using bzImage for capturing the crash dump, leave
-         the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y.
-         Otherwise if you plan to use vmlinux for capturing the crash dump
-         change this value to start of the reserved region (Typically 16MB
-         0x1000000). In other words, it can be set based on the "X" value as
-         specified in the "crashkernel=YM@XM" command line boot parameter
-         passed to the panic-ed kernel. Typically this parameter is set as
-         crashkernel=64M@16M. Please take a look at
-         Documentation/kdump/kdump.txt for more details about crash dumps.
+         So if you are using bzImage for capturing the crash dump,
+         leave the value here unchanged to 0x1000000 and set
+         CONFIG_RELOCATABLE=y.  Otherwise if you plan to use vmlinux
+         for capturing the crash dump change this value to start of
+         the reserved region.  In other words, it can be set based on
+         the "X" value as specified in the "crashkernel=YM@XM"
+         command line boot parameter passed to the panic-ed
+         kernel. Please take a look at Documentation/kdump/kdump.txt
+         for more details about crash dumps.
 
          Usage of bzImage for capturing the crash dump is recommended as
          one does not have to build two kernels. Same kernel can be used
@@ -1508,8 +1505,8 @@ config PHYSICAL_START
          Don't change this unless you know what you are doing.
 
 config RELOCATABLE
-       bool "Build a relocatable kernel (EXPERIMENTAL)"
-       depends on EXPERIMENTAL
+       bool "Build a relocatable kernel"
+       default y
        ---help---
          This builds a kernel image that retains relocation information
          so it can be loaded someplace besides the default 1MB.
@@ -1524,12 +1521,16 @@ config RELOCATABLE
          it has been loaded at and the compile time physical address
          (CONFIG_PHYSICAL_START) is ignored.
 
+# Relocation on x86-32 needs some additional build support
+config X86_NEED_RELOCS
+       def_bool y
+       depends on X86_32 && RELOCATABLE
+
 config PHYSICAL_ALIGN
        hex
        prompt "Alignment value to which kernel should be aligned" if X86_32
-       default "0x100000" if X86_32
-       default "0x200000" if X86_64
-       range 0x2000 0x400000
+       default "0x1000000"
+       range 0x2000 0x1000000
        ---help---
          This value puts the alignment restrictions on physical address
          where kernel is loaded and run from. Kernel is compiled for an
index d8359e73317f8dfb8b929f15f555f5593469e7fd..d105f29bb6bb7c9b75fe3369d66f68f2f3ada5ba 100644 (file)
@@ -159,14 +159,30 @@ config IOMMU_DEBUG
          options. See Documentation/x86_64/boot-options.txt for more
          details.
 
+config IOMMU_STRESS
+       bool "Enable IOMMU stress-test mode"
+       ---help---
+         This option disables various optimizations in IOMMU related
+         code to do real stress testing of the IOMMU code. This option
+         will cause a performance drop and should only be enabled for
+         testing.
+
 config IOMMU_LEAK
        bool "IOMMU leak tracing"
-       depends on DEBUG_KERNEL
-       depends on IOMMU_DEBUG
+       depends on IOMMU_DEBUG && DMA_API_DEBUG
        ---help---
          Add a simple leak tracer to the IOMMU code. This is useful when you
          are debugging a buggy device driver that leaks IOMMU mappings.
 
+config X86_DS_SELFTEST
+    bool "DS selftest"
+    default y
+    depends on DEBUG_KERNEL
+    depends on X86_DS
+       ---help---
+         Perform Debug Store selftests at boot time.
+         If in doubt, say "N".
+
 config HAVE_MMIOTRACE_SUPPORT
        def_bool y
 
index 8c86b72afdc2d39bc8957692e35335ebabae82bc..edbd0ca620678fd6627c60d4849a07852a823b11 100644 (file)
@@ -7,8 +7,6 @@ else
         KBUILD_DEFCONFIG := $(ARCH)_defconfig
 endif
 
-core-$(CONFIG_KVM) += arch/x86/kvm/
-
 # BITS is used as extension for files which are available in a 32 bit
 # and a 64 bit version to simplify shared Makefiles.
 # e.g.: obj-y += foo_$(BITS).o
@@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o
 
 libs-y  += arch/x86/lib/
 
-# Sub architecture files that needs linking first
-core-y += $(fcore-y)
-
-# Xen paravirtualization support
-core-$(CONFIG_XEN) += arch/x86/xen/
-
-# lguest paravirtualization support
-core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/
-
-core-y += arch/x86/kernel/
-core-y += arch/x86/mm/
-
-core-y += arch/x86/crypto/
-core-y += arch/x86/vdso/
-core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/
+# See arch/x86/Kbuild for content of core part of the kernel
+core-y += arch/x86/
 
 # drivers-y are linked after core-y
 drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/
index 172cf8a98bdd2541e0d930e2a92903f5f8f9d448..851fe936d2421a14d7a558e2946321503d73a5b6 100644 (file)
@@ -3,6 +3,8 @@ bzImage
 cpustr.h
 mkcpustr
 offsets.h
+voffset.h
+zoffset.h
 setup
 setup.bin
 setup.elf
index 6633b6e7505a68cc32ebc198f9ec6cd46cd0d9c3..8d16ada250480cb57168ea8b600c920f7c1f5192 100644 (file)
@@ -26,9 +26,10 @@ targets              := vmlinux.bin setup.bin setup.elf bzImage
 targets                += fdimage fdimage144 fdimage288 image.iso mtools.conf
 subdir-                := compressed
 
-setup-y                += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o
+setup-y                += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o
 setup-y                += header.o main.o mca.o memory.o pm.o pmjump.o
-setup-y                += printf.o string.o tty.o video.o video-mode.o version.o
+setup-y                += printf.o regs.o string.o tty.o video.o video-mode.o
+setup-y                += version.o
 setup-$(CONFIG_X86_APM_BOOT) += apm.o
 
 # The link order of the video-*.o modules can matter.  In particular,
@@ -86,19 +87,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE
 
 SETUP_OBJS = $(addprefix $(obj)/,$(setup-y))
 
-sed-offsets := -e 's/^00*/0/' \
-        -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p'
+sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p'
 
-quiet_cmd_offsets = OFFSETS $@
-      cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@
+quiet_cmd_voffset = VOFFSET $@
+      cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@
 
-$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE
-       $(call if_changed,offsets)
+targets += voffset.h
+$(obj)/voffset.h: vmlinux FORCE
+       $(call if_changed,voffset)
+
+sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p'
+
+quiet_cmd_zoffset = ZOFFSET $@
+      cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@
+
+targets += zoffset.h
+$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE
+       $(call if_changed,zoffset)
 
-targets += offsets.h
 
 AFLAGS_header.o += -I$(obj)
-$(obj)/header.o: $(obj)/offsets.h
+$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h
 
 LDFLAGS_setup.elf      := -T
 $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE
index 7c19ce8c2442c219a1e85ee22798a8abaa0bdf2c..64a31a6d751a0132b9b09ea71da1b31b5ec955d8 100644 (file)
@@ -2,7 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007-2008 rPath, Inc. - All Rights Reserved
- *   Copyright 2009 Intel Corporation
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -90,8 +90,11 @@ static int a20_test_long(void)
 
 static void enable_a20_bios(void)
 {
-       asm volatile("pushfl; int $0x15; popfl"
-                    : : "a" ((u16)0x2401));
+       struct biosregs ireg;
+
+       initregs(&ireg);
+       ireg.ax = 0x2401;
+       intcall(0x15, &ireg, NULL);
 }
 
 static void enable_a20_kbc(void)
index 7aa6033001f9ace30ef1a4ffdb52bfd629a98f26..ee274834ea8ba35cf0a804d8be7cf07fa0077993 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   Original APM BIOS checking by Stephen Rothwell, May 1994
  *   (sfr@canb.auug.org.au)
 
 int query_apm_bios(void)
 {
-       u16 ax, bx, cx, dx, di;
-       u32 ebx, esi;
-       u8 err;
+       struct biosregs ireg, oreg;
 
        /* APM BIOS installation check */
-       ax = 0x5300;
-       bx = cx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-                    : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-                    : : "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x53;
+       intcall(0x15, &ireg, &oreg);
 
-       if (err)
+       if (oreg.flags & X86_EFLAGS_CF)
                return -1;              /* No APM BIOS */
 
-       if (bx != 0x504d)       /* "PM" signature */
+       if (oreg.bx != 0x504d)          /* "PM" signature */
                return -1;
 
-       if (!(cx & 0x02))               /* 32 bits supported? */
+       if (!(oreg.cx & 0x02))          /* 32 bits supported? */
                return -1;
 
        /* Disconnect first, just in case */
-       ax = 0x5304;
-       bx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-                    : "+a" (ax), "+b" (bx)
-                    : : "ecx", "edx", "esi", "edi");
-
-       /* Paranoia */
-       ebx = esi = 0;
-       cx = dx = di = 0;
+       ireg.al = 0x04;
+       intcall(0x15, &ireg, NULL);
 
        /* 32-bit connect */
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6"
-                    : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx),
-                      "+S" (esi), "+D" (di), "=m" (err)
-                    : "a" (0x5303));
-
-       boot_params.apm_bios_info.cseg = ax;
-       boot_params.apm_bios_info.offset = ebx;
-       boot_params.apm_bios_info.cseg_16 = cx;
-       boot_params.apm_bios_info.dseg = dx;
-       boot_params.apm_bios_info.cseg_len = (u16)esi;
-       boot_params.apm_bios_info.cseg_16_len = esi >> 16;
-       boot_params.apm_bios_info.dseg_len = di;
-
-       if (err)
+       ireg.al = 0x03;
+       intcall(0x15, &ireg, &oreg);
+
+       boot_params.apm_bios_info.cseg        = oreg.ax;
+       boot_params.apm_bios_info.offset      = oreg.ebx;
+       boot_params.apm_bios_info.cseg_16     = oreg.cx;
+       boot_params.apm_bios_info.dseg        = oreg.dx;
+       boot_params.apm_bios_info.cseg_len    = oreg.si;
+       boot_params.apm_bios_info.cseg_16_len = oreg.hsi;
+       boot_params.apm_bios_info.dseg_len    = oreg.di;
+
+       if (oreg.flags & X86_EFLAGS_CF)
                return -1;
 
        /* Redo the installation check as the 32-bit connect;
           some BIOSes return different flags this way... */
 
-       ax = 0x5300;
-       bx = cx = 0;
-       asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0"
-                    : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx)
-                    : : "esi", "edi");
+       ireg.al = 0x00;
+       intcall(0x15, &ireg, &oreg);
 
-       if (err || bx != 0x504d) {
+       if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) {
                /* Failure with 32-bit connect, try to disconect and ignore */
-               ax = 0x5304;
-               bx = 0;
-               asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp"
-                            : "+a" (ax), "+b" (bx)
-                            : : "ecx", "edx", "esi", "edi");
+               ireg.al = 0x04;
+               intcall(0x15, &ireg, NULL);
                return -1;
        }
 
-       boot_params.apm_bios_info.version = ax;
-       boot_params.apm_bios_info.flags cx;
+       boot_params.apm_bios_info.version = oreg.ax;
+       boot_params.apm_bios_info.flags   = oreg.cx;
        return 0;
 }
 
diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S
new file mode 100644 (file)
index 0000000..5077937
--- /dev/null
@@ -0,0 +1,82 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * "Glove box" for BIOS calls.  Avoids the constant problems with BIOSes
+ * touching registers they shouldn't be.
+ */
+
+       .code16
+       .text
+       .globl  intcall
+       .type   intcall, @function
+intcall:
+       /* Self-modify the INT instruction.  Ugly, but works. */
+       cmpb    %al, 3f
+       je      1f
+       movb    %al, 3f
+       jmp     1f              /* Synchronize pipeline */
+1:
+       /* Save state */
+       pushfl
+       pushw   %fs
+       pushw   %gs
+       pushal
+
+       /* Copy input state to stack frame */
+       subw    $44, %sp
+       movw    %dx, %si
+       movw    %sp, %di
+       movw    $11, %cx
+       rep; movsd
+
+       /* Pop full state from the stack */
+       popal
+       popw    %gs
+       popw    %fs
+       popw    %es
+       popw    %ds
+       popfl
+
+       /* Actual INT */
+       .byte   0xcd            /* INT opcode */
+3:     .byte   0
+
+       /* Push full state to the stack */
+       pushfl
+       pushw   %ds
+       pushw   %es
+       pushw   %fs
+       pushw   %gs
+       pushal
+
+       /* Re-establish C environment invariants */
+       cld
+       movzwl  %sp, %esp
+       movw    %cs, %ax
+       movw    %ax, %ds
+       movw    %ax, %es
+
+       /* Copy output state from stack frame */
+       movw    68(%esp), %di   /* Original %cx == 3rd argument */
+       andw    %di, %di
+       jz      4f
+       movw    %sp, %si
+       movw    $11, %cx
+       rep; movsd
+4:     addw    $44, %sp
+
+       /* Restore state and return */
+       popal
+       popw    %gs
+       popw    %fs
+       popfl
+       retl
+       .size   intcall, .-intcall
index 7b2692e897e5fcd75f53073dcf543665d838eb6b..98239d2658f27b3ed3494ea1b0ad3d6cd0a01624 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -26,6 +27,7 @@
 #include <asm/setup.h>
 #include "bitops.h"
 #include <asm/cpufeature.h>
+#include <asm/processor-flags.h>
 
 /* Useful macros */
 #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)]))
@@ -241,6 +243,49 @@ int enable_a20(void);
 /* apm.c */
 int query_apm_bios(void);
 
+/* bioscall.c */
+struct biosregs {
+       union {
+               struct {
+                       u32 edi;
+                       u32 esi;
+                       u32 ebp;
+                       u32 _esp;
+                       u32 ebx;
+                       u32 edx;
+                       u32 ecx;
+                       u32 eax;
+                       u32 _fsgs;
+                       u32 _dses;
+                       u32 eflags;
+               };
+               struct {
+                       u16 di, hdi;
+                       u16 si, hsi;
+                       u16 bp, hbp;
+                       u16 _sp, _hsp;
+                       u16 bx, hbx;
+                       u16 dx, hdx;
+                       u16 cx, hcx;
+                       u16 ax, hax;
+                       u16 gs, fs;
+                       u16 es, ds;
+                       u16 flags, hflags;
+               };
+               struct {
+                       u8 dil, dih, edi2, edi3;
+                       u8 sil, sih, esi2, esi3;
+                       u8 bpl, bph, ebp2, ebp3;
+                       u8 _spl, _sph, _esp2, _esp3;
+                       u8 bl, bh, ebx2, ebx3;
+                       u8 dl, dh, edx2, edx3;
+                       u8 cl, ch, ecx2, ecx3;
+                       u8 al, ah, eax2, eax3;
+               };
+       };
+};
+void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg);
+
 /* cmdline.c */
 int cmdline_find_option(const char *option, char *buffer, int bufsize);
 int cmdline_find_option_bool(const char *option);
@@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...);
 int vsprintf(char *buf, const char *fmt, va_list args);
 int printf(const char *fmt, ...);
 
+/* regs.c */
+void initregs(struct biosregs *regs);
+
 /* string.c */
 int strcmp(const char *str1, const char *str2);
 size_t strnlen(const char *s, size_t maxlen);
index 63eff3b04d0181be4eb6163522989760d82e23b7..4a46fab7162e3b753d143eaedbbf1dcaabf8822d 100644 (file)
@@ -1,3 +1,6 @@
 relocs
 vmlinux.bin.all
 vmlinux.relocs
+vmlinux.lds
+mkpiggy
+piggy.S
index 65551c9f85718baf762d203329b6be42082a2b9c..49c8a4c37d7c3b29bc7f5fb4d0ea660aec6fdf11 100644 (file)
@@ -19,7 +19,9 @@ KBUILD_AFLAGS  := $(KBUILD_CFLAGS) -D__ASSEMBLY__
 LDFLAGS := -m elf_$(UTS_MACHINE)
 LDFLAGS_vmlinux := -T
 
-$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
+hostprogs-y    := mkpiggy
+
+$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE
        $(call if_changed,ld)
        @:
 
@@ -29,7 +31,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE
 
 
 targets += vmlinux.bin.all vmlinux.relocs relocs
-hostprogs-$(CONFIG_X86_32) += relocs
+hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs
 
 quiet_cmd_relocs = RELOCS  $@
       cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $<
@@ -37,46 +39,22 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE
        $(call if_changed,relocs)
 
 vmlinux.bin.all-y := $(obj)/vmlinux.bin
-vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs
-quiet_cmd_relocbin = BUILD   $@
-      cmd_relocbin = cat $(filter-out FORCE,$^) > $@
-$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE
-       $(call if_changed,relocbin)
-
-ifeq ($(CONFIG_X86_32),y)
+vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs
 
-ifdef CONFIG_RELOCATABLE
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE
-       $(call if_changed,lzma)
-else
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
+$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
        $(call if_changed,lzma)
-endif
-LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T
 
-else
+suffix-$(CONFIG_KERNEL_GZIP)   := gz
+suffix-$(CONFIG_KERNEL_BZIP2)  := bz2
+suffix-$(CONFIG_KERNEL_LZMA)   := lzma
 
-$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,gzip)
-$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,bzip2)
-$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE
-       $(call if_changed,lzma)
-
-LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T
-endif
+quiet_cmd_mkpiggy = MKPIGGY $@
+      cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
 
-suffix_$(CONFIG_KERNEL_GZIP)  = gz
-suffix_$(CONFIG_KERNEL_BZIP2) = bz2
-suffix_$(CONFIG_KERNEL_LZMA)  = lzma
-
-$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE
-       $(call if_changed,ld)
+targets += piggy.S
+$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE
+       $(call if_changed,mkpiggy)
index 3a8a866fb2e291e958478cb741a56d16b1016e5f..75e4f001e7061e5d3036197ac202dbee8e3d07a2 100644 (file)
  * the page directory. [According to comments etc elsewhere on a compressed
  * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC]
  *
- * Page 0 is deliberately kept safe, since System Management Mode code in 
+ * Page 0 is deliberately kept safe, since System Management Mode code in
  * laptops may need to access the BIOS data stored there.  This is also
- * useful for future device drivers that either access the BIOS via VM86 
+ * useful for future device drivers that either access the BIOS via VM86
  * mode.
  */
 
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.text
+       .text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/boot.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head","ax",@progbits
+       .section ".text.head","ax",@progbits
 ENTRY(startup_32)
        cld
-       /* test KEEP_SEGMENTS flag to see if the bootloader is asking
-        * us to not reload segments */
-       testb $(1<<6), BP_loadflags(%esi)
-       jnz 1f
+       /*
+        * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+        * us to not reload segments
+        */
+       testb   $(1<<6), BP_loadflags(%esi)
+       jnz     1f
 
        cli
-       movl $(__BOOT_DS),%eax
-       movl %eax,%ds
-       movl %eax,%es
-       movl %eax,%fs
-       movl %eax,%gs
-       movl %eax,%ss
+       movl    $__BOOT_DS, %eax
+       movl    %eax, %ds
+       movl    %eax, %es
+       movl    %eax, %fs
+       movl    %eax, %gs
+       movl    %eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-       leal (0x1e4+4)(%esi), %esp
-       call 1f
-1:     popl %ebp
-       subl $1b, %ebp
+       leal    (BP_scratch+4)(%esi), %esp
+       call    1f
+1:     popl    %ebp
+       subl    $1b, %ebp
 
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+/*
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
 
 #ifdef CONFIG_RELOCATABLE
-       movl    %ebp, %ebx
-       addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebx
-       andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx
+       movl    %ebp, %ebx
+       movl    BP_kernel_alignment(%esi), %eax
+       decl    %eax
+       addl    %eax, %ebx
+       notl    %eax
+       andl    %eax, %ebx
 #else
-       movl $LOAD_PHYSICAL_ADDR, %ebx
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
-       /* Replace the compressed data size with the uncompressed size */
-       subl input_len(%ebp), %ebx
-       movl output_len(%ebp), %eax
-       addl %eax, %ebx
-       /* Add 8 bytes for every 32K input block */
-       shrl $12, %eax
-       addl %eax, %ebx
-       /* Add 32K + 18 bytes of extra slack */
-       addl $(32768 + 18), %ebx
-       /* Align on a 4K boundary */
-       addl $4095, %ebx
-       andl $~4095, %ebx
-
-/* Copy the compressed kernel to the end of our buffer
+       /* Target address to relocate to for decompression */
+       addl    $z_extract_offset, %ebx
+
+       /* Set up the stack */
+       leal    boot_stack_end(%ebx), %esp
+
+       /* Zero EFLAGS */
+       pushl   $0
+       popfl
+
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-       pushl %esi
-       leal _end(%ebp), %esi
-       leal _end(%ebx), %edi
-       movl $(_end - startup_32), %ecx
+       pushl   %esi
+       leal    (_bss-4)(%ebp), %esi
+       leal    (_bss-4)(%ebx), %edi
+       movl    $(_bss - startup_32), %ecx
+       shrl    $2, %ecx
        std
-       rep
-       movsb
+       rep     movsl
        cld
-       popl %esi
-
-/* Compute the kernel start address.
- */
-#ifdef CONFIG_RELOCATABLE
-       addl    $(CONFIG_PHYSICAL_ALIGN - 1), %ebp
-       andl    $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp
-#else
-       movl    $LOAD_PHYSICAL_ADDR, %ebp
-#endif
+       popl    %esi
 
 /*
  * Jump to the relocated address.
  */
-       leal relocated(%ebx), %eax
-       jmp *%eax
+       leal    relocated(%ebx), %eax
+       jmp     *%eax
 ENDPROC(startup_32)
 
-.section ".text"
+       .text
 relocated:
 
 /*
- * Clear BSS
- */
-       xorl %eax,%eax
-       leal _edata(%ebx),%edi
-       leal _end(%ebx), %ecx
-       subl %edi,%ecx
-       cld
-       rep
-       stosb
-
-/*
- * Setup the stack for the decompressor
+ * Clear BSS (stack is currently empty)
  */
-       leal boot_stack_end(%ebx), %esp
+       xorl    %eax, %eax
+       leal    _bss(%ebx), %edi
+       leal    _ebss(%ebx), %ecx
+       subl    %edi, %ecx
+       shrl    $2, %ecx
+       rep     stosl
 
 /*
  * Do the decompression, and jump to the new kernel..
  */
-       movl output_len(%ebx), %eax
-       pushl %eax
-                       # push arguments for decompress_kernel:
-       pushl %ebp      # output address
-       movl input_len(%ebx), %eax
-       pushl %eax      # input_len
-       leal input_data(%ebx), %eax
-       pushl %eax      # input_data
-       leal boot_heap(%ebx), %eax
-       pushl %eax      # heap area
-       pushl %esi      # real mode pointer
-       call decompress_kernel
-       addl $20, %esp
-       popl %ecx
+       leal    z_extract_offset_negative(%ebx), %ebp
+                               /* push arguments for decompress_kernel: */
+       pushl   %ebp            /* output address */
+       pushl   $z_input_len    /* input_len */
+       leal    input_data(%ebx), %eax
+       pushl   %eax            /* input_data */
+       leal    boot_heap(%ebx), %eax
+       pushl   %eax            /* heap area */
+       pushl   %esi            /* real mode pointer */
+       call    decompress_kernel
+       addl    $20, %esp
 
 #if CONFIG_RELOCATABLE
-/* Find the address of the relocations.
+/*
+ * Find the address of the relocations.
  */
-       movl %ebp, %edi
-       addl %ecx, %edi
+       leal    z_output_len(%ebp), %edi
 
-/* Calculate the delta between where vmlinux was compiled to run
+/*
+ * Calculate the delta between where vmlinux was compiled to run
  * and where it was actually loaded.
  */
-       movl %ebp, %ebx
-       subl $LOAD_PHYSICAL_ADDR, %ebx
-       jz   2f         /* Nothing to be done if loaded at compiled addr. */
+       movl    %ebp, %ebx
+       subl    $LOAD_PHYSICAL_ADDR, %ebx
+       jz      2f      /* Nothing to be done if loaded at compiled addr. */
 /*
  * Process relocations.
  */
 
-1:     subl $4, %edi
-       movl 0(%edi), %ecx
-       testl %ecx, %ecx
-       jz 2f
-       addl %ebx, -__PAGE_OFFSET(%ebx, %ecx)
-       jmp 1b
+1:     subl    $4, %edi
+       movl    (%edi), %ecx
+       testl   %ecx, %ecx
+       jz      2f
+       addl    %ebx, -__PAGE_OFFSET(%ebx, %ecx)
+       jmp     1b
 2:
 #endif
 
 /*
  * Jump to the decompressed kernel.
  */
-       xorl %ebx,%ebx
-       jmp *%ebp
+       xorl    %ebx, %ebx
+       jmp     *%ebp
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+       .bss
+       .balign 4
 boot_heap:
        .fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
index ed4a8294800268c25667b5ede9bc42e2cdd229c9..f62c284db9eb5c94edf1df794b26e13ace0b4113 100644 (file)
@@ -21,8 +21,8 @@
 /*
  * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996
  */
-.code32
-.text
+       .code32
+       .text
 
 #include <linux/linkage.h>
 #include <asm/segment.h>
 #include <asm/processor-flags.h>
 #include <asm/asm-offsets.h>
 
-.section ".text.head"
+       .section ".text.head"
        .code32
 ENTRY(startup_32)
        cld
-       /* test KEEP_SEGMENTS flag to see if the bootloader is asking
-        * us to not reload segments */
+       /*
+        * Test KEEP_SEGMENTS flag to see if the bootloader is asking
+        * us to not reload segments
+        */
        testb $(1<<6), BP_loadflags(%esi)
        jnz 1f
 
@@ -49,14 +51,15 @@ ENTRY(startup_32)
        movl    %eax, %ss
 1:
 
-/* Calculate the delta between where we were compiled to run
+/*
+ * Calculate the delta between where we were compiled to run
  * at and where we were actually loaded at.  This can only be done
  * with a short local call on x86.  Nothing  else will tell us what
  * address we are running at.  The reserved chunk of the real-mode
  * data at 0x1e4 (defined as a scratch field) are used as the stack
  * for this calculation. Only 4 bytes are needed.
  */
-       leal    (0x1e4+4)(%esi), %esp
+       leal    (BP_scratch+4)(%esi), %esp
        call    1f
 1:     popl    %ebp
        subl    $1b, %ebp
@@ -70,32 +73,28 @@ ENTRY(startup_32)
        testl   %eax, %eax
        jnz     no_longmode
 
-/* Compute the delta between where we were compiled to run at
+/*
+ * Compute the delta between where we were compiled to run at
  * and where the code will actually run at.
- */
-/* %ebp contains the address we are loaded at by the boot loader and %ebx
+ *
+ * %ebp contains the address we are loaded at by the boot loader and %ebx
  * contains the address where we should move the kernel image temporarily
  * for safe in-place decompression.
  */
 
 #ifdef CONFIG_RELOCATABLE
        movl    %ebp, %ebx
-       addl    $(PMD_PAGE_SIZE -1), %ebx
-       andl    $PMD_PAGE_MASK, %ebx
+       movl    BP_kernel_alignment(%esi), %eax
+       decl    %eax
+       addl    %eax, %ebx
+       notl    %eax
+       andl    %eax, %ebx
 #else
-       movl    $CONFIG_PHYSICAL_START, %ebx
+       movl    $LOAD_PHYSICAL_ADDR, %ebx
 #endif
 
-       /* Replace the compressed data size with the uncompressed size */
-       subl    input_len(%ebp), %ebx
-       movl    output_len(%ebp), %eax
-       addl    %eax, %ebx
-       /* Add 8 bytes for every 32K input block */
-       shrl    $12, %eax
-       addl    %eax, %ebx
-       /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-       addl    $(32768 + 18 + 4095), %ebx
-       andl    $~4095, %ebx
+       /* Target address to relocate to for decompression */
+       addl    $z_extract_offset, %ebx
 
 /*
  * Prepare for entering 64 bit mode
@@ -114,7 +113,7 @@ ENTRY(startup_32)
  /*
   * Build early 4G boot pagetable
   */
-       /* Initialize Page tables to 0*/
+       /* Initialize Page tables to 0 */
        leal    pgtable(%ebx), %edi
        xorl    %eax, %eax
        movl    $((4096*6)/4), %ecx
@@ -155,7 +154,8 @@ ENTRY(startup_32)
        btsl    $_EFER_LME, %eax
        wrmsr
 
-       /* Setup for the jump to 64bit mode
+       /*
+        * Setup for the jump to 64bit mode
         *
         * When the jump is performend we will be in long mode but
         * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1
@@ -184,7 +184,8 @@ no_longmode:
 
 #include "../../kernel/verify_cpu_64.S"
 
-       /* Be careful here startup_64 needs to be at a predictable
+       /*
+        * Be careful here startup_64 needs to be at a predictable
         * address so I can export it in an ELF header.  Bootloaders
         * should look at the ELF header to find this address, as
         * it may change in the future.
@@ -192,7 +193,8 @@ no_longmode:
        .code64
        .org 0x200
 ENTRY(startup_64)
-       /* We come here either from startup_32 or directly from a
+       /*
+        * We come here either from startup_32 or directly from a
         * 64bit bootloader.  If we come here from a bootloader we depend on
         * an identity mapped page table being provied that maps our
         * entire text+data+bss and hopefully all of memory.
@@ -209,50 +211,54 @@ ENTRY(startup_64)
        movl    $0x20, %eax
        ltr     %ax
 
-       /* Compute the decompressed kernel start address.  It is where
+       /*
+        * Compute the decompressed kernel start address.  It is where
         * we were loaded at aligned to a 2M boundary. %rbp contains the
         * decompressed kernel start address.
         *
         * If it is a relocatable kernel then decompress and run the kernel
         * from load address aligned to 2MB addr, otherwise decompress and
-        * run the kernel from CONFIG_PHYSICAL_START
+        * run the kernel from LOAD_PHYSICAL_ADDR
+        *
+        * We cannot rely on the calculation done in 32-bit mode, since we
+        * may have been invoked via the 64-bit entry point.
         */
 
        /* Start with the delta to where the kernel will run at. */
 #ifdef CONFIG_RELOCATABLE
        leaq    startup_32(%rip) /* - $startup_32 */, %rbp
-       addq    $(PMD_PAGE_SIZE - 1), %rbp
-       andq    $PMD_PAGE_MASK, %rbp
-       movq    %rbp, %rbx
+       movl    BP_kernel_alignment(%rsi), %eax
+       decl    %eax
+       addq    %rax, %rbp
+       notq    %rax
+       andq    %rax, %rbp
 #else
-       movq    $CONFIG_PHYSICAL_START, %rbp
-       movq    %rbp, %rbx
+       movq    $LOAD_PHYSICAL_ADDR, %rbp
 #endif
 
-       /* Replace the compressed data size with the uncompressed size */
-       movl    input_len(%rip), %eax
-       subq    %rax, %rbx
-       movl    output_len(%rip), %eax
-       addq    %rax, %rbx
-       /* Add 8 bytes for every 32K input block */
-       shrq    $12, %rax
-       addq    %rax, %rbx
-       /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */
-       addq    $(32768 + 18 + 4095), %rbx
-       andq    $~4095, %rbx
-
-/* Copy the compressed kernel to the end of our buffer
+       /* Target address to relocate to for decompression */
+       leaq    z_extract_offset(%rbp), %rbx
+
+       /* Set up the stack */
+       leaq    boot_stack_end(%rbx), %rsp
+
+       /* Zero EFLAGS */
+       pushq   $0
+       popfq
+
+/*
+ * Copy the compressed kernel to the end of our buffer
  * where decompression in place becomes safe.
  */
-       leaq    _end_before_pgt(%rip), %r8
-       leaq    _end_before_pgt(%rbx), %r9
-       movq    $_end_before_pgt /* - $startup_32 */, %rcx
-1:     subq    $8, %r8
-       subq    $8, %r9
-       movq    0(%r8), %rax
-       movq    %rax, 0(%r9)
-       subq    $8, %rcx
-       jnz     1b
+       pushq   %rsi
+       leaq    (_bss-8)(%rip), %rsi
+       leaq    (_bss-8)(%rbx), %rdi
+       movq    $_bss /* - $startup_32 */, %rcx
+       shrq    $3, %rcx
+       std
+       rep     movsq
+       cld
+       popq    %rsi
 
 /*
  * Jump to the relocated address.
@@ -260,37 +266,28 @@ ENTRY(startup_64)
        leaq    relocated(%rbx), %rax
        jmp     *%rax
 
-.section ".text"
+       .text
 relocated:
 
 /*
- * Clear BSS
+ * Clear BSS (stack is currently empty)
  */
-       xorq    %rax, %rax
-       leaq    _edata(%rbx), %rdi
-       leaq    _end_before_pgt(%rbx), %rcx
+       xorl    %eax, %eax
+       leaq    _bss(%rip), %rdi
+       leaq    _ebss(%rip), %rcx
        subq    %rdi, %rcx
-       cld
-       rep
-       stosb
-
-       /* Setup the stack */
-       leaq    boot_stack_end(%rip), %rsp
-
-       /* zero EFLAGS after setting rsp */
-       pushq   $0
-       popfq
+       shrq    $3, %rcx
+       rep     stosq
 
 /*
  * Do the decompression, and jump to the new kernel..
  */
-       pushq   %rsi                    # Save the real mode argument
-       movq    %rsi, %rdi              # real mode address
-       leaq    boot_heap(%rip), %rsi   # malloc area for uncompression
-       leaq    input_data(%rip), %rdx  # input_data
-       movl    input_len(%rip), %eax
-       movq    %rax, %rcx              # input_len
-       movq    %rbp, %r8               # output
+       pushq   %rsi                    /* Save the real mode argument */
+       movq    %rsi, %rdi              /* real mode address */
+       leaq    boot_heap(%rip), %rsi   /* malloc area for uncompression */
+       leaq    input_data(%rip), %rdx  /* input_data */
+       movl    $z_input_len, %ecx      /* input_len */
+       movq    %rbp, %r8               /* output target address */
        call    decompress_kernel
        popq    %rsi
 
@@ -311,11 +308,21 @@ gdt:
        .quad   0x0000000000000000      /* TS continued */
 gdt_end:
 
-.bss
-/* Stack and heap for uncompression */
-.balign 4
+/*
+ * Stack and heap for uncompression
+ */
+       .bss
+       .balign 4
 boot_heap:
        .fill BOOT_HEAP_SIZE, 1, 0
 boot_stack:
        .fill BOOT_STACK_SIZE, 1, 0
 boot_stack_end:
+
+/*
+ * Space for page tables (not in .bss so not zeroed)
+ */
+       .section ".pgtable","a",@nobits
+       .balign 4096
+pgtable:
+       .fill 6*4096, 1, 0
index e45be73684ffe3bd5559029eb86cf66ec00ff250..842b2a36174a2a2b6347851d7f1069878fb5cb4b 100644 (file)
@@ -325,20 +325,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
        free_mem_ptr     = heap;        /* Heap */
        free_mem_end_ptr = heap + BOOT_HEAP_SIZE;
 
+       if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1))
+               error("Destination address inappropriately aligned");
 #ifdef CONFIG_X86_64
-       if ((unsigned long)output & (__KERNEL_ALIGN - 1))
-               error("Destination address not 2M aligned");
-       if ((unsigned long)output >= 0xffffffffffUL)
+       if (heap > 0x3fffffffffffUL)
                error("Destination address too large");
 #else
-       if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1))
-               error("Destination address not CONFIG_PHYSICAL_ALIGN aligned");
        if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff))
                error("Destination address too large");
+#endif
 #ifndef CONFIG_RELOCATABLE
-       if ((u32)output != LOAD_PHYSICAL_ADDR)
+       if ((unsigned long)output != LOAD_PHYSICAL_ADDR)
                error("Wrong destination address");
-#endif
 #endif
 
        if (!quiet)
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
new file mode 100644 (file)
index 0000000..bcbd36c
--- /dev/null
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------- *
+ *
+ *  Copyright (C) 2009 Intel Corporation. All rights reserved.
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License version
+ *  2 as published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
+ *  02110-1301, USA.
+ *
+ *  H. Peter Anvin <hpa@linux.intel.com>
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Compute the desired load offset from a compressed program; outputs
+ * a small assembly wrapper with the appropriate symbols defined.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <inttypes.h>
+
+static uint32_t getle32(const void *p)
+{
+       const uint8_t *cp = p;
+
+       return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) +
+               ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24);
+}
+
+int main(int argc, char *argv[])
+{
+       uint32_t olen;
+       long ilen;
+       unsigned long offs;
+       FILE *f;
+
+       if (argc < 2) {
+               fprintf(stderr, "Usage: %s compressed_file\n", argv[0]);
+               return 1;
+       }
+
+       /* Get the information for the compressed kernel image first */
+
+       f = fopen(argv[1], "r");
+       if (!f) {
+               perror(argv[1]);
+               return 1;
+       }
+
+
+       if (fseek(f, -4L, SEEK_END)) {
+               perror(argv[1]);
+       }
+       fread(&olen, sizeof olen, 1, f);
+       ilen = ftell(f);
+       olen = getle32(&olen);
+       fclose(f);
+
+       /*
+        * Now we have the input (compressed) and output (uncompressed)
+        * sizes, compute the necessary decompression offset...
+        */
+
+       offs = (olen > ilen) ? olen - ilen : 0;
+       offs += olen >> 12;     /* Add 8 bytes for each 32K block */
+       offs += 32*1024 + 18;   /* Add 32K + 18 bytes slack */
+       offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
+
+       printf(".section \".rodata.compressed\",\"a\",@progbits\n");
+       printf(".globl z_input_len\n");
+       printf("z_input_len = %lu\n", ilen);
+       printf(".globl z_output_len\n");
+       printf("z_output_len = %lu\n", (unsigned long)olen);
+       printf(".globl z_extract_offset\n");
+       printf("z_extract_offset = 0x%lx\n", offs);
+       /* z_extract_offset_negative allows simplification of head_32.S */
+       printf(".globl z_extract_offset_negative\n");
+       printf("z_extract_offset_negative = -0x%lx\n", offs);
+
+       printf(".globl input_data, input_data_end\n");
+       printf("input_data:\n");
+       printf(".incbin \"%s\"\n", argv[1]);
+       printf("input_data_end:\n");
+
+       return 0;
+}
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
new file mode 100644 (file)
index 0000000..cc353e1
--- /dev/null
@@ -0,0 +1,65 @@
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#undef i386
+
+#include <asm/page_types.h>
+
+#ifdef CONFIG_X86_64
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(startup_64)
+#else
+OUTPUT_ARCH(i386)
+ENTRY(startup_32)
+#endif
+
+SECTIONS
+{
+       /* Be careful parts of head_64.S assume startup_32 is at
+        * address 0.
+        */
+       . = 0;
+       .text.head : {
+               _head = . ;
+               *(.text.head)
+               _ehead = . ;
+       }
+       .rodata.compressed : {
+               *(.rodata.compressed)
+       }
+       .text : {
+               _text = .;      /* Text */
+               *(.text)
+               *(.text.*)
+               _etext = . ;
+       }
+       .rodata : {
+               _rodata = . ;
+               *(.rodata)       /* read-only data */
+               *(.rodata.*)
+               _erodata = . ;
+       }
+       .data : {
+               _data = . ;
+               *(.data)
+               *(.data.*)
+               _edata = . ;
+       }
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .bss : {
+               _bss = . ;
+               *(.bss)
+               *(.bss.*)
+               *(COMMON)
+               . = ALIGN(8);   /* For convenience during zeroing */
+               _ebss = .;
+       }
+#ifdef CONFIG_X86_64
+       . = ALIGN(PAGE_SIZE);
+       .pgtable : {
+               _pgtable = . ;
+               *(.pgtable)
+               _epgtable = . ;
+       }
+#endif
+       _end = .;
+}
diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr
deleted file mode 100644 (file)
index f02382a..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-SECTIONS
-{
-  .rodata.compressed : {
-       input_len = .;
-       LONG(input_data_end - input_data) input_data = .;
-       *(.data)
-       output_len = . - 4;
-       input_data_end = .;
-       }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds
deleted file mode 100644 (file)
index bb3c483..0000000
+++ /dev/null
@@ -1,43 +0,0 @@
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(startup_32)
-SECTIONS
-{
-       /* Be careful parts of head_32.S assume startup_32 is at
-        * address 0.
-        */
-       . = 0;
-       .text.head : {
-               _head = . ;
-               *(.text.head)
-               _ehead = . ;
-       }
-       .rodata.compressed : {
-               *(.rodata.compressed)
-       }
-       .text : {
-               _text = .;      /* Text */
-               *(.text)
-               *(.text.*)
-               _etext = . ;
-       }
-       .rodata : {
-               _rodata = . ;
-               *(.rodata)       /* read-only data */
-               *(.rodata.*)
-               _erodata = . ;
-       }
-       .data : {
-               _data = . ;
-               *(.data)
-               *(.data.*)
-               _edata = . ;
-       }
-       .bss : {
-               _bss = . ;
-               *(.bss)
-               *(.bss.*)
-               *(COMMON)
-               _end = . ;
-       }
-}
diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux_64.lds
deleted file mode 100644 (file)
index bef1ac8..0000000
+++ /dev/null
@@ -1,48 +0,0 @@
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(startup_64)
-SECTIONS
-{
-       /* Be careful parts of head_64.S assume startup_32 is at
-        * address 0.
-        */
-       . = 0;
-       .text.head : {
-               _head = . ;
-               *(.text.head)
-               _ehead = . ;
-       }
-       .rodata.compressed : {
-               *(.rodata.compressed)
-       }
-       .text : {
-               _text = .;      /* Text */
-               *(.text)
-               *(.text.*)
-               _etext = . ;
-       }
-       .rodata : {
-               _rodata = . ;
-               *(.rodata)       /* read-only data */
-               *(.rodata.*)
-               _erodata = . ;
-       }
-       .data : {
-               _data = . ;
-               *(.data)
-               *(.data.*)
-               _edata = . ;
-       }
-       .bss : {
-               _bss = . ;
-               *(.bss)
-               *(.bss.*)
-               *(COMMON)
-               . = ALIGN(8);
-               _end_before_pgt = . ;
-               . = ALIGN(4096);
-               pgtable = . ;
-               . = . + 4096 * 6;
-               _ebss = .;
-       }
-}
index 1aae8f3e5ca1912b6eb60ad75e3c120af1110bed..c501a5b466f8495c26d3eff4de1cddb97613fca4 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
  */
 static int read_mbr(u8 devno, void *buf)
 {
-       u16 ax, bx, cx, dx;
+       struct biosregs ireg, oreg;
 
-       ax = 0x0201;            /* Legacy Read, one sector */
-       cx = 0x0001;            /* Sector 0-0-1 */
-       dx = devno;
-       bx = (size_t)buf;
-       asm volatile("pushfl; stc; int $0x13; setc %%al; popfl"
-                    : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx)
-                    : : "esi", "edi", "memory");
+       initregs(&ireg);
+       ireg.ax = 0x0201;               /* Legacy Read, one sector */
+       ireg.cx = 0x0001;               /* Sector 0-0-1 */
+       ireg.dl = devno;
+       ireg.bx = (size_t)buf;
 
-       return -(u8)ax;         /* 0 or -1 */
+       intcall(0x13, &ireg, &oreg);
+
+       return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
@@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig)
 
 static int get_edd_info(u8 devno, struct edd_info *ei)
 {
-       u16 ax, bx, cx, dx, di;
+       struct biosregs ireg, oreg;
 
        memset(ei, 0, sizeof *ei);
 
        /* Check Extensions Present */
 
-       ax = 0x4100;
-       bx = EDDMAGIC1;
-       dx = devno;
-       asm("pushfl; stc; int $0x13; setc %%al; popfl"
-           : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx)
-           : : "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x41;
+       ireg.bx = EDDMAGIC1;
+       ireg.dl = devno;
+       intcall(0x13, &ireg, &oreg);
 
-       if ((u8)ax)
+       if (oreg.eflags & X86_EFLAGS_CF)
                return -1;      /* No extended information */
 
-       if (bx != EDDMAGIC2)
+       if (oreg.bx != EDDMAGIC2)
                return -1;
 
        ei->device  = devno;
-       ei->version = ax >> 8;  /* EDD version number */
-       ei->interface_support = cx; /* EDD functionality subsets */
+       ei->version = oreg.ah;           /* EDD version number */
+       ei->interface_support = oreg.cx; /* EDD functionality subsets */
 
        /* Extended Get Device Parameters */
 
        ei->params.length = sizeof(ei->params);
-       ax = 0x4800;
-       dx = devno;
-       asm("pushfl; int $0x13; popfl"
-           : "+a" (ax), "+d" (dx), "=m" (ei->params)
-           : "S" (&ei->params)
-           : "ebx", "ecx", "edi");
+       ireg.ah = 0x48;
+       ireg.si = (size_t)&ei->params;
+       intcall(0x13, &ireg, &oreg);
 
        /* Get legacy CHS parameters */
 
        /* Ralf Brown recommends setting ES:DI to 0:0 */
-       ax = 0x0800;
-       dx = devno;
-       di = 0;
-       asm("pushw %%es; "
-           "movw %%di,%%es; "
-           "pushfl; stc; int $0x13; setc %%al; popfl; "
-           "popw %%es"
-           : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di)
-           : : "esi");
-
-       if ((u8)ax == 0) {
-               ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2);
-               ei->legacy_max_head = dx >> 8;
-               ei->legacy_sectors_per_track = cx & 0x3f;
+       ireg.ah = 0x08;
+       ireg.es = 0;
+       intcall(0x13, &ireg, &oreg);
+
+       if (!(oreg.eflags & X86_EFLAGS_CF)) {
+               ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2);
+               ei->legacy_max_head = oreg.dh;
+               ei->legacy_sectors_per_track = oreg.cl & 0x3f;
        }
 
        return 0;
index 5d84d1c74e4c6d2a84666c7b7125641b00a1dcad..b31cc54b46410f89b4120702b1d1b018a79e204d 100644 (file)
@@ -22,7 +22,8 @@
 #include <asm/page_types.h>
 #include <asm/setup.h>
 #include "boot.h"
-#include "offsets.h"
+#include "voffset.h"
+#include "zoffset.h"
 
 BOOTSEG                = 0x07C0                /* original address of boot-sector */
 SYSSEG         = 0x1000                /* historical load address >> 4 */
@@ -115,7 +116,7 @@ _start:
        # Part 2 of the header, from the old setup.S
 
                .ascii  "HdrS"          # header signature
-               .word   0x0209          # header version number (>= 0x0105)
+               .word   0x020a          # header version number (>= 0x0105)
                                        # or else old loadlin-1.5 will fail)
                .globl realmode_swtch
 realmode_swtch:        .word   0, 0            # default_switch, SETUPSEG
@@ -168,7 +169,11 @@ heap_end_ptr:      .word   _end+STACK_SIZE-512
                                        # end of setup code can be used by setup
                                        # for local heap purposes.
 
-pad1:          .word   0
+ext_loader_ver:
+               .byte   0               # Extended boot loader version
+ext_loader_type:
+               .byte   0               # Extended boot loader type
+
 cmd_line_ptr:  .long   0               # (Header version 0x0202 or later)
                                        # If nonzero, a 32-bit pointer
                                        # to the kernel command line.
@@ -200,7 +205,7 @@ relocatable_kernel:    .byte 1
 #else
 relocatable_kernel:    .byte 0
 #endif
-pad2:                  .byte 0
+min_alignment:         .byte MIN_KERNEL_ALIGN_LG2      # minimum alignment
 pad3:                  .word 0
 
 cmdline_size:   .long   COMMAND_LINE_SIZE-1     #length of the command line,
@@ -212,16 +217,27 @@ hardware_subarch: .long 0                 # subarchitecture, added with 2.07
 
 hardware_subarch_data: .quad 0
 
-payload_offset:                .long input_data
-payload_length:                .long input_data_end-input_data
+payload_offset:                .long ZO_input_data
+payload_length:                .long ZO_z_input_len
 
 setup_data:            .quad 0                 # 64-bit physical pointer to
                                                # single linked list of
                                                # struct setup_data
 
+pref_address:          .quad LOAD_PHYSICAL_ADDR        # preferred load addr
+
+#define ZO_INIT_SIZE   (ZO__end - ZO_startup_32 + ZO_z_extract_offset)
+#define VO_INIT_SIZE   (VO__end - VO__text)
+#if ZO_INIT_SIZE > VO_INIT_SIZE
+#define INIT_SIZE ZO_INIT_SIZE
+#else
+#define INIT_SIZE VO_INIT_SIZE
+#endif
+init_size:             .long INIT_SIZE         # kernel initialization size
+
 # End of setup header #####################################################
 
-       .section ".inittext", "ax"
+       .section ".entrytext", "ax"
 start_of_setup:
 #ifdef SAFE_RESET_DISK_CONTROLLER
 # Reset the disk controller.
index 58f0415d3ae09261d55c11d12855e3bc75fc4bdf..140172b895bd32c32ca2819ef776718f77df3103 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -61,11 +62,10 @@ static void copy_boot_params(void)
  */
 static void keyboard_set_repeat(void)
 {
-       u16 ax = 0x0305;
-       u16 bx = 0;
-       asm volatile("int $0x16"
-                    : "+a" (ax), "+b" (bx)
-                    : : "ecx", "edx", "esi", "edi");
+       struct biosregs ireg;
+       initregs(&ireg);
+       ireg.ax = 0x0305;
+       intcall(0x16, &ireg, NULL);
 }
 
 /*
@@ -73,18 +73,22 @@ static void keyboard_set_repeat(void)
  */
 static void query_ist(void)
 {
+       struct biosregs ireg, oreg;
+
        /* Some older BIOSes apparently crash on this call, so filter
           it from machines too old to have SpeedStep at all. */
        if (cpu.level < 6)
                return;
 
-       asm("int $0x15"
-           : "=a" (boot_params.ist_info.signature),
-             "=b" (boot_params.ist_info.command),
-             "=c" (boot_params.ist_info.event),
-             "=d" (boot_params.ist_info.perf_level)
-           : "a" (0x0000e980),  /* IST Support */
-             "d" (0x47534943)); /* Request value */
+       initregs(&ireg);
+       ireg.ax  = 0xe980;       /* IST Support */
+       ireg.edx = 0x47534943;   /* Request value */
+       intcall(0x15, &ireg, &oreg);
+
+       boot_params.ist_info.signature  = oreg.eax;
+       boot_params.ist_info.command    = oreg.ebx;
+       boot_params.ist_info.event      = oreg.ecx;
+       boot_params.ist_info.perf_level = oreg.edx;
 }
 
 /*
@@ -93,13 +97,12 @@ static void query_ist(void)
 static void set_bios_mode(void)
 {
 #ifdef CONFIG_X86_64
-       u32 eax, ebx;
+       struct biosregs ireg;
 
-       eax = 0xec00;
-       ebx = 2;
-       asm volatile("int $0x15"
-                    : "+a" (eax), "+b" (ebx)
-                    : : "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ax = 0xec00;
+       ireg.bx = 2;
+       intcall(0x15, &ireg, NULL);
 #endif
 }
 
index 911eaae5d696427c98fdad7f344a14bdc0340efc..a95a531148ef9562ca408a9f451914748159b71e 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
 
 int query_mca(void)
 {
-       u8 err;
-       u16 es, bx, len;
-
-       asm("pushw %%es ; "
-           "int $0x15 ; "
-           "setc %0 ; "
-           "movw %%es, %1 ; "
-           "popw %%es"
-           : "=acd" (err), "=acdSD" (es), "=b" (bx)
-           : "a" (0xc000));
-
-       if (err)
+       struct biosregs ireg, oreg;
+       u16 len;
+
+       initregs(&ireg);
+       ireg.ah = 0xc0;
+       intcall(0x15, &ireg, &oreg);
+
+       if (oreg.eflags & X86_EFLAGS_CF)
                return -1;      /* No MCA present */
 
-       set_fs(es);
-       len = rdfs16(bx);
+       set_fs(oreg.es);
+       len = rdfs16(oreg.bx);
 
        if (len > sizeof(boot_params.sys_desc_table))
                len = sizeof(boot_params.sys_desc_table);
 
-       copy_from_fs(&boot_params.sys_desc_table, bx, len);
+       copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len);
        return 0;
 }
index 74b3d2ba84e90a39bb29f166278ce47bcf56437c..cae3feb1035e3da73826b518de4d47c15ad4e58d 100644 (file)
 static int detect_memory_e820(void)
 {
        int count = 0;
-       u32 next = 0;
-       u32 size, id, edi;
-       u8 err;
+       struct biosregs ireg, oreg;
        struct e820entry *desc = boot_params.e820_map;
        static struct e820entry buf; /* static so it is zeroed */
 
+       initregs(&ireg);
+       ireg.ax  = 0xe820;
+       ireg.cx  = sizeof buf;
+       ireg.edx = SMAP;
+       ireg.di  = (size_t)&buf;
+
        /*
         * Note: at least one BIOS is known which assumes that the
         * buffer pointed to by one e820 call is the same one as
@@ -41,22 +45,13 @@ static int detect_memory_e820(void)
         */
 
        do {
-               size = sizeof buf;
-
-               /* Important: %edx and %esi are clobbered by some BIOSes,
-                  so they must be either used for the error output
-                  or explicitly marked clobbered.  Given that, assume there
-                  is something out there clobbering %ebp and %edi, too. */
-               asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
-                   : "=d" (err), "+b" (next), "=a" (id), "+c" (size),
-                     "=D" (edi), "+m" (buf)
-                   : "D" (&buf), "d" (SMAP), "a" (0xe820)
-                   : "esi");
+               intcall(0x15, &ireg, &oreg);
+               ireg.ebx = oreg.ebx; /* for next iteration... */
 
                /* BIOSes which terminate the chain with CF = 1 as opposed
                   to %ebx = 0 don't always report the SMAP signature on
                   the final, failing, probe. */
-               if (err)
+               if (oreg.eflags & X86_EFLAGS_CF)
                        break;
 
                /* Some BIOSes stop returning SMAP in the middle of
@@ -64,60 +59,64 @@ static int detect_memory_e820(void)
                   screwed up the map at that point, we might have a
                   partial map, the full map, or complete garbage, so
                   just return failure. */
-               if (id != SMAP) {
+               if (oreg.eax != SMAP) {
                        count = 0;
                        break;
                }
 
                *desc++ = buf;
                count++;
-       } while (next && count < ARRAY_SIZE(boot_params.e820_map));
+       } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map));
 
        return boot_params.e820_entries = count;
 }
 
 static int detect_memory_e801(void)
 {
-       u16 ax, bx, cx, dx;
-       u8 err;
+       struct biosregs ireg, oreg;
 
-       bx = cx = dx = 0;
-       ax = 0xe801;
-       asm("stc; int $0x15; setc %0"
-           : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx));
+       initregs(&ireg);
+       ireg.ax = 0xe801;
+       intcall(0x15, &ireg, &oreg);
 
-       if (err)
+       if (oreg.eflags & X86_EFLAGS_CF)
                return -1;
 
        /* Do we really need to do this? */
-       if (cx || dx) {
-               ax = cx;
-               bx = dx;
+       if (oreg.cx || oreg.dx) {
+               oreg.ax = oreg.cx;
+               oreg.bx = oreg.dx;
        }
 
-       if (ax > 15*1024)
+       if (oreg.ax > 15*1024) {
                return -1;      /* Bogus! */
-
-       /* This ignores memory above 16MB if we have a memory hole
-          there.  If someone actually finds a machine with a memory
-          hole at 16MB and no support for 0E820h they should probably
-          generate a fake e820 map. */
-       boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax;
+       } else if (oreg.ax == 15*1024) {
+               boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax;
+       } else {
+               /*
+                * This ignores memory above 16MB if we have a memory
+                * hole there.  If someone actually finds a machine
+                * with a memory hole at 16MB and no support for
+                * 0E820h they should probably generate a fake e820
+                * map.
+                */
+               boot_params.alt_mem_k = oreg.ax;
+       }
 
        return 0;
 }
 
 static int detect_memory_88(void)
 {
-       u16 ax;
-       u8 err;
+       struct biosregs ireg, oreg;
 
-       ax = 0x8800;
-       asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax));
+       initregs(&ireg);
+       ireg.ah = 0x88;
+       intcall(0x15, &ireg, &oreg);
 
-       boot_params.screen_info.ext_mem_k = ax;
+       boot_params.screen_info.ext_mem_k = oreg.ax;
 
-       return -err;
+       return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */
 }
 
 int detect_memory(void)
diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c
new file mode 100644 (file)
index 0000000..958019b
--- /dev/null
@@ -0,0 +1,29 @@
+/* -----------------------------------------------------------------------
+ *
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
+ *
+ *   This file is part of the Linux kernel, and is made available under
+ *   the terms of the GNU General Public License version 2 or (at your
+ *   option) any later version; incorporated herein by reference.
+ *
+ * ----------------------------------------------------------------------- */
+
+/*
+ * Simple helper function for initializing a register set.
+ *
+ * Note that this sets EFLAGS_CF in the input register set; this
+ * makes it easier to catch functions which do nothing but don't
+ * explicitly set CF.
+ */
+
+#include "boot.h"
+
+void initregs(struct biosregs *reg)
+{
+       memset(reg, 0, sizeof *reg);
+       reg->eflags |= X86_EFLAGS_CF;
+       reg->ds = ds();
+       reg->es = ds();
+       reg->fs = fs();
+       reg->gs = gs();
+}
index bb8dc2de796936c6a21d0b1bfbd934ca8a3e5877..0f6ec455a2b13a58151861b2e8d307776d866742 100644 (file)
@@ -15,8 +15,11 @@ SECTIONS
 
        . = 497;
        .header         : { *(.header) }
+       .entrytext      : { *(.entrytext) }
        .inittext       : { *(.inittext) }
        .initdata       : { *(.initdata) }
+       __end_init = .;
+
        .text           : { *(.text) }
        .text32         : { *(.text32) }
 
@@ -52,4 +55,7 @@ SECTIONS
 
        . = ASSERT(_end <= 0x8000, "Setup too big!");
        . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!");
+       /* Necessary for the very-old-loader check to work... */
+       . = ASSERT(__end_init <= 5*512, "init sections too big!");
+
 }
index 7e8e8b25f5f6c89defd0df4064a61e3e16a4ea06..01ec69c901c7a595e88748f24880a5104af553fe 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
 
 void __attribute__((section(".inittext"))) putchar(int ch)
 {
-       unsigned char c = ch;
+       struct biosregs ireg;
 
-       if (c == '\n')
+       if (ch == '\n')
                putchar('\r');  /* \n -> \r\n */
 
-       /* int $0x10 is known to have bugs involving touching registers
-          it shouldn't.  Be extra conservative... */
-       asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal"
-                    : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch));
+       initregs(&ireg);
+       ireg.bx = 0x0007;
+       ireg.cx = 0x0001;
+       ireg.ah = 0x0e;
+       ireg.al = ch;
+       intcall(0x10, &ireg, NULL);
 }
 
 void __attribute__((section(".inittext"))) puts(const char *str)
 {
-       int n = 0;
-       while (*str) {
+       while (*str)
                putchar(*str++);
-               n++;
-       }
 }
 
 /*
@@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str)
 
 static u8 gettime(void)
 {
-       u16 ax = 0x0200;
-       u16 cx, dx;
+       struct biosregs ireg, oreg;
 
-       asm volatile("int $0x1a"
-                    : "+a" (ax), "=c" (cx), "=d" (dx)
-                    : : "ebx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x02;
+       intcall(0x1a, &ireg, &oreg);
 
-       return dx >> 8;
+       return oreg.dh;
 }
 
 /*
@@ -64,19 +63,24 @@ static u8 gettime(void)
  */
 int getchar(void)
 {
-       u16 ax = 0;
-       asm volatile("int $0x16" : "+a" (ax));
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+       /* ireg.ah = 0x00; */
+       intcall(0x16, &ireg, &oreg);
 
-       return ax & 0xff;
+       return oreg.al;
 }
 
 static int kbd_pending(void)
 {
-       u8 pending;
-       asm volatile("int $0x16; setnz %0"
-                    : "=qm" (pending)
-                    : "a" (0x0100));
-       return pending;
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+       ireg.ah = 0x01;
+       intcall(0x16, &ireg, &oreg);
+
+       return !(oreg.eflags & X86_EFLAGS_ZF);
 }
 
 void kbd_flush(void)
index 3fa979c9c363a5dbe6f9ce6d754636ed301d01b1..d660be4923634ebfc0160b468a569289c02bce90 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi)
 
 static int set_bios_mode(u8 mode)
 {
-       u16 ax;
+       struct biosregs ireg, oreg;
        u8 new_mode;
 
-       ax = mode;              /* AH=0x00 Set Video Mode */
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.al = mode;         /* AH=0x00 Set Video Mode */
+       intcall(0x10, &ireg, NULL);
 
-       ax = 0x0f00;            /* Get Current Video Mode */
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+
+       ireg.ah = 0x0f;         /* Get Current Video Mode */
+       intcall(0x10, &ireg, &oreg);
 
        do_restore = 1;         /* Assume video contents were lost */
-       new_mode = ax & 0x7f;   /* Not all BIOSes are clean with the top bit */
+
+       /* Not all BIOSes are clean with the top bit */
+       new_mode = ireg.al & 0x7f;
 
        if (new_mode == mode)
                return 0;       /* Mode change OK */
@@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode)
                /* Mode setting failed, but we didn't end up where we
                   started.  That's bad.  Try to revert to the original
                   video mode. */
-               ax = boot_params.screen_info.orig_video_mode;
-               asm volatile(INT10
-                            : "+a" (ax)
-                            : : "ebx", "ecx", "edx", "esi", "edi");
+               ireg.ax = boot_params.screen_info.orig_video_mode;
+               intcall(0x10, &ireg, NULL);
        }
 #endif
        return -1;
index 4a58c8ce3f6960534931409905103aa7465aa9d6..c700147d6ffb24b72fbbf43ff954d4cfd268f137 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {}
 static int vesa_probe(void)
 {
 #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID)
-       u16 ax, cx, di;
+       struct biosregs ireg, oreg;
        u16 mode;
        addr_t mode_ptr;
        struct mode_info *mi;
@@ -39,13 +40,12 @@ static int vesa_probe(void)
 
        video_vesa.modes = GET_HEAP(struct mode_info, 0);
 
-       ax = 0x4f00;
-       di = (size_t)&vginfo;
-       asm(INT10
-           : "+a" (ax), "+D" (di), "=m" (vginfo)
-           : : "ebx", "ecx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f00;
+       ireg.di = (size_t)&vginfo;
+       intcall(0x10, &ireg, &oreg);
 
-       if (ax != 0x004f ||
+       if (ireg.ax != 0x004f ||
            vginfo.signature != VESA_MAGIC ||
            vginfo.version < 0x0102)
                return 0;       /* Not present */
@@ -65,14 +65,12 @@ static int vesa_probe(void)
 
                memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-               ax = 0x4f01;
-               cx = mode;
-               di = (size_t)&vminfo;
-               asm(INT10
-                   : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-                   : : "ebx", "edx", "esi");
+               ireg.ax = 0x4f01;
+               ireg.cx = mode;
+               ireg.di = (size_t)&vminfo;
+               intcall(0x10, &ireg, &oreg);
 
-               if (ax != 0x004f)
+               if (ireg.ax != 0x004f)
                        continue;
 
                if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -111,20 +109,19 @@ static int vesa_probe(void)
 
 static int vesa_set_mode(struct mode_info *mode)
 {
-       u16 ax, bx, cx, di;
+       struct biosregs ireg, oreg;
        int is_graphic;
        u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA;
 
        memset(&vminfo, 0, sizeof vminfo); /* Just in case... */
 
-       ax = 0x4f01;
-       cx = vesa_mode;
-       di = (size_t)&vminfo;
-       asm(INT10
-           : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo)
-           : : "ebx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f01;
+       ireg.cx = vesa_mode;
+       ireg.di = (size_t)&vminfo;
+       intcall(0x10, &ireg, &oreg);
 
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                return -1;
 
        if ((vminfo.mode_attr & 0x15) == 0x05) {
@@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode)
        }
 
 
-       ax = 0x4f02;
-       bx = vesa_mode;
-       di = 0;
-       asm volatile(INT10
-                    : "+a" (ax), "+b" (bx), "+D" (di)
-                    : : "ecx", "edx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f02;
+       ireg.bx = vesa_mode;
+       intcall(0x10, &ireg, &oreg);
 
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                return -1;
 
        graphic_mode = is_graphic;
@@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode)
 /* Switch DAC to 8-bit mode */
 static void vesa_dac_set_8bits(void)
 {
+       struct biosregs ireg, oreg;
        u8 dac_size = 6;
 
        /* If possible, switch the DAC to 8-bit mode */
        if (vginfo.capabilities & 1) {
-               u16 ax, bx;
-
-               ax = 0x4f08;
-               bx = 0x0800;
-               asm volatile(INT10
-                            : "+a" (ax), "+b" (bx)
-                            : : "ecx", "edx", "esi", "edi");
-
-               if (ax == 0x004f)
-                       dac_size = bx >> 8;
+               initregs(&ireg);
+               ireg.ax = 0x4f08;
+               ireg.bh = 0x08;
+               intcall(0x10, &ireg, &oreg);
+               if (oreg.ax == 0x004f)
+                       dac_size = oreg.bh;
        }
 
        /* Set the color sizes to the DAC size, and offsets to 0 */
-       boot_params.screen_info.red_size = dac_size;
+       boot_params.screen_info.red_size   = dac_size;
        boot_params.screen_info.green_size = dac_size;
-       boot_params.screen_info.blue_size = dac_size;
-       boot_params.screen_info.rsvd_size = dac_size;
+       boot_params.screen_info.blue_size  = dac_size;
+       boot_params.screen_info.rsvd_size  = dac_size;
 
-       boot_params.screen_info.red_pos = 0;
-       boot_params.screen_info.green_pos = 0;
-       boot_params.screen_info.blue_pos = 0;
-       boot_params.screen_info.rsvd_pos = 0;
+       boot_params.screen_info.red_pos    = 0;
+       boot_params.screen_info.green_pos  = 0;
+       boot_params.screen_info.blue_pos   = 0;
+       boot_params.screen_info.rsvd_pos   = 0;
 }
 
 /* Save the VESA protected mode info */
 static void vesa_store_pm_info(void)
 {
-       u16 ax, bx, di, es;
+       struct biosregs ireg, oreg;
 
-       ax = 0x4f0a;
-       bx = di = 0;
-       asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es"
-           : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di)
-           : : "ecx", "esi");
+       initregs(&ireg);
+       ireg.ax = 0x4f0a;
+       intcall(0x10, &ireg, &oreg);
 
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                return;
 
-       boot_params.screen_info.vesapm_seg = es;
-       boot_params.screen_info.vesapm_off = di;
+       boot_params.screen_info.vesapm_seg = oreg.es;
+       boot_params.screen_info.vesapm_off = oreg.di;
 }
 
 /*
@@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void)
 void vesa_store_edid(void)
 {
 #ifdef CONFIG_FIRMWARE_EDID
-       u16 ax, bx, cx, dx, di;
+       struct biosregs ireg, oreg;
 
        /* Apparently used as a nonsense token... */
        memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info);
@@ -260,33 +250,26 @@ void vesa_store_edid(void)
        if (vginfo.version < 0x0200)
                return;         /* EDID requires VBE 2.0+ */
 
-       ax = 0x4f15;            /* VBE DDC */
-       bx = 0x0000;            /* Report DDC capabilities */
-       cx = 0;                 /* Controller 0 */
-       di = 0;                 /* ES:DI must be 0 by spec */
-
-       /* Note: The VBE DDC spec is different from the main VESA spec;
-          we genuinely have to assume all registers are destroyed here. */
-
-       asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es"
-           : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di)
-           : : "esi", "edx");
+       initregs(&ireg);
+       ireg.ax = 0x4f15;               /* VBE DDC */
+       /* ireg.bx = 0x0000; */         /* Report DDC capabilities */
+       /* ireg.cx = 0; */              /* Controller 0 */
+       ireg.es = 0;                    /* ES:DI must be 0 by spec */
+       intcall(0x10, &ireg, &oreg);
 
-       if (ax != 0x004f)
+       if (oreg.ax != 0x004f)
                return;         /* No EDID */
 
        /* BH = time in seconds to transfer EDD information */
        /* BL = DDC level supported */
 
-       ax = 0x4f15;            /* VBE DDC */
-       bx = 0x0001;            /* Read EDID */
-       cx = 0;                 /* Controller 0 */
-       dx = 0;                 /* EDID block number */
-       di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */
-       asm(INT10
-           : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info),
-             "+c" (cx), "+D" (di)
-           : : "esi");
+       ireg.ax = 0x4f15;               /* VBE DDC */
+       ireg.bx = 0x0001;               /* Read EDID */
+       /* ireg.cx = 0; */              /* Controller 0 */
+       /* ireg.dx = 0; */              /* EDID block number */
+       ireg.es = ds();
+       ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */
+       intcall(0x10, &ireg, &oreg);
 #endif /* CONFIG_FIRMWARE_EDID */
 }
 
index 9e0587a3776868e73b62b7def725997b6ffb7699..8f8d827e254d0b8cb392bf7fe95103059ac5ca89 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
@@ -39,30 +40,30 @@ static __videocard video_vga;
 /* Set basic 80x25 mode */
 static u8 vga_set_basic_mode(void)
 {
+       struct biosregs ireg, oreg;
        u16 ax;
        u8 rows;
        u8 mode;
 
+       initregs(&ireg);
+
 #ifdef CONFIG_VIDEO_400_HACK
        if (adapter >= ADAPTER_VGA) {
-               asm volatile(INT10
-                            : : "a" (0x1202), "b" (0x0030)
-                            : "ecx", "edx", "esi", "edi");
+               ireg.ax = 0x1202;
+               ireg.bx = 0x0030;
+               intcall(0x10, &ireg, NULL);
        }
 #endif
 
        ax = 0x0f00;
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
-
-       mode = (u8)ax;
+       intcall(0x10, &ireg, &oreg);
+       mode = oreg.al;
 
        set_fs(0);
        rows = rdfs8(0x484);    /* rows minus one */
 
 #ifndef CONFIG_VIDEO_400_HACK
-       if ((ax == 0x5003 || ax == 0x5007) &&
+       if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
            (rows == 0 || rows == 24))
                return mode;
 #endif
@@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void)
                mode = 3;
 
        /* Set the mode */
-       ax = mode;
-       asm volatile(INT10
-                    : "+a" (ax)
-                    : : "ebx", "ecx", "edx", "esi", "edi");
+       ireg.ax = mode;         /* AH=0: set mode */
+       intcall(0x10, &ireg, NULL);
        do_restore = 1;
        return mode;
 }
@@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void)
 static void vga_set_8font(void)
 {
        /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */
+       struct biosregs ireg;
+
+       initregs(&ireg);
 
        /* Set 8x8 font */
-       asm volatile(INT10 : : "a" (0x1112), "b" (0));
+       ireg.ax = 0x1112;
+       /* ireg.bl = 0; */
+       intcall(0x10, &ireg, NULL);
 
        /* Use alternate print screen */
-       asm volatile(INT10 : : "a" (0x1200), "b" (0x20));
+       ireg.ax = 0x1200;
+       ireg.bl = 0x20;
+       intcall(0x10, &ireg, NULL);
 
        /* Turn off cursor emulation */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x34;
+       intcall(0x10, &ireg, NULL);
 
        /* Cursor is scan lines 6-7 */
-       asm volatile(INT10 : : "a" (0x0100), "c" (0x0607));
+       ireg.ax = 0x0100;
+       ireg.cx = 0x0607;
+       intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_14font(void)
 {
        /* Set 9x14 font - 80x28 on VGA */
+       struct biosregs ireg;
+
+       initregs(&ireg);
 
        /* Set 9x14 font */
-       asm volatile(INT10 : : "a" (0x1111), "b" (0));
+       ireg.ax = 0x1111;
+       /* ireg.bl = 0; */
+       intcall(0x10, &ireg, NULL);
 
        /* Turn off cursor emulation */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x34));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x34;
+       intcall(0x10, &ireg, NULL);
 
        /* Cursor is scan lines 11-12 */
-       asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c));
+       ireg.ax = 0x0100;
+       ireg.cx = 0x0b0c;
+       intcall(0x10, &ireg, NULL);
 }
 
 static void vga_set_80x43(void)
 {
        /* Set 80x43 mode on VGA (not EGA) */
+       struct biosregs ireg;
+
+       initregs(&ireg);
 
        /* Set 350 scans */
-       asm volatile(INT10 : : "a" (0x1201), "b" (0x30));
+       ireg.ax = 0x1201;
+       ireg.bl = 0x30;
+       intcall(0x10, &ireg, NULL);
 
        /* Reset video mode */
-       asm volatile(INT10 : : "a" (0x0003));
+       ireg.ax = 0x0003;
+       intcall(0x10, &ireg, NULL);
 
        vga_set_8font();
 }
@@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode)
  */
 static int vga_probe(void)
 {
-       u16 ega_bx;
-
        static const char *card_name[] = {
                "CGA/MDA/HGC", "EGA", "VGA"
        };
@@ -240,26 +263,26 @@ static int vga_probe(void)
                sizeof(ega_modes)/sizeof(struct mode_info),
                sizeof(vga_modes)/sizeof(struct mode_info),
        };
-       u8 vga_flag;
 
-       asm(INT10
-           : "=b" (ega_bx)
-           : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */
-           : "ecx", "edx", "esi", "edi");
+       struct biosregs ireg, oreg;
+
+       initregs(&ireg);
+
+       ireg.ax = 0x1200;
+       ireg.bl = 0x10;         /* Check EGA/VGA */
+       intcall(0x10, &ireg, &oreg);
 
 #ifndef _WAKEUP
-       boot_params.screen_info.orig_video_ega_bx = ega_bx;
+       boot_params.screen_info.orig_video_ega_bx = oreg.bx;
 #endif
 
        /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */
-       if ((u8)ega_bx != 0x10) {
+       if (oreg.bl != 0x10) {
                /* EGA/VGA */
-               asm(INT10
-                   : "=a" (vga_flag)
-                   : "a" (0x1a00)
-                   : "ebx", "ecx", "edx", "esi", "edi");
+               ireg.ax = 0x1a00;
+               intcall(0x10, &ireg, &oreg);
 
-               if (vga_flag == 0x1a) {
+               if (oreg.al == 0x1a) {
                        adapter = ADAPTER_VGA;
 #ifndef _WAKEUP
                        boot_params.screen_info.orig_video_isVGA = 1;
index 3bef2c1febe936bd01f1b8d7d6b7d1eb5b1f9bb6..bad728b76fc2acff40b1e5052ae20b0e08ea5dc1 100644 (file)
@@ -2,6 +2,7 @@
  *
  *   Copyright (C) 1991, 1992 Linus Torvalds
  *   Copyright 2007 rPath, Inc. - All Rights Reserved
+ *   Copyright 2009 Intel Corporation; author H. Peter Anvin
  *
  *   This file is part of the Linux kernel, and is made available under
  *   the terms of the GNU General Public License version 2.
 
 static void store_cursor_position(void)
 {
-       u16 curpos;
-       u16 ax, bx;
+       struct biosregs ireg, oreg;
 
-       ax = 0x0300;
-       bx = 0;
-       asm(INT10
-           : "=d" (curpos), "+a" (ax), "+b" (bx)
-           : : "ecx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x03;
+       intcall(0x10, &ireg, &oreg);
 
-       boot_params.screen_info.orig_x = curpos;
-       boot_params.screen_info.orig_y = curpos >> 8;
+       boot_params.screen_info.orig_x = oreg.dl;
+       boot_params.screen_info.orig_y = oreg.dh;
 }
 
 static void store_video_mode(void)
 {
-       u16 ax, page;
+       struct biosregs ireg, oreg;
 
        /* N.B.: the saving of the video page here is a bit silly,
           since we pretty much assume page 0 everywhere. */
-       ax = 0x0f00;
-       asm(INT10
-           : "+a" (ax), "=b" (page)
-           : : "ecx", "edx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x0f;
+       intcall(0x10, &ireg, &oreg);
 
        /* Not all BIOSes are clean with respect to the top bit */
-       boot_params.screen_info.orig_video_mode = ax & 0x7f;
-       boot_params.screen_info.orig_video_page = page >> 8;
+       boot_params.screen_info.orig_video_mode = oreg.al & 0x7f;
+       boot_params.screen_info.orig_video_page = oreg.bh;
 }
 
 /*
@@ -257,7 +254,7 @@ static void restore_screen(void)
        int y;
        addr_t dst = 0;
        u16 *src = saved.data;
-       u16 ax, bx, dx;
+       struct biosregs ireg;
 
        if (graphic_mode)
                return;         /* Can't restore onto a graphic mode */
@@ -296,12 +293,11 @@ static void restore_screen(void)
        }
 
        /* Restore cursor position */
-       ax = 0x0200;            /* Set cursor position */
-       bx = 0;                 /* Page number (<< 8) */
-       dx = (saved.cury << 8)+saved.curx;
-       asm volatile(INT10
-                    : "+a" (ax), "+b" (bx), "+d" (dx)
-                    : : "ecx", "esi", "edi");
+       initregs(&ireg);
+       ireg.ah = 0x02;         /* Set cursor position */
+       ireg.dh = saved.cury;
+       ireg.dl = saved.curx;
+       intcall(0x10, &ireg, NULL);
 }
 #else
 #define save_screen()          ((void)0)
index ee63f5d14461517ef96b89a6c644ac6c3a60a2f6..5bb174a997fc61a3609a0a41500192f4475e676a 100644 (file)
@@ -112,20 +112,6 @@ extern int force_x, force_y;       /* Don't query the BIOS for cols/rows */
 extern int do_restore;         /* Restore screen contents */
 extern int graphic_mode;       /* Graphics mode with linear frame buffer */
 
-/*
- * int $0x10 is notorious for touching registers it shouldn't.
- * gcc doesn't like %ebp being clobbered, so define it as a push/pop
- * sequence here.
- *
- * A number of systems, including the original PC can clobber %bp in
- * certain circumstances, like when scrolling.  There exists at least
- * one Trident video card which could clobber DS under a set of
- * circumstances that we are unlikely to encounter (scrolling when
- * using an extended graphics mode of more than 800x600 pixels), but
- * it's cheap insurance to deal with that here.
- */
-#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp"
-
 /* Accessing VGA indexed registers */
 static inline u8 in_idx(u16 port, u8 index)
 {
index 235b81d0f6f2b1083c06cc46210df33141eb2b4b..edb992ebef92e2d95a8ce5d52d6f4ff33de91529 100644 (file)
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:50:58 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:21:55 2009
 #
 # CONFIG_64BIT is not set
 CONFIG_X86_32=y
 # CONFIG_X86_64 is not set
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf32-i386"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 # CONFIG_AUDIT_ARCH is not set
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_32_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
+CONFIG_X86_32_LAZY_GS=y
 CONFIG_KTIME_SCALAR=y
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
 
@@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -113,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -139,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 CONFIG_HAVE_GENERIC_DMA_COHERENT=y
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
 # CONFIG_LBD is not set
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 
@@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
+# CONFIG_X86_BIGSMP is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
-# CONFIG_X86_VSMP is not set
 # CONFIG_X86_RDC321X is not set
+# CONFIG_X86_32_NON_STANDARD is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,8 +245,10 @@ CONFIG_M686=y
 # CONFIG_GENERIC_CPU is not set
 CONFIG_X86_GENERIC=y
 CONFIG_X86_CPU=y
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=5
 CONFIG_X86_XADD=y
 # CONFIG_X86_PPRO_FENCE is not set
 CONFIG_X86_WP_WORKS_OK=y
@@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_CYRIX_32=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_32=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_CPU_SUP_TRANSMETA_32=y
 CONFIG_CPU_SUP_UMC_32=y
 CONFIG_X86_DS=y
@@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 # CONFIG_NOHIGHMEM is not set
 CONFIG_HIGHMEM4G=y
 # CONFIG_HIGHMEM64G is not set
@@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_HIGHPTE=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
@@ -312,6 +332,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -322,8 +343,9 @@ CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
 # CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_X86_NEED_RELOCS=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
@@ -363,7 +385,6 @@ CONFIG_ACPI_THERMAL=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -425,6 +446,7 @@ CONFIG_PCI_BIOS=y
 CONFIG_PCI_DIRECT=y
 CONFIG_PCI_MMCONFIG=y
 CONFIG_PCI_DOMAINS=y
+# CONFIG_DMAR is not set
 CONFIG_PCIEPORTBUS=y
 # CONFIG_HOTPLUG_PCI_PCIE is not set
 CONFIG_PCIEAER=y
@@ -435,6 +457,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 # CONFIG_ISA is not set
 # CONFIG_MCA is not set
@@ -481,7 +504,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +661,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +719,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +730,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -789,6 +811,7 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
 # CONFIG_HP_ILO is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -842,6 +865,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +964,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +1002,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1053,7 @@ CONFIG_E1000=y
 CONFIG_E1000E=y
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1068,7 @@ CONFIG_BNX2=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1078,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1088,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_IBMLS is not set
@@ -1073,8 +1104,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1084,21 +1115,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1209,6 +1240,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1303,6 +1336,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 CONFIG_HW_RANDOM_INTEL=y
 CONFIG_HW_RANDOM_AMD=y
 CONFIG_HW_RANDOM_GEODE=y
@@ -1390,7 +1424,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1424,6 +1457,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1433,6 +1467,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1448,11 +1483,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1643,7 +1681,6 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_3DFX is not set
 # CONFIG_FB_VOODOO1 is not set
 # CONFIG_FB_VT8623 is not set
-# CONFIG_FB_CYBLA is not set
 # CONFIG_FB_TRIDENT is not set
 # CONFIG_FB_ARK is not set
 # CONFIG_FB_PM3 is not set
@@ -1652,6 +1689,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1738,6 +1776,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1811,15 +1851,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1885,11 +1927,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1931,7 +1973,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1947,6 +1988,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1958,8 +2000,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1969,6 +2013,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2037,6 +2085,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2071,6 +2120,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2100,6 +2150,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -2151,6 +2206,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2164,7 +2220,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2251,6 +2306,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2266,6 +2322,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 # CONFIG_DEBUG_HIGHMEM is not set
 CONFIG_DEBUG_BUGVERBOSE=y
@@ -2289,13 +2346,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2305,13 +2368,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2321,7 +2392,6 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
@@ -2329,7 +2399,7 @@ CONFIG_DEBUG_RODATA=y
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_4KSTACKS is not set
 CONFIG_DOUBLEFAULT=y
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2365,6 +2435,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2380,10 +2452,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2456,6 +2530,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2467,11 +2542,13 @@ CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_GEODE is not set
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_LGUEST is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2489,7 +2566,10 @@ CONFIG_CRC32=y
 # CONFIG_LIBCRC32C is not set
 CONFIG_AUDIT_GENERIC=y
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
index 9fe5d212ab4cc8291e0c55ebdc619a57c85f2d64..cee1dd2e69b2e173c5218365312e73332652337f 100644 (file)
@@ -1,12 +1,13 @@
 #
 # Automatically generated make config: don't edit
-# Linux kernel version: 2.6.29-rc4
-# Tue Feb 24 15:44:16 2009
+# Linux kernel version: 2.6.30-rc2
+# Mon May 11 16:22:00 2009
 #
 CONFIG_64BIT=y
 # CONFIG_X86_32 is not set
 CONFIG_X86_64=y
 CONFIG_X86=y
+CONFIG_OUTPUT_FORMAT="elf64-x86-64"
 CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig"
 CONFIG_GENERIC_TIME=y
 CONFIG_GENERIC_CMOS_UPDATE=y
@@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y
 CONFIG_ARCH_HAS_DEFAULT_IDLE=y
 CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
 CONFIG_HAVE_SETUP_PER_CPU_AREA=y
+CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y
 CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y
 CONFIG_ARCH_HIBERNATION_POSSIBLE=y
 CONFIG_ARCH_SUSPEND_POSSIBLE=y
@@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y
 CONFIG_ARCH_POPULATES_NODE_MAP=y
 CONFIG_AUDIT_ARCH=y
 CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
+CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
 CONFIG_GENERIC_HARDIRQS=y
+CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y
 CONFIG_GENERIC_IRQ_PROBE=y
 CONFIG_GENERIC_PENDING_IRQ=y
-CONFIG_X86_SMP=y
 CONFIG_USE_GENERIC_SMP_HELPERS=y
 CONFIG_X86_64_SMP=y
 CONFIG_X86_HT=y
-CONFIG_X86_BIOS_REBOOT=y
 CONFIG_X86_TRAMPOLINE=y
 # CONFIG_KTIME_SCALAR is not set
 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
@@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y
 CONFIG_INIT_ENV_ARG_LIMIT=32
 CONFIG_LOCALVERSION=""
 # CONFIG_LOCALVERSION_AUTO is not set
+CONFIG_HAVE_KERNEL_GZIP=y
+CONFIG_HAVE_KERNEL_BZIP2=y
+CONFIG_HAVE_KERNEL_LZMA=y
+CONFIG_KERNEL_GZIP=y
+# CONFIG_KERNEL_BZIP2 is not set
+# CONFIG_KERNEL_LZMA is not set
 CONFIG_SWAP=y
 CONFIG_SYSVIPC=y
 CONFIG_SYSVIPC_SYSCTL=y
 CONFIG_POSIX_MQUEUE=y
+CONFIG_POSIX_MQUEUE_SYSCTL=y
 CONFIG_BSD_PROCESS_ACCT=y
 # CONFIG_BSD_PROCESS_ACCT_V3 is not set
 CONFIG_TASKSTATS=y
@@ -114,23 +123,26 @@ CONFIG_PID_NS=y
 CONFIG_NET_NS=y
 CONFIG_BLK_DEV_INITRD=y
 CONFIG_INITRAMFS_SOURCE=""
+CONFIG_RD_GZIP=y
+CONFIG_RD_BZIP2=y
+CONFIG_RD_LZMA=y
 CONFIG_CC_OPTIMIZE_FOR_SIZE=y
 CONFIG_SYSCTL=y
+CONFIG_ANON_INODES=y
 # CONFIG_EMBEDDED is not set
 CONFIG_UID16=y
 CONFIG_SYSCTL_SYSCALL=y
 CONFIG_KALLSYMS=y
 CONFIG_KALLSYMS_ALL=y
 CONFIG_KALLSYMS_EXTRA_PASS=y
+# CONFIG_STRIP_ASM_SYMS is not set
 CONFIG_HOTPLUG=y
 CONFIG_PRINTK=y
 CONFIG_BUG=y
 CONFIG_ELF_CORE=y
 CONFIG_PCSPKR_PLATFORM=y
-# CONFIG_COMPAT_BRK is not set
 CONFIG_BASE_FULL=y
 CONFIG_FUTEX=y
-CONFIG_ANON_INODES=y
 CONFIG_EPOLL=y
 CONFIG_SIGNALFD=y
 CONFIG_TIMERFD=y
@@ -140,6 +152,7 @@ CONFIG_AIO=y
 CONFIG_VM_EVENT_COUNTERS=y
 CONFIG_PCI_QUIRKS=y
 CONFIG_SLUB_DEBUG=y
+# CONFIG_COMPAT_BRK is not set
 # CONFIG_SLAB is not set
 CONFIG_SLUB=y
 # CONFIG_SLOB is not set
@@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y
 CONFIG_HAVE_KPROBES=y
 CONFIG_HAVE_KRETPROBES=y
 CONFIG_HAVE_ARCH_TRACEHOOK=y
+CONFIG_HAVE_DMA_API_DEBUG=y
+# CONFIG_SLOW_WORK is not set
 # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set
 CONFIG_SLABINFO=y
 CONFIG_RT_MUTEXES=y
@@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y
 # CONFIG_MODULE_SRCVERSION_ALL is not set
 CONFIG_STOP_MACHINE=y
 CONFIG_BLOCK=y
-CONFIG_BLK_DEV_IO_TRACE=y
 CONFIG_BLK_DEV_BSG=y
 # CONFIG_BLK_DEV_INTEGRITY is not set
 CONFIG_BLOCK_COMPAT=y
@@ -195,12 +209,10 @@ CONFIG_HIGH_RES_TIMERS=y
 CONFIG_GENERIC_CLOCKEVENTS_BUILD=y
 CONFIG_SMP=y
 CONFIG_SPARSE_IRQ=y
-# CONFIG_NUMA_MIGRATE_IRQ_DESC is not set
-CONFIG_X86_FIND_SMP_CONFIG=y
 CONFIG_X86_MPPARSE=y
-# CONFIG_X86_ELAN is not set
-# CONFIG_X86_GENERICARCH is not set
+CONFIG_X86_EXTENDED_PLATFORM=y
 # CONFIG_X86_VSMP is not set
+# CONFIG_X86_UV is not set
 CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_PARAVIRT_GUEST is not set
 # CONFIG_MEMTEST is not set
@@ -230,10 +242,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y
 # CONFIG_MCORE2 is not set
 CONFIG_GENERIC_CPU=y
 CONFIG_X86_CPU=y
-CONFIG_X86_L1_CACHE_BYTES=128
-CONFIG_X86_INTERNODE_CACHE_BYTES=128
+CONFIG_X86_L1_CACHE_BYTES=64
+CONFIG_X86_INTERNODE_CACHE_BYTES=64
 CONFIG_X86_CMPXCHG=y
-CONFIG_X86_L1_CACHE_SHIFT=7
+CONFIG_X86_L1_CACHE_SHIFT=6
 CONFIG_X86_WP_WORKS_OK=y
 CONFIG_X86_TSC=y
 CONFIG_X86_CMPXCHG64=y
@@ -242,7 +254,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64
 CONFIG_X86_DEBUGCTLMSR=y
 CONFIG_CPU_SUP_INTEL=y
 CONFIG_CPU_SUP_AMD=y
-CONFIG_CPU_SUP_CENTAUR_64=y
+CONFIG_CPU_SUP_CENTAUR=y
 CONFIG_X86_DS=y
 CONFIG_X86_PTRACE_BTS=y
 CONFIG_HPET_TIMER=y
@@ -269,6 +281,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y
 CONFIG_X86_MCE=y
 CONFIG_X86_MCE_INTEL=y
 CONFIG_X86_MCE_AMD=y
+CONFIG_X86_MCE_THRESHOLD=y
 # CONFIG_I8K is not set
 CONFIG_MICROCODE=y
 CONFIG_MICROCODE_INTEL=y
@@ -276,6 +289,7 @@ CONFIG_MICROCODE_AMD=y
 CONFIG_MICROCODE_OLD_INTERFACE=y
 CONFIG_X86_MSR=y
 CONFIG_X86_CPUID=y
+# CONFIG_X86_CPU_DEBUG is not set
 CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
 CONFIG_DIRECT_GBPAGES=y
 CONFIG_NUMA=y
@@ -309,6 +323,8 @@ CONFIG_ZONE_DMA_FLAG=1
 CONFIG_BOUNCE=y
 CONFIG_VIRT_TO_BUS=y
 CONFIG_UNEVICTABLE_LRU=y
+CONFIG_HAVE_MLOCK=y
+CONFIG_HAVE_MLOCKED_PAGE_BIT=y
 CONFIG_X86_CHECK_BIOS_CORRUPTION=y
 CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y
 CONFIG_X86_RESERVE_LOW_64K=y
@@ -317,6 +333,7 @@ CONFIG_MTRR=y
 CONFIG_X86_PAT=y
 CONFIG_EFI=y
 CONFIG_SECCOMP=y
+# CONFIG_CC_STACKPROTECTOR is not set
 # CONFIG_HZ_100 is not set
 # CONFIG_HZ_250 is not set
 # CONFIG_HZ_300 is not set
@@ -325,9 +342,10 @@ CONFIG_HZ=1000
 CONFIG_SCHED_HRTICK=y
 CONFIG_KEXEC=y
 CONFIG_CRASH_DUMP=y
+# CONFIG_KEXEC_JUMP is not set
 CONFIG_PHYSICAL_START=0x1000000
-# CONFIG_RELOCATABLE is not set
-CONFIG_PHYSICAL_ALIGN=0x200000
+CONFIG_RELOCATABLE=y
+CONFIG_PHYSICAL_ALIGN=0x1000000
 CONFIG_HOTPLUG_CPU=y
 # CONFIG_COMPAT_VDSO is not set
 # CONFIG_CMDLINE_BOOL is not set
@@ -370,7 +388,6 @@ CONFIG_ACPI_NUMA=y
 CONFIG_ACPI_BLACKLIST_YEAR=0
 # CONFIG_ACPI_DEBUG is not set
 # CONFIG_ACPI_PCI_SLOT is not set
-CONFIG_ACPI_SYSTEM=y
 CONFIG_X86_PM_TIMER=y
 CONFIG_ACPI_CONTAINER=y
 # CONFIG_ACPI_SBS is not set
@@ -436,6 +453,7 @@ CONFIG_PCI_MSI=y
 # CONFIG_PCI_DEBUG is not set
 # CONFIG_PCI_STUB is not set
 CONFIG_HT_IRQ=y
+# CONFIG_PCI_IOV is not set
 CONFIG_ISA_DMA_API=y
 CONFIG_K8_NB=y
 CONFIG_PCCARD=y
@@ -481,7 +499,6 @@ CONFIG_NET=y
 #
 # Networking options
 #
-CONFIG_COMPAT_NET_DEV_OPS=y
 CONFIG_PACKET=y
 CONFIG_PACKET_MMAP=y
 CONFIG_UNIX=y
@@ -639,6 +656,7 @@ CONFIG_LLC=y
 # CONFIG_LAPB is not set
 # CONFIG_ECONET is not set
 # CONFIG_WAN_ROUTER is not set
+# CONFIG_PHONET is not set
 CONFIG_NET_SCHED=y
 
 #
@@ -696,6 +714,7 @@ CONFIG_NET_SCH_FIFO=y
 #
 # CONFIG_NET_PKTGEN is not set
 # CONFIG_NET_TCPPROBE is not set
+# CONFIG_NET_DROP_MONITOR is not set
 CONFIG_HAMRADIO=y
 
 #
@@ -706,12 +725,10 @@ CONFIG_HAMRADIO=y
 # CONFIG_IRDA is not set
 # CONFIG_BT is not set
 # CONFIG_AF_RXRPC is not set
-# CONFIG_PHONET is not set
 CONFIG_FIB_RULES=y
 CONFIG_WIRELESS=y
 CONFIG_CFG80211=y
 # CONFIG_CFG80211_REG_DEBUG is not set
-CONFIG_NL80211=y
 CONFIG_WIRELESS_OLD_REGULATORY=y
 CONFIG_WIRELESS_EXT=y
 CONFIG_WIRELESS_EXT_SYSFS=y
@@ -788,9 +805,8 @@ CONFIG_MISC_DEVICES=y
 # CONFIG_TIFM_CORE is not set
 # CONFIG_ICS932S401 is not set
 # CONFIG_ENCLOSURE_SERVICES is not set
-# CONFIG_SGI_XP is not set
 # CONFIG_HP_ILO is not set
-# CONFIG_SGI_GRU is not set
+# CONFIG_ISL29003 is not set
 # CONFIG_C2PORT is not set
 
 #
@@ -844,6 +860,7 @@ CONFIG_SCSI_SPI_ATTRS=y
 # CONFIG_SCSI_LOWLEVEL is not set
 # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set
 # CONFIG_SCSI_DH is not set
+# CONFIG_SCSI_OSD_INITIATOR is not set
 CONFIG_ATA=y
 # CONFIG_ATA_NONSTANDARD is not set
 CONFIG_ATA_ACPI=y
@@ -940,6 +957,7 @@ CONFIG_DM_ZERO=y
 CONFIG_MACINTOSH_DRIVERS=y
 CONFIG_MAC_EMUMOUSEBTN=y
 CONFIG_NETDEVICES=y
+CONFIG_COMPAT_NET_DEV_OPS=y
 # CONFIG_IFB is not set
 # CONFIG_DUMMY is not set
 # CONFIG_BONDING is not set
@@ -977,6 +995,8 @@ CONFIG_MII=y
 CONFIG_NET_VENDOR_3COM=y
 # CONFIG_VORTEX is not set
 # CONFIG_TYPHOON is not set
+# CONFIG_ETHOC is not set
+# CONFIG_DNET is not set
 CONFIG_NET_TULIP=y
 # CONFIG_DE2104X is not set
 # CONFIG_TULIP is not set
@@ -1026,6 +1046,7 @@ CONFIG_E1000=y
 # CONFIG_E1000E is not set
 # CONFIG_IP1000 is not set
 # CONFIG_IGB is not set
+# CONFIG_IGBVF is not set
 # CONFIG_NS83820 is not set
 # CONFIG_HAMACHI is not set
 # CONFIG_YELLOWFIN is not set
@@ -1040,6 +1061,7 @@ CONFIG_TIGON3=y
 # CONFIG_QLA3XXX is not set
 # CONFIG_ATL1 is not set
 # CONFIG_ATL1E is not set
+# CONFIG_ATL1C is not set
 # CONFIG_JME is not set
 CONFIG_NETDEV_10000=y
 # CONFIG_CHELSIO_T1 is not set
@@ -1049,6 +1071,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_IXGBE is not set
 # CONFIG_IXGB is not set
 # CONFIG_S2IO is not set
+# CONFIG_VXGE is not set
 # CONFIG_MYRI10GE is not set
 # CONFIG_NETXEN_NIC is not set
 # CONFIG_NIU is not set
@@ -1058,6 +1081,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y
 # CONFIG_BNX2X is not set
 # CONFIG_QLGE is not set
 # CONFIG_SFC is not set
+# CONFIG_BE2NET is not set
 CONFIG_TR=y
 # CONFIG_IBMOL is not set
 # CONFIG_3C359 is not set
@@ -1072,8 +1096,8 @@ CONFIG_WLAN_80211=y
 # CONFIG_LIBERTAS is not set
 # CONFIG_LIBERTAS_THINFIRM is not set
 # CONFIG_AIRO is not set
-# CONFIG_HERMES is not set
 # CONFIG_ATMEL is not set
+# CONFIG_AT76C50X_USB is not set
 # CONFIG_AIRO_CS is not set
 # CONFIG_PCMCIA_WL3501 is not set
 # CONFIG_PRISM54 is not set
@@ -1083,21 +1107,21 @@ CONFIG_WLAN_80211=y
 # CONFIG_RTL8187 is not set
 # CONFIG_ADM8211 is not set
 # CONFIG_MAC80211_HWSIM is not set
+# CONFIG_MWL8K is not set
 # CONFIG_P54_COMMON is not set
 CONFIG_ATH5K=y
 # CONFIG_ATH5K_DEBUG is not set
 # CONFIG_ATH9K is not set
+# CONFIG_AR9170_USB is not set
 # CONFIG_IPW2100 is not set
 # CONFIG_IPW2200 is not set
-# CONFIG_IWLCORE is not set
-# CONFIG_IWLWIFI_LEDS is not set
-# CONFIG_IWLAGN is not set
-# CONFIG_IWL3945 is not set
+# CONFIG_IWLWIFI is not set
 # CONFIG_HOSTAP is not set
 # CONFIG_B43 is not set
 # CONFIG_B43LEGACY is not set
 # CONFIG_ZD1211RW is not set
 # CONFIG_RT2X00 is not set
+# CONFIG_HERMES is not set
 
 #
 # Enable WiMAX (Networking options) to see the WiMAX drivers
@@ -1208,6 +1232,8 @@ CONFIG_INPUT_TABLET=y
 # CONFIG_TABLET_USB_KBTAB is not set
 # CONFIG_TABLET_USB_WACOM is not set
 CONFIG_INPUT_TOUCHSCREEN=y
+# CONFIG_TOUCHSCREEN_AD7879_I2C is not set
+# CONFIG_TOUCHSCREEN_AD7879 is not set
 # CONFIG_TOUCHSCREEN_FUJITSU is not set
 # CONFIG_TOUCHSCREEN_GUNZE is not set
 # CONFIG_TOUCHSCREEN_ELO is not set
@@ -1301,6 +1327,7 @@ CONFIG_UNIX98_PTYS=y
 # CONFIG_LEGACY_PTYS is not set
 # CONFIG_IPMI_HANDLER is not set
 CONFIG_HW_RANDOM=y
+# CONFIG_HW_RANDOM_TIMERIOMEM is not set
 # CONFIG_HW_RANDOM_INTEL is not set
 # CONFIG_HW_RANDOM_AMD is not set
 CONFIG_NVRAM=y
@@ -1382,7 +1409,6 @@ CONFIG_I2C_I801=y
 # CONFIG_SENSORS_PCF8574 is not set
 # CONFIG_PCF8575 is not set
 # CONFIG_SENSORS_PCA9539 is not set
-# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_MAX6875 is not set
 # CONFIG_SENSORS_TSL2550 is not set
 # CONFIG_I2C_DEBUG_CORE is not set
@@ -1416,6 +1442,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_ADT7475 is not set
 # CONFIG_SENSORS_K8TEMP is not set
 # CONFIG_SENSORS_ASB100 is not set
+# CONFIG_SENSORS_ATK0110 is not set
 # CONFIG_SENSORS_ATXP1 is not set
 # CONFIG_SENSORS_DS1621 is not set
 # CONFIG_SENSORS_I5K_AMB is not set
@@ -1425,6 +1452,7 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_FSCHER is not set
 # CONFIG_SENSORS_FSCPOS is not set
 # CONFIG_SENSORS_FSCHMD is not set
+# CONFIG_SENSORS_G760A is not set
 # CONFIG_SENSORS_GL518SM is not set
 # CONFIG_SENSORS_GL520SM is not set
 # CONFIG_SENSORS_CORETEMP is not set
@@ -1440,11 +1468,14 @@ CONFIG_HWMON=y
 # CONFIG_SENSORS_LM90 is not set
 # CONFIG_SENSORS_LM92 is not set
 # CONFIG_SENSORS_LM93 is not set
+# CONFIG_SENSORS_LTC4215 is not set
 # CONFIG_SENSORS_LTC4245 is not set
+# CONFIG_SENSORS_LM95241 is not set
 # CONFIG_SENSORS_MAX1619 is not set
 # CONFIG_SENSORS_MAX6650 is not set
 # CONFIG_SENSORS_PC87360 is not set
 # CONFIG_SENSORS_PC87427 is not set
+# CONFIG_SENSORS_PCF8591 is not set
 # CONFIG_SENSORS_SIS5595 is not set
 # CONFIG_SENSORS_DME1737 is not set
 # CONFIG_SENSORS_SMSC47M1 is not set
@@ -1635,6 +1666,7 @@ CONFIG_FB_EFI=y
 # CONFIG_FB_VIRTUAL is not set
 # CONFIG_FB_METRONOME is not set
 # CONFIG_FB_MB862XX is not set
+# CONFIG_FB_BROADSHEET is not set
 CONFIG_BACKLIGHT_LCD_SUPPORT=y
 # CONFIG_LCD_CLASS_DEVICE is not set
 CONFIG_BACKLIGHT_CLASS_DEVICE=y
@@ -1720,6 +1752,8 @@ CONFIG_SND_PCI=y
 # CONFIG_SND_INDIGO is not set
 # CONFIG_SND_INDIGOIO is not set
 # CONFIG_SND_INDIGODJ is not set
+# CONFIG_SND_INDIGOIOX is not set
+# CONFIG_SND_INDIGODJX is not set
 # CONFIG_SND_EMU10K1 is not set
 # CONFIG_SND_EMU10K1X is not set
 # CONFIG_SND_ENS1370 is not set
@@ -1792,15 +1826,17 @@ CONFIG_USB_HIDDEV=y
 #
 # Special HID drivers
 #
-CONFIG_HID_COMPAT=y
 CONFIG_HID_A4TECH=y
 CONFIG_HID_APPLE=y
 CONFIG_HID_BELKIN=y
 CONFIG_HID_CHERRY=y
 CONFIG_HID_CHICONY=y
 CONFIG_HID_CYPRESS=y
+# CONFIG_DRAGONRISE_FF is not set
 CONFIG_HID_EZKEY=y
+CONFIG_HID_KYE=y
 CONFIG_HID_GYRATION=y
+CONFIG_HID_KENSINGTON=y
 CONFIG_HID_LOGITECH=y
 CONFIG_LOGITECH_FF=y
 # CONFIG_LOGIRUMBLEPAD2_FF is not set
@@ -1866,11 +1902,11 @@ CONFIG_USB_PRINTER=y
 # CONFIG_USB_TMC is not set
 
 #
-# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed;
+# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may
 #
 
 #
-# see USB_STORAGE Help for more information
+# also be needed; see USB_STORAGE Help for more info
 #
 CONFIG_USB_STORAGE=y
 # CONFIG_USB_STORAGE_DEBUG is not set
@@ -1912,7 +1948,6 @@ CONFIG_USB_LIBUSUAL=y
 # CONFIG_USB_LED is not set
 # CONFIG_USB_CYPRESS_CY7C63 is not set
 # CONFIG_USB_CYTHERM is not set
-# CONFIG_USB_PHIDGET is not set
 # CONFIG_USB_IDMOUSE is not set
 # CONFIG_USB_FTDI_ELAN is not set
 # CONFIG_USB_APPLEDISPLAY is not set
@@ -1928,6 +1963,7 @@ CONFIG_USB_LIBUSUAL=y
 #
 # OTG and related infrastructure
 #
+# CONFIG_NOP_USB_XCEIV is not set
 # CONFIG_UWB is not set
 # CONFIG_MMC is not set
 # CONFIG_MEMSTICK is not set
@@ -1939,8 +1975,10 @@ CONFIG_LEDS_CLASS=y
 #
 # CONFIG_LEDS_ALIX2 is not set
 # CONFIG_LEDS_PCA9532 is not set
+# CONFIG_LEDS_LP5521 is not set
 # CONFIG_LEDS_CLEVO_MAIL is not set
 # CONFIG_LEDS_PCA955X is not set
+# CONFIG_LEDS_BD2802 is not set
 
 #
 # LED Triggers
@@ -1950,6 +1988,10 @@ CONFIG_LEDS_TRIGGERS=y
 # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set
 # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set
 # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set
+
+#
+# iptables trigger is under Netfilter config (LED target)
+#
 # CONFIG_ACCESSIBILITY is not set
 # CONFIG_INFINIBAND is not set
 CONFIG_EDAC=y
@@ -2018,6 +2060,7 @@ CONFIG_DMADEVICES=y
 # DMA Devices
 #
 # CONFIG_INTEL_IOATDMA is not set
+# CONFIG_AUXDISPLAY is not set
 # CONFIG_UIO is not set
 # CONFIG_STAGING is not set
 CONFIG_X86_PLATFORM_DEVICES=y
@@ -2051,6 +2094,7 @@ CONFIG_DMIID=y
 #
 # CONFIG_EXT2_FS is not set
 CONFIG_EXT3_FS=y
+# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set
 CONFIG_EXT3_FS_XATTR=y
 CONFIG_EXT3_FS_POSIX_ACL=y
 CONFIG_EXT3_FS_SECURITY=y
@@ -2081,6 +2125,11 @@ CONFIG_AUTOFS4_FS=y
 # CONFIG_FUSE_FS is not set
 CONFIG_GENERIC_ACL=y
 
+#
+# Caches
+#
+# CONFIG_FSCACHE is not set
+
 #
 # CD-ROM/DVD Filesystems
 #
@@ -2132,6 +2181,7 @@ CONFIG_MISC_FILESYSTEMS=y
 # CONFIG_ROMFS_FS is not set
 # CONFIG_SYSV_FS is not set
 # CONFIG_UFS_FS is not set
+# CONFIG_NILFS2_FS is not set
 CONFIG_NETWORK_FILESYSTEMS=y
 CONFIG_NFS_FS=y
 CONFIG_NFS_V3=y
@@ -2145,7 +2195,6 @@ CONFIG_NFS_ACL_SUPPORT=y
 CONFIG_NFS_COMMON=y
 CONFIG_SUNRPC=y
 CONFIG_SUNRPC_GSS=y
-# CONFIG_SUNRPC_REGISTER_V4 is not set
 CONFIG_RPCSEC_GSS_KRB5=y
 # CONFIG_RPCSEC_GSS_SPKM3 is not set
 # CONFIG_SMB_FS is not set
@@ -2232,6 +2281,7 @@ CONFIG_DEBUG_FS=y
 CONFIG_DEBUG_KERNEL=y
 # CONFIG_DEBUG_SHIRQ is not set
 # CONFIG_DETECT_SOFTLOCKUP is not set
+# CONFIG_DETECT_HUNG_TASK is not set
 # CONFIG_SCHED_DEBUG is not set
 CONFIG_SCHEDSTATS=y
 CONFIG_TIMER_STATS=y
@@ -2247,6 +2297,7 @@ CONFIG_TIMER_STATS=y
 # CONFIG_LOCK_STAT is not set
 # CONFIG_DEBUG_SPINLOCK_SLEEP is not set
 # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set
+CONFIG_STACKTRACE=y
 # CONFIG_DEBUG_KOBJECT is not set
 CONFIG_DEBUG_BUGVERBOSE=y
 # CONFIG_DEBUG_INFO is not set
@@ -2269,13 +2320,19 @@ CONFIG_FRAME_POINTER=y
 # CONFIG_FAULT_INJECTION is not set
 # CONFIG_LATENCYTOP is not set
 CONFIG_SYSCTL_SYSCALL_CHECK=y
+# CONFIG_DEBUG_PAGEALLOC is not set
 CONFIG_USER_STACKTRACE_SUPPORT=y
+CONFIG_NOP_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACER=y
 CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y
 CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y
 CONFIG_HAVE_DYNAMIC_FTRACE=y
 CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y
 CONFIG_HAVE_HW_BRANCH_TRACER=y
+CONFIG_HAVE_FTRACE_SYSCALLS=y
+CONFIG_RING_BUFFER=y
+CONFIG_TRACING=y
+CONFIG_TRACING_SUPPORT=y
 
 #
 # Tracers
@@ -2285,13 +2342,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y
 # CONFIG_SYSPROF_TRACER is not set
 # CONFIG_SCHED_TRACER is not set
 # CONFIG_CONTEXT_SWITCH_TRACER is not set
+# CONFIG_EVENT_TRACER is not set
+# CONFIG_FTRACE_SYSCALLS is not set
 # CONFIG_BOOT_TRACER is not set
 # CONFIG_TRACE_BRANCH_PROFILING is not set
 # CONFIG_POWER_TRACER is not set
 # CONFIG_STACK_TRACER is not set
 # CONFIG_HW_BRANCH_TRACER is not set
+# CONFIG_KMEMTRACE is not set
+# CONFIG_WORKQUEUE_TRACER is not set
+CONFIG_BLK_DEV_IO_TRACE=y
+# CONFIG_FTRACE_STARTUP_TEST is not set
+# CONFIG_MMIOTRACE is not set
 CONFIG_PROVIDE_OHCI1394_DMA_INIT=y
-# CONFIG_DYNAMIC_PRINTK_DEBUG is not set
+# CONFIG_DYNAMIC_DEBUG is not set
+# CONFIG_DMA_API_DEBUG is not set
 # CONFIG_SAMPLES is not set
 CONFIG_HAVE_ARCH_KGDB=y
 # CONFIG_KGDB is not set
@@ -2301,14 +2366,13 @@ CONFIG_EARLY_PRINTK=y
 CONFIG_EARLY_PRINTK_DBGP=y
 CONFIG_DEBUG_STACKOVERFLOW=y
 CONFIG_DEBUG_STACK_USAGE=y
-# CONFIG_DEBUG_PAGEALLOC is not set
 # CONFIG_DEBUG_PER_CPU_MAPS is not set
 # CONFIG_X86_PTDUMP is not set
 CONFIG_DEBUG_RODATA=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_NX_TEST=m
 # CONFIG_IOMMU_DEBUG is not set
-# CONFIG_MMIOTRACE is not set
+CONFIG_HAVE_MMIOTRACE_SUPPORT=y
 CONFIG_IO_DELAY_TYPE_0X80=0
 CONFIG_IO_DELAY_TYPE_0XED=1
 CONFIG_IO_DELAY_TYPE_UDELAY=2
@@ -2344,6 +2408,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y
 CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1
 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set
 # CONFIG_SECURITY_SMACK is not set
+# CONFIG_SECURITY_TOMOYO is not set
+# CONFIG_IMA is not set
 CONFIG_CRYPTO=y
 
 #
@@ -2359,10 +2425,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y
 CONFIG_CRYPTO_HASH=y
 CONFIG_CRYPTO_HASH2=y
 CONFIG_CRYPTO_RNG2=y
+CONFIG_CRYPTO_PCOMP=y
 CONFIG_CRYPTO_MANAGER=y
 CONFIG_CRYPTO_MANAGER2=y
 # CONFIG_CRYPTO_GF128MUL is not set
 # CONFIG_CRYPTO_NULL is not set
+CONFIG_CRYPTO_WORKQUEUE=y
 # CONFIG_CRYPTO_CRYPTD is not set
 CONFIG_CRYPTO_AUTHENC=y
 # CONFIG_CRYPTO_TEST is not set
@@ -2414,6 +2482,7 @@ CONFIG_CRYPTO_SHA1=y
 #
 CONFIG_CRYPTO_AES=y
 # CONFIG_CRYPTO_AES_X86_64 is not set
+# CONFIG_CRYPTO_AES_NI_INTEL is not set
 # CONFIG_CRYPTO_ANUBIS is not set
 CONFIG_CRYPTO_ARC4=y
 # CONFIG_CRYPTO_BLOWFISH is not set
@@ -2435,6 +2504,7 @@ CONFIG_CRYPTO_DES=y
 # Compression
 #
 # CONFIG_CRYPTO_DEFLATE is not set
+# CONFIG_CRYPTO_ZLIB is not set
 # CONFIG_CRYPTO_LZO is not set
 
 #
@@ -2444,10 +2514,12 @@ CONFIG_CRYPTO_DES=y
 CONFIG_CRYPTO_HW=y
 # CONFIG_CRYPTO_DEV_HIFN_795X is not set
 CONFIG_HAVE_KVM=y
+CONFIG_HAVE_KVM_IRQCHIP=y
 CONFIG_VIRTUALIZATION=y
 # CONFIG_KVM is not set
 # CONFIG_VIRTIO_PCI is not set
 # CONFIG_VIRTIO_BALLOON is not set
+CONFIG_BINARY_PRINTF=y
 
 #
 # Library routines
@@ -2464,7 +2536,10 @@ CONFIG_CRC32=y
 # CONFIG_CRC7 is not set
 # CONFIG_LIBCRC32C is not set
 CONFIG_ZLIB_INFLATE=y
-CONFIG_PLIST=y
+CONFIG_DECOMPRESS_GZIP=y
+CONFIG_DECOMPRESS_BZIP2=y
+CONFIG_DECOMPRESS_LZMA=y
 CONFIG_HAS_IOMEM=y
 CONFIG_HAS_IOPORT=y
 CONFIG_HAS_DMA=y
+CONFIG_NLATTR=y
index a505202086e8741a916e696ccfb664d3f3e0f8ef..dcef387ddc36a87cb78554784d02e91808e70fb0 100644 (file)
@@ -830,4 +830,5 @@ ia32_sys_call_table:
        .quad sys_inotify_init1
        .quad compat_sys_preadv
        .quad compat_sys_pwritev
+       .quad compat_sys_rt_tgsigqueueinfo      /* 335 */
 ia32_syscall_end:
index f6aa18eadf71717d9e86c53ac3719776fa035969..1a37bcdc8606c0ccefaa73026c2188a8235b1a24 100644 (file)
@@ -3,6 +3,7 @@
 
 #include <linux/types.h>
 #include <linux/stddef.h>
+#include <linux/stringify.h>
 #include <asm/asm.h>
 
 /*
@@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {}
 
 const unsigned char *const *find_nop_table(void);
 
+/* alternative assembly primitive: */
+#define ALTERNATIVE(oldinstr, newinstr, feature)                       \
+                                                                       \
+      "661:\n\t" oldinstr "\n662:\n"                                   \
+      ".section .altinstructions,\"a\"\n"                              \
+      _ASM_ALIGN "\n"                                                  \
+      _ASM_PTR "661b\n"                                /* label           */   \
+      _ASM_PTR "663f\n"                                /* new instruction */   \
+      "         .byte " __stringify(feature) "\n"      /* feature bit     */   \
+      "         .byte 662b-661b\n"                     /* sourcelen       */   \
+      "         .byte 664f-663f\n"                     /* replacementlen  */   \
+      ".previous\n"                                                    \
+      ".section .altinstr_replacement, \"ax\"\n"                       \
+      "663:\n\t" newinstr "\n664:\n"           /* replacement     */   \
+      ".previous"
+
 /*
  * Alternative instructions for different CPU types or capabilities.
  *
@@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void);
  * without volatile and memory clobber.
  */
 #define alternative(oldinstr, newinstr, feature)                       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c0\n"           /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" :: "i" (feature) : "memory")
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
 
 /*
  * Alternative inline assembly with input.
@@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void);
  * Best is to use constraints that are fixed size (like (%1) ... "r")
  * If you use variable sized constraints like "m" or "g" in the
  * replacement make sure to pad to the worst case length.
+ * Leaving an unused argument 0 to keep API compatibility.
  */
 #define alternative_input(oldinstr, newinstr, feature, input...)       \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c0\n"           /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" :: "i" (feature), ##input)
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
+               : : "i" (0), ## input)
 
 /* Like alternative_input, but with a single output argument */
 #define alternative_io(oldinstr, newinstr, feature, output, input...)  \
-       asm volatile ("661:\n\t" oldinstr "\n662:\n"                    \
-                     ".section .altinstructions,\"a\"\n"               \
-                     _ASM_ALIGN "\n"                                   \
-                     _ASM_PTR "661b\n"         /* label */             \
-                     _ASM_PTR "663f\n"         /* new instruction */   \
-                     "  .byte %c[feat]\n"      /* feature bit */       \
-                     "  .byte 662b-661b\n"     /* sourcelen */         \
-                     "  .byte 664f-663f\n"     /* replacementlen */    \
-                     ".previous\n"                                     \
-                     ".section .altinstr_replacement,\"ax\"\n"         \
-                     "663:\n\t" newinstr "\n664:\n"  /* replacement */ \
-                     ".previous" : output : [feat] "i" (feature), ##input)
+       asm volatile (ALTERNATIVE(oldinstr, newinstr, feature)          \
+               : output : "i" (0), ## input)
 
 /*
  * use this macro(s) if you need more than one output parameter
index f712344329bc7a6929a38f8e0e968bd1166e4c9e..262e02820049aa8cc92dca18e83afdcbb829a71c 100644 (file)
@@ -27,6 +27,8 @@ extern int amd_iommu_init(void);
 extern int amd_iommu_init_dma_ops(void);
 extern void amd_iommu_detect(void);
 extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
+extern void amd_iommu_flush_all_domains(void);
+extern void amd_iommu_flush_all_devices(void);
 #else
 static inline int amd_iommu_init(void) { return -ENODEV; }
 static inline void amd_iommu_detect(void) { }
index 95c8cd9d22b54401fc8de9cf734512126b34a05f..0c878caaa0a23cda2a83a0ab9bb9ef3620bf8fad 100644 (file)
 #define PD_DMA_OPS_MASK                (1UL << 0) /* domain used for dma_ops */
 #define PD_DEFAULT_MASK                (1UL << 1) /* domain is a default dma_ops
                                              domain for an IOMMU */
+extern bool amd_iommu_dump;
+#define DUMP_printk(format, arg...)                                    \
+       do {                                                            \
+               if (amd_iommu_dump)                                             \
+                       printk(KERN_INFO "AMD IOMMU: " format, ## arg); \
+       } while(0);
+
+/*
+ * Make iterating over all IOMMUs easier
+ */
+#define for_each_iommu(iommu) \
+       list_for_each_entry((iommu), &amd_iommu_list, list)
+#define for_each_iommu_safe(iommu, next) \
+       list_for_each_entry_safe((iommu), (next), &amd_iommu_list, list)
+
+#define APERTURE_RANGE_SHIFT   27      /* 128 MB */
+#define APERTURE_RANGE_SIZE    (1ULL << APERTURE_RANGE_SHIFT)
+#define APERTURE_RANGE_PAGES   (APERTURE_RANGE_SIZE >> PAGE_SHIFT)
+#define APERTURE_MAX_RANGES    32      /* allows 4GB of DMA address space */
+#define APERTURE_RANGE_INDEX(a)        ((a) >> APERTURE_RANGE_SHIFT)
+#define APERTURE_PAGE_INDEX(a) (((a) >> 21) & 0x3fULL)
 
 /*
  * This structure contains generic data for  IOMMU protection domains
@@ -209,6 +230,26 @@ struct protection_domain {
        void *priv;             /* private data */
 };
 
+/*
+ * For dynamic growth the aperture size is split into ranges of 128MB of
+ * DMA address space each. This struct represents one such range.
+ */
+struct aperture_range {
+
+       /* address allocation bitmap */
+       unsigned long *bitmap;
+
+       /*
+        * Array of PTE pages for the aperture. In this array we save all the
+        * leaf pages of the domain page table used for the aperture. This way
+        * we don't need to walk the page table to find a specific PTE. We can
+        * just calculate its address in constant time.
+        */
+       u64 *pte_pages[64];
+
+       unsigned long offset;
+};
+
 /*
  * Data container for a dma_ops specific protection domain
  */
@@ -222,18 +263,10 @@ struct dma_ops_domain {
        unsigned long aperture_size;
 
        /* address we start to search for free addresses */
-       unsigned long next_bit;
-
-       /* address allocation bitmap */
-       unsigned long *bitmap;
+       unsigned long next_address;
 
-       /*
-        * Array of PTE pages for the aperture. In this array we save all the
-        * leaf pages of the domain page table used for the aperture. This way
-        * we don't need to walk the page table to find a specific PTE. We can
-        * just calculate its address in constant time.
-        */
-       u64 **pte_pages;
+       /* address space relevant data */
+       struct aperture_range *aperture[APERTURE_MAX_RANGES];
 
        /* This will be set to true when TLB needs to be flushed */
        bool need_flush;
index 42f2f83774224fb7897bf848fc6b84c4bce87380..bb7d47925847c585ee7e9e66f6f2e8652b5ab753 100644 (file)
@@ -107,8 +107,7 @@ extern u32 native_safe_apic_wait_icr_idle(void);
 extern void native_apic_icr_write(u32 low, u32 id);
 extern u64 native_apic_icr_read(void);
 
-#define EIM_8BIT_APIC_ID       0
-#define EIM_32BIT_APIC_ID      1
+extern int x2apic_mode;
 
 #ifdef CONFIG_X86_X2APIC
 /*
@@ -166,10 +165,9 @@ static inline u64 native_x2apic_icr_read(void)
        return val;
 }
 
-extern int x2apic, x2apic_phys;
+extern int x2apic_phys;
 extern void check_x2apic(void);
 extern void enable_x2apic(void);
-extern void enable_IR_x2apic(void);
 extern void x2apic_icr_write(u32 low, u32 id);
 static inline int x2apic_enabled(void)
 {
@@ -183,6 +181,8 @@ static inline int x2apic_enabled(void)
                return 1;
        return 0;
 }
+
+#define x2apic_supported()     (cpu_has_x2apic)
 #else
 static inline void check_x2apic(void)
 {
@@ -190,28 +190,20 @@ static inline void check_x2apic(void)
 static inline void enable_x2apic(void)
 {
 }
-static inline void enable_IR_x2apic(void)
-{
-}
 static inline int x2apic_enabled(void)
 {
        return 0;
 }
 
-#define        x2apic  0
-
+#define        x2apic_preenabled 0
+#define        x2apic_supported()      0
 #endif
 
-extern int get_physical_broadcast(void);
+extern void enable_IR_x2apic(void);
 
-#ifdef CONFIG_X86_X2APIC
-static inline void ack_x2APIC_irq(void)
-{
-       /* Docs say use 0 for future compatibility */
-       native_apic_msr_write(APIC_EOI, 0);
-}
-#endif
+extern int get_physical_broadcast(void);
 
+extern void apic_disable(void);
 extern int lapic_get_maxlvt(void);
 extern void clear_local_APIC(void);
 extern void connect_bsp_APIC(void);
@@ -252,7 +244,7 @@ static inline void lapic_shutdown(void) { }
 #define local_apic_timer_c2_ok         1
 static inline void init_apic_mappings(void) { }
 static inline void disable_local_APIC(void) { }
-
+static inline void apic_disable(void) { }
 #endif /* !CONFIG_X86_LOCAL_APIC */
 
 #ifdef CONFIG_X86_64
@@ -410,7 +402,7 @@ static inline unsigned default_get_apic_id(unsigned long x)
 {
        unsigned int ver = GET_APIC_VERSION(apic_read(APIC_LVR));
 
-       if (APIC_XAPIC(ver))
+       if (APIC_XAPIC(ver) || boot_cpu_has(X86_FEATURE_EXTD_APICID))
                return (x >> 24) & 0xFF;
        else
                return (x >> 24) & 0x0F;
@@ -478,6 +470,9 @@ static inline unsigned int read_apic_id(void)
 extern void default_setup_apic_routing(void);
 
 #ifdef CONFIG_X86_32
+
+extern struct apic apic_default;
+
 /*
  * Set up the logical destination ID.
  *
index bc9514fb3b13f70d75b11f037546eca8e43d07fd..7ddb36ab933b97b7757c0df30732d31d40404c19 100644 (file)
@@ -22,6 +22,7 @@
 #  define      APIC_INTEGRATED(x)      (1)
 #endif
 #define                APIC_XAPIC(x)           ((x) >= 0x14)
+#define                APIC_EXT_SPACE(x)       ((x) & 0x80000000)
 #define        APIC_TASKPRI    0x80
 #define                APIC_TPRI_MASK          0xFFu
 #define        APIC_ARBPRI     0x90
 #define                APIC_TDR_DIV_32         0x8
 #define                APIC_TDR_DIV_64         0x9
 #define                APIC_TDR_DIV_128        0xA
-#define        APIC_EILVT0     0x500
+#define        APIC_EFEAT      0x400
+#define        APIC_ECTRL      0x410
+#define APIC_EILVTn(n) (0x500 + 0x10 * n)
 #define                APIC_EILVT_NR_AMD_K8    1       /* # of extended interrupts */
 #define                APIC_EILVT_NR_AMD_10H   4
 #define                APIC_EILVT_LVTOFF(x)    (((x) >> 4) & 0xF)
 #define                APIC_EILVT_MSG_NMI      0x4
 #define                APIC_EILVT_MSG_EXT      0x7
 #define                APIC_EILVT_MASKED       (1 << 16)
-#define        APIC_EILVT1     0x510
-#define        APIC_EILVT2     0x520
-#define        APIC_EILVT3     0x530
 
 #define APIC_BASE (fix_to_virt(FIX_APIC_BASE))
 #define APIC_BASE_MSR  0x800
index 6ba23dd9fc9216de96b9cb52d1954b9bb7ae2b99..418e632d4a801aa8e86f565afeae59659d6500a0 100644 (file)
@@ -8,11 +8,26 @@
 
 #ifdef __KERNEL__
 
+#include <asm/page_types.h>
+
 /* Physical address where kernel should be loaded. */
 #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \
                                + (CONFIG_PHYSICAL_ALIGN - 1)) \
                                & ~(CONFIG_PHYSICAL_ALIGN - 1))
 
+/* Minimum kernel alignment, as a power of two */
+#ifdef CONFIG_x86_64
+#define MIN_KERNEL_ALIGN_LG2   PMD_SHIFT
+#else
+#define MIN_KERNEL_ALIGN_LG2   (PAGE_SHIFT+1)
+#endif
+#define MIN_KERNEL_ALIGN       (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)
+
+#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \
+       (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2))
+#error "Invalid value for CONFIG_PHYSICAL_ALIGN"
+#endif
+
 #ifdef CONFIG_KERNEL_BZIP2
 #define BOOT_HEAP_SIZE             0x400000
 #else /* !CONFIG_KERNEL_BZIP2 */
index 433adaebf9b631f108399a3d27f660ff61a0369b..1724e8de317c59518daebdb305f75b23bb4fcaf6 100644 (file)
@@ -50,7 +50,8 @@ struct setup_header {
        __u32   ramdisk_size;
        __u32   bootsect_kludge;
        __u16   heap_end_ptr;
-       __u16   _pad1;
+       __u8    ext_loader_ver;
+       __u8    ext_loader_type;
        __u32   cmd_line_ptr;
        __u32   initrd_addr_max;
        __u32   kernel_alignment;
index 222802029fa6c43b504959401236d559991deba0..d96c1ee3a95cce1ec7fdf48a1fe5269bba58d32f 100644 (file)
@@ -86,105 +86,7 @@ enum cpu_file_bit {
        CPU_VALUE_BIT,                          /* value                */
 };
 
-#define        CPU_FILE_VALUE                  (1 << CPU_VALUE_BIT)
-
-/*
- * DisplayFamily_DisplayModel  Processor Families/Processor Number Series
- * --------------------------  ------------------------------------------
- * 05_01, 05_02, 05_04         Pentium, Pentium with MMX
- *
- * 06_01                       Pentium Pro
- * 06_03, 06_05                        Pentium II Xeon, Pentium II
- * 06_07, 06_08, 06_0A, 06_0B  Pentium III Xeon, Pentum III
- *
- * 06_09, 060D                 Pentium M
- *
- * 06_0E                       Core Duo, Core Solo
- *
- * 06_0F                       Xeon 3000, 3200, 5100, 5300, 7300 series,
- *                             Core 2 Quad, Core 2 Extreme, Core 2 Duo,
- *                             Pentium dual-core
- * 06_17                       Xeon 5200, 5400 series, Core 2 Quad Q9650
- *
- * 06_1C                       Atom
- *
- * 0F_00, 0F_01, 0F_02         Xeon, Xeon MP, Pentium 4
- * 0F_03, 0F_04                        Xeon, Xeon MP, Pentium 4, Pentium D
- *
- * 0F_06                       Xeon 7100, 5000 Series, Xeon MP,
- *                             Pentium 4, Pentium D
- */
-
-/* Register processors bits */
-enum cpu_processor_bit {
-       CPU_NONE,
-/* Intel */
-       CPU_INTEL_PENTIUM_BIT,
-       CPU_INTEL_P6_BIT,
-       CPU_INTEL_PENTIUM_M_BIT,
-       CPU_INTEL_CORE_BIT,
-       CPU_INTEL_CORE2_BIT,
-       CPU_INTEL_ATOM_BIT,
-       CPU_INTEL_XEON_P4_BIT,
-       CPU_INTEL_XEON_MP_BIT,
-/* AMD */
-       CPU_AMD_K6_BIT,
-       CPU_AMD_K7_BIT,
-       CPU_AMD_K8_BIT,
-       CPU_AMD_0F_BIT,
-       CPU_AMD_10_BIT,
-       CPU_AMD_11_BIT,
-};
-
-#define        CPU_INTEL_PENTIUM       (1 << CPU_INTEL_PENTIUM_BIT)
-#define        CPU_INTEL_P6            (1 << CPU_INTEL_P6_BIT)
-#define        CPU_INTEL_PENTIUM_M     (1 << CPU_INTEL_PENTIUM_M_BIT)
-#define        CPU_INTEL_CORE          (1 << CPU_INTEL_CORE_BIT)
-#define        CPU_INTEL_CORE2         (1 << CPU_INTEL_CORE2_BIT)
-#define        CPU_INTEL_ATOM          (1 << CPU_INTEL_ATOM_BIT)
-#define        CPU_INTEL_XEON_P4       (1 << CPU_INTEL_XEON_P4_BIT)
-#define        CPU_INTEL_XEON_MP       (1 << CPU_INTEL_XEON_MP_BIT)
-
-#define        CPU_INTEL_PX            (CPU_INTEL_P6 | CPU_INTEL_PENTIUM_M)
-#define        CPU_INTEL_COREX         (CPU_INTEL_CORE | CPU_INTEL_CORE2)
-#define        CPU_INTEL_XEON          (CPU_INTEL_XEON_P4 | CPU_INTEL_XEON_MP)
-#define        CPU_CO_AT               (CPU_INTEL_CORE | CPU_INTEL_ATOM)
-#define        CPU_C2_AT               (CPU_INTEL_CORE2 | CPU_INTEL_ATOM)
-#define        CPU_CX_AT               (CPU_INTEL_COREX | CPU_INTEL_ATOM)
-#define        CPU_CX_XE               (CPU_INTEL_COREX | CPU_INTEL_XEON)
-#define        CPU_P6_XE               (CPU_INTEL_P6 | CPU_INTEL_XEON)
-#define        CPU_PM_CO_AT            (CPU_INTEL_PENTIUM_M | CPU_CO_AT)
-#define        CPU_C2_AT_XE            (CPU_C2_AT | CPU_INTEL_XEON)
-#define        CPU_CX_AT_XE            (CPU_CX_AT | CPU_INTEL_XEON)
-#define        CPU_P6_CX_AT            (CPU_INTEL_P6 | CPU_CX_AT)
-#define        CPU_P6_CX_XE            (CPU_P6_XE | CPU_INTEL_COREX)
-#define        CPU_P6_CX_AT_XE         (CPU_INTEL_P6 | CPU_CX_AT_XE)
-#define        CPU_PM_CX_AT_XE         (CPU_INTEL_PENTIUM_M | CPU_CX_AT_XE)
-#define        CPU_PM_CX_AT            (CPU_INTEL_PENTIUM_M | CPU_CX_AT)
-#define        CPU_PM_CX_XE            (CPU_INTEL_PENTIUM_M | CPU_CX_XE)
-#define        CPU_PX_CX_AT            (CPU_INTEL_PX | CPU_CX_AT)
-#define        CPU_PX_CX_AT_XE         (CPU_INTEL_PX | CPU_CX_AT_XE)
-
-/* Select all supported Intel CPUs */
-#define        CPU_INTEL_ALL           (CPU_INTEL_PENTIUM | CPU_PX_CX_AT_XE)
-
-#define        CPU_AMD_K6              (1 << CPU_AMD_K6_BIT)
-#define        CPU_AMD_K7              (1 << CPU_AMD_K7_BIT)
-#define        CPU_AMD_K8              (1 << CPU_AMD_K8_BIT)
-#define        CPU_AMD_0F              (1 << CPU_AMD_0F_BIT)
-#define        CPU_AMD_10              (1 << CPU_AMD_10_BIT)
-#define        CPU_AMD_11              (1 << CPU_AMD_11_BIT)
-
-#define        CPU_K10_PLUS            (CPU_AMD_10 | CPU_AMD_11)
-#define        CPU_K0F_PLUS            (CPU_AMD_0F | CPU_K10_PLUS)
-#define        CPU_K8_PLUS             (CPU_AMD_K8 | CPU_K0F_PLUS)
-#define        CPU_K7_PLUS             (CPU_AMD_K7 | CPU_K8_PLUS)
-
-/* Select all supported AMD CPUs */
-#define        CPU_AMD_ALL             (CPU_AMD_K6 | CPU_K7_PLUS)
-
-/* Select all supported CPUs */
-#define        CPU_ALL                 (CPU_INTEL_ALL | CPU_AMD_ALL)
+#define        CPU_FILE_VALUE          (1 << CPU_VALUE_BIT)
 
 #define MAX_CPU_FILES          512
 
@@ -220,7 +122,6 @@ struct cpu_debug_range {
        unsigned                min;            /* Register range min   */
        unsigned                max;            /* Register range max   */
        unsigned                flag;           /* Supported flags      */
-       unsigned                model;          /* Supported models     */
 };
 
 #endif /* _ASM_X86_CPU_DEBUG_H */
index 9c63bf37ad533f431de2c48db85a963fe1725844..4a28d22d479362ccf3a5bedbc8edb6499558e1ef 100644 (file)
@@ -22,7 +22,7 @@
 #define X86_FEATURE_TSC                (0*32+ 4) /* Time Stamp Counter */
 #define X86_FEATURE_MSR                (0*32+ 5) /* Model-Specific Registers */
 #define X86_FEATURE_PAE                (0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Architecture */
+#define X86_FEATURE_MCE                (0*32+ 7) /* Machine Check Exception */
 #define X86_FEATURE_CX8                (0*32+ 8) /* CMPXCHG8 instruction */
 #define X86_FEATURE_APIC       (0*32+ 9) /* Onboard APIC */
 #define X86_FEATURE_SEP                (0*32+11) /* SYSENTER/SYSEXIT */
@@ -94,6 +94,7 @@
 #define X86_FEATURE_TSC_RELIABLE (3*32+23) /* TSC is known to be reliable */
 #define X86_FEATURE_NONSTOP_TSC        (3*32+24) /* TSC does not stop in C states */
 #define X86_FEATURE_CLFLUSH_MONITOR (3*32+25) /* "" clflush reqd with monitor */
+#define X86_FEATURE_EXTD_APICID        (3*32+26) /* has extended APICID (8 bits) */
 
 /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
 #define X86_FEATURE_XMM3       (4*32+ 0) /* "pni" SSE-3 */
@@ -194,11 +195,11 @@ extern const char * const x86_power_flags[32];
 #define clear_cpu_cap(c, bit)  clear_bit(bit, (unsigned long *)((c)->x86_capability))
 #define setup_clear_cpu_cap(bit) do { \
        clear_cpu_cap(&boot_cpu_data, bit);     \
-       set_bit(bit, (unsigned long *)cleared_cpu_caps); \
+       set_bit(bit, (unsigned long *)cpu_caps_cleared); \
 } while (0)
 #define setup_force_cpu_cap(bit) do { \
        set_cpu_cap(&boot_cpu_data, bit);       \
-       clear_bit(bit, (unsigned long *)cleared_cpu_caps);      \
+       set_bit(bit, (unsigned long *)cpu_caps_set);    \
 } while (0)
 
 #define cpu_has_fpu            boot_cpu_has(X86_FEATURE_FPU)
index a8f672ba100c541fa893c4167dffabfab1c97561..70dac199b093d6727db43ac652aa92c26e41eb20 100644 (file)
@@ -15,8 +15,8 @@
  * - buffer allocation (memory accounting)
  *
  *
- * Copyright (C) 2007-2008 Intel Corporation.
- * Markus Metzger <markus.t.metzger@intel.com>, 2007-2008
+ * Copyright (C) 2007-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
 #ifndef _ASM_X86_DS_H
@@ -83,8 +83,10 @@ enum ds_feature {
  * The interrupt threshold is independent from the overflow callback
  * to allow users to use their own overflow interrupt handling mechanism.
  *
- * task: the task to request recording for;
- *       NULL for per-cpu recording on the current cpu
+ * The function might sleep.
+ *
+ * task: the task to request recording for
+ * cpu:  the cpu to request recording for
  * base: the base pointer for the (non-pageable) buffer;
  * size: the size of the provided buffer in bytes
  * ovfl: pointer to a function to be called on buffer overflow;
@@ -93,19 +95,28 @@ enum ds_feature {
  *     -1 if no interrupt threshold is requested.
  * flags: a bit-mask of the above flags
  */
-extern struct bts_tracer *ds_request_bts(struct task_struct *task,
-                                        void *base, size_t size,
-                                        bts_ovfl_callback_t ovfl,
-                                        size_t th, unsigned int flags);
-extern struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-                                          void *base, size_t size,
-                                          pebs_ovfl_callback_t ovfl,
-                                          size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+                                             void *base, size_t size,
+                                             bts_ovfl_callback_t ovfl,
+                                             size_t th, unsigned int flags);
+extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+                                            bts_ovfl_callback_t ovfl,
+                                            size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+                                               void *base, size_t size,
+                                               pebs_ovfl_callback_t ovfl,
+                                               size_t th, unsigned int flags);
+extern struct pebs_tracer *ds_request_pebs_cpu(int cpu,
+                                              void *base, size_t size,
+                                              pebs_ovfl_callback_t ovfl,
+                                              size_t th, unsigned int flags);
 
 /*
  * Release BTS or PEBS resources
  * Suspend and resume BTS or PEBS tracing
  *
+ * Must be called with irq's enabled.
+ *
  * tracer: the tracer handle returned from ds_request_~()
  */
 extern void ds_release_bts(struct bts_tracer *tracer);
@@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer);
 extern void ds_suspend_pebs(struct pebs_tracer *tracer);
 extern void ds_resume_pebs(struct pebs_tracer *tracer);
 
+/*
+ * Release BTS or PEBS resources
+ * Suspend and resume BTS or PEBS tracing
+ *
+ * Cpu tracers must call this on the traced cpu.
+ * Task tracers must call ds_release_~_noirq() for themselves.
+ *
+ * May be called with irq's disabled.
+ *
+ * Returns 0 if successful;
+ * -EPERM if the cpu tracer does not trace the current cpu.
+ * -EPERM if the task tracer does not trace itself.
+ *
+ * tracer: the tracer handle returned from ds_request_~()
+ */
+extern int ds_release_bts_noirq(struct bts_tracer *tracer);
+extern int ds_suspend_bts_noirq(struct bts_tracer *tracer);
+extern int ds_resume_bts_noirq(struct bts_tracer *tracer);
+extern int ds_release_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer);
+extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer);
+
 
 /*
  * The raw DS buffer state as it is used for BTS and PEBS recording.
@@ -170,9 +203,9 @@ struct bts_struct {
                } lbr;
                /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */
                struct {
-                       __u64 jiffies;
+                       __u64 clock;
                        pid_t pid;
-               } timestamp;
+               } event;
        } variant;
 };
 
@@ -201,8 +234,12 @@ struct bts_trace {
 struct pebs_trace {
        struct ds_trace ds;
 
-       /* the PEBS reset value */
-       unsigned long long reset_value;
+       /* the number of valid counters in the below array */
+       unsigned int counters;
+
+#define MAX_PEBS_COUNTERS 4
+       /* the counter reset value */
+       unsigned long long counter_reset[MAX_PEBS_COUNTERS];
 };
 
 
@@ -237,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer);
  * Returns 0 on success; -Eerrno on error
  *
  * tracer: the tracer handle returned from ds_request_pebs()
+ * counter: the index of the counter
  * value: the new counter reset value
  */
-extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value);
+extern int ds_set_pebs_reset(struct pebs_tracer *tracer,
+                            unsigned int counter, u64 value);
 
 /*
  * Initialization
@@ -252,21 +291,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *);
  */
 extern void ds_switch_to(struct task_struct *prev, struct task_struct *next);
 
-/*
- * Task clone/init and cleanup work
- */
-extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father);
-extern void ds_exit_thread(struct task_struct *tsk);
-
 #else /* CONFIG_X86_DS */
 
 struct cpuinfo_x86;
 static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {}
 static inline void ds_switch_to(struct task_struct *prev,
                                struct task_struct *next) {}
-static inline void ds_copy_thread(struct task_struct *tsk,
-                                 struct task_struct *father) {}
-static inline void ds_exit_thread(struct task_struct *tsk) {}
 
 #endif /* CONFIG_X86_DS */
 #endif /* _ASM_X86_DS_H */
index b762ea49bd703ab3b28958cf83b6908e825e57a4..3bd1777a4c8b3c2e647659c6c1ad6e8eb4bbbcf0 100644 (file)
@@ -63,7 +63,26 @@ extern unsigned long io_apic_irqs;
 extern void init_VISWS_APIC_irqs(void);
 extern void setup_IO_APIC(void);
 extern void disable_IO_APIC(void);
-extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn);
+
+struct io_apic_irq_attr {
+       int ioapic;
+       int ioapic_pin;
+       int trigger;
+       int polarity;
+};
+
+static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
+                                       int ioapic, int ioapic_pin,
+                                       int trigger, int polarity)
+{
+       irq_attr->ioapic     = ioapic;
+       irq_attr->ioapic_pin = ioapic_pin;
+       irq_attr->trigger    = trigger;
+       irq_attr->polarity   = polarity;
+}
+
+extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin,
+                                       struct io_apic_irq_attr *irq_attr);
 extern void setup_ioapic_dest(void);
 
 extern void enable_IO_APIC(void);
@@ -78,7 +97,11 @@ extern void eisa_set_level_irq(unsigned int irq);
 /* SMP */
 extern void smp_apic_timer_interrupt(struct pt_regs *);
 extern void smp_spurious_interrupt(struct pt_regs *);
+extern void smp_generic_interrupt(struct pt_regs *);
 extern void smp_error_interrupt(struct pt_regs *);
+#ifdef CONFIG_X86_IO_APIC
+extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
+#endif
 #ifdef CONFIG_SMP
 extern void smp_reschedule_interrupt(struct pt_regs *);
 extern void smp_call_function_interrupt(struct pt_regs *);
index 71c9e51839827dfc14ec28858467c2ea4815297c..175adf58dd4f8e3cec35d69123d4640af2ff37c8 100644 (file)
@@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
                     ".previous\n"
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err)
-#if 0 /* See comment in __save_init_fpu() below. */
+#if 0 /* See comment in fxsave() below. */
                     : [fx] "r" (fx), "m" (*fx), "0" (0));
 #else
                     : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0));
@@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
        return err;
 }
 
-static inline int restore_fpu_checking(struct task_struct *tsk)
-{
-       if (task_thread_info(tsk)->status & TS_XSAVE)
-               return xrstor_checking(&tsk->thread.xstate->xsave);
-       else
-               return fxrstor_checking(&tsk->thread.xstate->fxsave);
-}
-
 /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
    is pending. Clear the x87 state here by setting it to fixed
    values. The kernel data segment can be sometimes 0 and sometimes
@@ -120,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
                     ".previous\n"
                     _ASM_EXTABLE(1b, 3b)
                     : [err] "=r" (err), "=m" (*fx)
-#if 0 /* See comment in __fxsave_clear() below. */
+#if 0 /* See comment in fxsave() below. */
                     : [fx] "r" (fx), "0" (0));
 #else
                     : [fx] "cdaSDb" (fx), "0" (0));
@@ -185,12 +177,9 @@ static inline void tolerant_fwait(void)
        asm volatile("fnclex ; fwait");
 }
 
-static inline void restore_fpu(struct task_struct *tsk)
+/* perform fxrstor iff the processor has extended states, otherwise frstor */
+static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
 {
-       if (task_thread_info(tsk)->status & TS_XSAVE) {
-               xrstor_checking(&tsk->thread.xstate->xsave);
-               return;
-       }
        /*
         * The "nop" is needed to make the instructions the same
         * length.
@@ -199,7 +188,9 @@ static inline void restore_fpu(struct task_struct *tsk)
                "nop ; frstor %1",
                "fxrstor %1",
                X86_FEATURE_FXSR,
-               "m" (tsk->thread.xstate->fxsave));
+               "m" (*fx));
+
+       return 0;
 }
 
 /* We need a safe address that is cheap to find and that is already
@@ -262,6 +253,14 @@ end:
 
 #endif /* CONFIG_X86_64 */
 
+static inline int restore_fpu_checking(struct task_struct *tsk)
+{
+       if (task_thread_info(tsk)->status & TS_XSAVE)
+               return xrstor_checking(&tsk->thread.xstate->xsave);
+       else
+               return fxrstor_checking(&tsk->thread.xstate->fxsave);
+}
+
 /*
  * Signal frame handlers...
  */
@@ -305,18 +304,18 @@ static inline void kernel_fpu_end(void)
 /*
  * Some instructions like VIA's padlock instructions generate a spurious
  * DNA fault but don't modify SSE registers. And these instructions
- * get used from interrupt context aswell. To prevent these kernel instructions
- * in interrupt context interact wrongly with other user/kernel fpu usage, we
+ * get used from interrupt context as well. To prevent these kernel instructions
+ * in interrupt context interacting wrongly with other user/kernel fpu usage, we
  * should use them only in the context of irq_ts_save/restore()
  */
 static inline int irq_ts_save(void)
 {
        /*
-        * If we are in process context, we are ok to take a spurious DNA fault.
-        * Otherwise, doing clts() in process context require pre-emption to
-        * be disabled or some heavy lifting like kernel_fpu_begin()
+        * If in process context and not atomic, we can take a spurious DNA fault.
+        * Otherwise, doing clts() in process context requires disabling preemption
+        * or some heavy lifting like kernel_fpu_begin()
         */
-       if (!in_interrupt())
+       if (!in_atomic())
                return 0;
 
        if (read_cr0() & X86_CR0_TS) {
index 1a99e6c092afcfaaad1a73b95ee4ecc7249f231b..58d7091eeb1fc1e8955c87246121060fbb851d31 100644 (file)
@@ -60,8 +60,4 @@ extern struct irq_chip i8259A_chip;
 extern void mask_8259A(void);
 extern void unmask_8259A(void);
 
-#ifdef CONFIG_X86_32
-extern void init_ISA_irqs(void);
-#endif
-
 #endif /* _ASM_X86_I8259_H */
index 9d826e436010005254bb04e13bac1ee45d802b75..daf866ed0612413f3781cdf6f863e8038daebe85 100644 (file)
@@ -154,22 +154,19 @@ extern int timer_through_8259;
 extern int io_apic_get_unique_id(int ioapic, int apic_id);
 extern int io_apic_get_version(int ioapic);
 extern int io_apic_get_redir_entries(int ioapic);
-extern int io_apic_set_pci_routing(int ioapic, int pin, int irq,
-                                  int edge_level, int active_high_low);
 #endif /* CONFIG_ACPI */
 
+struct io_apic_irq_attr;
+extern int io_apic_set_pci_routing(struct device *dev, int irq,
+                struct io_apic_irq_attr *irq_attr);
 extern int (*ioapic_renumber_irq)(int ioapic, int irq);
 extern void ioapic_init_mappings(void);
 
-#ifdef CONFIG_X86_64
 extern struct IO_APIC_route_entry **alloc_ioapic_entries(void);
 extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
 extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
 extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
-extern void reinit_intr_remapped_IO_APIC(int intr_remapping,
-       struct IO_APIC_route_entry **ioapic_entries);
-#endif
 
 extern void probe_nr_irqs_gsi(void);
 
index 86af26091d6c3c2146da4e5f6f3abbfd6cb165ba..0e9fe1d9d9715db6a2fb952eff4b917465c3d64b 100644 (file)
@@ -1,3 +1,6 @@
+#ifndef _ASM_X86_IOMAP_H
+#define _ASM_X86_IOMAP_H
+
 /*
  * Copyright Â© 2008 Ingo Molnar
  *
@@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
 
 void
 iounmap_atomic(void *kvaddr, enum km_type type);
+
+#endif /* _ASM_X86_IOMAP_H */
index 0396760fccb85be05de0d5666c6019316d386d55..f275e2244505b98308ca72e66e26c250d29c61a8 100644 (file)
@@ -1,6 +1,6 @@
 #ifndef _ASM_X86_IRQ_REMAPPING_H
 #define _ASM_X86_IRQ_REMAPPING_H
 
-#define IRTE_DEST(dest) ((x2apic) ? dest : dest << 8)
+#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
 
 #endif /* _ASM_X86_IRQ_REMAPPING_H */
index 3cbd79bbb47c82613341fca01ee6e174319342fe..910b5a3d6751fea966c71553cbda17bd5469805a 100644 (file)
@@ -34,6 +34,7 @@
 
 #ifdef CONFIG_X86_32
 # define SYSCALL_VECTOR                        0x80
+# define IA32_SYSCALL_VECTOR           0x80
 #else
 # define IA32_SYSCALL_VECTOR           0x80
 #endif
index 54c8cc53b24dd732829da23805c96f64b92fa168..c2d1f3b58e5f1342607280be6a71d434796482dd 100644 (file)
@@ -12,4 +12,17 @@ extern int cache_k8_northbridges(void);
 extern void k8_flush_garts(void);
 extern int k8_scan_nodes(unsigned long start, unsigned long end);
 
+#ifdef CONFIG_K8_NB
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
+}
+#else
+static inline struct pci_dev *node_to_k8_nb_misc(int node)
+{
+       return NULL;
+}
+#endif
+
+
 #endif /* _ASM_X86_K8_H */
index c882664716c1d13e5678aa188fb0ccf75d50acca..ef51b501e22a6e53bf4ae7e2d9e2566760f72ee1 100644 (file)
@@ -9,20 +9,31 @@ struct cpu_signature {
 
 struct device;
 
+enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND };
+
 struct microcode_ops {
-       int  (*request_microcode_user) (int cpu, const void __user *buf, size_t size);
-       int  (*request_microcode_fw) (int cpu, struct device *device);
+       enum ucode_state (*request_microcode_user) (int cpu,
+                               const void __user *buf, size_t size);
 
-       void (*apply_microcode) (int cpu);
+       enum ucode_state (*request_microcode_fw) (int cpu,
+                               struct device *device);
 
-       int  (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
        void (*microcode_fini_cpu) (int cpu);
+
+       /*
+        * The generic 'microcode_core' part guarantees that
+        * the callbacks below run on a target cpu when they
+        * are being called.
+        * See also the "Synchronization" section in microcode_core.c.
+        */
+       int (*apply_microcode) (int cpu);
+       int (*collect_cpu_info) (int cpu, struct cpu_signature *csig);
 };
 
 struct ucode_cpu_info {
-       struct cpu_signature cpu_sig;
-       int valid;
-       void *mc;
+       struct cpu_signature    cpu_sig;
+       int                     valid;
+       void                    *mc;
 };
 extern struct ucode_cpu_info ucode_cpu_info[];
 
index 642fc7fc8cdc3fe8aedea4ddb60cacabc7925983..e2a1bb6d71ea832f6a6dbb3fc9ad7b58c7c9e6c4 100644 (file)
@@ -61,9 +61,11 @@ extern void get_smp_config(void);
 #ifdef CONFIG_X86_MPPARSE
 extern void find_smp_config(void);
 extern void early_reserve_e820_mpc_new(void);
+extern int enable_update_mptable;
 #else
 static inline void find_smp_config(void) { }
 static inline void early_reserve_e820_mpc_new(void) { }
+#define enable_update_mptable 0
 #endif
 
 void __cpuinit generic_processor_info(int apicid, int version);
@@ -72,20 +74,13 @@ extern void mp_register_ioapic(int id, u32 address, u32 gsi_base);
 extern void mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger,
                                   u32 gsi);
 extern void mp_config_acpi_legacy_irqs(void);
-extern int mp_register_gsi(u32 gsi, int edge_level, int active_high_low);
+struct device;
+extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
+                                int active_high_low);
 extern int acpi_probe_gsi(void);
 #ifdef CONFIG_X86_IO_APIC
-extern int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                               u32 gsi, int triggering, int polarity);
 extern int mp_find_ioapic(int gsi);
 extern int mp_find_ioapic_pin(int ioapic, int gsi);
-#else
-static inline int
-mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                  u32 gsi, int triggering, int polarity)
-{
-       return 0;
-}
 #endif
 #else /* !CONFIG_ACPI: */
 static inline int acpi_probe_gsi(void)
index ec41fc16c167cb19ef617f9a5e7ec2dda6f88bd1..4d58d04fca830bc56ab8bf62009661ad77a85f3e 100644 (file)
 #define MSR_K8_TOP_MEM1                        0xc001001a
 #define MSR_K8_TOP_MEM2                        0xc001001d
 #define MSR_K8_SYSCFG                  0xc0010010
-#define MSR_K8_HWCR                    0xc0010015
 #define MSR_K8_INT_PENDING_MSG         0xc0010055
 /* C1E active bits in int pending message */
 #define K8_INTP_C1E_ACTIVE_MASK                0x18000000
index c45a0a568dff092227b1436a48e7dab5434dc4fa..c97264409934be915d24d35395c1c0af5956aa7b 100644 (file)
@@ -64,7 +64,7 @@ static inline int nmi_watchdog_active(void)
         * but since they are power of two we could use a
         * cheaper way --cvg
         */
-       return nmi_watchdog & 0x3;
+       return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
 }
 #endif
 
index 064ed6df4cbec292758631a4a5a36aa3b77260c0..c4ae822e415f907093eabd2531d97506a891b691 100644 (file)
@@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks,
 extern void numa_init_array(void);
 extern int numa_off;
 
-extern void srat_reserve_add_area(int nodeid);
-extern int hotadd_percent;
-
 extern s16 apicid_to_node[MAX_LOCAL_APIC];
 
 extern unsigned long numa_free_all_bootmem(void);
@@ -27,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start,
                               unsigned long end);
 
 #ifdef CONFIG_NUMA
+/*
+ * Too small node sizes may confuse the VM badly. Usually they
+ * result from BIOS bugs. So dont recognize nodes as standalone
+ * NUMA entities that have less than this amount of RAM listed:
+ */
+#define NODE_MIN_SIZE (4*1024*1024)
+
 extern void __init init_cpu_to_node(void);
 extern void __cpuinit numa_set_node(int cpu, int node);
 extern void __cpuinit numa_clear_node(int cpu);
index 0f915ae649a717e99ebdfc079652b25fda315e72..6f1b7331313f1af2744c50accfecd26e069a5bed 100644 (file)
@@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE;
 extern int sysctl_legacy_va_layout;
 
 extern void find_low_pfn_range(void);
-extern unsigned long init_memory_mapping(unsigned long start,
-                                        unsigned long end);
-extern void initmem_init(unsigned long, unsigned long);
-extern void free_initmem(void);
 extern void setup_bootmem_allocator(void);
 
 #endif /* !__ASSEMBLY__ */
index d38c91b7024834a56be59db9c9b1bf411c698c50..8d382d3abf38cf9ec0be9c6164589d431017fc41 100644 (file)
  */
 #define __PAGE_OFFSET           _AC(0xffff880000000000, UL)
 
-#define __PHYSICAL_START       CONFIG_PHYSICAL_START
-#define __KERNEL_ALIGN         0x200000
-
-/*
- * Make sure kernel is aligned to 2MB address. Catching it at compile
- * time is better. Change your config file and compile the kernel
- * for a 2MB aligned address (CONFIG_PHYSICAL_START)
- */
-#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0
-#error "CONFIG_PHYSICAL_START must be a multiple of 2MB"
-#endif
+#define __PHYSICAL_START       ((CONFIG_PHYSICAL_START +               \
+                                 (CONFIG_PHYSICAL_ALIGN - 1)) &        \
+                                ~(CONFIG_PHYSICAL_ALIGN - 1))
 
 #define __START_KERNEL         (__START_KERNEL_map + __PHYSICAL_START)
 #define __START_KERNEL_map     _AC(0xffffffff80000000, UL)
 
-/* See Documentation/x86_64/mm.txt for a description of the memory map. */
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define __PHYSICAL_MASK_SHIFT  46
 #define __VIRTUAL_MASK_SHIFT   48
 
@@ -71,12 +63,6 @@ extern unsigned long __phys_addr(unsigned long);
 
 #define vmemmap ((struct page *)VMEMMAP_START)
 
-extern unsigned long init_memory_mapping(unsigned long start,
-                                        unsigned long end);
-
-extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
-extern void free_initmem(void);
-
 extern void init_extra_mapping_uc(unsigned long phys, unsigned long size);
 extern void init_extra_mapping_wb(unsigned long phys, unsigned long size);
 
index 826ad37006ab1a075993ddbe69bd3281412b7f09..6473f5ccff859baf2c897ba073f629d14ba6d0ee 100644 (file)
@@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr);
 extern unsigned long max_low_pfn_mapped;
 extern unsigned long max_pfn_mapped;
 
+extern unsigned long init_memory_mapping(unsigned long start,
+                                        unsigned long end);
+
+extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn);
+extern void free_initmem(void);
+
 #endif /* !__ASSEMBLY__ */
 
 #endif /* _ASM_X86_PAGE_DEFS_H */
index a53da004e08ed8903dbf135fead7e3e09664089a..4fb37c8a0832da8cf51f7a2c771c54fa97af2708 100644 (file)
@@ -56,6 +56,7 @@ struct desc_ptr;
 struct tss_struct;
 struct mm_struct;
 struct desc_struct;
+struct task_struct;
 
 /*
  * Wrapper type for pointers to code which uses the non-standard
@@ -203,7 +204,8 @@ struct pv_cpu_ops {
 
        void (*swapgs)(void);
 
-       struct pv_lazy_ops lazy_mode;
+       void (*start_context_switch)(struct task_struct *prev);
+       void (*end_context_switch)(struct task_struct *next);
 };
 
 struct pv_irq_ops {
@@ -1399,25 +1401,23 @@ enum paravirt_lazy_mode {
 };
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void);
-void paravirt_enter_lazy_cpu(void);
-void paravirt_leave_lazy_cpu(void);
+void paravirt_start_context_switch(struct task_struct *prev);
+void paravirt_end_context_switch(struct task_struct *next);
+
 void paravirt_enter_lazy_mmu(void);
 void paravirt_leave_lazy_mmu(void);
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode);
 
-#define  __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-static inline void arch_enter_lazy_cpu_mode(void)
+#define  __HAVE_ARCH_START_CONTEXT_SWITCH
+static inline void arch_start_context_switch(struct task_struct *prev)
 {
-       PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter);
+       PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev);
 }
 
-static inline void arch_leave_lazy_cpu_mode(void)
+static inline void arch_end_context_switch(struct task_struct *next)
 {
-       PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave);
+       PVOP_VCALL1(pv_cpu_ops.end_context_switch, next);
 }
 
-void arch_flush_lazy_cpu_mode(void);
-
 #define  __HAVE_ARCH_ENTER_LAZY_MMU_MODE
 static inline void arch_enter_lazy_mmu_mode(void)
 {
index 29d96d168bc097195e9cbe66ab723c90929e1116..18ef7ebf2631709dad4f26a681fdbf5dff5a8b2b 100644 (file)
@@ -81,6 +81,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base)
 #define pte_val(x)     native_pte_val(x)
 #define __pte(x)       native_make_pte(x)
 
+#define arch_end_context_switch(prev)  do {} while(0)
+
 #endif /* CONFIG_PARAVIRT */
 
 /*
@@ -503,6 +505,8 @@ static inline int pgd_none(pgd_t pgd)
 
 #ifndef __ASSEMBLY__
 
+extern int direct_gbpages;
+
 /* local pte updates need not use xchg for locking */
 static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
 {
index 6b87bc6d50189263691c10819618bc97deb8c086..abde308fdb0f54e07587e599998c31c4cc45dc66 100644 (file)
@@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[];
 
 extern void paging_init(void);
 
-#endif /* !__ASSEMBLY__ */
-
-#ifndef __ASSEMBLY__
-
 #define pte_ERROR(e)                                   \
        printk("%s:%d: bad pte %p(%016lx).\n",          \
               __FILE__, __LINE__, &(e), pte_val(e))
@@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
 
 #define update_mmu_cache(vma, address, pte) do { } while (0)
 
-extern int direct_gbpages;
-
 /* Encode and de-code a swap entry */
 #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
 #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1)
index fbf42b8e03833d0537b5e8594a4a3f06a154e180..766ea16fbbbda730a9952a67d85c8cda954f62ef 100644 (file)
@@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t;
 #define PGDIR_SIZE     (_AC(1, UL) << PGDIR_SHIFT)
 #define PGDIR_MASK     (~(PGDIR_SIZE - 1))
 
-
+/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */
 #define MAXMEM          _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL)
-#define VMALLOC_START    _AC(0xffffc20000000000, UL)
-#define VMALLOC_END      _AC(0xffffe1ffffffffff, UL)
-#define VMEMMAP_START   _AC(0xffffe20000000000, UL)
+#define VMALLOC_START    _AC(0xffffc90000000000, UL)
+#define VMALLOC_END      _AC(0xffffe8ffffffffff, UL)
+#define VMEMMAP_START   _AC(0xffffea0000000000, UL)
 #define MODULES_VADDR    _AC(0xffffffffa0000000, UL)
 #define MODULES_END      _AC(0xffffffffff000000, UL)
 #define MODULES_LEN   (MODULES_END - MODULES_VADDR)
index b8238dc8786d9f6b4291c2d7f912abbc0ea04e18..4d258ad76a0fc04925ac484c1fa9b0bfc47a1cb9 100644 (file)
@@ -273,7 +273,6 @@ typedef struct page *pgtable_t;
 
 extern pteval_t __supported_pte_mask;
 extern int nx_enabled;
-extern void set_nx(void);
 
 #define pgprot_writecombine    pgprot_writecombine
 extern pgprot_t pgprot_writecombine(pgprot_t prot);
index c2cceae709c8655338894c15b84c361e65caf727..c7768269b1cf1364b76c948d886deee10de2cf70 100644 (file)
@@ -135,7 +135,8 @@ extern struct cpuinfo_x86   boot_cpu_data;
 extern struct cpuinfo_x86      new_cpu_data;
 
 extern struct tss_struct       doublefault_tss;
-extern __u32                   cleared_cpu_caps[NCAPINTS];
+extern __u32                   cpu_caps_cleared[NCAPINTS];
+extern __u32                   cpu_caps_set[NCAPINTS];
 
 #ifdef CONFIG_SMP
 DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
@@ -409,9 +410,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary);
 extern unsigned int xstate_size;
 extern void free_thread_xstate(struct task_struct *);
 extern struct kmem_cache *task_xstate_cachep;
-extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
-extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
-extern unsigned short num_cache_leaves;
 
 struct thread_struct {
        /* Cached TLS descriptors: */
@@ -427,8 +425,12 @@ struct thread_struct {
        unsigned short          fsindex;
        unsigned short          gsindex;
 #endif
+#ifdef CONFIG_X86_32
        unsigned long           ip;
+#endif
+#ifdef CONFIG_X86_64
        unsigned long           fs;
+#endif
        unsigned long           gs;
        /* Hardware debugging registers: */
        unsigned long           debugreg0;
@@ -460,14 +462,8 @@ struct thread_struct {
        unsigned                io_bitmap_max;
 /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set.  */
        unsigned long   debugctlmsr;
-#ifdef CONFIG_X86_DS
-/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */
+       /* Debug Store context; see asm/ds.h */
        struct ds_context       *ds_ctx;
-#endif /* CONFIG_X86_DS */
-#ifdef CONFIG_X86_PTRACE_BTS
-/* the signal to send on a bts buffer overflow */
-       unsigned int    bts_ovfl_signal;
-#endif /* CONFIG_X86_PTRACE_BTS */
 };
 
 static inline unsigned long native_get_debugreg(int regno)
@@ -795,6 +791,21 @@ static inline unsigned long get_debugctlmsr(void)
     return debugctlmsr;
 }
 
+static inline unsigned long get_debugctlmsr_on_cpu(int cpu)
+{
+       u64 debugctlmsr = 0;
+       u32 val1, val2;
+
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return 0;
+#endif
+       rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2);
+       debugctlmsr = val1 | ((u64)val2 << 32);
+
+       return debugctlmsr;
+}
+
 static inline void update_debugctlmsr(unsigned long debugctlmsr)
 {
 #ifndef CONFIG_X86_DEBUGCTLMSR
@@ -804,6 +815,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr)
        wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr);
 }
 
+static inline void update_debugctlmsr_on_cpu(int cpu,
+                                            unsigned long debugctlmsr)
+{
+#ifndef CONFIG_X86_DEBUGCTLMSR
+       if (boot_cpu_data.x86 < 6)
+               return;
+#endif
+       wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR,
+                    (u32)((u64)debugctlmsr),
+                    (u32)((u64)debugctlmsr >> 32));
+}
+
 /*
  * from system description table in BIOS. Mostly for MCA use, but
  * others may find it useful:
@@ -814,6 +837,7 @@ extern unsigned int         BIOS_revision;
 
 /* Boot loader type from the setup header: */
 extern int                     bootloader_type;
+extern int                     bootloader_version;
 
 extern char                    ignore_fpu_irq;
 
@@ -874,7 +898,6 @@ static inline void spin_lock_prefetch(const void *x)
        .vm86_info              = NULL,                                   \
        .sysenter_cs            = __KERNEL_CS,                            \
        .io_bitmap_ptr          = NULL,                                   \
-       .fs                     = __KERNEL_PERCPU,                        \
 }
 
 /*
index 624f133943ed71293aaba1477b9ca20ad0836da8..0f0d908349aa3f375e87f68802cbcb2e7753f725 100644 (file)
@@ -236,12 +236,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx,
 extern int do_set_thread_area(struct task_struct *p, int idx,
                              struct user_desc __user *info, int can_allocate);
 
-extern void x86_ptrace_untrace(struct task_struct *);
-extern void x86_ptrace_fork(struct task_struct *child,
-                           unsigned long clone_flags);
+#ifdef CONFIG_X86_PTRACE_BTS
+extern void ptrace_bts_untrace(struct task_struct *tsk);
 
-#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk)
-#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags)
+#define arch_ptrace_untrace(tsk)       ptrace_bts_untrace(tsk)
+#endif /* CONFIG_X86_PTRACE_BTS */
 
 #endif /* __KERNEL__ */
 
index a4737dddfd5878da05d20fb819592fc58a117a47..64cf2d24fad1c2605c16b528cc10e8e37c9bc77c 100644 (file)
 #endif
 
 #ifdef CONFIG_X86_64
+#ifdef CONFIG_PARAVIRT
+/* Paravirtualized systems may not have PSE or PGE available */
 #define NEED_PSE       0
-#define NEED_MSR       (1<<(X86_FEATURE_MSR & 31))
 #define NEED_PGE       0
+#else
+#define NEED_PSE       (1<<(X86_FEATURE_PSE) & 31)
+#define NEED_PGE       (1<<(X86_FEATURE_PGE) & 31)
+#endif
+#define NEED_MSR       (1<<(X86_FEATURE_MSR & 31))
 #define NEED_FXSR      (1<<(X86_FEATURE_FXSR & 31))
 #define NEED_XMM       (1<<(X86_FEATURE_XMM & 31))
 #define NEED_XMM2      (1<<(X86_FEATURE_XMM2 & 31))
index bdc2ada05ae06056ad95e79bd8d6434d73510f5a..4093d1ed6db2a0b3eebc2983337a1592b673d5bc 100644 (file)
@@ -33,7 +33,6 @@ struct x86_quirks {
        int (*setup_ioapic_ids)(void);
 };
 
-extern void x86_quirk_pre_intr_init(void);
 extern void x86_quirk_intr_init(void);
 
 extern void x86_quirk_trap_init(void);
index 19e0d88b966d7b9154f46d9426450765dcf3d389..6a84ed166aec136334a644cc4fbaa676368ae983 100644 (file)
@@ -180,7 +180,7 @@ extern int safe_smp_processor_id(void);
 static inline int logical_smp_processor_id(void)
 {
        /* we don't want to mark this access volatile - bad code generation */
-       return GET_APIC_LOGICAL_ID(*(u32 *)(APIC_BASE + APIC_LDR));
+       return GET_APIC_LOGICAL_ID(apic_read(APIC_LDR));
 }
 
 #endif
index e3cc3c063ec5e77683c088fd80906de90fbf9bd2..4517d6b93188aa55a775be95561b9f4bee045d42 100644 (file)
@@ -27,7 +27,7 @@
 #else /* CONFIG_X86_32 */
 # define SECTION_SIZE_BITS     27 /* matt - 128 is convenient right now */
 # define MAX_PHYSADDR_BITS     44
-# define MAX_PHYSMEM_BITS      44 /* Can be max 45 bits */
+# define MAX_PHYSMEM_BITS      46
 #endif
 
 #endif /* CONFIG_SPARSEMEM */
index 7043408f6904a9f2d94c817b3488f80fbb4976df..372b76edd63f69053c76bcf7501eb71c35bb8f93 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * syscalls.h - Linux syscall interfaces (arch-specific)
  *
- * Copyright (c) 2008 Jaswinder Singh
+ * Copyright (c) 2008 Jaswinder Singh Rajput
  *
  * This file is released under the GPLv2.
  * See the file COPYING for more details.
 
 #include <linux/compiler.h>
 #include <linux/linkage.h>
-#include <linux/types.h>
 #include <linux/signal.h>
+#include <linux/types.h>
 
 /* Common in X86_32 and X86_64 */
 /* kernel/ioport.c */
 asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 
+/* kernel/process.c */
+int sys_fork(struct pt_regs *);
+int sys_vfork(struct pt_regs *);
+
 /* kernel/ldt.c */
 asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
 
+/* kernel/signal.c */
+long sys_rt_sigreturn(struct pt_regs *);
+
 /* kernel/tls.c */
 asmlinkage int sys_set_thread_area(struct user_desc __user *);
 asmlinkage int sys_get_thread_area(struct user_desc __user *);
 
 /* X86_32 only */
 #ifdef CONFIG_X86_32
+/* kernel/ioport.c */
+long sys_iopl(struct pt_regs *);
+
 /* kernel/process_32.c */
-int sys_fork(struct pt_regs *);
 int sys_clone(struct pt_regs *);
-int sys_vfork(struct pt_regs *);
 int sys_execve(struct pt_regs *);
 
-/* kernel/signal_32.c */
+/* kernel/signal.c */
 asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
 asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
                             struct old_sigaction __user *);
 int sys_sigaltstack(struct pt_regs *);
 unsigned long sys_sigreturn(struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
-
-/* kernel/ioport.c */
-long sys_iopl(struct pt_regs *);
 
 /* kernel/sys_i386_32.c */
+struct mmap_arg_struct;
+struct sel_arg_struct;
+struct oldold_utsname;
+struct old_utsname;
+
 asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
                          unsigned long, unsigned long, unsigned long);
-struct mmap_arg_struct;
 asmlinkage int old_mmap(struct mmap_arg_struct __user *);
-struct sel_arg_struct;
 asmlinkage int old_select(struct sel_arg_struct __user *);
 asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
-struct old_utsname;
 asmlinkage int sys_uname(struct old_utsname __user *);
-struct oldold_utsname;
 asmlinkage int sys_olduname(struct oldold_utsname __user *);
 
 /* kernel/vm86_32.c */
@@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *);
 #else /* CONFIG_X86_32 */
 
 /* X86_64 only */
+/* kernel/ioport.c */
+asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
+
 /* kernel/process_64.c */
-asmlinkage long sys_fork(struct pt_regs *);
 asmlinkage long sys_clone(unsigned long, unsigned long,
                          void __user *, void __user *,
                          struct pt_regs *);
-asmlinkage long sys_vfork(struct pt_regs *);
 asmlinkage long sys_execve(char __user *, char __user * __user *,
                           char __user * __user *,
                           struct pt_regs *);
 long sys_arch_prctl(int, unsigned long);
 
-/* kernel/ioport.c */
-asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
-
-/* kernel/signal_64.c */
+/* kernel/signal.c */
 asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
                                struct pt_regs *);
-long sys_rt_sigreturn(struct pt_regs *);
 
 /* kernel/sys_x86_64.c */
+struct new_utsname;
+
 asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
                         unsigned long, unsigned long, unsigned long);
-struct new_utsname;
 asmlinkage long sys_uname(struct new_utsname __user *);
 
 #endif /* CONFIG_X86_32 */
index f72956331c49349623014bffc4c241f01c361eee..c4ee8056bacaf9534c1fead0e7cee6b3e578a57b 100644 (file)
@@ -67,6 +67,7 @@ static inline int user_termio_to_kernel_termios(struct ktermios *termios,
        SET_LOW_TERMIOS_BITS(termios, termio, c_oflag);
        SET_LOW_TERMIOS_BITS(termios, termio, c_cflag);
        SET_LOW_TERMIOS_BITS(termios, termio, c_lflag);
+       get_user(termios->c_line, &termio->c_line);
        return copy_from_user(termios->c_cc, termio->c_cc, NCC);
 }
 
index 8820a73ae090aae29aa3454d6f3241ffcbef5440..602c769fc98ca7340f5388fe21e57ce5431b376e 100644 (file)
@@ -94,7 +94,8 @@ struct thread_info {
 #define TIF_FORCED_TF          24      /* true if TF in eflags artificially */
 #define TIF_DEBUGCTLMSR                25      /* uses thread_struct.debugctlmsr */
 #define TIF_DS_AREA_MSR                26      /* uses thread_struct.ds_area_msr */
-#define TIF_SYSCALL_FTRACE     27      /* for ftrace syscall instrumentation */
+#define TIF_LAZY_MMU_UPDATES   27      /* task is updating the mmu lazily */
+#define TIF_SYSCALL_FTRACE     28      /* for ftrace syscall instrumentation */
 
 #define _TIF_SYSCALL_TRACE     (1 << TIF_SYSCALL_TRACE)
 #define _TIF_NOTIFY_RESUME     (1 << TIF_NOTIFY_RESUME)
@@ -116,6 +117,7 @@ struct thread_info {
 #define _TIF_FORCED_TF         (1 << TIF_FORCED_TF)
 #define _TIF_DEBUGCTLMSR       (1 << TIF_DEBUGCTLMSR)
 #define _TIF_DS_AREA_MSR       (1 << TIF_DS_AREA_MSR)
+#define _TIF_LAZY_MMU_UPDATES  (1 << TIF_LAZY_MMU_UPDATES)
 #define _TIF_SYSCALL_FTRACE    (1 << TIF_SYSCALL_FTRACE)
 
 /* work to do in syscall_trace_enter() */
index 16a5c84b032997f264f704ac68537dc65ba1dbe6..a5ecc9c33e920eda5e50bcfe0313b333b1b2cd31 100644 (file)
@@ -17,7 +17,7 @@
 
 static inline void __native_flush_tlb(void)
 {
-       write_cr3(read_cr3());
+       native_write_cr3(native_read_cr3());
 }
 
 static inline void __native_flush_tlb_global(void)
@@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void)
         */
        raw_local_irq_save(flags);
 
-       cr4 = read_cr4();
+       cr4 = native_read_cr4();
        /* clear PGE */
-       write_cr4(cr4 & ~X86_CR4_PGE);
+       native_write_cr4(cr4 & ~X86_CR4_PGE);
        /* write old PGE again and flush TLBs */
-       write_cr4(cr4);
+       native_write_cr4(cr4);
 
        raw_local_irq_restore(flags);
 }
index f44b49abca49b93c1f9386f65e0b2b5df118a354..066ef590d7e054b7ac7c55c99d235a0b4bb73896 100644 (file)
@@ -203,7 +203,8 @@ struct pci_bus;
 void x86_pci_root_bus_res_quirks(struct pci_bus *b);
 
 #ifdef CONFIG_SMP
-#define mc_capable()   (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)
+#define mc_capable()   ((boot_cpu_data.x86_max_cores > 1) && \
+                       (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids))
 #define smt_capable()                  (smp_num_siblings > 1)
 #endif
 
index 0d5342515b8696970ec533572055df8b98ef74a9..bfd74c032fcaa33b7c50b066fbc5f9333ca6387d 100644 (file)
@@ -2,6 +2,7 @@
 #define _ASM_X86_TRAPS_H
 
 #include <asm/debugreg.h>
+#include <asm/siginfo.h>                       /* TRAP_TRACE, ... */
 
 #ifdef CONFIG_X86_32
 #define dotraplinkage
@@ -13,6 +14,9 @@ asmlinkage void divide_error(void);
 asmlinkage void debug(void);
 asmlinkage void nmi(void);
 asmlinkage void int3(void);
+asmlinkage void xen_debug(void);
+asmlinkage void xen_int3(void);
+asmlinkage void xen_stack_segment(void);
 asmlinkage void overflow(void);
 asmlinkage void bounds(void);
 asmlinkage void invalid_op(void);
@@ -74,7 +78,6 @@ static inline int get_si_code(unsigned long condition)
 }
 
 extern int panic_on_unrecovered_nmi;
-extern int kstack_depth_to_print;
 
 void math_error(void __user *);
 void math_emulate(struct math_emu_info *);
index 6e72d74cf8dc74b7720f5cb79ba355a926e7fa6e..708dae61262df4926eaef9d7b4c2aa5a049c0e74 100644 (file)
 #define __NR_inotify_init1     332
 #define __NR_preadv            333
 #define __NR_pwritev           334
+#define __NR_rt_tgsigqueueinfo 335
 
 #ifdef __KERNEL__
 
index f81829462325f6328a6e6d9c3667da02e9f616d0..4e2b054044000be53a86c2123170969f2851141e 100644 (file)
@@ -657,6 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1)
 __SYSCALL(__NR_preadv, sys_preadv)
 #define __NR_pwritev                           296
 __SYSCALL(__NR_pwritev, sys_pwritev)
+#define __NR_rt_tgsigqueueinfo                 297
+__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
 
 
 #ifndef __NO_STUBS
index 9b0e61bf7a88dc2a68521384ea68950cd900478e..bddd44f2f0ab5b3dc7dc0e8ad19fe6df52a0525a 100644 (file)
@@ -37,7 +37,7 @@
 #define UV_CPUS_PER_ACT_STATUS         32
 #define UV_ACT_STATUS_MASK             0x3
 #define UV_ACT_STATUS_SIZE             2
-#define UV_ACTIVATION_DESCRIPTOR_SIZE  32
+#define UV_ADP_SIZE                    32
 #define UV_DISTRIBUTION_SIZE           256
 #define UV_SW_ACK_NPENDING             8
 #define UV_NET_ENDPOINT_INTD           0x38
index d3a98ea1062ef936c7825f1387a4d2f4b83c65b9..341070f7ad5cb62679f22f7b3fff7316c549b5d9 100644 (file)
@@ -133,6 +133,7 @@ struct uv_scir_s {
 struct uv_hub_info_s {
        unsigned long           global_mmr_base;
        unsigned long           gpa_mask;
+       unsigned int            gnode_extra;
        unsigned long           gnode_upper;
        unsigned long           lowmem_remap_top;
        unsigned long           lowmem_remap_base;
@@ -159,7 +160,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
  *             p -  PNODE (local part of nsids, right shifted 1)
  */
 #define UV_NASID_TO_PNODE(n)           (((n) >> 1) & uv_hub_info->pnode_mask)
-#define UV_PNODE_TO_NASID(p)           (((p) << 1) | uv_hub_info->gnode_upper)
+#define UV_PNODE_TO_GNODE(p)           ((p) |uv_hub_info->gnode_extra)
+#define UV_PNODE_TO_NASID(p)           (UV_PNODE_TO_GNODE(p) << 1)
 
 #define UV_LOCAL_MMR_BASE              0xf4000000UL
 #define UV_GLOBAL_MMR32_BASE           0xf8000000UL
@@ -173,7 +175,7 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
 #define UV_GLOBAL_MMR32_PNODE_BITS(p)  ((p) << (UV_GLOBAL_MMR32_PNODE_SHIFT))
 
 #define UV_GLOBAL_MMR64_PNODE_BITS(p)                                  \
-       ((unsigned long)(p) << UV_GLOBAL_MMR64_PNODE_SHIFT)
+       ((unsigned long)(UV_PNODE_TO_GNODE(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
 
 #define UV_APIC_PNODE_SHIFT    6
 
index 88d1bfc847d30fc6b87007648fedc9970856e1af..4f78bd682125067c07fbf9c11a1b1bc1c40a2dd6 100644 (file)
@@ -28,7 +28,7 @@ CFLAGS_paravirt.o     := $(nostackp)
 obj-y                  := process_$(BITS).o signal.o entry_$(BITS).o
 obj-y                  += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
 obj-y                  += time_$(BITS).o ioport.o ldt.o dumpstack.o
-obj-y                  += setup.o i8259.o irqinit_$(BITS).o
+obj-y                  += setup.o i8259.o irqinit.o
 obj-$(CONFIG_X86_VISWS)        += visws_quirks.o
 obj-$(CONFIG_X86_32)   += probe_roms_32.o
 obj-$(CONFIG_X86_32)   += sys_i386_32.o i386_ksyms_32.o
@@ -44,6 +44,7 @@ obj-y                         += process.o
 obj-y                          += i387.o xsave.o
 obj-y                          += ptrace.o
 obj-$(CONFIG_X86_DS)           += ds.o
+obj-$(CONFIG_X86_DS_SELFTEST)          += ds_selftest.o
 obj-$(CONFIG_X86_32)           += tls.o
 obj-$(CONFIG_IA32_EMULATION)   += tls.o
 obj-y                          += step.o
index 723989d7f8029711c0a6fda92e8016101ac2ede7..631086159c53b0be5f28df92d3e2ed976cffa7a8 100644 (file)
@@ -33,6 +33,7 @@
 #include <linux/irq.h>
 #include <linux/bootmem.h>
 #include <linux/ioport.h>
+#include <linux/pci.h>
 
 #include <asm/pgtable.h>
 #include <asm/io_apic.h>
@@ -522,7 +523,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
  * success: return IRQ number (>=0)
  * failure: return < 0
  */
-int acpi_register_gsi(u32 gsi, int triggering, int polarity)
+int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
 {
        unsigned int irq;
        unsigned int plat_gsi = gsi;
@@ -532,14 +533,14 @@ int acpi_register_gsi(u32 gsi, int triggering, int polarity)
         * Make sure all (legacy) PCI IRQs are set as level-triggered.
         */
        if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) {
-               if (triggering == ACPI_LEVEL_SENSITIVE)
+               if (trigger == ACPI_LEVEL_SENSITIVE)
                        eisa_set_level_irq(gsi);
        }
 #endif
 
 #ifdef CONFIG_X86_IO_APIC
        if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) {
-               plat_gsi = mp_register_gsi(gsi, triggering, polarity);
+               plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
        }
 #endif
        acpi_gsi_to_irq(plat_gsi, &irq);
@@ -903,10 +904,8 @@ extern int es7000_plat;
 #endif
 
 static struct {
-       int apic_id;
        int gsi_base;
        int gsi_end;
-       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
 } mp_ioapic_routing[MAX_IO_APICS];
 
 int mp_find_ioapic(int gsi)
@@ -986,16 +985,12 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
 
        set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
        mp_ioapics[idx].apicid = uniq_ioapic_id(id);
-#ifdef CONFIG_X86_32
        mp_ioapics[idx].apicver = io_apic_get_version(idx);
-#else
-       mp_ioapics[idx].apicver = 0;
-#endif
+
        /*
         * Build basic GSI lookup table to facilitate gsi->io_apic lookups
         * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
         */
-       mp_ioapic_routing[idx].apic_id = mp_ioapics[idx].apicid;
        mp_ioapic_routing[idx].gsi_base = gsi_base;
        mp_ioapic_routing[idx].gsi_end = gsi_base +
            io_apic_get_redir_entries(idx);
@@ -1158,26 +1153,52 @@ void __init mp_config_acpi_legacy_irqs(void)
        }
 }
 
-int mp_register_gsi(u32 gsi, int triggering, int polarity)
+static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
+                       int polarity)
 {
+#ifdef CONFIG_X86_MPPARSE
+       struct mpc_intsrc mp_irq;
+       struct pci_dev *pdev;
+       unsigned char number;
+       unsigned int devfn;
        int ioapic;
-       int ioapic_pin;
-#ifdef CONFIG_X86_32
-#define MAX_GSI_NUM    4096
-#define IRQ_COMPRESSION_START  64
+       u8 pin;
 
-       static int pci_irq = IRQ_COMPRESSION_START;
-       /*
-        * Mapping between Global System Interrupts, which
-        * represent all possible interrupts, and IRQs
-        * assigned to actual devices.
-        */
-       static int gsi_to_irq[MAX_GSI_NUM];
-#else
+       if (!acpi_ioapic)
+               return 0;
+       if (!dev)
+               return 0;
+       if (dev->bus != &pci_bus_type)
+               return 0;
+
+       pdev = to_pci_dev(dev);
+       number = pdev->bus->number;
+       devfn = pdev->devfn;
+       pin = pdev->pin;
+       /* print the entry should happen on mptable identically */
+       mp_irq.type = MP_INTSRC;
+       mp_irq.irqtype = mp_INT;
+       mp_irq.irqflag = (trigger == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
+                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
+       mp_irq.srcbus = number;
+       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
+       ioapic = mp_find_ioapic(gsi);
+       mp_irq.dstapic = mp_ioapics[ioapic].apicid;
+       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
+
+       save_mp_irq(&mp_irq);
+#endif
+       return 0;
+}
+
+int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
+{
+       int ioapic;
+       int ioapic_pin;
+       struct io_apic_irq_attr irq_attr;
 
        if (acpi_irq_model != ACPI_IRQ_MODEL_IOAPIC)
                return gsi;
-#endif
 
        /* Don't set up the ACPI SCI because it's already set up */
        if (acpi_gbl_FADT.sci_interrupt == gsi)
@@ -1196,93 +1217,22 @@ int mp_register_gsi(u32 gsi, int triggering, int polarity)
                gsi = ioapic_renumber_irq(ioapic, gsi);
 #endif
 
-       /*
-        * Avoid pin reprogramming.  PRTs typically include entries
-        * with redundant pin->gsi mappings (but unique PCI devices);
-        * we only program the IOAPIC on the first.
-        */
        if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
                printk(KERN_ERR "Invalid reference to IOAPIC pin "
-                      "%d-%d\n", mp_ioapic_routing[ioapic].apic_id,
+                      "%d-%d\n", mp_ioapics[ioapic].apicid,
                       ioapic_pin);
                return gsi;
        }
-       if (test_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed)) {
-               pr_debug("Pin %d-%d already programmed\n",
-                        mp_ioapic_routing[ioapic].apic_id, ioapic_pin);
-#ifdef CONFIG_X86_32
-               return (gsi < IRQ_COMPRESSION_START ? gsi : gsi_to_irq[gsi]);
-#else
-               return gsi;
-#endif
-       }
-
-       set_bit(ioapic_pin, mp_ioapic_routing[ioapic].pin_programmed);
-#ifdef CONFIG_X86_32
-       /*
-        * For GSI >= 64, use IRQ compression
-        */
-       if ((gsi >= IRQ_COMPRESSION_START)
-           && (triggering == ACPI_LEVEL_SENSITIVE)) {
-               /*
-                * For PCI devices assign IRQs in order, avoiding gaps
-                * due to unused I/O APIC pins.
-                */
-               int irq = gsi;
-               if (gsi < MAX_GSI_NUM) {
-                       /*
-                        * Retain the VIA chipset work-around (gsi > 15), but
-                        * avoid a problem where the 8254 timer (IRQ0) is setup
-                        * via an override (so it's not on pin 0 of the ioapic),
-                        * and at the same time, the pin 0 interrupt is a PCI
-                        * type.  The gsi > 15 test could cause these two pins
-                        * to be shared as IRQ0, and they are not shareable.
-                        * So test for this condition, and if necessary, avoid
-                        * the pin collision.
-                        */
-                       gsi = pci_irq++;
-                       /*
-                        * Don't assign IRQ used by ACPI SCI
-                        */
-                       if (gsi == acpi_gbl_FADT.sci_interrupt)
-                               gsi = pci_irq++;
-                       gsi_to_irq[irq] = gsi;
-               } else {
-                       printk(KERN_ERR "GSI %u is too high\n", gsi);
-                       return gsi;
-               }
-       }
-#endif
-       io_apic_set_pci_routing(ioapic, ioapic_pin, gsi,
-                               triggering == ACPI_EDGE_SENSITIVE ? 0 : 1,
-                               polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
-       return gsi;
-}
 
-int mp_config_acpi_gsi(unsigned char number, unsigned int devfn, u8 pin,
-                       u32 gsi, int triggering, int polarity)
-{
-#ifdef CONFIG_X86_MPPARSE
-       struct mpc_intsrc mp_irq;
-       int ioapic;
+       if (enable_update_mptable)
+               mp_config_acpi_gsi(dev, gsi, trigger, polarity);
 
-       if (!acpi_ioapic)
-               return 0;
+       set_io_apic_irq_attr(&irq_attr, ioapic, ioapic_pin,
+                            trigger == ACPI_EDGE_SENSITIVE ? 0 : 1,
+                            polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
+       io_apic_set_pci_routing(dev, gsi, &irq_attr);
 
-       /* print the entry should happen on mptable identically */
-       mp_irq.type = MP_INTSRC;
-       mp_irq.irqtype = mp_INT;
-       mp_irq.irqflag = (triggering == ACPI_EDGE_SENSITIVE ? 4 : 0x0c) |
-                               (polarity == ACPI_ACTIVE_HIGH ? 1 : 3);
-       mp_irq.srcbus = number;
-       mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
-       ioapic = mp_find_ioapic(gsi);
-       mp_irq.dstapic = mp_ioapic_routing[ioapic].apic_id;
-       mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
-
-       save_mp_irq(&mp_irq);
-#endif
-       return 0;
+       return gsi;
 }
 
 /*
index 1c31cc0e9def070e26288672801396d042fa604a..167bc16ce0e59ca080112db6d232f270df7ceea0 100644 (file)
@@ -9,7 +9,7 @@
 always         := wakeup.bin
 targets                := wakeup.elf wakeup.lds
 
-wakeup-y       += wakeup.o wakemain.o video-mode.o copy.o
+wakeup-y       += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o
 
 # The link order of the video-*.o modules can matter.  In particular,
 # video-vga.o *must* be listed first, followed by video-vesa.o.
diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S
new file mode 100644 (file)
index 0000000..f51eb0b
--- /dev/null
@@ -0,0 +1 @@
+#include "../../../boot/bioscall.S"
diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c
new file mode 100644 (file)
index 0000000..6206033
--- /dev/null
@@ -0,0 +1 @@
+#include "../../../boot/regs.c"
index a97db99dad52fedd4bbcf6fe9d46bed8784f5e1f..1c60554537c358ef37fa9352e5bfd624dadf3e1a 100644 (file)
@@ -55,7 +55,16 @@ struct iommu_cmd {
 static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                             struct unity_map_entry *e);
 static struct dma_ops_domain *find_protection_domain(u16 devid);
+static u64* alloc_pte(struct protection_domain *dom,
+                     unsigned long address, u64
+                     **pte_page, gfp_t gfp);
+static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
+                                     unsigned long start_page,
+                                     unsigned int pages);
 
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
 
 #ifdef CONFIG_AMD_IOMMU_STATS
 
@@ -213,7 +222,7 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
 {
        struct amd_iommu *iommu;
 
-       list_for_each_entry(iommu, &amd_iommu_list, list)
+       for_each_iommu(iommu)
                iommu_poll_events(iommu);
 
        return IRQ_HANDLED;
@@ -440,7 +449,7 @@ static void iommu_flush_domain(u16 domid)
        __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
                                      domid, 1, 1);
 
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
                spin_lock_irqsave(&iommu->lock, flags);
                __iommu_queue_command(iommu, &cmd);
                __iommu_completion_wait(iommu);
@@ -449,6 +458,35 @@ static void iommu_flush_domain(u16 domid)
        }
 }
 
+void amd_iommu_flush_all_domains(void)
+{
+       int i;
+
+       for (i = 1; i < MAX_DOMAIN_ID; ++i) {
+               if (!test_bit(i, amd_iommu_pd_alloc_bitmap))
+                       continue;
+               iommu_flush_domain(i);
+       }
+}
+
+void amd_iommu_flush_all_devices(void)
+{
+       struct amd_iommu *iommu;
+       int i;
+
+       for (i = 0; i <= amd_iommu_last_bdf; ++i) {
+               if (amd_iommu_pd_table[i] == NULL)
+                       continue;
+
+               iommu = amd_iommu_rlookup_table[i];
+               if (!iommu)
+                       continue;
+
+               iommu_queue_inv_dev_entry(iommu, i);
+               iommu_completion_wait(iommu);
+       }
+}
+
 /****************************************************************************
  *
  * The functions below are used the create the page table mappings for
@@ -468,7 +506,7 @@ static int iommu_map_page(struct protection_domain *dom,
                          unsigned long phys_addr,
                          int prot)
 {
-       u64 __pte, *pte, *page;
+       u64 __pte, *pte;
 
        bus_addr  = PAGE_ALIGN(bus_addr);
        phys_addr = PAGE_ALIGN(phys_addr);
@@ -477,27 +515,7 @@ static int iommu_map_page(struct protection_domain *dom,
        if (bus_addr > IOMMU_MAP_SIZE_L3 || !(prot & IOMMU_PROT_MASK))
                return -EINVAL;
 
-       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(bus_addr)];
-
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               *pte = IOMMU_L2_PDE(virt_to_phys(page));
-       }
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L1_INDEX(bus_addr)];
-
-       if (!IOMMU_PTE_PRESENT(*pte)) {
-               page = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!page)
-                       return -ENOMEM;
-               *pte = IOMMU_L1_PDE(virt_to_phys(page));
-       }
-
-       pte = IOMMU_PTE_PAGE(*pte);
-       pte = &pte[IOMMU_PTE_L0_INDEX(bus_addr)];
+       pte = alloc_pte(dom, bus_addr, NULL, GFP_KERNEL);
 
        if (IOMMU_PTE_PRESENT(*pte))
                return -EBUSY;
@@ -595,7 +613,8 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
                 * as allocated in the aperture
                 */
                if (addr < dma_dom->aperture_size)
-                       __set_bit(addr >> PAGE_SHIFT, dma_dom->bitmap);
+                       __set_bit(addr >> PAGE_SHIFT,
+                                 dma_dom->aperture[0]->bitmap);
        }
 
        return 0;
@@ -632,42 +651,191 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
  ****************************************************************************/
 
 /*
- * The address allocator core function.
+ * The address allocator core functions.
  *
  * called with domain->lock held
  */
+
+/*
+ * This function checks if there is a PTE for a given dma address. If
+ * there is one, it returns the pointer to it.
+ */
+static u64* fetch_pte(struct protection_domain *domain,
+                     unsigned long address)
+{
+       u64 *pte;
+
+       pte = &domain->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte))
+               return NULL;
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte))
+               return NULL;
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+       return pte;
+}
+
+/*
+ * This function is used to add a new aperture range to an existing
+ * aperture in case of dma_ops domain allocation or address allocation
+ * failure.
+ */
+static int alloc_new_range(struct amd_iommu *iommu,
+                          struct dma_ops_domain *dma_dom,
+                          bool populate, gfp_t gfp)
+{
+       int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
+       int i;
+
+#ifdef CONFIG_IOMMU_STRESS
+       populate = false;
+#endif
+
+       if (index >= APERTURE_MAX_RANGES)
+               return -ENOMEM;
+
+       dma_dom->aperture[index] = kzalloc(sizeof(struct aperture_range), gfp);
+       if (!dma_dom->aperture[index])
+               return -ENOMEM;
+
+       dma_dom->aperture[index]->bitmap = (void *)get_zeroed_page(gfp);
+       if (!dma_dom->aperture[index]->bitmap)
+               goto out_free;
+
+       dma_dom->aperture[index]->offset = dma_dom->aperture_size;
+
+       if (populate) {
+               unsigned long address = dma_dom->aperture_size;
+               int i, num_ptes = APERTURE_RANGE_PAGES / 512;
+               u64 *pte, *pte_page;
+
+               for (i = 0; i < num_ptes; ++i) {
+                       pte = alloc_pte(&dma_dom->domain, address,
+                                       &pte_page, gfp);
+                       if (!pte)
+                               goto out_free;
+
+                       dma_dom->aperture[index]->pte_pages[i] = pte_page;
+
+                       address += APERTURE_RANGE_SIZE / 64;
+               }
+       }
+
+       dma_dom->aperture_size += APERTURE_RANGE_SIZE;
+
+       /* Intialize the exclusion range if necessary */
+       if (iommu->exclusion_start &&
+           iommu->exclusion_start >= dma_dom->aperture[index]->offset &&
+           iommu->exclusion_start < dma_dom->aperture_size) {
+               unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
+               int pages = iommu_num_pages(iommu->exclusion_start,
+                                           iommu->exclusion_length,
+                                           PAGE_SIZE);
+               dma_ops_reserve_addresses(dma_dom, startpage, pages);
+       }
+
+       /*
+        * Check for areas already mapped as present in the new aperture
+        * range and mark those pages as reserved in the allocator. Such
+        * mappings may already exist as a result of requested unity
+        * mappings for devices.
+        */
+       for (i = dma_dom->aperture[index]->offset;
+            i < dma_dom->aperture_size;
+            i += PAGE_SIZE) {
+               u64 *pte = fetch_pte(&dma_dom->domain, i);
+               if (!pte || !IOMMU_PTE_PRESENT(*pte))
+                       continue;
+
+               dma_ops_reserve_addresses(dma_dom, i << PAGE_SHIFT, 1);
+       }
+
+       return 0;
+
+out_free:
+       free_page((unsigned long)dma_dom->aperture[index]->bitmap);
+
+       kfree(dma_dom->aperture[index]);
+       dma_dom->aperture[index] = NULL;
+
+       return -ENOMEM;
+}
+
+static unsigned long dma_ops_area_alloc(struct device *dev,
+                                       struct dma_ops_domain *dom,
+                                       unsigned int pages,
+                                       unsigned long align_mask,
+                                       u64 dma_mask,
+                                       unsigned long start)
+{
+       unsigned long next_bit = dom->next_address % APERTURE_RANGE_SIZE;
+       int max_index = dom->aperture_size >> APERTURE_RANGE_SHIFT;
+       int i = start >> APERTURE_RANGE_SHIFT;
+       unsigned long boundary_size;
+       unsigned long address = -1;
+       unsigned long limit;
+
+       next_bit >>= PAGE_SHIFT;
+
+       boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
+                       PAGE_SIZE) >> PAGE_SHIFT;
+
+       for (;i < max_index; ++i) {
+               unsigned long offset = dom->aperture[i]->offset >> PAGE_SHIFT;
+
+               if (dom->aperture[i]->offset >= dma_mask)
+                       break;
+
+               limit = iommu_device_max_index(APERTURE_RANGE_PAGES, offset,
+                                              dma_mask >> PAGE_SHIFT);
+
+               address = iommu_area_alloc(dom->aperture[i]->bitmap,
+                                          limit, next_bit, pages, 0,
+                                           boundary_size, align_mask);
+               if (address != -1) {
+                       address = dom->aperture[i]->offset +
+                                 (address << PAGE_SHIFT);
+                       dom->next_address = address + (pages << PAGE_SHIFT);
+                       break;
+               }
+
+               next_bit = 0;
+       }
+
+       return address;
+}
+
 static unsigned long dma_ops_alloc_addresses(struct device *dev,
                                             struct dma_ops_domain *dom,
                                             unsigned int pages,
                                             unsigned long align_mask,
                                             u64 dma_mask)
 {
-       unsigned long limit;
        unsigned long address;
-       unsigned long boundary_size;
 
-       boundary_size = ALIGN(dma_get_seg_boundary(dev) + 1,
-                       PAGE_SIZE) >> PAGE_SHIFT;
-       limit = iommu_device_max_index(dom->aperture_size >> PAGE_SHIFT, 0,
-                                      dma_mask >> PAGE_SHIFT);
+#ifdef CONFIG_IOMMU_STRESS
+       dom->next_address = 0;
+       dom->need_flush = true;
+#endif
 
-       if (dom->next_bit >= limit) {
-               dom->next_bit = 0;
-               dom->need_flush = true;
-       }
+       address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+                                    dma_mask, dom->next_address);
 
-       address = iommu_area_alloc(dom->bitmap, limit, dom->next_bit, pages,
-                                  0 , boundary_size, align_mask);
        if (address == -1) {
-               address = iommu_area_alloc(dom->bitmap, limit, 0, pages,
-                               0, boundary_size, align_mask);
+               dom->next_address = 0;
+               address = dma_ops_area_alloc(dev, dom, pages, align_mask,
+                                            dma_mask, 0);
                dom->need_flush = true;
        }
 
-       if (likely(address != -1)) {
-               dom->next_bit = address + pages;
-               address <<= PAGE_SHIFT;
-       } else
+       if (unlikely(address == -1))
                address = bad_dma_address;
 
        WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
@@ -684,11 +852,23 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
                                   unsigned long address,
                                   unsigned int pages)
 {
-       address >>= PAGE_SHIFT;
-       iommu_area_free(dom->bitmap, address, pages);
+       unsigned i = address >> APERTURE_RANGE_SHIFT;
+       struct aperture_range *range = dom->aperture[i];
 
-       if (address >= dom->next_bit)
+       BUG_ON(i >= APERTURE_MAX_RANGES || range == NULL);
+
+#ifdef CONFIG_IOMMU_STRESS
+       if (i < 4)
+               return;
+#endif
+
+       if (address >= dom->next_address)
                dom->need_flush = true;
+
+       address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
+
+       iommu_area_free(range->bitmap, address, pages);
+
 }
 
 /****************************************************************************
@@ -736,12 +916,16 @@ static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
                                      unsigned long start_page,
                                      unsigned int pages)
 {
-       unsigned int last_page = dom->aperture_size >> PAGE_SHIFT;
+       unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
 
        if (start_page + pages > last_page)
                pages = last_page - start_page;
 
-       iommu_area_reserve(dom->bitmap, start_page, pages);
+       for (i = start_page; i < start_page + pages; ++i) {
+               int index = i / APERTURE_RANGE_PAGES;
+               int page  = i % APERTURE_RANGE_PAGES;
+               __set_bit(page, dom->aperture[index]->bitmap);
+       }
 }
 
 static void free_pagetable(struct protection_domain *domain)
@@ -780,14 +964,19 @@ static void free_pagetable(struct protection_domain *domain)
  */
 static void dma_ops_domain_free(struct dma_ops_domain *dom)
 {
+       int i;
+
        if (!dom)
                return;
 
        free_pagetable(&dom->domain);
 
-       kfree(dom->pte_pages);
-
-       kfree(dom->bitmap);
+       for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
+               if (!dom->aperture[i])
+                       continue;
+               free_page((unsigned long)dom->aperture[i]->bitmap);
+               kfree(dom->aperture[i]);
+       }
 
        kfree(dom);
 }
@@ -797,19 +986,9 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
  * It also intializes the page table and the address allocator data
  * structures required for the dma_ops interface
  */
-static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
-                                                  unsigned order)
+static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
 {
        struct dma_ops_domain *dma_dom;
-       unsigned i, num_pte_pages;
-       u64 *l2_pde;
-       u64 address;
-
-       /*
-        * Currently the DMA aperture must be between 32 MB and 1GB in size
-        */
-       if ((order < 25) || (order > 30))
-               return NULL;
 
        dma_dom = kzalloc(sizeof(struct dma_ops_domain), GFP_KERNEL);
        if (!dma_dom)
@@ -826,55 +1005,20 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu,
        dma_dom->domain.priv = dma_dom;
        if (!dma_dom->domain.pt_root)
                goto free_dma_dom;
-       dma_dom->aperture_size = (1ULL << order);
-       dma_dom->bitmap = kzalloc(dma_dom->aperture_size / (PAGE_SIZE * 8),
-                                 GFP_KERNEL);
-       if (!dma_dom->bitmap)
-               goto free_dma_dom;
-       /*
-        * mark the first page as allocated so we never return 0 as
-        * a valid dma-address. So we can use 0 as error value
-        */
-       dma_dom->bitmap[0] = 1;
-       dma_dom->next_bit = 0;
 
        dma_dom->need_flush = false;
        dma_dom->target_dev = 0xffff;
 
-       /* Intialize the exclusion range if necessary */
-       if (iommu->exclusion_start &&
-           iommu->exclusion_start < dma_dom->aperture_size) {
-               unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT;
-               int pages = iommu_num_pages(iommu->exclusion_start,
-                                           iommu->exclusion_length,
-                                           PAGE_SIZE);
-               dma_ops_reserve_addresses(dma_dom, startpage, pages);
-       }
+       if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL))
+               goto free_dma_dom;
 
        /*
-        * At the last step, build the page tables so we don't need to
-        * allocate page table pages in the dma_ops mapping/unmapping
-        * path.
+        * mark the first page as allocated so we never return 0 as
+        * a valid dma-address. So we can use 0 as error value
         */
-       num_pte_pages = dma_dom->aperture_size / (PAGE_SIZE * 512);
-       dma_dom->pte_pages = kzalloc(num_pte_pages * sizeof(void *),
-                       GFP_KERNEL);
-       if (!dma_dom->pte_pages)
-               goto free_dma_dom;
-
-       l2_pde = (u64 *)get_zeroed_page(GFP_KERNEL);
-       if (l2_pde == NULL)
-               goto free_dma_dom;
+       dma_dom->aperture[0]->bitmap[0] = 1;
+       dma_dom->next_address = 0;
 
-       dma_dom->domain.pt_root[0] = IOMMU_L2_PDE(virt_to_phys(l2_pde));
-
-       for (i = 0; i < num_pte_pages; ++i) {
-               dma_dom->pte_pages[i] = (u64 *)get_zeroed_page(GFP_KERNEL);
-               if (!dma_dom->pte_pages[i])
-                       goto free_dma_dom;
-               address = virt_to_phys(dma_dom->pte_pages[i]);
-               l2_pde[i] = IOMMU_L1_PDE(address);
-       }
 
        return dma_dom;
 
@@ -983,7 +1127,6 @@ static int device_change_notifier(struct notifier_block *nb,
        struct protection_domain *domain;
        struct dma_ops_domain *dma_domain;
        struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
        unsigned long flags;
 
        if (devid > amd_iommu_last_bdf)
@@ -1002,17 +1145,7 @@ static int device_change_notifier(struct notifier_block *nb,
                          "to a non-dma-ops domain\n", dev_name(dev));
 
        switch (action) {
-       case BUS_NOTIFY_BOUND_DRIVER:
-               if (domain)
-                       goto out;
-               dma_domain = find_protection_domain(devid);
-               if (!dma_domain)
-                       dma_domain = iommu->default_dom;
-               attach_device(iommu, &dma_domain->domain, devid);
-               printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-                      "device %s\n", dma_domain->domain.id, dev_name(dev));
-               break;
-       case BUS_NOTIFY_UNBIND_DRIVER:
+       case BUS_NOTIFY_UNBOUND_DRIVER:
                if (!domain)
                        goto out;
                detach_device(domain, devid);
@@ -1022,7 +1155,7 @@ static int device_change_notifier(struct notifier_block *nb,
                dma_domain = find_protection_domain(devid);
                if (dma_domain)
                        goto out;
-               dma_domain = dma_ops_domain_alloc(iommu, order);
+               dma_domain = dma_ops_domain_alloc(iommu);
                if (!dma_domain)
                        goto out;
                dma_domain->target_dev = devid;
@@ -1133,8 +1266,8 @@ static int get_device_resources(struct device *dev,
                        dma_dom = (*iommu)->default_dom;
                *domain = &dma_dom->domain;
                attach_device(*iommu, *domain, *bdf);
-               printk(KERN_INFO "AMD IOMMU: Using protection domain %d for "
-                               "device %s\n", (*domain)->id, dev_name(dev));
+               DUMP_printk("Using protection domain %d for device %s\n",
+                           (*domain)->id, dev_name(dev));
        }
 
        if (domain_for_device(_bdf) == NULL)
@@ -1143,6 +1276,66 @@ static int get_device_resources(struct device *dev,
        return 1;
 }
 
+/*
+ * If the pte_page is not yet allocated this function is called
+ */
+static u64* alloc_pte(struct protection_domain *dom,
+                     unsigned long address, u64 **pte_page, gfp_t gfp)
+{
+       u64 *pte, *page;
+
+       pte = &dom->pt_root[IOMMU_PTE_L2_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte)) {
+               page = (u64 *)get_zeroed_page(gfp);
+               if (!page)
+                       return NULL;
+               *pte = IOMMU_L2_PDE(virt_to_phys(page));
+       }
+
+       pte = IOMMU_PTE_PAGE(*pte);
+       pte = &pte[IOMMU_PTE_L1_INDEX(address)];
+
+       if (!IOMMU_PTE_PRESENT(*pte)) {
+               page = (u64 *)get_zeroed_page(gfp);
+               if (!page)
+                       return NULL;
+               *pte = IOMMU_L1_PDE(virt_to_phys(page));
+       }
+
+       pte = IOMMU_PTE_PAGE(*pte);
+
+       if (pte_page)
+               *pte_page = pte;
+
+       pte = &pte[IOMMU_PTE_L0_INDEX(address)];
+
+       return pte;
+}
+
+/*
+ * This function fetches the PTE for a given address in the aperture
+ */
+static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
+                           unsigned long address)
+{
+       struct aperture_range *aperture;
+       u64 *pte, *pte_page;
+
+       aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+       if (!aperture)
+               return NULL;
+
+       pte = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+       if (!pte) {
+               pte = alloc_pte(&dom->domain, address, &pte_page, GFP_ATOMIC);
+               aperture->pte_pages[APERTURE_PAGE_INDEX(address)] = pte_page;
+       } else
+               pte += IOMMU_PTE_L0_INDEX(address);
+
+       return pte;
+}
+
 /*
  * This is the generic map function. It maps one 4kb page at paddr to
  * the given address in the DMA address space for the domain.
@@ -1159,8 +1352,9 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
 
        paddr &= PAGE_MASK;
 
-       pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
-       pte += IOMMU_PTE_L0_INDEX(address);
+       pte  = dma_ops_get_pte(dom, address);
+       if (!pte)
+               return bad_dma_address;
 
        __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
 
@@ -1185,14 +1379,20 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
                                 struct dma_ops_domain *dom,
                                 unsigned long address)
 {
+       struct aperture_range *aperture;
        u64 *pte;
 
        if (address >= dom->aperture_size)
                return;
 
-       WARN_ON(address & ~PAGE_MASK || address >= dom->aperture_size);
+       aperture = dom->aperture[APERTURE_RANGE_INDEX(address)];
+       if (!aperture)
+               return;
+
+       pte  = aperture->pte_pages[APERTURE_PAGE_INDEX(address)];
+       if (!pte)
+               return;
 
-       pte  = dom->pte_pages[IOMMU_PTE_L1_INDEX(address)];
        pte += IOMMU_PTE_L0_INDEX(address);
 
        WARN_ON(!*pte);
@@ -1216,7 +1416,7 @@ static dma_addr_t __map_single(struct device *dev,
                               u64 dma_mask)
 {
        dma_addr_t offset = paddr & ~PAGE_MASK;
-       dma_addr_t address, start;
+       dma_addr_t address, start, ret;
        unsigned int pages;
        unsigned long align_mask = 0;
        int i;
@@ -1232,14 +1432,33 @@ static dma_addr_t __map_single(struct device *dev,
        if (align)
                align_mask = (1UL << get_order(size)) - 1;
 
+retry:
        address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
                                          dma_mask);
-       if (unlikely(address == bad_dma_address))
-               goto out;
+       if (unlikely(address == bad_dma_address)) {
+               /*
+                * setting next_address here will let the address
+                * allocator only scan the new allocated range in the
+                * first run. This is a small optimization.
+                */
+               dma_dom->next_address = dma_dom->aperture_size;
+
+               if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC))
+                       goto out;
+
+               /*
+                * aperture was sucessfully enlarged by 128 MB, try
+                * allocation again
+                */
+               goto retry;
+       }
 
        start = address;
        for (i = 0; i < pages; ++i) {
-               dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+               ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir);
+               if (ret == bad_dma_address)
+                       goto out_unmap;
+
                paddr += PAGE_SIZE;
                start += PAGE_SIZE;
        }
@@ -1255,6 +1474,17 @@ static dma_addr_t __map_single(struct device *dev,
 
 out:
        return address;
+
+out_unmap:
+
+       for (--i; i >= 0; --i) {
+               start -= PAGE_SIZE;
+               dma_ops_domain_unmap(iommu, dma_dom, start);
+       }
+
+       dma_ops_free_addresses(dma_dom, address, pages);
+
+       return bad_dma_address;
 }
 
 /*
@@ -1537,8 +1767,10 @@ static void *alloc_coherent(struct device *dev, size_t size,
        *dma_addr = __map_single(dev, iommu, domain->priv, paddr,
                                 size, DMA_BIDIRECTIONAL, true, dma_mask);
 
-       if (*dma_addr == bad_dma_address)
+       if (*dma_addr == bad_dma_address) {
+               spin_unlock_irqrestore(&domain->lock, flags);
                goto out_free;
+       }
 
        iommu_completion_wait(iommu);
 
@@ -1625,7 +1857,6 @@ static void prealloc_protection_domains(void)
        struct pci_dev *dev = NULL;
        struct dma_ops_domain *dma_dom;
        struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
        u16 devid;
 
        while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
@@ -1638,7 +1869,7 @@ static void prealloc_protection_domains(void)
                iommu = amd_iommu_rlookup_table[devid];
                if (!iommu)
                        continue;
-               dma_dom = dma_ops_domain_alloc(iommu, order);
+               dma_dom = dma_ops_domain_alloc(iommu);
                if (!dma_dom)
                        continue;
                init_unity_mappings_for_device(dma_dom, devid);
@@ -1664,7 +1895,6 @@ static struct dma_map_ops amd_iommu_dma_ops = {
 int __init amd_iommu_init_dma_ops(void)
 {
        struct amd_iommu *iommu;
-       int order = amd_iommu_aperture_order;
        int ret;
 
        /*
@@ -1672,8 +1902,8 @@ int __init amd_iommu_init_dma_ops(void)
         * found in the system. Devices not assigned to any other
         * protection domain will be assigned to the default one.
         */
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
-               iommu->default_dom = dma_ops_domain_alloc(iommu, order);
+       for_each_iommu(iommu) {
+               iommu->default_dom = dma_ops_domain_alloc(iommu);
                if (iommu->default_dom == NULL)
                        return -ENOMEM;
                iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -1710,7 +1940,7 @@ int __init amd_iommu_init_dma_ops(void)
 
 free_domains:
 
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
                if (iommu->default_dom)
                        dma_ops_domain_free(iommu->default_dom);
        }
@@ -1842,7 +2072,7 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
 
        old_domain = domain_for_device(devid);
        if (old_domain)
-               return -EBUSY;
+               detach_device(old_domain, devid);
 
        attach_device(iommu, domain, devid);
 
index 8c0be0902dacb2cc4e766cbe851891bd889f08e7..238989ec077df9e0b669c2f2a65d8b820a833510 100644 (file)
@@ -115,15 +115,21 @@ struct ivmd_header {
        u64 range_length;
 } __attribute__((packed));
 
+bool amd_iommu_dump;
+
 static int __initdata amd_iommu_detected;
 
 u16 amd_iommu_last_bdf;                        /* largest PCI device id we have
                                           to handle */
 LIST_HEAD(amd_iommu_unity_map);                /* a list of required unity mappings
                                           we find in ACPI */
-unsigned amd_iommu_aperture_order = 26; /* size of aperture in power of 2 */
+#ifdef CONFIG_IOMMU_STRESS
+bool amd_iommu_isolate = false;
+#else
 bool amd_iommu_isolate = true;         /* if true, device isolation is
                                           enabled */
+#endif
+
 bool amd_iommu_unmap_flush;            /* if true, flush on every unmap */
 
 LIST_HEAD(amd_iommu_list);             /* list of all AMD IOMMUs in the
@@ -175,7 +181,7 @@ static inline void update_last_devid(u16 devid)
 static inline unsigned long tbl_size(int entry_size)
 {
        unsigned shift = PAGE_SHIFT +
-                        get_order(amd_iommu_last_bdf * entry_size);
+                        get_order(((int)amd_iommu_last_bdf + 1) * entry_size);
 
        return 1UL << shift;
 }
@@ -193,7 +199,7 @@ static inline unsigned long tbl_size(int entry_size)
  * This function set the exclusion range in the IOMMU. DMA accesses to the
  * exclusion range are passed through untranslated
  */
-static void __init iommu_set_exclusion_range(struct amd_iommu *iommu)
+static void iommu_set_exclusion_range(struct amd_iommu *iommu)
 {
        u64 start = iommu->exclusion_start & PAGE_MASK;
        u64 limit = (start + iommu->exclusion_length) & PAGE_MASK;
@@ -225,7 +231,7 @@ static void __init iommu_set_device_table(struct amd_iommu *iommu)
 }
 
 /* Generic functions to enable/disable certain features of the IOMMU. */
-static void __init iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
+static void iommu_feature_enable(struct amd_iommu *iommu, u8 bit)
 {
        u32 ctrl;
 
@@ -244,7 +250,7 @@ static void __init iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
 }
 
 /* Function to enable the hardware */
-static void __init iommu_enable(struct amd_iommu *iommu)
+static void iommu_enable(struct amd_iommu *iommu)
 {
        printk(KERN_INFO "AMD IOMMU: Enabling IOMMU at %s cap 0x%hx\n",
               dev_name(&iommu->dev->dev), iommu->cap_ptr);
@@ -252,11 +258,9 @@ static void __init iommu_enable(struct amd_iommu *iommu)
        iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
 }
 
-/* Function to enable IOMMU event logging and event interrupts */
-static void __init iommu_enable_event_logging(struct amd_iommu *iommu)
+static void iommu_disable(struct amd_iommu *iommu)
 {
-       iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
-       iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+       iommu_feature_disable(iommu, CONTROL_IOMMU_EN);
 }
 
 /*
@@ -413,25 +417,36 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
 {
        u8 *cmd_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                        get_order(CMD_BUFFER_SIZE));
-       u64 entry;
 
        if (cmd_buf == NULL)
                return NULL;
 
        iommu->cmd_buf_size = CMD_BUFFER_SIZE;
 
-       entry = (u64)virt_to_phys(cmd_buf);
+       return cmd_buf;
+}
+
+/*
+ * This function writes the command buffer address to the hardware and
+ * enables it.
+ */
+static void iommu_enable_command_buffer(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       BUG_ON(iommu->cmd_buf == NULL);
+
+       entry = (u64)virt_to_phys(iommu->cmd_buf);
        entry |= MMIO_CMD_SIZE_512;
+
        memcpy_toio(iommu->mmio_base + MMIO_CMD_BUF_OFFSET,
-                       &entry, sizeof(entry));
+                   &entry, sizeof(entry));
 
        /* set head and tail to zero manually */
        writel(0x00, iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
        writel(0x00, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
 
        iommu_feature_enable(iommu, CONTROL_CMDBUF_EN);
-
-       return cmd_buf;
 }
 
 static void __init free_command_buffer(struct amd_iommu *iommu)
@@ -443,20 +458,27 @@ static void __init free_command_buffer(struct amd_iommu *iommu)
 /* allocates the memory where the IOMMU will log its events to */
 static u8 * __init alloc_event_buffer(struct amd_iommu *iommu)
 {
-       u64 entry;
        iommu->evt_buf = (u8 *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
                                                get_order(EVT_BUFFER_SIZE));
 
        if (iommu->evt_buf == NULL)
                return NULL;
 
+       return iommu->evt_buf;
+}
+
+static void iommu_enable_event_buffer(struct amd_iommu *iommu)
+{
+       u64 entry;
+
+       BUG_ON(iommu->evt_buf == NULL);
+
        entry = (u64)virt_to_phys(iommu->evt_buf) | EVT_LEN_MASK;
+
        memcpy_toio(iommu->mmio_base + MMIO_EVT_BUF_OFFSET,
                    &entry, sizeof(entry));
 
-       iommu->evt_buf_size = EVT_BUFFER_SIZE;
-
-       return iommu->evt_buf;
+       iommu_feature_enable(iommu, CONTROL_EVT_LOG_EN);
 }
 
 static void __init free_event_buffer(struct amd_iommu *iommu)
@@ -596,32 +618,83 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
        p += sizeof(struct ivhd_header);
        end += h->length;
 
+
        while (p < end) {
                e = (struct ivhd_entry *)p;
                switch (e->type) {
                case IVHD_DEV_ALL:
+
+                       DUMP_printk("  DEV_ALL\t\t\t first devid: %02x:%02x.%x"
+                                   " last device %02x:%02x.%x flags: %02x\n",
+                                   PCI_BUS(iommu->first_device),
+                                   PCI_SLOT(iommu->first_device),
+                                   PCI_FUNC(iommu->first_device),
+                                   PCI_BUS(iommu->last_device),
+                                   PCI_SLOT(iommu->last_device),
+                                   PCI_FUNC(iommu->last_device),
+                                   e->flags);
+
                        for (dev_i = iommu->first_device;
                                        dev_i <= iommu->last_device; ++dev_i)
                                set_dev_entry_from_acpi(iommu, dev_i,
                                                        e->flags, 0);
                        break;
                case IVHD_DEV_SELECT:
+
+                       DUMP_printk("  DEV_SELECT\t\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags);
+
                        devid = e->devid;
                        set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
                        break;
                case IVHD_DEV_SELECT_RANGE_START:
+
+                       DUMP_printk("  DEV_SELECT_RANGE_START\t "
+                                   "devid: %02x:%02x.%x flags: %02x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags);
+
                        devid_start = e->devid;
                        flags = e->flags;
                        ext_flags = 0;
                        alias = false;
                        break;
                case IVHD_DEV_ALIAS:
+
+                       DUMP_printk("  DEV_ALIAS\t\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x devid_to: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags,
+                                   PCI_BUS(e->ext >> 8),
+                                   PCI_SLOT(e->ext >> 8),
+                                   PCI_FUNC(e->ext >> 8));
+
                        devid = e->devid;
                        devid_to = e->ext >> 8;
-                       set_dev_entry_from_acpi(iommu, devid, e->flags, 0);
+                       set_dev_entry_from_acpi(iommu, devid_to, e->flags, 0);
                        amd_iommu_alias_table[devid] = devid_to;
                        break;
                case IVHD_DEV_ALIAS_RANGE:
+
+                       DUMP_printk("  DEV_ALIAS_RANGE\t\t "
+                                   "devid: %02x:%02x.%x flags: %02x "
+                                   "devid_to: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags,
+                                   PCI_BUS(e->ext >> 8),
+                                   PCI_SLOT(e->ext >> 8),
+                                   PCI_FUNC(e->ext >> 8));
+
                        devid_start = e->devid;
                        flags = e->flags;
                        devid_to = e->ext >> 8;
@@ -629,17 +702,39 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
                        alias = true;
                        break;
                case IVHD_DEV_EXT_SELECT:
+
+                       DUMP_printk("  DEV_EXT_SELECT\t\t devid: %02x:%02x.%x "
+                                   "flags: %02x ext: %08x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags, e->ext);
+
                        devid = e->devid;
                        set_dev_entry_from_acpi(iommu, devid, e->flags,
                                                e->ext);
                        break;
                case IVHD_DEV_EXT_SELECT_RANGE:
+
+                       DUMP_printk("  DEV_EXT_SELECT_RANGE\t devid: "
+                                   "%02x:%02x.%x flags: %02x ext: %08x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid),
+                                   e->flags, e->ext);
+
                        devid_start = e->devid;
                        flags = e->flags;
                        ext_flags = e->ext;
                        alias = false;
                        break;
                case IVHD_DEV_RANGE_END:
+
+                       DUMP_printk("  DEV_RANGE_END\t\t devid: %02x:%02x.%x\n",
+                                   PCI_BUS(e->devid),
+                                   PCI_SLOT(e->devid),
+                                   PCI_FUNC(e->devid));
+
                        devid = e->devid;
                        for (dev_i = devid_start; dev_i <= devid; ++dev_i) {
                                if (alias)
@@ -679,7 +774,7 @@ static void __init free_iommu_all(void)
 {
        struct amd_iommu *iommu, *next;
 
-       list_for_each_entry_safe(iommu, next, &amd_iommu_list, list) {
+       for_each_iommu_safe(iommu, next) {
                list_del(&iommu->list);
                free_iommu_one(iommu);
                kfree(iommu);
@@ -710,7 +805,6 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
        if (!iommu->mmio_base)
                return -ENOMEM;
 
-       iommu_set_device_table(iommu);
        iommu->cmd_buf = alloc_command_buffer(iommu);
        if (!iommu->cmd_buf)
                return -ENOMEM;
@@ -746,6 +840,15 @@ static int __init init_iommu_all(struct acpi_table_header *table)
                h = (struct ivhd_header *)p;
                switch (*p) {
                case ACPI_IVHD_TYPE:
+
+                       DUMP_printk("IOMMU: device: %02x:%02x.%01x cap: %04x "
+                                   "seg: %d flags: %01x info %04x\n",
+                                   PCI_BUS(h->devid), PCI_SLOT(h->devid),
+                                   PCI_FUNC(h->devid), h->cap_ptr,
+                                   h->pci_seg, h->flags, h->info);
+                       DUMP_printk("       mmio-addr: %016llx\n",
+                                   h->mmio_phys);
+
                        iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
                        if (iommu == NULL)
                                return -ENOMEM;
@@ -773,56 +876,9 @@ static int __init init_iommu_all(struct acpi_table_header *table)
  *
  ****************************************************************************/
 
-static int __init iommu_setup_msix(struct amd_iommu *iommu)
-{
-       struct amd_iommu *curr;
-       struct msix_entry entries[32]; /* only 32 supported by AMD IOMMU */
-       int nvec = 0, i;
-
-       list_for_each_entry(curr, &amd_iommu_list, list) {
-               if (curr->dev == iommu->dev) {
-                       entries[nvec].entry = curr->evt_msi_num;
-                       entries[nvec].vector = 0;
-                       curr->int_enabled = true;
-                       nvec++;
-               }
-       }
-
-       if (pci_enable_msix(iommu->dev, entries, nvec)) {
-               pci_disable_msix(iommu->dev);
-               return 1;
-       }
-
-       for (i = 0; i < nvec; ++i) {
-               int r = request_irq(entries->vector, amd_iommu_int_handler,
-                                   IRQF_SAMPLE_RANDOM,
-                                   "AMD IOMMU",
-                                   NULL);
-               if (r)
-                       goto out_free;
-       }
-
-       return 0;
-
-out_free:
-       for (i -= 1; i >= 0; --i)
-               free_irq(entries->vector, NULL);
-
-       pci_disable_msix(iommu->dev);
-
-       return 1;
-}
-
 static int __init iommu_setup_msi(struct amd_iommu *iommu)
 {
        int r;
-       struct amd_iommu *curr;
-
-       list_for_each_entry(curr, &amd_iommu_list, list) {
-               if (curr->dev == iommu->dev)
-                       curr->int_enabled = true;
-       }
-
 
        if (pci_enable_msi(iommu->dev))
                return 1;
@@ -837,17 +893,18 @@ static int __init iommu_setup_msi(struct amd_iommu *iommu)
                return 1;
        }
 
+       iommu->int_enabled = true;
+       iommu_feature_enable(iommu, CONTROL_EVT_INT_EN);
+
        return 0;
 }
 
-static int __init iommu_init_msi(struct amd_iommu *iommu)
+static int iommu_init_msi(struct amd_iommu *iommu)
 {
        if (iommu->int_enabled)
                return 0;
 
-       if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSIX))
-               return iommu_setup_msix(iommu);
-       else if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
+       if (pci_find_capability(iommu->dev, PCI_CAP_ID_MSI))
                return iommu_setup_msi(iommu);
 
        return 1;
@@ -899,6 +956,7 @@ static int __init init_exclusion_range(struct ivmd_header *m)
 static int __init init_unity_map_range(struct ivmd_header *m)
 {
        struct unity_map_entry *e = 0;
+       char *s;
 
        e = kzalloc(sizeof(*e), GFP_KERNEL);
        if (e == NULL)
@@ -906,14 +964,19 @@ static int __init init_unity_map_range(struct ivmd_header *m)
 
        switch (m->type) {
        default:
+               kfree(e);
+               return 0;
        case ACPI_IVMD_TYPE:
+               s = "IVMD_TYPEi\t\t\t";
                e->devid_start = e->devid_end = m->devid;
                break;
        case ACPI_IVMD_TYPE_ALL:
+               s = "IVMD_TYPE_ALL\t\t";
                e->devid_start = 0;
                e->devid_end = amd_iommu_last_bdf;
                break;
        case ACPI_IVMD_TYPE_RANGE:
+               s = "IVMD_TYPE_RANGE\t\t";
                e->devid_start = m->devid;
                e->devid_end = m->aux;
                break;
@@ -922,6 +985,13 @@ static int __init init_unity_map_range(struct ivmd_header *m)
        e->address_end = e->address_start + PAGE_ALIGN(m->range_length);
        e->prot = m->flags >> 1;
 
+       DUMP_printk("%s devid_start: %02x:%02x.%x devid_end: %02x:%02x.%x"
+                   " range_start: %016llx range_end: %016llx flags: %x\n", s,
+                   PCI_BUS(e->devid_start), PCI_SLOT(e->devid_start),
+                   PCI_FUNC(e->devid_start), PCI_BUS(e->devid_end),
+                   PCI_SLOT(e->devid_end), PCI_FUNC(e->devid_end),
+                   e->address_start, e->address_end, m->flags);
+
        list_add_tail(&e->list, &amd_iommu_unity_map);
 
        return 0;
@@ -967,18 +1037,28 @@ static void init_device_table(void)
  * This function finally enables all IOMMUs found in the system after
  * they have been initialized
  */
-static void __init enable_iommus(void)
+static void enable_iommus(void)
 {
        struct amd_iommu *iommu;
 
-       list_for_each_entry(iommu, &amd_iommu_list, list) {
+       for_each_iommu(iommu) {
+               iommu_set_device_table(iommu);
+               iommu_enable_command_buffer(iommu);
+               iommu_enable_event_buffer(iommu);
                iommu_set_exclusion_range(iommu);
                iommu_init_msi(iommu);
-               iommu_enable_event_logging(iommu);
                iommu_enable(iommu);
        }
 }
 
+static void disable_iommus(void)
+{
+       struct amd_iommu *iommu;
+
+       for_each_iommu(iommu)
+               iommu_disable(iommu);
+}
+
 /*
  * Suspend/Resume support
  * disable suspend until real resume implemented
@@ -986,12 +1066,31 @@ static void __init enable_iommus(void)
 
 static int amd_iommu_resume(struct sys_device *dev)
 {
+       /*
+        * Disable IOMMUs before reprogramming the hardware registers.
+        * IOMMU is still enabled from the resume kernel.
+        */
+       disable_iommus();
+
+       /* re-load the hardware */
+       enable_iommus();
+
+       /*
+        * we have to flush after the IOMMUs are enabled because a
+        * disabled IOMMU will never execute the commands we send
+        */
+       amd_iommu_flush_all_domains();
+       amd_iommu_flush_all_devices();
+
        return 0;
 }
 
 static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
 {
-       return -EINVAL;
+       /* disable IOMMUs to go out of the way for BIOS */
+       disable_iommus();
+
+       return 0;
 }
 
 static struct sysdev_class amd_iommu_sysdev_class = {
@@ -1137,9 +1236,6 @@ int __init amd_iommu_init(void)
 
        enable_iommus();
 
-       printk(KERN_INFO "AMD IOMMU: aperture size is %d MB\n",
-                       (1 << (amd_iommu_aperture_order-20)));
-
        printk(KERN_INFO "AMD IOMMU: device isolation ");
        if (amd_iommu_isolate)
                printk("enabled\n");
@@ -1211,6 +1307,13 @@ void __init amd_iommu_detect(void)
  *
  ****************************************************************************/
 
+static int __init parse_amd_iommu_dump(char *str)
+{
+       amd_iommu_dump = true;
+
+       return 1;
+}
+
 static int __init parse_amd_iommu_options(char *str)
 {
        for (; *str; ++str) {
@@ -1225,15 +1328,5 @@ static int __init parse_amd_iommu_options(char *str)
        return 1;
 }
 
-static int __init parse_amd_iommu_size_options(char *str)
-{
-       unsigned order = PAGE_SHIFT + get_order(memparse(str, &str));
-
-       if ((order > 24) && (order < 31))
-               amd_iommu_aperture_order = order;
-
-       return 1;
-}
-
+__setup("amd_iommu_dump", parse_amd_iommu_dump);
 __setup("amd_iommu=", parse_amd_iommu_options);
-__setup("amd_iommu_size=", parse_amd_iommu_size_options);
index f2870920f246a9f1d7e2075c37b65b05de27dbf1..a4c9cf0bf70bfdef62c638ed10d64c2a33a0a74b 100644 (file)
@@ -98,6 +98,29 @@ early_param("lapic", parse_lapic);
 /* Local APIC was disabled by the BIOS and enabled by the kernel */
 static int enabled_via_apicbase;
 
+/*
+ * Handle interrupt mode configuration register (IMCR).
+ * This register controls whether the interrupt signals
+ * that reach the BSP come from the master PIC or from the
+ * local APIC. Before entering Symmetric I/O Mode, either
+ * the BIOS or the operating system must switch out of
+ * PIC Mode by changing the IMCR.
+ */
+static inline void imcr_pic_to_apic(void)
+{
+       /* select IMCR register */
+       outb(0x70, 0x22);
+       /* NMI and 8259 INTR go through APIC */
+       outb(0x01, 0x23);
+}
+
+static inline void imcr_apic_to_pic(void)
+{
+       /* select IMCR register */
+       outb(0x70, 0x22);
+       /* NMI and 8259 INTR go directly to BSP */
+       outb(0x00, 0x23);
+}
 #endif
 
 #ifdef CONFIG_X86_64
@@ -111,13 +134,19 @@ static __init int setup_apicpmtimer(char *s)
 __setup("apicpmtimer", setup_apicpmtimer);
 #endif
 
+int x2apic_mode;
 #ifdef CONFIG_X86_X2APIC
-int x2apic;
 /* x2apic enabled before OS handover */
 static int x2apic_preenabled;
 static int disable_x2apic;
 static __init int setup_nox2apic(char *str)
 {
+       if (x2apic_enabled()) {
+               pr_warning("Bios already enabled x2apic, "
+                          "can't enforce nox2apic");
+               return 0;
+       }
+
        disable_x2apic = 1;
        setup_clear_cpu_cap(X86_FEATURE_X2APIC);
        return 0;
@@ -209,6 +238,31 @@ static int modern_apic(void)
        return lapic_get_version() >= 0x14;
 }
 
+/*
+ * bare function to substitute write operation
+ * and it's _that_ fast :)
+ */
+static void native_apic_write_dummy(u32 reg, u32 v)
+{
+       WARN_ON_ONCE((cpu_has_apic || !disable_apic));
+}
+
+static u32 native_apic_read_dummy(u32 reg)
+{
+       WARN_ON_ONCE((cpu_has_apic && !disable_apic));
+       return 0;
+}
+
+/*
+ * right after this call apic->write/read doesn't do anything
+ * note that there is no restore operation it works one way
+ */
+void apic_disable(void)
+{
+       apic->read = native_apic_read_dummy;
+       apic->write = native_apic_write_dummy;
+}
+
 void native_apic_wait_icr_idle(void)
 {
        while (apic_read(APIC_ICR) & APIC_ICR_BUSY)
@@ -348,7 +402,7 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
 
 static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask)
 {
-       unsigned long reg = (lvt_off << 4) + APIC_EILVT0;
+       unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0);
        unsigned int  v   = (mask << 16) | (msg_type << 8) | vector;
 
        apic_write(reg, v);
@@ -815,7 +869,7 @@ void clear_local_APIC(void)
        u32 v;
 
        /* APIC hasn't been mapped yet */
-       if (!x2apic && !apic_phys)
+       if (!x2apic_mode && !apic_phys)
                return;
 
        maxlvt = lapic_get_maxlvt();
@@ -1287,7 +1341,7 @@ void check_x2apic(void)
 {
        if (x2apic_enabled()) {
                pr_info("x2apic enabled by BIOS, switching to x2apic ops\n");
-               x2apic_preenabled = x2apic = 1;
+               x2apic_preenabled = x2apic_mode = 1;
        }
 }
 
@@ -1295,7 +1349,7 @@ void enable_x2apic(void)
 {
        int msr, msr2;
 
-       if (!x2apic)
+       if (!x2apic_mode)
                return;
 
        rdmsr(MSR_IA32_APICBASE, msr, msr2);
@@ -1304,6 +1358,7 @@ void enable_x2apic(void)
                wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
        }
 }
+#endif /* CONFIG_X86_X2APIC */
 
 void __init enable_IR_x2apic(void)
 {
@@ -1312,32 +1367,21 @@ void __init enable_IR_x2apic(void)
        unsigned long flags;
        struct IO_APIC_route_entry **ioapic_entries = NULL;
 
-       if (!cpu_has_x2apic)
-               return;
-
-       if (!x2apic_preenabled && disable_x2apic) {
-               pr_info("Skipped enabling x2apic and Interrupt-remapping "
-                       "because of nox2apic\n");
-               return;
+       ret = dmar_table_init();
+       if (ret) {
+               pr_debug("dmar_table_init() failed with %d:\n", ret);
+               goto ir_failed;
        }
 
-       if (x2apic_preenabled && disable_x2apic)
-               panic("Bios already enabled x2apic, can't enforce nox2apic");
-
-       if (!x2apic_preenabled && skip_ioapic_setup) {
-               pr_info("Skipped enabling x2apic and Interrupt-remapping "
-                       "because of skipping io-apic setup\n");
-               return;
+       if (!intr_remapping_supported()) {
+               pr_debug("intr-remapping not supported\n");
+               goto ir_failed;
        }
 
-       ret = dmar_table_init();
-       if (ret) {
-               pr_info("dmar_table_init() failed with %d:\n", ret);
 
-               if (x2apic_preenabled)
-                       panic("x2apic enabled by bios. But IR enabling failed");
-               else
-                       pr_info("Not enabling x2apic,Intr-remapping\n");
+       if (!x2apic_preenabled && skip_ioapic_setup) {
+               pr_info("Skipped enabling intr-remap because of skipping "
+                       "io-apic setup\n");
                return;
        }
 
@@ -1357,19 +1401,16 @@ void __init enable_IR_x2apic(void)
        mask_IO_APIC_setup(ioapic_entries);
        mask_8259A();
 
-       ret = enable_intr_remapping(EIM_32BIT_APIC_ID);
-
-       if (ret && x2apic_preenabled) {
-               local_irq_restore(flags);
-               panic("x2apic enabled by bios. But IR enabling failed");
-       }
-
+       ret = enable_intr_remapping(x2apic_supported());
        if (ret)
                goto end_restore;
 
-       if (!x2apic) {
-               x2apic = 1;
+       pr_info("Enabled Interrupt-remapping\n");
+
+       if (x2apic_supported() && !x2apic_mode) {
+               x2apic_mode = 1;
                enable_x2apic();
+               pr_info("Enabled x2apic\n");
        }
 
 end_restore:
@@ -1378,37 +1419,34 @@ end_restore:
                 * IR enabling failed
                 */
                restore_IO_APIC_setup(ioapic_entries);
-       else
-               reinit_intr_remapped_IO_APIC(x2apic_preenabled, ioapic_entries);
 
        unmask_8259A();
        local_irq_restore(flags);
 
 end:
-       if (!ret) {
-               if (!x2apic_preenabled)
-                       pr_info("Enabled x2apic and interrupt-remapping\n");
-               else
-                       pr_info("Enabled Interrupt-remapping\n");
-       } else
-               pr_err("Failed to enable Interrupt-remapping and x2apic\n");
        if (ioapic_entries)
                free_ioapic_entries(ioapic_entries);
+
+       if (!ret)
+               return;
+
+ir_failed:
+       if (x2apic_preenabled)
+               panic("x2apic enabled by bios. But IR enabling failed");
+       else if (cpu_has_x2apic)
+               pr_info("Not enabling x2apic,Intr-remapping\n");
 #else
        if (!cpu_has_x2apic)
                return;
 
        if (x2apic_preenabled)
                panic("x2apic enabled prior OS handover,"
-                     " enable CONFIG_INTR_REMAP");
-
-       pr_info("Enable CONFIG_INTR_REMAP for enabling intr-remapping "
-               " and x2apic\n");
+                     " enable CONFIG_X86_X2APIC, CONFIG_INTR_REMAP");
 #endif
 
        return;
 }
-#endif /* CONFIG_X86_X2APIC */
+
 
 #ifdef CONFIG_X86_64
 /*
@@ -1425,7 +1463,6 @@ static int __init detect_init_APIC(void)
        }
 
        mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
-       boot_cpu_physical_apicid = 0;
        return 0;
 }
 #else
@@ -1539,32 +1576,49 @@ void __init early_init_lapic_mapping(void)
  */
 void __init init_apic_mappings(void)
 {
-       if (x2apic) {
+       unsigned int new_apicid;
+
+       if (x2apic_mode) {
                boot_cpu_physical_apicid = read_apic_id();
                return;
        }
 
-       /*
-        * If no local APIC can be found then set up a fake all
-        * zeroes page to simulate the local APIC and another
-        * one for the IO-APIC.
-        */
+       /* If no local APIC can be found return early */
        if (!smp_found_config && detect_init_APIC()) {
-               apic_phys = (unsigned long) alloc_bootmem_pages(PAGE_SIZE);
-               apic_phys = __pa(apic_phys);
-       } else
+               /* lets NOP'ify apic operations */
+               pr_info("APIC: disable apic facility\n");
+               apic_disable();
+       } else {
                apic_phys = mp_lapic_addr;
 
-       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
-       apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
-                               APIC_BASE, apic_phys);
+               /*
+                * acpi lapic path already maps that address in
+                * acpi_register_lapic_address()
+                */
+               if (!acpi_lapic)
+                       set_fixmap_nocache(FIX_APIC_BASE, apic_phys);
+
+               apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
+                                       APIC_BASE, apic_phys);
+       }
 
        /*
         * Fetch the APIC ID of the BSP in case we have a
         * default configuration (or the MP table is broken).
         */
-       if (boot_cpu_physical_apicid == -1U)
-               boot_cpu_physical_apicid = read_apic_id();
+       new_apicid = read_apic_id();
+       if (boot_cpu_physical_apicid != new_apicid) {
+               boot_cpu_physical_apicid = new_apicid;
+               /*
+                * yeah -- we lie about apic_version
+                * in case if apic was disabled via boot option
+                * but it's not a problem for SMP compiled kernel
+                * since smp_sanity_check is prepared for such a case
+                * and disable smp mode
+                */
+               apic_version[new_apicid] =
+                        GET_APIC_VERSION(apic_read(APIC_LVR));
+       }
 }
 
 /*
@@ -1733,8 +1787,7 @@ void __init connect_bsp_APIC(void)
                 */
                apic_printk(APIC_VERBOSE, "leaving PIC mode, "
                                "enabling APIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x01, 0x23);
+               imcr_pic_to_apic();
        }
 #endif
        if (apic->enable_apic_mode)
@@ -1762,8 +1815,7 @@ void disconnect_bsp_APIC(int virt_wire_setup)
                 */
                apic_printk(APIC_VERBOSE, "disabling APIC mode, "
                                "entering PIC mode.\n");
-               outb(0x70, 0x22);
-               outb(0x00, 0x23);
+               imcr_apic_to_pic();
                return;
        }
 #endif
@@ -1969,10 +2021,10 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
 
        local_irq_save(flags);
        disable_local_APIC();
-#ifdef CONFIG_INTR_REMAP
+
        if (intr_remapping_enabled)
                disable_intr_remapping();
-#endif
+
        local_irq_restore(flags);
        return 0;
 }
@@ -1982,42 +2034,34 @@ static int lapic_resume(struct sys_device *dev)
        unsigned int l, h;
        unsigned long flags;
        int maxlvt;
-
-#ifdef CONFIG_INTR_REMAP
-       int ret;
+       int ret = 0;
        struct IO_APIC_route_entry **ioapic_entries = NULL;
 
        if (!apic_pm_state.active)
                return 0;
 
        local_irq_save(flags);
-       if (x2apic) {
+       if (intr_remapping_enabled) {
                ioapic_entries = alloc_ioapic_entries();
                if (!ioapic_entries) {
                        WARN(1, "Alloc ioapic_entries in lapic resume failed.");
-                       return -ENOMEM;
+                       ret = -ENOMEM;
+                       goto restore;
                }
 
                ret = save_IO_APIC_setup(ioapic_entries);
                if (ret) {
                        WARN(1, "Saving IO-APIC state failed: %d\n", ret);
                        free_ioapic_entries(ioapic_entries);
-                       return ret;
+                       goto restore;
                }
 
                mask_IO_APIC_setup(ioapic_entries);
                mask_8259A();
-               enable_x2apic();
        }
-#else
-       if (!apic_pm_state.active)
-               return 0;
 
-       local_irq_save(flags);
-       if (x2apic)
+       if (x2apic_mode)
                enable_x2apic();
-#endif
-
        else {
                /*
                 * Make sure the APICBASE points to the right address
@@ -2055,21 +2099,16 @@ static int lapic_resume(struct sys_device *dev)
        apic_write(APIC_ESR, 0);
        apic_read(APIC_ESR);
 
-#ifdef CONFIG_INTR_REMAP
-       if (intr_remapping_enabled)
-               reenable_intr_remapping(EIM_32BIT_APIC_ID);
-
-       if (x2apic) {
+       if (intr_remapping_enabled) {
+               reenable_intr_remapping(x2apic_mode);
                unmask_8259A();
                restore_IO_APIC_setup(ioapic_entries);
                free_ioapic_entries(ioapic_entries);
        }
-#endif
-
+restore:
        local_irq_restore(flags);
 
-
-       return 0;
+       return ret;
 }
 
 /*
@@ -2117,31 +2156,14 @@ static void apic_pm_activate(void) { }
 #endif /* CONFIG_PM */
 
 #ifdef CONFIG_X86_64
-/*
- * apic_is_clustered_box() -- Check if we can expect good TSC
- *
- * Thus far, the major user of this is IBM's Summit2 series:
- *
- * Clustered boxes may have unsynced TSC problems if they are
- * multi-chassis. Use available data to take a good guess.
- * If in doubt, go HPET.
- */
-__cpuinit int apic_is_clustered_box(void)
+
+static int __cpuinit apic_cluster_num(void)
 {
        int i, clusters, zeros;
        unsigned id;
        u16 *bios_cpu_apicid;
        DECLARE_BITMAP(clustermap, NUM_APIC_CLUSTERS);
 
-       /*
-        * there is not this kind of box with AMD CPU yet.
-        * Some AMD box with quadcore cpu and 8 sockets apicid
-        * will be [4, 0x23] or [8, 0x27] could be thought to
-        * vsmp box still need checking...
-        */
-       if ((boot_cpu_data.x86_vendor == X86_VENDOR_AMD) && !is_vsmp_box())
-               return 0;
-
        bios_cpu_apicid = early_per_cpu_ptr(x86_bios_cpu_apicid);
        bitmap_zero(clustermap, NUM_APIC_CLUSTERS);
 
@@ -2177,18 +2199,67 @@ __cpuinit int apic_is_clustered_box(void)
                        ++zeros;
        }
 
-       /* ScaleMP vSMPowered boxes have one cluster per board and TSCs are
-        * not guaranteed to be synced between boards
-        */
-       if (is_vsmp_box() && clusters > 1)
+       return clusters;
+}
+
+static int __cpuinitdata multi_checked;
+static int __cpuinitdata multi;
+
+static int __cpuinit set_multi(const struct dmi_system_id *d)
+{
+       if (multi)
+               return 0;
+       pr_info("APIC: %s detected, Multi Chassis\n", d->ident);
+       multi = 1;
+       return 0;
+}
+
+static const __cpuinitconst struct dmi_system_id multi_dmi_table[] = {
+       {
+               .callback = set_multi,
+               .ident = "IBM System Summit2",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "Summit2"),
+               },
+       },
+       {}
+};
+
+static void __cpuinit dmi_check_multi(void)
+{
+       if (multi_checked)
+               return;
+
+       dmi_check_system(multi_dmi_table);
+       multi_checked = 1;
+}
+
+/*
+ * apic_is_clustered_box() -- Check if we can expect good TSC
+ *
+ * Thus far, the major user of this is IBM's Summit2 series:
+ * Clustered boxes may have unsynced TSC problems if they are
+ * multi-chassis.
+ * Use DMI to check them
+ */
+__cpuinit int apic_is_clustered_box(void)
+{
+       dmi_check_multi();
+       if (multi)
                return 1;
 
+       if (!is_vsmp_box())
+               return 0;
+
        /*
-        * If clusters > 2, then should be multi-chassis.
-        * May have to revisit this when multi-core + hyperthreaded CPUs come
-        * out, but AFAIK this will work even for them.
+        * ScaleMP vSMPowered boxes have one cluster per board and TSCs are
+        * not guaranteed to be synced between boards
         */
-       return (clusters > 2);
+       if (apic_cluster_num() > 1)
+               return 1;
+
+       return 0;
 }
 #endif
 
index 306e5e88fb6f4abc9a53a3ce8638ac904a67067e..d0c99abc26c32a8f0df48d61485b1f36acb9b4b2 100644 (file)
@@ -161,7 +161,7 @@ static int flat_apic_id_registered(void)
 
 static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
 {
-       return hard_smp_processor_id() >> index_msb;
+       return initial_apic_id >> index_msb;
 }
 
 struct apic apic_flat =  {
@@ -235,7 +235,7 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
         * regardless of how many processors are present (x86_64 ES7000
         * is an example).
         */
-       if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID &&
+       if (acpi_gbl_FADT.header.revision >= FADT2_REVISION_ID &&
                (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) {
                printk(KERN_DEBUG "system APIC only can use physical flat");
                return 1;
index 302947775575ebb69a670a4ef10f0184b2b82ac9..69328ac8de9c86e106d30af1337a80ab0d4488b1 100644 (file)
@@ -145,7 +145,7 @@ es7000_rename_gsi(int ioapic, int gsi)
        return gsi;
 }
 
-static int wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
+static int __cpuinit wakeup_secondary_cpu_via_mip(int cpu, unsigned long eip)
 {
        unsigned long vect = 0, psaival = 0;
 
index 30da617d18e4735de234904fdc005cf527930c89..1946fac42ab3cb8a406d46509a983163cd347008 100644 (file)
@@ -59,6 +59,7 @@
 #include <asm/setup.h>
 #include <asm/irq_remapping.h>
 #include <asm/hpet.h>
+#include <asm/hw_irq.h>
 #include <asm/uv/uv_hub.h>
 #include <asm/uv/uv_irq.h>
 
@@ -129,12 +130,9 @@ struct irq_pin_list {
        struct irq_pin_list *next;
 };
 
-static struct irq_pin_list *get_one_free_irq_2_pin(int cpu)
+static struct irq_pin_list *get_one_free_irq_2_pin(int node)
 {
        struct irq_pin_list *pin;
-       int node;
-
-       node = cpu_to_node(cpu);
 
        pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
 
@@ -148,9 +146,6 @@ struct irq_cfg {
        unsigned move_cleanup_count;
        u8 vector;
        u8 move_in_progress : 1;
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-       u8 move_desc_pending : 1;
-#endif
 };
 
 /* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
@@ -212,12 +207,9 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
        return cfg;
 }
 
-static struct irq_cfg *get_one_free_irq_cfg(int cpu)
+static struct irq_cfg *get_one_free_irq_cfg(int node)
 {
        struct irq_cfg *cfg;
-       int node;
-
-       node = cpu_to_node(cpu);
 
        cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node);
        if (cfg) {
@@ -238,13 +230,13 @@ static struct irq_cfg *get_one_free_irq_cfg(int cpu)
        return cfg;
 }
 
-int arch_init_chip_data(struct irq_desc *desc, int cpu)
+int arch_init_chip_data(struct irq_desc *desc, int node)
 {
        struct irq_cfg *cfg;
 
        cfg = desc->chip_data;
        if (!cfg) {
-               desc->chip_data = get_one_free_irq_cfg(cpu);
+               desc->chip_data = get_one_free_irq_cfg(node);
                if (!desc->chip_data) {
                        printk(KERN_ERR "can not alloc irq_cfg\n");
                        BUG_ON(1);
@@ -254,10 +246,9 @@ int arch_init_chip_data(struct irq_desc *desc, int cpu)
        return 0;
 }
 
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-
+/* for move_irq_desc */
 static void
-init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
+init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
 {
        struct irq_pin_list *old_entry, *head, *tail, *entry;
 
@@ -266,7 +257,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
        if (!old_entry)
                return;
 
-       entry = get_one_free_irq_2_pin(cpu);
+       entry = get_one_free_irq_2_pin(node);
        if (!entry)
                return;
 
@@ -276,7 +267,7 @@ init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int cpu)
        tail            = entry;
        old_entry       = old_entry->next;
        while (old_entry) {
-               entry = get_one_free_irq_2_pin(cpu);
+               entry = get_one_free_irq_2_pin(node);
                if (!entry) {
                        entry = head;
                        while (entry) {
@@ -316,12 +307,12 @@ static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg)
 }
 
 void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                struct irq_desc *desc, int cpu)
+                                struct irq_desc *desc, int node)
 {
        struct irq_cfg *cfg;
        struct irq_cfg *old_cfg;
 
-       cfg = get_one_free_irq_cfg(cpu);
+       cfg = get_one_free_irq_cfg(node);
 
        if (!cfg)
                return;
@@ -332,7 +323,7 @@ void arch_init_copy_chip_data(struct irq_desc *old_desc,
 
        memcpy(cfg, old_cfg, sizeof(struct irq_cfg));
 
-       init_copy_irq_2_pin(old_cfg, cfg, cpu);
+       init_copy_irq_2_pin(old_cfg, cfg, node);
 }
 
 static void free_irq_cfg(struct irq_cfg *old_cfg)
@@ -356,19 +347,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
                old_desc->chip_data = NULL;
        }
 }
-
-static void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg = desc->chip_data;
-
-       if (!cfg->move_in_progress) {
-               /* it means that domain is not changed */
-               if (!cpumask_intersects(desc->affinity, mask))
-                       cfg->move_desc_pending = 1;
-       }
-}
-#endif
+/* end for move_irq_desc */
 
 #else
 static struct irq_cfg *irq_cfg(unsigned int irq)
@@ -378,13 +357,6 @@ static struct irq_cfg *irq_cfg(unsigned int irq)
 
 #endif
 
-#ifndef CONFIG_NUMA_MIGRATE_IRQ_DESC
-static inline void
-set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-}
-#endif
-
 struct io_apic {
        unsigned int index;
        unsigned int unused[3];
@@ -518,132 +490,18 @@ static void ioapic_mask_entry(int apic, int pin)
        spin_unlock_irqrestore(&ioapic_lock, flags);
 }
 
-#ifdef CONFIG_SMP
-static void send_cleanup_vector(struct irq_cfg *cfg)
-{
-       cpumask_var_t cleanup_mask;
-
-       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
-               unsigned int i;
-               cfg->move_cleanup_count = 0;
-               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-                       cfg->move_cleanup_count++;
-               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
-                       apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
-       } else {
-               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
-               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
-               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
-               free_cpumask_var(cleanup_mask);
-       }
-       cfg->move_in_progress = 0;
-}
-
-static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
-{
-       int apic, pin;
-       struct irq_pin_list *entry;
-       u8 vector = cfg->vector;
-
-       entry = cfg->irq_2_pin;
-       for (;;) {
-               unsigned int reg;
-
-               if (!entry)
-                       break;
-
-               apic = entry->apic;
-               pin = entry->pin;
-               /*
-                * With interrupt-remapping, destination information comes
-                * from interrupt-remapping table entry.
-                */
-               if (!irq_remapped(irq))
-                       io_apic_write(apic, 0x11 + pin*2, dest);
-               reg = io_apic_read(apic, 0x10 + pin*2);
-               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
-               reg |= vector;
-               io_apic_modify(apic, 0x10 + pin*2, reg);
-               if (!entry->next)
-                       break;
-               entry = entry->next;
-       }
-}
-
-static int
-assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
-
-/*
- * Either sets desc->affinity to a valid value, and returns
- * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
- * leaves desc->affinity untouched.
- */
-static unsigned int
-set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg;
-       unsigned int irq;
-
-       if (!cpumask_intersects(mask, cpu_online_mask))
-               return BAD_APICID;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-       if (assign_irq_vector(irq, cfg, mask))
-               return BAD_APICID;
-
-       /* check that before desc->addinity get updated */
-       set_extra_move_desc(desc, mask);
-
-       cpumask_copy(desc->affinity, mask);
-
-       return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
-}
-
-static void
-set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
-{
-       struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int dest;
-       unsigned int irq;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       dest = set_desc_affinity(desc, mask);
-       if (dest != BAD_APICID) {
-               /* Only the high 8 bits are valid. */
-               dest = SET_APIC_LOGICAL_ID(dest);
-               __target_IO_APIC_irq(irq, dest, cfg);
-       }
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-static void
-set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
-{
-       struct irq_desc *desc;
-
-       desc = irq_to_desc(irq);
-
-       set_ioapic_affinity_irq_desc(desc, mask);
-}
-#endif /* CONFIG_SMP */
-
 /*
  * The common case is 1:1 IRQ<->pin mappings. Sometimes there are
  * shared ISA-space IRQs, so we have to support them. We are super
  * fast in the common case, and fast for shared ISA-space IRQs.
  */
-static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
+static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
 {
        struct irq_pin_list *entry;
 
        entry = cfg->irq_2_pin;
        if (!entry) {
-               entry = get_one_free_irq_2_pin(cpu);
+               entry = get_one_free_irq_2_pin(node);
                if (!entry) {
                        printk(KERN_ERR "can not alloc irq_2_pin to add %d - %d\n",
                                        apic, pin);
@@ -663,7 +521,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
                entry = entry->next;
        }
 
-       entry->next = get_one_free_irq_2_pin(cpu);
+       entry->next = get_one_free_irq_2_pin(node);
        entry = entry->next;
        entry->apic = apic;
        entry->pin = pin;
@@ -672,7 +530,7 @@ static void add_pin_to_irq_cpu(struct irq_cfg *cfg, int cpu, int apic, int pin)
 /*
  * Reroute an IRQ to a different pin.
  */
-static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
+static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
                                      int oldapic, int oldpin,
                                      int newapic, int newpin)
 {
@@ -692,7 +550,7 @@ static void __init replace_pin_at_irq_cpu(struct irq_cfg *cfg, int cpu,
 
        /* why? call replace before add? */
        if (!replaced)
-               add_pin_to_irq_cpu(cfg, cpu, newapic, newpin);
+               add_pin_to_irq_node(cfg, node, newapic, newpin);
 }
 
 static inline void io_apic_modify_irq(struct irq_cfg *cfg,
@@ -850,7 +708,6 @@ static int __init ioapic_pirq_setup(char *str)
 __setup("pirq=", ioapic_pirq_setup);
 #endif /* CONFIG_X86_32 */
 
-#ifdef CONFIG_INTR_REMAP
 struct IO_APIC_route_entry **alloc_ioapic_entries(void)
 {
        int apic;
@@ -948,20 +805,6 @@ int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
        return 0;
 }
 
-void reinit_intr_remapped_IO_APIC(int intr_remapping,
-       struct IO_APIC_route_entry **ioapic_entries)
-
-{
-       /*
-        * for now plain restore of previous settings.
-        * TBD: In the case of OS enabling interrupt-remapping,
-        * IO-APIC RTE's need to be setup to point to interrupt-remapping
-        * table entries. for now, do a plain restore, and wait for
-        * the setup_IO_APIC_irqs() to do proper initialization.
-        */
-       restore_IO_APIC_setup(ioapic_entries);
-}
-
 void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 {
        int apic;
@@ -971,7 +814,6 @@ void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
 
        kfree(ioapic_entries);
 }
-#endif
 
 /*
  * Find the IRQ entry number of a certain pin.
@@ -1032,54 +874,6 @@ static int __init find_isa_irq_apic(int irq, int type)
        return -1;
 }
 
-/*
- * Find a specific PCI IRQ entry.
- * Not an __init, possibly needed by modules
- */
-static int pin_2_irq(int idx, int apic, int pin);
-
-int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin)
-{
-       int apic, i, best_guess = -1;
-
-       apic_printk(APIC_DEBUG, "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
-               bus, slot, pin);
-       if (test_bit(bus, mp_bus_not_pci)) {
-               apic_printk(APIC_VERBOSE, "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
-               return -1;
-       }
-       for (i = 0; i < mp_irq_entries; i++) {
-               int lbus = mp_irqs[i].srcbus;
-
-               for (apic = 0; apic < nr_ioapics; apic++)
-                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
-                           mp_irqs[i].dstapic == MP_APIC_ALL)
-                               break;
-
-               if (!test_bit(lbus, mp_bus_not_pci) &&
-                   !mp_irqs[i].irqtype &&
-                   (bus == lbus) &&
-                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
-                       int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
-
-                       if (!(apic || IO_APIC_IRQ(irq)))
-                               continue;
-
-                       if (pin == (mp_irqs[i].srcbusirq & 3))
-                               return irq;
-                       /*
-                        * Use the first all-but-pin matching entry as a
-                        * best-guess fuzzy result for broken mptables.
-                        */
-                       if (best_guess < 0)
-                               best_guess = irq;
-               }
-       }
-       return best_guess;
-}
-
-EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
-
 #if defined(CONFIG_EISA) || defined(CONFIG_MCA)
 /*
  * EISA Edge/Level control register, ELCR
@@ -1298,6 +1092,64 @@ static int pin_2_irq(int idx, int apic, int pin)
        return irq;
 }
 
+/*
+ * Find a specific PCI IRQ entry.
+ * Not an __init, possibly needed by modules
+ */
+int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       int apic, i, best_guess = -1;
+
+       apic_printk(APIC_DEBUG,
+                   "querying PCI -> IRQ mapping bus:%d, slot:%d, pin:%d.\n",
+                   bus, slot, pin);
+       if (test_bit(bus, mp_bus_not_pci)) {
+               apic_printk(APIC_VERBOSE,
+                           "PCI BIOS passed nonexistent PCI bus %d!\n", bus);
+               return -1;
+       }
+       for (i = 0; i < mp_irq_entries; i++) {
+               int lbus = mp_irqs[i].srcbus;
+
+               for (apic = 0; apic < nr_ioapics; apic++)
+                       if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic ||
+                           mp_irqs[i].dstapic == MP_APIC_ALL)
+                               break;
+
+               if (!test_bit(lbus, mp_bus_not_pci) &&
+                   !mp_irqs[i].irqtype &&
+                   (bus == lbus) &&
+                   (slot == ((mp_irqs[i].srcbusirq >> 2) & 0x1f))) {
+                       int irq = pin_2_irq(i, apic, mp_irqs[i].dstirq);
+
+                       if (!(apic || IO_APIC_IRQ(irq)))
+                               continue;
+
+                       if (pin == (mp_irqs[i].srcbusirq & 3)) {
+                               set_io_apic_irq_attr(irq_attr, apic,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               return irq;
+                       }
+                       /*
+                        * Use the first all-but-pin matching entry as a
+                        * best-guess fuzzy result for broken mptables.
+                        */
+                       if (best_guess < 0) {
+                               set_io_apic_irq_attr(irq_attr, apic,
+                                                    mp_irqs[i].dstirq,
+                                                    irq_trigger(i),
+                                                    irq_polarity(i));
+                               best_guess = irq;
+                       }
+               }
+       }
+       return best_guess;
+}
+EXPORT_SYMBOL(IO_APIC_get_PCI_irq_vector);
+
 void lock_vector_lock(void)
 {
        /* Used to the online set of cpus does not change
@@ -1628,58 +1480,70 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
        ioapic_write_entry(apic_id, pin, entry);
 }
 
+static struct {
+       DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
+} mp_ioapic_routing[MAX_IO_APICS];
+
 static void __init setup_IO_APIC_irqs(void)
 {
-       int apic_id, pin, idx, irq;
+       int apic_id = 0, pin, idx, irq;
        int notcon = 0;
        struct irq_desc *desc;
        struct irq_cfg *cfg;
-       int cpu = boot_cpu_id;
+       int node = cpu_to_node(boot_cpu_id);
 
        apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
 
-       for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
-               for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
-
-                       idx = find_irq_entry(apic_id, pin, mp_INT);
-                       if (idx == -1) {
-                               if (!notcon) {
-                                       notcon = 1;
-                                       apic_printk(APIC_VERBOSE,
-                                               KERN_DEBUG " %d-%d",
-                                               mp_ioapics[apic_id].apicid, pin);
-                               } else
-                                       apic_printk(APIC_VERBOSE, " %d-%d",
-                                               mp_ioapics[apic_id].apicid, pin);
-                               continue;
-                       }
-                       if (notcon) {
-                               apic_printk(APIC_VERBOSE,
-                                       " (apicid-pin) not connected\n");
-                               notcon = 0;
-                       }
+#ifdef CONFIG_ACPI
+       if (!acpi_disabled && acpi_ioapic) {
+               apic_id = mp_find_ioapic(0);
+               if (apic_id < 0)
+                       apic_id = 0;
+       }
+#endif
 
-                       irq = pin_2_irq(idx, apic_id, pin);
+       for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
+               idx = find_irq_entry(apic_id, pin, mp_INT);
+               if (idx == -1) {
+                       if (!notcon) {
+                               notcon = 1;
+                               apic_printk(APIC_VERBOSE,
+                                       KERN_DEBUG " %d-%d",
+                                       mp_ioapics[apic_id].apicid, pin);
+                       } else
+                               apic_printk(APIC_VERBOSE, " %d-%d",
+                                       mp_ioapics[apic_id].apicid, pin);
+                       continue;
+               }
+               if (notcon) {
+                       apic_printk(APIC_VERBOSE,
+                               " (apicid-pin) not connected\n");
+                       notcon = 0;
+               }
 
-                       /*
-                        * Skip the timer IRQ if there's a quirk handler
-                        * installed and if it returns 1:
-                        */
-                       if (apic->multi_timer_check &&
-                                       apic->multi_timer_check(apic_id, irq))
-                               continue;
+               irq = pin_2_irq(idx, apic_id, pin);
 
-                       desc = irq_to_desc_alloc_cpu(irq, cpu);
-                       if (!desc) {
-                               printk(KERN_INFO "can not get irq_desc for %d\n", irq);
-                               continue;
-                       }
-                       cfg = desc->chip_data;
-                       add_pin_to_irq_cpu(cfg, cpu, apic_id, pin);
+               /*
+                * Skip the timer IRQ if there's a quirk handler
+                * installed and if it returns 1:
+                */
+               if (apic->multi_timer_check &&
+                               apic->multi_timer_check(apic_id, irq))
+                       continue;
 
-                       setup_IO_APIC_irq(apic_id, pin, irq, desc,
-                                       irq_trigger(idx), irq_polarity(idx));
+               desc = irq_to_desc_alloc_node(irq, node);
+               if (!desc) {
+                       printk(KERN_INFO "can not get irq_desc for %d\n", irq);
+                       continue;
                }
+               cfg = desc->chip_data;
+               add_pin_to_irq_node(cfg, node, apic_id, pin);
+               /*
+                * don't mark it in pin_programmed, so later acpi could
+                * set it correctly when irq < 16
+                */
+               setup_IO_APIC_irq(apic_id, pin, irq, desc,
+                               irq_trigger(idx), irq_polarity(idx));
        }
 
        if (notcon)
@@ -1869,7 +1733,7 @@ __apicdebuginit(void) print_APIC_bitfield(int base)
 
 __apicdebuginit(void) print_local_APIC(void *dummy)
 {
-       unsigned int v, ver, maxlvt;
+       unsigned int i, v, ver, maxlvt;
        u64 icr;
 
        if (apic_verbosity == APIC_QUIET)
@@ -1957,6 +1821,18 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
        printk(KERN_DEBUG "... APIC TMCCT: %08x\n", v);
        v = apic_read(APIC_TDCR);
        printk(KERN_DEBUG "... APIC TDCR: %08x\n", v);
+
+       if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+               v = apic_read(APIC_EFEAT);
+               maxlvt = (v >> 16) & 0xff;
+               printk(KERN_DEBUG "... APIC EFEAT: %08x\n", v);
+               v = apic_read(APIC_ECTRL);
+               printk(KERN_DEBUG "... APIC ECTRL: %08x\n", v);
+               for (i = 0; i < maxlvt; i++) {
+                       v = apic_read(APIC_EILVTn(i));
+                       printk(KERN_DEBUG "... APIC EILVT%d: %08x\n", i, v);
+               }
+       }
        printk("\n");
 }
 
@@ -2005,6 +1881,11 @@ __apicdebuginit(void) print_PIC(void)
 __apicdebuginit(int) print_all_ICs(void)
 {
        print_PIC();
+
+       /* don't print out if apic is not there */
+       if (!cpu_has_apic || disable_apic)
+               return 0;
+
        print_all_local_APICs();
        print_IO_APIC();
 
@@ -2360,9 +2241,121 @@ static int ioapic_retrigger_irq(unsigned int irq)
  */
 
 #ifdef CONFIG_SMP
+static void send_cleanup_vector(struct irq_cfg *cfg)
+{
+       cpumask_var_t cleanup_mask;
+
+       if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
+               unsigned int i;
+               cfg->move_cleanup_count = 0;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       cfg->move_cleanup_count++;
+               for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
+                       apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
+       } else {
+               cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
+               cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
+               apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
+               free_cpumask_var(cleanup_mask);
+       }
+       cfg->move_in_progress = 0;
+}
+
+static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+       u8 vector = cfg->vector;
+
+       entry = cfg->irq_2_pin;
+       for (;;) {
+               unsigned int reg;
+
+               if (!entry)
+                       break;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               /*
+                * With interrupt-remapping, destination information comes
+                * from interrupt-remapping table entry.
+                */
+               if (!irq_remapped(irq))
+                       io_apic_write(apic, 0x11 + pin*2, dest);
+               reg = io_apic_read(apic, 0x10 + pin*2);
+               reg &= ~IO_APIC_REDIR_VECTOR_MASK;
+               reg |= vector;
+               io_apic_modify(apic, 0x10 + pin*2, reg);
+               if (!entry->next)
+                       break;
+               entry = entry->next;
+       }
+}
+
+static int
+assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
+
+/*
+ * Either sets desc->affinity to a valid value, and returns
+ * ->cpu_mask_to_apicid of that, or returns BAD_APICID and
+ * leaves desc->affinity untouched.
+ */
+static unsigned int
+set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned int irq;
+
+       if (!cpumask_intersects(mask, cpu_online_mask))
+               return BAD_APICID;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+       if (assign_irq_vector(irq, cfg, mask))
+               return BAD_APICID;
+
+       cpumask_copy(desc->affinity, mask);
+
+       return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
+}
+
+static int
+set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int dest;
+       unsigned int irq;
+       int ret = -1;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       dest = set_desc_affinity(desc, mask);
+       if (dest != BAD_APICID) {
+               /* Only the high 8 bits are valid. */
+               dest = SET_APIC_LOGICAL_ID(dest);
+               __target_IO_APIC_irq(irq, dest, cfg);
+               ret = 0;
+       }
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+
+       return ret;
+}
+
+static int
+set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
+{
+       struct irq_desc *desc;
+
+       desc = irq_to_desc(irq);
+
+       return set_ioapic_affinity_irq_desc(desc, mask);
+}
+
+#ifdef CONFIG_INTR_REMAP
 
-#ifdef CONFIG_INTR_REMAP
-
 /*
  * Migrate the IO-APIC irq in the presence of intr-remapping.
  *
@@ -2374,26 +2367,25 @@ static int ioapic_retrigger_irq(unsigned int irq)
  * Real vector that is used for interrupting cpu will be coming from
  * the interrupt-remapping table entry.
  */
-static void
+static int
 migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 {
        struct irq_cfg *cfg;
        struct irte irte;
        unsigned int dest;
        unsigned int irq;
+       int ret = -1;
 
        if (!cpumask_intersects(mask, cpu_online_mask))
-               return;
+               return ret;
 
        irq = desc->irq;
        if (get_irte(irq, &irte))
-               return;
+               return ret;
 
        cfg = desc->chip_data;
        if (assign_irq_vector(irq, cfg, mask))
-               return;
-
-       set_extra_move_desc(desc, mask);
+               return ret;
 
        dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
 
@@ -2409,27 +2401,30 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
                send_cleanup_vector(cfg);
 
        cpumask_copy(desc->affinity, mask);
+
+       return 0;
 }
 
 /*
  * Migrates the IRQ destination in the process context.
  */
-static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
                                            const struct cpumask *mask)
 {
-       migrate_ioapic_irq_desc(desc, mask);
+       return migrate_ioapic_irq_desc(desc, mask);
 }
-static void set_ir_ioapic_affinity_irq(unsigned int irq,
+static int set_ir_ioapic_affinity_irq(unsigned int irq,
                                       const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
 
-       set_ir_ioapic_affinity_irq_desc(desc, mask);
+       return set_ir_ioapic_affinity_irq_desc(desc, mask);
 }
 #else
-static inline void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
+static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
                                                   const struct cpumask *mask)
 {
+       return 0;
 }
 #endif
 
@@ -2491,86 +2486,19 @@ static void irq_complete_move(struct irq_desc **descp)
        struct irq_cfg *cfg = desc->chip_data;
        unsigned vector, me;
 
-       if (likely(!cfg->move_in_progress)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-               if (likely(!cfg->move_desc_pending))
-                       return;
-
-               /* domain has not changed, but affinity did */
-               me = smp_processor_id();
-               if (cpumask_test_cpu(me, desc->affinity)) {
-                       *descp = desc = move_irq_desc(desc, me);
-                       /* get the new one */
-                       cfg = desc->chip_data;
-                       cfg->move_desc_pending = 0;
-               }
-#endif
+       if (likely(!cfg->move_in_progress))
                return;
-       }
 
        vector = ~get_irq_regs()->orig_ax;
        me = smp_processor_id();
 
-       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) {
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-               *descp = desc = move_irq_desc(desc, me);
-               /* get the new one */
-               cfg = desc->chip_data;
-#endif
+       if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
                send_cleanup_vector(cfg);
-       }
 }
 #else
 static inline void irq_complete_move(struct irq_desc **descp) {}
 #endif
 
-static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
-{
-       int apic, pin;
-       struct irq_pin_list *entry;
-
-       entry = cfg->irq_2_pin;
-       for (;;) {
-
-               if (!entry)
-                       break;
-
-               apic = entry->apic;
-               pin = entry->pin;
-               io_apic_eoi(apic, pin);
-               entry = entry->next;
-       }
-}
-
-static void
-eoi_ioapic_irq(struct irq_desc *desc)
-{
-       struct irq_cfg *cfg;
-       unsigned long flags;
-       unsigned int irq;
-
-       irq = desc->irq;
-       cfg = desc->chip_data;
-
-       spin_lock_irqsave(&ioapic_lock, flags);
-       __eoi_ioapic_irq(irq, cfg);
-       spin_unlock_irqrestore(&ioapic_lock, flags);
-}
-
-#ifdef CONFIG_X86_X2APIC
-static void ack_x2apic_level(unsigned int irq)
-{
-       struct irq_desc *desc = irq_to_desc(irq);
-       ack_x2APIC_irq();
-       eoi_ioapic_irq(desc);
-}
-
-static void ack_x2apic_edge(unsigned int irq)
-{
-       ack_x2APIC_irq();
-}
-#endif
-
 static void ack_apic_edge(unsigned int irq)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -2634,9 +2562,6 @@ static void ack_apic_level(unsigned int irq)
         */
        ack_APIC_irq();
 
-       if (irq_remapped(irq))
-               eoi_ioapic_irq(desc);
-
        /* Now we can move and renable the irq */
        if (unlikely(do_unmask_irq)) {
                /* Only migrate the irq if the ack has been received.
@@ -2683,22 +2608,50 @@ static void ack_apic_level(unsigned int irq)
 }
 
 #ifdef CONFIG_INTR_REMAP
+static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
+{
+       int apic, pin;
+       struct irq_pin_list *entry;
+
+       entry = cfg->irq_2_pin;
+       for (;;) {
+
+               if (!entry)
+                       break;
+
+               apic = entry->apic;
+               pin = entry->pin;
+               io_apic_eoi(apic, pin);
+               entry = entry->next;
+       }
+}
+
+static void
+eoi_ioapic_irq(struct irq_desc *desc)
+{
+       struct irq_cfg *cfg;
+       unsigned long flags;
+       unsigned int irq;
+
+       irq = desc->irq;
+       cfg = desc->chip_data;
+
+       spin_lock_irqsave(&ioapic_lock, flags);
+       __eoi_ioapic_irq(irq, cfg);
+       spin_unlock_irqrestore(&ioapic_lock, flags);
+}
+
 static void ir_ack_apic_edge(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_edge(irq);
-#endif
-       return ack_apic_edge(irq);
+       ack_APIC_irq();
 }
 
 static void ir_ack_apic_level(unsigned int irq)
 {
-#ifdef CONFIG_X86_X2APIC
-       if (x2apic_enabled())
-               return ack_x2apic_level(irq);
-#endif
-       return ack_apic_level(irq);
+       struct irq_desc *desc = irq_to_desc(irq);
+
+       ack_APIC_irq();
+       eoi_ioapic_irq(desc);
 }
 #endif /* CONFIG_INTR_REMAP */
 
@@ -2903,7 +2856,7 @@ static inline void __init check_timer(void)
 {
        struct irq_desc *desc = irq_to_desc(0);
        struct irq_cfg *cfg = desc->chip_data;
-       int cpu = boot_cpu_id;
+       int node = cpu_to_node(boot_cpu_id);
        int apic1, pin1, apic2, pin2;
        unsigned long flags;
        int no_pin1 = 0;
@@ -2969,7 +2922,7 @@ static inline void __init check_timer(void)
                 * Ok, does IRQ0 through the IOAPIC work?
                 */
                if (no_pin1) {
-                       add_pin_to_irq_cpu(cfg, cpu, apic1, pin1);
+                       add_pin_to_irq_node(cfg, node, apic1, pin1);
                        setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
                } else {
                        /* for edge trigger, setup_IO_APIC_irq already
@@ -3006,7 +2959,7 @@ static inline void __init check_timer(void)
                /*
                 * legacy devices should be connected to IO APIC #0
                 */
-               replace_pin_at_irq_cpu(cfg, cpu, apic1, pin1, apic2, pin2);
+               replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
                setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
                enable_8259A_irq(0);
                if (timer_irq_works()) {
@@ -3218,14 +3171,13 @@ static int nr_irqs_gsi = NR_IRQS_LEGACY;
 /*
  * Dynamic irq allocate and deallocation
  */
-unsigned int create_irq_nr(unsigned int irq_want)
+unsigned int create_irq_nr(unsigned int irq_want, int node)
 {
        /* Allocate an unused irq */
        unsigned int irq;
        unsigned int new;
        unsigned long flags;
        struct irq_cfg *cfg_new = NULL;
-       int cpu = boot_cpu_id;
        struct irq_desc *desc_new = NULL;
 
        irq = 0;
@@ -3234,7 +3186,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
        spin_lock_irqsave(&vector_lock, flags);
        for (new = irq_want; new < nr_irqs; new++) {
-               desc_new = irq_to_desc_alloc_cpu(new, cpu);
+               desc_new = irq_to_desc_alloc_node(new, node);
                if (!desc_new) {
                        printk(KERN_INFO "can not get irq_desc for %d\n", new);
                        continue;
@@ -3243,6 +3195,9 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
                if (cfg_new->vector != 0)
                        continue;
+
+               desc_new = move_irq_desc(desc_new, node);
+
                if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
                        irq = new;
                break;
@@ -3260,11 +3215,12 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 int create_irq(void)
 {
+       int node = cpu_to_node(boot_cpu_id);
        unsigned int irq_want;
        int irq;
 
        irq_want = nr_irqs_gsi;
-       irq = create_irq_nr(irq_want);
+       irq = create_irq_nr(irq_want, node);
 
        if (irq == 0)
                irq = -1;
@@ -3366,7 +3322,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
 }
 
 #ifdef CONFIG_SMP
-static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
@@ -3375,7 +3331,7 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
        dest = set_desc_affinity(desc, mask);
        if (dest == BAD_APICID)
-               return;
+               return -1;
 
        cfg = desc->chip_data;
 
@@ -3387,13 +3343,15 @@ static void set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        write_msi_msg_desc(desc, &msg);
+
+       return 0;
 }
 #ifdef CONFIG_INTR_REMAP
 /*
  * Migrate the MSI irq to another cpumask. This migration is
  * done in the process context using interrupt-remapping hardware.
  */
-static void
+static int
 ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
@@ -3402,11 +3360,11 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
        struct irte irte;
 
        if (get_irte(irq, &irte))
-               return;
+               return -1;
 
        dest = set_desc_affinity(desc, mask);
        if (dest == BAD_APICID)
-               return;
+               return -1;
 
        irte.vector = cfg->vector;
        irte.dest_id = IRTE_DEST(dest);
@@ -3423,6 +3381,8 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
         */
        if (cfg->move_in_progress)
                send_cleanup_vector(cfg);
+
+       return 0;
 }
 
 #endif
@@ -3518,15 +3478,17 @@ int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
        unsigned int irq_want;
        struct intel_iommu *iommu = NULL;
        int index = 0;
+       int node;
 
        /* x86 doesn't support multiple MSI yet */
        if (type == PCI_CAP_ID_MSI && nvec > 1)
                return 1;
 
+       node = dev_to_node(&dev->dev);
        irq_want = nr_irqs_gsi;
        sub_handle = 0;
        list_for_each_entry(msidesc, &dev->msi_list, list) {
-               irq = create_irq_nr(irq_want);
+               irq = create_irq_nr(irq_want, node);
                if (irq == 0)
                        return -1;
                irq_want = irq + 1;
@@ -3576,7 +3538,7 @@ void arch_teardown_msi_irq(unsigned int irq)
 
 #if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
 #ifdef CONFIG_SMP
-static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
@@ -3585,7 +3547,7 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        dest = set_desc_affinity(desc, mask);
        if (dest == BAD_APICID)
-               return;
+               return -1;
 
        cfg = desc->chip_data;
 
@@ -3597,6 +3559,8 @@ static void dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        dmar_msi_write(irq, &msg);
+
+       return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3630,7 +3594,7 @@ int arch_setup_dmar_msi(unsigned int irq)
 #ifdef CONFIG_HPET_TIMER
 
 #ifdef CONFIG_SMP
-static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
+static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
@@ -3639,7 +3603,7 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
 
        dest = set_desc_affinity(desc, mask);
        if (dest == BAD_APICID)
-               return;
+               return -1;
 
        cfg = desc->chip_data;
 
@@ -3651,6 +3615,8 @@ static void hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
        msg.address_lo |= MSI_ADDR_DEST_ID(dest);
 
        hpet_msi_write(irq, &msg);
+
+       return 0;
 }
 
 #endif /* CONFIG_SMP */
@@ -3707,7 +3673,7 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
        write_ht_irq_msg(irq, &msg);
 }
 
-static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
+static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 {
        struct irq_desc *desc = irq_to_desc(irq);
        struct irq_cfg *cfg;
@@ -3715,11 +3681,13 @@ static void set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
 
        dest = set_desc_affinity(desc, mask);
        if (dest == BAD_APICID)
-               return;
+               return -1;
 
        cfg = desc->chip_data;
 
        target_ht_irq(irq, dest, cfg->vector);
+
+       return 0;
 }
 
 #endif
@@ -3794,6 +3762,8 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
        unsigned long flags;
        int err;
 
+       BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
        cfg = irq_cfg(irq);
 
        err = assign_irq_vector(irq, cfg, eligible_cpu);
@@ -3807,15 +3777,13 @@ int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
 
        mmr_value = 0;
        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-       BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
-       entry->vector = cfg->vector;
-       entry->delivery_mode = apic->irq_delivery_mode;
-       entry->dest_mode = apic->irq_dest_mode;
-       entry->polarity = 0;
-       entry->trigger = 0;
-       entry->mask = 0;
-       entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
+       entry->vector           = cfg->vector;
+       entry->delivery_mode    = apic->irq_delivery_mode;
+       entry->dest_mode        = apic->irq_dest_mode;
+       entry->polarity         = 0;
+       entry->trigger          = 0;
+       entry->mask             = 0;
+       entry->dest             = apic->cpu_mask_to_apicid(eligible_cpu);
 
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
        uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
@@ -3833,10 +3801,10 @@ void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
        struct uv_IO_APIC_route_entry *entry;
        int mmr_pnode;
 
+       BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
+
        mmr_value = 0;
        entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
-       BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
-
        entry->mask = 1;
 
        mmr_pnode = uv_blade_to_pnode(mmr_blade);
@@ -3900,6 +3868,71 @@ int __init arch_probe_nr_irqs(void)
 }
 #endif
 
+static int __io_apic_set_pci_routing(struct device *dev, int irq,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       struct irq_desc *desc;
+       struct irq_cfg *cfg;
+       int node;
+       int ioapic, pin;
+       int trigger, polarity;
+
+       ioapic = irq_attr->ioapic;
+       if (!IO_APIC_IRQ(irq)) {
+               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
+                       ioapic);
+               return -EINVAL;
+       }
+
+       if (dev)
+               node = dev_to_node(dev);
+       else
+               node = cpu_to_node(boot_cpu_id);
+
+       desc = irq_to_desc_alloc_node(irq, node);
+       if (!desc) {
+               printk(KERN_INFO "can not get irq_desc %d\n", irq);
+               return 0;
+       }
+
+       pin = irq_attr->ioapic_pin;
+       trigger = irq_attr->trigger;
+       polarity = irq_attr->polarity;
+
+       /*
+        * IRQs < 16 are already in the irq_2_pin[] map
+        */
+       if (irq >= NR_IRQS_LEGACY) {
+               cfg = desc->chip_data;
+               add_pin_to_irq_node(cfg, node, ioapic, pin);
+       }
+
+       setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
+
+       return 0;
+}
+
+int io_apic_set_pci_routing(struct device *dev, int irq,
+                               struct io_apic_irq_attr *irq_attr)
+{
+       int ioapic, pin;
+       /*
+        * Avoid pin reprogramming.  PRTs typically include entries
+        * with redundant pin->gsi mappings (but unique PCI devices);
+        * we only program the IOAPIC on the first.
+        */
+       ioapic = irq_attr->ioapic;
+       pin = irq_attr->ioapic_pin;
+       if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
+               pr_debug("Pin %d-%d already programmed\n",
+                        mp_ioapics[ioapic].apicid, pin);
+               return 0;
+       }
+       set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
+
+       return __io_apic_set_pci_routing(dev, irq, irq_attr);
+}
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
@@ -3980,6 +4013,7 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
 
        return apic_id;
 }
+#endif
 
 int __init io_apic_get_version(int ioapic)
 {
@@ -3992,39 +4026,6 @@ int __init io_apic_get_version(int ioapic)
 
        return reg_01.bits.version;
 }
-#endif
-
-int io_apic_set_pci_routing (int ioapic, int pin, int irq, int triggering, int polarity)
-{
-       struct irq_desc *desc;
-       struct irq_cfg *cfg;
-       int cpu = boot_cpu_id;
-
-       if (!IO_APIC_IRQ(irq)) {
-               apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
-                       ioapic);
-               return -EINVAL;
-       }
-
-       desc = irq_to_desc_alloc_cpu(irq, cpu);
-       if (!desc) {
-               printk(KERN_INFO "can not get irq_desc %d\n", irq);
-               return 0;
-       }
-
-       /*
-        * IRQs < 16 are already in the irq_2_pin[] map
-        */
-       if (irq >= NR_IRQS_LEGACY) {
-               cfg = desc->chip_data;
-               add_pin_to_irq_cpu(cfg, cpu, ioapic, pin);
-       }
-
-       setup_IO_APIC_irq(ioapic, pin, irq, desc, triggering, polarity);
-
-       return 0;
-}
-
 
 int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 {
@@ -4055,51 +4056,44 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
 #ifdef CONFIG_SMP
 void __init setup_ioapic_dest(void)
 {
-       int pin, ioapic, irq, irq_entry;
+       int pin, ioapic = 0, irq, irq_entry;
        struct irq_desc *desc;
-       struct irq_cfg *cfg;
        const struct cpumask *mask;
 
        if (skip_ioapic_setup == 1)
                return;
 
-       for (ioapic = 0; ioapic < nr_ioapics; ioapic++) {
-               for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
-                       irq_entry = find_irq_entry(ioapic, pin, mp_INT);
-                       if (irq_entry == -1)
-                               continue;
-                       irq = pin_2_irq(irq_entry, ioapic, pin);
-
-                       /* setup_IO_APIC_irqs could fail to get vector for some device
-                        * when you have too many devices, because at that time only boot
-                        * cpu is online.
-                        */
-                       desc = irq_to_desc(irq);
-                       cfg = desc->chip_data;
-                       if (!cfg->vector) {
-                               setup_IO_APIC_irq(ioapic, pin, irq, desc,
-                                                 irq_trigger(irq_entry),
-                                                 irq_polarity(irq_entry));
-                               continue;
+#ifdef CONFIG_ACPI
+       if (!acpi_disabled && acpi_ioapic) {
+               ioapic = mp_find_ioapic(0);
+               if (ioapic < 0)
+                       ioapic = 0;
+       }
+#endif
 
-                       }
+       for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
+               irq_entry = find_irq_entry(ioapic, pin, mp_INT);
+               if (irq_entry == -1)
+                       continue;
+               irq = pin_2_irq(irq_entry, ioapic, pin);
 
-                       /*
-                        * Honour affinities which have been set in early boot
-                        */
-                       if (desc->status &
-                           (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-                               mask = desc->affinity;
-                       else
-                               mask = apic->target_cpus();
+               desc = irq_to_desc(irq);
 
-                       if (intr_remapping_enabled)
-                               set_ir_ioapic_affinity_irq_desc(desc, mask);
-                       else
-                               set_ioapic_affinity_irq_desc(desc, mask);
-               }
+               /*
+                * Honour affinities which have been set in early boot
+                */
+               if (desc->status &
+                   (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
+                       mask = desc->affinity;
+               else
+                       mask = apic->target_cpus();
 
+               if (intr_remapping_enabled)
+                       set_ir_ioapic_affinity_irq_desc(desc, mask);
+               else
+                       set_ioapic_affinity_irq_desc(desc, mask);
        }
+
 }
 #endif
 
index ce4fbfa315a16ac1f4e45f34281121577f5477a6..a691302dc3ffa4e5e4771df1fedf17919d7133c6 100644 (file)
@@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data)
 }
 #endif
 
-static void report_broken_nmi(int cpu, int *prev_nmi_count)
+static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
 {
        printk(KERN_CONT "\n");
 
index 01eda2ac65e4d9620b19979a0a24ef4636cae889..440a8bccd91ad7ac8ae326f48fb30cab6e007b20 100644 (file)
@@ -160,7 +160,6 @@ extern struct apic apic_summit;
 extern struct apic apic_bigsmp;
 extern struct apic apic_es7000;
 extern struct apic apic_es7000_cluster;
-extern struct apic apic_default;
 
 struct apic *apic = &apic_default;
 EXPORT_SYMBOL_GPL(apic);
index 1783652bb0e56c8140e411a4ba5643a64ab4d0fc..bc3e880f9b82e76902b305d10920b70b05c440ac 100644 (file)
@@ -50,7 +50,7 @@ static struct apic *apic_probe[] __initdata = {
 void __init default_setup_apic_routing(void)
 {
 #ifdef CONFIG_X86_X2APIC
-       if (x2apic && (apic != &apic_x2apic_phys &&
+       if (x2apic_mode && (apic != &apic_x2apic_phys &&
 #ifdef CONFIG_X86_UV
                       apic != &apic_x2apic_uv_x &&
 #endif
index 9cfe1f415d81f3659f1f68b8f88dcb4365be3ff7..344eee4ac0a48242d64db5676ec38fbe92d42c73 100644 (file)
@@ -173,13 +173,6 @@ static inline int is_WPEG(struct rio_detail *rio){
                rio->type == LookOutAWPEG || rio->type == LookOutBWPEG);
 }
 
-
-/* In clustered mode, the high nibble of APIC ID is a cluster number.
- * The low nibble is a 4-bit bitmap. */
-#define XAPIC_DEST_CPUS_SHIFT  4
-#define XAPIC_DEST_CPUS_MASK   ((1u << XAPIC_DEST_CPUS_SHIFT) - 1)
-#define XAPIC_DEST_CLUSTER_MASK        (XAPIC_DEST_CPUS_MASK << XAPIC_DEST_CPUS_SHIFT)
-
 #define SUMMIT_APIC_DFR_VALUE  (APIC_DFR_CLUSTER)
 
 static const struct cpumask *summit_target_cpus(void)
index 4a903e2f0d179d68d6a30f5afe7c1b0543a45870..8e4cbb255c38d289f78083828845da5ae43b06ac 100644 (file)
@@ -10,7 +10,7 @@
 #include <asm/apic.h>
 #include <asm/ipi.h>
 
-DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
+static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
 
 static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
 {
index 2bda693529762b71f30ae2c26fa08a7692ca5cb5..ef0ae207a7c82cb64f7a0a7758a91dc22b0f8fee 100644 (file)
@@ -105,7 +105,7 @@ static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
        cpumask_set_cpu(cpu, retmask);
 }
 
-static int uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
+static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_rip)
 {
 #ifdef CONFIG_SMP
        unsigned long val;
@@ -562,7 +562,7 @@ void __init uv_system_init(void)
        union uvh_node_id_u node_id;
        unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
        int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val;
-       int max_pnode = 0;
+       int gnode_extra, max_pnode = 0;
        unsigned long mmr_base, present, paddr;
        unsigned short pnode_mask;
 
@@ -574,6 +574,13 @@ void __init uv_system_init(void)
        mmr_base =
            uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
            ~UV_MMR_ENABLE;
+       pnode_mask = (1 << n_val) - 1;
+       node_id.v = uv_read_local_mmr(UVH_NODE_ID);
+       gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
+       gnode_upper = ((unsigned long)gnode_extra  << m_val);
+       printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n",
+                       n_val, m_val, gnode_upper, gnode_extra);
+
        printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
 
        for(i = 0; i < UVH_NODE_PRESENT_TABLE_DEPTH; i++)
@@ -583,15 +590,18 @@ void __init uv_system_init(void)
 
        bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
        uv_blade_info = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_blade_info);
 
        get_lowmem_redirect(&lowmem_redir_base, &lowmem_redir_size);
 
        bytes = sizeof(uv_node_to_blade[0]) * num_possible_nodes();
        uv_node_to_blade = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_node_to_blade);
        memset(uv_node_to_blade, 255, bytes);
 
        bytes = sizeof(uv_cpu_to_blade[0]) * num_possible_cpus();
        uv_cpu_to_blade = kmalloc(bytes, GFP_KERNEL);
+       BUG_ON(!uv_cpu_to_blade);
        memset(uv_cpu_to_blade, 255, bytes);
 
        blade = 0;
@@ -607,11 +617,6 @@ void __init uv_system_init(void)
                }
        }
 
-       pnode_mask = (1 << n_val) - 1;
-       node_id.v = uv_read_local_mmr(UVH_NODE_ID);
-       gnode_upper = (((unsigned long)node_id.s.node_id) &
-                      ~((1 << n_val) - 1)) << m_val;
-
        uv_bios_init();
        uv_bios_get_sn_info(0, &uv_type, &sn_partition_id,
                            &sn_coherency_id, &sn_region_size);
@@ -634,6 +639,7 @@ void __init uv_system_init(void)
                uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
                uv_cpu_hub_info(cpu)->gpa_mask = (1 << (m_val + n_val)) - 1;
                uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
+               uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
                uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
                uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
                uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu;
index 5a6aa1c1162f06492a27cee3ac754b16e7fa57e0..1a830cbd70153b8662eddf16a87a5336c6cb0742 100644 (file)
@@ -146,4 +146,5 @@ void foo(void)
        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
        OFFSET(BP_version, boot_params, hdr.version);
+       OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 }
index e72f062fb4b5fd841617a3e7f7c766e64bf4428f..898ecc47e129e8f8e0848287a518f2af137f022b 100644 (file)
@@ -125,6 +125,7 @@ int main(void)
        OFFSET(BP_loadflags, boot_params, hdr.loadflags);
        OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
        OFFSET(BP_version, boot_params, hdr.version);
+       OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
 
        BLANK();
        DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
index 7e4a459daa644f30309f50e9ca65ece4be85fad7..e5b27d8f1b47395fda285d867b776e82454ae61a 100644 (file)
@@ -6,6 +6,7 @@
 #include <asm/processor.h>
 #include <asm/apic.h>
 #include <asm/cpu.h>
+#include <asm/pci-direct.h>
 
 #ifdef CONFIG_X86_64
 # include <asm/numa_64.h>
@@ -272,7 +273,7 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
        int cpu = smp_processor_id();
        int node;
-       unsigned apicid = hard_smp_processor_id();
+       unsigned apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
        node = c->phys_proc_id;
        if (apicid_to_node[apicid] != NUMA_NO_NODE)
@@ -351,6 +352,15 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
                    (c->x86_model == 8 && c->x86_mask >= 8))
                        set_cpu_cap(c, X86_FEATURE_K6_MTRR);
 #endif
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_PCI)
+       /* check CPU config space for extended APIC ID */
+       if (c->x86 >= 0xf) {
+               unsigned int val;
+               val = read_pci_config(0, 24, 0, 0x68);
+               if ((val & ((1 << 17) | (1 << 18))) == ((1 << 17) | (1 << 18)))
+                       set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
+       }
+#endif
 }
 
 static void __cpuinit init_amd(struct cpuinfo_x86 *c)
index 77848d9fca6833fa88488adb6e22cae027a7d6c1..b0517aa2bd3b37455c265bb415517f2a38eef20d 100644 (file)
@@ -299,7 +299,8 @@ static const char *__cpuinit table_lookup_model(struct cpuinfo_x86 *c)
        return NULL;            /* Not found */
 }
 
-__u32 cleared_cpu_caps[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_cleared[NCAPINTS] __cpuinitdata;
+__u32 cpu_caps_set[NCAPINTS] __cpuinitdata;
 
 void load_percpu_segment(int cpu)
 {
@@ -768,6 +769,12 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
        if (this_cpu->c_identify)
                this_cpu->c_identify(c);
 
+       /* Clear/Set all flags overriden by options, after probe */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
 #ifdef CONFIG_X86_64
        c->apicid = apic->phys_pkg_id(c->initial_apicid, 0);
 #endif
@@ -813,6 +820,16 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
 #endif
 
        init_hypervisor(c);
+
+       /*
+        * Clear/Set all flags overriden by options, need do it
+        * before following smp all cpus cap AND.
+        */
+       for (i = 0; i < NCAPINTS; i++) {
+               c->x86_capability[i] &= ~cpu_caps_cleared[i];
+               c->x86_capability[i] |= cpu_caps_set[i];
+       }
+
        /*
         * On SMP, boot_cpu_data holds the common feature set between
         * all CPUs; so make sure that we indicate which features are
@@ -825,10 +842,6 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
                        boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
        }
 
-       /* Clear all flags overriden by options */
-       for (i = 0; i < NCAPINTS; i++)
-               c->x86_capability[i] &= ~cleared_cpu_caps[i];
-
 #ifdef CONFIG_X86_MCE
        /* Init Machine Check Exception if available. */
        mcheck_init(c);
index 46e29ab96c6ac9d6765f0f1a89e7c043ed2f5cbd..6b2a52dd040398f36374926956875078c3e48f95 100644 (file)
@@ -32,9 +32,7 @@
 
 static DEFINE_PER_CPU(struct cpu_cpuX_base, cpu_arr[CPU_REG_ALL_BIT]);
 static DEFINE_PER_CPU(struct cpu_private *, priv_arr[MAX_CPU_FILES]);
-static DEFINE_PER_CPU(unsigned, cpu_modelflag);
 static DEFINE_PER_CPU(int, cpu_priv_count);
-static DEFINE_PER_CPU(unsigned, cpu_model);
 
 static DEFINE_MUTEX(cpu_debug_lock);
 
@@ -80,302 +78,102 @@ static struct cpu_file_base cpu_file[] = {
        { "value",      CPU_REG_ALL,    1       },
 };
 
-/* Intel Registers Range */
-static struct cpu_debug_range cpu_intel_range[] = {
-       { 0x00000000, 0x00000001, CPU_MC,       CPU_INTEL_ALL           },
-       { 0x00000006, 0x00000007, CPU_MONITOR,  CPU_CX_AT_XE            },
-       { 0x00000010, 0x00000010, CPU_TIME,     CPU_INTEL_ALL           },
-       { 0x00000011, 0x00000013, CPU_PMC,      CPU_INTEL_PENTIUM       },
-       { 0x00000017, 0x00000017, CPU_PLATFORM, CPU_PX_CX_AT_XE         },
-       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_P6_CX_AT_XE         },
-
-       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_PX_CX_AT_XE         },
-       { 0x0000002B, 0x0000002B, CPU_POWERON,  CPU_INTEL_XEON          },
-       { 0x0000002C, 0x0000002C, CPU_FREQ,     CPU_INTEL_XEON          },
-       { 0x0000003A, 0x0000003A, CPU_CONTROL,  CPU_CX_AT_XE            },
-
-       { 0x00000040, 0x00000043, CPU_LBRANCH,  CPU_PM_CX_AT_XE         },
-       { 0x00000044, 0x00000047, CPU_LBRANCH,  CPU_PM_CO_AT            },
-       { 0x00000060, 0x00000063, CPU_LBRANCH,  CPU_C2_AT               },
-       { 0x00000064, 0x00000067, CPU_LBRANCH,  CPU_INTEL_ATOM          },
-
-       { 0x00000079, 0x00000079, CPU_BIOS,     CPU_P6_CX_AT_XE         },
-       { 0x00000088, 0x0000008A, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x0000008B, 0x0000008B, CPU_BIOS,     CPU_P6_CX_AT_XE         },
-       { 0x0000009B, 0x0000009B, CPU_MONITOR,  CPU_INTEL_XEON          },
-
-       { 0x000000C1, 0x000000C2, CPU_PMC,      CPU_P6_CX_AT            },
-       { 0x000000CD, 0x000000CD, CPU_FREQ,     CPU_CX_AT               },
-       { 0x000000E7, 0x000000E8, CPU_PERF,     CPU_CX_AT               },
-       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_P6_CX_XE            },
-
-       { 0x00000116, 0x00000116, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x00000118, 0x00000118, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x00000119, 0x00000119, CPU_CACHE,    CPU_INTEL_PX            },
-       { 0x0000011A, 0x0000011B, CPU_CACHE,    CPU_INTEL_P6            },
-       { 0x0000011E, 0x0000011E, CPU_CACHE,    CPU_PX_CX_AT            },
-
-       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_P6_CX_AT_XE         },
-       { 0x00000179, 0x0000017A, CPU_MC,       CPU_PX_CX_AT_XE         },
-       { 0x0000017B, 0x0000017B, CPU_MC,       CPU_P6_XE               },
-       { 0x00000186, 0x00000187, CPU_PMC,      CPU_P6_CX_AT            },
-       { 0x00000198, 0x00000199, CPU_PERF,     CPU_PM_CX_AT_XE         },
-       { 0x0000019A, 0x0000019A, CPU_TIME,     CPU_PM_CX_AT_XE         },
-       { 0x0000019B, 0x0000019D, CPU_THERM,    CPU_PM_CX_AT_XE         },
-       { 0x000001A0, 0x000001A0, CPU_MISC,     CPU_PM_CX_AT_XE         },
-
-       { 0x000001C9, 0x000001C9, CPU_LBRANCH,  CPU_PM_CX_AT            },
-       { 0x000001D7, 0x000001D8, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_CX_AT_XE            },
-       { 0x000001DA, 0x000001DA, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000001DB, 0x000001DB, CPU_LBRANCH,  CPU_P6_XE               },
-       { 0x000001DC, 0x000001DC, CPU_LBRANCH,  CPU_INTEL_P6            },
-       { 0x000001DD, 0x000001DE, CPU_LBRANCH,  CPU_PX_CX_AT_XE         },
-       { 0x000001E0, 0x000001E0, CPU_LBRANCH,  CPU_INTEL_P6            },
-
-       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_P6_CX_XE            },
-       { 0x00000277, 0x00000277, CPU_PAT,      CPU_C2_AT_XE            },
-       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_P6_CX_XE            },
-
-       { 0x00000300, 0x00000308, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x00000309, 0x0000030B, CPU_PMC,      CPU_C2_AT_XE            },
-       { 0x0000030C, 0x00000311, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x00000345, 0x00000345, CPU_PMC,      CPU_C2_AT               },
-       { 0x00000360, 0x00000371, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x0000038D, 0x00000390, CPU_PMC,      CPU_C2_AT               },
-       { 0x000003A0, 0x000003BE, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003C0, 0x000003CD, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003E0, 0x000003E1, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003F0, 0x000003F0, CPU_PMC,      CPU_INTEL_XEON          },
-       { 0x000003F1, 0x000003F1, CPU_PMC,      CPU_C2_AT_XE            },
-       { 0x000003F2, 0x000003F2, CPU_PMC,      CPU_INTEL_XEON          },
-
-       { 0x00000400, 0x00000402, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000403, 0x00000403, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000404, 0x00000406, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000407, 0x00000407, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000408, 0x0000040A, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x0000040B, 0x0000040B, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x0000040C, 0x0000040E, CPU_MC,       CPU_PM_CX_XE            },
-       { 0x0000040F, 0x0000040F, CPU_MC,       CPU_INTEL_XEON          },
-       { 0x00000410, 0x00000412, CPU_MC,       CPU_PM_CX_AT_XE         },
-       { 0x00000413, 0x00000417, CPU_MC,       CPU_CX_AT_XE            },
-       { 0x00000480, 0x0000048B, CPU_VMX,      CPU_CX_AT_XE            },
-
-       { 0x00000600, 0x00000600, CPU_DEBUG,    CPU_PM_CX_AT_XE         },
-       { 0x00000680, 0x0000068F, CPU_LBRANCH,  CPU_INTEL_XEON          },
-       { 0x000006C0, 0x000006CF, CPU_LBRANCH,  CPU_INTEL_XEON          },
-
-       { 0x000107CC, 0x000107D3, CPU_PMC,      CPU_INTEL_XEON_MP       },
-
-       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_INTEL_XEON          },
-       { 0xC0000081, 0xC0000082, CPU_CALL,     CPU_INTEL_XEON          },
-       { 0xC0000084, 0xC0000084, CPU_CALL,     CPU_INTEL_XEON          },
-       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_INTEL_XEON          },
+/* CPU Registers Range */
+static struct cpu_debug_range cpu_reg_range[] = {
+       { 0x00000000, 0x00000001, CPU_MC,       },
+       { 0x00000006, 0x00000007, CPU_MONITOR,  },
+       { 0x00000010, 0x00000010, CPU_TIME,     },
+       { 0x00000011, 0x00000013, CPU_PMC,      },
+       { 0x00000017, 0x00000017, CPU_PLATFORM, },
+       { 0x0000001B, 0x0000001B, CPU_APIC,     },
+       { 0x0000002A, 0x0000002B, CPU_POWERON,  },
+       { 0x0000002C, 0x0000002C, CPU_FREQ,     },
+       { 0x0000003A, 0x0000003A, CPU_CONTROL,  },
+       { 0x00000040, 0x00000047, CPU_LBRANCH,  },
+       { 0x00000060, 0x00000067, CPU_LBRANCH,  },
+       { 0x00000079, 0x00000079, CPU_BIOS,     },
+       { 0x00000088, 0x0000008A, CPU_CACHE,    },
+       { 0x0000008B, 0x0000008B, CPU_BIOS,     },
+       { 0x0000009B, 0x0000009B, CPU_MONITOR,  },
+       { 0x000000C1, 0x000000C4, CPU_PMC,      },
+       { 0x000000CD, 0x000000CD, CPU_FREQ,     },
+       { 0x000000E7, 0x000000E8, CPU_PERF,     },
+       { 0x000000FE, 0x000000FE, CPU_MTRR,     },
+
+       { 0x00000116, 0x0000011E, CPU_CACHE,    },
+       { 0x00000174, 0x00000176, CPU_SYSENTER, },
+       { 0x00000179, 0x0000017B, CPU_MC,       },
+       { 0x00000186, 0x00000189, CPU_PMC,      },
+       { 0x00000198, 0x00000199, CPU_PERF,     },
+       { 0x0000019A, 0x0000019A, CPU_TIME,     },
+       { 0x0000019B, 0x0000019D, CPU_THERM,    },
+       { 0x000001A0, 0x000001A0, CPU_MISC,     },
+       { 0x000001C9, 0x000001C9, CPU_LBRANCH,  },
+       { 0x000001D7, 0x000001D8, CPU_LBRANCH,  },
+       { 0x000001D9, 0x000001D9, CPU_DEBUG,    },
+       { 0x000001DA, 0x000001E0, CPU_LBRANCH,  },
+
+       { 0x00000200, 0x0000020F, CPU_MTRR,     },
+       { 0x00000250, 0x00000250, CPU_MTRR,     },
+       { 0x00000258, 0x00000259, CPU_MTRR,     },
+       { 0x00000268, 0x0000026F, CPU_MTRR,     },
+       { 0x00000277, 0x00000277, CPU_PAT,      },
+       { 0x000002FF, 0x000002FF, CPU_MTRR,     },
+
+       { 0x00000300, 0x00000311, CPU_PMC,      },
+       { 0x00000345, 0x00000345, CPU_PMC,      },
+       { 0x00000360, 0x00000371, CPU_PMC,      },
+       { 0x0000038D, 0x00000390, CPU_PMC,      },
+       { 0x000003A0, 0x000003BE, CPU_PMC,      },
+       { 0x000003C0, 0x000003CD, CPU_PMC,      },
+       { 0x000003E0, 0x000003E1, CPU_PMC,      },
+       { 0x000003F0, 0x000003F2, CPU_PMC,      },
+
+       { 0x00000400, 0x00000417, CPU_MC,       },
+       { 0x00000480, 0x0000048B, CPU_VMX,      },
+
+       { 0x00000600, 0x00000600, CPU_DEBUG,    },
+       { 0x00000680, 0x0000068F, CPU_LBRANCH,  },
+       { 0x000006C0, 0x000006CF, CPU_LBRANCH,  },
+
+       { 0x000107CC, 0x000107D3, CPU_PMC,      },
+
+       { 0xC0000080, 0xC0000080, CPU_FEATURES, },
+       { 0xC0000081, 0xC0000084, CPU_CALL,     },
+       { 0xC0000100, 0xC0000102, CPU_BASE,     },
+       { 0xC0000103, 0xC0000103, CPU_TIME,     },
+
+       { 0xC0010000, 0xC0010007, CPU_PMC,      },
+       { 0xC0010010, 0xC0010010, CPU_CONF,     },
+       { 0xC0010015, 0xC0010015, CPU_CONF,     },
+       { 0xC0010016, 0xC001001A, CPU_MTRR,     },
+       { 0xC001001D, 0xC001001D, CPU_MTRR,     },
+       { 0xC001001F, 0xC001001F, CPU_CONF,     },
+       { 0xC0010030, 0xC0010035, CPU_BIOS,     },
+       { 0xC0010044, 0xC0010048, CPU_MC,       },
+       { 0xC0010050, 0xC0010056, CPU_SMM,      },
+       { 0xC0010058, 0xC0010058, CPU_CONF,     },
+       { 0xC0010060, 0xC0010060, CPU_CACHE,    },
+       { 0xC0010061, 0xC0010068, CPU_SMM,      },
+       { 0xC0010069, 0xC001006B, CPU_SMM,      },
+       { 0xC0010070, 0xC0010071, CPU_SMM,      },
+       { 0xC0010111, 0xC0010113, CPU_SMM,      },
+       { 0xC0010114, 0xC0010118, CPU_SVM,      },
+       { 0xC0010140, 0xC0010141, CPU_OSVM,     },
+       { 0xC0011022, 0xC0011023, CPU_CONF,     },
 };
 
-/* AMD Registers Range */
-static struct cpu_debug_range cpu_amd_range[] = {
-       { 0x00000000, 0x00000001, CPU_MC,       CPU_K10_PLUS,           },
-       { 0x00000010, 0x00000010, CPU_TIME,     CPU_K8_PLUS,            },
-       { 0x0000001B, 0x0000001B, CPU_APIC,     CPU_K8_PLUS,            },
-       { 0x0000002A, 0x0000002A, CPU_POWERON,  CPU_K7_PLUS             },
-       { 0x0000008B, 0x0000008B, CPU_VER,      CPU_K8_PLUS             },
-       { 0x000000FE, 0x000000FE, CPU_MTRR,     CPU_K8_PLUS,            },
-
-       { 0x00000174, 0x00000176, CPU_SYSENTER, CPU_K8_PLUS,            },
-       { 0x00000179, 0x0000017B, CPU_MC,       CPU_K8_PLUS,            },
-       { 0x000001D9, 0x000001D9, CPU_DEBUG,    CPU_K8_PLUS,            },
-       { 0x000001DB, 0x000001DE, CPU_LBRANCH,  CPU_K8_PLUS,            },
-
-       { 0x00000200, 0x0000020F, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000250, 0x00000250, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000258, 0x00000259, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000268, 0x0000026F, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0x00000277, 0x00000277, CPU_PAT,      CPU_K8_PLUS,            },
-       { 0x000002FF, 0x000002FF, CPU_MTRR,     CPU_K8_PLUS,            },
-
-       { 0x00000400, 0x00000413, CPU_MC,       CPU_K8_PLUS,            },
-
-       { 0xC0000080, 0xC0000080, CPU_FEATURES, CPU_AMD_ALL,            },
-       { 0xC0000081, 0xC0000084, CPU_CALL,     CPU_K8_PLUS,            },
-       { 0xC0000100, 0xC0000102, CPU_BASE,     CPU_K8_PLUS,            },
-       { 0xC0000103, 0xC0000103, CPU_TIME,     CPU_K10_PLUS,           },
-
-       { 0xC0010000, 0xC0010007, CPU_PMC,      CPU_K8_PLUS,            },
-       { 0xC0010010, 0xC0010010, CPU_CONF,     CPU_K7_PLUS,            },
-       { 0xC0010015, 0xC0010015, CPU_CONF,     CPU_K7_PLUS,            },
-       { 0xC0010016, 0xC001001A, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0xC001001D, 0xC001001D, CPU_MTRR,     CPU_K8_PLUS,            },
-       { 0xC001001F, 0xC001001F, CPU_CONF,     CPU_K8_PLUS,            },
-       { 0xC0010030, 0xC0010035, CPU_BIOS,     CPU_K8_PLUS,            },
-       { 0xC0010044, 0xC0010048, CPU_MC,       CPU_K8_PLUS,            },
-       { 0xC0010050, 0xC0010056, CPU_SMM,      CPU_K0F_PLUS,           },
-       { 0xC0010058, 0xC0010058, CPU_CONF,     CPU_K10_PLUS,           },
-       { 0xC0010060, 0xC0010060, CPU_CACHE,    CPU_AMD_11,             },
-       { 0xC0010061, 0xC0010068, CPU_SMM,      CPU_K10_PLUS,           },
-       { 0xC0010069, 0xC001006B, CPU_SMM,      CPU_AMD_11,             },
-       { 0xC0010070, 0xC0010071, CPU_SMM,      CPU_K10_PLUS,           },
-       { 0xC0010111, 0xC0010113, CPU_SMM,      CPU_K8_PLUS,            },
-       { 0xC0010114, 0xC0010118, CPU_SVM,      CPU_K10_PLUS,           },
-       { 0xC0010140, 0xC0010141, CPU_OSVM,     CPU_K10_PLUS,           },
-       { 0xC0011022, 0xC0011023, CPU_CONF,     CPU_K10_PLUS,           },
-};
-
-
-/* Intel */
-static int get_intel_modelflag(unsigned model)
-{
-       int flag;
-
-       switch (model) {
-       case 0x0501:
-       case 0x0502:
-       case 0x0504:
-               flag = CPU_INTEL_PENTIUM;
-               break;
-       case 0x0601:
-       case 0x0603:
-       case 0x0605:
-       case 0x0607:
-       case 0x0608:
-       case 0x060A:
-       case 0x060B:
-               flag = CPU_INTEL_P6;
-               break;
-       case 0x0609:
-       case 0x060D:
-               flag = CPU_INTEL_PENTIUM_M;
-               break;
-       case 0x060E:
-               flag = CPU_INTEL_CORE;
-               break;
-       case 0x060F:
-       case 0x0617:
-               flag = CPU_INTEL_CORE2;
-               break;
-       case 0x061C:
-               flag = CPU_INTEL_ATOM;
-               break;
-       case 0x0F00:
-       case 0x0F01:
-       case 0x0F02:
-       case 0x0F03:
-       case 0x0F04:
-               flag = CPU_INTEL_XEON_P4;
-               break;
-       case 0x0F06:
-               flag = CPU_INTEL_XEON_MP;
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-/* AMD */
-static int get_amd_modelflag(unsigned model)
-{
-       int flag;
-
-       switch (model >> 8) {
-       case 0x6:
-               flag = CPU_AMD_K6;
-               break;
-       case 0x7:
-               flag = CPU_AMD_K7;
-               break;
-       case 0x8:
-               flag = CPU_AMD_K8;
-               break;
-       case 0xf:
-               flag = CPU_AMD_0F;
-               break;
-       case 0x10:
-               flag = CPU_AMD_10;
-               break;
-       case 0x11:
-               flag = CPU_AMD_11;
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-static int get_cpu_modelflag(unsigned cpu)
-{
-       int flag;
-
-       flag = per_cpu(cpu_model, cpu);
-
-       switch (flag >> 16) {
-       case X86_VENDOR_INTEL:
-               flag = get_intel_modelflag(flag);
-               break;
-       case X86_VENDOR_AMD:
-               flag = get_amd_modelflag(flag & 0xffff);
-               break;
-       default:
-               flag = CPU_NONE;
-               break;
-       }
-
-       return flag;
-}
-
-static int get_cpu_range_count(unsigned cpu)
-{
-       int index;
-
-       switch (per_cpu(cpu_model, cpu) >> 16) {
-       case X86_VENDOR_INTEL:
-               index = ARRAY_SIZE(cpu_intel_range);
-               break;
-       case X86_VENDOR_AMD:
-               index = ARRAY_SIZE(cpu_amd_range);
-               break;
-       default:
-               index = 0;
-               break;
-       }
-
-       return index;
-}
-
 static int is_typeflag_valid(unsigned cpu, unsigned flag)
 {
-       unsigned vendor, modelflag;
-       int i, index;
+       int i;
 
        /* Standard Registers should be always valid */
        if (flag >= CPU_TSS)
                return 1;
 
-       modelflag = per_cpu(cpu_modelflag, cpu);
-       vendor = per_cpu(cpu_model, cpu) >> 16;
-       index = get_cpu_range_count(cpu);
-
-       for (i = 0; i < index; i++) {
-               switch (vendor) {
-               case X86_VENDOR_INTEL:
-                       if ((cpu_intel_range[i].model & modelflag) &&
-                           (cpu_intel_range[i].flag & flag))
-                               return 1;
-                       break;
-               case X86_VENDOR_AMD:
-                       if ((cpu_amd_range[i].model & modelflag) &&
-                           (cpu_amd_range[i].flag & flag))
-                               return 1;
-                       break;
-               }
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
+               if (cpu_reg_range[i].flag == flag)
+                       return 1;
        }
 
        /* Invalid */
@@ -385,26 +183,11 @@ static int is_typeflag_valid(unsigned cpu, unsigned flag)
 static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
                              int index, unsigned flag)
 {
-       unsigned modelflag;
-
-       modelflag = per_cpu(cpu_modelflag, cpu);
-       *max = 0;
-       switch (per_cpu(cpu_model, cpu) >> 16) {
-       case X86_VENDOR_INTEL:
-               if ((cpu_intel_range[index].model & modelflag) &&
-                   (cpu_intel_range[index].flag & flag)) {
-                       *min = cpu_intel_range[index].min;
-                       *max = cpu_intel_range[index].max;
-               }
-               break;
-       case X86_VENDOR_AMD:
-               if ((cpu_amd_range[index].model & modelflag) &&
-                   (cpu_amd_range[index].flag & flag)) {
-                       *min = cpu_amd_range[index].min;
-                       *max = cpu_amd_range[index].max;
-               }
-               break;
-       }
+       if (cpu_reg_range[index].flag == flag) {
+               *min = cpu_reg_range[index].min;
+               *max = cpu_reg_range[index].max;
+       } else
+               *max = 0;
 
        return *max;
 }
@@ -434,7 +217,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
        unsigned msr, msr_min, msr_max;
        struct cpu_private *priv;
        u32 low, high;
-       int i, range;
+       int i;
 
        if (seq) {
                priv = seq->private;
@@ -446,9 +229,7 @@ static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
                }
        }
 
-       range = get_cpu_range_count(cpu);
-
-       for (i = 0; i < range; i++) {
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
                if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
                        continue;
 
@@ -588,8 +369,20 @@ static void print_apic(void *arg)
        seq_printf(seq, " TMICT\t\t: %08x\n",  apic_read(APIC_TMICT));
        seq_printf(seq, " TMCCT\t\t: %08x\n",  apic_read(APIC_TMCCT));
        seq_printf(seq, " TDCR\t\t: %08x\n",  apic_read(APIC_TDCR));
-#endif /* CONFIG_X86_LOCAL_APIC */
+       if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
+               unsigned int i, v, maxeilvt;
+
+               v = apic_read(APIC_EFEAT);
+               maxeilvt = (v >> 16) & 0xff;
+               seq_printf(seq, " EFEAT\t\t: %08x\n", v);
+               seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
 
+               for (i = 0; i < maxeilvt; i++) {
+                       v = apic_read(APIC_EILVTn(i));
+                       seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
+               }
+       }
+#endif /* CONFIG_X86_LOCAL_APIC */
        seq_printf(seq, "\n MSR\t:\n");
 }
 
@@ -788,13 +581,11 @@ static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
 {
        struct dentry *cpu_dentry = NULL;
        unsigned reg, reg_min, reg_max;
-       int i, range, err = 0;
+       int i, err = 0;
        char reg_dir[12];
        u32 low, high;
 
-       range = get_cpu_range_count(cpu);
-
-       for (i = 0; i < range; i++) {
+       for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
                if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
                                   cpu_base[type].flag))
                        continue;
@@ -850,10 +641,6 @@ static int cpu_init_cpu(void)
                cpui = &cpu_data(cpu);
                if (!cpu_has(cpui, X86_FEATURE_MSR))
                        continue;
-               per_cpu(cpu_model, cpu) = ((cpui->x86_vendor << 16) |
-                                          (cpui->x86 << 8) |
-                                          (cpui->x86_model));
-               per_cpu(cpu_modelflag, cpu) = get_cpu_modelflag(cpu);
 
                sprintf(cpu_dir, "cpu%d", cpu);
                cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
index 52c839875478571711df3d861ffc1a3d4c5eb413..f138c6c389b921628575575f6a2b0c115441f6c0 100644 (file)
@@ -220,11 +220,14 @@ config X86_LONGHAUL
          If in doubt, say N.
 
 config X86_E_POWERSAVER
-       tristate "VIA C7 Enhanced PowerSaver"
+       tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
        select CPU_FREQ_TABLE
-       depends on X86_32
+       depends on X86_32 && EXPERIMENTAL
        help
-         This adds the CPUFreq driver for VIA C7 processors.
+         This adds the CPUFreq driver for VIA C7 processors.  However, this driver
+         does not have any safeguards to prevent operating the CPU out of spec
+         and is thus considered dangerous.  Please use the regular ACPI cpufreq
+         driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
 
          If in doubt, say N.
 
index 752e8c6b2c7e29e903895a4abe541b6e186b7add..ae9b503220cafa62e818afd4bfcf3b849a86c4b3 100644 (file)
@@ -90,11 +90,7 @@ static int check_est_cpu(unsigned int cpuid)
 {
        struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
 
-       if (cpu->x86_vendor != X86_VENDOR_INTEL ||
-           !cpu_has(cpu, X86_FEATURE_EST))
-               return 0;
-
-       return 1;
+       return cpu_has(cpu, X86_FEATURE_EST);
 }
 
 static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
index 7437fa133c02dfdde137ca7201fb9734888a85bc..daed39ba2614dbcad31e4725b343d56d0912f05d 100644 (file)
@@ -229,12 +229,12 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
 }
 #endif
 
-static void __cpuinit srat_detect_node(void)
+static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
 {
 #if defined(CONFIG_NUMA) && defined(CONFIG_X86_64)
        unsigned node;
        int cpu = smp_processor_id();
-       int apicid = hard_smp_processor_id();
+       int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
 
        /* Don't do the funky fallback heuristics the AMD version employs
           for now. */
@@ -400,7 +400,7 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
        }
 
        /* Work around errata */
-       srat_detect_node();
+       srat_detect_node(c);
 
        if (cpu_has(c, X86_FEATURE_VMX))
                detect_vmx_virtcap(c);
index 483eda96e102062b23f3e29820d911a9c7d6ab59..789efe217e1ab89a8862df2387a980d2cca9a60a 100644 (file)
@@ -17,6 +17,7 @@
 
 #include <asm/processor.h>
 #include <asm/smp.h>
+#include <asm/k8.h>
 
 #define LVL_1_INST     1
 #define LVL_1_DATA     2
@@ -159,14 +160,6 @@ struct _cpuid4_info_regs {
        unsigned long can_disable;
 };
 
-#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS)
-static struct pci_device_id k8_nb_id[] = {
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) },
-       { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) },
-       {}
-};
-#endif
-
 unsigned short                 num_cache_leaves;
 
 /* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -207,10 +200,17 @@ union l3_cache {
 };
 
 static const unsigned short __cpuinitconst assocs[] = {
-       [1] = 1, [2] = 2, [4] = 4, [6] = 8,
-       [8] = 16, [0xa] = 32, [0xb] = 48,
+       [1] = 1,
+       [2] = 2,
+       [4] = 4,
+       [6] = 8,
+       [8] = 16,
+       [0xa] = 32,
+       [0xb] = 48,
        [0xc] = 64,
-       [0xf] = 0xffff // ??
+       [0xd] = 96,
+       [0xe] = 128,
+       [0xf] = 0xffff /* fully associative - no way to show this currently */
 };
 
 static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 };
@@ -271,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
        eax->split.type = types[leaf];
        eax->split.level = levels[leaf];
        if (leaf == 3)
-               eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1;
+               eax->split.num_threads_sharing =
+                       current_cpu_data.x86_max_cores - 1;
        else
                eax->split.num_threads_sharing = 0;
        eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1;
@@ -291,6 +292,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
 {
        if (index < 3)
                return;
+
+       if (boot_cpu_data.x86 == 0x11)
+               return;
+
+       /* see erratum #382 */
+       if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8))
+               return;
+
        this_leaf->can_disable = 1;
 }
 
@@ -696,97 +705,75 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
 #define to_object(k)   container_of(k, struct _index_kobject, kobj)
 #define to_attr(a)     container_of(a, struct _cache_attr, attr)
 
-#ifdef CONFIG_PCI
-static struct pci_dev *get_k8_northbridge(int node)
-{
-       struct pci_dev *dev = NULL;
-       int i;
-
-       for (i = 0; i <= node; i++) {
-               do {
-                       dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
-                       if (!dev)
-                               break;
-               } while (!pci_match_id(&k8_nb_id[0], dev));
-               if (!dev)
-                       break;
-       }
-       return dev;
-}
-#else
-static struct pci_dev *get_k8_northbridge(int node)
-{
-       return NULL;
-}
-#endif
-
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf)
+static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+                                 unsigned int index)
 {
-       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-       int node = cpu_to_node(cpumask_first(mask));
-       struct pci_dev *dev = NULL;
-       ssize_t ret = 0;
-       int i;
+       int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+       int node = cpu_to_node(cpu);
+       struct pci_dev *dev = node_to_k8_nb_misc(node);
+       unsigned int reg = 0;
 
        if (!this_leaf->can_disable)
-               return sprintf(buf, "Feature not enabled\n");
-
-       dev = get_k8_northbridge(node);
-       if (!dev) {
-               printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
                return -EINVAL;
-       }
 
-       for (i = 0; i < 2; i++) {
-               unsigned int reg;
+       if (!dev)
+               return -EINVAL;
 
-               pci_read_config_dword(dev, 0x1BC + i * 4, &reg);
+       pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
+       return sprintf(buf, "%x\n", reg);
+}
 
-               ret += sprintf(buf, "%sEntry: %d\n", buf, i);
-               ret += sprintf(buf, "%sReads:  %s\tNew Entries: %s\n",  
-                       buf,
-                       reg & 0x80000000 ? "Disabled" : "Allowed",
-                       reg & 0x40000000 ? "Disabled" : "Allowed");
-               ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n",
-                       buf, (reg & 0x30000) >> 16, reg & 0xfff);
-       }
-       return ret;
+#define SHOW_CACHE_DISABLE(index)                                      \
+static ssize_t                                                         \
+show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf)          \
+{                                                                      \
+       return show_cache_disable(this_leaf, buf, index);               \
 }
+SHOW_CACHE_DISABLE(0)
+SHOW_CACHE_DISABLE(1)
 
-static ssize_t
-store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf,
-                   size_t count)
+static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
+       const char *buf, size_t count, unsigned int index)
 {
-       const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
-       int node = cpu_to_node(cpumask_first(mask));
-       struct pci_dev *dev = NULL;
-       unsigned int ret, index, val;
+       int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+       int node = cpu_to_node(cpu);
+       struct pci_dev *dev = node_to_k8_nb_misc(node);
+       unsigned long val = 0;
+       unsigned int scrubber = 0;
 
        if (!this_leaf->can_disable)
-               return 0;
-
-       if (strlen(buf) > 15)
                return -EINVAL;
 
-       ret = sscanf(buf, "%x %x", &index, &val);
-       if (ret != 2)
+       if (!capable(CAP_SYS_ADMIN))
+               return -EPERM;
+
+       if (!dev)
                return -EINVAL;
-       if (index > 1)
+
+       if (strict_strtoul(buf, 10, &val) < 0)
                return -EINVAL;
 
        val |= 0xc0000000;
-       dev = get_k8_northbridge(node);
-       if (!dev) {
-               printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n");
-               return -EINVAL;
-       }
+
+       pci_read_config_dword(dev, 0x58, &scrubber);
+       scrubber &= ~0x1f000000;
+       pci_write_config_dword(dev, 0x58, scrubber);
 
        pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
        wbinvd();
        pci_write_config_dword(dev, 0x1BC + index * 4, val);
+       return count;
+}
 
-       return 1;
+#define STORE_CACHE_DISABLE(index)                                     \
+static ssize_t                                                         \
+store_cache_disable_##index(struct _cpuid4_info *this_leaf,            \
+                           const char *buf, size_t count)              \
+{                                                                      \
+       return store_cache_disable(this_leaf, buf, count, index);       \
 }
+STORE_CACHE_DISABLE(0)
+STORE_CACHE_DISABLE(1)
 
 struct _cache_attr {
        struct attribute attr;
@@ -808,7 +795,10 @@ define_one_ro(size);
 define_one_ro(shared_cpu_map);
 define_one_ro(shared_cpu_list);
 
-static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable);
+static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
+               show_cache_disable_0, store_cache_disable_0);
+static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
+               show_cache_disable_1, store_cache_disable_1);
 
 static struct attribute * default_attrs[] = {
        &type.attr,
@@ -820,7 +810,8 @@ static struct attribute * default_attrs[] = {
        &size.attr,
        &shared_cpu_map.attr,
        &shared_cpu_list.attr,
-       &cache_disable.attr,
+       &cache_disable_0.attr,
+       &cache_disable_1.attr,
        NULL
 };
 
index cef3ee30744b9964c9503f1ffd137c7d5c9aaf69..65a0fceedcd77ce7ed0088a5b24b0432caebd990 100644 (file)
@@ -15,7 +15,6 @@
 #include <asm/hw_irq.h>
 #include <asm/idle.h>
 #include <asm/therm_throt.h>
-#include <asm/apic.h>
 
 asmlinkage void smp_thermal_interrupt(void)
 {
index ce0fe4b5c04f6cfef4c14109124d55bd13f90cbe..1d584a18a50dab20fbe35e21e10859db05f2fe39 100644 (file)
@@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits)
 
        if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1)
                return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
+       rdmsr(MSR_MTRRdefType, def, dummy);
        def &= 0xff;
        if (def != MTRR_TYPE_UNCACHABLE)
                return 0;
@@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
         */
        if (!is_cpu(INTEL) || disable_mtrr_trim)
                return 0;
-       rdmsr(MTRRdefType_MSR, def, dummy);
+       rdmsr(MSR_MTRRdefType, def, dummy);
        def &= 0xff;
        if (def != MTRR_TYPE_UNCACHABLE)
                return 0;
index d21d4fb161f70f43e91d229e94706567fbf3954e..0543f69f0b270c4ec6a4edf9f603df9f8228b0ea 100644 (file)
@@ -20,9 +20,9 @@ struct fixed_range_block {
 };
 
 static struct fixed_range_block fixed_range_blocks[] = {
-       { MTRRfix64K_00000_MSR, 1 }, /* one  64k MTRR  */
-       { MTRRfix16K_80000_MSR, 2 }, /* two  16k MTRRs */
-       { MTRRfix4K_C0000_MSR,  8 }, /* eight 4k MTRRs */
+       { MSR_MTRRfix64K_00000, 1 }, /* one  64k MTRR  */
+       { MSR_MTRRfix16K_80000, 2 }, /* two  16k MTRRs */
+       { MSR_MTRRfix4K_C0000,  8 }, /* eight 4k MTRRs */
        {}
 };
 
@@ -194,12 +194,12 @@ get_fixed_ranges(mtrr_type * frs)
 
        k8_check_syscfg_dram_mod_en();
 
-       rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]);
+       rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]);
 
        for (i = 0; i < 2; i++)
-               rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]);
+               rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]);
        for (i = 0; i < 8; i++)
-               rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]);
+               rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]);
 }
 
 void mtrr_save_fixed_ranges(void *info)
@@ -310,7 +310,7 @@ void __init get_mtrr_state(void)
 
        vrs = mtrr_state.var_ranges;
 
-       rdmsr(MTRRcap_MSR, lo, dummy);
+       rdmsr(MSR_MTRRcap, lo, dummy);
        mtrr_state.have_fixed = (lo >> 8) & 1;
 
        for (i = 0; i < num_var_ranges; i++)
@@ -318,7 +318,7 @@ void __init get_mtrr_state(void)
        if (mtrr_state.have_fixed)
                get_fixed_ranges(mtrr_state.fixed_ranges);
 
-       rdmsr(MTRRdefType_MSR, lo, dummy);
+       rdmsr(MSR_MTRRdefType, lo, dummy);
        mtrr_state.def_type = (lo & 0xff);
        mtrr_state.enabled = (lo & 0xc00) >> 10;
 
@@ -583,10 +583,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
        __flush_tlb();
 
        /*  Save MTRR state */
-       rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+       rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
 
        /*  Disable MTRRs, and set the default type to uncached  */
-       mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi);
+       mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi);
 }
 
 static void post_set(void) __releases(set_atomicity_lock)
@@ -595,7 +595,7 @@ static void post_set(void) __releases(set_atomicity_lock)
        __flush_tlb();
 
        /* Intel (P6) standard MTRRs */
-       mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi);
+       mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi);
                
        /*  Enable caches  */
        write_cr0(read_cr0() & 0xbfffffff);
@@ -707,7 +707,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i
 static int generic_have_wrcomb(void)
 {
        unsigned long config, dummy;
-       rdmsr(MTRRcap_MSR, config, dummy);
+       rdmsr(MSR_MTRRcap, config, dummy);
        return (config & (1 << 10));
 }
 
index 03cda01f57c7125173a8b348d92105d5bcd5dd28..8fc248b5aeafe26a0f822350bf9488685b9eab4e 100644 (file)
@@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void)
        unsigned long config = 0, dummy;
 
        if (use_intel()) {
-               rdmsr(MTRRcap_MSR, config, dummy);
+               rdmsr(MSR_MTRRcap, config, dummy);
        } else if (is_cpu(AMD))
                config = 2;
        else if (is_cpu(CYRIX) || is_cpu(CENTAUR))
index 77f67f7b347a3a97a093b1094c5da86a2ca9ad35..7538b767f2060ed4727bb992f665b87e629b1bc5 100644 (file)
@@ -5,21 +5,6 @@
 #include <linux/types.h>
 #include <linux/stddef.h>
 
-#define MTRRcap_MSR     0x0fe
-#define MTRRdefType_MSR 0x2ff
-
-#define MTRRfix64K_00000_MSR 0x250
-#define MTRRfix16K_80000_MSR 0x258
-#define MTRRfix16K_A0000_MSR 0x259
-#define MTRRfix4K_C0000_MSR 0x268
-#define MTRRfix4K_C8000_MSR 0x269
-#define MTRRfix4K_D0000_MSR 0x26a
-#define MTRRfix4K_D8000_MSR 0x26b
-#define MTRRfix4K_E0000_MSR 0x26c
-#define MTRRfix4K_E8000_MSR 0x26d
-#define MTRRfix4K_F0000_MSR 0x26e
-#define MTRRfix4K_F8000_MSR 0x26f
-
 #define MTRR_CHANGE_MASK_FIXED     0x01
 #define MTRR_CHANGE_MASK_VARIABLE  0x02
 #define MTRR_CHANGE_MASK_DEFTYPE   0x04
index 7f7e2753685bce875ccd6dca8903474e366456cc..1f5fb1588d1fb4df0bbaefc4196b23a6056140aa 100644 (file)
@@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
 
                if (use_intel())
                        /*  Save MTRR state */
-                       rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                       rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
                else
                        /* Cyrix ARRs - everything else were excluded at the top */
                        ctxt->ccr3 = getCx86(CX86_CCR3);
@@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
 {
        if (use_intel())
                /*  Disable MTRRs, and set the default type to uncached  */
-               mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL,
+               mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
                      ctxt->deftype_hi);
        else if (is_cpu(CYRIX))
                /* Cyrix ARRs - everything else were excluded at the top */
@@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt)
                /*  Restore MTRRdefType  */
                if (use_intel())
                        /* Intel (P6) standard MTRRs */
-                       mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi);
+                       mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
                else
                        /* Cyrix ARRs - everything else was excluded at the top */
                        setCx86(CX86_CCR3, ctxt->ccr3);
index 87b67e3a765ac212c2c954de3566ec487fdac8da..48bfe1386038c91fc9ee5b99473e67df5a93a432 100644 (file)
  * Markus Metzger <markus.t.metzger@intel.com>, 2007-2009
  */
 
-
-#include <asm/ds.h>
-
-#include <linux/errno.h>
+#include <linux/kernel.h>
 #include <linux/string.h>
-#include <linux/slab.h>
+#include <linux/errno.h>
 #include <linux/sched.h>
+#include <linux/slab.h>
 #include <linux/mm.h>
-#include <linux/kernel.h>
+#include <linux/trace_clock.h>
+
+#include <asm/ds.h>
 
+#include "ds_selftest.h"
 
 /*
- * The configuration for a particular DS hardware implementation.
+ * The configuration for a particular DS hardware implementation:
  */
 struct ds_configuration {
-       /* the name of the configuration */
-       const char *name;
-       /* the size of one pointer-typed field in the DS structure and
-          in the BTS and PEBS buffers in bytes;
-          this covers the first 8 DS fields related to buffer management. */
-       unsigned char  sizeof_field;
-       /* the size of a BTS/PEBS record in bytes */
-       unsigned char  sizeof_rec[2];
-       /* a series of bit-masks to control various features indexed
-        * by enum ds_feature */
-       unsigned long ctl[dsf_ctl_max];
+       /* The name of the configuration: */
+       const char              *name;
+
+       /* The size of pointer-typed fields in DS, BTS, and PEBS: */
+       unsigned char           sizeof_ptr_field;
+
+       /* The size of a BTS/PEBS record in bytes: */
+       unsigned char           sizeof_rec[2];
+
+       /* The number of pebs counter reset values in the DS structure. */
+       unsigned char           nr_counter_reset;
+
+       /* Control bit-masks indexed by enum ds_feature: */
+       unsigned long           ctl[dsf_ctl_max];
 };
-static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
+static struct ds_configuration ds_cfg __read_mostly;
+
+
+/* Maximal size of a DS configuration: */
+#define MAX_SIZEOF_DS          0x80
 
-#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id())
+/* Maximal size of a BTS record: */
+#define MAX_SIZEOF_BTS         (3 * 8)
 
-#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */
-#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */
-#define DS_ALIGNMENT (1 << 3)  /* BTS and PEBS buffer alignment */
+/* BTS and PEBS buffer alignment: */
+#define DS_ALIGNMENT           (1 << 3)
 
-#define BTS_CONTROL \
- (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\
-  ds_cfg.ctl[dsf_bts_overflow])
+/* Number of buffer pointers in DS: */
+#define NUM_DS_PTR_FIELDS      8
 
+/* Size of a pebs reset value in DS: */
+#define PEBS_RESET_FIELD_SIZE  8
+
+/* Mask of control bits in the DS MSR register: */
+#define BTS_CONTROL                              \
+       ( ds_cfg.ctl[dsf_bts]                   | \
+         ds_cfg.ctl[dsf_bts_kernel]            | \
+         ds_cfg.ctl[dsf_bts_user]              | \
+         ds_cfg.ctl[dsf_bts_overflow] )
 
 /*
  * A BTS or PEBS tracer.
@@ -66,29 +82,36 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array);
  * to identify tracers.
  */
 struct ds_tracer {
-       /* the DS context (partially) owned by this tracer */
-       struct ds_context *context;
-       /* the buffer provided on ds_request() and its size in bytes */
-       void *buffer;
-       size_t size;
+       /* The DS context (partially) owned by this tracer. */
+       struct ds_context       *context;
+       /* The buffer provided on ds_request() and its size in bytes. */
+       void                    *buffer;
+       size_t                  size;
 };
 
 struct bts_tracer {
-       /* the common DS part */
-       struct ds_tracer ds;
-       /* the trace including the DS configuration */
-       struct bts_trace trace;
-       /* buffer overflow notification function */
-       bts_ovfl_callback_t ovfl;
+       /* The common DS part: */
+       struct ds_tracer        ds;
+
+       /* The trace including the DS configuration: */
+       struct bts_trace        trace;
+
+       /* Buffer overflow notification function: */
+       bts_ovfl_callback_t     ovfl;
+
+       /* Active flags affecting trace collection. */
+       unsigned int            flags;
 };
 
 struct pebs_tracer {
-       /* the common DS part */
-       struct ds_tracer ds;
-       /* the trace including the DS configuration */
-       struct pebs_trace trace;
-       /* buffer overflow notification function */
-       pebs_ovfl_callback_t ovfl;
+       /* The common DS part: */
+       struct ds_tracer        ds;
+
+       /* The trace including the DS configuration: */
+       struct pebs_trace       trace;
+
+       /* Buffer overflow notification function: */
+       pebs_ovfl_callback_t    ovfl;
 };
 
 /*
@@ -97,6 +120,7 @@ struct pebs_tracer {
  *
  * The DS configuration consists of the following fields; different
  * architetures vary in the size of those fields.
+ *
  * - double-word aligned base linear address of the BTS buffer
  * - write pointer into the BTS buffer
  * - end linear address of the BTS buffer (one byte beyond the end of
@@ -135,21 +159,22 @@ enum ds_field {
 };
 
 enum ds_qualifier {
-       ds_bts  = 0,
+       ds_bts = 0,
        ds_pebs
 };
 
-static inline unsigned long ds_get(const unsigned char *base,
-                                  enum ds_qualifier qual, enum ds_field field)
+static inline unsigned long
+ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field)
 {
-       base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+       base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
        return *(unsigned long *)base;
 }
 
-static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
-                         enum ds_field field, unsigned long value)
+static inline void
+ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field,
+       unsigned long value)
 {
-       base += (ds_cfg.sizeof_field * (field + (4 * qual)));
+       base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual)));
        (*(unsigned long *)base) = value;
 }
 
@@ -159,7 +184,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual,
  */
 static DEFINE_SPINLOCK(ds_lock);
 
-
 /*
  * We either support (system-wide) per-cpu or per-thread allocation.
  * We distinguish the two based on the task_struct pointer, where a
@@ -178,12 +202,28 @@ static DEFINE_SPINLOCK(ds_lock);
  */
 static atomic_t tracers = ATOMIC_INIT(0);
 
-static inline void get_tracer(struct task_struct *task)
+static inline int get_tracer(struct task_struct *task)
 {
-       if (task)
+       int error;
+
+       spin_lock_irq(&ds_lock);
+
+       if (task) {
+               error = -EPERM;
+               if (atomic_read(&tracers) < 0)
+                       goto out;
                atomic_inc(&tracers);
-       else
+       } else {
+               error = -EPERM;
+               if (atomic_read(&tracers) > 0)
+                       goto out;
                atomic_dec(&tracers);
+       }
+
+       error = 0;
+out:
+       spin_unlock_irq(&ds_lock);
+       return error;
 }
 
 static inline void put_tracer(struct task_struct *task)
@@ -194,14 +234,6 @@ static inline void put_tracer(struct task_struct *task)
                atomic_inc(&tracers);
 }
 
-static inline int check_tracer(struct task_struct *task)
-{
-       return task ?
-               (atomic_read(&tracers) >= 0) :
-               (atomic_read(&tracers) <= 0);
-}
-
-
 /*
  * The DS context is either attached to a thread or to a cpu:
  * - in the former case, the thread_struct contains a pointer to the
@@ -213,61 +245,58 @@ static inline int check_tracer(struct task_struct *task)
  * deallocated when the last user puts the context.
  */
 struct ds_context {
-       /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */
-       unsigned char ds[MAX_SIZEOF_DS];
-       /* the owner of the BTS and PEBS configuration, respectively */
-       struct bts_tracer *bts_master;
-       struct pebs_tracer *pebs_master;
-       /* use count */
-       unsigned long count;
-       /* a pointer to the context location inside the thread_struct
-        * or the per_cpu context array */
-       struct ds_context **this;
-       /* a pointer to the task owning this context, or NULL, if the
-        * context is owned by a cpu */
-       struct task_struct *task;
-};
+       /* The DS configuration; goes into MSR_IA32_DS_AREA: */
+       unsigned char           ds[MAX_SIZEOF_DS];
+
+       /* The owner of the BTS and PEBS configuration, respectively: */
+       struct bts_tracer       *bts_master;
+       struct pebs_tracer      *pebs_master;
 
-static DEFINE_PER_CPU(struct ds_context *, system_context_array);
+       /* Use count: */
+       unsigned long           count;
 
-#define system_context per_cpu(system_context_array, smp_processor_id())
+       /* Pointer to the context pointer field: */
+       struct ds_context       **this;
+
+       /* The traced task; NULL for cpu tracing: */
+       struct task_struct      *task;
+
+       /* The traced cpu; only valid if task is NULL: */
+       int                     cpu;
+};
 
+static DEFINE_PER_CPU(struct ds_context *, cpu_context);
 
-static inline struct ds_context *ds_get_context(struct task_struct *task)
+
+static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
 {
        struct ds_context **p_context =
-               (task ? &task->thread.ds_ctx : &system_context);
+               (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu));
        struct ds_context *context = NULL;
        struct ds_context *new_context = NULL;
-       unsigned long irq;
 
        /* Chances are small that we already have a context. */
        new_context = kzalloc(sizeof(*new_context), GFP_KERNEL);
        if (!new_context)
                return NULL;
 
-       spin_lock_irqsave(&ds_lock, irq);
+       spin_lock_irq(&ds_lock);
 
        context = *p_context;
-       if (!context) {
+       if (likely(!context)) {
                context = new_context;
 
                context->this = p_context;
                context->task = task;
+               context->cpu = cpu;
                context->count = 0;
 
-               if (task)
-                       set_tsk_thread_flag(task, TIF_DS_AREA_MSR);
-
-               if (!task || (task == current))
-                       wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds);
-
                *p_context = context;
        }
 
        context->count++;
 
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
 
        if (context != new_context)
                kfree(new_context);
@@ -275,8 +304,9 @@ static inline struct ds_context *ds_get_context(struct task_struct *task)
        return context;
 }
 
-static inline void ds_put_context(struct ds_context *context)
+static void ds_put_context(struct ds_context *context)
 {
+       struct task_struct *task;
        unsigned long irq;
 
        if (!context)
@@ -291,17 +321,55 @@ static inline void ds_put_context(struct ds_context *context)
 
        *(context->this) = NULL;
 
-       if (context->task)
-               clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+       task = context->task;
+
+       if (task)
+               clear_tsk_thread_flag(task, TIF_DS_AREA_MSR);
 
-       if (!context->task || (context->task == current))
-               wrmsrl(MSR_IA32_DS_AREA, 0);
+       /*
+        * We leave the (now dangling) pointer to the DS configuration in
+        * the DS_AREA msr. This is as good or as bad as replacing it with
+        * NULL - the hardware would crash if we enabled tracing.
+        *
+        * This saves us some problems with having to write an msr on a
+        * different cpu while preventing others from doing the same for the
+        * next context for that same cpu.
+        */
 
        spin_unlock_irqrestore(&ds_lock, irq);
 
+       /* The context might still be in use for context switching. */
+       if (task && (task != current))
+               wait_task_context_switch(task);
+
        kfree(context);
 }
 
+static void ds_install_ds_area(struct ds_context *context)
+{
+       unsigned long ds;
+
+       ds = (unsigned long)context->ds;
+
+       /*
+        * There is a race between the bts master and the pebs master.
+        *
+        * The thread/cpu access is synchronized via get/put_cpu() for
+        * task tracing and via wrmsr_on_cpu for cpu tracing.
+        *
+        * If bts and pebs are collected for the same task or same cpu,
+        * the same confiuration is written twice.
+        */
+       if (context->task) {
+               get_cpu();
+               if (context->task == current)
+                       wrmsrl(MSR_IA32_DS_AREA, ds);
+               set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR);
+               put_cpu();
+       } else
+               wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA,
+                            (u32)((u64)ds), (u32)((u64)ds >> 32));
+}
 
 /*
  * Call the tracer's callback on a buffer overflow.
@@ -332,9 +400,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual)
  * The remainder of any partially written record is zeroed out.
  *
  * context: the DS context
- * qual: the buffer type
- * record: the data to write
- * size: the size of the data
+ * qual:    the buffer type
+ * record:  the data to write
+ * size:    the size of the data
  */
 static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                    const void *record, size_t size)
@@ -349,14 +417,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                unsigned long write_size, adj_write_size;
 
                /*
-                * write as much as possible without producing an
+                * Write as much as possible without producing an
                 * overflow interrupt.
                 *
-                * interrupt_threshold must either be
+                * Interrupt_threshold must either be
                 * - bigger than absolute_maximum or
                 * - point to a record between buffer_base and absolute_maximum
                 *
-                * index points to a valid record.
+                * Index points to a valid record.
                 */
                base   = ds_get(context->ds, qual, ds_buffer_base);
                index  = ds_get(context->ds, qual, ds_index);
@@ -365,8 +433,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
 
                write_end = min(end, int_th);
 
-               /* if we are already beyond the interrupt threshold,
-                * we fill the entire buffer */
+               /*
+                * If we are already beyond the interrupt threshold,
+                * we fill the entire buffer.
+                */
                if (write_end <= index)
                        write_end = end;
 
@@ -383,7 +453,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
                adj_write_size = write_size / ds_cfg.sizeof_rec[qual];
                adj_write_size *= ds_cfg.sizeof_rec[qual];
 
-               /* zero out trailing bytes */
+               /* Zero out trailing bytes. */
                memset((char *)index + write_size, 0,
                       adj_write_size - write_size);
                index += adj_write_size;
@@ -410,7 +480,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual,
  * Later architectures use 64bit pointers throughout, whereas earlier
  * architectures use 32bit pointers in 32bit mode.
  *
- * We compute the base address for the first 8 fields based on:
+ * We compute the base address for the fields based on:
  * - the field size stored in the DS configuration
  * - the relative field position
  *
@@ -431,23 +501,23 @@ enum bts_field {
        bts_to,
        bts_flags,
 
-       bts_qual = bts_from,
-       bts_jiffies = bts_to,
-       bts_pid = bts_flags,
+       bts_qual                = bts_from,
+       bts_clock               = bts_to,
+       bts_pid                 = bts_flags,
 
-       bts_qual_mask = (bts_qual_max - 1),
-       bts_escape = ((unsigned long)-1 & ~bts_qual_mask)
+       bts_qual_mask           = (bts_qual_max - 1),
+       bts_escape              = ((unsigned long)-1 & ~bts_qual_mask)
 };
 
 static inline unsigned long bts_get(const char *base, enum bts_field field)
 {
-       base += (ds_cfg.sizeof_field * field);
+       base += (ds_cfg.sizeof_ptr_field * field);
        return *(unsigned long *)base;
 }
 
 static inline void bts_set(char *base, enum bts_field field, unsigned long val)
 {
-       base += (ds_cfg.sizeof_field * field);;
+       base += (ds_cfg.sizeof_ptr_field * field);;
        (*(unsigned long *)base) = val;
 }
 
@@ -463,8 +533,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val)
  *
  * return: bytes read/written on success; -Eerrno, otherwise
  */
-static int bts_read(struct bts_tracer *tracer, const void *at,
-                   struct bts_struct *out)
+static int
+bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out)
 {
        if (!tracer)
                return -EINVAL;
@@ -478,8 +548,8 @@ static int bts_read(struct bts_tracer *tracer, const void *at,
        memset(out, 0, sizeof(*out));
        if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) {
                out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask);
-               out->variant.timestamp.jiffies = bts_get(at, bts_jiffies);
-               out->variant.timestamp.pid = bts_get(at, bts_pid);
+               out->variant.event.clock = bts_get(at, bts_clock);
+               out->variant.event.pid = bts_get(at, bts_pid);
        } else {
                out->qualifier = bts_branch;
                out->variant.lbr.from = bts_get(at, bts_from);
@@ -516,8 +586,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in)
        case bts_task_arrives:
        case bts_task_departs:
                bts_set(raw, bts_qual, (bts_escape | in->qualifier));
-               bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies);
-               bts_set(raw, bts_pid, in->variant.timestamp.pid);
+               bts_set(raw, bts_clock, in->variant.event.clock);
+               bts_set(raw, bts_pid, in->variant.event.pid);
                break;
        default:
                return -EINVAL;
@@ -555,7 +625,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
                             unsigned int flags) {
        unsigned long buffer, adj;
 
-       /* adjust the buffer address and size to meet alignment
+       /*
+        * Adjust the buffer address and size to meet alignment
         * constraints:
         * - buffer is double-word aligned
         * - size is multiple of record size
@@ -577,9 +648,11 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
        trace->begin = (void *)buffer;
        trace->top = trace->begin;
        trace->end = (void *)(buffer + size);
-       /* The value for 'no threshold' is -1, which will set the
+       /*
+        * The value for 'no threshold' is -1, which will set the
         * threshold outside of the buffer, just like we want it.
         */
+       ith *= ds_cfg.sizeof_rec[qual];
        trace->ith = (void *)(buffer + size - ith);
 
        trace->flags = flags;
@@ -588,18 +661,27 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual,
 
 static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
                      enum ds_qualifier qual, struct task_struct *task,
-                     void *base, size_t size, size_t th, unsigned int flags)
+                     int cpu, void *base, size_t size, size_t th)
 {
        struct ds_context *context;
        int error;
+       size_t req_size;
+
+       error = -EOPNOTSUPP;
+       if (!ds_cfg.sizeof_rec[qual])
+               goto out;
 
        error = -EINVAL;
        if (!base)
                goto out;
 
-       /* we require some space to do alignment adjustments below */
+       req_size = ds_cfg.sizeof_rec[qual];
+       /* We might need space for alignment adjustments. */
+       if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT))
+               req_size += DS_ALIGNMENT;
+
        error = -EINVAL;
-       if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual]))
+       if (size < req_size)
                goto out;
 
        if (th != (size_t)-1) {
@@ -614,182 +696,318 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace,
        tracer->size = size;
 
        error = -ENOMEM;
-       context = ds_get_context(task);
+       context = ds_get_context(task, cpu);
        if (!context)
                goto out;
        tracer->context = context;
 
-       ds_init_ds_trace(trace, qual, base, size, th, flags);
+       /*
+        * Defer any tracer-specific initialization work for the context until
+        * context ownership has been clarified.
+        */
 
        error = 0;
  out:
        return error;
 }
 
-struct bts_tracer *ds_request_bts(struct task_struct *task,
-                                 void *base, size_t size,
-                                 bts_ovfl_callback_t ovfl, size_t th,
-                                 unsigned int flags)
+static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu,
+                                        void *base, size_t size,
+                                        bts_ovfl_callback_t ovfl, size_t th,
+                                        unsigned int flags)
 {
        struct bts_tracer *tracer;
-       unsigned long irq;
        int error;
 
+       /* Buffer overflow notification is not yet implemented. */
        error = -EOPNOTSUPP;
-       if (!ds_cfg.ctl[dsf_bts])
+       if (ovfl)
                goto out;
 
-       /* buffer overflow notification is not yet implemented */
-       error = -EOPNOTSUPP;
-       if (ovfl)
+       error = get_tracer(task);
+       if (error < 0)
                goto out;
 
        error = -ENOMEM;
        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
        if (!tracer)
-               goto out;
+               goto out_put_tracer;
        tracer->ovfl = ovfl;
 
+       /* Do some more error checking and acquire a tracing context. */
        error = ds_request(&tracer->ds, &tracer->trace.ds,
-                          ds_bts, task, base, size, th, flags);
+                          ds_bts, task, cpu, base, size, th);
        if (error < 0)
                goto out_tracer;
 
-
-       spin_lock_irqsave(&ds_lock, irq);
-
-       error = -EPERM;
-       if (!check_tracer(task))
-               goto out_unlock;
-       get_tracer(task);
+       /* Claim the bts part of the tracing context we acquired above. */
+       spin_lock_irq(&ds_lock);
 
        error = -EPERM;
        if (tracer->ds.context->bts_master)
-               goto out_put_tracer;
+               goto out_unlock;
        tracer->ds.context->bts_master = tracer;
 
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
 
+       /*
+        * Now that we own the bts part of the context, let's complete the
+        * initialization for that part.
+        */
+       ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags);
+       ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+       ds_install_ds_area(tracer->ds.context);
 
        tracer->trace.read  = bts_read;
        tracer->trace.write = bts_write;
 
-       ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts);
+       /* Start tracing. */
        ds_resume_bts(tracer);
 
        return tracer;
 
- out_put_tracer:
-       put_tracer(task);
  out_unlock:
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
        ds_put_context(tracer->ds.context);
  out_tracer:
        kfree(tracer);
+ out_put_tracer:
+       put_tracer(task);
  out:
        return ERR_PTR(error);
 }
 
-struct pebs_tracer *ds_request_pebs(struct task_struct *task,
-                                   void *base, size_t size,
-                                   pebs_ovfl_callback_t ovfl, size_t th,
-                                   unsigned int flags)
+struct bts_tracer *ds_request_bts_task(struct task_struct *task,
+                                      void *base, size_t size,
+                                      bts_ovfl_callback_t ovfl,
+                                      size_t th, unsigned int flags)
+{
+       return ds_request_bts(task, 0, base, size, ovfl, th, flags);
+}
+
+struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size,
+                                     bts_ovfl_callback_t ovfl,
+                                     size_t th, unsigned int flags)
+{
+       return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu,
+                                          void *base, size_t size,
+                                          pebs_ovfl_callback_t ovfl, size_t th,
+                                          unsigned int flags)
 {
        struct pebs_tracer *tracer;
-       unsigned long irq;
        int error;
 
-       /* buffer overflow notification is not yet implemented */
+       /* Buffer overflow notification is not yet implemented. */
        error = -EOPNOTSUPP;
        if (ovfl)
                goto out;
 
+       error = get_tracer(task);
+       if (error < 0)
+               goto out;
+
        error = -ENOMEM;
        tracer = kzalloc(sizeof(*tracer), GFP_KERNEL);
        if (!tracer)
-               goto out;
+               goto out_put_tracer;
        tracer->ovfl = ovfl;
 
+       /* Do some more error checking and acquire a tracing context. */
        error = ds_request(&tracer->ds, &tracer->trace.ds,
-                          ds_pebs, task, base, size, th, flags);
+                          ds_pebs, task, cpu, base, size, th);
        if (error < 0)
                goto out_tracer;
 
-       spin_lock_irqsave(&ds_lock, irq);
-
-       error = -EPERM;
-       if (!check_tracer(task))
-               goto out_unlock;
-       get_tracer(task);
+       /* Claim the pebs part of the tracing context we acquired above. */
+       spin_lock_irq(&ds_lock);
 
        error = -EPERM;
        if (tracer->ds.context->pebs_master)
-               goto out_put_tracer;
+               goto out_unlock;
        tracer->ds.context->pebs_master = tracer;
 
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
 
+       /*
+        * Now that we own the pebs part of the context, let's complete the
+        * initialization for that part.
+        */
+       ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags);
        ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
+       ds_install_ds_area(tracer->ds.context);
+
+       /* Start tracing. */
        ds_resume_pebs(tracer);
 
        return tracer;
 
- out_put_tracer:
-       put_tracer(task);
  out_unlock:
-       spin_unlock_irqrestore(&ds_lock, irq);
+       spin_unlock_irq(&ds_lock);
        ds_put_context(tracer->ds.context);
  out_tracer:
        kfree(tracer);
+ out_put_tracer:
+       put_tracer(task);
  out:
        return ERR_PTR(error);
 }
 
-void ds_release_bts(struct bts_tracer *tracer)
+struct pebs_tracer *ds_request_pebs_task(struct task_struct *task,
+                                        void *base, size_t size,
+                                        pebs_ovfl_callback_t ovfl,
+                                        size_t th, unsigned int flags)
 {
-       if (!tracer)
-               return;
+       return ds_request_pebs(task, 0, base, size, ovfl, th, flags);
+}
 
-       ds_suspend_bts(tracer);
+struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size,
+                                       pebs_ovfl_callback_t ovfl,
+                                       size_t th, unsigned int flags)
+{
+       return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags);
+}
+
+static void ds_free_bts(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+
+       task = tracer->ds.context->task;
 
        WARN_ON_ONCE(tracer->ds.context->bts_master != tracer);
        tracer->ds.context->bts_master = NULL;
 
-       put_tracer(tracer->ds.context->task);
+       /* Make sure tracing stopped and the tracer is not in use. */
+       if (task && (task != current))
+               wait_task_context_switch(task);
+
        ds_put_context(tracer->ds.context);
+       put_tracer(task);
 
        kfree(tracer);
 }
 
+void ds_release_bts(struct bts_tracer *tracer)
+{
+       might_sleep();
+
+       if (!tracer)
+               return;
+
+       ds_suspend_bts(tracer);
+       ds_free_bts(tracer);
+}
+
+int ds_release_bts_noirq(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long irq;
+       int error;
+
+       if (!tracer)
+               return 0;
+
+       task = tracer->ds.context->task;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task &&
+           (tracer->ds.context->cpu != smp_processor_id()))
+               goto out;
+
+       error = -EPERM;
+       if (task && (task != current))
+               goto out;
+
+       ds_suspend_bts_noirq(tracer);
+       ds_free_bts(tracer);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static void update_task_debugctlmsr(struct task_struct *task,
+                                   unsigned long debugctlmsr)
+{
+       task->thread.debugctlmsr = debugctlmsr;
+
+       get_cpu();
+       if (task == current)
+               update_debugctlmsr(debugctlmsr);
+       put_cpu();
+}
+
 void ds_suspend_bts(struct bts_tracer *tracer)
 {
        struct task_struct *task;
+       unsigned long debugctlmsr;
+       int cpu;
 
        if (!tracer)
                return;
 
+       tracer->flags = 0;
+
        task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
 
-       if (!task || (task == current))
-               update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL);
+       WARN_ON(!task && irqs_disabled());
 
-       if (task) {
-               task->thread.debugctlmsr &= ~BTS_CONTROL;
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr_on_cpu(cpu));
+       debugctlmsr &= ~BTS_CONTROL;
 
-               if (!task->thread.debugctlmsr)
-                       clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-       }
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr_on_cpu(cpu, debugctlmsr);
 }
 
-void ds_resume_bts(struct bts_tracer *tracer)
+int ds_suspend_bts_noirq(struct bts_tracer *tracer)
 {
        struct task_struct *task;
-       unsigned long control;
+       unsigned long debugctlmsr, irq;
+       int cpu, error = 0;
 
        if (!tracer)
-               return;
+               return 0;
+
+       tracer->flags = 0;
 
        task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task && (cpu != smp_processor_id()))
+               goto out;
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr());
+       debugctlmsr &= ~BTS_CONTROL;
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr(debugctlmsr);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static unsigned long ds_bts_control(struct bts_tracer *tracer)
+{
+       unsigned long control;
 
        control = ds_cfg.ctl[dsf_bts];
        if (!(tracer->trace.ds.flags & BTS_KERNEL))
@@ -797,41 +1015,149 @@ void ds_resume_bts(struct bts_tracer *tracer)
        if (!(tracer->trace.ds.flags & BTS_USER))
                control |= ds_cfg.ctl[dsf_bts_user];
 
-       if (task) {
-               task->thread.debugctlmsr |= control;
-               set_tsk_thread_flag(task, TIF_DEBUGCTLMSR);
-       }
-
-       if (!task || (task == current))
-               update_debugctlmsr(get_debugctlmsr() | control);
+       return control;
 }
 
-void ds_release_pebs(struct pebs_tracer *tracer)
+void ds_resume_bts(struct bts_tracer *tracer)
 {
+       struct task_struct *task;
+       unsigned long debugctlmsr;
+       int cpu;
+
        if (!tracer)
                return;
 
-       ds_suspend_pebs(tracer);
+       tracer->flags = tracer->trace.ds.flags;
+
+       task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       WARN_ON(!task && irqs_disabled());
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr_on_cpu(cpu));
+       debugctlmsr |= ds_bts_control(tracer);
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr_on_cpu(cpu, debugctlmsr);
+}
+
+int ds_resume_bts_noirq(struct bts_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long debugctlmsr, irq;
+       int cpu, error = 0;
+
+       if (!tracer)
+               return 0;
+
+       tracer->flags = tracer->trace.ds.flags;
+
+       task = tracer->ds.context->task;
+       cpu  = tracer->ds.context->cpu;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task && (cpu != smp_processor_id()))
+               goto out;
+
+       debugctlmsr = (task ?
+                      task->thread.debugctlmsr :
+                      get_debugctlmsr());
+       debugctlmsr |= ds_bts_control(tracer);
+
+       if (task)
+               update_task_debugctlmsr(task, debugctlmsr);
+       else
+               update_debugctlmsr(debugctlmsr);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
+static void ds_free_pebs(struct pebs_tracer *tracer)
+{
+       struct task_struct *task;
+
+       task = tracer->ds.context->task;
 
        WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer);
        tracer->ds.context->pebs_master = NULL;
 
-       put_tracer(tracer->ds.context->task);
        ds_put_context(tracer->ds.context);
+       put_tracer(task);
 
        kfree(tracer);
 }
 
+void ds_release_pebs(struct pebs_tracer *tracer)
+{
+       might_sleep();
+
+       if (!tracer)
+               return;
+
+       ds_suspend_pebs(tracer);
+       ds_free_pebs(tracer);
+}
+
+int ds_release_pebs_noirq(struct pebs_tracer *tracer)
+{
+       struct task_struct *task;
+       unsigned long irq;
+       int error;
+
+       if (!tracer)
+               return 0;
+
+       task = tracer->ds.context->task;
+
+       local_irq_save(irq);
+
+       error = -EPERM;
+       if (!task &&
+           (tracer->ds.context->cpu != smp_processor_id()))
+               goto out;
+
+       error = -EPERM;
+       if (task && (task != current))
+               goto out;
+
+       ds_suspend_pebs_noirq(tracer);
+       ds_free_pebs(tracer);
+
+       error = 0;
+ out:
+       local_irq_restore(irq);
+       return error;
+}
+
 void ds_suspend_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_suspend_pebs_noirq(struct pebs_tracer *tracer)
+{
+       return 0;
+}
+
 void ds_resume_pebs(struct pebs_tracer *tracer)
 {
 
 }
 
+int ds_resume_pebs_noirq(struct pebs_tracer *tracer)
+{
+       return 0;
+}
+
 const struct bts_trace *ds_read_bts(struct bts_tracer *tracer)
 {
        if (!tracer)
@@ -847,8 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer)
                return NULL;
 
        ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs);
-       tracer->trace.reset_value =
-               *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8));
+
+       tracer->trace.counters = ds_cfg.nr_counter_reset;
+       memcpy(tracer->trace.counter_reset,
+              tracer->ds.context->ds +
+              (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field),
+              ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE);
 
        return &tracer->trace;
 }
@@ -873,18 +1203,24 @@ int ds_reset_pebs(struct pebs_tracer *tracer)
 
        tracer->trace.ds.top = tracer->trace.ds.begin;
 
-       ds_set(tracer->ds.context->ds, ds_bts, ds_index,
+       ds_set(tracer->ds.context->ds, ds_pebs, ds_index,
               (unsigned long)tracer->trace.ds.top);
 
        return 0;
 }
 
-int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value)
+int ds_set_pebs_reset(struct pebs_tracer *tracer,
+                     unsigned int counter, u64 value)
 {
        if (!tracer)
                return -EINVAL;
 
-       *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value;
+       if (ds_cfg.nr_counter_reset < counter)
+               return -EINVAL;
+
+       *(u64 *)(tracer->ds.context->ds +
+                (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) +
+                (counter * PEBS_RESET_FIELD_SIZE)) = value;
 
        return 0;
 }
@@ -894,73 +1230,117 @@ static const struct ds_configuration ds_cfg_netburst = {
        .ctl[dsf_bts]           = (1 << 2) | (1 << 3),
        .ctl[dsf_bts_kernel]    = (1 << 5),
        .ctl[dsf_bts_user]      = (1 << 6),
-
-       .sizeof_field           = sizeof(long),
-       .sizeof_rec[ds_bts]     = sizeof(long) * 3,
-#ifdef __i386__
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 10,
-#else
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 18,
-#endif
+       .nr_counter_reset       = 1,
 };
 static const struct ds_configuration ds_cfg_pentium_m = {
        .name = "Pentium M",
        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
-
-       .sizeof_field           = sizeof(long),
-       .sizeof_rec[ds_bts]     = sizeof(long) * 3,
-#ifdef __i386__
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 10,
-#else
-       .sizeof_rec[ds_pebs]    = sizeof(long) * 18,
-#endif
+       .nr_counter_reset       = 1,
 };
 static const struct ds_configuration ds_cfg_core2_atom = {
        .name = "Core 2/Atom",
        .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
        .ctl[dsf_bts_kernel]    = (1 << 9),
        .ctl[dsf_bts_user]      = (1 << 10),
-
-       .sizeof_field           = 8,
-       .sizeof_rec[ds_bts]     = 8 * 3,
-       .sizeof_rec[ds_pebs]    = 8 * 18,
+       .nr_counter_reset       = 1,
+};
+static const struct ds_configuration ds_cfg_core_i7 = {
+       .name = "Core i7",
+       .ctl[dsf_bts]           = (1 << 6) | (1 << 7),
+       .ctl[dsf_bts_kernel]    = (1 << 9),
+       .ctl[dsf_bts_user]      = (1 << 10),
+       .nr_counter_reset       = 4,
 };
 
 static void
-ds_configure(const struct ds_configuration *cfg)
+ds_configure(const struct ds_configuration *cfg,
+            struct cpuinfo_x86 *cpu)
 {
+       unsigned long nr_pebs_fields = 0;
+
+       printk(KERN_INFO "[ds] using %s configuration\n", cfg->name);
+
+#ifdef __i386__
+       nr_pebs_fields = 10;
+#else
+       nr_pebs_fields = 18;
+#endif
+
+       /*
+        * Starting with version 2, architectural performance
+        * monitoring supports a format specifier.
+        */
+       if ((cpuid_eax(0xa) & 0xff) > 1) {
+               unsigned long perf_capabilities, format;
+
+               rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities);
+
+               format = (perf_capabilities >> 8) & 0xf;
+
+               switch (format) {
+               case 0:
+                       nr_pebs_fields = 18;
+                       break;
+               case 1:
+                       nr_pebs_fields = 22;
+                       break;
+               default:
+                       printk(KERN_INFO
+                              "[ds] unknown PEBS format: %lu\n", format);
+                       nr_pebs_fields = 0;
+                       break;
+               }
+       }
+
        memset(&ds_cfg, 0, sizeof(ds_cfg));
        ds_cfg = *cfg;
 
-       printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name);
+       ds_cfg.sizeof_ptr_field =
+               (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4);
+
+       ds_cfg.sizeof_rec[ds_bts]  = ds_cfg.sizeof_ptr_field * 3;
+       ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields;
 
-       if (!cpu_has_bts) {
-               ds_cfg.ctl[dsf_bts] = 0;
+       if (!cpu_has(cpu, X86_FEATURE_BTS)) {
+               ds_cfg.sizeof_rec[ds_bts] = 0;
                printk(KERN_INFO "[ds] bts not available\n");
        }
-       if (!cpu_has_pebs)
+       if (!cpu_has(cpu, X86_FEATURE_PEBS)) {
+               ds_cfg.sizeof_rec[ds_pebs] = 0;
                printk(KERN_INFO "[ds] pebs not available\n");
+       }
+
+       printk(KERN_INFO "[ds] sizes: address: %u bit, ",
+              8 * ds_cfg.sizeof_ptr_field);
+       printk("bts/pebs record: %u/%u bytes\n",
+              ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]);
 
-       WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field));
+       WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset);
 }
 
 void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
 {
+       /* Only configure the first cpu. Others are identical. */
+       if (ds_cfg.name)
+               return;
+
        switch (c->x86) {
        case 0x6:
                switch (c->x86_model) {
                case 0x9:
                case 0xd: /* Pentium M */
-                       ds_configure(&ds_cfg_pentium_m);
+                       ds_configure(&ds_cfg_pentium_m, c);
                        break;
                case 0xf:
                case 0x17: /* Core2 */
                case 0x1c: /* Atom */
-                       ds_configure(&ds_cfg_core2_atom);
+                       ds_configure(&ds_cfg_core2_atom, c);
+                       break;
+               case 0x1a: /* Core i7 */
+                       ds_configure(&ds_cfg_core_i7, c);
                        break;
-               case 0x1a: /* i7 */
                default:
-                       /* sorry, don't know about them */
+                       /* Sorry, don't know about them. */
                        break;
                }
                break;
@@ -969,64 +1349,89 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c)
                case 0x0:
                case 0x1:
                case 0x2: /* Netburst */
-                       ds_configure(&ds_cfg_netburst);
+                       ds_configure(&ds_cfg_netburst, c);
                        break;
                default:
-                       /* sorry, don't know about them */
+                       /* Sorry, don't know about them. */
                        break;
                }
                break;
        default:
-               /* sorry, don't know about them */
+               /* Sorry, don't know about them. */
                break;
        }
 }
 
+static inline void ds_take_timestamp(struct ds_context *context,
+                                    enum bts_qualifier qualifier,
+                                    struct task_struct *task)
+{
+       struct bts_tracer *tracer = context->bts_master;
+       struct bts_struct ts;
+
+       /* Prevent compilers from reading the tracer pointer twice. */
+       barrier();
+
+       if (!tracer || !(tracer->flags & BTS_TIMESTAMPS))
+               return;
+
+       memset(&ts, 0, sizeof(ts));
+       ts.qualifier            = qualifier;
+       ts.variant.event.clock  = trace_clock_global();
+       ts.variant.event.pid    = task->pid;
+
+       bts_write(tracer, &ts);
+}
+
 /*
  * Change the DS configuration from tracing prev to tracing next.
  */
 void ds_switch_to(struct task_struct *prev, struct task_struct *next)
 {
-       struct ds_context *prev_ctx = prev->thread.ds_ctx;
-       struct ds_context *next_ctx = next->thread.ds_ctx;
+       struct ds_context *prev_ctx     = prev->thread.ds_ctx;
+       struct ds_context *next_ctx     = next->thread.ds_ctx;
+       unsigned long debugctlmsr       = next->thread.debugctlmsr;
+
+       /* Make sure all data is read before we start. */
+       barrier();
 
        if (prev_ctx) {
                update_debugctlmsr(0);
 
-               if (prev_ctx->bts_master &&
-                   (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-                       struct bts_struct ts = {
-                               .qualifier = bts_task_departs,
-                               .variant.timestamp.jiffies = jiffies_64,
-                               .variant.timestamp.pid = prev->pid
-                       };
-                       bts_write(prev_ctx->bts_master, &ts);
-               }
+               ds_take_timestamp(prev_ctx, bts_task_departs, prev);
        }
 
        if (next_ctx) {
-               if (next_ctx->bts_master &&
-                   (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) {
-                       struct bts_struct ts = {
-                               .qualifier = bts_task_arrives,
-                               .variant.timestamp.jiffies = jiffies_64,
-                               .variant.timestamp.pid = next->pid
-                       };
-                       bts_write(next_ctx->bts_master, &ts);
-               }
+               ds_take_timestamp(next_ctx, bts_task_arrives, next);
 
                wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds);
        }
 
-       update_debugctlmsr(next->thread.debugctlmsr);
+       update_debugctlmsr(debugctlmsr);
 }
 
-void ds_copy_thread(struct task_struct *tsk, struct task_struct *father)
+static __init int ds_selftest(void)
 {
-       clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR);
-       tsk->thread.ds_ctx = NULL;
-}
+       if (ds_cfg.sizeof_rec[ds_bts]) {
+               int error;
 
-void ds_exit_thread(struct task_struct *tsk)
-{
+               error = ds_selftest_bts();
+               if (error) {
+                       WARN(1, "[ds] selftest failed. disabling bts.\n");
+                       ds_cfg.sizeof_rec[ds_bts] = 0;
+               }
+       }
+
+       if (ds_cfg.sizeof_rec[ds_pebs]) {
+               int error;
+
+               error = ds_selftest_pebs();
+               if (error) {
+                       WARN(1, "[ds] selftest failed. disabling pebs.\n");
+                       ds_cfg.sizeof_rec[ds_pebs] = 0;
+               }
+       }
+
+       return 0;
 }
+device_initcall(ds_selftest);
diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c
new file mode 100644 (file)
index 0000000..6bc7c19
--- /dev/null
@@ -0,0 +1,408 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#include "ds_selftest.h"
+
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/smp.h>
+#include <linux/cpu.h>
+
+#include <asm/ds.h>
+
+
+#define BUFFER_SIZE            521     /* Intentionally chose an odd size. */
+#define SMALL_BUFFER_SIZE      24      /* A single bts entry. */
+
+struct ds_selftest_bts_conf {
+       struct bts_tracer *tracer;
+       int error;
+       int (*suspend)(struct bts_tracer *);
+       int (*resume)(struct bts_tracer *);
+};
+
+static int ds_selftest_bts_consistency(const struct bts_trace *trace)
+{
+       int error = 0;
+
+       if (!trace) {
+               printk(KERN_CONT "failed to access trace...");
+               /* Bail out. Other tests are pointless. */
+               return -1;
+       }
+
+       if (!trace->read) {
+               printk(KERN_CONT "bts read not available...");
+               error = -1;
+       }
+
+       /* Do some sanity checks on the trace configuration. */
+       if (!trace->ds.n) {
+               printk(KERN_CONT "empty bts buffer...");
+               error = -1;
+       }
+       if (!trace->ds.size) {
+               printk(KERN_CONT "bad bts trace setup...");
+               error = -1;
+       }
+       if (trace->ds.end !=
+           (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) {
+               printk(KERN_CONT "bad bts buffer setup...");
+               error = -1;
+       }
+       /*
+        * We allow top in [begin; end], since its not clear when the
+        * overflow adjustment happens: after the increment or before the
+        * write.
+        */
+       if ((trace->ds.top < trace->ds.begin) ||
+           (trace->ds.end < trace->ds.top)) {
+               printk(KERN_CONT "bts top out of bounds...");
+               error = -1;
+       }
+
+       return error;
+}
+
+static int ds_selftest_bts_read(struct bts_tracer *tracer,
+                               const struct bts_trace *trace,
+                               const void *from, const void *to)
+{
+       const unsigned char *at;
+
+       /*
+        * Check a few things which do not belong to this test.
+        * They should be covered by other tests.
+        */
+       if (!trace)
+               return -1;
+
+       if (!trace->read)
+               return -1;
+
+       if (to < from)
+               return -1;
+
+       if (from < trace->ds.begin)
+               return -1;
+
+       if (trace->ds.end < to)
+               return -1;
+
+       if (!trace->ds.size)
+               return -1;
+
+       /* Now to the test itself. */
+       for (at = from; (void *)at < to; at += trace->ds.size) {
+               struct bts_struct bts;
+               unsigned long index;
+               int error;
+
+               if (((void *)at - trace->ds.begin) % trace->ds.size) {
+                       printk(KERN_CONT
+                              "read from non-integer index...");
+                       return -1;
+               }
+               index = ((void *)at - trace->ds.begin) / trace->ds.size;
+
+               memset(&bts, 0, sizeof(bts));
+               error = trace->read(tracer, at, &bts);
+               if (error < 0) {
+                       printk(KERN_CONT
+                              "error reading bts trace at [%lu] (0x%p)...",
+                              index, at);
+                       return error;
+               }
+
+               switch (bts.qualifier) {
+               case BTS_BRANCH:
+                       break;
+               default:
+                       printk(KERN_CONT
+                              "unexpected bts entry %llu at [%lu] (0x%p)...",
+                              bts.qualifier, index, at);
+                       return -1;
+               }
+       }
+
+       return 0;
+}
+
+static void ds_selftest_bts_cpu(void *arg)
+{
+       struct ds_selftest_bts_conf *conf = arg;
+       const struct bts_trace *trace;
+       void *top;
+
+       if (IS_ERR(conf->tracer)) {
+               conf->error = PTR_ERR(conf->tracer);
+               conf->tracer = NULL;
+
+               printk(KERN_CONT
+                      "initialization failed (err: %d)...", conf->error);
+               return;
+       }
+
+       /* We should meanwhile have enough trace. */
+       conf->error = conf->suspend(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       /* Let's see if we can access the trace. */
+       trace = ds_read_bts(conf->tracer);
+
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       /* If everything went well, we should have a few trace entries. */
+       if (trace->ds.top == trace->ds.begin) {
+               /*
+                * It is possible but highly unlikely that we got a
+                * buffer overflow and end up at exactly the same
+                * position we started from.
+                * Let's issue a warning, but continue.
+                */
+               printk(KERN_CONT "no trace/overflow...");
+       }
+
+       /* Let's try to read the trace we collected. */
+       conf->error =
+               ds_selftest_bts_read(conf->tracer, trace,
+                                    trace->ds.begin, trace->ds.top);
+       if (conf->error < 0)
+               return;
+
+       /*
+        * Let's read the trace again.
+        * Since we suspended tracing, we should get the same result.
+        */
+       top = trace->ds.top;
+
+       trace = ds_read_bts(conf->tracer);
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       if (top != trace->ds.top) {
+               printk(KERN_CONT "suspend not working...");
+               conf->error = -1;
+               return;
+       }
+
+       /* Let's collect some more trace - see if resume is working. */
+       conf->error = conf->resume(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       conf->error = conf->suspend(conf->tracer);
+       if (conf->error < 0)
+               return;
+
+       trace = ds_read_bts(conf->tracer);
+
+       conf->error = ds_selftest_bts_consistency(trace);
+       if (conf->error < 0)
+               return;
+
+       if (trace->ds.top == top) {
+               /*
+                * It is possible but highly unlikely that we got a
+                * buffer overflow and end up at exactly the same
+                * position we started from.
+                * Let's issue a warning and check the full trace.
+                */
+               printk(KERN_CONT
+                      "no resume progress/overflow...");
+
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace,
+                                            trace->ds.begin, trace->ds.end);
+       } else if (trace->ds.top < top) {
+               /*
+                * We had a buffer overflow - the entire buffer should
+                * contain trace records.
+                */
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace,
+                                            trace->ds.begin, trace->ds.end);
+       } else {
+               /*
+                * It is quite likely that the buffer did not overflow.
+                * Let's just check the delta trace.
+                */
+               conf->error =
+                       ds_selftest_bts_read(conf->tracer, trace, top,
+                                            trace->ds.top);
+       }
+       if (conf->error < 0)
+               return;
+
+       conf->error = 0;
+}
+
+static int ds_suspend_bts_wrap(struct bts_tracer *tracer)
+{
+       ds_suspend_bts(tracer);
+       return 0;
+}
+
+static int ds_resume_bts_wrap(struct bts_tracer *tracer)
+{
+       ds_resume_bts(tracer);
+       return 0;
+}
+
+static void ds_release_bts_noirq_wrap(void *tracer)
+{
+       (void)ds_release_bts_noirq(tracer);
+}
+
+static int ds_selftest_bts_bad_release_noirq(int cpu,
+                                            struct bts_tracer *tracer)
+{
+       int error = -EPERM;
+
+       /* Try to release the tracer on the wrong cpu. */
+       get_cpu();
+       if (cpu != smp_processor_id()) {
+               error = ds_release_bts_noirq(tracer);
+               if (error != -EPERM)
+                       printk(KERN_CONT "release on wrong cpu...");
+       }
+       put_cpu();
+
+       return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer)
+{
+       struct bts_tracer *tracer;
+       int error;
+
+       /* Try to request cpu tracing while task tracing is active. */
+       tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL,
+                                   (size_t)-1, BTS_KERNEL);
+       error = PTR_ERR(tracer);
+       if (!IS_ERR(tracer)) {
+               ds_release_bts(tracer);
+               error = 0;
+       }
+
+       if (error != -EPERM)
+               printk(KERN_CONT "cpu/task tracing overlap...");
+
+       return error ? 0 : -1;
+}
+
+static int ds_selftest_bts_bad_request_task(void *buffer)
+{
+       struct bts_tracer *tracer;
+       int error;
+
+       /* Try to request cpu tracing while task tracing is active. */
+       tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL,
+                                   (size_t)-1, BTS_KERNEL);
+       error = PTR_ERR(tracer);
+       if (!IS_ERR(tracer)) {
+               error = 0;
+               ds_release_bts(tracer);
+       }
+
+       if (error != -EPERM)
+               printk(KERN_CONT "task/cpu tracing overlap...");
+
+       return error ? 0 : -1;
+}
+
+int ds_selftest_bts(void)
+{
+       struct ds_selftest_bts_conf conf;
+       unsigned char buffer[BUFFER_SIZE], *small_buffer;
+       unsigned long irq;
+       int cpu;
+
+       printk(KERN_INFO "[ds] bts selftest...");
+       conf.error = 0;
+
+       small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               conf.suspend = ds_suspend_bts_wrap;
+               conf.resume = ds_resume_bts_wrap;
+               conf.tracer =
+                       ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+                                          NULL, (size_t)-1, BTS_KERNEL);
+               ds_selftest_bts_cpu(&conf);
+               if (conf.error >= 0)
+                       conf.error = ds_selftest_bts_bad_request_task(buffer);
+               ds_release_bts(conf.tracer);
+               if (conf.error < 0)
+                       goto out;
+
+               conf.suspend = ds_suspend_bts_noirq;
+               conf.resume = ds_resume_bts_noirq;
+               conf.tracer =
+                       ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE,
+                                          NULL, (size_t)-1, BTS_KERNEL);
+               smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1);
+               if (conf.error >= 0) {
+                       conf.error =
+                               ds_selftest_bts_bad_release_noirq(cpu,
+                                                                 conf.tracer);
+                       /* We must not release the tracer twice. */
+                       if (conf.error < 0)
+                               conf.tracer = NULL;
+               }
+               if (conf.error >= 0)
+                       conf.error = ds_selftest_bts_bad_request_task(buffer);
+               smp_call_function_single(cpu, ds_release_bts_noirq_wrap,
+                                        conf.tracer, 1);
+               if (conf.error < 0)
+                       goto out;
+       }
+
+       conf.suspend = ds_suspend_bts_wrap;
+       conf.resume = ds_resume_bts_wrap;
+       conf.tracer =
+               ds_request_bts_task(current, buffer, BUFFER_SIZE,
+                                   NULL, (size_t)-1, BTS_KERNEL);
+       ds_selftest_bts_cpu(&conf);
+       if (conf.error >= 0)
+               conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+       ds_release_bts(conf.tracer);
+       if (conf.error < 0)
+               goto out;
+
+       conf.suspend = ds_suspend_bts_noirq;
+       conf.resume = ds_resume_bts_noirq;
+       conf.tracer =
+               ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE,
+                                  NULL, (size_t)-1, BTS_KERNEL);
+       local_irq_save(irq);
+       ds_selftest_bts_cpu(&conf);
+       if (conf.error >= 0)
+               conf.error = ds_selftest_bts_bad_request_cpu(0, buffer);
+       ds_release_bts_noirq(conf.tracer);
+       local_irq_restore(irq);
+       if (conf.error < 0)
+               goto out;
+
+       conf.error = 0;
+ out:
+       put_online_cpus();
+       printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed"));
+
+       return conf.error;
+}
+
+int ds_selftest_pebs(void)
+{
+       return 0;
+}
diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h
new file mode 100644 (file)
index 0000000..2ba8745
--- /dev/null
@@ -0,0 +1,15 @@
+/*
+ * Debug Store support - selftest
+ *
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@intel.com>, 2009
+ */
+
+#ifdef CONFIG_X86_DS_SELFTEST
+extern int ds_selftest_bts(void);
+extern int ds_selftest_pebs(void);
+#else
+static inline int ds_selftest_bts(void) { return 0; }
+static inline int ds_selftest_pebs(void) { return 0; }
+#endif
index da87590b8698a7c4642ad8e46febe2bf91972599..81086c227ab7cafe28c0c608f1b3eb2bce72b1c0 100644 (file)
@@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
                unsigned long *sp, unsigned long bp, char *log_lvl);
 
 extern unsigned int code_bytes;
-extern int kstack_depth_to_print;
 
 /* The form of the top of the frame on the stack */
 struct stack_frame {
index 0062813029256a2379d4dc66c42741439227210e..7271fa33d79135edd790f854d7c0c91d05d0c20b 100644 (file)
@@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
  */
 __init void e820_setup_gap(void)
 {
-       unsigned long gapstart, gapsize, round;
+       unsigned long gapstart, gapsize;
        int found;
 
        gapstart = 0x10000000;
@@ -635,14 +635,9 @@ __init void e820_setup_gap(void)
 #endif
 
        /*
-        * See how much we want to round up: start off with
-        * rounding to the next 1MB area.
+        * e820_reserve_resources_late protect stolen RAM already
         */
-       round = 0x100000;
-       while ((gapsize >> 4) > round)
-               round += round;
-       /* Fun with two's complement */
-       pci_mem_start = (gapstart + round) & -round;
+       pci_mem_start = gapstart;
 
        printk(KERN_INFO
               "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n",
@@ -1371,6 +1366,23 @@ void __init e820_reserve_resources(void)
        }
 }
 
+/* How much should we pad RAM ending depending on where it is? */
+static unsigned long ram_alignment(resource_size_t pos)
+{
+       unsigned long mb = pos >> 20;
+
+       /* To 64kB in the first megabyte */
+       if (!mb)
+               return 64*1024;
+
+       /* To 1MB in the first 16MB */
+       if (mb < 16)
+               return 1024*1024;
+
+       /* To 32MB for anything above that */
+       return 32*1024*1024;
+}
+
 void __init e820_reserve_resources_late(void)
 {
        int i;
@@ -1382,6 +1394,24 @@ void __init e820_reserve_resources_late(void)
                        insert_resource_expand_to_fit(&iomem_resource, res);
                res++;
        }
+
+       /*
+        * Try to bump up RAM regions to reasonable boundaries to
+        * avoid stolen RAM:
+        */
+       for (i = 0; i < e820.nr_map; i++) {
+               struct e820entry *entry = &e820_saved.map[i];
+               resource_size_t start, end;
+
+               if (entry->type != E820_RAM)
+                       continue;
+               start = entry->addr + entry->size;
+               end = round_up(start, ram_alignment(start));
+               if (start == end)
+                       continue;
+               reserve_region_with_split(&iomem_resource, start,
+                                                 end - 1, "RAM buffer");
+       }
 }
 
 char *__init default_machine_specific_memory_setup(void)
index 76b8cd953deed9f8a50d572cdc52b5edb68bc3b7..ebdb85cf2686fa36702cd4d50b657f22de85b3bd 100644 (file)
@@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func)
 
 }
 
+#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
 static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
 {
@@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
        d &= 0xff;
        return d;
 }
+#endif
 
 static void __init ati_bugs(int num, int slot, int func)
 {
index 38946c6e843388b33fc165779bd7c0aec8e260d7..1c17d7c751a43762717f9e63eeff73b80843ca92 100644 (file)
@@ -147,27 +147,14 @@ END(ftrace_graph_caller)
 GLOBAL(return_to_handler)
        subq  $80, %rsp
 
+       /* Save the return values */
        movq %rax, (%rsp)
-       movq %rcx, 8(%rsp)
-       movq %rdx, 16(%rsp)
-       movq %rsi, 24(%rsp)
-       movq %rdi, 32(%rsp)
-       movq %r8, 40(%rsp)
-       movq %r9, 48(%rsp)
-       movq %r10, 56(%rsp)
-       movq %r11, 64(%rsp)
+       movq %rdx, 8(%rsp)
 
        call ftrace_return_to_handler
 
        movq %rax, 72(%rsp)
-       movq 64(%rsp), %r11
-       movq 56(%rsp), %r10
-       movq 48(%rsp), %r9
-       movq 40(%rsp), %r8
-       movq 32(%rsp), %rdi
-       movq 24(%rsp), %rsi
-       movq 16(%rsp), %rdx
-       movq 8(%rsp), %rcx
+       movq 8(%rsp), %rdx
        movq (%rsp), %rax
        addq $72, %rsp
        retq
@@ -1379,6 +1366,11 @@ END(xen_failsafe_callback)
 paranoidzeroentry_ist debug do_debug DEBUG_STACK
 paranoidzeroentry_ist int3 do_int3 DEBUG_STACK
 paranoiderrorentry stack_segment do_stack_segment
+#ifdef CONFIG_XEN
+zeroentry xen_debug do_debug
+zeroentry xen_int3 do_int3
+errorentry xen_stack_segment do_stack_segment
+#endif
 errorentry general_protection do_general_protection
 errorentry page_fault do_page_fault
 #ifdef CONFIG_X86_MCE
index 30683883e0cdcf0bd669cb439206fefaf74301f2..dc5ed4bdd88d36317ba5fc87554d10156a7f9132 100644 (file)
@@ -608,13 +608,6 @@ ignore_int:
 ENTRY(initial_code)
        .long i386_start_kernel
 
-.section .text
-/*
- * Real beginning of normal "text" segment
- */
-ENTRY(stext)
-ENTRY(_stext)
-
 /*
  * BSS section
  */
index c3fe010d74c8e8823793cd9ff816ee02adc8ebde..9a391bbb8ba83caffb48073831d3f1462cbae08e 100644 (file)
@@ -12,6 +12,7 @@
 #include <asm/io_apic.h>
 #include <asm/irq.h>
 #include <asm/idle.h>
+#include <asm/hw_irq.h>
 
 atomic_t irq_err_count;
 
@@ -24,9 +25,9 @@ void (*generic_interrupt_extension)(void) = NULL;
  */
 void ack_bad_irq(unsigned int irq)
 {
-       printk(KERN_ERR "unexpected IRQ trap at vector %02x\n", irq);
+       if (printk_ratelimit())
+               pr_err("unexpected IRQ trap at vector %02x\n", irq);
 
-#ifdef CONFIG_X86_LOCAL_APIC
        /*
         * Currently unexpected vectors happen only on SMP and APIC.
         * We _must_ ack these because every local APIC has only N
@@ -36,9 +37,7 @@ void ack_bad_irq(unsigned int irq)
         * completely.
         * But only ack when the APIC is enabled -AK
         */
-       if (cpu_has_apic)
-               ack_APIC_irq();
-#endif
+       ack_APIC_irq();
 }
 
 #define irq_stats(x)           (&per_cpu(irq_stat, x))
@@ -178,7 +177,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
        sum += irq_stats(cpu)->irq_thermal_count;
 # ifdef CONFIG_X86_64
        sum += irq_stats(cpu)->irq_threshold_count;
-#endif
+# endif
 #endif
        return sum;
 }
@@ -213,14 +212,11 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
        irq = __get_cpu_var(vector_irq)[vector];
 
        if (!handle_irq(irq, regs)) {
-#ifdef CONFIG_X86_64
-               if (!disable_apic)
-                       ack_APIC_irq();
-#endif
+               ack_APIC_irq();
 
                if (printk_ratelimit())
-                       printk(KERN_EMERG "%s: %d.%d No irq handler for vector (irq %d)\n",
-                              __func__, smp_processor_id(), vector, irq);
+                       pr_emerg("%s: %d.%d No irq handler for vector (irq %d)\n",
+                               __func__, smp_processor_id(), vector, irq);
        }
 
        irq_exit();
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
new file mode 100644 (file)
index 0000000..2e08b10
--- /dev/null
@@ -0,0 +1,275 @@
+#include <linux/linkage.h>
+#include <linux/errno.h>
+#include <linux/signal.h>
+#include <linux/sched.h>
+#include <linux/ioport.h>
+#include <linux/interrupt.h>
+#include <linux/timex.h>
+#include <linux/slab.h>
+#include <linux/random.h>
+#include <linux/kprobes.h>
+#include <linux/init.h>
+#include <linux/kernel_stat.h>
+#include <linux/sysdev.h>
+#include <linux/bitops.h>
+#include <linux/acpi.h>
+#include <linux/io.h>
+#include <linux/delay.h>
+
+#include <asm/atomic.h>
+#include <asm/system.h>
+#include <asm/timer.h>
+#include <asm/hw_irq.h>
+#include <asm/pgtable.h>
+#include <asm/desc.h>
+#include <asm/apic.h>
+#include <asm/setup.h>
+#include <asm/i8259.h>
+#include <asm/traps.h>
+
+/*
+ * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
+ * (these are usually mapped to vectors 0x30-0x3f)
+ */
+
+/*
+ * The IO-APIC gives us many more interrupt sources. Most of these
+ * are unused but an SMP system is supposed to have enough memory ...
+ * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
+ * across the spectrum, so we really want to be prepared to get all
+ * of these. Plus, more powerful systems might have more than 64
+ * IO-APIC registers.
+ *
+ * (these are usually mapped into the 0x30-0xff vector range)
+ */
+
+#ifdef CONFIG_X86_32
+/*
+ * Note that on a 486, we don't want to do a SIGFPE on an irq13
+ * as the irq is unreliable, and exception 16 works correctly
+ * (ie as explained in the intel literature). On a 386, you
+ * can't use exception 16 due to bad IBM design, so we have to
+ * rely on the less exact irq13.
+ *
+ * Careful.. Not only is IRQ13 unreliable, but it is also
+ * leads to races. IBM designers who came up with it should
+ * be shot.
+ */
+
+static irqreturn_t math_error_irq(int cpl, void *dev_id)
+{
+       outb(0, 0xF0);
+       if (ignore_fpu_irq || !boot_cpu_data.hard_math)
+               return IRQ_NONE;
+       math_error((void __user *)get_irq_regs()->ip);
+       return IRQ_HANDLED;
+}
+
+/*
+ * New motherboards sometimes make IRQ 13 be a PCI interrupt,
+ * so allow interrupt sharing.
+ */
+static struct irqaction fpu_irq = {
+       .handler = math_error_irq,
+       .name = "fpu",
+};
+#endif
+
+/*
+ * IRQ2 is cascade interrupt to second interrupt controller
+ */
+static struct irqaction irq2 = {
+       .handler = no_action,
+       .name = "cascade",
+};
+
+DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
+       [0 ... IRQ0_VECTOR - 1] = -1,
+       [IRQ0_VECTOR] = 0,
+       [IRQ1_VECTOR] = 1,
+       [IRQ2_VECTOR] = 2,
+       [IRQ3_VECTOR] = 3,
+       [IRQ4_VECTOR] = 4,
+       [IRQ5_VECTOR] = 5,
+       [IRQ6_VECTOR] = 6,
+       [IRQ7_VECTOR] = 7,
+       [IRQ8_VECTOR] = 8,
+       [IRQ9_VECTOR] = 9,
+       [IRQ10_VECTOR] = 10,
+       [IRQ11_VECTOR] = 11,
+       [IRQ12_VECTOR] = 12,
+       [IRQ13_VECTOR] = 13,
+       [IRQ14_VECTOR] = 14,
+       [IRQ15_VECTOR] = 15,
+       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
+};
+
+int vector_used_by_percpu_irq(unsigned int vector)
+{
+       int cpu;
+
+       for_each_online_cpu(cpu) {
+               if (per_cpu(vector_irq, cpu)[vector] != -1)
+                       return 1;
+       }
+
+       return 0;
+}
+
+static void __init init_ISA_irqs(void)
+{
+       int i;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+       init_bsp_APIC();
+#endif
+       init_8259A(0);
+
+       /*
+        * 16 old-style INTA-cycle interrupts:
+        */
+       for (i = 0; i < NR_IRQS_LEGACY; i++) {
+               struct irq_desc *desc = irq_to_desc(i);
+
+               desc->status = IRQ_DISABLED;
+               desc->action = NULL;
+               desc->depth = 1;
+
+               set_irq_chip_and_handler_name(i, &i8259A_chip,
+                                             handle_level_irq, "XT");
+       }
+}
+
+/* Overridden in paravirt.c */
+void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
+
+static void __init smp_intr_init(void)
+{
+#ifdef CONFIG_SMP
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+       /*
+        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
+        * IPI, driven by wakeup.
+        */
+       alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
+
+       /* IPIs for invalidation */
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
+       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
+
+       /* IPI for generic function call */
+       alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
+
+       /* IPI for generic single function call */
+       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
+                       call_function_single_interrupt);
+
+       /* Low priority IPI to cleanup after moving an irq */
+       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
+       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
+#endif
+#endif /* CONFIG_SMP */
+}
+
+static void __init apic_intr_init(void)
+{
+       smp_intr_init();
+
+#ifdef CONFIG_X86_64
+       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+       alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
+#endif
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
+       /* self generated IPI for local APIC timer */
+       alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
+
+       /* generic IPI for platform specific use */
+       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
+
+       /* IPI vectors for APIC spurious and error interrupts */
+       alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
+       alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
+
+       /* Performance monitoring interrupts: */
+# ifdef CONFIG_PERF_COUNTERS
+       alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt);
+       alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt);
+# endif
+
+#endif
+
+#ifdef CONFIG_X86_32
+#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
+       /* thermal monitor LVT interrupt */
+       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
+#endif
+#endif
+}
+
+/**
+ * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
+ *
+ * Description:
+ *     Perform any necessary interrupt initialisation prior to setting up
+ *     the "ordinary" interrupt call gates.  For legacy reasons, the ISA
+ *     interrupts should be initialised here if the machine emulates a PC
+ *     in any way.
+ **/
+static void __init x86_quirk_pre_intr_init(void)
+{
+#ifdef CONFIG_X86_32
+       if (x86_quirks->arch_pre_intr_init) {
+               if (x86_quirks->arch_pre_intr_init())
+                       return;
+       }
+#endif
+       init_ISA_irqs();
+}
+
+void __init native_init_IRQ(void)
+{
+       int i;
+
+       /* Execute any quirks before the call gates are initialised: */
+       x86_quirk_pre_intr_init();
+
+       apic_intr_init();
+
+       /*
+        * Cover the whole vector space, no vector can escape
+        * us. (some of these will be overridden and become
+        * 'special' SMP interrupts)
+        */
+       for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
+               /* IA32_SYSCALL_VECTOR could be used in trap_init already. */
+               if (!test_bit(i, used_vectors))
+                       set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
+       }
+
+       if (!acpi_ioapic)
+               setup_irq(2, &irq2);
+
+#ifdef CONFIG_X86_32
+       /*
+        * Call quirks after call gates are initialised (usually add in
+        * the architecture specific gates):
+        */
+       x86_quirk_intr_init();
+
+       /*
+        * External FPU? Set up irq13 if so, for
+        * original braindamaged IBM FERR coupling.
+        */
+       if (boot_cpu_data.hard_math && !cpu_has_fpu)
+               setup_irq(FPU_IRQ, &fpu_irq);
+
+       irq_ctx_init(smp_processor_id());
+#endif
+}
diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c
deleted file mode 100644 (file)
index 368b0a8..0000000
+++ /dev/null
@@ -1,206 +0,0 @@
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/timer.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/setup.h>
-#include <asm/i8259.h>
-#include <asm/traps.h>
-
-
-/*
- * Note that on a 486, we don't want to do a SIGFPE on an irq13
- * as the irq is unreliable, and exception 16 works correctly
- * (ie as explained in the intel literature). On a 386, you
- * can't use exception 16 due to bad IBM design, so we have to
- * rely on the less exact irq13.
- *
- * Careful.. Not only is IRQ13 unreliable, but it is also
- * leads to races. IBM designers who came up with it should
- * be shot.
- */
-
-static irqreturn_t math_error_irq(int cpl, void *dev_id)
-{
-       outb(0, 0xF0);
-       if (ignore_fpu_irq || !boot_cpu_data.hard_math)
-               return IRQ_NONE;
-       math_error((void __user *)get_irq_regs()->ip);
-       return IRQ_HANDLED;
-}
-
-/*
- * New motherboards sometimes make IRQ 13 be a PCI interrupt,
- * so allow interrupt sharing.
- */
-static struct irqaction fpu_irq = {
-       .handler = math_error_irq,
-       .name = "fpu",
-};
-
-void __init init_ISA_irqs(void)
-{
-       int i;
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       init_bsp_APIC();
-#endif
-       init_8259A(0);
-
-       /*
-        * 16 old-style INTA-cycle interrupts:
-        */
-       for (i = 0; i < NR_IRQS_LEGACY; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = NULL;
-               desc->depth = 1;
-
-               set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                             handle_level_irq, "XT");
-       }
-}
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-static struct irqaction irq2 = {
-       .handler = no_action,
-       .name = "cascade",
-};
-
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-       [0 ... IRQ0_VECTOR - 1] = -1,
-       [IRQ0_VECTOR] = 0,
-       [IRQ1_VECTOR] = 1,
-       [IRQ2_VECTOR] = 2,
-       [IRQ3_VECTOR] = 3,
-       [IRQ4_VECTOR] = 4,
-       [IRQ5_VECTOR] = 5,
-       [IRQ6_VECTOR] = 6,
-       [IRQ7_VECTOR] = 7,
-       [IRQ8_VECTOR] = 8,
-       [IRQ9_VECTOR] = 9,
-       [IRQ10_VECTOR] = 10,
-       [IRQ11_VECTOR] = 11,
-       [IRQ12_VECTOR] = 12,
-       [IRQ13_VECTOR] = 13,
-       [IRQ14_VECTOR] = 14,
-       [IRQ15_VECTOR] = 15,
-       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu) {
-               if (per_cpu(vector_irq, cpu)[vector] != -1)
-                       return 1;
-       }
-
-       return 0;
-}
-
-/* Overridden in paravirt.c */
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-void __init native_init_IRQ(void)
-{
-       int i;
-
-       /* Execute any quirks before the call gates are initialised: */
-       x86_quirk_pre_intr_init();
-
-       /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
-        */
-       for (i =  FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
-               /* SYSCALL_VECTOR was reserved in trap_init. */
-               if (i != SYSCALL_VECTOR)
-                       set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
-       }
-
-
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP)
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPIs for invalidation */
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-       /* IPI for generic function call */
-       alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-       /* IPI for single call function */
-       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-                                call_function_single_interrupt);
-
-       /* Low priority IPI to cleanup after moving an irq */
-       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-
-#ifdef CONFIG_X86_LOCAL_APIC
-       /* self generated IPI for local APIC timer */
-       alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* generic IPI for platform specific use */
-       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-#endif
-
-#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL)
-       /* thermal monitor LVT interrupt */
-       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-#endif
-
-       if (!acpi_ioapic)
-               setup_irq(2, &irq2);
-
-       /*
-        * Call quirks after call gates are initialised (usually add in
-        * the architecture specific gates):
-        */
-       x86_quirk_intr_init();
-
-       /*
-        * External FPU? Set up irq13 if so, for
-        * original braindamaged IBM FERR coupling.
-        */
-       if (boot_cpu_data.hard_math && !cpu_has_fpu)
-               setup_irq(FPU_IRQ, &fpu_irq);
-
-       irq_ctx_init(smp_processor_id());
-}
diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c
deleted file mode 100644 (file)
index 8cd1053..0000000
+++ /dev/null
@@ -1,177 +0,0 @@
-#include <linux/linkage.h>
-#include <linux/errno.h>
-#include <linux/signal.h>
-#include <linux/sched.h>
-#include <linux/ioport.h>
-#include <linux/interrupt.h>
-#include <linux/timex.h>
-#include <linux/slab.h>
-#include <linux/random.h>
-#include <linux/init.h>
-#include <linux/kernel_stat.h>
-#include <linux/sysdev.h>
-#include <linux/bitops.h>
-#include <linux/acpi.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-
-#include <asm/atomic.h>
-#include <asm/system.h>
-#include <asm/hw_irq.h>
-#include <asm/pgtable.h>
-#include <asm/desc.h>
-#include <asm/apic.h>
-#include <asm/i8259.h>
-
-/*
- * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
- * (these are usually mapped to vectors 0x30-0x3f)
- */
-
-/*
- * The IO-APIC gives us many more interrupt sources. Most of these
- * are unused but an SMP system is supposed to have enough memory ...
- * sometimes (mostly wrt. hw bugs) we get corrupted vectors all
- * across the spectrum, so we really want to be prepared to get all
- * of these. Plus, more powerful systems might have more than 64
- * IO-APIC registers.
- *
- * (these are usually mapped into the 0x30-0xff vector range)
- */
-
-/*
- * IRQ2 is cascade interrupt to second interrupt controller
- */
-
-static struct irqaction irq2 = {
-       .handler = no_action,
-       .name = "cascade",
-};
-DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
-       [0 ... IRQ0_VECTOR - 1] = -1,
-       [IRQ0_VECTOR] = 0,
-       [IRQ1_VECTOR] = 1,
-       [IRQ2_VECTOR] = 2,
-       [IRQ3_VECTOR] = 3,
-       [IRQ4_VECTOR] = 4,
-       [IRQ5_VECTOR] = 5,
-       [IRQ6_VECTOR] = 6,
-       [IRQ7_VECTOR] = 7,
-       [IRQ8_VECTOR] = 8,
-       [IRQ9_VECTOR] = 9,
-       [IRQ10_VECTOR] = 10,
-       [IRQ11_VECTOR] = 11,
-       [IRQ12_VECTOR] = 12,
-       [IRQ13_VECTOR] = 13,
-       [IRQ14_VECTOR] = 14,
-       [IRQ15_VECTOR] = 15,
-       [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
-};
-
-int vector_used_by_percpu_irq(unsigned int vector)
-{
-       int cpu;
-
-       for_each_online_cpu(cpu) {
-               if (per_cpu(vector_irq, cpu)[vector] != -1)
-                       return 1;
-       }
-
-       return 0;
-}
-
-static void __init init_ISA_irqs(void)
-{
-       int i;
-
-       init_bsp_APIC();
-       init_8259A(0);
-
-       for (i = 0; i < NR_IRQS_LEGACY; i++) {
-               struct irq_desc *desc = irq_to_desc(i);
-
-               desc->status = IRQ_DISABLED;
-               desc->action = NULL;
-               desc->depth = 1;
-
-               /*
-                * 16 old-style INTA-cycle interrupts:
-                */
-               set_irq_chip_and_handler_name(i, &i8259A_chip,
-                                                     handle_level_irq, "XT");
-       }
-}
-
-void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ")));
-
-static void __init smp_intr_init(void)
-{
-#ifdef CONFIG_SMP
-       /*
-        * The reschedule interrupt is a CPU-to-CPU reschedule-helper
-        * IPI, driven by wakeup.
-        */
-       alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
-
-       /* IPIs for invalidation */
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6);
-       alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7);
-
-       /* IPI for generic function call */
-       alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
-
-       /* IPI for generic single function call */
-       alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
-                       call_function_single_interrupt);
-
-       /* Low priority IPI to cleanup after moving an irq */
-       set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
-       set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
-#endif
-}
-
-static void __init apic_intr_init(void)
-{
-       smp_intr_init();
-
-       alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt);
-       alloc_intr_gate(THRESHOLD_APIC_VECTOR, threshold_interrupt);
-
-       /* self generated IPI for local APIC timer */
-       alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
-
-       /* generic IPI for platform specific use */
-       alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt);
-
-       /* IPI vectors for APIC spurious and error interrupts */
-       alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
-       alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
-}
-
-void __init native_init_IRQ(void)
-{
-       int i;
-
-       init_ISA_irqs();
-       /*
-        * Cover the whole vector space, no vector can escape
-        * us. (some of these will be overridden and become
-        * 'special' SMP interrupts)
-        */
-       for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) {
-               int vector = FIRST_EXTERNAL_VECTOR + i;
-               if (vector != IA32_SYSCALL_VECTOR)
-                       set_intr_gate(vector, interrupt[i]);
-       }
-
-       apic_intr_init();
-
-       if (!acpi_ioapic)
-               setup_irq(2, &irq2);
-}
index b1f4dffb919e8c708421cb8c29af80ad50d6a7c9..8d82a77a3f3b96ea3c0dc37e91551dddc7e10b51 100644 (file)
@@ -142,7 +142,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p)
        gdb_regs32[GDB_PS]      = *(unsigned long *)(p->thread.sp + 8);
        gdb_regs32[GDB_CS]      = __KERNEL_CS;
        gdb_regs32[GDB_SS]      = __KERNEL_DS;
-       gdb_regs[GDB_PC]        = p->thread.ip;
+       gdb_regs[GDB_PC]        = 0;
        gdb_regs[GDB_R8]        = 0;
        gdb_regs[GDB_R9]        = 0;
        gdb_regs[GDB_R10]       = 0;
index 057173db6adcc099e508a5d6db9a98bdc865c607..a78ecad0c900ff646b8c826c08cd54b094815b5a 100644 (file)
@@ -196,7 +196,7 @@ static void kvm_leave_lazy_mmu(void)
        struct kvm_para_state *state = kvm_para_state();
 
        mmu_queue_flush(state);
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
+       paravirt_leave_lazy_mmu();
        state->mode = paravirt_get_lazy_mode();
 }
 
index 453b5795a5c6a23817f8e833f722d6140c6be644..366baa179913dfc3e3e865480e82008a9ab5071a 100644 (file)
  *  Licensed under the terms of the GNU General Public
  *  License version 2. See file COPYING for details.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/pci_ids.h>
 #include <linux/uaccess.h>
 #include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
 #include <linux/pci.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -79,9 +67,6 @@ struct microcode_amd {
 #define UCODE_CONTAINER_SECTION_HDR    8
 #define UCODE_CONTAINER_HEADER_SIZE    12
 
-/* serialize access to the physical write */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static struct equiv_cpu_entry *equiv_cpu_table;
 
 static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
@@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
        return 1;
 }
 
-static void apply_microcode_amd(int cpu)
+static int apply_microcode_amd(int cpu)
 {
-       unsigned long flags;
        u32 rev, dummy;
        int cpu_num = raw_smp_processor_id();
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num;
@@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu)
        BUG_ON(cpu_num != cpu);
 
        if (mc_amd == NULL)
-               return;
+               return 0;
 
-       spin_lock_irqsave(&microcode_update_lock, flags);
        wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code);
        /* get patch id after patching */
        rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy);
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
 
        /* check current patch id and patch's id for match */
        if (rev != mc_amd->hdr.patch_id) {
                printk(KERN_ERR "microcode: CPU%d: update failed "
                       "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id);
-               return;
+               return -1;
        }
 
        printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n",
               cpu, rev);
 
        uci->cpu_sig.rev = rev;
+
+       return 0;
 }
 
 static int get_ucode_data(void *to, const u8 *from, size_t n)
@@ -257,13 +241,12 @@ static int install_equiv_cpu_table(const u8 *buf)
 
 static void free_equiv_cpu_table(void)
 {
-       if (equiv_cpu_table) {
-               vfree(equiv_cpu_table);
-               equiv_cpu_table = NULL;
-       }
+       vfree(equiv_cpu_table);
+       equiv_cpu_table = NULL;
 }
 
-static int generic_load_microcode(int cpu, const u8 *data, size_t size)
+static enum ucode_state
+generic_load_microcode(int cpu, const u8 *data, size_t size)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
        const u8 *ucode_ptr = data;
@@ -272,12 +255,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
        int new_rev = uci->cpu_sig.rev;
        unsigned int leftover;
        unsigned long offset;
+       enum ucode_state state = UCODE_OK;
 
        offset = install_equiv_cpu_table(ucode_ptr);
        if (!offset) {
                printk(KERN_ERR "microcode: failed to create "
                       "equivalent cpu table\n");
-               return -EINVAL;
+               return UCODE_ERROR;
        }
 
        ucode_ptr += offset;
@@ -293,8 +277,7 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 
                mc_header = (struct microcode_header_amd *)mc;
                if (get_matching_microcode(cpu, mc, new_rev)) {
-                       if (new_mc)
-                               vfree(new_mc);
+                       vfree(new_mc);
                        new_rev = mc_header->patch_id;
                        new_mc  = mc;
                } else
@@ -306,34 +289,32 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size)
 
        if (new_mc) {
                if (!leftover) {
-                       if (uci->mc)
-                               vfree(uci->mc);
+                       vfree(uci->mc);
                        uci->mc = new_mc;
                        pr_debug("microcode: CPU%d found a matching microcode "
                                 "update with version 0x%x (current=0x%x)\n",
                                 cpu, new_rev, uci->cpu_sig.rev);
-               } else
+               } else {
                        vfree(new_mc);
-       }
+                       state = UCODE_ERROR;
+               }
+       } else
+               state = UCODE_NFOUND;
 
        free_equiv_cpu_table();
 
-       return (int)leftover;
+       return state;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
        const char *fw_name = "amd-ucode/microcode_amd.bin";
        const struct firmware *firmware;
-       int ret;
-
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
+       enum ucode_state ret;
 
-       ret = request_firmware(&firmware, fw_name, device);
-       if (ret) {
+       if (request_firmware(&firmware, fw_name, device)) {
                printk(KERN_ERR "microcode: failed to load file %s\n", fw_name);
-               return ret;
+               return UCODE_NFOUND;
        }
 
        ret = generic_load_microcode(cpu, firmware->data, firmware->size);
@@ -343,11 +324,12 @@ static int request_microcode_fw(int cpu, struct device *device)
        return ret;
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
        printk(KERN_INFO "microcode: AMD microcode update via "
               "/dev/cpu/microcode not supported\n");
-       return -1;
+       return UCODE_ERROR;
 }
 
 static void microcode_fini_cpu_amd(int cpu)
index 98c470c069d150f26af3125a7690a25689614362..9c4461501fcbb9618ac3ce12fef93f990b8f3955 100644 (file)
  *             Thanks to Stuart Swales for pointing out this bug.
  */
 #include <linux/platform_device.h>
-#include <linux/capability.h>
 #include <linux/miscdevice.h>
-#include <linux/firmware.h>
+#include <linux/capability.h>
 #include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/fs.h>
 #include <linux/mm.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
-#include <asm/msr.h>
 
 MODULE_DESCRIPTION("Microcode Update Driver");
 MODULE_AUTHOR("Tigran Aivazian <tigran@aivazian.fsnet.co.uk>");
@@ -101,36 +92,110 @@ MODULE_LICENSE("GPL");
 
 static struct microcode_ops    *microcode_ops;
 
-/* no concurrent ->write()s are allowed on /dev/cpu/microcode */
+/*
+ * Synchronization.
+ *
+ * All non cpu-hotplug-callback call sites use:
+ *
+ * - microcode_mutex to synchronize with each other;
+ * - get/put_online_cpus() to synchronize with
+ *   the cpu-hotplug-callback call sites.
+ *
+ * We guarantee that only a single cpu is being
+ * updated at any particular moment of time.
+ */
 static DEFINE_MUTEX(microcode_mutex);
 
 struct ucode_cpu_info          ucode_cpu_info[NR_CPUS];
 EXPORT_SYMBOL_GPL(ucode_cpu_info);
 
+/*
+ * Operations that are run on a target cpu:
+ */
+
+struct cpu_info_ctx {
+       struct cpu_signature    *cpu_sig;
+       int                     err;
+};
+
+static void collect_cpu_info_local(void *arg)
+{
+       struct cpu_info_ctx *ctx = arg;
+
+       ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(),
+                                                  ctx->cpu_sig);
+}
+
+static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig)
+{
+       struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 };
+       int ret;
+
+       ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1);
+       if (!ret)
+               ret = ctx.err;
+
+       return ret;
+}
+
+static int collect_cpu_info(int cpu)
+{
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       int ret;
+
+       memset(uci, 0, sizeof(*uci));
+
+       ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig);
+       if (!ret)
+               uci->valid = 1;
+
+       return ret;
+}
+
+struct apply_microcode_ctx {
+       int err;
+};
+
+static void apply_microcode_local(void *arg)
+{
+       struct apply_microcode_ctx *ctx = arg;
+
+       ctx->err = microcode_ops->apply_microcode(smp_processor_id());
+}
+
+static int apply_microcode_on_target(int cpu)
+{
+       struct apply_microcode_ctx ctx = { .err = 0 };
+       int ret;
+
+       ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1);
+       if (!ret)
+               ret = ctx.err;
+
+       return ret;
+}
+
 #ifdef CONFIG_MICROCODE_OLD_INTERFACE
 static int do_microcode_update(const void __user *buf, size_t size)
 {
-       cpumask_t old;
        int error = 0;
        int cpu;
 
-       old = current->cpus_allowed;
-
        for_each_online_cpu(cpu) {
                struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+               enum ucode_state ustate;
 
                if (!uci->valid)
                        continue;
 
-               set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu));
-               error = microcode_ops->request_microcode_user(cpu, buf, size);
-               if (error < 0)
-                       goto out;
-               if (!error)
-                       microcode_ops->apply_microcode(cpu);
+               ustate = microcode_ops->request_microcode_user(cpu, buf, size);
+               if (ustate == UCODE_ERROR) {
+                       error = -1;
+                       break;
+               } else if (ustate == UCODE_OK)
+                       apply_microcode_on_target(cpu);
        }
-out:
-       set_cpus_allowed_ptr(current, &old);
+
        return error;
 }
 
@@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2)
 static ssize_t microcode_write(struct file *file, const char __user *buf,
                               size_t len, loff_t *ppos)
 {
-       ssize_t ret;
+       ssize_t ret = -EINVAL;
 
        if ((len >> PAGE_SHIFT) > num_physpages) {
-               printk(KERN_ERR "microcode: too much data (max %ld pages)\n",
-                      num_physpages);
-               return -EINVAL;
+               pr_err("microcode: too much data (max %ld pages)\n", num_physpages);
+               return ret;
        }
 
        get_online_cpus();
        mutex_lock(&microcode_mutex);
 
-       ret = do_microcode_update(buf, len);
-       if (!ret)
+       if (do_microcode_update(buf, len) == 0)
                ret = (ssize_t)len;
 
        mutex_unlock(&microcode_mutex);
@@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
 }
 
 static const struct file_operations microcode_fops = {
-       .owner          = THIS_MODULE,
-       .write          = microcode_write,
-       .open           = microcode_open,
+       .owner                  = THIS_MODULE,
+       .write                  = microcode_write,
+       .open                   = microcode_open,
 };
 
 static struct miscdevice microcode_dev = {
-       .minor          = MICROCODE_MINOR,
-       .name           = "microcode",
-       .fops           = &microcode_fops,
+       .minor                  = MICROCODE_MINOR,
+       .name                   = "microcode",
+       .fops                   = &microcode_fops,
 };
 
 static int __init microcode_dev_init(void)
@@ -182,9 +245,7 @@ static int __init microcode_dev_init(void)
 
        error = misc_register(&microcode_dev);
        if (error) {
-               printk(KERN_ERR
-                       "microcode: can't misc_register on minor=%d\n",
-                       MICROCODE_MINOR);
+               pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR);
                return error;
        }
 
@@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR);
 /* fake device for request_firmware */
 static struct platform_device  *microcode_pdev;
 
-static long reload_for_cpu(void *unused)
+static int reload_for_cpu(int cpu)
 {
-       struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
        int err = 0;
 
        mutex_lock(&microcode_mutex);
        if (uci->valid) {
-               err = microcode_ops->request_microcode_fw(smp_processor_id(),
-                                                         &microcode_pdev->dev);
-               if (!err)
-                       microcode_ops->apply_microcode(smp_processor_id());
+               enum ucode_state ustate;
+
+               ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
+               if (ustate == UCODE_OK)
+                       apply_microcode_on_target(cpu);
+               else
+                       if (ustate == UCODE_ERROR)
+                               err = -EINVAL;
        }
        mutex_unlock(&microcode_mutex);
+
        return err;
 }
 
 static ssize_t reload_store(struct sys_device *dev,
                            struct sysdev_attribute *attr,
-                           const char *buf, size_t sz)
+                           const char *buf, size_t size)
 {
-       char *end;
-       unsigned long val = simple_strtoul(buf, &end, 0);
-       int err = 0;
+       unsigned long val;
        int cpu = dev->id;
+       int ret = 0;
+       char *end;
 
+       val = simple_strtoul(buf, &end, 0);
        if (end == buf)
                return -EINVAL;
+
        if (val == 1) {
                get_online_cpus();
                if (cpu_online(cpu))
-                       err = work_on_cpu(cpu, reload_for_cpu, NULL);
+                       ret = reload_for_cpu(cpu);
                put_online_cpus();
        }
-       if (err)
-               return err;
-       return sz;
+
+       if (!ret)
+               ret = size;
+
+       return ret;
 }
 
 static ssize_t version_show(struct sys_device *dev,
@@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = {
 };
 
 static struct attribute_group mc_attr_group = {
-       .attrs          = mc_default_attrs,
-       .name           = "microcode",
+       .attrs                  = mc_default_attrs,
+       .name                   = "microcode",
 };
 
-static void __microcode_fini_cpu(int cpu)
+static void microcode_fini_cpu(int cpu)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
@@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu)
        uci->valid = 0;
 }
 
-static void microcode_fini_cpu(int cpu)
-{
-       mutex_lock(&microcode_mutex);
-       __microcode_fini_cpu(cpu);
-       mutex_unlock(&microcode_mutex);
-}
-
-static void collect_cpu_info(int cpu)
+static enum ucode_state microcode_resume_cpu(int cpu)
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
-       memset(uci, 0, sizeof(*uci));
-       if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig))
-               uci->valid = 1;
+       if (!uci->mc)
+               return UCODE_NFOUND;
+
+       pr_debug("microcode: CPU%d updated upon resume\n", cpu);
+       apply_microcode_on_target(cpu);
+
+       return UCODE_OK;
 }
 
-static int microcode_resume_cpu(int cpu)
+static enum ucode_state microcode_init_cpu(int cpu)
 {
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
-       struct cpu_signature nsig;
+       enum ucode_state ustate;
 
-       pr_debug("microcode: CPU%d resumed\n", cpu);
+       if (collect_cpu_info(cpu))
+               return UCODE_ERROR;
 
-       if (!uci->mc)
-               return 1;
+       /* --dimm. Trigger a delayed update? */
+       if (system_state != SYSTEM_RUNNING)
+               return UCODE_NFOUND;
 
-       /*
-        * Let's verify that the 'cached' ucode does belong
-        * to this cpu (a bit of paranoia):
-        */
-       if (microcode_ops->collect_cpu_info(cpu, &nsig)) {
-               __microcode_fini_cpu(cpu);
-               printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n",
-                               cpu);
-               return -1;
-       }
+       ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
 
-       if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) {
-               __microcode_fini_cpu(cpu);
-               printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n",
-                               cpu);
-               /* Should we look for a new ucode here? */
-               return 1;
+       if (ustate == UCODE_OK) {
+               pr_debug("microcode: CPU%d updated upon init\n", cpu);
+               apply_microcode_on_target(cpu);
        }
 
-       return 0;
+       return ustate;
 }
 
-static long microcode_update_cpu(void *unused)
+static enum ucode_state microcode_update_cpu(int cpu)
 {
-       struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id();
-       int err = 0;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
+       enum ucode_state ustate;
 
-       /*
-        * Check if the system resume is in progress (uci->valid != NULL),
-        * otherwise just request a firmware:
-        */
-       if (uci->valid) {
-               err = microcode_resume_cpu(smp_processor_id());
-       } else {
-               collect_cpu_info(smp_processor_id());
-               if (uci->valid && system_state == SYSTEM_RUNNING)
-                       err = microcode_ops->request_microcode_fw(
-                                       smp_processor_id(),
-                                       &microcode_pdev->dev);
-       }
-       if (!err)
-               microcode_ops->apply_microcode(smp_processor_id());
-       return err;
-}
+       if (uci->valid)
+               ustate = microcode_resume_cpu(cpu);
+       else
+               ustate = microcode_init_cpu(cpu);
 
-static int microcode_init_cpu(int cpu)
-{
-       int err;
-       mutex_lock(&microcode_mutex);
-       err = work_on_cpu(cpu, microcode_update_cpu, NULL);
-       mutex_unlock(&microcode_mutex);
-
-       return err;
+       return ustate;
 }
 
 static int mc_sysdev_add(struct sys_device *sys_dev)
 {
        int err, cpu = sys_dev->id;
-       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
        if (!cpu_online(cpu))
                return 0;
 
        pr_debug("microcode: CPU%d added\n", cpu);
-       memset(uci, 0, sizeof(*uci));
 
        err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
        if (err)
                return err;
 
-       err = microcode_init_cpu(cpu);
+       if (microcode_init_cpu(cpu) == UCODE_ERROR)
+               err = -EINVAL;
 
        return err;
 }
@@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
 static int mc_sysdev_resume(struct sys_device *dev)
 {
        int cpu = dev->id;
+       struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
 
        if (!cpu_online(cpu))
                return 0;
 
-       /* only CPU 0 will apply ucode here */
-       microcode_update_cpu(NULL);
+       /*
+        * All non-bootup cpus are still disabled,
+        * so only CPU 0 will apply ucode here.
+        *
+        * Moreover, there can be no concurrent
+        * updates from any other places at this point.
+        */
+       WARN_ON(cpu != 0);
+
+       if (uci->valid && uci->mc)
+               microcode_ops->apply_microcode(cpu);
+
        return 0;
 }
 
 static struct sysdev_driver mc_sysdev_driver = {
-       .add            = mc_sysdev_add,
-       .remove         = mc_sysdev_remove,
-       .resume         = mc_sysdev_resume,
+       .add                    = mc_sysdev_add,
+       .remove                 = mc_sysdev_remove,
+       .resume                 = mc_sysdev_resume,
 };
 
 static __cpuinit int
@@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
        switch (action) {
        case CPU_ONLINE:
        case CPU_ONLINE_FROZEN:
-               if (microcode_init_cpu(cpu))
-                       printk(KERN_ERR "microcode: failed to init CPU%d\n",
-                              cpu);
+               microcode_update_cpu(cpu);
        case CPU_DOWN_FAILED:
        case CPU_DOWN_FAILED_FROZEN:
                pr_debug("microcode: CPU%d added\n", cpu);
                if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
-                       printk(KERN_ERR "microcode: Failed to create the sysfs "
-                               "group for CPU%d\n", cpu);
+                       pr_err("microcode: Failed to create group for CPU%d\n", cpu);
                break;
        case CPU_DOWN_PREPARE:
        case CPU_DOWN_PREPARE_FROZEN:
@@ -465,13 +508,10 @@ static int __init microcode_init(void)
                microcode_ops = init_amd_microcode();
 
        if (!microcode_ops) {
-               printk(KERN_ERR "microcode: no support for this CPU vendor\n");
+               pr_err("microcode: no support for this CPU vendor\n");
                return -ENODEV;
        }
 
-       error = microcode_dev_init();
-       if (error)
-               return error;
        microcode_pdev = platform_device_register_simple("microcode", -1,
                                                         NULL, 0);
        if (IS_ERR(microcode_pdev)) {
@@ -480,23 +520,31 @@ static int __init microcode_init(void)
        }
 
        get_online_cpus();
+       mutex_lock(&microcode_mutex);
+
        error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver);
+
+       mutex_unlock(&microcode_mutex);
        put_online_cpus();
+
        if (error) {
-               microcode_dev_exit();
                platform_device_unregister(microcode_pdev);
                return error;
        }
 
+       error = microcode_dev_init();
+       if (error)
+               return error;
+
        register_hotcpu_notifier(&mc_cpu_notifier);
 
-       printk(KERN_INFO
-              "Microcode Update Driver: v" MICROCODE_VERSION
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION
               " <tigran@aivazian.fsnet.co.uk>,"
               " Peter Oruba\n");
 
        return 0;
 }
+module_init(microcode_init);
 
 static void __exit microcode_exit(void)
 {
@@ -505,16 +553,17 @@ static void __exit microcode_exit(void)
        unregister_hotcpu_notifier(&mc_cpu_notifier);
 
        get_online_cpus();
+       mutex_lock(&microcode_mutex);
+
        sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver);
+
+       mutex_unlock(&microcode_mutex);
        put_online_cpus();
 
        platform_device_unregister(microcode_pdev);
 
        microcode_ops = NULL;
 
-       printk(KERN_INFO
-              "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
+       pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n");
 }
-
-module_init(microcode_init);
 module_exit(microcode_exit);
index 149b9ec7c1ab72fcd20c2f2ab84cc511b3d707e7..0d334ddd0a9642a2c05485e9e31394f8b1e98a3b 100644 (file)
  *             Fix sigmatch() macro to handle old CPUs with pf == 0.
  *             Thanks to Stuart Swales for pointing out this bug.
  */
-#include <linux/platform_device.h>
-#include <linux/capability.h>
-#include <linux/miscdevice.h>
 #include <linux/firmware.h>
-#include <linux/smp_lock.h>
-#include <linux/spinlock.h>
-#include <linux/cpumask.h>
 #include <linux/uaccess.h>
-#include <linux/vmalloc.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
-#include <linux/mutex.h>
-#include <linux/sched.h>
-#include <linux/init.h>
-#include <linux/slab.h>
-#include <linux/cpu.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
+#include <linux/vmalloc.h>
 
 #include <asm/microcode.h>
 #include <asm/processor.h>
@@ -150,13 +137,9 @@ struct extended_sigtable {
 
 #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE)
 
-/* serialize access to the physical write to MSR 0x79 */
-static DEFINE_SPINLOCK(microcode_update_lock);
-
 static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
 {
        struct cpuinfo_x86 *c = &cpu_data(cpu_num);
-       unsigned long flags;
        unsigned int val[2];
 
        memset(csig, 0, sizeof(*csig));
@@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
                csig->pf = 1 << ((val[1] >> 18) & 7);
        }
 
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);
-
        wrmsr(MSR_IA32_UCODE_REV, 0, 0);
        /* see notes above for revision 1.07.  Apparent chip bug */
        sync_core();
        /* get the current revision from MSR 0x8B */
        rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
 
-       pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n",
-                       csig->sig, csig->pf, csig->rev);
+       printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
+                       cpu_num, csig->sig, csig->pf, csig->rev);
 
        return 0;
 }
@@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev)
        return 0;
 }
 
-static void apply_microcode(int cpu)
+static int apply_microcode(int cpu)
 {
        struct microcode_intel *mc_intel;
        struct ucode_cpu_info *uci;
-       unsigned long flags;
        unsigned int val[2];
        int cpu_num;
 
@@ -334,10 +312,7 @@ static void apply_microcode(int cpu)
        BUG_ON(cpu_num != cpu);
 
        if (mc_intel == NULL)
-               return;
-
-       /* serialize access to the physical write to MSR 0x79 */
-       spin_lock_irqsave(&microcode_update_lock, flags);
+               return 0;
 
        /* write microcode via MSR 0x79 */
        wrmsr(MSR_IA32_UCODE_WRITE,
@@ -351,30 +326,32 @@ static void apply_microcode(int cpu)
        /* get the current revision from MSR 0x8B */
        rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
 
-       spin_unlock_irqrestore(&microcode_update_lock, flags);
        if (val[1] != mc_intel->hdr.rev) {
-               printk(KERN_ERR "microcode: CPU%d update from revision "
-                               "0x%x to 0x%x failed\n",
-                       cpu_num, uci->cpu_sig.rev, val[1]);
-               return;
+               printk(KERN_ERR "microcode: CPU%d update "
+                               "to revision 0x%x failed\n",
+                       cpu_num, mc_intel->hdr.rev);
+               return -1;
        }
-       printk(KERN_INFO "microcode: CPU%d updated from revision "
-                        "0x%x to 0x%x, date = %04x-%02x-%02x \n",
-               cpu_num, uci->cpu_sig.rev, val[1],
+       printk(KERN_INFO "microcode: CPU%d updated to revision "
+                        "0x%x, date = %04x-%02x-%02x \n",
+               cpu_num, val[1],
                mc_intel->hdr.date & 0xffff,
                mc_intel->hdr.date >> 24,
                (mc_intel->hdr.date >> 16) & 0xff);
 
        uci->cpu_sig.rev = val[1];
+
+       return 0;
 }
 
-static int generic_load_microcode(int cpu, void *data, size_t size,
-               int (*get_ucode_data)(void *, const void *, size_t))
+static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
+                               int (*get_ucode_data)(void *, const void *, size_t))
 {
        struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
        u8 *ucode_ptr = data, *new_mc = NULL, *mc;
        int new_rev = uci->cpu_sig.rev;
        unsigned int leftover = size;
+       enum ucode_state state = UCODE_OK;
 
        while (leftover) {
                struct microcode_header_intel mc_header;
@@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
                leftover  -= mc_size;
        }
 
-       if (!new_mc)
+       if (leftover) {
+               if (new_mc)
+                       vfree(new_mc);
+               state = UCODE_ERROR;
                goto out;
+       }
 
-       if (leftover) {
-               vfree(new_mc);
+       if (!new_mc) {
+               state = UCODE_NFOUND;
                goto out;
        }
 
@@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size,
        pr_debug("microcode: CPU%d found a matching microcode update with"
                 " version 0x%x (current=0x%x)\n",
                        cpu, new_rev, uci->cpu_sig.rev);
-
- out:
-       return (int)leftover;
+out:
+       return state;
 }
 
 static int get_ucode_fw(void *to, const void *from, size_t n)
@@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n)
        return 0;
 }
 
-static int request_microcode_fw(int cpu, struct device *device)
+static enum ucode_state request_microcode_fw(int cpu, struct device *device)
 {
        char name[30];
        struct cpuinfo_x86 *c = &cpu_data(cpu);
        const struct firmware *firmware;
-       int ret;
+       enum ucode_state ret;
 
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
        sprintf(name, "intel-ucode/%02x-%02x-%02x",
                c->x86, c->x86_model, c->x86_mask);
-       ret = request_firmware(&firmware, name, device);
-       if (ret) {
+
+       if (request_firmware(&firmware, name, device)) {
                pr_debug("microcode: data file %s load failed\n", name);
-               return ret;
+               return UCODE_NFOUND;
        }
 
        ret = generic_load_microcode(cpu, (void *)firmware->data,
@@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n)
        return copy_from_user(to, from, n);
 }
 
-static int request_microcode_user(int cpu, const void __user *buf, size_t size)
+static enum ucode_state
+request_microcode_user(int cpu, const void __user *buf, size_t size)
 {
-       /* We should bind the task to the CPU */
-       BUG_ON(cpu != raw_smp_processor_id());
-
        return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user);
 }
 
index 70fd7e414c1544aadf5de5573e67294e88cac16c..651c93b28862a5ea04f3fdfc883533ff843b2583 100644 (file)
@@ -17,6 +17,7 @@
 #include <linux/acpi.h>
 #include <linux/module.h>
 #include <linux/smp.h>
+#include <linux/pci.h>
 
 #include <asm/mtrr.h>
 #include <asm/mpspec.h>
@@ -870,24 +871,17 @@ static
 inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
 #endif /* CONFIG_X86_IO_APIC */
 
-static int check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length,
-                     int count)
+static int
+check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
 {
-       if (!mpc_new_phys) {
-               pr_info("No spare slots, try to append...take your risk, "
-                       "new mpc_length %x\n", count);
-       } else {
-               if (count <= mpc_new_length)
-                       pr_info("No spare slots, try to append..., "
-                               "new mpc_length %x\n", count);
-               else {
-                       pr_err("mpc_new_length %lx is too small\n",
-                               mpc_new_length);
-                       return -1;
-               }
+       int ret = 0;
+
+       if (!mpc_new_phys || count <= mpc_new_length) {
+               WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
+               return -1;
        }
 
-       return 0;
+       return ret;
 }
 
 static int  __init replace_intsrc_all(struct mpc_table *mpc,
@@ -946,7 +940,7 @@ static int  __init replace_intsrc_all(struct mpc_table *mpc,
                } else {
                        struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
                        count += sizeof(struct mpc_intsrc);
-                       if (!check_slot(mpc_new_phys, mpc_new_length, count))
+                       if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
                                goto out;
                        assign_to_mpc_intsrc(&mp_irqs[i], m);
                        mpc->length = count;
@@ -963,11 +957,14 @@ out:
        return 0;
 }
 
-static int __initdata enable_update_mptable;
+int enable_update_mptable;
 
 static int __init update_mptable_setup(char *str)
 {
        enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
        return 0;
 }
 early_param("update_mptable", update_mptable_setup);
@@ -980,6 +977,9 @@ static int __initdata alloc_mptable;
 static int __init parse_alloc_mptable_opt(char *p)
 {
        enable_update_mptable = 1;
+#ifdef CONFIG_PCI
+       pci_routeirq = 1;
+#endif
        alloc_mptable = 1;
        if (!p)
                return 0;
index 9faf43bea3361cf178f53942e8f37a8842737afc..70ec9b951d76e0eb21a79f124b9adebe8a4b60be 100644 (file)
@@ -248,18 +248,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA
 
 static inline void enter_lazy(enum paravirt_lazy_mode mode)
 {
-       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
-       BUG_ON(preemptible());
+       BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE);
 
-       __get_cpu_var(paravirt_lazy_mode) = mode;
+       percpu_write(paravirt_lazy_mode, mode);
 }
 
-void paravirt_leave_lazy(enum paravirt_lazy_mode mode)
+static void leave_lazy(enum paravirt_lazy_mode mode)
 {
-       BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode);
-       BUG_ON(preemptible());
+       BUG_ON(percpu_read(paravirt_lazy_mode) != mode);
 
-       __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE;
+       percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE);
 }
 
 void paravirt_enter_lazy_mmu(void)
@@ -269,22 +267,36 @@ void paravirt_enter_lazy_mmu(void)
 
 void paravirt_leave_lazy_mmu(void)
 {
-       paravirt_leave_lazy(PARAVIRT_LAZY_MMU);
+       leave_lazy(PARAVIRT_LAZY_MMU);
 }
 
-void paravirt_enter_lazy_cpu(void)
+void paravirt_start_context_switch(struct task_struct *prev)
 {
+       BUG_ON(preemptible());
+
+       if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) {
+               arch_leave_lazy_mmu_mode();
+               set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES);
+       }
        enter_lazy(PARAVIRT_LAZY_CPU);
 }
 
-void paravirt_leave_lazy_cpu(void)
+void paravirt_end_context_switch(struct task_struct *next)
 {
-       paravirt_leave_lazy(PARAVIRT_LAZY_CPU);
+       BUG_ON(preemptible());
+
+       leave_lazy(PARAVIRT_LAZY_CPU);
+
+       if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES))
+               arch_enter_lazy_mmu_mode();
 }
 
 enum paravirt_lazy_mode paravirt_get_lazy_mode(void)
 {
-       return __get_cpu_var(paravirt_lazy_mode);
+       if (in_interrupt())
+               return PARAVIRT_LAZY_NONE;
+
+       return percpu_read(paravirt_lazy_mode);
 }
 
 void arch_flush_lazy_mmu_mode(void)
@@ -292,7 +304,6 @@ void arch_flush_lazy_mmu_mode(void)
        preempt_disable();
 
        if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
-               WARN_ON(preempt_count() == 1);
                arch_leave_lazy_mmu_mode();
                arch_enter_lazy_mmu_mode();
        }
@@ -300,19 +311,6 @@ void arch_flush_lazy_mmu_mode(void)
        preempt_enable();
 }
 
-void arch_flush_lazy_cpu_mode(void)
-{
-       preempt_disable();
-
-       if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) {
-               WARN_ON(preempt_count() == 1);
-               arch_leave_lazy_cpu_mode();
-               arch_enter_lazy_cpu_mode();
-       }
-
-       preempt_enable();
-}
-
 struct pv_info pv_info = {
        .name = "bare hardware",
        .paravirt_enabled = 0,
@@ -404,10 +402,8 @@ struct pv_cpu_ops pv_cpu_ops = {
        .set_iopl_mask = native_set_iopl_mask,
        .io_delay = native_io_delay,
 
-       .lazy_mode = {
-               .enter = paravirt_nop,
-               .leave = paravirt_nop,
-       },
+       .start_context_switch = paravirt_nop,
+       .end_context_switch = paravirt_nop,
 };
 
 struct pv_apic_ops pv_apic_ops = {
index 755c21e906f3f04821755c703bb99e26b118e0f7..971a3bec47a8644fdd3cbbcebb833a0ad2cc3044 100644 (file)
@@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = {
 
 static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, };
 
-/* enable this to stress test the chip's TCE cache */
-#ifdef CONFIG_IOMMU_DEBUG
-static int debugging = 1;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       unsigned long idx = start;
-
-       BUG_ON(start >= end);
-
-       while (idx < end) {
-               if (!!test_bit(idx, bitmap) != expected)
-                       return idx;
-               ++idx;
-       }
-
-       /* all bits have the expected value */
-       return ~0UL;
-}
-#else /* debugging is disabled */
-static int debugging;
-
-static inline unsigned long verify_bit_range(unsigned long* bitmap,
-       int expected, unsigned long start, unsigned long end)
-{
-       return ~0UL;
-}
-
-#endif /* CONFIG_IOMMU_DEBUG */
-
 static inline int translation_enabled(struct iommu_table *tbl)
 {
        /* only PHBs with translation enabled have an IOMMU table */
@@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 {
        unsigned long index;
        unsigned long end;
-       unsigned long badbit;
        unsigned long flags;
 
        index = start_addr >> PAGE_SHIFT;
@@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl,
 
        spin_lock_irqsave(&tbl->it_lock, flags);
 
-       badbit = verify_bit_range(tbl->it_map, 0, index, end);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: entry already allocated at "
-                              "0x%lx tbl %p dma 0x%lx npages %u\n",
-                              badbit, tbl, start_addr, npages);
-       }
-
        iommu_area_reserve(tbl->it_map, index, npages);
 
        spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
        unsigned int npages)
 {
        unsigned long entry;
-       unsigned long badbit;
        unsigned long badend;
        unsigned long flags;
 
@@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 
        spin_lock_irqsave(&tbl->it_lock, flags);
 
-       badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages);
-       if (badbit != ~0UL) {
-               if (printk_ratelimit())
-                       printk(KERN_ERR "Calgary: bit is off at 0x%lx "
-                              "tbl %p dma 0x%Lx entry 0x%lx npages %u\n",
-                              badbit, tbl, dma_addr, entry, npages);
-       }
-
        iommu_area_free(tbl->it_map, entry, npages);
 
        spin_unlock_irqrestore(&tbl->it_lock, flags);
@@ -1488,9 +1439,8 @@ void __init detect_calgary(void)
                iommu_detected = 1;
                calgary_detected = 1;
                printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n");
-               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, "
-                      "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size,
-                      debugging ? "enabled" : "disabled");
+               printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
+                      specified_table_size);
 
                /* swiotlb for devices that aren't behind the Calgary. */
                if (max_pfn > MAX_DMA32_PFN)
index b284b58c035ccdd8fc850604cced3557ae7de2c5..cfd9f90638967e03277ad510625905213322bfd4 100644 (file)
@@ -144,48 +144,21 @@ static void flush_gart(void)
 }
 
 #ifdef CONFIG_IOMMU_LEAK
-
-#define SET_LEAK(x)                                                    \
-       do {                                                            \
-               if (iommu_leak_tab)                                     \
-                       iommu_leak_tab[x] = __builtin_return_address(0);\
-       } while (0)
-
-#define CLEAR_LEAK(x)                                                  \
-       do {                                                            \
-               if (iommu_leak_tab)                                     \
-                       iommu_leak_tab[x] = NULL;                       \
-       } while (0)
-
 /* Debugging aid for drivers that don't free their IOMMU tables */
-static void **iommu_leak_tab;
 static int leak_trace;
 static int iommu_leak_pages = 20;
 
 static void dump_leak(void)
 {
-       int i;
        static int dump;
 
-       if (dump || !iommu_leak_tab)
+       if (dump)
                return;
        dump = 1;
-       show_stack(NULL, NULL);
 
-       /* Very crude. dump some from the end of the table too */
-       printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n",
-              iommu_leak_pages);
-       for (i = 0; i < iommu_leak_pages; i += 2) {
-               printk(KERN_DEBUG "%lu: ", iommu_pages-i);
-               printk_address((unsigned long) iommu_leak_tab[iommu_pages-i],
-                               0);
-               printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' ');
-       }
-       printk(KERN_DEBUG "\n");
+       show_stack(NULL, NULL);
+       debug_dma_dump_mappings(NULL);
 }
-#else
-# define SET_LEAK(x)
-# define CLEAR_LEAK(x)
 #endif
 
 static void iommu_full(struct device *dev, size_t size, int dir)
@@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
 
        for (i = 0; i < npages; i++) {
                iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem);
-               SET_LEAK(iommu_page + i);
                phys_mem += PAGE_SIZE;
        }
        return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK);
@@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr,
        npages = iommu_num_pages(dma_addr, size, PAGE_SIZE);
        for (i = 0; i < npages; i++) {
                iommu_gatt_base[iommu_page + i] = gart_unmapped_entry;
-               CLEAR_LEAK(iommu_page + i);
        }
        free_iommu(iommu_page, npages);
 }
@@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start,
                pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE);
                while (pages--) {
                        iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr);
-                       SET_LEAK(iommu_page);
                        addr += PAGE_SIZE;
                        iommu_page++;
                }
@@ -688,8 +658,6 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
 
        agp_gatt_table = gatt;
 
-       enable_gart_translations();
-
        error = sysdev_class_register(&gart_sysdev_class);
        if (!error)
                error = sysdev_register(&device_gart);
@@ -801,11 +769,12 @@ void __init gart_iommu_init(void)
 
 #ifdef CONFIG_IOMMU_LEAK
        if (leak_trace) {
-               iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO,
-                                 get_order(iommu_pages*sizeof(void *)));
-               if (!iommu_leak_tab)
+               int ret;
+
+               ret = dma_debug_resize_entries(iommu_pages);
+               if (ret)
                        printk(KERN_DEBUG
-                              "PCI-DMA: Cannot allocate leak trace area\n");
+                              "PCI-DMA: Cannot trace all the entries\n");
        }
 #endif
 
@@ -845,6 +814,14 @@ void __init gart_iommu_init(void)
         * the pages as Not-Present:
         */
        wbinvd();
+       
+       /*
+        * Now all caches are flushed and we can safely enable
+        * GART hardware.  Doing it early leaves the possibility
+        * of stale cache entries that can lead to GART PTE
+        * errors.
+        */
+       enable_gart_translations();
 
        /*
         * Try to workaround a bug (thanks to BenH):
index 221a3853e2684b111c36669ee688f62ba51cdb10..a1712f2b50f1b974a1a346bd4d8db1ec21f02068 100644 (file)
@@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
        return paddr;
 }
 
-phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
        return baddr;
 }
index ca989158e847936e893c7110b4a328ea1549d452..3bb2be1649bddb5b3ba9870553b72ce73e41b460 100644 (file)
@@ -8,12 +8,15 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
+#include <linux/random.h>
 #include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
+#include <asm/syscalls.h>
 #include <asm/idle.h>
 #include <asm/uaccess.h>
 #include <asm/i387.h>
+#include <asm/ds.h>
 
 unsigned long idle_halt;
 EXPORT_SYMBOL(idle_halt);
@@ -45,6 +48,8 @@ void free_thread_xstate(struct task_struct *tsk)
                kmem_cache_free(task_xstate_cachep, tsk->thread.xstate);
                tsk->thread.xstate = NULL;
        }
+
+       WARN(tsk->thread.ds_ctx, "leaking DS context\n");
 }
 
 void free_thread_info(struct thread_info *ti)
@@ -83,8 +88,6 @@ void exit_thread(void)
                put_cpu();
                kfree(bp);
        }
-
-       ds_exit_thread(current);
 }
 
 void flush_thread(void)
@@ -613,3 +616,16 @@ static int __init idle_setup(char *str)
 }
 early_param("idle", idle_setup);
 
+unsigned long arch_align_stack(unsigned long sp)
+{
+       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
+               sp -= get_random_int() % 8192;
+       return sp & ~0xf;
+}
+
+unsigned long arch_randomize_brk(struct mm_struct *mm)
+{
+       unsigned long range_end = mm->brk + 0x02000000;
+       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
+}
+
index 76f8f84043a2a4693123648d92d08c22ca7f5f25..59f4524984afacd10b456dcdca4abbc2237d7ecb 100644 (file)
@@ -9,8 +9,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
@@ -33,7 +31,6 @@
 #include <linux/module.h>
 #include <linux/kallsyms.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/personality.h>
 #include <linux/tick.h>
 #include <linux/percpu.h>
@@ -290,7 +287,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                p->thread.io_bitmap_max = 0;
        }
 
-       ds_copy_thread(p, current);
+       clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+       p->thread.ds_ctx = NULL;
 
        clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
        p->thread.debugctlmsr = 0;
@@ -407,7 +405,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * done before math_state_restore, so the TS bit is up
         * to date.
         */
-       arch_leave_lazy_cpu_mode();
+       arch_end_context_switch(next_p);
 
        /* If the task has used fpu the last 5 timeslices, just do a full
         * restore of the math state immediately to avoid the trap; the
@@ -497,15 +495,3 @@ unsigned long get_wchan(struct task_struct *p)
        return 0;
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-       unsigned long range_end = mm->brk + 0x02000000;
-       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
index b751a41392b1b997d3c9a3235ba85b2372bdf7a3..ebefb5407b9d36f5ccbb1c458708fb8a647ff19b 100644 (file)
@@ -14,8 +14,6 @@
  * This file handles the architecture-dependent parts of process handling..
  */
 
-#include <stdarg.h>
-
 #include <linux/stackprotector.h>
 #include <linux/cpu.h>
 #include <linux/errno.h>
@@ -32,7 +30,6 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
-#include <linux/random.h>
 #include <linux/notifier.h>
 #include <linux/kprobes.h>
 #include <linux/kdebug.h>
@@ -335,7 +332,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
                        goto out;
        }
 
-       ds_copy_thread(p, me);
+       clear_tsk_thread_flag(p, TIF_DS_AREA_MSR);
+       p->thread.ds_ctx = NULL;
 
        clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR);
        p->thread.debugctlmsr = 0;
@@ -428,7 +426,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
         * done before math_state_restore, so the TS bit is up
         * to date.
         */
-       arch_leave_lazy_cpu_mode();
+       arch_end_context_switch(next_p);
 
        /*
         * Switch FS and GS.
@@ -660,15 +658,3 @@ long sys_arch_prctl(int code, unsigned long addr)
        return do_arch_prctl(current, code, addr);
 }
 
-unsigned long arch_align_stack(unsigned long sp)
-{
-       if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
-               sp -= get_random_int() % 8192;
-       return sp & ~0xf;
-}
-
-unsigned long arch_randomize_brk(struct mm_struct *mm)
-{
-       unsigned long range_end = mm->brk + 0x02000000;
-       return randomize_range(mm->brk, range_end, 0) ? : mm->brk;
-}
index 23b7c8f017e2afa74194c81c7720724a2604751b..09ecbde91c1354e036751f71e2d5612057fe3626 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/audit.h>
 #include <linux/seccomp.h>
 #include <linux/signal.h>
+#include <linux/workqueue.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -578,17 +579,130 @@ static int ioperm_get(struct task_struct *target,
 }
 
 #ifdef CONFIG_X86_PTRACE_BTS
+/*
+ * A branch trace store context.
+ *
+ * Contexts may only be installed by ptrace_bts_config() and only for
+ * ptraced tasks.
+ *
+ * Contexts are destroyed when the tracee is detached from the tracer.
+ * The actual destruction work requires interrupts enabled, so the
+ * work is deferred and will be scheduled during __ptrace_unlink().
+ *
+ * Contexts hold an additional task_struct reference on the traced
+ * task, as well as a reference on the tracer's mm.
+ *
+ * Ptrace already holds a task_struct for the duration of ptrace operations,
+ * but since destruction is deferred, it may be executed after both
+ * tracer and tracee exited.
+ */
+struct bts_context {
+       /* The branch trace handle. */
+       struct bts_tracer       *tracer;
+
+       /* The buffer used to store the branch trace and its size. */
+       void                    *buffer;
+       unsigned int            size;
+
+       /* The mm that paid for the above buffer. */
+       struct mm_struct        *mm;
+
+       /* The task this context belongs to. */
+       struct task_struct      *task;
+
+       /* The signal to send on a bts buffer overflow. */
+       unsigned int            bts_ovfl_signal;
+
+       /* The work struct to destroy a context. */
+       struct work_struct      work;
+};
+
+static int alloc_bts_buffer(struct bts_context *context, unsigned int size)
+{
+       void *buffer = NULL;
+       int err = -ENOMEM;
+
+       err = account_locked_memory(current->mm, current->signal->rlim, size);
+       if (err < 0)
+               return err;
+
+       buffer = kzalloc(size, GFP_KERNEL);
+       if (!buffer)
+               goto out_refund;
+
+       context->buffer = buffer;
+       context->size = size;
+       context->mm = get_task_mm(current);
+
+       return 0;
+
+ out_refund:
+       refund_locked_memory(current->mm, size);
+       return err;
+}
+
+static inline void free_bts_buffer(struct bts_context *context)
+{
+       if (!context->buffer)
+               return;
+
+       kfree(context->buffer);
+       context->buffer = NULL;
+
+       refund_locked_memory(context->mm, context->size);
+       context->size = 0;
+
+       mmput(context->mm);
+       context->mm = NULL;
+}
+
+static void free_bts_context_work(struct work_struct *w)
+{
+       struct bts_context *context;
+
+       context = container_of(w, struct bts_context, work);
+
+       ds_release_bts(context->tracer);
+       put_task_struct(context->task);
+       free_bts_buffer(context);
+       kfree(context);
+}
+
+static inline void free_bts_context(struct bts_context *context)
+{
+       INIT_WORK(&context->work, free_bts_context_work);
+       schedule_work(&context->work);
+}
+
+static inline struct bts_context *alloc_bts_context(struct task_struct *task)
+{
+       struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL);
+       if (context) {
+               context->task = task;
+               task->bts = context;
+
+               get_task_struct(task);
+       }
+
+       return context;
+}
+
 static int ptrace_bts_read_record(struct task_struct *child, size_t index,
                                  struct bts_struct __user *out)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        struct bts_struct bts;
        const unsigned char *at;
        int error;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        at = trace->ds.top - ((index + 1) * trace->ds.size);
        if ((void *)at < trace->ds.begin)
@@ -597,7 +711,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index,
        if (!trace->read)
                return -EOPNOTSUPP;
 
-       error = trace->read(child->bts, at, &bts);
+       error = trace->read(context->tracer, at, &bts);
        if (error < 0)
                return error;
 
@@ -611,13 +725,18 @@ static int ptrace_bts_drain(struct task_struct *child,
                            long size,
                            struct bts_struct __user *out)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        const unsigned char *at;
        int error, drained = 0;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        if (!trace->read)
                return -EOPNOTSUPP;
@@ -628,9 +747,8 @@ static int ptrace_bts_drain(struct task_struct *child,
        for (at = trace->ds.begin; (void *)at < trace->ds.top;
             out++, drained++, at += trace->ds.size) {
                struct bts_struct bts;
-               int error;
 
-               error = trace->read(child->bts, at, &bts);
+               error = trace->read(context->tracer, at, &bts);
                if (error < 0)
                        return error;
 
@@ -640,35 +758,18 @@ static int ptrace_bts_drain(struct task_struct *child,
 
        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-       error = ds_reset_bts(child->bts);
+       error = ds_reset_bts(context->tracer);
        if (error < 0)
                return error;
 
        return drained;
 }
 
-static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size)
-{
-       child->bts_buffer = alloc_locked_buffer(size);
-       if (!child->bts_buffer)
-               return -ENOMEM;
-
-       child->bts_size = size;
-
-       return 0;
-}
-
-static void ptrace_bts_free_buffer(struct task_struct *child)
-{
-       free_locked_buffer(child->bts_buffer, child->bts_size);
-       child->bts_buffer = NULL;
-       child->bts_size = 0;
-}
-
 static int ptrace_bts_config(struct task_struct *child,
                             long cfg_size,
                             const struct ptrace_bts_config __user *ucfg)
 {
+       struct bts_context *context;
        struct ptrace_bts_config cfg;
        unsigned int flags = 0;
 
@@ -678,28 +779,33 @@ static int ptrace_bts_config(struct task_struct *child,
        if (copy_from_user(&cfg, ucfg, sizeof(cfg)))
                return -EFAULT;
 
-       if (child->bts) {
-               ds_release_bts(child->bts);
-               child->bts = NULL;
-       }
+       context = child->bts;
+       if (!context)
+               context = alloc_bts_context(child);
+       if (!context)
+               return -ENOMEM;
 
        if (cfg.flags & PTRACE_BTS_O_SIGNAL) {
                if (!cfg.signal)
                        return -EINVAL;
 
-               child->thread.bts_ovfl_signal = cfg.signal;
                return -EOPNOTSUPP;
+               context->bts_ovfl_signal = cfg.signal;
        }
 
-       if ((cfg.flags & PTRACE_BTS_O_ALLOC) &&
-           (cfg.size != child->bts_size)) {
-               int error;
+       ds_release_bts(context->tracer);
+       context->tracer = NULL;
 
-               ptrace_bts_free_buffer(child);
+       if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) {
+               int err;
 
-               error = ptrace_bts_allocate_buffer(child, cfg.size);
-               if (error < 0)
-                       return error;
+               free_bts_buffer(context);
+               if (!cfg.size)
+                       return 0;
+
+               err = alloc_bts_buffer(context, cfg.size);
+               if (err < 0)
+                       return err;
        }
 
        if (cfg.flags & PTRACE_BTS_O_TRACE)
@@ -708,15 +814,14 @@ static int ptrace_bts_config(struct task_struct *child,
        if (cfg.flags & PTRACE_BTS_O_SCHED)
                flags |= BTS_TIMESTAMPS;
 
-       child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size,
-                                   /* ovfl = */ NULL, /* th = */ (size_t)-1,
-                                   flags);
-       if (IS_ERR(child->bts)) {
-               int error = PTR_ERR(child->bts);
-
-               ptrace_bts_free_buffer(child);
-               child->bts = NULL;
+       context->tracer =
+               ds_request_bts_task(child, context->buffer, context->size,
+                                   NULL, (size_t)-1, flags);
+       if (unlikely(IS_ERR(context->tracer))) {
+               int error = PTR_ERR(context->tracer);
 
+               free_bts_buffer(context);
+               context->tracer = NULL;
                return error;
        }
 
@@ -727,20 +832,25 @@ static int ptrace_bts_status(struct task_struct *child,
                             long cfg_size,
                             struct ptrace_bts_config __user *ucfg)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
        struct ptrace_bts_config cfg;
 
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
        if (cfg_size < sizeof(cfg))
                return -EIO;
 
-       trace = ds_read_bts(child->bts);
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        memset(&cfg, 0, sizeof(cfg));
-       cfg.size = trace->ds.end - trace->ds.begin;
-       cfg.signal = child->thread.bts_ovfl_signal;
-       cfg.bts_size = sizeof(struct bts_struct);
+       cfg.size        = trace->ds.end - trace->ds.begin;
+       cfg.signal      = context->bts_ovfl_signal;
+       cfg.bts_size    = sizeof(struct bts_struct);
 
        if (cfg.signal)
                cfg.flags |= PTRACE_BTS_O_SIGNAL;
@@ -759,80 +869,51 @@ static int ptrace_bts_status(struct task_struct *child,
 
 static int ptrace_bts_clear(struct task_struct *child)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size);
 
-       return ds_reset_bts(child->bts);
+       return ds_reset_bts(context->tracer);
 }
 
 static int ptrace_bts_size(struct task_struct *child)
 {
+       struct bts_context *context;
        const struct bts_trace *trace;
 
-       trace = ds_read_bts(child->bts);
+       context = child->bts;
+       if (!context)
+               return -ESRCH;
+
+       trace = ds_read_bts(context->tracer);
        if (!trace)
-               return -EPERM;
+               return -ESRCH;
 
        return (trace->ds.top - trace->ds.begin) / trace->ds.size;
 }
 
-static void ptrace_bts_fork(struct task_struct *tsk)
-{
-       tsk->bts = NULL;
-       tsk->bts_buffer = NULL;
-       tsk->bts_size = 0;
-       tsk->thread.bts_ovfl_signal = 0;
-}
-
-static void ptrace_bts_untrace(struct task_struct *child)
+/*
+ * Called from __ptrace_unlink() after the child has been moved back
+ * to its original parent.
+ */
+void ptrace_bts_untrace(struct task_struct *child)
 {
        if (unlikely(child->bts)) {
-               ds_release_bts(child->bts);
+               free_bts_context(child->bts);
                child->bts = NULL;
-
-               /* We cannot update total_vm and locked_vm since
-                  child's mm is already gone. But we can reclaim the
-                  memory. */
-               kfree(child->bts_buffer);
-               child->bts_buffer = NULL;
-               child->bts_size = 0;
        }
 }
-
-static void ptrace_bts_detach(struct task_struct *child)
-{
-       /*
-        * Ptrace_detach() races with ptrace_untrace() in case
-        * the child dies and is reaped by another thread.
-        *
-        * We only do the memory accounting at this point and
-        * leave the buffer deallocation and the bts tracer
-        * release to ptrace_bts_untrace() which will be called
-        * later on with tasklist_lock held.
-        */
-       release_locked_buffer(child->bts_buffer, child->bts_size);
-}
-#else
-static inline void ptrace_bts_fork(struct task_struct *tsk) {}
-static inline void ptrace_bts_detach(struct task_struct *child) {}
-static inline void ptrace_bts_untrace(struct task_struct *child) {}
 #endif /* CONFIG_X86_PTRACE_BTS */
 
-void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-       ptrace_bts_fork(child);
-}
-
-void x86_ptrace_untrace(struct task_struct *child)
-{
-       ptrace_bts_untrace(child);
-}
-
 /*
  * Called by kernel/ptrace.c when detaching..
  *
@@ -844,7 +925,6 @@ void ptrace_disable(struct task_struct *child)
 #ifdef TIF_SYSCALL_EMU
        clear_tsk_thread_flag(child, TIF_SYSCALL_EMU);
 #endif
-       ptrace_bts_detach(child);
 }
 
 #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
index 7563b31b4f0349f45bf324053a67ecb506a4fced..af71d06624bf5b70b5173ba29238c40b6c21c801 100644 (file)
@@ -491,5 +491,42 @@ void force_hpet_resume(void)
                break;
        }
 }
+#endif
+
+#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
+/* Set correct numa_node information for AMD NB functions */
+static void __init quirk_amd_nb_node(struct pci_dev *dev)
+{
+       struct pci_dev *nb_ht;
+       unsigned int devfn;
+       u32 val;
+
+       devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
+       nb_ht = pci_get_slot(dev->bus, devfn);
+       if (!nb_ht)
+               return;
+
+       pci_read_config_dword(nb_ht, 0x60, &val);
+       set_dev_node(&dev->dev, val & 7);
+       pci_dev_put(dev);
+}
 
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC,
+                       quirk_amd_nb_node);
+DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK,
+                       quirk_amd_nb_node);
 #endif
index 667188e0b5a0bfac320d69a0f734dda03b5bee88..d2d1ce8170f06c8573ac022f4e49dfc59a371c9b 100644 (file)
@@ -192,6 +192,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
                        DMI_MATCH(DMI_BOARD_NAME, "0KP561"),
                },
        },
+       {   /* Handle problems with rebooting on Dell Optiplex 360 with 0T656F */
+               .callback = set_bios_reboot,
+               .ident = "Dell OptiPlex 360",
+               .matches = {
+                       DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
+                       DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 360"),
+                       DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
+               },
+       },
        {       /* Handle problems with rebooting on Dell 2400's */
                .callback = set_bios_reboot,
                .ident = "Dell PowerEdge 2400",
index b4158439bf634d254852cceab2c30d26f943f7ea..d1c636bf31a71018d73f7e36616d2c7d533707ea 100644 (file)
 #define ARCH_SETUP
 #endif
 
+/*
+ * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
+ * The direct mapping extends to max_pfn_mapped, so that we can directly access
+ * apertures, ACPI and other tables without having to play with fixmaps.
+ */
+unsigned long max_low_pfn_mapped;
+unsigned long max_pfn_mapped;
+
 RESERVE_BRK(dmi_alloc, 65536);
 
 unsigned int boot_cpu_id __read_mostly;
@@ -214,8 +222,8 @@ unsigned long mmu_cr4_features;
 unsigned long mmu_cr4_features = X86_CR4_PAE;
 #endif
 
-/* Boot loader ID as an integer, for the benefit of proc_dointvec */
-int bootloader_type;
+/* Boot loader ID and version as integers, for the benefit of proc_dointvec */
+int bootloader_type, bootloader_version;
 
 /*
  * Setup options
@@ -706,6 +714,12 @@ void __init setup_arch(char **cmdline_p)
 #endif
        saved_video_mode = boot_params.hdr.vid_mode;
        bootloader_type = boot_params.hdr.type_of_loader;
+       if ((bootloader_type >> 4) == 0xe) {
+               bootloader_type &= 0xf;
+               bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4;
+       }
+       bootloader_version  = bootloader_type & 0xf;
+       bootloader_version |= boot_params.hdr.ext_loader_ver << 4;
 
 #ifdef CONFIG_BLK_DEV_RAM
        rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK;
@@ -854,12 +868,16 @@ void __init setup_arch(char **cmdline_p)
                max_low_pfn = max_pfn;
 
        high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
+       max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
 #endif
 
 #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
        setup_bios_corruption_check();
 #endif
 
+       printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
+                       max_pfn_mapped<<PAGE_SHIFT);
+
        reserve_brk();
 
        /* max_pfn_mapped is updated here */
@@ -996,24 +1014,6 @@ void __init setup_arch(char **cmdline_p)
 
 #ifdef CONFIG_X86_32
 
-/**
- * x86_quirk_pre_intr_init - initialisation prior to setting up interrupt vectors
- *
- * Description:
- *     Perform any necessary interrupt initialisation prior to setting up
- *     the "ordinary" interrupt call gates.  For legacy reasons, the ISA
- *     interrupts should be initialised here if the machine emulates a PC
- *     in any way.
- **/
-void __init x86_quirk_pre_intr_init(void)
-{
-       if (x86_quirks->arch_pre_intr_init) {
-               if (x86_quirks->arch_pre_intr_init())
-                       return;
-       }
-       init_ISA_irqs();
-}
-
 /**
  * x86_quirk_intr_init - post gate setup interrupt initialisation
  *
index 8f0e13be36b31d1a80c94dec2692504d2a9a86db..9c3f0823e6aa00ea93bec0babdfd2c0b19971d13 100644 (file)
@@ -425,6 +425,14 @@ void __init setup_per_cpu_areas(void)
        early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
 #endif
 
+#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA)
+       /*
+        * make sure boot cpu node_number is right, when boot cpu is on the
+        * node that doesn't have mem installed
+        */
+       per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id);
+#endif
+
        /* Setup node to cpumask map */
        setup_node_to_cpumask_map();
 
index 3b2e55e8ad2b8c2906e01aed063d0db9faee2ef5..28f5fb495a669e9b6843daf33a37baaac512fad7 100644 (file)
@@ -196,19 +196,19 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
 }
 
 struct smp_ops smp_ops = {
-       .smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
-       .smp_prepare_cpus = native_smp_prepare_cpus,
-       .smp_cpus_done = native_smp_cpus_done,
+       .smp_prepare_boot_cpu   = native_smp_prepare_boot_cpu,
+       .smp_prepare_cpus       = native_smp_prepare_cpus,
+       .smp_cpus_done          = native_smp_cpus_done,
 
-       .smp_send_stop = native_smp_send_stop,
-       .smp_send_reschedule = native_smp_send_reschedule,
+       .smp_send_stop          = native_smp_send_stop,
+       .smp_send_reschedule    = native_smp_send_reschedule,
 
-       .cpu_up = native_cpu_up,
-       .cpu_die = native_cpu_die,
-       .cpu_disable = native_cpu_disable,
-       .play_dead = native_play_dead,
+       .cpu_up                 = native_cpu_up,
+       .cpu_die                = native_cpu_die,
+       .cpu_disable            = native_cpu_disable,
+       .play_dead              = native_play_dead,
 
-       .send_call_func_ipi = native_send_call_func_ipi,
+       .send_call_func_ipi     = native_send_call_func_ipi,
        .send_call_func_single_ipi = native_send_call_func_single_ipi,
 };
 EXPORT_SYMBOL_GPL(smp_ops);
index 58d24ef917d8b46e7c7faedc4dd3fc79266ad0cb..7c80007ea5f7fb28ce9e61c4569f3c3432981b1d 100644 (file)
@@ -504,7 +504,7 @@ void __inquire_remote_apic(int apicid)
  * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this
  * won't ... remember to clear down the APIC, etc later.
  */
-int __devinit
+int __cpuinit
 wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
 {
        unsigned long send_status, accept_status = 0;
@@ -538,7 +538,7 @@ wakeup_secondary_cpu_via_nmi(int logical_apicid, unsigned long start_eip)
        return (send_status | accept_status);
 }
 
-int __devinit
+static int __cpuinit
 wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
 {
        unsigned long send_status, accept_status = 0;
@@ -822,10 +822,12 @@ do_rest:
        /* mark "stuck" area as not stuck */
        *((volatile unsigned long *)trampoline_base) = 0;
 
-       /*
-        * Cleanup possible dangling ends...
-        */
-       smpboot_restore_warm_reset_vector();
+       if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
+               /*
+                * Cleanup possible dangling ends...
+                */
+               smpboot_restore_warm_reset_vector();
+       }
 
        return boot_error;
 }
@@ -990,10 +992,12 @@ static int __init smp_sanity_check(unsigned max_cpus)
         */
        if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) &&
            !cpu_has_apic) {
-               printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n",
-                       boot_cpu_physical_apicid);
-               printk(KERN_ERR "... forcing use of dummy APIC emulation."
+               if (!disable_apic) {
+                       pr_err("BIOS bug, local APIC #%d not detected!...\n",
+                               boot_cpu_physical_apicid);
+                       pr_err("... forcing use of dummy APIC emulation."
                                "(tell your hw vendor)\n");
+               }
                smpboot_clear_io_apic();
                arch_disable_smp_support();
                return -1;
index f7bddc2e37d1bbf19a86f03b9e3a3a0da70ab199..4aaf7e48394fb562343f27d811e7a9f329be2b76 100644 (file)
@@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
 
 static int save_stack_stack(void *data, char *name)
 {
-       return -1;
+       return 0;
 }
 
 static void save_stack_address(void *data, unsigned long addr, int reliable)
index ff5c8736b491b8ff2c4835c5a19a51ad63633de1..734f92c02dde0f7dbf4564907dfb920512ee908d 100644 (file)
@@ -334,3 +334,4 @@ ENTRY(sys_call_table)
        .long sys_inotify_init1
        .long sys_preadv
        .long sys_pwritev
+       .long sys_rt_tgsigqueueinfo     /* 335 */
index 8c7b03b0cfcb47b1585187e95a67890137218c5d..124d40c575df376e989d5b00229fda865f0bb682 100644 (file)
@@ -715,7 +715,12 @@ uv_activation_descriptor_init(int node, int pnode)
        struct bau_desc *adp;
        struct bau_desc *ad2;
 
-       adp = (struct bau_desc *)kmalloc_node(16384, GFP_KERNEL, node);
+       /*
+        * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
+        * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per blade
+        */
+       adp = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
+               UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
        BUG_ON(!adp);
 
        pa = uv_gpa(adp); /* need the real nasid*/
@@ -729,7 +734,13 @@ uv_activation_descriptor_init(int node, int pnode)
                                      (n << UV_DESC_BASE_PNODE_SHIFT | m));
        }
 
-       for (i = 0, ad2 = adp; i < UV_ACTIVATION_DESCRIPTOR_SIZE; i++, ad2++) {
+       /*
+        * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
+        * cpu even though we only use the first one; one descriptor can
+        * describe a broadcast to 256 nodes.
+        */
+       for (i = 0, ad2 = adp; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
+               i++, ad2++) {
                memset(ad2, 0, sizeof(struct bau_desc));
                ad2->header.sw_ack_flag = 1;
                /*
index a1d288327ff0ff0f152241bf296c742778f61e00..ede024531f8facbc53fdfd4c82b687ee2050ac72 100644 (file)
@@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void)
        }
 
        clts();                         /* Allow maths ops (or we recurse) */
-#ifdef CONFIG_X86_32
-       restore_fpu(tsk);
-#else
        /*
         * Paranoid restore. send a SIGSEGV if we fail to restore the state.
         */
@@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void)
                force_sig(SIGSEGV, tsk);
                return;
        }
-#endif
+
        thread->status |= TS_USEDFPU;   /* So we fnsave on switch_to() */
        tsk->fpu_counter++;
 }
@@ -969,11 +966,8 @@ void __init trap_init(void)
        for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++)
                set_bit(i, used_vectors);
 
-#ifdef CONFIG_X86_64
        set_bit(IA32_SYSCALL_VECTOR, used_vectors);
-#else
-       set_bit(SYSCALL_VECTOR, used_vectors);
-#endif
+
        /*
         * Should be a barrier for any external CPU state:
         */
index d57de05dc43093e0831ebb544a2baddf3d72f184..3e1c057e98fe039325eed2056d8960fa52f023c4 100644 (file)
@@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void)
 {
        u64 tsc1, tsc2, delta, ref1, ref2;
        unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX;
-       unsigned long flags, latch, ms, fast_calibrate, tsc_khz;
+       unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz;
        int hpet = is_hpet_enabled(), i, loopmin;
 
-       tsc_khz = get_hypervisor_tsc_freq();
-       if (tsc_khz) {
+       hv_tsc_khz = get_hypervisor_tsc_freq();
+       if (hv_tsc_khz) {
                printk(KERN_INFO "TSC: Frequency read from the hypervisor\n");
-               return tsc_khz;
+               return hv_tsc_khz;
        }
 
        local_irq_save(flags);
@@ -710,7 +710,16 @@ static cycle_t read_tsc(struct clocksource *cs)
 #ifdef CONFIG_X86_64
 static cycle_t __vsyscall_fn vread_tsc(void)
 {
-       cycle_t ret = (cycle_t)vget_cycles();
+       cycle_t ret;
+
+       /*
+        * Surround the RDTSC by barriers, to make sure it's not
+        * speculated to outside the seqlock critical section and
+        * does not cause time warps:
+        */
+       rdtsc_barrier();
+       ret = (cycle_t)vget_cycles();
+       rdtsc_barrier();
 
        return ret >= __vsyscall_gtod_data.clock.cycle_last ?
                ret : __vsyscall_gtod_data.clock.cycle_last;
index bf36328f6ef9289c4b1a1a7d0ffd9c67878f3f32..027b5b498993b6f0606367e2803f6fc9f15dca2c 100644 (file)
@@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count;
  * of a critical section, to be able to prove TSC time-warps:
  */
 static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED;
+
 static __cpuinitdata cycles_t last_tsc;
 static __cpuinitdata cycles_t max_warp;
 static __cpuinitdata int nr_warps;
@@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
                return;
 
        if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
-               printk(KERN_INFO
-                      "Skipping synchronization checks as TSC is reliable.\n");
+               pr_info("Skipping synchronization checks as TSC is reliable.\n");
                return;
        }
 
-       printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:",
-                         smp_processor_id(), cpu);
+       pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
+               smp_processor_id(), cpu);
 
        /*
         * Reset it - in case this is a second bootup:
@@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu)
 
        if (nr_warps) {
                printk("\n");
-               printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs,"
-                                   " turning off TSC clock.\n", max_warp);
+               pr_warning("Measured %Ld cycles TSC warp between CPUs, "
+                          "turning off TSC clock.\n", max_warp);
                mark_tsc_unstable("check_tsc_sync_source failed");
        } else {
                printk(" passed.\n");
@@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void)
        while (atomic_read(&stop_count) != cpus)
                cpu_relax();
 }
-#undef NR_LOOPS
-
index d7ac84e7fc1c73f7cfcfd8ca6b66c5abed2219ce..9c4e625390589ac69e984edded55d13d6200a031 100644 (file)
@@ -287,10 +287,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        info->regs.pt.ds = 0;
        info->regs.pt.es = 0;
        info->regs.pt.fs = 0;
-
-/* we are clearing gs later just before "jmp resume_userspace",
- * because it is not saved/restored.
- */
+#ifndef CONFIG_X86_32_LAZY_GS
+       info->regs.pt.gs = 0;
+#endif
 
 /*
  * The flags register is also special: we cannot trust that the user
@@ -318,9 +317,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        }
 
 /*
- * Save old state, set default return value (%ax) to 0
+ * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL)
  */
-       info->regs32->ax = 0;
+       info->regs32->ax = VM86_SIGNAL;
        tsk->thread.saved_sp0 = tsk->thread.sp0;
        tsk->thread.saved_fs = info->regs32->fs;
        tsk->thread.saved_gs = get_user_gs(info->regs32);
@@ -343,7 +342,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
        __asm__ __volatile__(
                "movl %0,%%esp\n\t"
                "movl %1,%%ebp\n\t"
+#ifdef CONFIG_X86_32_LAZY_GS
                "mov  %2, %%gs\n\t"
+#endif
                "jmp resume_userspace"
                : /* no outputs */
                :"r" (&info->regs), "r" (task_thread_info(tsk)), "r" (0));
index 95deb9f2211e98decb5572e4757bc2cc48f3da18..b263423fbe2ae971424c5bd99c112ac984413383 100644 (file)
@@ -462,22 +462,28 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
 }
 #endif
 
-static void vmi_enter_lazy_cpu(void)
+static void vmi_start_context_switch(struct task_struct *prev)
 {
-       paravirt_enter_lazy_cpu();
+       paravirt_start_context_switch(prev);
        vmi_ops.set_lazy_mode(2);
 }
 
+static void vmi_end_context_switch(struct task_struct *next)
+{
+       vmi_ops.set_lazy_mode(0);
+       paravirt_end_context_switch(next);
+}
+
 static void vmi_enter_lazy_mmu(void)
 {
        paravirt_enter_lazy_mmu();
        vmi_ops.set_lazy_mode(1);
 }
 
-static void vmi_leave_lazy(void)
+static void vmi_leave_lazy_mmu(void)
 {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
        vmi_ops.set_lazy_mode(0);
+       paravirt_leave_lazy_mmu();
 }
 
 static inline int __init check_vmi_rom(struct vrom_header *rom)
@@ -711,14 +717,14 @@ static inline int __init activate_vmi(void)
        para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
        para_fill(pv_cpu_ops.io_delay, IODelay);
 
-       para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu,
+       para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
                  set_lazy_mode, SetLazyMode);
-       para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy,
+       para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
                  set_lazy_mode, SetLazyMode);
 
        para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
                  set_lazy_mode, SetLazyMode);
-       para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy,
+       para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
                  set_lazy_mode, SetLazyMode);
 
        /* user and kernel flush are just handled with different flags to FlushTLB */
index 849ee611f01388684ff9102838bef8a4fe38464b..4c85b2e2bb652873da1ee5367549e070b363bf8a 100644 (file)
@@ -1,5 +1,431 @@
+/*
+ * ld script for the x86 kernel
+ *
+ * Historic 32-bit version written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>
+ *
+ * Modernisation, unification and other changes and fixes:
+ *   Copyright (C) 2007-2009  Sam Ravnborg <sam@ravnborg.org>
+ *
+ *
+ * Don't define absolute symbols until and unless you know that symbol
+ * value is should remain constant even if kernel image is relocated
+ * at run time. Absolute symbols are not relocated. If symbol value should
+ * change if kernel is relocated, make the symbol section relative and
+ * put it inside the section definition.
+ */
+
 #ifdef CONFIG_X86_32
-# include "vmlinux_32.lds.S"
+#define LOAD_OFFSET __PAGE_OFFSET
 #else
-# include "vmlinux_64.lds.S"
+#define LOAD_OFFSET __START_KERNEL_map
 #endif
+
+#include <asm-generic/vmlinux.lds.h>
+#include <asm/asm-offsets.h>
+#include <asm/thread_info.h>
+#include <asm/page_types.h>
+#include <asm/cache.h>
+#include <asm/boot.h>
+
+#undef i386     /* in case the preprocessor is a 32bit one */
+
+OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
+
+#ifdef CONFIG_X86_32
+OUTPUT_ARCH(i386)
+ENTRY(phys_startup_32)
+jiffies = jiffies_64;
+#else
+OUTPUT_ARCH(i386:x86-64)
+ENTRY(phys_startup_64)
+jiffies_64 = jiffies;
+#endif
+
+PHDRS {
+       text PT_LOAD FLAGS(5);          /* R_E */
+       data PT_LOAD FLAGS(7);          /* RWE */
+#ifdef CONFIG_X86_64
+       user PT_LOAD FLAGS(7);          /* RWE */
+       data.init PT_LOAD FLAGS(7);     /* RWE */
+#ifdef CONFIG_SMP
+       percpu PT_LOAD FLAGS(7);        /* RWE */
+#endif
+       data.init2 PT_LOAD FLAGS(7);    /* RWE */
+#endif
+       note PT_NOTE FLAGS(0);          /* ___ */
+}
+
+SECTIONS
+{
+#ifdef CONFIG_X86_32
+        . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
+        phys_startup_32 = startup_32 - LOAD_OFFSET;
+#else
+        . = __START_KERNEL;
+        phys_startup_64 = startup_64 - LOAD_OFFSET;
+#endif
+
+       /* Text and read-only data */
+
+       /* bootstrapping code */
+       .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
+               _text = .;
+               *(.text.head)
+       } :text = 0x9090
+
+       /* The rest of the text */
+       .text :  AT(ADDR(.text) - LOAD_OFFSET) {
+#ifdef CONFIG_X86_32
+               /* not really needed, already page aligned */
+               . = ALIGN(PAGE_SIZE);
+               *(.text.page_aligned)
+#endif
+               . = ALIGN(8);
+               _stext = .;
+               TEXT_TEXT
+               SCHED_TEXT
+               LOCK_TEXT
+               KPROBES_TEXT
+               IRQENTRY_TEXT
+               *(.fixup)
+               *(.gnu.warning)
+               /* End of text section */
+               _etext = .;
+       } :text = 0x9090
+
+       NOTES :text :note
+
+       /* Exception table */
+       . = ALIGN(16);
+       __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
+               __start___ex_table = .;
+               *(__ex_table)
+               __stop___ex_table = .;
+       } :text = 0x9090
+
+       RODATA
+
+       /* Data */
+       . = ALIGN(PAGE_SIZE);
+       .data : AT(ADDR(.data) - LOAD_OFFSET) {
+               DATA_DATA
+               CONSTRUCTORS
+
+#ifdef CONFIG_X86_64
+               /* End of data section */
+               _edata = .;
+#endif
+       } :data
+
+#ifdef CONFIG_X86_32
+       /* 32 bit has nosave before _edata */
+       . = ALIGN(PAGE_SIZE);
+       .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+               __nosave_begin = .;
+               *(.data.nosave)
+               . = ALIGN(PAGE_SIZE);
+               __nosave_end = .;
+       }
+#endif
+
+       . = ALIGN(PAGE_SIZE);
+       .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
+               *(.data.page_aligned)
+               *(.data.idt)
+       }
+
+#ifdef CONFIG_X86_32
+       . = ALIGN(32);
+#else
+       . = ALIGN(PAGE_SIZE);
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+#endif
+       .data.cacheline_aligned :
+               AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
+               *(.data.cacheline_aligned)
+       }
+
+       /* rarely changed data like cpu maps */
+#ifdef CONFIG_X86_32
+       . = ALIGN(32);
+#else
+       . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
+#endif
+       .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
+               *(.data.read_mostly)
+
+#ifdef CONFIG_X86_32
+               /* End of data section */
+               _edata = .;
+#endif
+       }
+
+#ifdef CONFIG_X86_64
+
+#define VSYSCALL_ADDR (-10*1024*1024)
+#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \
+                            SIZEOF(.data.read_mostly) + 4095) & ~(4095))
+
+#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
+#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
+
+#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
+#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
+
+       . = VSYSCALL_ADDR;
+       .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) {
+               *(.vsyscall_0)
+       } :user
+
+       __vsyscall_0 = VSYSCALL_VIRT_ADDR;
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
+               *(.vsyscall_fn)
+       }
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
+               *(.vsyscall_gtod_data)
+       }
+
+       vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
+       .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
+               *(.vsyscall_clock)
+       }
+       vsyscall_clock = VVIRT(.vsyscall_clock);
+
+
+       .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
+               *(.vsyscall_1)
+       }
+       .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) {
+               *(.vsyscall_2)
+       }
+
+       .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
+               *(.vgetcpu_mode)
+       }
+       vgetcpu_mode = VVIRT(.vgetcpu_mode);
+
+       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
+       .jiffies : AT(VLOAD(.jiffies)) {
+               *(.jiffies)
+       }
+       jiffies = VVIRT(.jiffies);
+
+       .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
+               *(.vsyscall_3)
+       }
+
+       . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
+
+#undef VSYSCALL_ADDR
+#undef VSYSCALL_PHYS_ADDR
+#undef VSYSCALL_VIRT_ADDR
+#undef VLOAD_OFFSET
+#undef VLOAD
+#undef VVIRT_OFFSET
+#undef VVIRT
+
+#endif /* CONFIG_X86_64 */
+
+       /* init_task */
+       . = ALIGN(THREAD_SIZE);
+       .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
+               *(.data.init_task)
+       }
+#ifdef CONFIG_X86_64
+        :data.init
+#endif
+
+       /*
+        * smp_locks might be freed after init
+        * start/end must be page aligned
+        */
+       . = ALIGN(PAGE_SIZE);
+       .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
+               __smp_locks = .;
+               *(.smp_locks)
+               __smp_locks_end = .;
+               . = ALIGN(PAGE_SIZE);
+       }
+
+       /* Init code and data - will be freed after init */
+       . = ALIGN(PAGE_SIZE);
+       .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
+               __init_begin = .; /* paired with __init_end */
+               _sinittext = .;
+               INIT_TEXT
+               _einittext = .;
+       }
+
+       .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
+               INIT_DATA
+       }
+
+       . = ALIGN(16);
+       .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
+               __setup_start = .;
+               *(.init.setup)
+               __setup_end = .;
+       }
+       .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
+               __initcall_start = .;
+               INITCALLS
+               __initcall_end = .;
+       }
+
+       .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
+               __con_initcall_start = .;
+               *(.con_initcall.init)
+               __con_initcall_end = .;
+       }
+
+       .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
+               __x86_cpu_dev_start = .;
+               *(.x86_cpu_dev.init)
+               __x86_cpu_dev_end = .;
+       }
+
+       SECURITY_INIT
+
+       . = ALIGN(8);
+       .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
+               __parainstructions = .;
+               *(.parainstructions)
+               __parainstructions_end = .;
+       }
+
+       . = ALIGN(8);
+       .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
+               __alt_instructions = .;
+               *(.altinstructions)
+               __alt_instructions_end = .;
+       }
+
+       .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
+               *(.altinstr_replacement)
+       }
+
+       /*
+        * .exit.text is discard at runtime, not link time, to deal with
+        *  references from .altinstructions and .eh_frame
+        */
+       .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
+               EXIT_TEXT
+       }
+
+       .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
+               EXIT_DATA
+       }
+
+#ifdef CONFIG_BLK_DEV_INITRD
+       . = ALIGN(PAGE_SIZE);
+       .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
+               __initramfs_start = .;
+               *(.init.ramfs)
+               __initramfs_end = .;
+       }
+#endif
+
+#if defined(CONFIG_X86_64) && defined(CONFIG_SMP)
+       /*
+        * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
+        * output PHDR, so the next output section - __data_nosave - should
+        * start another section data.init2.  Also, pda should be at the head of
+        * percpu area.  Preallocate it and define the percpu offset symbol
+        * so that it can be accessed as a percpu variable.
+        */
+       . = ALIGN(PAGE_SIZE);
+       PERCPU_VADDR(0, :percpu)
+#else
+       PERCPU(PAGE_SIZE)
+#endif
+
+       . = ALIGN(PAGE_SIZE);
+
+       /* freed after init ends here */
+       .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) {
+               __init_end = .;
+       }
+
+#ifdef CONFIG_X86_64
+       .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
+               . = ALIGN(PAGE_SIZE);
+               __nosave_begin = .;
+               *(.data.nosave)
+               . = ALIGN(PAGE_SIZE);
+               __nosave_end = .;
+       } :data.init2
+       /* use another section data.init2, see PERCPU_VADDR() above */
+#endif
+
+       /* BSS */
+       . = ALIGN(PAGE_SIZE);
+       .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
+               __bss_start = .;
+               *(.bss.page_aligned)
+               *(.bss)
+               . = ALIGN(4);
+               __bss_stop = .;
+       }
+
+       . = ALIGN(PAGE_SIZE);
+       .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
+               __brk_base = .;
+               . += 64 * 1024;         /* 64k alignment slop space */
+               *(.brk_reservation)     /* areas brk users have reserved */
+               __brk_limit = .;
+       }
+
+       .end : AT(ADDR(.end) - LOAD_OFFSET) {
+               _end = .;
+       }
+
+       /* Sections to be discarded */
+       /DISCARD/ : {
+               *(.exitcall.exit)
+               *(.eh_frame)
+               *(.discard)
+       }
+
+        STABS_DEBUG
+        DWARF_DEBUG
+}
+
+
+#ifdef CONFIG_X86_32
+ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
+        "kernel image bigger than KERNEL_IMAGE_SIZE")
+#else
+/*
+ * Per-cpu symbols which need to be offset from __per_cpu_load
+ * for the boot processor.
+ */
+#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
+INIT_PER_CPU(gdt_page);
+INIT_PER_CPU(irq_stack_union);
+
+/*
+ * Build-time check on the image size:
+ */
+ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
+       "kernel image bigger than KERNEL_IMAGE_SIZE")
+
+#ifdef CONFIG_SMP
+ASSERT((per_cpu__irq_stack_union == 0),
+        "irq_stack_union is not at start of per-cpu area");
+#endif
+
+#endif /* CONFIG_X86_32 */
+
+#ifdef CONFIG_KEXEC
+#include <asm/kexec.h>
+
+ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
+       "kexec control code size is too big")
+#endif
+
diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S
deleted file mode 100644 (file)
index 62ad500..0000000
+++ /dev/null
@@ -1,229 +0,0 @@
-/* ld script to make i386 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- *
- * Don't define absolute symbols until and unless you know that symbol
- * value is should remain constant even if kernel image is relocated
- * at run time. Absolute symbols are not relocated. If symbol value should
- * change if kernel is relocated, make the symbol section relative and
- * put it inside the section definition.
- */
-
-#define LOAD_OFFSET __PAGE_OFFSET
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/thread_info.h>
-#include <asm/page_types.h>
-#include <asm/cache.h>
-#include <asm/boot.h>
-
-OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386")
-OUTPUT_ARCH(i386)
-ENTRY(phys_startup_32)
-jiffies = jiffies_64;
-
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       note PT_NOTE FLAGS(0);  /* ___ */
-}
-SECTIONS
-{
-  . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR;
-  phys_startup_32 = startup_32 - LOAD_OFFSET;
-
-  .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) {
-       _text = .;                      /* Text and read-only data */
-       *(.text.head)
-  } :text = 0x9090
-
-  /* read-only */
-  .text : AT(ADDR(.text) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */
-       *(.text.page_aligned)
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       IRQENTRY_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       _etext = .;                     /* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);               /* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-       __start___ex_table = .;
-        *(__ex_table)
-       __stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  /* writeable */
-  . = ALIGN(PAGE_SIZE);
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {      /* Data */
-       DATA_DATA
-       CONSTRUCTORS
-       } :data
-
-  . = ALIGN(PAGE_SIZE);
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-       __nosave_begin = .;
-       *(.data.nosave)
-       . = ALIGN(PAGE_SIZE);
-       __nosave_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       *(.data.page_aligned)
-       *(.data.idt)
-  }
-
-  . = ALIGN(32);
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       *(.data.cacheline_aligned)
-  }
-
-  /* rarely changed data like cpu maps */
-  . = ALIGN(32);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-       _edata = .;             /* End of data section */
-  }
-
-  . = ALIGN(THREAD_SIZE);      /* init_task */
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       *(.data.init_task)
-  }
-
-  /* might get freed after init */
-  . = ALIGN(PAGE_SIZE);
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       __smp_locks = .;
-       *(.smp_locks)
-       __smp_locks_end = .;
-  }
-  /* will be freed after init
-   * Following ALIGN() is required to make sure no other data falls on the
-   * same page where __smp_alt_end is pointing as that page might be freed
-   * after boot. Always make sure that ALIGN() directive is present after
-   * the section which contains __smp_alt_end.
-   */
-  . = ALIGN(PAGE_SIZE);
-
-  /* will be freed after init */
-  . = ALIGN(PAGE_SIZE);                /* Init code and data */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       __init_begin = .;
-       _sinittext = .;
-       INIT_TEXT
-       _einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-       INIT_DATA
-  }
-  . = ALIGN(16);
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-       __setup_start = .;
-       *(.init.setup)
-       __setup_end = .;
-   }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       __initcall_start = .;
-       INITCALLS
-       __initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       __con_initcall_start = .;
-       *(.con_initcall.init)
-       __con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-       __x86_cpu_dev_start = .;
-       *(.x86_cpu_dev.init)
-       __x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-  . = ALIGN(4);
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       __alt_instructions = .;
-       *(.altinstructions)
-       __alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  . = ALIGN(4);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-       __parainstructions = .;
-       *(.parainstructions)
-       __parainstructions_end = .;
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-       EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-       EXIT_DATA
-  }
-#if defined(CONFIG_BLK_DEV_INITRD)
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-       __initramfs_start = .;
-       *(.init.ramfs)
-       __initramfs_end = .;
-  }
-#endif
-  PERCPU(PAGE_SIZE)
-  . = ALIGN(PAGE_SIZE);
-  /* freed after init ends here */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       __init_end = .;
-       __bss_start = .;                /* BSS */
-       *(.bss.page_aligned)
-       *(.bss)
-       . = ALIGN(4);
-       __bss_stop = .;
-  }
-
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __brk_base = . ;
-       . += 64 * 1024 ;        /* 64k alignment slop space */
-       *(.brk_reservation)     /* areas brk users have reserved */
-       __brk_limit = . ;
-  }
-
-  .end : AT(ADDR(.end) - LOAD_OFFSET) {
-       _end = . ;
-  }
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       *(.discard)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_KEXEC
-/* Link time checks */
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S
deleted file mode 100644 (file)
index c874250..0000000
+++ /dev/null
@@ -1,298 +0,0 @@
-/* ld script to make x86-64 Linux kernel
- * Written by Martin Mares <mj@atrey.karlin.mff.cuni.cz>;
- */
-
-#define LOAD_OFFSET __START_KERNEL_map
-
-#include <asm-generic/vmlinux.lds.h>
-#include <asm/asm-offsets.h>
-#include <asm/page_types.h>
-
-#undef i386    /* in case the preprocessor is a 32bit one */
-
-OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64")
-OUTPUT_ARCH(i386:x86-64)
-ENTRY(phys_startup_64)
-jiffies_64 = jiffies;
-PHDRS {
-       text PT_LOAD FLAGS(5);  /* R_E */
-       data PT_LOAD FLAGS(7);  /* RWE */
-       user PT_LOAD FLAGS(7);  /* RWE */
-       data.init PT_LOAD FLAGS(7);     /* RWE */
-#ifdef CONFIG_SMP
-       percpu PT_LOAD FLAGS(7);        /* RWE */
-#endif
-       data.init2 PT_LOAD FLAGS(7);    /* RWE */
-       note PT_NOTE FLAGS(0);  /* ___ */
-}
-SECTIONS
-{
-  . = __START_KERNEL;
-  phys_startup_64 = startup_64 - LOAD_OFFSET;
-  .text :  AT(ADDR(.text) - LOAD_OFFSET) {
-       _text = .;                      /* Text and read-only data */
-       /* First the code that has to be first for bootstrapping */
-       *(.text.head)
-       _stext = .;
-       /* Then the rest */
-       TEXT_TEXT
-       SCHED_TEXT
-       LOCK_TEXT
-       KPROBES_TEXT
-       IRQENTRY_TEXT
-       *(.fixup)
-       *(.gnu.warning)
-       _etext = .;             /* End of text section */
-  } :text = 0x9090
-
-  NOTES :text :note
-
-  . = ALIGN(16);               /* Exception table */
-  __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) {
-       __start___ex_table = .;
-        *(__ex_table)
-       __stop___ex_table = .;
-  } :text = 0x9090
-
-  RODATA
-
-  . = ALIGN(PAGE_SIZE);                /* Align data segment to page size boundary */
-                               /* Data */
-  .data : AT(ADDR(.data) - LOAD_OFFSET) {
-       DATA_DATA
-       CONSTRUCTORS
-       _edata = .;                     /* End of data section */
-       } :data
-
-
-  .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-       *(.data.cacheline_aligned)
-  }
-  . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES);
-  .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) {
-       *(.data.read_mostly)
-  }
-
-#define VSYSCALL_ADDR (-10*1024*1024)
-#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095))
-
-#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR)
-#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET)
-
-#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR)
-#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
-
-  . = VSYSCALL_ADDR;
-  .vsyscall_0 :         AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user
-  __vsyscall_0 = VSYSCALL_VIRT_ADDR;
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) }
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data))
-               { *(.vsyscall_gtod_data) }
-  vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
-  .vsyscall_clock : AT(VLOAD(.vsyscall_clock))
-               { *(.vsyscall_clock) }
-  vsyscall_clock = VVIRT(.vsyscall_clock);
-
-
-  .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1))
-               { *(.vsyscall_1) }
-  .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2))
-               { *(.vsyscall_2) }
-
-  .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) }
-  vgetcpu_mode = VVIRT(.vgetcpu_mode);
-
-  . = ALIGN(CONFIG_X86_L1_CACHE_BYTES);
-  .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) }
-  jiffies = VVIRT(.jiffies);
-
-  .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3))
-               { *(.vsyscall_3) }
-
-  . = VSYSCALL_VIRT_ADDR + PAGE_SIZE;
-
-#undef VSYSCALL_ADDR
-#undef VSYSCALL_PHYS_ADDR
-#undef VSYSCALL_VIRT_ADDR
-#undef VLOAD_OFFSET
-#undef VLOAD
-#undef VVIRT_OFFSET
-#undef VVIRT
-
-  .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) {
-       . = ALIGN(THREAD_SIZE); /* init_task */
-       *(.data.init_task)
-  }:data.init
-
-  .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       *(.data.page_aligned)
-  }
-
-  .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
-       /* might get freed after init */
-       . = ALIGN(PAGE_SIZE);
-       __smp_alt_begin = .;
-       __smp_locks = .;
-       *(.smp_locks)
-       __smp_locks_end = .;
-       . = ALIGN(PAGE_SIZE);
-       __smp_alt_end = .;
-  }
-
-  . = ALIGN(PAGE_SIZE);                /* Init code and data */
-  __init_begin = .;    /* paired with __init_end */
-  .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) {
-       _sinittext = .;
-       INIT_TEXT
-       _einittext = .;
-  }
-  .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) {
-       __initdata_begin = .;
-       INIT_DATA
-       __initdata_end = .;
-   }
-
-  .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) {
-       . = ALIGN(16);
-       __setup_start = .;
-       *(.init.setup)
-       __setup_end = .;
-  }
-  .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) {
-       __initcall_start = .;
-       INITCALLS
-       __initcall_end = .;
-  }
-  .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) {
-       __con_initcall_start = .;
-       *(.con_initcall.init)
-       __con_initcall_end = .;
-  }
-  .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
-       __x86_cpu_dev_start = .;
-       *(.x86_cpu_dev.init)
-       __x86_cpu_dev_end = .;
-  }
-  SECURITY_INIT
-
-  . = ALIGN(8);
-  .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
-       __parainstructions = .;
-       *(.parainstructions)
-       __parainstructions_end = .;
-  }
-
-  .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
-       . = ALIGN(8);
-       __alt_instructions = .;
-       *(.altinstructions)
-       __alt_instructions_end = .;
-  }
-  .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
-       *(.altinstr_replacement)
-  }
-  /* .exit.text is discard at runtime, not link time, to deal with references
-     from .altinstructions and .eh_frame */
-  .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) {
-       EXIT_TEXT
-  }
-  .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) {
-       EXIT_DATA
-  }
-
-#ifdef CONFIG_BLK_DEV_INITRD
-  . = ALIGN(PAGE_SIZE);
-  .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) {
-       __initramfs_start = .;
-       *(.init.ramfs)
-       __initramfs_end = .;
-  }
-#endif
-
-#ifdef CONFIG_SMP
-  /*
-   * percpu offsets are zero-based on SMP.  PERCPU_VADDR() changes the
-   * output PHDR, so the next output section - __data_nosave - should
-   * start another section data.init2.  Also, pda should be at the head of
-   * percpu area.  Preallocate it and define the percpu offset symbol
-   * so that it can be accessed as a percpu variable.
-   */
-  . = ALIGN(PAGE_SIZE);
-  PERCPU_VADDR(0, :percpu)
-#else
-  PERCPU(PAGE_SIZE)
-#endif
-
-  . = ALIGN(PAGE_SIZE);
-  __init_end = .;
-
-  .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __nosave_begin = .;
-       *(.data.nosave)
-       . = ALIGN(PAGE_SIZE);
-       __nosave_end = .;
-  } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */
-
-  .bss : AT(ADDR(.bss) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __bss_start = .;                /* BSS */
-       *(.bss.page_aligned)
-       *(.bss)
-       __bss_stop = .;
-  }
-
-  .brk : AT(ADDR(.brk) - LOAD_OFFSET) {
-       . = ALIGN(PAGE_SIZE);
-       __brk_base = . ;
-       . += 64 * 1024 ;        /* 64k alignment slop space */
-       *(.brk_reservation)     /* areas brk users have reserved */
-       __brk_limit = . ;
-  }
-
-  _end = . ;
-
-  /* Sections to be discarded */
-  /DISCARD/ : {
-       *(.exitcall.exit)
-       *(.eh_frame)
-       *(.discard)
-       }
-
-  STABS_DEBUG
-
-  DWARF_DEBUG
-}
-
- /*
-  * Per-cpu symbols which need to be offset from __per_cpu_load
-  * for the boot processor.
-  */
-#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load
-INIT_PER_CPU(gdt_page);
-INIT_PER_CPU(irq_stack_union);
-
-/*
- * Build-time check on the image size:
- */
-ASSERT((_end - _text <= KERNEL_IMAGE_SIZE),
-       "kernel image bigger than KERNEL_IMAGE_SIZE")
-
-#ifdef CONFIG_SMP
-ASSERT((per_cpu__irq_stack_union == 0),
-        "irq_stack_union is not at start of per-cpu area");
-#endif
-
-#ifdef CONFIG_KEXEC
-#include <asm/kexec.h>
-
-ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE,
-       "kexec control code size is too big")
-#endif
index 44153afc9067558cef387ba3237ffcf439ca3800..25ee06a80aad3cd116292227826f9a32fa4b3f7a 100644 (file)
@@ -132,15 +132,7 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
                        return;
                }
 
-               /*
-                * Surround the RDTSC by barriers, to make sure it's not
-                * speculated to outside the seqlock critical section and
-                * does not cause time warps:
-                */
-               rdtsc_barrier();
                now = vread();
-               rdtsc_barrier();
-
                base = __vsyscall_gtod_data.clock.cycle_last;
                mask = __vsyscall_gtod_data.clock.mask;
                mult = __vsyscall_gtod_data.clock.mult;
index 33a93b41739612cafe1fe3d918c39a3915f86a78..4e0c265593958fcb70a8588d942a27a7d0ac3da6 100644 (file)
@@ -167,10 +167,16 @@ static void lazy_hcall3(unsigned long call,
 
 /* When lazy mode is turned off reset the per-cpu lazy mode variable and then
  * issue the do-nothing hypercall to flush any stored calls. */
-static void lguest_leave_lazy_mode(void)
+static void lguest_leave_lazy_mmu_mode(void)
 {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
        kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+       paravirt_leave_lazy_mmu();
+}
+
+static void lguest_end_context_switch(struct task_struct *next)
+{
+       kvm_hypercall0(LHCALL_FLUSH_ASYNC);
+       paravirt_end_context_switch(next);
 }
 
 /*G:033
@@ -637,7 +643,7 @@ static void __init lguest_init_IRQ(void)
 
 void lguest_setup_irq(unsigned int irq)
 {
-       irq_to_desc_alloc_cpu(irq, 0);
+       irq_to_desc_alloc_node(irq, 0);
        set_irq_chip_and_handler_name(irq, &lguest_irq_controller,
                                      handle_level_irq, "level");
 }
@@ -1054,8 +1060,8 @@ __init void lguest_init(void)
        pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry;
        pv_cpu_ops.write_idt_entry = lguest_write_idt_entry;
        pv_cpu_ops.wbinvd = lguest_wbinvd;
-       pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu;
-       pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+       pv_cpu_ops.start_context_switch = paravirt_start_context_switch;
+       pv_cpu_ops.end_context_switch = lguest_end_context_switch;
 
        /* pagetable management */
        pv_mmu_ops.write_cr3 = lguest_write_cr3;
@@ -1068,7 +1074,7 @@ __init void lguest_init(void)
        pv_mmu_ops.read_cr2 = lguest_read_cr2;
        pv_mmu_ops.read_cr3 = lguest_read_cr3;
        pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu;
-       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode;
+       pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode;
        pv_mmu_ops.pte_update = lguest_pte_update;
        pv_mmu_ops.pte_update_defer = lguest_pte_update;
 
index e7277cbcfb40ee1ea455fb63c4c6665656724013..a725b7f760ae4ba860e93b9af99b40931621430b 100644 (file)
@@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st,
                   st->current_address >= st->marker[1].start_address) {
                const char *unit = units;
                unsigned long delta;
+               int width = sizeof(unsigned long) * 2;
 
                /*
                 * Now print the actual finished series
                 */
-               seq_printf(m, "0x%p-0x%p   ",
-                          (void *)st->start_address,
-                          (void *)st->current_address);
+               seq_printf(m, "0x%0*lx-0x%0*lx   ",
+                          width, st->start_address,
+                          width, st->current_address);
 
                delta = (st->current_address - st->start_address) >> 10;
                while (!(delta & 1023) && unit[1]) {
index a03b7279efa018850def9042339fec4af4249042..5ec7ae366615458eb18306a8cbd5987c6c06f17c 100644 (file)
@@ -3,40 +3,16 @@
  *  Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs.
  *  Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar
  */
-#include <linux/interrupt.h>
-#include <linux/mmiotrace.h>
-#include <linux/bootmem.h>
-#include <linux/compiler.h>
-#include <linux/highmem.h>
-#include <linux/kprobes.h>
-#include <linux/uaccess.h>
-#include <linux/vmalloc.h>
-#include <linux/vt_kern.h>
-#include <linux/signal.h>
-#include <linux/kernel.h>
-#include <linux/ptrace.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/kdebug.h>
-#include <linux/errno.h>
-#include <linux/magic.h>
-#include <linux/sched.h>
-#include <linux/types.h>
-#include <linux/init.h>
-#include <linux/mman.h>
-#include <linux/tty.h>
-#include <linux/smp.h>
-#include <linux/mm.h>
-
-#include <asm-generic/sections.h>
-
-#include <asm/tlbflush.h>
-#include <asm/pgalloc.h>
-#include <asm/segment.h>
-#include <asm/system.h>
-#include <asm/proto.h>
-#include <asm/traps.h>
-#include <asm/desc.h>
+#include <linux/magic.h>               /* STACK_END_MAGIC              */
+#include <linux/sched.h>               /* test_thread_flag(), ...      */
+#include <linux/kdebug.h>              /* oops_begin/end, ...          */
+#include <linux/module.h>              /* search_exception_table       */
+#include <linux/bootmem.h>             /* max_low_pfn                  */
+#include <linux/kprobes.h>             /* __kprobes, ...               */
+#include <linux/mmiotrace.h>           /* kmmio_handler, ...           */
+
+#include <asm/traps.h>                 /* dotraplinkage, ...           */
+#include <asm/pgalloc.h>               /* pgd_*(), ...                 */
 
 /*
  * Page fault error code bits:
@@ -225,12 +201,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address)
        if (!pmd_present(*pmd_k))
                return NULL;
 
-       if (!pmd_present(*pmd)) {
+       if (!pmd_present(*pmd))
                set_pmd(pmd, *pmd_k);
-               arch_flush_lazy_mmu_mode();
-       } else {
+       else
                BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k));
-       }
 
        return pmd_k;
 }
@@ -538,8 +512,6 @@ bad:
 static int is_errata93(struct pt_regs *regs, unsigned long address)
 {
 #ifdef CONFIG_X86_64
-       static int once;
-
        if (address != regs->ip)
                return 0;
 
@@ -549,10 +521,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address)
        address |= 0xffffffffUL << 32;
        if ((address >= (u64)_stext && address <= (u64)_etext) ||
            (address >= MODULES_VADDR && address <= MODULES_END)) {
-               if (!once) {
-                       printk(errata93_warning);
-                       once = 1;
-               }
+               printk_once(errata93_warning);
                regs->ip = address;
                return 1;
        }
index 8126e8d1a2a4a789509cb49af563b6cbb76395ae..58f621e8191955c2e02016df1d8e99bec8e1ed8f 100644 (file)
@@ -44,7 +44,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
        vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
        BUG_ON(!pte_none(*(kmap_pte-idx)));
        set_pte(kmap_pte-idx, mk_pte(page, prot));
-       arch_flush_lazy_mmu_mode();
 
        return (void *)vaddr;
 }
@@ -74,7 +73,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type)
 #endif
        }
 
-       arch_flush_lazy_mmu_mode();
        pagefault_enable();
 }
 
index ae4f7b5d71040566f7b30af16e6c00f20c82c098..34c1bfb64f1ca07d80838f363b514caf07acf4db 100644 (file)
@@ -1,3 +1,4 @@
+#include <linux/initrd.h>
 #include <linux/ioport.h>
 #include <linux/swap.h>
 
@@ -10,6 +11,9 @@
 #include <asm/setup.h>
 #include <asm/system.h>
 #include <asm/tlbflush.h>
+#include <asm/tlb.h>
+
+DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 
 unsigned long __initdata e820_table_start;
 unsigned long __meminitdata e820_table_end;
@@ -23,6 +27,69 @@ int direct_gbpages
 #endif
 ;
 
+int nx_enabled;
+
+#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
+static int disable_nx __cpuinitdata;
+
+/*
+ * noexec = on|off
+ *
+ * Control non-executable mappings for processes.
+ *
+ * on      Enable
+ * off     Disable
+ */
+static int __init noexec_setup(char *str)
+{
+       if (!str)
+               return -EINVAL;
+       if (!strncmp(str, "on", 2)) {
+               __supported_pte_mask |= _PAGE_NX;
+               disable_nx = 0;
+       } else if (!strncmp(str, "off", 3)) {
+               disable_nx = 1;
+               __supported_pte_mask &= ~_PAGE_NX;
+       }
+       return 0;
+}
+early_param("noexec", noexec_setup);
+#endif
+
+#ifdef CONFIG_X86_PAE
+static void __init set_nx(void)
+{
+       unsigned int v[4], l, h;
+
+       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
+               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
+
+               if ((v[3] & (1 << 20)) && !disable_nx) {
+                       rdmsr(MSR_EFER, l, h);
+                       l |= EFER_NX;
+                       wrmsr(MSR_EFER, l, h);
+                       nx_enabled = 1;
+                       __supported_pte_mask |= _PAGE_NX;
+               }
+       }
+}
+#else
+static inline void set_nx(void)
+{
+}
+#endif
+
+#ifdef CONFIG_X86_64
+void __cpuinit check_efer(void)
+{
+       unsigned long efer;
+
+       rdmsrl(MSR_EFER, efer);
+       if (!(efer & EFER_NX) || disable_nx)
+               __supported_pte_mask &= ~_PAGE_NX;
+}
+#endif
+
 static void __init find_early_table_space(unsigned long end, int use_pse,
                                          int use_gbpages)
 {
@@ -66,12 +133,11 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
         */
 #ifdef CONFIG_X86_32
        start = 0x7000;
-       e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
-                                       tables, PAGE_SIZE);
-#else /* CONFIG_X86_64 */
+#else
        start = 0x8000;
-       e820_table_start = find_e820_area(start, end, tables, PAGE_SIZE);
 #endif
+       e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT,
+                                       tables, PAGE_SIZE);
        if (e820_table_start == -1UL)
                panic("Cannot find space for the kernel page tables");
 
@@ -159,12 +225,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
        use_gbpages = direct_gbpages;
 #endif
 
-#ifdef CONFIG_X86_32
-#ifdef CONFIG_X86_PAE
        set_nx();
        if (nx_enabled)
                printk(KERN_INFO "NX (Execute Disable) protection: active\n");
-#endif
 
        /* Enable PSE if available */
        if (cpu_has_pse)
@@ -175,7 +238,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
                set_in_cr4(X86_CR4_PGE);
                __supported_pte_mask |= _PAGE_GLOBAL;
        }
-#endif
 
        if (use_gbpages)
                page_size_mask |= 1 << PG_LEVEL_1G;
index 749559ed80f5d99e1771826155ac8b27e1f2a3f2..949708d7a481ec10e9591faba7466b3e1a75f57d 100644 (file)
 #include <asm/paravirt.h>
 #include <asm/setup.h>
 #include <asm/cacheflush.h>
+#include <asm/page_types.h>
 #include <asm/init.h>
 
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
 unsigned long highstart_pfn, highend_pfn;
 
 static noinline int do_test_wp_bit(void);
@@ -587,61 +584,9 @@ void zap_low_mappings(void)
        flush_tlb_all();
 }
 
-int nx_enabled;
-
 pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-#ifdef CONFIG_X86_PAE
-
-static int disable_nx __initdata;
-
-/*
- * noexec = on|off
- *
- * Control non executable mappings.
- *
- * on      Enable
- * off     Disable
- */
-static int __init noexec_setup(char *str)
-{
-       if (!str || !strcmp(str, "on")) {
-               if (cpu_has_nx) {
-                       __supported_pte_mask |= _PAGE_NX;
-                       disable_nx = 0;
-               }
-       } else {
-               if (!strcmp(str, "off")) {
-                       disable_nx = 1;
-                       __supported_pte_mask &= ~_PAGE_NX;
-               } else {
-                       return -EINVAL;
-               }
-       }
-
-       return 0;
-}
-early_param("noexec", noexec_setup);
-
-void __init set_nx(void)
-{
-       unsigned int v[4], l, h;
-
-       if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) {
-               cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]);
-
-               if ((v[3] & (1 << 20)) && !disable_nx) {
-                       rdmsr(MSR_EFER, l, h);
-                       l |= EFER_NX;
-                       wrmsr(MSR_EFER, l, h);
-                       nx_enabled = 1;
-                       __supported_pte_mask |= _PAGE_NX;
-               }
-       }
-}
-#endif
-
 /* user-defined highmem size */
 static unsigned int highmem_pages = -1;
 
@@ -761,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn,
        highstart_pfn = highend_pfn = max_pfn;
        if (max_pfn > max_low_pfn)
                highstart_pfn = max_low_pfn;
-       memory_present(0, 0, highend_pfn);
        e820_register_active_regions(0, 0, highend_pfn);
+       sparse_memory_present_with_active_regions(0);
        printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
                pages_to_mb(highend_pfn - highstart_pfn));
        num_physpages = highend_pfn;
        high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
 #else
-       memory_present(0, 0, max_low_pfn);
        e820_register_active_regions(0, 0, max_low_pfn);
+       sparse_memory_present_with_active_regions(0);
        num_physpages = max_low_pfn;
        high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
 #endif
index 1753e8020df6ec8aa3eefea7342386224e595f2c..52bb9519bb86b4ec778d613939ea658adb1052a6 100644 (file)
 #include <asm/cacheflush.h>
 #include <asm/init.h>
 
-/*
- * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
- * The direct mapping extends to max_pfn_mapped, so that we can directly access
- * apertures, ACPI and other tables without having to play with fixmaps.
- */
-unsigned long max_low_pfn_mapped;
-unsigned long max_pfn_mapped;
-
 static unsigned long dma_reserve __initdata;
 
-DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
-
 static int __init parse_direct_gbpages_off(char *arg)
 {
        direct_gbpages = 0;
@@ -85,39 +75,6 @@ early_param("gbpages", parse_direct_gbpages_on);
 pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
-static int disable_nx __cpuinitdata;
-
-/*
- * noexec=on|off
- * Control non-executable mappings for 64-bit processes.
- *
- * on  Enable (default)
- * off Disable
- */
-static int __init nonx_setup(char *str)
-{
-       if (!str)
-               return -EINVAL;
-       if (!strncmp(str, "on", 2)) {
-               __supported_pte_mask |= _PAGE_NX;
-               disable_nx = 0;
-       } else if (!strncmp(str, "off", 3)) {
-               disable_nx = 1;
-               __supported_pte_mask &= ~_PAGE_NX;
-       }
-       return 0;
-}
-early_param("noexec", nonx_setup);
-
-void __cpuinit check_efer(void)
-{
-       unsigned long efer;
-
-       rdmsrl(MSR_EFER, efer);
-       if (!(efer & EFER_NX) || disable_nx)
-               __supported_pte_mask &= ~_PAGE_NX;
-}
-
 int force_personality32;
 
 /*
@@ -628,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
        early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT);
        reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT);
 }
+#endif
 
 void __init paging_init(void)
 {
@@ -638,11 +596,10 @@ void __init paging_init(void)
        max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
        max_zone_pfns[ZONE_NORMAL] = max_pfn;
 
-       memory_present(0, 0, max_pfn);
+       sparse_memory_present_with_active_regions(MAX_NUMNODES);
        sparse_init();
        free_area_init_nodes(max_zone_pfns);
 }
-#endif
 
 /*
  * Memory hotplug specific functions
index 8056545e2d39f9b53ba56349db4717809d6b9f04..fe6f84ca121ee072d2be8014e2b9b7e959e5abbd 100644 (file)
@@ -82,7 +82,6 @@ iounmap_atomic(void *kvaddr, enum km_type type)
        if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx))
                kpte_clear_flush(kmap_pte-idx, vaddr);
 
-       arch_flush_lazy_mmu_mode();
        pagefault_enable();
 }
 EXPORT_SYMBOL_GPL(iounmap_atomic);
index 50dc802a1c469b904154be3901f5fc87ebd7da93..16ccbd77917f22c1693b9b41fcb8dc7485acee39 100644 (file)
@@ -32,7 +32,7 @@ struct kmmio_fault_page {
        struct list_head list;
        struct kmmio_fault_page *release_next;
        unsigned long page; /* location of the fault page */
-       bool old_presence; /* page presence prior to arming */
+       pteval_t old_presence; /* page presence prior to arming */
        bool armed;
 
        /*
@@ -97,60 +97,62 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr)
 static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page)
 {
        struct list_head *head;
-       struct kmmio_fault_page *p;
+       struct kmmio_fault_page *f;
 
        page &= PAGE_MASK;
        head = kmmio_page_list(page);
-       list_for_each_entry_rcu(p, head, list) {
-               if (p->page == page)
-                       return p;
+       list_for_each_entry_rcu(f, head, list) {
+               if (f->page == page)
+                       return f;
        }
        return NULL;
 }
 
-static void set_pmd_presence(pmd_t *pmd, bool present, bool *old)
+static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old)
 {
        pmdval_t v = pmd_val(*pmd);
-       *old = !!(v & _PAGE_PRESENT);
-       v &= ~_PAGE_PRESENT;
-       if (present)
-               v |= _PAGE_PRESENT;
+       if (clear) {
+               *old = v & _PAGE_PRESENT;
+               v &= ~_PAGE_PRESENT;
+       } else  /* presume this has been called with clear==true previously */
+               v |= *old;
        set_pmd(pmd, __pmd(v));
 }
 
-static void set_pte_presence(pte_t *pte, bool present, bool *old)
+static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old)
 {
        pteval_t v = pte_val(*pte);
-       *old = !!(v & _PAGE_PRESENT);
-       v &= ~_PAGE_PRESENT;
-       if (present)
-               v |= _PAGE_PRESENT;
+       if (clear) {
+               *old = v & _PAGE_PRESENT;
+               v &= ~_PAGE_PRESENT;
+       } else  /* presume this has been called with clear==true previously */
+               v |= *old;
        set_pte_atomic(pte, __pte(v));
 }
 
-static int set_page_presence(unsigned long addr, bool present, bool *old)
+static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
 {
        unsigned int level;
-       pte_t *pte = lookup_address(addr, &level);
+       pte_t *pte = lookup_address(f->page, &level);
 
        if (!pte) {
-               pr_err("kmmio: no pte for page 0x%08lx\n", addr);
+               pr_err("kmmio: no pte for page 0x%08lx\n", f->page);
                return -1;
        }
 
        switch (level) {
        case PG_LEVEL_2M:
-               set_pmd_presence((pmd_t *)pte, present, old);
+               clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence);
                break;
        case PG_LEVEL_4K:
-               set_pte_presence(pte, present, old);
+               clear_pte_presence(pte, clear, &f->old_presence);
                break;
        default:
                pr_err("kmmio: unexpected page level 0x%x.\n", level);
                return -1;
        }
 
-       __flush_tlb_one(addr);
+       __flush_tlb_one(f->page);
        return 0;
 }
 
@@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
        WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n");
        if (f->armed) {
                pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n",
-                                       f->page, f->count, f->old_presence);
+                                       f->page, f->count, !!f->old_presence);
        }
-       ret = set_page_presence(f->page, false, &f->old_presence);
+       ret = clear_page_presence(f, true);
        WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page);
        f->armed = true;
        return ret;
@@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
 /** Restore the given page to saved presence state. */
 static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
 {
-       bool tmp;
-       int ret = set_page_presence(f->page, f->old_presence, &tmp);
+       int ret = clear_page_presence(f, false);
        WARN_ONCE(ret < 0,
                        KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page);
        f->armed = false;
@@ -310,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
        struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx);
 
        if (!ctx->active) {
-               pr_debug("kmmio: spurious debug trap on CPU %d.\n",
+               /*
+                * debug traps without an active context are due to either
+                * something external causing them (f.e. using a debugger while
+                * mmio tracing enabled), or erroneous behaviour
+                */
+               pr_warning("kmmio: unexpected debug trap on CPU %d.\n",
                                                        smp_processor_id());
                goto out;
        }
@@ -439,12 +445,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head)
                                                head,
                                                struct kmmio_delayed_release,
                                                rcu);
-       struct kmmio_fault_page *p = dr->release_list;
-       while (p) {
-               struct kmmio_fault_page *next = p->release_next;
-               BUG_ON(p->count);
-               kfree(p);
-               p = next;
+       struct kmmio_fault_page *f = dr->release_list;
+       while (f) {
+               struct kmmio_fault_page *next = f->release_next;
+               BUG_ON(f->count);
+               kfree(f);
+               f = next;
        }
        kfree(dr);
 }
@@ -453,19 +459,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
 {
        struct kmmio_delayed_release *dr =
                container_of(head, struct kmmio_delayed_release, rcu);
-       struct kmmio_fault_page *p = dr->release_list;
+       struct kmmio_fault_page *f = dr->release_list;
        struct kmmio_fault_page **prevp = &dr->release_list;
        unsigned long flags;
 
        spin_lock_irqsave(&kmmio_lock, flags);
-       while (p) {
-               if (!p->count) {
-                       list_del_rcu(&p->list);
-                       prevp = &p->release_next;
+       while (f) {
+               if (!f->count) {
+                       list_del_rcu(&f->list);
+                       prevp = &f->release_next;
                } else {
-                       *prevp = p->release_next;
+                       *prevp = f->release_next;
                }
-               p = p->release_next;
+               f = f->release_next;
        }
        spin_unlock_irqrestore(&kmmio_lock, flags);
 
@@ -528,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
 }
 EXPORT_SYMBOL(unregister_kmmio_probe);
 
-static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val,
-                                                               void *args)
+static int
+kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
 {
        struct die_args *arg = args;
 
@@ -544,11 +550,23 @@ static struct notifier_block nb_die = {
        .notifier_call = kmmio_die_notifier
 };
 
-static int __init init_kmmio(void)
+int kmmio_init(void)
 {
        int i;
+
        for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++)
                INIT_LIST_HEAD(&kmmio_page_table[i]);
+
        return register_die_notifier(&nb_die);
 }
-fs_initcall(init_kmmio); /* should be before device_initcall() */
+
+void kmmio_cleanup(void)
+{
+       int i;
+
+       unregister_die_notifier(&nb_die);
+       for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) {
+               WARN_ONCE(!list_empty(&kmmio_page_table[i]),
+                       KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n");
+       }
+}
index 605c8be06217b0da36344abd5f23d06326bcd87a..c0bedcd10f9733a3a9a21873b9043ead3a87546e 100644 (file)
@@ -40,23 +40,23 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
 
 static void __init memtest(u64 pattern, u64 start_phys, u64 size)
 {
-       u64 i, count;
-       u64 *start;
+       u64 *p;
+       void *start, *end;
        u64 start_bad, last_bad;
        u64 start_phys_aligned;
        size_t incr;
 
        incr = sizeof(pattern);
        start_phys_aligned = ALIGN(start_phys, incr);
-       count = (size - (start_phys_aligned - start_phys))/incr;
        start = __va(start_phys_aligned);
+       end = start + size - (start_phys_aligned - start_phys);
        start_bad = 0;
        last_bad = 0;
 
-       for (i = 0; i < count; i++)
-               start[i] = pattern;
-       for (i = 0; i < count; i++, start++, start_phys_aligned += incr) {
-               if (*start == pattern)
+       for (p = start; p < end; p++)
+               *p = pattern;
+       for (p = start; p < end; p++, start_phys_aligned += incr) {
+               if (*p == pattern)
                        continue;
                if (start_phys_aligned == last_bad + incr) {
                        last_bad += incr;
index c9342ed8b402dd93cc2e991ac1f44f67a0261c8d..132772a8ec57a66c629ad5719431fa6c08d95d98 100644 (file)
@@ -451,6 +451,7 @@ void enable_mmiotrace(void)
 
        if (nommiotrace)
                pr_info(NAME "MMIO tracing disabled.\n");
+       kmmio_init();
        enter_uniprocessor();
        spin_lock_irq(&trace_lock);
        atomic_inc(&mmiotrace_enabled);
@@ -473,6 +474,7 @@ void disable_mmiotrace(void)
 
        clear_trace_list(); /* guarantees: no more kmmio callbacks */
        leave_uniprocessor();
+       kmmio_cleanup();
        pr_info(NAME "disabled.\n");
 out:
        mutex_unlock(&mmiotrace_mutex);
index 2d05a12029dc3d216814eb2a9e914b389014407a..459913beac71dc0539a4f0bd11a299df869e5711 100644 (file)
@@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
 }
 
 /* Initialize bootmem allocator for a node */
-void __init setup_node_bootmem(int nodeid, unsigned long start,
-                              unsigned long end)
+void __init
+setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
 {
        unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size;
+       const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        unsigned long bootmap_start, nodedata_phys;
        void *bootmap;
-       const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
        int nid;
 
        if (!end)
                return;
 
+       /*
+        * Don't confuse VM with a node that doesn't have the
+        * minimum amount of memory:
+        */
+       if (end && (end - start) < NODE_MIN_SIZE)
+               return;
+
        start = roundup(start, ZONE_ALIGN);
 
        printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid,
@@ -272,9 +279,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start,
                reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start,
                                 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT);
 
-#ifdef CONFIG_ACPI_NUMA
-       srat_reserve_add_area(nodeid);
-#endif
        node_set_online(nodeid);
 }
 
@@ -578,21 +582,6 @@ unsigned long __init numa_free_all_bootmem(void)
        return pages;
 }
 
-void __init paging_init(void)
-{
-       unsigned long max_zone_pfns[MAX_NR_ZONES];
-
-       memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
-       max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
-       max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
-       max_zone_pfns[ZONE_NORMAL] = max_pfn;
-
-       sparse_memory_present_with_active_regions(MAX_NUMNODES);
-       sparse_init();
-
-       free_area_init_nodes(max_zone_pfns);
-}
-
 static __init int numa_setup(char *opt)
 {
        if (!opt)
@@ -606,8 +595,6 @@ static __init int numa_setup(char *opt)
 #ifdef CONFIG_ACPI_NUMA
        if (!strncmp(opt, "noacpi", 6))
                acpi_numa = -1;
-       if (!strncmp(opt, "hotadd=", 7))
-               hotadd_percent = simple_strtoul(opt+7, NULL, 10);
 #endif
        return 0;
 }
index e17efed088c54a7b546b7b76c073ab55231d425e..6ce9518fe2acb6457db7d6c19c1739e6f48f2c9b 100644 (file)
@@ -839,13 +839,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
 
        vm_unmap_aliases();
 
-       /*
-        * If we're called with lazy mmu updates enabled, the
-        * in-memory pte state may be stale.  Flush pending updates to
-        * bring them up to date.
-        */
-       arch_flush_lazy_mmu_mode();
-
        cpa.vaddr = addr;
        cpa.pages = pages;
        cpa.numpages = numpages;
@@ -890,13 +883,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages,
        } else
                cpa_flush_all(cache);
 
-       /*
-        * If we've been called with lazy mmu updates enabled, then
-        * make sure that everything gets flushed out before we
-        * return.
-        */
-       arch_flush_lazy_mmu_mode();
-
 out:
        return ret;
 }
index 01765955baaf66922ad70a959c9875f60fcad56e..2dfcbf9df2ae8410228e25fb4f559cd75ed1982e 100644 (file)
@@ -31,17 +31,11 @@ static nodemask_t nodes_parsed __initdata;
 static nodemask_t cpu_nodes_parsed __initdata;
 static struct bootnode nodes[MAX_NUMNODES] __initdata;
 static struct bootnode nodes_add[MAX_NUMNODES];
-static int found_add_area __initdata;
-int hotadd_percent __initdata = 0;
 
 static int num_node_memblks __initdata;
 static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
 static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
 
-/* Too small nodes confuse the VM badly. Usually they result
-   from BIOS bugs. */
-#define NODE_MIN_SIZE (4*1024*1024)
-
 static __init int setup_node(int pxm)
 {
        return acpi_map_pxm_to_node(pxm);
@@ -66,9 +60,6 @@ static __init void cutoff_node(int i, unsigned long start, unsigned long end)
 {
        struct bootnode *nd = &nodes[i];
 
-       if (found_add_area)
-               return;
-
        if (nd->start < start) {
                nd->start = start;
                if (nd->end < nd->start)
@@ -86,7 +77,6 @@ static __init void bad_srat(void)
        int i;
        printk(KERN_ERR "SRAT: SRAT not used.\n");
        acpi_numa = -1;
-       found_add_area = 0;
        for (i = 0; i < MAX_LOCAL_APIC; i++)
                apicid_to_node[i] = NUMA_NO_NODE;
        for (i = 0; i < MAX_NUMNODES; i++)
@@ -182,24 +172,21 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
               pxm, apic_id, node);
 }
 
-static int update_end_of_memory(unsigned long end) {return -1;}
-static int hotadd_enough_memory(struct bootnode *nd) {return 1;}
 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
 static inline int save_add_info(void) {return 1;}
 #else
 static inline int save_add_info(void) {return 0;}
 #endif
 /*
- * Update nodes_add and decide if to include add are in the zone.
- * Both SPARSE and RESERVE need nodes_add information.
- * This code supports one contiguous hot add area per node.
+ * Update nodes_add[]
+ * This code supports one contiguous hot add area per node
  */
-static int __init
-reserve_hotadd(int node, unsigned long start, unsigned long end)
+static void __init
+update_nodes_add(int node, unsigned long start, unsigned long end)
 {
        unsigned long s_pfn = start >> PAGE_SHIFT;
        unsigned long e_pfn = end >> PAGE_SHIFT;
-       int ret = 0, changed = 0;
+       int changed = 0;
        struct bootnode *nd = &nodes_add[node];
 
        /* I had some trouble with strange memory hotadd regions breaking
@@ -210,7 +197,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
           mistakes */
        if ((signed long)(end - start) < NODE_MIN_SIZE) {
                printk(KERN_ERR "SRAT: Hotplug area too small\n");
-               return -1;
+               return;
        }
 
        /* This check might be a bit too strict, but I'm keeping it for now. */
@@ -218,12 +205,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
                printk(KERN_ERR
                        "SRAT: Hotplug area %lu -> %lu has existing memory\n",
                        s_pfn, e_pfn);
-               return -1;
-       }
-
-       if (!hotadd_enough_memory(&nodes_add[node]))  {
-               printk(KERN_ERR "SRAT: Hotplug area too large\n");
-               return -1;
+               return;
        }
 
        /* Looks good */
@@ -245,11 +227,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end)
                        printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
        }
 
-       ret = update_end_of_memory(nd->end);
-
        if (changed)
-               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end);
-       return ret;
+               printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
+                                nd->start, nd->end);
 }
 
 /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -310,13 +290,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
               start, end);
        e820_register_active_regions(node, start >> PAGE_SHIFT,
                                     end >> PAGE_SHIFT);
-       push_node_boundaries(node, nd->start >> PAGE_SHIFT,
-                                               nd->end >> PAGE_SHIFT);
 
-       if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) &&
-           (reserve_hotadd(node, start, end) < 0)) {
-               /* Ignore hotadd region. Undo damage */
-               printk(KERN_NOTICE "SRAT: Hotplug region ignored\n");
+       if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
+               update_nodes_add(node, start, end);
+               /* restore nodes[node] */
                *nd = oldnode;
                if ((nd->start | nd->end) == 0)
                        node_clear(node, nodes_parsed);
@@ -345,9 +322,9 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
                        pxmram = 0;
        }
 
-       e820ram = max_pfn - absent_pages_in_range(0, max_pfn);
-       /* We seem to lose 3 pages somewhere. Allow a bit of slack. */
-       if ((long)(e820ram - pxmram) >= 1*1024*1024) {
+       e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
+       /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
+       if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
                printk(KERN_ERR
        "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
                        (pxmram << PAGE_SHIFT) >> 20,
@@ -357,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
        return 1;
 }
 
-static void __init unparse_node(int node)
-{
-       int i;
-       node_clear(node, nodes_parsed);
-       node_clear(node, cpu_nodes_parsed);
-       for (i = 0; i < MAX_LOCAL_APIC; i++) {
-               if (apicid_to_node[i] == node)
-                       apicid_to_node[i] = NUMA_NO_NODE;
-       }
-}
-
 void __init acpi_numa_arch_fixup(void) {}
 
 /* Use the information discovered above to actually set up the nodes. */
@@ -379,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
                return -1;
 
        /* First clean up the node list */
-       for (i = 0; i < MAX_NUMNODES; i++) {
+       for (i = 0; i < MAX_NUMNODES; i++)
                cutoff_node(i, start, end);
-               /*
-                * don't confuse VM with a node that doesn't have the
-                * minimum memory.
-                */
-               if (nodes[i].end &&
-                       (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) {
-                       unparse_node(i);
-                       node_set_offline(i);
-               }
-       }
 
        if (!nodes_cover_memory(nodes)) {
                bad_srat();
@@ -423,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
 
                if (node == NUMA_NO_NODE)
                        continue;
-               if (!node_isset(node, node_possible_map))
+               if (!node_online(node))
                        numa_clear_node(i);
        }
        numa_init_array();
@@ -510,26 +466,6 @@ static int null_slit_node_compare(int a, int b)
 }
 #endif /* CONFIG_NUMA_EMU */
 
-void __init srat_reserve_add_area(int nodeid)
-{
-       if (found_add_area && nodes_add[nodeid].end) {
-               u64 total_mb;
-
-               printk(KERN_INFO "SRAT: Reserving hot-add memory space "
-                               "for node %d at %Lx-%Lx\n",
-                       nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end);
-               total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start)
-                                       >> PAGE_SHIFT;
-               total_mb *= sizeof(struct page);
-               total_mb >>= 20;
-               printk(KERN_INFO "SRAT: This will cost you %Lu MB of "
-                               "pre-allocated memory.\n", (unsigned long long)total_mb);
-               reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start,
-                              nodes_add[nodeid].end - nodes_add[nodeid].start,
-                              BOOTMEM_DEFAULT);
-       }
-}
-
 int __node_distance(int a, int b)
 {
        int index;
index 202864ad49a7e5bf2a6574922d7cd938dd023fd9..3b285e656e27e57dbf4ae6dd35b715d7c9f2d1c7 100644 (file)
@@ -356,14 +356,11 @@ static void exit_sysfs(void)
 #define exit_sysfs() do { } while (0)
 #endif /* CONFIG_PM */
 
-static int p4force;
-module_param(p4force, int, 0);
-
 static int __init p4_init(char **cpu_type)
 {
        __u8 cpu_model = boot_cpu_data.x86_model;
 
-       if (!p4force && (cpu_model > 6 || cpu_model == 5))
+       if (cpu_model > 6 || cpu_model == 5)
                return 0;
 
 #ifndef CONFIG_SMP
@@ -389,10 +386,25 @@ static int __init p4_init(char **cpu_type)
        return 0;
 }
 
+static int force_arch_perfmon;
+static int force_cpu_type(const char *str, struct kernel_param *kp)
+{
+       if (!strcmp(str, "archperfmon")) {
+               force_arch_perfmon = 1;
+               printk(KERN_INFO "oprofile: forcing architectural perfmon\n");
+       }
+
+       return 0;
+}
+module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0);
+
 static int __init ppro_init(char **cpu_type)
 {
        __u8 cpu_model = boot_cpu_data.x86_model;
 
+       if (force_arch_perfmon && cpu_has_arch_perfmon)
+               return 0;
+
        switch (cpu_model) {
        case 0 ... 2:
                *cpu_type = "i386/ppro";
@@ -414,6 +426,13 @@ static int __init ppro_init(char **cpu_type)
        case 15: case 23:
                *cpu_type = "i386/core_2";
                break;
+       case 26:
+               arch_perfmon_setup_counters();
+               *cpu_type = "i386/core_i7";
+               break;
+       case 28:
+               *cpu_type = "i386/atom";
+               break;
        default:
                /* Unknown */
                return 0;
index fecbce6e7d7c20d1f0af750e2305ef4e9a37a92c..0696d506c4ade99b0d43232128a77cea99d65ec8 100644 (file)
@@ -889,6 +889,9 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
                return 0;
        }
 
+       if (io_apic_assign_pci_irqs)
+               return 0;
+
        /* Find IRQ routing entry */
 
        if (!pirq_table)
@@ -1039,56 +1042,15 @@ static void __init pcibios_fixup_irqs(void)
                pirq_penalty[dev->irq]++;
        }
 
+       if (io_apic_assign_pci_irqs)
+               return;
+
        dev = NULL;
        while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
                pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
                if (!pin)
                        continue;
 
-#ifdef CONFIG_X86_IO_APIC
-               /*
-                * Recalculate IRQ numbers if we use the I/O APIC.
-                */
-               if (io_apic_assign_pci_irqs) {
-                       int irq;
-
-                       /*
-                        * interrupt pins are numbered starting from 1
-                        */
-                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
-                               PCI_SLOT(dev->devfn), pin - 1);
-                       /*
-                        * Busses behind bridges are typically not listed in the
-                        * MP-table.  In this case we have to look up the IRQ
-                        * based on the parent bus, parent slot, and pin number.
-                        * The SMP code detects such bridged busses itself so we
-                        * should get into this branch reliably.
-                        */
-                       if (irq < 0 && dev->bus->parent) {
-                               /* go back to the bridge */
-                               struct pci_dev *bridge = dev->bus->self;
-                               int bus;
-
-                               pin = pci_swizzle_interrupt_pin(dev, pin);
-                               bus = bridge->bus->number;
-                               irq = IO_APIC_get_PCI_irq_vector(bus,
-                                               PCI_SLOT(bridge->devfn), pin - 1);
-                               if (irq >= 0)
-                                       dev_warn(&dev->dev,
-                                               "using bridge %s INT %c to "
-                                                       "get IRQ %d\n",
-                                                pci_name(bridge),
-                                                'A' + pin - 1, irq);
-                       }
-                       if (irq >= 0) {
-                               dev_info(&dev->dev,
-                                       "PCI->APIC IRQ transform: INT %c "
-                                               "-> IRQ %d\n",
-                                       'A' + pin - 1, irq);
-                               dev->irq = irq;
-                       }
-               }
-#endif
                /*
                 * Still no IRQ? Try to lookup one...
                 */
@@ -1183,6 +1145,19 @@ int __init pcibios_irq_init(void)
        pcibios_enable_irq = pirq_enable_irq;
 
        pcibios_fixup_irqs();
+
+       if (io_apic_assign_pci_irqs && pci_routeirq) {
+               struct pci_dev *dev = NULL;
+               /*
+                * PCI IRQ routing is set up by pci_enable_device(), but we
+                * also do it here in case there are still broken drivers that
+                * don't use pci_enable_device().
+                */
+               printk(KERN_INFO "PCI: Routing PCI interrupts for all devices because \"pci=routeirq\" specified\n");
+               for_each_pci_dev(dev)
+                       pirq_enable_irq(dev);
+       }
+
        return 0;
 }
 
@@ -1213,16 +1188,23 @@ void pcibios_penalize_isa_irq(int irq, int active)
 static int pirq_enable_irq(struct pci_dev *dev)
 {
        u8 pin;
-       struct pci_dev *temp_dev;
 
        pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
-       if (pin && !pcibios_lookup_irq(dev, 1) && !dev->irq) {
+       if (pin && !pcibios_lookup_irq(dev, 1)) {
                char *msg = "";
 
+               if (!io_apic_assign_pci_irqs && dev->irq)
+                       return 0;
+
                if (io_apic_assign_pci_irqs) {
+#ifdef CONFIG_X86_IO_APIC
+                       struct pci_dev *temp_dev;
                        int irq;
+                       struct io_apic_irq_attr irq_attr;
 
-                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number, PCI_SLOT(dev->devfn), pin - 1);
+                       irq = IO_APIC_get_PCI_irq_vector(dev->bus->number,
+                                               PCI_SLOT(dev->devfn),
+                                               pin - 1, &irq_attr);
                        /*
                         * Busses behind bridges are typically not listed in the MP-table.
                         * In this case we have to look up the IRQ based on the parent bus,
@@ -1235,7 +1217,8 @@ static int pirq_enable_irq(struct pci_dev *dev)
 
                                pin = pci_swizzle_interrupt_pin(dev, pin);
                                irq = IO_APIC_get_PCI_irq_vector(bridge->bus->number,
-                                               PCI_SLOT(bridge->devfn), pin - 1);
+                                               PCI_SLOT(bridge->devfn),
+                                               pin - 1, &irq_attr);
                                if (irq >= 0)
                                        dev_warn(&dev->dev, "using bridge %s "
                                                 "INT %c to get IRQ %d\n",
@@ -1245,12 +1228,15 @@ static int pirq_enable_irq(struct pci_dev *dev)
                        }
                        dev = temp_dev;
                        if (irq >= 0) {
+                               io_apic_set_pci_routing(&dev->dev, irq,
+                                                        &irq_attr);
+                               dev->irq = irq;
                                dev_info(&dev->dev, "PCI->APIC IRQ transform: "
                                         "INT %c -> IRQ %d\n", 'A' + pin - 1, irq);
-                               dev->irq = irq;
                                return 0;
                        } else
                                msg = "; probably buggy MP table";
+#endif
                } else if (pci_probe & PCI_BIOS_IRQ_SCAN)
                        msg = "";
                else
index 7133cdf9098b7ef0366871a92c29ef8aa5445209..cac083386e0339801940daf8463a91df40f35721 100644 (file)
@@ -8,6 +8,7 @@
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/random.h>
+#include <linux/elf.h>
 #include <asm/vsyscall.h>
 #include <asm/vgtod.h>
 #include <asm/proto.h>
index f09e8c36ee805d58ba0d6580397305744446cc42..0a1700a2be9c8c9a548822ca4deef1bb475b4059 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/delay.h>
 #include <linux/start_kernel.h>
 #include <linux/sched.h>
+#include <linux/kprobes.h>
 #include <linux/bootmem.h>
 #include <linux/module.h>
 #include <linux/mm.h>
@@ -44,6 +45,7 @@
 #include <asm/processor.h>
 #include <asm/proto.h>
 #include <asm/msr-index.h>
+#include <asm/traps.h>
 #include <asm/setup.h>
 #include <asm/desc.h>
 #include <asm/pgtable.h>
@@ -240,10 +242,10 @@ static unsigned long xen_get_debugreg(int reg)
        return HYPERVISOR_get_debugreg(reg);
 }
 
-void xen_leave_lazy(void)
+static void xen_end_context_switch(struct task_struct *next)
 {
-       paravirt_leave_lazy(paravirt_get_lazy_mode());
        xen_mc_flush();
+       paravirt_end_context_switch(next);
 }
 
 static unsigned long xen_store_tr(void)
@@ -428,11 +430,44 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
 static int cvt_gate_to_trap(int vector, const gate_desc *val,
                            struct trap_info *info)
 {
+       unsigned long addr;
+
        if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT)
                return 0;
 
        info->vector = vector;
-       info->address = gate_offset(*val);
+
+       addr = gate_offset(*val);
+#ifdef CONFIG_X86_64
+       /*
+        * Look for known traps using IST, and substitute them
+        * appropriately.  The debugger ones are the only ones we care
+        * about.  Xen will handle faults like double_fault and
+        * machine_check, so we should never see them.  Warn if
+        * there's an unexpected IST-using fault handler.
+        */
+       if (addr == (unsigned long)debug)
+               addr = (unsigned long)xen_debug;
+       else if (addr == (unsigned long)int3)
+               addr = (unsigned long)xen_int3;
+       else if (addr == (unsigned long)stack_segment)
+               addr = (unsigned long)xen_stack_segment;
+       else if (addr == (unsigned long)double_fault ||
+                addr == (unsigned long)nmi) {
+               /* Don't need to handle these */
+               return 0;
+#ifdef CONFIG_X86_MCE
+       } else if (addr == (unsigned long)machine_check) {
+               return 0;
+#endif
+       } else {
+               /* Some other trap using IST? */
+               if (WARN_ON(val->ist != 0))
+                       return 0;
+       }
+#endif /* CONFIG_X86_64 */
+       info->address = addr;
+
        info->cs = gate_segment(*val);
        info->flags = val->dpl;
        /* interrupt gates clear IF */
@@ -623,10 +658,26 @@ static void xen_clts(void)
        xen_mc_issue(PARAVIRT_LAZY_CPU);
 }
 
+static DEFINE_PER_CPU(unsigned long, xen_cr0_value);
+
+static unsigned long xen_read_cr0(void)
+{
+       unsigned long cr0 = percpu_read(xen_cr0_value);
+
+       if (unlikely(cr0 == 0)) {
+               cr0 = native_read_cr0();
+               percpu_write(xen_cr0_value, cr0);
+       }
+
+       return cr0;
+}
+
 static void xen_write_cr0(unsigned long cr0)
 {
        struct multicall_space mcs;
 
+       percpu_write(xen_cr0_value, cr0);
+
        /* Only pay attention to cr0.TS; everything else is
           ignored. */
        mcs = xen_mc_entry(0);
@@ -812,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
 
        .clts = xen_clts,
 
-       .read_cr0 = native_read_cr0,
+       .read_cr0 = xen_read_cr0,
        .write_cr0 = xen_write_cr0,
 
        .read_cr4 = native_read_cr4,
@@ -860,10 +911,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
        /* Xen takes care of %gs when switching to usermode for us */
        .swapgs = paravirt_nop,
 
-       .lazy_mode = {
-               .enter = paravirt_enter_lazy_cpu,
-               .leave = xen_leave_lazy,
-       },
+       .start_context_switch = paravirt_start_context_switch,
+       .end_context_switch = xen_end_context_switch,
 };
 
 static const struct pv_apic_ops xen_apic_ops __initdata = {
index fba55b1a40217f93dad87242d3d7584a5d9665b2..4ceb28581652ef0ab7ff7bcc5426dd914954d100 100644 (file)
@@ -452,10 +452,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
 void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
                    pte_t *ptep, pte_t pteval)
 {
-       /* updates to init_mm may be done without lock */
-       if (mm == &init_mm)
-               preempt_disable();
-
        ADD_STATS(set_pte_at, 1);
 //     ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
        ADD_STATS(set_pte_at_current, mm == current->mm);
@@ -476,9 +472,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
        }
        xen_set_pte(ptep, pteval);
 
-out:
-       if (mm == &init_mm)
-               preempt_enable();
+out:   return;
 }
 
 pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -1152,10 +1146,8 @@ static void drop_other_mm_ref(void *info)
 
        /* If this cpu still has a stale cr3 reference, then make sure
           it has been flushed. */
-       if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) {
+       if (percpu_read(xen_current_cr3) == __pa(mm->pgd))
                load_cr3(swapper_pg_dir);
-               arch_flush_lazy_cpu_mode();
-       }
 }
 
 static void xen_drop_mm_ref(struct mm_struct *mm)
@@ -1168,7 +1160,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
                        load_cr3(swapper_pg_dir);
                else
                        leave_mm(smp_processor_id());
-               arch_flush_lazy_cpu_mode();
        }
 
        /* Get the "official" set of cpus referring to our pagetable. */
@@ -1876,6 +1867,14 @@ __init void xen_post_allocator_init(void)
        xen_mark_init_mm_pinned();
 }
 
+static void xen_leave_lazy_mmu(void)
+{
+       preempt_disable();
+       xen_mc_flush();
+       paravirt_leave_lazy_mmu();
+       preempt_enable();
+}
+
 const struct pv_mmu_ops xen_mmu_ops __initdata = {
        .pagetable_setup_start = xen_pagetable_setup_start,
        .pagetable_setup_done = xen_pagetable_setup_done,
@@ -1949,7 +1948,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = {
 
        .lazy_mode = {
                .enter = paravirt_enter_lazy_mmu,
-               .leave = xen_leave_lazy,
+               .leave = xen_leave_lazy_mmu,
        },
 
        .set_fixmap = xen_set_fixmap,
index 15c6c68db6a25f4941d25e8fc35857a3d518dc5d..ad0047f47cd476004c99485877f6d62a7db20d86 100644 (file)
@@ -61,9 +61,9 @@ char * __init xen_memory_setup(void)
         *  - xen_start_info
         * See comment above "struct start_info" in <xen/interface/xen.h>
         */
-       e820_add_region(__pa(xen_start_info->mfn_list),
-                       xen_start_info->pt_base - xen_start_info->mfn_list,
-                       E820_RESERVED);
+       reserve_early(__pa(xen_start_info->mfn_list),
+                     __pa(xen_start_info->pt_base),
+                       "XEN START INFO");
 
        sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
 
index ca6596b05d533c25f56e242409e88471a816ba9c..22494fd4c9b5cf49f8b3af7b2999626dc750b80c 100644 (file)
@@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
 void xen_ident_map_ISA(void);
 void xen_reserve_top(void);
 
-void xen_leave_lazy(void);
 void xen_post_allocator_init(void);
 
 char * __init xen_memory_setup(void);
index c89883be87379d9454ab1af7cd68319e92795597..648f15cb41f1df789a8381802114a2c6bd1033d6 100644 (file)
 #include <linux/task_io_accounting_ops.h>
 #include <linux/blktrace_api.h>
 #include <linux/fault-inject.h>
-#include <trace/block.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/block.h>
 
 #include "blk.h"
 
-DEFINE_TRACE(block_plug);
-DEFINE_TRACE(block_unplug_io);
-DEFINE_TRACE(block_unplug_timer);
-DEFINE_TRACE(block_getrq);
-DEFINE_TRACE(block_sleeprq);
-DEFINE_TRACE(block_rq_requeue);
-DEFINE_TRACE(block_bio_backmerge);
-DEFINE_TRACE(block_bio_frontmerge);
-DEFINE_TRACE(block_bio_queue);
-DEFINE_TRACE(block_rq_complete);
-DEFINE_TRACE(block_remap);     /* Also used in drivers/md/dm.c */
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
 
@@ -1277,7 +1269,7 @@ static inline void blk_partition_remap(struct bio *bio)
                bio->bi_bdev = bdev->bd_contains;
 
                trace_block_remap(bdev_get_queue(bio->bi_bdev), bio,
-                                   bdev->bd_dev, bio->bi_sector,
+                                   bdev->bd_dev,
                                    bio->bi_sector - p->start_sect);
        }
 }
@@ -1446,8 +1438,7 @@ static inline void __generic_make_request(struct bio *bio)
                        goto end_io;
 
                if (old_sector != -1)
-                       trace_block_remap(q, bio, old_dev, bio->bi_sector,
-                                           old_sector);
+                       trace_block_remap(q, bio, old_dev, old_sector);
 
                trace_block_bio_queue(q, bio);
 
@@ -1741,10 +1732,14 @@ static int __end_that_request_first(struct request *req, int error,
        trace_block_rq_complete(req->q, req);
 
        /*
-        * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual
-        * sense key with us all the way through
+        * For fs requests, rq is just carrier of independent bio's
+        * and each partial completion should be handled separately.
+        * Reset per-request error on each partial completion.
+        *
+        * TODO: tj: This is too subtle.  It would be better to let
+        * low level drivers do what they see fit.
         */
-       if (!blk_pc_request(req))
+       if (blk_fs_request(req))
                req->errors = 0;
 
        if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) {
index 3ff9bba3379a84891ddbc97450fcdbddf6e42a1a..26f9ec28f56c7f3b6d87a3ed207bf0722d9af572 100644 (file)
@@ -383,16 +383,21 @@ struct kobj_type blk_queue_ktype = {
 int blk_register_queue(struct gendisk *disk)
 {
        int ret;
+       struct device *dev = disk_to_dev(disk);
 
        struct request_queue *q = disk->queue;
 
        if (WARN_ON(!q))
                return -ENXIO;
 
+       ret = blk_trace_init_sysfs(dev);
+       if (ret)
+               return ret;
+
        if (!q->request_fn)
                return 0;
 
-       ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj),
+       ret = kobject_add(&q->kobj, kobject_get(&dev->kobj),
                          "%s", "queue");
        if (ret < 0)
                return ret;
index f87615dea46bbd9dfcdf0789f170c2b9b8512f6a..f8c218cd08e193d206c8b40dc18750ac1bdaa804 100644 (file)
@@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg)
        memcpy(&buts.name, &cbuts.name, 32);
 
        mutex_lock(&bdev->bd_mutex);
-       ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts);
+       ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts);
        mutex_unlock(&bdev->bd_mutex);
        if (ret)
                return ret;
index 7073a9072577cdf3a0ae6e63c5ca247c2f493a5d..e220f0c543e3c94bcd8907df4e9ac676ce57c41b 100644 (file)
 #include <linux/compiler.h>
 #include <linux/delay.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <linux/hash.h>
 #include <linux/uaccess.h>
 
+#include <trace/events/block.h>
+
 #include "blk.h"
 
 static DEFINE_SPINLOCK(elv_list_lock);
 static LIST_HEAD(elv_list);
 
-DEFINE_TRACE(block_rq_abort);
-
 /*
  * Merge hash stuff.
  */
@@ -55,9 +54,6 @@ static const int elv_hash_shift = 6;
 #define rq_hash_key(rq)                ((rq)->sector + (rq)->nr_sectors)
 #define ELV_ON_HASH(rq)                (!hlist_unhashed(&(rq)->hash))
 
-DEFINE_TRACE(block_rq_insert);
-DEFINE_TRACE(block_rq_issue);
-
 /*
  * Query io scheduler to see if the current process issuing bio may be
  * merged with rq.
index 51b9f8280f887cfd6649a39446e7e081d52fe3b6..2faa9e2ac89331b9c46c1de0de560a0225ce88e2 100644 (file)
@@ -401,7 +401,8 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                /* Interrupt Line values above 0xF are forbidden */
                if (dev->irq > 0 && (dev->irq <= 0xF)) {
                        printk(" - using IRQ %d\n", dev->irq);
-                       acpi_register_gsi(dev->irq, ACPI_LEVEL_SENSITIVE,
+                       acpi_register_gsi(&dev->dev, dev->irq,
+                                         ACPI_LEVEL_SENSITIVE,
                                          ACPI_ACTIVE_LOW);
                        return 0;
                } else {
@@ -410,7 +411,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev)
                }
        }
 
-       rc = acpi_register_gsi(gsi, triggering, polarity);
+       rc = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
        if (rc < 0) {
                dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n",
                         pin_name(pin));
index af761dc434f638e9ab869a8e8d8508a98adbd175..4895f0e053229bc8c04b568912340ffdbc02a076 100644 (file)
@@ -277,8 +277,8 @@ static int hci_uart_tty_open(struct tty_struct *tty)
        /* FIXME: why is this needed. Note don't use ldisc_ref here as the
           open path is before the ldisc is referencable */
 
-       if (tty->ldisc.ops->flush_buffer)
-               tty->ldisc.ops->flush_buffer(tty);
+       if (tty->ldisc->ops->flush_buffer)
+               tty->ldisc->ops->flush_buffer(tty);
        tty_driver_flush_buffer(tty);
 
        return 0;
@@ -463,7 +463,6 @@ static int hci_uart_tty_ioctl(struct tty_struct *tty, struct file * file,
                                clear_bit(HCI_UART_PROTO_SET, &hu->flags);
                                return err;
                        }
-                       tty->low_latency = 1;
                } else
                        return -EBUSY;
                break;
index 735bbe2be51aaf73819078c543320c5fc9111bbc..02ecfd5fa61c56f17ed658e3759e525794f53d81 100644 (file)
@@ -97,6 +97,19 @@ config DEVKMEM
          kind of kernel debugging operations.
          When in doubt, say "N".
 
+config BFIN_JTAG_COMM
+       tristate "Blackfin JTAG Communication"
+       depends on BLACKFIN
+       help
+         Add support for emulating a TTY device over the Blackfin JTAG.
+
+         To compile this driver as a module, choose M here: the
+         module will be called bfin_jtag_comm.
+
+config BFIN_JTAG_COMM_CONSOLE
+       bool "Console on Blackfin JTAG"
+       depends on BFIN_JTAG_COMM=y
+
 config SERIAL_NONSTANDARD
        bool "Non-standard serial port support"
        depends on HAS_IOMEM
index 9caf5b5ad1c05bfbe9ec4f7683a7f1c8c852005e..189efcff08ce246a58900ab15a7cacd487c37724 100644 (file)
@@ -13,6 +13,7 @@ obj-$(CONFIG_LEGACY_PTYS)     += pty.o
 obj-$(CONFIG_UNIX98_PTYS)      += pty.o
 obj-y                          += misc.o
 obj-$(CONFIG_VT)               += vt_ioctl.o vc_screen.o selection.o keyboard.o
+obj-$(CONFIG_BFIN_JTAG_COMM)   += bfin_jtag_comm.o
 obj-$(CONFIG_CONSOLE_TRANSLATIONS) += consolemap.o consolemap_deftbl.o
 obj-$(CONFIG_HW_CONSOLE)       += vt.o defkeymap.o
 obj-$(CONFIG_AUDIT)            += tty_audit.o
diff --git a/drivers/char/bfin_jtag_comm.c b/drivers/char/bfin_jtag_comm.c
new file mode 100644 (file)
index 0000000..44c113d
--- /dev/null
@@ -0,0 +1,365 @@
+/*
+ * TTY over Blackfin JTAG Communication
+ *
+ * Copyright 2008-2009 Analog Devices Inc.
+ *
+ * Enter bugs at http://blackfin.uclinux.org/
+ *
+ * Licensed under the GPL-2 or later.
+ */
+
+#include <linux/circ_buf.h>
+#include <linux/console.h>
+#include <linux/delay.h>
+#include <linux/err.h>
+#include <linux/kernel.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/sched.h>
+#include <linux/tty.h>
+#include <linux/tty_driver.h>
+#include <linux/tty_flip.h>
+#include <asm/atomic.h>
+
+/* See the Debug/Emulation chapter in the HRM */
+#define EMUDOF   0x00000001    /* EMUDAT_OUT full & valid */
+#define EMUDIF   0x00000002    /* EMUDAT_IN full & valid */
+#define EMUDOOVF 0x00000004    /* EMUDAT_OUT overflow */
+#define EMUDIOVF 0x00000008    /* EMUDAT_IN overflow */
+
+#define DRV_NAME "bfin-jtag-comm"
+#define DEV_NAME "ttyBFJC"
+
+#define pr_init(fmt, args...) ({ static const __initdata char __fmt[] = fmt; printk(__fmt, ## args); })
+#define debug(fmt, args...) pr_debug(DRV_NAME ": " fmt, ## args)
+
+static inline uint32_t bfin_write_emudat(uint32_t emudat)
+{
+       __asm__ __volatile__("emudat = %0;" : : "d"(emudat));
+       return emudat;
+}
+
+static inline uint32_t bfin_read_emudat(void)
+{
+       uint32_t emudat;
+       __asm__ __volatile__("%0 = emudat;" : "=d"(emudat));
+       return emudat;
+}
+
+static inline uint32_t bfin_write_emudat_chars(char a, char b, char c, char d)
+{
+       return bfin_write_emudat((a << 0) | (b << 8) | (c << 16) | (d << 24));
+}
+
+#define CIRC_SIZE 2048 /* see comment in tty_io.c:do_tty_write() */
+#define CIRC_MASK (CIRC_SIZE - 1)
+#define circ_empty(circ)     ((circ)->head == (circ)->tail)
+#define circ_free(circ)      CIRC_SPACE((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_cnt(circ)       CIRC_CNT((circ)->head, (circ)->tail, CIRC_SIZE)
+#define circ_byte(circ, idx) ((circ)->buf[(idx) & CIRC_MASK])
+
+static struct tty_driver *bfin_jc_driver;
+static struct task_struct *bfin_jc_kthread;
+static struct tty_struct * volatile bfin_jc_tty;
+static unsigned long bfin_jc_count;
+static DEFINE_MUTEX(bfin_jc_tty_mutex);
+static volatile struct circ_buf bfin_jc_write_buf;
+
+static int
+bfin_jc_emudat_manager(void *arg)
+{
+       uint32_t inbound_len = 0, outbound_len = 0;
+
+       while (!kthread_should_stop()) {
+               /* no one left to give data to, so sleep */
+               if (bfin_jc_tty == NULL && circ_empty(&bfin_jc_write_buf)) {
+                       debug("waiting for readers\n");
+                       __set_current_state(TASK_UNINTERRUPTIBLE);
+                       schedule();
+                       __set_current_state(TASK_RUNNING);
+               }
+
+               /* no data available, so just chill */
+               if (!(bfin_read_DBGSTAT() & EMUDIF) && circ_empty(&bfin_jc_write_buf)) {
+                       debug("waiting for data (in_len = %i) (circ: %i %i)\n",
+                               inbound_len, bfin_jc_write_buf.tail, bfin_jc_write_buf.head);
+                       if (inbound_len)
+                               schedule();
+                       else
+                               schedule_timeout_interruptible(HZ);
+                       continue;
+               }
+
+               /* if incoming data is ready, eat it */
+               if (bfin_read_DBGSTAT() & EMUDIF) {
+                       struct tty_struct *tty;
+                       mutex_lock(&bfin_jc_tty_mutex);
+                       tty = (struct tty_struct *)bfin_jc_tty;
+                       if (tty != NULL) {
+                               uint32_t emudat = bfin_read_emudat();
+                               if (inbound_len == 0) {
+                                       debug("incoming length: 0x%08x\n", emudat);
+                                       inbound_len = emudat;
+                               } else {
+                                       size_t num_chars = (4 <= inbound_len ? 4 : inbound_len);
+                                       debug("  incoming data: 0x%08x (pushing %zu)\n", emudat, num_chars);
+                                       inbound_len -= num_chars;
+                                       tty_insert_flip_string(tty, (unsigned char *)&emudat, num_chars);
+                                       tty_flip_buffer_push(tty);
+                               }
+                       }
+                       mutex_unlock(&bfin_jc_tty_mutex);
+               }
+
+               /* if outgoing data is ready, post it */
+               if (!(bfin_read_DBGSTAT() & EMUDOF) && !circ_empty(&bfin_jc_write_buf)) {
+                       if (outbound_len == 0) {
+                               outbound_len = circ_cnt(&bfin_jc_write_buf);
+                               bfin_write_emudat(outbound_len);
+                               debug("outgoing length: 0x%08x\n", outbound_len);
+                       } else {
+                               struct tty_struct *tty;
+                               int tail = bfin_jc_write_buf.tail;
+                               size_t ate = (4 <= outbound_len ? 4 : outbound_len);
+                               uint32_t emudat =
+                               bfin_write_emudat_chars(
+                                       circ_byte(&bfin_jc_write_buf, tail + 0),
+                                       circ_byte(&bfin_jc_write_buf, tail + 1),
+                                       circ_byte(&bfin_jc_write_buf, tail + 2),
+                                       circ_byte(&bfin_jc_write_buf, tail + 3)
+                               );
+                               bfin_jc_write_buf.tail += ate;
+                               outbound_len -= ate;
+                               mutex_lock(&bfin_jc_tty_mutex);
+                               tty = (struct tty_struct *)bfin_jc_tty;
+                               if (tty)
+                                       tty_wakeup(tty);
+                               mutex_unlock(&bfin_jc_tty_mutex);
+                               debug("  outgoing data: 0x%08x (pushing %zu)\n", emudat, ate);
+                       }
+               }
+       }
+
+       __set_current_state(TASK_RUNNING);
+       return 0;
+}
+
+static int
+bfin_jc_open(struct tty_struct *tty, struct file *filp)
+{
+       mutex_lock(&bfin_jc_tty_mutex);
+       debug("open %lu\n", bfin_jc_count);
+       ++bfin_jc_count;
+       bfin_jc_tty = tty;
+       wake_up_process(bfin_jc_kthread);
+       mutex_unlock(&bfin_jc_tty_mutex);
+       return 0;
+}
+
+static void
+bfin_jc_close(struct tty_struct *tty, struct file *filp)
+{
+       mutex_lock(&bfin_jc_tty_mutex);
+       debug("close %lu\n", bfin_jc_count);
+       if (--bfin_jc_count == 0)
+               bfin_jc_tty = NULL;
+       wake_up_process(bfin_jc_kthread);
+       mutex_unlock(&bfin_jc_tty_mutex);
+}
+
+/* XXX: we dont handle the put_char() case where we must handle count = 1 */
+static int
+bfin_jc_circ_write(const unsigned char *buf, int count)
+{
+       int i;
+       count = min(count, circ_free(&bfin_jc_write_buf));
+       debug("going to write chunk of %i bytes\n", count);
+       for (i = 0; i < count; ++i)
+               circ_byte(&bfin_jc_write_buf, bfin_jc_write_buf.head + i) = buf[i];
+       bfin_jc_write_buf.head += i;
+       return i;
+}
+
+#ifndef CONFIG_BFIN_JTAG_COMM_CONSOLE
+# define acquire_console_sem()
+# define release_console_sem()
+#endif
+static int
+bfin_jc_write(struct tty_struct *tty, const unsigned char *buf, int count)
+{
+       int i;
+       acquire_console_sem();
+       i = bfin_jc_circ_write(buf, count);
+       release_console_sem();
+       wake_up_process(bfin_jc_kthread);
+       return i;
+}
+
+static void
+bfin_jc_flush_chars(struct tty_struct *tty)
+{
+       wake_up_process(bfin_jc_kthread);
+}
+
+static int
+bfin_jc_write_room(struct tty_struct *tty)
+{
+       return circ_free(&bfin_jc_write_buf);
+}
+
+static int
+bfin_jc_chars_in_buffer(struct tty_struct *tty)
+{
+       return circ_cnt(&bfin_jc_write_buf);
+}
+
+static void
+bfin_jc_wait_until_sent(struct tty_struct *tty, int timeout)
+{
+       unsigned long expire = jiffies + timeout;
+       while (!circ_empty(&bfin_jc_write_buf)) {
+               if (signal_pending(current))
+                       break;
+               if (time_after(jiffies, expire))
+                       break;
+       }
+}
+
+static struct tty_operations bfin_jc_ops = {
+       .open            = bfin_jc_open,
+       .close           = bfin_jc_close,
+       .write           = bfin_jc_write,
+       /*.put_char        = bfin_jc_put_char,*/
+       .flush_chars     = bfin_jc_flush_chars,
+       .write_room      = bfin_jc_write_room,
+       .chars_in_buffer = bfin_jc_chars_in_buffer,
+       .wait_until_sent = bfin_jc_wait_until_sent,
+};
+
+static int __init bfin_jc_init(void)
+{
+       int ret;
+
+       bfin_jc_kthread = kthread_create(bfin_jc_emudat_manager, NULL, DRV_NAME);
+       if (IS_ERR(bfin_jc_kthread))
+               return PTR_ERR(bfin_jc_kthread);
+
+       ret = -ENOMEM;
+
+       bfin_jc_write_buf.head = bfin_jc_write_buf.tail = 0;
+       bfin_jc_write_buf.buf = kmalloc(CIRC_SIZE, GFP_KERNEL);
+       if (!bfin_jc_write_buf.buf)
+               goto err;
+
+       bfin_jc_driver = alloc_tty_driver(1);
+       if (!bfin_jc_driver)
+               goto err;
+
+       bfin_jc_driver->owner        = THIS_MODULE;
+       bfin_jc_driver->driver_name  = DRV_NAME;
+       bfin_jc_driver->name         = DEV_NAME;
+       bfin_jc_driver->type         = TTY_DRIVER_TYPE_SERIAL;
+       bfin_jc_driver->subtype      = SERIAL_TYPE_NORMAL;
+       bfin_jc_driver->init_termios = tty_std_termios;
+       tty_set_operations(bfin_jc_driver, &bfin_jc_ops);
+
+       ret = tty_register_driver(bfin_jc_driver);
+       if (ret)
+               goto err;
+
+       pr_init(KERN_INFO DRV_NAME ": initialized\n");
+
+       return 0;
+
+ err:
+       put_tty_driver(bfin_jc_driver);
+       kfree(bfin_jc_write_buf.buf);
+       kthread_stop(bfin_jc_kthread);
+       return ret;
+}
+module_init(bfin_jc_init);
+
+static void __exit bfin_jc_exit(void)
+{
+       kthread_stop(bfin_jc_kthread);
+       kfree(bfin_jc_write_buf.buf);
+       tty_unregister_driver(bfin_jc_driver);
+       put_tty_driver(bfin_jc_driver);
+}
+module_exit(bfin_jc_exit);
+
+#if defined(CONFIG_BFIN_JTAG_COMM_CONSOLE) || defined(CONFIG_EARLY_PRINTK)
+static void
+bfin_jc_straight_buffer_write(const char *buf, unsigned count)
+{
+       unsigned ate = 0;
+       while (bfin_read_DBGSTAT() & EMUDOF)
+               continue;
+       bfin_write_emudat(count);
+       while (ate < count) {
+               while (bfin_read_DBGSTAT() & EMUDOF)
+                       continue;
+               bfin_write_emudat_chars(buf[ate], buf[ate+1], buf[ate+2], buf[ate+3]);
+               ate += 4;
+       }
+}
+#endif
+
+#ifdef CONFIG_BFIN_JTAG_COMM_CONSOLE
+static void
+bfin_jc_console_write(struct console *co, const char *buf, unsigned count)
+{
+       if (bfin_jc_kthread == NULL)
+               bfin_jc_straight_buffer_write(buf, count);
+       else
+               bfin_jc_circ_write(buf, count);
+}
+
+static struct tty_driver *
+bfin_jc_console_device(struct console *co, int *index)
+{
+       *index = co->index;
+       return bfin_jc_driver;
+}
+
+static struct console bfin_jc_console = {
+       .name    = DEV_NAME,
+       .write   = bfin_jc_console_write,
+       .device  = bfin_jc_console_device,
+       .flags   = CON_ANYTIME | CON_PRINTBUFFER,
+       .index   = -1,
+};
+
+static int __init bfin_jc_console_init(void)
+{
+       register_console(&bfin_jc_console);
+       return 0;
+}
+console_initcall(bfin_jc_console_init);
+#endif
+
+#ifdef CONFIG_EARLY_PRINTK
+static void __init
+bfin_jc_early_write(struct console *co, const char *buf, unsigned int count)
+{
+       bfin_jc_straight_buffer_write(buf, count);
+}
+
+static struct __initdata console bfin_jc_early_console = {
+       .name   = "early_BFJC",
+       .write   = bfin_jc_early_write,
+       .flags   = CON_ANYTIME | CON_PRINTBUFFER,
+       .index   = -1,
+};
+
+struct console * __init
+bfin_jc_early_init(unsigned int port, unsigned int cflag)
+{
+       return &bfin_jc_early_console;
+}
+#endif
+
+MODULE_AUTHOR("Mike Frysinger <vapier@gentoo.org>");
+MODULE_DESCRIPTION("TTY over Blackfin JTAG Communication");
+MODULE_LICENSE("GPL");
index 1fdb9f657d8f8650745ab136b7292d887568be7e..f3366d3f06cfe404cdfe94b6d3393c55b7b05886 100644 (file)
 
 #define NR_PORTS       256
 
-#define ZE_V1_NPORTS   64
 #define ZO_V1  0
 #define ZO_V2  1
 #define ZE_V1  2
 static void cy_throttle(struct tty_struct *tty);
 static void cy_send_xchar(struct tty_struct *tty, char ch);
 
-#define IS_CYC_Z(card) ((card).num_chips == (unsigned int)-1)
-
-#define Z_FPGA_CHECK(card) \
-       ((readl(&((struct RUNTIME_9060 __iomem *) \
-               ((card).ctl_addr))->init_ctrl) & (1<<17)) != 0)
-
-#define ISZLOADED(card)        (((ZO_V1 == readl(&((struct RUNTIME_9060 __iomem *) \
-                       ((card).ctl_addr))->mail_box_0)) || \
-                       Z_FPGA_CHECK(card)) && \
-                       (ZFIRM_ID == readl(&((struct FIRM_ID __iomem *) \
-                       ((card).base_addr+ID_ADDRESS))->signature)))
-
 #ifndef SERIAL_XMIT_SIZE
 #define        SERIAL_XMIT_SIZE        (min(PAGE_SIZE, 4096))
 #endif
@@ -687,8 +674,6 @@ static void cy_send_xchar(struct tty_struct *tty, char ch);
 #define DRIVER_VERSION 0x02010203
 #define RAM_SIZE 0x80000
 
-#define Z_FPGA_LOADED(X)       ((readl(&(X)->init_ctrl) & (1<<17)) != 0)
-
 enum zblock_type {
        ZBLOCK_PRG = 0,
        ZBLOCK_FPGA = 1
@@ -883,6 +868,29 @@ static void cyz_rx_restart(unsigned long);
 static struct timer_list cyz_rx_full_timer[NR_PORTS];
 #endif                         /* CONFIG_CYZ_INTR */
 
+static inline bool cy_is_Z(struct cyclades_card *card)
+{
+       return card->num_chips == (unsigned int)-1;
+}
+
+static inline bool __cyz_fpga_loaded(struct RUNTIME_9060 __iomem *ctl_addr)
+{
+       return readl(&ctl_addr->init_ctrl) & (1 << 17);
+}
+
+static inline bool cyz_fpga_loaded(struct cyclades_card *card)
+{
+       return __cyz_fpga_loaded(card->ctl_addr.p9060);
+}
+
+static inline bool cyz_is_loaded(struct cyclades_card *card)
+{
+       struct FIRM_ID __iomem *fw_id = card->base_addr + ID_ADDRESS;
+
+       return (card->hw_ver == ZO_V1 || cyz_fpga_loaded(card)) &&
+                       readl(&fw_id->signature) == ZFIRM_ID;
+}
+
 static inline int serial_paranoia_check(struct cyclades_port *info,
                char *name, const char *routine)
 {
@@ -1395,19 +1403,15 @@ cyz_fetch_msg(struct cyclades_card *cinfo,
        unsigned long loc_doorbell;
 
        firm_id = cinfo->base_addr + ID_ADDRESS;
-       if (!ISZLOADED(*cinfo))
-               return -1;
        zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
        board_ctrl = &zfw_ctrl->board_ctrl;
 
-       loc_doorbell = readl(&((struct RUNTIME_9060 __iomem *)
-                                 (cinfo->ctl_addr))->loc_doorbell);
+       loc_doorbell = readl(&cinfo->ctl_addr.p9060->loc_doorbell);
        if (loc_doorbell) {
                *cmd = (char)(0xff & loc_doorbell);
                *channel = readl(&board_ctrl->fwcmd_channel);
                *param = (__u32) readl(&board_ctrl->fwcmd_param);
-               cy_writel(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-                         loc_doorbell, 0xffffffff);
+               cy_writel(&cinfo->ctl_addr.p9060->loc_doorbell, 0xffffffff);
                return 1;
        }
        return 0;
@@ -1424,15 +1428,14 @@ cyz_issue_cmd(struct cyclades_card *cinfo,
        unsigned int index;
 
        firm_id = cinfo->base_addr + ID_ADDRESS;
-       if (!ISZLOADED(*cinfo))
+       if (!cyz_is_loaded(cinfo))
                return -1;
 
        zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
        board_ctrl = &zfw_ctrl->board_ctrl;
 
        index = 0;
-       pci_doorbell =
-           &((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->pci_doorbell;
+       pci_doorbell = &cinfo->ctl_addr.p9060->pci_doorbell;
        while ((readl(pci_doorbell) & 0xff) != 0) {
                if (index++ == 1000)
                        return (int)(readl(pci_doorbell) & 0xff);
@@ -1624,10 +1627,8 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
        static struct BOARD_CTRL __iomem *board_ctrl;
        static struct CH_CTRL __iomem *ch_ctrl;
        static struct BUF_CTRL __iomem *buf_ctrl;
-       __u32 channel;
+       __u32 channel, param, fw_ver;
        __u8 cmd;
-       __u32 param;
-       __u32 hw_ver, fw_ver;
        int special_count;
        int delta_count;
 
@@ -1635,8 +1636,6 @@ static void cyz_handle_cmd(struct cyclades_card *cinfo)
        zfw_ctrl = cinfo->base_addr + (readl(&firm_id->zfwctrl_addr) & 0xfffff);
        board_ctrl = &zfw_ctrl->board_ctrl;
        fw_ver = readl(&board_ctrl->fw_version);
-       hw_ver = readl(&((struct RUNTIME_9060 __iomem *)(cinfo->ctl_addr))->
-                       mail_box_0);
 
        while (cyz_fetch_msg(cinfo, &channel, &cmd, &param) == 1) {
                special_count = 0;
@@ -1737,15 +1736,7 @@ static irqreturn_t cyz_interrupt(int irq, void *dev_id)
 {
        struct cyclades_card *cinfo = dev_id;
 
-       if (unlikely(cinfo == NULL)) {
-#ifdef CY_DEBUG_INTERRUPTS
-               printk(KERN_DEBUG "cyz_interrupt: spurious interrupt %d\n",
-                                                                       irq);
-#endif
-               return IRQ_NONE;        /* spurious interrupt */
-       }
-
-       if (unlikely(!ISZLOADED(*cinfo))) {
+       if (unlikely(!cyz_is_loaded(cinfo))) {
 #ifdef CY_DEBUG_INTERRUPTS
                printk(KERN_DEBUG "cyz_interrupt: board not yet loaded "
                                "(IRQ%d).\n", irq);
@@ -1785,7 +1776,6 @@ static void cyz_poll(unsigned long arg)
        struct tty_struct *tty;
        struct FIRM_ID __iomem *firm_id;
        struct ZFW_CTRL __iomem *zfw_ctrl;
-       struct BOARD_CTRL __iomem *board_ctrl;
        struct BUF_CTRL __iomem *buf_ctrl;
        unsigned long expires = jiffies + HZ;
        unsigned int port, card;
@@ -1793,19 +1783,17 @@ static void cyz_poll(unsigned long arg)
        for (card = 0; card < NR_CARDS; card++) {
                cinfo = &cy_card[card];
 
-               if (!IS_CYC_Z(*cinfo))
+               if (!cy_is_Z(cinfo))
                        continue;
-               if (!ISZLOADED(*cinfo))
+               if (!cyz_is_loaded(cinfo))
                        continue;
 
                firm_id = cinfo->base_addr + ID_ADDRESS;
                zfw_ctrl = cinfo->base_addr +
                                (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-               board_ctrl = &(zfw_ctrl->board_ctrl);
 
        /* Skip first polling cycle to avoid racing conditions with the FW */
                if (!cinfo->intr_enabled) {
-                       cinfo->nports = (int)readl(&board_ctrl->n_channel);
                        cinfo->intr_enabled = 1;
                        continue;
                }
@@ -1874,7 +1862,7 @@ static int startup(struct cyclades_port *info)
 
        set_line_char(info);
 
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -1931,7 +1919,7 @@ static int startup(struct cyclades_port *info)
                base_addr = card->base_addr;
 
                firm_id = base_addr + ID_ADDRESS;
-               if (!ISZLOADED(*card))
+               if (!cyz_is_loaded(card))
                        return -ENODEV;
 
                zfw_ctrl = card->base_addr +
@@ -2026,7 +2014,7 @@ static void start_xmit(struct cyclades_port *info)
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -2070,7 +2058,7 @@ static void shutdown(struct cyclades_port *info)
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -2126,7 +2114,7 @@ static void shutdown(struct cyclades_port *info)
 #endif
 
                firm_id = base_addr + ID_ADDRESS;
-               if (!ISZLOADED(*card))
+               if (!cyz_is_loaded(card))
                        return;
 
                zfw_ctrl = card->base_addr +
@@ -2233,7 +2221,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 #endif
        info->port.blocked_open++;
 
-       if (!IS_CYC_Z(*cinfo)) {
+       if (!cy_is_Z(cinfo)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = cinfo->bus_index;
@@ -2296,7 +2284,7 @@ block_til_ready(struct tty_struct *tty, struct file *filp,
 
                base_addr = cinfo->base_addr;
                firm_id = base_addr + ID_ADDRESS;
-               if (!ISZLOADED(*cinfo)) {
+               if (!cyz_is_loaded(cinfo)) {
                        __set_current_state(TASK_RUNNING);
                        remove_wait_queue(&info->port.open_wait, &wait);
                        return -EINVAL;
@@ -2397,16 +2385,14 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
           treat it as absent from the system.  This
           will make the user pay attention.
         */
-       if (IS_CYC_Z(*info->card)) {
+       if (cy_is_Z(info->card)) {
                struct cyclades_card *cinfo = info->card;
                struct FIRM_ID __iomem *firm_id = cinfo->base_addr + ID_ADDRESS;
 
-               if (!ISZLOADED(*cinfo)) {
-                       if (((ZE_V1 == readl(&((struct RUNTIME_9060 __iomem *)
-                                        (cinfo->ctl_addr))->mail_box_0)) &&
-                                       Z_FPGA_CHECK(*cinfo)) &&
-                                       (ZFIRM_HLT == readl(
-                                               &firm_id->signature))) {
+               if (!cyz_is_loaded(cinfo)) {
+                       if (cinfo->hw_ver == ZE_V1 && cyz_fpga_loaded(cinfo) &&
+                                       readl(&firm_id->signature) ==
+                                       ZFIRM_HLT) {
                                printk(KERN_ERR "cyc:Cyclades-Z Error: you "
                                        "need an external power supply for "
                                        "this number of ports.\nFirmware "
@@ -2423,18 +2409,13 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
                   interrupts should be enabled as soon as the first open
                   happens to one of its ports. */
                        if (!cinfo->intr_enabled) {
-                               struct ZFW_CTRL __iomem *zfw_ctrl;
-                               struct BOARD_CTRL __iomem *board_ctrl;
-
-                               zfw_ctrl = cinfo->base_addr +
-                                       (readl(&firm_id->zfwctrl_addr) &
-                                        0xfffff);
-
-                               board_ctrl = &zfw_ctrl->board_ctrl;
+                               u16 intr;
 
                                /* Enable interrupts on the PLX chip */
-                               cy_writew(cinfo->ctl_addr + 0x68,
-                                       readw(cinfo->ctl_addr + 0x68) | 0x0900);
+                               intr = readw(&cinfo->ctl_addr.p9060->
+                                               intr_ctrl_stat) | 0x0900;
+                               cy_writew(&cinfo->ctl_addr.p9060->
+                                               intr_ctrl_stat, intr);
                                /* Enable interrupts on the FW */
                                retval = cyz_issue_cmd(cinfo, 0,
                                                C_CM_IRQ_ENBL, 0L);
@@ -2442,8 +2423,6 @@ static int cy_open(struct tty_struct *tty, struct file *filp)
                                        printk(KERN_ERR "cyc:IRQ enable retval "
                                                "was %x\n", retval);
                                }
-                               cinfo->nports =
-                                       (int)readl(&board_ctrl->n_channel);
                                cinfo->intr_enabled = 1;
                        }
                }
@@ -2556,7 +2535,7 @@ static void cy_wait_until_sent(struct tty_struct *tty, int timeout)
 #endif
        card = info->card;
        channel = (info->line) - (card->first_line);
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -2601,7 +2580,7 @@ static void cy_flush_buffer(struct tty_struct *tty)
        info->xmit_cnt = info->xmit_head = info->xmit_tail = 0;
        spin_unlock_irqrestore(&card->card_lock, flags);
 
-       if (IS_CYC_Z(*card)) {  /* If it is a Z card, flush the on-board
+       if (cy_is_Z(card)) {    /* If it is a Z card, flush the on-board
                                           buffers as well */
                spin_lock_irqsave(&card->card_lock, flags);
                retval = cyz_issue_cmd(card, channel, C_CM_FLUSH_TX, 0L);
@@ -2682,7 +2661,7 @@ static void cy_close(struct tty_struct *tty, struct file *filp)
 
        spin_lock_irqsave(&card->card_lock, flags);
 
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                int channel = info->line - card->first_line;
                int index = card->bus_index;
                void __iomem *base_addr = card->base_addr +
@@ -2902,7 +2881,7 @@ static int cy_chars_in_buffer(struct tty_struct *tty)
        channel = (info->line) - (card->first_line);
 
 #ifdef Z_EXT_CHARS_IN_BUFFER
-       if (!IS_CYC_Z(cy_card[card])) {
+       if (!cy_is_Z(card)) {
 #endif                         /* Z_EXT_CHARS_IN_BUFFER */
 #ifdef CY_DEBUG_IO
                printk(KERN_DEBUG "cyc:cy_chars_in_buffer ttyC%d %d\n",
@@ -2984,7 +2963,6 @@ static void set_line_char(struct cyclades_port *info)
        void __iomem *base_addr;
        int chip, channel, index;
        unsigned cflag, iflag;
-       unsigned short chip_number;
        int baud, baud_rate = 0;
        int i;
 
@@ -3013,9 +2991,8 @@ static void set_line_char(struct cyclades_port *info)
 
        card = info->card;
        channel = info->line - card->first_line;
-       chip_number = channel / 4;
 
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
 
                index = card->bus_index;
 
@@ -3233,21 +3210,17 @@ static void set_line_char(struct cyclades_port *info)
        } else {
                struct FIRM_ID __iomem *firm_id;
                struct ZFW_CTRL __iomem *zfw_ctrl;
-               struct BOARD_CTRL __iomem *board_ctrl;
                struct CH_CTRL __iomem *ch_ctrl;
-               struct BUF_CTRL __iomem *buf_ctrl;
                __u32 sw_flow;
                int retval;
 
                firm_id = card->base_addr + ID_ADDRESS;
-               if (!ISZLOADED(*card))
+               if (!cyz_is_loaded(card))
                        return;
 
                zfw_ctrl = card->base_addr +
                        (readl(&firm_id->zfwctrl_addr) & 0xfffff);
-               board_ctrl = &zfw_ctrl->board_ctrl;
                ch_ctrl = &(zfw_ctrl->ch_ctrl[channel]);
-               buf_ctrl = &zfw_ctrl->buf_ctrl[channel];
 
                /* baud rate */
                baud = tty_get_baud_rate(info->port.tty);
@@ -3457,7 +3430,7 @@ static int get_lsr_info(struct cyclades_port *info, unsigned int __user *value)
 
        card = info->card;
        channel = (info->line) - (card->first_line);
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3497,7 +3470,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3523,7 +3496,7 @@ static int cy_tiocmget(struct tty_struct *tty, struct file *file)
        } else {
                base_addr = card->base_addr;
                firm_id = card->base_addr + ID_ADDRESS;
-               if (ISZLOADED(*card)) {
+               if (cyz_is_loaded(card)) {
                        zfw_ctrl = card->base_addr +
                                (readl(&firm_id->zfwctrl_addr) & 0xfffff);
                        board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3566,7 +3539,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
 
        card = info->card;
        channel = (info->line) - (card->first_line);
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3641,7 +3614,7 @@ cy_tiocmset(struct tty_struct *tty, struct file *file,
                base_addr = card->base_addr;
 
                firm_id = card->base_addr + ID_ADDRESS;
-               if (ISZLOADED(*card)) {
+               if (cyz_is_loaded(card)) {
                        zfw_ctrl = card->base_addr +
                                (readl(&firm_id->zfwctrl_addr) & 0xfffff);
                        board_ctrl = &zfw_ctrl->board_ctrl;
@@ -3713,7 +3686,7 @@ static int cy_break(struct tty_struct *tty, int break_state)
        card = info->card;
 
        spin_lock_irqsave(&card->card_lock, flags);
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                /* Let the transmit ISR take care of this (since it
                   requires stuffing characters into the output stream).
                 */
@@ -3782,7 +3755,7 @@ static int set_threshold(struct cyclades_port *info, unsigned long value)
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3810,7 +3783,7 @@ static int get_threshold(struct cyclades_port *info,
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3844,7 +3817,7 @@ static int set_timeout(struct cyclades_port *info, unsigned long value)
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -3867,7 +3840,7 @@ static int get_timeout(struct cyclades_port *info,
 
        card = info->card;
        channel = info->line - card->first_line;
-       if (!IS_CYC_Z(*card)) {
+       if (!cy_is_Z(card)) {
                chip = channel >> 2;
                channel &= 0x03;
                index = card->bus_index;
@@ -4121,7 +4094,7 @@ static void cy_send_xchar(struct tty_struct *tty, char ch)
        card = info->card;
        channel = info->line - card->first_line;
 
-       if (IS_CYC_Z(*card)) {
+       if (cy_is_Z(card)) {
                if (ch == STOP_CHAR(tty))
                        cyz_issue_cmd(card, channel, C_CM_SENDXOFF, 0L);
                else if (ch == START_CHAR(tty))
@@ -4154,7 +4127,7 @@ static void cy_throttle(struct tty_struct *tty)
        card = info->card;
 
        if (I_IXOFF(tty)) {
-               if (!IS_CYC_Z(*card))
+               if (!cy_is_Z(card))
                        cy_send_xchar(tty, STOP_CHAR(tty));
                else
                        info->throttle = 1;
@@ -4162,7 +4135,7 @@ static void cy_throttle(struct tty_struct *tty)
 
        if (tty->termios->c_cflag & CRTSCTS) {
                channel = info->line - card->first_line;
-               if (!IS_CYC_Z(*card)) {
+               if (!cy_is_Z(card)) {
                        chip = channel >> 2;
                        channel &= 0x03;
                        index = card->bus_index;
@@ -4219,7 +4192,7 @@ static void cy_unthrottle(struct tty_struct *tty)
        if (tty->termios->c_cflag & CRTSCTS) {
                card = info->card;
                channel = info->line - card->first_line;
-               if (!IS_CYC_Z(*card)) {
+               if (!cy_is_Z(card)) {
                        chip = channel >> 2;
                        channel &= 0x03;
                        index = card->bus_index;
@@ -4263,7 +4236,7 @@ static void cy_stop(struct tty_struct *tty)
 
        cinfo = info->card;
        channel = info->line - cinfo->first_line;
-       if (!IS_CYC_Z(*cinfo)) {
+       if (!cy_is_Z(cinfo)) {
                index = cinfo->bus_index;
                chip = channel >> 2;
                channel &= 0x03;
@@ -4296,7 +4269,7 @@ static void cy_start(struct tty_struct *tty)
        cinfo = info->card;
        channel = info->line - cinfo->first_line;
        index = cinfo->bus_index;
-       if (!IS_CYC_Z(*cinfo)) {
+       if (!cy_is_Z(cinfo)) {
                chip = channel >> 2;
                channel &= 0x03;
                base_addr = cinfo->base_addr + (cy_chip_offset[chip] << index);
@@ -4347,33 +4320,20 @@ static void cy_hangup(struct tty_struct *tty)
 static int __devinit cy_init_card(struct cyclades_card *cinfo)
 {
        struct cyclades_port *info;
-       u32 uninitialized_var(mailbox);
-       unsigned int nports, port;
+       unsigned int port;
        unsigned short chip_number;
-       int uninitialized_var(index);
 
        spin_lock_init(&cinfo->card_lock);
+       cinfo->intr_enabled = 0;
 
-       if (IS_CYC_Z(*cinfo)) { /* Cyclades-Z */
-               mailbox = readl(&((struct RUNTIME_9060 __iomem *)
-                                    cinfo->ctl_addr)->mail_box_0);
-               nports = (mailbox == ZE_V1) ? ZE_V1_NPORTS : 8;
-               cinfo->intr_enabled = 0;
-               cinfo->nports = 0;      /* Will be correctly set later, after
-                                          Z FW is loaded */
-       } else {
-               index = cinfo->bus_index;
-               nports = cinfo->nports = CyPORTS_PER_CHIP * cinfo->num_chips;
-       }
-
-       cinfo->ports = kzalloc(sizeof(*cinfo->ports) * nports, GFP_KERNEL);
+       cinfo->ports = kcalloc(cinfo->nports, sizeof(*cinfo->ports),
+                       GFP_KERNEL);
        if (cinfo->ports == NULL) {
                printk(KERN_ERR "Cyclades: cannot allocate ports\n");
-               cinfo->nports = 0;
                return -ENOMEM;
        }
 
-       for (port = cinfo->first_line; port < cinfo->first_line + nports;
+       for (port = cinfo->first_line; port < cinfo->first_line + cinfo->nports;
                        port++) {
                info = &cinfo->ports[port - cinfo->first_line];
                tty_port_init(&info->port);
@@ -4387,9 +4347,9 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
                init_completion(&info->shutdown_wait);
                init_waitqueue_head(&info->delta_msr_wait);
 
-               if (IS_CYC_Z(*cinfo)) {
+               if (cy_is_Z(cinfo)) {
                        info->type = PORT_STARTECH;
-                       if (mailbox == ZO_V1)
+                       if (cinfo->hw_ver == ZO_V1)
                                info->xmit_fifo_size = CYZ_FIFO_SIZE;
                        else
                                info->xmit_fifo_size = 4 * CYZ_FIFO_SIZE;
@@ -4398,6 +4358,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
                                cyz_rx_restart, (unsigned long)info);
 #endif
                } else {
+                       int index = cinfo->bus_index;
                        info->type = PORT_CIRRUS;
                        info->xmit_fifo_size = CyMAX_CHAR_FIFO;
                        info->cor1 = CyPARITY_NONE | Cy_1_STOP | Cy_8_BITS;
@@ -4430,7 +4391,7 @@ static int __devinit cy_init_card(struct cyclades_card *cinfo)
        }
 
 #ifndef CONFIG_CYZ_INTR
-       if (IS_CYC_Z(*cinfo) && !timer_pending(&cyz_timerlist)) {
+       if (cy_is_Z(cinfo) && !timer_pending(&cyz_timerlist)) {
                mod_timer(&cyz_timerlist, jiffies + 1);
 #ifdef CY_PCI_DEBUG
                printk(KERN_DEBUG "Cyclades-Z polling initialized\n");
@@ -4621,11 +4582,12 @@ static int __init cy_detect_isa(void)
 
                /* set cy_card */
                cy_card[j].base_addr = cy_isa_address;
-               cy_card[j].ctl_addr = NULL;
+               cy_card[j].ctl_addr.p9050 = NULL;
                cy_card[j].irq = (int)cy_isa_irq;
                cy_card[j].bus_index = 0;
                cy_card[j].first_line = cy_next_channel;
-               cy_card[j].num_chips = cy_isa_nchan / 4;
+               cy_card[j].num_chips = cy_isa_nchan / CyPORTS_PER_CHIP;
+               cy_card[j].nports = cy_isa_nchan;
                if (cy_init_card(&cy_card[j])) {
                        cy_card[j].base_addr = NULL;
                        free_irq(cy_isa_irq, &cy_card[j]);
@@ -4781,7 +4743,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
        struct CUSTOM_REG __iomem *cust = base_addr;
        struct ZFW_CTRL __iomem *pt_zfwctrl;
        void __iomem *tmp;
-       u32 mailbox, status;
+       u32 mailbox, status, nchan;
        unsigned int i;
        int retval;
 
@@ -4793,7 +4755,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
        /* Check whether the firmware is already loaded and running. If
           positive, skip this board */
-       if (Z_FPGA_LOADED(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
+       if (__cyz_fpga_loaded(ctl_addr) && readl(&fid->signature) == ZFIRM_ID) {
                u32 cntval = readl(base_addr + 0x190);
 
                udelay(100);
@@ -4812,7 +4774,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
 
        mailbox = readl(&ctl_addr->mail_box_0);
 
-       if (mailbox == 0 || Z_FPGA_LOADED(ctl_addr)) {
+       if (mailbox == 0 || __cyz_fpga_loaded(ctl_addr)) {
                /* stops CPU and set window to beginning of RAM */
                cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
                cy_writel(&cust->cpu_stop, 0);
@@ -4828,7 +4790,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
                                base_addr);
                if (retval)
                        goto err_rel;
-               if (!Z_FPGA_LOADED(ctl_addr)) {
+               if (!__cyz_fpga_loaded(ctl_addr)) {
                        dev_err(&pdev->dev, "fw upload successful, but fw is "
                                        "not loaded\n");
                        goto err_rel;
@@ -4887,7 +4849,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
                                "system before loading the new FW to the "
                                "Cyclades-Z.\n");
 
-                       if (Z_FPGA_LOADED(ctl_addr))
+                       if (__cyz_fpga_loaded(ctl_addr))
                                plx_init(pdev, irq, ctl_addr);
 
                        retval = -EIO;
@@ -4902,16 +4864,16 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
                        base_addr + ID_ADDRESS, readl(&fid->zfwctrl_addr),
                        base_addr + readl(&fid->zfwctrl_addr));
 
+       nchan = readl(&pt_zfwctrl->board_ctrl.n_channel);
        dev_info(&pdev->dev, "Cyclades-Z FW loaded: version = %x, ports = %u\n",
-               readl(&pt_zfwctrl->board_ctrl.fw_version),
-               readl(&pt_zfwctrl->board_ctrl.n_channel));
+               readl(&pt_zfwctrl->board_ctrl.fw_version), nchan);
 
-       if (readl(&pt_zfwctrl->board_ctrl.n_channel) == 0) {
+       if (nchan == 0) {
                dev_warn(&pdev->dev, "no Cyclades-Z ports were found. Please "
                        "check the connection between the Z host card and the "
                        "serial expanders.\n");
 
-               if (Z_FPGA_LOADED(ctl_addr))
+               if (__cyz_fpga_loaded(ctl_addr))
                        plx_init(pdev, irq, ctl_addr);
 
                dev_info(&pdev->dev, "Null number of ports detected. Board "
@@ -4932,9 +4894,7 @@ static int __devinit cyz_load_fw(struct pci_dev *pdev, void __iomem *base_addr,
        cy_writel(&ctl_addr->intr_ctrl_stat, readl(&ctl_addr->intr_ctrl_stat) |
                        0x00030800UL);
 
-       plx_init(pdev, irq, ctl_addr);
-
-       return 0;
+       return nchan;
 err_rel:
        release_firmware(fw);
 err:
@@ -4946,7 +4906,7 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 {
        void __iomem *addr0 = NULL, *addr2 = NULL;
        char *card_name = NULL;
-       u32 mailbox;
+       u32 uninitialized_var(mailbox);
        unsigned int device_id, nchan = 0, card_no, i;
        unsigned char plx_ver;
        int retval, irq;
@@ -5023,11 +4983,12 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
                }
 
                /* Disable interrupts on the PLX before resetting it */
-               cy_writew(addr0 + 0x68, readw(addr0 + 0x68) & ~0x0900);
+               cy_writew(&ctl_addr->intr_ctrl_stat,
+                               readw(&ctl_addr->intr_ctrl_stat) & ~0x0900);
 
                plx_init(pdev, irq, addr0);
 
-               mailbox = (u32)readl(&ctl_addr->mail_box_0);
+               mailbox = readl(&ctl_addr->mail_box_0);
 
                addr2 = ioremap_nocache(pci_resource_start(pdev, 2),
                                mailbox == ZE_V1 ? CyPCI_Ze_win : CyPCI_Zwin);
@@ -5038,12 +4999,8 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
 
                if (mailbox == ZE_V1) {
                        card_name = "Cyclades-Ze";
-
-                       readl(&ctl_addr->mail_box_0);
-                       nchan = ZE_V1_NPORTS;
                } else {
                        card_name = "Cyclades-8Zo";
-
 #ifdef CY_PCI_DEBUG
                        if (mailbox == ZO_V1) {
                                cy_writel(&ctl_addr->loc_addr_base, WIN_CREG);
@@ -5065,15 +5022,12 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
                         */
                        if ((mailbox == ZO_V1) || (mailbox == ZO_V2))
                                cy_writel(addr2 + ID_ADDRESS, 0L);
-
-                       retval = cyz_load_fw(pdev, addr2, addr0, irq);
-                       if (retval)
-                               goto err_unmap;
-                       /* This must be a Cyclades-8Zo/PCI.  The extendable
-                          version will have a different device_id and will
-                          be allocated its maximum number of ports. */
-                       nchan = 8;
                }
+
+               retval = cyz_load_fw(pdev, addr2, addr0, irq);
+               if (retval <= 0)
+                       goto err_unmap;
+               nchan = retval;
        }
 
        if ((cy_next_channel + nchan) > NR_PORTS) {
@@ -5103,8 +5057,10 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
                        dev_err(&pdev->dev, "could not allocate IRQ\n");
                        goto err_unmap;
                }
-               cy_card[card_no].num_chips = nchan / 4;
+               cy_card[card_no].num_chips = nchan / CyPORTS_PER_CHIP;
        } else {
+               cy_card[card_no].hw_ver = mailbox;
+               cy_card[card_no].num_chips = (unsigned int)-1;
 #ifdef CONFIG_CYZ_INTR
                /* allocate IRQ only if board has an IRQ */
                if (irq != 0 && irq != 255) {
@@ -5117,15 +5073,15 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
                        }
                }
 #endif                         /* CONFIG_CYZ_INTR */
-               cy_card[card_no].num_chips = (unsigned int)-1;
        }
 
        /* set cy_card */
        cy_card[card_no].base_addr = addr2;
-       cy_card[card_no].ctl_addr = addr0;
+       cy_card[card_no].ctl_addr.p9050 = addr0;
        cy_card[card_no].irq = irq;
        cy_card[card_no].bus_index = 1;
        cy_card[card_no].first_line = cy_next_channel;
+       cy_card[card_no].nports = nchan;
        retval = cy_init_card(&cy_card[card_no]);
        if (retval)
                goto err_null;
@@ -5138,17 +5094,20 @@ static int __devinit cy_pci_probe(struct pci_dev *pdev,
                plx_ver = readb(addr2 + CyPLX_VER) & 0x0f;
                switch (plx_ver) {
                case PLX_9050:
-
                        cy_writeb(addr0 + 0x4c, 0x43);
                        break;
 
                case PLX_9060:
                case PLX_9080:
                default:        /* Old boards, use PLX_9060 */
-                       plx_init(pdev, irq, addr0);
-                       cy_writew(addr0 + 0x68, readw(addr0 + 0x68) | 0x0900);
+               {
+                       struct RUNTIME_9060 __iomem *ctl_addr = addr0;
+                       plx_init(pdev, irq, ctl_addr);
+                       cy_writew(&ctl_addr->intr_ctrl_stat,
+                               readw(&ctl_addr->intr_ctrl_stat) | 0x0900);
                        break;
                }
+               }
        }
 
        dev_info(&pdev->dev, "%s/PCI #%d found: %d channels starting from "
@@ -5179,22 +5138,23 @@ static void __devexit cy_pci_remove(struct pci_dev *pdev)
        unsigned int i;
 
        /* non-Z with old PLX */
-       if (!IS_CYC_Z(*cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
+       if (!cy_is_Z(cinfo) && (readb(cinfo->base_addr + CyPLX_VER) & 0x0f) ==
                        PLX_9050)
-               cy_writeb(cinfo->ctl_addr + 0x4c, 0);
+               cy_writeb(cinfo->ctl_addr.p9050 + 0x4c, 0);
        else
 #ifndef CONFIG_CYZ_INTR
-               if (!IS_CYC_Z(*cinfo))
+               if (!cy_is_Z(cinfo))
 #endif
-               cy_writew(cinfo->ctl_addr + 0x68,
-                               readw(cinfo->ctl_addr + 0x68) & ~0x0900);
+               cy_writew(&cinfo->ctl_addr.p9060->intr_ctrl_stat,
+                       readw(&cinfo->ctl_addr.p9060->intr_ctrl_stat) &
+                       ~0x0900);
 
        iounmap(cinfo->base_addr);
-       if (cinfo->ctl_addr)
-               iounmap(cinfo->ctl_addr);
+       if (cinfo->ctl_addr.p9050)
+               iounmap(cinfo->ctl_addr.p9050);
        if (cinfo->irq
 #ifndef CONFIG_CYZ_INTR
-               && !IS_CYC_Z(*cinfo)
+               && !cy_is_Z(cinfo)
 #endif /* CONFIG_CYZ_INTR */
                )
                free_irq(cinfo->irq, cinfo);
@@ -5240,7 +5200,7 @@ static int cyclades_proc_show(struct seq_file *m, void *v)
                                        (cur_jifs - info->idle_stats.recv_idle)/
                                        HZ, info->idle_stats.overruns,
                                        /* FIXME: double check locking */
-                                       (long)info->port.tty->ldisc.ops->num);
+                                       (long)info->port.tty->ldisc->ops->num);
                        else
                                seq_printf(m, "%3d %8lu %10lu %8lu "
                                        "%10lu %8lu %9lu %6ld\n",
@@ -5386,11 +5346,11 @@ static void __exit cy_cleanup_module(void)
                        /* clear interrupt */
                        cy_writeb(card->base_addr + Cy_ClrIntr, 0);
                        iounmap(card->base_addr);
-                       if (card->ctl_addr)
-                               iounmap(card->ctl_addr);
+                       if (card->ctl_addr.p9050)
+                               iounmap(card->ctl_addr.p9050);
                        if (card->irq
 #ifndef CONFIG_CYZ_INTR
-                               && !IS_CYC_Z(*card)
+                               && !cy_is_Z(card)
 #endif /* CONFIG_CYZ_INTR */
                                )
                                free_irq(card->irq, card);
index af7c13ca949377da39751cb78df840b21de7ff32..abef1f7d84fefb03ef23ffc90417d173aea7ceef 100644 (file)
@@ -745,7 +745,7 @@ static int epca_carrier_raised(struct tty_port *port)
        return 0;
 }
 
-static void epca_raise_dtr_rts(struct tty_port *port)
+static void epca_dtr_rts(struct tty_port *port, int onoff)
 {
 }
 
@@ -925,7 +925,7 @@ static const struct tty_operations pc_ops = {
 
 static const struct tty_port_operations epca_port_ops = {
        .carrier_raised = epca_carrier_raised,
-       .raise_dtr_rts = epca_raise_dtr_rts,
+       .dtr_rts = epca_dtr_rts,
 };
 
 static int info_open(struct tty_struct *tty, struct file *filp)
@@ -1518,7 +1518,7 @@ static void doevent(int crd)
                if (event & MODEMCHG_IND) {
                        /* A modem signal change has been indicated */
                        ch->imodem = mstat;
-                       if (test_bit(ASYNC_CHECK_CD, &ch->port.flags)) {
+                       if (test_bit(ASYNCB_CHECK_CD, &ch->port.flags)) {
                                /* We are now receiving dcd */
                                if (mstat & ch->dcd)
                                        wake_up_interruptible(&ch->port.open_wait);
@@ -1765,9 +1765,9 @@ static void epcaparam(struct tty_struct *tty, struct channel *ch)
                 * that the driver will wait on carrier detect.
                 */
                if (ts->c_cflag & CLOCAL)
-                       clear_bit(ASYNC_CHECK_CD, &ch->port.flags);
+                       clear_bit(ASYNCB_CHECK_CD, &ch->port.flags);
                else
-                       set_bit(ASYNC_CHECK_CD, &ch->port.flags);
+                       set_bit(ASYNCB_CHECK_CD, &ch->port.flags);
                mval = ch->m_dtr | ch->m_rts;
        } /* End CBAUD not detected */
        iflag = termios2digi_i(ch, ts->c_iflag);
@@ -2114,8 +2114,8 @@ static int pc_ioctl(struct tty_struct *tty, struct file *file,
                        tty_wait_until_sent(tty, 0);
                } else {
                        /* ldisc lock already held in ioctl */
-                       if (tty->ldisc.ops->flush_buffer)
-                               tty->ldisc.ops->flush_buffer(tty);
+                       if (tty->ldisc->ops->flush_buffer)
+                               tty->ldisc->ops->flush_buffer(tty);
                }
                unlock_kernel();
                /* Fall Thru */
@@ -2244,7 +2244,8 @@ static void do_softint(struct work_struct *work)
                        if (test_and_clear_bit(EPCA_EVENT_HANGUP, &ch->event)) {
                                tty_hangup(tty);
                                wake_up_interruptible(&ch->port.open_wait);
-                               clear_bit(ASYNC_NORMAL_ACTIVE, &ch->port.flags);
+                               clear_bit(ASYNCB_NORMAL_ACTIVE,
+                                               &ch->port.flags);
                        }
                }
                tty_kref_put(tty);
index 340ba4f9dc54880bd6a3f7661ebba93d9c81733b..4a9f3492b9216142333f3fa2013542b4638b9c1e 100644 (file)
@@ -224,7 +224,7 @@ static void hpet_timer_set_irq(struct hpet_dev *devp)
                        break;
                }
 
-               gsi = acpi_register_gsi(irq, ACPI_LEVEL_SENSITIVE,
+               gsi = acpi_register_gsi(NULL, irq, ACPI_LEVEL_SENSITIVE,
                                        ACPI_ACTIVE_LOW);
                if (gsi > 0)
                        break;
@@ -939,7 +939,7 @@ static acpi_status hpet_resources(struct acpi_resource *res, void *data)
                irqp = &res->data.extended_irq;
 
                for (i = 0; i < irqp->interrupt_count; i++) {
-                       irq = acpi_register_gsi(irqp->interrupts[i],
+                       irq = acpi_register_gsi(NULL, irqp->interrupts[i],
                                      irqp->triggering, irqp->polarity);
                        if (irq < 0)
                                return AE_ERROR;
index 0061e18aff6045624e6fad0eaaa804f8f9d951e2..0d10b89218ed1da1818f47c643705f99d9dd3b55 100644 (file)
@@ -868,11 +868,11 @@ i2Input(i2ChanStrPtr pCh)
                amountToMove = count;
        }
        // Move the first block
-       pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+       pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
                 &(pCh->Ibuf[stripIndex]), NULL, amountToMove );
        // If we needed to wrap, do the second data move
        if (count > amountToMove) {
-               pCh->pTTY->ldisc.ops->receive_buf( pCh->pTTY,
+               pCh->pTTY->ldisc->ops->receive_buf( pCh->pTTY,
                 pCh->Ibuf, NULL, count - amountToMove );
        }
        // Bump and wrap the stripIndex all at once by the amount of data read. This
index afd9247cf082d15e2cb6f1e8273c72fba9317ba5..517271c762e6b620b1323da302c5cd837985c626 100644 (file)
@@ -1315,8 +1315,8 @@ static inline void  isig(int sig, struct tty_struct *tty, int flush)
        if (tty->pgrp)
                kill_pgrp(tty->pgrp, sig, 1);
        if (flush || !L_NOFLSH(tty)) {
-               if ( tty->ldisc.ops->flush_buffer )  
-                       tty->ldisc.ops->flush_buffer(tty);
+               if ( tty->ldisc->ops->flush_buffer )  
+                       tty->ldisc->ops->flush_buffer(tty);
                i2InputFlush( tty->driver_data );
        }
 }
index a59eac584d1621b3e4824ca3e644d944a31266a6..4d745a89504f196cc545145ca4bd653c4927ad0b 100644 (file)
@@ -329,7 +329,7 @@ static inline void drop_rts(struct isi_port *port)
 
 /* card->lock MUST NOT be held */
 
-static void isicom_raise_dtr_rts(struct tty_port *port)
+static void isicom_dtr_rts(struct tty_port *port, int on)
 {
        struct isi_port *ip = container_of(port, struct isi_port, port);
        struct isi_board *card = ip->card;
@@ -339,10 +339,17 @@ static void isicom_raise_dtr_rts(struct tty_port *port)
        if (!lock_card(card))
                return;
 
-       outw(0x8000 | (channel << card->shift_count) | 0x02, base);
-       outw(0x0f04, base);
-       InterruptTheCard(base);
-       ip->status |= (ISI_DTR | ISI_RTS);
+       if (on) {
+               outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+               outw(0x0f04, base);
+               InterruptTheCard(base);
+               ip->status |= (ISI_DTR | ISI_RTS);
+       } else {
+               outw(0x8000 | (channel << card->shift_count) | 0x02, base);
+               outw(0x0C04, base);
+               InterruptTheCard(base);
+               ip->status &= ~(ISI_DTR | ISI_RTS);
+       }
        unlock_card(card);
 }
 
@@ -1339,7 +1346,7 @@ static const struct tty_operations isicom_ops = {
 
 static const struct tty_port_operations isicom_port_ops = {
        .carrier_raised         = isicom_carrier_raised,
-       .raise_dtr_rts          = isicom_raise_dtr_rts,
+       .dtr_rts                = isicom_dtr_rts,
 };
 
 static int __devinit reset_card(struct pci_dev *pdev,
index fff19f7e29d25eac8c7cfbb4903638107dd8561c..e18800c400b10b60578b6fb9bb1d86a16dcbfd04 100644 (file)
@@ -1140,14 +1140,14 @@ static int stli_carrier_raised(struct tty_port *port)
        return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stli_raise_dtr_rts(struct tty_port *port)
+static void stli_dtr_rts(struct tty_port *port, int on)
 {
        struct stliport *portp = container_of(port, struct stliport, port);
        struct stlibrd *brdp = stli_brds[portp->brdnr];
-       stli_mkasysigs(&portp->asig, 1, 1);
+       stli_mkasysigs(&portp->asig, on, on);
        if (stli_cmdwait(brdp, portp, A_SETSIGNALS, &portp->asig,
                sizeof(asysigs_t), 0) < 0)
-                       printk(KERN_WARNING "istallion: dtr raise failed.\n");
+                       printk(KERN_WARNING "istallion: dtr set failed.\n");
 }
 
 
@@ -4417,7 +4417,7 @@ static const struct tty_operations stli_ops = {
 
 static const struct tty_port_operations stli_port_ops = {
        .carrier_raised = stli_carrier_raised,
-       .raise_dtr_rts = stli_raise_dtr_rts,
+       .dtr_rts = stli_dtr_rts,
 };
 
 /*****************************************************************************/
index 65e12bca657cfa0ba14f842ce6b442411f68bca6..f96d0bef855e3cda2507c698eb5cfdef85567c10 100644 (file)
@@ -694,9 +694,8 @@ static ssize_t read_zero(struct file * file, char __user * buf,
                written += chunk - unwritten;
                if (unwritten)
                        break;
-               /* Consider changing this to just 'signal_pending()' with lots of testing */
-               if (fatal_signal_pending(current))
-                       return written ? written : -EINTR;
+               if (signal_pending(current))
+                       return written ? written : -ERESTARTSYS;
                buf += chunk;
                count -= chunk;
                cond_resched();
index 4a4cab73d0be176aa640c9717bc7582231b39495..65b6ff2442c6df4edfd839b37752ded8340df9c9 100644 (file)
@@ -1184,6 +1184,11 @@ static int moxa_open(struct tty_struct *tty, struct file *filp)
                return -ENODEV;
        }
 
+       if (port % MAX_PORTS_PER_BOARD >= brd->numPorts) {
+               mutex_unlock(&moxa_openlock);
+               return -ENODEV;
+       }
+
        ch = &brd->ports[port % MAX_PORTS_PER_BOARD];
        ch->port.count++;
        tty->driver_data = ch;
index 13f8871e5b2177bb844018f6543ca2e690e973b6..9533f43a30bb0e7bad9873acb85d25cf39e44625 100644 (file)
@@ -547,14 +547,18 @@ static int mxser_carrier_raised(struct tty_port *port)
        return (inb(mp->ioaddr + UART_MSR) & UART_MSR_DCD)?1:0;
 }
 
-static void mxser_raise_dtr_rts(struct tty_port *port)
+static void mxser_dtr_rts(struct tty_port *port, int on)
 {
        struct mxser_port *mp = container_of(port, struct mxser_port, port);
        unsigned long flags;
 
        spin_lock_irqsave(&mp->slock, flags);
-       outb(inb(mp->ioaddr + UART_MCR) |
-               UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+       if (on)
+               outb(inb(mp->ioaddr + UART_MCR) |
+                       UART_MCR_DTR | UART_MCR_RTS, mp->ioaddr + UART_MCR);
+       else
+               outb(inb(mp->ioaddr + UART_MCR)&~(UART_MCR_DTR | UART_MCR_RTS),
+                       mp->ioaddr + UART_MCR);
        spin_unlock_irqrestore(&mp->slock, flags);
 }
 
@@ -2356,7 +2360,7 @@ static const struct tty_operations mxser_ops = {
 
 struct tty_port_operations mxser_port_ops = {
        .carrier_raised = mxser_carrier_raised,
-       .raise_dtr_rts = mxser_raise_dtr_rts,
+       .dtr_rts = mxser_dtr_rts,
 };
 
 /*
index bacb3e2872ae49cbc5668e6a402cb50fbf75a30b..461ece591a5bf98fbc0f979e03d75ac8f9cc6234 100644 (file)
@@ -342,8 +342,8 @@ static int n_hdlc_tty_open (struct tty_struct *tty)
 #endif
        
        /* Flush any pending characters in the driver and discipline. */
-       if (tty->ldisc.ops->flush_buffer)
-               tty->ldisc.ops->flush_buffer(tty);
+       if (tty->ldisc->ops->flush_buffer)
+               tty->ldisc->ops->flush_buffer(tty);
 
        tty_driver_flush_buffer(tty);
                
index f6f0e4ec2b510dde6ca5079f6def074e34f6fdce..94a5d5020abcec935e08d6bc74180a249ea58423 100644 (file)
 #define ECHO_OP_SET_CANON_COL 0x81
 #define ECHO_OP_ERASE_TAB 0x82
 
-static inline unsigned char *alloc_buf(void)
-{
-       gfp_t prio = in_interrupt() ? GFP_ATOMIC : GFP_KERNEL;
-
-       if (PAGE_SIZE != N_TTY_BUF_SIZE)
-               return kmalloc(N_TTY_BUF_SIZE, prio);
-       else
-               return (unsigned char *)__get_free_page(prio);
-}
-
-static inline void free_buf(unsigned char *buf)
-{
-       if (PAGE_SIZE != N_TTY_BUF_SIZE)
-               kfree(buf);
-       else
-               free_page((unsigned long) buf);
-}
-
 static inline int tty_put_user(struct tty_struct *tty, unsigned char x,
                               unsigned char __user *ptr)
 {
@@ -1558,11 +1540,11 @@ static void n_tty_close(struct tty_struct *tty)
 {
        n_tty_flush_buffer(tty);
        if (tty->read_buf) {
-               free_buf(tty->read_buf);
+               kfree(tty->read_buf);
                tty->read_buf = NULL;
        }
        if (tty->echo_buf) {
-               free_buf(tty->echo_buf);
+               kfree(tty->echo_buf);
                tty->echo_buf = NULL;
        }
 }
@@ -1584,17 +1566,16 @@ static int n_tty_open(struct tty_struct *tty)
 
        /* These are ugly. Currently a malloc failure here can panic */
        if (!tty->read_buf) {
-               tty->read_buf = alloc_buf();
+               tty->read_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
                if (!tty->read_buf)
                        return -ENOMEM;
        }
        if (!tty->echo_buf) {
-               tty->echo_buf = alloc_buf();
+               tty->echo_buf = kzalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
+
                if (!tty->echo_buf)
                        return -ENOMEM;
        }
-       memset(tty->read_buf, 0, N_TTY_BUF_SIZE);
-       memset(tty->echo_buf, 0, N_TTY_BUF_SIZE);
        reset_buffer_flags(tty);
        tty->column = 0;
        n_tty_set_termios(tty, NULL);
index 19d79fc54461c592111fbce5d0e68e395e10dcd4..77b3648892249f494724bf2a6302bdfc9158c4c5 100644 (file)
@@ -383,7 +383,7 @@ static void async_mode(MGSLPC_INFO *info);
 static void tx_timeout(unsigned long context);
 
 static int carrier_raised(struct tty_port *port);
-static void raise_dtr_rts(struct tty_port *port);
+static void dtr_rts(struct tty_port *port, int onoff);
 
 #if SYNCLINK_GENERIC_HDLC
 #define dev_to_port(D) (dev_to_hdlc(D)->priv)
@@ -513,7 +513,7 @@ static void ldisc_receive_buf(struct tty_struct *tty,
 
 static const struct tty_port_operations mgslpc_port_ops = {
        .carrier_raised = carrier_raised,
-       .raise_dtr_rts = raise_dtr_rts
+       .dtr_rts = dtr_rts
 };
 
 static int mgslpc_probe(struct pcmcia_device *link)
@@ -2528,13 +2528,16 @@ static int carrier_raised(struct tty_port *port)
        return 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int onoff)
 {
        MGSLPC_INFO *info = container_of(port, MGSLPC_INFO, port);
        unsigned long flags;
 
        spin_lock_irqsave(&info->lock,flags);
-       info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       if (onoff)
+               info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       else
+               info->serial_signals &= ~SerialSignal_RTS + SerialSignal_DTR;
        set_signals(info);
        spin_unlock_irqrestore(&info->lock,flags);
 }
index 31038a0052a2c3e007a4e792f2945ff015a7856e..5acd29e6e0430ee6bc16f3b4757a4fc76eb5cc06 100644 (file)
@@ -30,7 +30,6 @@
 
 #include <asm/system.h>
 
-/* These are global because they are accessed in tty_io.c */
 #ifdef CONFIG_UNIX98_PTYS
 static struct tty_driver *ptm_driver;
 static struct tty_driver *pts_driver;
@@ -111,7 +110,7 @@ static int pty_write(struct tty_struct *tty, const unsigned char *buf,
        c = to->receive_room;
        if (c > count)
                c = count;
-       to->ldisc.ops->receive_buf(to, buf, NULL, c);
+       to->ldisc->ops->receive_buf(to, buf, NULL, c);
 
        return c;
 }
@@ -149,11 +148,11 @@ static int pty_chars_in_buffer(struct tty_struct *tty)
        int count;
 
        /* We should get the line discipline lock for "tty->link" */
-       if (!to || !to->ldisc.ops->chars_in_buffer)
+       if (!to || !to->ldisc->ops->chars_in_buffer)
                return 0;
 
        /* The ldisc must report 0 if no characters available to be read */
-       count = to->ldisc.ops->chars_in_buffer(to);
+       count = to->ldisc->ops->chars_in_buffer(to);
 
        if (tty->driver->subtype == PTY_TYPE_SLAVE)
                return count;
@@ -187,8 +186,8 @@ static void pty_flush_buffer(struct tty_struct *tty)
        if (!to)
                return;
 
-       if (to->ldisc.ops->flush_buffer)
-               to->ldisc.ops->flush_buffer(to);
+       if (to->ldisc->ops->flush_buffer)
+               to->ldisc->ops->flush_buffer(to);
 
        if (to->packet) {
                spin_lock_irqsave(&tty->ctrl_lock, flags);
index f59fc5cea0673546fa03b0395a8d12b6f7872ff4..63d5b628477a72bbffb6cb8696a0a4b615ef5725 100644 (file)
@@ -872,11 +872,16 @@ static int carrier_raised(struct tty_port *port)
        return (sGetChanStatusLo(&info->channel) & CD_ACT) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
        struct r_port *info = container_of(port, struct r_port, port);
-       sSetDTR(&info->channel);
-       sSetRTS(&info->channel);
+       if (on) {
+               sSetDTR(&info->channel);
+               sSetRTS(&info->channel);
+       } else {
+               sClrDTR(&info->channel);
+               sClrRTS(&info->channel);
+       }
 }
 
 /*
@@ -934,7 +939,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
        /*
         * Info->count is now 1; so it's safe to sleep now.
         */
-       if (!test_bit(ASYNC_INITIALIZED, &port->flags)) {
+       if (!test_bit(ASYNCB_INITIALIZED, &port->flags)) {
                cp = &info->channel;
                sSetRxTrigger(cp, TRIG_1);
                if (sGetChanStatus(cp) & CD_ACT)
@@ -958,7 +963,7 @@ static int rp_open(struct tty_struct *tty, struct file *filp)
                sEnRxFIFO(cp);
                sEnTransmit(cp);
 
-               set_bit(ASYNC_INITIALIZED, &info->port.flags);
+               set_bit(ASYNCB_INITIALIZED, &info->port.flags);
 
                /*
                 * Set up the tty->alt_speed kludge
@@ -1641,7 +1646,7 @@ static int rp_write(struct tty_struct *tty,
        /*  Write remaining data into the port's xmit_buf */
        while (1) {
                /* Hung up ? */
-               if (!test_bit(ASYNC_NORMAL_ACTIVE, &info->port.flags))
+               if (!test_bit(ASYNCB_NORMAL_ACTIVE, &info->port.flags))
                        goto end;
                c = min(count, XMIT_BUF_SIZE - info->xmit_cnt - 1);
                c = min(c, XMIT_BUF_SIZE - info->xmit_head);
@@ -2250,7 +2255,7 @@ static const struct tty_operations rocket_ops = {
 
 static const struct tty_port_operations rocket_port_ops = {
        .carrier_raised = carrier_raised,
-       .raise_dtr_rts = raise_dtr_rts,
+       .dtr_rts = dtr_rts,
 };
 
 /*
index cb8ca5698963017d03ff6ed625ec41f0d0a6c062..f97b9e8480645f85c34efb7bcbb41b3474e34d6c 100644 (file)
@@ -327,7 +327,7 @@ int paste_selection(struct tty_struct *tty)
                }
                count = sel_buffer_lth - pasted;
                count = min(count, tty->receive_room);
-               tty->ldisc.ops->receive_buf(tty, sel_buffer + pasted,
+               tty->ldisc->ops->receive_buf(tty, sel_buffer + pasted,
                                                                NULL, count);
                pasted += count;
        }
index 2ad813a801dc391cc8bca62a4cc847b16ea0d4d1..53e504f41b2043fa89b4b9a83637fe31251d38ab 100644 (file)
@@ -772,11 +772,11 @@ static int stl_carrier_raised(struct tty_port *port)
        return (portp->sigs & TIOCM_CD) ? 1 : 0;
 }
 
-static void stl_raise_dtr_rts(struct tty_port *port)
+static void stl_dtr_rts(struct tty_port *port, int on)
 {
        struct stlport *portp = container_of(port, struct stlport, port);
        /* Takes brd_lock internally */
-       stl_setsignals(portp, 1, 1);
+       stl_setsignals(portp, on, on);
 }
 
 /*****************************************************************************/
@@ -2547,7 +2547,7 @@ static const struct tty_operations stl_ops = {
 
 static const struct tty_port_operations stl_port_ops = {
        .carrier_raised = stl_carrier_raised,
-       .raise_dtr_rts = stl_raise_dtr_rts,
+       .dtr_rts = stl_dtr_rts,
 };
 
 /*****************************************************************************/
index afd0b26ca05681210edd0cc0f22b01497eea3e37..afded3a2379c592082e85e373236addce5d9fcd3 100644 (file)
@@ -3247,13 +3247,16 @@ static int carrier_raised(struct tty_port *port)
        return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
        struct mgsl_struct *info = container_of(port, struct mgsl_struct, port);
        unsigned long flags;
 
        spin_lock_irqsave(&info->irq_spinlock,flags);
-       info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       if (on)
+               info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       else
+               info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
        usc_set_serial_signals(info);
        spin_unlock_irqrestore(&info->irq_spinlock,flags);
 }
@@ -4258,7 +4261,7 @@ static void mgsl_add_device( struct mgsl_struct *info )
 
 static const struct tty_port_operations mgsl_port_ops = {
        .carrier_raised = carrier_raised,
-       .raise_dtr_rts = raise_dtr_rts,
+       .dtr_rts = dtr_rts,
 };
 
 
index 5e256494686a8413734db27db5b9826069e36847..1386625fc4caae4bdf6a39e6835d0026b0e27ca7 100644 (file)
@@ -214,6 +214,7 @@ struct slgt_desc
 #define set_desc_next(a,b) (a).next   = cpu_to_le32((unsigned int)(b))
 #define set_desc_count(a,b)(a).count  = cpu_to_le16((unsigned short)(b))
 #define set_desc_eof(a,b)  (a).status = cpu_to_le16((b) ? (le16_to_cpu((a).status) | BIT0) : (le16_to_cpu((a).status) & ~BIT0))
+#define set_desc_status(a, b) (a).status = cpu_to_le16((unsigned short)(b))
 #define desc_count(a)      (le16_to_cpu((a).count))
 #define desc_status(a)     (le16_to_cpu((a).status))
 #define desc_complete(a)   (le16_to_cpu((a).status) & BIT15)
@@ -297,6 +298,7 @@ struct slgt_info {
        u32 max_frame_size;       /* as set by device config */
 
        unsigned int rbuf_fill_level;
+       unsigned int rx_pio;
        unsigned int if_mode;
        unsigned int base_clock;
 
@@ -331,6 +333,8 @@ struct slgt_info {
        struct slgt_desc *rbufs;
        unsigned int rbuf_current;
        unsigned int rbuf_index;
+       unsigned int rbuf_fill_index;
+       unsigned short rbuf_fill_count;
 
        unsigned int tbuf_count;
        struct slgt_desc *tbufs;
@@ -2110,6 +2114,40 @@ static void ri_change(struct slgt_info *info, unsigned short status)
        info->pending_bh |= BH_STATUS;
 }
 
+static void isr_rxdata(struct slgt_info *info)
+{
+       unsigned int count = info->rbuf_fill_count;
+       unsigned int i = info->rbuf_fill_index;
+       unsigned short reg;
+
+       while (rd_reg16(info, SSR) & IRQ_RXDATA) {
+               reg = rd_reg16(info, RDR);
+               DBGISR(("isr_rxdata %s RDR=%04X\n", info->device_name, reg));
+               if (desc_complete(info->rbufs[i])) {
+                       /* all buffers full */
+                       rx_stop(info);
+                       info->rx_restart = 1;
+                       continue;
+               }
+               info->rbufs[i].buf[count++] = (unsigned char)reg;
+               /* async mode saves status byte to buffer for each data byte */
+               if (info->params.mode == MGSL_MODE_ASYNC)
+                       info->rbufs[i].buf[count++] = (unsigned char)(reg >> 8);
+               if (count == info->rbuf_fill_level || (reg & BIT10)) {
+                       /* buffer full or end of frame */
+                       set_desc_count(info->rbufs[i], count);
+                       set_desc_status(info->rbufs[i], BIT15 | (reg >> 8));
+                       info->rbuf_fill_count = count = 0;
+                       if (++i == info->rbuf_count)
+                               i = 0;
+                       info->pending_bh |= BH_RECEIVE;
+               }
+       }
+
+       info->rbuf_fill_index = i;
+       info->rbuf_fill_count = count;
+}
+
 static void isr_serial(struct slgt_info *info)
 {
        unsigned short status = rd_reg16(info, SSR);
@@ -2125,6 +2163,8 @@ static void isr_serial(struct slgt_info *info)
                        if (info->tx_count)
                                isr_txeom(info, status);
                }
+               if (info->rx_pio && (status & IRQ_RXDATA))
+                       isr_rxdata(info);
                if ((status & IRQ_RXBREAK) && (status & RXBREAK)) {
                        info->icount.brk++;
                        /* process break detection if tty control allows */
@@ -2141,7 +2181,8 @@ static void isr_serial(struct slgt_info *info)
        } else {
                if (status & (IRQ_TXIDLE + IRQ_TXUNDER))
                        isr_txeom(info, status);
-
+               if (info->rx_pio && (status & IRQ_RXDATA))
+                       isr_rxdata(info);
                if (status & IRQ_RXIDLE) {
                        if (status & RXIDLE)
                                info->icount.rxidle++;
@@ -2642,6 +2683,10 @@ static int rx_enable(struct slgt_info *info, int enable)
                        return -EINVAL;
                }
                info->rbuf_fill_level = rbuf_fill_level;
+               if (rbuf_fill_level < 128)
+                       info->rx_pio = 1; /* PIO mode */
+               else
+                       info->rx_pio = 0; /* DMA mode */
                rx_stop(info); /* restart receiver to use new fill level */
        }
 
@@ -3099,13 +3144,16 @@ static int carrier_raised(struct tty_port *port)
        return (info->signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
        unsigned long flags;
        struct slgt_info *info = container_of(port, struct slgt_info, port);
 
        spin_lock_irqsave(&info->lock,flags);
-       info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+       if (on)
+               info->signals |= SerialSignal_RTS + SerialSignal_DTR;
+       else
+               info->signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
        set_signals(info);
        spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3419,7 +3467,7 @@ static void add_device(struct slgt_info *info)
 
 static const struct tty_port_operations slgt_port_ops = {
        .carrier_raised = carrier_raised,
-       .raise_dtr_rts = raise_dtr_rts,
+       .dtr_rts = dtr_rts,
 };
 
 /*
@@ -3841,15 +3889,27 @@ static void rx_start(struct slgt_info *info)
        rdma_reset(info);
        reset_rbufs(info);
 
-       /* set 1st descriptor address */
-       wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
-
-       if (info->params.mode != MGSL_MODE_ASYNC) {
-               /* enable rx DMA and DMA interrupt */
-               wr_reg32(info, RDCSR, (BIT2 + BIT0));
+       if (info->rx_pio) {
+               /* rx request when rx FIFO not empty */
+               wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) & ~BIT14));
+               slgt_irq_on(info, IRQ_RXDATA);
+               if (info->params.mode == MGSL_MODE_ASYNC) {
+                       /* enable saving of rx status */
+                       wr_reg32(info, RDCSR, BIT6);
+               }
        } else {
-               /* enable saving of rx status, rx DMA and DMA interrupt */
-               wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+               /* rx request when rx FIFO half full */
+               wr_reg16(info, SCR, (unsigned short)(rd_reg16(info, SCR) | BIT14));
+               /* set 1st descriptor address */
+               wr_reg32(info, RDDAR, info->rbufs[0].pdesc);
+
+               if (info->params.mode != MGSL_MODE_ASYNC) {
+                       /* enable rx DMA and DMA interrupt */
+                       wr_reg32(info, RDCSR, (BIT2 + BIT0));
+               } else {
+                       /* enable saving of rx status, rx DMA and DMA interrupt */
+                       wr_reg32(info, RDCSR, (BIT6 + BIT2 + BIT0));
+               }
        }
 
        slgt_irq_on(info, IRQ_RXOVER);
@@ -4467,6 +4527,8 @@ static void free_rbufs(struct slgt_info *info, unsigned int i, unsigned int last
 static void reset_rbufs(struct slgt_info *info)
 {
        free_rbufs(info, 0, info->rbuf_count - 1);
+       info->rbuf_fill_index = 0;
+       info->rbuf_fill_count = 0;
 }
 
 /*
index 26de60efe4b247162c3be863bb3a3c66d86d6c49..6f727e3c53ade1028d21cf11f364bdcc2fafbed9 100644 (file)
@@ -3277,13 +3277,16 @@ static int carrier_raised(struct tty_port *port)
        return (info->serial_signals & SerialSignal_DCD) ? 1 : 0;
 }
 
-static void raise_dtr_rts(struct tty_port *port)
+static void dtr_rts(struct tty_port *port, int on)
 {
        SLMP_INFO *info = container_of(port, SLMP_INFO, port);
        unsigned long flags;
 
        spin_lock_irqsave(&info->lock,flags);
-       info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       if (on)
+               info->serial_signals |= SerialSignal_RTS + SerialSignal_DTR;
+       else
+               info->serial_signals &= ~(SerialSignal_RTS + SerialSignal_DTR);
        set_signals(info);
        spin_unlock_irqrestore(&info->lock,flags);
 }
@@ -3746,7 +3749,7 @@ static void add_device(SLMP_INFO *info)
 
 static const struct tty_port_operations port_ops = {
        .carrier_raised = carrier_raised,
-       .raise_dtr_rts = raise_dtr_rts,
+       .dtr_rts = dtr_rts,
 };
 
 /* Allocate and initialize a device instance structure
index 55ba6f142883f6de16cc0460b540bbc64b932738..ac16fbec72d03018998b63fea0dd517800c2929f 100644 (file)
@@ -29,10 +29,7 @@ static struct tty_audit_buf *tty_audit_buf_alloc(int major, int minor,
        buf = kmalloc(sizeof(*buf), GFP_KERNEL);
        if (!buf)
                goto err;
-       if (PAGE_SIZE != N_TTY_BUF_SIZE)
-               buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
-       else
-               buf->data = (unsigned char *)__get_free_page(GFP_KERNEL);
+       buf->data = kmalloc(N_TTY_BUF_SIZE, GFP_KERNEL);
        if (!buf->data)
                goto err_buf;
        atomic_set(&buf->count, 1);
@@ -52,10 +49,7 @@ err:
 static void tty_audit_buf_free(struct tty_audit_buf *buf)
 {
        WARN_ON(buf->valid != 0);
-       if (PAGE_SIZE != N_TTY_BUF_SIZE)
-               kfree(buf->data);
-       else
-               free_page((unsigned long)buf->data);
+       kfree(buf->data);
        kfree(buf);
 }
 
index 66b99a2049e373cacca62d54d1d689daed2417e2..939e198d7670adfdad2068b4036a8d9348779444 100644 (file)
@@ -295,7 +295,7 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
        struct tty_driver *p, *res = NULL;
        int tty_line = 0;
        int len;
-       char *str;
+       char *str, *stp;
 
        for (str = name; *str; str++)
                if ((*str >= '0' && *str <= '9') || *str == ',')
@@ -311,13 +311,14 @@ struct tty_driver *tty_find_polling_driver(char *name, int *line)
        list_for_each_entry(p, &tty_drivers, tty_drivers) {
                if (strncmp(name, p->name, len) != 0)
                        continue;
-               if (*str == ',')
-                       str++;
-               if (*str == '\0')
-                       str = NULL;
+               stp = str;
+               if (*stp == ',')
+                       stp++;
+               if (*stp == '\0')
+                       stp = NULL;
 
                if (tty_line >= 0 && tty_line <= p->num && p->ops &&
-                   p->ops->poll_init && !p->ops->poll_init(p, tty_line, str)) {
+                   p->ops->poll_init && !p->ops->poll_init(p, tty_line, stp)) {
                        res = tty_driver_kref_get(p);
                        *line = tty_line;
                        break;
@@ -469,43 +470,6 @@ void tty_wakeup(struct tty_struct *tty)
 
 EXPORT_SYMBOL_GPL(tty_wakeup);
 
-/**
- *     tty_ldisc_flush -       flush line discipline queue
- *     @tty: tty
- *
- *     Flush the line discipline queue (if any) for this tty. If there
- *     is no line discipline active this is a no-op.
- */
-
-void tty_ldisc_flush(struct tty_struct *tty)
-{
-       struct tty_ldisc *ld = tty_ldisc_ref(tty);
-       if (ld) {
-               if (ld->ops->flush_buffer)
-                       ld->ops->flush_buffer(tty);
-               tty_ldisc_deref(ld);
-       }
-       tty_buffer_flush(tty);
-}
-
-EXPORT_SYMBOL_GPL(tty_ldisc_flush);
-
-/**
- *     tty_reset_termios       -       reset terminal state
- *     @tty: tty to reset
- *
- *     Restore a terminal to the driver default state
- */
-
-static void tty_reset_termios(struct tty_struct *tty)
-{
-       mutex_lock(&tty->termios_mutex);
-       *tty->termios = tty->driver->init_termios;
-       tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
-       tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
-       mutex_unlock(&tty->termios_mutex);
-}
-
 /**
  *     do_tty_hangup           -       actual handler for hangup events
  *     @work: tty device
@@ -535,7 +499,6 @@ static void do_tty_hangup(struct work_struct *work)
        struct file *cons_filp = NULL;
        struct file *filp, *f = NULL;
        struct task_struct *p;
-       struct tty_ldisc *ld;
        int    closecount = 0, n;
        unsigned long flags;
        int refs = 0;
@@ -566,40 +529,8 @@ static void do_tty_hangup(struct work_struct *work)
                filp->f_op = &hung_up_tty_fops;
        }
        file_list_unlock();
-       /*
-        * FIXME! What are the locking issues here? This may me overdoing
-        * things... This question is especially important now that we've
-        * removed the irqlock.
-        */
-       ld = tty_ldisc_ref(tty);
-       if (ld != NULL) {
-               /* We may have no line discipline at this point */
-               if (ld->ops->flush_buffer)
-                       ld->ops->flush_buffer(tty);
-               tty_driver_flush_buffer(tty);
-               if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
-                   ld->ops->write_wakeup)
-                       ld->ops->write_wakeup(tty);
-               if (ld->ops->hangup)
-                       ld->ops->hangup(tty);
-       }
-       /*
-        * FIXME: Once we trust the LDISC code better we can wait here for
-        * ldisc completion and fix the driver call race
-        */
-       wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
-       wake_up_interruptible_poll(&tty->read_wait, POLLIN);
-       /*
-        * Shutdown the current line discipline, and reset it to
-        * N_TTY.
-        */
-       if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS)
-               tty_reset_termios(tty);
-       /* Defer ldisc switch */
-       /* tty_deferred_ldisc_switch(N_TTY);
 
-         This should get done automatically when the port closes and
-         tty_release is called */
+       tty_ldisc_hangup(tty);
 
        read_lock(&tasklist_lock);
        if (tty->session) {
@@ -628,12 +559,15 @@ static void do_tty_hangup(struct work_struct *work)
        read_unlock(&tasklist_lock);
 
        spin_lock_irqsave(&tty->ctrl_lock, flags);
-       tty->flags = 0;
+       clear_bit(TTY_THROTTLED, &tty->flags);
+       clear_bit(TTY_PUSH, &tty->flags);
+       clear_bit(TTY_DO_WRITE_WAKEUP, &tty->flags);
        put_pid(tty->session);
        put_pid(tty->pgrp);
        tty->session = NULL;
        tty->pgrp = NULL;
        tty->ctrl_status = 0;
+       set_bit(TTY_HUPPED, &tty->flags);
        spin_unlock_irqrestore(&tty->ctrl_lock, flags);
 
        /* Account for the p->signal references we killed */
@@ -659,10 +593,7 @@ static void do_tty_hangup(struct work_struct *work)
         * can't yet guarantee all that.
         */
        set_bit(TTY_HUPPED, &tty->flags);
-       if (ld) {
-               tty_ldisc_enable(tty);
-               tty_ldisc_deref(ld);
-       }
+       tty_ldisc_enable(tty);
        unlock_kernel();
        if (f)
                fput(f);
@@ -2480,6 +2411,24 @@ static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int
        return tty->ops->tiocmset(tty, file, set, clear);
 }
 
+struct tty_struct *tty_pair_get_tty(struct tty_struct *tty)
+{
+       if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+           tty->driver->subtype == PTY_TYPE_MASTER)
+               tty = tty->link;
+       return tty;
+}
+EXPORT_SYMBOL(tty_pair_get_tty);
+
+struct tty_struct *tty_pair_get_pty(struct tty_struct *tty)
+{
+       if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
+           tty->driver->subtype == PTY_TYPE_MASTER)
+           return tty;
+       return tty->link;
+}
+EXPORT_SYMBOL(tty_pair_get_pty);
+
 /*
  * Split this up, as gcc can choke on it otherwise..
  */
@@ -2495,11 +2444,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        if (tty_paranoia_check(tty, inode, "tty_ioctl"))
                return -EINVAL;
 
-       real_tty = tty;
-       if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
-           tty->driver->subtype == PTY_TYPE_MASTER)
-               real_tty = tty->link;
-
+       real_tty = tty_pair_get_tty(tty);
 
        /*
         * Factor out some common prep work
@@ -2555,7 +2500,7 @@ long tty_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
        case TIOCGSID:
                return tiocgsid(tty, real_tty, p);
        case TIOCGETD:
-               return put_user(tty->ldisc.ops->num, (int __user *)p);
+               return put_user(tty->ldisc->ops->num, (int __user *)p);
        case TIOCSETD:
                return tiocsetd(tty, p);
        /*
@@ -2770,6 +2715,7 @@ void initialize_tty_struct(struct tty_struct *tty,
        tty->buf.head = tty->buf.tail = NULL;
        tty_buffer_init(tty);
        mutex_init(&tty->termios_mutex);
+       mutex_init(&tty->ldisc_mutex);
        init_waitqueue_head(&tty->write_wait);
        init_waitqueue_head(&tty->read_wait);
        INIT_WORK(&tty->hangup_work, do_tty_hangup);
index 6f4c7d0a53bf36bc6ff28afeadca4720704e459f..8116bb1c8f801c505816b0eec531bed441ff0842 100644 (file)
@@ -97,14 +97,19 @@ EXPORT_SYMBOL(tty_driver_flush_buffer);
  *     @tty: terminal
  *
  *     Indicate that a tty should stop transmitting data down the stack.
+ *     Takes the termios mutex to protect against parallel throttle/unthrottle
+ *     and also to ensure the driver can consistently reference its own
+ *     termios data at this point when implementing software flow control.
  */
 
 void tty_throttle(struct tty_struct *tty)
 {
+       mutex_lock(&tty->termios_mutex);
        /* check TTY_THROTTLED first so it indicates our state */
        if (!test_and_set_bit(TTY_THROTTLED, &tty->flags) &&
            tty->ops->throttle)
                tty->ops->throttle(tty);
+       mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_throttle);
 
@@ -113,13 +118,21 @@ EXPORT_SYMBOL(tty_throttle);
  *     @tty: terminal
  *
  *     Indicate that a tty may continue transmitting data down the stack.
+ *     Takes the termios mutex to protect against parallel throttle/unthrottle
+ *     and also to ensure the driver can consistently reference its own
+ *     termios data at this point when implementing software flow control.
+ *
+ *     Drivers should however remember that the stack can issue a throttle,
+ *     then change flow control method, then unthrottle.
  */
 
 void tty_unthrottle(struct tty_struct *tty)
 {
+       mutex_lock(&tty->termios_mutex);
        if (test_and_clear_bit(TTY_THROTTLED, &tty->flags) &&
            tty->ops->unthrottle)
                tty->ops->unthrottle(tty);
+       mutex_unlock(&tty->termios_mutex);
 }
 EXPORT_SYMBOL(tty_unthrottle);
 
@@ -613,9 +626,25 @@ static int set_termios(struct tty_struct *tty, void __user *arg, int opt)
        return 0;
 }
 
+static void copy_termios(struct tty_struct *tty, struct ktermios *kterm)
+{
+       mutex_lock(&tty->termios_mutex);
+       memcpy(kterm, tty->termios, sizeof(struct ktermios));
+       mutex_unlock(&tty->termios_mutex);
+}
+
+static void copy_termios_locked(struct tty_struct *tty, struct ktermios *kterm)
+{
+       mutex_lock(&tty->termios_mutex);
+       memcpy(kterm, tty->termios_locked, sizeof(struct ktermios));
+       mutex_unlock(&tty->termios_mutex);
+}
+
 static int get_termio(struct tty_struct *tty, struct termio __user *termio)
 {
-       if (kernel_termios_to_user_termio(termio, tty->termios))
+       struct ktermios kterm;
+       copy_termios(tty, &kterm);
+       if (kernel_termios_to_user_termio(termio, &kterm))
                return -EFAULT;
        return 0;
 }
@@ -917,6 +946,8 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
        struct tty_struct *real_tty;
        void __user *p = (void __user *)arg;
        int ret = 0;
+       struct ktermios kterm;
+       struct termiox ktermx;
 
        if (tty->driver->type == TTY_DRIVER_TYPE_PTY &&
            tty->driver->subtype == PTY_TYPE_MASTER)
@@ -952,23 +983,20 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
                return set_termios(real_tty, p, TERMIOS_OLD);
 #ifndef TCGETS2
        case TCGETS:
-               mutex_lock(&real_tty->termios_mutex);
-               if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios))
+               copy_termios(real_tty, &kterm);
+               if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
                        ret = -EFAULT;
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
 #else
        case TCGETS:
-               mutex_lock(&real_tty->termios_mutex);
-               if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios))
+               copy_termios(real_tty, &kterm);
+               if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
                        ret = -EFAULT;
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
        case TCGETS2:
-               mutex_lock(&real_tty->termios_mutex);
-               if (kernel_termios_to_user_termios((struct termios2 __user *)arg, real_tty->termios))
+               copy_termios(real_tty, &kterm);
+               if (kernel_termios_to_user_termios((struct termios2 __user *)arg, &kterm))
                        ret = -EFAULT;
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
        case TCSETSF2:
                return set_termios(real_tty, p,  TERMIOS_FLUSH | TERMIOS_WAIT);
@@ -987,34 +1015,36 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
                return set_termios(real_tty, p, TERMIOS_TERMIO);
 #ifndef TCGETS2
        case TIOCGLCKTRMIOS:
-               mutex_lock(&real_tty->termios_mutex);
-               if (kernel_termios_to_user_termios((struct termios __user *)arg, real_tty->termios_locked))
+               copy_termios_locked(real_tty, &kterm);
+               if (kernel_termios_to_user_termios((struct termios __user *)arg, &kterm))
                        ret = -EFAULT;
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
        case TIOCSLCKTRMIOS:
                if (!capable(CAP_SYS_ADMIN))
                        return -EPERM;
-               mutex_lock(&real_tty->termios_mutex);
-               if (user_termios_to_kernel_termios(real_tty->termios_locked,
+               copy_termios_locked(real_tty, &kterm);
+               if (user_termios_to_kernel_termios(&kterm,
                                               (struct termios __user *) arg))
-                       ret = -EFAULT;
+                       return -EFAULT;
+               mutex_lock(&real_tty->termios_mutex);
+               memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
                mutex_unlock(&real_tty->termios_mutex);
-               return ret;
+               return 0;
 #else
        case TIOCGLCKTRMIOS:
-               mutex_lock(&real_tty->termios_mutex);
-               if (kernel_termios_to_user_termios_1((struct termios __user *)arg, real_tty->termios_locked))
+               copy_termios_locked(real_tty, &kterm);
+               if (kernel_termios_to_user_termios_1((struct termios __user *)arg, &kterm))
                        ret = -EFAULT;
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
        case TIOCSLCKTRMIOS:
                if (!capable(CAP_SYS_ADMIN))
-                       ret = -EPERM;
-               mutex_lock(&real_tty->termios_mutex);
-               if (user_termios_to_kernel_termios_1(real_tty->termios_locked,
+                       return -EPERM;
+               copy_termios_locked(real_tty, &kterm);
+               if (user_termios_to_kernel_termios_1(&kterm,
                                               (struct termios __user *) arg))
-                       ret = -EFAULT;
+                       return -EFAULT;
+               mutex_lock(&real_tty->termios_mutex);
+               memcpy(real_tty->termios_locked, &kterm, sizeof(struct ktermios));
                mutex_unlock(&real_tty->termios_mutex);
                return ret;
 #endif
@@ -1023,9 +1053,10 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
                if (real_tty->termiox == NULL)
                        return -EINVAL;
                mutex_lock(&real_tty->termios_mutex);
-               if (copy_to_user(p, real_tty->termiox, sizeof(struct termiox)))
-                       ret = -EFAULT;
+               memcpy(&ktermx, real_tty->termiox, sizeof(struct termiox));
                mutex_unlock(&real_tty->termios_mutex);
+               if (copy_to_user(p, &ktermx, sizeof(struct termiox)))
+                       ret = -EFAULT;
                return ret;
        case TCSETX:
                return set_termiox(real_tty, p, 0);
@@ -1035,10 +1066,9 @@ int tty_mode_ioctl(struct tty_struct *tty, struct file *file,
                return set_termiox(real_tty, p, TERMIOS_FLUSH);
 #endif         
        case TIOCGSOFTCAR:
-               mutex_lock(&real_tty->termios_mutex);
-               ret = put_user(C_CLOCAL(real_tty) ? 1 : 0,
+               copy_termios(real_tty, &kterm);
+               ret = put_user((kterm.c_cflag & CLOCAL) ? 1 : 0,
                                                (int __user *)arg);
-               mutex_unlock(&real_tty->termios_mutex);
                return ret;
        case TIOCSSOFTCAR:
                if (get_user(arg, (unsigned int __user *) arg))
index f78f5b0127a88501ec0b21cf9d42c517209ad933..39c8f86dedd49c9e60ebe1e4e218225416bf0375 100644 (file)
@@ -115,19 +115,22 @@ EXPORT_SYMBOL(tty_unregister_ldisc);
 /**
  *     tty_ldisc_try_get       -       try and reference an ldisc
  *     @disc: ldisc number
- *     @ld: tty ldisc structure to complete
  *
  *     Attempt to open and lock a line discipline into place. Return
- *     the line discipline refcounted and assigned in ld. On an error
- *     report the error code back
+ *     the line discipline refcounted or an error.
  */
 
-static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_try_get(int disc)
 {
        unsigned long flags;
+       struct tty_ldisc *ld;
        struct tty_ldisc_ops *ldops;
        int err = -EINVAL;
-       
+
+       ld = kmalloc(sizeof(struct tty_ldisc), GFP_KERNEL);
+       if (ld == NULL)
+               return ERR_PTR(-ENOMEM);
+
        spin_lock_irqsave(&tty_ldisc_lock, flags);
        ld->ops = NULL;
        ldops = tty_ldiscs[disc];
@@ -140,17 +143,19 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
                        /* lock it */
                        ldops->refcount++;
                        ld->ops = ldops;
+                       ld->refcount = 0;
                        err = 0;
                }
        }
        spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-       return err;
+       if (err)
+               return ERR_PTR(err);
+       return ld;
 }
 
 /**
  *     tty_ldisc_get           -       take a reference to an ldisc
  *     @disc: ldisc number
- *     @ld: tty line discipline structure to use
  *
  *     Takes a reference to a line discipline. Deals with refcounts and
  *     module locking counts. Returns NULL if the discipline is not available.
@@ -161,52 +166,54 @@ static int tty_ldisc_try_get(int disc, struct tty_ldisc *ld)
  *             takes tty_ldisc_lock to guard against ldisc races
  */
 
-static int tty_ldisc_get(int disc, struct tty_ldisc *ld)
+static struct tty_ldisc *tty_ldisc_get(int disc)
 {
-       int err;
+       struct tty_ldisc *ld;
 
        if (disc < N_TTY || disc >= NR_LDISCS)
-               return -EINVAL;
-       err = tty_ldisc_try_get(disc, ld);
-       if (err < 0) {
+               return ERR_PTR(-EINVAL);
+       ld = tty_ldisc_try_get(disc);
+       if (IS_ERR(ld)) {
                request_module("tty-ldisc-%d", disc);
-               err = tty_ldisc_try_get(disc, ld);
+               ld = tty_ldisc_try_get(disc);
        }
-       return err;
+       return ld;
 }
 
 /**
  *     tty_ldisc_put           -       drop ldisc reference
- *     @disc: ldisc number
+ *     @ld: ldisc
  *
  *     Drop a reference to a line discipline. Manage refcounts and
- *     module usage counts
+ *     module usage counts. Free the ldisc once the recount hits zero.
  *
  *     Locking:
  *             takes tty_ldisc_lock to guard against ldisc races
  */
 
-static void tty_ldisc_put(struct tty_ldisc_ops *ld)
+static void tty_ldisc_put(struct tty_ldisc *ld)
 {
        unsigned long flags;
-       int disc = ld->num;
+       int disc = ld->ops->num;
+       struct tty_ldisc_ops *ldo;
 
        BUG_ON(disc < N_TTY || disc >= NR_LDISCS);
 
        spin_lock_irqsave(&tty_ldisc_lock, flags);
-       ld = tty_ldiscs[disc];
-       BUG_ON(ld->refcount == 0);
-       ld->refcount--;
-       module_put(ld->owner);
+       ldo = tty_ldiscs[disc];
+       BUG_ON(ldo->refcount == 0);
+       ldo->refcount--;
+       module_put(ldo->owner);
        spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+       kfree(ld);
 }
 
-static void * tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
+static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos)
 {
        return (*pos < NR_LDISCS) ? pos : NULL;
 }
 
-static void * tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
+static void *tty_ldiscs_seq_next(struct seq_file *m, void *v, loff_t *pos)
 {
        (*pos)++;
        return (*pos < NR_LDISCS) ? pos : NULL;
@@ -219,12 +226,13 @@ static void tty_ldiscs_seq_stop(struct seq_file *m, void *v)
 static int tty_ldiscs_seq_show(struct seq_file *m, void *v)
 {
        int i = *(loff_t *)v;
-       struct tty_ldisc ld;
-       
-       if (tty_ldisc_get(i, &ld) < 0)
+       struct tty_ldisc *ld;
+
+       ld = tty_ldisc_try_get(i);
+       if (IS_ERR(ld))
                return 0;
-       seq_printf(m, "%-10s %2d\n", ld.ops->name ? ld.ops->name : "???", i);
-       tty_ldisc_put(ld.ops);
+       seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i);
+       tty_ldisc_put(ld);
        return 0;
 }
 
@@ -263,8 +271,7 @@ const struct file_operations tty_ldiscs_proc_fops = {
 
 static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld)
 {
-       ld->refcount = 0;
-       tty->ldisc = *ld;
+       tty->ldisc = ld;
 }
 
 /**
@@ -286,7 +293,7 @@ static int tty_ldisc_try(struct tty_struct *tty)
        int ret = 0;
 
        spin_lock_irqsave(&tty_ldisc_lock, flags);
-       ld = &tty->ldisc;
+       ld = tty->ldisc;
        if (test_bit(TTY_LDISC, &tty->flags)) {
                ld->refcount++;
                ret = 1;
@@ -315,10 +322,9 @@ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty)
 {
        /* wait_event is a macro */
        wait_event(tty_ldisc_wait, tty_ldisc_try(tty));
-       WARN_ON(tty->ldisc.refcount == 0);
-       return &tty->ldisc;
+       WARN_ON(tty->ldisc->refcount == 0);
+       return tty->ldisc;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 
 /**
@@ -335,10 +341,9 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait);
 struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty)
 {
        if (tty_ldisc_try(tty))
-               return &tty->ldisc;
+               return tty->ldisc;
        return NULL;
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_ref);
 
 /**
@@ -366,7 +371,6 @@ void tty_ldisc_deref(struct tty_ldisc *ld)
                wake_up(&tty_ldisc_wait);
        spin_unlock_irqrestore(&tty_ldisc_lock, flags);
 }
-
 EXPORT_SYMBOL_GPL(tty_ldisc_deref);
 
 /**
@@ -388,6 +392,26 @@ void tty_ldisc_enable(struct tty_struct *tty)
        wake_up(&tty_ldisc_wait);
 }
 
+/**
+ *     tty_ldisc_flush -       flush line discipline queue
+ *     @tty: tty
+ *
+ *     Flush the line discipline queue (if any) for this tty. If there
+ *     is no line discipline active this is a no-op.
+ */
+
+void tty_ldisc_flush(struct tty_struct *tty)
+{
+       struct tty_ldisc *ld = tty_ldisc_ref(tty);
+       if (ld) {
+               if (ld->ops->flush_buffer)
+                       ld->ops->flush_buffer(tty);
+               tty_ldisc_deref(ld);
+       }
+       tty_buffer_flush(tty);
+}
+EXPORT_SYMBOL_GPL(tty_ldisc_flush);
+
 /**
  *     tty_set_termios_ldisc           -       set ldisc field
  *     @tty: tty structure
@@ -407,6 +431,39 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
        mutex_unlock(&tty->termios_mutex);
 }
 
+/**
+ *     tty_ldisc_open          -       open a line discipline
+ *     @tty: tty we are opening the ldisc on
+ *     @ld: discipline to open
+ *
+ *     A helper opening method. Also a convenient debugging and check
+ *     point.
+ */
+
+static int tty_ldisc_open(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+       WARN_ON(test_and_set_bit(TTY_LDISC_OPEN, &tty->flags));
+       if (ld->ops->open)
+               return ld->ops->open(tty);
+       return 0;
+}
+
+/**
+ *     tty_ldisc_close         -       close a line discipline
+ *     @tty: tty we are opening the ldisc on
+ *     @ld: discipline to close
+ *
+ *     A helper close method. Also a convenient debugging and check
+ *     point.
+ */
+
+static void tty_ldisc_close(struct tty_struct *tty, struct tty_ldisc *ld)
+{
+       WARN_ON(!test_bit(TTY_LDISC_OPEN, &tty->flags));
+       clear_bit(TTY_LDISC_OPEN, &tty->flags);
+       if (ld->ops->close)
+               ld->ops->close(tty);
+}
 
 /**
  *     tty_ldisc_restore       -       helper for tty ldisc change
@@ -420,66 +477,136 @@ static void tty_set_termios_ldisc(struct tty_struct *tty, int num)
 static void tty_ldisc_restore(struct tty_struct *tty, struct tty_ldisc *old)
 {
        char buf[64];
-       struct tty_ldisc new_ldisc;
+       struct tty_ldisc *new_ldisc;
+       int r;
 
        /* There is an outstanding reference here so this is safe */
-       tty_ldisc_get(old->ops->num, old);
+       old = tty_ldisc_get(old->ops->num);
+       WARN_ON(IS_ERR(old));
        tty_ldisc_assign(tty, old);
        tty_set_termios_ldisc(tty, old->ops->num);
-       if (old->ops->open && (old->ops->open(tty) < 0)) {
-               tty_ldisc_put(old->ops);
+       if (tty_ldisc_open(tty, old) < 0) {
+               tty_ldisc_put(old);
                /* This driver is always present */
-               if (tty_ldisc_get(N_TTY, &new_ldisc) < 0)
+               new_ldisc = tty_ldisc_get(N_TTY);
+               if (IS_ERR(new_ldisc))
                        panic("n_tty: get");
-               tty_ldisc_assign(tty, &new_ldisc);
+               tty_ldisc_assign(tty, new_ldisc);
                tty_set_termios_ldisc(tty, N_TTY);
-               if (new_ldisc.ops->open) {
-                       int r = new_ldisc.ops->open(tty);
-                               if (r < 0)
-                               panic("Couldn't open N_TTY ldisc for "
-                                     "%s --- error %d.",
-                                     tty_name(tty, buf), r);
-               }
+               r = tty_ldisc_open(tty, new_ldisc);
+               if (r < 0)
+                       panic("Couldn't open N_TTY ldisc for "
+                             "%s --- error %d.",
+                             tty_name(tty, buf), r);
        }
 }
 
+/**
+ *     tty_ldisc_halt          -       shut down the line discipline
+ *     @tty: tty device
+ *
+ *     Shut down the line discipline and work queue for this tty device.
+ *     The TTY_LDISC flag being cleared ensures no further references can
+ *     be obtained while the delayed work queue halt ensures that no more
+ *     data is fed to the ldisc.
+ *
+ *     In order to wait for any existing references to complete see
+ *     tty_ldisc_wait_idle.
+ */
+
+static int tty_ldisc_halt(struct tty_struct *tty)
+{
+       clear_bit(TTY_LDISC, &tty->flags);
+       return cancel_delayed_work(&tty->buf.work);
+}
+
+/**
+ *     tty_ldisc_wait_idle     -       wait for the ldisc to become idle
+ *     @tty: tty to wait for
+ *
+ *     Wait for the line discipline to become idle. The discipline must
+ *     have been halted for this to guarantee it remains idle.
+ *
+ *     tty_ldisc_lock protects the ref counts currently.
+ */
+
+static int tty_ldisc_wait_idle(struct tty_struct *tty)
+{
+       unsigned long flags;
+       spin_lock_irqsave(&tty_ldisc_lock, flags);
+       while (tty->ldisc->refcount) {
+               spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+               if (wait_event_timeout(tty_ldisc_wait,
+                               tty->ldisc->refcount == 0, 5 * HZ) == 0)
+                       return -EBUSY;
+               spin_lock_irqsave(&tty_ldisc_lock, flags);
+       }
+       spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+       return 0;
+}
+
 /**
  *     tty_set_ldisc           -       set line discipline
  *     @tty: the terminal to set
  *     @ldisc: the line discipline
  *
  *     Set the discipline of a tty line. Must be called from a process
- *     context.
+ *     context. The ldisc change logic has to protect itself against any
+ *     overlapping ldisc change (including on the other end of pty pairs),
+ *     the close of one side of a tty/pty pair, and eventually hangup.
  *
- *     Locking: takes tty_ldisc_lock.
- *              called functions take termios_mutex
+ *     Locking: takes tty_ldisc_lock, termios_mutex
  */
 
 int tty_set_ldisc(struct tty_struct *tty, int ldisc)
 {
        int retval;
-       struct tty_ldisc o_ldisc, new_ldisc;
-       int work;
-       unsigned long flags;
+       struct tty_ldisc *o_ldisc, *new_ldisc;
+       int work, o_work = 0;
        struct tty_struct *o_tty;
 
-restart:
-       /* This is a bit ugly for now but means we can break the 'ldisc
-          is part of the tty struct' assumption later */
-       retval = tty_ldisc_get(ldisc, &new_ldisc);
-       if (retval)
-               return retval;
+       new_ldisc = tty_ldisc_get(ldisc);
+       if (IS_ERR(new_ldisc))
+               return PTR_ERR(new_ldisc);
+
+       /*
+        *      We need to look at the tty locking here for pty/tty pairs
+        *      when both sides try to change in parallel.
+        */
+
+       o_tty = tty->link;      /* o_tty is the pty side or NULL */
+
+
+       /*
+        *      Check the no-op case
+        */
+
+       if (tty->ldisc->ops->num == ldisc) {
+               tty_ldisc_put(new_ldisc);
+               return 0;
+       }
 
        /*
         *      Problem: What do we do if this blocks ?
+        *      We could deadlock here
         */
 
        tty_wait_until_sent(tty, 0);
 
-       if (tty->ldisc.ops->num == ldisc) {
-               tty_ldisc_put(new_ldisc.ops);
-               return 0;
+       mutex_lock(&tty->ldisc_mutex);
+
+       /*
+        *      We could be midstream of another ldisc change which has
+        *      dropped the lock during processing. If so we need to wait.
+        */
+
+       while (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
+               mutex_unlock(&tty->ldisc_mutex);
+               wait_event(tty_ldisc_wait,
+                       test_bit(TTY_LDISC_CHANGING, &tty->flags) == 0);
+               mutex_lock(&tty->ldisc_mutex);
        }
+       set_bit(TTY_LDISC_CHANGING, &tty->flags);
 
        /*
         *      No more input please, we are switching. The new ldisc
@@ -489,8 +616,6 @@ restart:
        tty->receive_room = 0;
 
        o_ldisc = tty->ldisc;
-       o_tty = tty->link;
-
        /*
         *      Make sure we don't change while someone holds a
         *      reference to the line discipline. The TTY_LDISC bit
@@ -501,108 +626,181 @@ restart:
         *      with a userspace app continually trying to use the tty in
         *      parallel to the change and re-referencing the tty.
         */
-       clear_bit(TTY_LDISC, &tty->flags);
-       if (o_tty)
-               clear_bit(TTY_LDISC, &o_tty->flags);
 
-       spin_lock_irqsave(&tty_ldisc_lock, flags);
-       if (tty->ldisc.refcount || (o_tty && o_tty->ldisc.refcount)) {
-               if (tty->ldisc.refcount) {
-                       /* Free the new ldisc we grabbed. Must drop the lock
-                          first. */
-                       spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-                       tty_ldisc_put(o_ldisc.ops);
-                       /*
-                        * There are several reasons we may be busy, including
-                        * random momentary I/O traffic. We must therefore
-                        * retry. We could distinguish between blocking ops
-                        * and retries if we made tty_ldisc_wait() smarter.
-                        * That is up for discussion.
-                        */
-                       if (wait_event_interruptible(tty_ldisc_wait, tty->ldisc.refcount == 0) < 0)
-                               return -ERESTARTSYS;
-                       goto restart;
-               }
-               if (o_tty && o_tty->ldisc.refcount) {
-                       spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-                       tty_ldisc_put(o_tty->ldisc.ops);
-                       if (wait_event_interruptible(tty_ldisc_wait, o_tty->ldisc.refcount == 0) < 0)
-                               return -ERESTARTSYS;
-                       goto restart;
-               }
-       }
-       /*
-        *      If the TTY_LDISC bit is set, then we are racing against
-        *      another ldisc change
-        */
-       if (test_bit(TTY_LDISC_CHANGING, &tty->flags)) {
-               struct tty_ldisc *ld;
-               spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-               tty_ldisc_put(new_ldisc.ops);
-               ld = tty_ldisc_ref_wait(tty);
-               tty_ldisc_deref(ld);
-               goto restart;
-       }
-       /*
-        *      This flag is used to avoid two parallel ldisc changes. Once
-        *      open and close are fine grained locked this may work better
-        *      as a mutex shared with the open/close/hup paths
-        */
-       set_bit(TTY_LDISC_CHANGING, &tty->flags);
+       work = tty_ldisc_halt(tty);
        if (o_tty)
-               set_bit(TTY_LDISC_CHANGING, &o_tty->flags);
-       spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-       
-       /*
-        *      From this point on we know nobody has an ldisc
-        *      usage reference, nor can they obtain one until
-        *      we say so later on.
-        */
+               o_work = tty_ldisc_halt(o_tty);
 
-       work = cancel_delayed_work(&tty->buf.work);
        /*
-        * Wait for ->hangup_work and ->buf.work handlers to terminate
-        * MUST NOT hold locks here.
+        * Wait for ->hangup_work and ->buf.work handlers to terminate.
+        * We must drop the mutex here in case a hangup is also in process.
         */
+
+       mutex_unlock(&tty->ldisc_mutex);
+
        flush_scheduled_work();
+
+       /* Let any existing reference holders finish */
+       retval = tty_ldisc_wait_idle(tty);
+       if (retval < 0) {
+               clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+               tty_ldisc_put(new_ldisc);
+               return retval;
+       }
+
+       mutex_lock(&tty->ldisc_mutex);
+       if (test_bit(TTY_HUPPED, &tty->flags)) {
+               /* We were raced by the hangup method. It will have stomped
+                  the ldisc data and closed the ldisc down */
+               clear_bit(TTY_LDISC_CHANGING, &tty->flags);
+               mutex_unlock(&tty->ldisc_mutex);
+               tty_ldisc_put(new_ldisc);
+               return -EIO;
+       }
+
        /* Shutdown the current discipline. */
-       if (o_ldisc.ops->close)
-               (o_ldisc.ops->close)(tty);
+       tty_ldisc_close(tty, o_ldisc);
 
        /* Now set up the new line discipline. */
-       tty_ldisc_assign(tty, &new_ldisc);
+       tty_ldisc_assign(tty, new_ldisc);
        tty_set_termios_ldisc(tty, ldisc);
-       if (new_ldisc.ops->open)
-               retval = (new_ldisc.ops->open)(tty);
+
+       retval = tty_ldisc_open(tty, new_ldisc);
        if (retval < 0) {
-               tty_ldisc_put(new_ldisc.ops);
-               tty_ldisc_restore(tty, &o_ldisc);
+               /* Back to the old one or N_TTY if we can't */
+               tty_ldisc_put(new_ldisc);
+               tty_ldisc_restore(tty, o_ldisc);
        }
+
        /* At this point we hold a reference to the new ldisc and a
           a reference to the old ldisc. If we ended up flipping back
           to the existing ldisc we have two references to it */
 
-       if (tty->ldisc.ops->num != o_ldisc.ops->num && tty->ops->set_ldisc)
+       if (tty->ldisc->ops->num != o_ldisc->ops->num && tty->ops->set_ldisc)
                tty->ops->set_ldisc(tty);
 
-       tty_ldisc_put(o_ldisc.ops);
+       tty_ldisc_put(o_ldisc);
 
        /*
-        *      Allow ldisc referencing to occur as soon as the driver
-        *      ldisc callback completes.
+        *      Allow ldisc referencing to occur again
         */
 
        tty_ldisc_enable(tty);
        if (o_tty)
                tty_ldisc_enable(o_tty);
 
-       /* Restart it in case no characters kick it off. Safe if
+       /* Restart the work queue in case no characters kick it off. Safe if
           already running */
        if (work)
                schedule_delayed_work(&tty->buf.work, 1);
+       if (o_work)
+               schedule_delayed_work(&o_tty->buf.work, 1);
+       mutex_unlock(&tty->ldisc_mutex);
        return retval;
 }
 
+/**
+ *     tty_reset_termios       -       reset terminal state
+ *     @tty: tty to reset
+ *
+ *     Restore a terminal to the driver default state.
+ */
+
+static void tty_reset_termios(struct tty_struct *tty)
+{
+       mutex_lock(&tty->termios_mutex);
+       *tty->termios = tty->driver->init_termios;
+       tty->termios->c_ispeed = tty_termios_input_baud_rate(tty->termios);
+       tty->termios->c_ospeed = tty_termios_baud_rate(tty->termios);
+       mutex_unlock(&tty->termios_mutex);
+}
+
+
+/**
+ *     tty_ldisc_reinit        -       reinitialise the tty ldisc
+ *     @tty: tty to reinit
+ *
+ *     Switch the tty back to N_TTY line discipline and leave the
+ *     ldisc state closed
+ */
+
+static void tty_ldisc_reinit(struct tty_struct *tty)
+{
+       struct tty_ldisc *ld;
+
+       tty_ldisc_close(tty, tty->ldisc);
+       tty_ldisc_put(tty->ldisc);
+       tty->ldisc = NULL;
+       /*
+        *      Switch the line discipline back
+        */
+       ld = tty_ldisc_get(N_TTY);
+       BUG_ON(IS_ERR(ld));
+       tty_ldisc_assign(tty, ld);
+       tty_set_termios_ldisc(tty, N_TTY);
+}
+
+/**
+ *     tty_ldisc_hangup                -       hangup ldisc reset
+ *     @tty: tty being hung up
+ *
+ *     Some tty devices reset their termios when they receive a hangup
+ *     event. In that situation we must also switch back to N_TTY properly
+ *     before we reset the termios data.
+ *
+ *     Locking: We can take the ldisc mutex as the rest of the code is
+ *     careful to allow for this.
+ *
+ *     In the pty pair case this occurs in the close() path of the
+ *     tty itself so we must be careful about locking rules.
+ */
+
+void tty_ldisc_hangup(struct tty_struct *tty)
+{
+       struct tty_ldisc *ld;
+
+       /*
+        * FIXME! What are the locking issues here? This may me overdoing
+        * things... This question is especially important now that we've
+        * removed the irqlock.
+        */
+       ld = tty_ldisc_ref(tty);
+       if (ld != NULL) {
+               /* We may have no line discipline at this point */
+               if (ld->ops->flush_buffer)
+                       ld->ops->flush_buffer(tty);
+               tty_driver_flush_buffer(tty);
+               if ((test_bit(TTY_DO_WRITE_WAKEUP, &tty->flags)) &&
+                   ld->ops->write_wakeup)
+                       ld->ops->write_wakeup(tty);
+               if (ld->ops->hangup)
+                       ld->ops->hangup(tty);
+               tty_ldisc_deref(ld);
+       }
+       /*
+        * FIXME: Once we trust the LDISC code better we can wait here for
+        * ldisc completion and fix the driver call race
+        */
+       wake_up_interruptible_poll(&tty->write_wait, POLLOUT);
+       wake_up_interruptible_poll(&tty->read_wait, POLLIN);
+       /*
+        * Shutdown the current line discipline, and reset it to
+        * N_TTY.
+        */
+       if (tty->driver->flags & TTY_DRIVER_RESET_TERMIOS) {
+               /* Avoid racing set_ldisc */
+               mutex_lock(&tty->ldisc_mutex);
+               /* Switch back to N_TTY */
+               tty_ldisc_reinit(tty);
+               /* At this point we have a closed ldisc and we want to
+                  reopen it. We could defer this to the next open but
+                  it means auditing a lot of other paths so this is a FIXME */
+               WARN_ON(tty_ldisc_open(tty, tty->ldisc));
+               tty_ldisc_enable(tty);
+               mutex_unlock(&tty->ldisc_mutex);
+               tty_reset_termios(tty);
+       }
+}
 
 /**
  *     tty_ldisc_setup                 -       open line discipline
@@ -610,24 +808,23 @@ restart:
  *     @o_tty: pair tty for pty/tty pairs
  *
  *     Called during the initial open of a tty/pty pair in order to set up the
- *     line discplines and bind them to the tty.
+ *     line disciplines and bind them to the tty. This has no locking issues
+ *     as the device isn't yet active.
  */
 
 int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-       struct tty_ldisc *ld = &tty->ldisc;
+       struct tty_ldisc *ld = tty->ldisc;
        int retval;
 
-       if (ld->ops->open) {
-               retval = (ld->ops->open)(tty);
-               if (retval)
-                       return retval;
-       }
-       if (o_tty && o_tty->ldisc.ops->open) {
-               retval = (o_tty->ldisc.ops->open)(o_tty);
+       retval = tty_ldisc_open(tty, ld);
+       if (retval)
+               return retval;
+
+       if (o_tty) {
+               retval = tty_ldisc_open(o_tty, o_tty->ldisc);
                if (retval) {
-                       if (ld->ops->close)
-                               (ld->ops->close)(tty);
+                       tty_ldisc_close(tty, ld);
                        return retval;
                }
                tty_ldisc_enable(o_tty);
@@ -635,32 +832,25 @@ int tty_ldisc_setup(struct tty_struct *tty, struct tty_struct *o_tty)
        tty_ldisc_enable(tty);
        return 0;
 }
-
 /**
  *     tty_ldisc_release               -       release line discipline
  *     @tty: tty being shut down
  *     @o_tty: pair tty for pty/tty pairs
  *
- *     Called during the final close of a tty/pty pair in order to shut down the
- *     line discpline layer.
+ *     Called during the final close of a tty/pty pair in order to shut down
+ *     the line discpline layer. On exit the ldisc assigned is N_TTY and the
+ *     ldisc has not been opened.
  */
 
 void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 {
-       unsigned long flags;
-       struct tty_ldisc ld;
        /*
         * Prevent flush_to_ldisc() from rescheduling the work for later.  Then
         * kill any delayed work. As this is the final close it does not
         * race with the set_ldisc code path.
         */
-       clear_bit(TTY_LDISC, &tty->flags);
-       cancel_delayed_work(&tty->buf.work);
-
-       /*
-        * Wait for ->hangup_work and ->buf.work handlers to terminate
-        */
 
+       tty_ldisc_halt(tty);
        flush_scheduled_work();
 
        /*
@@ -668,38 +858,19 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
         * side waiters as the file is closing so user count on the file
         * side is zero.
         */
-       spin_lock_irqsave(&tty_ldisc_lock, flags);
-       while (tty->ldisc.refcount) {
-               spin_unlock_irqrestore(&tty_ldisc_lock, flags);
-               wait_event(tty_ldisc_wait, tty->ldisc.refcount == 0);
-               spin_lock_irqsave(&tty_ldisc_lock, flags);
-       }
-       spin_unlock_irqrestore(&tty_ldisc_lock, flags);
+
+       tty_ldisc_wait_idle(tty);
+
        /*
         * Shutdown the current line discipline, and reset it to N_TTY.
         *
         * FIXME: this MUST get fixed for the new reflocking
         */
-       if (tty->ldisc.ops->close)
-               (tty->ldisc.ops->close)(tty);
-       tty_ldisc_put(tty->ldisc.ops);
 
-       /*
-        *      Switch the line discipline back
-        */
-       WARN_ON(tty_ldisc_get(N_TTY, &ld));
-       tty_ldisc_assign(tty, &ld);
-       tty_set_termios_ldisc(tty, N_TTY);
-       if (o_tty) {
-               /* FIXME: could o_tty be in setldisc here ? */
-               clear_bit(TTY_LDISC, &o_tty->flags);
-               if (o_tty->ldisc.ops->close)
-                       (o_tty->ldisc.ops->close)(o_tty);
-               tty_ldisc_put(o_tty->ldisc.ops);
-               WARN_ON(tty_ldisc_get(N_TTY, &ld));
-               tty_ldisc_assign(o_tty, &ld);
-               tty_set_termios_ldisc(o_tty, N_TTY);
-       }
+       tty_ldisc_reinit(tty);
+       /* This will need doing differently if we need to lock */
+       if (o_tty)
+               tty_ldisc_release(o_tty, NULL);
 }
 
 /**
@@ -712,10 +883,10 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty)
 
 void tty_ldisc_init(struct tty_struct *tty)
 {
-       struct tty_ldisc ld;
-       if (tty_ldisc_get(N_TTY, &ld) < 0)
+       struct tty_ldisc *ld = tty_ldisc_get(N_TTY);
+       if (IS_ERR(ld))
                panic("n_tty: init_tty");
-       tty_ldisc_assign(tty, &ld);
+       tty_ldisc_assign(tty, ld);
 }
 
 void tty_ldisc_begin(void)
index 9b8004c72686a45f6ea577f1cd342e001dbbfa49..62dadfc95e341078ef42a284366cef37dbac6432 100644 (file)
@@ -137,7 +137,7 @@ int tty_port_carrier_raised(struct tty_port *port)
 EXPORT_SYMBOL(tty_port_carrier_raised);
 
 /**
- *     tty_port_raise_dtr_rts  -       Riase DTR/RTS
+ *     tty_port_raise_dtr_rts  -       Raise DTR/RTS
  *     @port: tty port
  *
  *     Wrapper for the DTR/RTS raise logic. For the moment this is used
@@ -147,11 +147,27 @@ EXPORT_SYMBOL(tty_port_carrier_raised);
 
 void tty_port_raise_dtr_rts(struct tty_port *port)
 {
-       if (port->ops->raise_dtr_rts)
-               port->ops->raise_dtr_rts(port);
+       if (port->ops->dtr_rts)
+               port->ops->dtr_rts(port, 1);
 }
 EXPORT_SYMBOL(tty_port_raise_dtr_rts);
 
+/**
+ *     tty_port_lower_dtr_rts  -       Lower DTR/RTS
+ *     @port: tty port
+ *
+ *     Wrapper for the DTR/RTS raise logic. For the moment this is used
+ *     to hide some internal details. This will eventually become entirely
+ *     internal to the tty port.
+ */
+
+void tty_port_lower_dtr_rts(struct tty_port *port)
+{
+       if (port->ops->dtr_rts)
+               port->ops->dtr_rts(port, 0);
+}
+EXPORT_SYMBOL(tty_port_lower_dtr_rts);
+
 /**
  *     tty_port_block_til_ready        -       Waiting logic for tty open
  *     @port: the tty port being opened
@@ -167,7 +183,7 @@ EXPORT_SYMBOL(tty_port_raise_dtr_rts);
  *             - port flags and counts
  *
  *     The passed tty_port must implement the carrier_raised method if it can
- *     do carrier detect and the raise_dtr_rts method if it supports software
+ *     do carrier detect and the dtr_rts method if it supports software
  *     management of these lines. Note that the dtr/rts raise is done each
  *     iteration as a hangup may have previously dropped them while we wait.
  */
@@ -182,7 +198,8 @@ int tty_port_block_til_ready(struct tty_port *port,
 
        /* block if port is in the process of being closed */
        if (tty_hung_up_p(filp) || port->flags & ASYNC_CLOSING) {
-               interruptible_sleep_on(&port->close_wait);
+               wait_event_interruptible(port->close_wait,
+                               !(port->flags & ASYNC_CLOSING));
                if (port->flags & ASYNC_HUP_NOTIFY)
                        return -EAGAIN;
                else
@@ -205,7 +222,6 @@ int tty_port_block_til_ready(struct tty_port *port,
           before the next open may complete */
 
        retval = 0;
-       add_wait_queue(&port->open_wait, &wait);
 
        /* The port lock protects the port counts */
        spin_lock_irqsave(&port->lock, flags);
@@ -219,7 +235,7 @@ int tty_port_block_til_ready(struct tty_port *port,
                if (tty->termios->c_cflag & CBAUD)
                        tty_port_raise_dtr_rts(port);
 
-               set_current_state(TASK_INTERRUPTIBLE);
+               prepare_to_wait(&port->open_wait, &wait, TASK_INTERRUPTIBLE);
                /* Check for a hangup or uninitialised port. Return accordingly */
                if (tty_hung_up_p(filp) || !(port->flags & ASYNC_INITIALIZED)) {
                        if (port->flags & ASYNC_HUP_NOTIFY)
@@ -240,8 +256,7 @@ int tty_port_block_til_ready(struct tty_port *port,
                }
                schedule();
        }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&port->open_wait, &wait);
+       finish_wait(&port->open_wait, &wait);
 
        /* Update counts. A parallel hangup will have set count to zero and
           we must not mess that up further */
@@ -292,6 +307,17 @@ int tty_port_close_start(struct tty_port *port, struct tty_struct *tty, struct f
        if (port->flags & ASYNC_INITIALIZED &&
                        port->closing_wait != ASYNC_CLOSING_WAIT_NONE)
                tty_wait_until_sent(tty, port->closing_wait);
+       if (port->drain_delay) {
+               unsigned int bps = tty_get_baud_rate(tty);
+               long timeout;
+
+               if (bps > 1200)
+                       timeout = max_t(long, (HZ * 10 * port->drain_delay) / bps,
+                                                               HZ / 10);
+               else
+                       timeout = 2 * HZ;
+               schedule_timeout_interruptible(timeout);
+       }
        return 1;
 }
 EXPORT_SYMBOL(tty_port_close_start);
@@ -302,6 +328,9 @@ void tty_port_close_end(struct tty_port *port, struct tty_struct *tty)
 
        tty_ldisc_flush(tty);
 
+       if (tty->termios->c_cflag & HUPCL)
+               tty_port_lower_dtr_rts(port);
+
        spin_lock_irqsave(&port->lock, flags);
        tty->closing = 0;
 
index 537da1cde16d6aa00c21d5b51d9c73aa6a7c71fa..e59b6dee9ae24e79196309a6c2031216b1924448 100644 (file)
@@ -402,27 +402,23 @@ static u8 ali_cable_detect(ide_hwif_t *hwif)
        return cbl;
 }
 
-#if !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC)
+#ifndef CONFIG_SPARC64
 /**
  *     init_hwif_ali15x3       -       Initialize the ALI IDE x86 stuff
  *     @hwif: interface to configure
  *
  *     Obtain the IRQ tables for an ALi based IDE solution on the PC
  *     class platforms. This part of the code isn't applicable to the
- *     Sparc and PowerPC systems.
+ *     Sparc systems.
  */
 
 static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 {
-       struct pci_dev *dev = to_pci_dev(hwif->dev);
        u8 ideic, inmir;
        s8 irq_routing_table[] = { -1,  9, 3, 10, 4,  5, 7,  6,
                                      1, 11, 0, 12, 0, 14, 0, 15 };
        int irq = -1;
 
-       if (dev->device == PCI_DEVICE_ID_AL_M5229)
-               hwif->irq = hwif->channel ? 15 : 14;
-
        if (isa_dev) {
                /*
                 * read IDE interface control
@@ -455,7 +451,7 @@ static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif)
 }
 #else
 #define init_hwif_ali15x3 NULL
-#endif /* !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC) */
+#endif /* CONFIG_SPARC64 */
 
 /**
  *     init_dma_ali15x3        -       set up DMA on ALi15x3
index 7201b176d75b05814665096ab45fe372900fbe99..afe5a4323879314b91198f5133fb19c230cea21f 100644 (file)
@@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_init_pc);
 
-/*
- * Generate a new packet command request in front of the request queue, before
- * the current request, so that it will be processed immediately, on the next
- * pass through the driver.
- */
-static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk,
-                             struct ide_atapi_pc *pc, struct request *rq)
-{
-       blk_rq_init(NULL, rq);
-       rq->cmd_type = REQ_TYPE_SPECIAL;
-       rq->cmd_flags |= REQ_PREEMPT;
-       rq->buffer = (char *)pc;
-       rq->rq_disk = disk;
-
-       if (pc->req_xfer) {
-               rq->data = pc->buf;
-               rq->data_len = pc->req_xfer;
-       }
-
-       memcpy(rq->cmd, pc->c, 12);
-       if (drive->media == ide_tape)
-               rq->cmd[13] = REQ_IDETAPE_PC1;
-
-       drive->hwif->rq = NULL;
-
-       elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 /*
  * Add a special packet command request to the tail of the request queue,
  * and wait for it to be serviced.
@@ -119,19 +91,21 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk,
 
        rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
        rq->cmd_type = REQ_TYPE_SPECIAL;
-       rq->buffer = (char *)pc;
+       rq->special = (char *)pc;
 
        if (pc->req_xfer) {
-               rq->data = pc->buf;
-               rq->data_len = pc->req_xfer;
+               error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer,
+                                       GFP_NOIO);
+               if (error)
+                       goto put_req;
        }
 
        memcpy(rq->cmd, pc->c, 12);
        if (drive->media == ide_tape)
                rq->cmd[13] = REQ_IDETAPE_PC1;
        error = blk_execute_rq(drive->queue, disk, rq, 0);
+put_req:
        blk_put_request(rq);
-
        return error;
 }
 EXPORT_SYMBOL_GPL(ide_queue_pc_tail);
@@ -191,20 +165,103 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc)
 }
 EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd);
 
+void ide_prep_sense(ide_drive_t *drive, struct request *rq)
+{
+       struct request_sense *sense = &drive->sense_data;
+       struct request *sense_rq = &drive->sense_rq;
+       unsigned int cmd_len, sense_len;
+       int err;
+
+       debug_log("%s: enter\n", __func__);
+
+       switch (drive->media) {
+       case ide_floppy:
+               cmd_len = 255;
+               sense_len = 18;
+               break;
+       case ide_tape:
+               cmd_len = 20;
+               sense_len = 20;
+               break;
+       default:
+               cmd_len = 18;
+               sense_len = 18;
+       }
+
+       BUG_ON(sense_len > sizeof(*sense));
+
+       if (blk_sense_request(rq) || drive->sense_rq_armed)
+               return;
+
+       memset(sense, 0, sizeof(*sense));
+
+       blk_rq_init(rq->q, sense_rq);
+
+       err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len,
+                             GFP_NOIO);
+       if (unlikely(err)) {
+               if (printk_ratelimit())
+                       printk(KERN_WARNING "%s: failed to map sense buffer\n",
+                              drive->name);
+               return;
+       }
+
+       sense_rq->rq_disk = rq->rq_disk;
+       sense_rq->cmd[0] = GPCMD_REQUEST_SENSE;
+       sense_rq->cmd[4] = cmd_len;
+       sense_rq->cmd_type = REQ_TYPE_SENSE;
+       sense_rq->cmd_flags |= REQ_PREEMPT;
+
+       if (drive->media == ide_tape)
+               sense_rq->cmd[13] = REQ_IDETAPE_PC1;
+
+       drive->sense_rq_armed = true;
+}
+EXPORT_SYMBOL_GPL(ide_prep_sense);
+
+int ide_queue_sense_rq(ide_drive_t *drive, void *special)
+{
+       /* deferred failure from ide_prep_sense() */
+       if (!drive->sense_rq_armed) {
+               printk(KERN_WARNING "%s: failed queue sense request\n",
+                      drive->name);
+               return -ENOMEM;
+       }
+
+       drive->sense_rq.special = special;
+       drive->sense_rq_armed = false;
+
+       drive->hwif->rq = NULL;
+
+       elv_add_request(drive->queue, &drive->sense_rq,
+                       ELEVATOR_INSERT_FRONT, 0);
+       return 0;
+}
+EXPORT_SYMBOL_GPL(ide_queue_sense_rq);
+
 /*
  * Called when an error was detected during the last packet command.
- * We queue a request sense packet command in the head of the request list.
+ * We queue a request sense packet command at the head of the request
+ * queue.
  */
-void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk)
+void ide_retry_pc(ide_drive_t *drive)
 {
-       struct request *rq = &drive->request_sense_rq;
+       struct request *sense_rq = &drive->sense_rq;
        struct ide_atapi_pc *pc = &drive->request_sense_pc;
 
        (void)ide_read_error(drive);
-       ide_create_request_sense_cmd(drive, pc);
+
+       /* init pc from sense_rq */
+       ide_init_pc(pc);
+       memcpy(pc->c, sense_rq->cmd, 12);
+       pc->buf = bio_data(sense_rq->bio);      /* pointer to mapped address */
+       pc->req_xfer = sense_rq->data_len;
+
        if (drive->media == ide_tape)
                set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags);
-       ide_queue_pc_head(drive, disk, pc, rq);
+
+       if (ide_queue_sense_rq(drive, pc))
+               ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq));
 }
 EXPORT_SYMBOL_GPL(ide_retry_pc);
 
@@ -276,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
        struct ide_cmd *cmd = &hwif->cmd;
        struct request *rq = hwif->rq;
        const struct ide_tp_ops *tp_ops = hwif->tp_ops;
-       xfer_func_t *xferfunc;
        unsigned int timeout, done;
        u16 bcount;
        u8 stat, ireason, dsc = 0;
@@ -303,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                                        drive->name, rq_data_dir(pc->rq)
                                                     ? "write" : "read");
                        pc->flags |= PC_FLAG_DMA_ERROR;
-               } else {
+               } else
                        pc->xferred = pc->req_xfer;
-                       if (drive->pc_update_buffers)
-                               drive->pc_update_buffers(drive, pc);
-               }
                debug_log("%s: DMA finished\n", drive->name);
        }
 
@@ -343,7 +396,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                        debug_log("[cmd %x]: check condition\n", rq->cmd[0]);
 
                        /* Retry operation */
-                       ide_retry_pc(drive, rq->rq_disk);
+                       ide_retry_pc(drive);
 
                        /* queued, but not started */
                        return ide_stopped;
@@ -353,6 +406,12 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0)
                        dsc = 1;
 
+               /*
+                * ->pc_callback() might change rq->data_len for
+                * residual count, cache total length.
+                */
+               done = blk_rq_bytes(rq);
+
                /* Command finished - Call the callback function */
                uptodate = drive->pc_callback(drive, dsc);
 
@@ -361,7 +420,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
 
                if (blk_special_request(rq)) {
                        rq->errors = 0;
-                       done = blk_rq_bytes(rq);
                        error = 0;
                } else {
 
@@ -370,11 +428,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                                        rq->errors = -EIO;
                        }
 
-                       if (drive->media == ide_tape)
-                               done = ide_rq_bytes(rq); /* FIXME */
-                       else
-                               done = blk_rq_bytes(rq);
-
                        error = uptodate ? 0 : -EIO;
                }
 
@@ -407,21 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive)
                return ide_do_reset(drive);
        }
 
-       xferfunc = write ? tp_ops->output_data : tp_ops->input_data;
-
-       if (drive->media == ide_floppy && pc->buf == NULL) {
-               done = min_t(unsigned int, bcount, cmd->nleft);
-               ide_pio_bytes(drive, cmd, write, done);
-       } else if (drive->media == ide_tape && pc->bh) {
-               done = drive->pc_io_buffers(drive, pc, bcount, write);
-       } else {
-               done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred);
-               xferfunc(drive, NULL, pc->cur_pos, done);
-       }
+       done = min_t(unsigned int, bcount, cmd->nleft);
+       ide_pio_bytes(drive, cmd, write, done);
 
-       /* Update the current position */
+       /* Update transferred byte count */
        pc->xferred += done;
-       pc->cur_pos += done;
 
        bcount -= done;
 
@@ -599,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd)
 
                /* We haven't transferred any data yet */
                pc->xferred = 0;
-               pc->cur_pos = pc->buf;
 
                valid_tf = IDE_VALID_DEVICE;
                bcount = ((drive->media == ide_tape) ?
index 925eb9e245d1e7f0028dbc3cd0e698bce6b98962..a75e4ee1cd17ec932e6c0d9ade9666fca2eecdb6 100644 (file)
@@ -206,54 +206,25 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive,
        ide_cd_log_error(drive->name, failed_command, sense);
 }
 
-static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense,
-                                     struct request *failed_command)
-{
-       struct cdrom_info *info         = drive->driver_data;
-       struct request *rq              = &drive->request_sense_rq;
-
-       ide_debug_log(IDE_DBG_SENSE, "enter");
-
-       if (sense == NULL)
-               sense = &info->sense_data;
-
-       /* stuff the sense request in front of our current request */
-       blk_rq_init(NULL, rq);
-       rq->cmd_type = REQ_TYPE_ATA_PC;
-       rq->rq_disk = info->disk;
-
-       rq->data = sense;
-       rq->cmd[0] = GPCMD_REQUEST_SENSE;
-       rq->cmd[4] = 18;
-       rq->data_len = 18;
-
-       rq->cmd_type = REQ_TYPE_SENSE;
-       rq->cmd_flags |= REQ_PREEMPT;
-
-       /* NOTE! Save the failed command in "rq->buffer" */
-       rq->buffer = (void *) failed_command;
-
-       if (failed_command)
-               ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x",
-                                            failed_command->cmd[0]);
-
-       drive->hwif->rq = NULL;
-
-       elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0);
-}
-
 static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq)
 {
        /*
-        * For REQ_TYPE_SENSE, "rq->buffer" points to the original
-        * failed request
+        * For REQ_TYPE_SENSE, "rq->special" points to the original
+        * failed request.  Also, the sense data should be read
+        * directly from rq which might be different from the original
+        * sense buffer if it got copied during mapping.
         */
-       struct request *failed = (struct request *)rq->buffer;
-       struct cdrom_info *info = drive->driver_data;
-       void *sense = &info->sense_data;
+       struct request *failed = (struct request *)rq->special;
+       void *sense = bio_data(rq->bio);
 
        if (failed) {
                if (failed->sense) {
+                       /*
+                        * Sense is always read into drive->sense_data.
+                        * Copy back if the failed request has its
+                        * sense pointer set.
+                        */
+                       memcpy(failed->sense, sense, 18);
                        sense = failed->sense;
                        failed->sense_len = rq->sense_len;
                }
@@ -428,7 +399,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat)
 
        /* if we got a CHECK_CONDITION status, queue a request sense command */
        if (stat & ATA_ERR)
-               cdrom_queue_request_sense(drive, NULL, NULL);
+               return ide_queue_sense_rq(drive, NULL) ? 2 : 1;
        return 1;
 
 end_request:
@@ -442,8 +413,7 @@ end_request:
 
                hwif->rq = NULL;
 
-               cdrom_queue_request_sense(drive, rq->sense, rq);
-               return 1;
+               return ide_queue_sense_rq(drive, rq) ? 2 : 1;
        } else
                return 2;
 }
@@ -503,14 +473,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd)
         * and some drives don't send them.  Sigh.
         */
        if (rq->cmd[0] == GPCMD_REQUEST_SENSE &&
-           cmd->nleft > 0 && cmd->nleft <= 5) {
-               unsigned int ofs = cmd->nbytes - cmd->nleft;
-
-               while (cmd->nleft > 0) {
-                       *((u8 *)rq->data + ofs++) = 0;
-                       cmd->nleft--;
-               }
-       }
+           cmd->nleft > 0 && cmd->nleft <= 5)
+               cmd->nleft = 0;
 }
 
 int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
@@ -543,8 +507,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd,
                rq->cmd_flags |= cmd_flags;
                rq->timeout = timeout;
                if (buffer) {
-                       rq->data = buffer;
-                       rq->data_len = *bufflen;
+                       error = blk_rq_map_kern(drive->queue, rq, buffer,
+                                               *bufflen, GFP_NOIO);
+                       if (error) {
+                               blk_put_request(rq);
+                               return error;
+                       }
                }
 
                error = blk_execute_rq(drive->queue, info->disk, rq, 0);
@@ -838,15 +806,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq)
        drive->dma = 0;
 
        /* sg request */
-       if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) {
+       if (rq->bio) {
                struct request_queue *q = drive->queue;
+               char *buf = bio_data(rq->bio);
                unsigned int alignment;
-               char *buf;
-
-               if (rq->bio)
-                       buf = bio_data(rq->bio);
-               else
-                       buf = rq->data;
 
                drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA);
 
@@ -896,6 +859,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq,
                goto out_end;
        }
 
+       /* prepare sense request for this command */
+       ide_prep_sense(drive, rq);
+
        memset(&cmd, 0, sizeof(cmd));
 
        if (rq_data_dir(rq))
index 1d97101099ce20bab8e0b319f6da278a09bd9afa..93a3cf1b0f3f8c61b974b9b16a190502f5d4d038 100644 (file)
@@ -87,10 +87,6 @@ struct cdrom_info {
 
        struct atapi_toc *toc;
 
-       /* The result of the last successful request sense command
-          on this device. */
-       struct request_sense sense_data;
-
        u8 max_speed;           /* Max speed of the drive. */
        u8 current_speed;       /* Current speed of the drive. */
 
index a9fbe2c31210cc1400dc2ce132c74211dbd40642..c2438804d3c4a074ae1ea6d95a39a2df325a0869 100644 (file)
@@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq)
        cmd->protocol = ATA_PROT_NODATA;
 
        rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-       rq->cmd_flags |= REQ_SOFTBARRIER;
        rq->special = cmd;
 }
 
index a0b8cab1d9a682249200fce35bc5ea5c8223079f..d9123ecae4a9829bcbee1e1d0f889bf5b20a6fa2 100644 (file)
@@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error)
        /*
         * un-busy drive etc and make sure request is sane
         */
-
        rq = hwif->rq;
-       if (!rq)
-               goto out;
-
-       hwif->rq = NULL;
-
-       rq->errors = 0;
-
-       if (!rq->bio)
-               goto out;
-
-       rq->sector = rq->bio->bi_sector;
-       rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9;
-       rq->hard_cur_sectors = rq->current_nr_sectors;
-       rq->buffer = bio_data(rq->bio);
-out:
+       if (rq) {
+               hwif->rq = NULL;
+               rq->errors = 0;
+       }
        return ret;
 }
 
index 2b4868d95f8b0597de827c36df575cec1375d8f7..537b7c5580339faea334f59f68c425895ed0263e 100644 (file)
@@ -134,13 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive,
        drive->pc = pc;
 
        if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) {
+               unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
                if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR))
                        ide_floppy_report_error(floppy, pc);
+
                /* Giving up */
                pc->error = IDE_DRV_ERROR_GENERAL;
 
                drive->failed_pc = NULL;
                drive->pc_callback(drive, 0);
+               ide_complete_rq(drive, -EIO, done);
                return ide_stopped;
        }
 
@@ -216,15 +220,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy,
        ide_init_pc(pc);
        memcpy(pc->c, rq->cmd, sizeof(pc->c));
        pc->rq = rq;
-       if (rq->data_len && rq_data_dir(rq) == WRITE)
-               pc->flags |= PC_FLAG_WRITING;
-       pc->buf = rq->data;
-       if (rq->bio)
+       if (rq->data_len) {
                pc->flags |= PC_FLAG_DMA_OK;
-       /*
-        * possibly problematic, doesn't look like ide-floppy correctly
-        * handled scattered requests if dma fails...
-        */
+               if (rq_data_dir(rq) == WRITE)
+                       pc->flags |= PC_FLAG_WRITING;
+       }
+       /* pio will be performed by ide_pio_bytes() which handles sg fine */
+       pc->buf = NULL;
        pc->req_xfer = pc->buf_size = rq->data_len;
 }
 
@@ -265,8 +267,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
                }
                pc = &floppy->queued_pc;
                idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block);
-       } else if (blk_special_request(rq)) {
-               pc = (struct ide_atapi_pc *) rq->buffer;
+       } else if (blk_special_request(rq) || blk_sense_request(rq)) {
+               pc = (struct ide_atapi_pc *)rq->special;
        } else if (blk_pc_request(rq)) {
                pc = &floppy->queued_pc;
                idefloppy_blockpc_cmd(floppy, pc, rq);
@@ -275,6 +277,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive,
                goto out_end;
        }
 
+       ide_prep_sense(drive, rq);
+
        memset(&cmd, 0, sizeof(cmd));
 
        if (rq_data_dir(rq))
index 6415a2e2ba87febae0cfa6e9b8342f2dbd190748..41d804065d3868daade9a7af34d6f480e71d3097 100644 (file)
@@ -248,14 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd)
        struct scatterlist *sg = hwif->sg_table;
        struct request *rq = cmd->rq;
 
-       if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) {
-               sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE);
-               cmd->sg_nents = 1;
-       } else if (!rq->bio) {
-               sg_init_one(sg, rq->data, rq->data_len);
-               cmd->sg_nents = 1;
-       } else
-               cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
+       cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg);
 }
 EXPORT_SYMBOL_GPL(ide_map_sg);
 
@@ -371,7 +364,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq)
                if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE)
                        return execute_drive_cmd(drive, rq);
                else if (blk_pm_request(rq)) {
-                       struct request_pm_state *pm = rq->data;
+                       struct request_pm_state *pm = rq->special;
 #ifdef DEBUG_PM
                        printk("%s: start_power_step(step: %d)\n",
                                drive->name, pm->pm_step);
@@ -484,6 +477,9 @@ void do_ide_request(struct request_queue *q)
 
        spin_unlock_irq(q->queue_lock);
 
+       /* HLD do_request() callback might sleep, make sure it's okay */
+       might_sleep();
+
        if (ide_lock_host(host, hwif))
                goto plug_device_2;
 
index c1c25ebbaa1fb3ef6abac4bb720946bfd6af90c6..5991b23793f20ee33f164a2dea752804f7580310 100644 (file)
@@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive)
        rq->cmd_type = REQ_TYPE_SPECIAL;
        rq->cmd_len = 1;
        rq->cmd[0] = REQ_DRIVE_RESET;
-       rq->cmd_flags |= REQ_SOFTBARRIER;
        if (blk_execute_rq(drive->queue, NULL, rq, 1))
                ret = rq->errors;
        blk_put_request(rq);
index 310d03f2b5b793e456305547ffd3d86e036c66ca..a914023d6d035d9e530178ff2422a7bf386cc2cd 100644 (file)
@@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout)
                        start_queue = 1;
                spin_unlock_irq(&hwif->lock);
 
-               if (start_queue) {
-                       spin_lock_irq(q->queue_lock);
-                       blk_start_queueing(q);
-                       spin_unlock_irq(q->queue_lock);
-               }
+               if (start_queue)
+                       blk_run_queue(q);
                return;
        }
        spin_unlock_irq(&hwif->lock);
index 0d8a151c0a01da799d0c68f96b20dc2dbe1928bc..ba1488bd84307bb90850977573d2e80c9778af69 100644 (file)
@@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
        ide_hwif_t *hwif = drive->hwif;
        struct request *rq;
        struct request_pm_state rqpm;
-       struct ide_cmd cmd;
        int ret;
 
        /* call ACPI _GTM only once */
@@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg)
                ide_acpi_get_timing(hwif);
 
        memset(&rqpm, 0, sizeof(rqpm));
-       memset(&cmd, 0, sizeof(cmd));
        rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
        rq->cmd_type = REQ_TYPE_PM_SUSPEND;
-       rq->special = &cmd;
-       rq->data = &rqpm;
+       rq->special = &rqpm;
        rqpm.pm_step = IDE_PM_START_SUSPEND;
        if (mesg.event == PM_EVENT_PRETHAW)
                mesg.event = PM_EVENT_FREEZE;
@@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev)
        ide_hwif_t *hwif = drive->hwif;
        struct request *rq;
        struct request_pm_state rqpm;
-       struct ide_cmd cmd;
        int err;
 
        /* call ACPI _PS0 / _STM only once */
@@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev)
        ide_acpi_exec_tfs(drive);
 
        memset(&rqpm, 0, sizeof(rqpm));
-       memset(&cmd, 0, sizeof(cmd));
        rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
        rq->cmd_type = REQ_TYPE_PM_RESUME;
        rq->cmd_flags |= REQ_PREEMPT;
-       rq->special = &cmd;
-       rq->data = &rqpm;
+       rq->special = &rqpm;
        rqpm.pm_step = IDE_PM_START_RESUME;
        rqpm.pm_state = PM_EVENT_ON;
 
@@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev)
 
 void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 {
-       struct request_pm_state *pm = rq->data;
+       struct request_pm_state *pm = rq->special;
 
 #ifdef DEBUG_PM
        printk(KERN_INFO "%s: complete_power_step(step: %d)\n",
@@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq)
 
 ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
 {
-       struct request_pm_state *pm = rq->data;
-       struct ide_cmd *cmd = rq->special;
-
-       memset(cmd, 0, sizeof(*cmd));
+       struct request_pm_state *pm = rq->special;
+       struct ide_cmd cmd = { };
 
        switch (pm->pm_step) {
        case IDE_PM_FLUSH_CACHE:        /* Suspend step 1 (flush cache) */
@@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
                        return ide_stopped;
                }
                if (ata_id_flush_ext_enabled(drive->id))
-                       cmd->tf.command = ATA_CMD_FLUSH_EXT;
+                       cmd.tf.command = ATA_CMD_FLUSH_EXT;
                else
-                       cmd->tf.command = ATA_CMD_FLUSH;
+                       cmd.tf.command = ATA_CMD_FLUSH;
                goto out_do_tf;
        case IDE_PM_STANDBY:            /* Suspend step 2 (standby) */
-               cmd->tf.command = ATA_CMD_STANDBYNOW1;
+               cmd.tf.command = ATA_CMD_STANDBYNOW1;
                goto out_do_tf;
        case IDE_PM_RESTORE_PIO:        /* Resume step 1 (restore PIO) */
                ide_set_max_pio(drive);
@@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
                        ide_complete_power_step(drive, rq);
                return ide_stopped;
        case IDE_PM_IDLE:               /* Resume step 2 (idle) */
-               cmd->tf.command = ATA_CMD_IDLEIMMEDIATE;
+               cmd.tf.command = ATA_CMD_IDLEIMMEDIATE;
                goto out_do_tf;
        case IDE_PM_RESTORE_DMA:        /* Resume step 3 (restore DMA) */
                /*
@@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq)
        return ide_stopped;
 
 out_do_tf:
-       cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
-       cmd->valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
-       cmd->protocol = ATA_PROT_NODATA;
+       cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE;
+       cmd.valid.in.tf  = IDE_VALID_IN_TF  | IDE_VALID_DEVICE;
+       cmd.protocol = ATA_PROT_NODATA;
 
-       return do_rw_taskfile(drive, cmd);
+       return do_rw_taskfile(drive, &cmd);
 }
 
 /**
@@ -181,7 +173,7 @@ out_do_tf:
 void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 {
        struct request_queue *q = drive->queue;
-       struct request_pm_state *pm = rq->data;
+       struct request_pm_state *pm = rq->special;
        unsigned long flags;
 
        ide_complete_power_step(drive, rq);
@@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq)
 
 void ide_check_pm_state(ide_drive_t *drive, struct request *rq)
 {
-       struct request_pm_state *pm = rq->data;
+       struct request_pm_state *pm = rq->special;
 
        if (blk_pm_suspend_request(rq) &&
            pm->pm_step == IDE_PM_START_SUSPEND)
index 3a53e0834cf798d6bbc7c5fca982d192d1941fd3..203bbeac182f52a52647f9f324efabfe8e004cd3 100644 (file)
@@ -131,13 +131,6 @@ enum {
        IDETAPE_DIR_WRITE = (1 << 2),
 };
 
-struct idetape_bh {
-       u32 b_size;
-       atomic_t b_count;
-       struct idetape_bh *b_reqnext;
-       char *b_data;
-};
-
 /* Tape door status */
 #define DOOR_UNLOCKED                  0
 #define DOOR_LOCKED                    1
@@ -219,18 +212,12 @@ typedef struct ide_tape_obj {
 
        /* Data buffer size chosen based on the tape's recommendation */
        int buffer_size;
-       /* merge buffer */
-       struct idetape_bh *merge_bh;
-       /* size of the merge buffer */
-       int merge_bh_size;
-       /* pointer to current buffer head within the merge buffer */
-       struct idetape_bh *bh;
-       char *b_data;
-       int b_count;
-
-       int pages_per_buffer;
-       /* Wasted space in each stage */
-       int excess_bh_size;
+       /* Staging buffer of buffer_size bytes */
+       void *buf;
+       /* The read/write cursor */
+       void *cur;
+       /* The number of valid bytes in buf */
+       size_t valid;
 
        /* Measures average tape speed */
        unsigned long avg_time;
@@ -297,84 +284,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i)
        return tape;
 }
 
-static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-                                 unsigned int bcount)
-{
-       struct idetape_bh *bh = pc->bh;
-       int count;
-
-       while (bcount) {
-               if (bh == NULL)
-                       break;
-               count = min(
-                       (unsigned int)(bh->b_size - atomic_read(&bh->b_count)),
-                       bcount);
-               drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data +
-                                       atomic_read(&bh->b_count), count);
-               bcount -= count;
-               atomic_add(count, &bh->b_count);
-               if (atomic_read(&bh->b_count) == bh->b_size) {
-                       bh = bh->b_reqnext;
-                       if (bh)
-                               atomic_set(&bh->b_count, 0);
-               }
-       }
-
-       pc->bh = bh;
-
-       return bcount;
-}
-
-static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-                                  unsigned int bcount)
-{
-       struct idetape_bh *bh = pc->bh;
-       int count;
-
-       while (bcount) {
-               if (bh == NULL)
-                       break;
-               count = min((unsigned int)pc->b_count, (unsigned int)bcount);
-               drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count);
-               bcount -= count;
-               pc->b_data += count;
-               pc->b_count -= count;
-               if (!pc->b_count) {
-                       bh = bh->b_reqnext;
-                       pc->bh = bh;
-                       if (bh) {
-                               pc->b_data = bh->b_data;
-                               pc->b_count = atomic_read(&bh->b_count);
-                       }
-               }
-       }
-
-       return bcount;
-}
-
-static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc)
-{
-       struct idetape_bh *bh = pc->bh;
-       int count;
-       unsigned int bcount = pc->xferred;
-
-       if (pc->flags & PC_FLAG_WRITING)
-               return;
-       while (bcount) {
-               if (bh == NULL) {
-                       printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-                                       __func__);
-                       return;
-               }
-               count = min((unsigned int)bh->b_size, (unsigned int)bcount);
-               atomic_set(&bh->b_count, count);
-               if (atomic_read(&bh->b_count) == bh->b_size)
-                       bh = bh->b_reqnext;
-               bcount -= count;
-       }
-       pc->bh = bh;
-}
-
 /*
  * called on each failed packet command retry to analyze the request sense. We
  * currently do not utilize this information.
@@ -392,12 +301,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
                 pc->c[0], tape->sense_key, tape->asc, tape->ascq);
 
        /* Correct pc->xferred by asking the tape.       */
-       if (pc->flags & PC_FLAG_DMA_ERROR) {
+       if (pc->flags & PC_FLAG_DMA_ERROR)
                pc->xferred = pc->req_xfer -
                        tape->blk_size *
                        get_unaligned_be32(&sense[3]);
-               idetape_update_buffers(drive, pc);
-       }
 
        /*
         * If error was the result of a zero-length read or write command,
@@ -436,29 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense)
        }
 }
 
-/* Free data buffers completely. */
-static void ide_tape_kfree_buffer(idetape_tape_t *tape)
-{
-       struct idetape_bh *prev_bh, *bh = tape->merge_bh;
-
-       while (bh) {
-               u32 size = bh->b_size;
-
-               while (size) {
-                       unsigned int order = fls(size >> PAGE_SHIFT)-1;
-
-                       if (bh->b_data)
-                               free_pages((unsigned long)bh->b_data, order);
-
-                       size &= (order-1);
-                       bh->b_data += (1 << order) * PAGE_SIZE;
-               }
-               prev_bh = bh;
-               bh = bh->b_reqnext;
-               kfree(prev_bh);
-       }
-}
-
 static void ide_tape_handle_dsc(ide_drive_t *);
 
 static int ide_tape_callback(ide_drive_t *drive, int dsc)
@@ -496,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc)
                }
 
                tape->first_frame += blocks;
-               rq->current_nr_sectors -= blocks;
+               rq->data_len -= blocks * tape->blk_size;
 
                if (pc->error) {
                        uptodate = 0;
@@ -558,19 +442,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive)
        idetape_postpone_request(drive);
 }
 
-static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc,
-                               unsigned int bcount, int write)
-{
-       unsigned int bleft;
-
-       if (write)
-               bleft = idetape_output_buffers(drive, pc, bcount);
-       else
-               bleft = idetape_input_buffers(drive, pc, bcount);
-
-       return bcount - bleft;
-}
-
 /*
  * Packet Command Interface
  *
@@ -622,6 +493,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
 
        if (pc->retries > IDETAPE_MAX_PC_RETRIES ||
                (pc->flags & PC_FLAG_ABORT)) {
+               unsigned int done = blk_rq_bytes(drive->hwif->rq);
+
                /*
                 * We will "abort" retrying a packet command in case legitimate
                 * error code was received (crossing a filemark, or end of the
@@ -641,8 +514,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive,
                        /* Giving up */
                        pc->error = IDE_DRV_ERROR_GENERAL;
                }
+
                drive->failed_pc = NULL;
                drive->pc_callback(drive, 0);
+               ide_complete_rq(drive, -EIO, done);
                return ide_stopped;
        }
        debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]);
@@ -695,7 +570,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive)
                                printk(KERN_ERR "ide-tape: %s: I/O error, ",
                                                tape->name);
                        /* Retry operation */
-                       ide_retry_pc(drive, tape->disk);
+                       ide_retry_pc(drive);
                        return ide_stopped;
                }
                pc->error = 0;
@@ -711,27 +586,22 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape,
                                   struct ide_atapi_pc *pc, struct request *rq,
                                   u8 opcode)
 {
-       struct idetape_bh *bh = (struct idetape_bh *)rq->special;
-       unsigned int length = rq->current_nr_sectors;
+       unsigned int length = rq->nr_sectors;
 
        ide_init_pc(pc);
        put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]);
        pc->c[1] = 1;
-       pc->bh = bh;
        pc->buf = NULL;
        pc->buf_size = length * tape->blk_size;
        pc->req_xfer = pc->buf_size;
        if (pc->req_xfer == tape->buffer_size)
                pc->flags |= PC_FLAG_DMA_OK;
 
-       if (opcode == READ_6) {
+       if (opcode == READ_6)
                pc->c[0] = READ_6;
-               atomic_set(&bh->b_count, 0);
-       } else if (opcode == WRITE_6) {
+       else if (opcode == WRITE_6) {
                pc->c[0] = WRITE_6;
                pc->flags |= PC_FLAG_WRITING;
-               pc->b_data = bh->b_data;
-               pc->b_count = atomic_read(&bh->b_count);
        }
 
        memcpy(rq->cmd, pc->c, 12);
@@ -747,12 +617,10 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
        struct ide_cmd cmd;
        u8 stat;
 
-       debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu,"
-                       " current_nr_sectors: %u\n",
-                       (unsigned long long)rq->sector, rq->nr_sectors,
-                       rq->current_nr_sectors);
+       debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n",
+                 (unsigned long long)rq->sector, rq->nr_sectors);
 
-       if (!blk_special_request(rq)) {
+       if (!(blk_special_request(rq) || blk_sense_request(rq))) {
                /* We do not support buffer cache originated requests. */
                printk(KERN_NOTICE "ide-tape: %s: Unsupported request in "
                        "request queue (%d)\n", drive->name, rq->cmd_type);
@@ -828,7 +696,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
                goto out;
        }
        if (rq->cmd[13] & REQ_IDETAPE_PC1) {
-               pc = (struct ide_atapi_pc *) rq->buffer;
+               pc = (struct ide_atapi_pc *)rq->special;
                rq->cmd[13] &= ~(REQ_IDETAPE_PC1);
                rq->cmd[13] |= REQ_IDETAPE_PC2;
                goto out;
@@ -840,6 +708,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive,
        BUG();
 
 out:
+       /* prepare sense request for this command */
+       ide_prep_sense(drive, rq);
+
        memset(&cmd, 0, sizeof(cmd));
 
        if (rq_data_dir(rq))
@@ -847,167 +718,10 @@ out:
 
        cmd.rq = rq;
 
-       return ide_tape_issue_pc(drive, &cmd, pc);
-}
-
-/*
- * The function below uses __get_free_pages to allocate a data buffer of size
- * tape->buffer_size (or a bit more). We attempt to combine sequential pages as
- * much as possible.
- *
- * It returns a pointer to the newly allocated buffer, or NULL in case of
- * failure.
- */
-static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape,
-                                                 int full, int clear)
-{
-       struct idetape_bh *prev_bh, *bh, *merge_bh;
-       int pages = tape->pages_per_buffer;
-       unsigned int order, b_allocd;
-       char *b_data = NULL;
-
-       merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-       bh = merge_bh;
-       if (bh == NULL)
-               goto abort;
-
-       order = fls(pages) - 1;
-       bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-       if (!bh->b_data)
-               goto abort;
-       b_allocd = (1 << order) * PAGE_SIZE;
-       pages &= (order-1);
-
-       if (clear)
-               memset(bh->b_data, 0, b_allocd);
-       bh->b_reqnext = NULL;
-       bh->b_size = b_allocd;
-       atomic_set(&bh->b_count, full ? bh->b_size : 0);
-
-       while (pages) {
-               order = fls(pages) - 1;
-               b_data = (char *) __get_free_pages(GFP_KERNEL, order);
-               if (!b_data)
-                       goto abort;
-               b_allocd = (1 << order) * PAGE_SIZE;
-
-               if (clear)
-                       memset(b_data, 0, b_allocd);
-
-               /* newly allocated page frames below buffer header or ...*/
-               if (bh->b_data == b_data + b_allocd) {
-                       bh->b_size += b_allocd;
-                       bh->b_data -= b_allocd;
-                       if (full)
-                               atomic_add(b_allocd, &bh->b_count);
-                       continue;
-               }
-               /* they are above the header */
-               if (b_data == bh->b_data + bh->b_size) {
-                       bh->b_size += b_allocd;
-                       if (full)
-                               atomic_add(b_allocd, &bh->b_count);
-                       continue;
-               }
-               prev_bh = bh;
-               bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL);
-               if (!bh) {
-                       free_pages((unsigned long) b_data, order);
-                       goto abort;
-               }
-               bh->b_reqnext = NULL;
-               bh->b_data = b_data;
-               bh->b_size = b_allocd;
-               atomic_set(&bh->b_count, full ? bh->b_size : 0);
-               prev_bh->b_reqnext = bh;
-
-               pages &= (order-1);
-       }
-
-       bh->b_size -= tape->excess_bh_size;
-       if (full)
-               atomic_sub(tape->excess_bh_size, &bh->b_count);
-       return merge_bh;
-abort:
-       ide_tape_kfree_buffer(tape);
-       return NULL;
-}
+       ide_init_sg_cmd(&cmd, pc->req_xfer);
+       ide_map_sg(drive, &cmd);
 
-static int idetape_copy_stage_from_user(idetape_tape_t *tape,
-                                       const char __user *buf, int n)
-{
-       struct idetape_bh *bh = tape->bh;
-       int count;
-       int ret = 0;
-
-       while (n) {
-               if (bh == NULL) {
-                       printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-                                       __func__);
-                       return 1;
-               }
-               count = min((unsigned int)
-                               (bh->b_size - atomic_read(&bh->b_count)),
-                               (unsigned int)n);
-               if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf,
-                               count))
-                       ret = 1;
-               n -= count;
-               atomic_add(count, &bh->b_count);
-               buf += count;
-               if (atomic_read(&bh->b_count) == bh->b_size) {
-                       bh = bh->b_reqnext;
-                       if (bh)
-                               atomic_set(&bh->b_count, 0);
-               }
-       }
-       tape->bh = bh;
-       return ret;
-}
-
-static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf,
-                                     int n)
-{
-       struct idetape_bh *bh = tape->bh;
-       int count;
-       int ret = 0;
-
-       while (n) {
-               if (bh == NULL) {
-                       printk(KERN_ERR "ide-tape: bh == NULL in %s\n",
-                                       __func__);
-                       return 1;
-               }
-               count = min(tape->b_count, n);
-               if  (copy_to_user(buf, tape->b_data, count))
-                       ret = 1;
-               n -= count;
-               tape->b_data += count;
-               tape->b_count -= count;
-               buf += count;
-               if (!tape->b_count) {
-                       bh = bh->b_reqnext;
-                       tape->bh = bh;
-                       if (bh) {
-                               tape->b_data = bh->b_data;
-                               tape->b_count = atomic_read(&bh->b_count);
-                       }
-               }
-       }
-       return ret;
-}
-
-static void idetape_init_merge_buffer(idetape_tape_t *tape)
-{
-       struct idetape_bh *bh = tape->merge_bh;
-       tape->bh = tape->merge_bh;
-
-       if (tape->chrdev_dir == IDETAPE_DIR_WRITE)
-               atomic_set(&bh->b_count, 0);
-       else {
-               tape->b_data = bh->b_data;
-               tape->b_count = atomic_read(&bh->b_count);
-       }
+       return ide_tape_issue_pc(drive, &cmd, pc);
 }
 
 /*
@@ -1107,10 +821,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive)
                return;
 
        clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags);
-       tape->merge_bh_size = 0;
-       if (tape->merge_bh != NULL) {
-               ide_tape_kfree_buffer(tape);
-               tape->merge_bh = NULL;
+       tape->valid = 0;
+       if (tape->buf != NULL) {
+               kfree(tape->buf);
+               tape->buf = NULL;
        }
 
        tape->chrdev_dir = IDETAPE_DIR_NONE;
@@ -1164,36 +878,44 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive,
  * Generate a read/write request for the block device interface and wait for it
  * to be serviced.
  */
-static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks,
-                                struct idetape_bh *bh)
+static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size)
 {
        idetape_tape_t *tape = drive->driver_data;
        struct request *rq;
-       int ret, errors;
+       int ret;
 
        debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd);
+       BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE);
+       BUG_ON(size < 0 || size % tape->blk_size);
 
        rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
        rq->cmd_type = REQ_TYPE_SPECIAL;
        rq->cmd[13] = cmd;
        rq->rq_disk = tape->disk;
-       rq->special = (void *)bh;
        rq->sector = tape->first_frame;
-       rq->nr_sectors = blocks;
-       rq->current_nr_sectors = blocks;
-       blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-       errors = rq->errors;
-       ret = tape->blk_size * (blocks - rq->current_nr_sectors);
-       blk_put_request(rq);
+       if (size) {
+               ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size,
+                                     __GFP_WAIT);
+               if (ret)
+                       goto out_put;
+       }
 
-       if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0)
-               return 0;
+       blk_execute_rq(drive->queue, tape->disk, rq, 0);
 
-       if (tape->merge_bh)
-               idetape_init_merge_buffer(tape);
-       if (errors == IDE_DRV_ERROR_GENERAL)
-               return -EIO;
+       /* calculate the number of transferred bytes and update buffer state */
+       size -= rq->data_len;
+       tape->cur = tape->buf;
+       if (cmd == REQ_IDETAPE_READ)
+               tape->valid = size;
+       else
+               tape->valid = 0;
+
+       ret = size;
+       if (rq->errors == IDE_DRV_ERROR_GENERAL)
+               ret = -EIO;
+out_put:
+       blk_put_request(rq);
        return ret;
 }
 
@@ -1230,153 +952,87 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd)
        pc->flags |= PC_FLAG_WAIT_FOR_DSC;
 }
 
-/* Queue up a character device originated write request. */
-static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks)
-{
-       idetape_tape_t *tape = drive->driver_data;
-
-       debug_log(DBG_CHRDEV, "Enter %s\n", __func__);
-
-       return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
-                                    blocks, tape->merge_bh);
-}
-
 static void ide_tape_flush_merge_buffer(ide_drive_t *drive)
 {
        idetape_tape_t *tape = drive->driver_data;
-       int blocks, min;
-       struct idetape_bh *bh;
 
        if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
                printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer"
                                " but we are not writing.\n");
                return;
        }
-       if (tape->merge_bh_size > tape->buffer_size) {
-               printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n");
-               tape->merge_bh_size = tape->buffer_size;
-       }
-       if (tape->merge_bh_size) {
-               blocks = tape->merge_bh_size / tape->blk_size;
-               if (tape->merge_bh_size % tape->blk_size) {
-                       unsigned int i;
-
-                       blocks++;
-                       i = tape->blk_size - tape->merge_bh_size %
-                               tape->blk_size;
-                       bh = tape->bh->b_reqnext;
-                       while (bh) {
-                               atomic_set(&bh->b_count, 0);
-                               bh = bh->b_reqnext;
-                       }
-                       bh = tape->bh;
-                       while (i) {
-                               if (bh == NULL) {
-                                       printk(KERN_INFO "ide-tape: bug,"
-                                                        " bh NULL\n");
-                                       break;
-                               }
-                               min = min(i, (unsigned int)(bh->b_size -
-                                               atomic_read(&bh->b_count)));
-                               memset(bh->b_data + atomic_read(&bh->b_count),
-                                               0, min);
-                               atomic_add(min, &bh->b_count);
-                               i -= min;
-                               bh = bh->b_reqnext;
-                       }
-               }
-               (void) idetape_add_chrdev_write_request(drive, blocks);
-               tape->merge_bh_size = 0;
-       }
-       if (tape->merge_bh != NULL) {
-               ide_tape_kfree_buffer(tape);
-               tape->merge_bh = NULL;
+       if (tape->buf) {
+               size_t aligned = roundup(tape->valid, tape->blk_size);
+
+               memset(tape->cur, 0, aligned - tape->valid);
+               idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned);
+               kfree(tape->buf);
+               tape->buf = NULL;
        }
        tape->chrdev_dir = IDETAPE_DIR_NONE;
 }
 
-static int idetape_init_read(ide_drive_t *drive)
+static int idetape_init_rw(ide_drive_t *drive, int dir)
 {
        idetape_tape_t *tape = drive->driver_data;
-       int bytes_read;
+       int rc;
 
-       /* Initialize read operation */
-       if (tape->chrdev_dir != IDETAPE_DIR_READ) {
-               if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
-                       ide_tape_flush_merge_buffer(drive);
-                       idetape_flush_tape_buffers(drive);
-               }
-               if (tape->merge_bh || tape->merge_bh_size) {
-                       printk(KERN_ERR "ide-tape: merge_bh_size should be"
-                                        " 0 now\n");
-                       tape->merge_bh_size = 0;
-               }
-               tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
-               if (!tape->merge_bh)
-                       return -ENOMEM;
-               tape->chrdev_dir = IDETAPE_DIR_READ;
+       BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE);
 
-               /*
-                * Issue a read 0 command to ensure that DSC handshake is
-                * switched from completion mode to buffer available mode.
-                * No point in issuing this if DSC overlap isn't supported, some
-                * drives (Seagate STT3401A) will return an error.
-                */
-               if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-                       bytes_read = idetape_queue_rw_tail(drive,
-                                                       REQ_IDETAPE_READ, 0,
-                                                       tape->merge_bh);
-                       if (bytes_read < 0) {
-                               ide_tape_kfree_buffer(tape);
-                               tape->merge_bh = NULL;
-                               tape->chrdev_dir = IDETAPE_DIR_NONE;
-                               return bytes_read;
-                       }
-               }
-       }
+       if (tape->chrdev_dir == dir)
+               return 0;
 
-       return 0;
-}
+       if (tape->chrdev_dir == IDETAPE_DIR_READ)
+               ide_tape_discard_merge_buffer(drive, 1);
+       else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) {
+               ide_tape_flush_merge_buffer(drive);
+               idetape_flush_tape_buffers(drive);
+       }
 
-/* called from idetape_chrdev_read() to service a chrdev read request. */
-static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks)
-{
-       idetape_tape_t *tape = drive->driver_data;
+       if (tape->buf || tape->valid) {
+               printk(KERN_ERR "ide-tape: valid should be 0 now\n");
+               tape->valid = 0;
+       }
 
-       debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks);
+       tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+       if (!tape->buf)
+               return -ENOMEM;
+       tape->chrdev_dir = dir;
+       tape->cur = tape->buf;
 
-       /* If we are at a filemark, return a read length of 0 */
-       if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
-               return 0;
-
-       idetape_init_read(drive);
+       /*
+        * Issue a 0 rw command to ensure that DSC handshake is
+        * switched from completion mode to buffer available mode.  No
+        * point in issuing this if DSC overlap isn't supported, some
+        * drives (Seagate STT3401A) will return an error.
+        */
+       if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
+               int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ
+                                                 : REQ_IDETAPE_WRITE;
+
+               rc = idetape_queue_rw_tail(drive, cmd, 0);
+               if (rc < 0) {
+                       kfree(tape->buf);
+                       tape->buf = NULL;
+                       tape->chrdev_dir = IDETAPE_DIR_NONE;
+                       return rc;
+               }
+       }
 
-       return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks,
-                                    tape->merge_bh);
+       return 0;
 }
 
 static void idetape_pad_zeros(ide_drive_t *drive, int bcount)
 {
        idetape_tape_t *tape = drive->driver_data;
-       struct idetape_bh *bh;
-       int blocks;
+
+       memset(tape->buf, 0, tape->buffer_size);
 
        while (bcount) {
-               unsigned int count;
+               unsigned int count = min(tape->buffer_size, bcount);
 
-               bh = tape->merge_bh;
-               count = min(tape->buffer_size, bcount);
+               idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count);
                bcount -= count;
-               blocks = count / tape->blk_size;
-               while (count) {
-                       atomic_set(&bh->b_count,
-                                  min(count, (unsigned int)bh->b_size));
-                       memset(bh->b_data, 0, atomic_read(&bh->b_count));
-                       count -= atomic_read(&bh->b_count);
-                       bh = bh->b_reqnext;
-               }
-               idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks,
-                                     tape->merge_bh);
        }
 }
 
@@ -1456,7 +1112,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op,
        }
 
        if (tape->chrdev_dir == IDETAPE_DIR_READ) {
-               tape->merge_bh_size = 0;
+               tape->valid = 0;
                if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
                        ++count;
                ide_tape_discard_merge_buffer(drive, 0);
@@ -1505,9 +1161,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
 {
        struct ide_tape_obj *tape = file->private_data;
        ide_drive_t *drive = tape->drive;
-       ssize_t bytes_read, temp, actually_read = 0, rc;
+       size_t done = 0;
        ssize_t ret = 0;
-       u16 ctl = *(u16 *)&tape->caps[12];
+       int rc;
 
        debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
@@ -1517,49 +1173,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf,
                            (count % tape->blk_size) == 0)
                                tape->user_bs_factor = count / tape->blk_size;
        }
-       rc = idetape_init_read(drive);
+
+       rc = idetape_init_rw(drive, IDETAPE_DIR_READ);
        if (rc < 0)
                return rc;
-       if (count == 0)
-               return (0);
-       if (tape->merge_bh_size) {
-               actually_read = min((unsigned int)(tape->merge_bh_size),
-                                   (unsigned int)count);
-               if (idetape_copy_stage_to_user(tape, buf, actually_read))
-                       ret = -EFAULT;
-               buf += actually_read;
-               tape->merge_bh_size -= actually_read;
-               count -= actually_read;
-       }
-       while (count >= tape->buffer_size) {
-               bytes_read = idetape_add_chrdev_read_request(drive, ctl);
-               if (bytes_read <= 0)
-                       goto finish;
-               if (idetape_copy_stage_to_user(tape, buf, bytes_read))
-                       ret = -EFAULT;
-               buf += bytes_read;
-               count -= bytes_read;
-               actually_read += bytes_read;
-       }
-       if (count) {
-               bytes_read = idetape_add_chrdev_read_request(drive, ctl);
-               if (bytes_read <= 0)
-                       goto finish;
-               temp = min((unsigned long)count, (unsigned long)bytes_read);
-               if (idetape_copy_stage_to_user(tape, buf, temp))
+
+       while (done < count) {
+               size_t todo;
+
+               /* refill if staging buffer is empty */
+               if (!tape->valid) {
+                       /* If we are at a filemark, nothing more to read */
+                       if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags))
+                               break;
+                       /* read */
+                       if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ,
+                                                 tape->buffer_size) <= 0)
+                               break;
+               }
+
+               /* copy out */
+               todo = min_t(size_t, count - done, tape->valid);
+               if (copy_to_user(buf + done, tape->cur, todo))
                        ret = -EFAULT;
-               actually_read += temp;
-               tape->merge_bh_size = bytes_read-temp;
+
+               tape->cur += todo;
+               tape->valid -= todo;
+               done += todo;
        }
-finish:
-       if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
+
+       if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) {
                debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name);
 
                idetape_space_over_filemarks(drive, MTFSF, 1);
                return 0;
        }
 
-       return ret ? ret : actually_read;
+       return ret ? ret : done;
 }
 
 static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
@@ -1567,9 +1217,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
 {
        struct ide_tape_obj *tape = file->private_data;
        ide_drive_t *drive = tape->drive;
-       ssize_t actually_written = 0;
+       size_t done = 0;
        ssize_t ret = 0;
-       u16 ctl = *(u16 *)&tape->caps[12];
+       int rc;
 
        /* The drive is write protected. */
        if (tape->write_prot)
@@ -1578,80 +1228,31 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf,
        debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count);
 
        /* Initialize write operation */
-       if (tape->chrdev_dir != IDETAPE_DIR_WRITE) {
-               if (tape->chrdev_dir == IDETAPE_DIR_READ)
-                       ide_tape_discard_merge_buffer(drive, 1);
-               if (tape->merge_bh || tape->merge_bh_size) {
-                       printk(KERN_ERR "ide-tape: merge_bh_size "
-                               "should be 0 now\n");
-                       tape->merge_bh_size = 0;
-               }
-               tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0);
-               if (!tape->merge_bh)
-                       return -ENOMEM;
-               tape->chrdev_dir = IDETAPE_DIR_WRITE;
-               idetape_init_merge_buffer(tape);
+       rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE);
+       if (rc < 0)
+               return rc;
 
-               /*
-                * Issue a write 0 command to ensure that DSC handshake is
-                * switched from completion mode to buffer available mode. No
-                * point in issuing this if DSC overlap isn't supported, some
-                * drives (Seagate STT3401A) will return an error.
-                */
-               if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) {
-                       ssize_t retval = idetape_queue_rw_tail(drive,
-                                                       REQ_IDETAPE_WRITE, 0,
-                                                       tape->merge_bh);
-                       if (retval < 0) {
-                               ide_tape_kfree_buffer(tape);
-                               tape->merge_bh = NULL;
-                               tape->chrdev_dir = IDETAPE_DIR_NONE;
-                               return retval;
-                       }
-               }
-       }
-       if (count == 0)
-               return (0);
-       if (tape->merge_bh_size) {
-               if (tape->merge_bh_size >= tape->buffer_size) {
-                       printk(KERN_ERR "ide-tape: bug: merge buf too big\n");
-                       tape->merge_bh_size = 0;
-               }
-               actually_written = min((unsigned int)
-                               (tape->buffer_size - tape->merge_bh_size),
-                               (unsigned int)count);
-               if (idetape_copy_stage_from_user(tape, buf, actually_written))
-                               ret = -EFAULT;
-               buf += actually_written;
-               tape->merge_bh_size += actually_written;
-               count -= actually_written;
-
-               if (tape->merge_bh_size == tape->buffer_size) {
-                       ssize_t retval;
-                       tape->merge_bh_size = 0;
-                       retval = idetape_add_chrdev_write_request(drive, ctl);
-                       if (retval <= 0)
-                               return (retval);
-               }
-       }
-       while (count >= tape->buffer_size) {
-               ssize_t retval;
-               if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size))
-                       ret = -EFAULT;
-               buf += tape->buffer_size;
-               count -= tape->buffer_size;
-               retval = idetape_add_chrdev_write_request(drive, ctl);
-               actually_written += tape->buffer_size;
-               if (retval <= 0)
-                       return (retval);
-       }
-       if (count) {
-               actually_written += count;
-               if (idetape_copy_stage_from_user(tape, buf, count))
+       while (done < count) {
+               size_t todo;
+
+               /* flush if staging buffer is full */
+               if (tape->valid == tape->buffer_size &&
+                   idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE,
+                                         tape->buffer_size) <= 0)
+                       return rc;
+
+               /* copy in */
+               todo = min_t(size_t, count - done,
+                            tape->buffer_size - tape->valid);
+               if (copy_from_user(tape->cur, buf + done, todo))
                        ret = -EFAULT;
-               tape->merge_bh_size += count;
+
+               tape->cur += todo;
+               tape->valid += todo;
+               done += todo;
        }
-       return ret ? ret : actually_written;
+
+       return ret ? ret : done;
 }
 
 static int idetape_write_filemark(ide_drive_t *drive)
@@ -1812,7 +1413,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file,
                idetape_flush_tape_buffers(drive);
        }
        if (cmd == MTIOCGET || cmd == MTIOCPOS) {
-               block_offset = tape->merge_bh_size /
+               block_offset = tape->valid /
                        (tape->blk_size * tape->user_bs_factor);
                position = idetape_read_position(drive);
                if (position < 0)
@@ -1960,12 +1561,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor)
        idetape_tape_t *tape = drive->driver_data;
 
        ide_tape_flush_merge_buffer(drive);
-       tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0);
-       if (tape->merge_bh != NULL) {
+       tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL);
+       if (tape->buf != NULL) {
                idetape_pad_zeros(drive, tape->blk_size *
                                (tape->user_bs_factor - 1));
-               ide_tape_kfree_buffer(tape);
-               tape->merge_bh = NULL;
+               kfree(tape->buf);
+               tape->buf = NULL;
        }
        idetape_write_filemark(drive);
        idetape_flush_tape_buffers(drive);
@@ -2159,8 +1760,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
        u16 *ctl = (u16 *)&tape->caps[12];
 
        drive->pc_callback       = ide_tape_callback;
-       drive->pc_update_buffers = idetape_update_buffers;
-       drive->pc_io_buffers     = ide_tape_io_buffers;
 
        drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP;
 
@@ -2191,11 +1790,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor)
                tape->buffer_size = *ctl * tape->blk_size;
        }
        buffer_size = tape->buffer_size;
-       tape->pages_per_buffer = buffer_size / PAGE_SIZE;
-       if (buffer_size % PAGE_SIZE) {
-               tape->pages_per_buffer++;
-               tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE;
-       }
 
        /* select the "best" DSC read/write polling freq */
        speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]);
@@ -2238,7 +1832,7 @@ static void ide_tape_release(struct device *dev)
        ide_drive_t *drive = tape->drive;
        struct gendisk *g = tape->disk;
 
-       BUG_ON(tape->merge_bh_size);
+       BUG_ON(tape->valid);
 
        drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP;
        drive->driver_data = NULL;
index 4aa6223c11bea0ee138fec32dfd29fbf7facd30e..f400eb4d4aff73414a2e7b2d0e9739ce0969de46 100644 (file)
@@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
 
        rq = blk_get_request(drive->queue, READ, __GFP_WAIT);
        rq->cmd_type = REQ_TYPE_ATA_TASKFILE;
-       rq->buffer = buf;
+
+       if (cmd->tf_flags & IDE_TFLAG_WRITE)
+               rq->cmd_flags |= REQ_RW;
 
        /*
         * (ks) We transfer currently only whole sectors.
@@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf,
         * if we would find a solution to transfer any size.
         * To support special commands like READ LONG.
         */
-       rq->hard_nr_sectors = rq->nr_sectors = nsect;
-       rq->hard_cur_sectors = rq->current_nr_sectors = nsect;
-
-       if (cmd->tf_flags & IDE_TFLAG_WRITE)
-               rq->cmd_flags |= REQ_RW;
+       if (nsect) {
+               error = blk_rq_map_kern(drive->queue, rq, buf,
+                                       nsect * SECTOR_SIZE, __GFP_WAIT);
+               if (error)
+                       goto put_req;
+       }
 
        rq->special = cmd;
        cmd->rq = rq;
 
        error = blk_execute_rq(drive->queue, NULL, rq, 0);
-       blk_put_request(rq);
 
+put_req:
+       blk_put_request(rq);
        return error;
 }
 
index 424f7b048c304e8f2a4c7da7e325abbbd1859b53..3fd8b1e65483da53070f0fe633e82843ac1a5ba0 100644 (file)
@@ -20,7 +20,8 @@
 #include <linux/idr.h>
 #include <linux/hdreg.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
+
+#include <trace/events/block.h>
 
 #define DM_MSG_PREFIX "core"
 
@@ -53,8 +54,6 @@ struct dm_target_io {
        union map_info info;
 };
 
-DEFINE_TRACE(block_bio_complete);
-
 /*
  * For request-based dm.
  * One of these is allocated per request.
@@ -656,8 +655,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone,
                /* the bio has been remapped so dispatch it */
 
                trace_block_remap(bdev_get_queue(clone->bi_bdev), clone,
-                                   tio->io->bio->bi_bdev->bd_dev,
-                                   clone->bi_sector, sector);
+                                   tio->io->bio->bi_bdev->bd_dev, sector);
 
                generic_make_request(clone);
        } else if (r < 0 || r == DM_MAPIO_REQUEUE) {
index 73348c4047e98d249a9a576b87f2e8d44edaf623..4a9cc92d4d1867469dcae3efed4c468b251ff7f8 100644 (file)
@@ -702,7 +702,7 @@ static unsigned int iosapic_startup_irq(unsigned int irq)
 }
 
 #ifdef CONFIG_SMP
-static void iosapic_set_affinity_irq(unsigned int irq,
+static int iosapic_set_affinity_irq(unsigned int irq,
                                     const struct cpumask *dest)
 {
        struct vector_info *vi = iosapic_get_vector(irq);
@@ -712,7 +712,7 @@ static void iosapic_set_affinity_irq(unsigned int irq,
 
        dest_cpu = cpu_check_affinity(irq, dest);
        if (dest_cpu < 0)
-               return;
+               return -1;
 
        cpumask_copy(irq_desc[irq].affinity, cpumask_of(dest_cpu));
        vi->txn_addr = txn_affinity_addr(irq, dest_cpu);
@@ -724,6 +724,8 @@ static void iosapic_set_affinity_irq(unsigned int irq,
        iosapic_set_irt_data(vi, &dummy_d0, &d1);
        iosapic_wr_irt_entry(vi, d0, d1);
        spin_unlock_irqrestore(&iosapic_lock, flags);
+
+       return 0;
 }
 #endif
 
index 4e63cc9e277827bfe74db6a778471dc92196cc25..151bf5bc8afe968d196702bad526c38b2797a4f8 100644 (file)
@@ -1,5 +1,5 @@
 /* Low-level parallel-port routines for 8255-based PC-style hardware.
- * 
+ *
  * Authors: Phil Blundell <philb@gnu.org>
  *          Tim Waugh <tim@cyberelk.demon.co.uk>
  *         Jose Renau <renau@acm.org>
@@ -11,7 +11,7 @@
  * Cleaned up include files - Russell King <linux@arm.uk.linux.org>
  * DMA support - Bert De Jonghe <bert@sophis.be>
  * Many ECP bugs fixed.  Fred Barnes & Jamie Lokier, 1999
- * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G. 
+ * More PCI support now conditional on CONFIG_PCI, 03/2001, Paul G.
  * Various hacks, Fred Barnes, 04/2001
  * Updated probing logic - Adam Belay <ambx1@neo.rr.com>
  */
 #include <linux/pnp.h>
 #include <linux/platform_device.h>
 #include <linux/sysctl.h>
+#include <linux/io.h>
+#include <linux/uaccess.h>
 
-#include <asm/io.h>
 #include <asm/dma.h>
-#include <asm/uaccess.h>
 
 #include <linux/parport.h>
 #include <linux/parport_pc.h>
@@ -82,7 +82,7 @@
 #define ECR_TST 06
 #define ECR_CNF 07
 #define ECR_MODE_MASK 0xe0
-#define ECR_WRITE(p,v) frob_econtrol((p),0xff,(v))
+#define ECR_WRITE(p, v) frob_econtrol((p), 0xff, (v))
 
 #undef DEBUG
 
@@ -109,27 +109,27 @@ static int pci_registered_parport;
 static int pnp_registered_parport;
 
 /* frob_control, but for ECR */
-static void frob_econtrol (struct parport *pb, unsigned char m,
+static void frob_econtrol(struct parport *pb, unsigned char m,
                           unsigned char v)
 {
        unsigned char ectr = 0;
 
        if (m != 0xff)
-               ectr = inb (ECONTROL (pb));
+               ectr = inb(ECONTROL(pb));
 
-       DPRINTK (KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
+       DPRINTK(KERN_DEBUG "frob_econtrol(%02x,%02x): %02x -> %02x\n",
                m, v, ectr, (ectr & ~m) ^ v);
 
-       outb ((ectr & ~m) ^ v, ECONTROL (pb));
+       outb((ectr & ~m) ^ v, ECONTROL(pb));
 }
 
-static __inline__ void frob_set_mode (struct parport *p, int mode)
+static inline void frob_set_mode(struct parport *p, int mode)
 {
-       frob_econtrol (p, ECR_MODE_MASK, mode << 5);
+       frob_econtrol(p, ECR_MODE_MASK, mode << 5);
 }
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-/* Safely change the mode bits in the ECR 
+/* Safely change the mode bits in the ECR
    Returns:
            0    : Success
           -EBUSY: Could not drain FIFO in some finite amount of time,
@@ -141,17 +141,18 @@ static int change_mode(struct parport *p, int m)
        unsigned char oecr;
        int mode;
 
-       DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n",m);
+       DPRINTK(KERN_INFO "parport change_mode ECP-ISA to mode 0x%02x\n", m);
 
        if (!priv->ecr) {
-               printk (KERN_DEBUG "change_mode: but there's no ECR!\n");
+               printk(KERN_DEBUG "change_mode: but there's no ECR!\n");
                return 0;
        }
 
        /* Bits <7:5> contain the mode. */
-       oecr = inb (ECONTROL (p));
+       oecr = inb(ECONTROL(p));
        mode = (oecr >> 5) & 0x7;
-       if (mode == m) return 0;
+       if (mode == m)
+               return 0;
 
        if (mode >= 2 && !(priv->ctr & 0x20)) {
                /* This mode resets the FIFO, so we may
@@ -163,19 +164,21 @@ static int change_mode(struct parport *p, int m)
                case ECR_ECP: /* ECP Parallel Port mode */
                        /* Busy wait for 200us */
                        for (counter = 0; counter < 40; counter++) {
-                               if (inb (ECONTROL (p)) & 0x01)
+                               if (inb(ECONTROL(p)) & 0x01)
+                                       break;
+                               if (signal_pending(current))
                                        break;
-                               if (signal_pending (current)) break;
-                               udelay (5);
+                               udelay(5);
                        }
 
                        /* Poll slowly. */
-                       while (!(inb (ECONTROL (p)) & 0x01)) {
-                               if (time_after_eq (jiffies, expire))
+                       while (!(inb(ECONTROL(p)) & 0x01)) {
+                               if (time_after_eq(jiffies, expire))
                                        /* The FIFO is stuck. */
                                        return -EBUSY;
-                               schedule_timeout_interruptible(msecs_to_jiffies(10));
-                               if (signal_pending (current))
+                               schedule_timeout_interruptible(
+                                                       msecs_to_jiffies(10));
+                               if (signal_pending(current))
                                        break;
                        }
                }
@@ -185,20 +188,20 @@ static int change_mode(struct parport *p, int m)
                /* We have to go through mode 001 */
                oecr &= ~(7 << 5);
                oecr |= ECR_PS2 << 5;
-               ECR_WRITE (p, oecr);
+               ECR_WRITE(p, oecr);
        }
 
        /* Set the mode. */
        oecr &= ~(7 << 5);
        oecr |= m << 5;
-       ECR_WRITE (p, oecr);
+       ECR_WRITE(p, oecr);
        return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
 /* Find FIFO lossage; FIFO is reset */
 #if 0
-static int get_fifo_residue (struct parport *p)
+static int get_fifo_residue(struct parport *p)
 {
        int residue;
        int cnfga;
@@ -206,26 +209,26 @@ static int get_fifo_residue (struct parport *p)
 
        /* Adjust for the contents of the FIFO. */
        for (residue = priv->fifo_depth; ; residue--) {
-               if (inb (ECONTROL (p)) & 0x2)
+               if (inb(ECONTROL(p)) & 0x2)
                                /* Full up. */
                        break;
 
-               outb (0, FIFO (p));
+               outb(0, FIFO(p));
        }
 
-       printk (KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
+       printk(KERN_DEBUG "%s: %d PWords were left in FIFO\n", p->name,
                residue);
 
        /* Reset the FIFO. */
-       frob_set_mode (p, ECR_PS2);
+       frob_set_mode(p, ECR_PS2);
 
        /* Now change to config mode and clean up. FIXME */
-       frob_set_mode (p, ECR_CNF);
-       cnfga = inb (CONFIGA (p));
-       printk (KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
+       frob_set_mode(p, ECR_CNF);
+       cnfga = inb(CONFIGA(p));
+       printk(KERN_DEBUG "%s: cnfgA contains 0x%02x\n", p->name, cnfga);
 
        if (!(cnfga & (1<<2))) {
-               printk (KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
+               printk(KERN_DEBUG "%s: Accounting for extra byte\n", p->name);
                residue++;
        }
 
@@ -233,9 +236,11 @@ static int get_fifo_residue (struct parport *p)
         * PWord != 1 byte. */
 
        /* Back to PS2 mode. */
-       frob_set_mode (p, ECR_PS2);
+       frob_set_mode(p, ECR_PS2);
 
-       DPRINTK (KERN_DEBUG "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n", inb (ECONTROL (p)));
+       DPRINTK(KERN_DEBUG
+            "*** get_fifo_residue: done residue collecting (ecr = 0x%2.2x)\n",
+                                                       inb(ECONTROL(p)));
        return residue;
 }
 #endif  /*  0 */
@@ -257,8 +262,8 @@ static int clear_epp_timeout(struct parport *pb)
        /* To clear timeout some chips require double read */
        parport_pc_read_status(pb);
        r = parport_pc_read_status(pb);
-       outb (r | 0x01, STATUS (pb)); /* Some reset by writing 1 */
-       outb (r & 0xfe, STATUS (pb)); /* Others by writing 0 */
+       outb(r | 0x01, STATUS(pb)); /* Some reset by writing 1 */
+       outb(r & 0xfe, STATUS(pb)); /* Others by writing 0 */
        r = parport_pc_read_status(pb);
 
        return !(r & 0x01);
@@ -272,7 +277,8 @@ static int clear_epp_timeout(struct parport *pb)
  * of these are in parport_pc.h.
  */
 
-static void parport_pc_init_state(struct pardevice *dev, struct parport_state *s)
+static void parport_pc_init_state(struct pardevice *dev,
+                                               struct parport_state *s)
 {
        s->u.pc.ctr = 0xc;
        if (dev->irq_func &&
@@ -289,22 +295,23 @@ static void parport_pc_save_state(struct parport *p, struct parport_state *s)
        const struct parport_pc_private *priv = p->physport->private_data;
        s->u.pc.ctr = priv->ctr;
        if (priv->ecr)
-               s->u.pc.ecr = inb (ECONTROL (p));
+               s->u.pc.ecr = inb(ECONTROL(p));
 }
 
-static void parport_pc_restore_state(struct parport *p, struct parport_state *s)
+static void parport_pc_restore_state(struct parport *p,
+                                               struct parport_state *s)
 {
        struct parport_pc_private *priv = p->physport->private_data;
        register unsigned char c = s->u.pc.ctr & priv->ctr_writable;
-       outb (c, CONTROL (p));
+       outb(c, CONTROL(p));
        priv->ctr = c;
        if (priv->ecr)
-               ECR_WRITE (p, s->u.pc.ecr);
+               ECR_WRITE(p, s->u.pc.ecr);
 }
 
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
-                                       size_t length, int flags)
+static size_t parport_pc_epp_read_data(struct parport *port, void *buf,
+                                      size_t length, int flags)
 {
        size_t got = 0;
 
@@ -316,54 +323,52 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
                 *  nFault is 0 if there is at least 1 byte in the Warp's FIFO
                 *  pError is 1 if there are 16 bytes in the Warp's FIFO
                 */
-               status = inb (STATUS (port));
+               status = inb(STATUS(port));
 
-               while (!(status & 0x08) && (got < length)) {
-                       if ((left >= 16) && (status & 0x20) && !(status & 0x08)) {
+               while (!(status & 0x08) && got < length) {
+                       if (left >= 16 && (status & 0x20) && !(status & 0x08)) {
                                /* can grab 16 bytes from warp fifo */
-                               if (!((long)buf & 0x03)) {
-                                       insl (EPPDATA (port), buf, 4);
-                               } else {
-                                       insb (EPPDATA (port), buf, 16);
-                               }
+                               if (!((long)buf & 0x03))
+                                       insl(EPPDATA(port), buf, 4);
+                               else
+                                       insb(EPPDATA(port), buf, 16);
                                buf += 16;
                                got += 16;
                                left -= 16;
                        } else {
                                /* grab single byte from the warp fifo */
-                               *((char *)buf) = inb (EPPDATA (port));
+                               *((char *)buf) = inb(EPPDATA(port));
                                buf++;
                                got++;
                                left--;
                        }
-                       status = inb (STATUS (port));
+                       status = inb(STATUS(port));
                        if (status & 0x01) {
                                /* EPP timeout should never occur... */
-                               printk (KERN_DEBUG "%s: EPP timeout occurred while talking to "
-                                       "w91284pic (should not have done)\n", port->name);
-                               clear_epp_timeout (port);
+                               printk(KERN_DEBUG
+"%s: EPP timeout occurred while talking to w91284pic (should not have done)\n", port->name);
+                               clear_epp_timeout(port);
                        }
                }
                return got;
        }
        if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-               if (!(((long)buf | length) & 0x03)) {
-                       insl (EPPDATA (port), buf, (length >> 2));
-               } else {
-                       insb (EPPDATA (port), buf, length);
-               }
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               if (!(((long)buf | length) & 0x03))
+                       insl(EPPDATA(port), buf, (length >> 2));
+               else
+                       insb(EPPDATA(port), buf, length);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        return -EIO;
                }
                return length;
        }
        for (; got < length; got++) {
-               *((char*)buf) = inb (EPPDATA(port));
+               *((char *)buf) = inb(EPPDATA(port));
                buf++;
-               if (inb (STATUS (port)) & 0x01) {
+               if (inb(STATUS(port)) & 0x01) {
                        /* EPP timeout */
-                       clear_epp_timeout (port);
+                       clear_epp_timeout(port);
                        break;
                }
        }
@@ -371,28 +376,27 @@ static size_t parport_pc_epp_read_data (struct parport *port, void *buf,
        return got;
 }
 
-static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
-                                        size_t length, int flags)
+static size_t parport_pc_epp_write_data(struct parport *port, const void *buf,
+                                       size_t length, int flags)
 {
        size_t written = 0;
 
        if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-               if (!(((long)buf | length) & 0x03)) {
-                       outsl (EPPDATA (port), buf, (length >> 2));
-               } else {
-                       outsb (EPPDATA (port), buf, length);
-               }
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               if (!(((long)buf | length) & 0x03))
+                       outsl(EPPDATA(port), buf, (length >> 2));
+               else
+                       outsb(EPPDATA(port), buf, length);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        return -EIO;
                }
                return length;
        }
        for (; written < length; written++) {
-               outb (*((char*)buf), EPPDATA(port));
+               outb(*((char *)buf), EPPDATA(port));
                buf++;
-               if (inb (STATUS(port)) & 0x01) {
-                       clear_epp_timeout (port);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        break;
                }
        }
@@ -400,24 +404,24 @@ static size_t parport_pc_epp_write_data (struct parport *port, const void *buf,
        return written;
 }
 
-static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
+static size_t parport_pc_epp_read_addr(struct parport *port, void *buf,
                                        size_t length, int flags)
 {
        size_t got = 0;
 
        if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-               insb (EPPADDR (port), buf, length);
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               insb(EPPADDR(port), buf, length);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        return -EIO;
                }
                return length;
        }
        for (; got < length; got++) {
-               *((char*)buf) = inb (EPPADDR (port));
+               *((char *)buf) = inb(EPPADDR(port));
                buf++;
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        break;
                }
        }
@@ -425,25 +429,25 @@ static size_t parport_pc_epp_read_addr (struct parport *port, void *buf,
        return got;
 }
 
-static size_t parport_pc_epp_write_addr (struct parport *port,
+static size_t parport_pc_epp_write_addr(struct parport *port,
                                         const void *buf, size_t length,
                                         int flags)
 {
        size_t written = 0;
 
        if ((flags & PARPORT_EPP_FAST) && (length > 1)) {
-               outsb (EPPADDR (port), buf, length);
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               outsb(EPPADDR(port), buf, length);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        return -EIO;
                }
                return length;
        }
        for (; written < length; written++) {
-               outb (*((char*)buf), EPPADDR (port));
+               outb(*((char *)buf), EPPADDR(port));
                buf++;
-               if (inb (STATUS (port)) & 0x01) {
-                       clear_epp_timeout (port);
+               if (inb(STATUS(port)) & 0x01) {
+                       clear_epp_timeout(port);
                        break;
                }
        }
@@ -451,74 +455,74 @@ static size_t parport_pc_epp_write_addr (struct parport *port,
        return written;
 }
 
-static size_t parport_pc_ecpepp_read_data (struct parport *port, void *buf,
-                                          size_t length, int flags)
+static size_t parport_pc_ecpepp_read_data(struct parport *port, void *buf,
+                                         size_t length, int flags)
 {
        size_t got;
 
-       frob_set_mode (port, ECR_EPP);
-       parport_pc_data_reverse (port);
-       parport_pc_write_control (port, 0x4);
-       got = parport_pc_epp_read_data (port, buf, length, flags);
-       frob_set_mode (port, ECR_PS2);
+       frob_set_mode(port, ECR_EPP);
+       parport_pc_data_reverse(port);
+       parport_pc_write_control(port, 0x4);
+       got = parport_pc_epp_read_data(port, buf, length, flags);
+       frob_set_mode(port, ECR_PS2);
 
        return got;
 }
 
-static size_t parport_pc_ecpepp_write_data (struct parport *port,
-                                           const void *buf, size_t length,
-                                           int flags)
+static size_t parport_pc_ecpepp_write_data(struct parport *port,
+                                          const void *buf, size_t length,
+                                          int flags)
 {
        size_t written;
 
-       frob_set_mode (port, ECR_EPP);
-       parport_pc_write_control (port, 0x4);
-       parport_pc_data_forward (port);
-       written = parport_pc_epp_write_data (port, buf, length, flags);
-       frob_set_mode (port, ECR_PS2);
+       frob_set_mode(port, ECR_EPP);
+       parport_pc_write_control(port, 0x4);
+       parport_pc_data_forward(port);
+       written = parport_pc_epp_write_data(port, buf, length, flags);
+       frob_set_mode(port, ECR_PS2);
 
        return written;
 }
 
-static size_t parport_pc_ecpepp_read_addr (struct parport *port, void *buf,
-                                          size_t length, int flags)
+static size_t parport_pc_ecpepp_read_addr(struct parport *port, void *buf,
+                                         size_t length, int flags)
 {
        size_t got;
 
-       frob_set_mode (port, ECR_EPP);
-       parport_pc_data_reverse (port);
-       parport_pc_write_control (port, 0x4);
-       got = parport_pc_epp_read_addr (port, buf, length, flags);
-       frob_set_mode (port, ECR_PS2);
+       frob_set_mode(port, ECR_EPP);
+       parport_pc_data_reverse(port);
+       parport_pc_write_control(port, 0x4);
+       got = parport_pc_epp_read_addr(port, buf, length, flags);
+       frob_set_mode(port, ECR_PS2);
 
        return got;
 }
 
-static size_t parport_pc_ecpepp_write_addr (struct parport *port,
+static size_t parport_pc_ecpepp_write_addr(struct parport *port,
                                            const void *buf, size_t length,
                                            int flags)
 {
        size_t written;
 
-       frob_set_mode (port, ECR_EPP);
-       parport_pc_write_control (port, 0x4);
-       parport_pc_data_forward (port);
-       written = parport_pc_epp_write_addr (port, buf, length, flags);
-       frob_set_mode (port, ECR_PS2);
+       frob_set_mode(port, ECR_EPP);
+       parport_pc_write_control(port, 0x4);
+       parport_pc_data_forward(port);
+       written = parport_pc_epp_write_addr(port, buf, length, flags);
+       frob_set_mode(port, ECR_PS2);
 
        return written;
 }
 #endif /* IEEE 1284 support */
 
 #ifdef CONFIG_PARPORT_PC_FIFO
-static size_t parport_pc_fifo_write_block_pio (struct parport *port,
+static size_t parport_pc_fifo_write_block_pio(struct parport *port,
                                               const void *buf, size_t length)
 {
        int ret = 0;
        const unsigned char *bufp = buf;
        size_t left = length;
        unsigned long expire = jiffies + port->physport->cad->timeout;
-       const int fifo = FIFO (port);
+       const int fifo = FIFO(port);
        int poll_for = 8; /* 80 usecs */
        const struct parport_pc_private *priv = port->physport->private_data;
        const int fifo_depth = priv->fifo_depth;
@@ -526,25 +530,25 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
        port = port->physport;
 
        /* We don't want to be interrupted every character. */
-       parport_pc_disable_irq (port);
+       parport_pc_disable_irq(port);
        /* set nErrIntrEn and serviceIntr */
-       frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+       frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
        /* Forward mode. */
-       parport_pc_data_forward (port); /* Must be in PS2 mode */
+       parport_pc_data_forward(port); /* Must be in PS2 mode */
 
        while (left) {
                unsigned char byte;
-               unsigned char ecrval = inb (ECONTROL (port));
+               unsigned char ecrval = inb(ECONTROL(port));
                int i = 0;
 
-               if (need_resched() && time_before (jiffies, expire))
+               if (need_resched() && time_before(jiffies, expire))
                        /* Can't yield the port. */
-                       schedule ();
+                       schedule();
 
                /* Anyone else waiting for the port? */
                if (port->waithead) {
-                       printk (KERN_DEBUG "Somebody wants the port\n");
+                       printk(KERN_DEBUG "Somebody wants the port\n");
                        break;
                }
 
@@ -552,21 +556,22 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
                        /* FIFO is full. Wait for interrupt. */
 
                        /* Clear serviceIntr */
-                       ECR_WRITE (port, ecrval & ~(1<<2));
-               false_alarm:
-                       ret = parport_wait_event (port, HZ);
-                       if (ret < 0) break;
+                       ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+                       ret = parport_wait_event(port, HZ);
+                       if (ret < 0)
+                               break;
                        ret = 0;
-                       if (!time_before (jiffies, expire)) {
+                       if (!time_before(jiffies, expire)) {
                                /* Timed out. */
-                               printk (KERN_DEBUG "FIFO write timed out\n");
+                               printk(KERN_DEBUG "FIFO write timed out\n");
                                break;
                        }
-                       ecrval = inb (ECONTROL (port));
+                       ecrval = inb(ECONTROL(port));
                        if (!(ecrval & (1<<2))) {
                                if (need_resched() &&
-                                   time_before (jiffies, expire))
-                                       schedule ();
+                                   time_before(jiffies, expire))
+                                       schedule();
 
                                goto false_alarm;
                        }
@@ -577,38 +582,38 @@ static size_t parport_pc_fifo_write_block_pio (struct parport *port,
                /* Can't fail now. */
                expire = jiffies + port->cad->timeout;
 
-       poll:
-               if (signal_pending (current))
+poll:
+               if (signal_pending(current))
                        break;
 
                if (ecrval & 0x01) {
                        /* FIFO is empty. Blast it full. */
                        const int n = left < fifo_depth ? left : fifo_depth;
-                       outsb (fifo, bufp, n);
+                       outsb(fifo, bufp, n);
                        bufp += n;
                        left -= n;
 
                        /* Adjust the poll time. */
-                       if (i < (poll_for - 2)) poll_for--;
+                       if (i < (poll_for - 2))
+                               poll_for--;
                        continue;
                } else if (i++ < poll_for) {
-                       udelay (10);
-                       ecrval = inb (ECONTROL (port));
+                       udelay(10);
+                       ecrval = inb(ECONTROL(port));
                        goto poll;
                }
 
-               /* Half-full (call me an optimist) */
+               /* Half-full(call me an optimist) */
                byte = *bufp++;
-               outb (byte, fifo);
+               outb(byte, fifo);
                left--;
-        }
-
-dump_parport_state ("leave fifo_write_block_pio", port);
+       }
+       dump_parport_state("leave fifo_write_block_pio", port);
        return length - left;
 }
 
 #ifdef HAS_DMA
-static size_t parport_pc_fifo_write_block_dma (struct parport *port,
+static size_t parport_pc_fifo_write_block_dma(struct parport *port,
                                               const void *buf, size_t length)
 {
        int ret = 0;
@@ -621,7 +626,7 @@ static size_t parport_pc_fifo_write_block_dma (struct parport *port,
        unsigned long start = (unsigned long) buf;
        unsigned long end = (unsigned long) buf + length - 1;
 
-dump_parport_state ("enter fifo_write_block_dma", port);
+       dump_parport_state("enter fifo_write_block_dma", port);
        if (end < MAX_DMA_ADDRESS) {
                /* If it would cross a 64k boundary, cap it at the end. */
                if ((start ^ end) & ~0xffffUL)
@@ -629,8 +634,9 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
                dma_addr = dma_handle = dma_map_single(dev, (void *)buf, length,
                                                       DMA_TO_DEVICE);
-        } else {
-               /* above 16 MB we use a bounce buffer as ISA-DMA is not possible */
+       } else {
+               /* above 16 MB we use a bounce buffer as ISA-DMA
+                  is not possible */
                maxlen   = PAGE_SIZE;          /* sizeof(priv->dma_buf) */
                dma_addr = priv->dma_handle;
                dma_handle = 0;
@@ -639,12 +645,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
        port = port->physport;
 
        /* We don't want to be interrupted every character. */
-       parport_pc_disable_irq (port);
+       parport_pc_disable_irq(port);
        /* set nErrIntrEn and serviceIntr */
-       frob_econtrol (port, (1<<4) | (1<<2), (1<<4) | (1<<2));
+       frob_econtrol(port, (1<<4) | (1<<2), (1<<4) | (1<<2));
 
        /* Forward mode. */
-       parport_pc_data_forward (port); /* Must be in PS2 mode */
+       parport_pc_data_forward(port); /* Must be in PS2 mode */
 
        while (left) {
                unsigned long expire = jiffies + port->physport->cad->timeout;
@@ -665,10 +671,10 @@ dump_parport_state ("enter fifo_write_block_dma", port);
                set_dma_count(port->dma, count);
 
                /* Set DMA mode */
-               frob_econtrol (port, 1<<3, 1<<3);
+               frob_econtrol(port, 1<<3, 1<<3);
 
                /* Clear serviceIntr */
-               frob_econtrol (port, 1<<2, 0);
+               frob_econtrol(port, 1<<2, 0);
 
                enable_dma(port->dma);
                release_dma_lock(dmaflag);
@@ -676,20 +682,22 @@ dump_parport_state ("enter fifo_write_block_dma", port);
                /* assume DMA will be successful */
                left -= count;
                buf  += count;
-               if (dma_handle) dma_addr += count;
+               if (dma_handle)
+                       dma_addr += count;
 
                /* Wait for interrupt. */
-       false_alarm:
-               ret = parport_wait_event (port, HZ);
-               if (ret < 0) break;
+false_alarm:
+               ret = parport_wait_event(port, HZ);
+               if (ret < 0)
+                       break;
                ret = 0;
-               if (!time_before (jiffies, expire)) {
+               if (!time_before(jiffies, expire)) {
                        /* Timed out. */
-                       printk (KERN_DEBUG "DMA write timed out\n");
+                       printk(KERN_DEBUG "DMA write timed out\n");
                        break;
                }
                /* Is serviceIntr set? */
-               if (!(inb (ECONTROL (port)) & (1<<2))) {
+               if (!(inb(ECONTROL(port)) & (1<<2))) {
                        cond_resched();
 
                        goto false_alarm;
@@ -705,14 +713,15 @@ dump_parport_state ("enter fifo_write_block_dma", port);
 
                /* Anyone else waiting for the port? */
                if (port->waithead) {
-                       printk (KERN_DEBUG "Somebody wants the port\n");
+                       printk(KERN_DEBUG "Somebody wants the port\n");
                        break;
                }
 
                /* update for possible DMA residue ! */
                buf  -= count;
                left += count;
-               if (dma_handle) dma_addr -= count;
+               if (dma_handle)
+                       dma_addr -= count;
        }
 
        /* Maybe got here through break, so adjust for DMA residue! */
@@ -723,12 +732,12 @@ dump_parport_state ("enter fifo_write_block_dma", port);
        release_dma_lock(dmaflag);
 
        /* Turn off DMA mode */
-       frob_econtrol (port, 1<<3, 0);
+       frob_econtrol(port, 1<<3, 0);
 
        if (dma_handle)
                dma_unmap_single(dev, dma_handle, length, DMA_TO_DEVICE);
 
-dump_parport_state ("leave fifo_write_block_dma", port);
+       dump_parport_state("leave fifo_write_block_dma", port);
        return length - left;
 }
 #endif
@@ -738,13 +747,13 @@ static inline size_t parport_pc_fifo_write_block(struct parport *port,
 {
 #ifdef HAS_DMA
        if (port->dma != PARPORT_DMA_NONE)
-               return parport_pc_fifo_write_block_dma (port, buf, length);
+               return parport_pc_fifo_write_block_dma(port, buf, length);
 #endif
-       return parport_pc_fifo_write_block_pio (port, buf, length);
+       return parport_pc_fifo_write_block_pio(port, buf, length);
 }
 
 /* Parallel Port FIFO mode (ECP chipsets) */
-static size_t parport_pc_compat_write_block_pio (struct parport *port,
+static size_t parport_pc_compat_write_block_pio(struct parport *port,
                                                 const void *buf, size_t length,
                                                 int flags)
 {
@@ -756,14 +765,16 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
        /* Special case: a timeout of zero means we cannot call schedule().
         * Also if O_NONBLOCK is set then use the default implementation. */
        if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-               return parport_ieee1284_write_compat (port, buf,
+               return parport_ieee1284_write_compat(port, buf,
                                                      length, flags);
 
        /* Set up parallel port FIFO mode.*/
-       parport_pc_data_forward (port); /* Must be in PS2 mode */
-       parport_pc_frob_control (port, PARPORT_CONTROL_STROBE, 0);
-       r = change_mode (port, ECR_PPF); /* Parallel port FIFO */
-       if (r)  printk (KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n", port->name);
+       parport_pc_data_forward(port); /* Must be in PS2 mode */
+       parport_pc_frob_control(port, PARPORT_CONTROL_STROBE, 0);
+       r = change_mode(port, ECR_PPF); /* Parallel port FIFO */
+       if (r)
+               printk(KERN_DEBUG "%s: Warning change_mode ECR_PPF failed\n",
+                                                               port->name);
 
        port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
@@ -775,40 +786,39 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
         * the FIFO is empty, so allow 4 seconds for each position
         * in the fifo.
         */
-        expire = jiffies + (priv->fifo_depth * HZ * 4);
+       expire = jiffies + (priv->fifo_depth * HZ * 4);
        do {
                /* Wait for the FIFO to empty */
-               r = change_mode (port, ECR_PS2);
-               if (r != -EBUSY) {
+               r = change_mode(port, ECR_PS2);
+               if (r != -EBUSY)
                        break;
-               }
-       } while (time_before (jiffies, expire));
+       } while (time_before(jiffies, expire));
        if (r == -EBUSY) {
 
-               printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+               printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
                /* Prevent further data transfer. */
-               frob_set_mode (port, ECR_TST);
+               frob_set_mode(port, ECR_TST);
 
                /* Adjust for the contents of the FIFO. */
                for (written -= priv->fifo_depth; ; written++) {
-                       if (inb (ECONTROL (port)) & 0x2) {
+                       if (inb(ECONTROL(port)) & 0x2) {
                                /* Full up. */
                                break;
                        }
-                       outb (0, FIFO (port));
+                       outb(0, FIFO(port));
                }
 
                /* Reset the FIFO and return to PS2 mode. */
-               frob_set_mode (port, ECR_PS2);
+               frob_set_mode(port, ECR_PS2);
        }
 
-       r = parport_wait_peripheral (port,
+       r = parport_wait_peripheral(port,
                                     PARPORT_STATUS_BUSY,
                                     PARPORT_STATUS_BUSY);
        if (r)
-               printk (KERN_DEBUG
-                       "%s: BUSY timeout (%d) in compat_write_block_pio\n", 
+               printk(KERN_DEBUG
+                       "%s: BUSY timeout (%d) in compat_write_block_pio\n",
                        port->name, r);
 
        port->physport->ieee1284.phase = IEEE1284_PH_FWD_IDLE;
@@ -818,7 +828,7 @@ static size_t parport_pc_compat_write_block_pio (struct parport *port,
 
 /* ECP */
 #ifdef CONFIG_PARPORT_1284
-static size_t parport_pc_ecp_write_block_pio (struct parport *port,
+static size_t parport_pc_ecp_write_block_pio(struct parport *port,
                                              const void *buf, size_t length,
                                              int flags)
 {
@@ -830,36 +840,38 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
        /* Special case: a timeout of zero means we cannot call schedule().
         * Also if O_NONBLOCK is set then use the default implementation. */
        if (port->physport->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-               return parport_ieee1284_ecp_write_data (port, buf,
+               return parport_ieee1284_ecp_write_data(port, buf,
                                                        length, flags);
 
        /* Switch to forward mode if necessary. */
        if (port->physport->ieee1284.phase != IEEE1284_PH_FWD_IDLE) {
                /* Event 47: Set nInit high. */
-               parport_frob_control (port,
+               parport_frob_control(port,
                                      PARPORT_CONTROL_INIT
                                      | PARPORT_CONTROL_AUTOFD,
                                      PARPORT_CONTROL_INIT
                                      | PARPORT_CONTROL_AUTOFD);
 
                /* Event 49: PError goes high. */
-               r = parport_wait_peripheral (port,
+               r = parport_wait_peripheral(port,
                                             PARPORT_STATUS_PAPEROUT,
                                             PARPORT_STATUS_PAPEROUT);
                if (r) {
-                       printk (KERN_DEBUG "%s: PError timeout (%d) "
+                       printk(KERN_DEBUG "%s: PError timeout (%d) "
                                "in ecp_write_block_pio\n", port->name, r);
                }
        }
 
        /* Set up ECP parallel port mode.*/
-       parport_pc_data_forward (port); /* Must be in PS2 mode */
-       parport_pc_frob_control (port,
+       parport_pc_data_forward(port); /* Must be in PS2 mode */
+       parport_pc_frob_control(port,
                                 PARPORT_CONTROL_STROBE |
                                 PARPORT_CONTROL_AUTOFD,
                                 0);
-       r = change_mode (port, ECR_ECP); /* ECP FIFO */
-       if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+       r = change_mode(port, ECR_ECP); /* ECP FIFO */
+       if (r)
+               printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+                                                               port->name);
        port->physport->ieee1284.phase = IEEE1284_PH_FWD_DATA;
 
        /* Write the data to the FIFO. */
@@ -873,55 +885,54 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
        expire = jiffies + (priv->fifo_depth * (HZ * 4));
        do {
                /* Wait for the FIFO to empty */
-               r = change_mode (port, ECR_PS2);
-               if (r != -EBUSY) {
+               r = change_mode(port, ECR_PS2);
+               if (r != -EBUSY)
                        break;
-               }
-       } while (time_before (jiffies, expire));
+       } while (time_before(jiffies, expire));
        if (r == -EBUSY) {
 
-               printk (KERN_DEBUG "%s: FIFO is stuck\n", port->name);
+               printk(KERN_DEBUG "%s: FIFO is stuck\n", port->name);
 
                /* Prevent further data transfer. */
-               frob_set_mode (port, ECR_TST);
+               frob_set_mode(port, ECR_TST);
 
                /* Adjust for the contents of the FIFO. */
                for (written -= priv->fifo_depth; ; written++) {
-                       if (inb (ECONTROL (port)) & 0x2) {
+                       if (inb(ECONTROL(port)) & 0x2) {
                                /* Full up. */
                                break;
                        }
-                       outb (0, FIFO (port));
+                       outb(0, FIFO(port));
                }
 
                /* Reset the FIFO and return to PS2 mode. */
-               frob_set_mode (port, ECR_PS2);
+               frob_set_mode(port, ECR_PS2);
 
                /* Host transfer recovery. */
-               parport_pc_data_reverse (port); /* Must be in PS2 mode */
-               udelay (5);
-               parport_frob_control (port, PARPORT_CONTROL_INIT, 0);
-               r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
+               parport_pc_data_reverse(port); /* Must be in PS2 mode */
+               udelay(5);
+               parport_frob_control(port, PARPORT_CONTROL_INIT, 0);
+               r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
                if (r)
-                       printk (KERN_DEBUG "%s: PE,1 timeout (%d) "
+                       printk(KERN_DEBUG "%s: PE,1 timeout (%d) "
                                "in ecp_write_block_pio\n", port->name, r);
 
-               parport_frob_control (port,
+               parport_frob_control(port,
                                      PARPORT_CONTROL_INIT,
                                      PARPORT_CONTROL_INIT);
-               r = parport_wait_peripheral (port,
+               r = parport_wait_peripheral(port,
                                             PARPORT_STATUS_PAPEROUT,
                                             PARPORT_STATUS_PAPEROUT);
-                if (r)
-                        printk (KERN_DEBUG "%s: PE,2 timeout (%d) "
+               if (r)
+                       printk(KERN_DEBUG "%s: PE,2 timeout (%d) "
                                "in ecp_write_block_pio\n", port->name, r);
        }
 
-       r = parport_wait_peripheral (port,
-                                    PARPORT_STATUS_BUSY, 
+       r = parport_wait_peripheral(port,
+                                    PARPORT_STATUS_BUSY,
                                     PARPORT_STATUS_BUSY);
-       if(r)
-               printk (KERN_DEBUG
+       if (r)
+               printk(KERN_DEBUG
                        "%s: BUSY timeout (%d) in ecp_write_block_pio\n",
                        port->name, r);
 
@@ -931,7 +942,7 @@ static size_t parport_pc_ecp_write_block_pio (struct parport *port,
 }
 
 #if 0
-static size_t parport_pc_ecp_read_block_pio (struct parport *port,
+static size_t parport_pc_ecp_read_block_pio(struct parport *port,
                                             void *buf, size_t length,
                                             int flags)
 {
@@ -944,13 +955,13 @@ static size_t parport_pc_ecp_read_block_pio (struct parport *port,
        char *bufp = buf;
 
        port = port->physport;
-DPRINTK (KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
-dump_parport_state ("enter fcn", port);
+       DPRINTK(KERN_DEBUG "parport_pc: parport_pc_ecp_read_block_pio\n");
+       dump_parport_state("enter fcn", port);
 
        /* Special case: a timeout of zero means we cannot call schedule().
         * Also if O_NONBLOCK is set then use the default implementation. */
        if (port->cad->timeout <= PARPORT_INACTIVITY_O_NONBLOCK)
-               return parport_ieee1284_ecp_read_data (port, buf,
+               return parport_ieee1284_ecp_read_data(port, buf,
                                                       length, flags);
 
        if (port->ieee1284.mode == IEEE1284_MODE_ECPRLE) {
@@ -966,173 +977,178 @@ dump_parport_state ("enter fcn", port);
         * go through software emulation.  Otherwise we may have to throw
         * away data. */
        if (length < fifofull)
-               return parport_ieee1284_ecp_read_data (port, buf,
+               return parport_ieee1284_ecp_read_data(port, buf,
                                                       length, flags);
 
        if (port->ieee1284.phase != IEEE1284_PH_REV_IDLE) {
                /* change to reverse-idle phase (must be in forward-idle) */
 
                /* Event 38: Set nAutoFd low (also make sure nStrobe is high) */
-               parport_frob_control (port,
+               parport_frob_control(port,
                                      PARPORT_CONTROL_AUTOFD
                                      | PARPORT_CONTROL_STROBE,
                                      PARPORT_CONTROL_AUTOFD);
-               parport_pc_data_reverse (port); /* Must be in PS2 mode */
-               udelay (5);
+               parport_pc_data_reverse(port); /* Must be in PS2 mode */
+               udelay(5);
                /* Event 39: Set nInit low to initiate bus reversal */
-               parport_frob_control (port,
+               parport_frob_control(port,
                                      PARPORT_CONTROL_INIT,
                                      0);
                /* Event 40: Wait for  nAckReverse (PError) to go low */
-               r = parport_wait_peripheral (port, PARPORT_STATUS_PAPEROUT, 0);
-                if (r) {
-                        printk (KERN_DEBUG "%s: PE timeout Event 40 (%d) "
+               r = parport_wait_peripheral(port, PARPORT_STATUS_PAPEROUT, 0);
+               if (r) {
+                       printk(KERN_DEBUG "%s: PE timeout Event 40 (%d) "
                                "in ecp_read_block_pio\n", port->name, r);
                        return 0;
                }
        }
 
        /* Set up ECP FIFO mode.*/
-/*     parport_pc_frob_control (port,
+/*     parport_pc_frob_control(port,
                                 PARPORT_CONTROL_STROBE |
                                 PARPORT_CONTROL_AUTOFD,
                                 PARPORT_CONTROL_AUTOFD); */
-       r = change_mode (port, ECR_ECP); /* ECP FIFO */
-       if (r) printk (KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n", port->name);
+       r = change_mode(port, ECR_ECP); /* ECP FIFO */
+       if (r)
+               printk(KERN_DEBUG "%s: Warning change_mode ECR_ECP failed\n",
+                                                               port->name);
 
        port->ieee1284.phase = IEEE1284_PH_REV_DATA;
 
        /* the first byte must be collected manually */
-dump_parport_state ("pre 43", port);
+       dump_parport_state("pre 43", port);
        /* Event 43: Wait for nAck to go low */
-       r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, 0);
+       r = parport_wait_peripheral(port, PARPORT_STATUS_ACK, 0);
        if (r) {
                /* timed out while reading -- no data */
-               printk (KERN_DEBUG "PIO read timed out (initial byte)\n");
+               printk(KERN_DEBUG "PIO read timed out (initial byte)\n");
                goto out_no_data;
        }
        /* read byte */
-       *bufp++ = inb (DATA (port));
+       *bufp++ = inb(DATA(port));
        left--;
-dump_parport_state ("43-44", port);
+       dump_parport_state("43-44", port);
        /* Event 44: nAutoFd (HostAck) goes high to acknowledge */
-       parport_pc_frob_control (port,
+       parport_pc_frob_control(port,
                                 PARPORT_CONTROL_AUTOFD,
                                 0);
-dump_parport_state ("pre 45", port);
+       dump_parport_state("pre 45", port);
        /* Event 45: Wait for nAck to go high */
-/*     r = parport_wait_peripheral (port, PARPORT_STATUS_ACK, PARPORT_STATUS_ACK); */
-dump_parport_state ("post 45", port);
-r = 0;
+       /* r = parport_wait_peripheral(port, PARPORT_STATUS_ACK,
+                                               PARPORT_STATUS_ACK); */
+       dump_parport_state("post 45", port);
+       r = 0;
        if (r) {
                /* timed out while waiting for peripheral to respond to ack */
-               printk (KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
+               printk(KERN_DEBUG "ECP PIO read timed out (waiting for nAck)\n");
 
                /* keep hold of the byte we've got already */
                goto out_no_data;
        }
        /* Event 46: nAutoFd (HostAck) goes low to accept more data */
-       parport_pc_frob_control (port,
+       parport_pc_frob_control(port,
                                 PARPORT_CONTROL_AUTOFD,
                                 PARPORT_CONTROL_AUTOFD);
 
 
-dump_parport_state ("rev idle", port);
+       dump_parport_state("rev idle", port);
        /* Do the transfer. */
        while (left > fifofull) {
                int ret;
                unsigned long expire = jiffies + port->cad->timeout;
-               unsigned char ecrval = inb (ECONTROL (port));
+               unsigned char ecrval = inb(ECONTROL(port));
 
-               if (need_resched() && time_before (jiffies, expire))
+               if (need_resched() && time_before(jiffies, expire))
                        /* Can't yield the port. */
-                       schedule ();
+                       schedule();
 
                /* At this point, the FIFO may already be full. In
-                 * that case ECP is already holding back the
-                 * peripheral (assuming proper design) with a delayed
-                 * handshake.  Work fast to avoid a peripheral
-                 * timeout.  */
+                * that case ECP is already holding back the
+                * peripheral (assuming proper design) with a delayed
+                * handshake.  Work fast to avoid a peripheral
+                * timeout.  */
 
                if (ecrval & 0x01) {
                        /* FIFO is empty. Wait for interrupt. */
-dump_parport_state ("FIFO empty", port);
+                       dump_parport_state("FIFO empty", port);
 
                        /* Anyone else waiting for the port? */
                        if (port->waithead) {
-                               printk (KERN_DEBUG "Somebody wants the port\n");
+                               printk(KERN_DEBUG "Somebody wants the port\n");
                                break;
                        }
 
                        /* Clear serviceIntr */
-                       ECR_WRITE (port, ecrval & ~(1<<2));
-               false_alarm:
-dump_parport_state ("waiting", port);
-                       ret = parport_wait_event (port, HZ);
-DPRINTK (KERN_DEBUG "parport_wait_event returned %d\n", ret);
+                       ECR_WRITE(port, ecrval & ~(1<<2));
+false_alarm:
+                       dump_parport_state("waiting", port);
+                       ret = parport_wait_event(port, HZ);
+                       DPRINTK(KERN_DEBUG "parport_wait_event returned %d\n",
+                                                                       ret);
                        if (ret < 0)
                                break;
                        ret = 0;
-                       if (!time_before (jiffies, expire)) {
+                       if (!time_before(jiffies, expire)) {
                                /* Timed out. */
-dump_parport_state ("timeout", port);
-                               printk (KERN_DEBUG "PIO read timed out\n");
+                               dump_parport_state("timeout", port);
+                               printk(KERN_DEBUG "PIO read timed out\n");
                                break;
                        }
-                       ecrval = inb (ECONTROL (port));
+                       ecrval = inb(ECONTROL(port));
                        if (!(ecrval & (1<<2))) {
                                if (need_resched() &&
-                                   time_before (jiffies, expire)) {
-                                       schedule ();
+                                   time_before(jiffies, expire)) {
+                                       schedule();
                                }
                                goto false_alarm;
                        }
 
                        /* Depending on how the FIFO threshold was
-                         * set, how long interrupt service took, and
-                         * how fast the peripheral is, we might be
-                         * lucky and have a just filled FIFO. */
+                        * set, how long interrupt service took, and
+                        * how fast the peripheral is, we might be
+                        * lucky and have a just filled FIFO. */
                        continue;
                }
 
                if (ecrval & 0x02) {
                        /* FIFO is full. */
-dump_parport_state ("FIFO full", port);
-                       insb (fifo, bufp, fifo_depth);
+                       dump_parport_state("FIFO full", port);
+                       insb(fifo, bufp, fifo_depth);
                        bufp += fifo_depth;
                        left -= fifo_depth;
                        continue;
                }
 
-DPRINTK (KERN_DEBUG "*** ecp_read_block_pio: reading one byte from the FIFO\n");
+               DPRINTK(KERN_DEBUG
+                 "*** ecp_read_block_pio: reading one byte from the FIFO\n");
 
                /* FIFO not filled.  We will cycle this loop for a while
-                 * and either the peripheral will fill it faster,
-                 * tripping a fast empty with insb, or we empty it. */
-               *bufp++ = inb (fifo);
+                * and either the peripheral will fill it faster,
+                * tripping a fast empty with insb, or we empty it. */
+               *bufp++ = inb(fifo);
                left--;
        }
 
        /* scoop up anything left in the FIFO */
-       while (left && !(inb (ECONTROL (port) & 0x01))) {
-               *bufp++ = inb (fifo);
+       while (left && !(inb(ECONTROL(port) & 0x01))) {
+               *bufp++ = inb(fifo);
                left--;
        }
 
        port->ieee1284.phase = IEEE1284_PH_REV_IDLE;
-dump_parport_state ("rev idle2", port);
+       dump_parport_state("rev idle2", port);
 
 out_no_data:
 
        /* Go to forward idle mode to shut the peripheral up (event 47). */
-       parport_frob_control (port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
+       parport_frob_control(port, PARPORT_CONTROL_INIT, PARPORT_CONTROL_INIT);
 
        /* event 49: PError goes high */
-       r = parport_wait_peripheral (port,
+       r = parport_wait_peripheral(port,
                                     PARPORT_STATUS_PAPEROUT,
                                     PARPORT_STATUS_PAPEROUT);
        if (r) {
-               printk (KERN_DEBUG
+               printk(KERN_DEBUG
                        "%s: PE timeout FWDIDLE (%d) in ecp_read_block_pio\n",
                        port->name, r);
        }
@@ -1141,14 +1157,14 @@ out_no_data:
 
        /* Finish up. */
        {
-               int lost = get_fifo_residue (port);
+               int lost = get_fifo_residue(port);
                if (lost)
                        /* Shouldn't happen with compliant peripherals. */
-                       printk (KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
+                       printk(KERN_DEBUG "%s: DATA LOSS (%d bytes)!\n",
                                port->name, lost);
        }
 
-dump_parport_state ("fwd idle", port);
+       dump_parport_state("fwd idle", port);
        return length - left;
 }
 #endif  /*  0  */
@@ -1164,8 +1180,7 @@ dump_parport_state ("fwd idle", port);
 
 /* GCC is not inlining extern inline function later overwriten to non-inline,
    so we use outlined_ variants here.  */
-static const struct parport_operations parport_pc_ops =
-{
+static const struct parport_operations parport_pc_ops = {
        .write_data     = parport_pc_write_data,
        .read_data      = parport_pc_read_data,
 
@@ -1202,88 +1217,107 @@ static const struct parport_operations parport_pc_ops =
 };
 
 #ifdef CONFIG_PARPORT_PC_SUPERIO
+
+static struct superio_struct *find_free_superio(void)
+{
+       int i;
+       for (i = 0; i < NR_SUPERIOS; i++)
+               if (superios[i].io == 0)
+                       return &superios[i];
+       return NULL;
+}
+
+
 /* Super-IO chipset detection, Winbond, SMSC */
 static void __devinit show_parconfig_smsc37c669(int io, int key)
 {
-       int cr1,cr4,cra,cr23,cr26,cr27,i=0;
-       static const char *const modes[]={
+       int cr1, cr4, cra, cr23, cr26, cr27;
+       struct superio_struct *s;
+
+       static const char *const modes[] = {
                "SPP and Bidirectional (PS/2)",
                "EPP and SPP",
                "ECP",
                "ECP and EPP" };
 
-       outb(key,io);
-       outb(key,io);
-       outb(1,io);
-       cr1=inb(io+1);
-       outb(4,io);
-       cr4=inb(io+1);
-       outb(0x0a,io);
-       cra=inb(io+1);
-       outb(0x23,io);
-       cr23=inb(io+1);
-       outb(0x26,io);
-       cr26=inb(io+1);
-       outb(0x27,io);
-       cr27=inb(io+1);
-       outb(0xaa,io);
+       outb(key, io);
+       outb(key, io);
+       outb(1, io);
+       cr1 = inb(io + 1);
+       outb(4, io);
+       cr4 = inb(io + 1);
+       outb(0x0a, io);
+       cra = inb(io + 1);
+       outb(0x23, io);
+       cr23 = inb(io + 1);
+       outb(0x26, io);
+       cr26 = inb(io + 1);
+       outb(0x27, io);
+       cr27 = inb(io + 1);
+       outb(0xaa, io);
 
        if (verbose_probing) {
-               printk (KERN_INFO "SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
+               printk(KERN_INFO
+                       "SMSC 37c669 LPT Config: cr_1=0x%02x, 4=0x%02x, "
                        "A=0x%2x, 23=0x%02x, 26=0x%02x, 27=0x%02x\n",
-                       cr1,cr4,cra,cr23,cr26,cr27);
-               
+                       cr1, cr4, cra, cr23, cr26, cr27);
+
                /* The documentation calls DMA and IRQ-Lines by letters, so
                   the board maker can/will wire them
                   appropriately/randomly...  G=reserved H=IDE-irq, */
-               printk (KERN_INFO "SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, "
-                       "fifo threshold=%d\n", cr23*4,
-                       (cr27 &0x0f) ? 'A'-1+(cr27 &0x0f): '-',
-                       (cr26 &0x0f) ? 'A'-1+(cr26 &0x0f): '-', cra & 0x0f);
+               printk(KERN_INFO
+       "SMSC LPT Config: io=0x%04x, irq=%c, dma=%c, fifo threshold=%d\n",
+                               cr23 * 4,
+                               (cr27 & 0x0f) ? 'A' - 1 + (cr27 & 0x0f) : '-',
+                               (cr26 & 0x0f) ? 'A' - 1 + (cr26 & 0x0f) : '-',
+                               cra & 0x0f);
                printk(KERN_INFO "SMSC LPT Config: enabled=%s power=%s\n",
-                      (cr23*4 >=0x100) ?"yes":"no", (cr1 & 4) ? "yes" : "no");
-               printk(KERN_INFO "SMSC LPT Config: Port mode=%s, EPP version =%s\n",
-                      (cr1 & 0x08 ) ? "Standard mode only (SPP)" : modes[cr4 & 0x03], 
-                      (cr4 & 0x40) ? "1.7" : "1.9");
+                      (cr23 * 4 >= 0x100) ? "yes" : "no",
+                      (cr1 & 4) ? "yes" : "no");
+               printk(KERN_INFO
+                       "SMSC LPT Config: Port mode=%s, EPP version =%s\n",
+                               (cr1 & 0x08) ? "Standard mode only (SPP)"
+                                             : modes[cr4 & 0x03],
+                               (cr4 & 0x40) ? "1.7" : "1.9");
        }
-               
+
        /* Heuristics !  BIOS setup for this mainboard device limits
           the choices to standard settings, i.e. io-address and IRQ
           are related, however DMA can be 1 or 3, assume DMA_A=DMA1,
           DMA_C=DMA3 (this is true e.g. for TYAN 1564D Tomcat IV) */
-       if(cr23*4 >=0x100) { /* if active */
-               while((superios[i].io!= 0) && (i<NR_SUPERIOS))
-                       i++;
-               if(i==NR_SUPERIOS)
+       if (cr23 * 4 >= 0x100) { /* if active */
+               s = find_free_superio();
+               if (s == NULL)
                        printk(KERN_INFO "Super-IO: too many chips!\n");
                else {
                        int d;
-                       switch (cr23*4) {
-                               case 0x3bc:
-                                       superios[i].io = 0x3bc;
-                                       superios[i].irq = 7;
-                                       break;
-                               case 0x378:
-                                       superios[i].io = 0x378;
-                                       superios[i].irq = 7;
-                                       break;
-                               case 0x278:
-                                       superios[i].io = 0x278;
-                                       superios[i].irq = 5;
+                       switch (cr23 * 4) {
+                       case 0x3bc:
+                               s->io = 0x3bc;
+                               s->irq = 7;
+                               break;
+                       case 0x378:
+                               s->io = 0x378;
+                               s->irq = 7;
+                               break;
+                       case 0x278:
+                               s->io = 0x278;
+                               s->irq = 5;
                        }
-                       d=(cr26 &0x0f);
-                       if((d==1) || (d==3)) 
-                               superios[i].dma= d;
+                       d = (cr26 & 0x0f);
+                       if (d == 1 || d == 3)
+                               s->dma = d;
                        else
-                               superios[i].dma= PARPORT_DMA_NONE;
+                               s->dma = PARPORT_DMA_NONE;
                }
-       }
+       }
 }
 
 
 static void __devinit show_parconfig_winbond(int io, int key)
 {
-       int cr30,cr60,cr61,cr70,cr74,crf0,i=0;
+       int cr30, cr60, cr61, cr70, cr74, crf0;
+       struct superio_struct *s;
        static const char *const modes[] = {
                "Standard (SPP) and Bidirectional(PS/2)", /* 0 */
                "EPP-1.9 and SPP",
@@ -1296,110 +1330,134 @@ static void __devinit show_parconfig_winbond(int io, int key)
        static char *const irqtypes[] = {
                "pulsed low, high-Z",
                "follows nACK" };
-               
+
        /* The registers are called compatible-PnP because the
-           register layout is modelled after ISA-PnP, the access
-           method is just another ... */
-       outb(key,io);
-       outb(key,io);
-       outb(0x07,io);   /* Register 7: Select Logical Device */
-       outb(0x01,io+1); /* LD1 is Parallel Port */
-       outb(0x30,io);
-       cr30=inb(io+1);
-       outb(0x60,io);
-       cr60=inb(io+1);
-       outb(0x61,io);
-       cr61=inb(io+1);
-       outb(0x70,io);
-       cr70=inb(io+1);
-       outb(0x74,io);
-       cr74=inb(io+1);
-       outb(0xf0,io);
-       crf0=inb(io+1);
-       outb(0xaa,io);
+          register layout is modelled after ISA-PnP, the access
+          method is just another ... */
+       outb(key, io);
+       outb(key, io);
+       outb(0x07, io);   /* Register 7: Select Logical Device */
+       outb(0x01, io + 1); /* LD1 is Parallel Port */
+       outb(0x30, io);
+       cr30 = inb(io + 1);
+       outb(0x60, io);
+       cr60 = inb(io + 1);
+       outb(0x61, io);
+       cr61 = inb(io + 1);
+       outb(0x70, io);
+       cr70 = inb(io + 1);
+       outb(0x74, io);
+       cr74 = inb(io + 1);
+       outb(0xf0, io);
+       crf0 = inb(io + 1);
+       outb(0xaa, io);
 
        if (verbose_probing) {
-               printk(KERN_INFO "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x "
-                      "70=%02x 74=%02x, f0=%02x\n", cr30,cr60,cr61,cr70,cr74,crf0);
-               printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ", 
-                      (cr30 & 0x01) ? "yes":"no", cr60,cr61,cr70&0x0f );
+               printk(KERN_INFO
+    "Winbond LPT Config: cr_30=%02x 60,61=%02x%02x 70=%02x 74=%02x, f0=%02x\n",
+                                       cr30, cr60, cr61, cr70, cr74, crf0);
+               printk(KERN_INFO "Winbond LPT Config: active=%s, io=0x%02x%02x irq=%d, ",
+                      (cr30 & 0x01) ? "yes" : "no", cr60, cr61, cr70 & 0x0f);
                if ((cr74 & 0x07) > 3)
                        printk("dma=none\n");
                else
-                       printk("dma=%d\n",cr74 & 0x07);
-               printk(KERN_INFO "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
-                      irqtypes[crf0>>7], (crf0>>3)&0x0f);
-               printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n", modes[crf0 & 0x07]);
+                       printk("dma=%d\n", cr74 & 0x07);
+               printk(KERN_INFO
+                   "Winbond LPT Config: irqtype=%s, ECP fifo threshold=%d\n",
+                                       irqtypes[crf0>>7], (crf0>>3)&0x0f);
+               printk(KERN_INFO "Winbond LPT Config: Port mode=%s\n",
+                                       modes[crf0 & 0x07]);
        }
 
-       if(cr30 & 0x01) { /* the settings can be interrogated later ... */
-               while((superios[i].io!= 0) && (i<NR_SUPERIOS))
-                       i++;
-               if(i==NR_SUPERIOS) 
+       if (cr30 & 0x01) { /* the settings can be interrogated later ... */
+               s = find_free_superio();
+               if (s == NULL)
                        printk(KERN_INFO "Super-IO: too many chips!\n");
                else {
-                       superios[i].io = (cr60<<8)|cr61;
-                       superios[i].irq = cr70&0x0f;
-                       superios[i].dma = (((cr74 & 0x07) > 3) ?
+                       s->io = (cr60 << 8) | cr61;
+                       s->irq = cr70 & 0x0f;
+                       s->dma = (((cr74 & 0x07) > 3) ?
                                           PARPORT_DMA_NONE : (cr74 & 0x07));
                }
        }
 }
 
-static void __devinit decode_winbond(int efer, int key, int devid, int devrev, int oldid)
+static void __devinit decode_winbond(int efer, int key, int devid,
+                                                       int devrev, int oldid)
 {
        const char *type = "unknown";
-       int id,progif=2;
+       int id, progif = 2;
 
        if (devid == devrev)
                /* simple heuristics, we happened to read some
-                   non-winbond register */
+                  non-winbond register */
                return;
 
-       id=(devid<<8) | devrev;
+       id = (devid << 8) | devrev;
 
        /* Values are from public data sheets pdf files, I can just
-           confirm 83977TF is correct :-) */
-       if      (id == 0x9771) type="83977F/AF";
-       else if (id == 0x9773) type="83977TF / SMSC 97w33x/97w34x";
-       else if (id == 0x9774) type="83977ATF";
-       else if ((id & ~0x0f) == 0x5270) type="83977CTF / SMSC 97w36x";
-       else if ((id & ~0x0f) == 0x52f0) type="83977EF / SMSC 97w35x";
-       else if ((id & ~0x0f) == 0x5210) type="83627";
-       else if ((id & ~0x0f) == 0x6010) type="83697HF";
-       else if ((oldid &0x0f ) == 0x0a) { type="83877F"; progif=1;}
-       else if ((oldid &0x0f ) == 0x0b) { type="83877AF"; progif=1;}
-       else if ((oldid &0x0f ) == 0x0c) { type="83877TF"; progif=1;}
-       else if ((oldid &0x0f ) == 0x0d) { type="83877ATF"; progif=1;}
-       else progif=0;
+          confirm 83977TF is correct :-) */
+       if (id == 0x9771)
+               type = "83977F/AF";
+       else if (id == 0x9773)
+               type = "83977TF / SMSC 97w33x/97w34x";
+       else if (id == 0x9774)
+               type = "83977ATF";
+       else if ((id & ~0x0f) == 0x5270)
+               type = "83977CTF / SMSC 97w36x";
+       else if ((id & ~0x0f) == 0x52f0)
+               type = "83977EF / SMSC 97w35x";
+       else if ((id & ~0x0f) == 0x5210)
+               type = "83627";
+       else if ((id & ~0x0f) == 0x6010)
+               type = "83697HF";
+       else if ((oldid & 0x0f) == 0x0a) {
+               type = "83877F";
+               progif = 1;
+       } else if ((oldid & 0x0f) == 0x0b) {
+               type = "83877AF";
+               progif = 1;
+       } else if ((oldid & 0x0f) == 0x0c) {
+               type = "83877TF";
+               progif = 1;
+       } else if ((oldid & 0x0f) == 0x0d) {
+               type = "83877ATF";
+               progif = 1;
+       } else
+               progif = 0;
 
        if (verbose_probing)
                printk(KERN_INFO "Winbond chip at EFER=0x%x key=0x%02x "
-                      "devid=%02x devrev=%02x oldid=%02x type=%s\n", 
+                      "devid=%02x devrev=%02x oldid=%02x type=%s\n",
                       efer, key, devid, devrev, oldid, type);
 
        if (progif == 2)
-               show_parconfig_winbond(efer,key);
+               show_parconfig_winbond(efer, key);
 }
 
 static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
 {
-        const char *type = "unknown";
+       const char *type = "unknown";
        void (*func)(int io, int key);
-        int id;
+       int id;
 
-        if (devid == devrev)
+       if (devid == devrev)
                /* simple heuristics, we happened to read some
-                   non-smsc register */
+                  non-smsc register */
                return;
 
-       func=NULL;
-        id=(devid<<8) | devrev;
+       func = NULL;
+       id = (devid << 8) | devrev;
 
-       if      (id==0x0302) {type="37c669"; func=show_parconfig_smsc37c669;}
-       else if (id==0x6582) type="37c665IR";
-       else if (devid==0x65) type="37c665GT";
-       else if (devid==0x66) type="37c666GT";
+       if (id == 0x0302) {
+               type = "37c669";
+               func = show_parconfig_smsc37c669;
+       } else if (id == 0x6582)
+               type = "37c665IR";
+       else if (devid == 0x65)
+               type = "37c665GT";
+       else if (devid == 0x66)
+               type = "37c666GT";
 
        if (verbose_probing)
                printk(KERN_INFO "SMSC chip at EFER=0x%x "
@@ -1407,138 +1465,138 @@ static void __devinit decode_smsc(int efer, int key, int devid, int devrev)
                       efer, key, devid, devrev, type);
 
        if (func)
-               func(efer,key);
+               func(efer, key);
 }
 
 
 static void __devinit winbond_check(int io, int key)
 {
-       int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+       int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
        if (!request_region(io, 3, __func__))
                return;
 
        /* First probe without key */
-       outb(0x20,io);
-       x_devid=inb(io+1);
-       outb(0x21,io);
-       x_devrev=inb(io+1);
-       outb(0x09,io);
-       x_oldid=inb(io+1);
-
-       outb(key,io);
-       outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-       outb(0x20,io);    /* Write EFIR, extended function index register */
-       devid=inb(io+1);  /* Read EFDR, extended function data register */
-       outb(0x21,io);
-       devrev=inb(io+1);
-       outb(0x09,io);
-       oldid=inb(io+1);
-       outb(0xaa,io);    /* Magic Seal */
+       outb(0x20, io);
+       x_devid = inb(io + 1);
+       outb(0x21, io);
+       x_devrev = inb(io + 1);
+       outb(0x09, io);
+       x_oldid = inb(io + 1);
+
+       outb(key, io);
+       outb(key, io);     /* Write Magic Sequence to EFER, extended
+                             funtion enable register */
+       outb(0x20, io);    /* Write EFIR, extended function index register */
+       devid = inb(io + 1);  /* Read EFDR, extended function data register */
+       outb(0x21, io);
+       devrev = inb(io + 1);
+       outb(0x09, io);
+       oldid = inb(io + 1);
+       outb(0xaa, io);    /* Magic Seal */
 
        if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
                goto out; /* protection against false positives */
 
-       decode_winbond(io,key,devid,devrev,oldid);
+       decode_winbond(io, key, devid, devrev, oldid);
 out:
        release_region(io, 3);
 }
 
-static void __devinit winbond_check2(int io,int key)
+static void __devinit winbond_check2(int io, int key)
 {
-        int devid,devrev,oldid,x_devid,x_devrev,x_oldid;
+       int devid, devrev, oldid, x_devid, x_devrev, x_oldid;
 
        if (!request_region(io, 3, __func__))
                return;
 
        /* First probe without the key */
-       outb(0x20,io+2);
-       x_devid=inb(io+2);
-       outb(0x21,io+1);
-       x_devrev=inb(io+2);
-       outb(0x09,io+1);
-       x_oldid=inb(io+2);
-
-        outb(key,io);     /* Write Magic Byte to EFER, extended
-                             funtion enable register */
-        outb(0x20,io+2);  /* Write EFIR, extended function index register */
-        devid=inb(io+2);  /* Read EFDR, extended function data register */
-        outb(0x21,io+1);
-        devrev=inb(io+2);
-        outb(0x09,io+1);
-        oldid=inb(io+2);
-        outb(0xaa,io);    /* Magic Seal */
-
-       if ((x_devid == devid) && (x_devrev == devrev) && (x_oldid == oldid))
+       outb(0x20, io + 2);
+       x_devid = inb(io + 2);
+       outb(0x21, io + 1);
+       x_devrev = inb(io + 2);
+       outb(0x09, io + 1);
+       x_oldid = inb(io + 2);
+
+       outb(key, io);     /* Write Magic Byte to EFER, extended
+                             funtion enable register */
+       outb(0x20, io + 2);  /* Write EFIR, extended function index register */
+       devid = inb(io + 2);  /* Read EFDR, extended function data register */
+       outb(0x21, io + 1);
+       devrev = inb(io + 2);
+       outb(0x09, io + 1);
+       oldid = inb(io + 2);
+       outb(0xaa, io);    /* Magic Seal */
+
+       if (x_devid == devid && x_devrev == devrev && x_oldid == oldid)
                goto out; /* protection against false positives */
 
-       decode_winbond(io,key,devid,devrev,oldid);
+       decode_winbond(io, key, devid, devrev, oldid);
 out:
        release_region(io, 3);
 }
 
 static void __devinit smsc_check(int io, int key)
 {
-        int id,rev,oldid,oldrev,x_id,x_rev,x_oldid,x_oldrev;
+       int id, rev, oldid, oldrev, x_id, x_rev, x_oldid, x_oldrev;
 
        if (!request_region(io, 3, __func__))
                return;
 
        /* First probe without the key */
-       outb(0x0d,io);
-       x_oldid=inb(io+1);
-       outb(0x0e,io);
-       x_oldrev=inb(io+1);
-       outb(0x20,io);
-       x_id=inb(io+1);
-       outb(0x21,io);
-       x_rev=inb(io+1);
-
-        outb(key,io);
-        outb(key,io);     /* Write Magic Sequence to EFER, extended
-                             funtion enable register */
-        outb(0x0d,io);    /* Write EFIR, extended function index register */
-        oldid=inb(io+1);  /* Read EFDR, extended function data register */
-        outb(0x0e,io);
-        oldrev=inb(io+1);
-       outb(0x20,io);
-       id=inb(io+1);
-       outb(0x21,io);
-       rev=inb(io+1);
-        outb(0xaa,io);    /* Magic Seal */
-
-       if ((x_id == id) && (x_oldrev == oldrev) &&
-           (x_oldid == oldid) && (x_rev == rev))
+       outb(0x0d, io);
+       x_oldid = inb(io + 1);
+       outb(0x0e, io);
+       x_oldrev = inb(io + 1);
+       outb(0x20, io);
+       x_id = inb(io + 1);
+       outb(0x21, io);
+       x_rev = inb(io + 1);
+
+       outb(key, io);
+       outb(key, io);     /* Write Magic Sequence to EFER, extended
+                             funtion enable register */
+       outb(0x0d, io);    /* Write EFIR, extended function index register */
+       oldid = inb(io + 1);  /* Read EFDR, extended function data register */
+       outb(0x0e, io);
+       oldrev = inb(io + 1);
+       outb(0x20, io);
+       id = inb(io + 1);
+       outb(0x21, io);
+       rev = inb(io + 1);
+       outb(0xaa, io);    /* Magic Seal */
+
+       if (x_id == id && x_oldrev == oldrev &&
+           x_oldid == oldid && x_rev == rev)
                goto out; /* protection against false positives */
 
-        decode_smsc(io,key,oldid,oldrev);
+       decode_smsc(io, key, oldid, oldrev);
 out:
        release_region(io, 3);
 }
 
 
-static void __devinit detect_and_report_winbond (void)
-{ 
+static void __devinit detect_and_report_winbond(void)
+{
        if (verbose_probing)
                printk(KERN_DEBUG "Winbond Super-IO detection, now testing ports 3F0,370,250,4E,2E ...\n");
-       winbond_check(0x3f0,0x87);
-       winbond_check(0x370,0x87);
-       winbond_check(0x2e ,0x87);
-       winbond_check(0x4e ,0x87);
-       winbond_check(0x3f0,0x86);
-       winbond_check2(0x250,0x88); 
-       winbond_check2(0x250,0x89);
+       winbond_check(0x3f0, 0x87);
+       winbond_check(0x370, 0x87);
+       winbond_check(0x2e , 0x87);
+       winbond_check(0x4e , 0x87);
+       winbond_check(0x3f0, 0x86);
+       winbond_check2(0x250, 0x88);
+       winbond_check2(0x250, 0x89);
 }
 
-static void __devinit detect_and_report_smsc (void)
+static void __devinit detect_and_report_smsc(void)
 {
        if (verbose_probing)
                printk(KERN_DEBUG "SMSC Super-IO detection, now testing Ports 2F0, 370 ...\n");
-       smsc_check(0x3f0,0x55);
-       smsc_check(0x370,0x55);
-       smsc_check(0x3f0,0x44);
-       smsc_check(0x370,0x44);
+       smsc_check(0x3f0, 0x55);
+       smsc_check(0x370, 0x55);
+       smsc_check(0x3f0, 0x44);
+       smsc_check(0x370, 0x44);
 }
 
 static void __devinit detect_and_report_it87(void)
@@ -1573,34 +1631,39 @@ static void __devinit detect_and_report_it87(void)
 }
 #endif /* CONFIG_PARPORT_PC_SUPERIO */
 
-static int get_superio_dma (struct parport *p)
+static struct superio_struct *find_superio(struct parport *p)
 {
-       int i=0;
-       while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
-               i++;
-       if (i!=NR_SUPERIOS)
-               return superios[i].dma;
+       int i;
+       for (i = 0; i < NR_SUPERIOS; i++)
+               if (superios[i].io != p->base)
+                       return &superios[i];
+       return NULL;
+}
+
+static int get_superio_dma(struct parport *p)
+{
+       struct superio_struct *s = find_superio(p);
+       if (s)
+               return s->dma;
        return PARPORT_DMA_NONE;
 }
 
-static int get_superio_irq (struct parport *p)
+static int get_superio_irq(struct parport *p)
 {
-       int i=0;
-        while( (superios[i].io != p->base) && (i<NR_SUPERIOS))
-                i++;
-        if (i!=NR_SUPERIOS)
-                return superios[i].irq;
-        return PARPORT_IRQ_NONE;
+       struct superio_struct *s = find_superio(p);
+       if (s)
+               return s->irq;
+       return PARPORT_IRQ_NONE;
 }
-       
+
 
 /* --- Mode detection ------------------------------------- */
 
 /*
  * Checks for port existence, all ports support SPP MODE
- * Returns: 
+ * Returns:
  *         0           :  No parallel port at this address
- *  PARPORT_MODE_PCSPP :  SPP port detected 
+ *  PARPORT_MODE_PCSPP :  SPP port detected
  *                        (if the user specified an ioport himself,
  *                         this shall always be the case!)
  *
@@ -1610,7 +1673,7 @@ static int parport_SPP_supported(struct parport *pb)
        unsigned char r, w;
 
        /*
-        * first clear an eventually pending EPP timeout 
+        * first clear an eventually pending EPP timeout
         * I (sailer@ife.ee.ethz.ch) have an SMSC chipset
         * that does not even respond to SPP cycles if an EPP
         * timeout is pending
@@ -1619,19 +1682,19 @@ static int parport_SPP_supported(struct parport *pb)
 
        /* Do a simple read-write test to make sure the port exists. */
        w = 0xc;
-       outb (w, CONTROL (pb));
+       outb(w, CONTROL(pb));
 
        /* Is there a control register that we can read from?  Some
         * ports don't allow reads, so read_control just returns a
         * software copy. Some ports _do_ allow reads, so bypass the
         * software copy here.  In addition, some bits aren't
         * writable. */
-       r = inb (CONTROL (pb));
+       r = inb(CONTROL(pb));
        if ((r & 0xf) == w) {
                w = 0xe;
-               outb (w, CONTROL (pb));
-               r = inb (CONTROL (pb));
-               outb (0xc, CONTROL (pb));
+               outb(w, CONTROL(pb));
+               r = inb(CONTROL(pb));
+               outb(0xc, CONTROL(pb));
                if ((r & 0xf) == w)
                        return PARPORT_MODE_PCSPP;
        }
@@ -1639,18 +1702,18 @@ static int parport_SPP_supported(struct parport *pb)
        if (user_specified)
                /* That didn't work, but the user thinks there's a
                 * port here. */
-               printk (KERN_INFO "parport 0x%lx (WARNING): CTR: "
+               printk(KERN_INFO "parport 0x%lx (WARNING): CTR: "
                        "wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
 
        /* Try the data register.  The data lines aren't tri-stated at
         * this stage, so we expect back what we wrote. */
        w = 0xaa;
-       parport_pc_write_data (pb, w);
-       r = parport_pc_read_data (pb);
+       parport_pc_write_data(pb, w);
+       r = parport_pc_read_data(pb);
        if (r == w) {
                w = 0x55;
-               parport_pc_write_data (pb, w);
-               r = parport_pc_read_data (pb);
+               parport_pc_write_data(pb, w);
+               r = parport_pc_read_data(pb);
                if (r == w)
                        return PARPORT_MODE_PCSPP;
        }
@@ -1658,9 +1721,9 @@ static int parport_SPP_supported(struct parport *pb)
        if (user_specified) {
                /* Didn't work, but the user is convinced this is the
                 * place. */
-               printk (KERN_INFO "parport 0x%lx (WARNING): DATA: "
+               printk(KERN_INFO "parport 0x%lx (WARNING): DATA: "
                        "wrote 0x%02x, read 0x%02x\n", pb->base, w, r);
-               printk (KERN_INFO "parport 0x%lx: You gave this address, "
+               printk(KERN_INFO "parport 0x%lx: You gave this address, "
                        "but there is probably no parallel port there!\n",
                        pb->base);
        }
@@ -1691,33 +1754,33 @@ static int parport_ECR_present(struct parport *pb)
        struct parport_pc_private *priv = pb->private_data;
        unsigned char r = 0xc;
 
-       outb (r, CONTROL (pb));
-       if ((inb (ECONTROL (pb)) & 0x3) == (r & 0x3)) {
-               outb (r ^ 0x2, CONTROL (pb)); /* Toggle bit 1 */
+       outb(r, CONTROL(pb));
+       if ((inb(ECONTROL(pb)) & 0x3) == (r & 0x3)) {
+               outb(r ^ 0x2, CONTROL(pb)); /* Toggle bit 1 */
 
-               r = inb (CONTROL (pb));
-               if ((inb (ECONTROL (pb)) & 0x2) == (r & 0x2))
+               r = inb(CONTROL(pb));
+               if ((inb(ECONTROL(pb)) & 0x2) == (r & 0x2))
                        goto no_reg; /* Sure that no ECR register exists */
        }
-       
-       if ((inb (ECONTROL (pb)) & 0x3 ) != 0x1)
+
+       if ((inb(ECONTROL(pb)) & 0x3) != 0x1)
                goto no_reg;
 
-       ECR_WRITE (pb, 0x34);
-       if (inb (ECONTROL (pb)) != 0x35)
+       ECR_WRITE(pb, 0x34);
+       if (inb(ECONTROL(pb)) != 0x35)
                goto no_reg;
 
        priv->ecr = 1;
-       outb (0xc, CONTROL (pb));
-       
+       outb(0xc, CONTROL(pb));
+
        /* Go to mode 000 */
-       frob_set_mode (pb, ECR_SPP);
+       frob_set_mode(pb, ECR_SPP);
 
        return 1;
 
  no_reg:
-       outb (0xc, CONTROL (pb));
-       return 0; 
+       outb(0xc, CONTROL(pb));
+       return 0;
 }
 
 #ifdef CONFIG_PARPORT_1284
@@ -1727,7 +1790,7 @@ static int parport_ECR_present(struct parport *pb)
  * allows us to read data from the data lines.  In theory we would get back
  * 0xff but any peripheral attached to the port may drag some or all of the
  * lines down to zero.  So if we get back anything that isn't the contents
- * of the data register we deem PS/2 support to be present. 
+ * of the data register we deem PS/2 support to be present.
  *
  * Some SPP ports have "half PS/2" ability - you can't turn off the line
  * drivers, but an external peripheral with sufficiently beefy drivers of
@@ -1735,26 +1798,28 @@ static int parport_ECR_present(struct parport *pb)
  * where they can then be read back as normal.  Ports with this property
  * and the right type of device attached are likely to fail the SPP test,
  * (as they will appear to have stuck bits) and so the fact that they might
- * be misdetected here is rather academic. 
+ * be misdetected here is rather academic.
  */
 
 static int parport_PS2_supported(struct parport *pb)
 {
        int ok = 0;
-  
+
        clear_epp_timeout(pb);
 
        /* try to tri-state the buffer */
-       parport_pc_data_reverse (pb);
-       
+       parport_pc_data_reverse(pb);
+
        parport_pc_write_data(pb, 0x55);
-       if (parport_pc_read_data(pb) != 0x55) ok++;
+       if (parport_pc_read_data(pb) != 0x55)
+               ok++;
 
        parport_pc_write_data(pb, 0xaa);
-       if (parport_pc_read_data(pb) != 0xaa) ok++;
+       if (parport_pc_read_data(pb) != 0xaa)
+               ok++;
 
        /* cancel input mode */
-       parport_pc_data_forward (pb);
+       parport_pc_data_forward(pb);
 
        if (ok) {
                pb->modes |= PARPORT_MODE_TRISTATE;
@@ -1773,68 +1838,68 @@ static int parport_ECP_supported(struct parport *pb)
        int config, configb;
        int pword;
        struct parport_pc_private *priv = pb->private_data;
-       /* Translate ECP intrLine to ISA irq value */   
-       static const int intrline[]= { 0, 7, 9, 10, 11, 14, 15, 5 }; 
+       /* Translate ECP intrLine to ISA irq value */
+       static const int intrline[] = { 0, 7, 9, 10, 11, 14, 15, 5 };
 
        /* If there is no ECR, we have no hope of supporting ECP. */
        if (!priv->ecr)
                return 0;
 
        /* Find out FIFO depth */
-       ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-       ECR_WRITE (pb, ECR_TST << 5); /* TEST FIFO */
-       for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02); i++)
-               outb (0xaa, FIFO (pb));
+       ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+       ECR_WRITE(pb, ECR_TST << 5); /* TEST FIFO */
+       for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02); i++)
+               outb(0xaa, FIFO(pb));
 
        /*
         * Using LGS chipset it uses ECR register, but
         * it doesn't support ECP or FIFO MODE
         */
        if (i == 1024) {
-               ECR_WRITE (pb, ECR_SPP << 5);
+               ECR_WRITE(pb, ECR_SPP << 5);
                return 0;
        }
 
        priv->fifo_depth = i;
        if (verbose_probing)
-               printk (KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
+               printk(KERN_DEBUG "0x%lx: FIFO is %d bytes\n", pb->base, i);
 
        /* Find out writeIntrThreshold */
-       frob_econtrol (pb, 1<<2, 1<<2);
-       frob_econtrol (pb, 1<<2, 0);
+       frob_econtrol(pb, 1<<2, 1<<2);
+       frob_econtrol(pb, 1<<2, 0);
        for (i = 1; i <= priv->fifo_depth; i++) {
-               inb (FIFO (pb));
-               udelay (50);
-               if (inb (ECONTROL (pb)) & (1<<2))
+               inb(FIFO(pb));
+               udelay(50);
+               if (inb(ECONTROL(pb)) & (1<<2))
                        break;
        }
 
        if (i <= priv->fifo_depth) {
                if (verbose_probing)
-                       printk (KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
+                       printk(KERN_DEBUG "0x%lx: writeIntrThreshold is %d\n",
                                pb->base, i);
        } else
                /* Number of bytes we know we can write if we get an
-                   interrupt. */
+                  interrupt. */
                i = 0;
 
        priv->writeIntrThreshold = i;
 
        /* Find out readIntrThreshold */
-       frob_set_mode (pb, ECR_PS2); /* Reset FIFO and enable PS2 */
-       parport_pc_data_reverse (pb); /* Must be in PS2 mode */
-       frob_set_mode (pb, ECR_TST); /* Test FIFO */
-       frob_econtrol (pb, 1<<2, 1<<2);
-       frob_econtrol (pb, 1<<2, 0);
+       frob_set_mode(pb, ECR_PS2); /* Reset FIFO and enable PS2 */
+       parport_pc_data_reverse(pb); /* Must be in PS2 mode */
+       frob_set_mode(pb, ECR_TST); /* Test FIFO */
+       frob_econtrol(pb, 1<<2, 1<<2);
+       frob_econtrol(pb, 1<<2, 0);
        for (i = 1; i <= priv->fifo_depth; i++) {
-               outb (0xaa, FIFO (pb));
-               if (inb (ECONTROL (pb)) & (1<<2))
+               outb(0xaa, FIFO(pb));
+               if (inb(ECONTROL(pb)) & (1<<2))
                        break;
        }
 
        if (i <= priv->fifo_depth) {
                if (verbose_probing)
-                       printk (KERN_INFO "0x%lx: readIntrThreshold is %d\n",
+                       printk(KERN_INFO "0x%lx: readIntrThreshold is %d\n",
                                pb->base, i);
        } else
                /* Number of bytes we can read if we get an interrupt. */
@@ -1842,23 +1907,23 @@ static int parport_ECP_supported(struct parport *pb)
 
        priv->readIntrThreshold = i;
 
-       ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-       ECR_WRITE (pb, 0xf4); /* Configuration mode */
-       config = inb (CONFIGA (pb));
+       ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+       ECR_WRITE(pb, 0xf4); /* Configuration mode */
+       config = inb(CONFIGA(pb));
        pword = (config >> 4) & 0x7;
        switch (pword) {
        case 0:
                pword = 2;
-               printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+               printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
                        pb->base);
                break;
        case 2:
                pword = 4;
-               printk (KERN_WARNING "0x%lx: Unsupported pword size!\n",
+               printk(KERN_WARNING "0x%lx: Unsupported pword size!\n",
                        pb->base);
                break;
        default:
-               printk (KERN_WARNING "0x%lx: Unknown implementation ID\n",
+               printk(KERN_WARNING "0x%lx: Unknown implementation ID\n",
                        pb->base);
                /* Assume 1 */
        case 1:
@@ -1867,28 +1932,29 @@ static int parport_ECP_supported(struct parport *pb)
        priv->pword = pword;
 
        if (verbose_probing) {
-               printk (KERN_DEBUG "0x%lx: PWord is %d bits\n", pb->base, 8 * pword);
-               
-               printk (KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
+               printk(KERN_DEBUG "0x%lx: PWord is %d bits\n",
+                       pb->base, 8 * pword);
+
+               printk(KERN_DEBUG "0x%lx: Interrupts are ISA-%s\n", pb->base,
                        config & 0x80 ? "Level" : "Pulses");
 
-               configb = inb (CONFIGB (pb));
-               printk (KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
+               configb = inb(CONFIGB(pb));
+               printk(KERN_DEBUG "0x%lx: ECP port cfgA=0x%02x cfgB=0x%02x\n",
                        pb->base, config, configb);
-               printk (KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
-               if ((configb >>3) & 0x07)
-                       printk("%d",intrline[(configb >>3) & 0x07]);
+               printk(KERN_DEBUG "0x%lx: ECP settings irq=", pb->base);
+               if ((configb >> 3) & 0x07)
+                       printk("%d", intrline[(configb >> 3) & 0x07]);
                else
                        printk("<none or set by other means>");
-               printk (" dma=");
-               if( (configb & 0x03 ) == 0x00)
+               printk(" dma=");
+               if ((configb & 0x03) == 0x00)
                        printk("<none or set by other means>\n");
                else
-                       printk("%d\n",configb & 0x07);
+                       printk("%d\n", configb & 0x07);
        }
 
        /* Go back to mode 000 */
-       frob_set_mode (pb, ECR_SPP);
+       frob_set_mode(pb, ECR_SPP);
 
        return 1;
 }
@@ -1903,10 +1969,10 @@ static int parport_ECPPS2_supported(struct parport *pb)
        if (!priv->ecr)
                return 0;
 
-       oecr = inb (ECONTROL (pb));
-       ECR_WRITE (pb, ECR_PS2 << 5);
+       oecr = inb(ECONTROL(pb));
+       ECR_WRITE(pb, ECR_PS2 << 5);
        result = parport_PS2_supported(pb);
-       ECR_WRITE (pb, oecr);
+       ECR_WRITE(pb, oecr);
        return result;
 }
 
@@ -1930,16 +1996,15 @@ static int parport_EPP_supported(struct parport *pb)
         */
 
        /* If EPP timeout bit clear then EPP available */
-       if (!clear_epp_timeout(pb)) {
+       if (!clear_epp_timeout(pb))
                return 0;  /* No way to clear timeout */
-       }
 
        /* Check for Intel bug. */
        if (priv->ecr) {
                unsigned char i;
                for (i = 0x00; i < 0x80; i += 0x20) {
-                       ECR_WRITE (pb, i);
-                       if (clear_epp_timeout (pb)) {
+                       ECR_WRITE(pb, i);
+                       if (clear_epp_timeout(pb)) {
                                /* Phony EPP in ECP. */
                                return 0;
                        }
@@ -1963,17 +2028,16 @@ static int parport_ECPEPP_supported(struct parport *pb)
        int result;
        unsigned char oecr;
 
-       if (!priv->ecr) {
+       if (!priv->ecr)
                return 0;
-       }
 
-       oecr = inb (ECONTROL (pb));
+       oecr = inb(ECONTROL(pb));
        /* Search for SMC style EPP+ECP mode */
-       ECR_WRITE (pb, 0x80);
-       outb (0x04, CONTROL (pb));
+       ECR_WRITE(pb, 0x80);
+       outb(0x04, CONTROL(pb));
        result = parport_EPP_supported(pb);
 
-       ECR_WRITE (pb, oecr);
+       ECR_WRITE(pb, oecr);
 
        if (result) {
                /* Set up access functions to use ECP+EPP hardware. */
@@ -1991,11 +2055,25 @@ static int parport_ECPEPP_supported(struct parport *pb)
 /* Don't bother probing for modes we know we won't use. */
 static int __devinit parport_PS2_supported(struct parport *pb) { return 0; }
 #ifdef CONFIG_PARPORT_PC_FIFO
-static int parport_ECP_supported(struct parport *pb) { return 0; }
+static int parport_ECP_supported(struct parport *pb)
+{
+       return 0;
+}
 #endif
-static int __devinit parport_EPP_supported(struct parport *pb) { return 0; }
-static int __devinit parport_ECPEPP_supported(struct parport *pb){return 0;}
-static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
+static int __devinit parport_EPP_supported(struct parport *pb)
+{
+       return 0;
+}
+
+static int __devinit parport_ECPEPP_supported(struct parport *pb)
+{
+       return 0;
+}
+
+static int __devinit parport_ECPPS2_supported(struct parport *pb)
+{
+       return 0;
+}
 
 #endif /* No IEEE 1284 support */
 
@@ -2005,17 +2083,17 @@ static int __devinit parport_ECPPS2_supported(struct parport *pb){return 0;}
 static int programmable_irq_support(struct parport *pb)
 {
        int irq, intrLine;
-       unsigned char oecr = inb (ECONTROL (pb));
+       unsigned char oecr = inb(ECONTROL(pb));
        static const int lookup[8] = {
                PARPORT_IRQ_NONE, 7, 9, 10, 11, 14, 15, 5
        };
 
-       ECR_WRITE (pb, ECR_CNF << 5); /* Configuration MODE */
+       ECR_WRITE(pb, ECR_CNF << 5); /* Configuration MODE */
 
-       intrLine = (inb (CONFIGB (pb)) >> 3) & 0x07;
+       intrLine = (inb(CONFIGB(pb)) >> 3) & 0x07;
        irq = lookup[intrLine];
 
-       ECR_WRITE (pb, oecr);
+       ECR_WRITE(pb, oecr);
        return irq;
 }
 
@@ -2025,17 +2103,17 @@ static int irq_probe_ECP(struct parport *pb)
        unsigned long irqs;
 
        irqs = probe_irq_on();
-               
-       ECR_WRITE (pb, ECR_SPP << 5); /* Reset FIFO */
-       ECR_WRITE (pb, (ECR_TST << 5) | 0x04);
-       ECR_WRITE (pb, ECR_TST << 5);
+
+       ECR_WRITE(pb, ECR_SPP << 5); /* Reset FIFO */
+       ECR_WRITE(pb, (ECR_TST << 5) | 0x04);
+       ECR_WRITE(pb, ECR_TST << 5);
 
        /* If Full FIFO sure that writeIntrThreshold is generated */
-       for (i=0; i < 1024 && !(inb (ECONTROL (pb)) & 0x02) ; i++) 
-               outb (0xaa, FIFO (pb));
-               
+       for (i = 0; i < 1024 && !(inb(ECONTROL(pb)) & 0x02) ; i++)
+               outb(0xaa, FIFO(pb));
+
        pb->irq = probe_irq_off(irqs);
-       ECR_WRITE (pb, ECR_SPP << 5);
+       ECR_WRITE(pb, ECR_SPP << 5);
 
        if (pb->irq <= 0)
                pb->irq = PARPORT_IRQ_NONE;
@@ -2045,7 +2123,7 @@ static int irq_probe_ECP(struct parport *pb)
 
 /*
  * This detection seems that only works in National Semiconductors
- * This doesn't work in SMC, LGS, and Winbond 
+ * This doesn't work in SMC, LGS, and Winbond
  */
 static int irq_probe_EPP(struct parport *pb)
 {
@@ -2056,16 +2134,16 @@ static int irq_probe_EPP(struct parport *pb)
        unsigned char oecr;
 
        if (pb->modes & PARPORT_MODE_PCECR)
-               oecr = inb (ECONTROL (pb));
+               oecr = inb(ECONTROL(pb));
 
        irqs = probe_irq_on();
 
        if (pb->modes & PARPORT_MODE_PCECR)
-               frob_econtrol (pb, 0x10, 0x10);
-       
+               frob_econtrol(pb, 0x10, 0x10);
+
        clear_epp_timeout(pb);
-       parport_pc_frob_control (pb, 0x20, 0x20);
-       parport_pc_frob_control (pb, 0x10, 0x10);
+       parport_pc_frob_control(pb, 0x20, 0x20);
+       parport_pc_frob_control(pb, 0x10, 0x10);
        clear_epp_timeout(pb);
 
        /* Device isn't expecting an EPP read
@@ -2074,9 +2152,9 @@ static int irq_probe_EPP(struct parport *pb)
        parport_pc_read_epp(pb);
        udelay(20);
 
-       pb->irq = probe_irq_off (irqs);
+       pb->irq = probe_irq_off(irqs);
        if (pb->modes & PARPORT_MODE_PCECR)
-               ECR_WRITE (pb, oecr);
+               ECR_WRITE(pb, oecr);
        parport_pc_write_control(pb, 0xc);
 
        if (pb->irq <= 0)
@@ -2133,28 +2211,28 @@ static int parport_irq_probe(struct parport *pb)
 /* --- DMA detection -------------------------------------- */
 
 /* Only if chipset conforms to ECP ISA Interface Standard */
-static int programmable_dma_support (struct parport *p)
+static int programmable_dma_support(struct parport *p)
 {
-       unsigned char oecr = inb (ECONTROL (p));
+       unsigned char oecr = inb(ECONTROL(p));
        int dma;
 
-       frob_set_mode (p, ECR_CNF);
-       
-       dma = inb (CONFIGB(p)) & 0x07;
+       frob_set_mode(p, ECR_CNF);
+
+       dma = inb(CONFIGB(p)) & 0x07;
        /* 000: Indicates jumpered 8-bit DMA if read-only.
           100: Indicates jumpered 16-bit DMA if read-only. */
        if ((dma & 0x03) == 0)
                dma = PARPORT_DMA_NONE;
 
-       ECR_WRITE (p, oecr);
+       ECR_WRITE(p, oecr);
        return dma;
 }
 
-static int parport_dma_probe (struct parport *p)
+static int parport_dma_probe(struct parport *p)
 {
        const struct parport_pc_private *priv = p->private_data;
-       if (priv->ecr)
-               p->dma = programmable_dma_support(p); /* ask ECP chipset first */
+       if (priv->ecr)          /* ask ECP chipset first */
+               p->dma = programmable_dma_support(p);
        if (p->dma == PARPORT_DMA_NONE) {
                /* ask known Super-IO chips proper, although these
                   claim ECP compatible, some don't report their DMA
@@ -2212,7 +2290,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
        if (!base_res)
                goto out4;
 
-       memcpy(ops, &parport_pc_ops, sizeof (struct parport_operations));
+       memcpy(ops, &parport_pc_ops, sizeof(struct parport_operations));
        priv->ctr = 0xc;
        priv->ctr_writable = ~0x10;
        priv->ecr = 0;
@@ -2239,7 +2317,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
                        if (!parport_EPP_supported(p))
                                parport_ECPEPP_supported(p);
        }
-       if (!parport_SPP_supported (p))
+       if (!parport_SPP_supported(p))
                /* No port. */
                goto out5;
        if (priv->ecr)
@@ -2247,7 +2325,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
        else
                parport_PS2_supported(p);
 
-       p->size = (p->modes & PARPORT_MODE_EPP)?8:3;
+       p->size = (p->modes & PARPORT_MODE_EPP) ? 8 : 3;
 
        printk(KERN_INFO "%s: PC-style at 0x%lx", p->name, p->base);
        if (p->base_hi && priv->ecr)
@@ -2271,7 +2349,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
                }
        }
        if (p->dma == PARPORT_DMA_AUTO) /* To use DMA, giving the irq
-                                           is mandatory (see above) */
+                                          is mandatory (see above) */
                p->dma = PARPORT_DMA_NONE;
 
 #ifdef CONFIG_PARPORT_PC_FIFO
@@ -2288,16 +2366,23 @@ struct parport *parport_pc_probe_port(unsigned long int base,
                if (p->dma != PARPORT_DMA_NONE) {
                        printk(", dma %d", p->dma);
                        p->modes |= PARPORT_MODE_DMA;
-               }
-               else printk(", using FIFO");
-       }
-       else
+               } else
+                       printk(", using FIFO");
+       } else
                /* We can't use the DMA channel after all. */
                p->dma = PARPORT_DMA_NONE;
 #endif /* Allowed to use FIFO/DMA */
 
        printk(" [");
-#define printmode(x) {if(p->modes&PARPORT_MODE_##x){printk("%s%s",f?",":"",#x);f++;}}
+
+#define printmode(x) \
+       {\
+               if (p->modes & PARPORT_MODE_##x) {\
+                       printk("%s%s", f ? "," : "", #x);\
+                       f++;\
+               } \
+       }
+
        {
                int f = 0;
                printmode(PCSPP);
@@ -2309,10 +2394,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
        }
 #undef printmode
 #ifndef CONFIG_PARPORT_1284
-       printk ("(,...)");
+       printk("(,...)");
 #endif /* CONFIG_PARPORT_1284 */
        printk("]\n");
-       if (probedirq != PARPORT_IRQ_NONE) 
+       if (probedirq != PARPORT_IRQ_NONE)
                printk(KERN_INFO "%s: irq %d detected\n", p->name, probedirq);
 
        /* If No ECP release the ports grabbed above. */
@@ -2328,7 +2413,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
        if (p->irq != PARPORT_IRQ_NONE) {
                if (request_irq(p->irq, parport_irq_handler,
                                 irqflags, p->name, p)) {
-                       printk (KERN_WARNING "%s: irq %d in use, "
+                       printk(KERN_WARNING "%s: irq %d in use, "
                                "resorting to polled operation\n",
                                p->name, p->irq);
                        p->irq = PARPORT_IRQ_NONE;
@@ -2338,8 +2423,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
 #ifdef CONFIG_PARPORT_PC_FIFO
 #ifdef HAS_DMA
                if (p->dma != PARPORT_DMA_NONE) {
-                       if (request_dma (p->dma, p->name)) {
-                               printk (KERN_WARNING "%s: dma %d in use, "
+                       if (request_dma(p->dma, p->name)) {
+                               printk(KERN_WARNING "%s: dma %d in use, "
                                        "resorting to PIO operation\n",
                                        p->name, p->dma);
                                p->dma = PARPORT_DMA_NONE;
@@ -2349,8 +2434,8 @@ struct parport *parport_pc_probe_port(unsigned long int base,
                                                       PAGE_SIZE,
                                                       &priv->dma_handle,
                                                       GFP_KERNEL);
-                               if (! priv->dma_buf) {
-                                       printk (KERN_WARNING "%s: "
+                               if (!priv->dma_buf) {
+                                       printk(KERN_WARNING "%s: "
                                                "cannot get buffer for DMA, "
                                                "resorting to PIO operation\n",
                                                p->name);
@@ -2369,10 +2454,10 @@ struct parport *parport_pc_probe_port(unsigned long int base,
                 * Put the ECP detected port in PS2 mode.
                 * Do this also for ports that have ECR but don't do ECP.
                 */
-               ECR_WRITE (p, 0x34);
+               ECR_WRITE(p, 0x34);
 
        parport_pc_write_data(p, 0);
-       parport_pc_data_forward (p);
+       parport_pc_data_forward(p);
 
        /* Now that we've told the sharing engine about the port, and
           found out its characteristics, let the high-level drivers
@@ -2380,7 +2465,7 @@ struct parport *parport_pc_probe_port(unsigned long int base,
        spin_lock(&ports_lock);
        list_add(&priv->list, &ports_list);
        spin_unlock(&ports_lock);
-       parport_announce_port (p);
+       parport_announce_port(p);
 
        return p;
 
@@ -2393,18 +2478,17 @@ out5:
 out4:
        parport_put_port(p);
 out3:
-       kfree (priv);
+       kfree(priv);
 out2:
-       kfree (ops);
+       kfree(ops);
 out1:
        if (pdev)
                platform_device_unregister(pdev);
        return NULL;
 }
+EXPORT_SYMBOL(parport_pc_probe_port);
 
-EXPORT_SYMBOL (parport_pc_probe_port);
-
-void parport_pc_unregister_port (struct parport *p)
+void parport_pc_unregister_port(struct parport *p)
 {
        struct parport_pc_private *priv = p->private_data;
        struct parport_operations *ops = p->ops;
@@ -2430,17 +2514,16 @@ void parport_pc_unregister_port (struct parport *p)
                                    priv->dma_buf,
                                    priv->dma_handle);
 #endif
-       kfree (p->private_data);
+       kfree(p->private_data);
        parport_put_port(p);
-       kfree (ops); /* hope no-one cached it */
+       kfree(ops); /* hope no-one cached it */
 }
-
-EXPORT_SYMBOL (parport_pc_unregister_port);
+EXPORT_SYMBOL(parport_pc_unregister_port);
 
 #ifdef CONFIG_PCI
 
 /* ITE support maintained by Rich Liu <richliu@poorman.org> */
-static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_ite_8872_probe(struct pci_dev *pdev, int autoirq,
                                         int autodma,
                                         const struct parport_pc_via_data *via)
 {
@@ -2452,73 +2535,74 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
        int irq;
        int i;
 
-       DPRINTK (KERN_DEBUG "sio_ite_8872_probe()\n");
-       
-       // make sure which one chip
-       for(i = 0; i < 5; i++) {
+       DPRINTK(KERN_DEBUG "sio_ite_8872_probe()\n");
+
+       /* make sure which one chip */
+       for (i = 0; i < 5; i++) {
                base_res = request_region(inta_addr[i], 32, "it887x");
                if (base_res) {
                        int test;
-                       pci_write_config_dword (pdev, 0x60,
+                       pci_write_config_dword(pdev, 0x60,
                                                0xe5000000 | inta_addr[i]);
-                       pci_write_config_dword (pdev, 0x78,
+                       pci_write_config_dword(pdev, 0x78,
                                                0x00000000 | inta_addr[i]);
-                       test = inb (inta_addr[i]);
-                       if (test != 0xff) break;
+                       test = inb(inta_addr[i]);
+                       if (test != 0xff)
+                               break;
                        release_region(inta_addr[i], 0x8);
                }
        }
-       if(i >= 5) {
-               printk (KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
+       if (i >= 5) {
+               printk(KERN_INFO "parport_pc: cannot find ITE8872 INTA\n");
                return 0;
        }
 
-       type = inb (inta_addr[i] + 0x18);
+       type = inb(inta_addr[i] + 0x18);
        type &= 0x0f;
 
        switch (type) {
        case 0x2:
-               printk (KERN_INFO "parport_pc: ITE8871 found (1P)\n");
+               printk(KERN_INFO "parport_pc: ITE8871 found (1P)\n");
                ite8872set = 0x64200000;
                break;
        case 0xa:
-               printk (KERN_INFO "parport_pc: ITE8875 found (1P)\n");
+               printk(KERN_INFO "parport_pc: ITE8875 found (1P)\n");
                ite8872set = 0x64200000;
                break;
        case 0xe:
-               printk (KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
+               printk(KERN_INFO "parport_pc: ITE8872 found (2S1P)\n");
                ite8872set = 0x64e00000;
                break;
        case 0x6:
-               printk (KERN_INFO "parport_pc: ITE8873 found (1S)\n");
+               printk(KERN_INFO "parport_pc: ITE8873 found (1S)\n");
                return 0;
        case 0x8:
-               DPRINTK (KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
+               DPRINTK(KERN_DEBUG "parport_pc: ITE8874 found (2S)\n");
                return 0;
        default:
-               printk (KERN_INFO "parport_pc: unknown ITE887x\n");
-               printk (KERN_INFO "parport_pc: please mail 'lspci -nvv' "
+               printk(KERN_INFO "parport_pc: unknown ITE887x\n");
+               printk(KERN_INFO "parport_pc: please mail 'lspci -nvv' "
                        "output to Rich.Liu@ite.com.tw\n");
                return 0;
        }
 
-       pci_read_config_byte (pdev, 0x3c, &ite8872_irq);
-       pci_read_config_dword (pdev, 0x1c, &ite8872_lpt);
+       pci_read_config_byte(pdev, 0x3c, &ite8872_irq);
+       pci_read_config_dword(pdev, 0x1c, &ite8872_lpt);
        ite8872_lpt &= 0x0000ff00;
-       pci_read_config_dword (pdev, 0x20, &ite8872_lpthi);
+       pci_read_config_dword(pdev, 0x20, &ite8872_lpthi);
        ite8872_lpthi &= 0x0000ff00;
-       pci_write_config_dword (pdev, 0x6c, 0xe3000000 | ite8872_lpt);
-       pci_write_config_dword (pdev, 0x70, 0xe3000000 | ite8872_lpthi);
-       pci_write_config_dword (pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
-       // SET SPP&EPP , Parallel Port NO DMA , Enable All Function
-       // SET Parallel IRQ
-       pci_write_config_dword (pdev, 0x9c,
+       pci_write_config_dword(pdev, 0x6c, 0xe3000000 | ite8872_lpt);
+       pci_write_config_dword(pdev, 0x70, 0xe3000000 | ite8872_lpthi);
+       pci_write_config_dword(pdev, 0x80, (ite8872_lpthi<<16) | ite8872_lpt);
+       /* SET SPP&EPP , Parallel Port NO DMA , Enable All Function */
+       /* SET Parallel IRQ */
+       pci_write_config_dword(pdev, 0x9c,
                                ite8872set | (ite8872_irq * 0x11111));
 
-       DPRINTK (KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
-       DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
+       DPRINTK(KERN_DEBUG "ITE887x: The IRQ is %d.\n", ite8872_irq);
+       DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O port is 0x%x.\n",
                 ite8872_lpt);
-       DPRINTK (KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
+       DPRINTK(KERN_DEBUG "ITE887x: The PARALLEL I/O porthi is 0x%x.\n",
                 ite8872_lpthi);
 
        /* Let the user (or defaults) steer us away from interrupts */
@@ -2530,14 +2614,14 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
         * Release the resource so that parport_pc_probe_port can get it.
         */
        release_resource(base_res);
-       if (parport_pc_probe_port (ite8872_lpt, ite8872_lpthi,
+       if (parport_pc_probe_port(ite8872_lpt, ite8872_lpthi,
                                   irq, PARPORT_DMA_NONE, &pdev->dev, 0)) {
-               printk (KERN_INFO
+               printk(KERN_INFO
                        "parport_pc: ITE 8872 parallel port: io=0x%X",
-                       ite8872_lpt);
+                                                               ite8872_lpt);
                if (irq != PARPORT_IRQ_NONE)
-                       printk (", irq=%d", irq);
-               printk ("\n");
+                       printk(", irq=%d", irq);
+               printk("\n");
                return 1;
        }
 
@@ -2546,7 +2630,7 @@ static int __devinit sio_ite_8872_probe (struct pci_dev *pdev, int autoirq,
 
 /* VIA 8231 support by Pavel Fedin <sonic_amiga@rambler.ru>
    based on VIA 686a support code by Jeff Garzik <jgarzik@pobox.com> */
-static int __devinitdata parport_init_mode = 0;
+static int __devinitdata parport_init_mode;
 
 /* Data for two known VIA chips */
 static struct parport_pc_via_data via_686a_data __devinitdata = {
@@ -2568,7 +2652,7 @@ static struct parport_pc_via_data via_8231_data __devinitdata = {
        0xF6
 };
 
-static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
+static int __devinit sio_via_probe(struct pci_dev *pdev, int autoirq,
                                    int autodma,
                                    const struct parport_pc_via_data *via)
 {
@@ -2580,38 +2664,38 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
        printk(KERN_DEBUG "parport_pc: VIA 686A/8231 detected\n");
 
-       switch(parport_init_mode)
-       {
+       switch (parport_init_mode) {
        case 1:
-           printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
-           siofunc = VIA_FUNCTION_PARPORT_SPP;
-           break;
+               printk(KERN_DEBUG "parport_pc: setting SPP mode\n");
+               siofunc = VIA_FUNCTION_PARPORT_SPP;
+               break;
        case 2:
-           printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
-           siofunc = VIA_FUNCTION_PARPORT_SPP;
-           ppcontrol = VIA_PARPORT_BIDIR;
-           break;
+               printk(KERN_DEBUG "parport_pc: setting PS/2 mode\n");
+               siofunc = VIA_FUNCTION_PARPORT_SPP;
+               ppcontrol = VIA_PARPORT_BIDIR;
+               break;
        case 3:
-           printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
-           siofunc = VIA_FUNCTION_PARPORT_EPP;
-           ppcontrol = VIA_PARPORT_BIDIR;
-           have_epp = 1;
-           break;
+               printk(KERN_DEBUG "parport_pc: setting EPP mode\n");
+               siofunc = VIA_FUNCTION_PARPORT_EPP;
+               ppcontrol = VIA_PARPORT_BIDIR;
+               have_epp = 1;
+               break;
        case 4:
-           printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
-           siofunc = VIA_FUNCTION_PARPORT_ECP;
-           ppcontrol = VIA_PARPORT_BIDIR;
-           break;
+               printk(KERN_DEBUG "parport_pc: setting ECP mode\n");
+               siofunc = VIA_FUNCTION_PARPORT_ECP;
+               ppcontrol = VIA_PARPORT_BIDIR;
+               break;
        case 5:
-           printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
-           siofunc = VIA_FUNCTION_PARPORT_ECP;
-           ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
-           have_epp = 1;
-           break;
-        default:
-           printk(KERN_DEBUG "parport_pc: probing current configuration\n");
-           siofunc = VIA_FUNCTION_PROBE;
-           break;
+               printk(KERN_DEBUG "parport_pc: setting EPP+ECP mode\n");
+               siofunc = VIA_FUNCTION_PARPORT_ECP;
+               ppcontrol = VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP;
+               have_epp = 1;
+               break;
+       default:
+               printk(KERN_DEBUG
+                       "parport_pc: probing current configuration\n");
+               siofunc = VIA_FUNCTION_PROBE;
+               break;
        }
        /*
         * unlock super i/o configuration
@@ -2622,38 +2706,36 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
        /* Bits 1-0: Parallel Port Mode / Enable */
        outb(via->viacfg_function, VIA_CONFIG_INDEX);
-       tmp = inb (VIA_CONFIG_DATA);
+       tmp = inb(VIA_CONFIG_DATA);
        /* Bit 5: EPP+ECP enable; bit 7: PS/2 bidirectional port enable */
        outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-       tmp2 = inb (VIA_CONFIG_DATA);
-       if (siofunc == VIA_FUNCTION_PROBE)
-       {
-           siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
-           ppcontrol = tmp2;
+       tmp2 = inb(VIA_CONFIG_DATA);
+       if (siofunc == VIA_FUNCTION_PROBE) {
+               siofunc = tmp & VIA_FUNCTION_PARPORT_DISABLE;
+               ppcontrol = tmp2;
+       } else {
+               tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
+               tmp |= siofunc;
+               outb(via->viacfg_function, VIA_CONFIG_INDEX);
+               outb(tmp, VIA_CONFIG_DATA);
+               tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
+               tmp2 |= ppcontrol;
+               outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
+               outb(tmp2, VIA_CONFIG_DATA);
        }
-       else
-       {
-           tmp &= ~VIA_FUNCTION_PARPORT_DISABLE;
-           tmp |= siofunc;
-           outb(via->viacfg_function, VIA_CONFIG_INDEX);
-           outb(tmp, VIA_CONFIG_DATA);
-           tmp2 &= ~(VIA_PARPORT_BIDIR|VIA_PARPORT_ECPEPP);
-           tmp2 |= ppcontrol;
-           outb(via->viacfg_parport_control, VIA_CONFIG_INDEX);
-           outb(tmp2, VIA_CONFIG_DATA);
-       }
-       
+
        /* Parallel Port I/O Base Address, bits 9-2 */
        outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
        port1 = inb(VIA_CONFIG_DATA) << 2;
-       
-       printk (KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",port1);
-       if ((port1 == 0x3BC) && have_epp)
-       {
-           outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
-           outb((0x378 >> 2), VIA_CONFIG_DATA);
-           printk(KERN_DEBUG "parport_pc: Parallel port base changed to 0x378\n");
-           port1 = 0x378;
+
+       printk(KERN_DEBUG "parport_pc: Current parallel port base: 0x%X\n",
+                                                                       port1);
+       if (port1 == 0x3BC && have_epp) {
+               outb(via->viacfg_parport_base, VIA_CONFIG_INDEX);
+               outb((0x378 >> 2), VIA_CONFIG_DATA);
+               printk(KERN_DEBUG
+                       "parport_pc: Parallel port base changed to 0x378\n");
+               port1 = 0x378;
        }
 
        /*
@@ -2667,36 +2749,39 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
                printk(KERN_INFO "parport_pc: VIA parallel port disabled in BIOS\n");
                return 0;
        }
-       
+
        /* Bits 7-4: PnP Routing for Parallel Port IRQ */
        pci_read_config_byte(pdev, via->via_pci_parport_irq_reg, &tmp);
        irq = ((tmp & VIA_IRQCONTROL_PARALLEL) >> 4);
 
-       if (siofunc == VIA_FUNCTION_PARPORT_ECP)
-       {
-           /* Bits 3-2: PnP Routing for Parallel Port DMA */
-           pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
-           dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
-       }
-       else
-           /* if ECP not enabled, DMA is not enabled, assumed bogus 'dma' value */
-           dma = PARPORT_DMA_NONE;
+       if (siofunc == VIA_FUNCTION_PARPORT_ECP) {
+               /* Bits 3-2: PnP Routing for Parallel Port DMA */
+               pci_read_config_byte(pdev, via->via_pci_parport_dma_reg, &tmp);
+               dma = ((tmp & VIA_DMACONTROL_PARALLEL) >> 2);
+       } else
+               /* if ECP not enabled, DMA is not enabled, assumed
+                  bogus 'dma' value */
+               dma = PARPORT_DMA_NONE;
 
        /* Let the user (or defaults) steer us away from interrupts and DMA */
        if (autoirq == PARPORT_IRQ_NONE) {
-           irq = PARPORT_IRQ_NONE;
-           dma = PARPORT_DMA_NONE;
+               irq = PARPORT_IRQ_NONE;
+               dma = PARPORT_DMA_NONE;
        }
        if (autodma == PARPORT_DMA_NONE)
-           dma = PARPORT_DMA_NONE;
+               dma = PARPORT_DMA_NONE;
 
        switch (port1) {
-       case 0x3bc: port2 = 0x7bc; break;
-       case 0x378: port2 = 0x778; break;
-       case 0x278: port2 = 0x678; break;
+       case 0x3bc:
+               port2 = 0x7bc; break;
+       case 0x378:
+               port2 = 0x778; break;
+       case 0x278:
+               port2 = 0x678; break;
        default:
-               printk(KERN_INFO "parport_pc: Weird VIA parport base 0x%X, ignoring\n",
-                       port1);
+               printk(KERN_INFO
+                       "parport_pc: Weird VIA parport base 0x%X, ignoring\n",
+                                                                       port1);
                return 0;
        }
 
@@ -2714,17 +2799,17 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
        }
 
        /* finally, do the probe with values obtained */
-       if (parport_pc_probe_port (port1, port2, irq, dma, &pdev->dev, 0)) {
-               printk (KERN_INFO
+       if (parport_pc_probe_port(port1, port2, irq, dma, &pdev->dev, 0)) {
+               printk(KERN_INFO
                        "parport_pc: VIA parallel port: io=0x%X", port1);
                if (irq != PARPORT_IRQ_NONE)
-                       printk (", irq=%d", irq);
+                       printk(", irq=%d", irq);
                if (dma != PARPORT_DMA_NONE)
-                       printk (", dma=%d", dma);
-               printk ("\n");
+                       printk(", dma=%d", dma);
+               printk("\n");
                return 1;
        }
-       
+
        printk(KERN_WARNING "parport_pc: Strange, can't probe VIA parallel port: io=0x%X, irq=%d, dma=%d\n",
                port1, irq, dma);
        return 0;
@@ -2732,8 +2817,8 @@ static int __devinit sio_via_probe (struct pci_dev *pdev, int autoirq,
 
 
 enum parport_pc_sio_types {
-       sio_via_686a = 0,       /* Via VT82C686A motherboard Super I/O */
-       sio_via_8231,           /* Via VT8231 south bridge integrated Super IO */
+       sio_via_686a = 0,   /* Via VT82C686A motherboard Super I/O */
+       sio_via_8231,       /* Via VT8231 south bridge integrated Super IO */
        sio_ite_8872,
        last_sio
 };
@@ -2804,15 +2889,15 @@ enum parport_pc_pci_cards {
 };
 
 
-/* each element directly indexed from enum list, above 
+/* each element directly indexed from enum list, above
  * (but offset by last_sio) */
 static struct parport_pc_pci {
        int numports;
        struct { /* BAR (base address registers) numbers in the config
-                    space header */
+                   space header */
                int lo;
-               int hi; /* -1 if not there, >6 for offset-method (max
-                           BAR is 6) */
+               int hi;
+               /* -1 if not there, >6 for offset-method (max BAR is 6) */
        } addr[4];
 
        /* If set, this is called immediately after pci_enable_device.
@@ -2857,7 +2942,7 @@ static struct parport_pc_pci {
        /* timedia_4018  */             { 2, { { 0, 1 }, { 2, 3 }, } },
        /* timedia_9018a */             { 2, { { 0, 1 }, { 2, 3 }, } },
                                        /* SYBA uses fixed offsets in
-                                           a 1K io window */
+                                          a 1K io window */
        /* syba_2p_epp AP138B */        { 2, { { 0, 0x078 }, { 0, 0x178 }, } },
        /* syba_1p_ecp W83787 */        { 1, { { 0, 0x078 }, } },
        /* titan_010l */                { 1, { { 3, -1 }, } },
@@ -2873,11 +2958,14 @@ static struct parport_pc_pci {
        /* oxsemi_pcie_pport */         { 1, { { 0, 1 }, } },
        /* aks_0100 */                  { 1, { { 0, -1 }, } },
        /* mobility_pp */               { 1, { { 0, 1 }, } },
-       /* netmos_9705 */               { 1, { { 0, -1 }, } }, /* untested */
-        /* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-        /* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} }, /* untested */
-       /* netmos_9805 */               { 1, { { 0, -1 }, } }, /* untested */
-       /* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } }, /* untested */
+
+       /* The netmos entries below are untested */
+       /* netmos_9705 */               { 1, { { 0, -1 }, } },
+       /* netmos_9715 */               { 2, { { 0, 1 }, { 2, 3 },} },
+       /* netmos_9755 */               { 2, { { 0, 1 }, { 2, 3 },} },
+       /* netmos_9805 */               { 1, { { 0, -1 }, } },
+       /* netmos_9815 */               { 2, { { 0, -1 }, { 2, -1 }, } },
+
        /* quatech_sppxp100 */          { 1, { { 0, 1 }, } },
 };
 
@@ -2906,7 +2994,7 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
        { PCI_VENDOR_ID_LAVA, PCI_DEVICE_ID_LAVA_BOCA_IOPPAR,
          PCI_ANY_ID, PCI_ANY_ID, 0, 0, boca_ioppar },
        { PCI_VENDOR_ID_PLX, PCI_DEVICE_ID_PLX_9050,
-         PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0,0, plx_9050 },
+         PCI_SUBVENDOR_ID_EXSYS, PCI_SUBDEVICE_ID_EXSYS_4014, 0, 0, plx_9050 },
        /* PCI_VENDOR_ID_TIMEDIA/SUNIX has many differing cards ...*/
        { 0x1409, 0x7168, 0x1409, 0x4078, 0, 0, timedia_4078a },
        { 0x1409, 0x7168, 0x1409, 0x4079, 0, 0, timedia_4079h },
@@ -2940,7 +3028,8 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
        { 0x9710, 0x9805, 0x1000, 0x0010, 0, 0, titan_1284p1 },
        { 0x9710, 0x9815, 0x1000, 0x0020, 0, 0, titan_1284p2 },
        /* PCI_VENDOR_ID_AVLAB/Intek21 has another bunch of cards ...*/
-       { 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p}, /* AFAVLAB_TK9902 */
+       /* AFAVLAB_TK9902 */
+       { 0x14db, 0x2120, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_1p},
        { 0x14db, 0x2121, PCI_ANY_ID, PCI_ANY_ID, 0, 0, avlab_2p},
        { PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI952PP,
          PCI_ANY_ID, PCI_ANY_ID, 0, 0, oxsemi_952 },
@@ -2983,14 +3072,14 @@ static const struct pci_device_id parport_pc_pci_tbl[] = {
          PCI_ANY_ID, PCI_ANY_ID, 0, 0, quatech_sppxp100 },
        { 0, } /* terminate list */
 };
-MODULE_DEVICE_TABLE(pci,parport_pc_pci_tbl);
+MODULE_DEVICE_TABLE(pci, parport_pc_pci_tbl);
 
 struct pci_parport_data {
        int num;
        struct parport *ports[2];
 };
 
-static int parport_pc_pci_probe (struct pci_dev *dev,
+static int parport_pc_pci_probe(struct pci_dev *dev,
                                           const struct pci_device_id *id)
 {
        int err, count, n, i = id->driver_data;
@@ -3003,7 +3092,8 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
        /* This is a PCI card */
        i -= last_sio;
        count = 0;
-       if ((err = pci_enable_device (dev)) != 0)
+       err = pci_enable_device(dev);
+       if (err)
                return err;
 
        data = kmalloc(sizeof(struct pci_parport_data), GFP_KERNEL);
@@ -3011,7 +3101,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
                return -ENOMEM;
 
        if (cards[i].preinit_hook &&
-           cards[i].preinit_hook (dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
+           cards[i].preinit_hook(dev, PARPORT_IRQ_NONE, PARPORT_DMA_NONE)) {
                kfree(data);
                return -ENODEV;
        }
@@ -3021,25 +3111,25 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
                int hi = cards[i].addr[n].hi;
                int irq;
                unsigned long io_lo, io_hi;
-               io_lo = pci_resource_start (dev, lo);
+               io_lo = pci_resource_start(dev, lo);
                io_hi = 0;
                if ((hi >= 0) && (hi <= 6))
-                       io_hi = pci_resource_start (dev, hi);
+                       io_hi = pci_resource_start(dev, hi);
                else if (hi > 6)
                        io_lo += hi; /* Reinterpret the meaning of
-                                        "hi" as an offset (see SYBA
-                                        def.) */
+                                       "hi" as an offset (see SYBA
+                                       def.) */
                /* TODO: test if sharing interrupts works */
                irq = dev->irq;
                if (irq == IRQ_NONE) {
-                       printk (KERN_DEBUG
+                       printk(KERN_DEBUG
        "PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx)\n",
                                parport_pc_pci_tbl[i + last_sio].vendor,
                                parport_pc_pci_tbl[i + last_sio].device,
                                io_lo, io_hi);
                        irq = PARPORT_IRQ_NONE;
                } else {
-                       printk (KERN_DEBUG
+                       printk(KERN_DEBUG
        "PCI parallel port detected: %04x:%04x, I/O at %#lx(%#lx), IRQ %d\n",
                                parport_pc_pci_tbl[i + last_sio].vendor,
                                parport_pc_pci_tbl[i + last_sio].device,
@@ -3056,7 +3146,7 @@ static int parport_pc_pci_probe (struct pci_dev *dev,
        data->num = count;
 
        if (cards[i].postinit_hook)
-               cards[i].postinit_hook (dev, count == 0);
+               cards[i].postinit_hook(dev, count == 0);
 
        if (count) {
                pci_set_drvdata(dev, data);
@@ -3090,7 +3180,7 @@ static struct pci_driver parport_pc_pci_driver = {
        .remove         = __devexit_p(parport_pc_pci_remove),
 };
 
-static int __init parport_pc_init_superio (int autoirq, int autodma)
+static int __init parport_pc_init_superio(int autoirq, int autodma)
 {
        const struct pci_device_id *id;
        struct pci_dev *pdev = NULL;
@@ -3101,8 +3191,9 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
                if (id == NULL || id->driver_data >= last_sio)
                        continue;
 
-               if (parport_pc_superio_info[id->driver_data].probe
-                       (pdev, autoirq, autodma,parport_pc_superio_info[id->driver_data].via)) {
+               if (parport_pc_superio_info[id->driver_data].probe(
+                       pdev, autoirq, autodma,
+                       parport_pc_superio_info[id->driver_data].via)) {
                        ret++;
                }
        }
@@ -3111,7 +3202,10 @@ static int __init parport_pc_init_superio (int autoirq, int autodma)
 }
 #else
 static struct pci_driver parport_pc_pci_driver;
-static int __init parport_pc_init_superio(int autoirq, int autodma) {return 0;}
+static int __init parport_pc_init_superio(int autoirq, int autodma)
+{
+       return 0;
+}
 #endif /* CONFIG_PCI */
 
 #ifdef CONFIG_PNP
@@ -3124,44 +3218,45 @@ static const struct pnp_device_id parport_pc_pnp_tbl[] = {
        { }
 };
 
-MODULE_DEVICE_TABLE(pnp,parport_pc_pnp_tbl);
+MODULE_DEVICE_TABLE(pnp, parport_pc_pnp_tbl);
 
-static int parport_pc_pnp_probe(struct pnp_dev *dev, const struct pnp_device_id *id)
+static int parport_pc_pnp_probe(struct pnp_dev *dev,
+                                               const struct pnp_device_id *id)
 {
        struct parport *pdata;
        unsigned long io_lo, io_hi;
        int dma, irq;
 
-       if (pnp_port_valid(dev,0) &&
-               !(pnp_port_flags(dev,0) & IORESOURCE_DISABLED)) {
-               io_lo = pnp_port_start(dev,0);
+       if (pnp_port_valid(dev, 0) &&
+               !(pnp_port_flags(dev, 0) & IORESOURCE_DISABLED)) {
+               io_lo = pnp_port_start(dev, 0);
        } else
                return -EINVAL;
 
-       if (pnp_port_valid(dev,1) &&
-               !(pnp_port_flags(dev,1) & IORESOURCE_DISABLED)) {
-               io_hi = pnp_port_start(dev,1);
+       if (pnp_port_valid(dev, 1) &&
+               !(pnp_port_flags(dev, 1) & IORESOURCE_DISABLED)) {
+               io_hi = pnp_port_start(dev, 1);
        } else
                io_hi = 0;
 
-       if (pnp_irq_valid(dev,0) &&
-               !(pnp_irq_flags(dev,0) & IORESOURCE_DISABLED)) {
-               irq = pnp_irq(dev,0);
+       if (pnp_irq_valid(dev, 0) &&
+               !(pnp_irq_flags(dev, 0) & IORESOURCE_DISABLED)) {
+               irq = pnp_irq(dev, 0);
        } else
                irq = PARPORT_IRQ_NONE;
 
-       if (pnp_dma_valid(dev,0) &&
-               !(pnp_dma_flags(dev,0) & IORESOURCE_DISABLED)) {
-               dma = pnp_dma(dev,0);
+       if (pnp_dma_valid(dev, 0) &&
+               !(pnp_dma_flags(dev, 0) & IORESOURCE_DISABLED)) {
+               dma = pnp_dma(dev, 0);
        } else
                dma = PARPORT_DMA_NONE;
 
        dev_info(&dev->dev, "reported by %s\n", dev->protocol->name);
-       if (!(pdata = parport_pc_probe_port(io_lo, io_hi,
-                                       irq, dma, &dev->dev, 0)))
+       pdata = parport_pc_probe_port(io_lo, io_hi, irq, dma, &dev->dev, 0);
+       if (pdata == NULL)
                return -ENODEV;
 
-       pnp_set_drvdata(dev,pdata);
+       pnp_set_drvdata(dev, pdata);
        return 0;
 }
 
@@ -3203,7 +3298,7 @@ static struct platform_driver parport_pc_platform_driver = {
 
 /* This is called by parport_pc_find_nonpci_ports (in asm/parport.h) */
 static int __devinit __attribute__((unused))
-parport_pc_find_isa_ports (int autoirq, int autodma)
+parport_pc_find_isa_ports(int autoirq, int autodma)
 {
        int count = 0;
 
@@ -3227,7 +3322,7 @@ parport_pc_find_isa_ports (int autoirq, int autodma)
  * autoirq is PARPORT_IRQ_NONE, PARPORT_IRQ_AUTO, or PARPORT_IRQ_PROBEONLY
  * autodma is PARPORT_DMA_NONE or PARPORT_DMA_AUTO
  */
-static void __init parport_pc_find_ports (int autoirq, int autodma)
+static void __init parport_pc_find_ports(int autoirq, int autodma)
 {
        int count = 0, err;
 
@@ -3261,11 +3356,18 @@ static void __init parport_pc_find_ports (int autoirq, int autodma)
  *     syntax and keep in mind that code below is a cleaned up version.
  */
 
-static int __initdata io[PARPORT_PC_MAX_PORTS+1] = { [0 ... PARPORT_PC_MAX_PORTS] = 0 };
-static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] =
-       { [0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO };
-static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE };
-static int __initdata irqval[PARPORT_PC_MAX_PORTS] = { [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY };
+static int __initdata io[PARPORT_PC_MAX_PORTS+1] = {
+       [0 ... PARPORT_PC_MAX_PORTS] = 0
+};
+static int __initdata io_hi[PARPORT_PC_MAX_PORTS+1] = {
+       [0 ... PARPORT_PC_MAX_PORTS] = PARPORT_IOHI_AUTO
+};
+static int __initdata dmaval[PARPORT_PC_MAX_PORTS] = {
+       [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_DMA_NONE
+};
+static int __initdata irqval[PARPORT_PC_MAX_PORTS] = {
+       [0 ... PARPORT_PC_MAX_PORTS-1] = PARPORT_IRQ_PROBEONLY
+};
 
 static int __init parport_parse_param(const char *s, int *val,
                                int automatic, int none, int nofifo)
@@ -3306,18 +3408,19 @@ static int __init parport_parse_dma(const char *dmastr, int *val)
 #ifdef CONFIG_PCI
 static int __init parport_init_mode_setup(char *str)
 {
-       printk(KERN_DEBUG "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
-
-       if (!strcmp (str, "spp"))
-               parport_init_mode=1;
-       if (!strcmp (str, "ps2"))
-               parport_init_mode=2;
-       if (!strcmp (str, "epp"))
-               parport_init_mode=3;
-       if (!strcmp (str, "ecp"))
-               parport_init_mode=4;
-       if (!strcmp (str, "ecpepp"))
-               parport_init_mode=5;
+       printk(KERN_DEBUG
+            "parport_pc.c: Specified parameter parport_init_mode=%s\n", str);
+
+       if (!strcmp(str, "spp"))
+               parport_init_mode = 1;
+       if (!strcmp(str, "ps2"))
+               parport_init_mode = 2;
+       if (!strcmp(str, "epp"))
+               parport_init_mode = 3;
+       if (!strcmp(str, "ecp"))
+               parport_init_mode = 4;
+       if (!strcmp(str, "ecpepp"))
+               parport_init_mode = 5;
        return 1;
 }
 #endif
@@ -3341,7 +3444,8 @@ module_param(verbose_probing, int, 0644);
 #endif
 #ifdef CONFIG_PCI
 static char *init_mode;
-MODULE_PARM_DESC(init_mode, "Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
+MODULE_PARM_DESC(init_mode,
+       "Initialise mode for VIA VT8231 port (spp, ps2, epp, ecp or ecpepp)");
 module_param(init_mode, charp, 0);
 #endif
 
@@ -3372,7 +3476,7 @@ static int __init parse_parport_params(void)
                                irqval[0] = val;
                                break;
                        default:
-                               printk (KERN_WARNING
+                               printk(KERN_WARNING
                                        "parport_pc: irq specified "
                                        "without base address.  Use 'io=' "
                                        "to specify one\n");
@@ -3385,7 +3489,7 @@ static int __init parse_parport_params(void)
                                dmaval[0] = val;
                                break;
                        default:
-                               printk (KERN_WARNING
+                               printk(KERN_WARNING
                                        "parport_pc: dma specified "
                                        "without base address.  Use 'io=' "
                                        "to specify one\n");
@@ -3396,7 +3500,7 @@ static int __init parse_parport_params(void)
 
 #else
 
-static int parport_setup_ptr __initdata = 0;
+static int parport_setup_ptr __initdata;
 
 /*
  * Acceptable parameters:
@@ -3407,7 +3511,7 @@ static int parport_setup_ptr __initdata = 0;
  *
  * IRQ/DMA may be numeric or 'auto' or 'none'
  */
-static int __init parport_setup (char *str)
+static int __init parport_setup(char *str)
 {
        char *endptr;
        char *sep;
@@ -3419,15 +3523,15 @@ static int __init parport_setup (char *str)
                return 1;
        }
 
-       if (!strncmp (str, "auto", 4)) {
+       if (!strncmp(str, "auto", 4)) {
                irqval[0] = PARPORT_IRQ_AUTO;
                dmaval[0] = PARPORT_DMA_AUTO;
                return 1;
        }
 
-       val = simple_strtoul (str, &endptr, 0);
+       val = simple_strtoul(str, &endptr, 0);
        if (endptr == str) {
-               printk (KERN_WARNING "parport=%s not understood\n", str);
+               printk(KERN_WARNING "parport=%s not understood\n", str);
                return 1;
        }
 
@@ -3461,7 +3565,7 @@ static int __init parse_parport_params(void)
        return io[0] == PARPORT_DISABLE;
 }
 
-__setup ("parport=", parport_setup);
+__setup("parport=", parport_setup);
 
 /*
  * Acceptable parameters:
@@ -3469,7 +3573,7 @@ __setup ("parport=", parport_setup);
  * parport_init_mode=[spp|ps2|epp|ecp|ecpepp]
  */
 #ifdef CONFIG_PCI
-__setup("parport_init_mode=",parport_init_mode_setup);
+__setup("parport_init_mode=", parport_init_mode_setup);
 #endif
 #endif
 
@@ -3493,13 +3597,13 @@ static int __init parport_pc_init(void)
                for (i = 0; i < PARPORT_PC_MAX_PORTS; i++) {
                        if (!io[i])
                                break;
-                       if ((io_hi[i]) == PARPORT_IOHI_AUTO)
-                              io_hi[i] = 0x400 + io[i];
+                       if (io_hi[i] == PARPORT_IOHI_AUTO)
+                               io_hi[i] = 0x400 + io[i];
                        parport_pc_probe_port(io[i], io_hi[i],
-                                         irqval[i], dmaval[i], NULL, 0);
+                                       irqval[i], dmaval[i], NULL, 0);
                }
        } else
-               parport_pc_find_ports (irqval[0], dmaval[0]);
+               parport_pc_find_ports(irqval[0], dmaval[0]);
 
        return 0;
 }
@@ -3507,9 +3611,9 @@ static int __init parport_pc_init(void)
 static void __exit parport_pc_exit(void)
 {
        if (pci_registered_parport)
-               pci_unregister_driver (&parport_pc_pci_driver);
+               pci_unregister_driver(&parport_pc_pci_driver);
        if (pnp_registered_parport)
-               pnp_unregister_driver (&parport_pc_pnp_driver);
+               pnp_unregister_driver(&parport_pc_pnp_driver);
        platform_driver_unregister(&parport_pc_platform_driver);
 
        while (!list_empty(&ports_list)) {
index dd18f857dfb042d8d5802bfbbab57c617054b3f4..42e4260c3b12311e3793102b76ec136f5f94352e 100644 (file)
@@ -153,45 +153,47 @@ int ibmphp_init_devno(struct slot **cur_slot)
                return -1;
        }
        for (loop = 0; loop < len; loop++) {
-               if ((*cur_slot)->number == rtable->slots[loop].slot) {
-               if ((*cur_slot)->bus == rtable->slots[loop].bus) {
+               if ((*cur_slot)->number == rtable->slots[loop].slot &&
+                   (*cur_slot)->bus == rtable->slots[loop].bus) {
+                       struct io_apic_irq_attr irq_attr;
+
                        (*cur_slot)->device = PCI_SLOT(rtable->slots[loop].devfn);
                        for (i = 0; i < 4; i++)
                                (*cur_slot)->irq[i] = IO_APIC_get_PCI_irq_vector((int) (*cur_slot)->bus,
-                                               (int) (*cur_slot)->device, i);
-
-                               debug("(*cur_slot)->irq[0] = %x\n",
-                                               (*cur_slot)->irq[0]);
-                               debug("(*cur_slot)->irq[1] = %x\n",
-                                               (*cur_slot)->irq[1]);
-                               debug("(*cur_slot)->irq[2] = %x\n",
-                                               (*cur_slot)->irq[2]);
-                               debug("(*cur_slot)->irq[3] = %x\n",
-                                               (*cur_slot)->irq[3]);
-
-                               debug("rtable->exlusive_irqs = %x\n",
+                                               (int) (*cur_slot)->device, i,
+                                               &irq_attr);
+
+                       debug("(*cur_slot)->irq[0] = %x\n",
+                                       (*cur_slot)->irq[0]);
+                       debug("(*cur_slot)->irq[1] = %x\n",
+                                       (*cur_slot)->irq[1]);
+                       debug("(*cur_slot)->irq[2] = %x\n",
+                                       (*cur_slot)->irq[2]);
+                       debug("(*cur_slot)->irq[3] = %x\n",
+                                       (*cur_slot)->irq[3]);
+
+                       debug("rtable->exlusive_irqs = %x\n",
                                        rtable->exclusive_irqs);
-                               debug("rtable->slots[loop].irq[0].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[0].bitmap = %x\n",
                                        rtable->slots[loop].irq[0].bitmap);
-                               debug("rtable->slots[loop].irq[1].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[1].bitmap = %x\n",
                                        rtable->slots[loop].irq[1].bitmap);
-                               debug("rtable->slots[loop].irq[2].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[2].bitmap = %x\n",
                                        rtable->slots[loop].irq[2].bitmap);
-                               debug("rtable->slots[loop].irq[3].bitmap = %x\n",
+                       debug("rtable->slots[loop].irq[3].bitmap = %x\n",
                                        rtable->slots[loop].irq[3].bitmap);
 
-                               debug("rtable->slots[loop].irq[0].link = %x\n",
+                       debug("rtable->slots[loop].irq[0].link = %x\n",
                                        rtable->slots[loop].irq[0].link);
-                               debug("rtable->slots[loop].irq[1].link = %x\n",
+                       debug("rtable->slots[loop].irq[1].link = %x\n",
                                        rtable->slots[loop].irq[1].link);
-                               debug("rtable->slots[loop].irq[2].link = %x\n",
+                       debug("rtable->slots[loop].irq[2].link = %x\n",
                                        rtable->slots[loop].irq[2].link);
-                               debug("rtable->slots[loop].irq[3].link = %x\n",
+                       debug("rtable->slots[loop].irq[3].link = %x\n",
                                        rtable->slots[loop].irq[3].link);
-                               debug("end of init_devno\n");
-                               kfree(rtable);
-                               return 0;
-                       }
+                       debug("end of init_devno\n");
+                       kfree(rtable);
+                       return 0;
                }
        }
 
index 6808d8333ecc414390590f83122f4c6602b01cef..737a1c44b07af9a8ed81ed6e2191fce7934603fd 100644 (file)
@@ -98,6 +98,7 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
        int max_irq;
        int pos;
        int irq;
+       int node;
 
        pos = pci_find_ht_capability(dev, HT_CAPTYPE_IRQ);
        if (!pos)
@@ -125,7 +126,8 @@ int __ht_create_irq(struct pci_dev *dev, int idx, ht_irq_update_t *update)
        cfg->msg.address_lo = 0xffffffff;
        cfg->msg.address_hi = 0xffffffff;
 
-       irq = create_irq();
+       node = dev_to_node(&dev->dev);
+       irq = create_irq_nr(0, node);
 
        if (irq <= 0) {
                kfree(cfg);
index a563fbe559d0505b90d1f736beb3a8e8bf197717..cd389162735f3f9ff03dd596d549134ae052de5a 100644 (file)
@@ -1972,15 +1972,6 @@ static int __init init_dmars(void)
                }
        }
 
-#ifdef CONFIG_INTR_REMAP
-       if (!intr_remapping_enabled) {
-               ret = enable_intr_remapping(0);
-               if (ret)
-                       printk(KERN_ERR
-                              "IOMMU: enable interrupt remapping failed\n");
-       }
-#endif
-
        /*
         * For each rmrr
         *   for each dev attached to rmrr
index f5e0ea724a6f53a12d26443fba84b01ab898f35d..3a0cb0bb05933bd72277df67af0cfe2254a7b6e4 100644 (file)
@@ -15,6 +15,14 @@ static struct ioapic_scope ir_ioapic[MAX_IO_APICS];
 static int ir_ioapic_num;
 int intr_remapping_enabled;
 
+static int disable_intremap;
+static __init int setup_nointremap(char *str)
+{
+       disable_intremap = 1;
+       return 0;
+}
+early_param("nointremap", setup_nointremap);
+
 struct irq_2_iommu {
        struct intel_iommu *iommu;
        u16 irte_index;
@@ -23,15 +31,12 @@ struct irq_2_iommu {
 };
 
 #ifdef CONFIG_GENERIC_HARDIRQS
-static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
+static struct irq_2_iommu *get_one_free_irq_2_iommu(int node)
 {
        struct irq_2_iommu *iommu;
-       int node;
-
-       node = cpu_to_node(cpu);
 
        iommu = kzalloc_node(sizeof(*iommu), GFP_ATOMIC, node);
-       printk(KERN_DEBUG "alloc irq_2_iommu on cpu %d node %d\n", cpu, node);
+       printk(KERN_DEBUG "alloc irq_2_iommu on node %d\n", node);
 
        return iommu;
 }
@@ -48,7 +53,7 @@ static struct irq_2_iommu *irq_2_iommu(unsigned int irq)
        return desc->irq_2_iommu;
 }
 
-static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
+static struct irq_2_iommu *irq_2_iommu_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
        struct irq_2_iommu *irq_iommu;
@@ -56,7 +61,7 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
        /*
         * alloc irq desc if not allocated already.
         */
-       desc = irq_to_desc_alloc_cpu(irq, cpu);
+       desc = irq_to_desc_alloc_node(irq, node);
        if (!desc) {
                printk(KERN_INFO "can not get irq_desc for %d\n", irq);
                return NULL;
@@ -65,14 +70,14 @@ static struct irq_2_iommu *irq_2_iommu_alloc_cpu(unsigned int irq, int cpu)
        irq_iommu = desc->irq_2_iommu;
 
        if (!irq_iommu)
-               desc->irq_2_iommu = get_one_free_irq_2_iommu(cpu);
+               desc->irq_2_iommu = get_one_free_irq_2_iommu(node);
 
        return desc->irq_2_iommu;
 }
 
 static struct irq_2_iommu *irq_2_iommu_alloc(unsigned int irq)
 {
-       return irq_2_iommu_alloc_cpu(irq, boot_cpu_id);
+       return irq_2_iommu_alloc_node(irq, cpu_to_node(boot_cpu_id));
 }
 
 #else /* !CONFIG_SPARSE_IRQ */
@@ -423,20 +428,6 @@ static void iommu_set_intr_remapping(struct intel_iommu *iommu, int mode)
                      readl, (sts & DMA_GSTS_IRTPS), sts);
        spin_unlock_irqrestore(&iommu->register_lock, flags);
 
-       if (mode == 0) {
-               spin_lock_irqsave(&iommu->register_lock, flags);
-
-               /* enable comaptiblity format interrupt pass through */
-               cmd = iommu->gcmd | DMA_GCMD_CFI;
-               iommu->gcmd |= DMA_GCMD_CFI;
-               writel(cmd, iommu->reg + DMAR_GCMD_REG);
-
-               IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG,
-                             readl, (sts & DMA_GSTS_CFIS), sts);
-
-               spin_unlock_irqrestore(&iommu->register_lock, flags);
-       }
-
        /*
         * global invalidation of interrupt entry cache before enabling
         * interrupt-remapping.
@@ -516,6 +507,23 @@ end:
        spin_unlock_irqrestore(&iommu->register_lock, flags);
 }
 
+int __init intr_remapping_supported(void)
+{
+       struct dmar_drhd_unit *drhd;
+
+       if (disable_intremap)
+               return 0;
+
+       for_each_drhd_unit(drhd) {
+               struct intel_iommu *iommu = drhd->iommu;
+
+               if (!ecap_ir_support(iommu->ecap))
+                       return 0;
+       }
+
+       return 1;
+}
+
 int __init enable_intr_remapping(int eim)
 {
        struct dmar_drhd_unit *drhd;
index adf17856bacc187a8216af9ded5ec3cf2ed2662b..7f207f335beca2c53d8120c3a4681ff139cb7282 100644 (file)
@@ -123,7 +123,7 @@ static void pnpacpi_parse_allocated_irqresource(struct pnp_dev *dev,
        }
 
        flags = irq_flags(triggering, polarity, shareable);
-       irq = acpi_register_gsi(gsi, triggering, polarity);
+       irq = acpi_register_gsi(&dev->dev, gsi, triggering, polarity);
        if (irq >= 0)
                pcibios_penalize_isa_irq(irq, 1);
        else
index e1716f14cd4710cca0363eab985cdf370d9b9879..91e316fe6522f00a837a580808b1c9164f067748 100644 (file)
@@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp,
                return blk_trace_setup(sdp->device->request_queue,
                                       sdp->disk->disk_name,
                                       MKDEV(SCSI_GENERIC_MAJOR, sdp->index),
+                                      NULL,
                                       (char *)arg);
        case BLKTRACESTART:
                return blk_trace_startstop(sdp->device->request_queue, 1);
index a0127e93ade0a424c6125383150dd342e61710fc..fb867a9f55e94957e1a59ec5cb1f78b85865797e 100644 (file)
@@ -287,6 +287,13 @@ static const struct serial8250_config uart_config[] = {
                .fcr            = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_10,
                .flags          = UART_CAP_FIFO,
        },
+       [PORT_AR7] = {
+               .name           = "AR7",
+               .fifo_size      = 16,
+               .tx_loadsz      = 16,
+               .fcr            = UART_FCR_ENABLE_FIFO | UART_FCR_R_TRIG_00,
+               .flags          = UART_CAP_FIFO | UART_CAP_AFE,
+       },
 };
 
 #if defined (CONFIG_SERIAL_8250_AU1X00)
index 938bc1b6c3faada842cf6b667db5e4817fe6dc75..e371a9c15341e2598825affdbd2ef1d727dfbde7 100644 (file)
@@ -2776,6 +2776,9 @@ static struct pci_device_id serial_pci_tbl[] = {
        {       PCI_VENDOR_ID_OXSEMI, 0x950a,
                PCI_ANY_ID, PCI_ANY_ID, 0, 0,
                pbn_b0_2_1130000 },
+       {       PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_C950,
+               PCI_VENDOR_ID_OXSEMI, PCI_SUBDEVICE_ID_OXSEMI_C950, 0, 0,
+               pbn_b0_1_921600 },
        {       PCI_VENDOR_ID_OXSEMI, PCI_DEVICE_ID_OXSEMI_16PCI954,
                PCI_ANY_ID, PCI_ANY_ID, 0, 0,
                pbn_b0_4_115200 },
index 343e3a35b6a37e0b248ad0195610998b8a3b46e5..641e800ed69333fbafc3b06dd982a0a3ec7f7605 100644 (file)
@@ -833,6 +833,7 @@ config SERIAL_IMX
        bool "IMX serial port support"
        depends on ARM && (ARCH_IMX || ARCH_MXC)
        select SERIAL_CORE
+       select RATIONAL
        help
          If you have a machine based on a Motorola IMX CPU you
          can enable its onboard serial port by enabling this option.
@@ -1433,4 +1434,11 @@ config SPORT_BAUD_RATE
        default 19200 if (SERIAL_SPORT_BAUD_RATE_19200)
        default 9600 if (SERIAL_SPORT_BAUD_RATE_9600)
 
+config SERIAL_TIMBERDALE
+       tristate "Support for timberdale UART"
+       depends on MFD_TIMBERDALE
+       select SERIAL_CORE
+       ---help---
+       Add support for UART controller on timberdale.
+
 endmenu
index d438eb2a73defd29ca9cfab294fa95b20d09fc5f..45a8658f54d5154e1c31c328551ccb73f3da8db7 100644 (file)
@@ -77,3 +77,4 @@ obj-$(CONFIG_SERIAL_OF_PLATFORM_NWPSERIAL) += nwpserial.o
 obj-$(CONFIG_SERIAL_KS8695) += serial_ks8695.o
 obj-$(CONFIG_KGDB_SERIAL_CONSOLE) += kgdboc.o
 obj-$(CONFIG_SERIAL_QE) += ucc_uart.o
+obj-$(CONFIG_SERIAL_TIMBERDALE)        += timbuart.o
index d86123e03391e97623103682271da36f51350ab2..e2f6b1bfac98c726198f35f6bffc153abddc7423 100644 (file)
@@ -330,6 +330,11 @@ static void bfin_serial_tx_chars(struct bfin_serial_port *uart)
                /* Clear TFI bit */
                UART_PUT_LSR(uart, TFI);
 #endif
+               /* Anomaly notes:
+                *  05000215 -  we always clear ETBEI within last UART TX
+                *              interrupt to end a string. It is always set
+                *              when start a new tx.
+                */
                UART_CLEAR_IER(uart, ETBEI);
                return;
        }
@@ -415,6 +420,7 @@ static void bfin_serial_dma_tx_chars(struct bfin_serial_port *uart)
        set_dma_start_addr(uart->tx_dma_channel, (unsigned long)(xmit->buf+xmit->tail));
        set_dma_x_count(uart->tx_dma_channel, uart->tx_count);
        set_dma_x_modify(uart->tx_dma_channel, 1);
+       SSYNC();
        enable_dma(uart->tx_dma_channel);
 
        UART_SET_IER(uart, ETBEI);
@@ -473,27 +479,41 @@ static void bfin_serial_dma_rx_chars(struct bfin_serial_port *uart)
 void bfin_serial_rx_dma_timeout(struct bfin_serial_port *uart)
 {
        int x_pos, pos;
-       unsigned long flags;
-
-       spin_lock_irqsave(&uart->port.lock, flags);
 
+       dma_disable_irq(uart->rx_dma_channel);
+       spin_lock_bh(&uart->port.lock);
+
+       /* 2D DMA RX buffer ring is used. Because curr_y_count and
+        * curr_x_count can't be read as an atomic operation,
+        * curr_y_count should be read before curr_x_count. When
+        * curr_x_count is read, curr_y_count may already indicate
+        * next buffer line. But, the position calculated here is
+        * still indicate the old line. The wrong position data may
+        * be smaller than current buffer tail, which cause garbages
+        * are received if it is not prohibit.
+        */
        uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
        x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
        uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
-       if (uart->rx_dma_nrows == DMA_RX_YCOUNT)
+       if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
                uart->rx_dma_nrows = 0;
        x_pos = DMA_RX_XCOUNT - x_pos;
        if (x_pos == DMA_RX_XCOUNT)
                x_pos = 0;
 
        pos = uart->rx_dma_nrows * DMA_RX_XCOUNT + x_pos;
-       if (pos != uart->rx_dma_buf.tail) {
+       /* Ignore receiving data if new position is in the same line of
+        * current buffer tail and small.
+        */
+       if (pos > uart->rx_dma_buf.tail ||
+               uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
                uart->rx_dma_buf.head = pos;
                bfin_serial_dma_rx_chars(uart);
                uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
        }
 
-       spin_unlock_irqrestore(&uart->port.lock, flags);
+       spin_unlock_bh(&uart->port.lock);
+       dma_enable_irq(uart->rx_dma_channel);
 
        mod_timer(&(uart->rx_dma_timer), jiffies + DMA_RX_FLUSH_JIFFIES);
 }
@@ -514,6 +534,11 @@ static irqreturn_t bfin_serial_dma_tx_int(int irq, void *dev_id)
        if (!(get_dma_curr_irqstat(uart->tx_dma_channel)&DMA_RUN)) {
                disable_dma(uart->tx_dma_channel);
                clear_dma_irqstat(uart->tx_dma_channel);
+               /* Anomaly notes:
+                *  05000215 -  we always clear ETBEI within last UART TX
+                *              interrupt to end a string. It is always set
+                *              when start a new tx.
+                */
                UART_CLEAR_IER(uart, ETBEI);
                xmit->tail = (xmit->tail + uart->tx_count) & (UART_XMIT_SIZE - 1);
                uart->port.icount.tx += uart->tx_count;
@@ -532,11 +557,26 @@ static irqreturn_t bfin_serial_dma_rx_int(int irq, void *dev_id)
 {
        struct bfin_serial_port *uart = dev_id;
        unsigned short irqstat;
+       int x_pos, pos;
 
        spin_lock(&uart->port.lock);
        irqstat = get_dma_curr_irqstat(uart->rx_dma_channel);
        clear_dma_irqstat(uart->rx_dma_channel);
-       bfin_serial_dma_rx_chars(uart);
+
+       uart->rx_dma_nrows = get_dma_curr_ycount(uart->rx_dma_channel);
+       x_pos = get_dma_curr_xcount(uart->rx_dma_channel);
+       uart->rx_dma_nrows = DMA_RX_YCOUNT - uart->rx_dma_nrows;
+       if (uart->rx_dma_nrows == DMA_RX_YCOUNT || x_pos == 0)
+               uart->rx_dma_nrows = 0;
+
+       pos = uart->rx_dma_nrows * DMA_RX_XCOUNT;
+       if (pos > uart->rx_dma_buf.tail ||
+               uart->rx_dma_nrows < (uart->rx_dma_buf.tail/DMA_RX_XCOUNT)) {
+               uart->rx_dma_buf.head = pos;
+               bfin_serial_dma_rx_chars(uart);
+               uart->rx_dma_buf.tail = uart->rx_dma_buf.head;
+       }
+
        spin_unlock(&uart->port.lock);
 
        return IRQ_HANDLED;
@@ -789,8 +829,16 @@ bfin_serial_set_termios(struct uart_port *port, struct ktermios *termios,
                        __func__);
        }
 
-       if (termios->c_cflag & CSTOPB)
-               lcr |= STB;
+       /* Anomaly notes:
+        *  05000231 -  STOP bit is always set to 1 whatever the user is set.
+        */
+       if (termios->c_cflag & CSTOPB) {
+               if (ANOMALY_05000231)
+                       printk(KERN_WARNING "STOP bits other than 1 is not "
+                               "supported in case of anomaly 05000231.\n");
+               else
+                       lcr |= STB;
+       }
        if (termios->c_cflag & PARENB)
                lcr |= PEN;
        if (!(termios->c_cflag & PARODD))
@@ -940,6 +988,10 @@ static void bfin_serial_reset_irda(struct uart_port *port)
 }
 
 #ifdef CONFIG_CONSOLE_POLL
+/* Anomaly notes:
+ *  05000099 -  Because we only use THRE in poll_put and DR in poll_get,
+ *             losing other bits of UART_LSR is not a problem here.
+ */
 static void bfin_serial_poll_put_char(struct uart_port *port, unsigned char chr)
 {
        struct bfin_serial_port *uart = (struct bfin_serial_port *)port;
@@ -1245,12 +1297,17 @@ static __init void early_serial_write(struct console *con, const char *s,
        }
 }
 
+/*
+ * This should have a .setup or .early_setup in it, but then things get called
+ * without the command line options, and the baud rate gets messed up - so
+ * don't let the common infrastructure play with things. (see calls to setup
+ * & earlysetup in ./kernel/printk.c:register_console()
+ */
 static struct __initdata console bfin_early_serial_console = {
        .name = "early_BFuart",
        .write = early_serial_write,
        .device = uart_console_device,
        .flags = CON_PRINTBUFFER,
-       .setup = bfin_serial_console_setup,
        .index = -1,
        .data  = &bfin_serial_reg,
 };
index 529c0ff7952ceca111e8db153a3794448ca3b5ae..34b4ae0fe76041f4eda4132e1a81a54784652557 100644 (file)
@@ -101,15 +101,16 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 {
        pr_debug("%s value:%x\n", __func__, value);
        /* Place a Start and Stop bit */
-       __asm__ volatile (
-               "R2 = b#01111111100;\n\t"
-               "R3 = b#10000000001;\n\t"
-               "%0 <<= 2;\n\t"
-               "%0 = %0 & R2;\n\t"
-               "%0 = %0 | R3;\n\t"
-               :"=r"(value)
-               :"0"(value)
-               :"R2", "R3");
+       __asm__ __volatile__ (
+               "R2 = b#01111111100;"
+               "R3 = b#10000000001;"
+               "%0 <<= 2;"
+               "%0 = %0 & R2;"
+               "%0 = %0 | R3;"
+               : "=d"(value)
+               : "d"(value)
+               : "ASTAT", "R2", "R3"
+       );
        pr_debug("%s value:%x\n", __func__, value);
 
        SPORT_PUT_TX(up, value);
@@ -118,27 +119,30 @@ static inline void tx_one_byte(struct sport_uart_port *up, unsigned int value)
 static inline unsigned int rx_one_byte(struct sport_uart_port *up)
 {
        unsigned int value, extract;
+       u32 tmp_mask1, tmp_mask2, tmp_shift, tmp;
 
        value = SPORT_GET_RX32(up);
        pr_debug("%s value:%x\n", __func__, value);
 
        /* Extract 8 bits data */
-       __asm__ volatile (
-               "R5 = 0;\n\t"
-               "P0 = 8;\n\t"
-               "R1 = 0x1801(Z);\n\t"
-               "R3 = 0x0300(Z);\n\t"
-               "R4 = 0;\n\t"
-               "LSETUP(loop_s, loop_e) LC0 = P0;\nloop_s:\t"
-               "R2 = extract(%1, R1.L)(Z);\n\t"
-               "R2 <<= R4;\n\t"
-               "R5 = R5 | R2;\n\t"
-               "R1 = R1 - R3;\nloop_e:\t"
-               "R4 += 1;\n\t"
-               "%0 = R5;\n\t"
-               :"=r"(extract)
-               :"r"(value)
-               :"P0", "R1", "R2","R3","R4", "R5");
+       __asm__ __volatile__ (
+               "%[extr] = 0;"
+               "%[mask1] = 0x1801(Z);"
+               "%[mask2] = 0x0300(Z);"
+               "%[shift] = 0;"
+               "LSETUP(.Lloop_s, .Lloop_e) LC0 = %[lc];"
+               ".Lloop_s:"
+               "%[tmp] = extract(%[val], %[mask1].L)(Z);"
+               "%[tmp] <<= %[shift];"
+               "%[extr] = %[extr] | %[tmp];"
+               "%[mask1] = %[mask1] - %[mask2];"
+               ".Lloop_e:"
+               "%[shift] += 1;"
+               : [val]"=d"(value), [extr]"=d"(extract), [shift]"=d"(tmp_shift), [tmp]"=d"(tmp),
+                 [mask1]"=d"(tmp_mask1), [mask2]"=d"(tmp_mask2)
+               : "d"(value), [lc]"a"(8)
+               : "ASTAT", "LB0", "LC0", "LT0"
+       );
 
        pr_debug("      extract:%x\n", extract);
        return extract;
@@ -149,7 +153,7 @@ static int sport_uart_setup(struct sport_uart_port *up, int sclk, int baud_rate)
        int tclkdiv, tfsdiv, rclkdiv;
 
        /* Set TCR1 and TCR2 */
-       SPORT_PUT_TCR1(up, (LTFS | ITFS | TFSR | TLSBIT | ITCLK));
+       SPORT_PUT_TCR1(up, (LATFS | ITFS | TFSR | TLSBIT | ITCLK));
        SPORT_PUT_TCR2(up, 10);
        pr_debug("%s TCR1:%x, TCR2:%x\n", __func__, SPORT_GET_TCR1(up), SPORT_GET_TCR2(up));
 
@@ -419,7 +423,7 @@ static void sport_shutdown(struct uart_port *port)
 }
 
 static void sport_set_termios(struct uart_port *port,
-               struct termios *termios, struct termios *old)
+               struct ktermios *termios, struct ktermios *old)
 {
        pr_debug("%s enter, c_cflag:%08x\n", __func__, termios->c_cflag);
        uart_update_timeout(port, CS8 ,port->uartclk);
index a461b3b2c72dcd63d79e02b1188843a87e092f13..9f2891c2c4a21f93e4cc6289e4b95c286a655455 100644 (file)
@@ -137,7 +137,12 @@ static LIST_HEAD(icom_adapter_head);
 static spinlock_t icom_lock;
 
 #ifdef ICOM_TRACE
-static inline void trace(struct icom_port *, char *, unsigned long) {};
+static inline void trace(struct icom_port *icom_port, char *trace_pt,
+                       unsigned long trace_data)
+{
+       dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
+       icom_port->port, trace_pt, trace_data);
+}
 #else
 static inline void trace(struct icom_port *icom_port, char *trace_pt, unsigned long trace_data) {};
 #endif
@@ -408,7 +413,7 @@ static void load_code(struct icom_port *icom_port)
        release_firmware(fw);
 
        /* Set Hardware level */
-       if ((icom_port->adapter->version | ADAPTER_V2) == ADAPTER_V2)
+       if (icom_port->adapter->version == ADAPTER_V2)
                writeb(V2_HARDWARE, &(icom_port->dram->misc_flags));
 
        /* Start the processor in Adapter */
@@ -861,7 +866,7 @@ static irqreturn_t icom_interrupt(int irq, void *dev_id)
        /* find icom_port for this interrupt */
        icom_adapter = (struct icom_adapter *) dev_id;
 
-       if ((icom_adapter->version | ADAPTER_V2) == ADAPTER_V2) {
+       if (icom_adapter->version == ADAPTER_V2) {
                int_reg = icom_adapter->base_addr + 0x8024;
 
                adapter_interrupts = readl(int_reg);
@@ -1647,15 +1652,6 @@ static void __exit icom_exit(void)
 module_init(icom_init);
 module_exit(icom_exit);
 
-#ifdef ICOM_TRACE
-static inline void trace(struct icom_port *icom_port, char *trace_pt,
-                 unsigned long trace_data)
-{
-       dev_info(&icom_port->adapter->pci_dev->dev, ":%d:%s - %lx\n",
-                icom_port->port, trace_pt, trace_data);
-}
-#endif
-
 MODULE_AUTHOR("Michael Anderson <mjanders@us.ibm.com>");
 MODULE_DESCRIPTION("IBM iSeries Serial IOA driver");
 MODULE_SUPPORTED_DEVICE
index 5f0be40dfdab7292ae65b4c763f4ee4105e36e24..7b5d1de9cfe39c3436d1074bb3b9c77933743507 100644 (file)
@@ -8,6 +8,9 @@
  *  Author: Sascha Hauer <sascha@saschahauer.de>
  *  Copyright (C) 2004 Pengutronix
  *
+ *  Copyright (C) 2009 emlix GmbH
+ *  Author: Fabian Godehardt (added IrDA support for iMX)
+ *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
  * the Free Software Foundation; either version 2 of the License, or
@@ -41,6 +44,8 @@
 #include <linux/serial_core.h>
 #include <linux/serial.h>
 #include <linux/clk.h>
+#include <linux/delay.h>
+#include <linux/rational.h>
 
 #include <asm/io.h>
 #include <asm/irq.h>
 #define  UCR4_DREN      (1<<0)  /* Recv data ready interrupt enable */
 #define  UFCR_RXTL_SHF   0       /* Receiver trigger level shift */
 #define  UFCR_RFDIV      (7<<7)  /* Reference freq divider mask */
+#define  UFCR_RFDIV_REG(x)     (((x) < 7 ? 6 - (x) : 6) << 7)
 #define  UFCR_TXTL_SHF   10      /* Transmitter trigger level shift */
 #define  USR1_PARITYERR  (1<<15) /* Parity error interrupt flag */
 #define  USR1_RTSS      (1<<14) /* RTS pin status */
@@ -211,10 +217,20 @@ struct imx_port {
        struct timer_list       timer;
        unsigned int            old_status;
        int                     txirq,rxirq,rtsirq;
-       int                     have_rtscts:1;
+       unsigned int            have_rtscts:1;
+       unsigned int            use_irda:1;
+       unsigned int            irda_inv_rx:1;
+       unsigned int            irda_inv_tx:1;
+       unsigned short          trcv_delay; /* transceiver delay */
        struct clk              *clk;
 };
 
+#ifdef CONFIG_IRDA
+#define USE_IRDA(sport)        ((sport)->use_irda)
+#else
+#define USE_IRDA(sport)        (0)
+#endif
+
 /*
  * Handle any change of modem status signal since we were last called.
  */
@@ -268,6 +284,48 @@ static void imx_stop_tx(struct uart_port *port)
        struct imx_port *sport = (struct imx_port *)port;
        unsigned long temp;
 
+       if (USE_IRDA(sport)) {
+               /* half duplex - wait for end of transmission */
+               int n = 256;
+               while ((--n > 0) &&
+                     !(readl(sport->port.membase + USR2) & USR2_TXDC)) {
+                       udelay(5);
+                       barrier();
+               }
+               /*
+                * irda transceiver - wait a bit more to avoid
+                * cutoff, hardware dependent
+                */
+               udelay(sport->trcv_delay);
+
+               /*
+                * half duplex - reactivate receive mode,
+                * flush receive pipe echo crap
+                */
+               if (readl(sport->port.membase + USR2) & USR2_TXDC) {
+                       temp = readl(sport->port.membase + UCR1);
+                       temp &= ~(UCR1_TXMPTYEN | UCR1_TRDYEN);
+                       writel(temp, sport->port.membase + UCR1);
+
+                       temp = readl(sport->port.membase + UCR4);
+                       temp &= ~(UCR4_TCEN);
+                       writel(temp, sport->port.membase + UCR4);
+
+                       while (readl(sport->port.membase + URXD0) &
+                              URXD_CHARRDY)
+                               barrier();
+
+                       temp = readl(sport->port.membase + UCR1);
+                       temp |= UCR1_RRDYEN;
+                       writel(temp, sport->port.membase + UCR1);
+
+                       temp = readl(sport->port.membase + UCR4);
+                       temp |= UCR4_DREN;
+                       writel(temp, sport->port.membase + UCR4);
+               }
+               return;
+       }
+
        temp = readl(sport->port.membase + UCR1);
        writel(temp & ~UCR1_TXMPTYEN, sport->port.membase + UCR1);
 }
@@ -302,13 +360,15 @@ static inline void imx_transmit_buffer(struct imx_port *sport)
                /* send xmit->buf[xmit->tail]
                 * out the port here */
                writel(xmit->buf[xmit->tail], sport->port.membase + URTX0);
-               xmit->tail = (xmit->tail + 1) &
-                        (UART_XMIT_SIZE - 1);
+               xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
                sport->port.icount.tx++;
                if (uart_circ_empty(xmit))
                        break;
        }
 
+       if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+               uart_write_wakeup(&sport->port);
+
        if (uart_circ_empty(xmit))
                imx_stop_tx(&sport->port);
 }
@@ -321,9 +381,30 @@ static void imx_start_tx(struct uart_port *port)
        struct imx_port *sport = (struct imx_port *)port;
        unsigned long temp;
 
+       if (USE_IRDA(sport)) {
+               /* half duplex in IrDA mode; have to disable receive mode */
+               temp = readl(sport->port.membase + UCR4);
+               temp &= ~(UCR4_DREN);
+               writel(temp, sport->port.membase + UCR4);
+
+               temp = readl(sport->port.membase + UCR1);
+               temp &= ~(UCR1_RRDYEN);
+               writel(temp, sport->port.membase + UCR1);
+       }
+
        temp = readl(sport->port.membase + UCR1);
        writel(temp | UCR1_TXMPTYEN, sport->port.membase + UCR1);
 
+       if (USE_IRDA(sport)) {
+               temp = readl(sport->port.membase + UCR1);
+               temp |= UCR1_TRDYEN;
+               writel(temp, sport->port.membase + UCR1);
+
+               temp = readl(sport->port.membase + UCR4);
+               temp |= UCR4_TCEN;
+               writel(temp, sport->port.membase + UCR4);
+       }
+
        if (readl(sport->port.membase + UTS) & UTS_TXEMPTY)
                imx_transmit_buffer(sport);
 }
@@ -395,8 +476,7 @@ static irqreturn_t imx_rxint(int irq, void *dev_id)
                                continue;
                }
 
-               if (uart_handle_sysrq_char
-                           (&sport->port, (unsigned char)rx))
+               if (uart_handle_sysrq_char(&sport->port, (unsigned char)rx))
                        continue;
 
                if (rx & (URXD_PRERR | URXD_OVRRUN | URXD_FRMERR) ) {
@@ -471,26 +551,26 @@ static unsigned int imx_tx_empty(struct uart_port *port)
  */
 static unsigned int imx_get_mctrl(struct uart_port *port)
 {
-        struct imx_port *sport = (struct imx_port *)port;
-        unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
+       struct imx_port *sport = (struct imx_port *)port;
+       unsigned int tmp = TIOCM_DSR | TIOCM_CAR;
 
-        if (readl(sport->port.membase + USR1) & USR1_RTSS)
-                tmp |= TIOCM_CTS;
+       if (readl(sport->port.membase + USR1) & USR1_RTSS)
+               tmp |= TIOCM_CTS;
 
-        if (readl(sport->port.membase + UCR2) & UCR2_CTS)
-                tmp |= TIOCM_RTS;
+       if (readl(sport->port.membase + UCR2) & UCR2_CTS)
+               tmp |= TIOCM_RTS;
 
-        return tmp;
+       return tmp;
 }
 
 static void imx_set_mctrl(struct uart_port *port, unsigned int mctrl)
 {
-        struct imx_port *sport = (struct imx_port *)port;
+       struct imx_port *sport = (struct imx_port *)port;
        unsigned long temp;
 
        temp = readl(sport->port.membase + UCR2) & ~UCR2_CTS;
 
-        if (mctrl & TIOCM_RTS)
+       if (mctrl & TIOCM_RTS)
                temp |= UCR2_CTS;
 
        writel(temp, sport->port.membase + UCR2);
@@ -534,12 +614,7 @@ static int imx_setup_ufcr(struct imx_port *sport, unsigned int mode)
        if(!ufcr_rfdiv)
                ufcr_rfdiv = 1;
 
-       if(ufcr_rfdiv >= 7)
-               ufcr_rfdiv = 6;
-       else
-               ufcr_rfdiv = 6 - ufcr_rfdiv;
-
-       val |= UFCR_RFDIV & (ufcr_rfdiv << 7);
+       val |= UFCR_RFDIV_REG(ufcr_rfdiv);
 
        writel(val, sport->port.membase + UFCR);
 
@@ -558,8 +633,24 @@ static int imx_startup(struct uart_port *port)
         * requesting IRQs
         */
        temp = readl(sport->port.membase + UCR4);
+
+       if (USE_IRDA(sport))
+               temp |= UCR4_IRSC;
+
        writel(temp & ~UCR4_DREN, sport->port.membase + UCR4);
 
+       if (USE_IRDA(sport)) {
+               /* reset fifo's and state machines */
+               int i = 100;
+               temp = readl(sport->port.membase + UCR2);
+               temp &= ~UCR2_SRST;
+               writel(temp, sport->port.membase + UCR2);
+               while (!(readl(sport->port.membase + UCR2) & UCR2_SRST) &&
+                   (--i > 0)) {
+                       udelay(1);
+               }
+       }
+
        /*
         * Allocate the IRQ(s) i.MX1 has three interrupts whereas later
         * chips only have one interrupt.
@@ -575,12 +666,16 @@ static int imx_startup(struct uart_port *port)
                if (retval)
                        goto error_out2;
 
-               retval = request_irq(sport->rtsirq, imx_rtsint,
-                            (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
-                              IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING,
-                               DRIVER_NAME, sport);
-               if (retval)
-                       goto error_out3;
+               /* do not use RTS IRQ on IrDA */
+               if (!USE_IRDA(sport)) {
+                       retval = request_irq(sport->rtsirq, imx_rtsint,
+                                    (sport->rtsirq < MAX_INTERNAL_IRQ) ? 0 :
+                                      IRQF_TRIGGER_FALLING |
+                                      IRQF_TRIGGER_RISING,
+                                       DRIVER_NAME, sport);
+                       if (retval)
+                               goto error_out3;
+               }
        } else {
                retval = request_irq(sport->port.irq, imx_int, 0,
                                DRIVER_NAME, sport);
@@ -597,18 +692,49 @@ static int imx_startup(struct uart_port *port)
 
        temp = readl(sport->port.membase + UCR1);
        temp |= UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN;
+
+       if (USE_IRDA(sport)) {
+               temp |= UCR1_IREN;
+               temp &= ~(UCR1_RTSDEN);
+       }
+
        writel(temp, sport->port.membase + UCR1);
 
        temp = readl(sport->port.membase + UCR2);
        temp |= (UCR2_RXEN | UCR2_TXEN);
        writel(temp, sport->port.membase + UCR2);
 
+       if (USE_IRDA(sport)) {
+               /* clear RX-FIFO */
+               int i = 64;
+               while ((--i > 0) &&
+                       (readl(sport->port.membase + URXD0) & URXD_CHARRDY)) {
+                       barrier();
+               }
+       }
+
 #if defined CONFIG_ARCH_MX2 || defined CONFIG_ARCH_MX3
        temp = readl(sport->port.membase + UCR3);
        temp |= UCR3_RXDMUXSEL;
        writel(temp, sport->port.membase + UCR3);
 #endif
 
+       if (USE_IRDA(sport)) {
+               temp = readl(sport->port.membase + UCR4);
+               if (sport->irda_inv_rx)
+                       temp |= UCR4_INVR;
+               else
+                       temp &= ~(UCR4_INVR);
+               writel(temp | UCR4_DREN, sport->port.membase + UCR4);
+
+               temp = readl(sport->port.membase + UCR3);
+               if (sport->irda_inv_tx)
+                       temp |= UCR3_INVT;
+               else
+                       temp &= ~(UCR3_INVT);
+               writel(temp, sport->port.membase + UCR3);
+       }
+
        /*
         * Enable modem status interrupts
         */
@@ -616,6 +742,16 @@ static int imx_startup(struct uart_port *port)
        imx_enable_ms(&sport->port);
        spin_unlock_irqrestore(&sport->port.lock,flags);
 
+       if (USE_IRDA(sport)) {
+               struct imxuart_platform_data *pdata;
+               pdata = sport->port.dev->platform_data;
+               sport->irda_inv_rx = pdata->irda_inv_rx;
+               sport->irda_inv_tx = pdata->irda_inv_tx;
+               sport->trcv_delay = pdata->transceiver_delay;
+               if (pdata->irda_enable)
+                       pdata->irda_enable(1);
+       }
+
        return 0;
 
 error_out3:
@@ -633,6 +769,17 @@ static void imx_shutdown(struct uart_port *port)
        struct imx_port *sport = (struct imx_port *)port;
        unsigned long temp;
 
+       temp = readl(sport->port.membase + UCR2);
+       temp &= ~(UCR2_TXEN);
+       writel(temp, sport->port.membase + UCR2);
+
+       if (USE_IRDA(sport)) {
+               struct imxuart_platform_data *pdata;
+               pdata = sport->port.dev->platform_data;
+               if (pdata->irda_enable)
+                       pdata->irda_enable(0);
+       }
+
        /*
         * Stop our timer.
         */
@@ -642,7 +789,8 @@ static void imx_shutdown(struct uart_port *port)
         * Free the interrupts
         */
        if (sport->txirq > 0) {
-               free_irq(sport->rtsirq, sport);
+               if (!USE_IRDA(sport))
+                       free_irq(sport->rtsirq, sport);
                free_irq(sport->txirq, sport);
                free_irq(sport->rxirq, sport);
        } else
@@ -654,6 +802,9 @@ static void imx_shutdown(struct uart_port *port)
 
        temp = readl(sport->port.membase + UCR1);
        temp &= ~(UCR1_TXMPTYEN | UCR1_RRDYEN | UCR1_RTSDEN | UCR1_UARTEN);
+       if (USE_IRDA(sport))
+               temp &= ~(UCR1_IREN);
+
        writel(temp, sport->port.membase + UCR1);
 }
 
@@ -665,7 +816,9 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
        unsigned long flags;
        unsigned int ucr2, old_ucr1, old_txrxen, baud, quot;
        unsigned int old_csize = old ? old->c_cflag & CSIZE : CS8;
-       unsigned int div, num, denom, ufcr;
+       unsigned int div, ufcr;
+       unsigned long num, denom;
+       uint64_t tdiv64;
 
        /*
         * If we don't support modem control lines, don't allow
@@ -761,38 +914,39 @@ imx_set_termios(struct uart_port *port, struct ktermios *termios,
                        sport->port.membase + UCR2);
        old_txrxen &= (UCR2_TXEN | UCR2_RXEN);
 
-       div = sport->port.uartclk / (baud * 16);
-       if (div > 7)
-               div = 7;
-       if (!div)
+       if (USE_IRDA(sport)) {
+               /*
+                * use maximum available submodule frequency to
+                * avoid missing short pulses due to low sampling rate
+                */
                div = 1;
-
-       num = baud;
-       denom = port->uartclk / div / 16;
-
-       /* shift num and denom right until they fit into 16 bits */
-       while (num > 0x10000 || denom > 0x10000) {
-               num >>= 1;
-               denom >>= 1;
+       } else {
+               div = sport->port.uartclk / (baud * 16);
+               if (div > 7)
+                       div = 7;
+               if (!div)
+                       div = 1;
        }
-       if (num > 0)
-               num -= 1;
-       if (denom > 0)
-               denom -= 1;
 
-       writel(num, sport->port.membase + UBIR);
-       writel(denom, sport->port.membase + UBMR);
+       rational_best_approximation(16 * div * baud, sport->port.uartclk,
+               1 << 16, 1 << 16, &num, &denom);
 
-       if (div == 7)
-               div = 6; /* 6 in RFDIV means divide by 7 */
-       else
-               div = 6 - div;
+       tdiv64 = sport->port.uartclk;
+       tdiv64 *= num;
+       do_div(tdiv64, denom * 16 * div);
+       tty_encode_baud_rate(sport->port.info->port.tty,
+               (speed_t)tdiv64, (speed_t)tdiv64);
+
+       num -= 1;
+       denom -= 1;
 
        ufcr = readl(sport->port.membase + UFCR);
-       ufcr = (ufcr & (~UFCR_RFDIV)) |
-           (div << 7);
+       ufcr = (ufcr & (~UFCR_RFDIV)) | UFCR_RFDIV_REG(div);
        writel(ufcr, sport->port.membase + UFCR);
 
+       writel(num, sport->port.membase + UBIR);
+       writel(denom, sport->port.membase + UBMR);
+
 #ifdef ONEMS
        writel(sport->port.uartclk / div / 1000, sport->port.membase + ONEMS);
 #endif
@@ -1072,22 +1226,22 @@ static struct uart_driver imx_reg = {
 
 static int serial_imx_suspend(struct platform_device *dev, pm_message_t state)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+       struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_suspend_port(&imx_reg, &sport->port);
+       if (sport)
+               uart_suspend_port(&imx_reg, &sport->port);
 
-        return 0;
+       return 0;
 }
 
 static int serial_imx_resume(struct platform_device *dev)
 {
-        struct imx_port *sport = platform_get_drvdata(dev);
+       struct imx_port *sport = platform_get_drvdata(dev);
 
-        if (sport)
-                uart_resume_port(&imx_reg, &sport->port);
+       if (sport)
+               uart_resume_port(&imx_reg, &sport->port);
 
-        return 0;
+       return 0;
 }
 
 static int serial_imx_probe(struct platform_device *pdev)
@@ -1143,19 +1297,29 @@ static int serial_imx_probe(struct platform_device *pdev)
        imx_ports[pdev->id] = sport;
 
        pdata = pdev->dev.platform_data;
-       if(pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
+       if (pdata && (pdata->flags & IMXUART_HAVE_RTSCTS))
                sport->have_rtscts = 1;
 
+#ifdef CONFIG_IRDA
+       if (pdata && (pdata->flags & IMXUART_IRDA))
+               sport->use_irda = 1;
+#endif
+
        if (pdata->init) {
                ret = pdata->init(pdev);
                if (ret)
                        goto clkput;
        }
 
-       uart_add_one_port(&imx_reg, &sport->port);
+       ret = uart_add_one_port(&imx_reg, &sport->port);
+       if (ret)
+               goto deinit;
        platform_set_drvdata(pdev, &sport->port);
 
        return 0;
+deinit:
+       if (pdata->exit)
+               pdata->exit(pdev);
 clkput:
        clk_put(sport->clk);
        clk_disable(sport->clk);
@@ -1193,13 +1357,13 @@ static int serial_imx_remove(struct platform_device *pdev)
 }
 
 static struct platform_driver serial_imx_driver = {
-        .probe          = serial_imx_probe,
-        .remove         = serial_imx_remove,
+       .probe          = serial_imx_probe,
+       .remove         = serial_imx_remove,
 
        .suspend        = serial_imx_suspend,
        .resume         = serial_imx_resume,
        .driver         = {
-               .name   = "imx-uart",
+               .name   = "imx-uart",
                .owner  = THIS_MODULE,
        },
 };
index c0a3e2734e240c02dd6e5a0b4da8a2f66800df86..4e5f3bde0461ca4c8eb4638bcc11643466c697ab 100644 (file)
@@ -61,6 +61,7 @@ enum {
        if ((DBG_##nlevel & jsm_debug))                 \
        dev_printk(KERN_##klevel, pdev->dev, fmt, ## args)
 
+#define        MAXLINES        256
 #define MAXPORTS       8
 #define MAX_STOPS_SENT 5
 
index 31496dc0a0d17d65206cb14ba280a6a5ffe29229..107ce2e187b8fc1c1773530a74b0779fdce283de 100644 (file)
@@ -33,6 +33,8 @@
 
 #include "jsm.h"
 
+static DECLARE_BITMAP(linemap, MAXLINES);
+
 static void jsm_carrier(struct jsm_channel *ch);
 
 static inline int jsm_get_mstat(struct jsm_channel *ch)
@@ -433,6 +435,7 @@ int __devinit jsm_tty_init(struct jsm_board *brd)
 int __devinit jsm_uart_port_init(struct jsm_board *brd)
 {
        int i;
+       unsigned int line;
        struct jsm_channel *ch;
 
        if (!brd)
@@ -459,9 +462,15 @@ int __devinit jsm_uart_port_init(struct jsm_board *brd)
                brd->channels[i]->uart_port.membase = brd->re_map_membase;
                brd->channels[i]->uart_port.fifosize = 16;
                brd->channels[i]->uart_port.ops = &jsm_ops;
-               brd->channels[i]->uart_port.line = brd->channels[i]->ch_portnum + brd->boardnum * 2;
+               line = find_first_zero_bit(linemap, MAXLINES);
+               if (line >= MAXLINES) {
+                       printk(KERN_INFO "jsm: linemap is full, added device failed\n");
+                       continue;
+               } else
+                       set_bit((int)line, linemap);
+               brd->channels[i]->uart_port.line = line;
                if (uart_add_one_port (&jsm_uart_driver, &brd->channels[i]->uart_port))
-                       printk(KERN_INFO "Added device failed\n");
+                       printk(KERN_INFO "jsm: add device failed\n");
                else
                        printk(KERN_INFO "Added device \n");
        }
@@ -494,6 +503,7 @@ int jsm_remove_uart_port(struct jsm_board *brd)
 
                ch = brd->channels[i];
 
+               clear_bit((int)(ch->uart_port.line), linemap);
                uart_remove_one_port(&jsm_uart_driver, &brd->channels[i]->uart_port);
        }
 
diff --git a/drivers/serial/timbuart.c b/drivers/serial/timbuart.c
new file mode 100644 (file)
index 0000000..ac9e5d5
--- /dev/null
@@ -0,0 +1,526 @@
+/*
+ * timbuart.c timberdale FPGA UART driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#include <linux/pci.h>
+#include <linux/interrupt.h>
+#include <linux/serial_core.h>
+#include <linux/kernel.h>
+#include <linux/platform_device.h>
+#include <linux/ioport.h>
+
+#include "timbuart.h"
+
+struct timbuart_port {
+       struct uart_port        port;
+       struct tasklet_struct   tasklet;
+       int                     usedma;
+       u8                      last_ier;
+       struct platform_device  *dev;
+};
+
+static int baudrates[] = {9600, 19200, 38400, 57600, 115200, 230400, 460800,
+       921600, 1843200, 3250000};
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier);
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid);
+
+static void timbuart_stop_rx(struct uart_port *port)
+{
+       /* spin lock held by upper layer, disable all RX interrupts */
+       u8 ier = ioread8(port->membase + TIMBUART_IER) & ~RXFLAGS;
+       iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_stop_tx(struct uart_port *port)
+{
+       /* spinlock held by upper layer, disable TX interrupt */
+       u8 ier = ioread8(port->membase + TIMBUART_IER) & ~TXBAE;
+       iowrite8(ier, port->membase + TIMBUART_IER);
+}
+
+static void timbuart_start_tx(struct uart_port *port)
+{
+       struct timbuart_port *uart =
+               container_of(port, struct timbuart_port, port);
+
+       /* do not transfer anything here -> fire off the tasklet */
+       tasklet_schedule(&uart->tasklet);
+}
+
+static void timbuart_flush_buffer(struct uart_port *port)
+{
+       u8 ctl = ioread8(port->membase + TIMBUART_CTRL) | TIMBUART_CTRL_FLSHTX;
+
+       iowrite8(ctl, port->membase + TIMBUART_CTRL);
+       iowrite8(TXBF, port->membase + TIMBUART_ISR);
+}
+
+static void timbuart_rx_chars(struct uart_port *port)
+{
+       struct tty_struct *tty = port->info->port.tty;
+
+       while (ioread8(port->membase + TIMBUART_ISR) & RXDP) {
+               u8 ch = ioread8(port->membase + TIMBUART_RXFIFO);
+               port->icount.rx++;
+               tty_insert_flip_char(tty, ch, TTY_NORMAL);
+       }
+
+       spin_unlock(&port->lock);
+       tty_flip_buffer_push(port->info->port.tty);
+       spin_lock(&port->lock);
+
+       dev_dbg(port->dev, "%s - total read %d bytes\n",
+               __func__, port->icount.rx);
+}
+
+static void timbuart_tx_chars(struct uart_port *port)
+{
+       struct circ_buf *xmit = &port->info->xmit;
+
+       while (!(ioread8(port->membase + TIMBUART_ISR) & TXBF) &&
+               !uart_circ_empty(xmit)) {
+               iowrite8(xmit->buf[xmit->tail],
+                       port->membase + TIMBUART_TXFIFO);
+               xmit->tail = (xmit->tail + 1) & (UART_XMIT_SIZE - 1);
+               port->icount.tx++;
+       }
+
+       dev_dbg(port->dev,
+               "%s - total written %d bytes, CTL: %x, RTS: %x, baud: %x\n",
+                __func__,
+               port->icount.tx,
+               ioread8(port->membase + TIMBUART_CTRL),
+               port->mctrl & TIOCM_RTS,
+               ioread8(port->membase + TIMBUART_BAUDRATE));
+}
+
+static void timbuart_handle_tx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+       struct timbuart_port *uart =
+               container_of(port, struct timbuart_port, port);
+       struct circ_buf *xmit = &port->info->xmit;
+
+       if (uart_circ_empty(xmit) || uart_tx_stopped(port))
+               return;
+
+       if (port->x_char)
+               return;
+
+       if (isr & TXFLAGS) {
+               timbuart_tx_chars(port);
+               /* clear all TX interrupts */
+               iowrite8(TXFLAGS, port->membase + TIMBUART_ISR);
+
+               if (uart_circ_chars_pending(xmit) < WAKEUP_CHARS)
+                       uart_write_wakeup(port);
+       } else
+               /* Re-enable any tx interrupt */
+               *ier |= uart->last_ier & TXFLAGS;
+
+       /* enable interrupts if there are chars in the transmit buffer,
+        * Or if we delivered some bytes and want the almost empty interrupt
+        * we wake up the upper layer later when we got the interrupt
+        * to give it some time to go out...
+        */
+       if (!uart_circ_empty(xmit))
+               *ier |= TXBAE;
+
+       dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_handle_rx_port(struct uart_port *port, u8 isr, u8 *ier)
+{
+       if (isr & RXFLAGS) {
+               /* Some RX status is set */
+               if (isr & RXBF) {
+                       u8 ctl = ioread8(port->membase + TIMBUART_CTRL) |
+                               TIMBUART_CTRL_FLSHRX;
+                       iowrite8(ctl, port->membase + TIMBUART_CTRL);
+                       port->icount.overrun++;
+               } else if (isr & (RXDP))
+                       timbuart_rx_chars(port);
+
+               /* ack all RX interrupts */
+               iowrite8(RXFLAGS, port->membase + TIMBUART_ISR);
+       }
+
+       /* always have the RX interrupts enabled */
+       *ier |= RXBAF | RXBF | RXTT;
+
+       dev_dbg(port->dev, "%s - leaving\n", __func__);
+}
+
+void timbuart_tasklet(unsigned long arg)
+{
+       struct timbuart_port *uart = (struct timbuart_port *)arg;
+       u8 isr, ier = 0;
+
+       spin_lock(&uart->port.lock);
+
+       isr = ioread8(uart->port.membase + TIMBUART_ISR);
+       dev_dbg(uart->port.dev, "%s ISR: %x\n", __func__, isr);
+
+       if (!uart->usedma)
+               timbuart_handle_tx_port(&uart->port, isr, &ier);
+
+       timbuart_mctrl_check(&uart->port, isr, &ier);
+
+       if (!uart->usedma)
+               timbuart_handle_rx_port(&uart->port, isr, &ier);
+
+       iowrite8(ier, uart->port.membase + TIMBUART_IER);
+
+       spin_unlock(&uart->port.lock);
+       dev_dbg(uart->port.dev, "%s leaving\n", __func__);
+}
+
+static unsigned int timbuart_tx_empty(struct uart_port *port)
+{
+       u8 isr = ioread8(port->membase + TIMBUART_ISR);
+
+       return (isr & TXBAE) ? TIOCSER_TEMT : 0;
+}
+
+static unsigned int timbuart_get_mctrl(struct uart_port *port)
+{
+       u8 cts = ioread8(port->membase + TIMBUART_CTRL);
+       dev_dbg(port->dev, "%s - cts %x\n", __func__, cts);
+
+       if (cts & TIMBUART_CTRL_CTS)
+               return TIOCM_CTS | TIOCM_DSR | TIOCM_CAR;
+       else
+               return TIOCM_DSR | TIOCM_CAR;
+}
+
+static void timbuart_set_mctrl(struct uart_port *port, unsigned int mctrl)
+{
+       dev_dbg(port->dev, "%s - %x\n", __func__, mctrl);
+
+       if (mctrl & TIOCM_RTS)
+               iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+       else
+               iowrite8(TIMBUART_CTRL_RTS, port->membase + TIMBUART_CTRL);
+}
+
+static void timbuart_mctrl_check(struct uart_port *port, u8 isr, u8 *ier)
+{
+       unsigned int cts;
+
+       if (isr & CTS_DELTA) {
+               /* ack */
+               iowrite8(CTS_DELTA, port->membase + TIMBUART_ISR);
+               cts = timbuart_get_mctrl(port);
+               uart_handle_cts_change(port, cts & TIOCM_CTS);
+               wake_up_interruptible(&port->info->delta_msr_wait);
+       }
+
+       *ier |= CTS_DELTA;
+}
+
+static void timbuart_enable_ms(struct uart_port *port)
+{
+       /* N/A */
+}
+
+static void timbuart_break_ctl(struct uart_port *port, int ctl)
+{
+       /* N/A */
+}
+
+static int timbuart_startup(struct uart_port *port)
+{
+       struct timbuart_port *uart =
+               container_of(port, struct timbuart_port, port);
+
+       dev_dbg(port->dev, "%s\n", __func__);
+
+       iowrite8(TIMBUART_CTRL_FLSHRX, port->membase + TIMBUART_CTRL);
+       iowrite8(0xff, port->membase + TIMBUART_ISR);
+       /* Enable all but TX interrupts */
+       iowrite8(RXBAF | RXBF | RXTT | CTS_DELTA,
+               port->membase + TIMBUART_IER);
+
+       return request_irq(port->irq, timbuart_handleinterrupt, IRQF_SHARED,
+               "timb-uart", uart);
+}
+
+static void timbuart_shutdown(struct uart_port *port)
+{
+       struct timbuart_port *uart =
+               container_of(port, struct timbuart_port, port);
+       dev_dbg(port->dev, "%s\n", __func__);
+       free_irq(port->irq, uart);
+       iowrite8(0, port->membase + TIMBUART_IER);
+}
+
+static int get_bindex(int baud)
+{
+       int i;
+
+       for (i = 0; i < ARRAY_SIZE(baudrates); i++)
+               if (baud <= baudrates[i])
+                       return i;
+
+       return -1;
+}
+
+static void timbuart_set_termios(struct uart_port *port,
+       struct ktermios *termios,
+       struct ktermios *old)
+{
+       unsigned int baud;
+       short bindex;
+       unsigned long flags;
+
+       baud = uart_get_baud_rate(port, termios, old, 0, port->uartclk / 16);
+       bindex = get_bindex(baud);
+       dev_dbg(port->dev, "%s - bindex %d\n", __func__, bindex);
+
+       if (bindex < 0)
+               bindex = 0;
+       baud = baudrates[bindex];
+
+       /* The serial layer calls into this once with old = NULL when setting
+          up initially */
+       if (old)
+               tty_termios_copy_hw(termios, old);
+       tty_termios_encode_baud_rate(termios, baud, baud);
+
+       spin_lock_irqsave(&port->lock, flags);
+       iowrite8((u8)bindex, port->membase + TIMBUART_BAUDRATE);
+       uart_update_timeout(port, termios->c_cflag, baud);
+       spin_unlock_irqrestore(&port->lock, flags);
+}
+
+static const char *timbuart_type(struct uart_port *port)
+{
+       return port->type == PORT_UNKNOWN ? "timbuart" : NULL;
+}
+
+/* We do not request/release mappings of the registers here,
+ * currently it's done in the proble function.
+ */
+static void timbuart_release_port(struct uart_port *port)
+{
+       struct platform_device *pdev = to_platform_device(port->dev);
+       int size =
+               resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+       if (port->flags & UPF_IOREMAP) {
+               iounmap(port->membase);
+               port->membase = NULL;
+       }
+
+       release_mem_region(port->mapbase, size);
+}
+
+static int timbuart_request_port(struct uart_port *port)
+{
+       struct platform_device *pdev = to_platform_device(port->dev);
+       int size =
+               resource_size(platform_get_resource(pdev, IORESOURCE_MEM, 0));
+
+       if (!request_mem_region(port->mapbase, size, "timb-uart"))
+               return -EBUSY;
+
+       if (port->flags & UPF_IOREMAP) {
+               port->membase = ioremap(port->mapbase, size);
+               if (port->membase == NULL) {
+                       release_mem_region(port->mapbase, size);
+                       return -ENOMEM;
+               }
+       }
+
+       return 0;
+}
+
+static irqreturn_t timbuart_handleinterrupt(int irq, void *devid)
+{
+       struct timbuart_port *uart = (struct timbuart_port *)devid;
+
+       if (ioread8(uart->port.membase + TIMBUART_IPR)) {
+               uart->last_ier = ioread8(uart->port.membase + TIMBUART_IER);
+
+               /* disable interrupts, the tasklet enables them again */
+               iowrite8(0, uart->port.membase + TIMBUART_IER);
+
+               /* fire off bottom half */
+               tasklet_schedule(&uart->tasklet);
+
+               return IRQ_HANDLED;
+       } else
+               return IRQ_NONE;
+}
+
+/*
+ * Configure/autoconfigure the port.
+ */
+static void timbuart_config_port(struct uart_port *port, int flags)
+{
+       if (flags & UART_CONFIG_TYPE) {
+               port->type = PORT_TIMBUART;
+               timbuart_request_port(port);
+       }
+}
+
+static int timbuart_verify_port(struct uart_port *port,
+       struct serial_struct *ser)
+{
+       /* we don't want the core code to modify any port params */
+       return -EINVAL;
+}
+
+static struct uart_ops timbuart_ops = {
+       .tx_empty = timbuart_tx_empty,
+       .set_mctrl = timbuart_set_mctrl,
+       .get_mctrl = timbuart_get_mctrl,
+       .stop_tx = timbuart_stop_tx,
+       .start_tx = timbuart_start_tx,
+       .flush_buffer = timbuart_flush_buffer,
+       .stop_rx = timbuart_stop_rx,
+       .enable_ms = timbuart_enable_ms,
+       .break_ctl = timbuart_break_ctl,
+       .startup = timbuart_startup,
+       .shutdown = timbuart_shutdown,
+       .set_termios = timbuart_set_termios,
+       .type = timbuart_type,
+       .release_port = timbuart_release_port,
+       .request_port = timbuart_request_port,
+       .config_port = timbuart_config_port,
+       .verify_port = timbuart_verify_port
+};
+
+static struct uart_driver timbuart_driver = {
+       .owner = THIS_MODULE,
+       .driver_name = "timberdale_uart",
+       .dev_name = "ttyTU",
+       .major = TIMBUART_MAJOR,
+       .minor = TIMBUART_MINOR,
+       .nr = 1
+};
+
+static int timbuart_probe(struct platform_device *dev)
+{
+       int err;
+       struct timbuart_port *uart;
+       struct resource *iomem;
+
+       dev_dbg(&dev->dev, "%s\n", __func__);
+
+       uart = kzalloc(sizeof(*uart), GFP_KERNEL);
+       if (!uart) {
+               err = -EINVAL;
+               goto err_mem;
+       }
+
+       uart->usedma = 0;
+
+       uart->port.uartclk = 3250000 * 16;
+       uart->port.fifosize  = TIMBUART_FIFO_SIZE;
+       uart->port.regshift  = 2;
+       uart->port.iotype  = UPIO_MEM;
+       uart->port.ops = &timbuart_ops;
+       uart->port.irq = 0;
+       uart->port.flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP;
+       uart->port.line  = 0;
+       uart->port.dev  = &dev->dev;
+
+       iomem = platform_get_resource(dev, IORESOURCE_MEM, 0);
+       if (!iomem) {
+               err = -ENOMEM;
+               goto err_register;
+       }
+       uart->port.mapbase = iomem->start;
+       uart->port.membase = NULL;
+
+       uart->port.irq = platform_get_irq(dev, 0);
+       if (uart->port.irq < 0) {
+               err = -EINVAL;
+               goto err_register;
+       }
+
+       tasklet_init(&uart->tasklet, timbuart_tasklet, (unsigned long)uart);
+
+       err = uart_register_driver(&timbuart_driver);
+       if (err)
+               goto err_register;
+
+       err = uart_add_one_port(&timbuart_driver, &uart->port);
+       if (err)
+               goto err_add_port;
+
+       platform_set_drvdata(dev, uart);
+
+       return 0;
+
+err_add_port:
+       uart_unregister_driver(&timbuart_driver);
+err_register:
+       kfree(uart);
+err_mem:
+       printk(KERN_ERR "timberdale: Failed to register Timberdale UART: %d\n",
+               err);
+
+       return err;
+}
+
+static int timbuart_remove(struct platform_device *dev)
+{
+       struct timbuart_port *uart = platform_get_drvdata(dev);
+
+       tasklet_kill(&uart->tasklet);
+       uart_remove_one_port(&timbuart_driver, &uart->port);
+       uart_unregister_driver(&timbuart_driver);
+       kfree(uart);
+
+       return 0;
+}
+
+static struct platform_driver timbuart_platform_driver = {
+       .driver = {
+               .name   = "timb-uart",
+               .owner  = THIS_MODULE,
+       },
+       .probe          = timbuart_probe,
+       .remove         = timbuart_remove,
+};
+
+/*--------------------------------------------------------------------------*/
+
+static int __init timbuart_init(void)
+{
+       return platform_driver_register(&timbuart_platform_driver);
+}
+
+static void __exit timbuart_exit(void)
+{
+       platform_driver_unregister(&timbuart_platform_driver);
+}
+
+module_init(timbuart_init);
+module_exit(timbuart_exit);
+
+MODULE_DESCRIPTION("Timberdale UART driver");
+MODULE_LICENSE("GPL v2");
+MODULE_ALIAS("platform:timb-uart");
+
diff --git a/drivers/serial/timbuart.h b/drivers/serial/timbuart.h
new file mode 100644 (file)
index 0000000..7e56676
--- /dev/null
@@ -0,0 +1,58 @@
+/*
+ * timbuart.c timberdale FPGA GPIO driver
+ * Copyright (c) 2009 Intel Corporation
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+/* Supports:
+ * Timberdale FPGA UART
+ */
+
+#ifndef _TIMBUART_H
+#define _TIMBUART_H
+
+#define TIMBUART_FIFO_SIZE     2048
+
+#define TIMBUART_RXFIFO                0x08
+#define TIMBUART_TXFIFO                0x0c
+#define TIMBUART_IER           0x10
+#define TIMBUART_IPR           0x14
+#define TIMBUART_ISR           0x18
+#define TIMBUART_CTRL          0x1c
+#define TIMBUART_BAUDRATE      0x20
+
+#define TIMBUART_CTRL_RTS      0x01
+#define TIMBUART_CTRL_CTS      0x02
+#define TIMBUART_CTRL_FLSHTX   0x40
+#define TIMBUART_CTRL_FLSHRX   0x80
+
+#define TXBF           0x01
+#define TXBAE          0x02
+#define CTS_DELTA      0x04
+#define RXDP           0x08
+#define RXBAF          0x10
+#define RXBF           0x20
+#define RXTT           0x40
+#define RXBNAE         0x80
+#define TXBE           0x100
+
+#define RXFLAGS (RXDP | RXBAF | RXBF | RXTT | RXBNAE)
+#define TXFLAGS (TXBF | TXBAE)
+
+#define TIMBUART_MAJOR 204
+#define TIMBUART_MINOR 192
+
+#endif /* _TIMBUART_H */
+
index 7a1164dd1d37f4bf115b236ef088fe8c90a82f50..ddeb6919253734ebcb3b7a92b6294b30bb305d52 100644 (file)
@@ -16,7 +16,8 @@
  *     v0.9  - thorough cleaning, URBification, almost a rewrite
  *     v0.10 - some more cleanups
  *     v0.11 - fixed flow control, read error doesn't stop reads
- *     v0.12 - added TIOCM ioctls, added break handling, made struct acm kmalloced
+ *     v0.12 - added TIOCM ioctls, added break handling, made struct acm
+ *             kmalloced
  *     v0.13 - added termios, added hangup
  *     v0.14 - sized down struct acm
  *     v0.15 - fixed flow control again - characters could be lost
@@ -62,7 +63,7 @@
 #include <linux/tty_flip.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <asm/uaccess.h>
+#include <linux/uaccess.h>
 #include <linux/usb.h>
 #include <linux/usb/cdc.h>
 #include <asm/byteorder.h>
@@ -87,7 +88,10 @@ static struct acm *acm_table[ACM_TTY_MINORS];
 
 static DEFINE_MUTEX(open_mutex);
 
-#define ACM_READY(acm) (acm && acm->dev && acm->used)
+#define ACM_READY(acm) (acm && acm->dev && acm->port.count)
+
+static const struct tty_port_operations acm_port_ops = {
+};
 
 #ifdef VERBOSE_DEBUG
 #define verbose        1
@@ -99,13 +103,15 @@ static DEFINE_MUTEX(open_mutex);
  * Functions for ACM control messages.
  */
 
-static int acm_ctrl_msg(struct acm *acm, int request, int value, void *buf, int len)
+static int acm_ctrl_msg(struct acm *acm, int request, int value,
+                                                       void *buf, int len)
 {
        int retval = usb_control_msg(acm->dev, usb_sndctrlpipe(acm->dev, 0),
                request, USB_RT_ACM, value,
                acm->control->altsetting[0].desc.bInterfaceNumber,
                buf, len, 5000);
-       dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d", request, value, len, retval);
+       dbg("acm_control_msg: rq: 0x%02x val: %#x len: %#x result: %d",
+                                               request, value, len, retval);
        return retval < 0 ? retval : 0;
 }
 
@@ -150,9 +156,8 @@ static int acm_wb_is_avail(struct acm *acm)
 
        n = ACM_NW;
        spin_lock_irqsave(&acm->write_lock, flags);
-       for (i = 0; i < ACM_NW; i++) {
+       for (i = 0; i < ACM_NW; i++)
                n -= acm->wb[i].use;
-       }
        spin_unlock_irqrestore(&acm->write_lock, flags);
        return n;
 }
@@ -183,7 +188,8 @@ static int acm_start_wb(struct acm *acm, struct acm_wb *wb)
        wb->urb->transfer_buffer_length = wb->len;
        wb->urb->dev = acm->dev;
 
-       if ((rc = usb_submit_urb(wb->urb, GFP_ATOMIC)) < 0) {
+       rc = usb_submit_urb(wb->urb, GFP_ATOMIC);
+       if (rc < 0) {
                dbg("usb_submit_urb(write bulk) failed: %d", rc);
                acm_write_done(acm, wb);
        }
@@ -262,6 +268,7 @@ static void acm_ctrl_irq(struct urb *urb)
 {
        struct acm *acm = urb->context;
        struct usb_cdc_notification *dr = urb->transfer_buffer;
+       struct tty_struct *tty;
        unsigned char *data;
        int newctrl;
        int retval;
@@ -287,40 +294,45 @@ static void acm_ctrl_irq(struct urb *urb)
 
        data = (unsigned char *)(dr + 1);
        switch (dr->bNotificationType) {
+       case USB_CDC_NOTIFY_NETWORK_CONNECTION:
+               dbg("%s network", dr->wValue ?
+                                       "connected to" : "disconnected from");
+               break;
 
-               case USB_CDC_NOTIFY_NETWORK_CONNECTION:
-
-                       dbg("%s network", dr->wValue ? "connected to" : "disconnected from");
-                       break;
-
-               case USB_CDC_NOTIFY_SERIAL_STATE:
-
-                       newctrl = get_unaligned_le16(data);
+       case USB_CDC_NOTIFY_SERIAL_STATE:
+               tty = tty_port_tty_get(&acm->port);
+               newctrl = get_unaligned_le16(data);
 
-                       if (acm->tty && !acm->clocal && (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
+               if (tty) {
+                       if (!acm->clocal &&
+                               (acm->ctrlin & ~newctrl & ACM_CTRL_DCD)) {
                                dbg("calling hangup");
-                               tty_hangup(acm->tty);
+                               tty_hangup(tty);
                        }
+                       tty_kref_put(tty);
+               }
 
-                       acm->ctrlin = newctrl;
-
-                       dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
-                               acm->ctrlin & ACM_CTRL_DCD ? '+' : '-', acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
-                               acm->ctrlin & ACM_CTRL_BRK ? '+' : '-', acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
-                               acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',     acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
-                               acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
+               acm->ctrlin = newctrl;
 
+               dbg("input control lines: dcd%c dsr%c break%c ring%c framing%c parity%c overrun%c",
+                       acm->ctrlin & ACM_CTRL_DCD ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_DSR ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_BRK ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_RI  ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_FRAMING ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_PARITY ? '+' : '-',
+                       acm->ctrlin & ACM_CTRL_OVERRUN ? '+' : '-');
                        break;
 
-               default:
-                       dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
-                               dr->bNotificationType, dr->wIndex,
-                               dr->wLength, data[0], data[1]);
-                       break;
+       default:
+               dbg("unknown notification %d received: index %d len %d data0 %d data1 %d",
+                       dr->bNotificationType, dr->wIndex,
+                       dr->wLength, data[0], data[1]);
+               break;
        }
 exit:
        usb_mark_last_busy(acm->dev);
-       retval = usb_submit_urb (urb, GFP_ATOMIC);
+       retval = usb_submit_urb(urb, GFP_ATOMIC);
        if (retval)
                dev_err(&urb->dev->dev, "%s - usb_submit_urb failed with "
                        "result %d", __func__, retval);
@@ -371,15 +383,14 @@ static void acm_rx_tasklet(unsigned long _acm)
 {
        struct acm *acm = (void *)_acm;
        struct acm_rb *buf;
-       struct tty_struct *tty = acm->tty;
+       struct tty_struct *tty;
        struct acm_ru *rcv;
        unsigned long flags;
        unsigned char throttled;
 
        dbg("Entering acm_rx_tasklet");
 
-       if (!ACM_READY(acm))
-       {
+       if (!ACM_READY(acm)) {
                dbg("acm_rx_tasklet: ACM not ready");
                return;
        }
@@ -387,12 +398,13 @@ static void acm_rx_tasklet(unsigned long _acm)
        spin_lock_irqsave(&acm->throttle_lock, flags);
        throttled = acm->throttle;
        spin_unlock_irqrestore(&acm->throttle_lock, flags);
-       if (throttled)
-       {
+       if (throttled) {
                dbg("acm_rx_tasklet: throttled");
                return;
        }
 
+       tty = tty_port_tty_get(&acm->port);
+
 next_buffer:
        spin_lock_irqsave(&acm->read_lock, flags);
        if (list_empty(&acm->filled_read_bufs)) {
@@ -406,20 +418,22 @@ next_buffer:
 
        dbg("acm_rx_tasklet: procesing buf 0x%p, size = %d", buf, buf->size);
 
-       tty_buffer_request_room(tty, buf->size);
-       spin_lock_irqsave(&acm->throttle_lock, flags);
-       throttled = acm->throttle;
-       spin_unlock_irqrestore(&acm->throttle_lock, flags);
-       if (!throttled)
-               tty_insert_flip_string(tty, buf->base, buf->size);
-       tty_flip_buffer_push(tty);
-
-       if (throttled) {
-               dbg("Throttling noticed");
-               spin_lock_irqsave(&acm->read_lock, flags);
-               list_add(&buf->list, &acm->filled_read_bufs);
-               spin_unlock_irqrestore(&acm->read_lock, flags);
-               return;
+       if (tty) {
+               spin_lock_irqsave(&acm->throttle_lock, flags);
+               throttled = acm->throttle;
+               spin_unlock_irqrestore(&acm->throttle_lock, flags);
+               if (!throttled) {
+                       tty_buffer_request_room(tty, buf->size);
+                       tty_insert_flip_string(tty, buf->base, buf->size);
+                       tty_flip_buffer_push(tty);
+               } else {
+                       tty_kref_put(tty);
+                       dbg("Throttling noticed");
+                       spin_lock_irqsave(&acm->read_lock, flags);
+                       list_add(&buf->list, &acm->filled_read_bufs);
+                       spin_unlock_irqrestore(&acm->read_lock, flags);
+                       return;
+               }
        }
 
        spin_lock_irqsave(&acm->read_lock, flags);
@@ -428,6 +442,8 @@ next_buffer:
        goto next_buffer;
 
 urbs:
+       tty_kref_put(tty);
+
        while (!list_empty(&acm->spare_read_bufs)) {
                spin_lock_irqsave(&acm->read_lock, flags);
                if (list_empty(&acm->spare_read_urbs)) {
@@ -454,10 +470,11 @@ urbs:
                rcv->urb->transfer_dma = buf->dma;
                rcv->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
 
-               /* This shouldn't kill the driver as unsuccessful URBs are returned to the
-                  free-urbs-pool and resubmited ASAP */
+               /* This shouldn't kill the driver as unsuccessful URBs are
+                  returned to the free-urbs-pool and resubmited ASAP */
                spin_lock_irqsave(&acm->read_lock, flags);
-               if (acm->susp_count || usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
+               if (acm->susp_count ||
+                               usb_submit_urb(rcv->urb, GFP_ATOMIC) < 0) {
                        list_add(&buf->list, &acm->spare_read_bufs);
                        list_add(&rcv->list, &acm->spare_read_urbs);
                        acm->processing = 0;
@@ -499,11 +516,14 @@ static void acm_write_bulk(struct urb *urb)
 static void acm_softint(struct work_struct *work)
 {
        struct acm *acm = container_of(work, struct acm, work);
+       struct tty_struct *tty;
 
        dev_vdbg(&acm->data->dev, "tx work\n");
        if (!ACM_READY(acm))
                return;
-       tty_wakeup(acm->tty);
+       tty = tty_port_tty_get(&acm->port);
+       tty_wakeup(tty);
+       tty_kref_put(tty);
 }
 
 static void acm_waker(struct work_struct *waker)
@@ -543,8 +563,9 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
                rv = 0;
 
        set_bit(TTY_NO_WRITE_SPLIT, &tty->flags);
+
        tty->driver_data = acm;
-       acm->tty = tty;
+       tty_port_tty_set(&acm->port, tty);
 
        if (usb_autopm_get_interface(acm->control) < 0)
                goto early_bail;
@@ -552,11 +573,10 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
                acm->control->needs_remote_wakeup = 1;
 
        mutex_lock(&acm->mutex);
-       if (acm->used++) {
+       if (acm->port.count++) {
                usb_autopm_put_interface(acm->control);
                goto done;
-        }
-
+       }
 
        acm->ctrlurb->dev = acm->dev;
        if (usb_submit_urb(acm->ctrlurb, GFP_KERNEL)) {
@@ -567,22 +587,22 @@ static int acm_tty_open(struct tty_struct *tty, struct file *filp)
        if (0 > acm_set_control(acm, acm->ctrlout = ACM_CTRL_DTR | ACM_CTRL_RTS) &&
            (acm->ctrl_caps & USB_CDC_CAP_LINE))
                goto full_bailout;
+
        usb_autopm_put_interface(acm->control);
 
        INIT_LIST_HEAD(&acm->spare_read_urbs);
        INIT_LIST_HEAD(&acm->spare_read_bufs);
        INIT_LIST_HEAD(&acm->filled_read_bufs);
-       for (i = 0; i < acm->rx_buflimit; i++) {
+
+       for (i = 0; i < acm->rx_buflimit; i++)
                list_add(&(acm->ru[i].list), &acm->spare_read_urbs);
-       }
-       for (i = 0; i < acm->rx_buflimit; i++) {
+       for (i = 0; i < acm->rx_buflimit; i++)
                list_add(&(acm->rb[i].list), &acm->spare_read_bufs);
-       }
 
        acm->throttle = 0;
 
        tasklet_schedule(&acm->urb_task);
-
+       rv = tty_port_block_til_ready(&acm->port, tty, filp);
 done:
        mutex_unlock(&acm->mutex);
 err_out:
@@ -593,16 +613,17 @@ full_bailout:
        usb_kill_urb(acm->ctrlurb);
 bail_out:
        usb_autopm_put_interface(acm->control);
-       acm->used--;
+       acm->port.count--;
        mutex_unlock(&acm->mutex);
 early_bail:
        mutex_unlock(&open_mutex);
+       tty_port_tty_set(&acm->port, NULL);
        return -EIO;
 }
 
 static void acm_tty_unregister(struct acm *acm)
 {
-       int i,nr;
+       int i, nr;
 
        nr = acm->rx_buflimit;
        tty_unregister_device(acm_tty_driver, acm->minor);
@@ -619,41 +640,56 @@ static void acm_tty_unregister(struct acm *acm)
 
 static int acm_tty_chars_in_buffer(struct tty_struct *tty);
 
+static void acm_port_down(struct acm *acm, int drain)
+{
+       int i, nr = acm->rx_buflimit;
+       mutex_lock(&open_mutex);
+       if (acm->dev) {
+               usb_autopm_get_interface(acm->control);
+               acm_set_control(acm, acm->ctrlout = 0);
+               /* try letting the last writes drain naturally */
+               if (drain) {
+                       wait_event_interruptible_timeout(acm->drain_wait,
+                               (ACM_NW == acm_wb_is_avail(acm)) || !acm->dev,
+                                       ACM_CLOSE_TIMEOUT * HZ);
+               }
+               usb_kill_urb(acm->ctrlurb);
+               for (i = 0; i < ACM_NW; i++)
+                       usb_kill_urb(acm->wb[i].urb);
+               for (i = 0; i < nr; i++)
+                       usb_kill_urb(acm->ru[i].urb);
+               acm->control->needs_remote_wakeup = 0;
+               usb_autopm_put_interface(acm->control);
+       }
+       mutex_unlock(&open_mutex);
+}
+
+static void acm_tty_hangup(struct tty_struct *tty)
+{
+       struct acm *acm = tty->driver_data;
+       tty_port_hangup(&acm->port);
+       acm_port_down(acm, 0);
+}
+
 static void acm_tty_close(struct tty_struct *tty, struct file *filp)
 {
        struct acm *acm = tty->driver_data;
-       int i,nr;
 
-       if (!acm || !acm->used)
+       /* Perform the closing process and see if we need to do the hardware
+          shutdown */
+       if (tty_port_close_start(&acm->port, tty, filp) == 0)
                return;
-
-       nr = acm->rx_buflimit;
+       acm_port_down(acm, 0);
+       tty_port_close_end(&acm->port, tty);
        mutex_lock(&open_mutex);
-       if (!--acm->used) {
-               if (acm->dev) {
-                       usb_autopm_get_interface(acm->control);
-                       acm_set_control(acm, acm->ctrlout = 0);
-
-                       /* try letting the last writes drain naturally */
-                       wait_event_interruptible_timeout(acm->drain_wait,
-                                       (ACM_NW == acm_wb_is_avail(acm))
-                                               || !acm->dev,
-                                       ACM_CLOSE_TIMEOUT * HZ);
-
-                       usb_kill_urb(acm->ctrlurb);
-                       for (i = 0; i < ACM_NW; i++)
-                               usb_kill_urb(acm->wb[i].urb);
-                       for (i = 0; i < nr; i++)
-                               usb_kill_urb(acm->ru[i].urb);
-                       acm->control->needs_remote_wakeup = 0;
-                       usb_autopm_put_interface(acm->control);
-               } else
-                       acm_tty_unregister(acm);
-       }
+       tty_port_tty_set(&acm->port, NULL);
+       if (!acm->dev)
+               acm_tty_unregister(acm);
        mutex_unlock(&open_mutex);
 }
 
-static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int count)
+static int acm_tty_write(struct tty_struct *tty,
+                                       const unsigned char *buf, int count)
 {
        struct acm *acm = tty->driver_data;
        int stat;
@@ -669,7 +705,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
                return 0;
 
        spin_lock_irqsave(&acm->write_lock, flags);
-       if ((wbn = acm_wb_alloc(acm)) < 0) {
+       wbn = acm_wb_alloc(acm);
+       if (wbn < 0) {
                spin_unlock_irqrestore(&acm->write_lock, flags);
                return 0;
        }
@@ -681,7 +718,8 @@ static int acm_tty_write(struct tty_struct *tty, const unsigned char *buf, int c
        wb->len = count;
        spin_unlock_irqrestore(&acm->write_lock, flags);
 
-       if ((stat = acm_write_start(acm, wbn)) < 0)
+       stat = acm_write_start(acm, wbn);
+       if (stat < 0)
                return stat;
        return count;
 }
@@ -767,8 +805,10 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
                return -EINVAL;
 
        newctrl = acm->ctrlout;
-       set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
-       clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) | (clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+       set = (set & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+                                       (set & TIOCM_RTS ? ACM_CTRL_RTS : 0);
+       clear = (clear & TIOCM_DTR ? ACM_CTRL_DTR : 0) |
+                                       (clear & TIOCM_RTS ? ACM_CTRL_RTS : 0);
 
        newctrl = (newctrl & ~clear) | set;
 
@@ -777,7 +817,8 @@ static int acm_tty_tiocmset(struct tty_struct *tty, struct file *file,
        return acm_set_control(acm, acm->ctrlout = newctrl);
 }
 
-static int acm_tty_ioctl(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned long arg)
+static int acm_tty_ioctl(struct tty_struct *tty, struct file *file,
+                                       unsigned int cmd, unsigned long arg)
 {
        struct acm *acm = tty->driver_data;
 
@@ -799,7 +840,8 @@ static const __u8 acm_tty_size[] = {
        5, 6, 7, 8
 };
 
-static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios_old)
+static void acm_tty_set_termios(struct tty_struct *tty,
+                                               struct ktermios *termios_old)
 {
        struct acm *acm = tty->driver_data;
        struct ktermios *termios = tty->termios;
@@ -809,19 +851,23 @@ static void acm_tty_set_termios(struct tty_struct *tty, struct ktermios *termios
        if (!ACM_READY(acm))
                return;
 
+       /* FIXME: Needs to support the tty_baud interface */
+       /* FIXME: Broken on sparc */
        newline.dwDTERate = cpu_to_le32p(acm_tty_speed +
                (termios->c_cflag & CBAUD & ~CBAUDEX) + (termios->c_cflag & CBAUDEX ? 15 : 0));
        newline.bCharFormat = termios->c_cflag & CSTOPB ? 2 : 0;
        newline.bParityType = termios->c_cflag & PARENB ?
-               (termios->c_cflag & PARODD ? 1 : 2) + (termios->c_cflag & CMSPAR ? 2 : 0) : 0;
+                               (termios->c_cflag & PARODD ? 1 : 2) +
+                               (termios->c_cflag & CMSPAR ? 2 : 0) : 0;
        newline.bDataBits = acm_tty_size[(termios->c_cflag & CSIZE) >> 4];
-
+       /* FIXME: Needs to clear unsupported bits in the termios */
        acm->clocal = ((termios->c_cflag & CLOCAL) != 0);
 
        if (!newline.dwDTERate) {
                newline.dwDTERate = acm->line.dwDTERate;
                newctrl &= ~ACM_CTRL_DTR;
-       } else  newctrl |=  ACM_CTRL_DTR;
+       } else
+               newctrl |=  ACM_CTRL_DTR;
 
        if (newctrl != acm->ctrlout)
                acm_set_control(acm, acm->ctrlout = newctrl);
@@ -846,9 +892,8 @@ static void acm_write_buffers_free(struct acm *acm)
        struct acm_wb *wb;
        struct usb_device *usb_dev = interface_to_usbdev(acm->control);
 
-       for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++) {
+       for (wb = &acm->wb[0], i = 0; i < ACM_NW; i++, wb++)
                usb_buffer_free(usb_dev, acm->writesize, wb->buf, wb->dmah);
-       }
 }
 
 static void acm_read_buffers_free(struct acm *acm)
@@ -857,7 +902,8 @@ static void acm_read_buffers_free(struct acm *acm)
        int i, n = acm->rx_buflimit;
 
        for (i = 0; i < n; i++)
-               usb_buffer_free(usb_dev, acm->readsize, acm->rb[i].base, acm->rb[i].dma);
+               usb_buffer_free(usb_dev, acm->readsize,
+                                       acm->rb[i].base, acm->rb[i].dma);
 }
 
 /* Little helper: write buffers allocate */
@@ -882,8 +928,8 @@ static int acm_write_buffers_alloc(struct acm *acm)
        return 0;
 }
 
-static int acm_probe (struct usb_interface *intf,
-                     const struct usb_device_id *id)
+static int acm_probe(struct usb_interface *intf,
+                    const struct usb_device_id *id)
 {
        struct usb_cdc_union_desc *union_header = NULL;
        struct usb_cdc_country_functional_desc *cfd = NULL;
@@ -897,7 +943,7 @@ static int acm_probe (struct usb_interface *intf,
        struct usb_device *usb_dev = interface_to_usbdev(intf);
        struct acm *acm;
        int minor;
-       int ctrlsize,readsize;
+       int ctrlsize, readsize;
        u8 *buf;
        u8 ac_management_function = 0;
        u8 call_management_function = 0;
@@ -917,7 +963,7 @@ static int acm_probe (struct usb_interface *intf,
                control_interface = usb_ifnum_to_if(usb_dev, 0);
                goto skip_normal_probe;
        }
-       
+
        /* normal probing*/
        if (!buffer) {
                dev_err(&intf->dev, "Weird descriptor references\n");
@@ -925,8 +971,10 @@ static int acm_probe (struct usb_interface *intf,
        }
 
        if (!buflen) {
-               if (intf->cur_altsetting->endpoint->extralen && intf->cur_altsetting->endpoint->extra) {
-                       dev_dbg(&intf->dev,"Seeking extra descriptors on endpoint\n");
+               if (intf->cur_altsetting->endpoint->extralen &&
+                               intf->cur_altsetting->endpoint->extra) {
+                       dev_dbg(&intf->dev,
+                               "Seeking extra descriptors on endpoint\n");
                        buflen = intf->cur_altsetting->endpoint->extralen;
                        buffer = intf->cur_altsetting->endpoint->extra;
                } else {
@@ -937,47 +985,43 @@ static int acm_probe (struct usb_interface *intf,
        }
 
        while (buflen > 0) {
-               if (buffer [1] != USB_DT_CS_INTERFACE) {
+               if (buffer[1] != USB_DT_CS_INTERFACE) {
                        dev_err(&intf->dev, "skipping garbage\n");
                        goto next_desc;
                }
 
-               switch (buffer [2]) {
-                       case USB_CDC_UNION_TYPE: /* we've found it */
-                               if (union_header) {
-                                       dev_err(&intf->dev, "More than one "
-                                               "union descriptor, "
-                                               "skipping ...\n");
-                                       goto next_desc;
-                               }
-                               union_header = (struct usb_cdc_union_desc *)
-                                                       buffer;
-                               break;
-                       case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
-                               cfd = (struct usb_cdc_country_functional_desc *)buffer;
-                               break;
-                       case USB_CDC_HEADER_TYPE: /* maybe check version */ 
-                               break; /* for now we ignore it */ 
-                       case USB_CDC_ACM_TYPE:
-                               ac_management_function = buffer[3];
-                               break;
-                       case USB_CDC_CALL_MANAGEMENT_TYPE:
-                               call_management_function = buffer[3];
-                               call_interface_num = buffer[4];
-                               if ((call_management_function & 3) != 3)
-                                       dev_err(&intf->dev, "This device "
-                                               "cannot do calls on its own. "
-                                               "It is no modem.\n");
-                               break;
-                       default:
-                               /* there are LOTS more CDC descriptors that
-                                * could legitimately be found here.
-                                */
-                               dev_dbg(&intf->dev, "Ignoring descriptor: "
-                                               "type %02x, length %d\n",
-                                               buffer[2], buffer[0]);
-                               break;
+               switch (buffer[2]) {
+               case USB_CDC_UNION_TYPE: /* we've found it */
+                       if (union_header) {
+                               dev_err(&intf->dev, "More than one "
+                                       "union descriptor, skipping ...\n");
+                               goto next_desc;
                        }
+                       union_header = (struct usb_cdc_union_desc *)buffer;
+                       break;
+               case USB_CDC_COUNTRY_TYPE: /* export through sysfs*/
+                       cfd = (struct usb_cdc_country_functional_desc *)buffer;
+                       break;
+               case USB_CDC_HEADER_TYPE: /* maybe check version */
+                       break; /* for now we ignore it */
+               case USB_CDC_ACM_TYPE:
+                       ac_management_function = buffer[3];
+                       break;
+               case USB_CDC_CALL_MANAGEMENT_TYPE:
+                       call_management_function = buffer[3];
+                       call_interface_num = buffer[4];
+                       if ((call_management_function & 3) != 3)
+                               dev_err(&intf->dev, "This device cannot do calls on its own. It is not a modem.\n");
+                       break;
+               default:
+                       /* there are LOTS more CDC descriptors that
+                        * could legitimately be found here.
+                        */
+                       dev_dbg(&intf->dev, "Ignoring descriptor: "
+                                       "type %02x, length %d\n",
+                                       buffer[2], buffer[0]);
+                       break;
+               }
 next_desc:
                buflen -= buffer[0];
                buffer += buffer[0];
@@ -985,33 +1029,36 @@ next_desc:
 
        if (!union_header) {
                if (call_interface_num > 0) {
-                       dev_dbg(&intf->dev,"No union descriptor, using call management descriptor\n");
+                       dev_dbg(&intf->dev, "No union descriptor, using call management descriptor\n");
                        data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = call_interface_num));
                        control_interface = intf;
                } else {
-                       dev_dbg(&intf->dev,"No union descriptor, giving up\n");
+                       dev_dbg(&intf->dev,
+                                       "No union descriptor, giving up\n");
                        return -ENODEV;
                }
        } else {
                control_interface = usb_ifnum_to_if(usb_dev, union_header->bMasterInterface0);
                data_interface = usb_ifnum_to_if(usb_dev, (data_interface_num = union_header->bSlaveInterface0));
                if (!control_interface || !data_interface) {
-                       dev_dbg(&intf->dev,"no interfaces\n");
+                       dev_dbg(&intf->dev, "no interfaces\n");
                        return -ENODEV;
                }
        }
-       
+
        if (data_interface_num != call_interface_num)
-               dev_dbg(&intf->dev,"Separate call control interface. That is not fully supported.\n");
+               dev_dbg(&intf->dev, "Separate call control interface. That is not fully supported.\n");
 
 skip_normal_probe:
 
        /*workaround for switched interfaces */
-       if (data_interface->cur_altsetting->desc.bInterfaceClass != CDC_DATA_INTERFACE_TYPE) {
-               if (control_interface->cur_altsetting->desc.bInterfaceClass == CDC_DATA_INTERFACE_TYPE) {
+       if (data_interface->cur_altsetting->desc.bInterfaceClass
+                                               != CDC_DATA_INTERFACE_TYPE) {
+               if (control_interface->cur_altsetting->desc.bInterfaceClass
+                                               == CDC_DATA_INTERFACE_TYPE) {
                        struct usb_interface *t;
-                       dev_dbg(&intf->dev,"Your device has switched interfaces.\n");
-
+                       dev_dbg(&intf->dev,
+                               "Your device has switched interfaces.\n");
                        t = control_interface;
                        control_interface = data_interface;
                        data_interface = t;
@@ -1023,9 +1070,9 @@ skip_normal_probe:
        /* Accept probe requests only for the control interface */
        if (intf != control_interface)
                return -ENODEV;
-       
+
        if (usb_interface_claimed(data_interface)) { /* valid in this context */
-               dev_dbg(&intf->dev,"The data interface isn't available\n");
+               dev_dbg(&intf->dev, "The data interface isn't available\n");
                return -EBUSY;
        }
 
@@ -1042,8 +1089,8 @@ skip_normal_probe:
        if (!usb_endpoint_dir_in(epread)) {
                /* descriptors are swapped */
                struct usb_endpoint_descriptor *t;
-               dev_dbg(&intf->dev,"The data interface has switched endpoints\n");
-               
+               dev_dbg(&intf->dev,
+                       "The data interface has switched endpoints\n");
                t = epread;
                epread = epwrite;
                epwrite = t;
@@ -1056,13 +1103,15 @@ skip_normal_probe:
                return -ENODEV;
        }
 
-       if (!(acm = kzalloc(sizeof(struct acm), GFP_KERNEL))) {
+       acm = kzalloc(sizeof(struct acm), GFP_KERNEL);
+       if (acm == NULL) {
                dev_dbg(&intf->dev, "out of memory (acm kzalloc)\n");
                goto alloc_fail;
        }
 
        ctrlsize = le16_to_cpu(epctrl->wMaxPacketSize);
-       readsize = le16_to_cpu(epread->wMaxPacketSize)* ( quirks == SINGLE_RX_URB ? 1 : 2);
+       readsize = le16_to_cpu(epread->wMaxPacketSize) *
+                               (quirks == SINGLE_RX_URB ? 1 : 2);
        acm->writesize = le16_to_cpu(epwrite->wMaxPacketSize) * 20;
        acm->control = control_interface;
        acm->data = data_interface;
@@ -1082,6 +1131,8 @@ skip_normal_probe:
        spin_lock_init(&acm->read_lock);
        mutex_init(&acm->mutex);
        acm->rx_endpoint = usb_rcvbulkpipe(usb_dev, epread->bEndpointAddress);
+       tty_port_init(&acm->port);
+       acm->port.ops = &acm_port_ops;
 
        buf = usb_buffer_alloc(usb_dev, ctrlsize, GFP_KERNEL, &acm->ctrl_dma);
        if (!buf) {
@@ -1103,8 +1154,10 @@ skip_normal_probe:
        for (i = 0; i < num_rx_buf; i++) {
                struct acm_ru *rcv = &(acm->ru[i]);
 
-               if (!(rcv->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-                       dev_dbg(&intf->dev, "out of memory (read urbs usb_alloc_urb)\n");
+               rcv->urb = usb_alloc_urb(0, GFP_KERNEL);
+               if (rcv->urb == NULL) {
+                       dev_dbg(&intf->dev,
+                               "out of memory (read urbs usb_alloc_urb)\n");
                        goto alloc_fail7;
                }
 
@@ -1117,26 +1170,29 @@ skip_normal_probe:
                rb->base = usb_buffer_alloc(acm->dev, readsize,
                                GFP_KERNEL, &rb->dma);
                if (!rb->base) {
-                       dev_dbg(&intf->dev, "out of memory (read bufs usb_buffer_alloc)\n");
+                       dev_dbg(&intf->dev,
+                               "out of memory (read bufs usb_buffer_alloc)\n");
                        goto alloc_fail7;
                }
        }
-       for(i = 0; i < ACM_NW; i++)
-       {
+       for (i = 0; i < ACM_NW; i++) {
                struct acm_wb *snd = &(acm->wb[i]);
 
-               if (!(snd->urb = usb_alloc_urb(0, GFP_KERNEL))) {
-                       dev_dbg(&intf->dev, "out of memory (write urbs usb_alloc_urb)");
+               snd->urb = usb_alloc_urb(0, GFP_KERNEL);
+               if (snd->urb == NULL) {
+                       dev_dbg(&intf->dev,
+                               "out of memory (write urbs usb_alloc_urb)");
                        goto alloc_fail7;
                }
 
-               usb_fill_bulk_urb(snd->urb, usb_dev, usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
-                               NULL, acm->writesize, acm_write_bulk, snd);
+               usb_fill_bulk_urb(snd->urb, usb_dev,
+                       usb_sndbulkpipe(usb_dev, epwrite->bEndpointAddress),
+                       NULL, acm->writesize, acm_write_bulk, snd);
                snd->urb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
                snd->instance = acm;
        }
 
-       usb_set_intfdata (intf, acm);
+       usb_set_intfdata(intf, acm);
 
        i = device_create_file(&intf->dev, &dev_attr_bmCapabilities);
        if (i < 0)
@@ -1147,7 +1203,8 @@ skip_normal_probe:
                if (!acm->country_codes)
                        goto skip_countries;
                acm->country_code_size = cfd->bLength - 4;
-               memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0, cfd->bLength - 4);
+               memcpy(acm->country_codes, (u8 *)&cfd->wCountyCode0,
+                                                       cfd->bLength - 4);
                acm->country_rel_date = cfd->iCountryCodeRelDate;
 
                i = device_create_file(&intf->dev, &dev_attr_wCountryCodes);
@@ -1156,7 +1213,8 @@ skip_normal_probe:
                        goto skip_countries;
                }
 
-               i = device_create_file(&intf->dev, &dev_attr_iCountryCodeRelDate);
+               i = device_create_file(&intf->dev,
+                                               &dev_attr_iCountryCodeRelDate);
                if (i < 0) {
                        kfree(acm->country_codes);
                        goto skip_countries;
@@ -1164,8 +1222,10 @@ skip_normal_probe:
        }
 
 skip_countries:
-       usb_fill_int_urb(acm->ctrlurb, usb_dev, usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
-                        acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm, epctrl->bInterval);
+       usb_fill_int_urb(acm->ctrlurb, usb_dev,
+                       usb_rcvintpipe(usb_dev, epctrl->bEndpointAddress),
+                       acm->ctrl_buffer, ctrlsize, acm_ctrl_irq, acm,
+                       epctrl->bInterval);
        acm->ctrlurb->transfer_flags |= URB_NO_TRANSFER_DMA_MAP;
        acm->ctrlurb->transfer_dma = acm->ctrl_dma;
 
@@ -1212,7 +1272,7 @@ static void stop_data_traffic(struct acm *acm)
        tasklet_disable(&acm->urb_task);
 
        usb_kill_urb(acm->ctrlurb);
-       for(i = 0; i < ACM_NW; i++)
+       for (i = 0; i < ACM_NW; i++)
                usb_kill_urb(acm->wb[i].urb);
        for (i = 0; i < acm->rx_buflimit; i++)
                usb_kill_urb(acm->ru[i].urb);
@@ -1227,13 +1287,14 @@ static void acm_disconnect(struct usb_interface *intf)
 {
        struct acm *acm = usb_get_intfdata(intf);
        struct usb_device *usb_dev = interface_to_usbdev(intf);
+       struct tty_struct *tty;
 
        /* sibling interface is already cleaning up */
        if (!acm)
                return;
 
        mutex_lock(&open_mutex);
-       if (acm->country_codes){
+       if (acm->country_codes) {
                device_remove_file(&acm->control->dev,
                                &dev_attr_wCountryCodes);
                device_remove_file(&acm->control->dev,
@@ -1247,22 +1308,25 @@ static void acm_disconnect(struct usb_interface *intf)
        stop_data_traffic(acm);
 
        acm_write_buffers_free(acm);
-       usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer, acm->ctrl_dma);
+       usb_buffer_free(usb_dev, acm->ctrlsize, acm->ctrl_buffer,
+                                                               acm->ctrl_dma);
        acm_read_buffers_free(acm);
 
        usb_driver_release_interface(&acm_driver, intf == acm->control ?
                                        acm->data : acm->control);
 
-       if (!acm->used) {
+       if (acm->port.count == 0) {
                acm_tty_unregister(acm);
                mutex_unlock(&open_mutex);
                return;
        }
 
        mutex_unlock(&open_mutex);
-
-       if (acm->tty)
-               tty_hangup(acm->tty);
+       tty = tty_port_tty_get(&acm->port);
+       if (tty) {
+               tty_hangup(tty);
+               tty_kref_put(tty);
+       }
 }
 
 #ifdef CONFIG_PM
@@ -1297,7 +1361,7 @@ static int acm_suspend(struct usb_interface *intf, pm_message_t message)
        */
        mutex_lock(&acm->mutex);
 
-       if (acm->used)
+       if (acm->port.count)
                stop_data_traffic(acm);
 
        mutex_unlock(&acm->mutex);
@@ -1319,7 +1383,7 @@ static int acm_resume(struct usb_interface *intf)
                return 0;
 
        mutex_lock(&acm->mutex);
-       if (acm->used) {
+       if (acm->port.count) {
                rv = usb_submit_urb(acm->ctrlurb, GFP_NOIO);
                if (rv < 0)
                        goto err_out;
@@ -1398,7 +1462,7 @@ static struct usb_device_id acm_ids[] = {
        { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
                USB_CDC_ACM_PROTO_AT_GSM) },
        { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
-               USB_CDC_ACM_PROTO_AT_3G ) },
+               USB_CDC_ACM_PROTO_AT_3G) },
        { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_ACM,
                USB_CDC_ACM_PROTO_AT_CDMA) },
 
@@ -1406,7 +1470,7 @@ static struct usb_device_id acm_ids[] = {
        { }
 };
 
-MODULE_DEVICE_TABLE (usb, acm_ids);
+MODULE_DEVICE_TABLE(usb, acm_ids);
 
 static struct usb_driver acm_driver = {
        .name =         "cdc_acm",
@@ -1429,6 +1493,7 @@ static struct usb_driver acm_driver = {
 static const struct tty_operations acm_ops = {
        .open =                 acm_tty_open,
        .close =                acm_tty_close,
+       .hangup =               acm_tty_hangup,
        .write =                acm_tty_write,
        .write_room =           acm_tty_write_room,
        .ioctl =                acm_tty_ioctl,
@@ -1460,7 +1525,8 @@ static int __init acm_init(void)
        acm_tty_driver->subtype = SERIAL_TYPE_NORMAL,
        acm_tty_driver->flags = TTY_DRIVER_REAL_RAW | TTY_DRIVER_DYNAMIC_DEV;
        acm_tty_driver->init_termios = tty_std_termios;
-       acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD | HUPCL | CLOCAL;
+       acm_tty_driver->init_termios.c_cflag = B9600 | CS8 | CREAD |
+                                                               HUPCL | CLOCAL;
        tty_set_operations(acm_tty_driver, &acm_ops);
 
        retval = tty_register_driver(acm_tty_driver);
@@ -1492,7 +1558,7 @@ static void __exit acm_exit(void)
 module_init(acm_init);
 module_exit(acm_exit);
 
-MODULE_AUTHOR( DRIVER_AUTHOR );
-MODULE_DESCRIPTION( DRIVER_DESC );
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_LICENSE("GPL");
 MODULE_ALIAS_CHARDEV_MAJOR(ACM_TTY_MAJOR);
index 1f95e7aa1b665618bc1fa83f6b9977375a026298..4c3856420adde6c55761950e0697822d9e496d1b 100644 (file)
@@ -89,8 +89,8 @@ struct acm {
        struct usb_device *dev;                         /* the corresponding usb device */
        struct usb_interface *control;                  /* control interface */
        struct usb_interface *data;                     /* data interface */
-       struct tty_struct *tty;                         /* the corresponding tty */
-       struct urb *ctrlurb;                    /* urbs */
+       struct tty_port port;                           /* our tty port data */
+       struct urb *ctrlurb;                            /* urbs */
        u8 *ctrl_buffer;                                /* buffers of urbs */
        dma_addr_t ctrl_dma;                            /* dma handles of buffers */
        u8 *country_codes;                              /* country codes from device */
@@ -120,7 +120,6 @@ struct acm {
        unsigned int ctrlout;                           /* output control lines (DTR, RTS) */
        unsigned int writesize;                         /* max packet size for the output bulk endpoint */
        unsigned int readsize,ctrlsize;                 /* buffer sizes for freeing */
-       unsigned int used;                              /* someone has this acm's device open */
        unsigned int minor;                             /* acm minor number */
        unsigned char throttle;                         /* throttled by tty layer */
        unsigned char clocal;                           /* termios CLOCAL */
index b7eacad4d48cb4ae34097c8d2ebbe7cb0ad0a363..2bfd6dd85b5ad23624cef600a2e8e5f76de5fa18 100644 (file)
@@ -93,8 +93,7 @@ static int  belkin_sa_startup(struct usb_serial *serial);
 static void belkin_sa_shutdown(struct usb_serial *serial);
 static int  belkin_sa_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void belkin_sa_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void belkin_sa_close(struct usb_serial_port *port);
 static void belkin_sa_read_int_callback(struct urb *urb);
 static void belkin_sa_set_termios(struct tty_struct *tty,
                        struct usb_serial_port *port, struct ktermios * old);
@@ -244,8 +243,7 @@ exit:
 } /* belkin_sa_open */
 
 
-static void belkin_sa_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void belkin_sa_close(struct usb_serial_port *port)
 {
        dbg("%s port %d", __func__, port->number);
 
index ab4cc277aa659be685a154c6a5024cf02f14eba9..2830766f5b391cc7d177c82e49a058ba28b1be73 100644 (file)
@@ -262,13 +262,33 @@ error:    kfree(priv);
        return r;
 }
 
-static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
-                               struct file *filp)
+static int ch341_carrier_raised(struct usb_serial_port *port)
+{
+       struct ch341_private *priv = usb_get_serial_port_data(port);
+       if (priv->line_status & CH341_BIT_DCD)
+               return 1;
+       return 0;
+}
+
+static void ch341_dtr_rts(struct usb_serial_port *port, int on)
 {
        struct ch341_private *priv = usb_get_serial_port_data(port);
        unsigned long flags;
-       unsigned int c_cflag;
 
+       dbg("%s - port %d", __func__, port->number);
+       /* drop DTR and RTS */
+       spin_lock_irqsave(&priv->lock, flags);
+       if (on)
+               priv->line_control |= CH341_BIT_RTS | CH341_BIT_DTR;
+       else
+               priv->line_control &= ~(CH341_BIT_RTS | CH341_BIT_DTR);
+       spin_unlock_irqrestore(&priv->lock, flags);
+       ch341_set_handshake(port->serial->dev, priv->line_control);
+       wake_up_interruptible(&priv->delta_msr_wait);
+}
+
+static void ch341_close(struct usb_serial_port *port)
+{
        dbg("%s - port %d", __func__, port->number);
 
        /* shutdown our urbs */
@@ -276,18 +296,6 @@ static void ch341_close(struct tty_struct *tty, struct usb_serial_port *port,
        usb_kill_urb(port->write_urb);
        usb_kill_urb(port->read_urb);
        usb_kill_urb(port->interrupt_in_urb);
-
-       if (tty) {
-               c_cflag = tty->termios->c_cflag;
-               if (c_cflag & HUPCL) {
-                       /* drop DTR and RTS */
-                       spin_lock_irqsave(&priv->lock, flags);
-                       priv->line_control = 0;
-                       spin_unlock_irqrestore(&priv->lock, flags);
-                       ch341_set_handshake(port->serial->dev, 0);
-               }
-       }
-       wake_up_interruptible(&priv->delta_msr_wait);
 }
 
 
@@ -302,7 +310,6 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
        dbg("ch341_open()");
 
        priv->baud_rate = DEFAULT_BAUD_RATE;
-       priv->line_control = CH341_BIT_RTS | CH341_BIT_DTR;
 
        r = ch341_configure(serial->dev, priv);
        if (r)
@@ -322,7 +329,7 @@ static int ch341_open(struct tty_struct *tty, struct usb_serial_port *port,
        if (r) {
                dev_err(&port->dev, "%s - failed submitting interrupt urb,"
                        " error %d\n", __func__, r);
-               ch341_close(tty, port, NULL);
+               ch341_close(port);
                return -EPROTO;
        }
 
@@ -343,9 +350,6 @@ static void ch341_set_termios(struct tty_struct *tty,
 
        dbg("ch341_set_termios()");
 
-       if (!tty || !tty->termios)
-               return;
-
        baud_rate = tty_get_baud_rate(tty);
 
        priv->baud_rate = baud_rate;
@@ -568,6 +572,8 @@ static struct usb_serial_driver ch341_device = {
        .usb_driver        = &ch341_driver,
        .num_ports         = 1,
        .open              = ch341_open,
+       .dtr_rts           = ch341_dtr_rts,
+       .carrier_raised    = ch341_carrier_raised,
        .close             = ch341_close,
        .ioctl             = ch341_ioctl,
        .set_termios       = ch341_set_termios,
index 19e24045b137a2964b450afbf94cc930de3cd427..247b61bfb7f45f4d179abd6ef36f9611eea046da 100644 (file)
@@ -169,7 +169,9 @@ static int usb_console_setup(struct console *co, char *options)
                        kfree(tty);
                }
        }
-
+       /* So we know not to kill the hardware on a hangup on this
+          port. We have also bumped the use count by one so it won't go
+          idle */
        port->console = 1;
        retval = 0;
 
@@ -182,7 +184,7 @@ free_tty:
        kfree(tty);
 reset_open_count:
        port->port.count = 0;
-goto out;
+       goto out;
 }
 
 static void usb_console_write(struct console *co,
index e8d5133ce9c84f911d210a374f5af40821c7f6b4..16a154d3b2feeaaacf0e0c949405d992a66165be 100644 (file)
@@ -1,5 +1,5 @@
 /*
- * Silicon Laboratories CP2101/CP2102 USB to RS232 serial adaptor driver
+ * Silicon Laboratories CP210x USB to RS232 serial adaptor driver
  *
  * Copyright (C) 2005 Craig Shelley (craig@microtron.org.uk)
  *
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.08"
-#define DRIVER_DESC "Silicon Labs CP2101/CP2102 RS232 serial adaptor driver"
+#define DRIVER_VERSION "v0.09"
+#define DRIVER_DESC "Silicon Labs CP210x RS232 serial adaptor driver"
 
 /*
  * Function Prototypes
  */
-static int cp2101_open(struct tty_struct *, struct usb_serial_port *,
+static int cp210x_open(struct tty_struct *, struct usb_serial_port *,
                                                        struct file *);
-static void cp2101_cleanup(struct usb_serial_port *);
-static void cp2101_close(struct tty_struct *, struct usb_serial_port *,
-                                                       struct file*);
-static void cp2101_get_termios(struct tty_struct *,
+static void cp210x_cleanup(struct usb_serial_port *);
+static void cp210x_close(struct usb_serial_port *);
+static void cp210x_get_termios(struct tty_struct *,
        struct usb_serial_port *port);
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
        unsigned int *cflagp, unsigned int *baudp);
-static void cp2101_set_termios(struct tty_struct *, struct usb_serial_port *,
+static void cp210x_set_termios(struct tty_struct *, struct usb_serial_port *,
                                                        struct ktermios*);
-static int cp2101_tiocmget(struct tty_struct *, struct file *);
-static int cp2101_tiocmset(struct tty_struct *, struct file *,
+static int cp210x_tiocmget(struct tty_struct *, struct file *);
+static int cp210x_tiocmset(struct tty_struct *, struct file *,
                unsigned int, unsigned int);
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *,
                unsigned int, unsigned int);
-static void cp2101_break_ctl(struct tty_struct *, int);
-static int cp2101_startup(struct usb_serial *);
-static void cp2101_shutdown(struct usb_serial *);
+static void cp210x_break_ctl(struct tty_struct *, int);
+static int cp210x_startup(struct usb_serial *);
+static void cp210x_shutdown(struct usb_serial *);
 
 static int debug;
 
 static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x0471, 0x066A) }, /* AKTAKOM ACE-1001 cable */
        { USB_DEVICE(0x0489, 0xE000) }, /* Pirelli Broadband S.p.A, DP-L10 SIP/GSM Mobile */
+       { USB_DEVICE(0x0745, 0x1000) }, /* CipherLab USB CCD Barcode Scanner 1000 */
        { USB_DEVICE(0x08e6, 0x5501) }, /* Gemalto Prox-PU/CU contactless smartcard reader */
+       { USB_DEVICE(0x08FD, 0x000A) }, /* Digianswer A/S , ZigBee/802.15.4 MAC Device */
        { USB_DEVICE(0x0FCF, 0x1003) }, /* Dynastream ANT development board */
        { USB_DEVICE(0x0FCF, 0x1004) }, /* Dynastream ANT2USB */
        { USB_DEVICE(0x0FCF, 0x1006) }, /* Dynastream ANT development board */
        { USB_DEVICE(0x10A6, 0xAA26) }, /* Knock-off DCU-11 cable */
        { USB_DEVICE(0x10AB, 0x10C5) }, /* Siemens MC60 Cable */
        { USB_DEVICE(0x10B5, 0xAC70) }, /* Nokia CA-42 USB */
+       { USB_DEVICE(0x10C4, 0x0F91) }, /* Vstabi */
        { USB_DEVICE(0x10C4, 0x800A) }, /* SPORTident BSM7-D-USB main station */
        { USB_DEVICE(0x10C4, 0x803B) }, /* Pololu USB-serial converter */
        { USB_DEVICE(0x10C4, 0x8053) }, /* Enfora EDG1228 */
@@ -85,10 +87,12 @@ static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x10C4, 0x81C8) }, /* Lipowsky Industrie Elektronik GmbH, Baby-JTAG */
        { USB_DEVICE(0x10C4, 0x81E2) }, /* Lipowsky Industrie Elektronik GmbH, Baby-LIN */
        { USB_DEVICE(0x10C4, 0x81E7) }, /* Aerocomm Radio */
+       { USB_DEVICE(0x10C4, 0x81F2) }, /* C1007 HF band RFID controller */
        { USB_DEVICE(0x10C4, 0x8218) }, /* Lipowsky Industrie Elektronik GmbH, HARP-1 */
        { USB_DEVICE(0x10C4, 0x822B) }, /* Modem EDGE(GSM) Comander 2 */
        { USB_DEVICE(0x10C4, 0x826B) }, /* Cygnal Integrated Products, Inc., Fasttrax GPS demostration module */
        { USB_DEVICE(0x10c4, 0x8293) }, /* Telegesys ETRX2USB */
+       { USB_DEVICE(0x10C4, 0x82F9) }, /* Procyon AVS */
        { USB_DEVICE(0x10C4, 0x8341) }, /* Siemens MC35PU GPRS Modem */
        { USB_DEVICE(0x10C4, 0x83A8) }, /* Amber Wireless AMB2560 */
        { USB_DEVICE(0x10C4, 0x846E) }, /* BEI USB Sensor Interface (VCP) */
@@ -99,7 +103,9 @@ static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x10C4, 0xF003) }, /* Elan Digital Systems USBpulse100 */
        { USB_DEVICE(0x10C4, 0xF004) }, /* Elan Digital Systems USBcount50 */
        { USB_DEVICE(0x10C5, 0xEA61) }, /* Silicon Labs MobiData GPRS USB Modem */
+       { USB_DEVICE(0x10CE, 0xEA6A) }, /* Silicon Labs MobiData GPRS USB Modem 100EU */
        { USB_DEVICE(0x13AD, 0x9999) }, /* Baltech card reader */
+       { USB_DEVICE(0x1555, 0x0004) }, /* Owen AC4 USB-RS485 Converter */
        { USB_DEVICE(0x166A, 0x0303) }, /* Clipsal 5500PCU C-Bus USB interface */
        { USB_DEVICE(0x16D6, 0x0001) }, /* Jablotron serial interface */
        { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */
@@ -108,53 +114,70 @@ static struct usb_device_id id_table [] = {
 
 MODULE_DEVICE_TABLE(usb, id_table);
 
-static struct usb_driver cp2101_driver = {
-       .name           = "cp2101",
+static struct usb_driver cp210x_driver = {
+       .name           = "cp210x",
        .probe          = usb_serial_probe,
        .disconnect     = usb_serial_disconnect,
        .id_table       = id_table,
        .no_dynamic_id  =       1,
 };
 
-static struct usb_serial_driver cp2101_device = {
+static struct usb_serial_driver cp210x_device = {
        .driver = {
                .owner =        THIS_MODULE,
-               .name =         "cp2101",
+               .name =         "cp210x",
        },
-       .usb_driver             = &cp2101_driver,
+       .usb_driver             = &cp210x_driver,
        .id_table               = id_table,
        .num_ports              = 1,
-       .open                   = cp2101_open,
-       .close                  = cp2101_close,
-       .break_ctl              = cp2101_break_ctl,
-       .set_termios            = cp2101_set_termios,
-       .tiocmget               = cp2101_tiocmget,
-       .tiocmset               = cp2101_tiocmset,
-       .attach                 = cp2101_startup,
-       .shutdown               = cp2101_shutdown,
+       .open                   = cp210x_open,
+       .close                  = cp210x_close,
+       .break_ctl              = cp210x_break_ctl,
+       .set_termios            = cp210x_set_termios,
+       .tiocmget               = cp210x_tiocmget,
+       .tiocmset               = cp210x_tiocmset,
+       .attach                 = cp210x_startup,
+       .shutdown               = cp210x_shutdown,
 };
 
 /* Config request types */
 #define REQTYPE_HOST_TO_DEVICE 0x41
 #define REQTYPE_DEVICE_TO_HOST 0xc1
 
-/* Config SET requests. To GET, add 1 to the request number */
-#define CP2101_UART            0x00    /* Enable / Disable */
-#define CP2101_BAUDRATE                0x01    /* (BAUD_RATE_GEN_FREQ / baudrate) */
-#define CP2101_BITS            0x03    /* 0x(0)(databits)(parity)(stopbits) */
-#define CP2101_BREAK           0x05    /* On / Off */
-#define CP2101_CONTROL         0x07    /* Flow control line states */
-#define CP2101_MODEMCTL                0x13    /* Modem controls */
-#define CP2101_CONFIG_6                0x19    /* 6 bytes of config data ??? */
-
-/* CP2101_UART */
+/* Config request codes */
+#define CP210X_IFC_ENABLE      0x00
+#define CP210X_SET_BAUDDIV     0x01
+#define CP210X_GET_BAUDDIV     0x02
+#define CP210X_SET_LINE_CTL    0x03
+#define CP210X_GET_LINE_CTL    0x04
+#define CP210X_SET_BREAK       0x05
+#define CP210X_IMM_CHAR                0x06
+#define CP210X_SET_MHS         0x07
+#define CP210X_GET_MDMSTS      0x08
+#define CP210X_SET_XON         0x09
+#define CP210X_SET_XOFF                0x0A
+#define CP210X_SET_EVENTMASK   0x0B
+#define CP210X_GET_EVENTMASK   0x0C
+#define CP210X_SET_CHAR                0x0D
+#define CP210X_GET_CHARS       0x0E
+#define CP210X_GET_PROPS       0x0F
+#define CP210X_GET_COMM_STATUS 0x10
+#define CP210X_RESET           0x11
+#define CP210X_PURGE           0x12
+#define CP210X_SET_FLOW                0x13
+#define CP210X_GET_FLOW                0x14
+#define CP210X_EMBED_EVENTS    0x15
+#define CP210X_GET_EVENTSTATE  0x16
+#define CP210X_SET_CHARS       0x19
+
+/* CP210X_IFC_ENABLE */
 #define UART_ENABLE            0x0001
 #define UART_DISABLE           0x0000
 
-/* CP2101_BAUDRATE */
+/* CP210X_(SET|GET)_BAUDDIV */
 #define BAUD_RATE_GEN_FREQ     0x384000
 
-/* CP2101_BITS */
+/* CP210X_(SET|GET)_LINE_CTL */
 #define BITS_DATA_MASK         0X0f00
 #define BITS_DATA_5            0X0500
 #define BITS_DATA_6            0X0600
@@ -174,11 +197,11 @@ static struct usb_serial_driver cp2101_device = {
 #define BITS_STOP_1_5          0x0001
 #define BITS_STOP_2            0x0002
 
-/* CP2101_BREAK */
+/* CP210X_SET_BREAK */
 #define BREAK_ON               0x0000
 #define BREAK_OFF              0x0001
 
-/* CP2101_CONTROL */
+/* CP210X_(SET_MHS|GET_MDMSTS) */
 #define CONTROL_DTR            0x0001
 #define CONTROL_RTS            0x0002
 #define CONTROL_CTS            0x0010
@@ -189,13 +212,13 @@ static struct usb_serial_driver cp2101_device = {
 #define CONTROL_WRITE_RTS      0x0200
 
 /*
- * cp2101_get_config
- * Reads from the CP2101 configuration registers
+ * cp210x_get_config
+ * Reads from the CP210x configuration registers
  * 'size' is specified in bytes.
  * 'data' is a pointer to a pre-allocated array of integers large
  * enough to hold 'size' bytes (with 4 bytes to each integer)
  */
-static int cp2101_get_config(struct usb_serial_port *port, u8 request,
+static int cp210x_get_config(struct usb_serial_port *port, u8 request,
                unsigned int *data, int size)
 {
        struct usb_serial *serial = port->serial;
@@ -211,9 +234,6 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
                return -ENOMEM;
        }
 
-       /* For get requests, the request number must be incremented */
-       request++;
-
        /* Issue the request, attempting to read 'size' bytes */
        result = usb_control_msg(serial->dev, usb_rcvctrlpipe(serial->dev, 0),
                                request, REQTYPE_DEVICE_TO_HOST, 0x0000,
@@ -236,12 +256,12 @@ static int cp2101_get_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config
- * Writes to the CP2101 configuration registers
+ * cp210x_set_config
+ * Writes to the CP210x configuration registers
  * Values less than 16 bits wide are sent directly
  * 'size' is specified in bytes.
  */
-static int cp2101_set_config(struct usb_serial_port *port, u8 request,
+static int cp210x_set_config(struct usb_serial_port *port, u8 request,
                unsigned int *data, int size)
 {
        struct usb_serial *serial = port->serial;
@@ -292,21 +312,21 @@ static int cp2101_set_config(struct usb_serial_port *port, u8 request,
 }
 
 /*
- * cp2101_set_config_single
- * Convenience function for calling cp2101_set_config on single data values
+ * cp210x_set_config_single
+ * Convenience function for calling cp210x_set_config on single data values
  * without requiring an integer pointer
  */
-static inline int cp2101_set_config_single(struct usb_serial_port *port,
+static inline int cp210x_set_config_single(struct usb_serial_port *port,
                u8 request, unsigned int data)
 {
-       return cp2101_set_config(port, request, &data, 2);
+       return cp210x_set_config(port, request, &data, 2);
 }
 
 /*
- * cp2101_quantise_baudrate
+ * cp210x_quantise_baudrate
  * Quantises the baud rate as per AN205 Table 1
  */
-static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
+static unsigned int cp210x_quantise_baudrate(unsigned int baud) {
        if      (baud <= 56)       baud = 0;
        else if (baud <= 300)      baud = 300;
        else if (baud <= 600)      baud = 600;
@@ -343,7 +363,7 @@ static unsigned int cp2101_quantise_baudrate(unsigned int baud) {
        return baud;
 }
 
-static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
+static int cp210x_open(struct tty_struct *tty, struct usb_serial_port *port,
                                struct file *filp)
 {
        struct usb_serial *serial = port->serial;
@@ -351,7 +371,7 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
 
        dbg("%s - port %d", __func__, port->number);
 
-       if (cp2101_set_config_single(port, CP2101_UART, UART_ENABLE)) {
+       if (cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_ENABLE)) {
                dev_err(&port->dev, "%s - Unable to enable UART\n",
                                __func__);
                return -EPROTO;
@@ -373,17 +393,17 @@ static int cp2101_open(struct tty_struct *tty, struct usb_serial_port *port,
        }
 
        /* Configure the termios structure */
-       cp2101_get_termios(tty, port);
+       cp210x_get_termios(tty, port);
 
        /* Set the DTR and RTS pins low */
-       cp2101_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
+       cp210x_tiocmset_port(tty ? (struct usb_serial_port *) tty->driver_data
                        : port,
                NULL, TIOCM_DTR | TIOCM_RTS, 0);
 
        return 0;
 }
 
-static void cp2101_cleanup(struct usb_serial_port *port)
+static void cp210x_cleanup(struct usb_serial_port *port)
 {
        struct usb_serial *serial = port->serial;
 
@@ -398,8 +418,7 @@ static void cp2101_cleanup(struct usb_serial_port *port)
        }
 }
 
-static void cp2101_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                       struct file *filp)
+static void cp210x_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
@@ -410,23 +429,23 @@ static void cp2101_close(struct tty_struct *tty, struct usb_serial_port *port,
 
        mutex_lock(&port->serial->disc_mutex);
        if (!port->serial->disconnected)
-               cp2101_set_config_single(port, CP2101_UART, UART_DISABLE);
+               cp210x_set_config_single(port, CP210X_IFC_ENABLE, UART_DISABLE);
        mutex_unlock(&port->serial->disc_mutex);
 }
 
 /*
- * cp2101_get_termios
+ * cp210x_get_termios
  * Reads the baud rate, data bits, parity, stop bits and flow control mode
  * from the device, corrects any unsupported values, and configures the
  * termios structure to reflect the state of the device
  */
-static void cp2101_get_termios(struct tty_struct *tty,
+static void cp210x_get_termios(struct tty_struct *tty,
        struct usb_serial_port *port)
 {
        unsigned int baud;
 
        if (tty) {
-               cp2101_get_termios_port(tty->driver_data,
+               cp210x_get_termios_port(tty->driver_data,
                        &tty->termios->c_cflag, &baud);
                tty_encode_baud_rate(tty, baud, baud);
        }
@@ -434,15 +453,15 @@ static void cp2101_get_termios(struct tty_struct *tty,
        else {
                unsigned int cflag;
                cflag = 0;
-               cp2101_get_termios_port(port, &cflag, &baud);
+               cp210x_get_termios_port(port, &cflag, &baud);
        }
 }
 
 /*
- * cp2101_get_termios_port
- * This is the heart of cp2101_get_termios which always uses a &usb_serial_port.
+ * cp210x_get_termios_port
+ * This is the heart of cp210x_get_termios which always uses a &usb_serial_port.
  */
-static void cp2101_get_termios_port(struct usb_serial_port *port,
+static void cp210x_get_termios_port(struct usb_serial_port *port,
        unsigned int *cflagp, unsigned int *baudp)
 {
        unsigned int cflag, modem_ctl[4];
@@ -451,17 +470,17 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
 
        dbg("%s - port %d", __func__, port->number);
 
-       cp2101_get_config(port, CP2101_BAUDRATE, &baud, 2);
+       cp210x_get_config(port, CP210X_GET_BAUDDIV, &baud, 2);
        /* Convert to baudrate */
        if (baud)
-               baud = cp2101_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
+               baud = cp210x_quantise_baudrate((BAUD_RATE_GEN_FREQ + baud/2)/ baud);
 
        dbg("%s - baud rate = %d", __func__, baud);
        *baudp = baud;
 
        cflag = *cflagp;
 
-       cp2101_get_config(port, CP2101_BITS, &bits, 2);
+       cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
        cflag &= ~CSIZE;
        switch (bits & BITS_DATA_MASK) {
        case BITS_DATA_5:
@@ -486,14 +505,14 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
                cflag |= CS8;
                bits &= ~BITS_DATA_MASK;
                bits |= BITS_DATA_8;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        default:
                dbg("%s - Unknown number of data bits, using 8", __func__);
                cflag |= CS8;
                bits &= ~BITS_DATA_MASK;
                bits |= BITS_DATA_8;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        }
 
@@ -516,20 +535,20 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
                                __func__);
                cflag &= ~PARENB;
                bits &= ~BITS_PARITY_MASK;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        case BITS_PARITY_SPACE:
                dbg("%s - parity = SPACE (not supported, disabling parity)",
                                __func__);
                cflag &= ~PARENB;
                bits &= ~BITS_PARITY_MASK;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        default:
                dbg("%s - Unknown parity mode, disabling parity", __func__);
                cflag &= ~PARENB;
                bits &= ~BITS_PARITY_MASK;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        }
 
@@ -542,7 +561,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
                dbg("%s - stop bits = 1.5 (not supported, using 1 stop bit)",
                                                                __func__);
                bits &= ~BITS_STOP_MASK;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        case BITS_STOP_2:
                dbg("%s - stop bits = 2", __func__);
@@ -552,11 +571,11 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
                dbg("%s - Unknown number of stop bits, using 1 stop bit",
                                                                __func__);
                bits &= ~BITS_STOP_MASK;
-               cp2101_set_config(port, CP2101_BITS, &bits, 2);
+               cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2);
                break;
        }
 
-       cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+       cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
        if (modem_ctl[0] & 0x0008) {
                dbg("%s - flow control = CRTSCTS", __func__);
                cflag |= CRTSCTS;
@@ -568,7 +587,7 @@ static void cp2101_get_termios_port(struct usb_serial_port *port,
        *cflagp = cflag;
 }
 
-static void cp2101_set_termios(struct tty_struct *tty,
+static void cp210x_set_termios(struct tty_struct *tty,
                struct usb_serial_port *port, struct ktermios *old_termios)
 {
        unsigned int cflag, old_cflag;
@@ -583,13 +602,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
        tty->termios->c_cflag &= ~CMSPAR;
        cflag = tty->termios->c_cflag;
        old_cflag = old_termios->c_cflag;
-       baud = cp2101_quantise_baudrate(tty_get_baud_rate(tty));
+       baud = cp210x_quantise_baudrate(tty_get_baud_rate(tty));
 
        /* If the baud rate is to be updated*/
        if (baud != tty_termios_baud_rate(old_termios) && baud != 0) {
                dbg("%s - Setting baud rate to %d baud", __func__,
                                baud);
-               if (cp2101_set_config_single(port, CP2101_BAUDRATE,
+               if (cp210x_set_config_single(port, CP210X_SET_BAUDDIV,
                                        ((BAUD_RATE_GEN_FREQ + baud/2) / baud))) {
                        dbg("Baud rate requested not supported by device\n");
                        baud = tty_termios_baud_rate(old_termios);
@@ -600,7 +619,7 @@ static void cp2101_set_termios(struct tty_struct *tty,
 
        /* If the number of data bits is to be updated */
        if ((cflag & CSIZE) != (old_cflag & CSIZE)) {
-               cp2101_get_config(port, CP2101_BITS, &bits, 2);
+               cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
                bits &= ~BITS_DATA_MASK;
                switch (cflag & CSIZE) {
                case CS5:
@@ -624,19 +643,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
                        dbg("%s - data bits = 9", __func__);
                        break;*/
                default:
-                       dbg("cp2101 driver does not "
+                       dbg("cp210x driver does not "
                                        "support the number of bits requested,"
                                        " using 8 bit mode\n");
                                bits |= BITS_DATA_8;
                                break;
                }
-               if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+               if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
                        dbg("Number of data bits requested "
                                        "not supported by device\n");
        }
 
        if ((cflag & (PARENB|PARODD)) != (old_cflag & (PARENB|PARODD))) {
-               cp2101_get_config(port, CP2101_BITS, &bits, 2);
+               cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
                bits &= ~BITS_PARITY_MASK;
                if (cflag & PARENB) {
                        if (cflag & PARODD) {
@@ -647,13 +666,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
                                dbg("%s - parity = EVEN", __func__);
                        }
                }
-               if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+               if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
                        dbg("Parity mode not supported "
                                        "by device\n");
        }
 
        if ((cflag & CSTOPB) != (old_cflag & CSTOPB)) {
-               cp2101_get_config(port, CP2101_BITS, &bits, 2);
+               cp210x_get_config(port, CP210X_GET_LINE_CTL, &bits, 2);
                bits &= ~BITS_STOP_MASK;
                if (cflag & CSTOPB) {
                        bits |= BITS_STOP_2;
@@ -662,13 +681,13 @@ static void cp2101_set_termios(struct tty_struct *tty,
                        bits |= BITS_STOP_1;
                        dbg("%s - stop bits = 1", __func__);
                }
-               if (cp2101_set_config(port, CP2101_BITS, &bits, 2))
+               if (cp210x_set_config(port, CP210X_SET_LINE_CTL, &bits, 2))
                        dbg("Number of stop bits requested "
                                        "not supported by device\n");
        }
 
        if ((cflag & CRTSCTS) != (old_cflag & CRTSCTS)) {
-               cp2101_get_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+               cp210x_get_config(port, CP210X_GET_FLOW, modem_ctl, 16);
                dbg("%s - read modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
                                __func__, modem_ctl[0], modem_ctl[1],
                                modem_ctl[2], modem_ctl[3]);
@@ -688,19 +707,19 @@ static void cp2101_set_termios(struct tty_struct *tty,
                dbg("%s - write modem controls = 0x%.4x 0x%.4x 0x%.4x 0x%.4x",
                                __func__, modem_ctl[0], modem_ctl[1],
                                modem_ctl[2], modem_ctl[3]);
-               cp2101_set_config(port, CP2101_MODEMCTL, modem_ctl, 16);
+               cp210x_set_config(port, CP210X_SET_FLOW, modem_ctl, 16);
        }
 
 }
 
-static int cp2101_tiocmset (struct tty_struct *tty, struct file *file,
+static int cp210x_tiocmset (struct tty_struct *tty, struct file *file,
                unsigned int set, unsigned int clear)
 {
        struct usb_serial_port *port = tty->driver_data;
-       return cp2101_tiocmset_port(port, file, set, clear);
+       return cp210x_tiocmset_port(port, file, set, clear);
 }
 
-static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
+static int cp210x_tiocmset_port(struct usb_serial_port *port, struct file *file,
                unsigned int set, unsigned int clear)
 {
        unsigned int control = 0;
@@ -726,10 +745,10 @@ static int cp2101_tiocmset_port(struct usb_serial_port *port, struct file *file,
 
        dbg("%s - control = 0x%.4x", __func__, control);
 
-       return cp2101_set_config(port, CP2101_CONTROL, &control, 2);
+       return cp210x_set_config(port, CP210X_SET_MHS, &control, 2);
 }
 
-static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
+static int cp210x_tiocmget (struct tty_struct *tty, struct file *file)
 {
        struct usb_serial_port *port = tty->driver_data;
        unsigned int control;
@@ -737,7 +756,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
 
        dbg("%s - port %d", __func__, port->number);
 
-       cp2101_get_config(port, CP2101_CONTROL, &control, 1);
+       cp210x_get_config(port, CP210X_GET_MDMSTS, &control, 1);
 
        result = ((control & CONTROL_DTR) ? TIOCM_DTR : 0)
                |((control & CONTROL_RTS) ? TIOCM_RTS : 0)
@@ -751,7 +770,7 @@ static int cp2101_tiocmget (struct tty_struct *tty, struct file *file)
        return result;
 }
 
-static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
+static void cp210x_break_ctl (struct tty_struct *tty, int break_state)
 {
        struct usb_serial_port *port = tty->driver_data;
        unsigned int state;
@@ -763,17 +782,17 @@ static void cp2101_break_ctl (struct tty_struct *tty, int break_state)
                state = BREAK_ON;
        dbg("%s - turning break %s", __func__,
                        state == BREAK_OFF ? "off" : "on");
-       cp2101_set_config(port, CP2101_BREAK, &state, 2);
+       cp210x_set_config(port, CP210X_SET_BREAK, &state, 2);
 }
 
-static int cp2101_startup(struct usb_serial *serial)
+static int cp210x_startup(struct usb_serial *serial)
 {
-       /* CP2101 buffers behave strangely unless device is reset */
+       /* cp210x buffers behave strangely unless device is reset */
        usb_reset_device(serial->dev);
        return 0;
 }
 
-static void cp2101_shutdown(struct usb_serial *serial)
+static void cp210x_shutdown(struct usb_serial *serial)
 {
        int i;
 
@@ -781,21 +800,21 @@ static void cp2101_shutdown(struct usb_serial *serial)
 
        /* Stop reads and writes on all ports */
        for (i = 0; i < serial->num_ports; ++i)
-               cp2101_cleanup(serial->port[i]);
+               cp210x_cleanup(serial->port[i]);
 }
 
-static int __init cp2101_init(void)
+static int __init cp210x_init(void)
 {
        int retval;
 
-       retval = usb_serial_register(&cp2101_device);
+       retval = usb_serial_register(&cp210x_device);
        if (retval)
                return retval; /* Failed to register */
 
-       retval = usb_register(&cp2101_driver);
+       retval = usb_register(&cp210x_driver);
        if (retval) {
                /* Failed to register */
-               usb_serial_deregister(&cp2101_device);
+               usb_serial_deregister(&cp210x_device);
                return retval;
        }
 
@@ -805,14 +824,14 @@ static int __init cp2101_init(void)
        return 0;
 }
 
-static void __exit cp2101_exit(void)
+static void __exit cp210x_exit(void)
 {
-       usb_deregister(&cp2101_driver);
-       usb_serial_deregister(&cp2101_device);
+       usb_deregister(&cp210x_driver);
+       usb_serial_deregister(&cp210x_device);
 }
 
-module_init(cp2101_init);
-module_exit(cp2101_exit);
+module_init(cp210x_init);
+module_exit(cp210x_exit);
 
 MODULE_DESCRIPTION(DRIVER_DESC);
 MODULE_VERSION(DRIVER_VERSION);
index dd501bb63ed6173b2b1f31a35f93d3a7d202db0a..933ba913e66c5fc21950a3ccc2e5b8229f8c9d63 100644 (file)
@@ -61,8 +61,7 @@ static int cyberjack_startup(struct usb_serial *serial);
 static void cyberjack_shutdown(struct usb_serial *serial);
 static int  cyberjack_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void cyberjack_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void cyberjack_close(struct usb_serial_port *port);
 static int cyberjack_write(struct tty_struct *tty,
        struct usb_serial_port *port, const unsigned char *buf, int count);
 static int cyberjack_write_room(struct tty_struct *tty);
@@ -185,8 +184,7 @@ static int  cyberjack_open(struct tty_struct *tty,
        return result;
 }
 
-static void cyberjack_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void cyberjack_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
index e568710b263fa9dc6a6e258c6898c70254180d59..669f93848539560182b589131cc18788487477dc 100644 (file)
@@ -174,8 +174,8 @@ static int  cypress_ca42v2_startup(struct usb_serial *serial);
 static void cypress_shutdown(struct usb_serial *serial);
 static int  cypress_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void cypress_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void cypress_close(struct usb_serial_port *port);
+static void cypress_dtr_rts(struct usb_serial_port *port, int on);
 static int  cypress_write(struct tty_struct *tty, struct usb_serial_port *port,
                        const unsigned char *buf, int count);
 static void cypress_send(struct usb_serial_port *port);
@@ -218,6 +218,7 @@ static struct usb_serial_driver cypress_earthmate_device = {
        .shutdown =                     cypress_shutdown,
        .open =                         cypress_open,
        .close =                        cypress_close,
+       .dtr_rts =                      cypress_dtr_rts,
        .write =                        cypress_write,
        .write_room =                   cypress_write_room,
        .ioctl =                        cypress_ioctl,
@@ -244,6 +245,7 @@ static struct usb_serial_driver cypress_hidcom_device = {
        .shutdown =                     cypress_shutdown,
        .open =                         cypress_open,
        .close =                        cypress_close,
+       .dtr_rts =                      cypress_dtr_rts,
        .write =                        cypress_write,
        .write_room =                   cypress_write_room,
        .ioctl =                        cypress_ioctl,
@@ -270,6 +272,7 @@ static struct usb_serial_driver cypress_ca42v2_device = {
        .shutdown =                     cypress_shutdown,
        .open =                         cypress_open,
        .close =                        cypress_close,
+       .dtr_rts =                      cypress_dtr_rts,
        .write =                        cypress_write,
        .write_room =                   cypress_write_room,
        .ioctl =                        cypress_ioctl,
@@ -656,11 +659,7 @@ static int cypress_open(struct tty_struct *tty,
        priv->rx_flags = 0;
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* raise both lines and set termios */
-       spin_lock_irqsave(&priv->lock, flags);
-       priv->line_control = CONTROL_DTR | CONTROL_RTS;
-       priv->cmd_ctrl = 1;
-       spin_unlock_irqrestore(&priv->lock, flags);
+       /* Set termios */
        result = cypress_write(tty, port, NULL, 0);
 
        if (result) {
@@ -694,76 +693,42 @@ static int cypress_open(struct tty_struct *tty,
                                                        __func__, result);
                cypress_set_dead(port);
        }
-
+       port->port.drain_delay = 256;
        return result;
 } /* cypress_open */
 
+static void cypress_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct cypress_private *priv = usb_get_serial_port_data(port);
+       /* drop dtr and rts */
+       priv = usb_get_serial_port_data(port);
+       spin_lock_irq(&priv->lock);
+       if (on == 0)
+               priv->line_control = 0;
+       else 
+               priv->line_control = CONTROL_DTR | CONTROL_RTS;
+       priv->cmd_ctrl = 1;
+       spin_unlock_irq(&priv->lock);
+       cypress_write(NULL, port, NULL, 0);
+}
 
-static void cypress_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void cypress_close(struct usb_serial_port *port)
 {
        struct cypress_private *priv = usb_get_serial_port_data(port);
-       unsigned int c_cflag;
-       int bps;
-       long timeout;
-       wait_queue_t wait;
 
        dbg("%s - port %d", __func__, port->number);
 
-       /* wait for data to drain from buffer */
-       spin_lock_irq(&priv->lock);
-       timeout = CYPRESS_CLOSING_WAIT;
-       init_waitqueue_entry(&wait, current);
-       add_wait_queue(&tty->write_wait, &wait);
-       for (;;) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (cypress_buf_data_avail(priv->buf) == 0
-               || timeout == 0 || signal_pending(current)
-               /* without mutex, allowed due to harmless failure mode */
-               || port->serial->disconnected)
-                       break;
-               spin_unlock_irq(&priv->lock);
-               timeout = schedule_timeout(timeout);
-               spin_lock_irq(&priv->lock);
-       }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&tty->write_wait, &wait);
-       /* clear out any remaining data in the buffer */
-       cypress_buf_clear(priv->buf);
-       spin_unlock_irq(&priv->lock);
-
        /* writing is potentially harmful, lock must be taken */
        mutex_lock(&port->serial->disc_mutex);
        if (port->serial->disconnected) {
                mutex_unlock(&port->serial->disc_mutex);
                return;
        }
-       /* wait for characters to drain from device */
-       if (tty) {
-               bps = tty_get_baud_rate(tty);
-               if (bps > 1200)
-                       timeout = max((HZ * 2560) / bps, HZ / 10);
-               else
-                       timeout = 2 * HZ;
-               schedule_timeout_interruptible(timeout);
-       }
-
+       cypress_buf_clear(priv->buf);
        dbg("%s - stopping urbs", __func__);
        usb_kill_urb(port->interrupt_in_urb);
        usb_kill_urb(port->interrupt_out_urb);
 
-       if (tty) {
-               c_cflag = tty->termios->c_cflag;
-               if (c_cflag & HUPCL) {
-                       /* drop dtr and rts */
-                       priv = usb_get_serial_port_data(port);
-                       spin_lock_irq(&priv->lock);
-                       priv->line_control = 0;
-                       priv->cmd_ctrl = 1;
-                       spin_unlock_irq(&priv->lock);
-                       cypress_write(tty, port, NULL, 0);
-               }
-       }
 
        if (stats)
                dev_info(&port->dev, "Statistics: %d Bytes In | %d Bytes Out | %d Commands Issued\n",
index 38ba4ea8b6bfdf1b84b69d9338cd9c3154d60e51..30f5140eff03a3bbeef50e90f28c942223c99e0e 100644 (file)
@@ -422,7 +422,6 @@ struct digi_port {
        int dp_throttled;
        int dp_throttle_restart;
        wait_queue_head_t dp_flush_wait;
-       int dp_in_close;                        /* close in progress */
        wait_queue_head_t dp_close_wait;        /* wait queue for close */
        struct work_struct dp_wakeup_work;
        struct usb_serial_port *dp_port;
@@ -456,8 +455,9 @@ static int digi_write_room(struct tty_struct *tty);
 static int digi_chars_in_buffer(struct tty_struct *tty);
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
        struct file *filp);
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-       struct file *filp);
+static void digi_close(struct usb_serial_port *port);
+static int digi_carrier_raised(struct usb_serial_port *port);
+static void digi_dtr_rts(struct usb_serial_port *port, int on);
 static int digi_startup_device(struct usb_serial *serial);
 static int digi_startup(struct usb_serial *serial);
 static void digi_shutdown(struct usb_serial *serial);
@@ -510,6 +510,8 @@ static struct usb_serial_driver digi_acceleport_2_device = {
        .num_ports =                    3,
        .open =                         digi_open,
        .close =                        digi_close,
+       .dtr_rts =                      digi_dtr_rts,
+       .carrier_raised =               digi_carrier_raised,
        .write =                        digi_write,
        .write_room =                   digi_write_room,
        .write_bulk_callback =          digi_write_bulk_callback,
@@ -1328,6 +1330,19 @@ static int digi_chars_in_buffer(struct tty_struct *tty)
 
 }
 
+static void digi_dtr_rts(struct usb_serial_port *port, int on)
+{
+       /* Adjust DTR and RTS */
+       digi_set_modem_signals(port, on * (TIOCM_DTR|TIOCM_RTS), 1);
+}
+
+static int digi_carrier_raised(struct usb_serial_port *port)
+{
+       struct digi_port *priv = usb_get_serial_port_data(port);
+       if (priv->dp_modem_signals & TIOCM_CD)
+               return 1;
+       return 0;
+}
 
 static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
                                struct file *filp)
@@ -1336,7 +1351,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
        unsigned char buf[32];
        struct digi_port *priv = usb_get_serial_port_data(port);
        struct ktermios not_termios;
-       unsigned long flags = 0;
 
        dbg("digi_open: TOP: port=%d, open_count=%d",
                priv->dp_port_num, port->port.count);
@@ -1345,26 +1359,6 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
        if (digi_startup_device(port->serial) != 0)
                return -ENXIO;
 
-       spin_lock_irqsave(&priv->dp_port_lock, flags);
-
-       /* don't wait on a close in progress for non-blocking opens */
-       if (priv->dp_in_close && (filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0) {
-               spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-               return -EAGAIN;
-       }
-
-       /* wait for a close in progress to finish */
-       while (priv->dp_in_close) {
-               cond_wait_interruptible_timeout_irqrestore(
-                       &priv->dp_close_wait, DIGI_RETRY_TIMEOUT,
-                       &priv->dp_port_lock, flags);
-               if (signal_pending(current))
-                       return -EINTR;
-               spin_lock_irqsave(&priv->dp_port_lock, flags);
-       }
-
-       spin_unlock_irqrestore(&priv->dp_port_lock, flags);
-
        /* read modem signals automatically whenever they change */
        buf[0] = DIGI_CMD_READ_INPUT_SIGNALS;
        buf[1] = priv->dp_port_num;
@@ -1387,16 +1381,11 @@ static int digi_open(struct tty_struct *tty, struct usb_serial_port *port,
                not_termios.c_iflag = ~tty->termios->c_iflag;
                digi_set_termios(tty, port, &not_termios);
        }
-
-       /* set DTR and RTS */
-       digi_set_modem_signals(port, TIOCM_DTR|TIOCM_RTS, 1);
-
        return 0;
 }
 
 
-static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
-                               struct file *filp)
+static void digi_close(struct usb_serial_port *port)
 {
        DEFINE_WAIT(wait);
        int ret;
@@ -1411,28 +1400,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
        if (port->serial->disconnected)
                goto exit;
 
-       /* do cleanup only after final close on this port */
-       spin_lock_irq(&priv->dp_port_lock);
-       priv->dp_in_close = 1;
-       spin_unlock_irq(&priv->dp_port_lock);
-
-       /* tell line discipline to process only XON/XOFF */
-       tty->closing = 1;
-
-       /* wait for output to drain */
-       if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-               tty_wait_until_sent(tty, DIGI_CLOSE_TIMEOUT);
-
-       /* flush driver and line discipline buffers */
-       tty_driver_flush_buffer(tty);
-       tty_ldisc_flush(tty);
-
        if (port->serial->dev) {
-               /* wait for transmit idle */
-               if ((filp->f_flags&(O_NDELAY|O_NONBLOCK)) == 0)
-                       digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
-               /* drop DTR and RTS */
-               digi_set_modem_signals(port, 0, 0);
+               /* FIXME: Transmit idle belongs in the wait_unti_sent path */
+               digi_transmit_idle(port, DIGI_CLOSE_TIMEOUT);
 
                /* disable input flow control */
                buf[0] = DIGI_CMD_SET_INPUT_FLOW_CONTROL;
@@ -1477,11 +1447,9 @@ static void digi_close(struct tty_struct *tty, struct usb_serial_port *port,
                /* shutdown any outstanding bulk writes */
                usb_kill_urb(port->write_urb);
        }
-       tty->closing = 0;
 exit:
        spin_lock_irq(&priv->dp_port_lock);
        priv->dp_write_urb_in_use = 0;
-       priv->dp_in_close = 0;
        wake_up_interruptible(&priv->dp_close_wait);
        spin_unlock_irq(&priv->dp_port_lock);
        mutex_unlock(&port->serial->disc_mutex);
@@ -1560,7 +1528,6 @@ static int digi_startup(struct usb_serial *serial)
                priv->dp_throttled = 0;
                priv->dp_throttle_restart = 0;
                init_waitqueue_head(&priv->dp_flush_wait);
-               priv->dp_in_close = 0;
                init_waitqueue_head(&priv->dp_close_wait);
                INIT_WORK(&priv->dp_wakeup_work, digi_wakeup_write_lock);
                priv->dp_port = serial->port[i];
index c709ec474a80b8540e91d5ae6e25fde2690f6b09..2b141ccb0cd958ca56d51c476075dc621e0a5a4a 100644 (file)
@@ -81,8 +81,7 @@ static int debug;
 /* function prototypes for an empeg-car player */
 static int  empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
                                                struct file *filp);
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                               struct file *filp);
+static void empeg_close(struct usb_serial_port *port);
 static int  empeg_write(struct tty_struct *tty, struct usb_serial_port *port,
                                                const unsigned char *buf,
                                                int count);
@@ -181,8 +180,7 @@ static int empeg_open(struct tty_struct *tty, struct usb_serial_port *port,
 }
 
 
-static void empeg_close(struct tty_struct *tty, struct usb_serial_port *port,
-                               struct file *filp)
+static void empeg_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
index d9fcdaedf389d9c670b57847004939c44a25dc32..683304d60615a334554c3b6a0328de5d96879df5 100644 (file)
@@ -89,6 +89,7 @@ struct ftdi_private {
        int force_rtscts;       /* if non-zero, force RTS-CTS to always
                                   be enabled */
 
+       unsigned int latency;           /* latency setting in use */
        spinlock_t tx_lock;     /* spinlock for transmit state */
        unsigned long tx_bytes;
        unsigned long tx_outstanding_bytes;
@@ -719,8 +720,8 @@ static int  ftdi_sio_port_probe(struct usb_serial_port *port);
 static int  ftdi_sio_port_remove(struct usb_serial_port *port);
 static int  ftdi_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void ftdi_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void ftdi_close(struct usb_serial_port *port);
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on);
 static int  ftdi_write(struct tty_struct *tty, struct usb_serial_port *port,
                        const unsigned char *buf, int count);
 static int  ftdi_write_room(struct tty_struct *tty);
@@ -758,6 +759,7 @@ static struct usb_serial_driver ftdi_sio_device = {
        .port_remove =          ftdi_sio_port_remove,
        .open =                 ftdi_open,
        .close =                ftdi_close,
+       .dtr_rts =              ftdi_dtr_rts,
        .throttle =             ftdi_throttle,
        .unthrottle =           ftdi_unthrottle,
        .write =                ftdi_write,
@@ -1037,7 +1039,54 @@ static int change_speed(struct tty_struct *tty, struct usb_serial_port *port)
        return rv;
 }
 
+static int write_latency_timer(struct usb_serial_port *port)
+{
+       struct ftdi_private *priv = usb_get_serial_port_data(port);
+       struct usb_device *udev = port->serial->dev;
+       char buf[1];
+       int rv = 0;
+       int l = priv->latency;
+
+       if (priv->flags & ASYNC_LOW_LATENCY)
+               l = 1;
+
+       dbg("%s: setting latency timer = %i", __func__, l);
+
+       rv = usb_control_msg(udev,
+                            usb_sndctrlpipe(udev, 0),
+                            FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
+                            FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
+                            l, priv->interface,
+                            buf, 0, WDR_TIMEOUT);
+
+       if (rv < 0)
+               dev_err(&port->dev, "Unable to write latency timer: %i\n", rv);
+       return rv;
+}
+
+static int read_latency_timer(struct usb_serial_port *port)
+{
+       struct ftdi_private *priv = usb_get_serial_port_data(port);
+       struct usb_device *udev = port->serial->dev;
+       unsigned short latency = 0;
+       int rv = 0;
+
 
+       dbg("%s", __func__);
+
+       rv = usb_control_msg(udev,
+                            usb_rcvctrlpipe(udev, 0),
+                            FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
+                            FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
+                            0, priv->interface,
+                            (char *) &latency, 1, WDR_TIMEOUT);
+
+       if (rv < 0) {
+               dev_err(&port->dev, "Unable to read latency timer: %i\n", rv);
+               return -EIO;
+       }
+       return latency;
+}
 
 static int get_serial_info(struct usb_serial_port *port,
                                struct serial_struct __user *retinfo)
@@ -1097,6 +1146,7 @@ static int set_serial_info(struct tty_struct *tty,
        priv->custom_divisor = new_serial.custom_divisor;
 
        tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
+       write_latency_timer(port);
 
 check_and_exit:
        if ((old_priv.flags & ASYNC_SPD_MASK) !=
@@ -1192,27 +1242,13 @@ static ssize_t show_latency_timer(struct device *dev,
 {
        struct usb_serial_port *port = to_usb_serial_port(dev);
        struct ftdi_private *priv = usb_get_serial_port_data(port);
-       struct usb_device *udev = port->serial->dev;
-       unsigned short latency = 0;
-       int rv = 0;
-
-
-       dbg("%s", __func__);
-
-       rv = usb_control_msg(udev,
-                            usb_rcvctrlpipe(udev, 0),
-                            FTDI_SIO_GET_LATENCY_TIMER_REQUEST,
-                            FTDI_SIO_GET_LATENCY_TIMER_REQUEST_TYPE,
-                            0, priv->interface,
-                            (char *) &latency, 1, WDR_TIMEOUT);
-
-       if (rv < 0) {
-               dev_err(dev, "Unable to read latency timer: %i\n", rv);
-               return -EIO;
-       }
-       return sprintf(buf, "%i\n", latency);
+       if (priv->flags & ASYNC_LOW_LATENCY)
+               return sprintf(buf, "1\n");
+       else
+               return sprintf(buf, "%i\n", priv->latency);
 }
 
+
 /* Write a new value of the latency timer, in units of milliseconds. */
 static ssize_t store_latency_timer(struct device *dev,
                        struct device_attribute *attr, const char *valbuf,
@@ -1220,25 +1256,13 @@ static ssize_t store_latency_timer(struct device *dev,
 {
        struct usb_serial_port *port = to_usb_serial_port(dev);
        struct ftdi_private *priv = usb_get_serial_port_data(port);
-       struct usb_device *udev = port->serial->dev;
-       char buf[1];
        int v = simple_strtoul(valbuf, NULL, 10);
        int rv = 0;
 
-       dbg("%s: setting latency timer = %i", __func__, v);
-
-       rv = usb_control_msg(udev,
-                            usb_sndctrlpipe(udev, 0),
-                            FTDI_SIO_SET_LATENCY_TIMER_REQUEST,
-                            FTDI_SIO_SET_LATENCY_TIMER_REQUEST_TYPE,
-                            v, priv->interface,
-                            buf, 0, WDR_TIMEOUT);
-
-       if (rv < 0) {
-               dev_err(dev, "Unable to write latency timer: %i\n", rv);
+       priv->latency = v;
+       rv = write_latency_timer(port);
+       if (rv < 0)
                return -EIO;
-       }
-
        return count;
 }
 
@@ -1392,6 +1416,7 @@ static int ftdi_sio_port_probe(struct usb_serial_port *port)
        usb_set_serial_port_data(port, priv);
 
        ftdi_determine_type(port);
+       read_latency_timer(port);
        create_sysfs_attrs(port);
        return 0;
 }
@@ -1514,6 +1539,8 @@ static int ftdi_open(struct tty_struct *tty,
        if (tty)
                tty->low_latency = (priv->flags & ASYNC_LOW_LATENCY) ? 1 : 0;
 
+       write_latency_timer(port);
+
        /* No error checking for this (will get errors later anyway) */
        /* See ftdi_sio.h for description of what is reset */
        usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
@@ -1529,11 +1556,6 @@ static int ftdi_open(struct tty_struct *tty,
        if (tty)
                ftdi_set_termios(tty, port, tty->termios);
 
-       /* FIXME: Flow control might be enabled, so it should be checked -
-          we have no control of defaults! */
-       /* Turn on RTS and DTR since we are not flow controlling by default */
-       set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-
        /* Not throttled */
        spin_lock_irqsave(&priv->rx_lock, flags);
        priv->rx_flags &= ~(THROTTLED | ACTUALLY_THROTTLED);
@@ -1558,6 +1580,30 @@ static int ftdi_open(struct tty_struct *tty,
 } /* ftdi_open */
 
 
+static void ftdi_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct ftdi_private *priv = usb_get_serial_port_data(port);
+       char buf[1];
+
+       mutex_lock(&port->serial->disc_mutex);
+       if (!port->serial->disconnected) {
+               /* Disable flow control */
+               if (!on && usb_control_msg(port->serial->dev,
+                           usb_sndctrlpipe(port->serial->dev, 0),
+                           FTDI_SIO_SET_FLOW_CTRL_REQUEST,
+                           FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
+                           0, priv->interface, buf, 0,
+                           WDR_TIMEOUT) < 0) {
+                           dev_err(&port->dev, "error from flowcontrol urb\n");
+               }
+               /* drop RTS and DTR */
+               if (on)
+                       set_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+               else
+                       clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
+       }
+       mutex_unlock(&port->serial->disc_mutex);
+}
 
 /*
  * usbserial:__serial_close  only calls ftdi_close if the point is open
@@ -1567,31 +1613,12 @@ static int ftdi_open(struct tty_struct *tty,
  *
  */
 
-static void ftdi_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void ftdi_close(struct usb_serial_port *port)
 { /* ftdi_close */
-       unsigned int c_cflag = tty->termios->c_cflag;
        struct ftdi_private *priv = usb_get_serial_port_data(port);
-       char buf[1];
 
        dbg("%s", __func__);
 
-       mutex_lock(&port->serial->disc_mutex);
-       if (c_cflag & HUPCL && !port->serial->disconnected) {
-               /* Disable flow control */
-               if (usb_control_msg(port->serial->dev,
-                                   usb_sndctrlpipe(port->serial->dev, 0),
-                                   FTDI_SIO_SET_FLOW_CTRL_REQUEST,
-                                   FTDI_SIO_SET_FLOW_CTRL_REQUEST_TYPE,
-                                   0, priv->interface, buf, 0,
-                                   WDR_TIMEOUT) < 0) {
-                       dev_err(&port->dev, "error from flowcontrol urb\n");
-               }
-
-               /* drop RTS and DTR */
-               clear_mctrl(port, TIOCM_DTR | TIOCM_RTS);
-       } /* Note change no line if hupcl is off */
-       mutex_unlock(&port->serial->disc_mutex);
 
        /* cancel any scheduled reading */
        cancel_delayed_work_sync(&priv->rx_work);
index 586d30ff450b2eb9cc8346bd26ea3697056661ac..ee25a3fe3b09c317b0ecbb86f8dde60301bbc9b9 100644 (file)
@@ -993,8 +993,7 @@ static int garmin_open(struct tty_struct *tty,
 }
 
 
-static void garmin_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void garmin_close(struct usb_serial_port *port)
 {
        struct usb_serial *serial = port->serial;
        struct garmin_data *garmin_data_p = usb_get_serial_port_data(port);
index 4cec9906ccf394ef312f07d152bbc64785f2cce0..be82ea956720184d545a0c43bce6f684c032fb7e 100644 (file)
@@ -184,8 +184,7 @@ int usb_serial_generic_resume(struct usb_serial *serial)
 }
 EXPORT_SYMBOL_GPL(usb_serial_generic_resume);
 
-void usb_serial_generic_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+void usb_serial_generic_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
        generic_cleanup(port);
index fb4a73d090f6b9137daa847301a37545c48edc9b..53ef5996e33de377b6290c84e4183f2669ef646e 100644 (file)
@@ -207,8 +207,7 @@ static void edge_bulk_out_cmd_callback(struct urb *urb);
 /* function prototypes for the usbserial callbacks */
 static int edge_open(struct tty_struct *tty, struct usb_serial_port *port,
                                        struct file *filp);
-static void edge_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                       struct file *filp);
+static void edge_close(struct usb_serial_port *port);
 static int edge_write(struct tty_struct *tty, struct usb_serial_port *port,
                                        const unsigned char *buf, int count);
 static int edge_write_room(struct tty_struct *tty);
@@ -965,7 +964,7 @@ static int edge_open(struct tty_struct *tty,
 
        if (!edge_port->txfifo.fifo) {
                dbg("%s - no memory", __func__);
-               edge_close(tty, port, filp);
+               edge_close(port);
                return -ENOMEM;
        }
 
@@ -975,7 +974,7 @@ static int edge_open(struct tty_struct *tty,
 
        if (!edge_port->write_urb) {
                dbg("%s - no memory", __func__);
-               edge_close(tty, port, filp);
+               edge_close(port);
                return -ENOMEM;
        }
 
@@ -1099,8 +1098,7 @@ static void block_until_tx_empty(struct edgeport_port *edge_port)
  * edge_close
  *     this function is called by the tty driver when a port is closed
  *****************************************************************************/
-static void edge_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
        struct edgeport_serial *edge_serial;
        struct edgeport_port *edge_port;
index 513b25e044c166749f062843d246350bd2ea8ea3..eabf20eeb370ee4ce578c831b36003a7837d2bd9 100644 (file)
@@ -2009,8 +2009,7 @@ release_es_lock:
        return status;
 }
 
-static void edge_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void edge_close(struct usb_serial_port *port)
 {
        struct edgeport_serial *edge_serial;
        struct edgeport_port *edge_port;
index cd62825a9ac325e520d6cf0ba93f6e7cab763d66..c610a99fa47741c51d2b9312150f9ffc05868e23 100644 (file)
@@ -76,8 +76,7 @@ static int initial_wait;
 /* Function prototypes for an ipaq */
 static int  ipaq_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void ipaq_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void ipaq_close(struct usb_serial_port *port);
 static int  ipaq_calc_num_ports(struct usb_serial *serial);
 static int  ipaq_startup(struct usb_serial *serial);
 static void ipaq_shutdown(struct usb_serial *serial);
@@ -714,8 +713,7 @@ error:
 }
 
 
-static void ipaq_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void ipaq_close(struct usb_serial_port *port)
 {
        struct ipaq_private     *priv = usb_get_serial_port_data(port);
 
index da2a2b46644a5ddada6bd79abf71504df4148fb7..29ad038b9c8de2536a9373bb60b28f9ad7354044 100644 (file)
@@ -302,23 +302,17 @@ static int ipw_open(struct tty_struct *tty,
        return 0;
 }
 
-static void ipw_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void ipw_dtr_rts(struct usb_serial_port *port, int on)
 {
        struct usb_device *dev = port->serial->dev;
        int result;
 
-       if (tty_hung_up_p(filp)) {
-               dbg("%s: tty_hung_up_p ...", __func__);
-               return;
-       }
-
        /*--1: drop the dtr */
        dbg("%s:dropping dtr", __func__);
        result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
                         IPW_SIO_SET_PIN,
                         USB_TYPE_VENDOR | USB_RECIP_INTERFACE | USB_DIR_OUT,
-                        IPW_PIN_CLRDTR,
+                        on ? IPW_PIN_SETDTR : IPW_PIN_CLRDTR,
                         0,
                         NULL,
                         0,
@@ -332,7 +326,7 @@ static void ipw_close(struct tty_struct *tty,
        result = usb_control_msg(dev, usb_sndctrlpipe(dev, 0),
                         IPW_SIO_SET_PIN, USB_TYPE_VENDOR |
                                        USB_RECIP_INTERFACE | USB_DIR_OUT,
-                        IPW_PIN_CLRRTS,
+                        on ? IPW_PIN_SETRTS : IPW_PIN_CLRRTS,
                         0,
                         NULL,
                         0,
@@ -340,7 +334,12 @@ static void ipw_close(struct tty_struct *tty,
        if (result < 0)
                dev_err(&port->dev,
                                "dropping rts failed (error = %d)\n", result);
+}
 
+static void ipw_close(struct usb_serial_port *port)
+{
+       struct usb_device *dev = port->serial->dev;
+       int result;
 
        /*--3: purge */
        dbg("%s:sending purge", __func__);
@@ -461,6 +460,7 @@ static struct usb_serial_driver ipw_device = {
        .num_ports =            1,
        .open =                 ipw_open,
        .close =                ipw_close,
+       .dtr_rts =              ipw_dtr_rts,
        .port_probe =           ipw_probe,
        .port_remove =          ipw_disconnect,
        .write =                ipw_write,
index 4e2cda93da596f7d5b98db2ae9a60b001418b392..66009b6b763a29a511551a9600d50802de63d3c9 100644 (file)
@@ -88,8 +88,7 @@ static int xbof = -1;
 static int  ir_startup (struct usb_serial *serial);
 static int  ir_open(struct tty_struct *tty, struct usb_serial_port *port,
                                        struct file *filep);
-static void ir_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                       struct file *filep);
+static void ir_close(struct usb_serial_port *port);
 static int  ir_write(struct tty_struct *tty, struct usb_serial_port *port,
                                        const unsigned char *buf, int count);
 static void ir_write_bulk_callback (struct urb *urb);
@@ -346,8 +345,7 @@ static int ir_open(struct tty_struct *tty,
        return result;
 }
 
-static void ir_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file * filp)
+static void ir_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
index 4473d442b2aa45357f7bfcea14aac39445de87da..76a3cc327bb9c2d0dbfdfa0a0e0df55015e15036 100644 (file)
@@ -40,7 +40,7 @@ static int debug;
 /*
  * Version Information
  */
-#define DRIVER_VERSION "v0.5"
+#define DRIVER_VERSION "v0.10"
 #define DRIVER_DESC "Infinity USB Unlimited Phoenix driver"
 
 static struct usb_device_id id_table[] = {
@@ -70,7 +70,6 @@ static void read_rxcmd_callback(struct urb *urb);
 struct iuu_private {
        spinlock_t lock;        /* store irq state */
        wait_queue_head_t delta_msr_wait;
-       u8 line_control;
        u8 line_status;
        u8 termios_initialized;
        int tiostatus;          /* store IUART SIGNAL for tiocmget call */
@@ -651,32 +650,33 @@ static int iuu_bulk_write(struct usb_serial_port *port)
        unsigned long flags;
        int result;
        int i;
+       int buf_len;
        char *buf_ptr = port->write_urb->transfer_buffer;
        dbg("%s - enter", __func__);
 
+       spin_lock_irqsave(&priv->lock, flags);
        *buf_ptr++ = IUU_UART_ESC;
        *buf_ptr++ = IUU_UART_TX;
        *buf_ptr++ = priv->writelen;
 
-       memcpy(buf_ptr, priv->writebuf,
-              priv->writelen);
+       memcpy(buf_ptr, priv->writebuf, priv->writelen);
+       buf_len = priv->writelen;
+       priv->writelen = 0;
+       spin_unlock_irqrestore(&priv->lock, flags);
        if (debug == 1) {
-               for (i = 0; i < priv->writelen; i++)
+               for (i = 0; i < buf_len; i++)
                        sprintf(priv->dbgbuf + i*2 ,
                                "%02X", priv->writebuf[i]);
-               priv->dbgbuf[priv->writelen+i*2] = 0;
+               priv->dbgbuf[buf_len+i*2] = 0;
                dbg("%s - writing %i chars : %s", __func__,
-                   priv->writelen, priv->dbgbuf);
+                   buf_len, priv->dbgbuf);
        }
        usb_fill_bulk_urb(port->write_urb, port->serial->dev,
                          usb_sndbulkpipe(port->serial->dev,
                                          port->bulk_out_endpointAddress),
-                         port->write_urb->transfer_buffer, priv->writelen + 3,
+                         port->write_urb->transfer_buffer, buf_len + 3,
                          iuu_rxcmd, port);
        result = usb_submit_urb(port->write_urb, GFP_ATOMIC);
-       spin_lock_irqsave(&priv->lock, flags);
-       priv->writelen = 0;
-       spin_unlock_irqrestore(&priv->lock, flags);
        usb_serial_port_softint(port);
        return result;
 }
@@ -770,14 +770,10 @@ static int iuu_uart_write(struct tty_struct *tty, struct usb_serial_port *port,
                return -ENOMEM;
 
        spin_lock_irqsave(&priv->lock, flags);
-       if (priv->writelen > 0) {
-               /* buffer already filled but not commited */
-               spin_unlock_irqrestore(&priv->lock, flags);
-               return 0;
-       }
+
        /* fill the buffer */
-       memcpy(priv->writebuf, buf, count);
-       priv->writelen = count;
+       memcpy(priv->writebuf + priv->writelen, buf, count);
+       priv->writelen += count;
        spin_unlock_irqrestore(&priv->lock, flags);
 
        return count;
@@ -819,7 +815,7 @@ static int iuu_uart_on(struct usb_serial_port *port)
        buf[0] = IUU_UART_ENABLE;
        buf[1] = (u8) ((IUU_BAUD_9600 >> 8) & 0x00FF);
        buf[2] = (u8) (0x00FF & IUU_BAUD_9600);
-       buf[3] = (u8) (0x0F0 & IUU_TWO_STOP_BITS) | (0x07 & IUU_PARITY_EVEN);
+       buf[3] = (u8) (0x0F0 & IUU_ONE_STOP_BIT) | (0x07 & IUU_PARITY_EVEN);
 
        status = bulk_immediate(port, buf, 4);
        if (status != IUU_OPERATION_OK) {
@@ -946,19 +942,59 @@ static int iuu_uart_baud(struct usb_serial_port *port, u32 baud,
        return status;
 }
 
-static int set_control_lines(struct usb_device *dev, u8 value)
+static void iuu_set_termios(struct tty_struct *tty,
+               struct usb_serial_port *port, struct ktermios *old_termios)
 {
-       return 0;
+       const u32 supported_mask = CMSPAR|PARENB|PARODD;
+
+       unsigned int cflag = tty->termios->c_cflag;
+       int status;
+       u32 actual;
+       u32 parity;
+       int csize = CS7;
+       int baud = 9600;        /* Fixed for the moment */
+       u32 newval = cflag & supported_mask;
+
+       /* compute the parity parameter */
+       parity = 0;
+       if (cflag & CMSPAR) {   /* Using mark space */
+               if (cflag & PARODD)
+                       parity |= IUU_PARITY_SPACE;
+               else
+                       parity |= IUU_PARITY_MARK;
+       } else if (!(cflag & PARENB)) {
+               parity |= IUU_PARITY_NONE;
+               csize = CS8;
+       } else if (cflag & PARODD)
+               parity |= IUU_PARITY_ODD;
+       else
+               parity |= IUU_PARITY_EVEN;
+
+       parity |= (cflag & CSTOPB ? IUU_TWO_STOP_BITS : IUU_ONE_STOP_BIT);
+
+       /* set it */
+       status = iuu_uart_baud(port,
+                       (clockmode == 2) ? 16457 : 9600 * boost / 100,
+                       &actual, parity);
+
+       /* set the termios value to the real one, so the user now what has
+        * changed. We support few fields so its easies to copy the old hw
+        * settings back over and then adjust them
+        */
+       if (old_termios)
+               tty_termios_copy_hw(tty->termios, old_termios);
+       if (status != 0)        /* Set failed - return old bits */
+               return;
+       /* Re-encode speed, parity and csize */
+       tty_encode_baud_rate(tty, baud, baud);
+       tty->termios->c_cflag &= ~(supported_mask|CSIZE);
+       tty->termios->c_cflag |= newval | csize;
 }
 
-static void iuu_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void iuu_close(struct usb_serial_port *port)
 {
        /* iuu_led (port,255,0,0,0); */
        struct usb_serial *serial;
-       struct iuu_private *priv = usb_get_serial_port_data(port);
-       unsigned long flags;
-       unsigned int c_cflag;
 
        serial = port->serial;
        if (!serial)
@@ -968,17 +1004,6 @@ static void iuu_close(struct tty_struct *tty,
 
        iuu_uart_off(port);
        if (serial->dev) {
-               if (tty) {
-                       c_cflag = tty->termios->c_cflag;
-                       if (c_cflag & HUPCL) {
-                               /* drop DTR and RTS */
-                               priv = usb_get_serial_port_data(port);
-                               spin_lock_irqsave(&priv->lock, flags);
-                               priv->line_control = 0;
-                               spin_unlock_irqrestore(&priv->lock, flags);
-                               set_control_lines(port->serial->dev, 0);
-                       }
-               }
                /* free writebuf */
                /* shutdown our urbs */
                dbg("%s - shutting down urbs", __func__);
@@ -1154,7 +1179,7 @@ static int iuu_open(struct tty_struct *tty,
        if (result) {
                dev_err(&port->dev, "%s - failed submitting read urb,"
                        " error %d\n", __func__, result);
-               iuu_close(tty, port, NULL);
+               iuu_close(port);
                return -EPROTO;
        } else {
                dbg("%s - rxcmd OK", __func__);
@@ -1175,6 +1200,7 @@ static struct usb_serial_driver iuu_device = {
        .read_bulk_callback = iuu_uart_read_callback,
        .tiocmget = iuu_tiocmget,
        .tiocmset = iuu_tiocmset,
+       .set_termios = iuu_set_termios,
        .attach = iuu_startup,
        .shutdown = iuu_shutdown,
 };
index 00daa8f7759a99946fc3a902dc1e46c9a2bf55af..f1195a98f316df62f0d465b144b32fc3f6f9da66 100644 (file)
@@ -1298,8 +1298,16 @@ static inline void stop_urb(struct urb *urb)
                usb_kill_urb(urb);
 }
 
-static void keyspan_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void keyspan_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct keyspan_port_private *p_priv = usb_get_serial_port_data(port);
+
+       p_priv->rts_state = on;
+       p_priv->dtr_state = on;
+       keyspan_send_setup(port, 0);
+}
+
+static void keyspan_close(struct usb_serial_port *port)
 {
        int                     i;
        struct usb_serial       *serial = port->serial;
@@ -1336,7 +1344,6 @@ static void keyspan_close(struct tty_struct *tty,
                        stop_urb(p_priv->out_urbs[i]);
                }
        }
-       tty_port_tty_set(&port->port, NULL);
 }
 
 /* download the firmware to a pre-renumeration device */
index 38b4582e073446ad6513b7ed11145ccc6a26e61c..0d4569b60768a736a1e180db3621aa9053f22e72 100644 (file)
@@ -38,9 +38,8 @@
 static int  keyspan_open               (struct tty_struct *tty,
                                         struct usb_serial_port *port,
                                         struct file *filp);
-static void keyspan_close              (struct tty_struct *tty,
-                                        struct usb_serial_port *port,
-                                        struct file *filp);
+static void keyspan_close              (struct usb_serial_port *port);
+static void keyspan_dtr_rts            (struct usb_serial_port *port, int on);
 static int  keyspan_startup            (struct usb_serial *serial);
 static void keyspan_shutdown           (struct usb_serial *serial);
 static int  keyspan_write_room         (struct tty_struct *tty);
@@ -562,6 +561,7 @@ static struct usb_serial_driver keyspan_1port_device = {
        .num_ports              = 1,
        .open                   = keyspan_open,
        .close                  = keyspan_close,
+       .dtr_rts                = keyspan_dtr_rts,
        .write                  = keyspan_write,
        .write_room             = keyspan_write_room,
        .set_termios            = keyspan_set_termios,
@@ -582,6 +582,7 @@ static struct usb_serial_driver keyspan_2port_device = {
        .num_ports              = 2,
        .open                   = keyspan_open,
        .close                  = keyspan_close,
+       .dtr_rts                = keyspan_dtr_rts,
        .write                  = keyspan_write,
        .write_room             = keyspan_write_room,
        .set_termios            = keyspan_set_termios,
@@ -602,6 +603,7 @@ static struct usb_serial_driver keyspan_4port_device = {
        .num_ports              = 4,
        .open                   = keyspan_open,
        .close                  = keyspan_close,
+       .dtr_rts                = keyspan_dtr_rts,
        .write                  = keyspan_write,
        .write_room             = keyspan_write_room,
        .set_termios            = keyspan_set_termios,
index bf1ae247da66133f64ee79d57f15ef6619f82aa2..ab769dbea1b3e6f66d4df15ce6166caca8a17a50 100644 (file)
@@ -651,6 +651,35 @@ static int keyspan_pda_chars_in_buffer(struct tty_struct *tty)
 }
 
 
+static void keyspan_pda_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct usb_serial *serial = port->serial;
+
+       if (serial->dev) {
+               if (on)
+                       keyspan_pda_set_modem_info(serial, (1<<7) | (1<< 2));
+               else
+                       keyspan_pda_set_modem_info(serial, 0);
+       }
+}
+
+static int keyspan_pda_carrier_raised(struct usb_serial_port *port)
+{
+       struct usb_serial *serial = port->serial;
+       unsigned char modembits;
+
+       /* If we can read the modem status and the DCD is low then
+          carrier is not raised yet */
+       if (keyspan_pda_get_modem_info(serial, &modembits) >= 0) {
+               if (!(modembits & (1>>6)))
+                       return 0;
+       }
+       /* Carrier raised, or we failed (eg disconnected) so
+          progress accordingly */
+       return 1;
+}
+
+
 static int keyspan_pda_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp)
 {
@@ -682,13 +711,6 @@ static int keyspan_pda_open(struct tty_struct *tty,
        priv->tx_room = room;
        priv->tx_throttled = room ? 0 : 1;
 
-       /* the normal serial device seems to always turn on DTR and RTS here,
-          so do the same */
-       if (tty && (tty->termios->c_cflag & CBAUD))
-               keyspan_pda_set_modem_info(serial, (1<<7) | (1<<2));
-       else
-               keyspan_pda_set_modem_info(serial, 0);
-
        /*Start reading from the device*/
        port->interrupt_in_urb->dev = serial->dev;
        rc = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
@@ -700,19 +722,11 @@ static int keyspan_pda_open(struct tty_struct *tty,
 error:
        return rc;
 }
-
-
-static void keyspan_pda_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void keyspan_pda_close(struct usb_serial_port *port)
 {
        struct usb_serial *serial = port->serial;
 
        if (serial->dev) {
-               /* the normal serial device seems to always shut
-                  off DTR and RTS now */
-               if (tty->termios->c_cflag & HUPCL)
-                       keyspan_pda_set_modem_info(serial, 0);
-
                /* shutdown our bulk reads and writes */
                usb_kill_urb(port->write_urb);
                usb_kill_urb(port->interrupt_in_urb);
@@ -839,6 +853,8 @@ static struct usb_serial_driver keyspan_pda_device = {
        .usb_driver =           &keyspan_pda_driver,
        .id_table =             id_table_std,
        .num_ports =            1,
+       .dtr_rts =              keyspan_pda_dtr_rts,
+       .carrier_raised =       keyspan_pda_carrier_raised,
        .open =                 keyspan_pda_open,
        .close =                keyspan_pda_close,
        .write =                keyspan_pda_write,
index fcd9082f3e7f05d93fedc3073e3f6a87c318a6a8..fa817c66b3e8c827bb6ce0837591f2406dd73f70 100644 (file)
@@ -76,8 +76,7 @@ static int  klsi_105_startup(struct usb_serial *serial);
 static void klsi_105_shutdown(struct usb_serial *serial);
 static int  klsi_105_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void klsi_105_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void klsi_105_close(struct usb_serial_port *port);
 static int  klsi_105_write(struct tty_struct *tty,
        struct usb_serial_port *port, const unsigned char *buf, int count);
 static void klsi_105_write_bulk_callback(struct urb *urb);
@@ -447,8 +446,7 @@ exit:
 } /* klsi_105_open */
 
 
-static void klsi_105_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void klsi_105_close(struct usb_serial_port *port)
 {
        struct klsi_105_private *priv = usb_get_serial_port_data(port);
        int rc;
index c148544953b37df33a00fc9873377a31168de993..6b570498287f3f5d62d337e6a347945101b638c3 100644 (file)
@@ -72,8 +72,7 @@ static int  kobil_startup(struct usb_serial *serial);
 static void kobil_shutdown(struct usb_serial *serial);
 static int  kobil_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void kobil_close(struct tty_struct *tty, struct usb_serial_port *port,
-                       struct file *filp);
+static void kobil_close(struct usb_serial_port *port);
 static int  kobil_write(struct tty_struct *tty, struct usb_serial_port *port,
                         const unsigned char *buf, int count);
 static int  kobil_write_room(struct tty_struct *tty);
@@ -209,7 +208,7 @@ static void kobil_shutdown(struct usb_serial *serial)
 
        for (i = 0; i < serial->num_ports; ++i) {
                while (serial->port[i]->port.count > 0)
-                       kobil_close(NULL, serial->port[i], NULL);
+                       kobil_close(serial->port[i]);
                kfree(usb_get_serial_port_data(serial->port[i]));
                usb_set_serial_port_data(serial->port[i], NULL);
        }
@@ -346,11 +345,11 @@ static int kobil_open(struct tty_struct *tty,
 }
 
 
-static void kobil_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void kobil_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
+       /* FIXME: Add rts/dtr methods */
        if (port->write_urb) {
                usb_kill_urb(port->write_urb);
                usb_free_urb(port->write_urb);
index 82930a7d509327c2797ef474042e0a3d2a4106ac..873795548fc0a976a5bd129e3316933c9eda069b 100644 (file)
@@ -95,8 +95,8 @@ static int  mct_u232_startup(struct usb_serial *serial);
 static void mct_u232_shutdown(struct usb_serial *serial);
 static int  mct_u232_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void mct_u232_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void mct_u232_close(struct usb_serial_port *port);
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on);
 static void mct_u232_read_int_callback(struct urb *urb);
 static void mct_u232_set_termios(struct tty_struct *tty,
                        struct usb_serial_port *port, struct ktermios *old);
@@ -140,6 +140,7 @@ static struct usb_serial_driver mct_u232_device = {
        .num_ports =         1,
        .open =              mct_u232_open,
        .close =             mct_u232_close,
+       .dtr_rts =           mct_u232_dtr_rts,
        .throttle =          mct_u232_throttle,
        .unthrottle =        mct_u232_unthrottle,
        .read_int_callback = mct_u232_read_int_callback,
@@ -496,29 +497,29 @@ error:
        return retval;
 } /* mct_u232_open */
 
-
-static void mct_u232_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void mct_u232_dtr_rts(struct usb_serial_port *port, int on)
 {
-       unsigned int c_cflag;
        unsigned int control_state;
        struct mct_u232_private *priv = usb_get_serial_port_data(port);
-       dbg("%s port %d", __func__, port->number);
 
-       if (tty) {
-               c_cflag = tty->termios->c_cflag;
-               mutex_lock(&port->serial->disc_mutex);
-               if (c_cflag & HUPCL && !port->serial->disconnected) {
-                       /* drop DTR and RTS */
-                       spin_lock_irq(&priv->lock);
+       mutex_lock(&port->serial->disc_mutex);
+       if (!port->serial->disconnected) {
+               /* drop DTR and RTS */
+               spin_lock_irq(&priv->lock);
+               if (on)
+                       priv->control_state |= TIOCM_DTR | TIOCM_RTS;
+               else
                        priv->control_state &= ~(TIOCM_DTR | TIOCM_RTS);
-                       control_state = priv->control_state;
-                       spin_unlock_irq(&priv->lock);
-                       mct_u232_set_modem_ctrl(port->serial, control_state);
-               }
-               mutex_unlock(&port->serial->disc_mutex);
+               control_state = priv->control_state;
+               spin_unlock_irq(&priv->lock);
+               mct_u232_set_modem_ctrl(port->serial, control_state);
        }
+       mutex_unlock(&port->serial->disc_mutex);
+}
 
+static void mct_u232_close(struct usb_serial_port *port)
+{
+       dbg("%s port %d", __func__, port->number);
 
        if (port->serial->dev) {
                /* shutdown our urbs */
index 24e3b5d4b4d49f088a9891718bc36c815c4a409c..9e1a013ee7f679177e7b588aabbb2b6534d4cda3 100644 (file)
@@ -533,8 +533,7 @@ static int mos7720_chars_in_buffer(struct tty_struct *tty)
        return chars;
 }
 
-static void mos7720_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void mos7720_close(struct usb_serial_port *port)
 {
        struct usb_serial *serial;
        struct moschip_port *mos7720_port;
index 84fb1dcd30dc3eacdf186c3666375775ff9b1fad..10b78a37214f6c337b1ac1bb3cb93219a9bbd3ba 100644 (file)
@@ -1135,54 +1135,12 @@ static int mos7840_chars_in_buffer(struct tty_struct *tty)
 
 }
 
-/************************************************************************
- *
- * mos7840_block_until_tx_empty
- *
- *     This function will block the close until one of the following:
- *             1. TX count are 0
- *             2. The mos7840 has stopped
- *             3. A timeout of 3 seconds without activity has expired
- *
- ************************************************************************/
-static void mos7840_block_until_tx_empty(struct tty_struct *tty,
-                               struct moschip_port *mos7840_port)
-{
-       int timeout = HZ / 10;
-       int wait = 30;
-       int count;
-
-       while (1) {
-
-               count = mos7840_chars_in_buffer(tty);
-
-               /* Check for Buffer status */
-               if (count <= 0)
-                       return;
-
-               /* Block the thread for a while */
-               interruptible_sleep_on_timeout(&mos7840_port->wait_chase,
-                                              timeout);
-
-               /* No activity.. count down section */
-               wait--;
-               if (wait == 0) {
-                       dbg("%s - TIMEOUT", __func__);
-                       return;
-               } else {
-                       /* Reset timeout value back to seconds */
-                       wait = 30;
-               }
-       }
-}
-
 /*****************************************************************************
  * mos7840_close
  *     this function is called by the tty driver when a port is closed
  *****************************************************************************/
 
-static void mos7840_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void mos7840_close(struct usb_serial_port *port)
 {
        struct usb_serial *serial;
        struct moschip_port *mos7840_port;
@@ -1223,10 +1181,6 @@ static void mos7840_close(struct tty_struct *tty,
                }
        }
 
-       if (serial->dev)
-               /* flush and block until tx is empty */
-               mos7840_block_until_tx_empty(tty, mos7840_port);
-
        /* While closing port, shutdown all bulk read, write  *
         * and interrupt read if they exists                  */
        if (serial->dev) {
index bcdcbb822705fd952863dcd5d041abef4d8daad3..f5f3751a888ced2547abf459d63e99a6cafe3a7e 100644 (file)
@@ -98,8 +98,7 @@ static int navman_open(struct tty_struct *tty,
        return result;
 }
 
-static void navman_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void navman_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
 
index df6539712726b93b8c007ecea018a5ed9ea4cb50..1104617334f50d9ff4e9afbbe3d3b21934f92cc3 100644 (file)
@@ -66,8 +66,7 @@ static int debug;
 /* function prototypes */
 static int  omninet_open(struct tty_struct *tty, struct usb_serial_port *port,
                                                        struct file *filp);
-static void omninet_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                                       struct file *filp);
+static void omninet_close(struct usb_serial_port *port);
 static void omninet_read_bulk_callback(struct urb *urb);
 static void omninet_write_bulk_callback(struct urb *urb);
 static int  omninet_write(struct tty_struct *tty, struct usb_serial_port *port,
@@ -189,8 +188,7 @@ static int omninet_open(struct tty_struct *tty,
        return result;
 }
 
-static void omninet_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void omninet_close(struct usb_serial_port *port)
 {
        dbg("%s - port %d", __func__, port->number);
        usb_kill_urb(port->read_urb);
index b500ad10b7589185087bbcb64cc0e4ba97fbfd3b..c20480aa975558c9fdac755fe0323991e6c1b4f6 100644 (file)
@@ -173,8 +173,7 @@ static int opticon_open(struct tty_struct *tty, struct usb_serial_port *port,
        return result;
 }
 
-static void opticon_close(struct tty_struct *tty, struct usb_serial_port *port,
-                         struct file *filp)
+static void opticon_close(struct usb_serial_port *port)
 {
        struct opticon_private *priv = usb_get_serial_data(port->serial);
 
index 7817b82889ca5e56eeb0c63a01c9351cf1a02612..a16d69fadba1cd9fa7088b9baa3a598b6402c2d9 100644 (file)
@@ -45,8 +45,9 @@
 /* Function prototypes */
 static int  option_open(struct tty_struct *tty, struct usb_serial_port *port,
                                                        struct file *filp);
-static void option_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                                       struct file *filp);
+static void option_close(struct usb_serial_port *port);
+static void option_dtr_rts(struct usb_serial_port *port, int on);
+
 static int  option_startup(struct usb_serial *serial);
 static void option_shutdown(struct usb_serial *serial);
 static int  option_write_room(struct tty_struct *tty);
@@ -61,7 +62,7 @@ static void option_set_termios(struct tty_struct *tty,
 static int  option_tiocmget(struct tty_struct *tty, struct file *file);
 static int  option_tiocmset(struct tty_struct *tty, struct file *file,
                                unsigned int set, unsigned int clear);
-static int  option_send_setup(struct tty_struct *tty, struct usb_serial_port *port);
+static int  option_send_setup(struct usb_serial_port *port);
 static int  option_suspend(struct usb_serial *serial, pm_message_t message);
 static int  option_resume(struct usb_serial *serial);
 
@@ -551,6 +552,7 @@ static struct usb_serial_driver option_1port_device = {
        .num_ports         = 1,
        .open              = option_open,
        .close             = option_close,
+       .dtr_rts           = option_dtr_rts,
        .write             = option_write,
        .write_room        = option_write_room,
        .chars_in_buffer   = option_chars_in_buffer,
@@ -630,7 +632,7 @@ static void option_set_termios(struct tty_struct *tty,
        dbg("%s", __func__);
        /* Doesn't support option setting */
        tty_termios_copy_hw(tty->termios, old_termios);
-       option_send_setup(tty, port);
+       option_send_setup(port);
 }
 
 static int option_tiocmget(struct tty_struct *tty, struct file *file)
@@ -669,7 +671,7 @@ static int option_tiocmset(struct tty_struct *tty, struct file *file,
                portdata->rts_state = 0;
        if (clear & TIOCM_DTR)
                portdata->dtr_state = 0;
-       return option_send_setup(tty, port);
+       return option_send_setup(port);
 }
 
 /* Write */
@@ -897,10 +899,6 @@ static int option_open(struct tty_struct *tty,
 
        dbg("%s", __func__);
 
-       /* Set some sane defaults */
-       portdata->rts_state = 1;
-       portdata->dtr_state = 1;
-
        /* Reset low level data toggle and start reading from endpoints */
        for (i = 0; i < N_IN_URB; i++) {
                urb = portdata->in_urbs[i];
@@ -936,37 +934,43 @@ static int option_open(struct tty_struct *tty,
                                usb_pipeout(urb->pipe), 0); */
        }
 
-       option_send_setup(tty, port);
+       option_send_setup(port);
 
        return 0;
 }
 
-static void option_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void option_dtr_rts(struct usb_serial_port *port, int on)
 {
-       int i;
        struct usb_serial *serial = port->serial;
        struct option_port_private *portdata;
 
        dbg("%s", __func__);
        portdata = usb_get_serial_port_data(port);
+       mutex_lock(&serial->disc_mutex);
+       portdata->rts_state = on;
+       portdata->dtr_state = on;
+       if (serial->dev)
+               option_send_setup(port);
+       mutex_unlock(&serial->disc_mutex);
+}
 
-       portdata->rts_state = 0;
-       portdata->dtr_state = 0;
 
-       if (serial->dev) {
-               mutex_lock(&serial->disc_mutex);
-               if (!serial->disconnected)
-                       option_send_setup(tty, port);
-               mutex_unlock(&serial->disc_mutex);
+static void option_close(struct usb_serial_port *port)
+{
+       int i;
+       struct usb_serial *serial = port->serial;
+       struct option_port_private *portdata;
+
+       dbg("%s", __func__);
+       portdata = usb_get_serial_port_data(port);
 
+       if (serial->dev) {
                /* Stop reading/writing urbs */
                for (i = 0; i < N_IN_URB; i++)
                        usb_kill_urb(portdata->in_urbs[i]);
                for (i = 0; i < N_OUT_URB; i++)
                        usb_kill_urb(portdata->out_urbs[i]);
        }
-       tty_port_tty_set(&port->port, NULL);
 }
 
 /* Helper functions used by option_setup_urbs */
@@ -1032,28 +1036,24 @@ static void option_setup_urbs(struct usb_serial *serial)
  * This is exactly the same as SET_CONTROL_LINE_STATE from the PSTN
  * CDC.
 */
-static int option_send_setup(struct tty_struct *tty,
-                                               struct usb_serial_port *port)
+static int option_send_setup(struct usb_serial_port *port)
 {
        struct usb_serial *serial = port->serial;
        struct option_port_private *portdata;
        int ifNum = serial->interface->cur_altsetting->desc.bInterfaceNumber;
+       int val = 0;
        dbg("%s", __func__);
 
        portdata = usb_get_serial_port_data(port);
 
-       if (tty) {
-               int val = 0;
-               if (portdata->dtr_state)
-                       val |= 0x01;
-               if (portdata->rts_state)
-                       val |= 0x02;
+       if (portdata->dtr_state)
+               val |= 0x01;
+       if (portdata->rts_state)
+               val |= 0x02;
 
-               return usb_control_msg(serial->dev,
-                       usb_rcvctrlpipe(serial->dev, 0),
-                       0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
-       }
-       return 0;
+       return usb_control_msg(serial->dev,
+               usb_rcvctrlpipe(serial->dev, 0),
+               0x22, 0x21, val, ifNum, NULL, 0, USB_CTRL_SET_TIMEOUT);
 }
 
 static int option_startup(struct usb_serial *serial)
index ba551f00f16ff1ec30fbcd90422f1e1112e8c00a..7de54781fe614e96d32ee69327822f02416c3801 100644 (file)
@@ -143,8 +143,7 @@ struct oti6858_control_pkt {
 /* function prototypes */
 static int oti6858_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void oti6858_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void oti6858_close(struct usb_serial_port *port);
 static void oti6858_set_termios(struct tty_struct *tty,
                        struct usb_serial_port *port, struct ktermios *old);
 static int oti6858_ioctl(struct tty_struct *tty, struct file *file,
@@ -622,67 +621,30 @@ static int oti6858_open(struct tty_struct *tty,
        if (result != 0) {
                dev_err(&port->dev, "%s(): usb_submit_urb() failed"
                               " with error %d\n", __func__, result);
-               oti6858_close(tty, port, NULL);
+               oti6858_close(port);
                return -EPROTO;
        }
 
        /* setup termios */
        if (tty)
                oti6858_set_termios(tty, port, &tmp_termios);
-
+       port->port.drain_delay = 256;   /* FIXME: check the FIFO length */
        return 0;
 }
 
-static void oti6858_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void oti6858_close(struct usb_serial_port *port)
 {
        struct oti6858_private *priv = usb_get_serial_port_data(port);
        unsigned long flags;
-       long timeout;
-       wait_queue_t wait;
 
        dbg("%s(port = %d)", __func__, port->number);
 
-       /* wait for data to drain from the buffer */
        spin_lock_irqsave(&priv->lock, flags);
-       timeout = 30 * HZ;      /* PL2303_CLOSING_WAIT */
-       init_waitqueue_entry(&wait, current);
-       add_wait_queue(&tty->write_wait, &wait);
-       dbg("%s(): entering wait loop", __func__);
-       for (;;) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (oti6858_buf_data_avail(priv->buf) == 0
-               || timeout == 0 || signal_pending(current)
-               || port->serial->disconnected)
-                       break;
-               spin_unlock_irqrestore(&priv->lock, flags);
-               timeout = schedule_timeout(timeout);
-               spin_lock_irqsave(&priv->lock, flags);
-       }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&tty->write_wait, &wait);
-       dbg("%s(): after wait loop", __func__);
-
        /* clear out any remaining data in the buffer */
        oti6858_buf_clear(priv->buf);
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* wait for characters to drain from the device */
-       /* (this is long enough for the entire 256 byte */
-       /* pl2303 hardware buffer to drain with no flow */
-       /* control for data rates of 1200 bps or more, */
-       /* for lower rates we should really know how much */
-       /* data is in the buffer to compute a delay */
-       /* that is not unnecessarily long) */
-       /* FIXME
-       bps = tty_get_baud_rate(tty);
-       if (bps > 1200)
-               timeout = max((HZ*2560)/bps,HZ/10);
-       else
-       */
-               timeout = 2*HZ;
-       schedule_timeout_interruptible(timeout);
-       dbg("%s(): after schedule_timeout_interruptible()", __func__);
+       dbg("%s(): after buf_clear()", __func__);
 
        /* cancel scheduled setup */
        cancel_delayed_work(&priv->delayed_setup_work);
@@ -694,15 +656,6 @@ static void oti6858_close(struct tty_struct *tty,
        usb_kill_urb(port->write_urb);
        usb_kill_urb(port->read_urb);
        usb_kill_urb(port->interrupt_in_urb);
-
-       /*
-       if (tty && (tty->termios->c_cflag) & HUPCL) {
-               // drop DTR and RTS
-               spin_lock_irqsave(&priv->lock, flags);
-               priv->pending_setup.control &= ~CONTROL_MASK;
-               spin_unlock_irqrestore(&priv->lock, flags);
-       }
-       */
 }
 
 static int oti6858_tiocmset(struct tty_struct *tty, struct file *file,
index 751a533a4347e7324725afb8948b75fcb2437bbd..e02dc3d643c7ae1279f467b21584e2899a82dfb7 100644 (file)
@@ -652,69 +652,41 @@ static void pl2303_set_termios(struct tty_struct *tty,
        kfree(buf);
 }
 
-static void pl2303_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void pl2303_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct pl2303_private *priv = usb_get_serial_port_data(port);
+       unsigned long flags;
+       u8 control;
+
+       spin_lock_irqsave(&priv->lock, flags);
+       /* Change DTR and RTS */
+       if (on)
+               priv->line_control |= (CONTROL_DTR | CONTROL_RTS);
+       else
+               priv->line_control &= ~(CONTROL_DTR | CONTROL_RTS);
+       control = priv->line_control;
+       spin_unlock_irqrestore(&priv->lock, flags);
+       set_control_lines(port->serial->dev, control);
+}
+
+static void pl2303_close(struct usb_serial_port *port)
 {
        struct pl2303_private *priv = usb_get_serial_port_data(port);
        unsigned long flags;
-       unsigned int c_cflag;
-       int bps;
-       long timeout;
-       wait_queue_t wait;
 
        dbg("%s - port %d", __func__, port->number);
 
-       /* wait for data to drain from the buffer */
        spin_lock_irqsave(&priv->lock, flags);
-       timeout = PL2303_CLOSING_WAIT;
-       init_waitqueue_entry(&wait, current);
-       add_wait_queue(&tty->write_wait, &wait);
-       for (;;) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (pl2303_buf_data_avail(priv->buf) == 0 ||
-                   timeout == 0 || signal_pending(current) ||
-                   port->serial->disconnected)
-                       break;
-               spin_unlock_irqrestore(&priv->lock, flags);
-               timeout = schedule_timeout(timeout);
-               spin_lock_irqsave(&priv->lock, flags);
-       }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&tty->write_wait, &wait);
        /* clear out any remaining data in the buffer */
        pl2303_buf_clear(priv->buf);
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* wait for characters to drain from the device */
-       /* (this is long enough for the entire 256 byte */
-       /* pl2303 hardware buffer to drain with no flow */
-       /* control for data rates of 1200 bps or more, */
-       /* for lower rates we should really know how much */
-       /* data is in the buffer to compute a delay */
-       /* that is not unnecessarily long) */
-       bps = tty_get_baud_rate(tty);
-       if (bps > 1200)
-               timeout = max((HZ*2560)/bps, HZ/10);
-       else
-               timeout = 2*HZ;
-       schedule_timeout_interruptible(timeout);
-
        /* shutdown our urbs */
        dbg("%s - shutting down urbs", __func__);
        usb_kill_urb(port->write_urb);
        usb_kill_urb(port->read_urb);
        usb_kill_urb(port->interrupt_in_urb);
 
-       if (tty) {
-               c_cflag = tty->termios->c_cflag;
-               if (c_cflag & HUPCL) {
-                       /* drop DTR and RTS */
-                       spin_lock_irqsave(&priv->lock, flags);
-                       priv->line_control = 0;
-                       spin_unlock_irqrestore(&priv->lock, flags);
-                       set_control_lines(port->serial->dev, 0);
-               }
-       }
 }
 
 static int pl2303_open(struct tty_struct *tty,
@@ -748,7 +720,7 @@ static int pl2303_open(struct tty_struct *tty,
        if (result) {
                dev_err(&port->dev, "%s - failed submitting read urb,"
                        " error %d\n", __func__, result);
-               pl2303_close(tty, port, NULL);
+               pl2303_close(port);
                return -EPROTO;
        }
 
@@ -758,9 +730,10 @@ static int pl2303_open(struct tty_struct *tty,
        if (result) {
                dev_err(&port->dev, "%s - failed submitting interrupt urb,"
                        " error %d\n", __func__, result);
-               pl2303_close(tty, port, NULL);
+               pl2303_close(port);
                return -EPROTO;
        }
+       port->port.drain_delay = 256;
        return 0;
 }
 
@@ -821,6 +794,14 @@ static int pl2303_tiocmget(struct tty_struct *tty, struct file *file)
        return result;
 }
 
+static int pl2303_carrier_raised(struct usb_serial_port *port)
+{
+       struct pl2303_private *priv = usb_get_serial_port_data(port);
+       if (priv->line_status & UART_DCD)
+               return 1;
+       return 0;
+}
+
 static int wait_modem_info(struct usb_serial_port *port, unsigned int arg)
 {
        struct pl2303_private *priv = usb_get_serial_port_data(port);
@@ -1125,6 +1106,8 @@ static struct usb_serial_driver pl2303_device = {
        .num_ports =            1,
        .open =                 pl2303_open,
        .close =                pl2303_close,
+       .dtr_rts =              pl2303_dtr_rts,
+       .carrier_raised =       pl2303_carrier_raised,
        .write =                pl2303_write,
        .ioctl =                pl2303_ioctl,
        .break_ctl =            pl2303_break_ctl,
index 913225c6161037ad78596f5ba68b9fa078f510b5..17ac34f4d66823deb49f02a571a09f28de07fe0a 100644 (file)
 #include <linux/module.h>
 #include <linux/usb.h>
 #include <linux/usb/serial.h>
-#include <linux/usb/ch9.h>
 
 #define SWIMS_USB_REQUEST_SetPower     0x00
 #define SWIMS_USB_REQUEST_SetNmea      0x07
 
-/* per port private data */
 #define N_IN_URB       4
 #define N_OUT_URB      4
 #define IN_BUFLEN      4096
 static int debug;
 static int nmea;
 
+/* Used in interface blacklisting */
+struct sierra_iface_info {
+       const u32 infolen;      /* number of interface numbers on blacklist */
+       const u8  *ifaceinfo;   /* pointer to the array holding the numbers */
+};
+
 static int sierra_set_power_state(struct usb_device *udev, __u16 swiState)
 {
        int result;
@@ -85,6 +89,23 @@ static int sierra_calc_num_ports(struct usb_serial *serial)
        return result;
 }
 
+static int is_blacklisted(const u8 ifnum,
+                               const struct sierra_iface_info *blacklist)
+{
+       const u8  *info;
+       int i;
+
+       if (blacklist) {
+               info = blacklist->ifaceinfo;
+
+               for (i = 0; i < blacklist->infolen; i++) {
+                       if (info[i] == ifnum)
+                               return 1;
+               }
+       }
+       return 0;
+}
+
 static int sierra_calc_interface(struct usb_serial *serial)
 {
        int interface;
@@ -153,9 +174,25 @@ static int sierra_probe(struct usb_serial *serial,
         */
        usb_set_serial_data(serial, (void *)num_ports);
 
+       /* ifnum could have changed - by calling usb_set_interface */
+       ifnum = sierra_calc_interface(serial);
+
+       if (is_blacklisted(ifnum,
+                               (struct sierra_iface_info *)id->driver_info)) {
+               dev_dbg(&serial->dev->dev,
+                       "Ignoring blacklisted interface #%d\n", ifnum);
+               return -ENODEV;
+       }
+
        return result;
 }
 
+static const u8 direct_ip_non_serial_ifaces[] = { 7, 8, 9, 10, 11 };
+static const struct sierra_iface_info direct_ip_interface_blacklist = {
+       .infolen = ARRAY_SIZE(direct_ip_non_serial_ifaces),
+       .ifaceinfo = direct_ip_non_serial_ifaces,
+};
+
 static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x1199, 0x0017) }, /* Sierra Wireless EM5625 */
        { USB_DEVICE(0x1199, 0x0018) }, /* Sierra Wireless MC5720 */
@@ -188,9 +225,11 @@ static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x1199, 0x6833) }, /* Sierra Wireless MC8781 */
        { USB_DEVICE(0x1199, 0x683A) }, /* Sierra Wireless MC8785 */
        { USB_DEVICE(0x1199, 0x683B) }, /* Sierra Wireless MC8785 Composite */
-       { USB_DEVICE(0x1199, 0x683C) }, /* Sierra Wireless MC8790 */
-       { USB_DEVICE(0x1199, 0x683D) }, /* Sierra Wireless MC8790 */
-       { USB_DEVICE(0x1199, 0x683E) }, /* Sierra Wireless MC8790 */
+       /* Sierra Wireless MC8790, MC8791, MC8792 Composite */
+       { USB_DEVICE(0x1199, 0x683C) },
+       { USB_DEVICE(0x1199, 0x683D) }, /* Sierra Wireless MC8791 Composite */
+       /* Sierra Wireless MC8790, MC8791, MC8792 */
+       { USB_DEVICE(0x1199, 0x683E) },
        { USB_DEVICE(0x1199, 0x6850) }, /* Sierra Wireless AirCard 880 */
        { USB_DEVICE(0x1199, 0x6851) }, /* Sierra Wireless AirCard 881 */
        { USB_DEVICE(0x1199, 0x6852) }, /* Sierra Wireless AirCard 880 E */
@@ -211,6 +250,10 @@ static struct usb_device_id id_table [] = {
        { USB_DEVICE(0x1199, 0x0112) }, /* Sierra Wireless AirCard 580 */
        { USB_DEVICE(0x0F3D, 0x0112) }, /* Airprime/Sierra PC 5220 */
 
+       { USB_DEVICE(0x1199, 0x68A3),   /* Sierra Wireless Direct IP modems */
+         .driver_info = (kernel_ulong_t)&direct_ip_interface_blacklist
+       },
+
        { }
 };
 MODULE_DEVICE_TABLE(usb, id_table);
@@ -229,7 +272,6 @@ struct sierra_port_private {
 
        /* Input endpoints and buffers for this port */
        struct urb *in_urbs[N_IN_URB];
-       char *in_buffer[N_IN_URB];
 
        /* Settings for the port */
        int rts_state;  /* Handshaking pins (outputs) */
@@ -240,57 +282,50 @@ struct sierra_port_private {
        int ri_state;
 };
 
-static int sierra_send_setup(struct tty_struct *tty,
-                                               struct usb_serial_port *port)
+static int sierra_send_setup(struct usb_serial_port *port)
 {
        struct usb_serial *serial = port->serial;
        struct sierra_port_private *portdata;
        __u16 interface = 0;
+       int val = 0;
 
        dev_dbg(&port->dev, "%s", __func__);
 
        portdata = usb_get_serial_port_data(port);
 
-       if (tty) {
-               int val = 0;
-               if (portdata->dtr_state)
-                       val |= 0x01;
-               if (portdata->rts_state)
-                       val |= 0x02;
-
-               /* If composite device then properly report interface */
-               if (serial->num_ports == 1) {
-                       interface = sierra_calc_interface(serial);
-
-                       /* Control message is sent only to interfaces with
-                        * interrupt_in endpoints
-                        */
-                       if (port->interrupt_in_urb) {
-                               /* send control message */
-                               return usb_control_msg(serial->dev,
-                                       usb_rcvctrlpipe(serial->dev, 0),
-                                       0x22, 0x21, val, interface,
-                                       NULL, 0, USB_CTRL_SET_TIMEOUT);
-                       }
-               }
-
-               /* Otherwise the need to do non-composite mapping */
-               else {
-                       if (port->bulk_out_endpointAddress == 2)
-                               interface = 0;
-                       else if (port->bulk_out_endpointAddress == 4)
-                               interface = 1;
-                       else if (port->bulk_out_endpointAddress == 5)
-                               interface = 2;
+       if (portdata->dtr_state)
+               val |= 0x01;
+       if (portdata->rts_state)
+               val |= 0x02;
 
+       /* If composite device then properly report interface */
+       if (serial->num_ports == 1) {
+               interface = sierra_calc_interface(serial);
+               /* Control message is sent only to interfaces with
+                * interrupt_in endpoints
+                */
+               if (port->interrupt_in_urb) {
+                       /* send control message */
                        return usb_control_msg(serial->dev,
                                usb_rcvctrlpipe(serial->dev, 0),
                                0x22, 0x21, val, interface,
                                NULL, 0, USB_CTRL_SET_TIMEOUT);
-
                }
        }
 
+       /* Otherwise the need to do non-composite mapping */
+       else {
+               if (port->bulk_out_endpointAddress == 2)
+                       interface = 0;
+               else if (port->bulk_out_endpointAddress == 4)
+                       interface = 1;
+               else if (port->bulk_out_endpointAddress == 5)
+                       interface = 2;
+               return usb_control_msg(serial->dev,
+                       usb_rcvctrlpipe(serial->dev, 0),
+                       0x22, 0x21, val, interface,
+                       NULL, 0, USB_CTRL_SET_TIMEOUT);
+       }
        return 0;
 }
 
@@ -299,7 +334,7 @@ static void sierra_set_termios(struct tty_struct *tty,
 {
        dev_dbg(&port->dev, "%s", __func__);
        tty_termios_copy_hw(tty->termios, old_termios);
-       sierra_send_setup(tty, port);
+       sierra_send_setup(port);
 }
 
 static int sierra_tiocmget(struct tty_struct *tty, struct file *file)
@@ -338,7 +373,18 @@ static int sierra_tiocmset(struct tty_struct *tty, struct file *file,
                portdata->rts_state = 0;
        if (clear & TIOCM_DTR)
                portdata->dtr_state = 0;
-       return sierra_send_setup(tty, port);
+       return sierra_send_setup(port);
+}
+
+static void sierra_release_urb(struct urb *urb)
+{
+       struct usb_serial_port *port;
+       if (urb) {
+               port =  urb->context;
+               dev_dbg(&port->dev, "%s: %p\n", __func__, urb);
+               kfree(urb->transfer_buffer);
+               usb_free_urb(urb);
+       }
 }
 
 static void sierra_outdat_callback(struct urb *urb)
@@ -465,7 +511,7 @@ static void sierra_indat_callback(struct urb *urb)
                                " received", __func__);
 
                /* Resubmit urb so we continue receiving */
-               if (port->port.count && status != -ESHUTDOWN) {
+               if (port->port.count && status != -ESHUTDOWN && status != -EPERM) {
                        err = usb_submit_urb(urb, GFP_ATOMIC);
                        if (err)
                                dev_err(&port->dev, "resubmit read urb failed."
@@ -557,67 +603,99 @@ static int sierra_write_room(struct tty_struct *tty)
        return 2048;
 }
 
-static int sierra_open(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void sierra_stop_rx_urbs(struct usb_serial_port *port)
 {
-       struct sierra_port_private *portdata;
-       struct usb_serial *serial = port->serial;
        int i;
-       struct urb *urb;
-       int result;
+       struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-       portdata = usb_get_serial_port_data(port);
+       for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++)
+               usb_kill_urb(portdata->in_urbs[i]);
 
-       dev_dbg(&port->dev, "%s", __func__);
+       usb_kill_urb(port->interrupt_in_urb);
+}
 
-       /* Set some sane defaults */
-       portdata->rts_state = 1;
-       portdata->dtr_state = 1;
+static int sierra_submit_rx_urbs(struct usb_serial_port *port, gfp_t mem_flags)
+{
+       int ok_cnt;
+       int err = -EINVAL;
+       int i;
+       struct urb *urb;
+       struct sierra_port_private *portdata = usb_get_serial_port_data(port);
 
-       /* Reset low level data toggle and start reading from endpoints */
-       for (i = 0; i < N_IN_URB; i++) {
+       ok_cnt = 0;
+       for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
                urb = portdata->in_urbs[i];
                if (!urb)
                        continue;
-               if (urb->dev != serial->dev) {
-                       dev_dbg(&port->dev, "%s: dev %p != %p",
-                                __func__, urb->dev, serial->dev);
-                       continue;
+               err = usb_submit_urb(urb, mem_flags);
+               if (err) {
+                       dev_err(&port->dev, "%s: submit urb failed: %d\n",
+                               __func__, err);
+               } else {
+                       ok_cnt++;
                }
+       }
 
-               /*
-                * make sure endpoint data toggle is synchronized with the
-                * device
-                */
-               usb_clear_halt(urb->dev, urb->pipe);
-
-               result = usb_submit_urb(urb, GFP_KERNEL);
-               if (result) {
-                       dev_err(&port->dev, "submit urb %d failed (%d) %d\n",
-                               i, result, urb->transfer_buffer_length);
+       if (ok_cnt && port->interrupt_in_urb) {
+               err = usb_submit_urb(port->interrupt_in_urb, mem_flags);
+               if (err) {
+                       dev_err(&port->dev, "%s: submit intr urb failed: %d\n",
+                               __func__, err);
                }
        }
 
-       sierra_send_setup(tty, port);
+       if (ok_cnt > 0) /* at least one rx urb submitted */
+               return 0;
+       else
+               return err;
+}
+
+static struct urb *sierra_setup_urb(struct usb_serial *serial, int endpoint,
+                                       int dir, void *ctx, int len,
+                                       gfp_t mem_flags,
+                                       usb_complete_t callback)
+{
+       struct urb      *urb;
+       u8              *buf;
+
+       if (endpoint == -1)
+               return NULL;
 
-       /* start up the interrupt endpoint if we have one */
-       if (port->interrupt_in_urb) {
-               result = usb_submit_urb(port->interrupt_in_urb, GFP_KERNEL);
-               if (result)
-                       dev_err(&port->dev, "submit irq_in urb failed %d\n",
-                               result);
+       urb = usb_alloc_urb(0, mem_flags);
+       if (urb == NULL) {
+               dev_dbg(&serial->dev->dev, "%s: alloc for endpoint %d failed\n",
+                       __func__, endpoint);
+               return NULL;
        }
-       return 0;
+
+       buf = kmalloc(len, mem_flags);
+       if (buf) {
+               /* Fill URB using supplied data */
+               usb_fill_bulk_urb(urb, serial->dev,
+                       usb_sndbulkpipe(serial->dev, endpoint) | dir,
+                       buf, len, callback, ctx);
+
+               /* debug */
+               dev_dbg(&serial->dev->dev, "%s %c u : %p d:%p\n", __func__,
+                               dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+       } else {
+               dev_dbg(&serial->dev->dev, "%s %c u:%p d:%p\n", __func__,
+                               dir == USB_DIR_IN ? 'i' : 'o', urb, buf);
+
+               sierra_release_urb(urb);
+               urb = NULL;
+       }
+
+       return urb;
 }
 
-static void sierra_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void sierra_close(struct usb_serial_port *port)
 {
        int i;
        struct usb_serial *serial = port->serial;
        struct sierra_port_private *portdata;
 
-       dev_dbg(&port->dev, "%s", __func__);
+       dev_dbg(&port->dev, "%s\n", __func__);
        portdata = usb_get_serial_port_data(port);
 
        portdata->rts_state = 0;
@@ -626,25 +704,83 @@ static void sierra_close(struct tty_struct *tty,
        if (serial->dev) {
                mutex_lock(&serial->disc_mutex);
                if (!serial->disconnected)
-                       sierra_send_setup(tty, port);
+                       sierra_send_setup(port);
                mutex_unlock(&serial->disc_mutex);
 
-               /* Stop reading/writing urbs */
-               for (i = 0; i < N_IN_URB; i++)
-                       usb_kill_urb(portdata->in_urbs[i]);
+               /* Stop reading urbs */
+               sierra_stop_rx_urbs(port);
+               /* .. and release them */
+               for (i = 0; i < N_IN_URB; i++) {
+                       sierra_release_urb(portdata->in_urbs[i]);
+                       portdata->in_urbs[i] = NULL;
+               }
        }
+}
 
-       usb_kill_urb(port->interrupt_in_urb);
-       tty_port_tty_set(&port->port, NULL);
+static int sierra_open(struct tty_struct *tty,
+                       struct usb_serial_port *port, struct file *filp)
+{
+       struct sierra_port_private *portdata;
+       struct usb_serial *serial = port->serial;
+       int i;
+       int err;
+       int endpoint;
+       struct urb *urb;
+
+       portdata = usb_get_serial_port_data(port);
+
+       dev_dbg(&port->dev, "%s", __func__);
+
+       /* Set some sane defaults */
+       portdata->rts_state = 1;
+       portdata->dtr_state = 1;
+
+
+       endpoint = port->bulk_in_endpointAddress;
+       for (i = 0; i < ARRAY_SIZE(portdata->in_urbs); i++) {
+               urb = sierra_setup_urb(serial, endpoint, USB_DIR_IN, port,
+                                       IN_BUFLEN, GFP_KERNEL,
+                                       sierra_indat_callback);
+               portdata->in_urbs[i] = urb;
+       }
+       /* clear halt condition */
+       usb_clear_halt(serial->dev,
+                       usb_sndbulkpipe(serial->dev, endpoint) | USB_DIR_IN);
+
+       err = sierra_submit_rx_urbs(port, GFP_KERNEL);
+       if (err) {
+               /* get rid of everything as in close */
+               sierra_close(port);
+               return err;
+       }
+       sierra_send_setup(port);
+
+       return 0;
+}
+
+
+static void sierra_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct usb_serial *serial = port->serial;
+       struct sierra_port_private *portdata;
+
+       portdata = usb_get_serial_port_data(port);
+       portdata->rts_state = on;
+       portdata->dtr_state = on;
+
+       if (serial->dev) {
+               mutex_lock(&serial->disc_mutex);
+               if (!serial->disconnected)
+                       sierra_send_setup(port);
+               mutex_unlock(&serial->disc_mutex);
+       }
 }
 
 static int sierra_startup(struct usb_serial *serial)
 {
        struct usb_serial_port *port;
        struct sierra_port_private *portdata;
-       struct urb *urb;
        int i;
-       int j;
 
        dev_dbg(&serial->dev->dev, "%s", __func__);
 
@@ -666,34 +802,8 @@ static int sierra_startup(struct usb_serial *serial)
                        return -ENOMEM;
                }
                spin_lock_init(&portdata->lock);
-               for (j = 0; j < N_IN_URB; j++) {
-                       portdata->in_buffer[j] = kmalloc(IN_BUFLEN, GFP_KERNEL);
-                       if (!portdata->in_buffer[j]) {
-                               for (--j; j >= 0; j--)
-                                       kfree(portdata->in_buffer[j]);
-                               kfree(portdata);
-                               return -ENOMEM;
-                       }
-               }
-
+               /* Set the port private data pointer */
                usb_set_serial_port_data(port, portdata);
-
-               /* initialize the in urbs */
-               for (j = 0; j < N_IN_URB; ++j) {
-                       urb = usb_alloc_urb(0, GFP_KERNEL);
-                       if (urb == NULL) {
-                               dev_dbg(&port->dev, "%s: alloc for in "
-                                       "port failed.", __func__);
-                               continue;
-                       }
-                       /* Fill URB using supplied data. */
-                       usb_fill_bulk_urb(urb, serial->dev,
-                                         usb_rcvbulkpipe(serial->dev,
-                                               port->bulk_in_endpointAddress),
-                                         portdata->in_buffer[j], IN_BUFLEN,
-                                         sierra_indat_callback, port);
-                       portdata->in_urbs[j] = urb;
-               }
        }
 
        return 0;
@@ -701,7 +811,7 @@ static int sierra_startup(struct usb_serial *serial)
 
 static void sierra_shutdown(struct usb_serial *serial)
 {
-       int i, j;
+       int i;
        struct usb_serial_port *port;
        struct sierra_port_private *portdata;
 
@@ -714,12 +824,6 @@ static void sierra_shutdown(struct usb_serial *serial)
                portdata = usb_get_serial_port_data(port);
                if (!portdata)
                        continue;
-
-               for (j = 0; j < N_IN_URB; j++) {
-                       usb_kill_urb(portdata->in_urbs[j]);
-                       usb_free_urb(portdata->in_urbs[j]);
-                       kfree(portdata->in_buffer[j]);
-               }
                kfree(portdata);
                usb_set_serial_port_data(port, NULL);
        }
@@ -737,6 +841,7 @@ static struct usb_serial_driver sierra_device = {
        .probe             = sierra_probe,
        .open              = sierra_open,
        .close             = sierra_close,
+       .dtr_rts           = sierra_dtr_rts,
        .write             = sierra_write,
        .write_room        = sierra_write_room,
        .set_termios       = sierra_set_termios,
index 5e7528cc81a8e624f8ef64a869eca06b18918126..8f7ed8f13996185857021d7d2f730c08f60b2403 100644 (file)
@@ -446,66 +446,47 @@ static void spcp8x5_set_workMode(struct usb_device *dev, u16 value,
                        "RTSCTS usb_control_msg(enable flowctrl) = %d\n", ret);
 }
 
+static int spcp8x5_carrier_raised(struct usb_serial_port *port)
+{
+       struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+       if (priv->line_status & MSR_STATUS_LINE_DCD)
+               return 1;
+       return 0;
+}
+
+static void spcp8x5_dtr_rts(struct usb_serial_port *port, int on)
+{
+       struct spcp8x5_private *priv = usb_get_serial_port_data(port);
+       unsigned long flags;
+       u8 control;
+
+       spin_lock_irqsave(&priv->lock, flags);
+       if (on)
+               priv->line_control = MCR_CONTROL_LINE_DTR
+                                               | MCR_CONTROL_LINE_RTS;
+       else
+               priv->line_control &= ~ (MCR_CONTROL_LINE_DTR
+                                               | MCR_CONTROL_LINE_RTS);
+       control = priv->line_control;
+       spin_unlock_irqrestore(&priv->lock, flags);
+       spcp8x5_set_ctrlLine(port->serial->dev, control , priv->type);
+}
+
 /* close the serial port. We should wait for data sending to device 1st and
  * then kill all urb. */
-static void spcp8x5_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void spcp8x5_close(struct usb_serial_port *port)
 {
        struct spcp8x5_private *priv = usb_get_serial_port_data(port);
        unsigned long flags;
-       unsigned int c_cflag;
-       int bps;
-       long timeout;
-       wait_queue_t wait;
        int result;
 
        dbg("%s - port %d", __func__, port->number);
 
-       /* wait for data to drain from the buffer */
        spin_lock_irqsave(&priv->lock, flags);
-       timeout = SPCP8x5_CLOSING_WAIT;
-       init_waitqueue_entry(&wait, current);
-       add_wait_queue(&tty->write_wait, &wait);
-       for (;;) {
-               set_current_state(TASK_INTERRUPTIBLE);
-               if (ringbuf_avail_data(priv->buf) == 0 ||
-                   timeout == 0 || signal_pending(current))
-                       break;
-               spin_unlock_irqrestore(&priv->lock, flags);
-               timeout = schedule_timeout(timeout);
-               spin_lock_irqsave(&priv->lock, flags);
-       }
-       set_current_state(TASK_RUNNING);
-       remove_wait_queue(&tty->write_wait, &wait);
-
        /* clear out any remaining data in the buffer */
        clear_ringbuf(priv->buf);
        spin_unlock_irqrestore(&priv->lock, flags);
 
-       /* wait for characters to drain from the device (this is long enough
-        * for the entire all byte spcp8x5 hardware buffer to drain with no
-        * flow control for data rates of 1200 bps or more, for lower rates we
-        * should really know how much data is in the buffer to compute a delay
-        * that is not unnecessarily long) */
-       bps = tty_get_baud_rate(tty);
-       if (bps > 1200)
-               timeout = max((HZ*2560) / bps, HZ/10);
-       else
-               timeout = 2*HZ;
-       set_current_state(TASK_INTERRUPTIBLE);
-       schedule_timeout(timeout);
-
-       /* clear control lines */
-       if (tty) {
-               c_cflag = tty->termios->c_cflag;
-               if (c_cflag & HUPCL) {
-                       spin_lock_irqsave(&priv->lock, flags);
-                       priv->line_control = 0;
-                       spin_unlock_irqrestore(&priv->lock, flags);
-                       spcp8x5_set_ctrlLine(port->serial->dev, 0 , priv->type);
-               }
-       }
-
        /* kill urb */
        if (port->write_urb != NULL) {
                result = usb_unlink_urb(port->write_urb);
@@ -665,13 +646,6 @@ static int spcp8x5_open(struct tty_struct *tty,
        if (ret)
                return ret;
 
-       spin_lock_irqsave(&priv->lock, flags);
-       if (tty && (tty->termios->c_cflag & CBAUD))
-               priv->line_control = MCR_DTR | MCR_RTS;
-       else
-               priv->line_control = 0;
-       spin_unlock_irqrestore(&priv->lock, flags);
-
        spcp8x5_set_ctrlLine(serial->dev, priv->line_control , priv->type);
 
        /* Setup termios */
@@ -691,9 +665,10 @@ static int spcp8x5_open(struct tty_struct *tty,
        port->read_urb->dev = serial->dev;
        ret = usb_submit_urb(port->read_urb, GFP_KERNEL);
        if (ret) {
-               spcp8x5_close(tty, port, NULL);
+               spcp8x5_close(port);
                return -EPROTO;
        }
+       port->port.drain_delay = 256;
        return 0;
 }
 
@@ -1033,6 +1008,8 @@ static struct usb_serial_driver spcp8x5_device = {
        .num_ports              = 1,
        .open                   = spcp8x5_open,
        .close                  = spcp8x5_close,
+       .dtr_rts                = spcp8x5_dtr_rts,
+       .carrier_raised         = spcp8x5_carrier_raised,
        .write                  = spcp8x5_write,
        .set_termios            = spcp8x5_set_termios,
        .ioctl                  = spcp8x5_ioctl,
index 69879e4379402f63ad53579c6a1184d762322260..8b07ebc6baeb4e40ba036e07cacee5414b508628 100644 (file)
@@ -152,8 +152,7 @@ static int symbol_open(struct tty_struct *tty, struct usb_serial_port *port,
        return result;
 }
 
-static void symbol_close(struct tty_struct *tty, struct usb_serial_port *port,
-                         struct file *filp)
+static void symbol_close(struct usb_serial_port *port)
 {
        struct symbol_private *priv = usb_get_serial_data(port->serial);
 
index 0a64bac306ee693f02e9b11ce20a2a70cf452965..42cb04c403beee3900e6fa33451913d0e0a3b29e 100644 (file)
@@ -100,8 +100,7 @@ static int ti_startup(struct usb_serial *serial);
 static void ti_shutdown(struct usb_serial *serial);
 static int ti_open(struct tty_struct *tty, struct usb_serial_port *port,
                struct file *file);
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-               struct file *file);
+static void ti_close(struct usb_serial_port *port);
 static int ti_write(struct tty_struct *tty, struct usb_serial_port *port,
                const unsigned char *data, int count);
 static int ti_write_room(struct tty_struct *tty);
@@ -647,8 +646,7 @@ release_lock:
 }
 
 
-static void ti_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                                       struct file *file)
+static void ti_close(struct usb_serial_port *port)
 {
        struct ti_device *tdev;
        struct ti_port *tport;
index f331e2bde88acbb0c2b4e36fee01eff68ffc9b5c..1967a7edc10c51fab8263aafc43a425a2b28e686 100644 (file)
@@ -238,9 +238,11 @@ static int serial_open (struct tty_struct *tty, struct file *filp)
                        goto bailout_interface_put;
                mutex_unlock(&serial->disc_mutex);
        }
-
        mutex_unlock(&port->mutex);
-       return 0;
+       /* Now do the correct tty layer semantics */
+       retval = tty_port_block_til_ready(&port->port, tty, filp);
+       if (retval == 0)
+               return 0;
 
 bailout_interface_put:
        usb_autopm_put_interface(serial->interface);
@@ -259,64 +261,89 @@ bailout_serial_put:
        return retval;
 }
 
-static void serial_close(struct tty_struct *tty, struct file *filp)
+/**
+ *     serial_do_down          -       shut down hardware
+ *     @port: port to shut down
+ *
+ *     Shut down a USB port unless it is the console. We never shut down the
+ *     console hardware as it will always be in use.
+ *
+ *     Don't free any resources at this point
+ */
+static void serial_do_down(struct usb_serial_port *port)
 {
-       struct usb_serial_port *port = tty->driver_data;
+       struct usb_serial_driver *drv = port->serial->type;
        struct usb_serial *serial;
        struct module *owner;
-       int count;
 
-       if (!port)
+       /* The console is magical, do not hang up the console hardware
+          or there will be tears */
+       if (port->console)
                return;
 
-       dbg("%s - port %d", __func__, port->number);
-
        mutex_lock(&port->mutex);
        serial = port->serial;
        owner = serial->type->driver.owner;
 
-       if (port->port.count == 0) {
-               mutex_unlock(&port->mutex);
-               return;
-       }
-
-       if (port->port.count == 1)
-               /* only call the device specific close if this
-                * port is being closed by the last owner. Ensure we do
-                * this before we drop the port count. The call is protected
-                * by the port mutex
-                */
-               serial->type->close(tty, port, filp);
-
-       if (port->port.count == (port->console ? 2 : 1)) {
-               struct tty_struct *tty = tty_port_tty_get(&port->port);
-               if (tty) {
-                       /* We must do this before we drop the port count to
-                          zero. */
-                       if (tty->driver_data)
-                               tty->driver_data = NULL;
-                       tty_port_tty_set(&port->port, NULL);
-                       tty_kref_put(tty);
-               }
-       }
+       if (drv->close)
+               drv->close(port);
 
-       --port->port.count;
-       count = port->port.count;
        mutex_unlock(&port->mutex);
-       put_device(&port->dev);
+}
+
+/**
+ *     serial_do_free          -       free resources post close/hangup
+ *     @port: port to free up
+ *
+ *     Do the resource freeing and refcount dropping for the port. We must
+ *     be careful about ordering and we must avoid freeing up the console.
+ */
 
+static void serial_do_free(struct usb_serial_port *port)
+{
+       struct usb_serial *serial;
+       struct module *owner;
+
+       /* The console is magical, do not hang up the console hardware
+          or there will be tears */
+       if (port->console)
+               return;
+
+       serial = port->serial;
+       owner = serial->type->driver.owner;
+       put_device(&port->dev);
        /* Mustn't dereference port any more */
-       if (count == 0) {
-               mutex_lock(&serial->disc_mutex);
-               if (!serial->disconnected)
-                       usb_autopm_put_interface(serial->interface);
-               mutex_unlock(&serial->disc_mutex);
-       }
+       mutex_lock(&serial->disc_mutex);
+       if (!serial->disconnected)
+               usb_autopm_put_interface(serial->interface);
+       mutex_unlock(&serial->disc_mutex);
        usb_serial_put(serial);
-
        /* Mustn't dereference serial any more */
-       if (count == 0)
-               module_put(owner);
+       module_put(owner);
+}
+
+static void serial_close(struct tty_struct *tty, struct file *filp)
+{
+       struct usb_serial_port *port = tty->driver_data;
+
+       dbg("%s - port %d", __func__, port->number);
+
+
+       if (tty_port_close_start(&port->port, tty, filp) == 0)
+               return;
+
+       serial_do_down(port);           
+       tty_port_close_end(&port->port, tty);
+       tty_port_tty_set(&port->port, NULL);
+       serial_do_free(port);
+}
+
+static void serial_hangup(struct tty_struct *tty)
+{
+       struct usb_serial_port *port = tty->driver_data;
+       serial_do_down(port);
+       tty_port_hangup(&port->port);
+       serial_do_free(port);
 }
 
 static int serial_write(struct tty_struct *tty, const unsigned char *buf,
@@ -648,6 +675,29 @@ static struct usb_serial_driver *search_serial_device(
        return NULL;
 }
 
+static int serial_carrier_raised(struct tty_port *port)
+{
+       struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+       struct usb_serial_driver *drv = p->serial->type;
+       if (drv->carrier_raised)
+               return drv->carrier_raised(p);
+       /* No carrier control - don't block */
+       return 1;       
+}
+
+static void serial_dtr_rts(struct tty_port *port, int on)
+{
+       struct usb_serial_port *p = container_of(port, struct usb_serial_port, port);
+       struct usb_serial_driver *drv = p->serial->type;
+       if (drv->dtr_rts)
+               drv->dtr_rts(p, on);
+}
+
+static const struct tty_port_operations serial_port_ops = {
+       .carrier_raised = serial_carrier_raised,
+       .dtr_rts = serial_dtr_rts,
+};
+
 int usb_serial_probe(struct usb_interface *interface,
                               const struct usb_device_id *id)
 {
@@ -841,6 +891,7 @@ int usb_serial_probe(struct usb_interface *interface,
                if (!port)
                        goto probe_error;
                tty_port_init(&port->port);
+               port->port.ops = &serial_port_ops;
                port->serial = serial;
                spin_lock_init(&port->lock);
                mutex_init(&port->mutex);
@@ -1071,6 +1122,9 @@ void usb_serial_disconnect(struct usb_interface *interface)
                if (port) {
                        struct tty_struct *tty = tty_port_tty_get(&port->port);
                        if (tty) {
+                               /* The hangup will occur asynchronously but
+                                  the object refcounts will sort out all the
+                                  cleanup */
                                tty_hangup(tty);
                                tty_kref_put(tty);
                        }
@@ -1135,6 +1189,7 @@ static const struct tty_operations serial_ops = {
        .open =                 serial_open,
        .close =                serial_close,
        .write =                serial_write,
+       .hangup =               serial_hangup,
        .write_room =           serial_write_room,
        .ioctl =                serial_ioctl,
        .set_termios =          serial_set_termios,
@@ -1147,6 +1202,7 @@ static const struct tty_operations serial_ops = {
        .proc_fops =            &serial_proc_fops,
 };
 
+
 struct tty_driver *usb_serial_tty_driver;
 
 static int __init usb_serial_init(void)
index 5ac414bda718103191d6489f613e3d5d3c361dd6..b15f1c0e1d4acba4adcbadf66ac98a9870e8c994 100644 (file)
@@ -38,8 +38,7 @@
 /* function prototypes for a handspring visor */
 static int  visor_open(struct tty_struct *tty, struct usb_serial_port *port,
                                        struct file *filp);
-static void visor_close(struct tty_struct *tty, struct usb_serial_port *port,
-                                       struct file *filp);
+static void visor_close(struct usb_serial_port *port);
 static int  visor_write(struct tty_struct *tty, struct usb_serial_port *port,
                                        const unsigned char *buf, int count);
 static int  visor_write_room(struct tty_struct *tty);
@@ -324,8 +323,7 @@ exit:
 }
 
 
-static void visor_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void visor_close(struct usb_serial_port *port)
 {
        struct visor_private *priv = usb_get_serial_port_data(port);
        unsigned char *transfer_buffer;
index 5335d3211c073c1c661d895aeaa0488fa0196ba1..7c7295d09f344cf7b4b167346e140e75d6394207 100644 (file)
@@ -147,8 +147,7 @@ static int  whiteheat_attach(struct usb_serial *serial);
 static void whiteheat_shutdown(struct usb_serial *serial);
 static int  whiteheat_open(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-static void whiteheat_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+static void whiteheat_close(struct usb_serial_port *port);
 static int  whiteheat_write(struct tty_struct *tty,
                        struct usb_serial_port *port,
                        const unsigned char *buf, int count);
@@ -712,8 +711,7 @@ exit:
 }
 
 
-static void whiteheat_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp)
+static void whiteheat_close(struct usb_serial_port *port)
 {
        struct whiteheat_private *info = usb_get_serial_port_data(port);
        struct whiteheat_urb_wrap *wrap;
@@ -723,31 +721,7 @@ static void whiteheat_close(struct tty_struct *tty,
 
        dbg("%s - port %d", __func__, port->number);
 
-       mutex_lock(&port->serial->disc_mutex);
-       /* filp is NULL when called from usb_serial_disconnect */
-       if ((filp && (tty_hung_up_p(filp))) || port->serial->disconnected) {
-               mutex_unlock(&port->serial->disc_mutex);
-               return;
-       }
-       mutex_unlock(&port->serial->disc_mutex);
-
-       tty->closing = 1;
-
-/*
- * Not currently in use; tty_wait_until_sent() calls
- * serial_chars_in_buffer() which deadlocks on the second semaphore
- * acquisition. This should be fixed at some point. Greg's been
- * notified.
-       if ((filp->f_flags & (O_NDELAY | O_NONBLOCK)) == 0) {
-               tty_wait_until_sent(tty, CLOSING_DELAY);
-       }
-*/
-
-       tty_driver_flush_buffer(tty);
-       tty_ldisc_flush(tty);
-
        firm_report_tx_done(port);
-
        firm_close(port);
 
        /* shutdown our bulk reads and writes */
@@ -775,10 +749,7 @@ static void whiteheat_close(struct tty_struct *tty,
        }
        spin_unlock_irq(&info->lock);
        mutex_unlock(&info->deathwarrant);
-
        stop_command_port(port->serial);
-
-       tty->closing = 0;
 }
 
 
index 8ac9cddac5754fdbf8ef9df33550a47bfba318f4..cab100acf983905b53d5c85b6ae76496d7b9b7c8 100644 (file)
@@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES
          secure, but slightly less efficient.
          If in doubt, say yes.
 
+config XEN_DEV_EVTCHN
+       tristate "Xen /dev/xen/evtchn device"
+       depends on XEN
+       default y
+       help
+         The evtchn driver allows a userspace process to triger event
+         channels and to receive notification of an event channel
+         firing.
+         If in doubt, say yes.
+
 config XENFS
        tristate "Xen filesystem"
        depends on XEN
@@ -41,3 +51,13 @@ config XEN_COMPAT_XENFS
          a xen platform.
          If in doubt, say yes.
 
+config XEN_SYS_HYPERVISOR
+       bool "Create xen entries under /sys/hypervisor"
+       depends on XEN && SYSFS
+       select SYS_HYPERVISOR
+       default y
+       help
+         Create entries under /sys/hypervisor describing the Xen
+        hypervisor environment.  When running native or in another
+        virtual environment, /sys/hypervisor will still be present,
+        but will have no xen contents.
\ No newline at end of file
index ff8accc9e103f9d2a4fc21dcbcc2e88d479a0ade..ec2a39b1e26f0438f6a2266cd77ff6173fd7bbc7 100644 (file)
@@ -4,4 +4,6 @@ obj-y   += xenbus/
 obj-$(CONFIG_HOTPLUG_CPU)      += cpu_hotplug.o
 obj-$(CONFIG_XEN_XENCOMM)      += xencomm.o
 obj-$(CONFIG_XEN_BALLOON)      += balloon.o
-obj-$(CONFIG_XENFS)            += xenfs/
\ No newline at end of file
+obj-$(CONFIG_XEN_DEV_EVTCHN)   += evtchn.o
+obj-$(CONFIG_XENFS)            += xenfs/
+obj-$(CONFIG_XEN_SYS_HYPERVISOR)       += sys-hypervisor.o
\ No newline at end of file
index 30963af5dba02960143f9ebf37b6f85d44de0dec..891d2e90753ab6e9dd312b3120ce6904a96058c2 100644 (file)
@@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq)
        return info_for_irq(irq)->evtchn;
 }
 
+unsigned irq_from_evtchn(unsigned int evtchn)
+{
+       return evtchn_to_irq[evtchn];
+}
+EXPORT_SYMBOL_GPL(irq_from_evtchn);
+
 static enum ipi_vector ipi_from_irq(unsigned irq)
 {
        struct irq_info *info = info_for_irq(irq);
@@ -335,7 +341,7 @@ static int find_unbound_irq(void)
        if (irq == nr_irqs)
                panic("No available IRQ to bind to: increase nr_irqs!\n");
 
-       desc = irq_to_desc_alloc_cpu(irq, 0);
+       desc = irq_to_desc_alloc_node(irq, 0);
        if (WARN_ON(desc == NULL))
                return -1;
 
@@ -688,13 +694,13 @@ void rebind_evtchn_irq(int evtchn, int irq)
 }
 
 /* Rebind an evtchn so that it gets delivered to a specific cpu */
-static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
+static int rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
 {
        struct evtchn_bind_vcpu bind_vcpu;
        int evtchn = evtchn_from_irq(irq);
 
        if (!VALID_EVTCHN(evtchn))
-               return;
+               return -1;
 
        /* Send future instances of this interrupt to other vcpu. */
        bind_vcpu.port = evtchn;
@@ -707,13 +713,15 @@ static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu)
         */
        if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0)
                bind_evtchn_to_cpu(evtchn, tcpu);
-}
 
+       return 0;
+}
 
-static void set_affinity_irq(unsigned irq, const struct cpumask *dest)
+static int set_affinity_irq(unsigned irq, const struct cpumask *dest)
 {
        unsigned tcpu = cpumask_first(dest);
-       rebind_irq_to_cpu(irq, tcpu);
+
+       return rebind_irq_to_cpu(irq, tcpu);
 }
 
 int resend_irq_on_evtchn(unsigned int irq)
diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c
new file mode 100644 (file)
index 0000000..af03195
--- /dev/null
@@ -0,0 +1,507 @@
+/******************************************************************************
+ * evtchn.c
+ *
+ * Driver for receiving and demuxing event-channel signals.
+ *
+ * Copyright (c) 2004-2005, K A Fraser
+ * Multi-process extensions Copyright (c) 2004, Steven Smith
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/fs.h>
+#include <linux/errno.h>
+#include <linux/miscdevice.h>
+#include <linux/major.h>
+#include <linux/proc_fs.h>
+#include <linux/stat.h>
+#include <linux/poll.h>
+#include <linux/irq.h>
+#include <linux/init.h>
+#include <linux/gfp.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <xen/events.h>
+#include <xen/evtchn.h>
+#include <asm/xen/hypervisor.h>
+
+struct per_user_data {
+       struct mutex bind_mutex; /* serialize bind/unbind operations */
+
+       /* Notification ring, accessed via /dev/xen/evtchn. */
+#define EVTCHN_RING_SIZE     (PAGE_SIZE / sizeof(evtchn_port_t))
+#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1))
+       evtchn_port_t *ring;
+       unsigned int ring_cons, ring_prod, ring_overflow;
+       struct mutex ring_cons_mutex; /* protect against concurrent readers */
+
+       /* Processes wait on this queue when ring is empty. */
+       wait_queue_head_t evtchn_wait;
+       struct fasync_struct *evtchn_async_queue;
+       const char *name;
+};
+
+/* Who's bound to each port? */
+static struct per_user_data *port_user[NR_EVENT_CHANNELS];
+static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */
+
+irqreturn_t evtchn_interrupt(int irq, void *data)
+{
+       unsigned int port = (unsigned long)data;
+       struct per_user_data *u;
+
+       spin_lock(&port_user_lock);
+
+       u = port_user[port];
+
+       disable_irq_nosync(irq);
+
+       if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) {
+               u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port;
+               wmb(); /* Ensure ring contents visible */
+               if (u->ring_cons == u->ring_prod++) {
+                       wake_up_interruptible(&u->evtchn_wait);
+                       kill_fasync(&u->evtchn_async_queue,
+                                   SIGIO, POLL_IN);
+               }
+       } else {
+               u->ring_overflow = 1;
+       }
+
+       spin_unlock(&port_user_lock);
+
+       return IRQ_HANDLED;
+}
+
+static ssize_t evtchn_read(struct file *file, char __user *buf,
+                          size_t count, loff_t *ppos)
+{
+       int rc;
+       unsigned int c, p, bytes1 = 0, bytes2 = 0;
+       struct per_user_data *u = file->private_data;
+
+       /* Whole number of ports. */
+       count &= ~(sizeof(evtchn_port_t)-1);
+
+       if (count == 0)
+               return 0;
+
+       if (count > PAGE_SIZE)
+               count = PAGE_SIZE;
+
+       for (;;) {
+               mutex_lock(&u->ring_cons_mutex);
+
+               rc = -EFBIG;
+               if (u->ring_overflow)
+                       goto unlock_out;
+
+               c = u->ring_cons;
+               p = u->ring_prod;
+               if (c != p)
+                       break;
+
+               mutex_unlock(&u->ring_cons_mutex);
+
+               if (file->f_flags & O_NONBLOCK)
+                       return -EAGAIN;
+
+               rc = wait_event_interruptible(u->evtchn_wait,
+                                             u->ring_cons != u->ring_prod);
+               if (rc)
+                       return rc;
+       }
+
+       /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */
+       if (((c ^ p) & EVTCHN_RING_SIZE) != 0) {
+               bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) *
+                       sizeof(evtchn_port_t);
+               bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t);
+       } else {
+               bytes1 = (p - c) * sizeof(evtchn_port_t);
+               bytes2 = 0;
+       }
+
+       /* Truncate chunks according to caller's maximum byte count. */
+       if (bytes1 > count) {
+               bytes1 = count;
+               bytes2 = 0;
+       } else if ((bytes1 + bytes2) > count) {
+               bytes2 = count - bytes1;
+       }
+
+       rc = -EFAULT;
+       rmb(); /* Ensure that we see the port before we copy it. */
+       if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) ||
+           ((bytes2 != 0) &&
+            copy_to_user(&buf[bytes1], &u->ring[0], bytes2)))
+               goto unlock_out;
+
+       u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t);
+       rc = bytes1 + bytes2;
+
+ unlock_out:
+       mutex_unlock(&u->ring_cons_mutex);
+       return rc;
+}
+
+static ssize_t evtchn_write(struct file *file, const char __user *buf,
+                           size_t count, loff_t *ppos)
+{
+       int rc, i;
+       evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+       struct per_user_data *u = file->private_data;
+
+       if (kbuf == NULL)
+               return -ENOMEM;
+
+       /* Whole number of ports. */
+       count &= ~(sizeof(evtchn_port_t)-1);
+
+       rc = 0;
+       if (count == 0)
+               goto out;
+
+       if (count > PAGE_SIZE)
+               count = PAGE_SIZE;
+
+       rc = -EFAULT;
+       if (copy_from_user(kbuf, buf, count) != 0)
+               goto out;
+
+       spin_lock_irq(&port_user_lock);
+       for (i = 0; i < (count/sizeof(evtchn_port_t)); i++)
+               if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u))
+                       enable_irq(irq_from_evtchn(kbuf[i]));
+       spin_unlock_irq(&port_user_lock);
+
+       rc = count;
+
+ out:
+       free_page((unsigned long)kbuf);
+       return rc;
+}
+
+static int evtchn_bind_to_user(struct per_user_data *u, int port)
+{
+       int rc = 0;
+
+       /*
+        * Ports are never reused, so every caller should pass in a
+        * unique port.
+        *
+        * (Locking not necessary because we haven't registered the
+        * interrupt handler yet, and our caller has already
+        * serialized bind operations.)
+        */
+       BUG_ON(port_user[port] != NULL);
+       port_user[port] = u;
+
+       rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED,
+                                      u->name, (void *)(unsigned long)port);
+       if (rc >= 0)
+               rc = 0;
+
+       return rc;
+}
+
+static void evtchn_unbind_from_user(struct per_user_data *u, int port)
+{
+       int irq = irq_from_evtchn(port);
+
+       unbind_from_irqhandler(irq, (void *)(unsigned long)port);
+
+       /* make sure we unbind the irq handler before clearing the port */
+       barrier();
+
+       port_user[port] = NULL;
+}
+
+static long evtchn_ioctl(struct file *file,
+                        unsigned int cmd, unsigned long arg)
+{
+       int rc;
+       struct per_user_data *u = file->private_data;
+       void __user *uarg = (void __user *) arg;
+
+       /* Prevent bind from racing with unbind */
+       mutex_lock(&u->bind_mutex);
+
+       switch (cmd) {
+       case IOCTL_EVTCHN_BIND_VIRQ: {
+               struct ioctl_evtchn_bind_virq bind;
+               struct evtchn_bind_virq bind_virq;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               bind_virq.virq = bind.virq;
+               bind_virq.vcpu = 0;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq,
+                                                &bind_virq);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, bind_virq.port);
+               if (rc == 0)
+                       rc = bind_virq.port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_BIND_INTERDOMAIN: {
+               struct ioctl_evtchn_bind_interdomain bind;
+               struct evtchn_bind_interdomain bind_interdomain;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               bind_interdomain.remote_dom  = bind.remote_domain;
+               bind_interdomain.remote_port = bind.remote_port;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain,
+                                                &bind_interdomain);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, bind_interdomain.local_port);
+               if (rc == 0)
+                       rc = bind_interdomain.local_port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_BIND_UNBOUND_PORT: {
+               struct ioctl_evtchn_bind_unbound_port bind;
+               struct evtchn_alloc_unbound alloc_unbound;
+
+               rc = -EFAULT;
+               if (copy_from_user(&bind, uarg, sizeof(bind)))
+                       break;
+
+               alloc_unbound.dom        = DOMID_SELF;
+               alloc_unbound.remote_dom = bind.remote_domain;
+               rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound,
+                                                &alloc_unbound);
+               if (rc != 0)
+                       break;
+
+               rc = evtchn_bind_to_user(u, alloc_unbound.port);
+               if (rc == 0)
+                       rc = alloc_unbound.port;
+               break;
+       }
+
+       case IOCTL_EVTCHN_UNBIND: {
+               struct ioctl_evtchn_unbind unbind;
+
+               rc = -EFAULT;
+               if (copy_from_user(&unbind, uarg, sizeof(unbind)))
+                       break;
+
+               rc = -EINVAL;
+               if (unbind.port >= NR_EVENT_CHANNELS)
+                       break;
+
+               spin_lock_irq(&port_user_lock);
+
+               rc = -ENOTCONN;
+               if (port_user[unbind.port] != u) {
+                       spin_unlock_irq(&port_user_lock);
+                       break;
+               }
+
+               evtchn_unbind_from_user(u, unbind.port);
+
+               spin_unlock_irq(&port_user_lock);
+
+               rc = 0;
+               break;
+       }
+
+       case IOCTL_EVTCHN_NOTIFY: {
+               struct ioctl_evtchn_notify notify;
+
+               rc = -EFAULT;
+               if (copy_from_user(&notify, uarg, sizeof(notify)))
+                       break;
+
+               if (notify.port >= NR_EVENT_CHANNELS) {
+                       rc = -EINVAL;
+               } else if (port_user[notify.port] != u) {
+                       rc = -ENOTCONN;
+               } else {
+                       notify_remote_via_evtchn(notify.port);
+                       rc = 0;
+               }
+               break;
+       }
+
+       case IOCTL_EVTCHN_RESET: {
+               /* Initialise the ring to empty. Clear errors. */
+               mutex_lock(&u->ring_cons_mutex);
+               spin_lock_irq(&port_user_lock);
+               u->ring_cons = u->ring_prod = u->ring_overflow = 0;
+               spin_unlock_irq(&port_user_lock);
+               mutex_unlock(&u->ring_cons_mutex);
+               rc = 0;
+               break;
+       }
+
+       default:
+               rc = -ENOSYS;
+               break;
+       }
+       mutex_unlock(&u->bind_mutex);
+
+       return rc;
+}
+
+static unsigned int evtchn_poll(struct file *file, poll_table *wait)
+{
+       unsigned int mask = POLLOUT | POLLWRNORM;
+       struct per_user_data *u = file->private_data;
+
+       poll_wait(file, &u->evtchn_wait, wait);
+       if (u->ring_cons != u->ring_prod)
+               mask |= POLLIN | POLLRDNORM;
+       if (u->ring_overflow)
+               mask = POLLERR;
+       return mask;
+}
+
+static int evtchn_fasync(int fd, struct file *filp, int on)
+{
+       struct per_user_data *u = filp->private_data;
+       return fasync_helper(fd, filp, on, &u->evtchn_async_queue);
+}
+
+static int evtchn_open(struct inode *inode, struct file *filp)
+{
+       struct per_user_data *u;
+
+       u = kzalloc(sizeof(*u), GFP_KERNEL);
+       if (u == NULL)
+               return -ENOMEM;
+
+       u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm);
+       if (u->name == NULL) {
+               kfree(u);
+               return -ENOMEM;
+       }
+
+       init_waitqueue_head(&u->evtchn_wait);
+
+       u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL);
+       if (u->ring == NULL) {
+               kfree(u->name);
+               kfree(u);
+               return -ENOMEM;
+       }
+
+       mutex_init(&u->bind_mutex);
+       mutex_init(&u->ring_cons_mutex);
+
+       filp->private_data = u;
+
+       return 0;
+}
+
+static int evtchn_release(struct inode *inode, struct file *filp)
+{
+       int i;
+       struct per_user_data *u = filp->private_data;
+
+       spin_lock_irq(&port_user_lock);
+
+       free_page((unsigned long)u->ring);
+
+       for (i = 0; i < NR_EVENT_CHANNELS; i++) {
+               if (port_user[i] != u)
+                       continue;
+
+               evtchn_unbind_from_user(port_user[i], i);
+       }
+
+       spin_unlock_irq(&port_user_lock);
+
+       kfree(u->name);
+       kfree(u);
+
+       return 0;
+}
+
+static const struct file_operations evtchn_fops = {
+       .owner   = THIS_MODULE,
+       .read    = evtchn_read,
+       .write   = evtchn_write,
+       .unlocked_ioctl = evtchn_ioctl,
+       .poll    = evtchn_poll,
+       .fasync  = evtchn_fasync,
+       .open    = evtchn_open,
+       .release = evtchn_release,
+};
+
+static struct miscdevice evtchn_miscdev = {
+       .minor        = MISC_DYNAMIC_MINOR,
+       .name         = "evtchn",
+       .fops         = &evtchn_fops,
+};
+static int __init evtchn_init(void)
+{
+       int err;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       spin_lock_init(&port_user_lock);
+       memset(port_user, 0, sizeof(port_user));
+
+       /* Create '/dev/misc/evtchn'. */
+       err = misc_register(&evtchn_miscdev);
+       if (err != 0) {
+               printk(KERN_ALERT "Could not register /dev/misc/evtchn\n");
+               return err;
+       }
+
+       printk(KERN_INFO "Event-channel device installed.\n");
+
+       return 0;
+}
+
+static void __exit evtchn_cleanup(void)
+{
+       misc_deregister(&evtchn_miscdev);
+}
+
+module_init(evtchn_init);
+module_exit(evtchn_cleanup);
+
+MODULE_LICENSE("GPL");
index 4b5b84837ee13c85fab14a73c7ab213f74af90e2..fddc2025dece777c513a7a7a72f8537751bf1834 100644 (file)
@@ -98,9 +98,8 @@ static void do_suspend(void)
                goto out;
        }
 
-       printk("suspending xenbus...\n");
-       /* XXX use normal device tree? */
-       xenbus_suspend();
+       printk(KERN_DEBUG "suspending xenstore...\n");
+       xs_suspend();
 
        err = device_power_down(PMSG_SUSPEND);
        if (err) {
@@ -116,9 +115,9 @@ static void do_suspend(void)
 
        if (!cancelled) {
                xen_arch_resume();
-               xenbus_resume();
+               xs_resume();
        } else
-               xenbus_suspend_cancel();
+               xs_suspend_cancel();
 
        device_power_up(PMSG_RESUME);
 
diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c
new file mode 100644 (file)
index 0000000..88a60e0
--- /dev/null
@@ -0,0 +1,445 @@
+/*
+ *  copyright (c) 2006 IBM Corporation
+ *  Authored by: Mike D. Day <ncmike@us.ibm.com>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/kobject.h>
+
+#include <asm/xen/hypervisor.h>
+#include <asm/xen/hypercall.h>
+
+#include <xen/xenbus.h>
+#include <xen/interface/xen.h>
+#include <xen/interface/version.h>
+
+#define HYPERVISOR_ATTR_RO(_name) \
+static struct hyp_sysfs_attr  _name##_attr = __ATTR_RO(_name)
+
+#define HYPERVISOR_ATTR_RW(_name) \
+static struct hyp_sysfs_attr _name##_attr = \
+       __ATTR(_name, 0644, _name##_show, _name##_store)
+
+struct hyp_sysfs_attr {
+       struct attribute attr;
+       ssize_t (*show)(struct hyp_sysfs_attr *, char *);
+       ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t);
+       void *hyp_attr_data;
+};
+
+static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       return sprintf(buffer, "xen\n");
+}
+
+HYPERVISOR_ATTR_RO(type);
+
+static int __init xen_sysfs_type_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &type_attr.attr);
+}
+
+static void xen_sysfs_type_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &type_attr.attr);
+}
+
+/* xen version attributes */
+static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       if (version)
+               return sprintf(buffer, "%d\n", version >> 16);
+       return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(major);
+
+static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int version = HYPERVISOR_xen_version(XENVER_version, NULL);
+       if (version)
+               return sprintf(buffer, "%d\n", version & 0xff);
+       return -ENODEV;
+}
+
+HYPERVISOR_ATTR_RO(minor);
+
+static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *extra;
+
+       extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL);
+       if (extra) {
+               ret = HYPERVISOR_xen_version(XENVER_extraversion, extra);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", extra);
+               kfree(extra);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(extra);
+
+static struct attribute *version_attrs[] = {
+       &major_attr.attr,
+       &minor_attr.attr,
+       &extra_attr.attr,
+       NULL
+};
+
+static struct attribute_group version_group = {
+       .name = "version",
+       .attrs = version_attrs,
+};
+
+static int __init xen_sysfs_version_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &version_group);
+}
+
+static void xen_sysfs_version_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &version_group);
+}
+
+/* UUID */
+
+static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       char *vm, *val;
+       int ret;
+       extern int xenstored_ready;
+
+       if (!xenstored_ready)
+               return -EBUSY;
+
+       vm = xenbus_read(XBT_NIL, "vm", "", NULL);
+       if (IS_ERR(vm))
+               return PTR_ERR(vm);
+       val = xenbus_read(XBT_NIL, vm, "uuid", NULL);
+       kfree(vm);
+       if (IS_ERR(val))
+               return PTR_ERR(val);
+       ret = sprintf(buffer, "%s\n", val);
+       kfree(val);
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(uuid);
+
+static int __init xen_sysfs_uuid_init(void)
+{
+       return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+static void xen_sysfs_uuid_destroy(void)
+{
+       sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr);
+}
+
+/* xen compilation attributes */
+
+static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compiler);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiler);
+
+static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compile_by);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compiled_by);
+
+static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_compile_info *info;
+
+       info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL);
+       if (info) {
+               ret = HYPERVISOR_xen_version(XENVER_compile_info, info);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", info->compile_date);
+               kfree(info);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(compile_date);
+
+static struct attribute *xen_compile_attrs[] = {
+       &compiler_attr.attr,
+       &compiled_by_attr.attr,
+       &compile_date_attr.attr,
+       NULL
+};
+
+static struct attribute_group xen_compilation_group = {
+       .name = "compilation",
+       .attrs = xen_compile_attrs,
+};
+
+int __init static xen_compilation_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+static void xen_compilation_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_compilation_group);
+}
+
+/* xen properties info */
+
+static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *caps;
+
+       caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL);
+       if (caps) {
+               ret = HYPERVISOR_xen_version(XENVER_capabilities, caps);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", caps);
+               kfree(caps);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(capabilities);
+
+static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       char *cset;
+
+       cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL);
+       if (cset) {
+               ret = HYPERVISOR_xen_version(XENVER_changeset, cset);
+               if (!ret)
+                       ret = sprintf(buffer, "%s\n", cset);
+               kfree(cset);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(changeset);
+
+static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret = -ENOMEM;
+       struct xen_platform_parameters *parms;
+
+       parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL);
+       if (parms) {
+               ret = HYPERVISOR_xen_version(XENVER_platform_parameters,
+                                            parms);
+               if (!ret)
+                       ret = sprintf(buffer, "%lx\n", parms->virt_start);
+               kfree(parms);
+       }
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(virtual_start);
+
+static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       int ret;
+
+       ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL);
+       if (ret > 0)
+               ret = sprintf(buffer, "%x\n", ret);
+
+       return ret;
+}
+
+HYPERVISOR_ATTR_RO(pagesize);
+
+static ssize_t xen_feature_show(int index, char *buffer)
+{
+       ssize_t ret;
+       struct xen_feature_info info;
+
+       info.submap_idx = index;
+       ret = HYPERVISOR_xen_version(XENVER_get_features, &info);
+       if (!ret)
+               ret = sprintf(buffer, "%08x", info.submap);
+
+       return ret;
+}
+
+static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer)
+{
+       ssize_t len;
+       int i;
+
+       len = 0;
+       for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) {
+               int ret = xen_feature_show(i, buffer + len);
+               if (ret < 0) {
+                       if (len == 0)
+                               len = ret;
+                       break;
+               }
+               len += ret;
+       }
+       if (len > 0)
+               buffer[len++] = '\n';
+
+       return len;
+}
+
+HYPERVISOR_ATTR_RO(features);
+
+static struct attribute *xen_properties_attrs[] = {
+       &capabilities_attr.attr,
+       &changeset_attr.attr,
+       &virtual_start_attr.attr,
+       &pagesize_attr.attr,
+       &features_attr.attr,
+       NULL
+};
+
+static struct attribute_group xen_properties_group = {
+       .name = "properties",
+       .attrs = xen_properties_attrs,
+};
+
+static int __init xen_properties_init(void)
+{
+       return sysfs_create_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static void xen_properties_destroy(void)
+{
+       sysfs_remove_group(hypervisor_kobj, &xen_properties_group);
+}
+
+static int __init hyper_sysfs_init(void)
+{
+       int ret;
+
+       if (!xen_domain())
+               return -ENODEV;
+
+       ret = xen_sysfs_type_init();
+       if (ret)
+               goto out;
+       ret = xen_sysfs_version_init();
+       if (ret)
+               goto version_out;
+       ret = xen_compilation_init();
+       if (ret)
+               goto comp_out;
+       ret = xen_sysfs_uuid_init();
+       if (ret)
+               goto uuid_out;
+       ret = xen_properties_init();
+       if (ret)
+               goto prop_out;
+
+       goto out;
+
+prop_out:
+       xen_sysfs_uuid_destroy();
+uuid_out:
+       xen_compilation_destroy();
+comp_out:
+       xen_sysfs_version_destroy();
+version_out:
+       xen_sysfs_type_destroy();
+out:
+       return ret;
+}
+
+static void __exit hyper_sysfs_exit(void)
+{
+       xen_properties_destroy();
+       xen_compilation_destroy();
+       xen_sysfs_uuid_destroy();
+       xen_sysfs_version_destroy();
+       xen_sysfs_type_destroy();
+
+}
+module_init(hyper_sysfs_init);
+module_exit(hyper_sysfs_exit);
+
+static ssize_t hyp_sysfs_show(struct kobject *kobj,
+                             struct attribute *attr,
+                             char *buffer)
+{
+       struct hyp_sysfs_attr *hyp_attr;
+       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+       if (hyp_attr->show)
+               return hyp_attr->show(hyp_attr, buffer);
+       return 0;
+}
+
+static ssize_t hyp_sysfs_store(struct kobject *kobj,
+                              struct attribute *attr,
+                              const char *buffer,
+                              size_t len)
+{
+       struct hyp_sysfs_attr *hyp_attr;
+       hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr);
+       if (hyp_attr->store)
+               return hyp_attr->store(hyp_attr, buffer, len);
+       return 0;
+}
+
+static struct sysfs_ops hyp_sysfs_ops = {
+       .show = hyp_sysfs_show,
+       .store = hyp_sysfs_store,
+};
+
+static struct kobj_type hyp_sysfs_kobj_type = {
+       .sysfs_ops = &hyp_sysfs_ops,
+};
+
+static int __init hypervisor_subsys_init(void)
+{
+       if (!xen_domain())
+               return -ENODEV;
+
+       hypervisor_kobj->ktype = &hyp_sysfs_kobj_type;
+       return 0;
+}
+device_initcall(hypervisor_subsys_init);
index 773d1cf2328334b9c62cc52fdf90c7107df02778..d42e25d5968dcf48acd5448a7a6af56d21abd5fb 100644 (file)
@@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name);
 
 static void xenbus_dev_shutdown(struct device *_dev);
 
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state);
+static int xenbus_dev_resume(struct device *dev);
+
 /* If something in array of ids matches this device, return it. */
 static const struct xenbus_device_id *
 match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev)
@@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = {
                .remove    = xenbus_dev_remove,
                .shutdown  = xenbus_dev_shutdown,
                .dev_attrs = xenbus_dev_attrs,
+
+               .suspend   = xenbus_dev_suspend,
+               .resume    = xenbus_dev_resume,
        },
 };
 
@@ -654,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus)
 
        kfree(root);
 }
+EXPORT_SYMBOL_GPL(xenbus_dev_changed);
 
 static void frontend_changed(struct xenbus_watch *watch,
                             const char **vec, unsigned int len)
@@ -669,7 +676,7 @@ static struct xenbus_watch fe_watch = {
        .callback = frontend_changed,
 };
 
-static int suspend_dev(struct device *dev, void *data)
+static int xenbus_dev_suspend(struct device *dev, pm_message_t state)
 {
        int err = 0;
        struct xenbus_driver *drv;
@@ -682,35 +689,14 @@ static int suspend_dev(struct device *dev, void *data)
        drv = to_xenbus_driver(dev->driver);
        xdev = container_of(dev, struct xenbus_device, dev);
        if (drv->suspend)
-               err = drv->suspend(xdev);
+               err = drv->suspend(xdev, state);
        if (err)
                printk(KERN_WARNING
                       "xenbus: suspend %s failed: %i\n", dev_name(dev), err);
        return 0;
 }
 
-static int suspend_cancel_dev(struct device *dev, void *data)
-{
-       int err = 0;
-       struct xenbus_driver *drv;
-       struct xenbus_device *xdev;
-
-       DPRINTK("");
-
-       if (dev->driver == NULL)
-               return 0;
-       drv = to_xenbus_driver(dev->driver);
-       xdev = container_of(dev, struct xenbus_device, dev);
-       if (drv->suspend_cancel)
-               err = drv->suspend_cancel(xdev);
-       if (err)
-               printk(KERN_WARNING
-                      "xenbus: suspend_cancel %s failed: %i\n",
-                      dev_name(dev), err);
-       return 0;
-}
-
-static int resume_dev(struct device *dev, void *data)
+static int xenbus_dev_resume(struct device *dev)
 {
        int err;
        struct xenbus_driver *drv;
@@ -755,33 +741,6 @@ static int resume_dev(struct device *dev, void *data)
        return 0;
 }
 
-void xenbus_suspend(void)
-{
-       DPRINTK("");
-
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev);
-       xenbus_backend_suspend(suspend_dev);
-       xs_suspend();
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend);
-
-void xenbus_resume(void)
-{
-       xb_init_comms();
-       xs_resume();
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev);
-       xenbus_backend_resume(resume_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_resume);
-
-void xenbus_suspend_cancel(void)
-{
-       xs_suspend_cancel();
-       bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev);
-       xenbus_backend_resume(suspend_cancel_dev);
-}
-EXPORT_SYMBOL_GPL(xenbus_suspend_cancel);
-
 /* A flag to determine if xenstored is 'ready' (i.e. has started) */
 int xenstored_ready = 0;
 
index e325eab4724d8cb0fab667b26f8c5b104c965c05..eab33f1dbdf7013f451af491b882973ea017f718 100644 (file)
@@ -673,6 +673,8 @@ void xs_resume(void)
        struct xenbus_watch *watch;
        char token[sizeof(watch) * 2 + 1];
 
+       xb_init_comms();
+
        mutex_unlock(&xs_state.response_mutex);
        mutex_unlock(&xs_state.request_mutex);
        up_write(&xs_state.transaction_mutex);
index 515741a8e6b8ecaae5bcd2d36af6538a2ee77345..6559e0c752ce1494baa127921834e388832cbd75 100644 (file)
 MODULE_DESCRIPTION("Xen filesystem");
 MODULE_LICENSE("GPL");
 
+static ssize_t capabilities_read(struct file *file, char __user *buf,
+                                size_t size, loff_t *off)
+{
+       char *tmp = "";
+
+       if (xen_initial_domain())
+               tmp = "control_d\n";
+
+       return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp));
+}
+
+static const struct file_operations capabilities_file_ops = {
+       .read = capabilities_read,
+};
+
 static int xenfs_fill_super(struct super_block *sb, void *data, int silent)
 {
        static struct tree_descr xenfs_files[] = {
-               [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR},
+               [1] = {},
+               { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR },
+               { "capabilities", &capabilities_file_ops, S_IRUGO },
                {""},
        };
 
index 98711647ece49548a94409a99ce4b680ee085ca1..740699c4f90c5cf6775873a2ee9b61a5fde4d0c0 100644 (file)
--- a/fs/bio.c
+++ b/fs/bio.c
 #include <linux/mempool.h>
 #include <linux/workqueue.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <scsi/sg.h>           /* for struct sg_iovec */
 
-DEFINE_TRACE(block_split);
+#include <trace/events/block.h>
 
 /*
  * Test patch to inline a certain number of bi_io_vec's inside the bio
index 49106127a4aa5f8f480dbd61968c0c6d0acd6b58..1864d0b63088949c9887bede245b7fc4784d713d 100644 (file)
@@ -2935,6 +2935,8 @@ int submit_bh(int rw, struct buffer_head * bh)
        BUG_ON(!buffer_locked(bh));
        BUG_ON(!buffer_mapped(bh));
        BUG_ON(!bh->b_end_io);
+       BUG_ON(buffer_delay(bh));
+       BUG_ON(buffer_unwritten(bh));
 
        /*
         * Mask in barrier bit for a write (could be either a WRITE or a
index f20c4069c22005681bc851bdc270f54c8a2475c1..b48689839428805efd6175eacd0a7396068cdafe 100644 (file)
@@ -1,3 +1,12 @@
+Version 1.59
+------------
+Client uses server inode numbers (which are persistent) rather than
+client generated ones by default (mount option "serverino" turned
+on by default if server supports it).  Add forceuid and forcegid
+mount options (so that when negotiating unix extensions specifying
+which uid mounted does not immediately force the server's reported
+uids to be overridden).
+
 Version 1.58
 ------------
 Guard against buffer overruns in various UCS-2 to UTF-8 string conversions
@@ -10,6 +19,8 @@ we converted from).  Fix endianness of the vcnum field used during
 session setup to distinguish multiple mounts to same server from different
 userids. Raw NTLMSSP fixed (it requires /proc/fs/cifs/experimental
 flag to be set to 2, and mount must enable krb5 to turn on extended security).
+Performance of file create to Samba improved (posix create on lookup
+removes 1 of 2 network requests sent on file create)
  
 Version 1.57
 ------------
index db208ddb989910d469f958054d000558c528a8d7..ad92921dbde415b7a00d0c063d6ca9adfb0d582d 100644 (file)
@@ -262,7 +262,8 @@ A partial list of the supported mount options follows:
                mount.  
   domain       Set the SMB/CIFS workgroup name prepended to the
                username during CIFS session establishment
-  uid          Set the default uid for inodes. For mounts to servers
+  forceuid     Set the default uid for inodes based on the uid
+               passed in. For mounts to servers
                which do support the CIFS Unix extensions, such as a
                properly configured Samba server, the server provides
                the uid, gid and mode so this parameter should  not be
@@ -292,6 +293,12 @@ A partial list of the supported mount options follows:
                the client.  Note that the mount.cifs helper must be
                at version 1.10 or higher to support specifying the uid
                (or gid) in non-numeric form.
+  forcegid     (similar to above but for the groupid instead of uid)
+  uid          Set the default uid for inodes, and indicate to the
+               cifs kernel driver which local user mounted . If the server
+               supports the unix extensions the default uid is
+               not used to fill in the owner fields of inodes (files)
+               unless the "forceuid" parameter is specified.
   gid          Set the default gid for inodes (similar to above).
   file_mode     If CIFS Unix extensions are not supported by the server
                this overrides the default mode for file inodes.
@@ -388,8 +395,13 @@ A partial list of the supported mount options follows:
                or the CIFS Unix Extensions equivalent and for those
                this mount option will have no effect.  Exporting cifs mounts
                under nfsd requires this mount option on the cifs mount.
+               This is now the default if server supports the 
+               required network operation.
   noserverino   Client generates inode numbers (rather than using the actual one
-               from the server) by default.
+               from the server). These inode numbers will vary after
+               unmount or reboot which can confuse some applications,
+               but not all server filesystems support unique inode
+               numbers.
   setuids       If the CIFS Unix extensions are negotiated with the server
                the client will attempt to set the effective uid and gid of
                the local process on newly created files, directories, and
index 67bf93a40d2eeb46a42175183810c25c3cd1db58..4a4581cb2b5e3442e1cc0da0f0a7cd2f6de930bd 100644 (file)
@@ -23,6 +23,7 @@
 #include <linux/string.h>
 #include <keys/user-type.h>
 #include <linux/key-type.h>
+#include <linux/inet.h>
 #include "cifsglob.h"
 #include "cifs_spnego.h"
 #include "cifs_debug.h"
@@ -73,9 +74,6 @@ struct key_type cifs_spnego_key_type = {
  * strlen(";sec=ntlmsspi") */
 #define MAX_MECH_STR_LEN       13
 
-/* max possible addr len eg FEDC:BA98:7654:3210:FEDC:BA98:7654:3210/128 */
-#define MAX_IPV6_ADDR_LEN      43
-
 /* strlen of "host=" */
 #define HOST_KEY_LEN           5
 
@@ -102,7 +100,7 @@ cifs_get_spnego_key(struct cifsSesInfo *sesInfo)
           host=hostname sec=mechanism uid=0xFF user=username */
        desc_len = MAX_VER_STR_LEN +
                   HOST_KEY_LEN + strlen(hostname) +
-                  IP_KEY_LEN + MAX_IPV6_ADDR_LEN +
+                  IP_KEY_LEN + INET6_ADDRSTRLEN +
                   MAX_MECH_STR_LEN +
                   UID_KEY_LEN + (sizeof(uid_t) * 2) +
                   USER_KEY_LEN + strlen(sesInfo->userName) + 1;
index 57ecdc83c26f3852440e67957ca031577cba8543..1403b5d86a739d67725e0dc94db1bec593e5ddab 100644 (file)
@@ -552,130 +552,138 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd,
        return rc;
 }
 
-
-/* Retrieve an ACL from the server */
-static struct cifs_ntsd *get_cifs_acl(u32 *pacllen, struct inode *inode,
-                                      const char *path, const __u16 *pfid)
+static struct cifs_ntsd *get_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb,
+               __u16 fid, u32 *pacllen)
 {
-       struct cifsFileInfo *open_file = NULL;
-       bool unlock_file = false;
-       int xid;
-       int rc = -EIO;
-       __u16 fid;
-       struct super_block *sb;
-       struct cifs_sb_info *cifs_sb;
        struct cifs_ntsd *pntsd = NULL;
+       int xid, rc;
+
+       xid = GetXid();
+       rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
+       FreeXid(xid);
 
-       cFYI(1, ("get mode from ACL for %s", path));
 
-       if (inode == NULL)
-               return NULL;
+       cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
+       return pntsd;
+}
+
+static struct cifs_ntsd *get_cifs_acl_by_path(struct cifs_sb_info *cifs_sb,
+               const char *path, u32 *pacllen)
+{
+       struct cifs_ntsd *pntsd = NULL;
+       int oplock = 0;
+       int xid, rc;
+       __u16 fid;
 
        xid = GetXid();
-       if (pfid == NULL)
-               open_file = find_readable_file(CIFS_I(inode));
-       else
-               fid = *pfid;
 
-       sb = inode->i_sb;
-       if (sb == NULL) {
-               FreeXid(xid);
-               return NULL;
-       }
-       cifs_sb = CIFS_SB(sb);
-
-       if (open_file) {
-               unlock_file = true;
-               fid = open_file->netfid;
-       } else if (pfid == NULL) {
-               int oplock = 0;
-               /* open file */
-               rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-                               READ_CONTROL, 0, &fid, &oplock, NULL,
-                               cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                       CIFS_MOUNT_MAP_SPECIAL_CHR);
-               if (rc != 0) {
-                       cERROR(1, ("Unable to open file to get ACL"));
-                       FreeXid(xid);
-                       return NULL;
-               }
+       rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, READ_CONTROL, 0,
+                        &fid, &oplock, NULL, cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+       if (rc) {
+               cERROR(1, ("Unable to open file to get ACL"));
+               goto out;
        }
 
        rc = CIFSSMBGetCIFSACL(xid, cifs_sb->tcon, fid, &pntsd, pacllen);
        cFYI(1, ("GetCIFSACL rc = %d ACL len %d", rc, *pacllen));
-       if (unlock_file == true) /* find_readable_file increments ref count */
-               atomic_dec(&open_file->wrtPending);
-       else if (pfid == NULL) /* if opened above we have to close the handle */
-               CIFSSMBClose(xid, cifs_sb->tcon, fid);
-       /* else handle was passed in by caller */
 
+       CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
        return pntsd;
 }
 
-/* Set an ACL on the server */
-static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
-                               struct inode *inode, const char *path)
+/* Retrieve an ACL from the server */
+static struct cifs_ntsd *get_cifs_acl(struct cifs_sb_info *cifs_sb,
+                                     struct inode *inode, const char *path,
+                                     u32 *pacllen)
 {
-       struct cifsFileInfo *open_file;
-       bool unlock_file = false;
-       int xid;
-       int rc = -EIO;
-       __u16 fid;
-       struct super_block *sb;
-       struct cifs_sb_info *cifs_sb;
+       struct cifs_ntsd *pntsd = NULL;
+       struct cifsFileInfo *open_file = NULL;
 
-       cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+       if (inode)
+               open_file = find_readable_file(CIFS_I(inode));
+       if (!open_file)
+               return get_cifs_acl_by_path(cifs_sb, path, pacllen);
 
-       if (!inode)
-               return rc;
+       pntsd = get_cifs_acl_by_fid(cifs_sb, open_file->netfid, pacllen);
+       atomic_dec(&open_file->wrtPending);
+       return pntsd;
+}
 
-       sb = inode->i_sb;
-       if (sb == NULL)
-               return rc;
+static int set_cifs_acl_by_fid(struct cifs_sb_info *cifs_sb, __u16 fid,
+               struct cifs_ntsd *pnntsd, u32 acllen)
+{
+       int xid, rc;
 
-       cifs_sb = CIFS_SB(sb);
        xid = GetXid();
+       rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
+       FreeXid(xid);
 
-       open_file = find_readable_file(CIFS_I(inode));
-       if (open_file) {
-               unlock_file = true;
-               fid = open_file->netfid;
-       } else {
-               int oplock = 0;
-               /* open file */
-               rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN,
-                               WRITE_DAC, 0, &fid, &oplock, NULL,
-                               cifs_sb->local_nls, cifs_sb->mnt_cifs_flags &
-                                       CIFS_MOUNT_MAP_SPECIAL_CHR);
-               if (rc != 0) {
-                       cERROR(1, ("Unable to open file to set ACL"));
-                       FreeXid(xid);
-                       return rc;
-               }
+       cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
+       return rc;
+}
+
+static int set_cifs_acl_by_path(struct cifs_sb_info *cifs_sb, const char *path,
+               struct cifs_ntsd *pnntsd, u32 acllen)
+{
+       int oplock = 0;
+       int xid, rc;
+       __u16 fid;
+
+       xid = GetXid();
+
+       rc = CIFSSMBOpen(xid, cifs_sb->tcon, path, FILE_OPEN, WRITE_DAC, 0,
+                        &fid, &oplock, NULL, cifs_sb->local_nls,
+                        cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR);
+       if (rc) {
+               cERROR(1, ("Unable to open file to set ACL"));
+               goto out;
        }
 
        rc = CIFSSMBSetCIFSACL(xid, cifs_sb->tcon, fid, pnntsd, acllen);
        cFYI(DBG2, ("SetCIFSACL rc = %d", rc));
-       if (unlock_file)
-               atomic_dec(&open_file->wrtPending);
-       else
-               CIFSSMBClose(xid, cifs_sb->tcon, fid);
 
+       CIFSSMBClose(xid, cifs_sb->tcon, fid);
+ out:
        FreeXid(xid);
+       return rc;
+}
 
+/* Set an ACL on the server */
+static int set_cifs_acl(struct cifs_ntsd *pnntsd, __u32 acllen,
+                               struct inode *inode, const char *path)
+{
+       struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb);
+       struct cifsFileInfo *open_file;
+       int rc;
+
+       cFYI(DBG2, ("set ACL for %s from mode 0x%x", path, inode->i_mode));
+
+       open_file = find_readable_file(CIFS_I(inode));
+       if (!open_file)
+               return set_cifs_acl_by_path(cifs_sb, path, pnntsd, acllen);
+
+       rc = set_cifs_acl_by_fid(cifs_sb, open_file->netfid, pnntsd, acllen);
+       atomic_dec(&open_file->wrtPending);
        return rc;
 }
 
 /* Translate the CIFS ACL (simlar to NTFS ACL) for a file into mode bits */
-void acl_to_uid_mode(struct inode *inode, const char *path, const __u16 *pfid)
+void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+                    const char *path, const __u16 *pfid)
 {
        struct cifs_ntsd *pntsd = NULL;
        u32 acllen = 0;
        int rc = 0;
 
        cFYI(DBG2, ("converting ACL to mode for %s", path));
-       pntsd = get_cifs_acl(&acllen, inode, path, pfid);
+
+       if (pfid)
+               pntsd = get_cifs_acl_by_fid(cifs_sb, *pfid, &acllen);
+       else
+               pntsd = get_cifs_acl(cifs_sb, inode, path, &acllen);
 
        /* if we can retrieve the ACL, now parse Access Control Entries, ACEs */
        if (pntsd)
@@ -698,7 +706,7 @@ int mode_to_acl(struct inode *inode, const char *path, __u64 nmode)
        cFYI(DBG2, ("set ACL from mode for %s", path));
 
        /* Get the security descriptor */
-       pntsd = get_cifs_acl(&secdesclen, inode, path, NULL);
+       pntsd = get_cifs_acl(CIFS_SB(inode->i_sb), inode, path, &secdesclen);
 
        /* Add three ACEs for owner, group, everyone getting rid of
           other ACEs as chmod disables ACEs and set the security descriptor */
index 5e6d35804d73511162230f67626aeaf331333982..0a10a59b6392891e4f0f319d6fa6cf62ec4d0cbd 100644 (file)
@@ -146,7 +146,7 @@ cifs_read_super(struct super_block *sb, void *data,
 #endif
        sb->s_blocksize = CIFS_MAX_MSGSIZE;
        sb->s_blocksize_bits = 14;      /* default 2**14 = CIFS_MAX_MSGSIZE */
-       inode = cifs_iget(sb, ROOT_I);
+       inode = cifs_root_iget(sb, ROOT_I);
 
        if (IS_ERR(inode)) {
                rc = PTR_ERR(inode);
index 051b71cfdea9b9725b12026fd6aec17325da26fd..9570a0e8023f4258941d1f4cdc0e8141188a622d 100644 (file)
@@ -36,7 +36,7 @@ extern void cifs_read_inode(struct inode *);
 
 /* Functions related to inodes */
 extern const struct inode_operations cifs_dir_inode_ops;
-extern struct inode *cifs_iget(struct super_block *, unsigned long);
+extern struct inode *cifs_root_iget(struct super_block *, unsigned long);
 extern int cifs_create(struct inode *, struct dentry *, int,
                       struct nameidata *);
 extern struct dentry *cifs_lookup(struct inode *, struct dentry *,
@@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg);
 extern const struct export_operations cifs_export_ops;
 #endif /* EXPERIMENTAL */
 
-#define CIFS_VERSION   "1.58"
+#define CIFS_VERSION   "1.59"
 #endif                         /* _CIFSFS_H */
index fae083930eee2210d1ae3e950c7d1639e1c9505e..f9452329bcce5b02929c5090a79220170d329813 100644 (file)
@@ -90,10 +90,10 @@ extern struct oplock_q_entry *AllocOplockQEntry(struct inode *, u16,
                                                 struct cifsTconInfo *);
 extern void DeleteOplockQEntry(struct oplock_q_entry *);
 extern void DeleteTconOplockQEntries(struct cifsTconInfo *);
-extern struct timespec cifs_NTtimeToUnix(u64 utc_nanoseconds_since_1601);
+extern struct timespec cifs_NTtimeToUnix(__le64 utc_nanoseconds_since_1601);
 extern u64 cifs_UnixTimeToNT(struct timespec);
-extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time);
-extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time);
+extern struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time,
+                                     int offset);
 
 extern int cifs_posix_open(char *full_path, struct inode **pinode,
                           struct super_block *sb, int mode, int oflags,
@@ -108,8 +108,8 @@ extern int cifs_get_inode_info(struct inode **pinode,
 extern int cifs_get_inode_info_unix(struct inode **pinode,
                        const unsigned char *search_path,
                        struct super_block *sb, int xid);
-extern void acl_to_uid_mode(struct inode *inode, const char *path,
-                           const __u16 *pfid);
+extern void acl_to_uid_mode(struct cifs_sb_info *cifs_sb, struct inode *inode,
+                           const char *path, const __u16 *pfid);
 extern int mode_to_acl(struct inode *inode, const char *path, __u64);
 
 extern int cifs_mount(struct super_block *, struct cifs_sb_info *, char *,
index d06260251c304692dc2789c14059170245ad2957..b84c61d5bca419d2ea221484636508f1ddaceaa1 100644 (file)
@@ -524,8 +524,8 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses)
                        int val, seconds, remain, result;
                        struct timespec ts, utc;
                        utc = CURRENT_TIME;
-                       ts = cnvrtDosUnixTm(le16_to_cpu(rsp->SrvTime.Date),
-                                               le16_to_cpu(rsp->SrvTime.Time));
+                       ts = cnvrtDosUnixTm(rsp->SrvTime.Date,
+                                           rsp->SrvTime.Time, 0);
                        cFYI(1, ("SrvTime %d sec since 1970 (utc: %d) diff: %d",
                                (int)ts.tv_sec, (int)utc.tv_sec,
                                (int)(utc.tv_sec - ts.tv_sec)));
@@ -2427,8 +2427,7 @@ querySymLinkRetry:
        params = 2 /* level */  + 4 /* rsrvd */  + name_len /* incl null */ ;
        pSMB->TotalDataCount = 0;
        pSMB->MaxParameterCount = cpu_to_le16(2);
-       /* BB find exact max data count below from sess structure BB */
-       pSMB->MaxDataCount = cpu_to_le16(4000);
+       pSMB->MaxDataCount = cpu_to_le16(CIFSMaxBufSize);
        pSMB->MaxSetupCount = 0;
        pSMB->Reserved = 0;
        pSMB->Flags = 0;
index 4aa81a507b741110657eeb1e7899235296d829d8..97f4311b9a8ea2f40325ac2fda8a2929153402eb 100644 (file)
@@ -35,6 +35,7 @@
 #include <linux/namei.h>
 #include <asm/uaccess.h>
 #include <asm/processor.h>
+#include <linux/inet.h>
 #include <net/ipv6.h>
 #include "cifspdu.h"
 #include "cifsglob.h"
@@ -61,7 +62,6 @@ struct smb_vol {
        char *domainname;
        char *UNC;
        char *UNCip;
-       char *in6_addr;   /* ipv6 address as human readable form of in6_addr */
        char *iocharset;  /* local code page for mapping to and from Unicode */
        char source_rfc1001_name[16]; /* netbios name of client */
        char target_rfc1001_name[16]; /* netbios name of server for Win9x/ME */
@@ -827,14 +827,16 @@ cifs_parse_mount_options(char *options, const char *devname,
        vol->target_rfc1001_name[0] = 0;
        vol->linux_uid = current_uid();  /* use current_euid() instead? */
        vol->linux_gid = current_gid();
-       vol->dir_mode = S_IRWXUGO;
-       /* 2767 perms indicate mandatory locking support */
-       vol->file_mode = (S_IRWXUGO | S_ISGID) & (~S_IXGRP);
+
+       /* default to only allowing write access to owner of the mount */
+       vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR;
 
        /* vol->retry default is 0 (i.e. "soft" limited retry not hard retry) */
        vol->rw = true;
        /* default is always to request posix paths. */
        vol->posix_paths = 1;
+       /* default to using server inode numbers where available */
+       vol->server_ino = 1;
 
        if (!options)
                return 1;
@@ -955,10 +957,12 @@ cifs_parse_mount_options(char *options, const char *devname,
                                }
                                strcpy(vol->password, value);
                        }
-               } else if (strnicmp(data, "ip", 2) == 0) {
+               } else if (!strnicmp(data, "ip", 2) ||
+                          !strnicmp(data, "addr", 4)) {
                        if (!value || !*value) {
                                vol->UNCip = NULL;
-                       } else if (strnlen(value, 35) < 35) {
+                       } else if (strnlen(value, INET6_ADDRSTRLEN) <
+                                                       INET6_ADDRSTRLEN) {
                                vol->UNCip = value;
                        } else {
                                printk(KERN_WARNING "CIFS: ip address "
@@ -1092,17 +1096,17 @@ cifs_parse_mount_options(char *options, const char *devname,
                                return 1;
                        }
                } else if (strnicmp(data, "uid", 3) == 0) {
-                       if (value && *value) {
+                       if (value && *value)
                                vol->linux_uid =
                                        simple_strtoul(value, &value, 0);
+               } else if (strnicmp(data, "forceuid", 8) == 0) {
                                vol->override_uid = 1;
-                       }
                } else if (strnicmp(data, "gid", 3) == 0) {
-                       if (value && *value) {
+                       if (value && *value)
                                vol->linux_gid =
                                        simple_strtoul(value, &value, 0);
+               } else if (strnicmp(data, "forcegid", 8) == 0) {
                                vol->override_gid = 1;
-                       }
                } else if (strnicmp(data, "file_mode", 4) == 0) {
                        if (value && *value) {
                                vol->file_mode =
@@ -1315,16 +1319,6 @@ cifs_parse_mount_options(char *options, const char *devname,
                        vol->direct_io = 1;
                } else if (strnicmp(data, "forcedirectio", 13) == 0) {
                        vol->direct_io = 1;
-               } else if (strnicmp(data, "in6_addr", 8) == 0) {
-                       if (!value || !*value) {
-                               vol->in6_addr = NULL;
-                       } else if (strnlen(value, 49) == 48) {
-                               vol->in6_addr = value;
-                       } else {
-                               printk(KERN_WARNING "CIFS: ip v6 address not "
-                                                   "48 characters long\n");
-                               return 1;
-                       }
                } else if (strnicmp(data, "noac", 4) == 0) {
                        printk(KERN_WARNING "CIFS: Mount option noac not "
                                "supported. Instead set "
index 302ea15f02e611fb2d254e68740cb8e1cbdd81c2..06866841b97f1b9993ea3dd577f3f4478d06880e 100644 (file)
@@ -241,7 +241,7 @@ static inline int cifs_open_inode_helper(struct inode *inode, struct file *file,
        /* BB need same check in cifs_create too? */
        /* if not oplocked, invalidate inode pages if mtime or file
           size changed */
-       temp = cifs_NTtimeToUnix(le64_to_cpu(buf->LastWriteTime));
+       temp = cifs_NTtimeToUnix(buf->LastWriteTime);
        if (timespec_equal(&file->f_path.dentry->d_inode->i_mtime, &temp) &&
                           (file->f_path.dentry->d_inode->i_size ==
                            (loff_t)le64_to_cpu(buf->EndOfFile))) {
index 9c869a6dcba18cb496085625386922d8521ef513..fad882b075ba50358b113e58808b9845f6baca4c 100644 (file)
@@ -85,10 +85,10 @@ static void cifs_unix_info_to_inode(struct inode *inode,
        __u64 num_of_bytes = le64_to_cpu(info->NumOfBytes);
        __u64 end_of_file = le64_to_cpu(info->EndOfFile);
 
-       inode->i_atime = cifs_NTtimeToUnix(le64_to_cpu(info->LastAccessTime));
+       inode->i_atime = cifs_NTtimeToUnix(info->LastAccessTime);
        inode->i_mtime =
-               cifs_NTtimeToUnix(le64_to_cpu(info->LastModificationTime));
-       inode->i_ctime = cifs_NTtimeToUnix(le64_to_cpu(info->LastStatusChange));
+               cifs_NTtimeToUnix(info->LastModificationTime);
+       inode->i_ctime = cifs_NTtimeToUnix(info->LastStatusChange);
        inode->i_mode = le64_to_cpu(info->Permissions);
 
        /*
@@ -554,14 +554,11 @@ int cifs_get_inode_info(struct inode **pinode,
 
        /* Linux can not store file creation time so ignore it */
        if (pfindData->LastAccessTime)
-               inode->i_atime = cifs_NTtimeToUnix
-                       (le64_to_cpu(pfindData->LastAccessTime));
+               inode->i_atime = cifs_NTtimeToUnix(pfindData->LastAccessTime);
        else /* do not need to use current_fs_time - time not stored */
                inode->i_atime = CURRENT_TIME;
-       inode->i_mtime =
-                   cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
-       inode->i_ctime =
-           cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+       inode->i_mtime = cifs_NTtimeToUnix(pfindData->LastWriteTime);
+       inode->i_ctime = cifs_NTtimeToUnix(pfindData->ChangeTime);
        cFYI(DBG2, ("Attributes came in as 0x%x", attr));
        if (adjustTZ && (pTcon->ses) && (pTcon->ses->server)) {
                inode->i_ctime.tv_sec += pTcon->ses->server->timeAdj;
@@ -629,7 +626,7 @@ int cifs_get_inode_info(struct inode **pinode,
        /* fill in 0777 bits from ACL */
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) {
                cFYI(1, ("Getting mode bits from ACL"));
-               acl_to_uid_mode(inode, full_path, pfid);
+               acl_to_uid_mode(cifs_sb, inode, full_path, pfid);
        }
 #endif
        if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) {
@@ -699,7 +696,7 @@ char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb)
 }
 
 /* gets root inode */
-struct inode *cifs_iget(struct super_block *sb, unsigned long ino)
+struct inode *cifs_root_iget(struct super_block *sb, unsigned long ino)
 {
        int xid;
        struct cifs_sb_info *cifs_sb;
index e2fe998989a381ddc8625fde5334fae628f1dacd..32d6baa0a54fa907963d976c56d75c09d301e055 100644 (file)
@@ -853,12 +853,12 @@ smbCalcSize_LE(struct smb_hdr *ptr)
 
 #define NTFS_TIME_OFFSET ((u64)(369*365 + 89) * 24 * 3600 * 10000000)
 
-    /*
    * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
    * into Unix UTC (based 1970-01-01, in seconds).
    */
+/*
+ * Convert the NT UTC (based 1601-01-01, in hundred nanosecond units)
+ * into Unix UTC (based 1970-01-01, in seconds).
+ */
 struct timespec
-cifs_NTtimeToUnix(u64 ntutc)
+cifs_NTtimeToUnix(__le64 ntutc)
 {
        struct timespec ts;
        /* BB what about the timezone? BB */
@@ -866,7 +866,7 @@ cifs_NTtimeToUnix(u64 ntutc)
        /* Subtract the NTFS time offset, then convert to 1s intervals. */
        u64 t;
 
-       t = ntutc - NTFS_TIME_OFFSET;
+       t = le64_to_cpu(ntutc) - NTFS_TIME_OFFSET;
        ts.tv_nsec = do_div(t, 10000000) * 100;
        ts.tv_sec = t;
        return ts;
@@ -883,16 +883,12 @@ cifs_UnixTimeToNT(struct timespec t)
 static int total_days_of_prev_months[] =
 {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334};
 
-
-__le64 cnvrtDosCifsTm(__u16 date, __u16 time)
-{
-       return cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm(date, time)));
-}
-
-struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
+struct timespec cnvrtDosUnixTm(__le16 le_date, __le16 le_time, int offset)
 {
        struct timespec ts;
        int sec, min, days, month, year;
+       u16 date = le16_to_cpu(le_date);
+       u16 time = le16_to_cpu(le_time);
        SMB_TIME *st = (SMB_TIME *)&time;
        SMB_DATE *sd = (SMB_DATE *)&date;
 
@@ -933,7 +929,7 @@ struct timespec cnvrtDosUnixTm(__u16 date, __u16 time)
                days -= ((year & 0x03) == 0) && (month < 2 ? 1 : 0);
        sec += 24 * 60 * 60 * days;
 
-       ts.tv_sec = sec;
+       ts.tv_sec = sec + offset;
 
        /* cFYI(1,("sec after cnvrt dos to unix time %d",sec)); */
 
index 964e097c82035b75f865275f3f7c6b45b3c69a63..86d0055dc529906df58a3dfddbc4b5601af08d0f 100644 (file)
@@ -115,17 +115,6 @@ construct_dentry(struct qstr *qstring, struct file *file,
        return rc;
 }
 
-static void AdjustForTZ(struct cifsTconInfo *tcon, struct inode *inode)
-{
-       if ((tcon) && (tcon->ses) && (tcon->ses->server)) {
-               inode->i_ctime.tv_sec += tcon->ses->server->timeAdj;
-               inode->i_mtime.tv_sec += tcon->ses->server->timeAdj;
-               inode->i_atime.tv_sec += tcon->ses->server->timeAdj;
-       }
-       return;
-}
-
-
 static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                          char *buf, unsigned int *pobject_type, int isNewInode)
 {
@@ -150,26 +139,25 @@ static void fill_in_inode(struct inode *tmp_inode, int new_buf_type,
                allocation_size = le64_to_cpu(pfindData->AllocationSize);
                end_of_file = le64_to_cpu(pfindData->EndOfFile);
                tmp_inode->i_atime =
-                     cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+                       cifs_NTtimeToUnix(pfindData->LastAccessTime);
                tmp_inode->i_mtime =
-                     cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastWriteTime));
+                       cifs_NTtimeToUnix(pfindData->LastWriteTime);
                tmp_inode->i_ctime =
-                     cifs_NTtimeToUnix(le64_to_cpu(pfindData->ChangeTime));
+                       cifs_NTtimeToUnix(pfindData->ChangeTime);
        } else { /* legacy, OS2 and DOS style */
-/*             struct timespec ts;*/
+               int offset = cifs_sb->tcon->ses->server->timeAdj;
                FIND_FILE_STANDARD_INFO *pfindData =
                        (FIND_FILE_STANDARD_INFO *)buf;
 
-               tmp_inode->i_mtime = cnvrtDosUnixTm(
-                               le16_to_cpu(pfindData->LastWriteDate),
-                               le16_to_cpu(pfindData->LastWriteTime));
-               tmp_inode->i_atime = cnvrtDosUnixTm(
-                               le16_to_cpu(pfindData->LastAccessDate),
-                               le16_to_cpu(pfindData->LastAccessTime));
-               tmp_inode->i_ctime = cnvrtDosUnixTm(
-                               le16_to_cpu(pfindData->LastWriteDate),
-                               le16_to_cpu(pfindData->LastWriteTime));
-               AdjustForTZ(cifs_sb->tcon, tmp_inode);
+               tmp_inode->i_mtime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+                                                   pfindData->LastWriteTime,
+                                                   offset);
+               tmp_inode->i_atime = cnvrtDosUnixTm(pfindData->LastAccessDate,
+                                                   pfindData->LastAccessTime,
+                                                   offset);
+               tmp_inode->i_ctime = cnvrtDosUnixTm(pfindData->LastWriteDate,
+                                                   pfindData->LastWriteTime,
+                                                   offset);
                attr = le16_to_cpu(pfindData->Attributes);
                allocation_size = le32_to_cpu(pfindData->AllocationSize);
                end_of_file = le32_to_cpu(pfindData->DataSize);
@@ -331,11 +319,11 @@ static void unix_fill_in_inode(struct inode *tmp_inode,
        local_size  = tmp_inode->i_size;
 
        tmp_inode->i_atime =
-           cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastAccessTime));
+           cifs_NTtimeToUnix(pfindData->LastAccessTime);
        tmp_inode->i_mtime =
-           cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastModificationTime));
+           cifs_NTtimeToUnix(pfindData->LastModificationTime);
        tmp_inode->i_ctime =
-           cifs_NTtimeToUnix(le64_to_cpu(pfindData->LastStatusChange));
+           cifs_NTtimeToUnix(pfindData->LastStatusChange);
 
        tmp_inode->i_mode = le64_to_cpu(pfindData->Permissions);
        /* since we set the inode type below we need to mask off type
index 681ed81e6be03a44d4d6722441513c9b0fac621e..bb2a9b2e81738819491a7ae23e0c66be36abc39f 100644 (file)
@@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename,
        if (!bprm)
                goto out_files;
 
-       retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+       retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-       mutex_unlock(&current->cred_exec_mutex);
+       mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1573,7 +1573,7 @@ out_unmark:
 
 out_unlock:
        current->in_execve = 0;
-       mutex_unlock(&current->cred_exec_mutex);
+       mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
        free_bprm(bprm);
index c68edb969441c2f199c5b1475ec755ebdbc0fe62..9b1d285f9fe6eb069c9fc62d02beea8df018cffc 100644 (file)
@@ -557,8 +557,10 @@ static int __init init_devpts_fs(void)
        int err = register_filesystem(&devpts_fs_type);
        if (!err) {
                devpts_mnt = kern_mount(&devpts_fs_type);
-               if (IS_ERR(devpts_mnt))
+               if (IS_ERR(devpts_mnt)) {
                        err = PTR_ERR(devpts_mnt);
+                       unregister_filesystem(&devpts_fs_type);
+               }
        }
        return err;
 }
index 895823d0149d9cfbd75bb34b670662dafb7f9ef6..a7fcd975c6b264d534f63f7958cc8918027010ae 100644 (file)
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1016,7 +1016,7 @@ void install_exec_creds(struct linux_binprm *bprm)
        commit_creds(bprm->cred);
        bprm->cred = NULL;
 
-       /* cred_exec_mutex must be held at least to this point to prevent
+       /* cred_guard_mutex must be held at least to this point to prevent
         * ptrace_attach() from altering our determination of the task's
         * credentials; any time after this it may be unlocked */
 
@@ -1026,7 +1026,7 @@ EXPORT_SYMBOL(install_exec_creds);
 
 /*
  * determine how safe it is to execute the proposed program
- * - the caller must hold current->cred_exec_mutex to protect against
+ * - the caller must hold current->cred_guard_mutex to protect against
  *   PTRACE_ATTACH
  */
 int check_unsafe_exec(struct linux_binprm *bprm)
@@ -1268,7 +1268,7 @@ int do_execve(char * filename,
        if (!bprm)
                goto out_files;
 
-       retval = mutex_lock_interruptible(&current->cred_exec_mutex);
+       retval = mutex_lock_interruptible(&current->cred_guard_mutex);
        if (retval < 0)
                goto out_free;
        current->in_execve = 1;
@@ -1331,7 +1331,7 @@ int do_execve(char * filename,
        /* execve succeeded */
        current->fs->in_exec = 0;
        current->in_execve = 0;
-       mutex_unlock(&current->cred_exec_mutex);
+       mutex_unlock(&current->cred_guard_mutex);
        acct_update_integrals(current);
        free_bprm(bprm);
        if (displaced)
@@ -1354,7 +1354,7 @@ out_unmark:
 
 out_unlock:
        current->in_execve = 0;
-       mutex_unlock(&current->cred_exec_mutex);
+       mutex_unlock(&current->cred_guard_mutex);
 
 out_free:
        free_bprm(bprm);
index 5c4afe652245a9d045825f26f3ad3dbe0a06e13a..e3c748faf2dbcd14ceafb13c6353479e6b072eb9 100644 (file)
@@ -1093,6 +1093,7 @@ failed_mount:
        brelse(bh);
 failed_sbi:
        sb->s_fs_info = NULL;
+       kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        return ret;
 }
index 599dbfe504c39ef8370dd28a4c65aa05346499b4..d8b73d4abe3ebeaa3d993a9e4c5f7e33410ca83d 100644 (file)
@@ -2021,6 +2021,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+       kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
index a8ff003a00f70b8e09958fcdfa3491c0a4bc5a8f..8a34710ecf40ef1f3577be07f13c9086e5ea888d 100644 (file)
@@ -5,8 +5,8 @@
 obj-$(CONFIG_EXT4_FS) += ext4.o
 
 ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \
-                  ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
-                  ext4_jbd2.o migrate.o mballoc.o
+               ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \
+               ext4_jbd2.o migrate.o mballoc.o block_validity.o
 
 ext4-$(CONFIG_EXT4_FS_XATTR)           += xattr.o xattr_user.o xattr_trusted.o
 ext4-$(CONFIG_EXT4_FS_POSIX_ACL)       += acl.o
index 53c72ad85877314c16c0eef0f19fb3889a6743da..e2126d70dff5bd5fce57506849a91ca1f0a2052e 100644 (file)
@@ -19,7 +19,6 @@
 #include <linux/buffer_head.h>
 #include "ext4.h"
 #include "ext4_jbd2.h"
-#include "group.h"
 #include "mballoc.h"
 
 /*
@@ -88,6 +87,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 ext4_group_t block_group, struct ext4_group_desc *gdp)
 {
        int bit, bit_max;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned free_blocks, group_blocks;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
@@ -123,7 +123,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                bit_max += ext4_bg_num_gdb(sb, block_group);
        }
 
-       if (block_group == sbi->s_groups_count - 1) {
+       if (block_group == ngroups - 1) {
                /*
                 * Even though mke2fs always initialize first and last group
                 * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need
@@ -131,7 +131,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
                 */
                group_blocks = ext4_blocks_count(sbi->s_es) -
                        le32_to_cpu(sbi->s_es->s_first_data_block) -
-                       (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1));
+                       (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1));
        } else {
                group_blocks = EXT4_BLOCKS_PER_GROUP(sb);
        }
@@ -205,18 +205,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb,
 {
        unsigned int group_desc;
        unsigned int offset;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
-       if (block_group >= sbi->s_groups_count) {
+       if (block_group >= ngroups) {
                ext4_error(sb, "ext4_get_group_desc",
                           "block_group >= groups_count - "
                           "block_group = %u, groups_count = %u",
-                          block_group, sbi->s_groups_count);
+                          block_group, ngroups);
 
                return NULL;
        }
-       smp_rmb();
 
        group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb);
        offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1);
@@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-       spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                ext4_init_block_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-               spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+               ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-       spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
        down_write(&grp->alloc_sem);
        for (i = 0, blocks_freed = 0; i < count; i++) {
                BUFFER_TRACE(bitmap_bh, "clear bit");
-               if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group),
+               if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
                                                bit + i, bitmap_bh->b_data)) {
                        ext4_error(sb, __func__,
                                   "bit already cleared for block %llu",
@@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
                        blocks_freed++;
                }
        }
-       spin_lock(sb_bgl_lock(sbi, block_group));
+       ext4_lock_group(sb, block_group);
        blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc);
        ext4_free_blks_set(sb, desc, blk_free_count);
        desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc);
-       spin_unlock(sb_bgl_lock(sbi, block_group));
+       ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed);
 
        if (sbi->s_log_groups_per_flex) {
@@ -665,7 +665,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        ext4_fsblk_t desc_count;
        struct ext4_group_desc *gdp;
        ext4_group_t i;
-       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        ext4_fsblk_t bitmap_count;
@@ -677,7 +677,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        bitmap_count = 0;
        gdp = NULL;
 
-       smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
@@ -700,7 +699,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb)
        return bitmap_count;
 #else
        desc_count = 0;
-       smp_rmb();
        for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c
new file mode 100644 (file)
index 0000000..50784ef
--- /dev/null
@@ -0,0 +1,244 @@
+/*
+ *  linux/fs/ext4/block_validity.c
+ *
+ * Copyright (C) 2009
+ * Theodore Ts'o (tytso@mit.edu)
+ *
+ * Track which blocks in the filesystem are metadata blocks that
+ * should never be used as data blocks by files or directories.
+ */
+
+#include <linux/time.h>
+#include <linux/fs.h>
+#include <linux/namei.h>
+#include <linux/quotaops.h>
+#include <linux/buffer_head.h>
+#include <linux/module.h>
+#include <linux/swap.h>
+#include <linux/pagemap.h>
+#include <linux/version.h>
+#include <linux/blkdev.h>
+#include <linux/mutex.h>
+#include "ext4.h"
+
+struct ext4_system_zone {
+       struct rb_node  node;
+       ext4_fsblk_t    start_blk;
+       unsigned int    count;
+};
+
+static struct kmem_cache *ext4_system_zone_cachep;
+
+int __init init_ext4_system_zone(void)
+{
+       ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone,
+                                            SLAB_RECLAIM_ACCOUNT);
+       if (ext4_system_zone_cachep == NULL)
+               return -ENOMEM;
+       return 0;
+}
+
+void exit_ext4_system_zone(void)
+{
+       kmem_cache_destroy(ext4_system_zone_cachep);
+}
+
+static inline int can_merge(struct ext4_system_zone *entry1,
+                    struct ext4_system_zone *entry2)
+{
+       if ((entry1->start_blk + entry1->count) == entry2->start_blk)
+               return 1;
+       return 0;
+}
+
+/*
+ * Mark a range of blocks as belonging to the "system zone" --- that
+ * is, filesystem metadata blocks which should never be used by
+ * inodes.
+ */
+static int add_system_zone(struct ext4_sb_info *sbi,
+                          ext4_fsblk_t start_blk,
+                          unsigned int count)
+{
+       struct ext4_system_zone *new_entry = NULL, *entry;
+       struct rb_node **n = &sbi->system_blks.rb_node, *node;
+       struct rb_node *parent = NULL, *new_node = NULL;
+
+       while (*n) {
+               parent = *n;
+               entry = rb_entry(parent, struct ext4_system_zone, node);
+               if (start_blk < entry->start_blk)
+                       n = &(*n)->rb_left;
+               else if (start_blk >= (entry->start_blk + entry->count))
+                       n = &(*n)->rb_right;
+               else {
+                       if (start_blk + count > (entry->start_blk + 
+                                                entry->count))
+                               entry->count = (start_blk + count - 
+                                               entry->start_blk);
+                       new_node = *n;
+                       new_entry = rb_entry(new_node, struct ext4_system_zone,
+                                            node);
+                       break;
+               }
+       }
+
+       if (!new_entry) {
+               new_entry = kmem_cache_alloc(ext4_system_zone_cachep,
+                                            GFP_KERNEL);
+               if (!new_entry)
+                       return -ENOMEM;
+               new_entry->start_blk = start_blk;
+               new_entry->count = count;
+               new_node = &new_entry->node;
+
+               rb_link_node(new_node, parent, n);
+               rb_insert_color(new_node, &sbi->system_blks);
+       }
+
+       /* Can we merge to the left? */
+       node = rb_prev(new_node);
+       if (node) {
+               entry = rb_entry(node, struct ext4_system_zone, node);
+               if (can_merge(entry, new_entry)) {
+                       new_entry->start_blk = entry->start_blk;
+                       new_entry->count += entry->count;
+                       rb_erase(node, &sbi->system_blks);
+                       kmem_cache_free(ext4_system_zone_cachep, entry);
+               }
+       }
+
+       /* Can we merge to the right? */
+       node = rb_next(new_node);
+       if (node) {
+               entry = rb_entry(node, struct ext4_system_zone, node);
+               if (can_merge(new_entry, entry)) {
+                       new_entry->count += entry->count;
+                       rb_erase(node, &sbi->system_blks);
+                       kmem_cache_free(ext4_system_zone_cachep, entry);
+               }
+       }
+       return 0;
+}
+
+static void debug_print_tree(struct ext4_sb_info *sbi)
+{
+       struct rb_node *node;
+       struct ext4_system_zone *entry;
+       int first = 1;
+
+       printk(KERN_INFO "System zones: ");
+       node = rb_first(&sbi->system_blks);
+       while (node) {
+               entry = rb_entry(node, struct ext4_system_zone, node);
+               printk("%s%llu-%llu", first ? "" : ", ",
+                      entry->start_blk, entry->start_blk + entry->count - 1);
+               first = 0;
+               node = rb_next(node);
+       }
+       printk("\n");
+}
+
+int ext4_setup_system_zone(struct super_block *sb)
+{
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
+       struct ext4_sb_info *sbi = EXT4_SB(sb);
+       struct ext4_group_desc *gdp;
+       ext4_group_t i;
+       int flex_size = ext4_flex_bg_size(sbi);
+       int ret;
+
+       if (!test_opt(sb, BLOCK_VALIDITY)) {
+               if (EXT4_SB(sb)->system_blks.rb_node)
+                       ext4_release_system_zone(sb);
+               return 0;
+       }
+       if (EXT4_SB(sb)->system_blks.rb_node)
+               return 0;
+
+       for (i=0; i < ngroups; i++) {
+               if (ext4_bg_has_super(sb, i) &&
+                   ((i < 5) || ((i % flex_size) == 0)))
+                       add_system_zone(sbi, ext4_group_first_block_no(sb, i),
+                                       sbi->s_gdb_count + 1);
+               gdp = ext4_get_group_desc(sb, i, NULL);
+               ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1);
+               if (ret)
+                       return ret;
+               ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1);
+               if (ret)
+                       return ret;
+               ret = add_system_zone(sbi, ext4_inode_table(sb, gdp),
+                               sbi->s_itb_per_group);
+               if (ret)
+                       return ret;
+       }
+
+       if (test_opt(sb, DEBUG))
+               debug_print_tree(EXT4_SB(sb));
+       return 0;
+}
+
+/* Called when the filesystem is unmounted */
+void ext4_release_system_zone(struct super_block *sb)
+{
+       struct rb_node  *n = EXT4_SB(sb)->system_blks.rb_node;
+       struct rb_node  *parent;
+       struct ext4_system_zone *entry;
+
+       while (n) {
+               /* Do the node's children first */
+               if (n->rb_left) {
+                       n = n->rb_left;
+                       continue;
+               }
+               if (n->rb_right) {
+                       n = n->rb_right;
+                       continue;
+               }
+               /*
+                * The node has no children; free it, and then zero
+                * out parent's link to it.  Finally go to the
+                * beginning of the loop and try to free the parent
+                * node.
+                */
+               parent = rb_parent(n);
+               entry = rb_entry(n, struct ext4_system_zone, node);
+               kmem_cache_free(ext4_system_zone_cachep, entry);
+               if (!parent)
+                       EXT4_SB(sb)->system_blks.rb_node = NULL;
+               else if (parent->rb_left == n)
+                       parent->rb_left = NULL;
+               else if (parent->rb_right == n)
+                       parent->rb_right = NULL;
+               n = parent;
+       }
+       EXT4_SB(sb)->system_blks.rb_node = NULL;
+}
+
+/*
+ * Returns 1 if the passed-in block region (start_blk,
+ * start_blk+count) is valid; 0 if some part of the block region
+ * overlaps with filesystem metadata blocks.
+ */
+int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk,
+                         unsigned int count)
+{
+       struct ext4_system_zone *entry;
+       struct rb_node *n = sbi->system_blks.rb_node;
+
+       if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) ||
+           (start_blk + count > ext4_blocks_count(sbi->s_es)))
+               return 0;
+       while (n) {
+               entry = rb_entry(n, struct ext4_system_zone, node);
+               if (start_blk + count - 1 < entry->start_blk)
+                       n = n->rb_left;
+               else if (start_blk >= (entry->start_blk + entry->count))
+                       n = n->rb_right;
+               else
+                       return 0;
+       }
+       return 1;
+}
+
index b64789929a65bc00243e9b10298f056aadaeb306..9dc93168e2623ae09d26b1c7287f4fc30975e077 100644 (file)
@@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp,
                struct buffer_head *bh = NULL;
 
                map_bh.b_state = 0;
-               err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh,
-                                               0, 0, 0);
+               err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0);
                if (err > 0) {
                        pgoff_t index = map_bh.b_blocknr >>
                                        (PAGE_CACHE_SHIFT - inode->i_blkbits);
index d0f15ef56de1b1b6b325d0a86207e78260b1f40a..cc7d5edc38c904646469094191e4a41f305f8d5b 100644 (file)
 #include <linux/magic.h>
 #include <linux/jbd2.h>
 #include <linux/quota.h>
-#include "ext4_i.h"
+#include <linux/rwsem.h>
+#include <linux/rbtree.h>
+#include <linux/seqlock.h>
+#include <linux/mutex.h>
+#include <linux/timer.h>
+#include <linux/wait.h>
+#include <linux/blockgroup_lock.h>
+#include <linux/percpu_counter.h>
 
 /*
  * The fourth extended filesystem constants/structures
 #define ext4_debug(f, a...)    do {} while (0)
 #endif
 
+/* data type for block offset of block group */
+typedef int ext4_grpblk_t;
+
+/* data type for filesystem-wide blocks number */
+typedef unsigned long long ext4_fsblk_t;
+
+/* data type for file logical block number */
+typedef __u32 ext4_lblk_t;
+
+/* data type for block group number */
+typedef unsigned int ext4_group_t;
+
+
 /* prefer goal again. length */
 #define EXT4_MB_HINT_MERGE             1
 /* blocks already reserved */
@@ -179,9 +199,6 @@ struct flex_groups {
 #define EXT4_BG_BLOCK_UNINIT   0x0002 /* Block bitmap not in use */
 #define EXT4_BG_INODE_ZEROED   0x0004 /* On-disk itable initialized to zero */
 
-#ifdef __KERNEL__
-#include "ext4_sb.h"
-#endif
 /*
  * Macro-instructions used to manage group descriptors
  */
@@ -297,10 +314,23 @@ struct ext4_new_group_data {
 };
 
 /*
- * Following is used by preallocation code to tell get_blocks() that we
- * want uninitialzed extents.
+ * Flags used by ext4_get_blocks()
  */
-#define EXT4_CREATE_UNINITIALIZED_EXT          2
+       /* Allocate any needed blocks and/or convert an unitialized
+          extent to be an initialized ext4 */
+#define EXT4_GET_BLOCKS_CREATE                 0x0001
+       /* Request the creation of an unitialized extent */
+#define EXT4_GET_BLOCKS_UNINIT_EXT             0x0002
+#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT      (EXT4_GET_BLOCKS_UNINIT_EXT|\
+                                                EXT4_GET_BLOCKS_CREATE)
+       /* Caller is from the delayed allocation writeout path,
+          so set the magic i_delalloc_reserve_flag after taking the 
+          inode allocation semaphore for */
+#define EXT4_GET_BLOCKS_DELALLOC_RESERVE       0x0004
+       /* Call ext4_da_update_reserve_space() after successfully 
+          allocating the blocks */
+#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE   0x0008
+
 
 /*
  * ioctl commands
@@ -515,6 +545,110 @@ do {                                                                             \
 
 #endif /* defined(__KERNEL__) || defined(__linux__) */
 
+/*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+       ext4_fsblk_t    ec_start;
+       ext4_lblk_t     ec_block;
+       __u32           ec_len; /* must be 32bit to return holes */
+       __u32           ec_type;
+};
+
+/*
+ * fourth extended file system inode data in memory
+ */
+struct ext4_inode_info {
+       __le32  i_data[15];     /* unconverted */
+       __u32   i_flags;
+       ext4_fsblk_t    i_file_acl;
+       __u32   i_dtime;
+
+       /*
+        * i_block_group is the number of the block group which contains
+        * this file's inode.  Constant across the lifetime of the inode,
+        * it is ued for making block allocation decisions - we try to
+        * place a file's data blocks near its inode block, and new inodes
+        * near to their parent directory's inode.
+        */
+       ext4_group_t    i_block_group;
+       __u32   i_state;                /* Dynamic state flags for ext4 */
+
+       ext4_lblk_t             i_dir_start_lookup;
+#ifdef CONFIG_EXT4_FS_XATTR
+       /*
+        * Extended attributes can be read independently of the main file
+        * data. Taking i_mutex even when reading would cause contention
+        * between readers of EAs and writers of regular file data, so
+        * instead we synchronize on xattr_sem when reading or changing
+        * EAs.
+        */
+       struct rw_semaphore xattr_sem;
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+       struct posix_acl        *i_acl;
+       struct posix_acl        *i_default_acl;
+#endif
+
+       struct list_head i_orphan;      /* unlinked but open inodes */
+
+       /*
+        * i_disksize keeps track of what the inode size is ON DISK, not
+        * in memory.  During truncate, i_size is set to the new size by
+        * the VFS prior to calling ext4_truncate(), but the filesystem won't
+        * set i_disksize to 0 until the truncate is actually under way.
+        *
+        * The intent is that i_disksize always represents the blocks which
+        * are used by this file.  This allows recovery to restart truncate
+        * on orphans if we crash during truncate.  We actually write i_disksize
+        * into the on-disk inode when writing inodes out, instead of i_size.
+        *
+        * The only time when i_disksize and i_size may be different is when
+        * a truncate is in progress.  The only things which change i_disksize
+        * are ext4_get_block (growth) and ext4_truncate (shrinkth).
+        */
+       loff_t  i_disksize;
+
+       /*
+        * i_data_sem is for serialising ext4_truncate() against
+        * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
+        * data tree are chopped off during truncate. We can't do that in
+        * ext4 because whenever we perform intermediate commits during
+        * truncate, the inode and all the metadata blocks *must* be in a
+        * consistent state which allows truncation of the orphans to restart
+        * during recovery.  Hence we must fix the get_block-vs-truncate race
+        * by other means, so we have i_data_sem.
+        */
+       struct rw_semaphore i_data_sem;
+       struct inode vfs_inode;
+       struct jbd2_inode jinode;
+
+       struct ext4_ext_cache i_cached_extent;
+       /*
+        * File creation time. Its function is same as that of
+        * struct timespec i_{a,c,m}time in the generic inode.
+        */
+       struct timespec i_crtime;
+
+       /* mballoc */
+       struct list_head i_prealloc_list;
+       spinlock_t i_prealloc_lock;
+
+       /* ialloc */
+       ext4_group_t    i_last_alloc_group;
+
+       /* allocation reservation info for delalloc */
+       unsigned int i_reserved_data_blocks;
+       unsigned int i_reserved_meta_blocks;
+       unsigned int i_allocated_meta_blocks;
+       unsigned short i_delalloc_reserved_flag;
+
+       /* on-disk additional length */
+       __u16 i_extra_isize;
+
+       spinlock_t i_block_reservation_lock;
+};
+
 /*
  * File system states
  */
@@ -560,6 +694,7 @@ do {                                                                               \
 #define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_DELALLOC            0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT      0x10000000 /* Abort on file data write */
+#define EXT4_MOUNT_BLOCK_VALIDITY      0x20000000 /* Block validity checking */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -689,6 +824,137 @@ struct ext4_super_block {
 };
 
 #ifdef __KERNEL__
+/*
+ * fourth extended-fs super-block data in memory
+ */
+struct ext4_sb_info {
+       unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
+       unsigned long s_inodes_per_block;/* Number of inodes per block */
+       unsigned long s_blocks_per_group;/* Number of blocks in a group */
+       unsigned long s_inodes_per_group;/* Number of inodes in a group */
+       unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
+       unsigned long s_gdb_count;      /* Number of group descriptor blocks */
+       unsigned long s_desc_per_block; /* Number of group descriptors per block */
+       ext4_group_t s_groups_count;    /* Number of groups in the fs */
+       unsigned long s_overhead_last;  /* Last calculated overhead */
+       unsigned long s_blocks_last;    /* Last seen block count */
+       loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
+       struct buffer_head * s_sbh;     /* Buffer containing the super block */
+       struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
+       struct buffer_head **s_group_desc;
+       unsigned long  s_mount_opt;
+       ext4_fsblk_t s_sb_block;
+       uid_t s_resuid;
+       gid_t s_resgid;
+       unsigned short s_mount_state;
+       unsigned short s_pad;
+       int s_addr_per_block_bits;
+       int s_desc_per_block_bits;
+       int s_inode_size;
+       int s_first_ino;
+       unsigned int s_inode_readahead_blks;
+       spinlock_t s_next_gen_lock;
+       u32 s_next_generation;
+       u32 s_hash_seed[4];
+       int s_def_hash_version;
+       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
+       struct percpu_counter s_freeblocks_counter;
+       struct percpu_counter s_freeinodes_counter;
+       struct percpu_counter s_dirs_counter;
+       struct percpu_counter s_dirtyblocks_counter;
+       struct blockgroup_lock *s_blockgroup_lock;
+       struct proc_dir_entry *s_proc;
+       struct kobject s_kobj;
+       struct completion s_kobj_unregister;
+
+       /* Journaling */
+       struct inode *s_journal_inode;
+       struct journal_s *s_journal;
+       struct list_head s_orphan;
+       struct mutex s_orphan_lock;
+       struct mutex s_resize_lock;
+       unsigned long s_commit_interval;
+       u32 s_max_batch_time;
+       u32 s_min_batch_time;
+       struct block_device *journal_bdev;
+#ifdef CONFIG_JBD2_DEBUG
+       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
+       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
+#endif
+#ifdef CONFIG_QUOTA
+       char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
+       int s_jquota_fmt;                       /* Format of quota to use */
+#endif
+       unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
+       struct rb_root system_blks;
+
+#ifdef EXTENTS_STATS
+       /* ext4 extents stats */
+       unsigned long s_ext_min;
+       unsigned long s_ext_max;
+       unsigned long s_depth_max;
+       spinlock_t s_ext_stats_lock;
+       unsigned long s_ext_blocks;
+       unsigned long s_ext_extents;
+#endif
+
+       /* for buddy allocator */
+       struct ext4_group_info ***s_group_info;
+       struct inode *s_buddy_cache;
+       long s_blocks_reserved;
+       spinlock_t s_reserve_lock;
+       spinlock_t s_md_lock;
+       tid_t s_last_transaction;
+       unsigned short *s_mb_offsets;
+       unsigned int *s_mb_maxs;
+
+       /* tunables */
+       unsigned long s_stripe;
+       unsigned int s_mb_stream_request;
+       unsigned int s_mb_max_to_scan;
+       unsigned int s_mb_min_to_scan;
+       unsigned int s_mb_stats;
+       unsigned int s_mb_order2_reqs;
+       unsigned int s_mb_group_prealloc;
+       /* where last allocation was done - for stream allocation */
+       unsigned long s_mb_last_group;
+       unsigned long s_mb_last_start;
+
+       /* history to debug policy */
+       struct ext4_mb_history *s_mb_history;
+       int s_mb_history_cur;
+       int s_mb_history_max;
+       int s_mb_history_num;
+       spinlock_t s_mb_history_lock;
+       int s_mb_history_filter;
+
+       /* stats for buddy allocator */
+       spinlock_t s_mb_pa_lock;
+       atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
+       atomic_t s_bal_success; /* we found long enough chunks */
+       atomic_t s_bal_allocated;       /* in blocks */
+       atomic_t s_bal_ex_scanned;      /* total extents scanned */
+       atomic_t s_bal_goals;   /* goal hits */
+       atomic_t s_bal_breaks;  /* too long searches */
+       atomic_t s_bal_2orders; /* 2^order hits */
+       spinlock_t s_bal_lock;
+       unsigned long s_mb_buddies_generated;
+       unsigned long long s_mb_generation_time;
+       atomic_t s_mb_lost_chunks;
+       atomic_t s_mb_preallocated;
+       atomic_t s_mb_discarded;
+
+       /* locality groups */
+       struct ext4_locality_group *s_locality_groups;
+
+       /* for write statistics */
+       unsigned long s_sectors_written_start;
+       u64 s_kbytes_written;
+
+       unsigned int s_log_groups_per_flex;
+       struct flex_groups *s_flex_groups;
+};
+
 static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb)
 {
        return sb->s_fs_info;
@@ -704,7 +970,6 @@ static inline struct timespec ext4_current_time(struct inode *inode)
                current_fs_time(inode->i_sb) : CURRENT_TIME_SEC;
 }
 
-
 static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
 {
        return ino == EXT4_ROOT_INO ||
@@ -1014,6 +1279,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb,
                                                    ext4_group_t block_group,
                                                    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
+struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+                                     ext4_group_t block_group);
+extern unsigned ext4_init_block_bitmap(struct super_block *sb,
+                                      struct buffer_head *bh,
+                                      ext4_group_t group,
+                                      struct ext4_group_desc *desc);
+#define ext4_free_blocks_after_init(sb, group, desc)                   \
+               ext4_init_block_bitmap(sb, NULL, group, desc)
 
 /* dir.c */
 extern int ext4_check_dir_entry(const char *, struct inode *,
@@ -1038,6 +1311,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long);
 extern unsigned long ext4_count_free_inodes(struct super_block *);
 extern unsigned long ext4_count_dirs(struct super_block *);
 extern void ext4_check_inodes_bitmap(struct super_block *);
+extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
+                                      struct buffer_head *bh,
+                                      ext4_group_t group,
+                                      struct ext4_group_desc *desc);
+extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 
 /* mballoc.c */
 extern long ext4_mb_stats;
@@ -1123,6 +1401,8 @@ extern void ext4_abort(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
 extern void ext4_warning(struct super_block *, const char *, const char *, ...)
        __attribute__ ((format (printf, 3, 4)));
+extern void ext4_msg(struct super_block *, const char *, const char *, ...)
+       __attribute__ ((format (printf, 3, 4)));
 extern void ext4_grp_locked_error(struct super_block *, ext4_group_t,
                                const char *, const char *, ...)
        __attribute__ ((format (printf, 4, 5)));
@@ -1161,6 +1441,10 @@ extern void ext4_used_dirs_set(struct super_block *sb,
                                struct ext4_group_desc *bg, __u32 count);
 extern void ext4_itable_unused_set(struct super_block *sb,
                                   struct ext4_group_desc *bg, __u32 count);
+extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
+                                  struct ext4_group_desc *gdp);
+extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
+                                      struct ext4_group_desc *gdp);
 
 static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es)
 {
@@ -1228,6 +1512,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb,
         return grp_info[indexv][indexh];
 }
 
+/*
+ * Reading s_groups_count requires using smp_rmb() afterwards.  See
+ * the locking protocol documented in the comments of ext4_group_add()
+ * in resize.c
+ */
+static inline ext4_group_t ext4_get_groups_count(struct super_block *sb)
+{
+       ext4_group_t    ngroups = EXT4_SB(sb)->s_groups_count;
+
+       smp_rmb();
+       return ngroups;
+}
 
 static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi,
                                             ext4_group_t block_group)
@@ -1283,33 +1579,25 @@ struct ext4_group_info {
 };
 
 #define EXT4_GROUP_INFO_NEED_INIT_BIT  0
-#define EXT4_GROUP_INFO_LOCKED_BIT     1
 
 #define EXT4_MB_GRP_NEED_INIT(grp)     \
        (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state)))
 
-static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
+static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb,
+                                             ext4_group_t group)
 {
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+       return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group);
 }
 
-static inline void ext4_unlock_group(struct super_block *sb,
-                                       ext4_group_t group)
+static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group)
 {
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state));
+       spin_lock(ext4_group_lock_ptr(sb, group));
 }
 
-static inline int ext4_is_group_locked(struct super_block *sb,
+static inline void ext4_unlock_group(struct super_block *sb,
                                        ext4_group_t group)
 {
-       struct ext4_group_info *grinfo = ext4_get_group_info(sb, group);
-
-       return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT,
-                                               &(grinfo->bb_state));
+       spin_unlock(ext4_group_lock_ptr(sb, group));
 }
 
 /*
@@ -1326,11 +1614,21 @@ extern const struct file_operations ext4_file_operations;
 /* namei.c */
 extern const struct inode_operations ext4_dir_inode_operations;
 extern const struct inode_operations ext4_special_inode_operations;
+extern struct dentry *ext4_get_parent(struct dentry *child);
 
 /* symlink.c */
 extern const struct inode_operations ext4_symlink_inode_operations;
 extern const struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* block_validity */
+extern void ext4_release_system_zone(struct super_block *sb);
+extern int ext4_setup_system_zone(struct super_block *sb);
+extern int __init init_ext4_system_zone(void);
+extern void exit_ext4_system_zone(void);
+extern int ext4_data_block_valid(struct ext4_sb_info *sbi,
+                                ext4_fsblk_t start_blk,
+                                unsigned int count);
+
 /* extents.c */
 extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
 extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
@@ -1338,17 +1636,15 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks,
                                       int chunk);
 extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                               ext4_lblk_t iblock, unsigned int max_blocks,
-                              struct buffer_head *bh_result,
-                              int create, int extend_disksize);
+                              struct buffer_head *bh_result, int flags);
 extern void ext4_ext_truncate(struct inode *);
 extern void ext4_ext_init(struct super_block *);
 extern void ext4_ext_release(struct super_block *);
 extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset,
                          loff_t len);
-extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode,
-                       sector_t block, unsigned int max_blocks,
-                       struct buffer_head *bh, int create,
-                       int extend_disksize, int flag);
+extern int ext4_get_blocks(handle_t *handle, struct inode *inode,
+                          sector_t block, unsigned int max_blocks,
+                          struct buffer_head *bh, int flags);
 extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                        __u64 start, __u64 len);
 
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
deleted file mode 100644 (file)
index 4ce2187..0000000
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- *  ext4_i.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_i.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_I
-#define _EXT4_I
-
-#include <linux/rwsem.h>
-#include <linux/rbtree.h>
-#include <linux/seqlock.h>
-#include <linux/mutex.h>
-
-/* data type for block offset of block group */
-typedef int ext4_grpblk_t;
-
-/* data type for filesystem-wide blocks number */
-typedef unsigned long long ext4_fsblk_t;
-
-/* data type for file logical block number */
-typedef __u32 ext4_lblk_t;
-
-/* data type for block group number */
-typedef unsigned int ext4_group_t;
-
-/*
- * storage for cached extent
- */
-struct ext4_ext_cache {
-       ext4_fsblk_t    ec_start;
-       ext4_lblk_t     ec_block;
-       __u32           ec_len; /* must be 32bit to return holes */
-       __u32           ec_type;
-};
-
-/*
- * fourth extended file system inode data in memory
- */
-struct ext4_inode_info {
-       __le32  i_data[15];     /* unconverted */
-       __u32   i_flags;
-       ext4_fsblk_t    i_file_acl;
-       __u32   i_dtime;
-
-       /*
-        * i_block_group is the number of the block group which contains
-        * this file's inode.  Constant across the lifetime of the inode,
-        * it is ued for making block allocation decisions - we try to
-        * place a file's data blocks near its inode block, and new inodes
-        * near to their parent directory's inode.
-        */
-       ext4_group_t    i_block_group;
-       __u32   i_state;                /* Dynamic state flags for ext4 */
-
-       ext4_lblk_t             i_dir_start_lookup;
-#ifdef CONFIG_EXT4_FS_XATTR
-       /*
-        * Extended attributes can be read independently of the main file
-        * data. Taking i_mutex even when reading would cause contention
-        * between readers of EAs and writers of regular file data, so
-        * instead we synchronize on xattr_sem when reading or changing
-        * EAs.
-        */
-       struct rw_semaphore xattr_sem;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-       struct posix_acl        *i_acl;
-       struct posix_acl        *i_default_acl;
-#endif
-
-       struct list_head i_orphan;      /* unlinked but open inodes */
-
-       /*
-        * i_disksize keeps track of what the inode size is ON DISK, not
-        * in memory.  During truncate, i_size is set to the new size by
-        * the VFS prior to calling ext4_truncate(), but the filesystem won't
-        * set i_disksize to 0 until the truncate is actually under way.
-        *
-        * The intent is that i_disksize always represents the blocks which
-        * are used by this file.  This allows recovery to restart truncate
-        * on orphans if we crash during truncate.  We actually write i_disksize
-        * into the on-disk inode when writing inodes out, instead of i_size.
-        *
-        * The only time when i_disksize and i_size may be different is when
-        * a truncate is in progress.  The only things which change i_disksize
-        * are ext4_get_block (growth) and ext4_truncate (shrinkth).
-        */
-       loff_t  i_disksize;
-
-       /*
-        * i_data_sem is for serialising ext4_truncate() against
-        * ext4_getblock().  In the 2.4 ext2 design, great chunks of inode's
-        * data tree are chopped off during truncate. We can't do that in
-        * ext4 because whenever we perform intermediate commits during
-        * truncate, the inode and all the metadata blocks *must* be in a
-        * consistent state which allows truncation of the orphans to restart
-        * during recovery.  Hence we must fix the get_block-vs-truncate race
-        * by other means, so we have i_data_sem.
-        */
-       struct rw_semaphore i_data_sem;
-       struct inode vfs_inode;
-       struct jbd2_inode jinode;
-
-       struct ext4_ext_cache i_cached_extent;
-       /*
-        * File creation time. Its function is same as that of
-        * struct timespec i_{a,c,m}time in the generic inode.
-        */
-       struct timespec i_crtime;
-
-       /* mballoc */
-       struct list_head i_prealloc_list;
-       spinlock_t i_prealloc_lock;
-
-       /* ialloc */
-       ext4_group_t    i_last_alloc_group;
-
-       /* allocation reservation info for delalloc */
-       unsigned int i_reserved_data_blocks;
-       unsigned int i_reserved_meta_blocks;
-       unsigned int i_allocated_meta_blocks;
-       unsigned short i_delalloc_reserved_flag;
-
-       /* on-disk additional length */
-       __u16 i_extra_isize;
-
-       spinlock_t i_block_reservation_lock;
-};
-
-#endif /* _EXT4_I */
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
deleted file mode 100644 (file)
index 57b71fe..0000000
+++ /dev/null
@@ -1,161 +0,0 @@
-/*
- *  ext4_sb.h
- *
- * Copyright (C) 1992, 1993, 1994, 1995
- * Remy Card (card@masi.ibp.fr)
- * Laboratoire MASI - Institut Blaise Pascal
- * Universite Pierre et Marie Curie (Paris VI)
- *
- *  from
- *
- *  linux/include/linux/minix_fs_sb.h
- *
- *  Copyright (C) 1991, 1992  Linus Torvalds
- */
-
-#ifndef _EXT4_SB
-#define _EXT4_SB
-
-#ifdef __KERNEL__
-#include <linux/timer.h>
-#include <linux/wait.h>
-#include <linux/blockgroup_lock.h>
-#include <linux/percpu_counter.h>
-#endif
-#include <linux/rbtree.h>
-
-/*
- * fourth extended-fs super-block data in memory
- */
-struct ext4_sb_info {
-       unsigned long s_desc_size;      /* Size of a group descriptor in bytes */
-       unsigned long s_inodes_per_block;/* Number of inodes per block */
-       unsigned long s_blocks_per_group;/* Number of blocks in a group */
-       unsigned long s_inodes_per_group;/* Number of inodes in a group */
-       unsigned long s_itb_per_group;  /* Number of inode table blocks per group */
-       unsigned long s_gdb_count;      /* Number of group descriptor blocks */
-       unsigned long s_desc_per_block; /* Number of group descriptors per block */
-       ext4_group_t s_groups_count;    /* Number of groups in the fs */
-       unsigned long s_overhead_last;  /* Last calculated overhead */
-       unsigned long s_blocks_last;    /* Last seen block count */
-       loff_t s_bitmap_maxbytes;       /* max bytes for bitmap files */
-       struct buffer_head * s_sbh;     /* Buffer containing the super block */
-       struct ext4_super_block *s_es;  /* Pointer to the super block in the buffer */
-       struct buffer_head **s_group_desc;
-       unsigned long  s_mount_opt;
-       ext4_fsblk_t s_sb_block;
-       uid_t s_resuid;
-       gid_t s_resgid;
-       unsigned short s_mount_state;
-       unsigned short s_pad;
-       int s_addr_per_block_bits;
-       int s_desc_per_block_bits;
-       int s_inode_size;
-       int s_first_ino;
-       unsigned int s_inode_readahead_blks;
-       spinlock_t s_next_gen_lock;
-       u32 s_next_generation;
-       u32 s_hash_seed[4];
-       int s_def_hash_version;
-       int s_hash_unsigned;    /* 3 if hash should be signed, 0 if not */
-       struct percpu_counter s_freeblocks_counter;
-       struct percpu_counter s_freeinodes_counter;
-       struct percpu_counter s_dirs_counter;
-       struct percpu_counter s_dirtyblocks_counter;
-       struct blockgroup_lock *s_blockgroup_lock;
-       struct proc_dir_entry *s_proc;
-       struct kobject s_kobj;
-       struct completion s_kobj_unregister;
-
-       /* Journaling */
-       struct inode *s_journal_inode;
-       struct journal_s *s_journal;
-       struct list_head s_orphan;
-       unsigned long s_commit_interval;
-       u32 s_max_batch_time;
-       u32 s_min_batch_time;
-       struct block_device *journal_bdev;
-#ifdef CONFIG_JBD2_DEBUG
-       struct timer_list turn_ro_timer;        /* For turning read-only (crash simulation) */
-       wait_queue_head_t ro_wait_queue;        /* For people waiting for the fs to go read-only */
-#endif
-#ifdef CONFIG_QUOTA
-       char *s_qf_names[MAXQUOTAS];            /* Names of quota files with journalled quota */
-       int s_jquota_fmt;                       /* Format of quota to use */
-#endif
-       unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */
-
-#ifdef EXTENTS_STATS
-       /* ext4 extents stats */
-       unsigned long s_ext_min;
-       unsigned long s_ext_max;
-       unsigned long s_depth_max;
-       spinlock_t s_ext_stats_lock;
-       unsigned long s_ext_blocks;
-       unsigned long s_ext_extents;
-#endif
-
-       /* for buddy allocator */
-       struct ext4_group_info ***s_group_info;
-       struct inode *s_buddy_cache;
-       long s_blocks_reserved;
-       spinlock_t s_reserve_lock;
-       spinlock_t s_md_lock;
-       tid_t s_last_transaction;
-       unsigned short *s_mb_offsets;
-       unsigned int *s_mb_maxs;
-
-       /* tunables */
-       unsigned long s_stripe;
-       unsigned int s_mb_stream_request;
-       unsigned int s_mb_max_to_scan;
-       unsigned int s_mb_min_to_scan;
-       unsigned int s_mb_stats;
-       unsigned int s_mb_order2_reqs;
-       unsigned int s_mb_group_prealloc;
-       /* where last allocation was done - for stream allocation */
-       unsigned long s_mb_last_group;
-       unsigned long s_mb_last_start;
-
-       /* history to debug policy */
-       struct ext4_mb_history *s_mb_history;
-       int s_mb_history_cur;
-       int s_mb_history_max;
-       int s_mb_history_num;
-       spinlock_t s_mb_history_lock;
-       int s_mb_history_filter;
-
-       /* stats for buddy allocator */
-       spinlock_t s_mb_pa_lock;
-       atomic_t s_bal_reqs;    /* number of reqs with len > 1 */
-       atomic_t s_bal_success; /* we found long enough chunks */
-       atomic_t s_bal_allocated;       /* in blocks */
-       atomic_t s_bal_ex_scanned;      /* total extents scanned */
-       atomic_t s_bal_goals;   /* goal hits */
-       atomic_t s_bal_breaks;  /* too long searches */
-       atomic_t s_bal_2orders; /* 2^order hits */
-       spinlock_t s_bal_lock;
-       unsigned long s_mb_buddies_generated;
-       unsigned long long s_mb_generation_time;
-       atomic_t s_mb_lost_chunks;
-       atomic_t s_mb_preallocated;
-       atomic_t s_mb_discarded;
-
-       /* locality groups */
-       struct ext4_locality_group *s_locality_groups;
-
-       /* for write statistics */
-       unsigned long s_sectors_written_start;
-       u64 s_kbytes_written;
-
-       unsigned int s_log_groups_per_flex;
-       struct flex_groups *s_flex_groups;
-};
-
-static inline spinlock_t *
-sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
-{
-       return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
-}
-
-#endif /* _EXT4_SB */
index e3a55eb8b26a5df9cc41fb5efd79e23fc275da22..2593f748c3a48040ef3e70c6a8032d58b8c714e5 100644 (file)
@@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth)
 
 static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
 {
-       ext4_fsblk_t block = ext_pblock(ext), valid_block;
+       ext4_fsblk_t block = ext_pblock(ext);
        int len = ext4_ext_get_actual_len(ext);
-       struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
 
-       valid_block = le32_to_cpu(es->s_first_data_block) +
-               EXT4_SB(inode->i_sb)->s_gdb_count;
-       if (unlikely(block <= valid_block ||
-                    ((block + len) > ext4_blocks_count(es))))
-               return 0;
-       else
-               return 1;
+       return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
 static int ext4_valid_extent_idx(struct inode *inode,
                                struct ext4_extent_idx *ext_idx)
 {
-       ext4_fsblk_t block = idx_pblock(ext_idx), valid_block;
-       struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+       ext4_fsblk_t block = idx_pblock(ext_idx);
 
-       valid_block = le32_to_cpu(es->s_first_data_block) +
-               EXT4_SB(inode->i_sb)->s_gdb_count;
-       if (unlikely(block <= valid_block ||
-                    (block >= ext4_blocks_count(es))))
-               return 0;
-       else
-               return 1;
+       return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1);
 }
 
 static int ext4_valid_extent_entries(struct inode *inode,
@@ -2097,12 +2083,16 @@ ext4_ext_rm_leaf(handle_t *handle, struct inode *inode,
        ex = EXT_LAST_EXTENT(eh);
 
        ex_ee_block = le32_to_cpu(ex->ee_block);
-       if (ext4_ext_is_uninitialized(ex))
-               uninitialized = 1;
        ex_ee_len = ext4_ext_get_actual_len(ex);
 
        while (ex >= EXT_FIRST_EXTENT(eh) &&
                        ex_ee_block + ex_ee_len > start) {
+
+               if (ext4_ext_is_uninitialized(ex))
+                       uninitialized = 1;
+               else
+                       uninitialized = 0;
+
                ext_debug("remove ext %lu:%u\n", ex_ee_block, ex_ee_len);
                path[depth].p_ext = ex;
 
@@ -2784,7 +2774,7 @@ fix_extent_len:
 int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                        ext4_lblk_t iblock,
                        unsigned int max_blocks, struct buffer_head *bh_result,
-                       int create, int extend_disksize)
+                       int flags)
 {
        struct ext4_ext_path *path = NULL;
        struct ext4_extent_header *eh;
@@ -2793,7 +2783,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        int err = 0, depth, ret, cache_type;
        unsigned int allocated = 0;
        struct ext4_allocation_request ar;
-       loff_t disksize;
 
        __clear_bit(BH_New, &bh_result->b_state);
        ext_debug("blocks %u/%u requested for inode %u\n",
@@ -2803,7 +2792,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        cache_type = ext4_ext_in_cache(inode, iblock, &newex);
        if (cache_type) {
                if (cache_type == EXT4_EXT_CACHE_GAP) {
-                       if (!create) {
+                       if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                                /*
                                 * block isn't allocated yet and
                                 * user doesn't want to allocate it
@@ -2869,9 +2858,11 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                                        EXT4_EXT_CACHE_EXTENT);
                                goto out;
                        }
-                       if (create == EXT4_CREATE_UNINITIALIZED_EXT)
+                       if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)
                                goto out;
-                       if (!create) {
+                       if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
+                               if (allocated > max_blocks)
+                                       allocated = max_blocks;
                                /*
                                 * We have blocks reserved already.  We
                                 * return allocated blocks so that delalloc
@@ -2879,8 +2870,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
                                 * the buffer head will be unmapped so that
                                 * a read from the block returns 0s.
                                 */
-                               if (allocated > max_blocks)
-                                       allocated = max_blocks;
                                set_buffer_unwritten(bh_result);
                                bh_result->b_bdev = inode->i_sb->s_bdev;
                                bh_result->b_blocknr = newblock;
@@ -2903,7 +2892,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * requested block isn't allocated yet;
         * we couldn't try to create block if create flag is zero
         */
-       if (!create) {
+       if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) {
                /*
                 * put just found gap into cache to speed up
                 * subsequent requests
@@ -2932,10 +2921,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
         * EXT_UNINIT_MAX_LEN.
         */
        if (max_blocks > EXT_INIT_MAX_LEN &&
-           create != EXT4_CREATE_UNINITIALIZED_EXT)
+           !(flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_INIT_MAX_LEN;
        else if (max_blocks > EXT_UNINIT_MAX_LEN &&
-                create == EXT4_CREATE_UNINITIALIZED_EXT)
+                (flags & EXT4_GET_BLOCKS_UNINIT_EXT))
                max_blocks = EXT_UNINIT_MAX_LEN;
 
        /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */
@@ -2966,7 +2955,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        /* try to insert new extent into found leaf and return */
        ext4_ext_store_pblock(&newex, newblock);
        newex.ee_len = cpu_to_le16(ar.len);
-       if (create == EXT4_CREATE_UNINITIALIZED_EXT)  /* Mark uninitialized */
+       if (flags & EXT4_GET_BLOCKS_UNINIT_EXT)  /* Mark uninitialized */
                ext4_ext_mark_uninitialized(&newex);
        err = ext4_ext_insert_extent(handle, inode, path, &newex);
        if (err) {
@@ -2983,18 +2972,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
        newblock = ext_pblock(&newex);
        allocated = ext4_ext_get_actual_len(&newex);
 outnew:
-       if (extend_disksize) {
-               disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits;
-               if (disksize > i_size_read(inode))
-                       disksize = i_size_read(inode);
-               if (disksize > EXT4_I(inode)->i_disksize)
-                       EXT4_I(inode)->i_disksize = disksize;
-       }
-
        set_buffer_new(bh_result);
 
        /* Cache only when it is _not_ an uninitialized extent */
-       if (create != EXT4_CREATE_UNINITIALIZED_EXT)
+       if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0)
                ext4_ext_put_in_cache(inode, iblock, allocated, newblock,
                                                EXT4_EXT_CACHE_EXTENT);
 out:
@@ -3150,9 +3131,10 @@ retry:
                        ret = PTR_ERR(handle);
                        break;
                }
-               ret = ext4_get_blocks_wrap(handle, inode, block,
-                                         max_blocks, &map_bh,
-                                         EXT4_CREATE_UNINITIALIZED_EXT, 0, 0);
+               map_bh.b_state = 0;
+               ret = ext4_get_blocks(handle, inode, block,
+                                     max_blocks, &map_bh,
+                                     EXT4_GET_BLOCKS_CREATE_UNINIT_EXT);
                if (ret <= 0) {
 #ifdef EXT4FS_DEBUG
                        WARN_ON(ret <= 0);
@@ -3195,7 +3177,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
                       void *data)
 {
        struct fiemap_extent_info *fieinfo = data;
-       unsigned long blksize_bits = inode->i_sb->s_blocksize_bits;
+       unsigned char blksize_bits = inode->i_sb->s_blocksize_bits;
        __u64   logical;
        __u64   physical;
        __u64   length;
@@ -3242,9 +3224,16 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path,
         *
         * XXX this might miss a single-block extent at EXT_MAX_BLOCK
         */
-       if (logical + length - 1 == EXT_MAX_BLOCK ||
-           ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK)
+       if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK ||
+           newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) {
+               loff_t size = i_size_read(inode);
+               loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb);
+
                flags |= FIEMAP_EXTENT_LAST;
+               if ((flags & FIEMAP_EXTENT_DELALLOC) &&
+                   logical+length > size)
+                       length = (size - logical + bs - 1) & ~(bs-1);
+       }
 
        error = fiemap_fill_next_extent(fieinfo, logical, physical,
                                        length, flags);
@@ -3318,10 +3307,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
                 * Walk the extent tree gathering extent information.
                 * ext4_ext_fiemap_cb will push extents back to user.
                 */
-               down_write(&EXT4_I(inode)->i_data_sem);
+               down_read(&EXT4_I(inode)->i_data_sem);
                error = ext4_ext_walk_space(inode, start_blk, len_blks,
                                          ext4_ext_fiemap_cb, fieinfo);
-               up_write(&EXT4_I(inode)->i_data_sem);
+               up_read(&EXT4_I(inode)->i_data_sem);
        }
 
        return error;
diff --git a/fs/ext4/group.h b/fs/ext4/group.h
deleted file mode 100644 (file)
index c2c0a8d..0000000
+++ /dev/null
@@ -1,29 +0,0 @@
-/*
- *  linux/fs/ext4/group.h
- *
- * Copyright (C) 2007 Cluster File Systems, Inc
- *
- * Author: Andreas Dilger <adilger@clusterfs.com>
- */
-
-#ifndef _LINUX_EXT4_GROUP_H
-#define _LINUX_EXT4_GROUP_H
-
-extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group,
-                                  struct ext4_group_desc *gdp);
-extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group,
-                                      struct ext4_group_desc *gdp);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-                                     ext4_group_t block_group);
-extern unsigned ext4_init_block_bitmap(struct super_block *sb,
-                                      struct buffer_head *bh,
-                                      ext4_group_t group,
-                                      struct ext4_group_desc *desc);
-#define ext4_free_blocks_after_init(sb, group, desc)                   \
-               ext4_init_block_bitmap(sb, NULL, group, desc)
-extern unsigned ext4_init_inode_bitmap(struct super_block *sb,
-                                      struct buffer_head *bh,
-                                      ext4_group_t group,
-                                      struct ext4_group_desc *desc);
-extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
-#endif /* _LINUX_EXT4_GROUP_H */
index f18e0a08a6b5080cf4b788a06c98abc59c8aca34..3743bd849bce83d4085940acdec32bee72bbd999 100644 (file)
@@ -27,7 +27,6 @@
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "group.h"
 
 /*
  * ialloc.c contains the inodes allocation and deallocation routines
@@ -123,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
                unlock_buffer(bh);
                return bh;
        }
-       spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       ext4_lock_group(sb, block_group);
        if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
                ext4_init_inode_bitmap(sb, bh, block_group, desc);
                set_bitmap_uptodate(bh);
                set_buffer_uptodate(bh);
-               spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+               ext4_unlock_group(sb, block_group);
                unlock_buffer(bh);
                return bh;
        }
-       spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group));
+       ext4_unlock_group(sb, block_group);
        if (buffer_uptodate(bh)) {
                /*
                 * if not uninit if bh is uptodate,
@@ -247,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                goto error_return;
 
        /* Ok, now we can actually update the inode bitmaps.. */
-       spin_lock(sb_bgl_lock(sbi, block_group));
-       cleared = ext4_clear_bit(bit, bitmap_bh->b_data);
-       spin_unlock(sb_bgl_lock(sbi, block_group));
+       cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group),
+                                       bit, bitmap_bh->b_data);
        if (!cleared)
                ext4_error(sb, "ext4_free_inode",
                           "bit already cleared for inode %lu", ino);
@@ -261,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                if (fatal) goto error_return;
 
                if (gdp) {
-                       spin_lock(sb_bgl_lock(sbi, block_group));
+                       ext4_lock_group(sb, block_group);
                        count = ext4_free_inodes_count(sb, gdp) + 1;
                        ext4_free_inodes_set(sb, gdp, count);
                        if (is_directory) {
@@ -277,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
                        }
                        gdp->bg_checksum = ext4_group_desc_csum(sbi,
                                                        block_group, gdp);
-                       spin_unlock(sb_bgl_lock(sbi, block_group));
+                       ext4_unlock_group(sb, block_group);
                        percpu_counter_inc(&sbi->s_freeinodes_counter);
                        if (is_directory)
                                percpu_counter_dec(&sbi->s_dirs_counter);
@@ -316,7 +314,7 @@ error_return:
 static int find_group_dir(struct super_block *sb, struct inode *parent,
                                ext4_group_t *best_group)
 {
-       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        unsigned int freei, avefreei;
        struct ext4_group_desc *desc, *best_desc = NULL;
        ext4_group_t group;
@@ -349,11 +347,10 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *desc;
-       struct buffer_head *bh;
        struct flex_groups *flex_group = sbi->s_flex_groups;
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group);
-       ext4_group_t ngroups = sbi->s_groups_count;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        int flex_size = ext4_flex_bg_size(sbi);
        ext4_group_t best_flex = parent_fbg_group;
        int blocks_per_flex = sbi->s_blocks_per_group * flex_size;
@@ -362,7 +359,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
        ext4_group_t n_fbg_groups;
        ext4_group_t i;
 
-       n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >>
+       n_fbg_groups = (ngroups + flex_size - 1) >>
                sbi->s_log_groups_per_flex;
 
 find_close_to_parent:
@@ -404,7 +401,7 @@ find_close_to_parent:
 found_flexbg:
        for (i = best_flex * flex_size; i < ngroups &&
                     i < (best_flex + 1) * flex_size; i++) {
-               desc = ext4_get_group_desc(sb, i, &bh);
+               desc = ext4_get_group_desc(sb, i, NULL);
                if (ext4_free_inodes_count(sb, desc)) {
                        *best_group = i;
                        goto out;
@@ -478,20 +475,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
-       ext4_group_t ngroups = sbi->s_groups_count;
+       ext4_group_t real_ngroups = ext4_get_groups_count(sb);
        int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
        unsigned int freei, avefreei;
        ext4_fsblk_t freeb, avefreeb;
        unsigned int ndirs;
        int max_dirs, min_inodes;
        ext4_grpblk_t min_blocks;
-       ext4_group_t i, grp, g;
+       ext4_group_t i, grp, g, ngroups;
        struct ext4_group_desc *desc;
        struct orlov_stats stats;
        int flex_size = ext4_flex_bg_size(sbi);
 
+       ngroups = real_ngroups;
        if (flex_size > 1) {
-               ngroups = (ngroups + flex_size - 1) >>
+               ngroups = (real_ngroups + flex_size - 1) >>
                        sbi->s_log_groups_per_flex;
                parent_group >>= sbi->s_log_groups_per_flex;
        }
@@ -543,7 +541,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
                 */
                grp *= flex_size;
                for (i = 0; i < flex_size; i++) {
-                       if (grp+i >= sbi->s_groups_count)
+                       if (grp+i >= real_ngroups)
                                break;
                        desc = ext4_get_group_desc(sb, grp+i, NULL);
                        if (desc && ext4_free_inodes_count(sb, desc)) {
@@ -583,7 +581,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
        }
 
 fallback:
-       ngroups = sbi->s_groups_count;
+       ngroups = real_ngroups;
        avefreei = freei / ngroups;
 fallback_retry:
        parent_group = EXT4_I(parent)->i_block_group;
@@ -613,9 +611,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
                            ext4_group_t *group, int mode)
 {
        ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
-       ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
+       ext4_group_t i, last, ngroups = ext4_get_groups_count(sb);
        struct ext4_group_desc *desc;
-       ext4_group_t i, last;
        int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
 
        /*
@@ -708,10 +705,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent,
 
 /*
  * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's sb_bgl_lock
+ * is uninit we need to take the groups's ext4_group_lock
  * and clear the uninit flag. The inode bitmap update
  * and group desc uninit flag clear should be done
- * after holding sb_bgl_lock so that ext4_read_inode_bitmap
+ * after holding ext4_group_lock so that ext4_read_inode_bitmap
  * doesn't race with the ext4_claim_inode
  */
 static int ext4_claim_inode(struct super_block *sb,
@@ -722,7 +719,7 @@ static int ext4_claim_inode(struct super_block *sb,
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
 
-       spin_lock(sb_bgl_lock(sbi, group));
+       ext4_lock_group(sb, group);
        if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) {
                /* not a free inode */
                retval = 1;
@@ -731,7 +728,7 @@ static int ext4_claim_inode(struct super_block *sb,
        ino++;
        if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
                        ino > EXT4_INODES_PER_GROUP(sb)) {
-               spin_unlock(sb_bgl_lock(sbi, group));
+               ext4_unlock_group(sb, group);
                ext4_error(sb, __func__,
                           "reserved inode or inode > inodes count - "
                           "block_group = %u, inode=%lu", group,
@@ -780,7 +777,7 @@ static int ext4_claim_inode(struct super_block *sb,
        }
        gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
 err_ret:
-       spin_unlock(sb_bgl_lock(sbi, group));
+       ext4_unlock_group(sb, group);
        return retval;
 }
 
@@ -799,11 +796,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
        struct super_block *sb;
        struct buffer_head *inode_bitmap_bh = NULL;
        struct buffer_head *group_desc_bh;
-       ext4_group_t group = 0;
+       ext4_group_t ngroups, group = 0;
        unsigned long ino = 0;
        struct inode *inode;
        struct ext4_group_desc *gdp = NULL;
-       struct ext4_super_block *es;
        struct ext4_inode_info *ei;
        struct ext4_sb_info *sbi;
        int ret2, err = 0;
@@ -818,15 +814,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
                return ERR_PTR(-EPERM);
 
        sb = dir->i_sb;
+       ngroups = ext4_get_groups_count(sb);
        trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id,
                   dir->i_ino, mode);
        inode = new_inode(sb);
        if (!inode)
                return ERR_PTR(-ENOMEM);
        ei = EXT4_I(inode);
-
        sbi = EXT4_SB(sb);
-       es = sbi->s_es;
 
        if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
                ret2 = find_group_flex(sb, dir, &group);
@@ -856,7 +851,7 @@ got_group:
        if (ret2 == -1)
                goto out;
 
-       for (i = 0; i < sbi->s_groups_count; i++) {
+       for (i = 0; i < ngroups; i++) {
                err = -EIO;
 
                gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
@@ -917,7 +912,7 @@ repeat_in_this_group:
                 * group descriptor metadata has not yet been updated.
                 * So we just go onto the next blockgroup.
                 */
-               if (++group == sbi->s_groups_count)
+               if (++group == ngroups)
                        group = 0;
        }
        err = -ENOSPC;
@@ -938,7 +933,7 @@ got:
                }
 
                free = 0;
-               spin_lock(sb_bgl_lock(sbi, group));
+               ext4_lock_group(sb, group);
                /* recheck and clear flag under lock if we still need to */
                if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        free = ext4_free_blocks_after_init(sb, group, gdp);
@@ -947,7 +942,7 @@ got:
                        gdp->bg_checksum = ext4_group_desc_csum(sbi, group,
                                                                gdp);
                }
-               spin_unlock(sb_bgl_lock(sbi, group));
+               ext4_unlock_group(sb, group);
 
                /* Don't need to dirty bitmap block if we didn't change it */
                if (free) {
@@ -1158,7 +1153,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 {
        unsigned long desc_count;
        struct ext4_group_desc *gdp;
-       ext4_group_t i;
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 #ifdef EXT4FS_DEBUG
        struct ext4_super_block *es;
        unsigned long bitmap_count, x;
@@ -1168,7 +1163,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        desc_count = 0;
        bitmap_count = 0;
        gdp = NULL;
-       for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+       for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1190,7 +1185,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
        return desc_count;
 #else
        desc_count = 0;
-       for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+       for (i = 0; i < ngroups; i++) {
                gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
@@ -1205,9 +1200,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb)
 unsigned long ext4_count_dirs(struct super_block * sb)
 {
        unsigned long count = 0;
-       ext4_group_t i;
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
 
-       for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+       for (i = 0; i < ngroups; i++) {
                struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL);
                if (!gdp)
                        continue;
index 2a9ffd528dd1271e179a35322ce2e0a522f4d5b2..875db944b22f19889727ed0fcc2b4c60dd4043e4 100644 (file)
@@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode,
 }
 
 static int __ext4_check_blockref(const char *function, struct inode *inode,
-                                __le32 *p, unsigned int max) {
-
-       unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+                                __le32 *p, unsigned int max)
+{
        __le32 *bref = p;
+       unsigned int blk;
+
        while (bref < p+max) {
-               if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
+               blk = le32_to_cpu(*bref++);
+               if (blk && 
+                   unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), 
+                                                   blk, 1))) {
                        ext4_error(inode->i_sb, function,
-                                  "block reference %u >= max (%u) "
-                                  "in inode #%lu, offset=%d",
-                                  le32_to_cpu(*bref), maxblocks,
-                                  inode->i_ino, (int)(bref-p));
+                                  "invalid block reference %u "
+                                  "in inode #%lu", blk, inode->i_ino);
                        return -EIO;
                }
-               bref++;
        }
        return 0;
 }
@@ -892,6 +893,10 @@ err_out:
 }
 
 /*
+ * The ext4_ind_get_blocks() function handles non-extents inodes
+ * (i.e., using the traditional indirect/double-indirect i_blocks
+ * scheme) for ext4_get_blocks().
+ *
  * Allocation strategy is simple: if we have to allocate something, we will
  * have to go the whole way to leaf. So let's do it before attaching anything
  * to tree, set linkage between the newborn blocks, write them if sync is
@@ -909,15 +914,16 @@ err_out:
  * return = 0, if plain lookup failed.
  * return < 0, error case.
  *
- *
- * Need to be called with
- * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block
- * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem)
+ * The ext4_ind_get_blocks() function should be called with
+ * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem
+ * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or
+ * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system
+ * blocks.
  */
-static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
+static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode,
                                  ext4_lblk_t iblock, unsigned int maxblocks,
                                  struct buffer_head *bh_result,
-                                 int create, int extend_disksize)
+                                 int flags)
 {
        int err = -EIO;
        ext4_lblk_t offsets[4];
@@ -927,14 +933,11 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        int indirect_blks;
        int blocks_to_boundary = 0;
        int depth;
-       struct ext4_inode_info *ei = EXT4_I(inode);
        int count = 0;
        ext4_fsblk_t first_block = 0;
-       loff_t disksize;
-
 
        J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL));
-       J_ASSERT(handle != NULL || create == 0);
+       J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0);
        depth = ext4_block_to_path(inode, iblock, offsets,
                                        &blocks_to_boundary);
 
@@ -963,7 +966,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        }
 
        /* Next simple case - plain lookup or failed read of indirect block */
-       if (!create || err == -EIO)
+       if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO)
                goto cleanup;
 
        /*
@@ -997,19 +1000,7 @@ static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode,
        if (!err)
                err = ext4_splice_branch(handle, inode, iblock,
                                        partial, indirect_blks, count);
-       /*
-        * i_disksize growing is protected by i_data_sem.  Don't forget to
-        * protect it if you're about to implement concurrent
-        * ext4_get_block() -bzzz
-       */
-       if (!err && extend_disksize) {
-               disksize = ((loff_t) iblock + count) << inode->i_blkbits;
-               if (disksize > i_size_read(inode))
-                       disksize = i_size_read(inode);
-               if (disksize > ei->i_disksize)
-                       ei->i_disksize = disksize;
-       }
-       if (err)
+       else 
                goto cleanup;
 
        set_buffer_new(bh_result);
@@ -1120,8 +1111,23 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
                ext4_discard_preallocations(inode);
 }
 
+static int check_block_validity(struct inode *inode, sector_t logical,
+                               sector_t phys, int len)
+{
+       if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) {
+               ext4_error(inode->i_sb, "check_block_validity",
+                          "inode #%lu logical block %llu mapped to %llu "
+                          "(size %d)", inode->i_ino,
+                          (unsigned long long) logical,
+                          (unsigned long long) phys, len);
+               WARN_ON(1);
+               return -EIO;
+       }
+       return 0;
+}
+
 /*
- * The ext4_get_blocks_wrap() function try to look up the requested blocks,
+ * The ext4_get_blocks() function tries to look up the requested blocks,
  * and returns if the blocks are already mapped.
  *
  * Otherwise it takes the write lock of the i_data_sem and allocate blocks
@@ -1129,7 +1135,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  * mapped.
  *
  * If file type is extents based, it will call ext4_ext_get_blocks(),
- * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping
+ * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping
  * based files
  *
  * On success, it returns the number of blocks being mapped or allocate.
@@ -1142,9 +1148,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
  *
  * It returns the error in case of allocation failure.
  */
-int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
-                       unsigned int max_blocks, struct buffer_head *bh,
-                       int create, int extend_disksize, int flag)
+int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block,
+                   unsigned int max_blocks, struct buffer_head *bh,
+                   int flags)
 {
        int retval;
 
@@ -1152,21 +1158,28 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
        clear_buffer_unwritten(bh);
 
        /*
-        * Try to see if we can get  the block without requesting
-        * for new file system block.
+        * Try to see if we can get the block without requesting a new
+        * file system block.
         */
        down_read((&EXT4_I(inode)->i_data_sem));
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                               bh, 0, 0);
+                               bh, 0);
        } else {
-               retval = ext4_get_blocks_handle(handle,
-                               inode, block, max_blocks, bh, 0, 0);
+               retval = ext4_ind_get_blocks(handle, inode, block, max_blocks,
+                                            bh, 0);
        }
        up_read((&EXT4_I(inode)->i_data_sem));
 
+       if (retval > 0 && buffer_mapped(bh)) {
+               int ret = check_block_validity(inode, block, 
+                                              bh->b_blocknr, retval);
+               if (ret != 0)
+                       return ret;
+       }
+
        /* If it is only a block(s) look up */
-       if (!create)
+       if ((flags & EXT4_GET_BLOCKS_CREATE) == 0)
                return retval;
 
        /*
@@ -1205,7 +1218,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         * let the underlying get_block() function know to
         * avoid double accounting
         */
-       if (flag)
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 1;
        /*
         * We need to check for EXT4 here because migrate
@@ -1213,10 +1226,10 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
         */
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
                retval =  ext4_ext_get_blocks(handle, inode, block, max_blocks,
-                               bh, create, extend_disksize);
+                                             bh, flags);
        } else {
-               retval = ext4_get_blocks_handle(handle, inode, block,
-                               max_blocks, bh, create, extend_disksize);
+               retval = ext4_ind_get_blocks(handle, inode, block,
+                                            max_blocks, bh, flags);
 
                if (retval > 0 && buffer_new(bh)) {
                        /*
@@ -1229,18 +1242,23 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
                }
        }
 
-       if (flag) {
+       if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)
                EXT4_I(inode)->i_delalloc_reserved_flag = 0;
-               /*
-                * Update reserved blocks/metadata blocks
-                * after successful block allocation
-                * which were deferred till now
-                */
-               if ((retval > 0) && buffer_delay(bh))
-                       ext4_da_update_reserve_space(inode, retval);
-       }
+
+       /*
+        * Update reserved blocks/metadata blocks after successful
+        * block allocation which had been deferred till now.
+        */
+       if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE))
+               ext4_da_update_reserve_space(inode, retval);
 
        up_write((&EXT4_I(inode)->i_data_sem));
+       if (retval > 0 && buffer_mapped(bh)) {
+               int ret = check_block_validity(inode, block, 
+                                              bh->b_blocknr, retval);
+               if (ret != 0)
+                       return ret;
+       }
        return retval;
 }
 
@@ -1268,8 +1286,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock,
                started = 1;
        }
 
-       ret = ext4_get_blocks_wrap(handle, inode, iblock,
-                                       max_blocks, bh_result, create, 0, 0);
+       ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result,
+                             create ? EXT4_GET_BLOCKS_CREATE : 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -1288,17 +1306,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode,
 {
        struct buffer_head dummy;
        int fatal = 0, err;
+       int flags = 0;
 
        J_ASSERT(handle != NULL || create == 0);
 
        dummy.b_state = 0;
        dummy.b_blocknr = -1000;
        buffer_trace_init(&dummy.b_history);
-       err = ext4_get_blocks_wrap(handle, inode, block, 1,
-                                       &dummy, create, 1, 0);
+       if (create)
+               flags |= EXT4_GET_BLOCKS_CREATE;
+       err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags);
        /*
-        * ext4_get_blocks_handle() returns number of blocks
-        * mapped. 0 in case of a HOLE.
+        * ext4_get_blocks() returns number of blocks mapped. 0 in
+        * case of a HOLE.
         */
        if (err > 0) {
                if (err > 1)
@@ -1439,7 +1459,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                                struct page **pagep, void **fsdata)
 {
        struct inode *inode = mapping->host;
-       int ret, needed_blocks = ext4_writepage_trans_blocks(inode);
+       int ret, needed_blocks;
        handle_t *handle;
        int retries = 0;
        struct page *page;
@@ -1450,6 +1470,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
                   "dev %s ino %lu pos %llu len %u flags %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, flags);
+       /*
+        * Reserve one block more for addition to orphan list in case
+        * we allocate blocks but write fails for some reason
+        */
+       needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
        index = pos >> PAGE_CACHE_SHIFT;
        from = pos & (PAGE_CACHE_SIZE - 1);
        to = from + len;
@@ -1483,15 +1508,30 @@ retry:
 
        if (ret) {
                unlock_page(page);
-               ext4_journal_stop(handle);
                page_cache_release(page);
                /*
                 * block_write_begin may have instantiated a few blocks
                 * outside i_size.  Trim these off again. Don't need
                 * i_size_read because we hold i_mutex.
+                *
+                * Add inode to orphan list in case we crash before
+                * truncate finishes
                 */
                if (pos + len > inode->i_size)
+                       ext4_orphan_add(handle, inode);
+
+               ext4_journal_stop(handle);
+               if (pos + len > inode->i_size) {
                        vmtruncate(inode, inode->i_size);
+                       /* 
+                        * If vmtruncate failed early the inode might
+                        * still be on the orphan list; we need to
+                        * make sure the inode is removed from the
+                        * orphan list in that case.
+                        */
+                       if (inode->i_nlink)
+                               ext4_orphan_del(NULL, inode);
+               }
        }
 
        if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
@@ -1509,6 +1549,52 @@ static int write_end_fn(handle_t *handle, struct buffer_head *bh)
        return ext4_handle_dirty_metadata(handle, NULL, bh);
 }
 
+static int ext4_generic_write_end(struct file *file,
+                               struct address_space *mapping,
+                               loff_t pos, unsigned len, unsigned copied,
+                               struct page *page, void *fsdata)
+{
+       int i_size_changed = 0;
+       struct inode *inode = mapping->host;
+       handle_t *handle = ext4_journal_current_handle();
+
+       copied = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+       /*
+        * No need to use i_size_read() here, the i_size
+        * cannot change under us because we hold i_mutex.
+        *
+        * But it's important to update i_size while still holding page lock:
+        * page writeout could otherwise come in and zero beyond i_size.
+        */
+       if (pos + copied > inode->i_size) {
+               i_size_write(inode, pos + copied);
+               i_size_changed = 1;
+       }
+
+       if (pos + copied >  EXT4_I(inode)->i_disksize) {
+               /* We need to mark inode dirty even if
+                * new_i_size is less that inode->i_size
+                * bu greater than i_disksize.(hint delalloc)
+                */
+               ext4_update_i_disksize(inode, (pos + copied));
+               i_size_changed = 1;
+       }
+       unlock_page(page);
+       page_cache_release(page);
+
+       /*
+        * Don't mark the inode dirty under page lock. First, it unnecessarily
+        * makes the holding time of page lock longer. Second, it forces lock
+        * ordering of page lock and transaction start for journaling
+        * filesystems.
+        */
+       if (i_size_changed)
+               ext4_mark_inode_dirty(handle, inode);
+
+       return copied;
+}
+
 /*
  * We need to pick up the new inode size which generic_commit_write gave us
  * `file' can be NULL - eg, when called from page_symlink().
@@ -1532,21 +1618,15 @@ static int ext4_ordered_write_end(struct file *file,
        ret = ext4_jbd2_file_inode(handle, inode);
 
        if (ret == 0) {
-               loff_t new_i_size;
-
-               new_i_size = pos + copied;
-               if (new_i_size > EXT4_I(inode)->i_disksize) {
-                       ext4_update_i_disksize(inode, new_i_size);
-                       /* We need to mark inode dirty even if
-                        * new_i_size is less that inode->i_size
-                        * bu greater than i_disksize.(hint delalloc)
-                        */
-                       ext4_mark_inode_dirty(handle, inode);
-               }
-
-               ret2 = generic_write_end(file, mapping, pos, len, copied,
+               ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
                copied = ret2;
+               if (pos + len > inode->i_size)
+                       /* if we have allocated more blocks and copied
+                        * less. We will have blocks allocated outside
+                        * inode->i_size. So truncate them
+                        */
+                       ext4_orphan_add(handle, inode);
                if (ret2 < 0)
                        ret = ret2;
        }
@@ -1554,6 +1634,18 @@ static int ext4_ordered_write_end(struct file *file,
        if (!ret)
                ret = ret2;
 
+       if (pos + len > inode->i_size) {
+               vmtruncate(inode, inode->i_size);
+               /* 
+                * If vmtruncate failed early the inode might still be
+                * on the orphan list; we need to make sure the inode
+                * is removed from the orphan list in that case.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
+
+
        return ret ? ret : copied;
 }
 
@@ -1565,25 +1657,21 @@ static int ext4_writeback_write_end(struct file *file,
        handle_t *handle = ext4_journal_current_handle();
        struct inode *inode = mapping->host;
        int ret = 0, ret2;
-       loff_t new_i_size;
 
        trace_mark(ext4_writeback_write_end,
                   "dev %s ino %lu pos %llu len %u copied %u",
                   inode->i_sb->s_id, inode->i_ino,
                   (unsigned long long) pos, len, copied);
-       new_i_size = pos + copied;
-       if (new_i_size > EXT4_I(inode)->i_disksize) {
-               ext4_update_i_disksize(inode, new_i_size);
-               /* We need to mark inode dirty even if
-                * new_i_size is less that inode->i_size
-                * bu greater than i_disksize.(hint delalloc)
-                */
-               ext4_mark_inode_dirty(handle, inode);
-       }
-
-       ret2 = generic_write_end(file, mapping, pos, len, copied,
+       ret2 = ext4_generic_write_end(file, mapping, pos, len, copied,
                                                        page, fsdata);
        copied = ret2;
+       if (pos + len > inode->i_size)
+               /* if we have allocated more blocks and copied
+                * less. We will have blocks allocated outside
+                * inode->i_size. So truncate them
+                */
+               ext4_orphan_add(handle, inode);
+
        if (ret2 < 0)
                ret = ret2;
 
@@ -1591,6 +1679,17 @@ static int ext4_writeback_write_end(struct file *file,
        if (!ret)
                ret = ret2;
 
+       if (pos + len > inode->i_size) {
+               vmtruncate(inode, inode->i_size);
+               /* 
+                * If vmtruncate failed early the inode might still be
+                * on the orphan list; we need to make sure the inode
+                * is removed from the orphan list in that case.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
+
        return ret ? ret : copied;
 }
 
@@ -1635,10 +1734,27 @@ static int ext4_journalled_write_end(struct file *file,
        }
 
        unlock_page(page);
+       page_cache_release(page);
+       if (pos + len > inode->i_size)
+               /* if we have allocated more blocks and copied
+                * less. We will have blocks allocated outside
+                * inode->i_size. So truncate them
+                */
+               ext4_orphan_add(handle, inode);
+
        ret2 = ext4_journal_stop(handle);
        if (!ret)
                ret = ret2;
-       page_cache_release(page);
+       if (pos + len > inode->i_size) {
+               vmtruncate(inode, inode->i_size);
+               /* 
+                * If vmtruncate failed early the inode might still be
+                * on the orphan list; we need to make sure the inode
+                * is removed from the orphan list in that case.
+                */
+               if (inode->i_nlink)
+                       ext4_orphan_del(NULL, inode);
+       }
 
        return ret ? ret : copied;
 }
@@ -1852,7 +1968,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
  * @logical - first logical block to start assignment with
  *
  * the function goes through all passed space and put actual disk
- * block numbers into buffer heads, dropping BH_Delay
+ * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten
  */
 static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                                 struct buffer_head *exbh)
@@ -1902,16 +2018,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical,
                        do {
                                if (cur_logical >= logical + blocks)
                                        break;
-                               if (buffer_delay(bh)) {
-                                       bh->b_blocknr = pblock;
-                                       clear_buffer_delay(bh);
-                                       bh->b_bdev = inode->i_sb->s_bdev;
-                               } else if (buffer_unwritten(bh)) {
-                                       bh->b_blocknr = pblock;
-                                       clear_buffer_unwritten(bh);
-                                       set_buffer_mapped(bh);
-                                       set_buffer_new(bh);
-                                       bh->b_bdev = inode->i_sb->s_bdev;
+
+                               if (buffer_delay(bh) ||
+                                               buffer_unwritten(bh)) {
+
+                                       BUG_ON(bh->b_bdev != inode->i_sb->s_bdev);
+
+                                       if (buffer_delay(bh)) {
+                                               clear_buffer_delay(bh);
+                                               bh->b_blocknr = pblock;
+                                       } else {
+                                               /*
+                                                * unwritten already should have
+                                                * blocknr assigned. Verify that
+                                                */
+                                               clear_buffer_unwritten(bh);
+                                               BUG_ON(bh->b_blocknr != pblock);
+                                       }
+
                                } else if (buffer_mapped(bh))
                                        BUG_ON(bh->b_blocknr != pblock);
 
@@ -1990,51 +2114,6 @@ static void ext4_print_free_blocks(struct inode *inode)
        return;
 }
 
-#define                EXT4_DELALLOC_RSVED     1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
-                                  struct buffer_head *bh_result, int create)
-{
-       int ret;
-       unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
-       loff_t disksize = EXT4_I(inode)->i_disksize;
-       handle_t *handle = NULL;
-
-       handle = ext4_journal_current_handle();
-       BUG_ON(!handle);
-       ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
-                                  bh_result, create, 0, EXT4_DELALLOC_RSVED);
-       if (ret <= 0)
-               return ret;
-
-       bh_result->b_size = (ret << inode->i_blkbits);
-
-       if (ext4_should_order_data(inode)) {
-               int retval;
-               retval = ext4_jbd2_file_inode(handle, inode);
-               if (retval)
-                       /*
-                        * Failed to add inode for ordered mode. Don't
-                        * update file size
-                        */
-                       return retval;
-       }
-
-       /*
-        * Update on-disk size along with block allocation we don't
-        * use 'extend_disksize' as size may change within already
-        * allocated block -bzzz
-        */
-       disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
-       if (disksize > i_size_read(inode))
-               disksize = i_size_read(inode);
-       if (disksize > EXT4_I(inode)->i_disksize) {
-               ext4_update_i_disksize(inode, disksize);
-               ret = ext4_mark_inode_dirty(handle, inode);
-               return ret;
-       }
-       return 0;
-}
-
 /*
  * mpage_da_map_blocks - go through given space
  *
@@ -2045,29 +2124,57 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
  */
 static int mpage_da_map_blocks(struct mpage_da_data *mpd)
 {
-       int err = 0;
+       int err, blks, get_blocks_flags;
        struct buffer_head new;
-       sector_t next;
+       sector_t next = mpd->b_blocknr;
+       unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits;
+       loff_t disksize = EXT4_I(mpd->inode)->i_disksize;
+       handle_t *handle = NULL;
 
        /*
         * We consider only non-mapped and non-allocated blocks
         */
        if ((mpd->b_state  & (1 << BH_Mapped)) &&
-           !(mpd->b_state & (1 << BH_Delay)))
+               !(mpd->b_state & (1 << BH_Delay)) &&
+               !(mpd->b_state & (1 << BH_Unwritten)))
                return 0;
-       new.b_state = mpd->b_state;
-       new.b_blocknr = 0;
-       new.b_size = mpd->b_size;
-       next = mpd->b_blocknr;
+
        /*
-        * If we didn't accumulate anything
-        * to write simply return
+        * If we didn't accumulate anything to write simply return
         */
-       if (!new.b_size)
+       if (!mpd->b_size)
                return 0;
 
-       err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
-       if (err) {
+       handle = ext4_journal_current_handle();
+       BUG_ON(!handle);
+
+       /*
+        * Call ext4_get_blocks() to allocate any delayed allocation
+        * blocks, or to convert an uninitialized extent to be
+        * initialized (in the case where we have written into
+        * one or more preallocated blocks).
+        *
+        * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to
+        * indicate that we are on the delayed allocation path.  This
+        * affects functions in many different parts of the allocation
+        * call path.  This flag exists primarily because we don't
+        * want to change *many* call functions, so ext4_get_blocks()
+        * will set the magic i_delalloc_reserved_flag once the
+        * inode's allocation semaphore is taken.
+        *
+        * If the blocks in questions were delalloc blocks, set
+        * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting
+        * variables are updated after the blocks have been allocated.
+        */
+       new.b_state = 0;
+       get_blocks_flags = (EXT4_GET_BLOCKS_CREATE |
+                           EXT4_GET_BLOCKS_DELALLOC_RESERVE);
+       if (mpd->b_state & (1 << BH_Delay))
+               get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE;
+       blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks,
+                              &new, get_blocks_flags);
+       if (blks < 0) {
+               err = blks;
                /*
                 * If get block returns with error we simply
                 * return. Later writepage will redirty the page and
@@ -2100,12 +2207,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
                if (err == -ENOSPC) {
                        ext4_print_free_blocks(mpd->inode);
                }
-               /* invlaidate all the pages */
+               /* invalidate all the pages */
                ext4_da_block_invalidatepages(mpd, next,
                                mpd->b_size >> mpd->inode->i_blkbits);
                return err;
        }
-       BUG_ON(new.b_size == 0);
+       BUG_ON(blks == 0);
+
+       new.b_size = (blks << mpd->inode->i_blkbits);
 
        if (buffer_new(&new))
                __unmap_underlying_blocks(mpd->inode, &new);
@@ -2118,6 +2227,23 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
            (mpd->b_state & (1 << BH_Unwritten)))
                mpage_put_bnr_to_bhs(mpd, next, &new);
 
+       if (ext4_should_order_data(mpd->inode)) {
+               err = ext4_jbd2_file_inode(handle, mpd->inode);
+               if (err)
+                       return err;
+       }
+
+       /*
+        * Update on-disk size along with block allocation.
+        */
+       disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits;
+       if (disksize > i_size_read(mpd->inode))
+               disksize = i_size_read(mpd->inode);
+       if (disksize > EXT4_I(mpd->inode)->i_disksize) {
+               ext4_update_i_disksize(mpd->inode, disksize);
+               return ext4_mark_inode_dirty(handle, mpd->inode);
+       }
+
        return 0;
 }
 
@@ -2192,6 +2318,17 @@ flush_it:
        return;
 }
 
+static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
+{
+       /*
+        * unmapped buffer is possible for holes.
+        * delay buffer is possible with delayed allocation.
+        * We also need to consider unwritten buffer as unmapped.
+        */
+       return (!buffer_mapped(bh) || buffer_delay(bh) ||
+                               buffer_unwritten(bh)) && buffer_dirty(bh);
+}
+
 /*
  * __mpage_da_writepage - finds extent of pages and blocks
  *
@@ -2276,8 +2413,7 @@ static int __mpage_da_writepage(struct page *page,
                         * Otherwise we won't make progress
                         * with the page in ext4_da_writepage
                         */
-                       if (buffer_dirty(bh) &&
-                           (!buffer_mapped(bh) || buffer_delay(bh))) {
+                       if (ext4_bh_unmapped_or_delay(NULL, bh)) {
                                mpage_add_bh_to_extent(mpd, logical,
                                                       bh->b_size,
                                                       bh->b_state);
@@ -2303,8 +2439,16 @@ static int __mpage_da_writepage(struct page *page,
 }
 
 /*
- * this is a special callback for ->write_begin() only
- * it's intention is to return mapped block or reserve space
+ * This is a special get_blocks_t callback which is used by
+ * ext4_da_write_begin().  It will either return mapped block or
+ * reserve space for a single block.
+ *
+ * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set.
+ * We also have b_blocknr = -1 and b_bdev initialized properly
+ *
+ * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set.
+ * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev
+ * initialized properly.
  */
 static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                                  struct buffer_head *bh_result, int create)
@@ -2323,7 +2467,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
         * preallocated blocks are unmapped but should treated
         * the same as allocated blocks.
         */
-       ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1,  bh_result, 0, 0, 0);
+       ret = ext4_get_blocks(NULL, inode, iblock, 1,  bh_result, 0);
        if ((ret == 0) && !buffer_delay(bh_result)) {
                /* the block isn't (pre)allocated yet, let's reserve space */
                /*
@@ -2340,40 +2484,53 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
                set_buffer_delay(bh_result);
        } else if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
-               /*
-                * With sub-block writes into unwritten extents
-                * we also need to mark the buffer as new so that
-                * the unwritten parts of the buffer gets correctly zeroed.
-                */
-               if (buffer_unwritten(bh_result))
+               if (buffer_unwritten(bh_result)) {
+                       /* A delayed write to unwritten bh should
+                        * be marked new and mapped.  Mapped ensures
+                        * that we don't do get_block multiple times
+                        * when we write to the same offset and new
+                        * ensures that we do proper zero out for
+                        * partial write.
+                        */
                        set_buffer_new(bh_result);
+                       set_buffer_mapped(bh_result);
+               }
                ret = 0;
        }
 
        return ret;
 }
 
-static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
-{
-       /*
-        * unmapped buffer is possible for holes.
-        * delay buffer is possible with delayed allocation
-        */
-       return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh));
-}
-
-static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
+/*
+ * This function is used as a standard get_block_t calback function
+ * when there is no desire to allocate any blocks.  It is used as a
+ * callback function for block_prepare_write(), nobh_writepage(), and
+ * block_write_full_page().  These functions should only try to map a
+ * single block at a time.
+ *
+ * Since this function doesn't do block allocations even if the caller
+ * requests it by passing in create=1, it is critically important that
+ * any caller checks to make sure that any buffer heads are returned
+ * by this function are either all already mapped or marked for
+ * delayed allocation before calling nobh_writepage() or
+ * block_write_full_page().  Otherwise, b_blocknr could be left
+ * unitialized, and the page write functions will be taken by
+ * surprise.
+ */
+static int noalloc_get_block_write(struct inode *inode, sector_t iblock,
                                   struct buffer_head *bh_result, int create)
 {
        int ret = 0;
        unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
 
+       BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize);
+
        /*
         * we don't want to do block allocation in writepage
         * so call get_block_wrap with create = 0
         */
-       ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks,
-                                  bh_result, 0, 0, 0);
+       ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0);
+       BUG_ON(create && ret == 0);
        if (ret > 0) {
                bh_result->b_size = (ret << inode->i_blkbits);
                ret = 0;
@@ -2382,10 +2539,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock,
 }
 
 /*
- * get called vi ext4_da_writepages after taking page lock (have journal handle)
- * get called via journal_submit_inode_data_buffers (no journal handle)
- * get called via shrink_page_list via pdflush (no journal handle)
- * or grab_page_cache when doing write_begin (have journal handle)
+ * This function can get called via...
+ *   - ext4_da_writepages after taking page lock (have journal handle)
+ *   - journal_submit_inode_data_buffers (no journal handle)
+ *   - shrink_page_list via pdflush (no journal handle)
+ *   - grab_page_cache when doing write_begin (have journal handle)
  */
 static int ext4_da_writepage(struct page *page,
                                struct writeback_control *wbc)
@@ -2436,7 +2594,7 @@ static int ext4_da_writepage(struct page *page,
                 * do block allocation here.
                 */
                ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                               ext4_normal_get_block_write);
+                                         noalloc_get_block_write);
                if (!ret) {
                        page_bufs = page_buffers(page);
                        /* check whether all are mapped and non delay */
@@ -2461,11 +2619,10 @@ static int ext4_da_writepage(struct page *page,
        }
 
        if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode))
-               ret = nobh_writepage(page, ext4_normal_get_block_write, wbc);
+               ret = nobh_writepage(page, noalloc_get_block_write, wbc);
        else
-               ret = block_write_full_page(page,
-                                               ext4_normal_get_block_write,
-                                               wbc);
+               ret = block_write_full_page(page, noalloc_get_block_write,
+                                           wbc);
 
        return ret;
 }
@@ -2777,7 +2934,7 @@ retry:
        *pagep = page;
 
        ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata,
-                                                       ext4_da_get_block_prep);
+                               ext4_da_get_block_prep);
        if (ret < 0) {
                unlock_page(page);
                ext4_journal_stop(handle);
@@ -2815,7 +2972,7 @@ static int ext4_da_should_update_i_disksize(struct page *page,
        for (i = 0; i < idx; i++)
                bh = bh->b_this_page;
 
-       if (!buffer_mapped(bh) || (buffer_delay(bh)))
+       if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh))
                return 0;
        return 1;
 }
@@ -3085,12 +3242,10 @@ static int __ext4_normal_writepage(struct page *page,
        struct inode *inode = page->mapping->host;
 
        if (test_opt(inode->i_sb, NOBH))
-               return nobh_writepage(page,
-                                       ext4_normal_get_block_write, wbc);
+               return nobh_writepage(page, noalloc_get_block_write, wbc);
        else
-               return block_write_full_page(page,
-                                               ext4_normal_get_block_write,
-                                               wbc);
+               return block_write_full_page(page, noalloc_get_block_write,
+                                            wbc);
 }
 
 static int ext4_normal_writepage(struct page *page,
@@ -3142,7 +3297,7 @@ static int __ext4_journalled_writepage(struct page *page,
        int err;
 
        ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
-                                       ext4_normal_get_block_write);
+                                 noalloc_get_block_write);
        if (ret != 0)
                goto out_unlock;
 
@@ -3227,9 +3382,8 @@ static int ext4_journalled_writepage(struct page *page,
                 * really know unless we go poke around in the buffer_heads.
                 * But block_write_full_page will do the right thing.
                 */
-               return block_write_full_page(page,
-                                               ext4_normal_get_block_write,
-                                               wbc);
+               return block_write_full_page(page, noalloc_get_block_write,
+                                            wbc);
        }
 no_write:
        redirty_page_for_writepage(wbc, page);
@@ -3973,7 +4127,8 @@ void ext4_truncate(struct inode *inode)
        if (!ext4_can_truncate(inode))
                return;
 
-       if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+       if (ei->i_disksize && inode->i_size == 0 &&
+           !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
                ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
 
        if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
@@ -4715,25 +4870,6 @@ int ext4_write_inode(struct inode *inode, int wait)
        return ext4_force_commit(inode->i_sb);
 }
 
-int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh)
-{
-       int err = 0;
-
-       mark_buffer_dirty(bh);
-       if (inode && inode_needs_sync(inode)) {
-               sync_dirty_buffer(bh);
-               if (buffer_req(bh) && !buffer_uptodate(bh)) {
-                       ext4_error(inode->i_sb, __func__,
-                                  "IO error syncing inode, "
-                                  "inode=%lu, block=%llu",
-                                  inode->i_ino,
-                                  (unsigned long long)bh->b_blocknr);
-                       err = -EIO;
-               }
-       }
-       return err;
-}
-
 /*
  * ext4_setattr()
  *
@@ -4930,7 +5066,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk)
  */
 int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
 {
-       int groups, gdpblocks;
+       ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
+       int gdpblocks;
        int idxblocks;
        int ret = 0;
 
@@ -4957,8 +5094,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk)
                groups += nrblocks;
 
        gdpblocks = groups;
-       if (groups > EXT4_SB(inode->i_sb)->s_groups_count)
-               groups = EXT4_SB(inode->i_sb)->s_groups_count;
+       if (groups > ngroups)
+               groups = ngroups;
        if (groups > EXT4_SB(inode->i_sb)->s_gdb_count)
                gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
 
@@ -4998,7 +5135,7 @@ int ext4_writepage_trans_blocks(struct inode *inode)
  * Calculate the journal credits for a chunk of data modification.
  *
  * This is called from DIO, fallocate or whoever calling
- * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks.
+ * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks.
  *
  * journal buffers for data blocks are not included here, as DIO
  * and fallocate do no need to journal data buffers.
index f871677a798499c4327629cf282d4f75b38d44b3..ed8482e22c0ea7623641c28efbede7bb7b65091e 100644 (file)
@@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr)
        ext4_set_bit(bit, addr);
 }
 
-static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-       addr = mb_correct_addr_and_bit(&bit, addr);
-       ext4_set_bit_atomic(lock, bit, addr);
-}
-
 static inline void mb_clear_bit(int bit, void *addr)
 {
        addr = mb_correct_addr_and_bit(&bit, addr);
        ext4_clear_bit(bit, addr);
 }
 
-static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr)
-{
-       addr = mb_correct_addr_and_bit(&bit, addr);
-       ext4_clear_bit_atomic(lock, bit, addr);
-}
-
 static inline int mb_find_next_zero_bit(void *addr, int max, int start)
 {
        int fix = 0, ret, tmpmax;
@@ -448,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b,
 
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-       BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+       assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) {
                        ext4_fsblk_t blocknr;
@@ -472,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count)
 
        if (unlikely(e4b->bd_info->bb_bitmap == NULL))
                return;
-       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        for (i = 0; i < count; i++) {
                BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap));
                mb_set_bit(first + i, e4b->bd_info->bb_bitmap);
@@ -739,6 +727,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb,
 
 static int ext4_mb_init_cache(struct page *page, char *incore)
 {
+       ext4_group_t ngroups;
        int blocksize;
        int blocks_per_page;
        int groups_per_page;
@@ -757,6 +746,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
 
        inode = page->mapping->host;
        sb = inode->i_sb;
+       ngroups = ext4_get_groups_count(sb);
        blocksize = 1 << inode->i_blkbits;
        blocks_per_page = PAGE_CACHE_SIZE / blocksize;
 
@@ -780,7 +770,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
        for (i = 0; i < groups_per_page; i++) {
                struct ext4_group_desc *desc;
 
-               if (first_group + i >= EXT4_SB(sb)->s_groups_count)
+               if (first_group + i >= ngroups)
                        break;
 
                err = -EIO;
@@ -801,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                        unlock_buffer(bh[i]);
                        continue;
                }
-               spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+               ext4_lock_group(sb, first_group + i);
                if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                        ext4_init_block_bitmap(sb, bh[i],
                                                first_group + i, desc);
                        set_bitmap_uptodate(bh[i]);
                        set_buffer_uptodate(bh[i]);
-                       spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+                       ext4_unlock_group(sb, first_group + i);
                        unlock_buffer(bh[i]);
                        continue;
                }
-               spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i));
+               ext4_unlock_group(sb, first_group + i);
                if (buffer_uptodate(bh[i])) {
                        /*
                         * if not uninit if bh is uptodate,
@@ -852,7 +842,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore)
                struct ext4_group_info *grinfo;
 
                group = (first_block + i) >> 1;
-               if (group >= EXT4_SB(sb)->s_groups_count)
+               if (group >= ngroups)
                        break;
 
                /*
@@ -1078,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block)
        return 0;
 }
 
-static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_clear_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
 
@@ -1091,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-               if (lock)
-                       mb_clear_bit_atomic(lock, cur, bm);
-               else
-                       mb_clear_bit(cur, bm);
+               mb_clear_bit(cur, bm);
                cur++;
        }
 }
 
-static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
+static void mb_set_bits(void *bm, int cur, int len)
 {
        __u32 *addr;
 
@@ -1112,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len)
                        cur += 32;
                        continue;
                }
-               if (lock)
-                       mb_set_bit_atomic(lock, cur, bm);
-               else
-                       mb_set_bit(cur, bm);
+               mb_set_bit(cur, bm);
                cur++;
        }
 }
@@ -1131,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b,
        struct super_block *sb = e4b->bd_sb;
 
        BUG_ON(first + count > (sb->s_blocksize << 3));
-       BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group));
+       assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_free_blocks_double(inode, e4b, first, count);
 
@@ -1212,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block,
        int ord;
        void *buddy;
 
-       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        BUG_ON(ex == NULL);
 
        buddy = mb_find_buddy(e4b, order, &max);
@@ -1276,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
 
        BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3));
        BUG_ON(e4b->bd_group != ex->fe_group);
-       BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group));
+       assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group));
        mb_check_buddy(e4b);
        mb_mark_used_double(e4b, start, len);
 
@@ -1330,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex)
                e4b->bd_info->bb_counters[ord]++;
        }
 
-       mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group),
-                       EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+       mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
        mb_check_buddy(e4b);
 
        return ret;
@@ -1726,7 +1709,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        unsigned free, fragments;
        unsigned i, bits;
        int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
-       struct ext4_group_desc *desc;
        struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
 
        BUG_ON(cr < 0 || cr >= 4);
@@ -1742,10 +1724,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
        switch (cr) {
        case 0:
                BUG_ON(ac->ac_2order == 0);
-               /* If this group is uninitialized, skip it initially */
-               desc = ext4_get_group_desc(ac->ac_sb, group, NULL);
-               if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
-                       return 0;
 
                /* Avoid using the first bg of a flexgroup for data files */
                if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
@@ -1788,6 +1766,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        int block, pnum;
        int blocks_per_page;
        int groups_per_page;
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t first_group;
        struct ext4_group_info *grp;
 
@@ -1807,7 +1786,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group)
        /* read all groups the page covers into the cache */
        for (i = 0; i < groups_per_page; i++) {
 
-               if ((first_group + i) >= EXT4_SB(sb)->s_groups_count)
+               if ((first_group + i) >= ngroups)
                        break;
                grp = ext4_get_group_info(sb, first_group + i);
                /* take all groups write allocation
@@ -1945,8 +1924,7 @@ err:
 static noinline_for_stack int
 ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 {
-       ext4_group_t group;
-       ext4_group_t i;
+       ext4_group_t ngroups, group, i;
        int cr;
        int err = 0;
        int bsbits;
@@ -1957,6 +1935,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
 
        sb = ac->ac_sb;
        sbi = EXT4_SB(sb);
+       ngroups = ext4_get_groups_count(sb);
        BUG_ON(ac->ac_status == AC_STATUS_FOUND);
 
        /* first, try the goal */
@@ -2017,11 +1996,11 @@ repeat:
                 */
                group = ac->ac_g_ex.fe_group;
 
-               for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) {
+               for (i = 0; i < ngroups; group++, i++) {
                        struct ext4_group_info *grp;
                        struct ext4_group_desc *desc;
 
-                       if (group == EXT4_SB(sb)->s_groups_count)
+                       if (group == ngroups)
                                group = 0;
 
                        /* quick check to skip empty groups */
@@ -2064,9 +2043,7 @@ repeat:
 
                        ac->ac_groups_scanned++;
                        desc = ext4_get_group_desc(sb, group, NULL);
-                       if (cr == 0 || (desc->bg_flags &
-                                       cpu_to_le16(EXT4_BG_BLOCK_UNINIT) &&
-                                       ac->ac_2order != 0))
+                       if (cr == 0)
                                ext4_mb_simple_scan_group(ac, &e4b);
                        else if (cr == 1 &&
                                        ac->ac_g_ex.fe_len == sbi->s_stripe)
@@ -2315,12 +2292,10 @@ static struct file_operations ext4_mb_seq_history_fops = {
 static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
 
-       if (*pos < 0 || *pos >= sbi->s_groups_count)
+       if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
-
        group = *pos + 1;
        return (void *) ((unsigned long) group);
 }
@@ -2328,11 +2303,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos)
 static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos)
 {
        struct super_block *sb = seq->private;
-       struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t group;
 
        ++*pos;
-       if (*pos < 0 || *pos >= sbi->s_groups_count)
+       if (*pos < 0 || *pos >= ext4_get_groups_count(sb))
                return NULL;
        group = *pos + 1;
        return (void *) ((unsigned long) group);
@@ -2420,7 +2394,8 @@ static void ext4_mb_history_release(struct super_block *sb)
 
        if (sbi->s_proc != NULL) {
                remove_proc_entry("mb_groups", sbi->s_proc);
-               remove_proc_entry("mb_history", sbi->s_proc);
+               if (sbi->s_mb_history_max)
+                       remove_proc_entry("mb_history", sbi->s_proc);
        }
        kfree(sbi->s_mb_history);
 }
@@ -2431,17 +2406,17 @@ static void ext4_mb_history_init(struct super_block *sb)
        int i;
 
        if (sbi->s_proc != NULL) {
-               proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
-                                &ext4_mb_seq_history_fops, sb);
+               if (sbi->s_mb_history_max)
+                       proc_create_data("mb_history", S_IRUGO, sbi->s_proc,
+                                        &ext4_mb_seq_history_fops, sb);
                proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
                                 &ext4_mb_seq_groups_fops, sb);
        }
 
-       sbi->s_mb_history_max = 1000;
        sbi->s_mb_history_cur = 0;
        spin_lock_init(&sbi->s_mb_history_lock);
        i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history);
-       sbi->s_mb_history = kzalloc(i, GFP_KERNEL);
+       sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL;
        /* if we can't allocate history, then we simple won't use it */
 }
 
@@ -2451,7 +2426,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac)
        struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
        struct ext4_mb_history h;
 
-       if (unlikely(sbi->s_mb_history == NULL))
+       if (sbi->s_mb_history == NULL)
                return;
 
        if (!(ac->ac_op & sbi->s_mb_history_filter))
@@ -2587,6 +2562,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add)
 
 static int ext4_mb_init_backend(struct super_block *sb)
 {
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int metalen;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -2598,7 +2574,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        struct ext4_group_desc *desc;
 
        /* This is the number of blocks used by GDT */
-       num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) -
+       num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) -
                                1) >> EXT4_DESC_PER_BLOCK_BITS(sb);
 
        /*
@@ -2644,7 +2620,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
        for (i = 0; i < num_meta_group_infos; i++) {
                if ((i + 1) == num_meta_group_infos)
                        metalen = sizeof(*meta_group_info) *
-                               (sbi->s_groups_count -
+                               (ngroups -
                                        (i << EXT4_DESC_PER_BLOCK_BITS(sb)));
                meta_group_info = kmalloc(metalen, GFP_KERNEL);
                if (meta_group_info == NULL) {
@@ -2655,7 +2631,7 @@ static int ext4_mb_init_backend(struct super_block *sb)
                sbi->s_group_info[i] = meta_group_info;
        }
 
-       for (i = 0; i < sbi->s_groups_count; i++) {
+       for (i = 0; i < ngroups; i++) {
                desc = ext4_get_group_desc(sb, i, NULL);
                if (desc == NULL) {
                        printk(KERN_ERR
@@ -2761,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
        return 0;
 }
 
-/* need to called with ext4 group lock (ext4_lock_group) */
+/* need to called with the ext4 group lock held */
 static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 {
        struct ext4_prealloc_space *pa;
@@ -2781,13 +2757,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp)
 
 int ext4_mb_release(struct super_block *sb)
 {
+       ext4_group_t ngroups = ext4_get_groups_count(sb);
        ext4_group_t i;
        int num_meta_group_infos;
        struct ext4_group_info *grinfo;
        struct ext4_sb_info *sbi = EXT4_SB(sb);
 
        if (sbi->s_group_info) {
-               for (i = 0; i < sbi->s_groups_count; i++) {
+               for (i = 0; i < ngroups; i++) {
                        grinfo = ext4_get_group_info(sb, i);
 #ifdef DOUBLE_CHECK
                        kfree(grinfo->bb_bitmap);
@@ -2797,7 +2774,7 @@ int ext4_mb_release(struct super_block *sb)
                        ext4_unlock_group(sb, i);
                        kfree(grinfo);
                }
-               num_meta_group_infos = (sbi->s_groups_count +
+               num_meta_group_infos = (ngroups +
                                EXT4_DESC_PER_BLOCK(sb) - 1) >>
                        EXT4_DESC_PER_BLOCK_BITS(sb);
                for (i = 0; i < num_meta_group_infos; i++)
@@ -2984,27 +2961,25 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                + le32_to_cpu(es->s_first_data_block);
 
        len = ac->ac_b_ex.fe_len;
-       if (in_range(ext4_block_bitmap(sb, gdp), block, len) ||
-           in_range(ext4_inode_bitmap(sb, gdp), block, len) ||
-           in_range(block, ext4_inode_table(sb, gdp),
-                    EXT4_SB(sb)->s_itb_per_group) ||
-           in_range(block + len - 1, ext4_inode_table(sb, gdp),
-                    EXT4_SB(sb)->s_itb_per_group)) {
+       if (!ext4_data_block_valid(sbi, block, len)) {
                ext4_error(sb, __func__,
-                          "Allocating block %llu in system zone of %d group\n",
-                          block, ac->ac_b_ex.fe_group);
+                          "Allocating blocks %llu-%llu which overlap "
+                          "fs metadata\n", block, block+len);
                /* File system mounted not to panic on error
                 * Fix the bitmap and repeat the block allocation
                 * We leak some of the blocks here.
                 */
-               mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group),
-                               bitmap_bh->b_data, ac->ac_b_ex.fe_start,
-                               ac->ac_b_ex.fe_len);
+               ext4_lock_group(sb, ac->ac_b_ex.fe_group);
+               mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,
+                           ac->ac_b_ex.fe_len);
+               ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
                err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
                if (!err)
                        err = -EAGAIN;
                goto out_err;
        }
+
+       ext4_lock_group(sb, ac->ac_b_ex.fe_group);
 #ifdef AGGRESSIVE_CHECK
        {
                int i;
@@ -3014,9 +2989,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
                }
        }
 #endif
-       spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
-       mb_set_bits(NULL, bitmap_bh->b_data,
-                               ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len);
+       mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len);
        if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
                gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
                ext4_free_blks_set(sb, gdp,
@@ -3026,7 +2999,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
        len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len;
        ext4_free_blks_set(sb, gdp, len);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp);
-       spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group));
+
+       ext4_unlock_group(sb, ac->ac_b_ex.fe_group);
        percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len);
        /*
         * Now reduce the dirty block count also. Should not go negative
@@ -3459,7 +3433,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
  * the function goes through all block freed in the group
  * but not yet committed and marks them used in in-core bitmap.
  * buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with the ext4 group lock held
  */
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
                                                ext4_group_t group)
@@ -3473,9 +3447,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 
        while (n) {
                entry = rb_entry(n, struct ext4_free_data, node);
-               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-                               bitmap, entry->start_blk,
-                               entry->count);
+               mb_set_bits(bitmap, entry->start_blk, entry->count);
                n = rb_next(n);
        }
        return;
@@ -3484,7 +3456,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 /*
  * the function goes through all preallocation in this group and marks them
  * used in in-core bitmap. buddy must be generated from this bitmap
- * Need to be called with ext4 group lock (ext4_lock_group)
+ * Need to be called with ext4 group lock held
  */
 static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                                        ext4_group_t group)
@@ -3516,8 +3488,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
                if (unlikely(len == 0))
                        continue;
                BUG_ON(groupnr != group);
-               mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group),
-                                               bitmap, start, len);
+               mb_set_bits(bitmap, start, len);
                preallocated += len;
                count++;
        }
@@ -4121,7 +4092,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode,
 static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
 {
        struct super_block *sb = ac->ac_sb;
-       ext4_group_t i;
+       ext4_group_t ngroups, i;
 
        printk(KERN_ERR "EXT4-fs: Can't allocate:"
                        " Allocation context details:\n");
@@ -4145,7 +4116,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac)
        printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned,
                ac->ac_found);
        printk(KERN_ERR "EXT4-fs: groups: \n");
-       for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) {
+       ngroups = ext4_get_groups_count(sb);
+       for (i = 0; i < ngroups; i++) {
                struct ext4_group_info *grp = ext4_get_group_info(sb, i);
                struct ext4_prealloc_space *pa;
                ext4_grpblk_t start;
@@ -4469,13 +4441,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
 
 static int ext4_mb_discard_preallocations(struct super_block *sb, int needed)
 {
-       ext4_group_t i;
+       ext4_group_t i, ngroups = ext4_get_groups_count(sb);
        int ret;
        int freed = 0;
 
        trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d",
                   sb->s_id, needed);
-       for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) {
+       for (i = 0; i < ngroups && needed > 0; i++) {
                ret = ext4_mb_discard_group_preallocations(sb, i, needed);
                freed += ret;
                needed -= ret;
@@ -4859,29 +4831,25 @@ do_more:
                new_entry->group  = block_group;
                new_entry->count = count;
                new_entry->t_tid = handle->h_transaction->t_tid;
+
                ext4_lock_group(sb, block_group);
-               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-                               bit, count);
+               mb_clear_bits(bitmap_bh->b_data, bit, count);
                ext4_mb_free_metadata(handle, &e4b, new_entry);
-               ext4_unlock_group(sb, block_group);
        } else {
-               ext4_lock_group(sb, block_group);
                /* need to update group_info->bb_free and bitmap
                 * with group lock held. generate_buddy look at
                 * them with group lock_held
                 */
-               mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data,
-                               bit, count);
+               ext4_lock_group(sb, block_group);
+               mb_clear_bits(bitmap_bh->b_data, bit, count);
                mb_free_blocks(inode, &e4b, bit, count);
                ext4_mb_return_to_preallocation(inode, &e4b, block, count);
-               ext4_unlock_group(sb, block_group);
        }
 
-       spin_lock(sb_bgl_lock(sbi, block_group));
        ret = ext4_free_blks_count(sb, gdp) + count;
        ext4_free_blks_set(sb, gdp, ret);
        gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp);
-       spin_unlock(sb_bgl_lock(sbi, block_group));
+       ext4_unlock_group(sb, block_group);
        percpu_counter_add(&sbi->s_freeblocks_counter, count);
 
        if (sbi->s_log_groups_per_flex) {
index dd9e6cd5f6cf4dc97ae6c97b4c6c5c9d5bee9e4d..75e34f69215bfb3c24a6ecf9545eace9a36053f7 100644 (file)
@@ -23,7 +23,6 @@
 #include <linux/mutex.h>
 #include "ext4_jbd2.h"
 #include "ext4.h"
-#include "group.h"
 
 /*
  * with AGGRESSIVE_CHECK allocator runs consistency checks over
index 22098e1cd085911a7aeffc975ac7a0773a32bbc0..07eb6649e4fa1396d80e03186ffd76559b4c6709 100644 (file)
@@ -37,7 +37,6 @@
 #include "ext4.h"
 #include "ext4_jbd2.h"
 
-#include "namei.h"
 #include "xattr.h"
 #include "acl.h"
 
@@ -750,7 +749,7 @@ static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
                        ext4fs_dirhash(de->name, de->name_len, &h);
                        map_tail--;
                        map_tail->hash = h.hash;
-                       map_tail->offs = (u16) ((char *) de - base);
+                       map_tail->offs = ((char *) de - base)>>2;
                        map_tail->size = le16_to_cpu(de->rec_len);
                        count++;
                        cond_resched();
@@ -1148,7 +1147,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
        unsigned rec_len = 0;
 
        while (count--) {
-               struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + map->offs);
+               struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) 
+                                               (from + (map->offs<<2));
                rec_len = EXT4_DIR_REC_LEN(de->name_len);
                memcpy (to, de, rec_len);
                ((struct ext4_dir_entry_2 *) to)->rec_len =
@@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
 
-       lock_super(sb);
+       mutex_lock(&EXT4_SB(sb)->s_orphan_lock);
        if (!list_empty(&EXT4_I(inode)->i_orphan))
                goto out_unlock;
 
@@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
 
        /* @@@ FIXME: Observation from aviro:
         * I think I can trigger J_ASSERT in ext4_orphan_add().  We block
-        * here (on lock_super()), so race with ext4_link() which might bump
+        * here (on s_orphan_lock), so race with ext4_link() which might bump
         * ->i_nlink. For, say it, character device. Not a regular file,
         * not a directory, not a symlink and ->i_nlink > 0.
+        *
+        * tytso, 4/25/2009: I'm not sure how that could happen;
+        * shouldn't the fs core protect us from these sort of
+        * unlink()/link() races?
         */
        J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
                  S_ISLNK(inode->i_mode)) || inode->i_nlink == 0);
@@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode)
        jbd_debug(4, "orphan inode %lu will point to %d\n",
                        inode->i_ino, NEXT_ORPHAN(inode));
 out_unlock:
-       unlock_super(sb);
+       mutex_unlock(&EXT4_SB(sb)->s_orphan_lock);
        ext4_std_error(inode->i_sb, err);
        return err;
 }
@@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
        if (!ext4_handle_valid(handle))
                return 0;
 
-       lock_super(inode->i_sb);
-       if (list_empty(&ei->i_orphan)) {
-               unlock_super(inode->i_sb);
-               return 0;
-       }
+       mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
+       if (list_empty(&ei->i_orphan))
+               goto out;
 
        ino_next = NEXT_ORPHAN(inode);
        prev = ei->i_orphan.prev;
@@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode)
 out_err:
        ext4_std_error(inode->i_sb, err);
 out:
-       unlock_super(inode->i_sb);
+       mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock);
        return err;
 
 out_brelse:
@@ -2533,6 +2535,7 @@ const struct inode_operations ext4_dir_inode_operations = {
        .removexattr    = generic_removexattr,
 #endif
        .permission     = ext4_permission,
+       .fiemap         = ext4_fiemap,
 };
 
 const struct inode_operations ext4_special_inode_operations = {
diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h
deleted file mode 100644 (file)
index 5e4dfff..0000000
+++ /dev/null
@@ -1,8 +0,0 @@
-/*  linux/fs/ext4/namei.h
- *
- * Copyright (C) 2005 Simtec Electronics
- *     Ben Dooks <ben@simtec.co.uk>
- *
-*/
-
-extern struct dentry *ext4_get_parent(struct dentry *child);
index 546c7dd869e19176e77cb54278e9bdfc2b520da4..27eb289eea3708fb4ee71dc26e31def645c5a03f 100644 (file)
@@ -15,7 +15,6 @@
 #include <linux/slab.h>
 
 #include "ext4_jbd2.h"
-#include "group.h"
 
 #define outside(b, first, last)        ((b) < (first) || (b) >= (last))
 #define inside(b, first, last) ((b) >= (first) && (b) < (last))
@@ -193,7 +192,7 @@ static int setup_new_group_blocks(struct super_block *sb,
        if (IS_ERR(handle))
                return PTR_ERR(handle);
 
-       lock_super(sb);
+       mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                err = -EBUSY;
                goto exit_journal;
@@ -302,7 +301,7 @@ exit_bh:
        brelse(bh);
 
 exit_journal:
-       unlock_super(sb);
+       mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
 
@@ -643,11 +642,12 @@ exit_free:
  * important part is that the new block and inode counts are in the backup
  * superblocks, and the location of the new group metadata in the GDT backups.
  *
- * We do not need lock_super() for this, because these blocks are not
- * otherwise touched by the filesystem code when it is mounted.  We don't
- * need to worry about last changing from sbi->s_groups_count, because the
- * worst that can happen is that we do not copy the full number of backups
- * at this time.  The resize which changed s_groups_count will backup again.
+ * We do not need take the s_resize_lock for this, because these
+ * blocks are not otherwise touched by the filesystem code when it is
+ * mounted.  We don't need to worry about last changing from
+ * sbi->s_groups_count, because the worst that can happen is that we
+ * do not copy the full number of backups at this time.  The resize
+ * which changed s_groups_count will backup again.
  */
 static void update_backups(struct super_block *sb,
                           int blk_off, char *data, int size)
@@ -809,7 +809,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
                goto exit_put;
        }
 
-       lock_super(sb);
+       mutex_lock(&sbi->s_resize_lock);
        if (input->group != sbi->s_groups_count) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
@@ -840,7 +840,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         /*
          * OK, now we've set up the new group.  Time to make it active.
          *
-         * Current kernels don't lock all allocations via lock_super(),
+         * We do not lock all allocations via s_resize_lock
          * so we have to be safe wrt. concurrent accesses the group
          * data.  So we need to be careful to set all of the relevant
          * group descriptor data etc. *before* we enable the group.
@@ -900,12 +900,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
         *
         * The precise rules we use are:
         *
-        * * Writers of s_groups_count *must* hold lock_super
+        * * Writers of s_groups_count *must* hold s_resize_lock
         * AND
         * * Writers must perform a smp_wmb() after updating all dependent
         *   data and before modifying the groups count
         *
-        * * Readers must hold lock_super() over the access
+        * * Readers must hold s_resize_lock over the access
         * OR
         * * Readers must perform an smp_rmb() after reading the groups count
         *   and before reading any dependent data.
@@ -948,7 +948,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
        sb->s_dirt = 1;
 
 exit_journal:
-       unlock_super(sb);
+       mutex_unlock(&sbi->s_resize_lock);
        if ((err2 = ext4_journal_stop(handle)) && !err)
                err = err2;
        if (!err) {
@@ -986,7 +986,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
 
        /* We don't need to worry about locking wrt other resizers just
         * yet: we're going to revalidate es->s_blocks_count after
-        * taking lock_super() below. */
+        * taking the s_resize_lock below. */
        o_blocks_count = ext4_blocks_count(es);
        o_groups_count = EXT4_SB(sb)->s_groups_count;
 
@@ -1056,11 +1056,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                goto exit_put;
        }
 
-       lock_super(sb);
+       mutex_lock(&EXT4_SB(sb)->s_resize_lock);
        if (o_blocks_count != ext4_blocks_count(es)) {
                ext4_warning(sb, __func__,
                             "multiple resizers run on filesystem!");
-               unlock_super(sb);
+               mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                err = -EBUSY;
                goto exit_put;
@@ -1070,14 +1070,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es,
                                                 EXT4_SB(sb)->s_sbh))) {
                ext4_warning(sb, __func__,
                             "error %d on journal write access", err);
-               unlock_super(sb);
+               mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
                ext4_journal_stop(handle);
                goto exit_put;
        }
        ext4_blocks_count_set(es, o_blocks_count + add);
        ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh);
        sb->s_dirt = 1;
-       unlock_super(sb);
+       mutex_unlock(&EXT4_SB(sb)->s_resize_lock);
        ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
                   o_blocks_count + add);
        /* We add the blocks to the bitmap and set the group need init bit */
index 2958f4e6f222a8956f595145b1fe31e16e4eec9d..c191d0f65fedc441ee601fbfa783dea6b889db50 100644 (file)
@@ -20,6 +20,7 @@
 #include <linux/string.h>
 #include <linux/fs.h>
 #include <linux/time.h>
+#include <linux/vmalloc.h>
 #include <linux/jbd2.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include "ext4_jbd2.h"
 #include "xattr.h"
 #include "acl.h"
-#include "namei.h"
-#include "group.h"
+
+static int default_mb_history_length = 1000;
+
+module_param_named(default_mb_history_length, default_mb_history_length,
+                  int, 0644);
+MODULE_PARM_DESC(default_mb_history_length,
+                "Default number of entries saved for mb_history");
 
 struct proc_dir_entry *ext4_proc_root;
 static struct kset *ext4_kset;
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
                             unsigned long journal_devnum);
-static int ext4_commit_super(struct super_block *sb,
-                             struct ext4_super_block *es, int sync);
+static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
                                        struct ext4_super_block *es);
 static void ext4_clear_journal_err(struct super_block *sb,
@@ -74,7 +79,7 @@ ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_block_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
+                (ext4_fsblk_t)le32_to_cpu(bg->bg_block_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
@@ -82,7 +87,7 @@ ext4_fsblk_t ext4_inode_bitmap(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_bitmap_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
+                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_bitmap_hi) << 32 : 0);
 }
 
 ext4_fsblk_t ext4_inode_table(struct super_block *sb,
@@ -90,7 +95,7 @@ ext4_fsblk_t ext4_inode_table(struct super_block *sb,
 {
        return le32_to_cpu(bg->bg_inode_table_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
+                (ext4_fsblk_t)le32_to_cpu(bg->bg_inode_table_hi) << 32 : 0);
 }
 
 __u32 ext4_free_blks_count(struct super_block *sb,
@@ -98,7 +103,7 @@ __u32 ext4_free_blks_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_blocks_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
+                (__u32)le16_to_cpu(bg->bg_free_blocks_count_hi) << 16 : 0);
 }
 
 __u32 ext4_free_inodes_count(struct super_block *sb,
@@ -106,7 +111,7 @@ __u32 ext4_free_inodes_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_free_inodes_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
+                (__u32)le16_to_cpu(bg->bg_free_inodes_count_hi) << 16 : 0);
 }
 
 __u32 ext4_used_dirs_count(struct super_block *sb,
@@ -114,7 +119,7 @@ __u32 ext4_used_dirs_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_used_dirs_count_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
+                (__u32)le16_to_cpu(bg->bg_used_dirs_count_hi) << 16 : 0);
 }
 
 __u32 ext4_itable_unused_count(struct super_block *sb,
@@ -122,7 +127,7 @@ __u32 ext4_itable_unused_count(struct super_block *sb,
 {
        return le16_to_cpu(bg->bg_itable_unused_lo) |
                (EXT4_DESC_SIZE(sb) >= EXT4_MIN_DESC_SIZE_64BIT ?
-               (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
+                (__u32)le16_to_cpu(bg->bg_itable_unused_hi) << 16 : 0);
 }
 
 void ext4_block_bitmap_set(struct super_block *sb,
@@ -202,8 +207,7 @@ handle_t *ext4_journal_start_sb(struct super_block *sb, int nblocks)
        journal = EXT4_SB(sb)->s_journal;
        if (journal) {
                if (is_journal_aborted(journal)) {
-                       ext4_abort(sb, __func__,
-                                  "Detected aborted journal");
+                       ext4_abort(sb, __func__, "Detected aborted journal");
                        return ERR_PTR(-EROFS);
                }
                return jbd2_journal_start(journal, nblocks);
@@ -302,10 +306,10 @@ static void ext4_handle_error(struct super_block *sb)
                        jbd2_journal_abort(journal, -EIO);
        }
        if (test_opt(sb, ERRORS_RO)) {
-               printk(KERN_CRIT "Remounting filesystem read-only\n");
+               ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
                sb->s_flags |= MS_RDONLY;
        }
-       ext4_commit_super(sb, es, 1);
+       ext4_commit_super(sb, 1);
        if (test_opt(sb, ERRORS_PANIC))
                panic("EXT4-fs (device %s): panic forced after error\n",
                        sb->s_id);
@@ -395,8 +399,6 @@ void ext4_abort(struct super_block *sb, const char *function,
 {
        va_list args;
 
-       printk(KERN_CRIT "ext4_abort called.\n");
-
        va_start(args, fmt);
        printk(KERN_CRIT "EXT4-fs error (device %s): %s: ", sb->s_id, function);
        vprintk(fmt, args);
@@ -409,7 +411,7 @@ void ext4_abort(struct super_block *sb, const char *function,
        if (sb->s_flags & MS_RDONLY)
                return;
 
-       printk(KERN_CRIT "Remounting filesystem read-only\n");
+       ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only");
        EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
        sb->s_flags |= MS_RDONLY;
        EXT4_SB(sb)->s_mount_opt |= EXT4_MOUNT_ABORT;
@@ -417,6 +419,18 @@ void ext4_abort(struct super_block *sb, const char *function,
                jbd2_journal_abort(EXT4_SB(sb)->s_journal, -EIO);
 }
 
+void ext4_msg (struct super_block * sb, const char *prefix,
+                  const char *fmt, ...)
+{
+       va_list args;
+
+       va_start(args, fmt);
+       printk("%sEXT4-fs (%s): ", prefix, sb->s_id);
+       vprintk(fmt, args);
+       printk("\n");
+       va_end(args);
+}
+
 void ext4_warning(struct super_block *sb, const char *function,
                  const char *fmt, ...)
 {
@@ -431,7 +445,7 @@ void ext4_warning(struct super_block *sb, const char *function,
 }
 
 void ext4_grp_locked_error(struct super_block *sb, ext4_group_t grp,
-                               const char *function, const char *fmt, ...)
+                          const char *function, const char *fmt, ...)
 __releases(bitlock)
 __acquires(bitlock)
 {
@@ -447,7 +461,7 @@ __acquires(bitlock)
        if (test_opt(sb, ERRORS_CONT)) {
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-               ext4_commit_super(sb, es, 0);
+               ext4_commit_super(sb, 0);
                return;
        }
        ext4_unlock_group(sb, grp);
@@ -467,7 +481,6 @@ __acquires(bitlock)
        return;
 }
 
-
 void ext4_update_dynamic_rev(struct super_block *sb)
 {
        struct ext4_super_block *es = EXT4_SB(sb)->s_es;
@@ -496,7 +509,7 @@ void ext4_update_dynamic_rev(struct super_block *sb)
 /*
  * Open the external journal device
  */
-static struct block_device *ext4_blkdev_get(dev_t dev)
+static struct block_device *ext4_blkdev_get(dev_t dev, struct super_block *sb)
 {
        struct block_device *bdev;
        char b[BDEVNAME_SIZE];
@@ -507,7 +520,7 @@ static struct block_device *ext4_blkdev_get(dev_t dev)
        return bdev;
 
 fail:
-       printk(KERN_ERR "EXT4-fs: failed to open journal device %s: %ld\n",
+       ext4_msg(sb, KERN_ERR, "failed to open journal device %s: %ld",
                        __bdevname(dev, b), PTR_ERR(bdev));
        return NULL;
 }
@@ -543,8 +556,8 @@ static void dump_orphan_list(struct super_block *sb, struct ext4_sb_info *sbi)
 {
        struct list_head *l;
 
-       printk(KERN_ERR "sb orphan head is %d\n",
-              le32_to_cpu(sbi->s_es->s_last_orphan));
+       ext4_msg(sb, KERN_ERR, "sb orphan head is %d",
+                le32_to_cpu(sbi->s_es->s_last_orphan));
 
        printk(KERN_ERR "sb_info orphan list:\n");
        list_for_each(l, &sbi->s_orphan) {
@@ -563,6 +576,7 @@ static void ext4_put_super(struct super_block *sb)
        struct ext4_super_block *es = sbi->s_es;
        int i, err;
 
+       ext4_release_system_zone(sb);
        ext4_mb_release(sb);
        ext4_ext_release(sb);
        ext4_xattr_put_super(sb);
@@ -576,7 +590,7 @@ static void ext4_put_super(struct super_block *sb)
        if (!(sb->s_flags & MS_RDONLY)) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
                es->s_state = cpu_to_le16(sbi->s_mount_state);
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
        }
        if (sbi->s_proc) {
                remove_proc_entry(sb->s_id, ext4_proc_root);
@@ -586,7 +600,10 @@ static void ext4_put_super(struct super_block *sb)
        for (i = 0; i < sbi->s_gdb_count; i++)
                brelse(sbi->s_group_desc[i]);
        kfree(sbi->s_group_desc);
-       kfree(sbi->s_flex_groups);
+       if (is_vmalloc_addr(sbi->s_flex_groups))
+               vfree(sbi->s_flex_groups);
+       else
+               kfree(sbi->s_flex_groups);
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -629,7 +646,6 @@ static void ext4_put_super(struct super_block *sb)
        lock_kernel();
        kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
-       return;
 }
 
 static struct kmem_cache *ext4_inode_cachep;
@@ -644,6 +660,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei = kmem_cache_alloc(ext4_inode_cachep, GFP_NOFS);
        if (!ei)
                return NULL;
+
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
        ei->i_acl = EXT4_ACL_NOT_CACHED;
        ei->i_default_acl = EXT4_ACL_NOT_CACHED;
@@ -664,14 +681,16 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
        ei->i_allocated_meta_blocks = 0;
        ei->i_delalloc_reserved_flag = 0;
        spin_lock_init(&(ei->i_block_reservation_lock));
+
        return &ei->vfs_inode;
 }
 
 static void ext4_destroy_inode(struct inode *inode)
 {
        if (!list_empty(&(EXT4_I(inode)->i_orphan))) {
-               printk("EXT4 Inode %p: orphan list check failed!\n",
-                       EXT4_I(inode));
+               ext4_msg(inode->i_sb, KERN_ERR,
+                        "Inode %lu (%p): orphan list check failed!",
+                        inode->i_ino, EXT4_I(inode));
                print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4,
                                EXT4_I(inode), sizeof(struct ext4_inode_info),
                                true);
@@ -870,12 +889,12 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
                seq_puts(seq, ",noauto_da_alloc");
 
        ext4_show_quota_options(seq, sb);
+
        return 0;
 }
 
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
-               u64 ino, u32 generation)
+                                       u64 ino, u32 generation)
 {
        struct inode *inode;
 
@@ -904,14 +923,14 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 }
 
 static struct dentry *ext4_fh_to_dentry(struct super_block *sb, struct fid *fid,
-               int fh_len, int fh_type)
+                                       int fh_len, int fh_type)
 {
        return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
 }
 
 static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
-               int fh_len, int fh_type)
+                                       int fh_len, int fh_type)
 {
        return generic_fh_to_parent(sb, fid, fh_len, fh_type,
                                    ext4_nfs_get_inode);
@@ -923,7 +942,8 @@ static struct dentry *ext4_fh_to_parent(struct super_block *sb, struct fid *fid,
  * which would prevent try_to_free_buffers() from freeing them, we must use
  * jbd2 layer's try_to_free_buffers() function to release them.
  */
-static int bdev_try_to_free_page(struct super_block *sb, struct page *page, gfp_t wait)
+static int bdev_try_to_free_page(struct super_block *sb, struct page *page,
+                                gfp_t wait)
 {
        journal_t *journal = EXT4_SB(sb)->s_journal;
 
@@ -992,7 +1012,6 @@ static const struct super_operations ext4_sops = {
        .dirty_inode    = ext4_dirty_inode,
        .delete_inode   = ext4_delete_inode,
        .put_super      = ext4_put_super,
-       .write_super    = ext4_write_super,
        .sync_fs        = ext4_sync_fs,
        .freeze_fs      = ext4_freeze,
        .unfreeze_fs    = ext4_unfreeze,
@@ -1007,6 +1026,25 @@ static const struct super_operations ext4_sops = {
        .bdev_try_to_free_page = bdev_try_to_free_page,
 };
 
+static const struct super_operations ext4_nojournal_sops = {
+       .alloc_inode    = ext4_alloc_inode,
+       .destroy_inode  = ext4_destroy_inode,
+       .write_inode    = ext4_write_inode,
+       .dirty_inode    = ext4_dirty_inode,
+       .delete_inode   = ext4_delete_inode,
+       .write_super    = ext4_write_super,
+       .put_super      = ext4_put_super,
+       .statfs         = ext4_statfs,
+       .remount_fs     = ext4_remount,
+       .clear_inode    = ext4_clear_inode,
+       .show_options   = ext4_show_options,
+#ifdef CONFIG_QUOTA
+       .quota_read     = ext4_quota_read,
+       .quota_write    = ext4_quota_write,
+#endif
+       .bdev_try_to_free_page = bdev_try_to_free_page,
+};
+
 static const struct export_operations ext4_export_ops = {
        .fh_to_dentry = ext4_fh_to_dentry,
        .fh_to_parent = ext4_fh_to_parent,
@@ -1023,12 +1061,13 @@ enum {
        Opt_journal_update, Opt_journal_dev,
        Opt_journal_checksum, Opt_journal_async_commit,
        Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
-       Opt_data_err_abort, Opt_data_err_ignore,
+       Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length,
        Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
        Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
        Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
        Opt_usrquota, Opt_grpquota, Opt_i_version,
        Opt_stripe, Opt_delalloc, Opt_nodelalloc,
+       Opt_block_validity, Opt_noblock_validity,
        Opt_inode_readahead_blks, Opt_journal_ioprio
 };
 
@@ -1069,6 +1108,7 @@ static const match_table_t tokens = {
        {Opt_data_writeback, "data=writeback"},
        {Opt_data_err_abort, "data_err=abort"},
        {Opt_data_err_ignore, "data_err=ignore"},
+       {Opt_mb_history_length, "mb_history_length=%u"},
        {Opt_offusrjquota, "usrjquota="},
        {Opt_usrjquota, "usrjquota=%s"},
        {Opt_offgrpjquota, "grpjquota="},
@@ -1087,6 +1127,8 @@ static const match_table_t tokens = {
        {Opt_resize, "resize"},
        {Opt_delalloc, "delalloc"},
        {Opt_nodelalloc, "nodelalloc"},
+       {Opt_block_validity, "block_validity"},
+       {Opt_noblock_validity, "noblock_validity"},
        {Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
        {Opt_journal_ioprio, "journal_ioprio=%u"},
        {Opt_auto_da_alloc, "auto_da_alloc=%u"},
@@ -1102,8 +1144,9 @@ static ext4_fsblk_t get_sb_block(void **data)
 
        if (!options || strncmp(options, "sb=", 3) != 0)
                return 1;       /* Default location */
+
        options += 3;
-       /*todo: use simple_strtoll with >32bit ext4 */
+       /* TODO: use simple_strtoll with >32bit ext4 */
        sb_block = simple_strtoul(options, &options, 0);
        if (*options && *options != ',') {
                printk(KERN_ERR "EXT4-fs: Invalid sb specification: %s\n",
@@ -1113,6 +1156,7 @@ static ext4_fsblk_t get_sb_block(void **data)
        if (*options == ',')
                options++;
        *data = (void *) options;
+
        return sb_block;
 }
 
@@ -1206,8 +1250,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_user_xattr:
                case Opt_nouser_xattr:
-                       printk(KERN_ERR "EXT4 (no)user_xattr options "
-                              "not supported\n");
+                       ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
                        break;
 #endif
 #ifdef CONFIG_EXT4_FS_POSIX_ACL
@@ -1220,8 +1263,7 @@ static int parse_options(char *options, struct super_block *sb,
 #else
                case Opt_acl:
                case Opt_noacl:
-                       printk(KERN_ERR "EXT4 (no)acl options "
-                              "not supported\n");
+                       ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
                        break;
 #endif
                case Opt_journal_update:
@@ -1231,16 +1273,16 @@ static int parse_options(char *options, struct super_block *sb,
                           user to specify an existing inode to be the
                           journal file. */
                        if (is_remount) {
-                               printk(KERN_ERR "EXT4-fs: cannot specify "
-                                      "journal on remount\n");
+                               ext4_msg(sb, KERN_ERR,
+                                        "Cannot specify journal on remount");
                                return 0;
                        }
                        set_opt(sbi->s_mount_opt, UPDATE_JOURNAL);
                        break;
                case Opt_journal_dev:
                        if (is_remount) {
-                               printk(KERN_ERR "EXT4-fs: cannot specify "
-                                      "journal on remount\n");
+                               ext4_msg(sb, KERN_ERR,
+                                       "Cannot specify journal on remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option))
@@ -1294,9 +1336,8 @@ static int parse_options(char *options, struct super_block *sb,
                        if (is_remount) {
                                if ((sbi->s_mount_opt & EXT4_MOUNT_DATA_FLAGS)
                                                != data_opt) {
-                                       printk(KERN_ERR
-                                               "EXT4-fs: cannot change data "
-                                               "mode on remount\n");
+                                       ext4_msg(sb, KERN_ERR,
+                                               "Cannot change data mode on remount");
                                        return 0;
                                }
                        } else {
@@ -1310,6 +1351,13 @@ static int parse_options(char *options, struct super_block *sb,
                case Opt_data_err_ignore:
                        clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT);
                        break;
+               case Opt_mb_history_length:
+                       if (match_int(&args[0], &option))
+                               return 0;
+                       if (option < 0)
+                               return 0;
+                       sbi->s_mb_history_max = option;
+                       break;
 #ifdef CONFIG_QUOTA
                case Opt_usrjquota:
                        qtype = USRQUOTA;
@@ -1319,31 +1367,31 @@ static int parse_options(char *options, struct super_block *sb,
 set_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            !sbi->s_qf_names[qtype]) {
-                               printk(KERN_ERR
-                                      "EXT4-fs: Cannot change journaled "
-                                      "quota options when quota turned on.\n");
+                               ext4_msg(sb, KERN_ERR,
+                                      "Cannot change journaled "
+                                      "quota options when quota turned on");
                                return 0;
                        }
                        qname = match_strdup(&args[0]);
                        if (!qname) {
-                               printk(KERN_ERR
-                                       "EXT4-fs: not enough memory for "
-                                       "storing quotafile name.\n");
+                               ext4_msg(sb, KERN_ERR,
+                                       "Not enough memory for "
+                                       "storing quotafile name");
                                return 0;
                        }
                        if (sbi->s_qf_names[qtype] &&
                            strcmp(sbi->s_qf_names[qtype], qname)) {
-                               printk(KERN_ERR
-                                       "EXT4-fs: %s quota file already "
-                                       "specified.\n", QTYPE2NAME(qtype));
+                               ext4_msg(sb, KERN_ERR,
+                                       "%s quota file already "
+                                       "specified", QTYPE2NAME(qtype));
                                kfree(qname);
                                return 0;
                        }
                        sbi->s_qf_names[qtype] = qname;
                        if (strchr(sbi->s_qf_names[qtype], '/')) {
-                               printk(KERN_ERR
-                                       "EXT4-fs: quotafile must be on "
-                                       "filesystem root.\n");
+                               ext4_msg(sb, KERN_ERR,
+                                       "quotafile must be on "
+                                       "filesystem root");
                                kfree(sbi->s_qf_names[qtype]);
                                sbi->s_qf_names[qtype] = NULL;
                                return 0;
@@ -1358,9 +1406,9 @@ set_qf_name:
 clear_qf_name:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_qf_names[qtype]) {
-                               printk(KERN_ERR "EXT4-fs: Cannot change "
+                               ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                       "quota turned on.\n");
+                                       "quota turned on");
                                return 0;
                        }
                        /*
@@ -1377,9 +1425,9 @@ clear_qf_name:
 set_qf_format:
                        if (sb_any_quota_loaded(sb) &&
                            sbi->s_jquota_fmt != qfmt) {
-                               printk(KERN_ERR "EXT4-fs: Cannot change "
+                               ext4_msg(sb, KERN_ERR, "Cannot change "
                                        "journaled quota options when "
-                                       "quota turned on.\n");
+                                       "quota turned on");
                                return 0;
                        }
                        sbi->s_jquota_fmt = qfmt;
@@ -1395,8 +1443,8 @@ set_qf_format:
                        break;
                case Opt_noquota:
                        if (sb_any_quota_loaded(sb)) {
-                               printk(KERN_ERR "EXT4-fs: Cannot change quota "
-                                       "options when quota turned on.\n");
+                               ext4_msg(sb, KERN_ERR, "Cannot change quota "
+                                       "options when quota turned on");
                                return 0;
                        }
                        clear_opt(sbi->s_mount_opt, QUOTA);
@@ -1407,8 +1455,8 @@ set_qf_format:
                case Opt_quota:
                case Opt_usrquota:
                case Opt_grpquota:
-                       printk(KERN_ERR
-                               "EXT4-fs: quota options not supported.\n");
+                       ext4_msg(sb, KERN_ERR,
+                               "quota options not supported");
                        break;
                case Opt_usrjquota:
                case Opt_grpjquota:
@@ -1416,9 +1464,8 @@ set_qf_format:
                case Opt_offgrpjquota:
                case Opt_jqfmt_vfsold:
                case Opt_jqfmt_vfsv0:
-                       printk(KERN_ERR
-                               "EXT4-fs: journaled quota options not "
-                               "supported.\n");
+                       ext4_msg(sb, KERN_ERR,
+                               "journaled quota options not supported");
                        break;
                case Opt_noquota:
                        break;
@@ -1443,8 +1490,9 @@ set_qf_format:
                        break;
                case Opt_resize:
                        if (!is_remount) {
-                               printk("EXT4-fs: resize option only available "
-                                       "for remount\n");
+                               ext4_msg(sb, KERN_ERR,
+                                       "resize option only available "
+                                       "for remount");
                                return 0;
                        }
                        if (match_int(&args[0], &option) != 0)
@@ -1474,14 +1522,21 @@ set_qf_format:
                case Opt_delalloc:
                        set_opt(sbi->s_mount_opt, DELALLOC);
                        break;
+               case Opt_block_validity:
+                       set_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                       break;
+               case Opt_noblock_validity:
+                       clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY);
+                       break;
                case Opt_inode_readahead_blks:
                        if (match_int(&args[0], &option))
                                return 0;
                        if (option < 0 || option > (1 << 30))
                                return 0;
-                       if (option & (option - 1)) {
-                               printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
-                                      " must be a power of 2\n");
+                       if (!is_power_of_2(option)) {
+                               ext4_msg(sb, KERN_ERR,
+                                        "EXT4-fs: inode_readahead_blks"
+                                        " must be a power of 2");
                                return 0;
                        }
                        sbi->s_inode_readahead_blks = option;
@@ -1508,9 +1563,9 @@ set_qf_format:
                                set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
                        break;
                default:
-                       printk(KERN_ERR
-                              "EXT4-fs: Unrecognized mount option \"%s\" "
-                              "or missing value\n", p);
+                       ext4_msg(sb, KERN_ERR,
+                              "Unrecognized mount option \"%s\" "
+                              "or missing value", p);
                        return 0;
                }
        }
@@ -1528,21 +1583,21 @@ set_qf_format:
                                (sbi->s_mount_opt & EXT4_MOUNT_GRPQUOTA)) ||
                    (sbi->s_qf_names[GRPQUOTA] &&
                                (sbi->s_mount_opt & EXT4_MOUNT_USRQUOTA))) {
-                       printk(KERN_ERR "EXT4-fs: old and new quota "
-                                       "format mixing.\n");
+                       ext4_msg(sb, KERN_ERR, "old and new quota "
+                                       "format mixing");
                        return 0;
                }
 
                if (!sbi->s_jquota_fmt) {
-                       printk(KERN_ERR "EXT4-fs: journaled quota format "
-                                       "not specified.\n");
+                       ext4_msg(sb, KERN_ERR, "journaled quota format "
+                                       "not specified");
                        return 0;
                }
        } else {
                if (sbi->s_jquota_fmt) {
-                       printk(KERN_ERR "EXT4-fs: journaled quota format "
+                       ext4_msg(sb, KERN_ERR, "journaled quota format "
                                        "specified with no journaling "
-                                       "enabled.\n");
+                                       "enabled");
                        return 0;
                }
        }
@@ -1557,32 +1612,32 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        int res = 0;
 
        if (le32_to_cpu(es->s_rev_level) > EXT4_MAX_SUPP_REV) {
-               printk(KERN_ERR "EXT4-fs warning: revision level too high, "
-                      "forcing read-only mode\n");
+               ext4_msg(sb, KERN_ERR, "revision level too high, "
+                        "forcing read-only mode");
                res = MS_RDONLY;
        }
        if (read_only)
                return res;
        if (!(sbi->s_mount_state & EXT4_VALID_FS))
-               printk(KERN_WARNING "EXT4-fs warning: mounting unchecked fs, "
-                      "running e2fsck is recommended\n");
+               ext4_msg(sb, KERN_WARNING, "warning: mounting unchecked fs, "
+                        "running e2fsck is recommended");
        else if ((sbi->s_mount_state & EXT4_ERROR_FS))
-               printk(KERN_WARNING
-                      "EXT4-fs warning: mounting fs with errors, "
-                      "running e2fsck is recommended\n");
+               ext4_msg(sb, KERN_WARNING,
+                        "warning: mounting fs with errors, "
+                        "running e2fsck is recommended");
        else if ((__s16) le16_to_cpu(es->s_max_mnt_count) >= 0 &&
                 le16_to_cpu(es->s_mnt_count) >=
                 (unsigned short) (__s16) le16_to_cpu(es->s_max_mnt_count))
-               printk(KERN_WARNING
-                      "EXT4-fs warning: maximal mount count reached, "
-                      "running e2fsck is recommended\n");
+               ext4_msg(sb, KERN_WARNING,
+                        "warning: maximal mount count reached, "
+                        "running e2fsck is recommended");
        else if (le32_to_cpu(es->s_checkinterval) &&
                (le32_to_cpu(es->s_lastcheck) +
                        le32_to_cpu(es->s_checkinterval) <= get_seconds()))
-               printk(KERN_WARNING
-                      "EXT4-fs warning: checktime reached, "
-                      "running e2fsck is recommended\n");
-       if (!sbi->s_journal) 
+               ext4_msg(sb, KERN_WARNING,
+                        "warning: checktime reached, "
+                        "running e2fsck is recommended");
+       if (!sbi->s_journal)
                es->s_state &= cpu_to_le16(~EXT4_VALID_FS);
        if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
                es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
@@ -1592,7 +1647,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
        if (sbi->s_journal)
                EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 
-       ext4_commit_super(sb, es, 1);
+       ext4_commit_super(sb, 1);
        if (test_opt(sb, DEBUG))
                printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, "
                                "bpg=%lu, ipg=%lu, mo=%04lx]\n",
@@ -1603,11 +1658,11 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
                        sbi->s_mount_opt);
 
        if (EXT4_SB(sb)->s_journal) {
-               printk(KERN_INFO "EXT4 FS on %s, %s journal on %s\n",
-                      sb->s_id, EXT4_SB(sb)->s_journal->j_inode ? "internal" :
+               ext4_msg(sb, KERN_INFO, "%s journal on %s",
+                      EXT4_SB(sb)->s_journal->j_inode ? "internal" :
                       "external", EXT4_SB(sb)->s_journal->j_devname);
        } else {
-               printk(KERN_INFO "EXT4 FS on %s, no journal\n", sb->s_id);
+               ext4_msg(sb, KERN_INFO, "no journal");
        }
        return res;
 }
@@ -1616,10 +1671,10 @@ static int ext4_fill_flex_info(struct super_block *sb)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        struct ext4_group_desc *gdp = NULL;
-       struct buffer_head *bh;
        ext4_group_t flex_group_count;
        ext4_group_t flex_group;
        int groups_per_flex = 0;
+       size_t size;
        int i;
 
        if (!sbi->s_es->s_log_groups_per_flex) {
@@ -1634,16 +1689,21 @@ static int ext4_fill_flex_info(struct super_block *sb)
        flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) +
                        ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) <<
                              EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex;
-       sbi->s_flex_groups = kzalloc(flex_group_count *
-                                    sizeof(struct flex_groups), GFP_KERNEL);
+       size = flex_group_count * sizeof(struct flex_groups);
+       sbi->s_flex_groups = kzalloc(size, GFP_KERNEL);
+       if (sbi->s_flex_groups == NULL) {
+               sbi->s_flex_groups = vmalloc(size);
+               if (sbi->s_flex_groups)
+                       memset(sbi->s_flex_groups, 0, size);
+       }
        if (sbi->s_flex_groups == NULL) {
-               printk(KERN_ERR "EXT4-fs: not enough memory for "
-                               "%u flex groups\n", flex_group_count);
+               ext4_msg(sb, KERN_ERR, "not enough memory for "
+                               "%u flex groups", flex_group_count);
                goto failed;
        }
 
        for (i = 0; i < sbi->s_groups_count; i++) {
-               gdp = ext4_get_group_desc(sb, i, &bh);
+               gdp = ext4_get_group_desc(sb, i, NULL);
 
                flex_group = ext4_flex_group(sbi, i);
                atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
@@ -1724,44 +1784,44 @@ static int ext4_check_descriptors(struct super_block *sb)
 
                block_bitmap = ext4_block_bitmap(sb, gdp);
                if (block_bitmap < first_block || block_bitmap > last_block) {
-                       printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                       ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Block bitmap for group %u not in group "
-                              "(block %llu)!\n", i, block_bitmap);
+                              "(block %llu)!", i, block_bitmap);
                        return 0;
                }
                inode_bitmap = ext4_inode_bitmap(sb, gdp);
                if (inode_bitmap < first_block || inode_bitmap > last_block) {
-                       printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                       ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode bitmap for group %u not in group "
-                              "(block %llu)!\n", i, inode_bitmap);
+                              "(block %llu)!", i, inode_bitmap);
                        return 0;
                }
                inode_table = ext4_inode_table(sb, gdp);
                if (inode_table < first_block ||
                    inode_table + sbi->s_itb_per_group - 1 > last_block) {
-                       printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
+                       ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
                               "Inode table for group %u not in group "
-                              "(block %llu)!\n", i, inode_table);
+                              "(block %llu)!", i, inode_table);
                        return 0;
                }
-               spin_lock(sb_bgl_lock(sbi, i));
+               ext4_lock_group(sb, i);
                if (!ext4_group_desc_csum_verify(sbi, i, gdp)) {
-                       printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: "
-                              "Checksum for group %u failed (%u!=%u)\n",
-                              i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
-                              gdp)), le16_to_cpu(gdp->bg_checksum));
+                       ext4_msg(sb, KERN_ERR, "ext4_check_descriptors: "
+                                "Checksum for group %u failed (%u!=%u)",
+                                i, le16_to_cpu(ext4_group_desc_csum(sbi, i,
+                                    gdp)), le16_to_cpu(gdp->bg_checksum));
                        if (!(sb->s_flags & MS_RDONLY)) {
-                               spin_unlock(sb_bgl_lock(sbi, i));
+                               ext4_unlock_group(sb, i);
                                return 0;
                        }
                }
-               spin_unlock(sb_bgl_lock(sbi, i));
+               ext4_unlock_group(sb, i);
                if (!flexbg_flag)
                        first_block += EXT4_BLOCKS_PER_GROUP(sb);
        }
 
        ext4_free_blocks_count_set(sbi->s_es, ext4_count_free_blocks(sb));
-       sbi->s_es->s_free_inodes_count = cpu_to_le32(ext4_count_free_inodes(sb));
+       sbi->s_es->s_free_inodes_count =cpu_to_le32(ext4_count_free_inodes(sb));
        return 1;
 }
 
@@ -1796,8 +1856,8 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
 
        if (bdev_read_only(sb->s_bdev)) {
-               printk(KERN_ERR "EXT4-fs: write access "
-                       "unavailable, skipping orphan cleanup.\n");
+               ext4_msg(sb, KERN_ERR, "write access "
+                       "unavailable, skipping orphan cleanup");
                return;
        }
 
@@ -1811,8 +1871,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
        }
 
        if (s_flags & MS_RDONLY) {
-               printk(KERN_INFO "EXT4-fs: %s: orphan cleanup on readonly fs\n",
-                      sb->s_id);
+               ext4_msg(sb, KERN_INFO, "orphan cleanup on readonly fs");
                sb->s_flags &= ~MS_RDONLY;
        }
 #ifdef CONFIG_QUOTA
@@ -1823,9 +1882,9 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                if (EXT4_SB(sb)->s_qf_names[i]) {
                        int ret = ext4_quota_on_mount(sb, i);
                        if (ret < 0)
-                               printk(KERN_ERR
-                                       "EXT4-fs: Cannot turn on journaled "
-                                       "quota: error %d\n", ret);
+                               ext4_msg(sb, KERN_ERR,
+                                       "Cannot turn on journaled "
+                                       "quota: error %d", ret);
                }
        }
 #endif
@@ -1842,16 +1901,16 @@ static void ext4_orphan_cleanup(struct super_block *sb,
                list_add(&EXT4_I(inode)->i_orphan, &EXT4_SB(sb)->s_orphan);
                vfs_dq_init(inode);
                if (inode->i_nlink) {
-                       printk(KERN_DEBUG
-                               "%s: truncating inode %lu to %lld bytes\n",
+                       ext4_msg(sb, KERN_DEBUG,
+                               "%s: truncating inode %lu to %lld bytes",
                                __func__, inode->i_ino, inode->i_size);
                        jbd_debug(2, "truncating inode %lu to %lld bytes\n",
                                  inode->i_ino, inode->i_size);
                        ext4_truncate(inode);
                        nr_truncates++;
                } else {
-                       printk(KERN_DEBUG
-                               "%s: deleting unreferenced inode %lu\n",
+                       ext4_msg(sb, KERN_DEBUG,
+                               "%s: deleting unreferenced inode %lu",
                                __func__, inode->i_ino);
                        jbd_debug(2, "deleting unreferenced inode %lu\n",
                                  inode->i_ino);
@@ -1863,11 +1922,11 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #define PLURAL(x) (x), ((x) == 1) ? "" : "s"
 
        if (nr_orphans)
-               printk(KERN_INFO "EXT4-fs: %s: %d orphan inode%s deleted\n",
-                      sb->s_id, PLURAL(nr_orphans));
+               ext4_msg(sb, KERN_INFO, "%d orphan inode%s deleted",
+                      PLURAL(nr_orphans));
        if (nr_truncates)
-               printk(KERN_INFO "EXT4-fs: %s: %d truncate%s cleaned up\n",
-                      sb->s_id, PLURAL(nr_truncates));
+               ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up",
+                      PLURAL(nr_truncates));
 #ifdef CONFIG_QUOTA
        /* Turn quotas off */
        for (i = 0; i < MAXQUOTAS; i++) {
@@ -1877,6 +1936,7 @@ static void ext4_orphan_cleanup(struct super_block *sb,
 #endif
        sb->s_flags = s_flags; /* Restore MS_RDONLY status */
 }
+
 /*
  * Maximal extent format file size.
  * Resulting logical blkno at s_maxbytes must fit in our on-disk
@@ -1927,19 +1987,19 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
        loff_t res = EXT4_NDIR_BLOCKS;
        int meta_blocks;
        loff_t upper_limit;
-       /* This is calculated to be the largest file size for a
-        * dense, bitmapped file such that the total number of
-        * sectors in the file, including data and all indirect blocks,
-        * does not exceed 2^48 -1
-        * __u32 i_blocks_lo and _u16 i_blocks_high representing the
-        * total number of  512 bytes blocks of the file
+       /* This is calculated to be the largest file size for a dense, block
+        * mapped file such that the file's total number of 512-byte sectors,
+        * including data and all indirect blocks, does not exceed (2^48 - 1).
+        *
+        * __u32 i_blocks_lo and _u16 i_blocks_high represent the total
+        * number of 512-byte sectors of the file.
         */
 
        if (!has_huge_files || sizeof(blkcnt_t) < sizeof(u64)) {
                /*
-                * !has_huge_files or CONFIG_LBD is not enabled
-                * implies the inode i_block represent total blocks in
-                * 512 bytes 32 == size of vfs inode i_blocks * 8
+                * !has_huge_files or CONFIG_LBD not enabled implies that
+                * the inode i_block field represents total file blocks in
+                * 2^32 512-byte sectors == size of vfs inode i_blocks * 8
                 */
                upper_limit = (1LL << 32) - 1;
 
@@ -1981,7 +2041,7 @@ static loff_t ext4_max_bitmap_size(int bits, int has_huge_files)
 }
 
 static ext4_fsblk_t descriptor_loc(struct super_block *sb,
-                               ext4_fsblk_t logical_sb_block, int nr)
+                                  ext4_fsblk_t logical_sb_block, int nr)
 {
        struct ext4_sb_info *sbi = EXT4_SB(sb);
        ext4_group_t bg, first_meta_bg;
@@ -1995,6 +2055,7 @@ static ext4_fsblk_t descriptor_loc(struct super_block *sb,
        bg = sbi->s_desc_per_block * nr;
        if (ext4_bg_has_super(sb, bg))
                has_super = 1;
+
        return (has_super + ext4_group_first_block_no(sb, bg));
 }
 
@@ -2091,8 +2152,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
        if (parse_strtoul(buf, 0x40000000, &t))
                return -EINVAL;
 
-       /* inode_readahead_blks must be a power of 2 */
-       if (t & (t-1))
+       if (!is_power_of_2(t))
                return -EINVAL;
 
        sbi->s_inode_readahead_blks = t;
@@ -2100,7 +2160,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
 }
 
 static ssize_t sbi_ui_show(struct ext4_attr *a,
-                               struct ext4_sb_info *sbi, char *buf)
+                          struct ext4_sb_info *sbi, char *buf)
 {
        unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
 
@@ -2205,7 +2265,6 @@ static struct kobj_type ext4_ktype = {
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                                __releases(kernel_lock)
                                __acquires(kernel_lock)
-
 {
        struct buffer_head *bh;
        struct ext4_super_block *es = NULL;
@@ -2256,7 +2315,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        blocksize = sb_min_blocksize(sb, EXT4_MIN_BLOCK_SIZE);
        if (!blocksize) {
-               printk(KERN_ERR "EXT4-fs: unable to set blocksize\n");
+               ext4_msg(sb, KERN_ERR, "unable to set blocksize");
                goto out_fail;
        }
 
@@ -2272,7 +2331,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        if (!(bh = sb_bread(sb, logical_sb_block))) {
-               printk(KERN_ERR "EXT4-fs: unable to read superblock\n");
+               ext4_msg(sb, KERN_ERR, "unable to read superblock");
                goto out_fail;
        }
        /*
@@ -2321,6 +2380,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ;
        sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
        sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
+       sbi->s_mb_history_max = default_mb_history_length;
 
        set_opt(sbi->s_mount_opt, BARRIER);
 
@@ -2330,7 +2390,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        set_opt(sbi->s_mount_opt, DELALLOC);
 
-
        if (!parse_options((char *) data, sb, &journal_devnum,
                           &journal_ioprio, NULL, 0))
                goto failed_mount;
@@ -2342,9 +2401,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
            (EXT4_HAS_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_RO_COMPAT_FEATURE(sb, ~0U) ||
             EXT4_HAS_INCOMPAT_FEATURE(sb, ~0U)))
-               printk(KERN_WARNING
-                      "EXT4-fs warning: feature flags set on rev 0 fs, "
-                      "running e2fsck is recommended\n");
+               ext4_msg(sb, KERN_WARNING,
+                      "feature flags set on rev 0 fs, "
+                      "running e2fsck is recommended");
 
        /*
         * Check feature flags regardless of the revision level, since we
@@ -2353,16 +2412,18 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
         */
        features = EXT4_HAS_INCOMPAT_FEATURE(sb, ~EXT4_FEATURE_INCOMPAT_SUPP);
        if (features) {
-               printk(KERN_ERR "EXT4-fs: %s: couldn't mount because of "
-                      "unsupported optional features (%x).\n", sb->s_id,
+               ext4_msg(sb, KERN_ERR,
+                       "Couldn't mount because of "
+                       "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_incompat) &
                        ~EXT4_FEATURE_INCOMPAT_SUPP));
                goto failed_mount;
        }
        features = EXT4_HAS_RO_COMPAT_FEATURE(sb, ~EXT4_FEATURE_RO_COMPAT_SUPP);
        if (!(sb->s_flags & MS_RDONLY) && features) {
-               printk(KERN_ERR "EXT4-fs: %s: couldn't mount RDWR because of "
-                      "unsupported optional features (%x).\n", sb->s_id,
+               ext4_msg(sb, KERN_ERR,
+                       "Couldn't mount RDWR because of "
+                       "unsupported optional features (%x)",
                        (le32_to_cpu(EXT4_SB(sb)->s_es->s_feature_ro_compat) &
                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                goto failed_mount;
@@ -2376,9 +2437,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                 */
                if (sizeof(root->i_blocks) < sizeof(u64) &&
                                !(sb->s_flags & MS_RDONLY)) {
-                       printk(KERN_ERR "EXT4-fs: %s: Filesystem with huge "
+                       ext4_msg(sb, KERN_ERR, "Filesystem with huge "
                                        "files cannot be mounted read-write "
-                                       "without CONFIG_LBD.\n", sb->s_id);
+                                       "without CONFIG_LBD");
                        goto failed_mount;
                }
        }
@@ -2386,17 +2447,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 
        if (blocksize < EXT4_MIN_BLOCK_SIZE ||
            blocksize > EXT4_MAX_BLOCK_SIZE) {
-               printk(KERN_ERR
-                      "EXT4-fs: Unsupported filesystem blocksize %d on %s.\n",
-                      blocksize, sb->s_id);
+               ext4_msg(sb, KERN_ERR,
+                      "Unsupported filesystem blocksize %d", blocksize);
                goto failed_mount;
        }
 
        if (sb->s_blocksize != blocksize) {
-
                /* Validate the filesystem blocksize */
                if (!sb_set_blocksize(sb, blocksize)) {
-                       printk(KERN_ERR "EXT4-fs: bad block size %d.\n",
+                       ext4_msg(sb, KERN_ERR, "bad block size %d",
                                        blocksize);
                        goto failed_mount;
                }
@@ -2406,15 +2465,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                offset = do_div(logical_sb_block, blocksize);
                bh = sb_bread(sb, logical_sb_block);
                if (!bh) {
-                       printk(KERN_ERR
-                              "EXT4-fs: Can't read superblock on 2nd try.\n");
+                       ext4_msg(sb, KERN_ERR,
+                              "Can't read superblock on 2nd try");
                        goto failed_mount;
                }
                es = (struct ext4_super_block *)(((char *)bh->b_data) + offset);
                sbi->s_es = es;
                if (es->s_magic != cpu_to_le16(EXT4_SUPER_MAGIC)) {
-                       printk(KERN_ERR
-                              "EXT4-fs: Magic mismatch, very weird !\n");
+                       ext4_msg(sb, KERN_ERR,
+                              "Magic mismatch, very weird!");
                        goto failed_mount;
                }
        }
@@ -2432,30 +2491,33 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                if ((sbi->s_inode_size < EXT4_GOOD_OLD_INODE_SIZE) ||
                    (!is_power_of_2(sbi->s_inode_size)) ||
                    (sbi->s_inode_size > blocksize)) {
-                       printk(KERN_ERR
-                              "EXT4-fs: unsupported inode size: %d\n",
+                       ext4_msg(sb, KERN_ERR,
+                              "unsupported inode size: %d",
                               sbi->s_inode_size);
                        goto failed_mount;
                }
                if (sbi->s_inode_size > EXT4_GOOD_OLD_INODE_SIZE)
                        sb->s_time_gran = 1 << (EXT4_EPOCH_BITS - 2);
        }
+
        sbi->s_desc_size = le16_to_cpu(es->s_desc_size);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_64BIT)) {
                if (sbi->s_desc_size < EXT4_MIN_DESC_SIZE_64BIT ||
                    sbi->s_desc_size > EXT4_MAX_DESC_SIZE ||
                    !is_power_of_2(sbi->s_desc_size)) {
-                       printk(KERN_ERR
-                              "EXT4-fs: unsupported descriptor size %lu\n",
+                       ext4_msg(sb, KERN_ERR,
+                              "unsupported descriptor size %lu",
                               sbi->s_desc_size);
                        goto failed_mount;
                }
        } else
                sbi->s_desc_size = EXT4_MIN_DESC_SIZE;
+
        sbi->s_blocks_per_group = le32_to_cpu(es->s_blocks_per_group);
        sbi->s_inodes_per_group = le32_to_cpu(es->s_inodes_per_group);
        if (EXT4_INODE_SIZE(sb) == 0 || EXT4_INODES_PER_GROUP(sb) == 0)
                goto cantfind_ext4;
+
        sbi->s_inodes_per_block = blocksize / EXT4_INODE_SIZE(sb);
        if (sbi->s_inodes_per_block == 0)
                goto cantfind_ext4;
@@ -2466,6 +2528,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_mount_state = le16_to_cpu(es->s_state);
        sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
        sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
+
        for (i = 0; i < 4; i++)
                sbi->s_hash_seed[i] = le32_to_cpu(es->s_hash_seed[i]);
        sbi->s_def_hash_version = es->s_def_hash_version;
@@ -2483,25 +2546,24 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        }
 
        if (sbi->s_blocks_per_group > blocksize * 8) {
-               printk(KERN_ERR
-                      "EXT4-fs: #blocks per group too big: %lu\n",
+               ext4_msg(sb, KERN_ERR,
+                      "#blocks per group too big: %lu",
                       sbi->s_blocks_per_group);
                goto failed_mount;
        }
        if (sbi->s_inodes_per_group > blocksize * 8) {
-               printk(KERN_ERR
-                      "EXT4-fs: #inodes per group too big: %lu\n",
+               ext4_msg(sb, KERN_ERR,
+                      "#inodes per group too big: %lu",
                       sbi->s_inodes_per_group);
                goto failed_mount;
        }
 
        if (ext4_blocks_count(es) >
                    (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-               printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-                       " too large to mount safely\n", sb->s_id);
+               ext4_msg(sb, KERN_ERR, "filesystem"
+                       " too large to mount safely");
                if (sizeof(sector_t) < 8)
-                       printk(KERN_WARNING "EXT4-fs: CONFIG_LBD not "
-                                       "enabled\n");
+                       ext4_msg(sb, KERN_WARNING, "CONFIG_LBD not enabled");
                goto failed_mount;
        }
 
@@ -2511,21 +2573,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /* check blocks count against device size */
        blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
        if (blocks_count && ext4_blocks_count(es) > blocks_count) {
-               printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
-                      "exceeds size of device (%llu blocks)\n",
+               ext4_msg(sb, KERN_WARNING, "bad geometry: block count %llu "
+                      "exceeds size of device (%llu blocks)",
                       ext4_blocks_count(es), blocks_count);
                goto failed_mount;
        }
 
-        /*
-         * It makes no sense for the first data block to be beyond the end
-         * of the filesystem.
-         */
-        if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
-                printk(KERN_WARNING "EXT4-fs: bad geometry: first data"
-                      "block %u is beyond end of filesystem (%llu)\n",
-                      le32_to_cpu(es->s_first_data_block),
-                      ext4_blocks_count(es));
+       /*
+        * It makes no sense for the first data block to be beyond the end
+        * of the filesystem.
+        */
+       if (le32_to_cpu(es->s_first_data_block) >= ext4_blocks_count(es)) {
+                ext4_msg(sb, KERN_WARNING, "bad geometry: first data"
+                        "block %u is beyond end of filesystem (%llu)",
+                        le32_to_cpu(es->s_first_data_block),
+                        ext4_blocks_count(es));
                goto failed_mount;
        }
        blocks_count = (ext4_blocks_count(es) -
@@ -2533,9 +2595,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        EXT4_BLOCKS_PER_GROUP(sb) - 1);
        do_div(blocks_count, EXT4_BLOCKS_PER_GROUP(sb));
        if (blocks_count > ((uint64_t)1<<32) - EXT4_DESC_PER_BLOCK(sb)) {
-               printk(KERN_WARNING "EXT4-fs: groups count too large: %u "
+               ext4_msg(sb, KERN_WARNING, "groups count too large: %u "
                       "(block count %llu, first data block %u, "
-                      "blocks per group %lu)\n", sbi->s_groups_count,
+                      "blocks per group %lu)", sbi->s_groups_count,
                       ext4_blocks_count(es),
                       le32_to_cpu(es->s_first_data_block),
                       EXT4_BLOCKS_PER_GROUP(sb));
@@ -2547,7 +2609,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sbi->s_group_desc = kmalloc(db_count * sizeof(struct buffer_head *),
                                    GFP_KERNEL);
        if (sbi->s_group_desc == NULL) {
-               printk(KERN_ERR "EXT4-fs: not enough memory\n");
+               ext4_msg(sb, KERN_ERR, "not enough memory");
                goto failed_mount;
        }
 
@@ -2562,21 +2624,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                block = descriptor_loc(sb, logical_sb_block, i);
                sbi->s_group_desc[i] = sb_bread(sb, block);
                if (!sbi->s_group_desc[i]) {
-                       printk(KERN_ERR "EXT4-fs: "
-                              "can't read group descriptor %d\n", i);
+                       ext4_msg(sb, KERN_ERR,
+                              "can't read group descriptor %d", i);
                        db_count = i;
                        goto failed_mount2;
                }
        }
        if (!ext4_check_descriptors(sb)) {
-               printk(KERN_ERR "EXT4-fs: group descriptors corrupted!\n");
+               ext4_msg(sb, KERN_ERR, "group descriptors corrupted!");
                goto failed_mount2;
        }
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG))
                if (!ext4_fill_flex_info(sb)) {
-                       printk(KERN_ERR
-                              "EXT4-fs: unable to initialize "
-                              "flex_bg meta info!\n");
+                       ext4_msg(sb, KERN_ERR,
+                              "unable to initialize "
+                              "flex_bg meta info!");
                        goto failed_mount2;
                }
 
@@ -2598,7 +2660,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                err = percpu_counter_init(&sbi->s_dirtyblocks_counter, 0);
        }
        if (err) {
-               printk(KERN_ERR "EXT4-fs: insufficient memory\n");
+               ext4_msg(sb, KERN_ERR, "insufficient memory");
                goto failed_mount3;
        }
 
@@ -2607,7 +2669,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        /*
         * set up enough so that it can read an inode
         */
-       sb->s_op = &ext4_sops;
+       if (!test_opt(sb, NOLOAD) &&
+           EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL))
+               sb->s_op = &ext4_sops;
+       else
+               sb->s_op = &ext4_nojournal_sops;
        sb->s_export_op = &ext4_export_ops;
        sb->s_xattr = ext4_xattr_handlers;
 #ifdef CONFIG_QUOTA
@@ -2615,6 +2681,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        sb->dq_op = &ext4_quota_operations;
 #endif
        INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */
+       mutex_init(&sbi->s_orphan_lock);
+       mutex_init(&sbi->s_resize_lock);
 
        sb->s_root = NULL;
 
@@ -2632,13 +2700,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        goto failed_mount3;
                if (!(sb->s_flags & MS_RDONLY) &&
                    EXT4_SB(sb)->s_journal->j_failed_commit) {
-                       printk(KERN_CRIT "EXT4-fs error (device %s): "
+                       ext4_msg(sb, KERN_CRIT, "error: "
                               "ext4_fill_super: Journal transaction "
-                              "%u is corrupt\n", sb->s_id,
+                              "%u is corrupt",
                               EXT4_SB(sb)->s_journal->j_failed_commit);
                        if (test_opt(sb, ERRORS_RO)) {
-                               printk(KERN_CRIT
-                                      "Mounting filesystem read-only\n");
+                               ext4_msg(sb, KERN_CRIT,
+                                      "Mounting filesystem read-only");
                                sb->s_flags |= MS_RDONLY;
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
@@ -2646,14 +2714,14 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
                        if (test_opt(sb, ERRORS_PANIC)) {
                                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-                               ext4_commit_super(sb, es, 1);
+                               ext4_commit_super(sb, 1);
                                goto failed_mount4;
                        }
                }
        } else if (test_opt(sb, NOLOAD) && !(sb->s_flags & MS_RDONLY) &&
              EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
-               printk(KERN_ERR "EXT4-fs: required journal recovery "
-                      "suppressed and not mounted read-only\n");
+               ext4_msg(sb, KERN_ERR, "required journal recovery "
+                      "suppressed and not mounted read-only");
                goto failed_mount4;
        } else {
                clear_opt(sbi->s_mount_opt, DATA_FLAGS);
@@ -2666,7 +2734,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        if (ext4_blocks_count(es) > 0xffffffffULL &&
            !jbd2_journal_set_features(EXT4_SB(sb)->s_journal, 0, 0,
                                       JBD2_FEATURE_INCOMPAT_64BIT)) {
-               printk(KERN_ERR "EXT4-fs: Failed to set 64-bit journal feature\n");
+               ext4_msg(sb, KERN_ERR, "Failed to set 64-bit journal feature");
                goto failed_mount4;
        }
 
@@ -2704,8 +2772,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
        case EXT4_MOUNT_WRITEBACK_DATA:
                if (!jbd2_journal_check_available_features
                    (sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_REVOKE)) {
-                       printk(KERN_ERR "EXT4-fs: Journal does not support "
-                              "requested data journaling mode\n");
+                       ext4_msg(sb, KERN_ERR, "Journal does not support "
+                              "requested data journaling mode");
                        goto failed_mount4;
                }
        default:
@@ -2717,8 +2785,8 @@ no_journal:
 
        if (test_opt(sb, NOBH)) {
                if (!(test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)) {
-                       printk(KERN_WARNING "EXT4-fs: Ignoring nobh option - "
-                               "its supported only with writeback mode\n");
+                       ext4_msg(sb, KERN_WARNING, "Ignoring nobh option - "
+                               "its supported only with writeback mode");
                        clear_opt(sbi->s_mount_opt, NOBH);
                }
        }
@@ -2729,18 +2797,18 @@ no_journal:
 
        root = ext4_iget(sb, EXT4_ROOT_INO);
        if (IS_ERR(root)) {
-               printk(KERN_ERR "EXT4-fs: get root inode failed\n");
+               ext4_msg(sb, KERN_ERR, "get root inode failed");
                ret = PTR_ERR(root);
                goto failed_mount4;
        }
        if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
                iput(root);
-               printk(KERN_ERR "EXT4-fs: corrupt root inode, run e2fsck\n");
+               ext4_msg(sb, KERN_ERR, "corrupt root inode, run e2fsck");
                goto failed_mount4;
        }
        sb->s_root = d_alloc_root(root);
        if (!sb->s_root) {
-               printk(KERN_ERR "EXT4-fs: get root dentry failed\n");
+               ext4_msg(sb, KERN_ERR, "get root dentry failed");
                iput(root);
                ret = -ENOMEM;
                goto failed_mount4;
@@ -2769,22 +2837,29 @@ no_journal:
                                                        sbi->s_inode_size) {
                sbi->s_want_extra_isize = sizeof(struct ext4_inode) -
                                                       EXT4_GOOD_OLD_INODE_SIZE;
-               printk(KERN_INFO "EXT4-fs: required extra inode space not"
-                       "available.\n");
+               ext4_msg(sb, KERN_INFO, "required extra inode space not"
+                        "available");
        }
 
        if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
-               printk(KERN_WARNING "EXT4-fs: Ignoring delalloc option - "
-                               "requested data journaling mode\n");
+               ext4_msg(sb, KERN_WARNING, "Ignoring delalloc option - "
+                        "requested data journaling mode");
                clear_opt(sbi->s_mount_opt, DELALLOC);
        } else if (test_opt(sb, DELALLOC))
-               printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n");
+               ext4_msg(sb, KERN_INFO, "delayed allocation enabled");
+
+       err = ext4_setup_system_zone(sb);
+       if (err) {
+               ext4_msg(sb, KERN_ERR, "failed to initialize system "
+                        "zone (%d)\n", err);
+               goto failed_mount4;
+       }
 
        ext4_ext_init(sb);
        err = ext4_mb_init(sb, needs_recovery);
        if (err) {
-               printk(KERN_ERR "EXT4-fs: failed to initalize mballoc (%d)\n",
-                      err);
+               ext4_msg(sb, KERN_ERR, "failed to initalize mballoc (%d)",
+                        err);
                goto failed_mount4;
        }
 
@@ -2798,19 +2873,11 @@ no_journal:
                goto failed_mount4;
        };
 
-       /*
-        * akpm: core read_super() calls in here with the superblock locked.
-        * That deadlocks, because orphan cleanup needs to lock the superblock
-        * in numerous places.  Here we just pop the lock - it's relatively
-        * harmless, because we are now ready to accept write_super() requests,
-        * and aviro says that's the only reason for hanging onto the
-        * superblock lock.
-        */
        EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS;
        ext4_orphan_cleanup(sb, es);
        EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS;
        if (needs_recovery) {
-               printk(KERN_INFO "EXT4-fs: recovery complete.\n");
+               ext4_msg(sb, KERN_INFO, "recovery complete");
                ext4_mark_recovery_complete(sb, es);
        }
        if (EXT4_SB(sb)->s_journal) {
@@ -2823,25 +2890,30 @@ no_journal:
        } else
                descr = "out journal";
 
-       printk(KERN_INFO "EXT4-fs: mounted filesystem %s with%s\n",
-              sb->s_id, descr);
+       ext4_msg(sb, KERN_INFO, "mounted filesystem with%s", descr);
 
        lock_kernel();
        return 0;
 
 cantfind_ext4:
        if (!silent)
-               printk(KERN_ERR "VFS: Can't find ext4 filesystem on dev %s.\n",
-                      sb->s_id);
+               ext4_msg(sb, KERN_ERR, "VFS: Can't find ext4 filesystem");
        goto failed_mount;
 
 failed_mount4:
-       printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id);
+       ext4_msg(sb, KERN_ERR, "mount failed");
+       ext4_release_system_zone(sb);
        if (sbi->s_journal) {
                jbd2_journal_destroy(sbi->s_journal);
                sbi->s_journal = NULL;
        }
 failed_mount3:
+       if (sbi->s_flex_groups) {
+               if (is_vmalloc_addr(sbi->s_flex_groups))
+                       vfree(sbi->s_flex_groups);
+               else
+                       kfree(sbi->s_flex_groups);
+       }
        percpu_counter_destroy(&sbi->s_freeblocks_counter);
        percpu_counter_destroy(&sbi->s_freeinodes_counter);
        percpu_counter_destroy(&sbi->s_dirs_counter);
@@ -2862,6 +2934,7 @@ failed_mount:
        brelse(bh);
 out_fail:
        sb->s_fs_info = NULL;
+       kfree(sbi->s_blockgroup_lock);
        kfree(sbi);
        lock_kernel();
        return ret;
@@ -2906,27 +2979,27 @@ static journal_t *ext4_get_journal(struct super_block *sb,
 
        journal_inode = ext4_iget(sb, journal_inum);
        if (IS_ERR(journal_inode)) {
-               printk(KERN_ERR "EXT4-fs: no journal found.\n");
+               ext4_msg(sb, KERN_ERR, "no journal found");
                return NULL;
        }
        if (!journal_inode->i_nlink) {
                make_bad_inode(journal_inode);
                iput(journal_inode);
-               printk(KERN_ERR "EXT4-fs: journal inode is deleted.\n");
+               ext4_msg(sb, KERN_ERR, "journal inode is deleted");
                return NULL;
        }
 
        jbd_debug(2, "Journal inode found at %p: %lld bytes\n",
                  journal_inode, journal_inode->i_size);
        if (!S_ISREG(journal_inode->i_mode)) {
-               printk(KERN_ERR "EXT4-fs: invalid journal inode.\n");
+               ext4_msg(sb, KERN_ERR, "invalid journal inode");
                iput(journal_inode);
                return NULL;
        }
 
        journal = jbd2_journal_init_inode(journal_inode);
        if (!journal) {
-               printk(KERN_ERR "EXT4-fs: Could not load journal inode\n");
+               ext4_msg(sb, KERN_ERR, "Could not load journal inode");
                iput(journal_inode);
                return NULL;
        }
@@ -2950,13 +3023,13 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
 
        BUG_ON(!EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL));
 
-       bdev = ext4_blkdev_get(j_dev);
+       bdev = ext4_blkdev_get(j_dev, sb);
        if (bdev == NULL)
                return NULL;
 
        if (bd_claim(bdev, sb)) {
-               printk(KERN_ERR
-                       "EXT4-fs: failed to claim external journal device.\n");
+               ext4_msg(sb, KERN_ERR,
+                       "failed to claim external journal device");
                blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
                return NULL;
        }
@@ -2964,8 +3037,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        blocksize = sb->s_blocksize;
        hblock = bdev_hardsect_size(bdev);
        if (blocksize < hblock) {
-               printk(KERN_ERR
-                       "EXT4-fs: blocksize too small for journal device.\n");
+               ext4_msg(sb, KERN_ERR,
+                       "blocksize too small for journal device");
                goto out_bdev;
        }
 
@@ -2973,8 +3046,8 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        offset = EXT4_MIN_BLOCK_SIZE % blocksize;
        set_blocksize(bdev, blocksize);
        if (!(bh = __bread(bdev, sb_block, blocksize))) {
-               printk(KERN_ERR "EXT4-fs: couldn't read superblock of "
-                      "external journal\n");
+               ext4_msg(sb, KERN_ERR, "couldn't read superblock of "
+                      "external journal");
                goto out_bdev;
        }
 
@@ -2982,14 +3055,14 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        if ((le16_to_cpu(es->s_magic) != EXT4_SUPER_MAGIC) ||
            !(le32_to_cpu(es->s_feature_incompat) &
              EXT4_FEATURE_INCOMPAT_JOURNAL_DEV)) {
-               printk(KERN_ERR "EXT4-fs: external journal has "
-                                       "bad superblock\n");
+               ext4_msg(sb, KERN_ERR, "external journal has "
+                                       "bad superblock");
                brelse(bh);
                goto out_bdev;
        }
 
        if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) {
-               printk(KERN_ERR "EXT4-fs: journal UUID does not match\n");
+               ext4_msg(sb, KERN_ERR, "journal UUID does not match");
                brelse(bh);
                goto out_bdev;
        }
@@ -3001,25 +3074,26 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb,
        journal = jbd2_journal_init_dev(bdev, sb->s_bdev,
                                        start, len, blocksize);
        if (!journal) {
-               printk(KERN_ERR "EXT4-fs: failed to create device journal\n");
+               ext4_msg(sb, KERN_ERR, "failed to create device journal");
                goto out_bdev;
        }
        journal->j_private = sb;
        ll_rw_block(READ, 1, &journal->j_sb_buffer);
        wait_on_buffer(journal->j_sb_buffer);
        if (!buffer_uptodate(journal->j_sb_buffer)) {
-               printk(KERN_ERR "EXT4-fs: I/O error on journal device\n");
+               ext4_msg(sb, KERN_ERR, "I/O error on journal device");
                goto out_journal;
        }
        if (be32_to_cpu(journal->j_superblock->s_nr_users) != 1) {
-               printk(KERN_ERR "EXT4-fs: External journal has more than one "
-                                       "user (unsupported) - %d\n",
+               ext4_msg(sb, KERN_ERR, "External journal has more than one "
+                                       "user (unsupported) - %d",
                        be32_to_cpu(journal->j_superblock->s_nr_users));
                goto out_journal;
        }
        EXT4_SB(sb)->journal_bdev = bdev;
        ext4_init_journal_params(sb, journal);
        return journal;
+
 out_journal:
        jbd2_journal_destroy(journal);
 out_bdev:
@@ -3041,8 +3115,8 @@ static int ext4_load_journal(struct super_block *sb,
 
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
-               printk(KERN_INFO "EXT4-fs: external journal device major/minor "
-                       "numbers have changed\n");
+               ext4_msg(sb, KERN_INFO, "external journal device major/minor "
+                       "numbers have changed");
                journal_dev = new_decode_dev(journal_devnum);
        } else
                journal_dev = new_decode_dev(le32_to_cpu(es->s_journal_dev));
@@ -3054,24 +3128,23 @@ static int ext4_load_journal(struct super_block *sb,
         * crash?  For recovery, we need to check in advance whether we
         * can get read-write access to the device.
         */
-
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER)) {
                if (sb->s_flags & MS_RDONLY) {
-                       printk(KERN_INFO "EXT4-fs: INFO: recovery "
-                                       "required on readonly filesystem.\n");
+                       ext4_msg(sb, KERN_INFO, "INFO: recovery "
+                                       "required on readonly filesystem");
                        if (really_read_only) {
-                               printk(KERN_ERR "EXT4-fs: write access "
-                                       "unavailable, cannot proceed.\n");
+                               ext4_msg(sb, KERN_ERR, "write access "
+                                       "unavailable, cannot proceed");
                                return -EROFS;
                        }
-                       printk(KERN_INFO "EXT4-fs: write access will "
-                              "be enabled during recovery.\n");
+                       ext4_msg(sb, KERN_INFO, "write access will "
+                              "be enabled during recovery");
                }
        }
 
        if (journal_inum && journal_dev) {
-               printk(KERN_ERR "EXT4-fs: filesystem has both journal "
-                      "and inode journals!\n");
+               ext4_msg(sb, KERN_ERR, "filesystem has both journal "
+                      "and inode journals!");
                return -EINVAL;
        }
 
@@ -3084,14 +3157,14 @@ static int ext4_load_journal(struct super_block *sb,
        }
 
        if (journal->j_flags & JBD2_BARRIER)
-               printk(KERN_INFO "EXT4-fs: barriers enabled\n");
+               ext4_msg(sb, KERN_INFO, "barriers enabled");
        else
-               printk(KERN_INFO "EXT4-fs: barriers disabled\n");
+               ext4_msg(sb, KERN_INFO, "barriers disabled");
 
        if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
                err = jbd2_journal_update_format(journal);
                if (err)  {
-                       printk(KERN_ERR "EXT4-fs: error updating journal.\n");
+                       ext4_msg(sb, KERN_ERR, "error updating journal");
                        jbd2_journal_destroy(journal);
                        return err;
                }
@@ -3103,7 +3176,7 @@ static int ext4_load_journal(struct super_block *sb,
                err = jbd2_journal_load(journal);
 
        if (err) {
-               printk(KERN_ERR "EXT4-fs: error loading journal.\n");
+               ext4_msg(sb, KERN_ERR, "error loading journal");
                jbd2_journal_destroy(journal);
                return err;
        }
@@ -3114,18 +3187,17 @@ static int ext4_load_journal(struct super_block *sb,
        if (journal_devnum &&
            journal_devnum != le32_to_cpu(es->s_journal_dev)) {
                es->s_journal_dev = cpu_to_le32(journal_devnum);
-               sb->s_dirt = 1;
 
                /* Make sure we flush the recovery flag to disk. */
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
        }
 
        return 0;
 }
 
-static int ext4_commit_super(struct super_block *sb,
-                             struct ext4_super_block *es, int sync)
+static int ext4_commit_super(struct super_block *sb, int sync)
 {
+       struct ext4_super_block *es = EXT4_SB(sb)->s_es;
        struct buffer_head *sbh = EXT4_SB(sb)->s_sbh;
        int error = 0;
 
@@ -3140,8 +3212,8 @@ static int ext4_commit_super(struct super_block *sb,
                 * be remapped.  Nothing we can do but to retry the
                 * write and hope for the best.
                 */
-               printk(KERN_ERR "EXT4-fs: previous I/O error to "
-                      "superblock detected for %s.\n", sb->s_id);
+               ext4_msg(sb, KERN_ERR, "previous I/O error to "
+                      "superblock detected");
                clear_buffer_write_io_error(sbh);
                set_buffer_uptodate(sbh);
        }
@@ -3154,7 +3226,7 @@ static int ext4_commit_super(struct super_block *sb,
                                        &EXT4_SB(sb)->s_freeblocks_counter));
        es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
                                        &EXT4_SB(sb)->s_freeinodes_counter));
-
+       sb->s_dirt = 0;
        BUFFER_TRACE(sbh, "marking dirty");
        mark_buffer_dirty(sbh);
        if (sync) {
@@ -3164,8 +3236,8 @@ static int ext4_commit_super(struct super_block *sb,
 
                error = buffer_write_io_error(sbh);
                if (error) {
-                       printk(KERN_ERR "EXT4-fs: I/O error while writing "
-                              "superblock for %s.\n", sb->s_id);
+                       ext4_msg(sb, KERN_ERR, "I/O error while writing "
+                              "superblock");
                        clear_buffer_write_io_error(sbh);
                        set_buffer_uptodate(sbh);
                }
@@ -3173,7 +3245,6 @@ static int ext4_commit_super(struct super_block *sb,
        return error;
 }
 
-
 /*
  * Have we just finished recovery?  If so, and if we are mounting (or
  * remounting) the filesystem readonly, then we will end up with a
@@ -3192,14 +3263,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb,
        if (jbd2_journal_flush(journal) < 0)
                goto out;
 
-       lock_super(sb);
        if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) &&
            sb->s_flags & MS_RDONLY) {
                EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               sb->s_dirt = 0;
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
        }
-       unlock_super(sb);
 
 out:
        jbd2_journal_unlock_updates(journal);
@@ -3238,7 +3306,7 @@ static void ext4_clear_journal_err(struct super_block *sb,
 
                EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS;
                es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
 
                jbd2_journal_clear_err(journal);
        }
@@ -3257,29 +3325,15 @@ int ext4_force_commit(struct super_block *sb)
                return 0;
 
        journal = EXT4_SB(sb)->s_journal;
-       if (journal) {
-               sb->s_dirt = 0;
+       if (journal)
                ret = ext4_journal_force_commit(journal);
-       }
 
        return ret;
 }
 
-/*
- * Ext4 always journals updates to the superblock itself, so we don't
- * have to propagate any other updates to the superblock on disk at this
- * point.  (We can probably nuke this function altogether, and remove
- * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...)
- */
 static void ext4_write_super(struct super_block *sb)
 {
-       if (EXT4_SB(sb)->s_journal) {
-               if (mutex_trylock(&sb->s_lock) != 0)
-                       BUG();
-               sb->s_dirt = 0;
-       } else {
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-       }
+       ext4_commit_super(sb, 1);
 }
 
 static int ext4_sync_fs(struct super_block *sb, int wait)
@@ -3288,16 +3342,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait)
        tid_t target;
 
        trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait);
-       sb->s_dirt = 0;
-       if (EXT4_SB(sb)->s_journal) {
-               if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal,
-                                             &target)) {
-                       if (wait)
-                               jbd2_log_wait_commit(EXT4_SB(sb)->s_journal,
-                                                    target);
-               }
-       } else {
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait);
+       if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) {
+               if (wait)
+                       jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target);
        }
        return ret;
 }
@@ -3310,34 +3357,32 @@ static int ext4_freeze(struct super_block *sb)
 {
        int error = 0;
        journal_t *journal;
-       sb->s_dirt = 0;
 
-       if (!(sb->s_flags & MS_RDONLY)) {
-               journal = EXT4_SB(sb)->s_journal;
+       if (sb->s_flags & MS_RDONLY)
+               return 0;
 
-               if (journal) {
-                       /* Now we set up the journal barrier. */
-                       jbd2_journal_lock_updates(journal);
+       journal = EXT4_SB(sb)->s_journal;
 
-                       /*
-                        * We don't want to clear needs_recovery flag when we
-                        * failed to flush the journal.
-                        */
-                       error = jbd2_journal_flush(journal);
-                       if (error < 0)
-                               goto out;
-               }
+       /* Now we set up the journal barrier. */
+       jbd2_journal_lock_updates(journal);
 
-               /* Journal blocked and flushed, clear needs_recovery flag. */
-               EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-               if (error)
-                       goto out;
+       /*
+        * Don't clear the needs_recovery flag if we failed to flush
+        * the journal.
+        */
+       error = jbd2_journal_flush(journal);
+       if (error < 0) {
+       out:
+               jbd2_journal_unlock_updates(journal);
+               return error;
        }
+
+       /* Journal blocked and flushed, clear needs_recovery flag. */
+       EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       error = ext4_commit_super(sb, 1);
+       if (error)
+               goto out;
        return 0;
-out:
-       jbd2_journal_unlock_updates(journal);
-       return error;
 }
 
 /*
@@ -3346,14 +3391,15 @@ out:
  */
 static int ext4_unfreeze(struct super_block *sb)
 {
-       if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) {
-               lock_super(sb);
-               /* Reser the needs_recovery flag before the fs is unlocked. */
-               EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
-               ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1);
-               unlock_super(sb);
-               jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
-       }
+       if (sb->s_flags & MS_RDONLY)
+               return 0;
+
+       lock_super(sb);
+       /* Reset the needs_recovery flag before the fs is unlocked. */
+       EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
+       ext4_commit_super(sb, 1);
+       unlock_super(sb);
+       jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal);
        return 0;
 }
 
@@ -3432,22 +3478,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                            (sbi->s_mount_state & EXT4_VALID_FS))
                                es->s_state = cpu_to_le16(sbi->s_mount_state);
 
-                       /*
-                        * We have to unlock super so that we can wait for
-                        * transactions.
-                        */
-                       if (sbi->s_journal) {
-                               unlock_super(sb);
+                       if (sbi->s_journal)
                                ext4_mark_recovery_complete(sb, es);
-                               lock_super(sb);
-                       }
                } else {
                        int ret;
                        if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb,
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP))) {
-                               printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                               ext4_msg(sb, KERN_WARNING, "couldn't "
                                       "remount RDWR because of unsupported "
-                                      "optional features (%x).\n", sb->s_id,
+                                      "optional features (%x)",
                                (le32_to_cpu(sbi->s_es->s_feature_ro_compat) &
                                        ~EXT4_FEATURE_RO_COMPAT_SUPP));
                                err = -EROFS;
@@ -3456,17 +3495,15 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
 
                        /*
                         * Make sure the group descriptor checksums
-                        * are sane.  If they aren't, refuse to
-                        * remount r/w.
+                        * are sane.  If they aren't, refuse to remount r/w.
                         */
                        for (g = 0; g < sbi->s_groups_count; g++) {
                                struct ext4_group_desc *gdp =
                                        ext4_get_group_desc(sb, g, NULL);
 
                                if (!ext4_group_desc_csum_verify(sbi, g, gdp)) {
-                                       printk(KERN_ERR
-              "EXT4-fs: ext4_remount: "
-               "Checksum for group %u failed (%u!=%u)\n",
+                                       ext4_msg(sb, KERN_ERR,
+              "ext4_remount: Checksum for group %u failed (%u!=%u)",
                g, le16_to_cpu(ext4_group_desc_csum(sbi, g, gdp)),
                                               le16_to_cpu(gdp->bg_checksum));
                                        err = -EINVAL;
@@ -3480,11 +3517,10 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                         * require a full umount/remount for now.
                         */
                        if (es->s_last_orphan) {
-                               printk(KERN_WARNING "EXT4-fs: %s: couldn't "
+                               ext4_msg(sb, KERN_WARNING, "Couldn't "
                                       "remount RDWR because of unprocessed "
                                       "orphan inode list.  Please "
-                                      "umount/remount instead.\n",
-                                      sb->s_id);
+                                      "umount/remount instead");
                                err = -EINVAL;
                                goto restore_opts;
                        }
@@ -3504,8 +3540,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                                sb->s_flags &= ~MS_RDONLY;
                }
        }
+       ext4_setup_system_zone(sb);
        if (sbi->s_journal == NULL)
-               ext4_commit_super(sb, es, 1);
+               ext4_commit_super(sb, 1);
 
 #ifdef CONFIG_QUOTA
        /* Release old quota file names */
@@ -3515,6 +3552,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
                        kfree(old_opts.s_qf_names[i]);
 #endif
        return 0;
+
 restore_opts:
        sb->s_flags = old_sb_flags;
        sbi->s_mount_opt = old_opts.s_mount_opt;
@@ -3545,9 +3583,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
        if (test_opt(sb, MINIX_DF)) {
                sbi->s_overhead_last = 0;
        } else if (sbi->s_blocks_last != ext4_blocks_count(es)) {
-               ext4_group_t ngroups = sbi->s_groups_count, i;
+               ext4_group_t i, ngroups = ext4_get_groups_count(sb);
                ext4_fsblk_t overhead = 0;
-               smp_rmb();
 
                /*
                 * Compute the overhead (FS structures).  This is constant
@@ -3599,11 +3636,12 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf)
               le64_to_cpup((void *)es->s_uuid + sizeof(u64));
        buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL;
        buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL;
+
        return 0;
 }
 
-/* Helper function for writing quotas on sync - we need to start transaction before quota file
- * is locked for write. Otherwise the are possible deadlocks:
+/* Helper function for writing quotas on sync - we need to start transaction
+ * before quota file is locked for write. Otherwise the are possible deadlocks:
  * Process 1                         Process 2
  * ext4_create()                     quota_sync()
  *   jbd2_journal_start()                  write_dquot()
@@ -3627,7 +3665,7 @@ static int ext4_write_dquot(struct dquot *dquot)
 
        inode = dquot_to_inode(dquot);
        handle = ext4_journal_start(inode,
-                                       EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+                                   EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_commit(dquot);
@@ -3643,7 +3681,7 @@ static int ext4_acquire_dquot(struct dquot *dquot)
        handle_t *handle;
 
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                       EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
+                                   EXT4_QUOTA_INIT_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle))
                return PTR_ERR(handle);
        ret = dquot_acquire(dquot);
@@ -3659,7 +3697,7 @@ static int ext4_release_dquot(struct dquot *dquot)
        handle_t *handle;
 
        handle = ext4_journal_start(dquot_to_inode(dquot),
-                                       EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
+                                   EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb));
        if (IS_ERR(handle)) {
                /* Release dquot anyway to avoid endless cycle in dqput() */
                dquot_release(dquot);
@@ -3707,7 +3745,7 @@ static int ext4_write_info(struct super_block *sb, int type)
 static int ext4_quota_on_mount(struct super_block *sb, int type)
 {
        return vfs_quota_on_mount(sb, EXT4_SB(sb)->s_qf_names[type],
-                       EXT4_SB(sb)->s_jquota_fmt, type);
+                                 EXT4_SB(sb)->s_jquota_fmt, type);
 }
 
 /*
@@ -3738,9 +3776,9 @@ static int ext4_quota_on(struct super_block *sb, int type, int format_id,
        if (EXT4_SB(sb)->s_qf_names[type]) {
                /* Quotafile not in fs root? */
                if (path.dentry->d_parent != sb->s_root)
-                       printk(KERN_WARNING
-                               "EXT4-fs: Quota file not on filesystem root. "
-                               "Journaled quota will not work.\n");
+                       ext4_msg(sb, KERN_WARNING,
+                               "Quota file not on filesystem root. "
+                               "Journaled quota will not work");
        }
 
        /*
@@ -3823,8 +3861,8 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type,
        handle_t *handle = journal_current_handle();
 
        if (EXT4_SB(sb)->s_journal && !handle) {
-               printk(KERN_WARNING "EXT4-fs: Quota write (off=%llu, len=%llu)"
-                       " cancelled because transaction is not started.\n",
+               ext4_msg(sb, KERN_WARNING, "Quota write (off=%llu, len=%llu)"
+                       " cancelled because transaction is not started",
                        (unsigned long long)off, (unsigned long long)len);
                return -EIO;
        }
@@ -3878,10 +3916,10 @@ out:
 
 #endif
 
-static int ext4_get_sb(struct file_system_type *fs_type,
-       int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4_get_sb(struct file_system_type *fs_type, int flags,
+                      const char *dev_name, void *data, struct vfsmount *mnt)
 {
-       return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+       return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4_fs_type = {
@@ -3893,14 +3931,14 @@ static struct file_system_type ext4_fs_type = {
 };
 
 #ifdef CONFIG_EXT4DEV_COMPAT
-static int ext4dev_get_sb(struct file_system_type *fs_type,
-       int flags, const char *dev_name, void *data, struct vfsmount *mnt)
+static int ext4dev_get_sb(struct file_system_type *fs_type, int flags,
+                         const char *dev_name, void *data,struct vfsmount *mnt)
 {
-       printk(KERN_WARNING "EXT4-fs: Update your userspace programs "
-              "to mount using ext4\n");
-       printk(KERN_WARNING "EXT4-fs: ext4dev backwards compatibility "
-              "will go away by 2.6.31\n");
-       return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
+       printk(KERN_WARNING "EXT4-fs (%s): Update your userspace programs "
+              "to mount using ext4\n", dev_name);
+       printk(KERN_WARNING "EXT4-fs (%s): ext4dev backwards compatibility "
+              "will go away by 2.6.31\n", dev_name);
+       return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super,mnt);
 }
 
 static struct file_system_type ext4dev_fs_type = {
@@ -3917,13 +3955,16 @@ static int __init init_ext4_fs(void)
 {
        int err;
 
+       err = init_ext4_system_zone();
+       if (err)
+               return err;
        ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
        if (!ext4_kset)
-               return -ENOMEM;
+               goto out4;
        ext4_proc_root = proc_mkdir("fs/ext4", NULL);
        err = init_ext4_mballoc();
        if (err)
-               return err;
+               goto out3;
 
        err = init_ext4_xattr();
        if (err)
@@ -3948,6 +3989,11 @@ out1:
        exit_ext4_xattr();
 out2:
        exit_ext4_mballoc();
+out3:
+       remove_proc_entry("fs/ext4", NULL);
+       kset_unregister(ext4_kset);
+out4:
+       exit_ext4_system_zone();
        return err;
 }
 
@@ -3962,6 +4008,7 @@ static void __exit exit_ext4_fs(void)
        exit_ext4_mballoc();
        remove_proc_entry("fs/ext4", NULL);
        kset_unregister(ext4_kset);
+       exit_ext4_system_zone();
 }
 
 MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
index c1462d43e7217de23f046852d9fd466af0617e65..941c8425c10b34493e9ad6fefef7fe54f451c84a 100644 (file)
@@ -30,6 +30,7 @@
 #include <linux/dnotify.h>
 #include <linux/statfs.h>
 #include <linux/security.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -986,6 +987,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
                        &hugetlbfs_file_operations);
        if (!file)
                goto out_dentry; /* inode is already attached */
+       ima_counts_get(file);
 
        return file;
 
index 82d9c42b8bac951f54fe0a09298eb90a0627f667..286f38dfc6c0748d90d471c0878e4d08e8f7ece5 100644 (file)
@@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd,
        switch (cmd) {
        case FIBMAP:
                return ioctl_fibmap(filp, p);
-       case FS_IOC_FIEMAP:
-               return ioctl_fiemap(filp, arg);
-       case FIGETBSZ:
-               return put_user(inode->i_sb->s_blocksize, p);
        case FIONREAD:
                return put_user(i_size_read(inode) - filp->f_pos, p);
        }
@@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
                error = ioctl_fsthaw(filp);
                break;
 
+       case FS_IOC_FIEMAP:
+               return ioctl_fiemap(filp, arg);
+
+       case FIGETBSZ:
+       {
+               struct inode *inode = filp->f_path.dentry->d_inode;
+               int __user *p = (int __user *)arg;
+               return put_user(inode->i_sb->s_blocksize, p);
+       }
+
        default:
                if (S_ISREG(filp->f_path.dentry->d_inode->i_mode))
                        error = file_ioctl(filp, cmd, arg);
index 58144102bf253b2244fee8fe44ce1a7c02cfbc79..62be7d294ec26eb71130692cc9187314141fd34d 100644 (file)
@@ -1781,7 +1781,7 @@ int jbd2_journal_wipe(journal_t *journal, int write)
  * Journal abort has very specific semantics, which we describe
  * for journal abort.
  *
- * Two internal function, which provide abort to te jbd layer
+ * Two internal functions, which provide abort to the jbd layer
  * itself are here.
  */
 
@@ -1879,7 +1879,7 @@ void jbd2_journal_abort(journal_t *journal, int errno)
  * int jbd2_journal_errno () - returns the journal's error state.
  * @journal: journal to examine.
  *
- * This is the errno numbet set with jbd2_journal_abort(), the last
+ * This is the errno number set with jbd2_journal_abort(), the last
  * time the journal was mounted - if the journal was stopped
  * without calling abort this will be 0.
  *
@@ -1903,7 +1903,7 @@ int jbd2_journal_errno(journal_t *journal)
  * int jbd2_journal_clear_err () - clears the journal's error state
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 int jbd2_journal_clear_err(journal_t *journal)
@@ -1923,7 +1923,7 @@ int jbd2_journal_clear_err(journal_t *journal)
  * void jbd2_journal_ack_err() - Ack journal err.
  * @journal: journal to act on.
  *
- * An error must be cleared or Acked to take a FS out of readonly
+ * An error must be cleared or acked to take a FS out of readonly
  * mode.
  */
 void jbd2_journal_ack_err(journal_t *journal)
index 680ba60863ffb2dee0f81ffbda6974c419c79668..42381bd6543b1abd5446961424234f6d83234f13 100644 (file)
@@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages,
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
 
-       clear_buffer_mapped(&map_bh);
+       map_bh.b_state = 0;
+       map_bh.b_size = 0;
        for (page_idx = 0; page_idx < nr_pages; page_idx++) {
                struct page *page = list_entry(pages->prev, struct page, lru);
 
@@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block)
        struct buffer_head map_bh;
        unsigned long first_logical_block = 0;
 
-       clear_buffer_mapped(&map_bh);
+       map_bh.b_state = 0;
+       map_bh.b_size = 0;
        bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio,
                        &map_bh, &first_logical_block, get_block);
        if (bio)
index 967c3db9272453e5cdd597d8f9fb50b4ceae0367..c82805d088e1d5c0bfcd0229fc0cd7bd0c3a69db 100644 (file)
@@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd)
                        err = inode_permission(nd->path.dentry->d_inode,
                                               MAY_EXEC);
                if (!err)
-                       err = ima_path_check(&nd->path, MAY_EXEC);
+                       err = ima_path_check(&nd->path, MAY_EXEC,
+                                            IMA_COUNT_UPDATE);
                if (err)
                        break;
 
@@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag)
                return error;
 
        error = ima_path_check(path,
-                              acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC));
+                              acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                              IMA_COUNT_UPDATE);
        if (error)
                return error;
        /*
index b660435978d288de89bf42070b3a6e83399b2356..bd584bcf1d9f9690b1304794472c3dfdf4a88f79 100644 (file)
@@ -55,6 +55,7 @@
 #include <linux/security.h>
 #endif /* CONFIG_NFSD_V4 */
 #include <linux/jhash.h>
+#include <linux/ima.h>
 
 #include <asm/uaccess.h>
 
@@ -735,6 +736,8 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
                            flags, cred);
        if (IS_ERR(*filp))
                host_err = PTR_ERR(*filp);
+       else
+               ima_counts_get(*filp);
 out_nfserr:
        err = nfserrno(host_err);
 out:
@@ -2024,6 +2027,7 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
                                        struct dentry *dentry, int acc)
 {
        struct inode    *inode = dentry->d_inode;
+       struct path     path;
        int             err;
 
        if (acc == NFSD_MAY_NOP)
@@ -2096,7 +2100,17 @@ nfsd_permission(struct svc_rqst *rqstp, struct svc_export *exp,
        if (err == -EACCES && S_ISREG(inode->i_mode) &&
            acc == (NFSD_MAY_READ | NFSD_MAY_OWNER_OVERRIDE))
                err = inode_permission(inode, MAY_EXEC);
+       if (err)
+               goto nfsd_out;
 
+       /* Do integrity (permission) checking now, but defer incrementing
+        * IMA counts to the actual file open.
+        */
+       path.mnt = exp->ex_path.mnt;
+       path.dentry = dentry;
+       err = ima_path_check(&path, acc & (MAY_READ | MAY_WRITE | MAY_EXEC),
+                            IMA_COUNT_LEAVE);
+nfsd_out:
        return err? nfserrno(err) : 0;
 }
 
index 3326bbf9ab95222dbc2466a846650c7935e58cf5..1539e630c47d524b1df251236e638dbb1e8d279b 100644 (file)
@@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf,
        if (copy_from_user(page, buf, count))
                goto out_free;
 
+       /* Guard against adverse ptrace interaction */
+       length = mutex_lock_interruptible(&task->cred_guard_mutex);
+       if (length < 0)
+               goto out_free;
+
        length = security_setprocattr(task,
                                      (char*)file->f_path.dentry->d_name.name,
                                      (void*)page, count);
+       mutex_unlock(&task->cred_guard_mutex);
 out_free:
        free_page((unsigned long) page);
 out:
index 9bca39cf99eeb56d6c32c4f0f9f4c49073683f19..1afa4dd4cae24167a0c4e3f47137a13cf8a6d199 100644 (file)
 
 static int loadavg_proc_show(struct seq_file *m, void *v)
 {
-       int a, b, c;
-       unsigned long seq;
+       unsigned long avnrun[3];
 
-       do {
-               seq = read_seqbegin(&xtime_lock);
-               a = avenrun[0] + (FIXED_1/200);
-               b = avenrun[1] + (FIXED_1/200);
-               c = avenrun[2] + (FIXED_1/200);
-       } while (read_seqretry(&xtime_lock, seq));
+       get_avenrun(avnrun, FIXED_1/200, 0);
 
-       seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n",
-               LOAD_INT(a), LOAD_FRAC(a),
-               LOAD_INT(b), LOAD_FRAC(b),
-               LOAD_INT(c), LOAD_FRAC(c),
+       seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n",
+               LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]),
+               LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]),
+               LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]),
                nr_running(), nr_threads,
                task_active_pid_ns(current)->last_pid);
        return 0;
index d8c3e3cbf41603eb983e5922da3ee796c7fece84..fe36accd43283b77f372adcc2c9f815a7910a233 100644 (file)
@@ -8,3 +8,4 @@ header-y += mtd/
 header-y += rdma/
 header-y += video/
 header-y += drm/
+header-y += xen/
index 8e6d0ca70aba987b4663db6e512d79236400f183..e410f602cab1b31fdd71681708d17049181026d5 100644 (file)
@@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm,
 #endif
 
 /*
- * A facility to provide batching of the reload of page tables with the
- * actual context switch code for paravirtualized guests.  By convention,
- * only one of the lazy modes (CPU, MMU) should be active at any given
- * time, entry should never be nested, and entry and exits should always
- * be paired.  This is for sanity of maintaining and reasoning about the
- * kernel code.
+ * A facility to provide batching of the reload of page tables and
+ * other process state with the actual context switch code for
+ * paravirtualized guests.  By convention, only one of the batched
+ * update (lazy) modes (CPU, MMU) should be active at any given time,
+ * entry should never be nested, and entry and exits should always be
+ * paired.  This is for sanity of maintaining and reasoning about the
+ * kernel code.  In this case, the exit (end of the context switch) is
+ * in architecture-specific code, and so doesn't need a generic
+ * definition.
  */
-#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE
-#define arch_enter_lazy_cpu_mode()     do {} while (0)
-#define arch_leave_lazy_cpu_mode()     do {} while (0)
-#define arch_flush_lazy_cpu_mode()     do {} while (0)
+#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH
+#define arch_start_context_switch(prev)        do {} while (0)
 #endif
 
 #ifndef __HAVE_PFNMAP_TRACKING
index 89853bcd27a658ac0aef934c61ba723057933457..f1736ca7922cb0022244087119f73a70e85f8b23 100644 (file)
@@ -63,7 +63,7 @@
 #define BRANCH_PROFILE()
 #endif
 
-#ifdef CONFIG_EVENT_TRACER
+#ifdef CONFIG_EVENT_TRACING
 #define FTRACE_EVENTS()        VMLINUX_SYMBOL(__start_ftrace_events) = .;      \
                        *(_ftrace_events)                               \
                        VMLINUX_SYMBOL(__stop_ftrace_events) = .;
index 88be890ee3c7e402cfd398e5113d2f78bb29338b..51b4b0a5ce8cf00b00fd849ee26e0e7bc504a795 100644 (file)
@@ -119,7 +119,7 @@ extern int pci_mmcfg_config_num;
 extern int sbf_port;
 extern unsigned long acpi_realmode_flags;
 
-int acpi_register_gsi (u32 gsi, int triggering, int polarity);
+int acpi_register_gsi (struct device *dev, u32 gsi, int triggering, int polarity);
 int acpi_gsi_to_irq (u32 gsi, unsigned int *irq);
 
 #ifdef CONFIG_X86_IO_APIC
index d960889e92efa17a73ee8038603a622be8dcf8a0..7e4350ece0f8dd495398ca4cbf41c95e995b6525 100644 (file)
@@ -116,9 +116,9 @@ struct blk_io_trace {
  * The remap event
  */
 struct blk_io_trace_remap {
-       __be32 device;
        __be32 device_from;
-       __be64 sector;
+       __be32 device_to;
+       __be64 sector_from;
 };
 
 enum {
@@ -165,8 +165,9 @@ struct blk_trace {
 
 extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *);
 extern void blk_trace_shutdown(struct request_queue *);
-extern int do_blk_trace_setup(struct request_queue *q,
-       char *name, dev_t dev, struct blk_user_trace_setup *buts);
+extern int do_blk_trace_setup(struct request_queue *q, char *name,
+                             dev_t dev, struct block_device *bdev,
+                             struct blk_user_trace_setup *buts);
 extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 
 /**
@@ -193,22 +194,42 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...);
 extern void blk_add_driver_data(struct request_queue *q, struct request *rq,
                                void *data, size_t len);
 extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+                          struct block_device *bdev,
                           char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
+extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
 
 #else /* !CONFIG_BLK_DEV_IO_TRACE */
-#define blk_trace_ioctl(bdev, cmd, arg)                (-ENOTTY)
-#define blk_trace_shutdown(q)                  do { } while (0)
-#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY)
-#define blk_add_driver_data(q, rq, data, len)  do {} while (0)
-#define blk_trace_setup(q, name, dev, arg)     (-ENOTTY)
-#define blk_trace_startstop(q, start)          (-ENOTTY)
-#define blk_trace_remove(q)                    (-ENOTTY)
-#define blk_add_trace_msg(q, fmt, ...)         do { } while (0)
+# define blk_trace_ioctl(bdev, cmd, arg)               (-ENOTTY)
+# define blk_trace_shutdown(q)                         do { } while (0)
+# define do_blk_trace_setup(q, name, dev, bdev, buts)  (-ENOTTY)
+# define blk_add_driver_data(q, rq, data, len)         do {} while (0)
+# define blk_trace_setup(q, name, dev, bdev, arg)      (-ENOTTY)
+# define blk_trace_startstop(q, start)                 (-ENOTTY)
+# define blk_trace_remove(q)                           (-ENOTTY)
+# define blk_add_trace_msg(q, fmt, ...)                        do { } while (0)
+static inline int blk_trace_init_sysfs(struct device *dev)
+{
+       return 0;
+}
 
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#if defined(CONFIG_EVENT_TRACING) && defined(CONFIG_BLOCK)
+
+static inline int blk_cmd_buf_len(struct request *rq)
+{
+       return blk_pc_request(rq) ? rq->cmd_len * 3 : 1;
+}
+
+extern void blk_dump_cmd(char *buf, struct request *rq);
+extern void blk_fill_rwbs(char *rwbs, u32 rw, int bytes);
+extern void blk_fill_rwbs_rq(char *rwbs, struct request *rq);
+
+#endif /* CONFIG_EVENT_TRACING && CONFIG_BLOCK */
+
 #endif /* __KERNEL__ */
 #endif
index f2ded21f9a3c37cdef070315659c57d5a066f4f9..af931ee43dd8e43454b9cd9d389ec0bd5b1a7b21 100644 (file)
@@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from);
 int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from);
 int get_compat_sigevent(struct sigevent *event,
                const struct compat_sigevent __user *u_event);
+long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+                                 struct compat_siginfo __user *uinfo);
 
 static inline int compat_timeval_compare(struct compat_timeval *lhs,
                                        struct compat_timeval *rhs)
index 788850ba4e7577def670ea0fba81b14ba0a932a0..1fbdea4f08ebbb059d060cb01e431b7d65f23efc 100644 (file)
@@ -142,19 +142,6 @@ struct CYZ_BOOT_CTRL {
 
 
 #ifndef DP_WINDOW_SIZE
-/* #include "cyclomz.h" */
-/****************** ****************** *******************/
-/*
- *     The data types defined below are used in all ZFIRM interface
- *     data structures. They accomodate differences between HW
- *     architectures and compilers.
- */
-
-typedef __u64  ucdouble;               /* 64 bits, unsigned */
-typedef __u32  uclong;                 /* 32 bits, unsigned */
-typedef __u16  ucshort;                /* 16 bits, unsigned */
-typedef __u8   ucchar;                 /* 8 bits, unsigned */
-
 /*
  *     Memory Window Sizes
  */
@@ -507,16 +494,20 @@ struct ZFW_CTRL {
 
 /* Per card data structure */
 struct cyclades_card {
-    void __iomem *base_addr;
-    void __iomem *ctl_addr;
-    int irq;
-    unsigned int num_chips;    /* 0 if card absent, -1 if Z/PCI, else Y */
-    unsigned int first_line;   /* minor number of first channel on card */
-    unsigned int nports;       /* Number of ports in the card */
-    int bus_index;             /* address shift - 0 for ISA, 1 for PCI */
-    int intr_enabled;          /* FW Interrupt flag - 0 disabled, 1 enabled */
-    spinlock_t card_lock;
-    struct cyclades_port *ports;
+       void __iomem *base_addr;
+       union {
+               void __iomem *p9050;
+               struct RUNTIME_9060 __iomem *p9060;
+       } ctl_addr;
+       int irq;
+       unsigned int num_chips; /* 0 if card absent, -1 if Z/PCI, else Y */
+       unsigned int first_line;        /* minor number of first channel on card */
+       unsigned int nports;    /* Number of ports in the card */
+       int bus_index;          /* address shift - 0 for ISA, 1 for PCI */
+       int intr_enabled;               /* FW Interrupt flag - 0 disabled, 1 enabled */
+       u32 hw_ver;
+       spinlock_t card_lock;
+       struct cyclades_port *ports;
 };
 
 /***************************************
index 28d53cb7b5a22c099feb9a2aa19b2e3010e8d124..171ad8aedc835258e152b94b2bc92242045d9c3e 100644 (file)
@@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus);
 
 extern void dma_debug_init(u32 num_entries);
 
+extern int dma_debug_resize_entries(u32 num_entries);
+
 extern void debug_dma_map_page(struct device *dev, struct page *page,
                               size_t offset, size_t size,
                               int direction, dma_addr_t dma_addr,
@@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries)
 {
 }
 
+static inline int dma_debug_resize_entries(u32 num_entries)
+{
+       return 0;
+}
+
 static inline void debug_dma_map_page(struct device *dev, struct page *page,
                                      size_t offset, size_t size,
                                      int direction, dma_addr_t dma_addr,
index e397dc342cdaf1eaf99b022b4523f858d9f3e886..10ff5c498824b5b8dc5e963b07563878a13c2667 100644 (file)
@@ -108,6 +108,7 @@ struct irte {
 };
 #ifdef CONFIG_INTR_REMAP
 extern int intr_remapping_enabled;
+extern int intr_remapping_supported(void);
 extern int enable_intr_remapping(int);
 extern void disable_intr_remapping(void);
 extern int reenable_intr_remapping(int);
@@ -157,6 +158,8 @@ static inline struct intel_iommu *map_ioapic_to_ir(int apic)
 }
 #define irq_remapped(irq)              (0)
 #define enable_intr_remapping(mode)    (-1)
+#define disable_intr_remapping()       (0)
+#define reenable_intr_remapping(mode)  (0)
 #define intr_remapping_enabled         (0)
 #endif
 
index 8a0c2f221e6b95b448991b1e291610c9e52ea91e..39b95c56587e8f1f8852aed9318148fcc16aac56 100644 (file)
@@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
 extern int skip_trace(unsigned long ip);
 
-extern void ftrace_release(void *start, unsigned long size);
-
 extern void ftrace_disable_daemon(void);
 extern void ftrace_enable_daemon(void);
 #else
@@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled)
 
 #ifdef CONFIG_FTRACE_MCOUNT_RECORD
 extern void ftrace_init(void);
-extern void ftrace_init_module(struct module *mod,
-                              unsigned long *start, unsigned long *end);
 #else
 static inline void ftrace_init(void) { }
-static inline void
-ftrace_init_module(struct module *mod,
-                  unsigned long *start, unsigned long *end) { }
 #endif
 
 /*
@@ -368,6 +361,7 @@ struct ftrace_ret_stack {
        unsigned long ret;
        unsigned long func;
        unsigned long long calltime;
+       unsigned long long subtime;
 };
 
 /*
@@ -379,8 +373,6 @@ extern void return_to_handler(void);
 
 extern int
 ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
-extern void
-ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
 /*
  * Sometimes we don't want to trace a function with the function
@@ -496,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 extern int ftrace_dump_on_oops;
 
+#ifdef CONFIG_PREEMPT
+#define INIT_TRACE_RECURSION           .trace_recursion = 0,
+#endif
+
 #endif /* CONFIG_TRACING */
 
+#ifndef INIT_TRACE_RECURSION
+#define INIT_TRACE_RECURSION
+#endif
 
 #ifdef CONFIG_HW_BRANCH_TRACER
 
diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h
new file mode 100644 (file)
index 0000000..5c093ff
--- /dev/null
@@ -0,0 +1,172 @@
+#ifndef _LINUX_FTRACE_EVENT_H
+#define _LINUX_FTRACE_EVENT_H
+
+#include <linux/trace_seq.h>
+#include <linux/ring_buffer.h>
+#include <linux/percpu.h>
+
+struct trace_array;
+struct tracer;
+struct dentry;
+
+DECLARE_PER_CPU(struct trace_seq, ftrace_event_seq);
+
+struct trace_print_flags {
+       unsigned long           mask;
+       const char              *name;
+};
+
+const char *ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+                                  unsigned long flags,
+                                  const struct trace_print_flags *flag_array);
+
+const char *ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+                                    const struct trace_print_flags *symbol_array);
+
+/*
+ * The trace entry - the most basic unit of tracing. This is what
+ * is printed in the end as a single line in the trace output, such as:
+ *
+ *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
+ */
+struct trace_entry {
+       unsigned short          type;
+       unsigned char           flags;
+       unsigned char           preempt_count;
+       int                     pid;
+       int                     tgid;
+};
+
+#define FTRACE_MAX_EVENT                                               \
+       ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1)
+
+/*
+ * Trace iterator - used by printout routines who present trace
+ * results to users and which routines might sleep, etc:
+ */
+struct trace_iterator {
+       struct trace_array      *tr;
+       struct tracer           *trace;
+       void                    *private;
+       int                     cpu_file;
+       struct mutex            mutex;
+       struct ring_buffer_iter *buffer_iter[NR_CPUS];
+       unsigned long           iter_flags;
+
+       /* The below is zeroed out in pipe_read */
+       struct trace_seq        seq;
+       struct trace_entry      *ent;
+       int                     cpu;
+       u64                     ts;
+
+       loff_t                  pos;
+       long                    idx;
+
+       cpumask_var_t           started;
+};
+
+
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+                                             int flags);
+struct trace_event {
+       struct hlist_node       node;
+       struct list_head        list;
+       int                     type;
+       trace_print_func        trace;
+       trace_print_func        raw;
+       trace_print_func        hex;
+       trace_print_func        binary;
+};
+
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
+
+/* Return values for print_line callback */
+enum print_line_t {
+       TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
+       TRACE_TYPE_HANDLED      = 1,
+       TRACE_TYPE_UNHANDLED    = 2,    /* Relay to other output functions */
+       TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
+};
+
+
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(int type, unsigned long len,
+                                 unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+                                       unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+                                       unsigned long flags, int pc);
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event);
+
+void tracing_record_cmdline(struct task_struct *tsk);
+
+struct ftrace_event_call {
+       struct list_head        list;
+       char                    *name;
+       char                    *system;
+       struct dentry           *dir;
+       struct trace_event      *event;
+       int                     enabled;
+       int                     (*regfunc)(void);
+       void                    (*unregfunc)(void);
+       int                     id;
+       int                     (*raw_init)(void);
+       int                     (*show_format)(struct trace_seq *s);
+       int                     (*define_fields)(void);
+       struct list_head        fields;
+       int                     filter_active;
+       void                    *filter;
+       void                    *mod;
+
+#ifdef CONFIG_EVENT_PROFILE
+       atomic_t        profile_count;
+       int             (*profile_enable)(struct ftrace_event_call *);
+       void            (*profile_disable)(struct ftrace_event_call *);
+#endif
+};
+
+#define MAX_FILTER_PRED                32
+#define MAX_FILTER_STR_VAL     128
+
+extern int init_preds(struct ftrace_event_call *call);
+extern void destroy_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern int filter_current_check_discard(struct ftrace_event_call *call,
+                                       void *rec,
+                                       struct ring_buffer_event *event);
+
+extern int trace_define_field(struct ftrace_event_call *call, char *type,
+                             char *name, int offset, int size, int is_signed);
+
+#define is_signed_type(type)   (((type)(-1)) < 0)
+
+int trace_set_clr_event(const char *system, const char *event, int set);
+
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
+#define event_trace_printk(ip, fmt, args...)                           \
+do {                                                                   \
+       __trace_printk_check_format(fmt, ##args);                       \
+       tracing_record_cmdline(current);                                \
+       if (__builtin_constant_p(fmt)) {                                \
+               static const char *trace_printk_fmt                     \
+                 __attribute__((section("__trace_printk_fmt"))) =      \
+                       __builtin_constant_p(fmt) ? fmt : NULL;         \
+                                                                       \
+               __trace_bprintk(ip, trace_printk_fmt, ##args);          \
+       } else                                                          \
+               __trace_printk(ip, fmt, ##args);                        \
+} while (0)
+
+#define __common_field(type, item, is_signed)                          \
+       ret = trace_define_field(event_call, #type, "common_" #item,    \
+                                offsetof(typeof(field.ent), item),     \
+                                sizeof(field.ent.item), is_signed);    \
+       if (ret)                                                        \
+               return ret;
+
+#endif /* _LINUX_FTRACE_EVENT_H */
index 3bf5bb5a34f9fba43b9248caf434eb6059e6329e..34956c8fdebf8df63ab44c1f84b136406b3d0f34 100644 (file)
@@ -23,6 +23,8 @@ union ktime;
 #define FUTEX_TRYLOCK_PI       8
 #define FUTEX_WAIT_BITSET      9
 #define FUTEX_WAKE_BITSET      10
+#define FUTEX_WAIT_REQUEUE_PI  11
+#define FUTEX_CMP_REQUEUE_PI   12
 
 #define FUTEX_PRIVATE_FLAG     128
 #define FUTEX_CLOCK_REALTIME   256
@@ -38,6 +40,10 @@ union ktime;
 #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAIT_BITSET_PRIVATE      (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG)
 #define FUTEX_WAKE_BITSET_PRIVATE      (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG)
+#define FUTEX_WAIT_REQUEUE_PI_PRIVATE  (FUTEX_WAIT_REQUEUE_PI | \
+                                        FUTEX_PRIVATE_FLAG)
+#define FUTEX_CMP_REQUEUE_PI_PRIVATE   (FUTEX_CMP_REQUEUE_PI | \
+                                        FUTEX_PRIVATE_FLAG)
 
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
index 9fed365a598b89845d83271f93d9fe3affef15f2..867cb68d84619a8631670f839bf908a2db99b56b 100644 (file)
@@ -26,6 +26,9 @@
 #include <asm/io.h>
 #include <asm/mutex.h>
 
+/* for request_sense */
+#include <linux/cdrom.h>
+
 #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300)
 # define SUPPORT_VLB_SYNC 0
 #else
@@ -324,7 +327,6 @@ struct ide_cmd {
        unsigned int            cursg_ofs;
 
        struct request          *rq;            /* copy of request */
-       void                    *special;       /* valid_t generally */
 };
 
 /* ATAPI packet command flags */
@@ -360,11 +362,7 @@ struct ide_atapi_pc {
 
        /* data buffer */
        u8 *buf;
-       /* current buffer position */
-       u8 *cur_pos;
        int buf_size;
-       /* missing/available data on the current buffer */
-       int b_count;
 
        /* the corresponding request */
        struct request *rq;
@@ -377,10 +375,6 @@ struct ide_atapi_pc {
         */
        u8 pc_buf[IDE_PC_BUFFER_SIZE];
 
-       /* idetape only */
-       struct idetape_bh *bh;
-       char *b_data;
-
        unsigned long timeout;
 };
 
@@ -593,16 +587,16 @@ struct ide_drive_s {
        /* callback for packet commands */
        int  (*pc_callback)(struct ide_drive_s *, int);
 
-       void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *);
-       int  (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *,
-                             unsigned int, int);
-
        ide_startstop_t (*irq_handler)(struct ide_drive_s *);
 
        unsigned long atapi_flags;
 
        struct ide_atapi_pc request_sense_pc;
-       struct request request_sense_rq;
+
+       /* current sense rq and buffer */
+       bool sense_rq_armed;
+       struct request sense_rq;
+       struct request_sense sense_data;
 };
 
 typedef struct ide_drive_s ide_drive_t;
@@ -1174,7 +1168,10 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *);
 int ide_do_start_stop(ide_drive_t *, struct gendisk *, int);
 int ide_set_media_lock(ide_drive_t *, struct gendisk *, int);
 void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *);
-void ide_retry_pc(ide_drive_t *, struct gendisk *);
+void ide_retry_pc(ide_drive_t *drive);
+
+void ide_prep_sense(ide_drive_t *drive, struct request *rq);
+int ide_queue_sense_rq(ide_drive_t *drive, void *special);
 
 int ide_cd_expiry(ide_drive_t *);
 
index 0e2aa45cb0cefdfa3fc3727fe0a62926f1755172..b1b827d091a995e3d5b2e4a0d1e7f18e975e9319 100644 (file)
 #include <linux/fs.h>
 struct linux_binprm;
 
+#define IMA_COUNT_UPDATE 1
+#define IMA_COUNT_LEAVE 0
+
 #ifdef CONFIG_IMA
 extern int ima_bprm_check(struct linux_binprm *bprm);
 extern int ima_inode_alloc(struct inode *inode);
 extern void ima_inode_free(struct inode *inode);
-extern int ima_path_check(struct path *path, int mask);
+extern int ima_path_check(struct path *path, int mask, int update_counts);
 extern void ima_file_free(struct file *file);
 extern int ima_file_mmap(struct file *file, unsigned long prot);
-extern void ima_shm_check(struct file *file);
+extern void ima_counts_get(struct file *file);
 
 #else
 static inline int ima_bprm_check(struct linux_binprm *bprm)
@@ -38,7 +41,7 @@ static inline void ima_inode_free(struct inode *inode)
        return;
 }
 
-static inline int ima_path_check(struct path *path, int mask)
+static inline int ima_path_check(struct path *path, int mask, int update_counts)
 {
        return 0;
 }
@@ -53,7 +56,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot)
        return 0;
 }
 
-static inline void ima_shm_check(struct file *file)
+static inline void ima_counts_get(struct file *file)
 {
        return;
 }
index d87247d2641f5cbf4eb69a90fea392b95684e588..6646bfc7b8929aeadbc1c12dbdd6f48dc07d7722 100644 (file)
@@ -145,8 +145,8 @@ extern struct cred init_cred;
        .group_leader   = &tsk,                                         \
        .real_cred      = &init_cred,                                   \
        .cred           = &init_cred,                                   \
-       .cred_exec_mutex =                                              \
-                __MUTEX_INITIALIZER(tsk.cred_exec_mutex),              \
+       .cred_guard_mutex =                                             \
+                __MUTEX_INITIALIZER(tsk.cred_guard_mutex),             \
        .comm           = "swapper",                                    \
        .thread         = INIT_THREAD,                                  \
        .fs             = &init_fs,                                     \
@@ -174,6 +174,7 @@ extern struct cred init_cred;
        INIT_TRACE_IRQFLAGS                                             \
        INIT_LOCKDEP                                                    \
        INIT_FTRACE_GRAPH                                               \
+       INIT_TRACE_RECURSION                                            \
 }
 
 
index 91bb76f44f14e6488e909669ba3e3e94fba1104a..ff374ceface08d55c889c5698cdf6bcfda753655 100644 (file)
@@ -566,6 +566,6 @@ struct irq_desc;
 extern int early_irq_init(void);
 extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
-extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
+extern int arch_init_chip_data(struct irq_desc *desc, int node);
 
 #endif
index b7cbeed972e425b694f1666343f6fc0558ac439e..eedbb8e5e0ccff99c79594b6485fd9259049d088 100644 (file)
@@ -117,7 +117,7 @@ struct irq_chip {
        void            (*eoi)(unsigned int irq);
 
        void            (*end)(unsigned int irq);
-       void            (*set_affinity)(unsigned int irq,
+       int             (*set_affinity)(unsigned int irq,
                                        const struct cpumask *dest);
        int             (*retrigger)(unsigned int irq);
        int             (*set_type)(unsigned int irq, unsigned int flow_type);
@@ -187,7 +187,7 @@ struct irq_desc {
        spinlock_t              lock;
 #ifdef CONFIG_SMP
        cpumask_var_t           affinity;
-       unsigned int            cpu;
+       unsigned int            node;
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        cpumask_var_t           pending_mask;
 #endif
@@ -201,26 +201,23 @@ struct irq_desc {
 } ____cacheline_internodealigned_in_smp;
 
 extern void arch_init_copy_chip_data(struct irq_desc *old_desc,
-                                       struct irq_desc *desc, int cpu);
+                                       struct irq_desc *desc, int node);
 extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc);
 
 #ifndef CONFIG_SPARSE_IRQ
 extern struct irq_desc irq_desc[NR_IRQS];
-#else /* CONFIG_SPARSE_IRQ */
-extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
-#endif /* CONFIG_SPARSE_IRQ */
-
-extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
+#endif
 
-static inline struct irq_desc *
-irq_remap_to_desc(unsigned int irq, struct irq_desc *desc)
-{
-#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC
-       return irq_to_desc(irq);
+#ifdef CONFIG_NUMA_IRQ_DESC
+extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int node);
 #else
+static inline struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
+{
        return desc;
-#endif
 }
+#endif
+
+extern struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node);
 
 /*
  * Migration helpers for obsolete names, they will go away:
@@ -386,7 +383,7 @@ extern void set_irq_noprobe(unsigned int irq);
 extern void set_irq_probe(unsigned int irq);
 
 /* Handle dynamic irq creation and destruction */
-extern unsigned int create_irq_nr(unsigned int irq_want);
+extern unsigned int create_irq_nr(unsigned int irq_want, int node);
 extern int create_irq(void);
 extern void destroy_irq(unsigned int irq);
 
@@ -424,47 +421,48 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #ifdef CONFIG_SMP
 /**
- * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:      pointer to irq_desc struct
  * @cpu:       cpu which will be handling the cpumasks
  * @boot:      true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
- * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
                                                                bool boot)
 {
-       int node;
-
+#ifdef CONFIG_CPUMASK_OFFSTACK
        if (boot) {
                alloc_bootmem_cpumask_var(&desc->affinity);
-               cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
                alloc_bootmem_cpumask_var(&desc->pending_mask);
-               cpumask_clear(desc->pending_mask);
 #endif
                return true;
        }
 
-       node = cpu_to_node(cpu);
-
        if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
                return false;
-       cpumask_setall(desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
        if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
                free_cpumask_var(desc->affinity);
                return false;
        }
-       cpumask_clear(desc->pending_mask);
+#endif
 #endif
        return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+       cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+       cpumask_clear(desc->pending_mask);
+#endif
+}
+
 /**
  * init_copy_desc_masks - copy cpumasks for irq_desc
  * @old_desc:  pointer to old irq_desc struct
@@ -478,7 +476,7 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
                                        struct irq_desc *new_desc)
 {
-#ifdef CONFIG_CPUMASKS_OFFSTACK
+#ifdef CONFIG_CPUMASK_OFFSTACK
        cpumask_copy(new_desc->affinity, old_desc->affinity);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
@@ -499,12 +497,16 @@ static inline void free_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
+static inline bool alloc_desc_masks(struct irq_desc *desc, int node,
                                                                bool boot)
 {
        return true;
 }
 
+static inline void init_desc_masks(struct irq_desc *desc)
+{
+}
+
 static inline void init_copy_desc_masks(struct irq_desc *old_desc,
                                        struct irq_desc *new_desc)
 {
diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
new file mode 100644 (file)
index 0000000..b616d39
--- /dev/null
@@ -0,0 +1,25 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <trace/events/kmem.h>
+
+#ifdef CONFIG_KMEMTRACE
+extern void kmemtrace_init(void);
+#else
+static inline void kmemtrace_init(void)
+{
+}
+#endif
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h
new file mode 100644 (file)
index 0000000..e461b2c
--- /dev/null
@@ -0,0 +1,111 @@
+/*
+ * Common LSM logging functions
+ * Heavily borrowed from selinux/avc.h
+ *
+ * Author : Etienne BASSET  <etienne.basset@ensta.org>
+ *
+ * All credits to : Stephen Smalley, <sds@epoch.ncsc.mil>
+ * All BUGS to : Etienne BASSET  <etienne.basset@ensta.org>
+ */
+#ifndef _LSM_COMMON_LOGGING_
+#define _LSM_COMMON_LOGGING_
+
+#include <linux/stddef.h>
+#include <linux/errno.h>
+#include <linux/kernel.h>
+#include <linux/kdev_t.h>
+#include <linux/spinlock.h>
+#include <linux/init.h>
+#include <linux/audit.h>
+#include <linux/in6.h>
+#include <linux/path.h>
+#include <linux/key.h>
+#include <linux/skbuff.h>
+#include <asm/system.h>
+
+
+/* Auxiliary data to use in generating the audit record. */
+struct common_audit_data {
+       char    type;
+#define LSM_AUDIT_DATA_FS      1
+#define LSM_AUDIT_DATA_NET     2
+#define LSM_AUDIT_DATA_CAP     3
+#define LSM_AUDIT_DATA_IPC     4
+#define LSM_AUDIT_DATA_TASK    5
+#define LSM_AUDIT_DATA_KEY     6
+       struct task_struct *tsk;
+       union   {
+               struct {
+                       struct path path;
+                       struct inode *inode;
+               } fs;
+               struct {
+                       int netif;
+                       struct sock *sk;
+                       u16 family;
+                       __be16 dport;
+                       __be16 sport;
+                       union {
+                               struct {
+                                       __be32 daddr;
+                                       __be32 saddr;
+                               } v4;
+                               struct {
+                                       struct in6_addr daddr;
+                                       struct in6_addr saddr;
+                               } v6;
+                       } fam;
+               } net;
+               int cap;
+               int ipc_id;
+               struct task_struct *tsk;
+#ifdef CONFIG_KEYS
+               struct {
+                       key_serial_t key;
+                       char *key_desc;
+               } key_struct;
+#endif
+       } u;
+       const char *function;
+       /* this union contains LSM specific data */
+       union {
+               /* SMACK data */
+               struct smack_audit_data {
+                       char *subject;
+                       char *object;
+                       char *request;
+                       int result;
+               } smack_audit_data;
+               /* SELinux data */
+               struct {
+                       u32 ssid;
+                       u32 tsid;
+                       u16 tclass;
+                       u32 requested;
+                       u32 audited;
+                       struct av_decision *avd;
+                       int result;
+               } selinux_audit_data;
+       } lsm_priv;
+       /* these callback will be implemented by a specific LSM */
+       void (*lsm_pre_audit)(struct audit_buffer *, void *);
+       void (*lsm_post_audit)(struct audit_buffer *, void *);
+};
+
+#define v4info fam.v4
+#define v6info fam.v6
+
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+               struct common_audit_data *ad, u8 *proto);
+
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+               struct common_audit_data *ad, u8 *proto);
+
+/* Initialize an LSM audit data structure. */
+#define COMMON_AUDIT_DATA_INIT(_d, _t) \
+       { memset((_d), 0, sizeof(struct common_audit_data)); \
+        (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; }
+
+void common_lsm_audit(struct common_audit_data *a);
+
+#endif
index 5b4e28bcb788de92ffc3993c39e912f675a11c7f..927138cf30502c757f806b76471c86bb45466f5d 100644 (file)
@@ -9,6 +9,7 @@
 #define DEBUGFS_MAGIC          0x64626720
 #define SYSFS_MAGIC            0x62656572
 #define SECURITYFS_MAGIC       0x73636673
+#define SELINUX_MAGIC          0xf97cff8c
 #define TMPFS_MAGIC            0x01021994
 #define SQUASHFS_MAGIC         0x73717368
 #define EFS_SUPER_MAGIC                0x414A53
index bff1f0d475c7593240d5a464ae8ecc7c03e07299..ad613ed66ab07b60e0f859eba21be6c6c4a1202e 100644 (file)
@@ -19,6 +19,7 @@ struct anon_vma;
 struct file_ra_state;
 struct user_struct;
 struct writeback_control;
+struct rlimit;
 
 #ifndef CONFIG_DISCONTIGMEM          /* Don't use mapnrs, do it properly */
 extern unsigned long max_mapnr;
@@ -580,12 +581,10 @@ static inline void set_page_links(struct page *page, enum zone_type zone,
  */
 static inline unsigned long round_hint_to_min(unsigned long hint)
 {
-#ifdef CONFIG_SECURITY
        hint &= PAGE_MASK;
        if (((void *)hint != NULL) &&
            (hint < mmap_min_addr))
                return PAGE_ALIGN(mmap_min_addr);
-#endif
        return hint;
 }
 
@@ -1031,8 +1030,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn,
                                        unsigned long end_pfn);
 extern void remove_active_range(unsigned int nid, unsigned long start_pfn,
                                        unsigned long end_pfn);
-extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn,
-                                       unsigned long end_pfn);
 extern void remove_all_active_ranges(void);
 extern unsigned long absent_pages_in_range(unsigned long start_pfn,
                                                unsigned long end_pfn);
@@ -1319,8 +1316,8 @@ int vmemmap_populate_basepages(struct page *start_page,
 int vmemmap_populate(struct page *start_page, unsigned long pages, int node);
 void vmemmap_populate_print_last(void);
 
-extern void *alloc_locked_buffer(size_t size);
-extern void free_locked_buffer(void *buffer, size_t size);
-extern void release_locked_buffer(void *buffer, size_t size);
+extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+                                size_t size);
+extern void refund_locked_memory(struct mm_struct *mm, size_t size);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_MM_H */
index 3d1b7bde128361d0905bfbb6b28375c91d6465c3..97491f78b08cd72138ed25750145275ca1dabb90 100644 (file)
@@ -30,6 +30,8 @@ extern unsigned int kmmio_count;
 
 extern int register_kmmio_probe(struct kmmio_probe *p);
 extern void unregister_kmmio_probe(struct kmmio_probe *p);
+extern int kmmio_init(void);
+extern void kmmio_cleanup(void);
 
 #ifdef CONFIG_MMIOTRACE
 /* kmmio is active by some kmmio_probes? */
index 627ac082e2a64ad6734a665b270a3d11705e86d2..a8f2c0aa4c328af87718285f715bc72b19d7f55f 100644 (file)
@@ -337,6 +337,14 @@ struct module
        const char **trace_bprintk_fmt_start;
        unsigned int num_trace_bprintk_fmt;
 #endif
+#ifdef CONFIG_EVENT_TRACING
+       struct ftrace_event_call *trace_events;
+       unsigned int num_trace_events;
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+       unsigned long *ftrace_callsites;
+       unsigned int num_ftrace_callsites;
+#endif
 
 #ifdef CONFIG_MODULE_UNLOAD
        /* What modules depend on me? */
index 3069ec7e0ab84ca54a282c72d73385e44e69d63a..878cab4f5fcc5db95585184c22aea5d905a34295 100644 (file)
@@ -150,5 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock);
  */
 extern int mutex_trylock(struct mutex *lock);
 extern void mutex_unlock(struct mutex *lock);
+extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock);
 
 #endif
index 0f71812d67d327283b551d2a3c84b762fa2617b4..d7d1c41a0b17db03da5768b35233f409b98b9b7d 100644 (file)
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_U       0xC118
 #define PCI_DEVICE_ID_OXSEMI_PCIe952_1_GU      0xC11C
 #define PCI_DEVICE_ID_OXSEMI_16PCI954  0x9501
+#define PCI_DEVICE_ID_OXSEMI_C950      0x950B
 #define PCI_DEVICE_ID_OXSEMI_16PCI95N  0x9511
 #define PCI_DEVICE_ID_OXSEMI_16PCI954PP        0x9513
 #define PCI_DEVICE_ID_OXSEMI_16PCI952  0x9521
 #define PCI_DEVICE_ID_OXSEMI_16PCI952PP        0x9523
+#define PCI_SUBDEVICE_ID_OXSEMI_C950   0x0001
 
 #define PCI_VENDOR_ID_CHELSIO          0x1425
 
index 67c15653fc230c9341ed1131c1e846156c6772bb..59e133d39d5091c4e5628212b4d563756fe02bec 100644 (file)
@@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child,
                          struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
 extern void exit_ptrace(struct task_struct *tracer);
-extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
 /* Returns 0 on success, -errno on denial. */
@@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task)
 #define arch_ptrace_untrace(task)              do { } while (0)
 #endif
 
-#ifndef arch_ptrace_fork
-/*
- * Do machine-specific work to initialize a new task.
- *
- * This is called from copy_process().
- */
-#define arch_ptrace_fork(child, clone_flags)   do { } while (0)
-#endif
-
 extern int task_current_syscall(struct task_struct *target, long *callno,
                                unsigned long args[6], unsigned int maxargs,
                                unsigned long *sp, unsigned long *pc);
diff --git a/include/linux/rational.h b/include/linux/rational.h
new file mode 100644 (file)
index 0000000..4f532fc
--- /dev/null
@@ -0,0 +1,19 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers,
+ * e.g. when calculating optimum numerator/denominator pairs for
+ * pll configuration taking into account restricted register size
+ */
+
+#ifndef _LINUX_RATIONAL_H
+#define _LINUX_RATIONAL_H
+
+void rational_best_approximation(
+       unsigned long given_numerator, unsigned long given_denominator,
+       unsigned long max_numerator, unsigned long max_denominator,
+       unsigned long *best_numerator, unsigned long *best_denominator);
+
+#endif /* _LINUX_RATIONAL_H */
index e649bd3f2c976c3f5bed58c067c351a336403e75..5710f43bbc9ec0aa42eabd74f851eda4a9a6f213 100644 (file)
@@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list,
        at->prev = last;
 }
 
+/**
+ * list_entry_rcu - get the struct for this entry
+ * @ptr:        the &struct list_head pointer.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_entry_rcu(ptr, type, member) \
+       container_of(rcu_dereference(ptr), type, member)
+
+/**
+ * list_first_entry_rcu - get the first element from a list
+ * @ptr:        the list head to take the element from.
+ * @type:       the type of the struct this is embedded in.
+ * @member:     the name of the list_struct within the struct.
+ *
+ * Note, that list is expected to be not empty.
+ *
+ * This primitive may safely run concurrently with the _rcu list-mutation
+ * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock().
+ */
+#define list_first_entry_rcu(ptr, type, member) \
+       list_entry_rcu((ptr)->next, type, member)
+
 #define __list_for_each_rcu(pos, head) \
        for (pos = rcu_dereference((head)->next); \
                pos != (head); \
@@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list,
  * as long as the traversal is guarded by rcu_read_lock().
  */
 #define list_for_each_entry_rcu(pos, head, member) \
-       for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \
+       for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \
                prefetch(pos->member.next), &pos->member != (head); \
-               pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member))
+               pos = list_entry_rcu(pos->member.next, typeof(*pos), member))
 
 
 /**
index 58b2aa5312b9caaae76f7f9dcb7e9cfcc7f06605..5a5153806c42562c730e8911099fea05c57bf4ad 100644 (file)
@@ -161,8 +161,15 @@ struct rcu_data {
        unsigned long offline_fqs;      /* Kicked due to being offline. */
        unsigned long resched_ipi;      /* Sent a resched IPI. */
 
-       /* 5) For future __rcu_pending statistics. */
+       /* 5) __rcu_pending() statistics. */
        long n_rcu_pending;             /* rcu_pending() calls since boot. */
+       long n_rp_qs_pending;
+       long n_rp_cb_ready;
+       long n_rp_cpu_needs_gp;
+       long n_rp_gp_completed;
+       long n_rp_gp_started;
+       long n_rp_need_fqs;
+       long n_rp_need_nothing;
 
        int cpu;
 };
index e1b7b2173885f8f14f4a8da3979d0a6eff08e186..8670f1575fe19abe7793b700aa70d525080c83bf 100644 (file)
@@ -11,7 +11,7 @@ struct ring_buffer_iter;
  * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
-       u32             type:2, len:3, time_delta:27;
+       u32             type_len:5, time_delta:27;
        u32             array[];
 };
 
@@ -24,7 +24,8 @@ struct ring_buffer_event {
  *                               size is variable depending on how much
  *                               padding is needed
  *                              If time_delta is non zero:
- *                               everything else same as RINGBUF_TYPE_DATA
+ *                               array[0] holds the actual length
+ *                               size = 4 + length (bytes)
  *
  * @RINGBUF_TYPE_TIME_EXTEND:  Extend the time delta
  *                              array[0] = time delta (28 .. 59)
@@ -35,22 +36,23 @@ struct ring_buffer_event {
  *                              array[1..2] = tv_sec
  *                              size = 16 bytes
  *
- * @RINGBUF_TYPE_DATA:         Data record
- *                              If len is zero:
+ * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX:
+ *                             Data record
+ *                              If type_len is zero:
  *                               array[0] holds the actual length
  *                               array[1..(length+3)/4] holds data
- *                               size = 4 + 4 + length (bytes)
+ *                               size = 4 + length (bytes)
  *                              else
- *                               length = len << 2
+ *                               length = type_len << 2
  *                               array[0..(length+3)/4-1] holds data
  *                               size = 4 + length (bytes)
  */
 enum ring_buffer_type {
+       RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28,
        RINGBUF_TYPE_PADDING,
        RINGBUF_TYPE_TIME_EXTEND,
        /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */
        RINGBUF_TYPE_TIME_STAMP,
-       RINGBUF_TYPE_DATA,
 };
 
 unsigned ring_buffer_event_length(struct ring_buffer_event *event);
@@ -68,13 +70,54 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
        return event->time_delta;
 }
 
+/*
+ * ring_buffer_event_discard can discard any event in the ring buffer.
+ *   it is up to the caller to protect against a reader from
+ *   consuming it or a writer from wrapping and replacing it.
+ *
+ * No external protection is needed if this is called before
+ * the event is commited. But in that case it would be better to
+ * use ring_buffer_discard_commit.
+ *
+ * Note, if an event that has not been committed is discarded
+ * with ring_buffer_event_discard, it must still be committed.
+ */
 void ring_buffer_event_discard(struct ring_buffer_event *event);
 
+/*
+ * ring_buffer_discard_commit will remove an event that has not
+ *   ben committed yet. If this is used, then ring_buffer_unlock_commit
+ *   must not be called on the discarded event. This function
+ *   will try to remove the event from the ring buffer completely
+ *   if another event has not been written after it.
+ *
+ * Example use:
+ *
+ *  if (some_condition)
+ *    ring_buffer_discard_commit(buffer, event);
+ *  else
+ *    ring_buffer_unlock_commit(buffer, event);
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+                               struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
 struct ring_buffer *
-ring_buffer_alloc(unsigned long size, unsigned flags);
+__ring_buffer_alloc(unsigned long size, unsigned flags, struct lock_class_key *key);
+
+/*
+ * Because the ring buffer is generic, if other users of the ring buffer get
+ * traced by ftrace, it can produce lockdep warnings. We need to keep each
+ * ring buffer's lock class separate.
+ */
+#define ring_buffer_alloc(size, flags)                 \
+({                                                     \
+       static struct lock_class_key __key;             \
+       __ring_buffer_alloc((size), (flags), &__key);   \
+})
+
 void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
@@ -122,6 +165,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer);
 unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu);
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu);
 
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
 void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
@@ -137,6 +182,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
 int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
                          size_t len, int cpu, int full);
 
+struct trace_seq;
+
+int ring_buffer_print_entry_header(struct trace_seq *s);
+int ring_buffer_print_page_header(struct trace_seq *s);
+
 enum ring_buffer_flags {
        RB_FL_OVERWRITE         = 1 << 0,
 };
index b4c38bc8049cbbea17e0ca4f929f35df9cddbe1f..42bf2766111e5585f24812fbc7183d81316b8f62 100644 (file)
@@ -77,6 +77,7 @@ struct sched_param {
 #include <linux/proportions.h>
 #include <linux/seccomp.h>
 #include <linux/rcupdate.h>
+#include <linux/rculist.h>
 #include <linux/rtmutex.h>
 
 #include <linux/time.h>
@@ -96,8 +97,8 @@ struct exec_domain;
 struct futex_pi_state;
 struct robust_list_head;
 struct bio;
-struct bts_tracer;
 struct fs_struct;
+struct bts_context;
 
 /*
  * List of flags we want to share for kernel threads,
@@ -116,6 +117,7 @@ struct fs_struct;
  *    11 bit fractions.
  */
 extern unsigned long avenrun[];                /* Load averages */
+extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
 
 #define FSHIFT         11              /* nr of bits of precision */
 #define FIXED_1                (1<<FSHIFT)     /* 1.0 as fixed-point */
@@ -135,8 +137,8 @@ DECLARE_PER_CPU(unsigned long, process_counts);
 extern int nr_processes(void);
 extern unsigned long nr_running(void);
 extern unsigned long nr_uninterruptible(void);
-extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
+extern void calc_global_load(void);
 
 extern unsigned long get_parent_ip(unsigned long addr);
 
@@ -838,7 +840,17 @@ struct sched_group {
         */
        u32 reciprocal_cpu_power;
 
-       unsigned long cpumask[];
+       /*
+        * The CPUs this group covers.
+        *
+        * NOTE: this field is variable length. (Allocated dynamically
+        * by attaching extra space to the end of the structure,
+        * depending on how many CPUs the kernel has booted up with)
+        *
+        * It is also be embedded into static data structures at build
+        * time. (See 'struct static_sched_group' in kernel/sched.c)
+        */
+       unsigned long cpumask[0];
 };
 
 static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
@@ -924,8 +936,17 @@ struct sched_domain {
        char *name;
 #endif
 
-       /* span of all CPUs in this domain */
-       unsigned long span[];
+       /*
+        * Span of all CPUs in this domain.
+        *
+        * NOTE: this field is variable length. (Allocated dynamically
+        * by attaching extra space to the end of the structure,
+        * depending on how many CPUs the kernel has booted up with)
+        *
+        * It is also be embedded into static data structures at build
+        * time. (See 'struct static_sched_domain' in kernel/sched.c)
+        */
+       unsigned long span[0];
 };
 
 static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
@@ -1209,18 +1230,11 @@ struct task_struct {
        struct list_head ptraced;
        struct list_head ptrace_entry;
 
-#ifdef CONFIG_X86_PTRACE_BTS
        /*
         * This is the tracer handle for the ptrace BTS extension.
         * This field actually belongs to the ptracer task.
         */
-       struct bts_tracer *bts;
-       /*
-        * The buffer to hold the BTS data.
-        */
-       void *bts_buffer;
-       size_t bts_size;
-#endif /* CONFIG_X86_PTRACE_BTS */
+       struct bts_context *bts;
 
        /* PID/PID hash table linkage. */
        struct pid_link pids[PIDTYPE_MAX];
@@ -1247,7 +1261,9 @@ struct task_struct {
                                         * credentials (COW) */
        const struct cred *cred;        /* effective (overridable) subjective task
                                         * credentials (COW) */
-       struct mutex cred_exec_mutex;   /* execve vs ptrace cred calculation mutex */
+       struct mutex cred_guard_mutex;  /* guard against foreign influences on
+                                        * credential calculations
+                                        * (notably. ptrace) */
 
        char comm[TASK_COMM_LEN]; /* executable name excluding path
                                     - access with [gs]et_task_comm (which lock
@@ -1428,7 +1444,9 @@ struct task_struct {
 #ifdef CONFIG_TRACING
        /* state flags for use by tracers */
        unsigned long trace;
-#endif
+       /* bitmask of trace recursion */
+       unsigned long trace_recursion;
+#endif /* CONFIG_TRACING */
 };
 
 /* Future-safe accessor for struct task_struct's cpus_allowed. */
@@ -1885,6 +1903,7 @@ extern void sched_dead(struct task_struct *p);
 
 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
+extern void __flush_signals(struct task_struct *);
 extern void ignore_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *, int force_default);
 extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
@@ -2001,8 +2020,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from);
 extern char *get_task_comm(char *to, struct task_struct *tsk);
 
 #ifdef CONFIG_SMP
+extern void wait_task_context_switch(struct task_struct *p);
 extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
 #else
+static inline void wait_task_context_switch(struct task_struct *p) {}
 static inline unsigned long wait_task_inactive(struct task_struct *p,
                                               long match_state)
 {
@@ -2010,7 +2031,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p,
 }
 #endif
 
-#define next_task(p)   list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks)
+#define next_task(p) \
+       list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
 
 #define for_each_process(p) \
        for (p = &init_task ; (p = next_task(p)) != &init_task ; )
@@ -2049,8 +2071,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2)
 
 static inline struct task_struct *next_thread(const struct task_struct *p)
 {
-       return list_entry(rcu_dereference(p->thread_group.next),
-                         struct task_struct, thread_group);
+       return list_entry_rcu(p->thread_group.next,
+                             struct task_struct, thread_group);
 }
 
 static inline int thread_group_empty(struct task_struct *p)
index d5fd6163606fa028f7db538825cc055a051e0d51..5eff459b38338ace06bb7d9e83d10a462b906203 100644 (file)
@@ -2197,6 +2197,8 @@ static inline int security_file_mmap(struct file *file, unsigned long reqprot,
                                     unsigned long addr,
                                     unsigned long addr_only)
 {
+       if ((addr < mmap_min_addr) && !capable(CAP_SYS_RAWIO))
+               return -EACCES;
        return 0;
 }
 
index 9136cc5608c3df2b15145ab7f1fbe09c966fd8f9..e5bb75a6380219256c52dce6c75be8aebddb2aaf 100644 (file)
@@ -96,54 +96,76 @@ struct serial_uart_config {
 
 /*
  * Definitions for async_struct (and serial_struct) flags field
+ *
+ * Define ASYNCB_* for convenient use with {test,set,clear}_bit.
  */
-#define ASYNC_HUP_NOTIFY 0x0001 /* Notify getty on hangups and closes 
-                                  on the callout port */
-#define ASYNC_FOURPORT  0x0002 /* Set OU1, OUT2 per AST Fourport settings */
-#define ASYNC_SAK      0x0004  /* Secure Attention Key (Orange book) */
-#define ASYNC_SPLIT_TERMIOS 0x0008 /* Separate termios for dialin/callout */
-
-#define ASYNC_SPD_MASK 0x1030
-#define ASYNC_SPD_HI   0x0010  /* Use 56000 instead of 38400 bps */
-
-#define ASYNC_SPD_VHI  0x0020  /* Use 115200 instead of 38400 bps */
-#define ASYNC_SPD_CUST 0x0030  /* Use user-specified divisor */
-
-#define ASYNC_SKIP_TEST        0x0040 /* Skip UART test during autoconfiguration */
-#define ASYNC_AUTO_IRQ  0x0080 /* Do automatic IRQ during autoconfiguration */
-#define ASYNC_SESSION_LOCKOUT 0x0100 /* Lock out cua opens based on session */
-#define ASYNC_PGRP_LOCKOUT    0x0200 /* Lock out cua opens based on pgrp */
-#define ASYNC_CALLOUT_NOHUP   0x0400 /* Don't do hangups for cua device */
-
-#define ASYNC_HARDPPS_CD       0x0800  /* Call hardpps when CD goes high  */
-
-#define ASYNC_SPD_SHI  0x1000  /* Use 230400 instead of 38400 bps */
-#define ASYNC_SPD_WARP 0x1010  /* Use 460800 instead of 38400 bps */
-
-#define ASYNC_LOW_LATENCY 0x2000 /* Request low latency behaviour */
-
-#define ASYNC_BUGGY_UART  0x4000 /* This is a buggy UART, skip some safety
-                                 * checks.  Note: can be dangerous! */
-
-#define ASYNC_AUTOPROBE         0x8000 /* Port was autoprobed by PCI or PNP code */
-
-#define ASYNC_FLAGS    0x7FFF  /* Possible legal async flags */
-#define ASYNC_USR_MASK 0x3430  /* Legal flags that non-privileged
-                                * users can set or reset */
-
-/* Internal flags used only by kernel/chr_drv/serial.c */
-#define ASYNC_INITIALIZED      0x80000000 /* Serial port was initialized */
-#define ASYNC_NORMAL_ACTIVE    0x20000000 /* Normal device is active */
-#define ASYNC_BOOT_AUTOCONF    0x10000000 /* Autoconfigure port on bootup */
-#define ASYNC_CLOSING          0x08000000 /* Serial port is closing */
-#define ASYNC_CTS_FLOW         0x04000000 /* Do CTS flow control */
-#define ASYNC_CHECK_CD         0x02000000 /* i.e., CLOCAL */
-#define ASYNC_SHARE_IRQ                0x01000000 /* for multifunction cards
-                                            --- no longer used */
-#define ASYNC_CONS_FLOW                0x00800000 /* flow control for console  */
-
-#define ASYNC_BOOT_ONLYMCA     0x00400000 /* Probe only if MCA bus */
-#define ASYNC_INTERNAL_FLAGS   0xFFC00000 /* Internal flags */
+#define ASYNCB_HUP_NOTIFY       0 /* Notify getty on hangups and closes
+                                   * on the callout port */
+#define ASYNCB_FOURPORT                 1 /* Set OU1, OUT2 per AST Fourport settings */
+#define ASYNCB_SAK              2 /* Secure Attention Key (Orange book) */
+#define ASYNCB_SPLIT_TERMIOS    3 /* Separate termios for dialin/callout */
+#define ASYNCB_SPD_HI           4 /* Use 56000 instead of 38400 bps */
+#define ASYNCB_SPD_VHI          5 /* Use 115200 instead of 38400 bps */
+#define ASYNCB_SKIP_TEST        6 /* Skip UART test during autoconfiguration */
+#define ASYNCB_AUTO_IRQ                 7 /* Do automatic IRQ during
+                                   * autoconfiguration */
+#define ASYNCB_SESSION_LOCKOUT  8 /* Lock out cua opens based on session */
+#define ASYNCB_PGRP_LOCKOUT     9 /* Lock out cua opens based on pgrp */
+#define ASYNCB_CALLOUT_NOHUP   10 /* Don't do hangups for cua device */
+#define ASYNCB_HARDPPS_CD      11 /* Call hardpps when CD goes high  */
+#define ASYNCB_SPD_SHI         12 /* Use 230400 instead of 38400 bps */
+#define ASYNCB_LOW_LATENCY     13 /* Request low latency behaviour */
+#define ASYNCB_BUGGY_UART      14 /* This is a buggy UART, skip some safety
+                                   * checks.  Note: can be dangerous! */
+#define ASYNCB_AUTOPROBE       15 /* Port was autoprobed by PCI or PNP code */
+#define ASYNCB_LAST_USER       15
+
+/* Internal flags used only by kernel */
+#define ASYNCB_INITIALIZED     31 /* Serial port was initialized */
+#define ASYNCB_NORMAL_ACTIVE   29 /* Normal device is active */
+#define ASYNCB_BOOT_AUTOCONF   28 /* Autoconfigure port on bootup */
+#define ASYNCB_CLOSING         27 /* Serial port is closing */
+#define ASYNCB_CTS_FLOW                26 /* Do CTS flow control */
+#define ASYNCB_CHECK_CD                25 /* i.e., CLOCAL */
+#define ASYNCB_SHARE_IRQ       24 /* for multifunction cards, no longer used */
+#define ASYNCB_CONS_FLOW       23 /* flow control for console  */
+#define ASYNCB_BOOT_ONLYMCA    22 /* Probe only if MCA bus */
+#define ASYNCB_FIRST_KERNEL    22
+
+#define ASYNC_HUP_NOTIFY       (1U << ASYNCB_HUP_NOTIFY)
+#define ASYNC_FOURPORT         (1U << ASYNCB_FOURPORT)
+#define ASYNC_SAK              (1U << ASYNCB_SAK)
+#define ASYNC_SPLIT_TERMIOS    (1U << ASYNCB_SPLIT_TERMIOS)
+#define ASYNC_SPD_HI           (1U << ASYNCB_SPD_HI)
+#define ASYNC_SPD_VHI          (1U << ASYNCB_SPD_VHI)
+#define ASYNC_SKIP_TEST                (1U << ASYNCB_SKIP_TEST)
+#define ASYNC_AUTO_IRQ         (1U << ASYNCB_AUTO_IRQ)
+#define ASYNC_SESSION_LOCKOUT  (1U << ASYNCB_SESSION_LOCKOUT)
+#define ASYNC_PGRP_LOCKOUT     (1U << ASYNCB_PGRP_LOCKOUT)
+#define ASYNC_CALLOUT_NOHUP    (1U << ASYNCB_CALLOUT_NOHUP)
+#define ASYNC_HARDPPS_CD       (1U << ASYNCB_HARDPPS_CD)
+#define ASYNC_SPD_SHI          (1U << ASYNCB_SPD_SHI)
+#define ASYNC_LOW_LATENCY      (1U << ASYNCB_LOW_LATENCY)
+#define ASYNC_BUGGY_UART       (1U << ASYNCB_BUGGY_UART)
+#define ASYNC_AUTOPROBE                (1U << ASYNCB_AUTOPROBE)
+
+#define ASYNC_FLAGS            ((1U << ASYNCB_LAST_USER) - 1)
+#define ASYNC_USR_MASK         (ASYNC_SPD_HI|ASYNC_SPD_VHI| \
+               ASYNC_CALLOUT_NOHUP|ASYNC_SPD_SHI|ASYNC_LOW_LATENCY)
+#define ASYNC_SPD_CUST         (ASYNC_SPD_HI|ASYNC_SPD_VHI)
+#define ASYNC_SPD_WARP         (ASYNC_SPD_HI|ASYNC_SPD_SHI)
+#define ASYNC_SPD_MASK         (ASYNC_SPD_HI|ASYNC_SPD_VHI|ASYNC_SPD_SHI)
+
+#define ASYNC_INITIALIZED      (1U << ASYNCB_INITIALIZED)
+#define ASYNC_NORMAL_ACTIVE    (1U << ASYNCB_NORMAL_ACTIVE)
+#define ASYNC_BOOT_AUTOCONF    (1U << ASYNCB_BOOT_AUTOCONF)
+#define ASYNC_CLOSING          (1U << ASYNCB_CLOSING)
+#define ASYNC_CTS_FLOW         (1U << ASYNCB_CTS_FLOW)
+#define ASYNC_CHECK_CD         (1U << ASYNCB_CHECK_CD)
+#define ASYNC_SHARE_IRQ                (1U << ASYNCB_SHARE_IRQ)
+#define ASYNC_CONS_FLOW                (1U << ASYNCB_CONS_FLOW)
+#define ASYNC_BOOT_ONLYMCA     (1U << ASYNCB_BOOT_ONLYMCA)
+#define ASYNC_INTERNAL_FLAGS   (~((1U << ASYNCB_FIRST_KERNEL) - 1))
 
 /*
  * Multiport serial configuration structure --- external structure
index 57a97e52e58d0339d0e3dfd1271e143e865d68a4..6fd80c4243f1bd089d8356d7ed0e181ed911753e 100644 (file)
@@ -41,7 +41,8 @@
 #define PORT_XSCALE    15
 #define PORT_RM9000    16      /* PMC-Sierra RM9xxx internal UART */
 #define PORT_OCTEON    17      /* Cavium OCTEON internal UART */
-#define PORT_MAX_8250  17      /* max port ID */
+#define PORT_AR7       18      /* Texas Instruments AR7 internal UART */
+#define PORT_MAX_8250  18      /* max port ID */
 
 /*
  * ARM specific type numbers.  These are not currently guaranteed
 /* MAX3100 */
 #define PORT_MAX3100    86
 
+/* Timberdale UART */
+#define PORT_TIMBUART  87
+
 #ifdef __KERNEL__
 
 #include <linux/compiler.h>
index 84f997f8aa53cfcc8b04ff86bb4b0263f0e7dc1a..c7552836bd954a0f0a0235feea5cf9acb5c1a2c8 100644 (file)
@@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig)
 extern int next_signal(struct sigpending *pending, sigset_t *mask);
 extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p);
 extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *);
+extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
+                                siginfo_t *info);
 extern long do_sigpending(void __user *, unsigned long);
 extern int sigprocmask(int, sigset_t *, sigset_t *);
 extern int show_unhandled_signals;
index 5ac9b0bcaf9adef1fdfddebd2aff6290a44f7b9f..713f841ecaa914e74aead8e4d8ff5d74cb5040d3 100644 (file)
@@ -14,7 +14,7 @@
 #include <asm/page.h>          /* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>         /* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
index 5046f90c11710178127a20a7bcb8677b4e80d943..be5d40c43bd2e7aa96264e624b074f4a884826f4 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 enum stat_item {
        ALLOC_FASTPATH,         /* Allocation from cpu slab */
index 938234c4a996ba6e78521ffc60ecc197034fed9d..d4841ed8215b755ae2a291245687187479d4d27c 100644 (file)
@@ -60,6 +60,7 @@ static inline void __raw_spin_unlock(raw_spinlock_t *lock)
 #define __raw_spin_is_locked(lock)     ((void)(lock), 0)
 /* for sched.c and kernel_lock.c: */
 # define __raw_spin_lock(lock)         do { (void)(lock); } while (0)
+# define __raw_spin_lock_flags(lock, flags)    do { (void)(lock); } while (0)
 # define __raw_spin_unlock(lock)       do { (void)(lock); } while (0)
 # define __raw_spin_trylock(lock)      ({ (void)(lock); 1; })
 #endif /* DEBUG_SPINLOCK */
index ac9ff54f7cb305943e5444f27be83460ef398206..cb1a6631b8f449bcf2738f37208783dd23be87ba 100644 (file)
@@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs);
 
 extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev,
                                      phys_addr_t address);
-extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address);
+extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev,
+                                      dma_addr_t address);
 
 extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size);
 
index e6b820f8b56b8ae3bf50ac980d1e95ce3ad6536e..a8cc4e13434c4261ae8ab536566b23fcd777286a 100644 (file)
@@ -21,13 +21,14 @@ struct restart_block {
                struct {
                        unsigned long arg0, arg1, arg2, arg3;
                };
-               /* For futex_wait */
+               /* For futex_wait and futex_wait_requeue_pi */
                struct {
                        u32 *uaddr;
                        u32 val;
                        u32 flags;
                        u32 bitset;
                        u64 time;
+                       u32 *uaddr2;
                } futex;
                /* For nanosleep */
                struct {
diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
new file mode 100644 (file)
index 0000000..c68bccb
--- /dev/null
@@ -0,0 +1,92 @@
+#ifndef _LINUX_TRACE_SEQ_H
+#define _LINUX_TRACE_SEQ_H
+
+#include <linux/fs.h>
+
+/*
+ * Trace sequences are used to allow a function to call several other functions
+ * to create a string of data to use (up to a max of PAGE_SIZE.
+ */
+
+struct trace_seq {
+       unsigned char           buffer[PAGE_SIZE];
+       unsigned int            len;
+       unsigned int            readpos;
+};
+
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+       s->len = 0;
+       s->readpos = 0;
+}
+
+/*
+ * Currently only defined when tracing is enabled.
+ */
+#ifdef CONFIG_TRACING
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+       __attribute__ ((format (printf, 2, 3)));
+extern int trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+       __attribute__ ((format (printf, 2, 0)));
+extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern void trace_print_seq(struct seq_file *m, struct trace_seq *s);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                size_t cnt);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+                               size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+
+#else /* CONFIG_TRACING */
+static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+       return 0;
+}
+static inline int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+       return 0;
+}
+
+static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+}
+static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+                                size_t cnt)
+{
+       return 0;
+}
+static inline int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+       return 0;
+}
+static inline int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+       return 0;
+}
+static inline int
+trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
+{
+       return 0;
+}
+static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+                                      size_t len)
+{
+       return 0;
+}
+static inline void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+       return NULL;
+}
+static inline int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+       return 0;
+}
+#endif /* CONFIG_TRACING */
+
+#endif /* _LINUX_TRACE_SEQ_H */
index d35a7ee7611fe1025ed5ccaf01c6ea614c03d688..14df7e635d439e07d438e9890f0c8f718fa2aa15 100644 (file)
@@ -31,6 +31,8 @@ struct tracepoint {
                                         * Keep in sync with vmlinux.lds.h.
                                         */
 
+#ifndef DECLARE_TRACE
+
 #define TP_PROTO(args...)      args
 #define TP_ARGS(args...)               args
 
@@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin,
        struct tracepoint *end)
 { }
 #endif /* CONFIG_TRACEPOINTS */
+#endif /* DECLARE_TRACE */
 
 /*
  * Connect a probe to a tracepoint.
@@ -154,10 +157,8 @@ static inline void tracepoint_synchronize_unregister(void)
 }
 
 #define PARAMS(args...) args
-#define TRACE_FORMAT(name, proto, args, fmt)           \
-       DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
-
 
+#ifndef TRACE_EVENT
 /*
  * For use with the TRACE_EVENT macro:
  *
@@ -262,5 +263,6 @@ static inline void tracepoint_synchronize_unregister(void)
 
 #define TRACE_EVENT(name, proto, args, struct, assign, print)  \
        DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+#endif
 
 #endif
index fc39db95499fbab4142800b24faf52b182fae113..1488d8c81aac6587c9edc8002f39f89d104391cf 100644 (file)
@@ -185,7 +185,7 @@ struct tty_port;
 struct tty_port_operations {
        /* Return 1 if the carrier is raised */
        int (*carrier_raised)(struct tty_port *port);
-       void (*raise_dtr_rts)(struct tty_port *port);
+       void (*dtr_rts)(struct tty_port *port, int raise);
 };
        
 struct tty_port {
@@ -201,6 +201,9 @@ struct tty_port {
        unsigned char           *xmit_buf;      /* Optional buffer */
        int                     close_delay;    /* Close port delay */
        int                     closing_wait;   /* Delay for output */
+       int                     drain_delay;    /* Set to zero if no pure time
+                                                  based drain is needed else
+                                                  set to size of fifo */
 };
 
 /*
@@ -223,8 +226,11 @@ struct tty_struct {
        struct tty_driver *driver;
        const struct tty_operations *ops;
        int index;
-       /* The ldisc objects are protected by tty_ldisc_lock at the moment */
-       struct tty_ldisc ldisc;
+
+       /* Protects ldisc changes: Lock tty not pty */
+       struct mutex ldisc_mutex;
+       struct tty_ldisc *ldisc;
+
        struct mutex termios_mutex;
        spinlock_t ctrl_lock;
        /* Termios values are protected by the termios mutex */
@@ -311,6 +317,7 @@ struct tty_struct {
 #define TTY_CLOSING            7       /* ->close() in progress */
 #define TTY_LDISC              9       /* Line discipline attached */
 #define TTY_LDISC_CHANGING     10      /* Line discipline changing */
+#define TTY_LDISC_OPEN         11      /* Line discipline is open */
 #define TTY_HW_COOK_OUT        14      /* Hardware can do output cooking */
 #define TTY_HW_COOK_IN                 15      /* Hardware can do input cooking */
 #define TTY_PTY_LOCK           16      /* pty private */
@@ -403,6 +410,7 @@ extern int tty_termios_hw_change(struct ktermios *a, struct ktermios *b);
 extern struct tty_ldisc *tty_ldisc_ref(struct tty_struct *);
 extern void tty_ldisc_deref(struct tty_ldisc *);
 extern struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *);
+extern void tty_ldisc_hangup(struct tty_struct *tty);
 extern const struct file_operations tty_ldiscs_proc_fops;
 
 extern void tty_wakeup(struct tty_struct *tty);
@@ -425,6 +433,9 @@ extern struct tty_struct *tty_init_dev(struct tty_driver *driver, int idx,
 extern void tty_release_dev(struct file *filp);
 extern int tty_init_termios(struct tty_struct *tty);
 
+extern struct tty_struct *tty_pair_get_tty(struct tty_struct *tty);
+extern struct tty_struct *tty_pair_get_pty(struct tty_struct *tty);
+
 extern struct mutex tty_mutex;
 
 extern void tty_write_unlock(struct tty_struct *tty);
@@ -438,6 +449,7 @@ extern struct tty_struct *tty_port_tty_get(struct tty_port *port);
 extern void tty_port_tty_set(struct tty_port *port, struct tty_struct *tty);
 extern int tty_port_carrier_raised(struct tty_port *port);
 extern void tty_port_raise_dtr_rts(struct tty_port *port);
+extern void tty_port_lower_dtr_rts(struct tty_port *port);
 extern void tty_port_hangup(struct tty_port *port);
 extern int tty_port_block_til_ready(struct tty_port *port,
                                struct tty_struct *tty, struct file *filp);
index bcba84ea2d86ae280a2778ebcce9dc617c0b7e9c..3566129384a4d77e83d6496db87ea456eca2e9f6 100644 (file)
  *     the line discipline are close to full, and it should somehow
  *     signal that no more characters should be sent to the tty.
  *
- *     Optional: Always invoke via tty_throttle();
+ *     Optional: Always invoke via tty_throttle(), called under the
+ *     termios lock.
  * 
  * void (*unthrottle)(struct tty_struct * tty);
  *
  *     that characters can now be sent to the tty without fear of
  *     overrunning the input buffers of the line disciplines.
  * 
- *     Optional: Always invoke via tty_unthrottle();
+ *     Optional: Always invoke via tty_unthrottle(), called under the
+ *     termios lock.
  *
  * void (*stop)(struct tty_struct *tty);
  *
index 625e9e4639c68f3e9fc2b6b1d3ce4a734fa2113b..8cdfed738fe4b8b28d0f7208c8553f0cf2eaee03 100644 (file)
@@ -224,8 +224,7 @@ struct usb_serial_driver {
        /* Called by console with tty = NULL and by tty */
        int  (*open)(struct tty_struct *tty,
                        struct usb_serial_port *port, struct file *filp);
-       void (*close)(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+       void (*close)(struct usb_serial_port *port);
        int  (*write)(struct tty_struct *tty, struct usb_serial_port *port,
                        const unsigned char *buf, int count);
        /* Called only by the tty layer */
@@ -241,6 +240,10 @@ struct usb_serial_driver {
        int  (*tiocmget)(struct tty_struct *tty, struct file *file);
        int  (*tiocmset)(struct tty_struct *tty, struct file *file,
                         unsigned int set, unsigned int clear);
+       /* Called by the tty layer for port level work. There may or may not
+          be an attached tty at this point */
+       void (*dtr_rts)(struct usb_serial_port *port, int on);
+       int  (*carrier_raised)(struct usb_serial_port *port);
        /* USB events */
        void (*read_int_callback)(struct urb *urb);
        void (*write_int_callback)(struct urb *urb);
@@ -283,8 +286,7 @@ extern int usb_serial_generic_open(struct tty_struct *tty,
                struct usb_serial_port *port, struct file *filp);
 extern int usb_serial_generic_write(struct tty_struct *tty,
        struct usb_serial_port *port, const unsigned char *buf, int count);
-extern void usb_serial_generic_close(struct tty_struct *tty,
-                       struct usb_serial_port *port, struct file *filp);
+extern void usb_serial_generic_close(struct usb_serial_port *port);
 extern int usb_serial_generic_resume(struct usb_serial *serial);
 extern int usb_serial_generic_write_room(struct tty_struct *tty);
 extern int usb_serial_generic_chars_in_buffer(struct tty_struct *tty);
index bc024632f365915f99894f5d799938f5e4e69b45..6788e1a4d4ca63e8cfc3b288f87d456334064c29 100644 (file)
@@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
        list_del(&old->task_list);
 }
 
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
-                       int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
 void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
diff --git a/include/trace/block.h b/include/trace/block.h
deleted file mode 100644 (file)
index 25b7068..0000000
+++ /dev/null
@@ -1,76 +0,0 @@
-#ifndef _TRACE_BLOCK_H
-#define _TRACE_BLOCK_H
-
-#include <linux/blkdev.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(block_rq_abort,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_insert,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_issue,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_requeue,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_rq_complete,
-       TP_PROTO(struct request_queue *q, struct request *rq),
-             TP_ARGS(q, rq));
-
-DECLARE_TRACE(block_bio_bounce,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_complete,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_backmerge,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_frontmerge,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_bio_queue,
-       TP_PROTO(struct request_queue *q, struct bio *bio),
-             TP_ARGS(q, bio));
-
-DECLARE_TRACE(block_getrq,
-       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-             TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_sleeprq,
-       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
-             TP_ARGS(q, bio, rw));
-
-DECLARE_TRACE(block_plug,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_timer,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_unplug_io,
-       TP_PROTO(struct request_queue *q),
-             TP_ARGS(q));
-
-DECLARE_TRACE(block_split,
-       TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-             TP_ARGS(q, bio, pdu));
-
-DECLARE_TRACE(block_remap,
-       TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-                sector_t from, sector_t to),
-             TP_ARGS(q, bio, dev, from, to));
-
-#endif
diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h
new file mode 100644 (file)
index 0000000..f7a7ae1
--- /dev/null
@@ -0,0 +1,75 @@
+/*
+ * Trace files that want to automate creationg of all tracepoints defined
+ * in their file should include this file. The following are macros that the
+ * trace file may define:
+ *
+ * TRACE_SYSTEM defines the system the tracepoint is for
+ *
+ * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h
+ *     This macro may be defined to tell define_trace.h what file to include.
+ *     Note, leave off the ".h".
+ *
+ * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace
+ *     then this macro can define the path to use. Note, the path is relative to
+ *     define_trace.h, not the file including it. Full path names for out of tree
+ *     modules must be used.
+ */
+
+#ifdef CREATE_TRACE_POINTS
+
+/* Prevent recursion */
+#undef CREATE_TRACE_POINTS
+
+#include <linux/stringify.h>
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+       DEFINE_TRACE(name)
+
+#undef DECLARE_TRACE
+#define DECLARE_TRACE(name, proto, args)       \
+       DEFINE_TRACE(name)
+
+#undef TRACE_INCLUDE
+#undef __TRACE_INCLUDE
+
+#ifndef TRACE_INCLUDE_FILE
+# define TRACE_INCLUDE_FILE TRACE_SYSTEM
+# define UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifndef TRACE_INCLUDE_PATH
+# define __TRACE_INCLUDE(system) <trace/events/system.h>
+# define UNDEF_TRACE_INCLUDE_PATH
+#else
+# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h)
+#endif
+
+# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system)
+
+/* Let the trace headers be reread */
+#define TRACE_HEADER_MULTI_READ
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#ifdef CONFIG_EVENT_TRACING
+#include <trace/ftrace.h>
+#endif
+
+#undef TRACE_HEADER_MULTI_READ
+
+/* Only undef what we defined in this file */
+#ifdef UNDEF_TRACE_INCLUDE_FILE
+# undef TRACE_INCLUDE_FILE
+# undef UNDEF_TRACE_INCLUDE_FILE
+#endif
+
+#ifdef UNDEF_TRACE_INCLUDE_PATH
+# undef TRACE_INCLUDE_PATH
+# undef UNDEF_TRACE_INCLUDE_PATH
+#endif
+
+/* We may be processing more files */
+#define CREATE_TRACE_POINTS
+
+#endif /* CREATE_TRACE_POINTS */
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
new file mode 100644 (file)
index 0000000..53effd4
--- /dev/null
@@ -0,0 +1,498 @@
+#if !defined(_TRACE_BLOCK_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_BLOCK_H
+
+#include <linux/blktrace_api.h>
+#include <linux/blkdev.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM block
+
+TRACE_EVENT(block_rq_abort,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_insert,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  unsigned int, bytes                   )
+               __array(  char,         rwbs,   6               )
+               __array(  char,         comm,   TASK_COMM_LEN   )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __entry->bytes, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_issue,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  unsigned int, bytes                   )
+               __array(  char,         rwbs,   6               )
+               __array(  char,         comm,   TASK_COMM_LEN   )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->bytes     = blk_pc_request(rq) ? rq->data_len : 0;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %u (%s) %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __entry->bytes, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_rq_requeue,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+
+TRACE_EVENT(block_rq_complete,
+
+       TP_PROTO(struct request_queue *q, struct request *rq),
+
+       TP_ARGS(q, rq),
+
+       TP_STRUCT__entry(
+               __field(  dev_t,        dev                     )
+               __field(  sector_t,     sector                  )
+               __field(  unsigned int, nr_sector               )
+               __field(  int,          errors                  )
+               __array(  char,         rwbs,   6               )
+               __dynamic_array( char,  cmd,    blk_cmd_buf_len(rq)     )
+       ),
+
+       TP_fast_assign(
+               __entry->dev       = rq->rq_disk ? disk_devt(rq->rq_disk) : 0;
+               __entry->sector    = blk_pc_request(rq) ? 0 : rq->hard_sector;
+               __entry->nr_sector = blk_pc_request(rq) ?
+                                               0 : rq->hard_nr_sectors;
+               __entry->errors    = rq->errors;
+
+               blk_fill_rwbs_rq(__entry->rwbs, rq);
+               blk_dump_cmd(__get_str(cmd), rq);
+       ),
+
+       TP_printk("%d,%d %s (%s) %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev),
+                 __entry->rwbs, __get_str(cmd),
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->errors)
+);
+TRACE_EVENT(block_bio_bounce,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_complete,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev             )
+               __field( sector_t,      sector          )
+               __field( unsigned,      nr_sector       )
+               __field( int,           error           )
+               __array( char,          rwbs,   6       )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%d]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->error)
+);
+
+TRACE_EVENT(block_bio_backmerge,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_frontmerge,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned,      nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_bio_queue,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio),
+
+       TP_ARGS(q, bio),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_getrq,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+       TP_ARGS(q, bio, rw),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+        ),
+
+       TP_fast_assign(
+               __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
+               __entry->sector         = bio ? bio->bi_sector : 0;
+               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               blk_fill_rwbs(__entry->rwbs,
+                             bio ? bio->bi_rw : 0, __entry->nr_sector);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+        ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_sleeprq,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+
+       TP_ARGS(q, bio, rw),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                     )
+               __field( sector_t,      sector                  )
+               __field( unsigned int,  nr_sector               )
+               __array( char,          rwbs,   6               )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio ? bio->bi_bdev->bd_dev : 0;
+               __entry->sector         = bio ? bio->bi_sector : 0;
+               __entry->nr_sector      = bio ? bio->bi_size >> 9 : 0;
+               blk_fill_rwbs(__entry->rwbs,
+                           bio ? bio->bi_rw : 0, __entry->nr_sector);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector, __entry->comm)
+);
+
+TRACE_EVENT(block_plug,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s]", __entry->comm)
+);
+
+TRACE_EVENT(block_unplug_timer,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __field( int,           nr_rq                   )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->nr_rq  = q->rq.count[READ] + q->rq.count[WRITE];
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_unplug_io,
+
+       TP_PROTO(struct request_queue *q),
+
+       TP_ARGS(q),
+
+       TP_STRUCT__entry(
+               __field( int,           nr_rq                   )
+               __array( char,          comm,   TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->nr_rq  = q->rq.count[READ] + q->rq.count[WRITE];
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("[%s] %d", __entry->comm, __entry->nr_rq)
+);
+
+TRACE_EVENT(block_split,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio,
+                unsigned int new_sector),
+
+       TP_ARGS(q, bio, new_sector),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev                             )
+               __field( sector_t,      sector                          )
+               __field( sector_t,      new_sector                      )
+               __array( char,          rwbs,           6               )
+               __array( char,          comm,           TASK_COMM_LEN   )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->new_sector     = new_sector;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+       ),
+
+       TP_printk("%d,%d %s %llu / %llu [%s]",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 (unsigned long long)__entry->new_sector,
+                 __entry->comm)
+);
+
+TRACE_EVENT(block_remap,
+
+       TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+                sector_t from),
+
+       TP_ARGS(q, bio, dev, from),
+
+       TP_STRUCT__entry(
+               __field( dev_t,         dev             )
+               __field( sector_t,      sector          )
+               __field( unsigned int,  nr_sector       )
+               __field( dev_t,         old_dev         )
+               __field( sector_t,      old_sector      )
+               __array( char,          rwbs,   6       )
+       ),
+
+       TP_fast_assign(
+               __entry->dev            = bio->bi_bdev->bd_dev;
+               __entry->sector         = bio->bi_sector;
+               __entry->nr_sector      = bio->bi_size >> 9;
+               __entry->old_dev        = dev;
+               __entry->old_sector     = from;
+               blk_fill_rwbs(__entry->rwbs, bio->bi_rw, bio->bi_size);
+       ),
+
+       TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+                 MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+                 (unsigned long long)__entry->sector,
+                 __entry->nr_sector,
+                 MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+                 (unsigned long long)__entry->old_sector)
+);
+
+#endif /* _TRACE_BLOCK_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
+
diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h
new file mode 100644 (file)
index 0000000..b0c7ede
--- /dev/null
@@ -0,0 +1,145 @@
+#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_IRQ_H
+
+#include <linux/tracepoint.h>
+#include <linux/interrupt.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM irq
+
+#define softirq_name(sirq) { sirq##_SOFTIRQ, #sirq }
+#define show_softirq_name(val)                 \
+       __print_symbolic(val,                   \
+                        softirq_name(HI),      \
+                        softirq_name(TIMER),   \
+                        softirq_name(NET_TX),  \
+                        softirq_name(NET_RX),  \
+                        softirq_name(BLOCK),   \
+                        softirq_name(TASKLET), \
+                        softirq_name(SCHED),   \
+                        softirq_name(HRTIMER), \
+                        softirq_name(RCU))
+
+/**
+ * irq_handler_entry - called immediately before the irq action handler
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ *
+ * The struct irqaction pointed to by @action contains various
+ * information about the handler, including the device name,
+ * @action->name, and the device id, @action->dev_id. When used in
+ * conjunction with the irq_handler_exit tracepoint, we can figure
+ * out irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_entry,
+
+       TP_PROTO(int irq, struct irqaction *action),
+
+       TP_ARGS(irq, action),
+
+       TP_STRUCT__entry(
+               __field(        int,    irq             )
+               __string(       name,   action->name    )
+       ),
+
+       TP_fast_assign(
+               __entry->irq = irq;
+               __assign_str(name, action->name);
+       ),
+
+       TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name))
+);
+
+/**
+ * irq_handler_exit - called immediately after the irq action handler returns
+ * @irq: irq number
+ * @action: pointer to struct irqaction
+ * @ret: return value
+ *
+ * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding
+ * @action->handler scuccessully handled this irq. Otherwise, the irq might be
+ * a shared irq line, or the irq was not handled successfully. Can be used in
+ * conjunction with the irq_handler_entry to understand irq handler latencies.
+ */
+TRACE_EVENT(irq_handler_exit,
+
+       TP_PROTO(int irq, struct irqaction *action, int ret),
+
+       TP_ARGS(irq, action, ret),
+
+       TP_STRUCT__entry(
+               __field(        int,    irq     )
+               __field(        int,    ret     )
+       ),
+
+       TP_fast_assign(
+               __entry->irq    = irq;
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("irq=%d return=%s",
+                 __entry->irq, __entry->ret ? "handled" : "unhandled")
+);
+
+/**
+ * softirq_entry - called immediately before the softirq handler
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter, contains a pointer to the struct softirq_action
+ * which has a pointer to the action handler that is called. By subtracting
+ * the @vec pointer from the @h pointer, we can determine the softirq
+ * number. Also, when used in combination with the softirq_exit tracepoint
+ * we can determine the softirq latency.
+ */
+TRACE_EVENT(softirq_entry,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec),
+
+       TP_STRUCT__entry(
+               __field(        int,    vec                     )
+       ),
+
+       TP_fast_assign(
+               __entry->vec = (int)(h - vec);
+       ),
+
+       TP_printk("softirq=%d action=%s", __entry->vec,
+                 show_softirq_name(__entry->vec))
+);
+
+/**
+ * softirq_exit - called immediately after the softirq handler returns
+ * @h: pointer to struct softirq_action
+ * @vec: pointer to first struct softirq_action in softirq_vec array
+ *
+ * The @h parameter contains a pointer to the struct softirq_action
+ * that has handled the softirq. By subtracting the @vec pointer from
+ * the @h pointer, we can determine the softirq number. Also, when used in
+ * combination with the softirq_entry tracepoint we can determine the softirq
+ * latency.
+ */
+TRACE_EVENT(softirq_exit,
+
+       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+
+       TP_ARGS(h, vec),
+
+       TP_STRUCT__entry(
+               __field(        int,    vec                     )
+       ),
+
+       TP_fast_assign(
+               __entry->vec = (int)(h - vec);
+       ),
+
+       TP_printk("softirq=%d action=%s", __entry->vec,
+                 show_softirq_name(__entry->vec))
+);
+
+#endif /*  _TRACE_IRQ_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/kmem.h b/include/trace/events/kmem.h
new file mode 100644 (file)
index 0000000..9baba50
--- /dev/null
@@ -0,0 +1,231 @@
+#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_KMEM_H
+
+#include <linux/types.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM kmem
+
+/*
+ * The order of these masks is important. Matching masks will be seen
+ * first and the left over flags will end up showing by themselves.
+ *
+ * For example, if we have GFP_KERNEL before GFP_USER we wil get:
+ *
+ *  GFP_KERNEL|GFP_HARDWALL
+ *
+ * Thus most bits set go first.
+ */
+#define show_gfp_flags(flags)                                          \
+       (flags) ? __print_flags(flags, "|",                             \
+       {(unsigned long)GFP_HIGHUSER_MOVABLE,   "GFP_HIGHUSER_MOVABLE"}, \
+       {(unsigned long)GFP_HIGHUSER,           "GFP_HIGHUSER"},        \
+       {(unsigned long)GFP_USER,               "GFP_USER"},            \
+       {(unsigned long)GFP_TEMPORARY,          "GFP_TEMPORARY"},       \
+       {(unsigned long)GFP_KERNEL,             "GFP_KERNEL"},          \
+       {(unsigned long)GFP_NOFS,               "GFP_NOFS"},            \
+       {(unsigned long)GFP_ATOMIC,             "GFP_ATOMIC"},          \
+       {(unsigned long)GFP_NOIO,               "GFP_NOIO"},            \
+       {(unsigned long)__GFP_HIGH,             "GFP_HIGH"},            \
+       {(unsigned long)__GFP_WAIT,             "GFP_WAIT"},            \
+       {(unsigned long)__GFP_IO,               "GFP_IO"},              \
+       {(unsigned long)__GFP_COLD,             "GFP_COLD"},            \
+       {(unsigned long)__GFP_NOWARN,           "GFP_NOWARN"},          \
+       {(unsigned long)__GFP_REPEAT,           "GFP_REPEAT"},          \
+       {(unsigned long)__GFP_NOFAIL,           "GFP_NOFAIL"},          \
+       {(unsigned long)__GFP_NORETRY,          "GFP_NORETRY"},         \
+       {(unsigned long)__GFP_COMP,             "GFP_COMP"},            \
+       {(unsigned long)__GFP_ZERO,             "GFP_ZERO"},            \
+       {(unsigned long)__GFP_NOMEMALLOC,       "GFP_NOMEMALLOC"},      \
+       {(unsigned long)__GFP_HARDWALL,         "GFP_HARDWALL"},        \
+       {(unsigned long)__GFP_THISNODE,         "GFP_THISNODE"},        \
+       {(unsigned long)__GFP_RECLAIMABLE,      "GFP_RECLAIMABLE"},     \
+       {(unsigned long)__GFP_MOVABLE,          "GFP_MOVABLE"}          \
+       ) : "GFP_NOWAIT"
+
+TRACE_EVENT(kmalloc,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmem_cache_alloc,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags))
+);
+
+TRACE_EVENT(kmalloc_node,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags,
+                int node),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+               __field(        int,            node            )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+               __entry->node           = node;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags),
+               __entry->node)
+);
+
+TRACE_EVENT(kmem_cache_alloc_node,
+
+       TP_PROTO(unsigned long call_site,
+                const void *ptr,
+                size_t bytes_req,
+                size_t bytes_alloc,
+                gfp_t gfp_flags,
+                int node),
+
+       TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+               __field(        size_t,         bytes_req       )
+               __field(        size_t,         bytes_alloc     )
+               __field(        gfp_t,          gfp_flags       )
+               __field(        int,            node            )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+               __entry->bytes_req      = bytes_req;
+               __entry->bytes_alloc    = bytes_alloc;
+               __entry->gfp_flags      = gfp_flags;
+               __entry->node           = node;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%s node=%d",
+               __entry->call_site,
+               __entry->ptr,
+               __entry->bytes_req,
+               __entry->bytes_alloc,
+               show_gfp_flags(__entry->gfp_flags),
+               __entry->node)
+);
+
+TRACE_EVENT(kfree,
+
+       TP_PROTO(unsigned long call_site, const void *ptr),
+
+       TP_ARGS(call_site, ptr),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+
+TRACE_EVENT(kmem_cache_free,
+
+       TP_PROTO(unsigned long call_site, const void *ptr),
+
+       TP_ARGS(call_site, ptr),
+
+       TP_STRUCT__entry(
+               __field(        unsigned long,  call_site       )
+               __field(        const void *,   ptr             )
+       ),
+
+       TP_fast_assign(
+               __entry->call_site      = call_site;
+               __entry->ptr            = ptr;
+       ),
+
+       TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr)
+);
+#endif /* _TRACE_KMEM_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h
new file mode 100644 (file)
index 0000000..0e956c9
--- /dev/null
@@ -0,0 +1,96 @@
+#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lockdep
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_EVENT(lock_acquire,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
+               int trylock, int read, int check,
+               struct lockdep_map *next_lock, unsigned long ip),
+
+       TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+
+       TP_STRUCT__entry(
+               __field(unsigned int, flags)
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0);
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "",
+                 (__entry->flags & 2) ? "read " : "",
+                 __get_str(name))
+);
+
+TRACE_EVENT(lock_release,
+
+       TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+
+       TP_ARGS(lock, nested, ip),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s", __get_str(name))
+);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_EVENT(lock_contended,
+
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+
+       TP_ARGS(lock, ip),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+       ),
+
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+       ),
+
+       TP_printk("%s", __get_str(name))
+);
+
+TRACE_EVENT(lock_acquired,
+       TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime),
+
+       TP_ARGS(lock, ip, waittime),
+
+       TP_STRUCT__entry(
+               __string(name, lock->name)
+               __field(unsigned long, wait_usec)
+               __field(unsigned long, wait_nsec_rem)
+       ),
+       TP_fast_assign(
+               __assign_str(name, lock->name);
+               __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC);
+               __entry->wait_usec = (unsigned long) waittime;
+       ),
+       TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec,
+                                      __entry->wait_nsec_rem)
+);
+
+#endif
+#endif
+
+#endif /* _TRACE_LOCKDEP_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
new file mode 100644 (file)
index 0000000..24ab5bc
--- /dev/null
@@ -0,0 +1,346 @@
+#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SCHED_H
+
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sched
+
+/*
+ * Tracepoint for calling kthread_stop, performed to end a kthread:
+ */
+TRACE_EVENT(sched_kthread_stop,
+
+       TP_PROTO(struct task_struct *t),
+
+       TP_ARGS(t),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
+               __entry->pid    = t->pid;
+       ),
+
+       TP_printk("task %s:%d", __entry->comm, __entry->pid)
+);
+
+/*
+ * Tracepoint for the return value of the kthread stopping:
+ */
+TRACE_EVENT(sched_kthread_stop_ret,
+
+       TP_PROTO(int ret),
+
+       TP_ARGS(ret),
+
+       TP_STRUCT__entry(
+               __field(        int,    ret     )
+       ),
+
+       TP_fast_assign(
+               __entry->ret    = ret;
+       ),
+
+       TP_printk("ret %d", __entry->ret)
+);
+
+/*
+ * Tracepoint for waiting on task to unschedule:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wait_task,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p),
+
+       TP_ARGS(rq, p),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid    = p->pid;
+               __entry->prio   = p->prio;
+       ),
+
+       TP_printk("task %s:%d [%d]",
+                 __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for waking up a task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+       TP_ARGS(rq, p, success),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+               __field(        int,    success                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+               __entry->success        = success;
+       ),
+
+       TP_printk("task %s:%d [%d] success=%d",
+                 __entry->comm, __entry->pid, __entry->prio,
+                 __entry->success)
+);
+
+/*
+ * Tracepoint for waking up a new task:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_wakeup_new,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+
+       TP_ARGS(rq, p, success),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+               __field(        int,    success                 )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+               __entry->success        = success;
+       ),
+
+       TP_printk("task %s:%d [%d] success=%d",
+                 __entry->comm, __entry->pid, __entry->prio,
+                 __entry->success)
+);
+
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
+       TP_PROTO(struct rq *rq, struct task_struct *prev,
+                struct task_struct *next),
+
+       TP_ARGS(rq, prev, next),
+
+       TP_STRUCT__entry(
+               __array(        char,   prev_comm,      TASK_COMM_LEN   )
+               __field(        pid_t,  prev_pid                        )
+               __field(        int,    prev_prio                       )
+               __field(        long,   prev_state                      )
+               __array(        char,   next_comm,      TASK_COMM_LEN   )
+               __field(        pid_t,  next_pid                        )
+               __field(        int,    next_prio                       )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+               __entry->prev_pid       = prev->pid;
+               __entry->prev_prio      = prev->prio;
+               __entry->prev_state     = prev->state;
+               memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+               __entry->next_pid       = next->pid;
+               __entry->next_prio      = next->prio;
+       ),
+
+       TP_printk("task %s:%d [%d] (%s) ==> %s:%d [%d]",
+               __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+               __entry->prev_state ?
+                 __print_flags(__entry->prev_state, "|",
+                               { 1, "S"} , { 2, "D" }, { 4, "T" }, { 8, "t" },
+                               { 16, "Z" }, { 32, "X" }, { 64, "x" },
+                               { 128, "W" }) : "R",
+               __entry->next_comm, __entry->next_pid, __entry->next_prio)
+);
+
+/*
+ * Tracepoint for a task being migrated:
+ */
+TRACE_EVENT(sched_migrate_task,
+
+       TP_PROTO(struct task_struct *p, int dest_cpu),
+
+       TP_ARGS(p, dest_cpu),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+               __field(        int,    orig_cpu                )
+               __field(        int,    dest_cpu                )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+               __entry->orig_cpu       = task_cpu(p);
+               __entry->dest_cpu       = dest_cpu;
+       ),
+
+       TP_printk("task %s:%d [%d] from: %d  to: %d",
+                 __entry->comm, __entry->pid, __entry->prio,
+                 __entry->orig_cpu, __entry->dest_cpu)
+);
+
+/*
+ * Tracepoint for freeing a task:
+ */
+TRACE_EVENT(sched_process_free,
+
+       TP_PROTO(struct task_struct *p),
+
+       TP_ARGS(p),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+       ),
+
+       TP_printk("task %s:%d [%d]",
+                 __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a task exiting:
+ */
+TRACE_EVENT(sched_process_exit,
+
+       TP_PROTO(struct task_struct *p),
+
+       TP_ARGS(p),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->prio           = p->prio;
+       ),
+
+       TP_printk("task %s:%d [%d]",
+                 __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for a waiting task:
+ */
+TRACE_EVENT(sched_process_wait,
+
+       TP_PROTO(struct pid *pid),
+
+       TP_ARGS(pid),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        int,    prio                    )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
+               __entry->pid            = pid_nr(pid);
+               __entry->prio           = current->prio;
+       ),
+
+       TP_printk("task %s:%d [%d]",
+                 __entry->comm, __entry->pid, __entry->prio)
+);
+
+/*
+ * Tracepoint for do_fork:
+ */
+TRACE_EVENT(sched_process_fork,
+
+       TP_PROTO(struct task_struct *parent, struct task_struct *child),
+
+       TP_ARGS(parent, child),
+
+       TP_STRUCT__entry(
+               __array(        char,   parent_comm,    TASK_COMM_LEN   )
+               __field(        pid_t,  parent_pid                      )
+               __array(        char,   child_comm,     TASK_COMM_LEN   )
+               __field(        pid_t,  child_pid                       )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
+               __entry->parent_pid     = parent->pid;
+               memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
+               __entry->child_pid      = child->pid;
+       ),
+
+       TP_printk("parent %s:%d  child %s:%d",
+               __entry->parent_comm, __entry->parent_pid,
+               __entry->child_comm, __entry->child_pid)
+);
+
+/*
+ * Tracepoint for sending a signal:
+ */
+TRACE_EVENT(sched_signal_send,
+
+       TP_PROTO(int sig, struct task_struct *p),
+
+       TP_ARGS(sig, p),
+
+       TP_STRUCT__entry(
+               __field(        int,    sig                     )
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid    = p->pid;
+               __entry->sig    = sig;
+       ),
+
+       TP_printk("sig: %d  task %s:%d",
+                 __entry->sig, __entry->comm, __entry->pid)
+);
+
+#endif /* _TRACE_SCHED_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/skb.h b/include/trace/events/skb.h
new file mode 100644 (file)
index 0000000..1e8fabb
--- /dev/null
@@ -0,0 +1,40 @@
+#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_SKB_H
+
+#include <linux/skbuff.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM skb
+
+/*
+ * Tracepoint for free an sk_buff:
+ */
+TRACE_EVENT(kfree_skb,
+
+       TP_PROTO(struct sk_buff *skb, void *location),
+
+       TP_ARGS(skb, location),
+
+       TP_STRUCT__entry(
+               __field(        void *,         skbaddr         )
+               __field(        unsigned short, protocol        )
+               __field(        void *,         location        )
+       ),
+
+       TP_fast_assign(
+               __entry->skbaddr = skb;
+               if (skb) {
+                       __entry->protocol = ntohs(skb->protocol);
+               }
+               __entry->location = location;
+       ),
+
+       TP_printk("skbaddr=%p protocol=%u location=%p",
+               __entry->skbaddr, __entry->protocol, __entry->location)
+);
+
+#endif /* _TRACE_SKB_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/events/workqueue.h b/include/trace/events/workqueue.h
new file mode 100644 (file)
index 0000000..035f1bf
--- /dev/null
@@ -0,0 +1,100 @@
+#if !defined(_TRACE_WORKQUEUE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_WORKQUEUE_H
+
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/tracepoint.h>
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM workqueue
+
+TRACE_EVENT(workqueue_insertion,
+
+       TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+       TP_ARGS(wq_thread, work),
+
+       TP_STRUCT__entry(
+               __array(char,           thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,          thread_pid)
+               __field(work_func_t,    func)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->func           = work->func;
+       ),
+
+       TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+               __entry->thread_pid, __entry->func)
+);
+
+TRACE_EVENT(workqueue_execution,
+
+       TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+
+       TP_ARGS(wq_thread, work),
+
+       TP_STRUCT__entry(
+               __array(char,           thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,          thread_pid)
+               __field(work_func_t,    func)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->func           = work->func;
+       ),
+
+       TP_printk("thread=%s:%d func=%pF", __entry->thread_comm,
+               __entry->thread_pid, __entry->func)
+);
+
+/* Trace the creation of one workqueue thread on a cpu */
+TRACE_EVENT(workqueue_creation,
+
+       TP_PROTO(struct task_struct *wq_thread, int cpu),
+
+       TP_ARGS(wq_thread, cpu),
+
+       TP_STRUCT__entry(
+               __array(char,   thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,  thread_pid)
+               __field(int,    cpu)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+               __entry->cpu            = cpu;
+       ),
+
+       TP_printk("thread=%s:%d cpu=%d", __entry->thread_comm,
+               __entry->thread_pid, __entry->cpu)
+);
+
+TRACE_EVENT(workqueue_destruction,
+
+       TP_PROTO(struct task_struct *wq_thread),
+
+       TP_ARGS(wq_thread),
+
+       TP_STRUCT__entry(
+               __array(char,   thread_comm,    TASK_COMM_LEN)
+               __field(pid_t,  thread_pid)
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->thread_comm, wq_thread->comm, TASK_COMM_LEN);
+               __entry->thread_pid     = wq_thread->pid;
+       ),
+
+       TP_printk("thread=%s:%d", __entry->thread_comm, __entry->thread_pid)
+);
+
+#endif /* _TRACE_WORKQUEUE_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h
new file mode 100644 (file)
index 0000000..1867553
--- /dev/null
@@ -0,0 +1,591 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *     struct trace_entry              ent;
+ *     <type>                          <item>;
+ *     <type2>                         <item2>[<len>];
+ *     [...]
+ * };
+ *
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#include <linux/ftrace_event.h>
+
+#undef __field
+#define __field(type, item)            type    item;
+
+#undef __array
+#define __array(type, item, len)       type    item[len];
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len) unsigned short __data_loc_##item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
+       struct ftrace_raw_##name {                              \
+               struct trace_entry      ent;                    \
+               tstruct                                         \
+               char                    __data[0];              \
+       };                                                      \
+       static struct ftrace_event_call event_##name
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+
+/*
+ * Stage 2 of the trace events.
+ *
+ * Include the following:
+ *
+ * struct ftrace_data_offsets_<call> {
+ *     int                             <item1>;
+ *     int                             <item2>;
+ *     [...]
+ * };
+ *
+ * The __dynamic_array() macro will create each int <item>, this is
+ * to keep the offset of each array from the beginning of the event.
+ */
+
+#undef __field
+#define __field(type, item);
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)       int item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+       struct ftrace_data_offsets_##call {                             \
+               tstruct;                                                \
+       };
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *     struct ftrace_raw_##call field;
+ *     int ret;
+ *
+ *     ret = trace_seq_printf(s, #type " " #item ";"
+ *                            " offset:%u; size:%u;\n",
+ *                            offsetof(struct ftrace_raw_##call, item),
+ *                            sizeof(field.type));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)                                    \
+       ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
+                              "offset:%u;\tsize:%u;\n",                \
+                              (unsigned int)offsetof(typeof(field), item), \
+                              (unsigned int)sizeof(field.item));       \
+       if (!ret)                                                       \
+               return 0;
+
+#undef __array
+#define __array(type, item, len)                                               \
+       ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
+                              "offset:%u;\tsize:%u;\n",                \
+                              (unsigned int)offsetof(typeof(field), item), \
+                              (unsigned int)sizeof(field.item));       \
+       if (!ret)                                                       \
+               return 0;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                                      \
+       ret = trace_seq_printf(s, "\tfield:__data_loc " #item ";\t"            \
+                              "offset:%u;\tsize:%u;\n",                       \
+                              (unsigned int)offsetof(typeof(field),           \
+                                       __data_loc_##item),                    \
+                              (unsigned int)sizeof(field.__data_loc_##item)); \
+       if (!ret)                                                              \
+               return 0;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef __entry
+#define __entry REC
+
+#undef __print_symbolic
+#undef __get_dynamic_array
+#undef __get_str
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+static int                                                             \
+ftrace_format_##call(struct trace_seq *s)                              \
+{                                                                      \
+       struct ftrace_raw_##call field __attribute__((unused));         \
+       int ret = 0;                                                    \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       trace_seq_printf(s, "\nprint fmt: " print);                     \
+                                                                       \
+       return ret;                                                     \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *     struct trace_seq *s = &iter->seq;
+ *     struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *     struct trace_entry *entry;
+ *     struct trace_seq *p;
+ *     int ret;
+ *
+ *     entry = iter->ent;
+ *
+ *     if (entry->type != event_<call>.id) {
+ *             WARN_ON_ONCE(1);
+ *             return TRACE_TYPE_UNHANDLED;
+ *     }
+ *
+ *     field = (typeof(field))entry;
+ *
+ *     p = get_cpu_var(ftrace_event_seq);
+ *     trace_seq_init(p);
+ *     ret = trace_seq_printf(s, <TP_printk> "\n");
+ *     put_cpu();
+ *     if (!ret)
+ *             return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *     return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef __entry
+#define __entry field
+
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
+
+#undef __get_dynamic_array
+#define __get_dynamic_array(field)     \
+               ((void *)__entry + __entry->__data_loc_##field)
+
+#undef __get_str
+#define __get_str(field) (char *)__get_dynamic_array(field)
+
+#undef __print_flags
+#define __print_flags(flag, delim, flag_array...)                      \
+       ({                                                              \
+               static const struct trace_print_flags flags[] =         \
+                       { flag_array, { -1, NULL }};                    \
+               ftrace_print_flags_seq(p, delim, flag, flags);          \
+       })
+
+#undef __print_symbolic
+#define __print_symbolic(value, symbol_array...)                       \
+       ({                                                              \
+               static const struct trace_print_flags symbols[] =       \
+                       { symbol_array, { -1, NULL }};                  \
+               ftrace_print_symbols_seq(p, value, symbols);            \
+       })
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+enum print_line_t                                                      \
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
+{                                                                      \
+       struct trace_seq *s = &iter->seq;                               \
+       struct ftrace_raw_##call *field;                                \
+       struct trace_entry *entry;                                      \
+       struct trace_seq *p;                                            \
+       int ret;                                                        \
+                                                                       \
+       entry = iter->ent;                                              \
+                                                                       \
+       if (entry->type != event_##call.id) {                           \
+               WARN_ON_ONCE(1);                                        \
+               return TRACE_TYPE_UNHANDLED;                            \
+       }                                                               \
+                                                                       \
+       field = (typeof(field))entry;                                   \
+                                                                       \
+       p = &get_cpu_var(ftrace_event_seq);                             \
+       trace_seq_init(p);                                              \
+       ret = trace_seq_printf(s, #call ": " print);                    \
+       put_cpu();                                                      \
+       if (!ret)                                                       \
+               return TRACE_TYPE_PARTIAL_LINE;                         \
+                                                                       \
+       return TRACE_TYPE_HANDLED;                                      \
+}
+       
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef __field
+#define __field(type, item)                                            \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed_type(type));     \
+       if (ret)                                                        \
+               return ret;
+
+#undef __array
+#define __array(type, item, len)                                       \
+       BUILD_BUG_ON(len > MAX_FILTER_STR_VAL);                         \
+       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), 0);                \
+       if (ret)                                                        \
+               return ret;
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                                      \
+       ret = trace_define_field(event_call, "__data_loc" "[" #type "]", #item,\
+                               offsetof(typeof(field), __data_loc_##item),    \
+                                sizeof(field.__data_loc_##item), 0);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
+int                                                                    \
+ftrace_define_fields_##call(void)                                      \
+{                                                                      \
+       struct ftrace_raw_##call field;                                 \
+       struct ftrace_event_call *event_call = &event_##call;           \
+       int ret;                                                        \
+                                                                       \
+       __common_field(int, type, 1);                                   \
+       __common_field(unsigned char, flags, 0);                        \
+       __common_field(unsigned char, preempt_count, 0);                \
+       __common_field(int, pid, 1);                                    \
+       __common_field(int, tgid, 1);                                   \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return ret;                                                     \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * remember the offset of each array from the beginning of the event.
+ */
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                               \
+       __data_offsets->item = __data_size +                            \
+                              offsetof(typeof(*entry), __data);        \
+       __data_size += (len) * sizeof(type);
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, strlen(src) + 1)       \
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+static inline int ftrace_get_offsets_##call(                           \
+       struct ftrace_data_offsets_##call *__data_offsets, proto)       \
+{                                                                      \
+       int __data_size = 0;                                            \
+       struct ftrace_raw_##call __maybe_unused *entry;                 \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return __data_size;                                             \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+/*
+ * Stage 4 of the trace events.
+ *
+ * Override the macros in <trace/trace_events.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ *     int ret;
+ *
+ *     ret = register_trace_<call>(ftrace_event_<call>);
+ *     if (!ret)
+ *             pr_info("event trace: Could not activate trace point "
+ *                     "probe to  <call>");
+ *     return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *     unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ *     struct ring_buffer_event *event;
+ *     struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *     unsigned long irq_flags;
+ *     int pc;
+ *
+ *     local_save_flags(irq_flags);
+ *     pc = preempt_count();
+ *
+ *     event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *                               sizeof(struct ftrace_raw_<call>),
+ *                               irq_flags, pc);
+ *     if (!event)
+ *             return;
+ *     entry   = ring_buffer_event_data(event);
+ *
+ *     <assign>;  <-- Here we assign the entries by the __field and
+ *                     __array macros.
+ *
+ *     trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ *     int ret;
+ *
+ *     ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *     if (!ret)
+ *             pr_info("event trace: Could not activate trace point "
+ *                     "probe to <call>");
+ *     return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ *     unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ *     int id;
+ *
+ *     id = register_ftrace_event(&ftrace_event_type_<call>);
+ *     if (!id)
+ *             return -ENODEV;
+ *     event_<call>.id = id;
+ *     return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ *     .name                   = "<call>",
+ *     .system                 = "<system>",
+ *     .raw_init               = ftrace_raw_init_event_<call>,
+ *     .regfunc                = ftrace_reg_event_<call>,
+ *     .unregfunc              = ftrace_unreg_event_<call>,
+ *     .show_format            = ftrace_format_<call>,
+ * }
+ *
+ */
+
+#undef TP_FMT
+#define TP_FMT(fmt, args...)   fmt "\n", ##args
+
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)                              \
+static void ftrace_profile_##call(proto)                               \
+{                                                                      \
+       extern void perf_tpcounter_event(int);                          \
+       perf_tpcounter_event(event_##call.id);                          \
+}                                                                      \
+                                                                       \
+static int ftrace_profile_enable_##call(struct ftrace_event_call *event_call) \
+{                                                                      \
+       int ret = 0;                                                    \
+                                                                       \
+       if (!atomic_inc_return(&event_call->profile_count))             \
+               ret = register_trace_##call(ftrace_profile_##call);     \
+                                                                       \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static void ftrace_profile_disable_##call(struct ftrace_event_call *event_call)\
+{                                                                      \
+       if (atomic_add_negative(-1, &event_call->profile_count))        \
+               unregister_trace_##call(ftrace_profile_##call);         \
+}
+
+#define _TRACE_PROFILE_INIT(call)                                      \
+       .profile_count = ATOMIC_INIT(-1),                               \
+       .profile_enable = ftrace_profile_enable_##call,                 \
+       .profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
+#undef __entry
+#define __entry entry
+
+#undef __field
+#define __field(type, item)
+
+#undef __array
+#define __array(type, item, len)
+
+#undef __dynamic_array
+#define __dynamic_array(type, item, len)                               \
+       __entry->__data_loc_##item = __data_offsets.item;
+
+#undef __string
+#define __string(item, src) __dynamic_array(char, item, -1)            \
+
+#undef __assign_str
+#define __assign_str(dst, src)                                         \
+       strcpy(__get_str(dst), src);
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
+                                                                       \
+static struct ftrace_event_call event_##call;                          \
+                                                                       \
+static void ftrace_raw_event_##call(proto)                             \
+{                                                                      \
+       struct ftrace_data_offsets_##call __maybe_unused __data_offsets;\
+       struct ftrace_event_call *event_call = &event_##call;           \
+       struct ring_buffer_event *event;                                \
+       struct ftrace_raw_##call *entry;                                \
+       unsigned long irq_flags;                                        \
+       int __data_size;                                                \
+       int pc;                                                         \
+                                                                       \
+       local_save_flags(irq_flags);                                    \
+       pc = preempt_count();                                           \
+                                                                       \
+       __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \
+                                                                       \
+       event = trace_current_buffer_lock_reserve(event_##call.id,      \
+                                sizeof(*entry) + __data_size,          \
+                                irq_flags, pc);                        \
+       if (!event)                                                     \
+               return;                                                 \
+       entry   = ring_buffer_event_data(event);                        \
+                                                                       \
+                                                                       \
+       tstruct                                                         \
+                                                                       \
+       { assign; }                                                     \
+                                                                       \
+       if (!filter_current_check_discard(event_call, entry, event))    \
+               trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \
+}                                                                      \
+                                                                       \
+static int ftrace_raw_reg_event_##call(void)                           \
+{                                                                      \
+       int ret;                                                        \
+                                                                       \
+       ret = register_trace_##call(ftrace_raw_event_##call);           \
+       if (ret)                                                        \
+               pr_info("event trace: Could not activate trace point "  \
+                       "probe to " #call "\n");                        \
+       return ret;                                                     \
+}                                                                      \
+                                                                       \
+static void ftrace_raw_unreg_event_##call(void)                                \
+{                                                                      \
+       unregister_trace_##call(ftrace_raw_event_##call);               \
+}                                                                      \
+                                                                       \
+static struct trace_event ftrace_event_type_##call = {                 \
+       .trace                  = ftrace_raw_output_##call,             \
+};                                                                     \
+                                                                       \
+static int ftrace_raw_init_event_##call(void)                          \
+{                                                                      \
+       int id;                                                         \
+                                                                       \
+       id = register_ftrace_event(&ftrace_event_type_##call);          \
+       if (!id)                                                        \
+               return -ENODEV;                                         \
+       event_##call.id = id;                                           \
+       INIT_LIST_HEAD(&event_##call.fields);                           \
+       init_preds(&event_##call);                                      \
+       return 0;                                                       \
+}                                                                      \
+                                                                       \
+static struct ftrace_event_call __used                                 \
+__attribute__((__aligned__(4)))                                                \
+__attribute__((section("_ftrace_events"))) event_##call = {            \
+       .name                   = #call,                                \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .event                  = &ftrace_event_type_##call,            \
+       .raw_init               = ftrace_raw_init_event_##call,         \
+       .regfunc                = ftrace_raw_reg_event_##call,          \
+       .unregfunc              = ftrace_raw_unreg_event_##call,        \
+       .show_format            = ftrace_format_##call,                 \
+       .define_fields          = ftrace_define_fields_##call,          \
+       _TRACE_PROFILE_INIT(call)                                       \
+}
+
+#include TRACE_INCLUDE(TRACE_INCLUDE_FILE)
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
diff --git a/include/trace/irq.h b/include/trace/irq.h
deleted file mode 100644 (file)
index ff5d449..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_IRQ_H
-#define _TRACE_IRQ_H
-
-#include <linux/interrupt.h>
-#include <linux/tracepoint.h>
-
-#include <trace/irq_event_types.h>
-
-#endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
deleted file mode 100644 (file)
index 85964eb..0000000
+++ /dev/null
@@ -1,55 +0,0 @@
-
-/* use <trace/irq.h> instead */
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM irq
-
-/*
- * Tracepoint for entry of interrupt handler:
- */
-TRACE_FORMAT(irq_handler_entry,
-       TP_PROTO(int irq, struct irqaction *action),
-       TP_ARGS(irq, action),
-       TP_FMT("irq=%d handler=%s", irq, action->name)
-       );
-
-/*
- * Tracepoint for return of an interrupt handler:
- */
-TRACE_EVENT(irq_handler_exit,
-
-       TP_PROTO(int irq, struct irqaction *action, int ret),
-
-       TP_ARGS(irq, action, ret),
-
-       TP_STRUCT__entry(
-               __field(        int,    irq     )
-               __field(        int,    ret     )
-       ),
-
-       TP_fast_assign(
-               __entry->irq    = irq;
-               __entry->ret    = ret;
-       ),
-
-       TP_printk("irq=%d return=%s",
-                 __entry->irq, __entry->ret ? "handled" : "unhandled")
-);
-
-TRACE_FORMAT(softirq_entry,
-       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-       TP_ARGS(h, vec),
-       TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-       );
-
-TRACE_FORMAT(softirq_exit,
-       TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
-       TP_ARGS(h, vec),
-       TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
-       );
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
deleted file mode 100644 (file)
index 28ee69f..0000000
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/tracepoint.h>
-#include <linux/types.h>
-
-#ifdef CONFIG_KMEMTRACE
-extern void kmemtrace_init(void);
-#else
-static inline void kmemtrace_init(void)
-{
-}
-#endif
-
-DECLARE_TRACE(kmalloc,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmem_cache_alloc,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
-DECLARE_TRACE(kmalloc_node,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags,
-                     int node),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kmem_cache_alloc_node,
-             TP_PROTO(unsigned long call_site,
-                     const void *ptr,
-                     size_t bytes_req,
-                     size_t bytes_alloc,
-                     gfp_t gfp_flags,
-                     int node),
-             TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
-DECLARE_TRACE(kfree,
-             TP_PROTO(unsigned long call_site, const void *ptr),
-             TP_ARGS(call_site, ptr));
-DECLARE_TRACE(kmem_cache_free,
-             TP_PROTO(unsigned long call_site, const void *ptr),
-             TP_ARGS(call_site, ptr));
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
deleted file mode 100644 (file)
index 5ca67df..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_LOCKDEP_H
-#define _TRACE_LOCKDEP_H
-
-#include <linux/lockdep.h>
-#include <linux/tracepoint.h>
-
-#include <trace/lockdep_event_types.h>
-
-#endif
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
deleted file mode 100644 (file)
index adccfcd..0000000
+++ /dev/null
@@ -1,44 +0,0 @@
-
-#ifndef TRACE_FORMAT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM lock
-
-#ifdef CONFIG_LOCKDEP
-
-TRACE_FORMAT(lock_acquire,
-       TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
-               int trylock, int read, int check,
-               struct lockdep_map *next_lock, unsigned long ip),
-       TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
-       TP_FMT("%s%s%s", trylock ? "try " : "",
-               read ? "read " : "", lock->name)
-       );
-
-TRACE_FORMAT(lock_release,
-       TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-       TP_ARGS(lock, nested, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-#ifdef CONFIG_LOCK_STAT
-
-TRACE_FORMAT(lock_contended,
-       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-       TP_ARGS(lock, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-TRACE_FORMAT(lock_acquired,
-       TP_PROTO(struct lockdep_map *lock, unsigned long ip),
-       TP_ARGS(lock, ip),
-       TP_FMT("%s", lock->name)
-       );
-
-#endif
-#endif
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/sched.h b/include/trace/sched.h
deleted file mode 100644 (file)
index 4e372a1..0000000
+++ /dev/null
@@ -1,9 +0,0 @@
-#ifndef _TRACE_SCHED_H
-#define _TRACE_SCHED_H
-
-#include <linux/sched.h>
-#include <linux/tracepoint.h>
-
-#include <trace/sched_event_types.h>
-
-#endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
deleted file mode 100644 (file)
index 63547dc..0000000
+++ /dev/null
@@ -1,337 +0,0 @@
-
-/* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT
-# error Do not include this file directly.
-# error Unless you know what you are doing.
-#endif
-
-#undef TRACE_SYSTEM
-#define TRACE_SYSTEM sched
-
-/*
- * Tracepoint for calling kthread_stop, performed to end a kthread:
- */
-TRACE_EVENT(sched_kthread_stop,
-
-       TP_PROTO(struct task_struct *t),
-
-       TP_ARGS(t),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
-               __entry->pid    = t->pid;
-       ),
-
-       TP_printk("task %s:%d", __entry->comm, __entry->pid)
-);
-
-/*
- * Tracepoint for the return value of the kthread stopping:
- */
-TRACE_EVENT(sched_kthread_stop_ret,
-
-       TP_PROTO(int ret),
-
-       TP_ARGS(ret),
-
-       TP_STRUCT__entry(
-               __field(        int,    ret     )
-       ),
-
-       TP_fast_assign(
-               __entry->ret    = ret;
-       ),
-
-       TP_printk("ret %d", __entry->ret)
-);
-
-/*
- * Tracepoint for waiting on task to unschedule:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wait_task,
-
-       TP_PROTO(struct rq *rq, struct task_struct *p),
-
-       TP_ARGS(rq, p),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid    = p->pid;
-               __entry->prio   = p->prio;
-       ),
-
-       TP_printk("task %s:%d [%d]",
-                 __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for waking up a task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup,
-
-       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-       TP_ARGS(rq, p, success),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-               __field(        int,    success                 )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-               __entry->success        = success;
-       ),
-
-       TP_printk("task %s:%d [%d] success=%d",
-                 __entry->comm, __entry->pid, __entry->prio,
-                 __entry->success)
-);
-
-/*
- * Tracepoint for waking up a new task:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_wakeup_new,
-
-       TP_PROTO(struct rq *rq, struct task_struct *p, int success),
-
-       TP_ARGS(rq, p, success),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-               __field(        int,    success                 )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-               __entry->success        = success;
-       ),
-
-       TP_printk("task %s:%d [%d] success=%d",
-                 __entry->comm, __entry->pid, __entry->prio,
-                 __entry->success)
-);
-
-/*
- * Tracepoint for task switches, performed by the scheduler:
- *
- * (NOTE: the 'rq' argument is not used by generic trace events,
- *        but used by the latency tracer plugin. )
- */
-TRACE_EVENT(sched_switch,
-
-       TP_PROTO(struct rq *rq, struct task_struct *prev,
-                struct task_struct *next),
-
-       TP_ARGS(rq, prev, next),
-
-       TP_STRUCT__entry(
-               __array(        char,   prev_comm,      TASK_COMM_LEN   )
-               __field(        pid_t,  prev_pid                        )
-               __field(        int,    prev_prio                       )
-               __array(        char,   next_comm,      TASK_COMM_LEN   )
-               __field(        pid_t,  next_pid                        )
-               __field(        int,    next_prio                       )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
-               __entry->prev_pid       = prev->pid;
-               __entry->prev_prio      = prev->prio;
-               memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
-               __entry->next_pid       = next->pid;
-               __entry->next_prio      = next->prio;
-       ),
-
-       TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-               __entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-               __entry->next_comm, __entry->next_pid, __entry->next_prio)
-);
-
-/*
- * Tracepoint for a task being migrated:
- */
-TRACE_EVENT(sched_migrate_task,
-
-       TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-
-       TP_ARGS(p, orig_cpu, dest_cpu),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-               __field(        int,    orig_cpu                )
-               __field(        int,    dest_cpu                )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-               __entry->orig_cpu       = orig_cpu;
-               __entry->dest_cpu       = dest_cpu;
-       ),
-
-       TP_printk("task %s:%d [%d] from: %d  to: %d",
-                 __entry->comm, __entry->pid, __entry->prio,
-                 __entry->orig_cpu, __entry->dest_cpu)
-);
-
-/*
- * Tracepoint for freeing a task:
- */
-TRACE_EVENT(sched_process_free,
-
-       TP_PROTO(struct task_struct *p),
-
-       TP_ARGS(p),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-       ),
-
-       TP_printk("task %s:%d [%d]",
-                 __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a task exiting:
- */
-TRACE_EVENT(sched_process_exit,
-
-       TP_PROTO(struct task_struct *p),
-
-       TP_ARGS(p),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->prio           = p->prio;
-       ),
-
-       TP_printk("task %s:%d [%d]",
-                 __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for a waiting task:
- */
-TRACE_EVENT(sched_process_wait,
-
-       TP_PROTO(struct pid *pid),
-
-       TP_ARGS(pid),
-
-       TP_STRUCT__entry(
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-               __field(        int,    prio                    )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
-               __entry->pid            = pid_nr(pid);
-               __entry->prio           = current->prio;
-       ),
-
-       TP_printk("task %s:%d [%d]",
-                 __entry->comm, __entry->pid, __entry->prio)
-);
-
-/*
- * Tracepoint for do_fork:
- */
-TRACE_EVENT(sched_process_fork,
-
-       TP_PROTO(struct task_struct *parent, struct task_struct *child),
-
-       TP_ARGS(parent, child),
-
-       TP_STRUCT__entry(
-               __array(        char,   parent_comm,    TASK_COMM_LEN   )
-               __field(        pid_t,  parent_pid                      )
-               __array(        char,   child_comm,     TASK_COMM_LEN   )
-               __field(        pid_t,  child_pid                       )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
-               __entry->parent_pid     = parent->pid;
-               memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
-               __entry->child_pid      = child->pid;
-       ),
-
-       TP_printk("parent %s:%d  child %s:%d",
-               __entry->parent_comm, __entry->parent_pid,
-               __entry->child_comm, __entry->child_pid)
-);
-
-/*
- * Tracepoint for sending a signal:
- */
-TRACE_EVENT(sched_signal_send,
-
-       TP_PROTO(int sig, struct task_struct *p),
-
-       TP_ARGS(sig, p),
-
-       TP_STRUCT__entry(
-               __field(        int,    sig                     )
-               __array(        char,   comm,   TASK_COMM_LEN   )
-               __field(        pid_t,  pid                     )
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid    = p->pid;
-               __entry->sig    = sig;
-       ),
-
-       TP_printk("sig: %d  task %s:%d",
-                 __entry->sig, __entry->comm, __entry->pid)
-);
-
-#undef TRACE_SYSTEM
diff --git a/include/trace/skb.h b/include/trace/skb.h
deleted file mode 100644 (file)
index b66206d..0000000
+++ /dev/null
@@ -1,11 +0,0 @@
-#ifndef _TRACE_SKB_H_
-#define _TRACE_SKB_H_
-
-#include <linux/skbuff.h>
-#include <linux/tracepoint.h>
-
-DECLARE_TRACE(kfree_skb,
-       TP_PROTO(struct sk_buff *skb, void *location),
-       TP_ARGS(skb, location));
-
-#endif
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
deleted file mode 100644 (file)
index df56f56..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-/* trace/<type>_event_types.h here */
-
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
-#include <trace/lockdep_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
deleted file mode 100644 (file)
index fd13750..0000000
+++ /dev/null
@@ -1,5 +0,0 @@
-/* trace/<type>.h here */
-
-#include <trace/sched.h>
-#include <trace/irq.h>
-#include <trace/lockdep.h>
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
deleted file mode 100644 (file)
index 7626523..0000000
+++ /dev/null
@@ -1,25 +0,0 @@
-#ifndef __TRACE_WORKQUEUE_H
-#define __TRACE_WORKQUEUE_H
-
-#include <linux/tracepoint.h>
-#include <linux/workqueue.h>
-#include <linux/sched.h>
-
-DECLARE_TRACE(workqueue_insertion,
-          TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-          TP_ARGS(wq_thread, work));
-
-DECLARE_TRACE(workqueue_execution,
-          TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
-          TP_ARGS(wq_thread, work));
-
-/* Trace the creation of one workqueue thread on a cpu */
-DECLARE_TRACE(workqueue_creation,
-          TP_PROTO(struct task_struct *wq_thread, int cpu),
-          TP_ARGS(wq_thread, cpu));
-
-DECLARE_TRACE(workqueue_destruction,
-          TP_PROTO(struct task_struct *wq_thread),
-          TP_ARGS(wq_thread));
-
-#endif /* __TRACE_WORKQUEUE_H */
diff --git a/include/xen/Kbuild b/include/xen/Kbuild
new file mode 100644 (file)
index 0000000..4e65c16
--- /dev/null
@@ -0,0 +1 @@
+header-y += evtchn.h
index 0d5f1adc0363e9e125cb8786ca96b0917440b6b5..e68d59a90ca88b73c21b431c3cf538165916d5a7 100644 (file)
@@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq);
    irq will be disabled so it won't deliver an interrupt. */
 void xen_poll_irq(int irq);
 
+/* Determine the IRQ which is bound to an event channel */
+unsigned irq_from_evtchn(unsigned int evtchn);
+
 #endif /* _XEN_EVENTS_H */
diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h
new file mode 100644 (file)
index 0000000..14e833e
--- /dev/null
@@ -0,0 +1,88 @@
+/******************************************************************************
+ * evtchn.h
+ *
+ * Interface to /dev/xen/evtchn.
+ *
+ * Copyright (c) 2003-2005, K A Fraser
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation; or, when distributed
+ * separately from the Linux kernel or incorporated into other
+ * software packages, subject to the following license:
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this source file (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy, modify,
+ * merge, publish, distribute, sublicense, and/or sell copies of the Software,
+ * and to permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef __LINUX_PUBLIC_EVTCHN_H__
+#define __LINUX_PUBLIC_EVTCHN_H__
+
+/*
+ * Bind a fresh port to VIRQ @virq.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_VIRQ                         \
+       _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq))
+struct ioctl_evtchn_bind_virq {
+       unsigned int virq;
+};
+
+/*
+ * Bind a fresh port to remote <@remote_domain, @remote_port>.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_INTERDOMAIN                  \
+       _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain))
+struct ioctl_evtchn_bind_interdomain {
+       unsigned int remote_domain, remote_port;
+};
+
+/*
+ * Allocate a fresh port for binding to @remote_domain.
+ * Return allocated port.
+ */
+#define IOCTL_EVTCHN_BIND_UNBOUND_PORT                 \
+       _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port))
+struct ioctl_evtchn_bind_unbound_port {
+       unsigned int remote_domain;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_UNBIND                            \
+       _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind))
+struct ioctl_evtchn_unbind {
+       unsigned int port;
+};
+
+/*
+ * Unbind previously allocated @port.
+ */
+#define IOCTL_EVTCHN_NOTIFY                            \
+       _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify))
+struct ioctl_evtchn_notify {
+       unsigned int port;
+};
+
+/* Clear and reinitialise the event buffer. Clear error condition. */
+#define IOCTL_EVTCHN_RESET                             \
+       _IOC(_IOC_NONE, 'E', 5, 0)
+
+#endif /* __LINUX_PUBLIC_EVTCHN_H__ */
index 453235e923f0be0d3ae801e754bfb83a68caf892..e8b6519d47e9df818ebbb6d19323f032b8dde988 100644 (file)
@@ -57,4 +57,7 @@ struct xen_feature_info {
 /* Declares the features reported by XENVER_get_features. */
 #include "features.h"
 
+/* arg == NULL; returns host memory page size. */
+#define XENVER_pagesize 7
+
 #endif /* __XEN_PUBLIC_VERSION_H__ */
index f87f9614844db69e7eeacb2784646f5458ed954a..b9763badbd77da07813dae421f20a90a08df2e11 100644 (file)
@@ -91,8 +91,7 @@ struct xenbus_driver {
        void (*otherend_changed)(struct xenbus_device *dev,
                                 enum xenbus_state backend_state);
        int (*remove)(struct xenbus_device *dev);
-       int (*suspend)(struct xenbus_device *dev);
-       int (*suspend_cancel)(struct xenbus_device *dev);
+       int (*suspend)(struct xenbus_device *dev, pm_message_t state);
        int (*resume)(struct xenbus_device *dev);
        int (*uevent)(struct xenbus_device *, char **, int, char *, int);
        struct device_driver driver;
index 7be4d3836745a049596e7be06b5e850c035b1f72..d4e9671347ee907ba9f59f5a87f0e446970b9725 100644 (file)
@@ -308,7 +308,7 @@ menu "RCU Subsystem"
 
 choice
        prompt "RCU Implementation"
-       default CLASSIC_RCU
+       default TREE_RCU
 
 config CLASSIC_RCU
        bool "Classic RCU"
index d721dad05dd722fa065170e9f573a4f74c49d38d..bb7dc57eee36ed8079e63bdf1a4c725d4c97e45f 100644 (file)
@@ -64,6 +64,7 @@
 #include <linux/idr.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/kmemtrace.h>
 #include <trace/boot.h>
 
 #include <asm/io.h>
@@ -71,7 +72,6 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
index 16a2189e96f9458d7626660b4c537dd718bd0fd6..87c2b641fd7b475b192adf281fcdc1f5cf249893 100644 (file)
--- a/ipc/sem.c
+++ b/ipc/sem.c
@@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk)
                int i;
 
                rcu_read_lock();
-               un = list_entry(rcu_dereference(ulp->list_proc.next),
-                                       struct sem_undo, list_proc);
+               un = list_entry_rcu(ulp->list_proc.next,
+                                   struct sem_undo, list_proc);
                if (&un->list_proc == &ulp->list_proc)
                        semid = -1;
                 else
index 4259716004853bf727491980208e01686958085b..15dd238e533887b1e84f17b070bb64f0b4660961 100644 (file)
--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -384,7 +384,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
        error = PTR_ERR(file);
        if (IS_ERR(file))
                goto no_file;
-       ima_shm_check(file);
 
        id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni);
        if (id < 0) {
@@ -891,7 +890,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr)
        file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations);
        if (!file)
                goto out_free;
-       ima_shm_check(file);
+       ima_counts_get(file);
 
        file->private_data = sfd;
        file->f_mapping = shp->shm_file->f_mapping;
index 42423665660a3d6e0a1fc6a37dd0da997643b47f..a35eee3436de081a7a35d9b8184a9349facee17d 100644 (file)
@@ -93,6 +93,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o
 obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
+obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
index 42d56544460f21b353cc279aa8aa52301eded59b..f6c204f07ea6084c4849d52358d6b1b2aab1f0da 100644 (file)
@@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese,
 
 }
 
+asmlinkage long
+compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig,
+                            struct compat_siginfo __user *uinfo)
+{
+       siginfo_t info;
+
+       if (copy_siginfo_from_user32(&info, uinfo))
+               return -EFAULT;
+       return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 #ifdef __ARCH_WANT_COMPAT_SYS_TIME
 
 /* compat_time_t is a 32 bit "long" and needs to get converted. */
index 3a039189d70748b294082301d9636d467c019c3b..1bb4d7e5d61694a6b8c01ba47f8432dcd37c828e 100644 (file)
@@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds);
 
 /*
  * Prepare credentials for current to perform an execve()
- * - The caller must hold current->cred_exec_mutex
+ * - The caller must hold current->cred_guard_mutex
  */
 struct cred *prepare_exec_creds(void)
 {
@@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
        struct cred *new;
        int ret;
 
-       mutex_init(&p->cred_exec_mutex);
+       mutex_init(&p->cred_guard_mutex);
 
        if (
 #ifdef CONFIG_KEYS
index abf9cf3b95c609f12ccb0c6992cc3a8221bdcf8b..51d1fe3fb7ad5b60666c7f229d6bacd8942479e9 100644 (file)
@@ -48,7 +48,7 @@
 #include <linux/tracehook.h>
 #include <linux/fs_struct.h>
 #include <linux/init_task.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include <asm/mmu_context.h>
 #include "cred-internals.h"
 
-DEFINE_TRACE(sched_process_free);
-DEFINE_TRACE(sched_process_exit);
-DEFINE_TRACE(sched_process_wait);
-
 static void exit_mm(struct task_struct * tsk);
 
 static void __unhash_process(struct task_struct *p)
@@ -1476,6 +1472,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
                 */
                if (*notask_error)
                        *notask_error = ret;
+               return 0;
        }
 
        if (likely(!ptrace) && unlikely(p->ptrace)) {
index 875ffbdd96d09ccb2a9587636f472b8b8278cbfd..bb762b4dd21769e9c4035128b2317ca643818a1b 100644 (file)
@@ -61,7 +61,6 @@
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
 #include <linux/fs_struct.h>
-#include <trace/sched.h>
 #include <linux/magic.h>
 
 #include <asm/pgtable.h>
@@ -71,6 +70,8 @@
 #include <asm/cacheflush.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/sched.h>
+
 /*
  * Protected counters by write_lock_irq(&tasklist_lock)
  */
@@ -83,8 +84,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0;
 
 __cacheline_aligned DEFINE_RWLOCK(tasklist_lock);  /* outer */
 
-DEFINE_TRACE(sched_process_fork);
-
 int nr_processes(void)
 {
        int cpu;
@@ -982,6 +981,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (!p)
                goto fork_out;
 
+       ftrace_graph_init_task(p);
+
        rt_mutex_init_task(p);
 
 #ifdef CONFIG_PROVE_LOCKING
@@ -1089,8 +1090,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
        p->blocked_on = NULL; /* not blocked yet */
 #endif
-       if (unlikely(current->ptrace))
-               ptrace_fork(p, clone_flags);
+
+       p->bts = NULL;
 
        /* Perform scheduler related setup. Assign this task to a CPU. */
        sched_fork(p, clone_flags);
@@ -1131,8 +1132,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                }
        }
 
-       ftrace_graph_init_task(p);
-
        p->pid = pid_nr(pid);
        p->tgid = p->pid;
        if (clone_flags & CLONE_THREAD)
@@ -1141,7 +1140,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        if (current->nsproxy != p->nsproxy) {
                retval = ns_cgroup_clone(p, pid);
                if (retval)
-                       goto bad_fork_free_graph;
+                       goto bad_fork_free_pid;
        }
 
        p->set_child_tid = (clone_flags & CLONE_CHILD_SETTID) ? child_tidptr : NULL;
@@ -1233,7 +1232,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
                spin_unlock(&current->sighand->siglock);
                write_unlock_irq(&tasklist_lock);
                retval = -ERESTARTNOINTR;
-               goto bad_fork_free_graph;
+               goto bad_fork_free_pid;
        }
 
        if (clone_flags & CLONE_THREAD) {
@@ -1268,8 +1267,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        cgroup_post_fork(p);
        return p;
 
-bad_fork_free_graph:
-       ftrace_graph_exit_task(p);
 bad_fork_free_pid:
        if (pid != &init_struct_pid)
                free_pid(pid);
index d546b2d53a62ba700756c8b7ff53b11373923162..80b5ce716596a95b797a00c417b5a514f9d5d2a2 100644 (file)
  *  PRIVATE futexes by Eric Dumazet
  *  Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com>
  *
+ *  Requeue-PI support by Darren Hart <dvhltc@us.ibm.com>
+ *  Copyright (C) IBM Corporation, 2009
+ *  Thanks to Thomas Gleixner for conceptual design and careful reviews.
+ *
  *  Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly
  *  enough at me, Linus for the original (flawed) idea, Matthew
  *  Kirkwood for proof-of-concept implementation.
@@ -96,8 +100,8 @@ struct futex_pi_state {
  */
 struct futex_q {
        struct plist_node list;
-       /* There can only be a single waiter */
-       wait_queue_head_t waiter;
+       /* Waiter reference */
+       struct task_struct *task;
 
        /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
@@ -107,7 +111,9 @@ struct futex_q {
 
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
-       struct task_struct *task;
+
+       /* rt_waiter storage for requeue_pi: */
+       struct rt_mutex_waiter *rt_waiter;
 
        /* Bitset for the optional bitmasked wakeup */
        u32 bitset;
@@ -278,6 +284,25 @@ void put_futex_key(int fshared, union futex_key *key)
        drop_futex_key_refs(key);
 }
 
+/**
+ * futex_top_waiter() - Return the highest priority waiter on a futex
+ * @hb:     the hash bucket the futex_q's reside in
+ * @key:    the futex key (to distinguish it from other futex futex_q's)
+ *
+ * Must be called with the hb lock held.
+ */
+static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb,
+                                       union futex_key *key)
+{
+       struct futex_q *this;
+
+       plist_for_each_entry(this, &hb->chain, list) {
+               if (match_futex(&this->key, key))
+                       return this;
+       }
+       return NULL;
+}
+
 static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
 {
        u32 curval;
@@ -539,28 +564,160 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
        return 0;
 }
 
+/**
+ * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
+ * @uaddr:             the pi futex user address
+ * @hb:                        the pi futex hash bucket
+ * @key:               the futex key associated with uaddr and hb
+ * @ps:                        the pi_state pointer where we store the result of the
+ *                     lookup
+ * @task:              the task to perform the atomic lock work for.  This will
+ *                     be "current" except in the case of requeue pi.
+ * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Returns:
+ *  0 - ready to wait
+ *  1 - acquired the lock
+ * <0 - error
+ *
+ * The hb->lock and futex_key refs shall be held by the caller.
+ */
+static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
+                               union futex_key *key,
+                               struct futex_pi_state **ps,
+                               struct task_struct *task, int set_waiters)
+{
+       int lock_taken, ret, ownerdied = 0;
+       u32 uval, newval, curval;
+
+retry:
+       ret = lock_taken = 0;
+
+       /*
+        * To avoid races, we attempt to take the lock here again
+        * (by doing a 0 -> TID atomic cmpxchg), while holding all
+        * the locks. It will most likely not succeed.
+        */
+       newval = task_pid_vnr(task);
+       if (set_waiters)
+               newval |= FUTEX_WAITERS;
+
+       curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
+
+       if (unlikely(curval == -EFAULT))
+               return -EFAULT;
+
+       /*
+        * Detect deadlocks.
+        */
+       if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task))))
+               return -EDEADLK;
+
+       /*
+        * Surprise - we got the lock. Just return to userspace:
+        */
+       if (unlikely(!curval))
+               return 1;
+
+       uval = curval;
+
+       /*
+        * Set the FUTEX_WAITERS flag, so the owner will know it has someone
+        * to wake at the next unlock.
+        */
+       newval = curval | FUTEX_WAITERS;
+
+       /*
+        * There are two cases, where a futex might have no owner (the
+        * owner TID is 0): OWNER_DIED. We take over the futex in this
+        * case. We also do an unconditional take over, when the owner
+        * of the futex died.
+        *
+        * This is safe as we are protected by the hash bucket lock !
+        */
+       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
+               /* Keep the OWNER_DIED bit */
+               newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task);
+               ownerdied = 0;
+               lock_taken = 1;
+       }
+
+       curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+       if (unlikely(curval == -EFAULT))
+               return -EFAULT;
+       if (unlikely(curval != uval))
+               goto retry;
+
+       /*
+        * We took the lock due to owner died take over.
+        */
+       if (unlikely(lock_taken))
+               return 1;
+
+       /*
+        * We dont have the lock. Look up the PI state (or create it if
+        * we are the first waiter):
+        */
+       ret = lookup_pi_state(uval, hb, key, ps);
+
+       if (unlikely(ret)) {
+               switch (ret) {
+               case -ESRCH:
+                       /*
+                        * No owner found for this futex. Check if the
+                        * OWNER_DIED bit is set to figure out whether
+                        * this is a robust futex or not.
+                        */
+                       if (get_futex_value_locked(&curval, uaddr))
+                               return -EFAULT;
+
+                       /*
+                        * We simply start over in case of a robust
+                        * futex. The code above will take the futex
+                        * and return happy.
+                        */
+                       if (curval & FUTEX_OWNER_DIED) {
+                               ownerdied = 1;
+                               goto retry;
+                       }
+               default:
+                       break;
+               }
+       }
+
+       return ret;
+}
+
 /*
  * The hash bucket lock must be held when this is called.
  * Afterwards, the futex_q must not be accessed.
  */
 static void wake_futex(struct futex_q *q)
 {
-       plist_del(&q->list, &q->list.plist);
+       struct task_struct *p = q->task;
+
        /*
-        * The lock in wake_up_all() is a crucial memory barrier after the
-        * plist_del() and also before assigning to q->lock_ptr.
+        * We set q->lock_ptr = NULL _before_ we wake up the task. If
+        * a non futex wake up happens on another CPU then the task
+        * might exit and p would dereference a non existing task
+        * struct. Prevent this by holding a reference on p across the
+        * wake up.
         */
-       wake_up(&q->waiter);
+       get_task_struct(p);
+
+       plist_del(&q->list, &q->list.plist);
        /*
-        * The waiting task can free the futex_q as soon as this is written,
-        * without taking any locks.  This must come last.
-        *
-        * A memory barrier is required here to prevent the following store to
-        * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-        * end of wake_up() does not prevent this store from moving.
+        * The waiting task can free the futex_q as soon as
+        * q->lock_ptr = NULL is written, without taking any locks. A
+        * memory barrier is required here to prevent the following
+        * store to lock_ptr from getting ahead of the plist_del.
         */
        smp_wmb();
        q->lock_ptr = NULL;
+
+       wake_up_state(p, TASK_NORMAL);
+       put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -689,7 +846,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 
        plist_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
-                       if (this->pi_state) {
+                       if (this->pi_state || this->rt_waiter) {
                                ret = -EINVAL;
                                break;
                        }
@@ -802,24 +959,185 @@ out:
        return ret;
 }
 
-/*
- * Requeue all waiters hashed on one physical page to another
- * physical page.
+/**
+ * requeue_futex() - Requeue a futex_q from one hb to another
+ * @q:         the futex_q to requeue
+ * @hb1:       the source hash_bucket
+ * @hb2:       the target hash_bucket
+ * @key2:      the new key for the requeued futex_q
+ */
+static inline
+void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1,
+                  struct futex_hash_bucket *hb2, union futex_key *key2)
+{
+
+       /*
+        * If key1 and key2 hash to the same bucket, no need to
+        * requeue.
+        */
+       if (likely(&hb1->chain != &hb2->chain)) {
+               plist_del(&q->list, &hb1->chain);
+               plist_add(&q->list, &hb2->chain);
+               q->lock_ptr = &hb2->lock;
+#ifdef CONFIG_DEBUG_PI_LIST
+               q->list.plist.lock = &hb2->lock;
+#endif
+       }
+       get_futex_key_refs(key2);
+       q->key = *key2;
+}
+
+/**
+ * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue
+ * q:  the futex_q
+ * key:        the key of the requeue target futex
+ *
+ * During futex_requeue, with requeue_pi=1, it is possible to acquire the
+ * target futex if it is uncontended or via a lock steal.  Set the futex_q key
+ * to the requeue target futex so the waiter can detect the wakeup on the right
+ * futex, but remove it from the hb and NULL the rt_waiter so it can detect
+ * atomic lock acquisition.  Must be called with the q->lock_ptr held.
+ */
+static inline
+void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
+{
+       drop_futex_key_refs(&q->key);
+       get_futex_key_refs(key);
+       q->key = *key;
+
+       WARN_ON(plist_node_empty(&q->list));
+       plist_del(&q->list, &q->list.plist);
+
+       WARN_ON(!q->rt_waiter);
+       q->rt_waiter = NULL;
+
+       wake_up_state(q->task, TASK_NORMAL);
+}
+
+/**
+ * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
+ * @pifutex:           the user address of the to futex
+ * @hb1:               the from futex hash bucket, must be locked by the caller
+ * @hb2:               the to futex hash bucket, must be locked by the caller
+ * @key1:              the from futex key
+ * @key2:              the to futex key
+ * @ps:                        address to store the pi_state pointer
+ * @set_waiters:       force setting the FUTEX_WAITERS bit (1) or not (0)
+ *
+ * Try and get the lock on behalf of the top waiter if we can do it atomically.
+ * Wake the top waiter if we succeed.  If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
+ *
+ * Returns:
+ *  0 - failed to acquire the lock atomicly
+ *  1 - acquired the lock
+ * <0 - error
+ */
+static int futex_proxy_trylock_atomic(u32 __user *pifutex,
+                                struct futex_hash_bucket *hb1,
+                                struct futex_hash_bucket *hb2,
+                                union futex_key *key1, union futex_key *key2,
+                                struct futex_pi_state **ps, int set_waiters)
+{
+       struct futex_q *top_waiter = NULL;
+       u32 curval;
+       int ret;
+
+       if (get_futex_value_locked(&curval, pifutex))
+               return -EFAULT;
+
+       /*
+        * Find the top_waiter and determine if there are additional waiters.
+        * If the caller intends to requeue more than 1 waiter to pifutex,
+        * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+        * as we have means to handle the possible fault.  If not, don't set
+        * the bit unecessarily as it will force the subsequent unlock to enter
+        * the kernel.
+        */
+       top_waiter = futex_top_waiter(hb1, key1);
+
+       /* There are no waiters, nothing for us to do. */
+       if (!top_waiter)
+               return 0;
+
+       /*
+        * Try to take the lock for top_waiter.  Set the FUTEX_WAITERS bit in
+        * the contended case or if set_waiters is 1.  The pi_state is returned
+        * in ps in contended cases.
+        */
+       ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+                                  set_waiters);
+       if (ret == 1)
+               requeue_pi_wake_futex(top_waiter, key2);
+
+       return ret;
+}
+
+/**
+ * futex_requeue() - Requeue waiters from uaddr1 to uaddr2
+ * uaddr1:     source futex user address
+ * uaddr2:     target futex user address
+ * nr_wake:    number of waiters to wake (must be 1 for requeue_pi)
+ * nr_requeue: number of waiters to requeue (0-INT_MAX)
+ * requeue_pi: if we are attempting to requeue from a non-pi futex to a
+ *             pi futex (pi to pi requeue is not supported)
+ *
+ * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire
+ * uaddr2 atomically on behalf of the top waiter.
+ *
+ * Returns:
+ * >=0 - on success, the number of tasks requeued or woken
+ *  <0 - on error
  */
 static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
-                        int nr_wake, int nr_requeue, u32 *cmpval)
+                        int nr_wake, int nr_requeue, u32 *cmpval,
+                        int requeue_pi)
 {
        union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
+       int drop_count = 0, task_count = 0, ret;
+       struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb1, *hb2;
        struct plist_head *head1;
        struct futex_q *this, *next;
-       int ret, drop_count = 0;
+       u32 curval2;
+
+       if (requeue_pi) {
+               /*
+                * requeue_pi requires a pi_state, try to allocate it now
+                * without any locks in case it fails.
+                */
+               if (refill_pi_state_cache())
+                       return -ENOMEM;
+               /*
+                * requeue_pi must wake as many tasks as it can, up to nr_wake
+                * + nr_requeue, since it acquires the rt_mutex prior to
+                * returning to userspace, so as to not leave the rt_mutex with
+                * waiters and no owner.  However, second and third wake-ups
+                * cannot be predicted as they involve race conditions with the
+                * first wake and a fault while looking up the pi_state.  Both
+                * pthread_cond_signal() and pthread_cond_broadcast() should
+                * use nr_wake=1.
+                */
+               if (nr_wake != 1)
+                       return -EINVAL;
+       }
 
 retry:
+       if (pi_state != NULL) {
+               /*
+                * We will have to lookup the pi_state again, so free this one
+                * to keep the accounting correct.
+                */
+               free_pi_state(pi_state);
+               pi_state = NULL;
+       }
+
        ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_READ);
+       ret = get_futex_key(uaddr2, fshared, &key2,
+                           requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -854,32 +1172,99 @@ retry_private:
                }
        }
 
+       if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
+               /*
+                * Attempt to acquire uaddr2 and wake the top waiter. If we
+                * intend to requeue waiters, force setting the FUTEX_WAITERS
+                * bit.  We force this here where we are able to easily handle
+                * faults rather in the requeue loop below.
+                */
+               ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
+                                                &key2, &pi_state, nr_requeue);
+
+               /*
+                * At this point the top_waiter has either taken uaddr2 or is
+                * waiting on it.  If the former, then the pi_state will not
+                * exist yet, look it up one more time to ensure we have a
+                * reference to it.
+                */
+               if (ret == 1) {
+                       WARN_ON(pi_state);
+                       task_count++;
+                       ret = get_futex_value_locked(&curval2, uaddr2);
+                       if (!ret)
+                               ret = lookup_pi_state(curval2, hb2, &key2,
+                                                     &pi_state);
+               }
+
+               switch (ret) {
+               case 0:
+                       break;
+               case -EFAULT:
+                       double_unlock_hb(hb1, hb2);
+                       put_futex_key(fshared, &key2);
+                       put_futex_key(fshared, &key1);
+                       ret = get_user(curval2, uaddr2);
+                       if (!ret)
+                               goto retry;
+                       goto out;
+               case -EAGAIN:
+                       /* The owner was exiting, try again. */
+                       double_unlock_hb(hb1, hb2);
+                       put_futex_key(fshared, &key2);
+                       put_futex_key(fshared, &key1);
+                       cond_resched();
+                       goto retry;
+               default:
+                       goto out_unlock;
+               }
+       }
+
        head1 = &hb1->chain;
        plist_for_each_entry_safe(this, next, head1, list) {
-               if (!match_futex (&this->key, &key1))
+               if (task_count - nr_wake >= nr_requeue)
+                       break;
+
+               if (!match_futex(&this->key, &key1))
                        continue;
-               if (++ret <= nr_wake) {
+
+               WARN_ON(!requeue_pi && this->rt_waiter);
+               WARN_ON(requeue_pi && !this->rt_waiter);
+
+               /*
+                * Wake nr_wake waiters.  For requeue_pi, if we acquired the
+                * lock, we already woke the top_waiter.  If not, it will be
+                * woken by futex_unlock_pi().
+                */
+               if (++task_count <= nr_wake && !requeue_pi) {
                        wake_futex(this);
-               } else {
-                       /*
-                        * If key1 and key2 hash to the same bucket, no need to
-                        * requeue.
-                        */
-                       if (likely(head1 != &hb2->chain)) {
-                               plist_del(&this->list, &hb1->chain);
-                               plist_add(&this->list, &hb2->chain);
-                               this->lock_ptr = &hb2->lock;
-#ifdef CONFIG_DEBUG_PI_LIST
-                               this->list.plist.lock = &hb2->lock;
-#endif
-                       }
-                       this->key = key2;
-                       get_futex_key_refs(&key2);
-                       drop_count++;
+                       continue;
+               }
 
-                       if (ret - nr_wake >= nr_requeue)
-                               break;
+               /*
+                * Requeue nr_requeue waiters and possibly one more in the case
+                * of requeue_pi if we couldn't acquire the lock atomically.
+                */
+               if (requeue_pi) {
+                       /* Prepare the waiter to take the rt_mutex. */
+                       atomic_inc(&pi_state->refcount);
+                       this->pi_state = pi_state;
+                       ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
+                                                       this->rt_waiter,
+                                                       this->task, 1);
+                       if (ret == 1) {
+                               /* We got the lock. */
+                               requeue_pi_wake_futex(this, &key2);
+                               continue;
+                       } else if (ret) {
+                               /* -EDEADLK */
+                               this->pi_state = NULL;
+                               free_pi_state(pi_state);
+                               goto out_unlock;
+                       }
                }
+               requeue_futex(this, hb1, hb2, &key2);
+               drop_count++;
        }
 
 out_unlock:
@@ -899,7 +1284,9 @@ out_put_keys:
 out_put_key1:
        put_futex_key(fshared, &key1);
 out:
-       return ret;
+       if (pi_state != NULL)
+               free_pi_state(pi_state);
+       return ret ? ret : task_count;
 }
 
 /* The key must be already stored in q->key. */
@@ -907,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
 
-       init_waitqueue_head(&q->waiter);
-
        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1119,39 +1504,153 @@ handle_fault:
  */
 #define FLAGS_SHARED           0x01
 #define FLAGS_CLOCKRT          0x02
+#define FLAGS_HAS_TIMEOUT      0x04
 
 static long futex_wait_restart(struct restart_block *restart);
 
-static int futex_wait(u32 __user *uaddr, int fshared,
-                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+/**
+ * fixup_owner() - Post lock pi_state and corner case management
+ * @uaddr:     user address of the futex
+ * @fshared:   whether the futex is shared (1) or not (0)
+ * @q:         futex_q (contains pi_state and access to the rt_mutex)
+ * @locked:    if the attempt to take the rt_mutex succeeded (1) or not (0)
+ *
+ * After attempting to lock an rt_mutex, this function is called to cleanup
+ * the pi_state owner as well as handle race conditions that may allow us to
+ * acquire the lock. Must be called with the hb lock held.
+ *
+ * Returns:
+ *  1 - success, lock taken
+ *  0 - success, lock not taken
+ * <0 - on error (-EFAULT)
+ */
+static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q,
+                      int locked)
 {
-       struct task_struct *curr = current;
-       struct restart_block *restart;
-       DECLARE_WAITQUEUE(wait, curr);
-       struct futex_hash_bucket *hb;
-       struct futex_q q;
-       u32 uval;
-       int ret;
-       struct hrtimer_sleeper t;
-       int rem = 0;
-
-       if (!bitset)
-               return -EINVAL;
+       struct task_struct *owner;
+       int ret = 0;
 
-       q.pi_state = NULL;
-       q.bitset = bitset;
-retry:
-       q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_READ);
-       if (unlikely(ret != 0))
+       if (locked) {
+               /*
+                * Got the lock. We might not be the anticipated owner if we
+                * did a lock-steal - fix up the PI-state in that case:
+                */
+               if (q->pi_state->owner != current)
+                       ret = fixup_pi_state_owner(uaddr, q, current, fshared);
                goto out;
+       }
 
-retry_private:
-       hb = queue_lock(&q);
+       /*
+        * Catch the rare case, where the lock was released when we were on the
+        * way back before we locked the hash bucket.
+        */
+       if (q->pi_state->owner == current) {
+               /*
+                * Try to get the rt_mutex now. This might fail as some other
+                * task acquired the rt_mutex after we removed ourself from the
+                * rt_mutex waiters list.
+                */
+               if (rt_mutex_trylock(&q->pi_state->pi_mutex)) {
+                       locked = 1;
+                       goto out;
+               }
+
+               /*
+                * pi_state is incorrect, some other task did a lock steal and
+                * we returned due to timeout or signal without taking the
+                * rt_mutex. Too late. We can access the rt_mutex_owner without
+                * locking, as the other task is now blocked on the hash bucket
+                * lock. Fix the state up.
+                */
+               owner = rt_mutex_owner(&q->pi_state->pi_mutex);
+               ret = fixup_pi_state_owner(uaddr, q, owner, fshared);
+               goto out;
+       }
 
        /*
-        * Access the page AFTER the hash-bucket is locked.
-        * Order is important:
+        * Paranoia check. If we did not take the lock, then we should not be
+        * the owner, nor the pending owner, of the rt_mutex.
+        */
+       if (rt_mutex_owner(&q->pi_state->pi_mutex) == current)
+               printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p "
+                               "pi-state %p\n", ret,
+                               q->pi_state->pi_mutex.owner,
+                               q->pi_state->owner);
+
+out:
+       return ret ? ret : locked;
+}
+
+/**
+ * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal
+ * @hb:                the futex hash bucket, must be locked by the caller
+ * @q:         the futex_q to queue up on
+ * @timeout:   the prepared hrtimer_sleeper, or null for no timeout
+ */
+static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
+                               struct hrtimer_sleeper *timeout)
+{
+       queue_me(q, hb);
+
+       /*
+        * There might have been scheduling since the queue_me(), as we
+        * cannot hold a spinlock across the get_user() in case it
+        * faults, and we cannot just set TASK_INTERRUPTIBLE state when
+        * queueing ourselves into the futex hash. This code thus has to
+        * rely on the futex_wake() code removing us from hash when it
+        * wakes us up.
+        */
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       /* Arm the timer */
+       if (timeout) {
+               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+               if (!hrtimer_active(&timeout->timer))
+                       timeout->task = NULL;
+       }
+
+       /*
+        * !plist_node_empty() is safe here without any lock.
+        * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
+        */
+       if (likely(!plist_node_empty(&q->list))) {
+               /*
+                * If the timer has already expired, current will already be
+                * flagged for rescheduling. Only call schedule if there
+                * is no timeout, or if it has yet to expire.
+                */
+               if (!timeout || timeout->task)
+                       schedule();
+       }
+       __set_current_state(TASK_RUNNING);
+}
+
+/**
+ * futex_wait_setup() - Prepare to wait on a futex
+ * @uaddr:     the futex userspace address
+ * @val:       the expected value
+ * @fshared:   whether the futex is shared (1) or not (0)
+ * @q:         the associated futex_q
+ * @hb:                storage for hash_bucket pointer to be returned to caller
+ *
+ * Setup the futex_q and locate the hash_bucket.  Get the futex value and
+ * compare it with the expected value.  Handle atomic faults internally.
+ * Return with the hb lock held and a q.key reference on success, and unlocked
+ * with no q.key reference on failure.
+ *
+ * Returns:
+ *  0 - uaddr contains val and hb has been locked
+ * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked
+ */
+static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
+                          struct futex_q *q, struct futex_hash_bucket **hb)
+{
+       u32 uval;
+       int ret;
+
+       /*
+        * Access the page AFTER the hash-bucket is locked.
+        * Order is important:
         *
         *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
         *   Userspace waker:  if (cond(var)) { var = new; futex_wake(&var); }
@@ -1165,95 +1664,83 @@ retry_private:
         * A consequence is that futex_wait() can return zero and absorb
         * a wakeup when *uaddr != val on entry to the syscall.  This is
         * rare, but normal.
-        *
-        * For shared futexes, we hold the mmap semaphore, so the mapping
-        * cannot have changed since we looked it up in get_futex_key.
         */
+retry:
+       q->key = FUTEX_KEY_INIT;
+       ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
+       if (unlikely(ret != 0))
+               return ret;
+
+retry_private:
+       *hb = queue_lock(q);
+
        ret = get_futex_value_locked(&uval, uaddr);
 
-       if (unlikely(ret)) {
-               queue_unlock(&q, hb);
+       if (ret) {
+               queue_unlock(q, *hb);
 
                ret = get_user(uval, uaddr);
                if (ret)
-                       goto out_put_key;
+                       goto out;
 
                if (!fshared)
                        goto retry_private;
 
-               put_futex_key(fshared, &q.key);
+               put_futex_key(fshared, &q->key);
                goto retry;
        }
-       ret = -EWOULDBLOCK;
-       if (unlikely(uval != val)) {
-               queue_unlock(&q, hb);
-               goto out_put_key;
-       }
 
-       /* Only actually queue if *uaddr contained val.  */
-       queue_me(&q, hb);
+       if (uval != val) {
+               queue_unlock(q, *hb);
+               ret = -EWOULDBLOCK;
+       }
 
-       /*
-        * There might have been scheduling since the queue_me(), as we
-        * cannot hold a spinlock across the get_user() in case it
-        * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-        * queueing ourselves into the futex hash.  This code thus has to
-        * rely on the futex_wake() code removing us from hash when it
-        * wakes us up.
-        */
+out:
+       if (ret)
+               put_futex_key(fshared, &q->key);
+       return ret;
+}
 
-       /* add_wait_queue is the barrier after __set_current_state. */
-       __set_current_state(TASK_INTERRUPTIBLE);
-       add_wait_queue(&q.waiter, &wait);
-       /*
-        * !plist_node_empty() is safe here without any lock.
-        * q.lock_ptr != 0 is not safe, because of ordering against wakeup.
-        */
-       if (likely(!plist_node_empty(&q.list))) {
-               if (!abs_time)
-                       schedule();
-               else {
-                       hrtimer_init_on_stack(&t.timer,
-                                             clockrt ? CLOCK_REALTIME :
-                                             CLOCK_MONOTONIC,
-                                             HRTIMER_MODE_ABS);
-                       hrtimer_init_sleeper(&t, current);
-                       hrtimer_set_expires_range_ns(&t.timer, *abs_time,
-                                                    current->timer_slack_ns);
-
-                       hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
-                       if (!hrtimer_active(&t.timer))
-                               t.task = NULL;
+static int futex_wait(u32 __user *uaddr, int fshared,
+                     u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct restart_block *restart;
+       struct futex_hash_bucket *hb;
+       struct futex_q q;
+       int ret;
 
-                       /*
-                        * the timer could have already expired, in which
-                        * case current would be flagged for rescheduling.
-                        * Don't bother calling schedule.
-                        */
-                       if (likely(t.task))
-                               schedule();
+       if (!bitset)
+               return -EINVAL;
 
-                       hrtimer_cancel(&t.timer);
+       q.pi_state = NULL;
+       q.bitset = bitset;
+       q.rt_waiter = NULL;
 
-                       /* Flag if a timeout occured */
-                       rem = (t.task == NULL);
+       if (abs_time) {
+               to = &timeout;
 
-                       destroy_hrtimer_on_stack(&t.timer);
-               }
+               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_sleeper(to, current);
+               hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                            current->timer_slack_ns);
        }
-       __set_current_state(TASK_RUNNING);
 
-       /*
-        * NOTE: we don't remove ourselves from the waitqueue because
-        * we are the only user of it.
-        */
+       /* Prepare to wait on uaddr. */
+       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       if (ret)
+               goto out;
+
+       /* queue_me and wait for wakeup, timeout, or a signal. */
+       futex_wait_queue_me(hb, &q, to);
 
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
        if (!unqueue_me(&q))
                goto out_put_key;
        ret = -ETIMEDOUT;
-       if (rem)
+       if (to && !to->task)
                goto out_put_key;
 
        /*
@@ -1270,7 +1757,7 @@ retry_private:
        restart->futex.val = val;
        restart->futex.time = abs_time->tv64;
        restart->futex.bitset = bitset;
-       restart->futex.flags = 0;
+       restart->futex.flags = FLAGS_HAS_TIMEOUT;
 
        if (fshared)
                restart->futex.flags |= FLAGS_SHARED;
@@ -1282,6 +1769,10 @@ retry_private:
 out_put_key:
        put_futex_key(fshared, &q.key);
 out:
+       if (to) {
+               hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
+       }
        return ret;
 }
 
@@ -1290,13 +1781,16 @@ static long futex_wait_restart(struct restart_block *restart)
 {
        u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
        int fshared = 0;
-       ktime_t t;
+       ktime_t t, *tp = NULL;
 
-       t.tv64 = restart->futex.time;
+       if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
+               t.tv64 = restart->futex.time;
+               tp = &t;
+       }
        restart->fn = do_no_restart_syscall;
        if (restart->futex.flags & FLAGS_SHARED)
                fshared = 1;
-       return (long)futex_wait(uaddr, fshared, restart->futex.val, &t,
+       return (long)futex_wait(uaddr, fshared, restart->futex.val, tp,
                                restart->futex.bitset,
                                restart->futex.flags & FLAGS_CLOCKRT);
 }
@@ -1312,11 +1806,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
                         int detect, ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
-       struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
-       u32 uval, newval, curval;
+       u32 uval;
        struct futex_q q;
-       int ret, lock_taken, ownerdied = 0;
+       int res, ret;
 
        if (refill_pi_state_cache())
                return -ENOMEM;
@@ -1330,6 +1823,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        }
 
        q.pi_state = NULL;
+       q.rt_waiter = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
        ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
@@ -1339,81 +1833,15 @@ retry:
 retry_private:
        hb = queue_lock(&q);
 
-retry_locked:
-       ret = lock_taken = 0;
-
-       /*
-        * To avoid races, we attempt to take the lock here again
-        * (by doing a 0 -> TID atomic cmpxchg), while holding all
-        * the locks. It will most likely not succeed.
-        */
-       newval = task_pid_vnr(current);
-
-       curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
-
-       if (unlikely(curval == -EFAULT))
-               goto uaddr_faulted;
-
-       /*
-        * Detect deadlocks. In case of REQUEUE_PI this is a valid
-        * situation and we return success to user space.
-        */
-       if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) {
-               ret = -EDEADLK;
-               goto out_unlock_put_key;
-       }
-
-       /*
-        * Surprise - we got the lock. Just return to userspace:
-        */
-       if (unlikely(!curval))
-               goto out_unlock_put_key;
-
-       uval = curval;
-
-       /*
-        * Set the WAITERS flag, so the owner will know it has someone
-        * to wake at next unlock
-        */
-       newval = curval | FUTEX_WAITERS;
-
-       /*
-        * There are two cases, where a futex might have no owner (the
-        * owner TID is 0): OWNER_DIED. We take over the futex in this
-        * case. We also do an unconditional take over, when the owner
-        * of the futex died.
-        *
-        * This is safe as we are protected by the hash bucket lock !
-        */
-       if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) {
-               /* Keep the OWNER_DIED bit */
-               newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current);
-               ownerdied = 0;
-               lock_taken = 1;
-       }
-
-       curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
-
-       if (unlikely(curval == -EFAULT))
-               goto uaddr_faulted;
-       if (unlikely(curval != uval))
-               goto retry_locked;
-
-       /*
-        * We took the lock due to owner died take over.
-        */
-       if (unlikely(lock_taken))
-               goto out_unlock_put_key;
-
-       /*
-        * We dont have the lock. Look up the PI state (or create it if
-        * we are the first waiter):
-        */
-       ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state);
-
+       ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
        if (unlikely(ret)) {
                switch (ret) {
-
+               case 1:
+                       /* We got the lock. */
+                       ret = 0;
+                       goto out_unlock_put_key;
+               case -EFAULT:
+                       goto uaddr_faulted;
                case -EAGAIN:
                        /*
                         * Task is exiting and we just wait for the
@@ -1423,25 +1851,6 @@ retry_locked:
                        put_futex_key(fshared, &q.key);
                        cond_resched();
                        goto retry;
-
-               case -ESRCH:
-                       /*
-                        * No owner found for this futex. Check if the
-                        * OWNER_DIED bit is set to figure out whether
-                        * this is a robust futex or not.
-                        */
-                       if (get_futex_value_locked(&curval, uaddr))
-                               goto uaddr_faulted;
-
-                       /*
-                        * We simply start over in case of a robust
-                        * futex. The code above will take the futex
-                        * and return happy.
-                        */
-                       if (curval & FUTEX_OWNER_DIED) {
-                               ownerdied = 1;
-                               goto retry_locked;
-                       }
                default:
                        goto out_unlock_put_key;
                }
@@ -1465,71 +1874,21 @@ retry_locked:
        }
 
        spin_lock(q.lock_ptr);
-
-       if (!ret) {
-               /*
-                * Got the lock. We might not be the anticipated owner
-                * if we did a lock-steal - fix up the PI-state in
-                * that case:
-                */
-               if (q.pi_state->owner != curr)
-                       ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
-       } else {
-               /*
-                * Catch the rare case, where the lock was released
-                * when we were on the way back before we locked the
-                * hash bucket.
-                */
-               if (q.pi_state->owner == curr) {
-                       /*
-                        * Try to get the rt_mutex now. This might
-                        * fail as some other task acquired the
-                        * rt_mutex after we removed ourself from the
-                        * rt_mutex waiters list.
-                        */
-                       if (rt_mutex_trylock(&q.pi_state->pi_mutex))
-                               ret = 0;
-                       else {
-                               /*
-                                * pi_state is incorrect, some other
-                                * task did a lock steal and we
-                                * returned due to timeout or signal
-                                * without taking the rt_mutex. Too
-                                * late. We can access the
-                                * rt_mutex_owner without locking, as
-                                * the other task is now blocked on
-                                * the hash bucket lock. Fix the state
-                                * up.
-                                */
-                               struct task_struct *owner;
-                               int res;
-
-                               owner = rt_mutex_owner(&q.pi_state->pi_mutex);
-                               res = fixup_pi_state_owner(uaddr, &q, owner,
-                                                          fshared);
-
-                               /* propagate -EFAULT, if the fixup failed */
-                               if (res)
-                                       ret = res;
-                       }
-               } else {
-                       /*
-                        * Paranoia check. If we did not take the lock
-                        * in the trylock above, then we should not be
-                        * the owner of the rtmutex, neither the real
-                        * nor the pending one:
-                        */
-                       if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr)
-                               printk(KERN_ERR "futex_lock_pi: ret = %d "
-                                      "pi-mutex: %p pi-state %p\n", ret,
-                                      q.pi_state->pi_mutex.owner,
-                                      q.pi_state->owner);
-               }
-       }
+       /*
+        * Fixup the pi_state owner and possibly acquire the lock if we
+        * haven't already.
+        */
+       res = fixup_owner(uaddr, fshared, &q, !ret);
+       /*
+        * If fixup_owner() returned an error, proprogate that.  If it acquired
+        * the lock, clear our -ETIMEDOUT or -EINTR.
+        */
+       if (res)
+               ret = (res < 0) ? res : 0;
 
        /*
-        * If fixup_pi_state_owner() faulted and was unable to handle the
-        * fault, unlock it and return the fault to userspace.
+        * If fixup_owner() faulted and was unable to handle the fault, unlock
+        * it and return the fault to userspace.
         */
        if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
                rt_mutex_unlock(&q.pi_state->pi_mutex);
@@ -1537,9 +1896,7 @@ retry_locked:
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
 
-       if (to)
-               destroy_hrtimer_on_stack(&to->timer);
-       return ret != -EINTR ? ret : -ERESTARTNOINTR;
+       goto out;
 
 out_unlock_put_key:
        queue_unlock(&q, hb);
@@ -1549,7 +1906,7 @@ out_put_key:
 out:
        if (to)
                destroy_hrtimer_on_stack(&to->timer);
-       return ret;
+       return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
 uaddr_faulted:
        /*
@@ -1572,7 +1929,6 @@ uaddr_faulted:
        goto retry;
 }
 
-
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1674,6 +2030,229 @@ pi_faulted:
        return ret;
 }
 
+/**
+ * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex
+ * @hb:                the hash_bucket futex_q was original enqueued on
+ * @q:         the futex_q woken while waiting to be requeued
+ * @key2:      the futex_key of the requeue target futex
+ * @timeout:   the timeout associated with the wait (NULL if none)
+ *
+ * Detect if the task was woken on the initial futex as opposed to the requeue
+ * target futex.  If so, determine if it was a timeout or a signal that caused
+ * the wakeup and return the appropriate error code to the caller.  Must be
+ * called with the hb lock held.
+ *
+ * Returns
+ *  0 - no early wakeup detected
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
+ */
+static inline
+int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
+                                  struct futex_q *q, union futex_key *key2,
+                                  struct hrtimer_sleeper *timeout)
+{
+       int ret = 0;
+
+       /*
+        * With the hb lock held, we avoid races while we process the wakeup.
+        * We only need to hold hb (and not hb2) to ensure atomicity as the
+        * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb.
+        * It can't be requeued from uaddr2 to something else since we don't
+        * support a PI aware source futex for requeue.
+        */
+       if (!match_futex(&q->key, key2)) {
+               WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr));
+               /*
+                * We were woken prior to requeue by a timeout or a signal.
+                * Unqueue the futex_q and determine which it was.
+                */
+               plist_del(&q->list, &q->list.plist);
+               drop_futex_key_refs(&q->key);
+
+               if (timeout && !timeout->task)
+                       ret = -ETIMEDOUT;
+               else
+                       ret = -ERESTARTNOINTR;
+       }
+       return ret;
+}
+
+/**
+ * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2
+ * @uaddr:     the futex we initialyl wait on (non-pi)
+ * @fshared:   whether the futexes are shared (1) or not (0).  They must be
+ *             the same type, no requeueing from private to shared, etc.
+ * @val:       the expected value of uaddr
+ * @abs_time:  absolute timeout
+ * @bitset:    32 bit wakeup bitset set by userspace, defaults to all.
+ * @clockrt:   whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0)
+ * @uaddr2:    the pi futex we will take prior to returning to user-space
+ *
+ * The caller will wait on uaddr and will be requeued by futex_requeue() to
+ * uaddr2 which must be PI aware.  Normal wakeup will wake on uaddr2 and
+ * complete the acquisition of the rt_mutex prior to returning to userspace.
+ * This ensures the rt_mutex maintains an owner when it has waiters; without
+ * one, the pi logic wouldn't know which task to boost/deboost, if there was a
+ * need to.
+ *
+ * We call schedule in futex_wait_queue_me() when we enqueue and return there
+ * via the following:
+ * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue()
+ * 2) wakeup on uaddr2 after a requeue and subsequent unlock
+ * 3) signal (before or after requeue)
+ * 4) timeout (before or after requeue)
+ *
+ * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function.
+ *
+ * If 2, we may then block on trying to take the rt_mutex and return via:
+ * 5) successful lock
+ * 6) signal
+ * 7) timeout
+ * 8) other lock acquisition failure
+ *
+ * If 6, we setup a restart_block with futex_lock_pi() as the function.
+ *
+ * If 4 or 7, we cleanup and return with -ETIMEDOUT.
+ *
+ * Returns:
+ *  0 - On success
+ * <0 - On error
+ */
+static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
+                                u32 val, ktime_t *abs_time, u32 bitset,
+                                int clockrt, u32 __user *uaddr2)
+{
+       struct hrtimer_sleeper timeout, *to = NULL;
+       struct rt_mutex_waiter rt_waiter;
+       struct rt_mutex *pi_mutex = NULL;
+       struct futex_hash_bucket *hb;
+       union futex_key key2;
+       struct futex_q q;
+       int res, ret;
+
+       if (!bitset)
+               return -EINVAL;
+
+       if (abs_time) {
+               to = &timeout;
+               hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME :
+                                     CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+               hrtimer_init_sleeper(to, current);
+               hrtimer_set_expires_range_ns(&to->timer, *abs_time,
+                                            current->timer_slack_ns);
+       }
+
+       /*
+        * The waiter is allocated on our stack, manipulated by the requeue
+        * code while we sleep on uaddr.
+        */
+       debug_rt_mutex_init_waiter(&rt_waiter);
+       rt_waiter.task = NULL;
+
+       q.pi_state = NULL;
+       q.bitset = bitset;
+       q.rt_waiter = &rt_waiter;
+
+       key2 = FUTEX_KEY_INIT;
+       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
+       if (unlikely(ret != 0))
+               goto out;
+
+       /* Prepare to wait on uaddr. */
+       ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
+       if (ret)
+               goto out_key2;
+
+       /* Queue the futex_q, drop the hb lock, wait for wakeup. */
+       futex_wait_queue_me(hb, &q, to);
+
+       spin_lock(&hb->lock);
+       ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
+       spin_unlock(&hb->lock);
+       if (ret)
+               goto out_put_keys;
+
+       /*
+        * In order for us to be here, we know our q.key == key2, and since
+        * we took the hb->lock above, we also know that futex_requeue() has
+        * completed and we no longer have to concern ourselves with a wakeup
+        * race with the atomic proxy lock acquition by the requeue code.
+        */
+
+       /* Check if the requeue code acquired the second futex for us. */
+       if (!q.rt_waiter) {
+               /*
+                * Got the lock. We might not be the anticipated owner if we
+                * did a lock-steal - fix up the PI-state in that case.
+                */
+               if (q.pi_state && (q.pi_state->owner != current)) {
+                       spin_lock(q.lock_ptr);
+                       ret = fixup_pi_state_owner(uaddr2, &q, current,
+                                                  fshared);
+                       spin_unlock(q.lock_ptr);
+               }
+       } else {
+               /*
+                * We have been woken up by futex_unlock_pi(), a timeout, or a
+                * signal.  futex_unlock_pi() will not destroy the lock_ptr nor
+                * the pi_state.
+                */
+               WARN_ON(!&q.pi_state);
+               pi_mutex = &q.pi_state->pi_mutex;
+               ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1);
+               debug_rt_mutex_free_waiter(&rt_waiter);
+
+               spin_lock(q.lock_ptr);
+               /*
+                * Fixup the pi_state owner and possibly acquire the lock if we
+                * haven't already.
+                */
+               res = fixup_owner(uaddr2, fshared, &q, !ret);
+               /*
+                * If fixup_owner() returned an error, proprogate that.  If it
+                * acquired the lock, clear our -ETIMEDOUT or -EINTR.
+                */
+               if (res)
+                       ret = (res < 0) ? res : 0;
+
+               /* Unqueue and drop the lock. */
+               unqueue_me_pi(&q);
+       }
+
+       /*
+        * If fixup_pi_state_owner() faulted and was unable to handle the
+        * fault, unlock the rt_mutex and return the fault to userspace.
+        */
+       if (ret == -EFAULT) {
+               if (rt_mutex_owner(pi_mutex) == current)
+                       rt_mutex_unlock(pi_mutex);
+       } else if (ret == -EINTR) {
+               /*
+                * We've already been requeued, but we have no way to
+                * restart by calling futex_lock_pi() directly. We
+                * could restart the syscall, but that will look at
+                * the user space value and return right away. So we
+                * drop back with EWOULDBLOCK to tell user space that
+                * "val" has been changed. That's the same what the
+                * restart of the syscall would do in
+                * futex_wait_setup().
+                */
+               ret = -EWOULDBLOCK;
+       }
+
+out_put_keys:
+       put_futex_key(fshared, &q.key);
+out_key2:
+       put_futex_key(fshared, &key2);
+
+out:
+       if (to) {
+               hrtimer_cancel(&to->timer);
+               destroy_hrtimer_on_stack(&to->timer);
+       }
+       return ret;
+}
+
 /*
  * Support for robust futexes: the kernel cleans up held futexes at
  * thread exit time.
@@ -1896,7 +2475,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                fshared = 1;
 
        clockrt = op & FUTEX_CLOCK_REALTIME;
-       if (clockrt && cmd != FUTEX_WAIT_BITSET)
+       if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
                return -ENOSYS;
 
        switch (cmd) {
@@ -1911,10 +2490,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                ret = futex_wake(uaddr, fshared, val, val3);
                break;
        case FUTEX_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0);
                break;
        case FUTEX_CMP_REQUEUE:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3);
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                   0);
                break;
        case FUTEX_WAKE_OP:
                ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3);
@@ -1931,6 +2511,15 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                if (futex_cmpxchg_enabled)
                        ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1);
                break;
+       case FUTEX_WAIT_REQUEUE_PI:
+               val3 = FUTEX_BITSET_MATCH_ANY;
+               ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
+                                           clockrt, uaddr2);
+               break;
+       case FUTEX_CMP_REQUEUE_PI:
+               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
+                                   1);
+               break;
        default:
                ret = -ENOSYS;
        }
@@ -1948,7 +2537,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
        int cmd = op & FUTEX_CMD_MASK;
 
        if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI ||
-                     cmd == FUTEX_WAIT_BITSET)) {
+                     cmd == FUTEX_WAIT_BITSET ||
+                     cmd == FUTEX_WAIT_REQUEUE_PI)) {
                if (copy_from_user(&ts, utime, sizeof(ts)) != 0)
                        return -EFAULT;
                if (!timespec_valid(&ts))
@@ -1960,11 +2550,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
                tp = &t;
        }
        /*
-        * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE.
+        * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*.
         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
         */
        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-           cmd == FUTEX_WAKE_OP)
+           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                val2 = (u32) (unsigned long) utime;
 
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
index 3394f8f52964b3d9b4433a3560ff94eb32b20cb8..7d047808419da88e273fba8cd1efd88d9aaa5bcd 100644 (file)
@@ -3,5 +3,5 @@ obj-y := handle.o manage.o spurious.o resend.o chip.o devres.o
 obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
-obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_NUMA_IRQ_DESC) += numa_migrate.o
 obj-$(CONFIG_PM_SLEEP) += pm.o
index c687ba4363f2b4a95a5c3988998286a1e8ab699b..13c68e71b726c674619633de2f6e200b00adc039 100644 (file)
@@ -359,7 +359,6 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
 
        spin_lock(&desc->lock);
        mask_ack_irq(desc, irq);
-       desc = irq_remap_to_desc(irq, desc);
 
        if (unlikely(desc->status & IRQ_INPROGRESS))
                goto out_unlock;
@@ -438,7 +437,6 @@ handle_fasteoi_irq(unsigned int irq, struct irq_desc *desc)
        desc->status &= ~IRQ_INPROGRESS;
 out:
        desc->chip->eoi(irq);
-       desc = irq_remap_to_desc(irq, desc);
 
        spin_unlock(&desc->lock);
 }
@@ -475,7 +473,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
                    !desc->action)) {
                desc->status |= (IRQ_PENDING | IRQ_MASKED);
                mask_ack_irq(desc, irq);
-               desc = irq_remap_to_desc(irq, desc);
                goto out_unlock;
        }
        kstat_incr_irqs_this_cpu(irq, desc);
@@ -483,7 +480,6 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
        /* Start handling the irq */
        if (desc->chip->ack)
                desc->chip->ack(irq);
-       desc = irq_remap_to_desc(irq, desc);
 
        /* Mark the IRQ currently in progress.*/
        desc->status |= IRQ_INPROGRESS;
@@ -544,10 +540,8 @@ handle_percpu_irq(unsigned int irq, struct irq_desc *desc)
        if (!noirqdebug)
                note_interrupt(irq, desc, action_ret);
 
-       if (desc->chip->eoi) {
+       if (desc->chip->eoi)
                desc->chip->eoi(irq);
-               desc = irq_remap_to_desc(irq, desc);
-       }
 }
 
 void
@@ -582,10 +576,8 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
 
        /* Uninstall? */
        if (handle == handle_bad_irq) {
-               if (desc->chip != &no_irq_chip) {
+               if (desc->chip != &no_irq_chip)
                        mask_ack_irq(desc, irq);
-                       desc = irq_remap_to_desc(irq, desc);
-               }
                desc->status |= IRQ_DISABLED;
                desc->depth = 1;
        }
index 26e08754744fa13dd9da3e919be6adb3976b2b04..a60018402f4228b46d5870ee93f9e86ae417137a 100644 (file)
  */
 
 #include <linux/irq.h>
+#include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
-#include <trace/irq.h>
 #include <linux/bootmem.h>
+#include <trace/events/irq.h>
 
 #include "internals.h"
 
@@ -81,45 +82,48 @@ static struct irq_desc irq_desc_init = {
        .lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
 };
 
-void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
+void __ref init_kstat_irqs(struct irq_desc *desc, int node, int nr)
 {
-       int node;
        void *ptr;
 
-       node = cpu_to_node(cpu);
-       ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
+       if (slab_is_available())
+               ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs),
+                                  GFP_ATOMIC, node);
+       else
+               ptr = alloc_bootmem_node(NODE_DATA(node),
+                               nr * sizeof(*desc->kstat_irqs));
 
        /*
         * don't overwite if can not get new one
         * init_copy_kstat_irqs() could still use old one
         */
        if (ptr) {
-               printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
-                        cpu, node);
+               printk(KERN_DEBUG "  alloc kstat_irqs on node %d\n", node);
                desc->kstat_irqs = ptr;
        }
 }
 
-static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
+static void init_one_irq_desc(int irq, struct irq_desc *desc, int node)
 {
        memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
        spin_lock_init(&desc->lock);
        desc->irq = irq;
 #ifdef CONFIG_SMP
-       desc->cpu = cpu;
+       desc->node = node;
 #endif
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_kstat_irqs(desc, cpu, nr_cpu_ids);
+       init_kstat_irqs(desc, node, nr_cpu_ids);
        if (!desc->kstat_irqs) {
                printk(KERN_ERR "can not alloc kstat_irqs\n");
                BUG_ON(1);
        }
-       if (!init_alloc_desc_masks(desc, cpu, false)) {
+       if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
                BUG_ON(1);
        }
-       arch_init_chip_data(desc, cpu);
+       init_desc_masks(desc);
+       arch_init_chip_data(desc, node);
 }
 
 /*
@@ -169,7 +173,8 @@ int __init early_irq_init(void)
                desc[i].irq = i;
                desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
                lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-               init_alloc_desc_masks(&desc[i], 0, true);
+               alloc_desc_masks(&desc[i], 0, true);
+               init_desc_masks(&desc[i]);
                irq_desc_ptrs[i] = desc + i;
        }
 
@@ -187,11 +192,10 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc * __ref irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        struct irq_desc *desc;
        unsigned long flags;
-       int node;
 
        if (irq >= nr_irqs) {
                WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
@@ -210,15 +214,17 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
        if (desc)
                goto out_unlock;
 
-       node = cpu_to_node(cpu);
-       desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
-       printk(KERN_DEBUG "  alloc irq_desc for %d on cpu %d node %d\n",
-                irq, cpu, node);
+       if (slab_is_available())
+               desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
+       else
+               desc = alloc_bootmem_node(NODE_DATA(node), sizeof(*desc));
+
+       printk(KERN_DEBUG "  alloc irq_desc for %d on node %d\n", irq, node);
        if (!desc) {
                printk(KERN_ERR "can not alloc irq_desc\n");
                BUG_ON(1);
        }
-       init_one_irq_desc(irq, desc, cpu);
+       init_one_irq_desc(irq, desc, node);
 
        irq_desc_ptrs[irq] = desc;
 
@@ -256,7 +262,8 @@ int __init early_irq_init(void)
 
        for (i = 0; i < count; i++) {
                desc[i].irq = i;
-               init_alloc_desc_masks(&desc[i], 0, true);
+               alloc_desc_masks(&desc[i], 0, true);
+               init_desc_masks(&desc[i]);
                desc[i].kstat_irqs = kstat_irqs_all[i];
        }
        return arch_early_irq_init();
@@ -267,7 +274,7 @@ struct irq_desc *irq_to_desc(unsigned int irq)
        return (irq < NR_IRQS) ? irq_desc + irq : NULL;
 }
 
-struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
+struct irq_desc *irq_to_desc_alloc_node(unsigned int irq, int node)
 {
        return irq_to_desc(irq);
 }
@@ -348,9 +355,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action)
               "but no thread function available.", irq, action->name);
 }
 
-DEFINE_TRACE(irq_handler_entry);
-DEFINE_TRACE(irq_handler_exit);
-
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:       the interrupt number
@@ -453,11 +457,8 @@ unsigned int __do_IRQ(unsigned int irq)
                /*
                 * No locking required for CPU-local interrupts:
                 */
-               if (desc->chip->ack) {
+               if (desc->chip->ack)
                        desc->chip->ack(irq);
-                       /* get new one */
-                       desc = irq_remap_to_desc(irq, desc);
-               }
                if (likely(!(desc->status & IRQ_DISABLED))) {
                        action_ret = handle_IRQ_event(irq, desc->action);
                        if (!noirqdebug)
@@ -468,10 +469,8 @@ unsigned int __do_IRQ(unsigned int irq)
        }
 
        spin_lock(&desc->lock);
-       if (desc->chip->ack) {
+       if (desc->chip->ack)
                desc->chip->ack(irq);
-               desc = irq_remap_to_desc(irq, desc);
-       }
        /*
         * REPLAY is when Linux resends an IRQ that was dropped earlier
         * WAITING is used by probe to mark irqs that are being tested
index 01ce20eab38fed96a7b5a861940ba58d47c18fbf..73468253143ba55883072d1020791c11b1e30210 100644 (file)
@@ -16,7 +16,7 @@ extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
 extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
-extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void init_kstat_irqs(struct irq_desc *desc, int node, int nr);
 extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
 
@@ -42,6 +42,9 @@ static inline void unregister_handler_proc(unsigned int irq,
 
 extern int irq_select_affinity_usr(unsigned int irq);
 
+extern void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask);
+
 /*
  * Debugging printout:
  */
index 2734eca59243bd8b0783b3553622363b2ce8129e..aaf5c9d05770378b42acc2d41905ceb543f1f280 100644 (file)
@@ -80,7 +80,7 @@ int irq_can_set_affinity(unsigned int irq)
        return 1;
 }
 
-static void
+void
 irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
 {
        struct irqaction *action = desc->action;
@@ -109,17 +109,22 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
        spin_lock_irqsave(&desc->lock, flags);
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-       if (desc->status & IRQ_MOVE_PCNTXT)
-               desc->chip->set_affinity(irq, cpumask);
+       if (desc->status & IRQ_MOVE_PCNTXT) {
+               if (!desc->chip->set_affinity(irq, cpumask)) {
+                       cpumask_copy(desc->affinity, cpumask);
+                       irq_set_thread_affinity(desc, cpumask);
+               }
+       }
        else {
                desc->status |= IRQ_MOVE_PENDING;
                cpumask_copy(desc->pending_mask, cpumask);
        }
 #else
-       cpumask_copy(desc->affinity, cpumask);
-       desc->chip->set_affinity(irq, cpumask);
+       if (!desc->chip->set_affinity(irq, cpumask)) {
+               cpumask_copy(desc->affinity, cpumask);
+               irq_set_thread_affinity(desc, cpumask);
+       }
 #endif
-       irq_set_thread_affinity(desc, cpumask);
        desc->status |= IRQ_AFFINITY_SET;
        spin_unlock_irqrestore(&desc->lock, flags);
        return 0;
index e05ad9be43b7a5fa7e6d6a24f94e49828a73572a..cfe767ca154501460f9f7e757c08d881611e9c95 100644 (file)
@@ -1,5 +1,8 @@
 
 #include <linux/irq.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
 
 void move_masked_irq(int irq)
 {
@@ -39,11 +42,12 @@ void move_masked_irq(int irq)
         * masking the irqs.
         */
        if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
-                  < nr_cpu_ids)) {
-               cpumask_and(desc->affinity,
-                           desc->pending_mask, cpu_online_mask);
-               desc->chip->set_affinity(irq, desc->affinity);
-       }
+                  < nr_cpu_ids))
+               if (!desc->chip->set_affinity(irq, desc->pending_mask)) {
+                       cpumask_copy(desc->affinity, desc->pending_mask);
+                       irq_set_thread_affinity(desc, desc->pending_mask);
+               }
+
        cpumask_clear(desc->pending_mask);
 }
 
index 44bbdcbaf8d2c8193f2d631b54061047546d0065..2f69bee57bf21ca5a42027834ddccee96670d989 100644 (file)
@@ -15,9 +15,9 @@
 
 static void init_copy_kstat_irqs(struct irq_desc *old_desc,
                                 struct irq_desc *desc,
-                                int cpu, int nr)
+                                int node, int nr)
 {
-       init_kstat_irqs(desc, cpu, nr);
+       init_kstat_irqs(desc, node, nr);
 
        if (desc->kstat_irqs != old_desc->kstat_irqs)
                memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
@@ -34,20 +34,20 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
-                struct irq_desc *desc, int cpu)
+                struct irq_desc *desc, int node)
 {
        memcpy(desc, old_desc, sizeof(struct irq_desc));
-       if (!init_alloc_desc_masks(desc, cpu, false)) {
+       if (!alloc_desc_masks(desc, node, false)) {
                printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
                                "for migration.\n", irq);
                return false;
        }
        spin_lock_init(&desc->lock);
-       desc->cpu = cpu;
+       desc->node = node;
        lockdep_set_class(&desc->lock, &irq_desc_lock_class);
-       init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+       init_copy_kstat_irqs(old_desc, desc, node, nr_cpu_ids);
        init_copy_desc_masks(old_desc, desc);
-       arch_init_copy_chip_data(old_desc, desc, cpu);
+       arch_init_copy_chip_data(old_desc, desc, node);
        return true;
 }
 
@@ -59,12 +59,11 @@ static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
 }
 
 static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
-                                               int cpu)
+                                               int node)
 {
        struct irq_desc *desc;
        unsigned int irq;
        unsigned long flags;
-       int node;
 
        irq = old_desc->irq;
 
@@ -76,7 +75,6 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
        if (desc && old_desc != desc)
                goto out_unlock;
 
-       node = cpu_to_node(cpu);
        desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
        if (!desc) {
                printk(KERN_ERR "irq %d: can not get new irq_desc "
@@ -85,7 +83,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
                desc = old_desc;
                goto out_unlock;
        }
-       if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
+       if (!init_copy_one_irq_desc(irq, old_desc, desc, node)) {
                /* still use old one */
                kfree(desc);
                desc = old_desc;
@@ -97,9 +95,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 
        /* free the old one */
        free_one_irq_desc(old_desc, desc);
-       spin_unlock(&old_desc->lock);
        kfree(old_desc);
-       spin_lock(&desc->lock);
 
        return desc;
 
@@ -109,24 +105,14 @@ out_unlock:
        return desc;
 }
 
-struct irq_desc *move_irq_desc(struct irq_desc *desc, int cpu)
+struct irq_desc *move_irq_desc(struct irq_desc *desc, int node)
 {
-       int old_cpu;
-       int node, old_node;
-
        /* those all static, do move them */
        if (desc->irq < NR_IRQS_LEGACY)
                return desc;
 
-       old_cpu = desc->cpu;
-       if (old_cpu != cpu) {
-               node = cpu_to_node(cpu);
-               old_node = cpu_to_node(old_cpu);
-               if (old_node != node)
-                       desc = __real_move_irq_desc(desc, cpu);
-               else
-                       desc->cpu = cpu;
-       }
+       if (desc->node != node)
+               desc = __real_move_irq_desc(desc, node);
 
        return desc;
 }
index 4ebaf8519abf64fb0cc4e93ef42d7ad0cf286c9f..41c88fe40500399c76f0382d51a3fb05aef03211 100644 (file)
@@ -13,7 +13,7 @@
 #include <linux/file.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #define KTHREAD_NICE_LEVEL (-5)
 
@@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock);
 static LIST_HEAD(kthread_create_list);
 struct task_struct *kthreadd_task;
 
-DEFINE_TRACE(sched_kthread_stop);
-DEFINE_TRACE(sched_kthread_stop_ret);
-
 struct kthread_create_info
 {
        /* Information passed to kthread() from kthreadd. */
index accb40cdb12a4cd0c3d5fd880c706d986568557d..8bbeef996c76598b7e45e3b48186fc18c03b5be0 100644 (file)
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
-#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
 #include "lockdep_internals.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/lockdep.h>
+
 #ifdef CONFIG_PROVE_LOCKING
 int prove_locking = 1;
 module_param(prove_locking, int, 0644);
@@ -2935,8 +2937,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
-DEFINE_TRACE(lock_acquire);
-
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2963,8 +2963,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
-DEFINE_TRACE(lock_release);
-
 void lock_release(struct lockdep_map *lock, int nested,
                          unsigned long ip)
 {
@@ -3105,6 +3103,8 @@ found_it:
                hlock->holdtime_stamp = now;
        }
 
+       trace_lock_acquired(lock, ip, waittime);
+
        stats = get_lock_stats(hlock_class(hlock));
        if (waittime) {
                if (hlock->read)
@@ -3120,8 +3120,6 @@ found_it:
        lock->ip = ip;
 }
 
-DEFINE_TRACE(lock_contended);
-
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
@@ -3143,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
-DEFINE_TRACE(lock_acquired);
-
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
        unsigned long flags;
 
-       trace_lock_acquired(lock, ip);
-
        if (unlikely(!lock_stat))
                return;
 
index e797812a4d95f164bb377447a62de3089c0ba182..278e9b6762bb703c1f2c82809e1ad311db900c76 100644 (file)
@@ -18,6 +18,7 @@
 */
 #include <linux/module.h>
 #include <linux/moduleloader.h>
+#include <linux/ftrace_event.h>
 #include <linux/init.h>
 #include <linux/kallsyms.h>
 #include <linux/fs.h>
@@ -72,6 +73,9 @@ DEFINE_MUTEX(module_mutex);
 EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
+/* Block module loading/unloading? */
+int modules_disabled = 0;
+
 /* Waiting for a module to finish initializing? */
 static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
@@ -777,7 +781,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
        char name[MODULE_NAME_LEN];
        int ret, forced = 0;
 
-       if (!capable(CAP_SYS_MODULE))
+       if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;
 
        if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0)
@@ -1489,9 +1493,6 @@ static void free_module(struct module *mod)
        /* Free any allocated parameters. */
        destroy_params(mod->kp, mod->num_kp);
 
-       /* release any pointers to mcount in this module */
-       ftrace_release(mod->module_core, mod->core_size);
-
        /* This may be NULL, but that's OK */
        module_free(mod, mod->module_init);
        kfree(mod->args);
@@ -1892,11 +1893,9 @@ static noinline struct module *load_module(void __user *umod,
        unsigned int symindex = 0;
        unsigned int strindex = 0;
        unsigned int modindex, versindex, infoindex, pcpuindex;
-       unsigned int num_mcount;
        struct module *mod;
        long err = 0;
        void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-       unsigned long *mseg;
        mm_segment_t old_fs;
 
        DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
@@ -2172,7 +2171,19 @@ static noinline struct module *load_module(void __user *umod,
                                        sizeof(*mod->tracepoints),
                                        &mod->num_tracepoints);
 #endif
-
+#ifdef CONFIG_EVENT_TRACING
+       mod->trace_events = section_objs(hdr, sechdrs, secstrings,
+                                        "_ftrace_events",
+                                        sizeof(*mod->trace_events),
+                                        &mod->num_trace_events);
+#endif
+#ifdef CONFIG_FTRACE_MCOUNT_RECORD
+       /* sechdrs[0].sh_size is always zero */
+       mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings,
+                                            "__mcount_loc",
+                                            sizeof(*mod->ftrace_callsites),
+                                            &mod->num_ftrace_callsites);
+#endif
 #ifdef CONFIG_MODVERSIONS
        if ((mod->num_syms && !mod->crcs)
            || (mod->num_gpl_syms && !mod->gpl_crcs)
@@ -2237,11 +2248,6 @@ static noinline struct module *load_module(void __user *umod,
                        dynamic_debug_setup(debug, num_debug);
        }
 
-       /* sechdrs[0].sh_size is always zero */
-       mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc",
-                           sizeof(*mseg), &num_mcount);
-       ftrace_init_module(mod, mseg, mseg + num_mcount);
-
        err = module_finalize(hdr, sechdrs, mod);
        if (err < 0)
                goto cleanup;
@@ -2302,7 +2308,6 @@ static noinline struct module *load_module(void __user *umod,
  cleanup:
        kobject_del(&mod->mkobj.kobj);
        kobject_put(&mod->mkobj.kobj);
-       ftrace_release(mod->module_core, mod->core_size);
  free_unload:
        module_unload_free(mod);
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
@@ -2336,7 +2341,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod,
        int ret = 0;
 
        /* Must have permission */
-       if (!capable(CAP_SYS_MODULE))
+       if (!capable(CAP_SYS_MODULE) || modules_disabled)
                return -EPERM;
 
        /* Only one module load at a time, please */
index 507cf2b5e9f1e6328b2e335a3a05c0ead6a0ff3e..e5cc0cd28d541eb64526b4f7d68c9dca29eaa571 100644 (file)
@@ -249,7 +249,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass,
 
                /* didnt get the lock, go to sleep: */
                spin_unlock_mutex(&lock->wait_lock, flags);
-               __schedule();
+               preempt_enable_no_resched();
+               schedule();
+               preempt_disable();
                spin_lock_mutex(&lock->wait_lock, flags);
        }
 
@@ -471,5 +473,28 @@ int __sched mutex_trylock(struct mutex *lock)
 
        return ret;
 }
-
 EXPORT_SYMBOL(mutex_trylock);
+
+/**
+ * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0
+ * @cnt: the atomic which we are to dec
+ * @lock: the mutex to return holding if we dec to 0
+ *
+ * return true and hold lock if we dec to 0, return false otherwise
+ */
+int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock)
+{
+       /* dec if we can't possibly hit 0 */
+       if (atomic_add_unless(cnt, -1, 1))
+               return 0;
+       /* we might hit 0, so take the lock */
+       mutex_lock(lock);
+       if (!atomic_dec_and_test(cnt)) {
+               /* when we actually did the dec, we didn't hit 0 */
+               mutex_unlock(lock);
+               return 0;
+       }
+       /* we hit 0, and we hold the lock */
+       return 1;
+}
+EXPORT_SYMBOL(atomic_dec_and_mutex_lock);
index 42c317874cfa8d2703c13f3c44757a6f397459ac..f6d8b8cb5e34b64e23227314b0ab7400abe28287 100644 (file)
 #include <linux/uaccess.h>
 
 
-/*
- * Initialize a new task whose father had been ptraced.
- *
- * Called from copy_process().
- */
-void ptrace_fork(struct task_struct *child, unsigned long clone_flags)
-{
-       arch_ptrace_fork(child, clone_flags);
-}
-
 /*
  * ptrace a task: make the debugger its new parent and
  * move it to the ptrace list.
@@ -185,10 +175,11 @@ int ptrace_attach(struct task_struct *task)
        if (same_thread_group(task, current))
                goto out;
 
-       /* Protect exec's credential calculations against our interference;
-        * SUID, SGID and LSM creds get determined differently under ptrace.
+       /* Protect the target's credential calculations against our
+        * interference; SUID, SGID and LSM creds get determined differently
+        * under ptrace.
         */
-       retval = mutex_lock_interruptible(&task->cred_exec_mutex);
+       retval = mutex_lock_interruptible(&task->cred_guard_mutex);
        if (retval  < 0)
                goto out;
 
@@ -232,7 +223,7 @@ repeat:
 bad:
        write_unlock_irqrestore(&tasklist_lock, flags);
        task_unlock(task);
-       mutex_unlock(&task->cred_exec_mutex);
+       mutex_unlock(&task->cred_guard_mutex);
 out:
        return retval;
 }
index ce97a4df64d3539edc785ff9db119c3ba3edbe07..beb0e659adcc60be60bcc1efd2c6e9ca45367afe 100644 (file)
@@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg)
 
                rcu_ctrlblk.sched_sleep = rcu_sched_sleeping;
                spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags);
-               ret = 0;
+               ret = 0; /* unused */
                __wait_event_interruptible(rcu_ctrlblk.sched_wq,
                        rcu_ctrlblk.sched_sleep != rcu_sched_sleeping,
                        ret);
 
-               /*
-                * Signals would prevent us from sleeping, and we cannot
-                * do much with them in any case.  So flush them.
-                */
-               if (ret)
-                       flush_signals(current);
                couldsleepnext = 0;
 
        } while (!kthread_should_stop());
index d2a372fb0b9b511cfe9ebdc1f5431f2ec5faafcd..0dccfbba6d267ad59b62314d665a3a91b3104252 100644 (file)
@@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
        check_cpu_stall(rsp, rdp);
 
        /* Is the RCU core waiting for a quiescent state from this CPU? */
-       if (rdp->qs_pending)
+       if (rdp->qs_pending) {
+               rdp->n_rp_qs_pending++;
                return 1;
+       }
 
        /* Does this CPU have callbacks ready to invoke? */
-       if (cpu_has_callbacks_ready_to_invoke(rdp))
+       if (cpu_has_callbacks_ready_to_invoke(rdp)) {
+               rdp->n_rp_cb_ready++;
                return 1;
+       }
 
        /* Has RCU gone idle with this CPU needing another grace period? */
-       if (cpu_needs_another_gp(rsp, rdp))
+       if (cpu_needs_another_gp(rsp, rdp)) {
+               rdp->n_rp_cpu_needs_gp++;
                return 1;
+       }
 
        /* Has another RCU grace period completed?  */
-       if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */
+       if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */
+               rdp->n_rp_gp_completed++;
                return 1;
+       }
 
        /* Has a new RCU grace period started? */
-       if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */
+       if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */
+               rdp->n_rp_gp_started++;
                return 1;
+       }
 
        /* Has an RCU GP gone long enough to send resched IPIs &c? */
        if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
-           ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0))
+           ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
+               rdp->n_rp_need_fqs++;
                return 1;
+       }
 
        /* nothing to do */
+       rdp->n_rp_need_nothing++;
        return 0;
 }
 
index 4b1875ba94044216250d938713ddf76ac2280f1c..fe1dcdbf1ca340558191a02ee408fea17e04fadd 100644 (file)
@@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = {
        .release = single_release,
 };
 
-static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir;
+static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp)
+{
+       seq_printf(m, "%3d%cnp=%ld "
+                  "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n",
+                  rdp->cpu,
+                  cpu_is_offline(rdp->cpu) ? '!' : ' ',
+                  rdp->n_rcu_pending,
+                  rdp->n_rp_qs_pending,
+                  rdp->n_rp_cb_ready,
+                  rdp->n_rp_cpu_needs_gp,
+                  rdp->n_rp_gp_completed,
+                  rdp->n_rp_gp_started,
+                  rdp->n_rp_need_fqs,
+                  rdp->n_rp_need_nothing);
+}
+
+static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp)
+{
+       int cpu;
+       struct rcu_data *rdp;
+
+       for_each_possible_cpu(cpu) {
+               rdp = rsp->rda[cpu];
+               if (rdp->beenonline)
+                       print_one_rcu_pending(m, rdp);
+       }
+}
+
+static int show_rcu_pending(struct seq_file *m, void *unused)
+{
+       seq_puts(m, "rcu:\n");
+       print_rcu_pendings(m, &rcu_state);
+       seq_puts(m, "rcu_bh:\n");
+       print_rcu_pendings(m, &rcu_bh_state);
+       return 0;
+}
+
+static int rcu_pending_open(struct inode *inode, struct file *file)
+{
+       return single_open(file, show_rcu_pending, NULL);
+}
+
+static struct file_operations rcu_pending_fops = {
+       .owner = THIS_MODULE,
+       .open = rcu_pending_open,
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .release = single_release,
+};
+
+static struct dentry *rcudir;
+static struct dentry *datadir;
+static struct dentry *datadir_csv;
+static struct dentry *gpdir;
+static struct dentry *hierdir;
+static struct dentry *rcu_pendingdir;
+
 static int __init rcuclassic_trace_init(void)
 {
        rcudir = debugfs_create_dir("rcu", NULL);
@@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void)
                                                NULL, &rcuhier_fops);
        if (!hierdir)
                goto free_out;
+
+       rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir,
+                                               NULL, &rcu_pending_fops);
+       if (!rcu_pendingdir)
+               goto free_out;
        return 0;
 free_out:
        if (datadir)
@@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void)
        debugfs_remove(datadir_csv);
        debugfs_remove(gpdir);
        debugfs_remove(hierdir);
+       debugfs_remove(rcu_pendingdir);
        debugfs_remove(rcudir);
 }
 
index 69d9cb921ffa657ef6939be6dcfb053570fb019f..820c5af44f3ec6472db64bf32daeccba458ab7d7 100644 (file)
@@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task,
  * assigned pending owner [which might not have taken the
  * lock yet]:
  */
-static inline int try_to_steal_lock(struct rt_mutex *lock)
+static inline int try_to_steal_lock(struct rt_mutex *lock,
+                                   struct task_struct *task)
 {
        struct task_struct *pendowner = rt_mutex_owner(lock);
        struct rt_mutex_waiter *next;
@@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
        if (!rt_mutex_owner_pending(lock))
                return 0;
 
-       if (pendowner == current)
+       if (pendowner == task)
                return 1;
 
        spin_lock_irqsave(&pendowner->pi_lock, flags);
-       if (current->prio >= pendowner->prio) {
+       if (task->prio >= pendowner->prio) {
                spin_unlock_irqrestore(&pendowner->pi_lock, flags);
                return 0;
        }
@@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock)
         * We are going to steal the lock and a waiter was
         * enqueued on the pending owners pi_waiters queue. So
         * we have to enqueue this waiter into
-        * current->pi_waiters list. This covers the case,
-        * where current is boosted because it holds another
+        * task->pi_waiters list. This covers the case,
+        * where task is boosted because it holds another
         * lock and gets unboosted because the booster is
         * interrupted, so we would delay a waiter with higher
-        * priority as current->normal_prio.
+        * priority as task->normal_prio.
         *
         * Note: in the rare case of a SCHED_OTHER task changing
         * its priority and thus stealing the lock, next->task
-        * might be current:
+        * might be task:
         */
-       if (likely(next->task != current)) {
-               spin_lock_irqsave(&current->pi_lock, flags);
-               plist_add(&next->pi_list_entry, &current->pi_waiters);
-               __rt_mutex_adjust_prio(current);
-               spin_unlock_irqrestore(&current->pi_lock, flags);
+       if (likely(next->task != task)) {
+               spin_lock_irqsave(&task->pi_lock, flags);
+               plist_add(&next->pi_list_entry, &task->pi_waiters);
+               __rt_mutex_adjust_prio(task);
+               spin_unlock_irqrestore(&task->pi_lock, flags);
        }
        return 1;
 }
@@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
         */
        mark_rt_mutex_waiters(lock);
 
-       if (rt_mutex_owner(lock) && !try_to_steal_lock(lock))
+       if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current))
                return 0;
 
        /* We got the lock. */
@@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock)
  */
 static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
                                   struct rt_mutex_waiter *waiter,
+                                  struct task_struct *task,
                                   int detect_deadlock)
 {
        struct task_struct *owner = rt_mutex_owner(lock);
@@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        unsigned long flags;
        int chain_walk = 0, res;
 
-       spin_lock_irqsave(&current->pi_lock, flags);
-       __rt_mutex_adjust_prio(current);
-       waiter->task = current;
+       spin_lock_irqsave(&task->pi_lock, flags);
+       __rt_mutex_adjust_prio(task);
+       waiter->task = task;
        waiter->lock = lock;
-       plist_node_init(&waiter->list_entry, current->prio);
-       plist_node_init(&waiter->pi_list_entry, current->prio);
+       plist_node_init(&waiter->list_entry, task->prio);
+       plist_node_init(&waiter->pi_list_entry, task->prio);
 
        /* Get the top priority waiter on the lock */
        if (rt_mutex_has_waiters(lock))
                top_waiter = rt_mutex_top_waiter(lock);
        plist_add(&waiter->list_entry, &lock->wait_list);
 
-       current->pi_blocked_on = waiter;
+       task->pi_blocked_on = waiter;
 
-       spin_unlock_irqrestore(&current->pi_lock, flags);
+       spin_unlock_irqrestore(&task->pi_lock, flags);
 
        if (waiter == rt_mutex_top_waiter(lock)) {
                spin_lock_irqsave(&owner->pi_lock, flags);
@@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock,
        spin_unlock(&lock->wait_lock);
 
        res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter,
-                                        current);
+                                        task);
 
        spin_lock(&lock->wait_lock);
 
@@ -605,37 +607,25 @@ void rt_mutex_adjust_pi(struct task_struct *task)
        rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task);
 }
 
-/*
- * Slow path lock function:
+/**
+ * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop
+ * @lock:               the rt_mutex to take
+ * @state:              the state the task should block in (TASK_INTERRUPTIBLE
+ *                      or TASK_UNINTERRUPTIBLE)
+ * @timeout:            the pre-initialized and started timer, or NULL for none
+ * @waiter:             the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:    passed to task_blocks_on_rt_mutex
+ *
+ * lock->wait_lock must be held by the caller.
  */
 static int __sched
-rt_mutex_slowlock(struct rt_mutex *lock, int state,
-                 struct hrtimer_sleeper *timeout,
-                 int detect_deadlock)
+__rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                   struct hrtimer_sleeper *timeout,
+                   struct rt_mutex_waiter *waiter,
+                   int detect_deadlock)
 {
-       struct rt_mutex_waiter waiter;
        int ret = 0;
 
-       debug_rt_mutex_init_waiter(&waiter);
-       waiter.task = NULL;
-
-       spin_lock(&lock->wait_lock);
-
-       /* Try to acquire the lock again: */
-       if (try_to_take_rt_mutex(lock)) {
-               spin_unlock(&lock->wait_lock);
-               return 0;
-       }
-
-       set_current_state(state);
-
-       /* Setup the timer, when timeout != NULL */
-       if (unlikely(timeout)) {
-               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
-               if (!hrtimer_active(&timeout->timer))
-                       timeout->task = NULL;
-       }
-
        for (;;) {
                /* Try to acquire the lock: */
                if (try_to_take_rt_mutex(lock))
@@ -656,19 +646,19 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
                }
 
                /*
-                * waiter.task is NULL the first time we come here and
+                * waiter->task is NULL the first time we come here and
                 * when we have been woken up by the previous owner
                 * but the lock got stolen by a higher prio task.
                 */
-               if (!waiter.task) {
-                       ret = task_blocks_on_rt_mutex(lock, &waiter,
+               if (!waiter->task) {
+                       ret = task_blocks_on_rt_mutex(lock, waiter, current,
                                                      detect_deadlock);
                        /*
                         * If we got woken up by the owner then start loop
                         * all over without going into schedule to try
                         * to get the lock now:
                         */
-                       if (unlikely(!waiter.task)) {
+                       if (unlikely(!waiter->task)) {
                                /*
                                 * Reset the return value. We might
                                 * have returned with -EDEADLK and the
@@ -684,15 +674,52 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state,
 
                spin_unlock(&lock->wait_lock);
 
-               debug_rt_mutex_print_deadlock(&waiter);
+               debug_rt_mutex_print_deadlock(waiter);
 
-               if (waiter.task)
+               if (waiter->task)
                        schedule_rt_mutex(lock);
 
                spin_lock(&lock->wait_lock);
                set_current_state(state);
        }
 
+       return ret;
+}
+
+/*
+ * Slow path lock function:
+ */
+static int __sched
+rt_mutex_slowlock(struct rt_mutex *lock, int state,
+                 struct hrtimer_sleeper *timeout,
+                 int detect_deadlock)
+{
+       struct rt_mutex_waiter waiter;
+       int ret = 0;
+
+       debug_rt_mutex_init_waiter(&waiter);
+       waiter.task = NULL;
+
+       spin_lock(&lock->wait_lock);
+
+       /* Try to acquire the lock again: */
+       if (try_to_take_rt_mutex(lock)) {
+               spin_unlock(&lock->wait_lock);
+               return 0;
+       }
+
+       set_current_state(state);
+
+       /* Setup the timer, when timeout != NULL */
+       if (unlikely(timeout)) {
+               hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS);
+               if (!hrtimer_active(&timeout->timer))
+                       timeout->task = NULL;
+       }
+
+       ret = __rt_mutex_slowlock(lock, state, timeout, &waiter,
+                                 detect_deadlock);
+
        set_current_state(TASK_RUNNING);
 
        if (unlikely(waiter.task))
@@ -864,9 +891,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock,
 EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible);
 
 /**
- * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible
- *                                    the timeout structure is provided
- *                                    by the caller
+ * rt_mutex_timed_lock - lock a rt_mutex interruptible
+ *                     the timeout structure is provided
+ *                     by the caller
  *
  * @lock:              the rt_mutex to be locked
  * @timeout:           timeout structure or NULL (no timeout)
@@ -913,7 +940,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock)
 }
 EXPORT_SYMBOL_GPL(rt_mutex_unlock);
 
-/***
+/**
  * rt_mutex_destroy - mark a mutex unusable
  * @lock: the mutex to be destroyed
  *
@@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock,
        rt_mutex_deadlock_account_unlock(proxy_owner);
 }
 
+/**
+ * rt_mutex_start_proxy_lock() - Start lock acquisition for another task
+ * @lock:              the rt_mutex to take
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @task:              the task to prepare
+ * @detect_deadlock:   perform deadlock detection (1) or not (0)
+ *
+ * Returns:
+ *  0 - task blocked on lock
+ *  1 - acquired the lock for task, caller should wake it up
+ * <0 - error
+ *
+ * Special API call for FUTEX_REQUEUE_PI support.
+ */
+int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                             struct rt_mutex_waiter *waiter,
+                             struct task_struct *task, int detect_deadlock)
+{
+       int ret;
+
+       spin_lock(&lock->wait_lock);
+
+       mark_rt_mutex_waiters(lock);
+
+       if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) {
+               /* We got the lock for task. */
+               debug_rt_mutex_lock(lock);
+
+               rt_mutex_set_owner(lock, task, 0);
+
+               rt_mutex_deadlock_account_lock(lock, task);
+               return 1;
+       }
+
+       ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock);
+
+
+       if (ret && !waiter->task) {
+               /*
+                * Reset the return value. We might have
+                * returned with -EDEADLK and the owner
+                * released the lock while we were walking the
+                * pi chain.  Let the waiter sort it out.
+                */
+               ret = 0;
+       }
+       spin_unlock(&lock->wait_lock);
+
+       debug_rt_mutex_print_deadlock(waiter);
+
+       return ret;
+}
+
 /**
  * rt_mutex_next_owner - return the next owner of the lock
  *
@@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock)
 
        return rt_mutex_top_waiter(lock)->task;
 }
+
+/**
+ * rt_mutex_finish_proxy_lock() - Complete lock acquisition
+ * @lock:              the rt_mutex we were woken on
+ * @to:                        the timeout, null if none. hrtimer should already have
+ *                     been started.
+ * @waiter:            the pre-initialized rt_mutex_waiter
+ * @detect_deadlock:   perform deadlock detection (1) or not (0)
+ *
+ * Complete the lock acquisition started our behalf by another thread.
+ *
+ * Returns:
+ *  0 - success
+ * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK
+ *
+ * Special API call for PI-futex requeue support
+ */
+int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                              struct hrtimer_sleeper *to,
+                              struct rt_mutex_waiter *waiter,
+                              int detect_deadlock)
+{
+       int ret;
+
+       spin_lock(&lock->wait_lock);
+
+       set_current_state(TASK_INTERRUPTIBLE);
+
+       ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter,
+                                 detect_deadlock);
+
+       set_current_state(TASK_RUNNING);
+
+       if (unlikely(waiter->task))
+               remove_waiter(lock, waiter);
+
+       /*
+        * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might
+        * have to fix that up.
+        */
+       fixup_rt_mutex_waiters(lock);
+
+       spin_unlock(&lock->wait_lock);
+
+       /*
+        * Readjust priority, when we did not get the lock. We might have been
+        * the pending owner and boosted. Since we did not take the lock, the
+        * PI boost has to go.
+        */
+       if (unlikely(ret))
+               rt_mutex_adjust_prio(current);
+
+       return ret;
+}
index e124bf5800ea140ceeb936c7cba6429fea0e49b5..97a2f81866afdb6507607c4a30348f3f74564e0e 100644 (file)
@@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock,
                                       struct task_struct *proxy_owner);
 extern void rt_mutex_proxy_unlock(struct rt_mutex *lock,
                                  struct task_struct *proxy_owner);
+extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock,
+                                    struct rt_mutex_waiter *waiter,
+                                    struct task_struct *task,
+                                    int detect_deadlock);
+extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock,
+                                     struct hrtimer_sleeper *to,
+                                     struct rt_mutex_waiter *waiter,
+                                     int detect_deadlock);
 
 #ifdef CONFIG_DEBUG_RT_MUTEXES
 # include "rtmutex-debug.h"
index 26efa475bdc143f6e4459067c18ce57e71608764..14c447ae5d53a23235a44f426737e46313df7f57 100644 (file)
 #include <linux/debugfs.h>
 #include <linux/ctype.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
 
 #include <asm/tlb.h>
 #include <asm/irq_regs.h>
 
 #include "sched_cpupri.h"
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/sched.h>
+
 /*
  * Convert user-nice values [ -20 ... 0 ... 19 ]
  * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
  */
 #define RUNTIME_INF    ((u64)~0ULL)
 
-DEFINE_TRACE(sched_wait_task);
-DEFINE_TRACE(sched_wakeup);
-DEFINE_TRACE(sched_wakeup_new);
-DEFINE_TRACE(sched_switch);
-DEFINE_TRACE(sched_migrate_task);
-
 #ifdef CONFIG_SMP
 
 static void double_rq_lock(struct rq *rq1, struct rq *rq2);
@@ -630,6 +626,10 @@ struct rq {
        struct list_head migration_queue;
 #endif
 
+       /* calc_load related fields */
+       unsigned long calc_load_update;
+       long calc_load_active;
+
 #ifdef CONFIG_SCHED_HRTICK
 #ifdef CONFIG_SMP
        int hrtick_csd_pending;
@@ -1728,6 +1728,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
 }
 #endif
 
+static void calc_load_account_active(struct rq *this_rq);
+
 #include "sched_stats.h"
 #include "sched_idletask.c"
 #include "sched_fair.c"
@@ -1958,7 +1960,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
 
        clock_offset = old_rq->clock - new_rq->clock;
 
-       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+       trace_sched_migrate_task(p, new_cpu);
 
 #ifdef CONFIG_SCHEDSTATS
        if (p->se.wait_start)
@@ -2014,6 +2016,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req)
        return 1;
 }
 
+/*
+ * wait_task_context_switch -  wait for a thread to complete at least one
+ *                             context switch.
+ *
+ * @p must not be current.
+ */
+void wait_task_context_switch(struct task_struct *p)
+{
+       unsigned long nvcsw, nivcsw, flags;
+       int running;
+       struct rq *rq;
+
+       nvcsw   = p->nvcsw;
+       nivcsw  = p->nivcsw;
+       for (;;) {
+               /*
+                * The runqueue is assigned before the actual context
+                * switch. We need to take the runqueue lock.
+                *
+                * We could check initially without the lock but it is
+                * very likely that we need to take the lock in every
+                * iteration.
+                */
+               rq = task_rq_lock(p, &flags);
+               running = task_running(rq, p);
+               task_rq_unlock(rq, &flags);
+
+               if (likely(!running))
+                       break;
+               /*
+                * The switch count is incremented before the actual
+                * context switch. We thus wait for two switches to be
+                * sure at least one completed.
+                */
+               if ((p->nvcsw - nvcsw) > 1)
+                       break;
+               if ((p->nivcsw - nivcsw) > 1)
+                       break;
+
+               cpu_relax();
+       }
+}
+
 /*
  * wait_task_inactive - wait for a thread to unschedule.
  *
@@ -2458,6 +2503,17 @@ out:
        return success;
 }
 
+/**
+ * wake_up_process - Wake up a specific process
+ * @p: The process to be woken up.
+ *
+ * Attempt to wake up the nominated process and move it to the set of runnable
+ * processes.  Returns 1 if the process was woken up, 0 if it was already
+ * running.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
+ */
 int wake_up_process(struct task_struct *p)
 {
        return try_to_wake_up(p, TASK_ALL, 0);
@@ -2766,7 +2822,7 @@ context_switch(struct rq *rq, struct task_struct *prev,
         * combine the page table reload and the switch backend into
         * one hypercall.
         */
-       arch_enter_lazy_cpu_mode();
+       arch_start_context_switch(prev);
 
        if (unlikely(!mm)) {
                next->active_mm = oldmm;
@@ -2856,19 +2912,72 @@ unsigned long nr_iowait(void)
        return sum;
 }
 
-unsigned long nr_active(void)
+/* Variables and functions for calc_load */
+static atomic_long_t calc_load_tasks;
+static unsigned long calc_load_update;
+unsigned long avenrun[3];
+EXPORT_SYMBOL(avenrun);
+
+/**
+ * get_avenrun - get the load average array
+ * @loads:     pointer to dest load array
+ * @offset:    offset to add
+ * @shift:     shift count to shift the result left
+ *
+ * These values are estimates at best, so no need for locking.
+ */
+void get_avenrun(unsigned long *loads, unsigned long offset, int shift)
+{
+       loads[0] = (avenrun[0] + offset) << shift;
+       loads[1] = (avenrun[1] + offset) << shift;
+       loads[2] = (avenrun[2] + offset) << shift;
+}
+
+static unsigned long
+calc_load(unsigned long load, unsigned long exp, unsigned long active)
 {
-       unsigned long i, running = 0, uninterruptible = 0;
+       load *= exp;
+       load += active * (FIXED_1 - exp);
+       return load >> FSHIFT;
+}
 
-       for_each_online_cpu(i) {
-               running += cpu_rq(i)->nr_running;
-               uninterruptible += cpu_rq(i)->nr_uninterruptible;
-       }
+/*
+ * calc_load - update the avenrun load estimates 10 ticks after the
+ * CPUs have updated calc_load_tasks.
+ */
+void calc_global_load(void)
+{
+       unsigned long upd = calc_load_update + 10;
+       long active;
 
-       if (unlikely((long)uninterruptible < 0))
-               uninterruptible = 0;
+       if (time_before(jiffies, upd))
+               return;
 
-       return running + uninterruptible;
+       active = atomic_long_read(&calc_load_tasks);
+       active = active > 0 ? active * FIXED_1 : 0;
+
+       avenrun[0] = calc_load(avenrun[0], EXP_1, active);
+       avenrun[1] = calc_load(avenrun[1], EXP_5, active);
+       avenrun[2] = calc_load(avenrun[2], EXP_15, active);
+
+       calc_load_update += LOAD_FREQ;
+}
+
+/*
+ * Either called from update_cpu_load() or from a cpu going idle
+ */
+static void calc_load_account_active(struct rq *this_rq)
+{
+       long nr_active, delta;
+
+       nr_active = this_rq->nr_running;
+       nr_active += (long) this_rq->nr_uninterruptible;
+
+       if (nr_active != this_rq->calc_load_active) {
+               delta = nr_active - this_rq->calc_load_active;
+               this_rq->calc_load_active = nr_active;
+               atomic_long_add(delta, &calc_load_tasks);
+       }
 }
 
 /*
@@ -2899,6 +3008,11 @@ static void update_cpu_load(struct rq *this_rq)
                        new_load += scale-1;
                this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i;
        }
+
+       if (time_after_eq(jiffies, this_rq->calc_load_update)) {
+               this_rq->calc_load_update += LOAD_FREQ;
+               calc_load_account_active(this_rq);
+       }
 }
 
 #ifdef CONFIG_SMP
@@ -4240,10 +4354,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu)
 static struct {
        atomic_t load_balancer;
        cpumask_var_t cpu_mask;
+       cpumask_var_t ilb_grp_nohz_mask;
 } nohz ____cacheline_aligned = {
        .load_balancer = ATOMIC_INIT(-1),
 };
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * lowest_flag_domain - Return lowest sched_domain containing flag.
+ * @cpu:       The cpu whose lowest level of sched domain is to
+ *             be returned.
+ * @flag:      The flag to check for the lowest sched_domain
+ *             for the given cpu.
+ *
+ * Returns the lowest sched_domain of a cpu which contains the given flag.
+ */
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd)
+               if (sd && (sd->flags & flag))
+                       break;
+
+       return sd;
+}
+
+/**
+ * for_each_flag_domain - Iterates over sched_domains containing the flag.
+ * @cpu:       The cpu whose domains we're iterating over.
+ * @sd:                variable holding the value of the power_savings_sd
+ *             for cpu.
+ * @flag:      The flag to filter the sched_domains to be iterated.
+ *
+ * Iterates over all the scheduler domains for a given cpu that has the 'flag'
+ * set, starting from the lowest sched_domain to the highest.
+ */
+#define for_each_flag_domain(cpu, sd, flag) \
+       for (sd = lowest_flag_domain(cpu, flag); \
+               (sd && (sd->flags & flag)); sd = sd->parent)
+
+/**
+ * is_semi_idle_group - Checks if the given sched_group is semi-idle.
+ * @ilb_group: group to be checked for semi-idleness
+ *
+ * Returns:    1 if the group is semi-idle. 0 otherwise.
+ *
+ * We define a sched_group to be semi idle if it has atleast one idle-CPU
+ * and atleast one non-idle CPU. This helper function checks if the given
+ * sched_group is semi-idle or not.
+ */
+static inline int is_semi_idle_group(struct sched_group *ilb_group)
+{
+       cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask,
+                                       sched_group_cpus(ilb_group));
+
+       /*
+        * A sched_group is semi-idle when it has atleast one busy cpu
+        * and atleast one idle cpu.
+        */
+       if (cpumask_empty(nohz.ilb_grp_nohz_mask))
+               return 0;
+
+       if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group)))
+               return 0;
+
+       return 1;
+}
+/**
+ * find_new_ilb - Finds the optimum idle load balancer for nomination.
+ * @cpu:       The cpu which is nominating a new idle_load_balancer.
+ *
+ * Returns:    Returns the id of the idle load balancer if it exists,
+ *             Else, returns >= nr_cpu_ids.
+ *
+ * This algorithm picks the idle load balancer such that it belongs to a
+ * semi-idle powersavings sched_domain. The idea is to try and avoid
+ * completely idle packages/cores just for the purpose of idle load balancing
+ * when there are other idle cpu's which are better suited for that job.
+ */
+static int find_new_ilb(int cpu)
+{
+       struct sched_domain *sd;
+       struct sched_group *ilb_group;
+
+       /*
+        * Have idle load balancer selection from semi-idle packages only
+        * when power-aware load balancing is enabled
+        */
+       if (!(sched_smt_power_savings || sched_mc_power_savings))
+               goto out_done;
+
+       /*
+        * Optimize for the case when we have no idle CPUs or only one
+        * idle CPU. Don't walk the sched_domain hierarchy in such cases
+        */
+       if (cpumask_weight(nohz.cpu_mask) < 2)
+               goto out_done;
+
+       for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) {
+               ilb_group = sd->groups;
+
+               do {
+                       if (is_semi_idle_group(ilb_group))
+                               return cpumask_first(nohz.ilb_grp_nohz_mask);
+
+                       ilb_group = ilb_group->next;
+
+               } while (ilb_group != sd->groups);
+       }
+
+out_done:
+       return cpumask_first(nohz.cpu_mask);
+}
+#else /*  (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */
+static inline int find_new_ilb(int call_cpu)
+{
+       return cpumask_first(nohz.cpu_mask);
+}
+#endif
+
 /*
  * This routine will try to nominate the ilb (idle load balancing)
  * owner among the cpus whose ticks are stopped. ilb owner will do the idle
@@ -4298,8 +4528,24 @@ int select_nohz_load_balancer(int stop_tick)
                        /* make me the ilb owner */
                        if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1)
                                return 1;
-               } else if (atomic_read(&nohz.load_balancer) == cpu)
+               } else if (atomic_read(&nohz.load_balancer) == cpu) {
+                       int new_ilb;
+
+                       if (!(sched_smt_power_savings ||
+                                               sched_mc_power_savings))
+                               return 1;
+                       /*
+                        * Check to see if there is a more power-efficient
+                        * ilb.
+                        */
+                       new_ilb = find_new_ilb(cpu);
+                       if (new_ilb < nr_cpu_ids && new_ilb != cpu) {
+                               atomic_set(&nohz.load_balancer, -1);
+                               resched_cpu(new_ilb);
+                               return 0;
+                       }
                        return 1;
+               }
        } else {
                if (!cpumask_test_cpu(cpu, nohz.cpu_mask))
                        return 0;
@@ -4468,15 +4714,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
                }
 
                if (atomic_read(&nohz.load_balancer) == -1) {
-                       /*
-                        * simple selection for now: Nominate the
-                        * first cpu in the nohz list to be the next
-                        * ilb owner.
-                        *
-                        * TBD: Traverse the sched domains and nominate
-                        * the nearest cpu in the nohz.cpu_mask.
-                        */
-                       int ilb = cpumask_first(nohz.cpu_mask);
+                       int ilb = find_new_ilb(cpu);
 
                        if (ilb < nr_cpu_ids)
                                resched_cpu(ilb);
@@ -5007,13 +5245,15 @@ pick_next_task(struct rq *rq)
 /*
  * schedule() is the main scheduler function.
  */
-asmlinkage void __sched __schedule(void)
+asmlinkage void __sched schedule(void)
 {
        struct task_struct *prev, *next;
        unsigned long *switch_count;
        struct rq *rq;
        int cpu;
 
+need_resched:
+       preempt_disable();
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
        rcu_qsctr_inc(cpu);
@@ -5070,15 +5310,9 @@ need_resched_nonpreemptible:
 
        if (unlikely(reacquire_kernel_lock(current) < 0))
                goto need_resched_nonpreemptible;
-}
 
-asmlinkage void __sched schedule(void)
-{
-need_resched:
-       preempt_disable();
-       __schedule();
        preempt_enable_no_resched();
-       if (unlikely(test_thread_flag(TIF_NEED_RESCHED)))
+       if (need_resched())
                goto need_resched;
 }
 EXPORT_SYMBOL(schedule);
@@ -5221,7 +5455,7 @@ EXPORT_SYMBOL(default_wake_function);
  * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
  * zero in this (rare) case, and we handle it by continuing to scan the queue.
  */
-void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, int sync, void *key)
 {
        wait_queue_t *curr, *next;
@@ -5241,6 +5475,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
  * @key: is directly passed to the wakeup function
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5279,6 +5516,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
  * with each other. This can prevent needless bouncing between CPUs.
  *
  * On UP it can prevent extra preemption.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
                        int nr_exclusive, void *key)
@@ -5315,6 +5555,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync);        /* For internal use only */
  * awakened in the same order in which they were queued.
  *
  * See also complete_all(), wait_for_completion() and related routines.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete(struct completion *x)
 {
@@ -5332,6 +5575,9 @@ EXPORT_SYMBOL(complete);
  * @x:  holds the state of this particular completion
  *
  * This will wake up all threads waiting on this particular completion event.
+ *
+ * It may be assumed that this function implies a write memory barrier before
+ * changing the task state if and only if any tasks are woken up.
  */
 void complete_all(struct completion *x)
 {
@@ -6490,8 +6736,9 @@ void sched_show_task(struct task_struct *p)
 #ifdef CONFIG_DEBUG_STACK_USAGE
        free = stack_not_used(p);
 #endif
-       printk(KERN_CONT "%5lu %5d %6d\n", free,
-               task_pid_nr(p), task_pid_nr(p->real_parent));
+       printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
+               task_pid_nr(p), task_pid_nr(p->real_parent),
+               (unsigned long)task_thread_info(p)->flags);
 
        show_stack(p, NULL);
 }
@@ -6970,6 +7217,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 
        }
 }
+
+/*
+ * remove the tasks which were accounted by rq from calc_load_tasks.
+ */
+static void calc_global_load_remove(struct rq *rq)
+{
+       atomic_long_sub(rq->calc_load_active, &calc_load_tasks);
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL)
@@ -7204,6 +7459,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                /* Update our root-domain */
                rq = cpu_rq(cpu);
                spin_lock_irqsave(&rq->lock, flags);
+               rq->calc_load_update = calc_load_update;
+               rq->calc_load_active = 0;
                if (rq->rd) {
                        BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
 
@@ -7243,7 +7500,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                cpuset_unlock();
                migrate_nr_uninterruptible(rq);
                BUG_ON(rq->nr_running != 0);
-
+               calc_global_load_remove(rq);
                /*
                 * No need to migrate the tasks: it was best-effort if
                 * they didn't take sched_hotcpu_mutex. Just wake up
@@ -7753,8 +8010,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0;
 
 /*
  * The cpus mask in sched_group and sched_domain hangs off the end.
- * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space
- * for nr_cpu_ids < CONFIG_NR_CPUS.
+ *
+ * ( See the the comments in include/linux/sched.h:struct sched_group
+ *   and struct sched_domain. )
  */
 struct static_sched_group {
        struct sched_group sg;
@@ -7875,7 +8133,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head)
                        struct sched_domain *sd;
 
                        sd = &per_cpu(phys_domains, j).sd;
-                       if (j != cpumask_first(sched_group_cpus(sd->groups))) {
+                       if (j != group_first_cpu(sd->groups)) {
                                /*
                                 * Only add "power" once for each
                                 * physical package.
@@ -7953,7 +8211,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd)
 
        WARN_ON(!sd || !sd->groups);
 
-       if (cpu != cpumask_first(sched_group_cpus(sd->groups)))
+       if (cpu != group_first_cpu(sd->groups))
                return;
 
        child = sd->child;
@@ -8938,6 +9196,8 @@ void __init sched_init(void)
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
                rq->nr_running = 0;
+               rq->calc_load_active = 0;
+               rq->calc_load_update = jiffies + LOAD_FREQ;
                init_cfs_rq(&rq->cfs, rq);
                init_rt_rq(&rq->rt, rq);
 #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -9045,6 +9305,9 @@ void __init sched_init(void)
         * when this runqueue becomes "idle".
         */
        init_idle(current, smp_processor_id());
+
+       calc_load_update = jiffies + LOAD_FREQ;
+
        /*
         * During early bootup we pretend to be a normal task:
         */
@@ -9055,6 +9318,7 @@ void __init sched_init(void)
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
        alloc_bootmem_cpumask_var(&nohz.cpu_mask);
+       alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask);
 #endif
        alloc_bootmem_cpumask_var(&cpu_isolated_map);
 #endif /* SMP */
@@ -9800,6 +10064,13 @@ static int sched_rt_global_constraints(void)
        if (sysctl_sched_rt_period <= 0)
                return -EINVAL;
 
+       /*
+        * There's always some RT tasks in the root group
+        * -- migration, kstopmachine etc..
+        */
+       if (sysctl_sched_rt_runtime == 0)
+               return -EBUSY;
+
        spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
        for_each_possible_cpu(i) {
                struct rt_rq *rt_rq = &cpu_rq(i)->rt;
index 3816f217f119ef635cf169f268b5140c6e0f66c6..5f9650e8fe7587938de1bfa71e48cd4a04061d36 100644 (file)
@@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 
        find_matching_se(&se, &pse);
 
-       while (se) {
-               BUG_ON(!pse);
+       BUG_ON(!pse);
 
-               if (wakeup_preempt_entity(se, pse) == 1) {
-                       resched_task(curr);
-                       break;
-               }
-
-               se = parent_entity(se);
-               pse = parent_entity(pse);
-       }
+       if (wakeup_preempt_entity(se, pse) == 1)
+               resched_task(curr);
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
index 8a21a2e28c13ac2e4620e980888702e22a4f2387..499672c10cbd615141a362cafe804711bbcf1ff3 100644 (file)
@@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy
 static struct task_struct *pick_next_task_idle(struct rq *rq)
 {
        schedstat_inc(rq, sched_goidle);
-
+       /* adjust the active tasks as we might go into a long sleep */
+       calc_load_account_active(rq);
        return rq->idle;
 }
 
index d8034737db4cb86f8adca044560a3765ac65dba5..809a228019adeb06c8979d86f52473a0d4a168ac 100644 (file)
@@ -27,7 +27,7 @@
 #include <linux/freezer.h>
 #include <linux/pid_namespace.h>
 #include <linux/nsproxy.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/param.h>
 #include <asm/uaccess.h>
@@ -41,8 +41,6 @@
 
 static struct kmem_cache *sigqueue_cachep;
 
-DEFINE_TRACE(sched_signal_send);
-
 static void __user *sig_handler(struct task_struct *t, int sig)
 {
        return t->sighand->action[sig - 1].sa.sa_handler;
@@ -249,14 +247,19 @@ void flush_sigqueue(struct sigpending *queue)
 /*
  * Flush all pending signals for a task.
  */
+void __flush_signals(struct task_struct *t)
+{
+       clear_tsk_thread_flag(t, TIF_SIGPENDING);
+       flush_sigqueue(&t->pending);
+       flush_sigqueue(&t->signal->shared_pending);
+}
+
 void flush_signals(struct task_struct *t)
 {
        unsigned long flags;
 
        spin_lock_irqsave(&t->sighand->siglock, flags);
-       clear_tsk_thread_flag(t, TIF_SIGPENDING);
-       flush_sigqueue(&t->pending);
-       flush_sigqueue(&t->signal->shared_pending);
+       __flush_signals(t);
        spin_unlock_irqrestore(&t->sighand->siglock, flags);
 }
 
@@ -2278,24 +2281,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
        return kill_something_info(sig, &info, pid);
 }
 
-static int do_tkill(pid_t tgid, pid_t pid, int sig)
+static int
+do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info)
 {
-       int error;
-       struct siginfo info;
        struct task_struct *p;
        unsigned long flags;
-
-       error = -ESRCH;
-       info.si_signo = sig;
-       info.si_errno = 0;
-       info.si_code = SI_TKILL;
-       info.si_pid = task_tgid_vnr(current);
-       info.si_uid = current_uid();
+       int error = -ESRCH;
 
        rcu_read_lock();
        p = find_task_by_vpid(pid);
        if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) {
-               error = check_kill_permission(sig, &info, p);
+               error = check_kill_permission(sig, info, p);
                /*
                 * The null signal is a permissions and process existence
                 * probe.  No signal is actually delivered.
@@ -2305,7 +2301,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
                 * signal is private anyway.
                 */
                if (!error && sig && lock_task_sighand(p, &flags)) {
-                       error = specific_send_sig_info(sig, &info, p);
+                       error = specific_send_sig_info(sig, info, p);
                        unlock_task_sighand(p, &flags);
                }
        }
@@ -2314,6 +2310,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig)
        return error;
 }
 
+static int do_tkill(pid_t tgid, pid_t pid, int sig)
+{
+       struct siginfo info;
+
+       info.si_signo = sig;
+       info.si_errno = 0;
+       info.si_code = SI_TKILL;
+       info.si_pid = task_tgid_vnr(current);
+       info.si_uid = current_uid();
+
+       return do_send_specific(tgid, pid, sig, &info);
+}
+
 /**
  *  sys_tgkill - send signal to one specific thread
  *  @tgid: the thread group ID of the thread
@@ -2363,6 +2372,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig,
        return kill_proc_info(sig, &info, pid);
 }
 
+long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info)
+{
+       /* This is only valid for single tasks */
+       if (pid <= 0 || tgid <= 0)
+               return -EINVAL;
+
+       /* Not even root can pretend to send signals from the kernel.
+          Nor can they impersonate a kill(), which adds source info.  */
+       if (info->si_code >= 0)
+               return -EPERM;
+       info->si_signo = sig;
+
+       return do_send_specific(tgid, pid, sig, info);
+}
+
+SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig,
+               siginfo_t __user *, uinfo)
+{
+       siginfo_t info;
+
+       if (copy_from_user(&info, uinfo, sizeof(siginfo_t)))
+               return -EFAULT;
+
+       return do_rt_tgsigqueueinfo(tgid, pid, sig, &info);
+}
+
 int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact)
 {
        struct task_struct *t = current;
index b525dd348511b0bda3b08c19ffef6df0cf053206..258885a543db60692a695aa682abb6c95f3ffcdf 100644 (file)
@@ -24,7 +24,9 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
-#include <trace/irq.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/irq.h>
 
 #include <asm/irq.h>
 /*
@@ -186,9 +188,6 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
-DEFINE_TRACE(softirq_entry);
-DEFINE_TRACE(softirq_exit);
-
 asmlinkage void __do_softirq(void)
 {
        struct softirq_action *h;
@@ -828,7 +827,7 @@ int __init __weak arch_early_irq_init(void)
        return 0;
 }
 
-int __weak arch_init_chip_data(struct irq_desc *desc, int cpu)
+int __weak arch_init_chip_data(struct irq_desc *desc, int node)
 {
        return 0;
 }
index b2970d56fb7678d6493e44c766db536f691656b7..944ba03cae199ad6139f96f65113bca007fe4fda 100644 (file)
@@ -114,6 +114,7 @@ static int ngroups_max = NGROUPS_MAX;
 
 #ifdef CONFIG_MODULES
 extern char modprobe_path[];
+extern int modules_disabled;
 #endif
 #ifdef CONFIG_CHR_DEV_SG
 extern int sg_big_buff;
@@ -534,6 +535,17 @@ static struct ctl_table kern_table[] = {
                .proc_handler   = &proc_dostring,
                .strategy       = &sysctl_string,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "modules_disabled",
+               .data           = &modules_disabled,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               /* only handle a transition from default "0" to "1" */
+               .proc_handler   = &proc_dointvec_minmax,
+               .extra1         = &one,
+               .extra2         = &one,
+       },
 #endif
 #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET)
        {
@@ -729,6 +741,14 @@ static struct ctl_table kern_table[] = {
                .mode           = 0444,
                .proc_handler   = &proc_dointvec,
        },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "bootloader_version",
+               .data           = &bootloader_version,
+               .maxlen         = sizeof (int),
+               .mode           = 0444,
+               .proc_handler   = &proc_dointvec,
+       },
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "kstack_depth_to_print",
@@ -1225,7 +1245,6 @@ static struct ctl_table vm_table[] = {
                .strategy       = &sysctl_jiffies,
        },
 #endif
-#ifdef CONFIG_SECURITY
        {
                .ctl_name       = CTL_UNNUMBERED,
                .procname       = "mmap_min_addr",
@@ -1234,7 +1253,6 @@ static struct ctl_table vm_table[] = {
                .mode           = 0644,
                .proc_handler   = &proc_doulongvec_minmax,
        },
-#endif
 #ifdef CONFIG_NUMA
        {
                .ctl_name       = CTL_UNNUMBERED,
index 687dff49f6e7da5ec92199f6bbae08de57e5ba43..52a8bf8931f3903128af16a13d32c5d4b8b376aa 100644 (file)
@@ -22,7 +22,7 @@
 
 /*
  * This read-write spinlock protects us from races in SMP while
- * playing with xtime and avenrun.
+ * playing with xtime.
  */
 __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock);
 
index cffffad01c31cf1670490f2860f0e5de0f25d301..a26ed294f938708241c41b19b35fff551506a56a 100644 (file)
@@ -1122,47 +1122,6 @@ void update_process_times(int user_tick)
        run_posix_cpu_timers(p);
 }
 
-/*
- * Nr of active tasks - counted in fixed-point numbers
- */
-static unsigned long count_active_tasks(void)
-{
-       return nr_active() * FIXED_1;
-}
-
-/*
- * Hmm.. Changed this, as the GNU make sources (load.c) seems to
- * imply that avenrun[] is the standard name for this kind of thing.
- * Nothing else seems to be standardized: the fractional size etc
- * all seem to differ on different machines.
- *
- * Requires xtime_lock to access.
- */
-unsigned long avenrun[3];
-
-EXPORT_SYMBOL(avenrun);
-
-/*
- * calc_load - given tick count, update the avenrun load estimates.
- * This is called while holding a write_lock on xtime_lock.
- */
-static inline void calc_load(unsigned long ticks)
-{
-       unsigned long active_tasks; /* fixed-point */
-       static int count = LOAD_FREQ;
-
-       count -= ticks;
-       if (unlikely(count < 0)) {
-               active_tasks = count_active_tasks();
-               do {
-                       CALC_LOAD(avenrun[0], EXP_1, active_tasks);
-                       CALC_LOAD(avenrun[1], EXP_5, active_tasks);
-                       CALC_LOAD(avenrun[2], EXP_15, active_tasks);
-                       count += LOAD_FREQ;
-               } while (count < 0);
-       }
-}
-
 /*
  * This function runs timers and the timer-tq in bottom half context.
  */
@@ -1186,16 +1145,6 @@ void run_local_timers(void)
        softlockup_tick();
 }
 
-/*
- * Called by the timer interrupt. xtime_lock must already be taken
- * by the timer IRQ!
- */
-static inline void update_times(unsigned long ticks)
-{
-       update_wall_time();
-       calc_load(ticks);
-}
-
 /*
  * The 64-bit jiffies value is not atomic - you MUST NOT read it
  * without sampling the sequence number in xtime_lock.
@@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks)
 void do_timer(unsigned long ticks)
 {
        jiffies_64 += ticks;
-       update_times(ticks);
+       update_wall_time();
+       calc_global_load();
 }
 
 #ifdef __ARCH_WANT_SYS_ALARM
@@ -1406,37 +1356,17 @@ int do_sysinfo(struct sysinfo *info)
 {
        unsigned long mem_total, sav_total;
        unsigned int mem_unit, bitcount;
-       unsigned long seq;
+       struct timespec tp;
 
        memset(info, 0, sizeof(struct sysinfo));
 
-       do {
-               struct timespec tp;
-               seq = read_seqbegin(&xtime_lock);
-
-               /*
-                * This is annoying.  The below is the same thing
-                * posix_get_clock_monotonic() does, but it wants to
-                * take the lock which we want to cover the loads stuff
-                * too.
-                */
-
-               getnstimeofday(&tp);
-               tp.tv_sec += wall_to_monotonic.tv_sec;
-               tp.tv_nsec += wall_to_monotonic.tv_nsec;
-               monotonic_to_bootbased(&tp);
-               if (tp.tv_nsec - NSEC_PER_SEC >= 0) {
-                       tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC;
-                       tp.tv_sec++;
-               }
-               info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
+       ktime_get_ts(&tp);
+       monotonic_to_bootbased(&tp);
+       info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0);
 
-               info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT);
-               info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT);
-               info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT);
+       get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT);
 
-               info->procs = nr_threads;
-       } while (read_seqretry(&xtime_lock, seq));
+       info->procs = nr_threads;
 
        si_meminfo(info);
        si_swapinfo(info);
index 417d1985e29911784adbf506523462f009933c02..4a13e5a01ce318c62c75c991a8694096e4821f40 100644 (file)
@@ -48,6 +48,21 @@ config FTRACE_NMI_ENTER
        depends on HAVE_FTRACE_NMI_ENTER
        default y
 
+config EVENT_TRACING
+       select CONTEXT_SWITCH_TRACER
+       bool
+
+config CONTEXT_SWITCH_TRACER
+       select MARKERS
+       bool
+
+# All tracer options should select GENERIC_TRACER. For those options that are
+# enabled by all tracers (context switch and event tracer) they select TRACING.
+# This allows those options to appear when no other tracer is selected. But the
+# options do not appear when something else selects it. We need the two options
+# GENERIC_TRACER and TRACING to avoid circular dependencies to accomplish the
+# hidding of the automatic options options.
+
 config TRACING
        bool
        select DEBUG_FS
@@ -56,6 +71,11 @@ config TRACING
        select TRACEPOINTS
        select NOP_TRACER
        select BINARY_PRINTF
+       select EVENT_TRACING
+
+config GENERIC_TRACER
+       bool
+       select TRACING
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -73,14 +93,20 @@ config TRACING_SUPPORT
 
 if TRACING_SUPPORT
 
-menu "Tracers"
+menuconfig FTRACE
+       bool "Tracers"
+       default y if DEBUG_KERNEL
+       help
+        Enable the kernel tracing infrastructure.
+
+if FTRACE
 
 config FUNCTION_TRACER
        bool "Kernel Function Tracer"
        depends on HAVE_FUNCTION_TRACER
        select FRAME_POINTER
        select KALLSYMS
-       select TRACING
+       select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          Enable the kernel to trace every kernel function. This is done
@@ -104,13 +130,14 @@ config FUNCTION_GRAPH_TRACER
          the return value. This is done by setting the current return 
          address on the current task structure into a stack of calls.
 
+
 config IRQSOFF_TRACER
        bool "Interrupts-off Latency Tracer"
        default n
        depends on TRACE_IRQFLAGS_SUPPORT
        depends on GENERIC_TIME
        select TRACE_IRQFLAGS
-       select TRACING
+       select GENERIC_TRACER
        select TRACER_MAX_TRACE
        help
          This option measures the time spent in irqs-off critical
@@ -131,7 +158,7 @@ config PREEMPT_TRACER
        default n
        depends on GENERIC_TIME
        depends on PREEMPT
-       select TRACING
+       select GENERIC_TRACER
        select TRACER_MAX_TRACE
        help
          This option measures the time spent in preemption off critical
@@ -150,7 +177,7 @@ config PREEMPT_TRACER
 config SYSPROF_TRACER
        bool "Sysprof Tracer"
        depends on X86
-       select TRACING
+       select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          This tracer provides the trace needed by the 'Sysprof' userspace
@@ -158,40 +185,33 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
        bool "Scheduling Latency Tracer"
-       select TRACING
+       select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        select TRACER_MAX_TRACE
        help
          This tracer tracks the latency of the highest priority task
          to be scheduled in, starting from the point it has woken up.
 
-config CONTEXT_SWITCH_TRACER
-       bool "Trace process context switches"
-       select TRACING
-       select MARKERS
-       help
-         This tracer gets called from the context switch and records
-         all switching of tasks.
-
-config EVENT_TRACER
-       bool "Trace various events in the kernel"
+config ENABLE_DEFAULT_TRACERS
+       bool "Trace process context switches and events"
+       depends on !GENERIC_TRACER
        select TRACING
        help
          This tracer hooks to various trace points in the kernel
          allowing the user to pick and choose which trace point they
-         want to trace.
+         want to trace. It also includes the sched_switch tracer plugin.
 
 config FTRACE_SYSCALLS
        bool "Trace syscalls"
        depends on HAVE_FTRACE_SYSCALLS
-       select TRACING
+       select GENERIC_TRACER
        select KALLSYMS
        help
          Basic tracer to catch the syscall entry and exit events.
 
 config BOOT_TRACER
        bool "Trace boot initcalls"
-       select TRACING
+       select GENERIC_TRACER
        select CONTEXT_SWITCH_TRACER
        help
          This tracer helps developers to optimize boot times: it records
@@ -207,8 +227,36 @@ config BOOT_TRACER
          to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
+       bool
+       select GENERIC_TRACER
+
+choice
+       prompt "Branch Profiling"
+       default BRANCH_PROFILE_NONE
+       help
+        The branch profiling is a software profiler. It will add hooks
+        into the C conditionals to test which path a branch takes.
+
+        The likely/unlikely profiler only looks at the conditions that
+        are annotated with a likely or unlikely macro.
+
+        The "all branch" profiler will profile every if statement in the
+        kernel. This profiler will also enable the likely/unlikely
+        profiler as well.
+
+        Either of the above profilers add a bit of overhead to the system.
+        If unsure choose "No branch profiling".
+
+config BRANCH_PROFILE_NONE
+       bool "No branch profiling"
+       help
+        No branch profiling. Branch profiling adds a bit of overhead.
+        Only enable it if you want to analyse the branching behavior.
+        Otherwise keep it disabled.
+
+config PROFILE_ANNOTATED_BRANCHES
        bool "Trace likely/unlikely profiler"
-       select TRACING
+       select TRACE_BRANCH_PROFILING
        help
          This tracer profiles all the the likely and unlikely macros
          in the kernel. It will display the results in:
@@ -218,11 +266,9 @@ config TRACE_BRANCH_PROFILING
          Note: this will add a significant overhead, only turn this
          on if you need to profile the system's use of these macros.
 
-         Say N if unsure.
-
 config PROFILE_ALL_BRANCHES
        bool "Profile all if conditionals"
-       depends on TRACE_BRANCH_PROFILING
+       select TRACE_BRANCH_PROFILING
        help
          This tracer profiles all branch conditions. Every if ()
          taken in the kernel is recorded whether it hit or miss.
@@ -230,11 +276,12 @@ config PROFILE_ALL_BRANCHES
 
          /debugfs/tracing/profile_branch
 
+         This option also enables the likely/unlikely profiler.
+
          This configuration, when enabled, will impose a great overhead
          on the system. This should only be enabled when the system
          is to be analyzed
-
-         Say N if unsure.
+endchoice
 
 config TRACING_BRANCHES
        bool
@@ -261,7 +308,7 @@ config BRANCH_TRACER
 config POWER_TRACER
        bool "Trace power consumption behavior"
        depends on X86
-       select TRACING
+       select GENERIC_TRACER
        help
          This tracer helps developers to analyze and optimize the kernels
          power management decisions, specifically the C-state and P-state
@@ -295,14 +342,14 @@ config STACK_TRACER
 config HW_BRANCH_TRACER
        depends on HAVE_HW_BRANCH_TRACER
        bool "Trace hw branches"
-       select TRACING
+       select GENERIC_TRACER
        help
          This tracer records all branches on the system in a circular
          buffer giving access to the last N branches for each cpu.
 
 config KMEMTRACE
        bool "Trace SLAB allocations"
-       select TRACING
+       select GENERIC_TRACER
        help
          kmemtrace provides tracing for slab allocator functions, such as
          kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
@@ -322,7 +369,7 @@ config KMEMTRACE
 
 config WORKQUEUE_TRACER
        bool "Trace workqueues"
-       select TRACING
+       select GENERIC_TRACER
        help
          The workqueue tracer provides some statistical informations
           about each cpu workqueue thread such as the number of the
@@ -338,7 +385,7 @@ config BLK_DEV_IO_TRACE
        select RELAY
        select DEBUG_FS
        select TRACEPOINTS
-       select TRACING
+       select GENERIC_TRACER
        select STACKTRACE
        help
          Say Y here if you want to be able to trace the block layer actions
@@ -375,6 +422,20 @@ config DYNAMIC_FTRACE
         were made. If so, it runs stop_machine (stops all CPUS)
         and modifies the code to jump over the call to ftrace.
 
+config FUNCTION_PROFILER
+       bool "Kernel function profiler"
+       depends on FUNCTION_TRACER
+       default n
+       help
+        This option enables the kernel function profiler. A file is created
+        in debugfs called function_profile_enabled which defaults to zero.
+        When a 1 is echoed into this file profiling begins, and when a
+        zero is entered, profiling stops. A file in the trace_stats
+        directory called functions, that show the list of functions that
+        have been hit and their counters.
+
+        If in doubt, say N
+
 config FTRACE_MCOUNT_RECORD
        def_bool y
        depends on DYNAMIC_FTRACE
@@ -385,7 +446,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
        bool "Perform a startup test on ftrace"
-       depends on TRACING
+       depends on GENERIC_TRACER
        select FTRACE_SELFTEST
        help
          This option performs a series of startup tests on ftrace. On bootup
@@ -396,7 +457,7 @@ config FTRACE_STARTUP_TEST
 config MMIOTRACE
        bool "Memory mapped IO tracing"
        depends on HAVE_MMIOTRACE_SUPPORT && PCI
-       select TRACING
+       select GENERIC_TRACER
        help
          Mmiotrace traces Memory Mapped I/O access and is meant for
          debugging and reverse engineering. It is called from the ioremap
@@ -416,7 +477,23 @@ config MMIOTRACE_TEST
 
          Say N, unless you absolutely know what you are doing.
 
-endmenu
+config RING_BUFFER_BENCHMARK
+       tristate "Ring buffer benchmark stress tester"
+       depends on RING_BUFFER
+       help
+         This option creates a test to stress the ring buffer and bench mark it.
+         It creates its own ring buffer such that it will not interfer with
+         any other users of the ring buffer (such as ftrace). It then creates
+         a producer and consumer that will run for 10 seconds and sleep for
+         10 seconds. Each interval it will print out the number of events
+         it recorded and give a rough estimate of how long each iteration took.
+
+         It does not disable interrupts or raise its priority, so it may be
+         affected by processes that are running.
+
+         If unsure, say N
+
+endif # FTRACE
 
 endif # TRACING_SUPPORT
 
index 2630f5121ec12467a4b05958670aac40f357dcb8..844164dca90ae1ec9039ccfd66c5a89aaa8d1a3d 100644 (file)
@@ -15,11 +15,17 @@ ifdef CONFIG_TRACING_BRANCHES
 KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
 endif
 
+#
+# Make the trace clocks available generally: it's infrastructure
+# relied on by ptrace for example:
+#
+obj-y += trace_clock.o
+
 obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
+obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o
 
 obj-$(CONFIG_TRACING) += trace.o
-obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_TRACING) += trace_printk.o
@@ -39,12 +45,14 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
-obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events.o
-obj-$(CONFIG_EVENT_TRACER) += events.o
-obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
+ifeq ($(CONFIG_BLOCK),y)
+obj-$(CONFIG_EVENT_TRACING) += blktrace.o
+endif
+obj-$(CONFIG_EVENT_TRACING) += trace_events.o
+obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
-obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
 
 libftrace-y := ftrace.o
index 921ef5d1f0ba95e7497faa55afb293c50ae7ee47..7bd6a9893c247e875098a57f640c664fba7783c8 100644 (file)
 #include <linux/mutex.h>
 #include <linux/debugfs.h>
 #include <linux/time.h>
-#include <trace/block.h>
 #include <linux/uaccess.h>
+
+#include <trace/events/block.h>
+
 #include "trace_output.h"
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
@@ -147,7 +151,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 {
        if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
                return 1;
-       if (sector < bt->start_lba || sector > bt->end_lba)
+       if (sector && (sector < bt->start_lba || sector > bt->end_lba))
                return 1;
        if (bt->pid && pid != bt->pid)
                return 1;
@@ -192,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
        what |= MASK_TC_BIT(rw, DISCARD);
 
        pid = tsk->pid;
-       if (unlikely(act_log_check(bt, what, sector, pid)))
+       if (act_log_check(bt, what, sector, pid))
                return;
        cpu = raw_smp_processor_id();
 
@@ -262,6 +266,7 @@ static void blk_trace_free(struct blk_trace *bt)
 {
        debugfs_remove(bt->msg_file);
        debugfs_remove(bt->dropped_file);
+       debugfs_remove(bt->dir);
        relay_close(bt->rchan);
        free_percpu(bt->sequence);
        free_percpu(bt->msg_data);
@@ -403,11 +408,29 @@ static struct rchan_callbacks blk_relay_callbacks = {
        .remove_buf_file        = blk_remove_buf_file_callback,
 };
 
+static void blk_trace_setup_lba(struct blk_trace *bt,
+                               struct block_device *bdev)
+{
+       struct hd_struct *part = NULL;
+
+       if (bdev)
+               part = bdev->bd_part;
+
+       if (part) {
+               bt->start_lba = part->start_sect;
+               bt->end_lba = part->start_sect + part->nr_sects;
+       } else {
+               bt->start_lba = 0;
+               bt->end_lba = -1ULL;
+       }
+}
+
 /*
  * Setup everything required to start tracing
  */
 int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-                       struct blk_user_trace_setup *buts)
+                      struct block_device *bdev,
+                      struct blk_user_trace_setup *buts)
 {
        struct blk_trace *old_bt, *bt = NULL;
        struct dentry *dir = NULL;
@@ -480,10 +503,13 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (!bt->act_mask)
                bt->act_mask = (u16) -1;
 
-       bt->start_lba = buts->start_lba;
-       bt->end_lba = buts->end_lba;
-       if (!bt->end_lba)
-               bt->end_lba = -1ULL;
+       blk_trace_setup_lba(bt, bdev);
+
+       /* overwrite with user settings */
+       if (buts->start_lba)
+               bt->start_lba = buts->start_lba;
+       if (buts->end_lba)
+               bt->end_lba = buts->end_lba;
 
        bt->pid = buts->pid;
        bt->trace_state = Blktrace_setup;
@@ -505,6 +531,7 @@ err:
 }
 
 int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+                   struct block_device *bdev,
                    char __user *arg)
 {
        struct blk_user_trace_setup buts;
@@ -514,7 +541,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
        if (ret)
                return -EFAULT;
 
-       ret = do_blk_trace_setup(q, name, dev, &buts);
+       ret = do_blk_trace_setup(q, name, dev, bdev, &buts);
        if (ret)
                return ret;
 
@@ -582,7 +609,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
        switch (cmd) {
        case BLKTRACESETUP:
                bdevname(bdev, b);
-               ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+               ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg);
                break;
        case BLKTRACESTART:
                start = 1;
@@ -809,7 +836,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  * @bio:       the source bio
  * @dev:       target device
  * @from:      source sector
- * @to:                target sector
  *
  * Description:
  *     Device mapper or raid target sometimes need to split a bio because
@@ -817,7 +843,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
  *
  **/
 static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-                                      dev_t dev, sector_t from, sector_t to)
+                                      dev_t dev, sector_t from)
 {
        struct blk_trace *bt = q->blk_trace;
        struct blk_io_trace_remap r;
@@ -825,12 +851,13 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
        if (likely(!bt))
                return;
 
-       r.device = cpu_to_be32(dev);
-       r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-       r.sector = cpu_to_be64(to);
+       r.device_from = cpu_to_be32(dev);
+       r.device_to   = cpu_to_be32(bio->bi_bdev->bd_dev);
+       r.sector_from = cpu_to_be64(from);
 
-       __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-                       !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+       __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+                       BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE),
+                       sizeof(r), &r);
 }
 
 /**
@@ -971,6 +998,16 @@ static inline const void *pdu_start(const struct trace_entry *ent)
        return te_blk_io_trace(ent) + 1;
 }
 
+static inline u32 t_action(const struct trace_entry *ent)
+{
+       return te_blk_io_trace(ent)->action;
+}
+
+static inline u32 t_bytes(const struct trace_entry *ent)
+{
+       return te_blk_io_trace(ent)->bytes;
+}
+
 static inline u32 t_sec(const struct trace_entry *ent)
 {
        return te_blk_io_trace(ent)->bytes >> 9;
@@ -996,11 +1033,11 @@ static void get_pdu_remap(const struct trace_entry *ent,
                          struct blk_io_trace_remap *r)
 {
        const struct blk_io_trace_remap *__r = pdu_start(ent);
-       __u64 sector = __r->sector;
+       __u64 sector_from = __r->sector_from;
 
-       r->device = be32_to_cpu(__r->device);
        r->device_from = be32_to_cpu(__r->device_from);
-       r->sector = be64_to_cpu(sector);
+       r->device_to   = be32_to_cpu(__r->device_to);
+       r->sector_from = be64_to_cpu(sector_from);
 }
 
 typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
@@ -1031,36 +1068,98 @@ static int blk_log_action(struct trace_iterator *iter, const char *act)
                                MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
+static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent)
+{
+       const unsigned char *pdu_buf;
+       int pdu_len;
+       int i, end, ret;
+
+       pdu_buf = pdu_start(ent);
+       pdu_len = te_blk_io_trace(ent)->pdu_len;
+
+       if (!pdu_len)
+               return 1;
+
+       /* find the last zero that needs to be printed */
+       for (end = pdu_len - 1; end >= 0; end--)
+               if (pdu_buf[end])
+                       break;
+       end++;
+
+       if (!trace_seq_putc(s, '('))
+               return 0;
+
+       for (i = 0; i < pdu_len; i++) {
+
+               ret = trace_seq_printf(s, "%s%02x",
+                                      i == 0 ? "" : " ", pdu_buf[i]);
+               if (!ret)
+                       return ret;
+
+               /*
+                * stop when the rest is just zeroes and indicate so
+                * with a ".." appended
+                */
+               if (i == end && end != pdu_len - 1)
+                       return trace_seq_puts(s, " ..) ");
+       }
+
+       return trace_seq_puts(s, ") ");
+}
+
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
        char cmd[TASK_COMM_LEN];
 
        trace_find_cmdline(ent->pid, cmd);
 
-       if (t_sec(ent))
-               return trace_seq_printf(s, "%llu + %u [%s]\n",
-                                       t_sector(ent), t_sec(ent), cmd);
-       return trace_seq_printf(s, "[%s]\n", cmd);
+       if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+               int ret;
+
+               ret = trace_seq_printf(s, "%u ", t_bytes(ent));
+               if (!ret)
+                       return 0;
+               ret = blk_log_dump_pdu(s, ent);
+               if (!ret)
+                       return 0;
+               return trace_seq_printf(s, "[%s]\n", cmd);
+       } else {
+               if (t_sec(ent))
+                       return trace_seq_printf(s, "%llu + %u [%s]\n",
+                                               t_sector(ent), t_sec(ent), cmd);
+               return trace_seq_printf(s, "[%s]\n", cmd);
+       }
 }
 
 static int blk_log_with_error(struct trace_seq *s,
                              const struct trace_entry *ent)
 {
-       if (t_sec(ent))
-               return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-                                       t_sec(ent), t_error(ent));
-       return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+       if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) {
+               int ret;
+
+               ret = blk_log_dump_pdu(s, ent);
+               if (ret)
+                       return trace_seq_printf(s, "[%d]\n", t_error(ent));
+               return 0;
+       } else {
+               if (t_sec(ent))
+                       return trace_seq_printf(s, "%llu + %u [%d]\n",
+                                               t_sector(ent),
+                                               t_sec(ent), t_error(ent));
+               return trace_seq_printf(s, "%llu [%d]\n",
+                                       t_sector(ent), t_error(ent));
+       }
 }
 
 static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 {
-       struct blk_io_trace_remap r = { .device = 0, };
+       struct blk_io_trace_remap r = { .device_from = 0, };
 
        get_pdu_remap(ent, &r);
        return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-                              t_sector(ent),
-                              t_sec(ent), MAJOR(r.device), MINOR(r.device),
-                              (unsigned long long)r.sector);
+                               t_sector(ent), t_sec(ent),
+                               MAJOR(r.device_from), MINOR(r.device_from),
+                               (unsigned long long)r.sector_from);
 }
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
@@ -1117,7 +1216,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 static void blk_tracer_start(struct trace_array *tr)
 {
        blk_tracer_enabled = true;
-       trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
 static int blk_tracer_init(struct trace_array *tr)
@@ -1130,7 +1228,6 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
        blk_tracer_enabled = false;
-       trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
@@ -1182,7 +1279,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
        }
 
        if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-               ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+               ret = trace_seq_printf(s, "Unknown action %x\n", what);
        else {
                ret = log_action(iter, what2act[what].act[long_act]);
                if (ret)
@@ -1195,9 +1292,6 @@ out:
 static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
                                               int flags)
 {
-       if (!trace_print_context(iter))
-               return TRACE_TYPE_PARTIAL_LINE;
-
        return print_one_line(iter, false);
 }
 
@@ -1232,6 +1326,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
        return print_one_line(iter, true);
 }
 
+static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set)
+{
+       /* don't output context-info for blk_classic output */
+       if (bit == TRACE_BLK_OPT_CLASSIC) {
+               if (set)
+                       trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+               else
+                       trace_flags |= TRACE_ITER_CONTEXT_INFO;
+       }
+       return 0;
+}
+
 static struct tracer blk_tracer __read_mostly = {
        .name           = "blk",
        .init           = blk_tracer_init,
@@ -1241,6 +1347,7 @@ static struct tracer blk_tracer __read_mostly = {
        .print_header   = blk_tracer_print_header,
        .print_line     = blk_tracer_print_line,
        .flags          = &blk_tracer_flags,
+       .set_flag       = blk_tracer_set_flag,
 };
 
 static struct trace_event trace_blk_event = {
@@ -1285,7 +1392,8 @@ static int blk_trace_remove_queue(struct request_queue *q)
 /*
  * Setup everything required to start tracing
  */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+static int blk_trace_setup_queue(struct request_queue *q,
+                                struct block_device *bdev)
 {
        struct blk_trace *old_bt, *bt = NULL;
        int ret = -ENOMEM;
@@ -1298,9 +1406,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
        if (!bt->msg_data)
                goto free_bt;
 
-       bt->dev = dev;
+       bt->dev = bdev->bd_dev;
        bt->act_mask = (u16)-1;
-       bt->end_lba = -1ULL;
+
+       blk_trace_setup_lba(bt, bdev);
 
        old_bt = xchg(&q->blk_trace, bt);
        if (old_bt != NULL) {
@@ -1517,7 +1626,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
        if (attr == &dev_attr_enable) {
                if (value)
-                       ret = blk_trace_setup_queue(q, bdev->bd_dev);
+                       ret = blk_trace_setup_queue(q, bdev);
                else
                        ret = blk_trace_remove_queue(q);
                goto out_unlock_bdev;
@@ -1525,7 +1634,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 
        ret = 0;
        if (q->blk_trace == NULL)
-               ret = blk_trace_setup_queue(q, bdev->bd_dev);
+               ret = blk_trace_setup_queue(q, bdev);
 
        if (ret == 0) {
                if (attr == &dev_attr_act_mask)
@@ -1548,3 +1657,80 @@ out:
        return ret ? ret : count;
 }
 
+int blk_trace_init_sysfs(struct device *dev)
+{
+       return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
+}
+
+#endif /* CONFIG_BLK_DEV_IO_TRACE */
+
+#ifdef CONFIG_EVENT_TRACING
+
+void blk_dump_cmd(char *buf, struct request *rq)
+{
+       int i, end;
+       int len = rq->cmd_len;
+       unsigned char *cmd = rq->cmd;
+
+       if (!blk_pc_request(rq)) {
+               buf[0] = '\0';
+               return;
+       }
+
+       for (end = len - 1; end >= 0; end--)
+               if (cmd[end])
+                       break;
+       end++;
+
+       for (i = 0; i < len; i++) {
+               buf += sprintf(buf, "%s%02x", i == 0 ? "" : " ", cmd[i]);
+               if (i == end && end != len - 1) {
+                       sprintf(buf, " ..");
+                       break;
+               }
+       }
+}
+
+void blk_fill_rwbs(char *rwbs, u32 rw, int bytes)
+{
+       int i = 0;
+
+       if (rw & WRITE)
+               rwbs[i++] = 'W';
+       else if (rw & 1 << BIO_RW_DISCARD)
+               rwbs[i++] = 'D';
+       else if (bytes)
+               rwbs[i++] = 'R';
+       else
+               rwbs[i++] = 'N';
+
+       if (rw & 1 << BIO_RW_AHEAD)
+               rwbs[i++] = 'A';
+       if (rw & 1 << BIO_RW_BARRIER)
+               rwbs[i++] = 'B';
+       if (rw & 1 << BIO_RW_SYNCIO)
+               rwbs[i++] = 'S';
+       if (rw & 1 << BIO_RW_META)
+               rwbs[i++] = 'M';
+
+       rwbs[i] = '\0';
+}
+
+void blk_fill_rwbs_rq(char *rwbs, struct request *rq)
+{
+       int rw = rq->cmd_flags & 0x03;
+       int bytes;
+
+       if (blk_discard_rq(rq))
+               rw |= (1 << BIO_RW_DISCARD);
+
+       if (blk_pc_request(rq))
+               bytes = rq->data_len;
+       else
+               bytes = rq->hard_nr_sectors << 9;
+
+       blk_fill_rwbs(rwbs, rw, bytes);
+}
+
+#endif /* CONFIG_EVENT_TRACING */
+
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
deleted file mode 100644 (file)
index 246f2aa..0000000
+++ /dev/null
@@ -1,14 +0,0 @@
-/*
- * This is the place to register all trace points as events.
- */
-
-#include <linux/stringify.h>
-
-#include <trace/trace_events.h>
-
-#include "trace_output.h"
-
-#include "trace_events_stage_1.h"
-#include "trace_events_stage_2.h"
-#include "trace_events_stage_3.h"
-
index f1ed080406c31f6bf61025e34e626f71b2865ad6..bb60732ade0cc7bca54fdbff32a9dfda16bacde9 100644 (file)
 #include <linux/list.h>
 #include <linux/hash.h>
 
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include <asm/ftrace.h>
+#include <asm/setup.h>
 
-#include "trace.h"
+#include "trace_output.h"
+#include "trace_stat.h"
 
 #define FTRACE_WARN_ON(cond)                   \
        do {                                    \
@@ -68,7 +70,7 @@ static DEFINE_MUTEX(ftrace_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
-       .func = ftrace_stub,
+       .func           = ftrace_stub,
 };
 
 static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end;
@@ -240,6 +242,580 @@ static void ftrace_update_pid_func(void)
 #endif
 }
 
+#ifdef CONFIG_FUNCTION_PROFILER
+struct ftrace_profile {
+       struct hlist_node               node;
+       unsigned long                   ip;
+       unsigned long                   counter;
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       unsigned long long              time;
+#endif
+};
+
+struct ftrace_profile_page {
+       struct ftrace_profile_page      *next;
+       unsigned long                   index;
+       struct ftrace_profile           records[];
+};
+
+struct ftrace_profile_stat {
+       atomic_t                        disabled;
+       struct hlist_head               *hash;
+       struct ftrace_profile_page      *pages;
+       struct ftrace_profile_page      *start;
+       struct tracer_stat              stat;
+};
+
+#define PROFILE_RECORDS_SIZE                                           \
+       (PAGE_SIZE - offsetof(struct ftrace_profile_page, records))
+
+#define PROFILES_PER_PAGE                                      \
+       (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile))
+
+static int ftrace_profile_bits __read_mostly;
+static int ftrace_profile_enabled __read_mostly;
+
+/* ftrace_profile_lock - synchronize the enable and disable of the profiler */
+static DEFINE_MUTEX(ftrace_profile_lock);
+
+static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats);
+
+#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */
+
+static void *
+function_stat_next(void *v, int idx)
+{
+       struct ftrace_profile *rec = v;
+       struct ftrace_profile_page *pg;
+
+       pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK);
+
+ again:
+       rec++;
+       if ((void *)rec >= (void *)&pg->records[pg->index]) {
+               pg = pg->next;
+               if (!pg)
+                       return NULL;
+               rec = &pg->records[0];
+               if (!rec->counter)
+                       goto again;
+       }
+
+       return rec;
+}
+
+static void *function_stat_start(struct tracer_stat *trace)
+{
+       struct ftrace_profile_stat *stat =
+               container_of(trace, struct ftrace_profile_stat, stat);
+
+       if (!stat || !stat->start)
+               return NULL;
+
+       return function_stat_next(&stat->start->records[0], 0);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/* function graph compares on total time */
+static int function_stat_cmp(void *p1, void *p2)
+{
+       struct ftrace_profile *a = p1;
+       struct ftrace_profile *b = p2;
+
+       if (a->time < b->time)
+               return -1;
+       if (a->time > b->time)
+               return 1;
+       else
+               return 0;
+}
+#else
+/* not function graph compares against hits */
+static int function_stat_cmp(void *p1, void *p2)
+{
+       struct ftrace_profile *a = p1;
+       struct ftrace_profile *b = p2;
+
+       if (a->counter < b->counter)
+               return -1;
+       if (a->counter > b->counter)
+               return 1;
+       else
+               return 0;
+}
+#endif
+
+static int function_stat_headers(struct seq_file *m)
+{
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       seq_printf(m, "  Function                               "
+                  "Hit    Time            Avg\n"
+                     "  --------                               "
+                  "---    ----            ---\n");
+#else
+       seq_printf(m, "  Function                               Hit\n"
+                     "  --------                               ---\n");
+#endif
+       return 0;
+}
+
+static int function_stat_show(struct seq_file *m, void *v)
+{
+       struct ftrace_profile *rec = v;
+       char str[KSYM_SYMBOL_LEN];
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       static DEFINE_MUTEX(mutex);
+       static struct trace_seq s;
+       unsigned long long avg;
+#endif
+
+       kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+       seq_printf(m, "  %-30.30s  %10lu", str, rec->counter);
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+       seq_printf(m, "    ");
+       avg = rec->time;
+       do_div(avg, rec->counter);
+
+       mutex_lock(&mutex);
+       trace_seq_init(&s);
+       trace_print_graph_duration(rec->time, &s);
+       trace_seq_puts(&s, "    ");
+       trace_print_graph_duration(avg, &s);
+       trace_print_seq(m, &s);
+       mutex_unlock(&mutex);
+#endif
+       seq_putc(m, '\n');
+
+       return 0;
+}
+
+static void ftrace_profile_reset(struct ftrace_profile_stat *stat)
+{
+       struct ftrace_profile_page *pg;
+
+       pg = stat->pages = stat->start;
+
+       while (pg) {
+               memset(pg->records, 0, PROFILE_RECORDS_SIZE);
+               pg->index = 0;
+               pg = pg->next;
+       }
+
+       memset(stat->hash, 0,
+              FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head));
+}
+
+int ftrace_profile_pages_init(struct ftrace_profile_stat *stat)
+{
+       struct ftrace_profile_page *pg;
+       int functions;
+       int pages;
+       int i;
+
+       /* If we already allocated, do nothing */
+       if (stat->pages)
+               return 0;
+
+       stat->pages = (void *)get_zeroed_page(GFP_KERNEL);
+       if (!stat->pages)
+               return -ENOMEM;
+
+#ifdef CONFIG_DYNAMIC_FTRACE
+       functions = ftrace_update_tot_cnt;
+#else
+       /*
+        * We do not know the number of functions that exist because
+        * dynamic tracing is what counts them. With past experience
+        * we have around 20K functions. That should be more than enough.
+        * It is highly unlikely we will execute every function in
+        * the kernel.
+        */
+       functions = 20000;
+#endif
+
+       pg = stat->start = stat->pages;
+
+       pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE);
+
+       for (i = 0; i < pages; i++) {
+               pg->next = (void *)get_zeroed_page(GFP_KERNEL);
+               if (!pg->next)
+                       goto out_free;
+               pg = pg->next;
+       }
+
+       return 0;
+
+ out_free:
+       pg = stat->start;
+       while (pg) {
+               unsigned long tmp = (unsigned long)pg;
+
+               pg = pg->next;
+               free_page(tmp);
+       }
+
+       free_page((unsigned long)stat->pages);
+       stat->pages = NULL;
+       stat->start = NULL;
+
+       return -ENOMEM;
+}
+
+static int ftrace_profile_init_cpu(int cpu)
+{
+       struct ftrace_profile_stat *stat;
+       int size;
+
+       stat = &per_cpu(ftrace_profile_stats, cpu);
+
+       if (stat->hash) {
+               /* If the profile is already created, simply reset it */
+               ftrace_profile_reset(stat);
+               return 0;
+       }
+
+       /*
+        * We are profiling all functions, but usually only a few thousand
+        * functions are hit. We'll make a hash of 1024 items.
+        */
+       size = FTRACE_PROFILE_HASH_SIZE;
+
+       stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL);
+
+       if (!stat->hash)
+               return -ENOMEM;
+
+       if (!ftrace_profile_bits) {
+               size--;
+
+               for (; size; size >>= 1)
+                       ftrace_profile_bits++;
+       }
+
+       /* Preallocate the function profiling pages */
+       if (ftrace_profile_pages_init(stat) < 0) {
+               kfree(stat->hash);
+               stat->hash = NULL;
+               return -ENOMEM;
+       }
+
+       return 0;
+}
+
+static int ftrace_profile_init(void)
+{
+       int cpu;
+       int ret = 0;
+
+       for_each_online_cpu(cpu) {
+               ret = ftrace_profile_init_cpu(cpu);
+               if (ret)
+                       break;
+       }
+
+       return ret;
+}
+
+/* interrupts must be disabled */
+static struct ftrace_profile *
+ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+       struct ftrace_profile *rec;
+       struct hlist_head *hhd;
+       struct hlist_node *n;
+       unsigned long key;
+
+       key = hash_long(ip, ftrace_profile_bits);
+       hhd = &stat->hash[key];
+
+       if (hlist_empty(hhd))
+               return NULL;
+
+       hlist_for_each_entry_rcu(rec, n, hhd, node) {
+               if (rec->ip == ip)
+                       return rec;
+       }
+
+       return NULL;
+}
+
+static void ftrace_add_profile(struct ftrace_profile_stat *stat,
+                              struct ftrace_profile *rec)
+{
+       unsigned long key;
+
+       key = hash_long(rec->ip, ftrace_profile_bits);
+       hlist_add_head_rcu(&rec->node, &stat->hash[key]);
+}
+
+/*
+ * The memory is already allocated, this simply finds a new record to use.
+ */
+static struct ftrace_profile *
+ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip)
+{
+       struct ftrace_profile *rec = NULL;
+
+       /* prevent recursion (from NMIs) */
+       if (atomic_inc_return(&stat->disabled) != 1)
+               goto out;
+
+       /*
+        * Try to find the function again since an NMI
+        * could have added it
+        */
+       rec = ftrace_find_profiled_func(stat, ip);
+       if (rec)
+               goto out;
+
+       if (stat->pages->index == PROFILES_PER_PAGE) {
+               if (!stat->pages->next)
+                       goto out;
+               stat->pages = stat->pages->next;
+       }
+
+       rec = &stat->pages->records[stat->pages->index++];
+       rec->ip = ip;
+       ftrace_add_profile(stat, rec);
+
+ out:
+       atomic_dec(&stat->disabled);
+
+       return rec;
+}
+
+static void
+function_profile_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct ftrace_profile_stat *stat;
+       struct ftrace_profile *rec;
+       unsigned long flags;
+
+       if (!ftrace_profile_enabled)
+               return;
+
+       local_irq_save(flags);
+
+       stat = &__get_cpu_var(ftrace_profile_stats);
+       if (!stat->hash || !ftrace_profile_enabled)
+               goto out;
+
+       rec = ftrace_find_profiled_func(stat, ip);
+       if (!rec) {
+               rec = ftrace_profile_alloc(stat, ip);
+               if (!rec)
+                       goto out;
+       }
+
+       rec->counter++;
+ out:
+       local_irq_restore(flags);
+}
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+static int profile_graph_entry(struct ftrace_graph_ent *trace)
+{
+       function_profile_call(trace->func, 0);
+       return 1;
+}
+
+static void profile_graph_return(struct ftrace_graph_ret *trace)
+{
+       struct ftrace_profile_stat *stat;
+       unsigned long long calltime;
+       struct ftrace_profile *rec;
+       unsigned long flags;
+
+       local_irq_save(flags);
+       stat = &__get_cpu_var(ftrace_profile_stats);
+       if (!stat->hash || !ftrace_profile_enabled)
+               goto out;
+
+       calltime = trace->rettime - trace->calltime;
+
+       if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) {
+               int index;
+
+               index = trace->depth;
+
+               /* Append this call time to the parent time to subtract */
+               if (index)
+                       current->ret_stack[index - 1].subtime += calltime;
+
+               if (current->ret_stack[index].subtime < calltime)
+                       calltime -= current->ret_stack[index].subtime;
+               else
+                       calltime = 0;
+       }
+
+       rec = ftrace_find_profiled_func(stat, trace->func);
+       if (rec)
+               rec->time += calltime;
+
+ out:
+       local_irq_restore(flags);
+}
+
+static int register_ftrace_profiler(void)
+{
+       return register_ftrace_graph(&profile_graph_return,
+                                    &profile_graph_entry);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+       unregister_ftrace_graph();
+}
+#else
+static struct ftrace_ops ftrace_profile_ops __read_mostly =
+{
+       .func           = function_profile_call,
+};
+
+static int register_ftrace_profiler(void)
+{
+       return register_ftrace_function(&ftrace_profile_ops);
+}
+
+static void unregister_ftrace_profiler(void)
+{
+       unregister_ftrace_function(&ftrace_profile_ops);
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+static ssize_t
+ftrace_profile_write(struct file *filp, const char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       unsigned long val;
+       char buf[64];           /* big enough to hold a number */
+       int ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       val = !!val;
+
+       mutex_lock(&ftrace_profile_lock);
+       if (ftrace_profile_enabled ^ val) {
+               if (val) {
+                       ret = ftrace_profile_init();
+                       if (ret < 0) {
+                               cnt = ret;
+                               goto out;
+                       }
+
+                       ret = register_ftrace_profiler();
+                       if (ret < 0) {
+                               cnt = ret;
+                               goto out;
+                       }
+                       ftrace_profile_enabled = 1;
+               } else {
+                       ftrace_profile_enabled = 0;
+                       /*
+                        * unregister_ftrace_profiler calls stop_machine
+                        * so this acts like an synchronize_sched.
+                        */
+                       unregister_ftrace_profiler();
+               }
+       }
+ out:
+       mutex_unlock(&ftrace_profile_lock);
+
+       filp->f_pos += cnt;
+
+       return cnt;
+}
+
+static ssize_t
+ftrace_profile_read(struct file *filp, char __user *ubuf,
+                    size_t cnt, loff_t *ppos)
+{
+       char buf[64];           /* big enough to hold a number */
+       int r;
+
+       r = sprintf(buf, "%u\n", ftrace_profile_enabled);
+       return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static const struct file_operations ftrace_profile_fops = {
+       .open           = tracing_open_generic,
+       .read           = ftrace_profile_read,
+       .write          = ftrace_profile_write,
+};
+
+/* used to initialize the real stat files */
+static struct tracer_stat function_stats __initdata = {
+       .name           = "functions",
+       .stat_start     = function_stat_start,
+       .stat_next      = function_stat_next,
+       .stat_cmp       = function_stat_cmp,
+       .stat_headers   = function_stat_headers,
+       .stat_show      = function_stat_show
+};
+
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+       struct ftrace_profile_stat *stat;
+       struct dentry *entry;
+       char *name;
+       int ret;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               stat = &per_cpu(ftrace_profile_stats, cpu);
+
+               /* allocate enough for function name + cpu number */
+               name = kmalloc(32, GFP_KERNEL);
+               if (!name) {
+                       /*
+                        * The files created are permanent, if something happens
+                        * we still do not free memory.
+                        */
+                       kfree(stat);
+                       WARN(1,
+                            "Could not allocate stat file for cpu %d\n",
+                            cpu);
+                       return;
+               }
+               stat->stat = function_stats;
+               snprintf(name, 32, "function%d", cpu);
+               stat->stat.name = name;
+               ret = register_stat_tracer(&stat->stat);
+               if (ret) {
+                       WARN(1,
+                            "Could not register function stat for cpu %d\n",
+                            cpu);
+                       kfree(name);
+                       return;
+               }
+       }
+
+       entry = debugfs_create_file("function_profile_enabled", 0644,
+                                   d_tracer, NULL, &ftrace_profile_fops);
+       if (!entry)
+               pr_warning("Could not create debugfs "
+                          "'function_profile_enabled' entry\n");
+}
+
+#else /* CONFIG_FUNCTION_PROFILER */
+static void ftrace_profile_debugfs(struct dentry *d_tracer)
+{
+}
+#endif /* CONFIG_FUNCTION_PROFILER */
+
 /* set when tracing only a pid */
 struct pid *ftrace_pid_trace;
 static struct pid * const ftrace_swapper_pid = &init_struct_pid;
@@ -261,7 +837,6 @@ struct ftrace_func_probe {
        struct rcu_head         rcu;
 };
 
-
 enum {
        FTRACE_ENABLE_CALLS             = (1 << 0),
        FTRACE_DISABLE_CALLS            = (1 << 1),
@@ -346,30 +921,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec)
        rec->flags |= FTRACE_FL_FREE;
 }
 
-void ftrace_release(void *start, unsigned long size)
-{
-       struct dyn_ftrace *rec;
-       struct ftrace_page *pg;
-       unsigned long s = (unsigned long)start;
-       unsigned long e = s + size;
-
-       if (ftrace_disabled || !start)
-               return;
-
-       mutex_lock(&ftrace_lock);
-       do_for_each_ftrace_rec(pg, rec) {
-               if ((rec->ip >= s) && (rec->ip < e)) {
-                       /*
-                        * rec->ip is changed in ftrace_free_rec()
-                        * It should not between s and e if record was freed.
-                        */
-                       FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
-                       ftrace_free_rec(rec);
-               }
-       } while_for_each_ftrace_rec();
-       mutex_unlock(&ftrace_lock);
-}
-
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 {
        struct dyn_ftrace *rec;
@@ -1408,7 +1959,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 
 static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-       .func = function_trace_probe_call,
+       .func           = function_trace_probe_call,
 };
 
 static int ftrace_probe_registered;
@@ -1823,6 +2374,45 @@ void ftrace_set_notrace(unsigned char *buf, int len, int reset)
        ftrace_set_regex(buf, len, reset, 0);
 }
 
+/*
+ * command line interface to allow users to set filters on boot up.
+ */
+#define FTRACE_FILTER_SIZE             COMMAND_LINE_SIZE
+static char ftrace_notrace_buf[FTRACE_FILTER_SIZE] __initdata;
+static char ftrace_filter_buf[FTRACE_FILTER_SIZE] __initdata;
+
+static int __init set_ftrace_notrace(char *str)
+{
+       strncpy(ftrace_notrace_buf, str, FTRACE_FILTER_SIZE);
+       return 1;
+}
+__setup("ftrace_notrace=", set_ftrace_notrace);
+
+static int __init set_ftrace_filter(char *str)
+{
+       strncpy(ftrace_filter_buf, str, FTRACE_FILTER_SIZE);
+       return 1;
+}
+__setup("ftrace_filter=", set_ftrace_filter);
+
+static void __init set_ftrace_early_filter(char *buf, int enable)
+{
+       char *func;
+
+       while (buf) {
+               func = strsep(&buf, ",");
+               ftrace_set_regex(func, strlen(func), 0, enable);
+       }
+}
+
+static void __init set_ftrace_early_filters(void)
+{
+       if (ftrace_filter_buf[0])
+               set_ftrace_early_filter(ftrace_filter_buf, 1);
+       if (ftrace_notrace_buf[0])
+               set_ftrace_early_filter(ftrace_notrace_buf, 0);
+}
+
 static int
 ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 {
@@ -2128,38 +2718,23 @@ static const struct file_operations ftrace_graph_fops = {
 
 static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer)
 {
-       struct dentry *entry;
 
-       entry = debugfs_create_file("available_filter_functions", 0444,
-                                   d_tracer, NULL, &ftrace_avail_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'available_filter_functions' entry\n");
+       trace_create_file("available_filter_functions", 0444,
+                       d_tracer, NULL, &ftrace_avail_fops);
 
-       entry = debugfs_create_file("failures", 0444,
-                                   d_tracer, NULL, &ftrace_failures_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'failures' entry\n");
+       trace_create_file("failures", 0444,
+                       d_tracer, NULL, &ftrace_failures_fops);
 
-       entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer,
-                                   NULL, &ftrace_filter_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_filter' entry\n");
+       trace_create_file("set_ftrace_filter", 0644, d_tracer,
+                       NULL, &ftrace_filter_fops);
 
-       entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer,
+       trace_create_file("set_ftrace_notrace", 0644, d_tracer,
                                    NULL, &ftrace_notrace_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_notrace' entry\n");
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-       entry = debugfs_create_file("set_graph_function", 0444, d_tracer,
+       trace_create_file("set_graph_function", 0444, d_tracer,
                                    NULL,
                                    &ftrace_graph_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_graph_function' entry\n");
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
        return 0;
@@ -2197,14 +2772,72 @@ static int ftrace_convert_nops(struct module *mod,
        return 0;
 }
 
-void ftrace_init_module(struct module *mod,
-                       unsigned long *start, unsigned long *end)
+#ifdef CONFIG_MODULES
+void ftrace_release(void *start, void *end)
+{
+       struct dyn_ftrace *rec;
+       struct ftrace_page *pg;
+       unsigned long s = (unsigned long)start;
+       unsigned long e = (unsigned long)end;
+
+       if (ftrace_disabled || !start || start == end)
+               return;
+
+       mutex_lock(&ftrace_lock);
+       do_for_each_ftrace_rec(pg, rec) {
+               if ((rec->ip >= s) && (rec->ip < e)) {
+                       /*
+                        * rec->ip is changed in ftrace_free_rec()
+                        * It should not between s and e if record was freed.
+                        */
+                       FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
+                       ftrace_free_rec(rec);
+               }
+       } while_for_each_ftrace_rec();
+       mutex_unlock(&ftrace_lock);
+}
+
+static void ftrace_init_module(struct module *mod,
+                              unsigned long *start, unsigned long *end)
 {
        if (ftrace_disabled || start == end)
                return;
        ftrace_convert_nops(mod, start, end);
 }
 
+static int ftrace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       switch (val) {
+       case MODULE_STATE_COMING:
+               ftrace_init_module(mod, mod->ftrace_callsites,
+                                  mod->ftrace_callsites +
+                                  mod->num_ftrace_callsites);
+               break;
+       case MODULE_STATE_GOING:
+               ftrace_release(mod->ftrace_callsites,
+                              mod->ftrace_callsites +
+                              mod->num_ftrace_callsites);
+               break;
+       }
+
+       return 0;
+}
+#else
+static int ftrace_module_notify(struct notifier_block *self,
+                               unsigned long val, void *data)
+{
+       return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block ftrace_module_nb = {
+       .notifier_call = ftrace_module_notify,
+       .priority = 0,
+};
+
 extern unsigned long __start_mcount_loc[];
 extern unsigned long __stop_mcount_loc[];
 
@@ -2236,6 +2869,12 @@ void __init ftrace_init(void)
                                  __start_mcount_loc,
                                  __stop_mcount_loc);
 
+       ret = register_module_notifier(&ftrace_module_nb);
+       if (ret)
+               pr_warning("Failed to register trace ftrace module notifier\n");
+
+       set_ftrace_early_filters();
+
        return;
  failed:
        ftrace_disabled = 1;
@@ -2417,7 +3056,6 @@ static const struct file_operations ftrace_pid_fops = {
 static __init int ftrace_init_debugfs(void)
 {
        struct dentry *d_tracer;
-       struct dentry *entry;
 
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -2425,11 +3063,11 @@ static __init int ftrace_init_debugfs(void)
 
        ftrace_init_dyn_debugfs(d_tracer);
 
-       entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer,
-                                   NULL, &ftrace_pid_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'set_ftrace_pid' entry\n");
+       trace_create_file("set_ftrace_pid", 0644, d_tracer,
+                           NULL, &ftrace_pid_fops);
+
+       ftrace_profile_debugfs(d_tracer);
+
        return 0;
 }
 fs_initcall(ftrace_init_debugfs);
@@ -2538,7 +3176,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
-static atomic_t ftrace_graph_active;
+static int ftrace_graph_active;
 static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
@@ -2580,12 +3218,12 @@ static int alloc_retstack_tasklist(struct ftrace_ret_stack **ret_stack_list)
                }
 
                if (t->ret_stack == NULL) {
-                       t->curr_ret_stack = -1;
-                       /* Make sure IRQs see the -1 first: */
-                       barrier();
-                       t->ret_stack = ret_stack_list[start++];
                        atomic_set(&t->tracing_graph_pause, 0);
                        atomic_set(&t->trace_overrun, 0);
+                       t->curr_ret_stack = -1;
+                       /* Make sure the tasks see the -1 first: */
+                       smp_wmb();
+                       t->ret_stack = ret_stack_list[start++];
                }
        } while_each_thread(g, t);
 
@@ -2643,8 +3281,10 @@ static int start_graph_tracing(void)
                return -ENOMEM;
 
        /* The cpu_boot init_task->ret_stack will never be freed */
-       for_each_online_cpu(cpu)
-               ftrace_graph_init_task(idle_task(cpu));
+       for_each_online_cpu(cpu) {
+               if (!idle_task(cpu)->ret_stack)
+                       ftrace_graph_init_task(idle_task(cpu));
+       }
 
        do {
                ret = alloc_retstack_tasklist(ret_stack_list);
@@ -2690,7 +3330,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        mutex_lock(&ftrace_lock);
 
        /* we currently allow only one tracer registered at a time */
-       if (atomic_read(&ftrace_graph_active)) {
+       if (ftrace_graph_active) {
                ret = -EBUSY;
                goto out;
        }
@@ -2698,10 +3338,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
        ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
        register_pm_notifier(&ftrace_suspend_notifier);
 
-       atomic_inc(&ftrace_graph_active);
+       ftrace_graph_active++;
        ret = start_graph_tracing();
        if (ret) {
-               atomic_dec(&ftrace_graph_active);
+               ftrace_graph_active--;
                goto out;
        }
 
@@ -2719,10 +3359,10 @@ void unregister_ftrace_graph(void)
 {
        mutex_lock(&ftrace_lock);
 
-       if (!unlikely(atomic_read(&ftrace_graph_active)))
+       if (unlikely(!ftrace_graph_active))
                goto out;
 
-       atomic_dec(&ftrace_graph_active);
+       ftrace_graph_active--;
        unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
@@ -2736,18 +3376,25 @@ void unregister_ftrace_graph(void)
 /* Allocate a return stack for newly created task */
 void ftrace_graph_init_task(struct task_struct *t)
 {
-       if (atomic_read(&ftrace_graph_active)) {
-               t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
+       /* Make sure we do not use the parent ret_stack */
+       t->ret_stack = NULL;
+
+       if (ftrace_graph_active) {
+               struct ftrace_ret_stack *ret_stack;
+
+               ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH
                                * sizeof(struct ftrace_ret_stack),
                                GFP_KERNEL);
-               if (!t->ret_stack)
+               if (!ret_stack)
                        return;
                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
                t->ftrace_timestamp = 0;
-       } else
-               t->ret_stack = NULL;
+               /* make curr_ret_stack visable before we add the ret_stack */
+               smp_wmb();
+               t->ret_stack = ret_stack;
+       }
 }
 
 void ftrace_graph_exit_task(struct task_struct *t)
index 5011f4d91e375c2895cdc074299caafdf058ca38..86cdf671d7e288c0998a4b14e2b2f2f2c709fdb2 100644 (file)
@@ -12,7 +12,7 @@
 #include <linux/dcache.h>
 #include <linux/fs.h>
 
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 
 #include "trace_output.h"
 #include "trace.h"
@@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
                                   gfp_t gfp_flags,
                                   int node)
 {
+       struct ftrace_event_call *call = &event_kmem_alloc;
        struct trace_array *tr = kmemtrace_array;
        struct kmemtrace_alloc_entry *entry;
        struct ring_buffer_event *event;
@@ -62,7 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
        entry->gfp_flags        = gfp_flags;
        entry->node             = node;
 
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 
        trace_wake_up();
 }
@@ -71,6 +73,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
                                  unsigned long call_site,
                                  const void *ptr)
 {
+       struct ftrace_event_call *call = &event_kmem_free;
        struct trace_array *tr = kmemtrace_array;
        struct kmemtrace_free_entry *entry;
        struct ring_buffer_event *event;
@@ -86,7 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
        entry->call_site        = call_site;
        entry->ptr              = ptr;
 
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 
        trace_wake_up();
 }
index 960cbf44c844a17dd156b25927d6e98c89840600..2e642b2b7253d11ddce40c657a471f007561be65 100644 (file)
 
 #include "trace.h"
 
+/*
+ * The ring buffer header is special. We must manually up keep it.
+ */
+int ring_buffer_print_entry_header(struct trace_seq *s)
+{
+       int ret;
+
+       ret = trace_seq_printf(s, "# compressed entry header\n");
+       ret = trace_seq_printf(s, "\ttype_len    :    5 bits\n");
+       ret = trace_seq_printf(s, "\ttime_delta  :   27 bits\n");
+       ret = trace_seq_printf(s, "\tarray       :   32 bits\n");
+       ret = trace_seq_printf(s, "\n");
+       ret = trace_seq_printf(s, "\tpadding     : type == %d\n",
+                              RINGBUF_TYPE_PADDING);
+       ret = trace_seq_printf(s, "\ttime_extend : type == %d\n",
+                              RINGBUF_TYPE_TIME_EXTEND);
+       ret = trace_seq_printf(s, "\tdata max type_len  == %d\n",
+                              RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
+
+       return ret;
+}
+
 /*
  * The ring buffer is made up of a list of pages. A separate list of pages is
  * allocated for each CPU. A writer may only write to a buffer that is
@@ -182,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT           4U
-#define RB_MAX_SMALL_DATA      28
+#define RB_MAX_SMALL_DATA      (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
+
+/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */
+#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX
 
 enum {
        RB_LEN_TIME_EXTEND = 8,
@@ -191,48 +216,28 @@ enum {
 
 static inline int rb_null_event(struct ring_buffer_event *event)
 {
-       return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+       return event->type_len == RINGBUF_TYPE_PADDING
+                       && event->time_delta == 0;
 }
 
 static inline int rb_discarded_event(struct ring_buffer_event *event)
 {
-       return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+       return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta;
 }
 
 static void rb_event_set_padding(struct ring_buffer_event *event)
 {
-       event->type = RINGBUF_TYPE_PADDING;
+       event->type_len = RINGBUF_TYPE_PADDING;
        event->time_delta = 0;
 }
 
-/**
- * ring_buffer_event_discard - discard an event in the ring buffer
- * @buffer: the ring buffer
- * @event: the event to discard
- *
- * Sometimes a event that is in the ring buffer needs to be ignored.
- * This function lets the user discard an event in the ring buffer
- * and then that event will not be read later.
- *
- * Note, it is up to the user to be careful with this, and protect
- * against races. If the user discards an event that has been consumed
- * it is possible that it could corrupt the ring buffer.
- */
-void ring_buffer_event_discard(struct ring_buffer_event *event)
-{
-       event->type = RINGBUF_TYPE_PADDING;
-       /* time delta must be non zero */
-       if (!event->time_delta)
-               event->time_delta = 1;
-}
-
 static unsigned
 rb_event_data_length(struct ring_buffer_event *event)
 {
        unsigned length;
 
-       if (event->len)
-               length = event->len * RB_ALIGNMENT;
+       if (event->type_len)
+               length = event->type_len * RB_ALIGNMENT;
        else
                length = event->array[0];
        return length + RB_EVNT_HDR_SIZE;
@@ -242,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event)
 static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
-       switch (event->type) {
+       switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event))
                        /* undefined */
                        return -1;
-               return rb_event_data_length(event);
+               return  event->array[0] + RB_EVNT_HDR_SIZE;
 
        case RINGBUF_TYPE_TIME_EXTEND:
                return RB_LEN_TIME_EXTEND;
@@ -271,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event)
 unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 {
        unsigned length = rb_event_length(event);
-       if (event->type != RINGBUF_TYPE_DATA)
+       if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX)
                return length;
        length -= RB_EVNT_HDR_SIZE;
        if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0]))
@@ -284,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 static void *
 rb_event_data(struct ring_buffer_event *event)
 {
-       BUG_ON(event->type != RINGBUF_TYPE_DATA);
+       BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX);
        /* If length is in len field, then array[0] has the data */
-       if (event->len)
+       if (event->type_len)
                return (void *)&event->array[0];
        /* Otherwise length is in array[0] and array[1] has the data */
        return (void *)&event->array[1];
@@ -316,9 +321,10 @@ struct buffer_data_page {
 };
 
 struct buffer_page {
+       struct list_head list;          /* list of buffer pages */
        local_t          write;         /* index for next write */
        unsigned         read;          /* index for next read */
-       struct list_head list;          /* list of free pages */
+       local_t          entries;       /* entries on this page */
        struct buffer_data_page *page;  /* Actual data page */
 };
 
@@ -361,6 +367,34 @@ static inline int test_time_stamp(u64 delta)
 
 #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
+/* Max payload is BUF_PAGE_SIZE - header (8bytes) */
+#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2))
+
+/* Max number of timestamps that can fit on a page */
+#define RB_TIMESTAMPS_PER_PAGE (BUF_PAGE_SIZE / RB_LEN_TIME_STAMP)
+
+int ring_buffer_print_page_header(struct trace_seq *s)
+{
+       struct buffer_data_page field;
+       int ret;
+
+       ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t"
+                              "offset:0;\tsize:%u;\n",
+                              (unsigned int)sizeof(field.time_stamp));
+
+       ret = trace_seq_printf(s, "\tfield: local_t commit;\t"
+                              "offset:%u;\tsize:%u;\n",
+                              (unsigned int)offsetof(typeof(field), commit),
+                              (unsigned int)sizeof(field.commit));
+
+       ret = trace_seq_printf(s, "\tfield: char data;\t"
+                              "offset:%u;\tsize:%u;\n",
+                              (unsigned int)offsetof(typeof(field), data),
+                              (unsigned int)BUF_PAGE_SIZE);
+
+       return ret;
+}
+
 /*
  * head_page == tail_page && head == tail then buffer is empty.
  */
@@ -375,8 +409,11 @@ struct ring_buffer_per_cpu {
        struct buffer_page              *tail_page;     /* write to tail */
        struct buffer_page              *commit_page;   /* committed pages */
        struct buffer_page              *reader_page;
+       unsigned long                   nmi_dropped;
+       unsigned long                   commit_overrun;
        unsigned long                   overrun;
-       unsigned long                   entries;
+       unsigned long                   read;
+       local_t                         entries;
        u64                             write_stamp;
        u64                             read_stamp;
        atomic_t                        record_disabled;
@@ -389,6 +426,8 @@ struct ring_buffer {
        atomic_t                        record_disabled;
        cpumask_var_t                   cpumask;
 
+       struct lock_class_key           *reader_lock_key;
+
        struct mutex                    mutex;
 
        struct ring_buffer_per_cpu      **buffers;
@@ -420,13 +459,18 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
+static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+       /* shift to debug/test normalization and TIME_EXTENTS */
+       return buffer->clock() << DEBUG_SHIFT;
+}
+
 u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 {
        u64 time;
 
        preempt_disable_notrace();
-       /* shift to debug/test normalization and TIME_EXTENTS */
-       time = buffer->clock() << DEBUG_SHIFT;
+       time = rb_time_stamp(buffer, cpu);
        preempt_enable_no_resched_notrace();
 
        return time;
@@ -523,6 +567,7 @@ rb_allocate_cpu_buffer(struct ring_buffer *buffer, int cpu)
        cpu_buffer->cpu = cpu;
        cpu_buffer->buffer = buffer;
        spin_lock_init(&cpu_buffer->reader_lock);
+       lockdep_set_class(&cpu_buffer->reader_lock, buffer->reader_lock_key);
        cpu_buffer->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        INIT_LIST_HEAD(&cpu_buffer->pages);
 
@@ -593,7 +638,8 @@ static int rb_cpu_notify(struct notifier_block *self,
  * when the buffer wraps. If this flag is not set, the buffer will
  * drop data when the tail hits the head.
  */
-struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
+struct ring_buffer *__ring_buffer_alloc(unsigned long size, unsigned flags,
+                                       struct lock_class_key *key)
 {
        struct ring_buffer *buffer;
        int bsize;
@@ -616,6 +662,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
        buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
        buffer->flags = flags;
        buffer->clock = trace_clock_local;
+       buffer->reader_lock_key = key;
 
        /* need at least two pages */
        if (buffer->pages == 1)
@@ -673,7 +720,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
        kfree(buffer);
        return NULL;
 }
-EXPORT_SYMBOL_GPL(ring_buffer_alloc);
+EXPORT_SYMBOL_GPL(__ring_buffer_alloc);
 
 /**
  * ring_buffer_free - free a ring buffer.
@@ -947,31 +994,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer)
        return rb_page_commit(cpu_buffer->head_page);
 }
 
-/*
- * When the tail hits the head and the buffer is in overwrite mode,
- * the head jumps to the next page and all content on the previous
- * page is discarded. But before doing so, we update the overrun
- * variable of the buffer.
- */
-static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer)
-{
-       struct ring_buffer_event *event;
-       unsigned long head;
-
-       for (head = 0; head < rb_head_size(cpu_buffer);
-            head += rb_event_length(event)) {
-
-               event = __rb_page_index(cpu_buffer->head_page, head);
-               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                       return;
-               /* Only count data entries */
-               if (event->type != RINGBUF_TYPE_DATA)
-                       continue;
-               cpu_buffer->overrun++;
-               cpu_buffer->entries--;
-       }
-}
-
 static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer,
                               struct buffer_page **bpage)
 {
@@ -991,7 +1013,7 @@ rb_event_index(struct ring_buffer_event *event)
        return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static int
+static inline int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
             struct ring_buffer_event *event)
 {
@@ -1110,28 +1132,21 @@ static void
 rb_update_event(struct ring_buffer_event *event,
                         unsigned type, unsigned length)
 {
-       event->type = type;
+       event->type_len = type;
 
        switch (type) {
 
        case RINGBUF_TYPE_PADDING:
-               break;
-
        case RINGBUF_TYPE_TIME_EXTEND:
-               event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
-               break;
-
        case RINGBUF_TYPE_TIME_STAMP:
-               event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
                break;
 
-       case RINGBUF_TYPE_DATA:
+       case 0:
                length -= RB_EVNT_HDR_SIZE;
-               if (length > RB_MAX_SMALL_DATA) {
-                       event->len = 0;
+               if (length > RB_MAX_SMALL_DATA)
                        event->array[0] = length;
-               else
-                       event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
+               else
+                       event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT);
                break;
        default:
                BUG();
@@ -1155,131 +1170,156 @@ static unsigned rb_calculate_event_length(unsigned length)
        return length;
 }
 
+
 static struct ring_buffer_event *
-__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
-                 unsigned type, unsigned long length, u64 *ts)
+rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
+            unsigned long length, unsigned long tail,
+            struct buffer_page *commit_page,
+            struct buffer_page *tail_page, u64 *ts)
 {
-       struct buffer_page *tail_page, *head_page, *reader_page, *commit_page;
-       unsigned long tail, write;
+       struct buffer_page *next_page, *head_page, *reader_page;
        struct ring_buffer *buffer = cpu_buffer->buffer;
        struct ring_buffer_event *event;
-       unsigned long flags;
        bool lock_taken = false;
+       unsigned long flags;
 
-       commit_page = cpu_buffer->commit_page;
-       /* we just need to protect against interrupts */
-       barrier();
-       tail_page = cpu_buffer->tail_page;
-       write = local_add_return(length, &tail_page->write);
-       tail = write - length;
+       next_page = tail_page;
 
-       /* See if we shot pass the end of this buffer page */
-       if (write > BUF_PAGE_SIZE) {
-               struct buffer_page *next_page = tail_page;
+       local_irq_save(flags);
+       /*
+        * Since the write to the buffer is still not
+        * fully lockless, we must be careful with NMIs.
+        * The locks in the writers are taken when a write
+        * crosses to a new page. The locks protect against
+        * races with the readers (this will soon be fixed
+        * with a lockless solution).
+        *
+        * Because we can not protect against NMIs, and we
+        * want to keep traces reentrant, we need to manage
+        * what happens when we are in an NMI.
+        *
+        * NMIs can happen after we take the lock.
+        * If we are in an NMI, only take the lock
+        * if it is not already taken. Otherwise
+        * simply fail.
+        */
+       if (unlikely(in_nmi())) {
+               if (!__raw_spin_trylock(&cpu_buffer->lock)) {
+                       cpu_buffer->nmi_dropped++;
+                       goto out_reset;
+               }
+       } else
+               __raw_spin_lock(&cpu_buffer->lock);
 
-               local_irq_save(flags);
-               /*
-                * Since the write to the buffer is still not
-                * fully lockless, we must be careful with NMIs.
-                * The locks in the writers are taken when a write
-                * crosses to a new page. The locks protect against
-                * races with the readers (this will soon be fixed
-                * with a lockless solution).
-                *
-                * Because we can not protect against NMIs, and we
-                * want to keep traces reentrant, we need to manage
-                * what happens when we are in an NMI.
-                *
-                * NMIs can happen after we take the lock.
-                * If we are in an NMI, only take the lock
-                * if it is not already taken. Otherwise
-                * simply fail.
-                */
-               if (unlikely(in_nmi())) {
-                       if (!__raw_spin_trylock(&cpu_buffer->lock))
-                               goto out_reset;
-               } else
-                       __raw_spin_lock(&cpu_buffer->lock);
+       lock_taken = true;
 
-               lock_taken = true;
+       rb_inc_page(cpu_buffer, &next_page);
 
-               rb_inc_page(cpu_buffer, &next_page);
+       head_page = cpu_buffer->head_page;
+       reader_page = cpu_buffer->reader_page;
 
-               head_page = cpu_buffer->head_page;
-               reader_page = cpu_buffer->reader_page;
+       /* we grabbed the lock before incrementing */
+       if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
+               goto out_reset;
 
-               /* we grabbed the lock before incrementing */
-               if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-                       goto out_reset;
+       /*
+        * If for some reason, we had an interrupt storm that made
+        * it all the way around the buffer, bail, and warn
+        * about it.
+        */
+       if (unlikely(next_page == commit_page)) {
+               cpu_buffer->commit_overrun++;
+               goto out_reset;
+       }
 
-               /*
-                * If for some reason, we had an interrupt storm that made
-                * it all the way around the buffer, bail, and warn
-                * about it.
-                */
-               if (unlikely(next_page == commit_page)) {
-                       WARN_ON_ONCE(1);
+       if (next_page == head_page) {
+               if (!(buffer->flags & RB_FL_OVERWRITE))
                        goto out_reset;
-               }
 
-               if (next_page == head_page) {
-                       if (!(buffer->flags & RB_FL_OVERWRITE))
-                               goto out_reset;
-
-                       /* tail_page has not moved yet? */
-                       if (tail_page == cpu_buffer->tail_page) {
-                               /* count overflows */
-                               rb_update_overflow(cpu_buffer);
+               /* tail_page has not moved yet? */
+               if (tail_page == cpu_buffer->tail_page) {
+                       /* count overflows */
+                       cpu_buffer->overrun +=
+                               local_read(&head_page->entries);
 
-                               rb_inc_page(cpu_buffer, &head_page);
-                               cpu_buffer->head_page = head_page;
-                               cpu_buffer->head_page->read = 0;
-                       }
+                       rb_inc_page(cpu_buffer, &head_page);
+                       cpu_buffer->head_page = head_page;
+                       cpu_buffer->head_page->read = 0;
                }
+       }
 
-               /*
-                * If the tail page is still the same as what we think
-                * it is, then it is up to us to update the tail
-                * pointer.
-                */
-               if (tail_page == cpu_buffer->tail_page) {
-                       local_set(&next_page->write, 0);
-                       local_set(&next_page->page->commit, 0);
-                       cpu_buffer->tail_page = next_page;
+       /*
+        * If the tail page is still the same as what we think
+        * it is, then it is up to us to update the tail
+        * pointer.
+        */
+       if (tail_page == cpu_buffer->tail_page) {
+               local_set(&next_page->write, 0);
+               local_set(&next_page->entries, 0);
+               local_set(&next_page->page->commit, 0);
+               cpu_buffer->tail_page = next_page;
+
+               /* reread the time stamp */
+               *ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+               cpu_buffer->tail_page->page->time_stamp = *ts;
+       }
 
-                       /* reread the time stamp */
-                       *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
-                       cpu_buffer->tail_page->page->time_stamp = *ts;
-               }
+       /*
+        * The actual tail page has moved forward.
+        */
+       if (tail < BUF_PAGE_SIZE) {
+               /* Mark the rest of the page with padding */
+               event = __rb_page_index(tail_page, tail);
+               rb_event_set_padding(event);
+       }
 
-               /*
-                * The actual tail page has moved forward.
-                */
-               if (tail < BUF_PAGE_SIZE) {
-                       /* Mark the rest of the page with padding */
-                       event = __rb_page_index(tail_page, tail);
-                       rb_event_set_padding(event);
-               }
+       /* Set the write back to the previous setting */
+       local_sub(length, &tail_page->write);
 
-               if (tail <= BUF_PAGE_SIZE)
-                       /* Set the write back to the previous setting */
-                       local_set(&tail_page->write, tail);
+       /*
+        * If this was a commit entry that failed,
+        * increment that too
+        */
+       if (tail_page == cpu_buffer->commit_page &&
+           tail == rb_commit_index(cpu_buffer)) {
+               rb_set_commit_to_write(cpu_buffer);
+       }
 
-               /*
-                * If this was a commit entry that failed,
-                * increment that too
-                */
-               if (tail_page == cpu_buffer->commit_page &&
-                   tail == rb_commit_index(cpu_buffer)) {
-                       rb_set_commit_to_write(cpu_buffer);
-               }
+       __raw_spin_unlock(&cpu_buffer->lock);
+       local_irq_restore(flags);
+
+       /* fail and let the caller try again */
+       return ERR_PTR(-EAGAIN);
+
+ out_reset:
+       /* reset write */
+       local_sub(length, &tail_page->write);
 
+       if (likely(lock_taken))
                __raw_spin_unlock(&cpu_buffer->lock);
-               local_irq_restore(flags);
+       local_irq_restore(flags);
+       return NULL;
+}
 
-               /* fail and let the caller try again */
-               return ERR_PTR(-EAGAIN);
-       }
+static struct ring_buffer_event *
+__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
+                 unsigned type, unsigned long length, u64 *ts)
+{
+       struct buffer_page *tail_page, *commit_page;
+       struct ring_buffer_event *event;
+       unsigned long tail, write;
+
+       commit_page = cpu_buffer->commit_page;
+       /* we just need to protect against interrupts */
+       barrier();
+       tail_page = cpu_buffer->tail_page;
+       write = local_add_return(length, &tail_page->write);
+       tail = write - length;
+
+       /* See if we shot pass the end of this buffer page */
+       if (write > BUF_PAGE_SIZE)
+               return rb_move_tail(cpu_buffer, length, tail,
+                                   commit_page, tail_page, ts);
 
        /* We reserved something on the buffer */
 
@@ -1289,6 +1329,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
        event = __rb_page_index(tail_page, tail);
        rb_update_event(event, type, length);
 
+       /* The passed in type is zero for DATA */
+       if (likely(!type))
+               local_inc(&tail_page->entries);
+
        /*
         * If this is a commit and the tail is zero, then update
         * this page's time stamp.
@@ -1297,16 +1341,38 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
                cpu_buffer->commit_page->page->time_stamp = *ts;
 
        return event;
+}
 
- out_reset:
-       /* reset write */
-       if (tail <= BUF_PAGE_SIZE)
-               local_set(&tail_page->write, tail);
+static inline int
+rb_try_to_discard(struct ring_buffer_per_cpu *cpu_buffer,
+                 struct ring_buffer_event *event)
+{
+       unsigned long new_index, old_index;
+       struct buffer_page *bpage;
+       unsigned long index;
+       unsigned long addr;
 
-       if (likely(lock_taken))
-               __raw_spin_unlock(&cpu_buffer->lock);
-       local_irq_restore(flags);
-       return NULL;
+       new_index = rb_event_index(event);
+       old_index = new_index + rb_event_length(event);
+       addr = (unsigned long)event;
+       addr &= PAGE_MASK;
+
+       bpage = cpu_buffer->tail_page;
+
+       if (bpage->page == (void *)addr && rb_page_write(bpage) == old_index) {
+               /*
+                * This is on the tail page. It is possible that
+                * a write could come in and move the tail page
+                * and write to the next page. That is fine
+                * because we just shorten what is on this page.
+                */
+               index = local_cmpxchg(&bpage->write, old_index, new_index);
+               if (index == old_index)
+                       return 1;
+       }
+
+       /* could not discard */
+       return 0;
 }
 
 static int
@@ -1351,16 +1417,23 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
                        event->array[0] = *delta >> TS_SHIFT;
                } else {
                        cpu_buffer->commit_page->page->time_stamp = *ts;
-                       event->time_delta = 0;
-                       event->array[0] = 0;
+                       /* try to discard, since we do not need this */
+                       if (!rb_try_to_discard(cpu_buffer, event)) {
+                               /* nope, just zero it */
+                               event->time_delta = 0;
+                               event->array[0] = 0;
+                       }
                }
                cpu_buffer->write_stamp = *ts;
                /* let the caller know this was the commit */
                ret = 1;
        } else {
-               /* Darn, this is just wasted space */
-               event->time_delta = 0;
-               event->array[0] = 0;
+               /* Try to discard the event */
+               if (!rb_try_to_discard(cpu_buffer, event)) {
+                       /* Darn, this is just wasted space */
+                       event->time_delta = 0;
+                       event->array[0] = 0;
+               }
                ret = 0;
        }
 
@@ -1371,13 +1444,14 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 
 static struct ring_buffer_event *
 rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
-                     unsigned type, unsigned long length)
+                     unsigned long length)
 {
        struct ring_buffer_event *event;
-       u64 ts, delta;
+       u64 ts, delta = 0;
        int commit = 0;
        int nr_loops = 0;
 
+       length = rb_calculate_event_length(length);
  again:
        /*
         * We allow for interrupts to reenter here and do a trace.
@@ -1391,7 +1465,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
        if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
                return NULL;
 
-       ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+       ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
        /*
         * Only the first commit can update the timestamp.
@@ -1401,23 +1475,24 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * also be made. But only the entry that did the actual
         * commit will be something other than zero.
         */
-       if (cpu_buffer->tail_page == cpu_buffer->commit_page &&
-           rb_page_write(cpu_buffer->tail_page) ==
-           rb_commit_index(cpu_buffer)) {
+       if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page &&
+                  rb_page_write(cpu_buffer->tail_page) ==
+                  rb_commit_index(cpu_buffer))) {
+               u64 diff;
 
-               delta = ts - cpu_buffer->write_stamp;
+               diff = ts - cpu_buffer->write_stamp;
 
-               /* make sure this delta is calculated here */
+               /* make sure this diff is calculated here */
                barrier();
 
                /* Did the write stamp get updated already? */
                if (unlikely(ts < cpu_buffer->write_stamp))
-                       delta = 0;
+                       goto get_event;
 
-               if (test_time_stamp(delta)) {
+               delta = diff;
+               if (unlikely(test_time_stamp(delta))) {
 
                        commit = rb_add_time_stamp(cpu_buffer, &ts, &delta);
-
                        if (commit == -EBUSY)
                                return NULL;
 
@@ -1426,12 +1501,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 
                        RB_WARN_ON(cpu_buffer, commit < 0);
                }
-       } else
-               /* Non commits have zero deltas */
-               delta = 0;
+       }
 
-       event = __rb_reserve_next(cpu_buffer, type, length, &ts);
-       if (PTR_ERR(event) == -EAGAIN)
+ get_event:
+       event = __rb_reserve_next(cpu_buffer, 0, length, &ts);
+       if (unlikely(PTR_ERR(event) == -EAGAIN))
                goto again;
 
        if (!event) {
@@ -1448,7 +1522,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
         * If the timestamp was commited, make the commit our entry
         * now so that we will update it when needed.
         */
-       if (commit)
+       if (unlikely(commit))
                rb_set_commit_event(cpu_buffer, event);
        else if (!rb_is_commit(cpu_buffer, event))
                delta = 0;
@@ -1458,6 +1532,36 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
        return event;
 }
 
+#define TRACE_RECURSIVE_DEPTH 16
+
+static int trace_recursive_lock(void)
+{
+       current->trace_recursion++;
+
+       if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH))
+               return 0;
+
+       /* Disable all tracing before we do anything else */
+       tracing_off_permanent();
+
+       printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:"
+                   "HC[%lu]:SC[%lu]:NMI[%lu]\n",
+                   current->trace_recursion,
+                   hardirq_count() >> HARDIRQ_SHIFT,
+                   softirq_count() >> SOFTIRQ_SHIFT,
+                   in_nmi());
+
+       WARN_ON_ONCE(1);
+       return -1;
+}
+
+static void trace_recursive_unlock(void)
+{
+       WARN_ON_ONCE(!current->trace_recursion);
+
+       current->trace_recursion--;
+}
+
 static DEFINE_PER_CPU(int, rb_need_resched);
 
 /**
@@ -1491,6 +1595,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        /* If we are tracing schedule, we don't want to recurse */
        resched = ftrace_preempt_disable();
 
+       if (trace_recursive_lock())
+               goto out_nocheck;
+
        cpu = raw_smp_processor_id();
 
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
@@ -1501,11 +1608,10 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        if (atomic_read(&cpu_buffer->record_disabled))
                goto out;
 
-       length = rb_calculate_event_length(length);
-       if (length > BUF_PAGE_SIZE)
+       if (length > BUF_MAX_DATA_SIZE)
                goto out;
 
-       event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length);
+       event = rb_reserve_next_event(cpu_buffer, length);
        if (!event)
                goto out;
 
@@ -1520,6 +1626,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
        return event;
 
  out:
+       trace_recursive_unlock();
+
+ out_nocheck:
        ftrace_preempt_enable(resched);
        return NULL;
 }
@@ -1528,7 +1637,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve);
 static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
                      struct ring_buffer_event *event)
 {
-       cpu_buffer->entries++;
+       local_inc(&cpu_buffer->entries);
 
        /* Only process further if we own the commit */
        if (!rb_is_commit(cpu_buffer, event))
@@ -1558,6 +1667,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 
        rb_commit(cpu_buffer, event);
 
+       trace_recursive_unlock();
+
        /*
         * Only the last preempt count needs to restore preemption.
         */
@@ -1570,6 +1681,99 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit);
 
+static inline void rb_event_discard(struct ring_buffer_event *event)
+{
+       /* array[0] holds the actual length for the discarded event */
+       event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE;
+       event->type_len = RINGBUF_TYPE_PADDING;
+       /* time delta must be non zero */
+       if (!event->time_delta)
+               event->time_delta = 1;
+}
+
+/**
+ * ring_buffer_event_discard - discard any event in the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+       rb_event_discard(event);
+}
+EXPORT_SYMBOL_GPL(ring_buffer_event_discard);
+
+/**
+ * ring_buffer_commit_discard - discard an event that has not been committed
+ * @buffer: the ring buffer
+ * @event: non committed event to discard
+ *
+ * This is similar to ring_buffer_event_discard but must only be
+ * performed on an event that has not been committed yet. The difference
+ * is that this will also try to free the event from the ring buffer
+ * if another event has not been added behind it.
+ *
+ * If another event has been added behind it, it will set the event
+ * up as discarded, and perform the commit.
+ *
+ * If this function is called, do not call ring_buffer_unlock_commit on
+ * the event.
+ */
+void ring_buffer_discard_commit(struct ring_buffer *buffer,
+                               struct ring_buffer_event *event)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       int cpu;
+
+       /* The event is discarded regardless */
+       rb_event_discard(event);
+
+       /*
+        * This must only be called if the event has not been
+        * committed yet. Thus we can assume that preemption
+        * is still disabled.
+        */
+       RB_WARN_ON(buffer, preemptible());
+
+       cpu = smp_processor_id();
+       cpu_buffer = buffer->buffers[cpu];
+
+       if (!rb_try_to_discard(cpu_buffer, event))
+               goto out;
+
+       /*
+        * The commit is still visible by the reader, so we
+        * must increment entries.
+        */
+       local_inc(&cpu_buffer->entries);
+ out:
+       /*
+        * If a write came in and pushed the tail page
+        * we still need to update the commit pointer
+        * if we were the commit.
+        */
+       if (rb_is_commit(cpu_buffer, event))
+               rb_set_commit_to_write(cpu_buffer);
+
+       trace_recursive_unlock();
+
+       /*
+        * Only the last preempt count needs to restore preemption.
+        */
+       if (preempt_count() == 1)
+               ftrace_preempt_enable(per_cpu(rb_need_resched, cpu));
+       else
+               preempt_enable_no_resched_notrace();
+
+}
+EXPORT_SYMBOL_GPL(ring_buffer_discard_commit);
+
 /**
  * ring_buffer_write - write data to the buffer without reserving
  * @buffer: The ring buffer to write to.
@@ -1589,7 +1793,6 @@ int ring_buffer_write(struct ring_buffer *buffer,
 {
        struct ring_buffer_per_cpu *cpu_buffer;
        struct ring_buffer_event *event;
-       unsigned long event_length;
        void *body;
        int ret = -EBUSY;
        int cpu, resched;
@@ -1612,9 +1815,10 @@ int ring_buffer_write(struct ring_buffer *buffer,
        if (atomic_read(&cpu_buffer->record_disabled))
                goto out;
 
-       event_length = rb_calculate_event_length(length);
-       event = rb_reserve_next_event(cpu_buffer,
-                                     RINGBUF_TYPE_DATA, event_length);
+       if (length > BUF_MAX_DATA_SIZE)
+               goto out;
+
+       event = rb_reserve_next_event(cpu_buffer, length);
        if (!event)
                goto out;
 
@@ -1728,7 +1932,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
                return 0;
 
        cpu_buffer = buffer->buffers[cpu];
-       ret = cpu_buffer->entries;
+       ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun)
+               - cpu_buffer->read;
 
        return ret;
 }
@@ -1754,6 +1959,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
+/**
+ * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       unsigned long ret;
+
+       if (!cpumask_test_cpu(cpu, buffer->cpumask))
+               return 0;
+
+       cpu_buffer = buffer->buffers[cpu];
+       ret = cpu_buffer->nmi_dropped;
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu);
+
+/**
+ * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits
+ * @buffer: The ring buffer
+ * @cpu: The per CPU buffer to get the number of overruns from
+ */
+unsigned long
+ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu)
+{
+       struct ring_buffer_per_cpu *cpu_buffer;
+       unsigned long ret;
+
+       if (!cpumask_test_cpu(cpu, buffer->cpumask))
+               return 0;
+
+       cpu_buffer = buffer->buffers[cpu];
+       ret = cpu_buffer->commit_overrun;
+
+       return ret;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu);
+
 /**
  * ring_buffer_entries - get the number of entries in a buffer
  * @buffer: The ring buffer
@@ -1770,7 +2016,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
        /* if you care about this being correct, lock the buffer */
        for_each_buffer_cpu(buffer, cpu) {
                cpu_buffer = buffer->buffers[cpu];
-               entries += cpu_buffer->entries;
+               entries += (local_read(&cpu_buffer->entries) -
+                           cpu_buffer->overrun) - cpu_buffer->read;
        }
 
        return entries;
@@ -1862,7 +2109,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer,
 {
        u64 delta;
 
-       switch (event->type) {
+       switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                return;
 
@@ -1893,7 +2140,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter,
 {
        u64 delta;
 
-       switch (event->type) {
+       switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                return;
 
@@ -1966,6 +2213,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->reader_page->list.prev = reader->list.prev;
 
        local_set(&cpu_buffer->reader_page->write, 0);
+       local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
 
        /* Make the reader page now replace the head */
@@ -2008,8 +2256,9 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
        event = rb_reader_event(cpu_buffer);
 
-       if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
-               cpu_buffer->entries--;
+       if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX
+                       || rb_discarded_event(event))
+               cpu_buffer->read++;
 
        rb_update_read_stamp(cpu_buffer, event);
 
@@ -2031,8 +2280,8 @@ static void rb_advance_iter(struct ring_buffer_iter *iter)
         * Check if we are at the end of the buffer.
         */
        if (iter->head >= rb_page_size(iter->head_page)) {
-               if (RB_WARN_ON(buffer,
-                              iter->head_page == cpu_buffer->commit_page))
+               /* discarded commits can make the page empty */
+               if (iter->head_page == cpu_buffer->commit_page)
                        return;
                rb_inc_iter(iter);
                return;
@@ -2075,12 +2324,10 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        /*
         * We repeat when a timestamp is encountered. It is possible
         * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written. The max times
-        * that this can happen is the number of nested interrupts we
-        * can have.  Nesting 10 deep of interrupts is clearly
-        * an anomaly.
+        * as one timestamp is about to be written, or from discarded
+        * commits. The most that we can have is the number on a single page.
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                return NULL;
 
        reader = rb_get_reader_page(cpu_buffer);
@@ -2089,7 +2336,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
        event = rb_reader_event(cpu_buffer);
 
-       switch (event->type) {
+       switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event))
                        RB_WARN_ON(cpu_buffer, 1);
@@ -2146,14 +2393,14 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
  again:
        /*
-        * We repeat when a timestamp is encountered. It is possible
-        * to get multiple timestamps from an interrupt entering just
-        * as one timestamp is about to be written. The max times
-        * that this can happen is the number of nested interrupts we
-        * can have. Nesting 10 deep of interrupts is clearly
-        * an anomaly.
+        * We repeat when a timestamp is encountered.
+        * We can get multiple timestamps by nested interrupts or also
+        * if filtering is on (discarding commits). Since discarding
+        * commits can be frequent we can get a lot of timestamps.
+        * But we limit them by not adding timestamps if they begin
+        * at the start of a page.
         */
-       if (RB_WARN_ON(cpu_buffer, ++nr_loops > 10))
+       if (RB_WARN_ON(cpu_buffer, ++nr_loops > RB_TIMESTAMPS_PER_PAGE))
                return NULL;
 
        if (rb_per_cpu_empty(cpu_buffer))
@@ -2161,7 +2408,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
        event = rb_iter_head_event(iter);
 
-       switch (event->type) {
+       switch (event->type_len) {
        case RINGBUF_TYPE_PADDING:
                if (rb_null_event(event)) {
                        rb_inc_iter(iter);
@@ -2220,7 +2467,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
        event = rb_buffer_peek(buffer, cpu, ts);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2248,7 +2495,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
        event = rb_iter_peek(iter, ts);
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2293,7 +2540,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
        preempt_enable();
 
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2386,7 +2633,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-       if (event && event->type == RINGBUF_TYPE_PADDING) {
+       if (event && event->type_len == RINGBUF_TYPE_PADDING) {
                cpu_relax();
                goto again;
        }
@@ -2411,6 +2658,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
        cpu_buffer->head_page
                = list_entry(cpu_buffer->pages.next, struct buffer_page, list);
        local_set(&cpu_buffer->head_page->write, 0);
+       local_set(&cpu_buffer->head_page->entries, 0);
        local_set(&cpu_buffer->head_page->page->commit, 0);
 
        cpu_buffer->head_page->read = 0;
@@ -2420,11 +2668,15 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
        INIT_LIST_HEAD(&cpu_buffer->reader_page->list);
        local_set(&cpu_buffer->reader_page->write, 0);
+       local_set(&cpu_buffer->reader_page->entries, 0);
        local_set(&cpu_buffer->reader_page->page->commit, 0);
        cpu_buffer->reader_page->read = 0;
 
+       cpu_buffer->nmi_dropped = 0;
+       cpu_buffer->commit_overrun = 0;
        cpu_buffer->overrun = 0;
-       cpu_buffer->entries = 0;
+       cpu_buffer->read = 0;
+       local_set(&cpu_buffer->entries, 0);
 
        cpu_buffer->write_stamp = 0;
        cpu_buffer->read_stamp = 0;
@@ -2443,6 +2695,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        if (!cpumask_test_cpu(cpu, buffer->cpumask))
                return;
 
+       atomic_inc(&cpu_buffer->record_disabled);
+
        spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
        __raw_spin_lock(&cpu_buffer->lock);
@@ -2452,6 +2706,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
        __raw_spin_unlock(&cpu_buffer->lock);
 
        spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+
+       atomic_dec(&cpu_buffer->record_disabled);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2578,28 +2834,6 @@ out:
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
-static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-                             struct buffer_data_page *bpage,
-                             unsigned int offset)
-{
-       struct ring_buffer_event *event;
-       unsigned long head;
-
-       __raw_spin_lock(&cpu_buffer->lock);
-       for (head = offset; head < local_read(&bpage->commit);
-            head += rb_event_length(event)) {
-
-               event = __rb_data_page_index(bpage, head);
-               if (RB_WARN_ON(cpu_buffer, rb_null_event(event)))
-                       return;
-               /* Only count data entries */
-               if (event->type != RINGBUF_TYPE_DATA)
-                       continue;
-               cpu_buffer->entries--;
-       }
-       __raw_spin_unlock(&cpu_buffer->lock);
-}
-
 /**
  * ring_buffer_alloc_read_page - allocate a page to read from buffer
  * @buffer: the buffer to allocate for.
@@ -2630,6 +2864,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
        return bpage;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page);
 
 /**
  * ring_buffer_free_read_page - free an allocated read page
@@ -2642,6 +2877,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
 {
        free_page((unsigned long)data);
 }
+EXPORT_SYMBOL_GPL(ring_buffer_free_read_page);
 
 /**
  * ring_buffer_read_page - extract a page from the ring buffer
@@ -2768,16 +3004,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
                /* we copied everything to the beginning */
                read = 0;
        } else {
+               /* update the entry counter */
+               cpu_buffer->read += local_read(&reader->entries);
+
                /* swap the pages */
                rb_init_page(bpage);
                bpage = reader->page;
                reader->page = *data_page;
                local_set(&reader->write, 0);
+               local_set(&reader->entries, 0);
                reader->read = 0;
                *data_page = bpage;
-
-               /* update the entry counter */
-               rb_remove_entries(cpu_buffer, bpage, read);
        }
        ret = read;
 
@@ -2787,6 +3024,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
  out:
        return ret;
 }
+EXPORT_SYMBOL_GPL(ring_buffer_read_page);
 
 static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
@@ -2845,14 +3083,11 @@ static const struct file_operations rb_simple_fops = {
 static __init int rb_init_debugfs(void)
 {
        struct dentry *d_tracer;
-       struct dentry *entry;
 
        d_tracer = tracing_init_dentry();
 
-       entry = debugfs_create_file("tracing_on", 0644, d_tracer,
-                                   &ring_buffer_flags, &rb_simple_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_on' entry\n");
+       trace_create_file("tracing_on", 0644, d_tracer,
+                           &ring_buffer_flags, &rb_simple_fops);
 
        return 0;
 }
diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c
new file mode 100644 (file)
index 0000000..8d68e14
--- /dev/null
@@ -0,0 +1,416 @@
+/*
+ * ring buffer tester and benchmark
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/ring_buffer.h>
+#include <linux/completion.h>
+#include <linux/kthread.h>
+#include <linux/module.h>
+#include <linux/time.h>
+
+struct rb_page {
+       u64             ts;
+       local_t         commit;
+       char            data[4080];
+};
+
+/* run time and sleep time in seconds */
+#define RUN_TIME       10
+#define SLEEP_TIME     10
+
+/* number of events for writer to wake up the reader */
+static int wakeup_interval = 100;
+
+static int reader_finish;
+static struct completion read_start;
+static struct completion read_done;
+
+static struct ring_buffer *buffer;
+static struct task_struct *producer;
+static struct task_struct *consumer;
+static unsigned long read;
+
+static int disable_reader;
+module_param(disable_reader, uint, 0644);
+MODULE_PARM_DESC(disable_reader, "only run producer");
+
+static int read_events;
+
+static int kill_test;
+
+#define KILL_TEST()                            \
+       do {                                    \
+               if (!kill_test) {               \
+                       kill_test = 1;          \
+                       WARN_ON(1);             \
+               }                               \
+       } while (0)
+
+enum event_status {
+       EVENT_FOUND,
+       EVENT_DROPPED,
+};
+
+static enum event_status read_event(int cpu)
+{
+       struct ring_buffer_event *event;
+       int *entry;
+       u64 ts;
+
+       event = ring_buffer_consume(buffer, cpu, &ts);
+       if (!event)
+               return EVENT_DROPPED;
+
+       entry = ring_buffer_event_data(event);
+       if (*entry != cpu) {
+               KILL_TEST();
+               return EVENT_DROPPED;
+       }
+
+       read++;
+       return EVENT_FOUND;
+}
+
+static enum event_status read_page(int cpu)
+{
+       struct ring_buffer_event *event;
+       struct rb_page *rpage;
+       unsigned long commit;
+       void *bpage;
+       int *entry;
+       int ret;
+       int inc;
+       int i;
+
+       bpage = ring_buffer_alloc_read_page(buffer);
+       if (!bpage)
+               return EVENT_DROPPED;
+
+       ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1);
+       if (ret >= 0) {
+               rpage = bpage;
+               commit = local_read(&rpage->commit);
+               for (i = 0; i < commit && !kill_test; i += inc) {
+
+                       if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) {
+                               KILL_TEST();
+                               break;
+                       }
+
+                       inc = -1;
+                       event = (void *)&rpage->data[i];
+                       switch (event->type_len) {
+                       case RINGBUF_TYPE_PADDING:
+                               /* We don't expect any padding */
+                               KILL_TEST();
+                               break;
+                       case RINGBUF_TYPE_TIME_EXTEND:
+                               inc = 8;
+                               break;
+                       case 0:
+                               entry = ring_buffer_event_data(event);
+                               if (*entry != cpu) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               read++;
+                               if (!event->array[0]) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               inc = event->array[0];
+                               break;
+                       default:
+                               entry = ring_buffer_event_data(event);
+                               if (*entry != cpu) {
+                                       KILL_TEST();
+                                       break;
+                               }
+                               read++;
+                               inc = ((event->type_len + 1) * 4);
+                       }
+                       if (kill_test)
+                               break;
+
+                       if (inc <= 0) {
+                               KILL_TEST();
+                               break;
+                       }
+               }
+       }
+       ring_buffer_free_read_page(buffer, bpage);
+
+       if (ret < 0)
+               return EVENT_DROPPED;
+       return EVENT_FOUND;
+}
+
+static void ring_buffer_consumer(void)
+{
+       /* toggle between reading pages and events */
+       read_events ^= 1;
+
+       read = 0;
+       while (!reader_finish && !kill_test) {
+               int found;
+
+               do {
+                       int cpu;
+
+                       found = 0;
+                       for_each_online_cpu(cpu) {
+                               enum event_status stat;
+
+                               if (read_events)
+                                       stat = read_event(cpu);
+                               else
+                                       stat = read_page(cpu);
+
+                               if (kill_test)
+                                       break;
+                               if (stat == EVENT_FOUND)
+                                       found = 1;
+                       }
+               } while (found && !kill_test);
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (reader_finish)
+                       break;
+
+               schedule();
+               __set_current_state(TASK_RUNNING);
+       }
+       reader_finish = 0;
+       complete(&read_done);
+}
+
+static void ring_buffer_producer(void)
+{
+       struct timeval start_tv;
+       struct timeval end_tv;
+       unsigned long long time;
+       unsigned long long entries;
+       unsigned long long overruns;
+       unsigned long missed = 0;
+       unsigned long hit = 0;
+       unsigned long avg;
+       int cnt = 0;
+
+       /*
+        * Hammer the buffer for 10 secs (this may
+        * make the system stall)
+        */
+       pr_info("Starting ring buffer hammer\n");
+       do_gettimeofday(&start_tv);
+       do {
+               struct ring_buffer_event *event;
+               int *entry;
+
+               event = ring_buffer_lock_reserve(buffer, 10);
+               if (!event) {
+                       missed++;
+               } else {
+                       hit++;
+                       entry = ring_buffer_event_data(event);
+                       *entry = smp_processor_id();
+                       ring_buffer_unlock_commit(buffer, event);
+               }
+               do_gettimeofday(&end_tv);
+
+               cnt++;
+               if (consumer && !(cnt % wakeup_interval))
+                       wake_up_process(consumer);
+
+#ifndef CONFIG_PREEMPT
+               /*
+                * If we are a non preempt kernel, the 10 second run will
+                * stop everything while it runs. Instead, we will call
+                * cond_resched and also add any time that was lost by a
+                * rescedule.
+                *
+                * Do a cond resched at the same frequency we would wake up
+                * the reader.
+                */
+               if (cnt % wakeup_interval)
+                       cond_resched();
+#endif
+
+       } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test);
+       pr_info("End ring buffer hammer\n");
+
+       if (consumer) {
+               /* Init both completions here to avoid races */
+               init_completion(&read_start);
+               init_completion(&read_done);
+               /* the completions must be visible before the finish var */
+               smp_wmb();
+               reader_finish = 1;
+               /* finish var visible before waking up the consumer */
+               smp_wmb();
+               wake_up_process(consumer);
+               wait_for_completion(&read_done);
+       }
+
+       time = end_tv.tv_sec - start_tv.tv_sec;
+       time *= USEC_PER_SEC;
+       time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec);
+
+       entries = ring_buffer_entries(buffer);
+       overruns = ring_buffer_overruns(buffer);
+
+       if (kill_test)
+               pr_info("ERROR!\n");
+       pr_info("Time:     %lld (usecs)\n", time);
+       pr_info("Overruns: %lld\n", overruns);
+       if (disable_reader)
+               pr_info("Read:     (reader disabled)\n");
+       else
+               pr_info("Read:     %ld  (by %s)\n", read,
+                       read_events ? "events" : "pages");
+       pr_info("Entries:  %lld\n", entries);
+       pr_info("Total:    %lld\n", entries + overruns + read);
+       pr_info("Missed:   %ld\n", missed);
+       pr_info("Hit:      %ld\n", hit);
+
+       /* Convert time from usecs to millisecs */
+       do_div(time, USEC_PER_MSEC);
+       if (time)
+               hit /= (long)time;
+       else
+               pr_info("TIME IS ZERO??\n");
+
+       pr_info("Entries per millisec: %ld\n", hit);
+
+       if (hit) {
+               /* Calculate the average time in nanosecs */
+               avg = NSEC_PER_MSEC / hit;
+               pr_info("%ld ns per entry\n", avg);
+       }
+
+       if (missed) {
+               if (time)
+                       missed /= (long)time;
+
+               pr_info("Total iterations per millisec: %ld\n", hit + missed);
+
+               /* it is possible that hit + missed will overflow and be zero */
+               if (!(hit + missed)) {
+                       pr_info("hit + missed overflowed and totalled zero!\n");
+                       hit--; /* make it non zero */
+               }
+
+               /* Caculate the average time in nanosecs */
+               avg = NSEC_PER_MSEC / (hit + missed);
+               pr_info("%ld ns per entry\n", avg);
+       }
+}
+
+static void wait_to_die(void)
+{
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop()) {
+               schedule();
+               set_current_state(TASK_INTERRUPTIBLE);
+       }
+       __set_current_state(TASK_RUNNING);
+}
+
+static int ring_buffer_consumer_thread(void *arg)
+{
+       while (!kthread_should_stop() && !kill_test) {
+               complete(&read_start);
+
+               ring_buffer_consumer();
+
+               set_current_state(TASK_INTERRUPTIBLE);
+               if (kthread_should_stop() || kill_test)
+                       break;
+
+               schedule();
+               __set_current_state(TASK_RUNNING);
+       }
+       __set_current_state(TASK_RUNNING);
+
+       if (kill_test)
+               wait_to_die();
+
+       return 0;
+}
+
+static int ring_buffer_producer_thread(void *arg)
+{
+       init_completion(&read_start);
+
+       while (!kthread_should_stop() && !kill_test) {
+               ring_buffer_reset(buffer);
+
+               if (consumer) {
+                       smp_wmb();
+                       wake_up_process(consumer);
+                       wait_for_completion(&read_start);
+               }
+
+               ring_buffer_producer();
+
+               pr_info("Sleeping for 10 secs\n");
+               set_current_state(TASK_INTERRUPTIBLE);
+               schedule_timeout(HZ * SLEEP_TIME);
+               __set_current_state(TASK_RUNNING);
+       }
+
+       if (kill_test)
+               wait_to_die();
+
+       return 0;
+}
+
+static int __init ring_buffer_benchmark_init(void)
+{
+       int ret;
+
+       /* make a one meg buffer in overwite mode */
+       buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE);
+       if (!buffer)
+               return -ENOMEM;
+
+       if (!disable_reader) {
+               consumer = kthread_create(ring_buffer_consumer_thread,
+                                         NULL, "rb_consumer");
+               ret = PTR_ERR(consumer);
+               if (IS_ERR(consumer))
+                       goto out_fail;
+       }
+
+       producer = kthread_run(ring_buffer_producer_thread,
+                              NULL, "rb_producer");
+       ret = PTR_ERR(producer);
+
+       if (IS_ERR(producer))
+               goto out_kill;
+
+       return 0;
+
+ out_kill:
+       if (consumer)
+               kthread_stop(consumer);
+
+ out_fail:
+       ring_buffer_free(buffer);
+       return ret;
+}
+
+static void __exit ring_buffer_benchmark_exit(void)
+{
+       kthread_stop(producer);
+       if (consumer)
+               kthread_stop(consumer);
+       ring_buffer_free(buffer);
+}
+
+module_init(ring_buffer_benchmark_init);
+module_exit(ring_buffer_benchmark_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("ring_buffer_benchmark");
+MODULE_LICENSE("GPL");
index cda81ec58d9fe1e723388826de8966b274b71d0a..8acd9b81a5d76046ee52c9d1dc9224cc43edb1c2 100644 (file)
@@ -171,6 +171,13 @@ static struct trace_array  global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+int filter_current_check_discard(struct ftrace_event_call *call, void *rec,
+                                struct ring_buffer_event *event)
+{
+       return filter_check_discard(call, rec, global_trace.buffer, event);
+}
+EXPORT_SYMBOL_GPL(filter_current_check_discard);
+
 cycle_t ftrace_now(int cpu)
 {
        u64 ts;
@@ -255,7 +262,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-       TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
+       TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME |
+       TRACE_ITER_GRAPH_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -317,6 +325,7 @@ static const char *trace_options[] = {
        "latency-format",
        "global-clock",
        "sleep-time",
+       "graph-time",
        NULL
 };
 
@@ -402,17 +411,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
        return cnt;
 }
 
-static void
-trace_print_seq(struct seq_file *m, struct trace_seq *s)
-{
-       int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
-
-       s->buffer[len] = 0;
-       seq_puts(m, s->buffer);
-
-       trace_seq_init(s);
-}
-
 /**
  * update_max_tr - snapshot all trace buffers from global_trace to max_tr
  * @tr: tracer
@@ -641,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr)
                tracing_reset(tr, cpu);
 }
 
+void tracing_reset_current(int cpu)
+{
+       tracing_reset(&global_trace, cpu);
+}
+
+void tracing_reset_current_online_cpus(void)
+{
+       tracing_reset_online_cpus(&global_trace);
+}
+
 #define SAVED_CMDLINES 128
 #define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
@@ -800,6 +808,7 @@ void trace_find_cmdline(int pid, char comm[])
                return;
        }
 
+       preempt_disable();
        __raw_spin_lock(&trace_cmdline_lock);
        map = map_pid_to_cmdline[pid];
        if (map != NO_CMDLINE_MAP)
@@ -808,6 +817,7 @@ void trace_find_cmdline(int pid, char comm[])
                strcpy(comm, "<...>");
 
        __raw_spin_unlock(&trace_cmdline_lock);
+       preempt_enable();
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
@@ -840,7 +850,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                   unsigned char type,
+                                                   int type,
                                                    unsigned long len,
                                                    unsigned long flags, int pc)
 {
@@ -883,30 +893,40 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 }
 
 struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+trace_current_buffer_lock_reserve(int type, unsigned long len,
                                  unsigned long flags, int pc)
 {
        return trace_buffer_lock_reserve(&global_trace,
                                         type, len, flags, pc);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve);
 
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-       return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+       __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit);
 
 void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
                                        unsigned long flags, int pc)
 {
-       return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+       __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
+}
+EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit);
+
+void trace_current_buffer_discard_commit(struct ring_buffer_event *event)
+{
+       ring_buffer_discard_commit(global_trace.buffer, event);
 }
+EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit);
 
 void
 trace_function(struct trace_array *tr,
               unsigned long ip, unsigned long parent_ip, unsigned long flags,
               int pc)
 {
+       struct ftrace_event_call *call = &event_function;
        struct ring_buffer_event *event;
        struct ftrace_entry *entry;
 
@@ -921,7 +941,9 @@ trace_function(struct trace_array *tr,
        entry   = ring_buffer_event_data(event);
        entry->ip                       = ip;
        entry->parent_ip                = parent_ip;
-       ring_buffer_unlock_commit(tr->buffer, event);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -930,6 +952,7 @@ static int __trace_graph_entry(struct trace_array *tr,
                                unsigned long flags,
                                int pc)
 {
+       struct ftrace_event_call *call = &event_funcgraph_entry;
        struct ring_buffer_event *event;
        struct ftrace_graph_ent_entry *entry;
 
@@ -942,7 +965,8 @@ static int __trace_graph_entry(struct trace_array *tr,
                return 0;
        entry   = ring_buffer_event_data(event);
        entry->graph_ent                        = *trace;
-       ring_buffer_unlock_commit(global_trace.buffer, event);
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(global_trace.buffer, event);
 
        return 1;
 }
@@ -952,6 +976,7 @@ static void __trace_graph_return(struct trace_array *tr,
                                unsigned long flags,
                                int pc)
 {
+       struct ftrace_event_call *call = &event_funcgraph_exit;
        struct ring_buffer_event *event;
        struct ftrace_graph_ret_entry *entry;
 
@@ -964,7 +989,8 @@ static void __trace_graph_return(struct trace_array *tr,
                return;
        entry   = ring_buffer_event_data(event);
        entry->ret                              = *trace;
-       ring_buffer_unlock_commit(global_trace.buffer, event);
+       if (!filter_current_check_discard(call, entry, event))
+               ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -982,6 +1008,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
                                 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+       struct ftrace_event_call *call = &event_kernel_stack;
        struct ring_buffer_event *event;
        struct stack_entry *entry;
        struct stack_trace trace;
@@ -999,7 +1026,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
        trace.entries           = entry->caller;
 
        save_stack_trace(&trace);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1024,6 +1052,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
                                   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
+       struct ftrace_event_call *call = &event_user_stack;
        struct ring_buffer_event *event;
        struct userstack_entry *entry;
        struct stack_trace trace;
@@ -1045,7 +1074,8 @@ static void ftrace_trace_userstack(struct trace_array *tr,
        trace.entries           = entry->caller;
 
        save_stack_trace_user(&trace);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -1089,6 +1119,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
                           struct task_struct *next,
                           unsigned long flags, int pc)
 {
+       struct ftrace_event_call *call = &event_context_switch;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
 
@@ -1104,7 +1135,9 @@ tracing_sched_switch_trace(struct trace_array *tr,
        entry->next_prio                = next->prio;
        entry->next_state               = next->state;
        entry->next_cpu = task_cpu(next);
-       trace_buffer_unlock_commit(tr, event, flags, pc);
+
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1113,6 +1146,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
                           struct task_struct *curr,
                           unsigned long flags, int pc)
 {
+       struct ftrace_event_call *call = &event_wakeup;
        struct ring_buffer_event *event;
        struct ctx_switch_entry *entry;
 
@@ -1129,7 +1163,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
        entry->next_state               = wakee->state;
        entry->next_cpu                 = task_cpu(wakee);
 
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
        ftrace_trace_stack(tr, flags, 6, pc);
        ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1230,11 +1265,13 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
                (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        static u32 trace_buf[TRACE_BUF_SIZE];
 
+       struct ftrace_event_call *call = &event_bprint;
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
        struct bprint_entry *entry;
        unsigned long flags;
+       int disable;
        int resched;
        int cpu, len = 0, size, pc;
 
@@ -1249,7 +1286,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
 
-       if (unlikely(atomic_read(&data->disabled)))
+       disable = atomic_inc_return(&data->disabled);
+       if (unlikely(disable != 1))
                goto out;
 
        /* Lockdep uses trace_printk for lock tracing */
@@ -1269,13 +1307,15 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
        entry->fmt                      = fmt;
 
        memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
        __raw_spin_unlock(&trace_buf_lock);
        local_irq_restore(flags);
 
 out:
+       atomic_dec_return(&data->disabled);
        ftrace_preempt_enable(resched);
        unpause_graph_tracing();
 
@@ -1288,12 +1328,14 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
        static char trace_buf[TRACE_BUF_SIZE];
 
+       struct ftrace_event_call *call = &event_print;
        struct ring_buffer_event *event;
        struct trace_array *tr = &global_trace;
        struct trace_array_cpu *data;
        int cpu, len = 0, size, pc;
        struct print_entry *entry;
        unsigned long irq_flags;
+       int disable;
 
        if (tracing_disabled || tracing_selftest_running)
                return 0;
@@ -1303,7 +1345,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
        cpu = raw_smp_processor_id();
        data = tr->data[cpu];
 
-       if (unlikely(atomic_read(&data->disabled)))
+       disable = atomic_inc_return(&data->disabled);
+       if (unlikely(disable != 1))
                goto out;
 
        pause_graph_tracing();
@@ -1323,13 +1366,15 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 
        memcpy(&entry->buf, trace_buf, len);
        entry->buf[len] = 0;
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
        __raw_spin_unlock(&trace_buf_lock);
        raw_local_irq_restore(irq_flags);
        unpause_graph_tracing();
  out:
+       atomic_dec_return(&data->disabled);
        preempt_enable_notrace();
 
        return len;
@@ -1526,12 +1571,14 @@ static void *s_start(struct seq_file *m, loff_t *pos)
                p = s_next(m, p, &l);
        }
 
+       trace_event_read_lock();
        return p;
 }
 
 static void s_stop(struct seq_file *m, void *p)
 {
        atomic_dec(&trace_record_cmdline_disabled);
+       trace_event_read_unlock();
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1774,6 +1821,7 @@ static int trace_empty(struct trace_iterator *iter)
        return 1;
 }
 
+/*  Called with trace_event_read_lock() held. */
 static enum print_line_t print_trace_line(struct trace_iterator *iter)
 {
        enum print_line_t ret;
@@ -2396,6 +2444,56 @@ static const struct file_operations tracing_readme_fops = {
        .read           = tracing_readme_read,
 };
 
+static ssize_t
+tracing_saved_cmdlines_read(struct file *file, char __user *ubuf,
+                               size_t cnt, loff_t *ppos)
+{
+       char *buf_comm;
+       char *file_buf;
+       char *buf;
+       int len = 0;
+       int pid;
+       int i;
+
+       file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL);
+       if (!file_buf)
+               return -ENOMEM;
+
+       buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL);
+       if (!buf_comm) {
+               kfree(file_buf);
+               return -ENOMEM;
+       }
+
+       buf = file_buf;
+
+       for (i = 0; i < SAVED_CMDLINES; i++) {
+               int r;
+
+               pid = map_cmdline_to_pid[i];
+               if (pid == -1 || pid == NO_CMDLINE_MAP)
+                       continue;
+
+               trace_find_cmdline(pid, buf_comm);
+               r = sprintf(buf, "%d %s\n", pid, buf_comm);
+               buf += r;
+               len += r;
+       }
+
+       len = simple_read_from_buffer(ubuf, cnt, ppos,
+                                     file_buf, len);
+
+       kfree(file_buf);
+       kfree(buf_comm);
+
+       return len;
+}
+
+static const struct file_operations tracing_saved_cmdlines_fops = {
+    .open       = tracing_open_generic,
+    .read       = tracing_saved_cmdlines_read,
+};
+
 static ssize_t
 tracing_ctrl_read(struct file *filp, char __user *ubuf,
                  size_t cnt, loff_t *ppos)
@@ -2728,6 +2826,9 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
        /* trace pipe does not show start of buffer */
        cpumask_setall(iter->started);
 
+       if (trace_flags & TRACE_ITER_LATENCY_FMT)
+               iter->iter_flags |= TRACE_FILE_LAT_FMT;
+
        iter->cpu_file = cpu_file;
        iter->tr = &global_trace;
        mutex_init(&iter->mutex);
@@ -2915,6 +3016,7 @@ waitagain:
               offsetof(struct trace_iterator, seq));
        iter->pos = -1;
 
+       trace_event_read_lock();
        while (find_next_entry_inc(iter) != NULL) {
                enum print_line_t ret;
                int len = iter->seq.len;
@@ -2931,6 +3033,7 @@ waitagain:
                if (iter->seq.len >= cnt)
                        break;
        }
+       trace_event_read_unlock();
 
        /* Now copy what we have to the user */
        sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
@@ -3053,6 +3156,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                goto out_err;
        }
 
+       trace_event_read_lock();
+
        /* Fill as many pages as possible. */
        for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
                pages[i] = alloc_page(GFP_KERNEL);
@@ -3075,6 +3180,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
                trace_seq_init(&iter->seq);
        }
 
+       trace_event_read_unlock();
        mutex_unlock(&iter->mutex);
 
        spd.nr_pages = i;
@@ -3425,7 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                .spd_release    = buffer_spd_release,
        };
        struct buffer_ref *ref;
-       int size, i;
+       int entries, size, i;
        size_t ret;
 
        if (*ppos & (PAGE_SIZE - 1)) {
@@ -3440,7 +3546,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                len &= PAGE_MASK;
        }
 
-       for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) {
+       entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
+
+       for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) {
                struct page *page;
                int r;
 
@@ -3457,7 +3565,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                }
 
                r = ring_buffer_read_page(ref->buffer, &ref->page,
-                                         len, info->cpu, 0);
+                                         len, info->cpu, 1);
                if (r < 0) {
                        ring_buffer_free_read_page(ref->buffer,
                                                   ref->page);
@@ -3481,6 +3589,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos,
                spd.partial[i].private = (unsigned long)ref;
                spd.nr_pages++;
                *ppos += PAGE_SIZE;
+
+               entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu);
        }
 
        spd.nr_pages = i;
@@ -3508,6 +3618,45 @@ static const struct file_operations tracing_buffers_fops = {
        .llseek         = no_llseek,
 };
 
+static ssize_t
+tracing_stats_read(struct file *filp, char __user *ubuf,
+                  size_t count, loff_t *ppos)
+{
+       unsigned long cpu = (unsigned long)filp->private_data;
+       struct trace_array *tr = &global_trace;
+       struct trace_seq *s;
+       unsigned long cnt;
+
+       s = kmalloc(sizeof(*s), GFP_ATOMIC);
+       if (!s)
+               return ENOMEM;
+
+       trace_seq_init(s);
+
+       cnt = ring_buffer_entries_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "entries: %ld\n", cnt);
+
+       cnt = ring_buffer_overrun_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "overrun: %ld\n", cnt);
+
+       cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "commit overrun: %ld\n", cnt);
+
+       cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu);
+       trace_seq_printf(s, "nmi dropped: %ld\n", cnt);
+
+       count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len);
+
+       kfree(s);
+
+       return count;
+}
+
+static const struct file_operations tracing_stats_fops = {
+       .open           = tracing_open_generic,
+       .read           = tracing_stats_read,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3597,7 +3746,7 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
        struct dentry *d_percpu = tracing_dentry_percpu();
-       struct dentry *entry, *d_cpu;
+       struct dentry *d_cpu;
        /* strlen(cpu) + MAX(log10(cpu)) + '\0' */
        char cpu_dir[7];
 
@@ -3612,21 +3761,18 @@ static void tracing_init_debugfs_percpu(long cpu)
        }
 
        /* per cpu trace_pipe */
-       entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
-                               (void *) cpu, &tracing_pipe_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_pipe' entry\n");
+       trace_create_file("trace_pipe", 0444, d_cpu,
+                       (void *) cpu, &tracing_pipe_fops);
 
        /* per cpu trace */
-       entry = debugfs_create_file("trace", 0644, d_cpu,
-                               (void *) cpu, &tracing_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace' entry\n");
+       trace_create_file("trace", 0644, d_cpu,
+                       (void *) cpu, &tracing_fops);
+
+       trace_create_file("trace_pipe_raw", 0444, d_cpu,
+                       (void *) cpu, &tracing_buffers_fops);
 
-       entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
-                                   (void *) cpu, &tracing_buffers_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
+       trace_create_file("stats", 0444, d_cpu,
+                       (void *) cpu, &tracing_stats_fops);
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3782,6 +3928,22 @@ static const struct file_operations trace_options_core_fops = {
        .write = trace_options_core_write,
 };
 
+struct dentry *trace_create_file(const char *name,
+                                mode_t mode,
+                                struct dentry *parent,
+                                void *data,
+                                const struct file_operations *fops)
+{
+       struct dentry *ret;
+
+       ret = debugfs_create_file(name, mode, parent, data, fops);
+       if (!ret)
+               pr_warning("Could not create debugfs '%s' entry\n", name);
+
+       return ret;
+}
+
+
 static struct dentry *trace_options_init_dentry(void)
 {
        struct dentry *d_tracer;
@@ -3809,7 +3971,6 @@ create_trace_option_file(struct trace_option_dentry *topt,
                         struct tracer_opt *opt)
 {
        struct dentry *t_options;
-       struct dentry *entry;
 
        t_options = trace_options_init_dentry();
        if (!t_options)
@@ -3818,11 +3979,9 @@ create_trace_option_file(struct trace_option_dentry *topt,
        topt->flags = flags;
        topt->opt = opt;
 
-       entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+       topt->entry = trace_create_file(opt->name, 0644, t_options, topt,
                                    &trace_options_fops);
 
-       topt->entry = entry;
-
 }
 
 static struct trace_option_dentry *
@@ -3877,123 +4036,84 @@ static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
        struct dentry *t_options;
-       struct dentry *entry;
 
        t_options = trace_options_init_dentry();
        if (!t_options)
                return NULL;
 
-       entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+       return trace_create_file(option, 0644, t_options, (void *)index,
                                    &trace_options_core_fops);
-
-       return entry;
 }
 
 static __init void create_trace_options_dir(void)
 {
        struct dentry *t_options;
-       struct dentry *entry;
        int i;
 
        t_options = trace_options_init_dentry();
        if (!t_options)
                return;
 
-       for (i = 0; trace_options[i]; i++) {
-               entry = create_trace_option_core_file(trace_options[i], i);
-               if (!entry)
-                       pr_warning("Could not create debugfs %s entry\n",
-                                  trace_options[i]);
-       }
+       for (i = 0; trace_options[i]; i++)
+               create_trace_option_core_file(trace_options[i], i);
 }
 
 static __init int tracer_init_debugfs(void)
 {
        struct dentry *d_tracer;
-       struct dentry *entry;
        int cpu;
 
        d_tracer = tracing_init_dentry();
 
-       entry = debugfs_create_file("tracing_enabled", 0644, d_tracer,
-                                   &global_trace, &tracing_ctrl_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_enabled' entry\n");
+       trace_create_file("tracing_enabled", 0644, d_tracer,
+                       &global_trace, &tracing_ctrl_fops);
 
-       entry = debugfs_create_file("trace_options", 0644, d_tracer,
-                                   NULL, &tracing_iter_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace_options' entry\n");
+       trace_create_file("trace_options", 0644, d_tracer,
+                       NULL, &tracing_iter_fops);
 
-       create_trace_options_dir();
+       trace_create_file("tracing_cpumask", 0644, d_tracer,
+                       NULL, &tracing_cpumask_fops);
+
+       trace_create_file("trace", 0644, d_tracer,
+                       (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 
-       entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
-                                   NULL, &tracing_cpumask_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
-
-       entry = debugfs_create_file("trace", 0644, d_tracer,
-                                (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'trace' entry\n");
-
-       entry = debugfs_create_file("available_tracers", 0444, d_tracer,
-                                   &global_trace, &show_traces_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'available_tracers' entry\n");
-
-       entry = debugfs_create_file("current_tracer", 0444, d_tracer,
-                                   &global_trace, &set_tracer_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'current_tracer' entry\n");
-
-       entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer,
-                                   &tracing_max_latency,
-                                   &tracing_max_lat_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'tracing_max_latency' entry\n");
-
-       entry = debugfs_create_file("tracing_thresh", 0644, d_tracer,
-                                   &tracing_thresh, &tracing_max_lat_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'tracing_thresh' entry\n");
-       entry = debugfs_create_file("README", 0644, d_tracer,
-                                   NULL, &tracing_readme_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'README' entry\n");
-
-       entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+       trace_create_file("available_tracers", 0444, d_tracer,
+                       &global_trace, &show_traces_fops);
+
+       trace_create_file("current_tracer", 0644, d_tracer,
+                       &global_trace, &set_tracer_fops);
+
+       trace_create_file("tracing_max_latency", 0644, d_tracer,
+                       &tracing_max_latency, &tracing_max_lat_fops);
+
+       trace_create_file("tracing_thresh", 0644, d_tracer,
+                       &tracing_thresh, &tracing_max_lat_fops);
+
+       trace_create_file("README", 0444, d_tracer,
+                       NULL, &tracing_readme_fops);
+
+       trace_create_file("trace_pipe", 0444, d_tracer,
                        (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'trace_pipe' entry\n");
-
-       entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer,
-                                   &global_trace, &tracing_entries_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'buffer_size_kb' entry\n");
-
-       entry = debugfs_create_file("trace_marker", 0220, d_tracer,
-                                   NULL, &tracing_mark_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'trace_marker' entry\n");
+
+       trace_create_file("buffer_size_kb", 0644, d_tracer,
+                       &global_trace, &tracing_entries_fops);
+
+       trace_create_file("trace_marker", 0220, d_tracer,
+                       NULL, &tracing_mark_fops);
+
+       trace_create_file("saved_cmdlines", 0444, d_tracer,
+                       NULL, &tracing_saved_cmdlines_fops);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
-       entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
-                                   &ftrace_update_tot_cnt,
-                                   &tracing_dyn_info_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'dyn_ftrace_total_info' entry\n");
+       trace_create_file("dyn_ftrace_total_info", 0444, d_tracer,
+                       &ftrace_update_tot_cnt, &tracing_dyn_info_fops);
 #endif
 #ifdef CONFIG_SYSPROF_TRACER
        init_tracer_sysprof_debugfs(d_tracer);
 #endif
 
+       create_trace_options_dir();
+
        for_each_tracing_cpu(cpu)
                tracing_init_debugfs_percpu(cpu);
 
@@ -4064,7 +4184,8 @@ trace_printk_seq(struct trace_seq *s)
 
 static void __ftrace_dump(bool disable_tracing)
 {
-       static DEFINE_SPINLOCK(ftrace_dump_lock);
+       static raw_spinlock_t ftrace_dump_lock =
+               (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
        /* use static because iter can be a bit big for the stack */
        static struct trace_iterator iter;
        unsigned int old_userobj;
@@ -4073,7 +4194,8 @@ static void __ftrace_dump(bool disable_tracing)
        int cnt = 0, cpu;
 
        /* only one dump */
-       spin_lock_irqsave(&ftrace_dump_lock, flags);
+       local_irq_save(flags);
+       __raw_spin_lock(&ftrace_dump_lock);
        if (dump_ran)
                goto out;
 
@@ -4145,7 +4267,8 @@ static void __ftrace_dump(bool disable_tracing)
        }
 
  out:
-       spin_unlock_irqrestore(&ftrace_dump_lock, flags);
+       __raw_spin_unlock(&ftrace_dump_lock);
+       local_irq_restore(flags);
 }
 
 /* By default: disable tracing after the dump */
index e685ac2b2ba10f1dcf24fef92180d94b75f76367..6e735d4771f8ad9ddd971ee1806f904d7e44e176 100644 (file)
@@ -9,9 +9,12 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <trace/power.h>
 
+#include <linux/trace_seq.h>
+#include <linux/ftrace_event.h>
+
 enum trace_type {
        __TRACE_FIRST_TYPE = 0,
 
@@ -41,20 +44,6 @@ enum trace_type {
        __TRACE_LAST_TYPE,
 };
 
-/*
- * The trace entry - the most basic unit of tracing. This is what
- * is printed in the end as a single line in the trace output, such as:
- *
- *     bash-15816 [01]   235.197585: idle_cpu <- irq_enter
- */
-struct trace_entry {
-       unsigned char           type;
-       unsigned char           flags;
-       unsigned char           preempt_count;
-       int                     pid;
-       int                     tgid;
-};
-
 /*
  * Function trace entry - function address and parent function addres:
  */
@@ -263,8 +252,6 @@ struct trace_array_cpu {
        char                    comm[TASK_COMM_LEN];
 };
 
-struct trace_iterator;
-
 /*
  * The trace array - an array of per-CPU trace arrays. This is the
  * highest level data structure that individual tracers deal with.
@@ -339,15 +326,6 @@ extern void __ftrace_bad_type(void);
                __ftrace_bad_type();                                    \
        } while (0)
 
-/* Return values for print_line callback */
-enum print_line_t {
-       TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
-       TRACE_TYPE_HANDLED      = 1,
-       TRACE_TYPE_UNHANDLED    = 2,    /* Relay to other output functions */
-       TRACE_TYPE_NO_CONSUME   = 3     /* Handled but ask to not consume */
-};
-
-
 /*
  * An option specific to a tracer. This is a boolean value.
  * The bit is the bit index that sets its value on the
@@ -423,60 +401,30 @@ struct tracer {
        struct tracer_stat      *stats;
 };
 
-struct trace_seq {
-       unsigned char           buffer[PAGE_SIZE];
-       unsigned int            len;
-       unsigned int            readpos;
-};
-
-static inline void
-trace_seq_init(struct trace_seq *s)
-{
-       s->len = 0;
-       s->readpos = 0;
-}
-
 
 #define TRACE_PIPE_ALL_CPU     -1
 
-/*
- * Trace iterator - used by printout routines who present trace
- * results to users and which routines might sleep, etc:
- */
-struct trace_iterator {
-       struct trace_array      *tr;
-       struct tracer           *trace;
-       void                    *private;
-       int                     cpu_file;
-       struct mutex            mutex;
-       struct ring_buffer_iter *buffer_iter[NR_CPUS];
-
-       /* The below is zeroed out in pipe_read */
-       struct trace_seq        seq;
-       struct trace_entry      *ent;
-       int                     cpu;
-       u64                     ts;
-
-       unsigned long           iter_flags;
-       loff_t                  pos;
-       long                    idx;
-
-       cpumask_var_t           started;
-};
-
 int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
 void tracing_reset_online_cpus(struct trace_array *tr);
+void tracing_reset_current(int cpu);
+void tracing_reset_current_online_cpus(void);
 int tracing_open_generic(struct inode *inode, struct file *filp);
+struct dentry *trace_create_file(const char *name,
+                                mode_t mode,
+                                struct dentry *parent,
+                                void *data,
+                                const struct file_operations *fops);
+
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct ring_buffer_event;
 
 struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
-                                                   unsigned char type,
+                                                   int type,
                                                    unsigned long len,
                                                    unsigned long flags,
                                                    int pc);
@@ -484,14 +432,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
                                struct ring_buffer_event *event,
                                unsigned long flags, int pc);
 
-struct ring_buffer_event *
-trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
-                                 unsigned long flags, int pc);
-void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
-                                       unsigned long flags, int pc);
-void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
-                                       unsigned long flags, int pc);
-
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
                                                struct trace_array_cpu *data);
 
@@ -514,7 +454,6 @@ void tracing_sched_switch_trace(struct trace_array *tr,
                                struct task_struct *prev,
                                struct task_struct *next,
                                unsigned long flags, int pc);
-void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
                                struct task_struct *wakee,
@@ -599,6 +538,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace,
                                               struct trace_array *tr);
 extern int trace_selftest_startup_branch(struct tracer *trace,
                                         struct trace_array *tr);
+extern int trace_selftest_startup_hw_branches(struct tracer *trace,
+                                             struct trace_array *tr);
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
@@ -613,6 +554,8 @@ extern unsigned long trace_flags;
 /* Standard output formatting function used for function return traces */
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern enum print_line_t print_graph_function(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* TODO: make this variable */
@@ -644,7 +587,6 @@ static inline int ftrace_graph_addr(unsigned long addr)
        return 1;
 }
 #endif /* CONFIG_DYNAMIC_FTRACE */
-
 #else /* CONFIG_FUNCTION_GRAPH_TRACER */
 static inline enum print_line_t
 print_graph_function(struct trace_iterator *iter)
@@ -692,6 +634,7 @@ enum trace_iterator_flags {
        TRACE_ITER_LATENCY_FMT          = 0x40000,
        TRACE_ITER_GLOBAL_CLK           = 0x80000,
        TRACE_ITER_SLEEP_TIME           = 0x100000,
+       TRACE_ITER_GRAPH_TIME           = 0x200000,
 };
 
 /*
@@ -790,103 +733,113 @@ struct ftrace_event_field {
        char                    *type;
        int                     offset;
        int                     size;
+       int                     is_signed;
 };
 
-struct ftrace_event_call {
-       char                    *name;
-       char                    *system;
-       struct dentry           *dir;
-       int                     enabled;
-       int                     (*regfunc)(void);
-       void                    (*unregfunc)(void);
-       int                     id;
-       int                     (*raw_init)(void);
-       int                     (*show_format)(struct trace_seq *s);
-       int                     (*define_fields)(void);
-       struct list_head        fields;
+struct event_filter {
+       int                     n_preds;
        struct filter_pred      **preds;
-
-#ifdef CONFIG_EVENT_PROFILE
-       atomic_t        profile_count;
-       int             (*profile_enable)(struct ftrace_event_call *);
-       void            (*profile_disable)(struct ftrace_event_call *);
-#endif
+       char                    *filter_string;
 };
 
 struct event_subsystem {
        struct list_head        list;
        const char              *name;
        struct dentry           *entry;
-       struct filter_pred      **preds;
+       void                    *filter;
 };
 
-#define events_for_each(event)                                         \
-       for (event = __start_ftrace_events;                             \
-            (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-            event++)
-
-#define MAX_FILTER_PRED 8
-
 struct filter_pred;
 
-typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event,
+                                int val1, int val2);
 
 struct filter_pred {
        filter_pred_fn_t fn;
        u64 val;
-       char *str_val;
+       char str_val[MAX_FILTER_STR_VAL];
        int str_len;
        char *field_name;
        int offset;
        int not;
-       int or;
-       int compound;
-       int clear;
+       int op;
+       int pop_n;
 };
 
-int trace_define_field(struct ftrace_event_call *call, char *type,
-                      char *name, int offset, int size);
-extern void filter_free_pred(struct filter_pred *pred);
-extern void filter_print_preds(struct filter_pred **preds,
+extern void print_event_filter(struct ftrace_event_call *call,
                               struct trace_seq *s);
-extern int filter_parse(char **pbuf, struct filter_pred *pred);
-extern int filter_add_pred(struct ftrace_event_call *call,
-                          struct filter_pred *pred);
-extern void filter_free_preds(struct ftrace_event_call *call);
-extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
-extern void filter_free_subsystem_preds(struct event_subsystem *system);
-extern int filter_add_subsystem_pred(struct event_subsystem *system,
-                                    struct filter_pred *pred);
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#define for_each_event(event)                                          \
-       for (event = __start_ftrace_events;                             \
-            (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-            event++)
+extern int apply_event_filter(struct ftrace_event_call *call,
+                             char *filter_string);
+extern int apply_subsystem_event_filter(struct event_subsystem *system,
+                                       char *filter_string);
+extern void print_subsystem_event_filter(struct event_subsystem *system,
+                                        struct trace_seq *s);
+
+static inline int
+filter_check_discard(struct ftrace_event_call *call, void *rec,
+                    struct ring_buffer *buffer,
+                    struct ring_buffer_event *event)
+{
+       if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) {
+               ring_buffer_discard_commit(buffer, event);
+               return 1;
+       }
+
+       return 0;
+}
+
+#define DEFINE_COMPARISON_PRED(type)                                   \
+static int filter_pred_##type(struct filter_pred *pred, void *event,   \
+                             int val1, int val2)                       \
+{                                                                      \
+       type *addr = (type *)(event + pred->offset);                    \
+       type val = (type)pred->val;                                     \
+       int match = 0;                                                  \
+                                                                       \
+       switch (pred->op) {                                             \
+       case OP_LT:                                                     \
+               match = (*addr < val);                                  \
+               break;                                                  \
+       case OP_LE:                                                     \
+               match = (*addr <= val);                                 \
+               break;                                                  \
+       case OP_GT:                                                     \
+               match = (*addr > val);                                  \
+               break;                                                  \
+       case OP_GE:                                                     \
+               match = (*addr >= val);                                 \
+               break;                                                  \
+       default:                                                        \
+               break;                                                  \
+       }                                                               \
+                                                                       \
+       return match;                                                   \
+}
+
+#define DEFINE_EQUALITY_PRED(size)                                     \
+static int filter_pred_##size(struct filter_pred *pred, void *event,   \
+                             int val1, int val2)                       \
+{                                                                      \
+       u##size *addr = (u##size *)(event + pred->offset);              \
+       u##size val = (u##size)pred->val;                               \
+       int match;                                                      \
+                                                                       \
+       match = (val == *addr) ^ pred->not;                             \
+                                                                       \
+       return match;                                                   \
+}
+
+extern struct mutex event_mutex;
+extern struct list_head ftrace_events;
 
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
-/*
- * The double __builtin_constant_p is because gcc will give us an error
- * if we try to allocate the static variable to fmt if it is not a
- * constant. Even with the outer if statement optimizing out.
- */
-#define event_trace_printk(ip, fmt, args...)                           \
-do {                                                                   \
-       __trace_printk_check_format(fmt, ##args);                       \
-       tracing_record_cmdline(current);                                \
-       if (__builtin_constant_p(fmt)) {                                \
-               static const char *trace_printk_fmt                     \
-                 __attribute__((section("__trace_printk_fmt"))) =      \
-                       __builtin_constant_p(fmt) ? fmt : NULL;         \
-                                                                       \
-               __trace_bprintk(ip, trace_printk_fmt, ##args);          \
-       } else                                                          \
-               __trace_printk(ip, fmt, ##args);                        \
-} while (0)
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+       extern struct ftrace_event_call event_##call;
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt)
+#include "trace_event_types.h"
 
 #endif /* _LINUX_KERNEL_TRACE_H */
index 7a30fc4c36423fc29f7e4bb97985ccd9e5ca9540..a29ef23ffb47080d81c41950ac4385566b193058 100644 (file)
@@ -9,6 +9,7 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/time.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter)
        trace_assign_type(field, entry);
        call = &field->boot_call;
        ts = iter->ts;
-       nsec_rem = do_div(ts, 1000000000);
+       nsec_rem = do_div(ts, NSEC_PER_SEC);
 
        ret = trace_seq_printf(s, "[%5ld.%09ld] calling  %s @ %i\n",
                        (unsigned long)ts, nsec_rem, call->func, call->caller);
@@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter)
        trace_assign_type(field, entry);
        init_ret = &field->boot_ret;
        ts = iter->ts;
-       nsec_rem = do_div(ts, 1000000000);
+       nsec_rem = do_div(ts, NSEC_PER_SEC);
 
        ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s "
                        "returned %d after %llu msecs\n",
index 8333715e4066eed26e53ecda1077d69c317679d3..7a7a9fd249a9c7c158d136dfc998efc9e908f35c 100644 (file)
@@ -30,6 +30,7 @@ static struct trace_array *branch_tracer;
 static void
 probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 {
+       struct ftrace_event_call *call = &event_branch;
        struct trace_array *tr = branch_tracer;
        struct ring_buffer_event *event;
        struct trace_branch *entry;
@@ -73,7 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
        entry->line = f->line;
        entry->correct = val == expect;
 
-       ring_buffer_unlock_commit(tr->buffer, event);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -271,7 +273,7 @@ static int branch_stat_show(struct seq_file *m, void *v)
        return 0;
 }
 
-static void *annotated_branch_stat_start(void)
+static void *annotated_branch_stat_start(struct tracer_stat *trace)
 {
        return __start_annotated_branch_profile;
 }
@@ -346,7 +348,7 @@ static int all_branch_stat_headers(struct seq_file *m)
        return 0;
 }
 
-static void *all_branch_stat_start(void)
+static void *all_branch_stat_start(struct tracer_stat *trace)
 {
        return __start_branch_profile;
 }
index 22cba9970776cf4984cb93b3e3a7fc0c5687f912..5b5895afecfe425f5c917af29b13d29ff4acf918 100644 (file)
 int ftrace_profile_enable(int event_id)
 {
        struct ftrace_event_call *event;
+       int ret = -EINVAL;
 
-       for_each_event(event) {
-               if (event->id == event_id)
-                       return event->profile_enable(event);
+       mutex_lock(&event_mutex);
+       list_for_each_entry(event, &ftrace_events, list) {
+               if (event->id == event_id) {
+                       ret = event->profile_enable(event);
+                       break;
+               }
        }
+       mutex_unlock(&event_mutex);
 
-       return -EINVAL;
+       return ret;
 }
 
 void ftrace_profile_disable(int event_id)
 {
        struct ftrace_event_call *event;
 
-       for_each_event(event) {
-               if (event->id == event_id)
-                       return event->profile_disable(event);
+       mutex_lock(&event_mutex);
+       list_for_each_entry(event, &ftrace_events, list) {
+               if (event->id == event_id) {
+                       event->profile_disable(event);
+                       break;
+               }
        }
+       mutex_unlock(&event_mutex);
 }
-
index fd78bee71dd7519e03d45d38e3de9e58579af234..5e32e375134d976183bb0ae5df79abda3f029a54 100644 (file)
@@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
        TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
-TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore,
        TRACE_STRUCT(
                TRACE_FIELD(unsigned long, arg1, arg1)
                TRACE_FIELD(unsigned long, arg2, arg2)
@@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
        TRACE_STRUCT(
                TRACE_FIELD(unsigned int, line, line)
-               TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
-               TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+               TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func,
+                                   TRACE_FUNC_SIZE+1, func)
+               TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file,
+                                   TRACE_FUNC_SIZE+1, file)
                TRACE_FIELD(char, correct, correct)
        ),
        TP_RAW_FMT("%u:%s:%s (%u)")
@@ -139,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
        TRACE_STRUCT(
-               TRACE_FIELD(ktime_t, state_data.stamp, stamp)
-               TRACE_FIELD(ktime_t, state_data.end, end)
+               TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1)
+               TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1)
                TRACE_FIELD(int, state_data.type, type)
                TRACE_FIELD(int, state_data.state, state)
        ),
index 576f4fa2af0da22cad87d1c20fb044d89c16a066..aa08be69a1b6c1fcc026359ece8613812ad1a492 100644 (file)
@@ -8,19 +8,25 @@
  *
  */
 
+#include <linux/workqueue.h>
+#include <linux/spinlock.h>
+#include <linux/kthread.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/delay.h>
 
 #include "trace_output.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
-static DEFINE_MUTEX(event_mutex);
+DEFINE_MUTEX(event_mutex);
+
+LIST_HEAD(ftrace_events);
 
 int trace_define_field(struct ftrace_event_call *call, char *type,
-                      char *name, int offset, int size)
+                      char *name, int offset, int size, int is_signed)
 {
        struct ftrace_event_field *field;
 
@@ -38,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 
        field->offset = offset;
        field->size = size;
+       field->is_signed = is_signed;
        list_add(&field->link, &call->fields);
 
        return 0;
@@ -51,47 +58,94 @@ err:
 
        return -ENOMEM;
 }
+EXPORT_SYMBOL_GPL(trace_define_field);
 
-static void ftrace_clear_events(void)
-{
-       struct ftrace_event_call *call = (void *)__start_ftrace_events;
-
+#ifdef CONFIG_MODULES
 
-       while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+static void trace_destroy_fields(struct ftrace_event_call *call)
+{
+       struct ftrace_event_field *field, *next;
 
-               if (call->enabled) {
-                       call->enabled = 0;
-                       call->unregfunc();
-               }
-               call++;
+       list_for_each_entry_safe(field, next, &call->fields, link) {
+               list_del(&field->link);
+               kfree(field->type);
+               kfree(field->name);
+               kfree(field);
        }
 }
 
+#endif /* CONFIG_MODULES */
+
 static void ftrace_event_enable_disable(struct ftrace_event_call *call,
                                        int enable)
 {
-
        switch (enable) {
        case 0:
                if (call->enabled) {
                        call->enabled = 0;
+                       tracing_stop_cmdline_record();
                        call->unregfunc();
                }
                break;
        case 1:
                if (!call->enabled) {
                        call->enabled = 1;
+                       tracing_start_cmdline_record();
                        call->regfunc();
                }
                break;
        }
 }
 
+static void ftrace_clear_events(void)
+{
+       struct ftrace_event_call *call;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               ftrace_event_enable_disable(call, 0);
+       }
+       mutex_unlock(&event_mutex);
+}
+
+/*
+ * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events.
+ */
+static int __ftrace_set_clr_event(const char *match, const char *sub,
+                                 const char *event, int set)
+{
+       struct ftrace_event_call *call;
+       int ret = -EINVAL;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+
+               if (!call->name || !call->regfunc)
+                       continue;
+
+               if (match &&
+                   strcmp(match, call->name) != 0 &&
+                   strcmp(match, call->system) != 0)
+                       continue;
+
+               if (sub && strcmp(sub, call->system) != 0)
+                       continue;
+
+               if (event && strcmp(event, call->name) != 0)
+                       continue;
+
+               ftrace_event_enable_disable(call, set);
+
+               ret = 0;
+       }
+       mutex_unlock(&event_mutex);
+
+       return ret;
+}
+
 static int ftrace_set_clr_event(char *buf, int set)
 {
-       struct ftrace_event_call *call = __start_ftrace_events;
        char *event = NULL, *sub = NULL, *match;
-       int ret = -EINVAL;
 
        /*
         * The buf format can be <subsystem>:<event-name>
@@ -117,30 +171,24 @@ static int ftrace_set_clr_event(char *buf, int set)
                        event = NULL;
        }
 
-       mutex_lock(&event_mutex);
-       for_each_event(call) {
-
-               if (!call->name || !call->regfunc)
-                       continue;
-
-               if (match &&
-                   strcmp(match, call->name) != 0 &&
-                   strcmp(match, call->system) != 0)
-                       continue;
-
-               if (sub && strcmp(sub, call->system) != 0)
-                       continue;
-
-               if (event && strcmp(event, call->name) != 0)
-                       continue;
-
-               ftrace_event_enable_disable(call, set);
-
-               ret = 0;
-       }
-       mutex_unlock(&event_mutex);
+       return __ftrace_set_clr_event(match, sub, event, set);
+}
 
-       return ret;
+/**
+ * trace_set_clr_event - enable or disable an event
+ * @system: system name to match (NULL for any system)
+ * @event: event name to match (NULL for all events, within system)
+ * @set: 1 to enable, 0 to disable
+ *
+ * This is a way for other parts of the kernel to enable or disable
+ * event recording.
+ *
+ * Returns 0 on success, -EINVAL if the parameters do not match any
+ * registered events.
+ */
+int trace_set_clr_event(const char *system, const char *event, int set)
+{
+       return __ftrace_set_clr_event(NULL, system, event, set);
 }
 
 /* 128 should be much more than enough */
@@ -224,15 +272,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
-       struct ftrace_event_call *call = m->private;
-       struct ftrace_event_call *next = call;
+       struct list_head *list = m->private;
+       struct ftrace_event_call *call;
 
        (*pos)++;
 
        for (;;) {
-               if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+               if (list == &ftrace_events)
                        return NULL;
 
+               call = list_entry(list, struct ftrace_event_call, list);
+
                /*
                 * The ftrace subsystem is for showing formats only.
                 * They can not be enabled or disabled via the event files.
@@ -240,45 +290,51 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
                if (call->regfunc)
                        break;
 
-               call++;
-               next = call;
+               list = list->next;
        }
 
-       m->private = ++next;
+       m->private = list->next;
 
        return call;
 }
 
 static void *t_start(struct seq_file *m, loff_t *pos)
 {
+       mutex_lock(&event_mutex);
+       if (*pos == 0)
+               m->private = ftrace_events.next;
        return t_next(m, NULL, pos);
 }
 
 static void *
 s_next(struct seq_file *m, void *v, loff_t *pos)
 {
-       struct ftrace_event_call *call = m->private;
-       struct ftrace_event_call *next;
+       struct list_head *list = m->private;
+       struct ftrace_event_call *call;
 
        (*pos)++;
 
  retry:
-       if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+       if (list == &ftrace_events)
                return NULL;
 
+       call = list_entry(list, struct ftrace_event_call, list);
+
        if (!call->enabled) {
-               call++;
+               list = list->next;
                goto retry;
        }
 
-       next = call;
-       m->private = ++next;
+       m->private = list->next;
 
        return call;
 }
 
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
+       mutex_lock(&event_mutex);
+       if (*pos == 0)
+               m->private = ftrace_events.next;
        return s_next(m, NULL, pos);
 }
 
@@ -295,12 +351,12 @@ static int t_show(struct seq_file *m, void *v)
 
 static void t_stop(struct seq_file *m, void *p)
 {
+       mutex_unlock(&event_mutex);
 }
 
 static int
 ftrace_event_seq_open(struct inode *inode, struct file *file)
 {
-       int ret;
        const struct seq_operations *seq_ops;
 
        if ((file->f_mode & FMODE_WRITE) &&
@@ -308,13 +364,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
                ftrace_clear_events();
 
        seq_ops = inode->i_private;
-       ret = seq_open(file, seq_ops);
-       if (!ret) {
-               struct seq_file *m = file->private_data;
-
-               m->private = __start_ftrace_events;
-       }
-       return ret;
+       return seq_open(file, seq_ops);
 }
 
 static ssize_t
@@ -374,8 +424,93 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
        return cnt;
 }
 
+static ssize_t
+system_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+                  loff_t *ppos)
+{
+       const char set_to_char[4] = { '?', '0', '1', 'X' };
+       const char *system = filp->private_data;
+       struct ftrace_event_call *call;
+       char buf[2];
+       int set = 0;
+       int ret;
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (!call->name || !call->regfunc)
+                       continue;
+
+               if (system && strcmp(call->system, system) != 0)
+                       continue;
+
+               /*
+                * We need to find out if all the events are set
+                * or if all events or cleared, or if we have
+                * a mixture.
+                */
+               set |= (1 << !!call->enabled);
+
+               /*
+                * If we have a mixture, no need to look further.
+                */
+               if (set == 3)
+                       break;
+       }
+       mutex_unlock(&event_mutex);
+
+       buf[0] = set_to_char[set];
+       buf[1] = '\n';
+
+       ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+
+       return ret;
+}
+
+static ssize_t
+system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+                   loff_t *ppos)
+{
+       const char *system = filp->private_data;
+       unsigned long val;
+       char buf[64];
+       ssize_t ret;
+
+       if (cnt >= sizeof(buf))
+               return -EINVAL;
+
+       if (copy_from_user(&buf, ubuf, cnt))
+               return -EFAULT;
+
+       buf[cnt] = 0;
+
+       ret = strict_strtoul(buf, 10, &val);
+       if (ret < 0)
+               return ret;
+
+       ret = tracing_update_buffers();
+       if (ret < 0)
+               return ret;
+
+       if (val != 0 && val != 1)
+               return -EINVAL;
+
+       ret = __ftrace_set_clr_event(NULL, system, NULL, val);
+       if (ret)
+               goto out;
+
+       ret = cnt;
+
+out:
+       *ppos += cnt;
+
+       return ret;
+}
+
+extern char *__bad_type_size(void);
+
 #undef FIELD
 #define FIELD(type, name)                                              \
+       sizeof(type) != sizeof(field.name) ? __bad_type_size() :        \
        #type, "common_" #name, offsetof(typeof(field), name),          \
                sizeof(field.name)
 
@@ -391,7 +526,7 @@ static int trace_write_header(struct trace_seq *s)
                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
                                "\n",
-                               FIELD(unsigned char, type),
+                               FIELD(unsigned short, type),
                                FIELD(unsigned char, flags),
                                FIELD(unsigned char, preempt_count),
                                FIELD(int, pid),
@@ -481,7 +616,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
        trace_seq_init(s);
 
-       filter_print_preds(call->preds, s);
+       print_event_filter(call, s);
        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
        kfree(s);
@@ -494,38 +629,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                   loff_t *ppos)
 {
        struct ftrace_event_call *call = filp->private_data;
-       char buf[64], *pbuf = buf;
-       struct filter_pred *pred;
+       char *buf;
        int err;
 
-       if (cnt >= sizeof(buf))
+       if (cnt >= PAGE_SIZE)
                return -EINVAL;
 
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = '\0';
-
-       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-       if (!pred)
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
                return -ENOMEM;
 
-       err = filter_parse(&pbuf, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
-               return err;
-       }
-
-       if (pred->clear) {
-               filter_free_preds(call);
-               filter_free_pred(pred);
-               return cnt;
+       if (copy_from_user(buf, ubuf, cnt)) {
+               free_page((unsigned long) buf);
+               return -EFAULT;
        }
+       buf[cnt] = '\0';
 
-       err = filter_add_pred(call, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
+       err = apply_event_filter(call, buf);
+       free_page((unsigned long) buf);
+       if (err < 0)
                return err;
-       }
 
        *ppos += cnt;
 
@@ -549,7 +672,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
        trace_seq_init(s);
 
-       filter_print_preds(system->preds, s);
+       print_subsystem_event_filter(system, s);
        r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
        kfree(s);
@@ -562,45 +685,56 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
                       loff_t *ppos)
 {
        struct event_subsystem *system = filp->private_data;
-       char buf[64], *pbuf = buf;
-       struct filter_pred *pred;
+       char *buf;
        int err;
 
-       if (cnt >= sizeof(buf))
+       if (cnt >= PAGE_SIZE)
                return -EINVAL;
 
-       if (copy_from_user(&buf, ubuf, cnt))
-               return -EFAULT;
-       buf[cnt] = '\0';
-
-       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
-       if (!pred)
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
                return -ENOMEM;
 
-       err = filter_parse(&pbuf, pred);
-       if (err < 0) {
-               filter_free_pred(pred);
-               return err;
-       }
-
-       if (pred->clear) {
-               filter_free_subsystem_preds(system);
-               filter_free_pred(pred);
-               return cnt;
+       if (copy_from_user(buf, ubuf, cnt)) {
+               free_page((unsigned long) buf);
+               return -EFAULT;
        }
+       buf[cnt] = '\0';
 
-       err = filter_add_subsystem_pred(system, pred);
-       if (err < 0) {
-               filter_free_subsystem_preds(system);
-               filter_free_pred(pred);
+       err = apply_subsystem_event_filter(system, buf);
+       free_page((unsigned long) buf);
+       if (err < 0)
                return err;
-       }
 
        *ppos += cnt;
 
        return cnt;
 }
 
+static ssize_t
+show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+       int (*func)(struct trace_seq *s) = filp->private_data;
+       struct trace_seq *s;
+       int r;
+
+       if (*ppos)
+               return 0;
+
+       s = kmalloc(sizeof(*s), GFP_KERNEL);
+       if (!s)
+               return -ENOMEM;
+
+       trace_seq_init(s);
+
+       func(s);
+       r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
+
+       kfree(s);
+
+       return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
        .start = t_start,
        .next = t_next,
@@ -658,6 +792,17 @@ static const struct file_operations ftrace_subsystem_filter_fops = {
        .write = subsystem_filter_write,
 };
 
+static const struct file_operations ftrace_system_enable_fops = {
+       .open = tracing_open_generic,
+       .read = system_enable_read,
+       .write = system_enable_write,
+};
+
+static const struct file_operations ftrace_show_header_fops = {
+       .open = tracing_open_generic,
+       .read = show_header,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
        static struct dentry *d_tracer;
@@ -684,6 +829,7 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
        struct event_subsystem *system;
+       struct dentry *entry;
 
        /* First see if we did not already create this dir */
        list_for_each_entry(system, &event_subsystems, list) {
@@ -707,16 +853,46 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
                return d_events;
        }
 
-       system->name = name;
+       system->name = kstrdup(name, GFP_KERNEL);
+       if (!system->name) {
+               debugfs_remove(system->entry);
+               kfree(system);
+               return d_events;
+       }
+
        list_add(&system->list, &event_subsystems);
 
-       system->preds = NULL;
+       system->filter = NULL;
+
+       system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL);
+       if (!system->filter) {
+               pr_warning("Could not allocate filter for subsystem "
+                          "'%s'\n", name);
+               return system->entry;
+       }
+
+       entry = debugfs_create_file("filter", 0644, system->entry, system,
+                                   &ftrace_subsystem_filter_fops);
+       if (!entry) {
+               kfree(system->filter);
+               system->filter = NULL;
+               pr_warning("Could not create debugfs "
+                          "'%s/filter' entry\n", name);
+       }
+
+       entry = trace_create_file("enable", 0644, system->entry,
+                                 (void *)system->name,
+                                 &ftrace_system_enable_fops);
 
        return system->entry;
 }
 
 static int
-event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events,
+                const struct file_operations *id,
+                const struct file_operations *enable,
+                const struct file_operations *filter,
+                const struct file_operations *format)
 {
        struct dentry *entry;
        int ret;
@@ -725,7 +901,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
         * If the trace point header did not define TRACE_SYSTEM
         * then the system would be called "TRACE_SYSTEM".
         */
-       if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+       if (strcmp(call->system, TRACE_SYSTEM) != 0)
                d_events = event_subsystem_dir(call->system, d_events);
 
        if (call->raw_init) {
@@ -744,21 +920,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                return -1;
        }
 
-       if (call->regfunc) {
-               entry = debugfs_create_file("enable", 0644, call->dir, call,
-                                           &ftrace_enable_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs "
-                                  "'%s/enable' entry\n", call->name);
-       }
+       if (call->regfunc)
+               entry = trace_create_file("enable", 0644, call->dir, call,
+                                         enable);
 
-       if (call->id) {
-               entry = debugfs_create_file("id", 0444, call->dir, call,
-                               &ftrace_event_id_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs '%s/id' entry\n",
-                                       call->name);
-       }
+       if (call->id)
+               entry = trace_create_file("id", 0444, call->dir, call,
+                                         id);
 
        if (call->define_fields) {
                ret = call->define_fields();
@@ -767,32 +935,195 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
                                   " events/%s\n", call->name);
                        return ret;
                }
-               entry = debugfs_create_file("filter", 0644, call->dir, call,
-                                           &ftrace_event_filter_fops);
-               if (!entry)
-                       pr_warning("Could not create debugfs "
-                                  "'%s/filter' entry\n", call->name);
+               entry = trace_create_file("filter", 0644, call->dir, call,
+                                         filter);
        }
 
        /* A trace may not want to export its format */
        if (!call->show_format)
                return 0;
 
-       entry = debugfs_create_file("format", 0444, call->dir, call,
-                                   &ftrace_event_format_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'%s/format' entry\n", call->name);
+       entry = trace_create_file("format", 0444, call->dir, call,
+                                 format);
+
+       return 0;
+}
+
+#define for_each_event(event, start, end)                      \
+       for (event = start;                                     \
+            (unsigned long)event < (unsigned long)end;         \
+            event++)
+
+#ifdef CONFIG_MODULES
+
+static LIST_HEAD(ftrace_module_file_list);
+
+/*
+ * Modules must own their file_operations to keep up with
+ * reference counting.
+ */
+struct ftrace_module_file_ops {
+       struct list_head                list;
+       struct module                   *mod;
+       struct file_operations          id;
+       struct file_operations          enable;
+       struct file_operations          format;
+       struct file_operations          filter;
+};
+
+static struct ftrace_module_file_ops *
+trace_create_file_ops(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops;
+
+       /*
+        * This is a bit of a PITA. To allow for correct reference
+        * counting, modules must "own" their file_operations.
+        * To do this, we allocate the file operations that will be
+        * used in the event directory.
+        */
+
+       file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL);
+       if (!file_ops)
+               return NULL;
+
+       file_ops->mod = mod;
+
+       file_ops->id = ftrace_event_id_fops;
+       file_ops->id.owner = mod;
+
+       file_ops->enable = ftrace_enable_fops;
+       file_ops->enable.owner = mod;
+
+       file_ops->filter = ftrace_event_filter_fops;
+       file_ops->filter.owner = mod;
+
+       file_ops->format = ftrace_event_format_fops;
+       file_ops->format.owner = mod;
+
+       list_add(&file_ops->list, &ftrace_module_file_list);
+
+       return file_ops;
+}
+
+static void trace_module_add_events(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops = NULL;
+       struct ftrace_event_call *call, *start, *end;
+       struct dentry *d_events;
+
+       start = mod->trace_events;
+       end = mod->trace_events + mod->num_trace_events;
+
+       if (start == end)
+               return;
+
+       d_events = event_trace_events_dir();
+       if (!d_events)
+               return;
+
+       for_each_event(call, start, end) {
+               /* The linker may leave blanks */
+               if (!call->name)
+                       continue;
+
+               /*
+                * This module has events, create file ops for this module
+                * if not already done.
+                */
+               if (!file_ops) {
+                       file_ops = trace_create_file_ops(mod);
+                       if (!file_ops)
+                               return;
+               }
+               call->mod = mod;
+               list_add(&call->list, &ftrace_events);
+               event_create_dir(call, d_events,
+                                &file_ops->id, &file_ops->enable,
+                                &file_ops->filter, &file_ops->format);
+       }
+}
+
+static void trace_module_remove_events(struct module *mod)
+{
+       struct ftrace_module_file_ops *file_ops;
+       struct ftrace_event_call *call, *p;
+       bool found = false;
+
+       down_write(&trace_event_mutex);
+       list_for_each_entry_safe(call, p, &ftrace_events, list) {
+               if (call->mod == mod) {
+                       found = true;
+                       ftrace_event_enable_disable(call, 0);
+                       if (call->event)
+                               __unregister_ftrace_event(call->event);
+                       debugfs_remove_recursive(call->dir);
+                       list_del(&call->list);
+                       trace_destroy_fields(call);
+                       destroy_preds(call);
+               }
+       }
+
+       /* Now free the file_operations */
+       list_for_each_entry(file_ops, &ftrace_module_file_list, list) {
+               if (file_ops->mod == mod)
+                       break;
+       }
+       if (&file_ops->list != &ftrace_module_file_list) {
+               list_del(&file_ops->list);
+               kfree(file_ops);
+       }
+
+       /*
+        * It is safest to reset the ring buffer if the module being unloaded
+        * registered any events.
+        */
+       if (found)
+               tracing_reset_current_online_cpus();
+       up_write(&trace_event_mutex);
+}
+
+static int trace_module_notify(struct notifier_block *self,
+                              unsigned long val, void *data)
+{
+       struct module *mod = data;
+
+       mutex_lock(&event_mutex);
+       switch (val) {
+       case MODULE_STATE_COMING:
+               trace_module_add_events(mod);
+               break;
+       case MODULE_STATE_GOING:
+               trace_module_remove_events(mod);
+               break;
+       }
+       mutex_unlock(&event_mutex);
 
        return 0;
 }
+#else
+static int trace_module_notify(struct notifier_block *self,
+                              unsigned long val, void *data)
+{
+       return 0;
+}
+#endif /* CONFIG_MODULES */
+
+struct notifier_block trace_module_nb = {
+       .notifier_call = trace_module_notify,
+       .priority = 0,
+};
+
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 
 static __init int event_trace_init(void)
 {
-       struct ftrace_event_call *call = __start_ftrace_events;
+       struct ftrace_event_call *call;
        struct dentry *d_tracer;
        struct dentry *entry;
        struct dentry *d_events;
+       int ret;
 
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
@@ -816,13 +1147,243 @@ static __init int event_trace_init(void)
        if (!d_events)
                return 0;
 
-       for_each_event(call) {
+       /* ring buffer internal formats */
+       trace_create_file("header_page", 0444, d_events,
+                         ring_buffer_print_page_header,
+                         &ftrace_show_header_fops);
+
+       trace_create_file("header_event", 0444, d_events,
+                         ring_buffer_print_entry_header,
+                         &ftrace_show_header_fops);
+
+       trace_create_file("enable", 0644, d_events,
+                         NULL, &ftrace_system_enable_fops);
+
+       for_each_event(call, __start_ftrace_events, __stop_ftrace_events) {
                /* The linker may leave blanks */
                if (!call->name)
                        continue;
-               event_create_dir(call, d_events);
+               list_add(&call->list, &ftrace_events);
+               event_create_dir(call, d_events, &ftrace_event_id_fops,
+                                &ftrace_enable_fops, &ftrace_event_filter_fops,
+                                &ftrace_event_format_fops);
        }
 
+       ret = register_module_notifier(&trace_module_nb);
+       if (ret)
+               pr_warning("Failed to register trace events module notifier\n");
+
        return 0;
 }
 fs_initcall(event_trace_init);
+
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+
+static DEFINE_SPINLOCK(test_spinlock);
+static DEFINE_SPINLOCK(test_spinlock_irq);
+static DEFINE_MUTEX(test_mutex);
+
+static __init void test_work(struct work_struct *dummy)
+{
+       spin_lock(&test_spinlock);
+       spin_lock_irq(&test_spinlock_irq);
+       udelay(1);
+       spin_unlock_irq(&test_spinlock_irq);
+       spin_unlock(&test_spinlock);
+
+       mutex_lock(&test_mutex);
+       msleep(1);
+       mutex_unlock(&test_mutex);
+}
+
+static __init int event_test_thread(void *unused)
+{
+       void *test_malloc;
+
+       test_malloc = kmalloc(1234, GFP_KERNEL);
+       if (!test_malloc)
+               pr_info("failed to kmalloc\n");
+
+       schedule_on_each_cpu(test_work);
+
+       kfree(test_malloc);
+
+       set_current_state(TASK_INTERRUPTIBLE);
+       while (!kthread_should_stop())
+               schedule();
+
+       return 0;
+}
+
+/*
+ * Do various things that may trigger events.
+ */
+static __init void event_test_stuff(void)
+{
+       struct task_struct *test_thread;
+
+       test_thread = kthread_run(event_test_thread, NULL, "test-events");
+       msleep(1);
+       kthread_stop(test_thread);
+}
+
+/*
+ * For every trace event defined, we will test each trace point separately,
+ * and then by groups, and finally all trace points.
+ */
+static __init void event_trace_self_tests(void)
+{
+       struct ftrace_event_call *call;
+       struct event_subsystem *system;
+       int ret;
+
+       pr_info("Running tests on trace events:\n");
+
+       list_for_each_entry(call, &ftrace_events, list) {
+
+               /* Only test those that have a regfunc */
+               if (!call->regfunc)
+                       continue;
+
+               pr_info("Testing event %s: ", call->name);
+
+               /*
+                * If an event is already enabled, someone is using
+                * it and the self test should not be on.
+                */
+               if (call->enabled) {
+                       pr_warning("Enabled event during self test!\n");
+                       WARN_ON_ONCE(1);
+                       continue;
+               }
+
+               ftrace_event_enable_disable(call, 1);
+               event_test_stuff();
+               ftrace_event_enable_disable(call, 0);
+
+               pr_cont("OK\n");
+       }
+
+       /* Now test at the sub system level */
+
+       pr_info("Running tests on trace event systems:\n");
+
+       list_for_each_entry(system, &event_subsystems, list) {
+
+               /* the ftrace system is special, skip it */
+               if (strcmp(system->name, "ftrace") == 0)
+                       continue;
+
+               pr_info("Testing event system %s: ", system->name);
+
+               ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1);
+               if (WARN_ON_ONCE(ret)) {
+                       pr_warning("error enabling system %s\n",
+                                  system->name);
+                       continue;
+               }
+
+               event_test_stuff();
+
+               ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0);
+               if (WARN_ON_ONCE(ret))
+                       pr_warning("error disabling system %s\n",
+                                  system->name);
+
+               pr_cont("OK\n");
+       }
+
+       /* Test with all events enabled */
+
+       pr_info("Running tests on all trace events:\n");
+       pr_info("Testing all events: ");
+
+       ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1);
+       if (WARN_ON_ONCE(ret)) {
+               pr_warning("error enabling all events\n");
+               return;
+       }
+
+       event_test_stuff();
+
+       /* reset sysname */
+       ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0);
+       if (WARN_ON_ONCE(ret)) {
+               pr_warning("error disabling all events\n");
+               return;
+       }
+
+       pr_cont("OK\n");
+}
+
+#ifdef CONFIG_FUNCTION_TRACER
+
+static DEFINE_PER_CPU(atomic_t, test_event_disable);
+
+static void
+function_test_events_call(unsigned long ip, unsigned long parent_ip)
+{
+       struct ring_buffer_event *event;
+       struct ftrace_entry *entry;
+       unsigned long flags;
+       long disabled;
+       int resched;
+       int cpu;
+       int pc;
+
+       pc = preempt_count();
+       resched = ftrace_preempt_disable();
+       cpu = raw_smp_processor_id();
+       disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu));
+
+       if (disabled != 1)
+               goto out;
+
+       local_save_flags(flags);
+
+       event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry),
+                                                 flags, pc);
+       if (!event)
+               goto out;
+       entry   = ring_buffer_event_data(event);
+       entry->ip                       = ip;
+       entry->parent_ip                = parent_ip;
+
+       trace_nowake_buffer_unlock_commit(event, flags, pc);
+
+ out:
+       atomic_dec(&per_cpu(test_event_disable, cpu));
+       ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_ops __initdata  =
+{
+       .func = function_test_events_call,
+};
+
+static __init void event_trace_self_test_with_function(void)
+{
+       register_ftrace_function(&trace_ops);
+       pr_info("Running tests again, along with the function tracer\n");
+       event_trace_self_tests();
+       unregister_ftrace_function(&trace_ops);
+}
+#else
+static __init void event_trace_self_test_with_function(void)
+{
+}
+#endif
+
+static __init int event_trace_self_tests_init(void)
+{
+
+       event_trace_self_tests();
+
+       event_trace_self_test_with_function();
+
+       return 0;
+}
+
+late_initcall(event_trace_self_tests_init);
+
+#endif
index e03cbf1e38f36b306f8eafd2ed2d79837b901a37..db6e54bdb596d90b07fef8fed03520157ecf4390 100644 (file)
 #include <linux/uaccess.h>
 #include <linux/module.h>
 #include <linux/ctype.h>
+#include <linux/mutex.h>
 
 #include "trace.h"
 #include "trace_output.h"
 
-static int filter_pred_64(struct filter_pred *pred, void *event)
+static DEFINE_MUTEX(filter_mutex);
+
+enum filter_op_ids
+{
+       OP_OR,
+       OP_AND,
+       OP_NE,
+       OP_EQ,
+       OP_LT,
+       OP_LE,
+       OP_GT,
+       OP_GE,
+       OP_NONE,
+       OP_OPEN_PAREN,
+};
+
+struct filter_op {
+       int id;
+       char *string;
+       int precedence;
+};
+
+static struct filter_op filter_ops[] = {
+       { OP_OR, "||", 1 },
+       { OP_AND, "&&", 2 },
+       { OP_NE, "!=", 4 },
+       { OP_EQ, "==", 4 },
+       { OP_LT, "<", 5 },
+       { OP_LE, "<=", 5 },
+       { OP_GT, ">", 5 },
+       { OP_GE, ">=", 5 },
+       { OP_NONE, "OP_NONE", 0 },
+       { OP_OPEN_PAREN, "(", 0 },
+};
+
+enum {
+       FILT_ERR_NONE,
+       FILT_ERR_INVALID_OP,
+       FILT_ERR_UNBALANCED_PAREN,
+       FILT_ERR_TOO_MANY_OPERANDS,
+       FILT_ERR_OPERAND_TOO_LONG,
+       FILT_ERR_FIELD_NOT_FOUND,
+       FILT_ERR_ILLEGAL_FIELD_OP,
+       FILT_ERR_ILLEGAL_INTVAL,
+       FILT_ERR_BAD_SUBSYS_FILTER,
+       FILT_ERR_TOO_MANY_PREDS,
+       FILT_ERR_MISSING_FIELD,
+       FILT_ERR_INVALID_FILTER,
+};
+
+static char *err_text[] = {
+       "No error",
+       "Invalid operator",
+       "Unbalanced parens",
+       "Too many operands",
+       "Operand too long",
+       "Field not found",
+       "Illegal operation for field type",
+       "Illegal integer value",
+       "Couldn't find or set field in one of a subsystem's events",
+       "Too many terms in predicate expression",
+       "Missing field name and/or value",
+       "Meaningless filter expression",
+};
+
+struct opstack_op {
+       int op;
+       struct list_head list;
+};
+
+struct postfix_elt {
+       int op;
+       char *operand;
+       struct list_head list;
+};
+
+struct filter_parse_state {
+       struct filter_op *ops;
+       struct list_head opstack;
+       struct list_head postfix;
+       int lasterr;
+       int lasterr_pos;
+
+       struct {
+               char *string;
+               unsigned int cnt;
+               unsigned int tail;
+       } infix;
+
+       struct {
+               char string[MAX_FILTER_STR_VAL];
+               int pos;
+               unsigned int tail;
+       } operand;
+};
+
+DEFINE_COMPARISON_PRED(s64);
+DEFINE_COMPARISON_PRED(u64);
+DEFINE_COMPARISON_PRED(s32);
+DEFINE_COMPARISON_PRED(u32);
+DEFINE_COMPARISON_PRED(s16);
+DEFINE_COMPARISON_PRED(u16);
+DEFINE_COMPARISON_PRED(s8);
+DEFINE_COMPARISON_PRED(u8);
+
+DEFINE_EQUALITY_PRED(64);
+DEFINE_EQUALITY_PRED(32);
+DEFINE_EQUALITY_PRED(16);
+DEFINE_EQUALITY_PRED(8);
+
+static int filter_pred_and(struct filter_pred *pred __attribute((unused)),
+                          void *event __attribute((unused)),
+                          int val1, int val2)
+{
+       return val1 && val2;
+}
+
+static int filter_pred_or(struct filter_pred *pred __attribute((unused)),
+                         void *event __attribute((unused)),
+                         int val1, int val2)
+{
+       return val1 || val2;
+}
+
+/* Filter predicate for fixed sized arrays of characters */
+static int filter_pred_string(struct filter_pred *pred, void *event,
+                             int val1, int val2)
 {
-       u64 *addr = (u64 *)(event + pred->offset);
-       u64 val = (u64)pred->val;
-       int match;
+       char *addr = (char *)(event + pred->offset);
+       int cmp, match;
+
+       cmp = strncmp(addr, pred->str_val, pred->str_len);
 
-       match = (val == *addr) ^ pred->not;
+       match = (!cmp) ^ pred->not;
 
        return match;
 }
 
-static int filter_pred_32(struct filter_pred *pred, void *event)
+/*
+ * Filter predicate for dynamic sized arrays of characters.
+ * These are implemented through a list of strings at the end
+ * of the entry.
+ * Also each of these strings have a field in the entry which
+ * contains its offset from the beginning of the entry.
+ * We have then first to get this field, dereference it
+ * and add it to the address of the entry, and at last we have
+ * the address of the string.
+ */
+static int filter_pred_strloc(struct filter_pred *pred, void *event,
+                             int val1, int val2)
 {
-       u32 *addr = (u32 *)(event + pred->offset);
-       u32 val = (u32)pred->val;
-       int match;
+       int str_loc = *(int *)(event + pred->offset);
+       char *addr = (char *)(event + str_loc);
+       int cmp, match;
+
+       cmp = strncmp(addr, pred->str_val, pred->str_len);
 
-       match = (val == *addr) ^ pred->not;
+       match = (!cmp) ^ pred->not;
 
        return match;
 }
 
-static int filter_pred_16(struct filter_pred *pred, void *event)
+static int filter_pred_none(struct filter_pred *pred, void *event,
+                           int val1, int val2)
+{
+       return 0;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
 {
-       u16 *addr = (u16 *)(event + pred->offset);
-       u16 val = (u16)pred->val;
-       int match;
+       struct event_filter *filter = call->filter;
+       int match, top = 0, val1 = 0, val2 = 0;
+       int stack[MAX_FILTER_PRED];
+       struct filter_pred *pred;
+       int i;
+
+       for (i = 0; i < filter->n_preds; i++) {
+               pred = filter->preds[i];
+               if (!pred->pop_n) {
+                       match = pred->fn(pred, rec, val1, val2);
+                       stack[top++] = match;
+                       continue;
+               }
+               if (pred->pop_n > top) {
+                       WARN_ON_ONCE(1);
+                       return 0;
+               }
+               val1 = stack[--top];
+               val2 = stack[--top];
+               match = pred->fn(pred, rec, val1, val2);
+               stack[top++] = match;
+       }
 
-       match = (val == *addr) ^ pred->not;
+       return stack[--top];
+}
+EXPORT_SYMBOL_GPL(filter_match_preds);
 
-       return match;
+static void parse_error(struct filter_parse_state *ps, int err, int pos)
+{
+       ps->lasterr = err;
+       ps->lasterr_pos = pos;
 }
 
-static int filter_pred_8(struct filter_pred *pred, void *event)
+static void remove_filter_string(struct event_filter *filter)
 {
-       u8 *addr = (u8 *)(event + pred->offset);
-       u8 val = (u8)pred->val;
-       int match;
+       kfree(filter->filter_string);
+       filter->filter_string = NULL;
+}
 
-       match = (val == *addr) ^ pred->not;
+static int replace_filter_string(struct event_filter *filter,
+                                char *filter_string)
+{
+       kfree(filter->filter_string);
+       filter->filter_string = kstrdup(filter_string, GFP_KERNEL);
+       if (!filter->filter_string)
+               return -ENOMEM;
 
-       return match;
+       return 0;
 }
 
-static int filter_pred_string(struct filter_pred *pred, void *event)
+static int append_filter_string(struct event_filter *filter,
+                               char *string)
 {
-       char *addr = (char *)(event + pred->offset);
-       int cmp, match;
+       int newlen;
+       char *new_filter_string;
 
-       cmp = strncmp(addr, pred->str_val, pred->str_len);
+       BUG_ON(!filter->filter_string);
+       newlen = strlen(filter->filter_string) + strlen(string) + 1;
+       new_filter_string = kmalloc(newlen, GFP_KERNEL);
+       if (!new_filter_string)
+               return -ENOMEM;
 
-       match = (!cmp) ^ pred->not;
+       strcpy(new_filter_string, filter->filter_string);
+       strcat(new_filter_string, string);
+       kfree(filter->filter_string);
+       filter->filter_string = new_filter_string;
 
-       return match;
+       return 0;
 }
 
-/* return 1 if event matches, 0 otherwise (discard) */
-int filter_match_preds(struct ftrace_event_call *call, void *rec)
+static void append_filter_err(struct filter_parse_state *ps,
+                             struct event_filter *filter)
 {
-       int i, matched, and_failed = 0;
-       struct filter_pred *pred;
+       int pos = ps->lasterr_pos;
+       char *buf, *pbuf;
 
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (call->preds[i]) {
-                       pred = call->preds[i];
-                       if (and_failed && !pred->or)
-                               continue;
-                       matched = pred->fn(pred, rec);
-                       if (!matched && !pred->or) {
-                               and_failed = 1;
-                               continue;
-                       } else if (matched && pred->or)
-                               return 1;
-               } else
-                       break;
-       }
+       buf = (char *)__get_free_page(GFP_TEMPORARY);
+       if (!buf)
+               return;
 
-       if (and_failed)
-               return 0;
+       append_filter_string(filter, "\n");
+       memset(buf, ' ', PAGE_SIZE);
+       if (pos > PAGE_SIZE - 128)
+               pos = 0;
+       buf[pos] = '^';
+       pbuf = &buf[pos] + 1;
 
-       return 1;
+       sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]);
+       append_filter_string(filter, buf);
+       free_page((unsigned long) buf);
 }
 
-void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
+void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s)
 {
-       char *field_name;
-       struct filter_pred *pred;
-       int i;
+       struct event_filter *filter = call->filter;
 
-       if (!preds) {
+       mutex_lock(&filter_mutex);
+       if (filter->filter_string)
+               trace_seq_printf(s, "%s\n", filter->filter_string);
+       else
                trace_seq_printf(s, "none\n");
-               return;
-       }
+       mutex_unlock(&filter_mutex);
+}
 
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (preds[i]) {
-                       pred = preds[i];
-                       field_name = pred->field_name;
-                       if (i)
-                               trace_seq_printf(s, pred->or ? "|| " : "&& ");
-                       trace_seq_printf(s, "%s ", field_name);
-                       trace_seq_printf(s, pred->not ? "!= " : "== ");
-                       if (pred->str_val)
-                               trace_seq_printf(s, "%s\n", pred->str_val);
-                       else
-                               trace_seq_printf(s, "%llu\n", pred->val);
-               } else
-                       break;
-       }
+void print_subsystem_event_filter(struct event_subsystem *system,
+                                 struct trace_seq *s)
+{
+       struct event_filter *filter = system->filter;
+
+       mutex_lock(&filter_mutex);
+       if (filter->filter_string)
+               trace_seq_printf(s, "%s\n", filter->filter_string);
+       else
+               trace_seq_printf(s, "none\n");
+       mutex_unlock(&filter_mutex);
 }
 
 static struct ftrace_event_field *
@@ -150,284 +328,828 @@ find_event_field(struct ftrace_event_call *call, char *name)
        return NULL;
 }
 
-void filter_free_pred(struct filter_pred *pred)
+static void filter_free_pred(struct filter_pred *pred)
 {
        if (!pred)
                return;
 
        kfree(pred->field_name);
-       kfree(pred->str_val);
        kfree(pred);
 }
 
-void filter_free_preds(struct ftrace_event_call *call)
+static void filter_clear_pred(struct filter_pred *pred)
 {
-       int i;
+       kfree(pred->field_name);
+       pred->field_name = NULL;
+       pred->str_len = 0;
+}
 
-       if (call->preds) {
-               for (i = 0; i < MAX_FILTER_PRED; i++)
-                       filter_free_pred(call->preds[i]);
-               kfree(call->preds);
-               call->preds = NULL;
+static int filter_set_pred(struct filter_pred *dest,
+                          struct filter_pred *src,
+                          filter_pred_fn_t fn)
+{
+       *dest = *src;
+       if (src->field_name) {
+               dest->field_name = kstrdup(src->field_name, GFP_KERNEL);
+               if (!dest->field_name)
+                       return -ENOMEM;
        }
+       dest->fn = fn;
+
+       return 0;
 }
 
-void filter_free_subsystem_preds(struct event_subsystem *system)
+static void filter_disable_preds(struct ftrace_event_call *call)
 {
-       struct ftrace_event_call *call = __start_ftrace_events;
+       struct event_filter *filter = call->filter;
        int i;
 
-       if (system->preds) {
-               for (i = 0; i < MAX_FILTER_PRED; i++)
-                       filter_free_pred(system->preds[i]);
-               kfree(system->preds);
-               system->preds = NULL;
-       }
+       call->filter_active = 0;
+       filter->n_preds = 0;
 
-       events_for_each(call) {
-               if (!call->name || !call->regfunc)
-                       continue;
+       for (i = 0; i < MAX_FILTER_PRED; i++)
+               filter->preds[i]->fn = filter_pred_none;
+}
+
+void destroy_preds(struct ftrace_event_call *call)
+{
+       struct event_filter *filter = call->filter;
+       int i;
 
-               if (!strcmp(call->system, system->name))
-                       filter_free_preds(call);
+       for (i = 0; i < MAX_FILTER_PRED; i++) {
+               if (filter->preds[i])
+                       filter_free_pred(filter->preds[i]);
        }
+       kfree(filter->preds);
+       kfree(filter);
+       call->filter = NULL;
 }
 
-static int __filter_add_pred(struct ftrace_event_call *call,
-                            struct filter_pred *pred)
+int init_preds(struct ftrace_event_call *call)
 {
+       struct event_filter *filter;
+       struct filter_pred *pred;
        int i;
 
-       if (call->preds && !pred->compound)
-               filter_free_preds(call);
+       filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL);
+       if (!call->filter)
+               return -ENOMEM;
 
-       if (!call->preds) {
-               call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
-                                     GFP_KERNEL);
-               if (!call->preds)
-                       return -ENOMEM;
-       }
+       call->filter_active = 0;
+       filter->n_preds = 0;
+
+       filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL);
+       if (!filter->preds)
+               goto oom;
 
        for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (!call->preds[i]) {
-                       call->preds[i] = pred;
-                       return 0;
+               pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+               if (!pred)
+                       goto oom;
+               pred->fn = filter_pred_none;
+               filter->preds[i] = pred;
+       }
+
+       return 0;
+
+oom:
+       destroy_preds(call);
+
+       return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(init_preds);
+
+static void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+       struct event_filter *filter = system->filter;
+       struct ftrace_event_call *call;
+       int i;
+
+       if (filter->n_preds) {
+               for (i = 0; i < filter->n_preds; i++)
+                       filter_free_pred(filter->preds[i]);
+               kfree(filter->preds);
+               filter->preds = NULL;
+               filter->n_preds = 0;
+       }
+
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
+               if (!call->define_fields)
+                       continue;
+
+               if (!strcmp(call->system, system->name)) {
+                       filter_disable_preds(call);
+                       remove_filter_string(call->filter);
                }
        }
+       mutex_unlock(&event_mutex);
+}
+
+static int filter_add_pred_fn(struct filter_parse_state *ps,
+                             struct ftrace_event_call *call,
+                             struct filter_pred *pred,
+                             filter_pred_fn_t fn)
+{
+       struct event_filter *filter = call->filter;
+       int idx, err;
+
+       if (filter->n_preds == MAX_FILTER_PRED) {
+               parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+               return -ENOSPC;
+       }
+
+       idx = filter->n_preds;
+       filter_clear_pred(filter->preds[idx]);
+       err = filter_set_pred(filter->preds[idx], pred, fn);
+       if (err)
+               return err;
 
-       return -ENOSPC;
+       filter->n_preds++;
+       call->filter_active = 1;
+
+       return 0;
 }
 
+enum {
+       FILTER_STATIC_STRING = 1,
+       FILTER_DYN_STRING
+};
+
 static int is_string_field(const char *type)
 {
+       if (strstr(type, "__data_loc") && strstr(type, "char"))
+               return FILTER_DYN_STRING;
+
        if (strchr(type, '[') && strstr(type, "char"))
-               return 1;
+               return FILTER_STATIC_STRING;
 
        return 0;
 }
 
-int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+static int is_legal_op(struct ftrace_event_field *field, int op)
 {
-       struct ftrace_event_field *field;
-
-       field = find_event_field(call, pred->field_name);
-       if (!field)
-               return -EINVAL;
+       if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE))
+               return 0;
 
-       pred->offset = field->offset;
+       return 1;
+}
 
-       if (is_string_field(field->type)) {
-               if (!pred->str_val)
-                       return -EINVAL;
-               pred->fn = filter_pred_string;
-               pred->str_len = field->size;
-               return __filter_add_pred(call, pred);
-       } else {
-               if (pred->str_val)
-                       return -EINVAL;
-       }
+static filter_pred_fn_t select_comparison_fn(int op, int field_size,
+                                            int field_is_signed)
+{
+       filter_pred_fn_t fn = NULL;
 
-       switch (field->size) {
+       switch (field_size) {
        case 8:
-               pred->fn = filter_pred_64;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_64;
+               else if (field_is_signed)
+                       fn = filter_pred_s64;
+               else
+                       fn = filter_pred_u64;
                break;
        case 4:
-               pred->fn = filter_pred_32;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_32;
+               else if (field_is_signed)
+                       fn = filter_pred_s32;
+               else
+                       fn = filter_pred_u32;
                break;
        case 2:
-               pred->fn = filter_pred_16;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_16;
+               else if (field_is_signed)
+                       fn = filter_pred_s16;
+               else
+                       fn = filter_pred_u16;
                break;
        case 1:
-               pred->fn = filter_pred_8;
+               if (op == OP_EQ || op == OP_NE)
+                       fn = filter_pred_8;
+               else if (field_is_signed)
+                       fn = filter_pred_s8;
+               else
+                       fn = filter_pred_u8;
                break;
-       default:
-               return -EINVAL;
        }
 
-       return __filter_add_pred(call, pred);
+       return fn;
 }
 
-static struct filter_pred *copy_pred(struct filter_pred *pred)
+static int filter_add_pred(struct filter_parse_state *ps,
+                          struct ftrace_event_call *call,
+                          struct filter_pred *pred)
 {
-       struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
-       if (!new_pred)
-               return NULL;
+       struct ftrace_event_field *field;
+       filter_pred_fn_t fn;
+       unsigned long long val;
+       int string_type;
+
+       pred->fn = filter_pred_none;
+
+       if (pred->op == OP_AND) {
+               pred->pop_n = 2;
+               return filter_add_pred_fn(ps, call, pred, filter_pred_and);
+       } else if (pred->op == OP_OR) {
+               pred->pop_n = 2;
+               return filter_add_pred_fn(ps, call, pred, filter_pred_or);
+       }
+
+       field = find_event_field(call, pred->field_name);
+       if (!field) {
+               parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0);
+               return -EINVAL;
+       }
 
-       memcpy(new_pred, pred, sizeof(*pred));
+       pred->offset = field->offset;
 
-       if (pred->field_name) {
-               new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-               if (!new_pred->field_name) {
-                       kfree(new_pred);
-                       return NULL;
-               }
+       if (!is_legal_op(field, pred->op)) {
+               parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0);
+               return -EINVAL;
        }
 
-       if (pred->str_val) {
-               new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-               if (!new_pred->str_val) {
-                       filter_free_pred(new_pred);
-                       return NULL;
+       string_type = is_string_field(field->type);
+       if (string_type) {
+               if (string_type == FILTER_STATIC_STRING)
+                       fn = filter_pred_string;
+               else
+                       fn = filter_pred_strloc;
+               pred->str_len = field->size;
+               if (pred->op == OP_NE)
+                       pred->not = 1;
+               return filter_add_pred_fn(ps, call, pred, fn);
+       } else {
+               if (strict_strtoull(pred->str_val, 0, &val)) {
+                       parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0);
+                       return -EINVAL;
                }
+               pred->val = val;
+       }
+
+       fn = select_comparison_fn(pred->op, field->size, field->is_signed);
+       if (!fn) {
+               parse_error(ps, FILT_ERR_INVALID_OP, 0);
+               return -EINVAL;
        }
 
-       return new_pred;
+       if (pred->op == OP_NE)
+               pred->not = 1;
+
+       return filter_add_pred_fn(ps, call, pred, fn);
 }
 
-int filter_add_subsystem_pred(struct event_subsystem *system,
-                             struct filter_pred *pred)
+static int filter_add_subsystem_pred(struct filter_parse_state *ps,
+                                    struct event_subsystem *system,
+                                    struct filter_pred *pred,
+                                    char *filter_string)
 {
-       struct ftrace_event_call *call = __start_ftrace_events;
-       struct filter_pred *event_pred;
-       int i;
-
-       if (system->preds && !pred->compound)
-               filter_free_subsystem_preds(system);
+       struct event_filter *filter = system->filter;
+       struct ftrace_event_call *call;
+       int err = 0;
 
-       if (!system->preds) {
-               system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+       if (!filter->preds) {
+               filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
                                        GFP_KERNEL);
-               if (!system->preds)
+
+               if (!filter->preds)
                        return -ENOMEM;
        }
 
-       for (i = 0; i < MAX_FILTER_PRED; i++) {
-               if (!system->preds[i]) {
-                       system->preds[i] = pred;
-                       break;
-               }
+       if (filter->n_preds == MAX_FILTER_PRED) {
+               parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0);
+               return -ENOSPC;
        }
 
-       if (i == MAX_FILTER_PRED)
-               return -ENOSPC;
+       filter->preds[filter->n_preds] = pred;
+       filter->n_preds++;
 
-       events_for_each(call) {
-               int err;
+       mutex_lock(&event_mutex);
+       list_for_each_entry(call, &ftrace_events, list) {
 
-               if (!call->name || !call->regfunc)
+               if (!call->define_fields)
                        continue;
 
                if (strcmp(call->system, system->name))
                        continue;
 
-               if (!find_event_field(call, pred->field_name))
-                       continue;
+               err = filter_add_pred(ps, call, pred);
+               if (err) {
+                       mutex_unlock(&event_mutex);
+                       filter_free_subsystem_preds(system);
+                       parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0);
+                       goto out;
+               }
+               replace_filter_string(call->filter, filter_string);
+       }
+       mutex_unlock(&event_mutex);
+out:
+       return err;
+}
 
-               event_pred = copy_pred(pred);
-               if (!event_pred)
-                       goto oom;
+static void parse_init(struct filter_parse_state *ps,
+                      struct filter_op *ops,
+                      char *infix_string)
+{
+       memset(ps, '\0', sizeof(*ps));
 
-               err = filter_add_pred(call, event_pred);
-               if (err)
-                       filter_free_pred(event_pred);
-               if (err == -ENOMEM)
-                       goto oom;
+       ps->infix.string = infix_string;
+       ps->infix.cnt = strlen(infix_string);
+       ps->ops = ops;
+
+       INIT_LIST_HEAD(&ps->opstack);
+       INIT_LIST_HEAD(&ps->postfix);
+}
+
+static char infix_next(struct filter_parse_state *ps)
+{
+       ps->infix.cnt--;
+
+       return ps->infix.string[ps->infix.tail++];
+}
+
+static char infix_peek(struct filter_parse_state *ps)
+{
+       if (ps->infix.tail == strlen(ps->infix.string))
+               return 0;
+
+       return ps->infix.string[ps->infix.tail];
+}
+
+static void infix_advance(struct filter_parse_state *ps)
+{
+       ps->infix.cnt--;
+       ps->infix.tail++;
+}
+
+static inline int is_precedence_lower(struct filter_parse_state *ps,
+                                     int a, int b)
+{
+       return ps->ops[a].precedence < ps->ops[b].precedence;
+}
+
+static inline int is_op_char(struct filter_parse_state *ps, char c)
+{
+       int i;
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (ps->ops[i].string[0] == c)
+                       return 1;
        }
 
        return 0;
+}
 
-oom:
-       system->preds[i] = NULL;
-       return -ENOMEM;
+static int infix_get_op(struct filter_parse_state *ps, char firstc)
+{
+       char nextc = infix_peek(ps);
+       char opstr[3];
+       int i;
+
+       opstr[0] = firstc;
+       opstr[1] = nextc;
+       opstr[2] = '\0';
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (!strcmp(opstr, ps->ops[i].string)) {
+                       infix_advance(ps);
+                       return ps->ops[i].id;
+               }
+       }
+
+       opstr[1] = '\0';
+
+       for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) {
+               if (!strcmp(opstr, ps->ops[i].string))
+                       return ps->ops[i].id;
+       }
+
+       return OP_NONE;
 }
 
-int filter_parse(char **pbuf, struct filter_pred *pred)
+static inline void clear_operand_string(struct filter_parse_state *ps)
 {
-       char *tmp, *tok, *val_str = NULL;
-       int tok_n = 0;
+       memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL);
+       ps->operand.tail = 0;
+}
 
-       /* field ==/!= number, or/and field ==/!= number, number */
-       while ((tok = strsep(pbuf, " \n"))) {
-               if (tok_n == 0) {
-                       if (!strcmp(tok, "0")) {
-                               pred->clear = 1;
-                               return 0;
-                       } else if (!strcmp(tok, "&&")) {
-                               pred->or = 0;
-                               pred->compound = 1;
-                       } else if (!strcmp(tok, "||")) {
-                               pred->or = 1;
-                               pred->compound = 1;
-                       } else
-                               pred->field_name = tok;
-                       tok_n = 1;
+static inline int append_operand_char(struct filter_parse_state *ps, char c)
+{
+       if (ps->operand.tail == MAX_FILTER_STR_VAL - 1)
+               return -EINVAL;
+
+       ps->operand.string[ps->operand.tail++] = c;
+
+       return 0;
+}
+
+static int filter_opstack_push(struct filter_parse_state *ps, int op)
+{
+       struct opstack_op *opstack_op;
+
+       opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL);
+       if (!opstack_op)
+               return -ENOMEM;
+
+       opstack_op->op = op;
+       list_add(&opstack_op->list, &ps->opstack);
+
+       return 0;
+}
+
+static int filter_opstack_empty(struct filter_parse_state *ps)
+{
+       return list_empty(&ps->opstack);
+}
+
+static int filter_opstack_top(struct filter_parse_state *ps)
+{
+       struct opstack_op *opstack_op;
+
+       if (filter_opstack_empty(ps))
+               return OP_NONE;
+
+       opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+
+       return opstack_op->op;
+}
+
+static int filter_opstack_pop(struct filter_parse_state *ps)
+{
+       struct opstack_op *opstack_op;
+       int op;
+
+       if (filter_opstack_empty(ps))
+               return OP_NONE;
+
+       opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list);
+       op = opstack_op->op;
+       list_del(&opstack_op->list);
+
+       kfree(opstack_op);
+
+       return op;
+}
+
+static void filter_opstack_clear(struct filter_parse_state *ps)
+{
+       while (!filter_opstack_empty(ps))
+               filter_opstack_pop(ps);
+}
+
+static char *curr_operand(struct filter_parse_state *ps)
+{
+       return ps->operand.string;
+}
+
+static int postfix_append_operand(struct filter_parse_state *ps, char *operand)
+{
+       struct postfix_elt *elt;
+
+       elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+       if (!elt)
+               return -ENOMEM;
+
+       elt->op = OP_NONE;
+       elt->operand = kstrdup(operand, GFP_KERNEL);
+       if (!elt->operand) {
+               kfree(elt);
+               return -ENOMEM;
+       }
+
+       list_add_tail(&elt->list, &ps->postfix);
+
+       return 0;
+}
+
+static int postfix_append_op(struct filter_parse_state *ps, int op)
+{
+       struct postfix_elt *elt;
+
+       elt = kmalloc(sizeof(*elt), GFP_KERNEL);
+       if (!elt)
+               return -ENOMEM;
+
+       elt->op = op;
+       elt->operand = NULL;
+
+       list_add_tail(&elt->list, &ps->postfix);
+
+       return 0;
+}
+
+static void postfix_clear(struct filter_parse_state *ps)
+{
+       struct postfix_elt *elt;
+
+       while (!list_empty(&ps->postfix)) {
+               elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
+               kfree(elt->operand);
+               list_del(&elt->list);
+       }
+}
+
+static int filter_parse(struct filter_parse_state *ps)
+{
+       int in_string = 0;
+       int op, top_op;
+       char ch;
+
+       while ((ch = infix_next(ps))) {
+               if (ch == '"') {
+                       in_string ^= 1;
                        continue;
                }
-               if (tok_n == 1) {
-                       if (!pred->field_name)
-                               pred->field_name = tok;
-                       else if (!strcmp(tok, "!="))
-                               pred->not = 1;
-                       else if (!strcmp(tok, "=="))
-                               pred->not = 0;
-                       else {
-                               pred->field_name = NULL;
+
+               if (in_string)
+                       goto parse_operand;
+
+               if (isspace(ch))
+                       continue;
+
+               if (is_op_char(ps, ch)) {
+                       op = infix_get_op(ps, ch);
+                       if (op == OP_NONE) {
+                               parse_error(ps, FILT_ERR_INVALID_OP, 0);
                                return -EINVAL;
                        }
-                       tok_n = 2;
+
+                       if (strlen(curr_operand(ps))) {
+                               postfix_append_operand(ps, curr_operand(ps));
+                               clear_operand_string(ps);
+                       }
+
+                       while (!filter_opstack_empty(ps)) {
+                               top_op = filter_opstack_top(ps);
+                               if (!is_precedence_lower(ps, top_op, op)) {
+                                       top_op = filter_opstack_pop(ps);
+                                       postfix_append_op(ps, top_op);
+                                       continue;
+                               }
+                               break;
+                       }
+
+                       filter_opstack_push(ps, op);
                        continue;
                }
-               if (tok_n == 2) {
-                       if (pred->compound) {
-                               if (!strcmp(tok, "!="))
-                                       pred->not = 1;
-                               else if (!strcmp(tok, "=="))
-                                       pred->not = 0;
-                               else {
-                                       pred->field_name = NULL;
-                                       return -EINVAL;
-                               }
-                       } else {
-                               val_str = tok;
-                               break; /* done */
+
+               if (ch == '(') {
+                       filter_opstack_push(ps, OP_OPEN_PAREN);
+                       continue;
+               }
+
+               if (ch == ')') {
+                       if (strlen(curr_operand(ps))) {
+                               postfix_append_operand(ps, curr_operand(ps));
+                               clear_operand_string(ps);
+                       }
+
+                       top_op = filter_opstack_pop(ps);
+                       while (top_op != OP_NONE) {
+                               if (top_op == OP_OPEN_PAREN)
+                                       break;
+                               postfix_append_op(ps, top_op);
+                               top_op = filter_opstack_pop(ps);
+                       }
+                       if (top_op == OP_NONE) {
+                               parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                               return -EINVAL;
                        }
-                       tok_n = 3;
                        continue;
                }
-               if (tok_n == 3) {
-                       val_str = tok;
-                       break; /* done */
+parse_operand:
+               if (append_operand_char(ps, ch)) {
+                       parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0);
+                       return -EINVAL;
                }
        }
 
-       if (!val_str) {
-               pred->field_name = NULL;
-               return -EINVAL;
+       if (strlen(curr_operand(ps)))
+               postfix_append_operand(ps, curr_operand(ps));
+
+       while (!filter_opstack_empty(ps)) {
+               top_op = filter_opstack_pop(ps);
+               if (top_op == OP_NONE)
+                       break;
+               if (top_op == OP_OPEN_PAREN) {
+                       parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0);
+                       return -EINVAL;
+               }
+               postfix_append_op(ps, top_op);
        }
 
-       pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
-       if (!pred->field_name)
-               return -ENOMEM;
+       return 0;
+}
 
-       pred->val = simple_strtoull(val_str, &tmp, 0);
-       if (tmp == val_str) {
-               pred->str_val = kstrdup(val_str, GFP_KERNEL);
-               if (!pred->str_val)
-                       return -ENOMEM;
-       } else if (*tmp != '\0')
+static struct filter_pred *create_pred(int op, char *operand1, char *operand2)
+{
+       struct filter_pred *pred;
+
+       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+       if (!pred)
+               return NULL;
+
+       pred->field_name = kstrdup(operand1, GFP_KERNEL);
+       if (!pred->field_name) {
+               kfree(pred);
+               return NULL;
+       }
+
+       strcpy(pred->str_val, operand2);
+       pred->str_len = strlen(operand2);
+
+       pred->op = op;
+
+       return pred;
+}
+
+static struct filter_pred *create_logical_pred(int op)
+{
+       struct filter_pred *pred;
+
+       pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+       if (!pred)
+               return NULL;
+
+       pred->op = op;
+
+       return pred;
+}
+
+static int check_preds(struct filter_parse_state *ps)
+{
+       int n_normal_preds = 0, n_logical_preds = 0;
+       struct postfix_elt *elt;
+
+       list_for_each_entry(elt, &ps->postfix, list) {
+               if (elt->op == OP_NONE)
+                       continue;
+
+               if (elt->op == OP_AND || elt->op == OP_OR) {
+                       n_logical_preds++;
+                       continue;
+               }
+               n_normal_preds++;
+       }
+
+       if (!n_normal_preds || n_logical_preds >= n_normal_preds) {
+               parse_error(ps, FILT_ERR_INVALID_FILTER, 0);
                return -EINVAL;
+       }
 
        return 0;
 }
 
+static int replace_preds(struct event_subsystem *system,
+                        struct ftrace_event_call *call,
+                        struct filter_parse_state *ps,
+                        char *filter_string)
+{
+       char *operand1 = NULL, *operand2 = NULL;
+       struct filter_pred *pred;
+       struct postfix_elt *elt;
+       int err;
+
+       err = check_preds(ps);
+       if (err)
+               return err;
+
+       list_for_each_entry(elt, &ps->postfix, list) {
+               if (elt->op == OP_NONE) {
+                       if (!operand1)
+                               operand1 = elt->operand;
+                       else if (!operand2)
+                               operand2 = elt->operand;
+                       else {
+                               parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0);
+                               return -EINVAL;
+                       }
+                       continue;
+               }
+
+               if (elt->op == OP_AND || elt->op == OP_OR) {
+                       pred = create_logical_pred(elt->op);
+                       if (call) {
+                               err = filter_add_pred(ps, call, pred);
+                               filter_free_pred(pred);
+                       } else
+                               err = filter_add_subsystem_pred(ps, system,
+                                                       pred, filter_string);
+                       if (err)
+                               return err;
+
+                       operand1 = operand2 = NULL;
+                       continue;
+               }
+
+               if (!operand1 || !operand2) {
+                       parse_error(ps, FILT_ERR_MISSING_FIELD, 0);
+                       return -EINVAL;
+               }
+
+               pred = create_pred(elt->op, operand1, operand2);
+               if (call) {
+                       err = filter_add_pred(ps, call, pred);
+                       filter_free_pred(pred);
+               } else
+                       err = filter_add_subsystem_pred(ps, system, pred,
+                                                       filter_string);
+               if (err)
+                       return err;
+
+               operand1 = operand2 = NULL;
+       }
+
+       return 0;
+}
+
+int apply_event_filter(struct ftrace_event_call *call, char *filter_string)
+{
+       int err;
+
+       struct filter_parse_state *ps;
+
+       mutex_lock(&filter_mutex);
+
+       if (!strcmp(strstrip(filter_string), "0")) {
+               filter_disable_preds(call);
+               remove_filter_string(call->filter);
+               mutex_unlock(&filter_mutex);
+               return 0;
+       }
+
+       err = -ENOMEM;
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+       if (!ps)
+               goto out_unlock;
+
+       filter_disable_preds(call);
+       replace_filter_string(call->filter, filter_string);
+
+       parse_init(ps, filter_ops, filter_string);
+       err = filter_parse(ps);
+       if (err) {
+               append_filter_err(ps, call->filter);
+               goto out;
+       }
+
+       err = replace_preds(NULL, call, ps, filter_string);
+       if (err)
+               append_filter_err(ps, call->filter);
+
+out:
+       filter_opstack_clear(ps);
+       postfix_clear(ps);
+       kfree(ps);
+out_unlock:
+       mutex_unlock(&filter_mutex);
+
+       return err;
+}
+
+int apply_subsystem_event_filter(struct event_subsystem *system,
+                                char *filter_string)
+{
+       int err;
+
+       struct filter_parse_state *ps;
+
+       mutex_lock(&filter_mutex);
+
+       if (!strcmp(strstrip(filter_string), "0")) {
+               filter_free_subsystem_preds(system);
+               remove_filter_string(system->filter);
+               mutex_unlock(&filter_mutex);
+               return 0;
+       }
+
+       err = -ENOMEM;
+       ps = kzalloc(sizeof(*ps), GFP_KERNEL);
+       if (!ps)
+               goto out_unlock;
+
+       filter_free_subsystem_preds(system);
+       replace_filter_string(system->filter, filter_string);
+
+       parse_init(ps, filter_ops, filter_string);
+       err = filter_parse(ps);
+       if (err) {
+               append_filter_err(ps, system->filter);
+               goto out;
+       }
+
+       err = replace_preds(system, NULL, ps, filter_string);
+       if (err)
+               append_filter_err(ps, system->filter);
+
+out:
+       filter_opstack_clear(ps);
+       postfix_clear(ps);
+       kfree(ps);
+out_unlock:
+       mutex_unlock(&filter_mutex);
+
+       return err;
+}
 
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
deleted file mode 100644 (file)
index 38985f9..0000000
+++ /dev/null
@@ -1,39 +0,0 @@
-/*
- * Stage 1 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * struct ftrace_raw_<call> {
- *     struct trace_entry              ent;
- *     <type>                          <item>;
- *     <type2>                         <item2>[<len>];
- *     [...]
- * };
- *
- * The <type> <item> is created by the __field(type, item) macro or
- * the __array(type2, item2, len) macro.
- * We simply do "type item;", and that will create the fields
- * in the structure.
- */
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)
-
-#undef __array
-#define __array(type, item, len)       type    item[len];
-
-#undef __field
-#define __field(type, item)            type    item;
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \
-       struct ftrace_raw_##name {                              \
-               struct trace_entry      ent;                    \
-               tstruct                                         \
-       };                                                      \
-       static struct ftrace_event_call event_##name
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
deleted file mode 100644 (file)
index d363c66..0000000
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Stage 2 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * enum print_line_t
- * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
- * {
- *     struct trace_seq *s = &iter->seq;
- *     struct ftrace_raw_<call> *field; <-- defined in stage 1
- *     struct trace_entry *entry;
- *     int ret;
- *
- *     entry = iter->ent;
- *
- *     if (entry->type != event_<call>.id) {
- *             WARN_ON_ONCE(1);
- *             return TRACE_TYPE_UNHANDLED;
- *     }
- *
- *     field = (typeof(field))entry;
- *
- *     ret = trace_seq_printf(s, <TP_printk> "\n");
- *     if (!ret)
- *             return TRACE_TYPE_PARTIAL_LINE;
- *
- *     return TRACE_TYPE_HANDLED;
- * }
- *
- * This is the method used to print the raw event to the trace
- * output format. Note, this is not needed if the data is read
- * in binary.
- */
-
-#undef __entry
-#define __entry field
-
-#undef TP_printk
-#define TP_printk(fmt, args...) fmt "\n", args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
-enum print_line_t                                                      \
-ftrace_raw_output_##call(struct trace_iterator *iter, int flags)       \
-{                                                                      \
-       struct trace_seq *s = &iter->seq;                               \
-       struct ftrace_raw_##call *field;                                \
-       struct trace_entry *entry;                                      \
-       int ret;                                                        \
-                                                                       \
-       entry = iter->ent;                                              \
-                                                                       \
-       if (entry->type != event_##call.id) {                           \
-               WARN_ON_ONCE(1);                                        \
-               return TRACE_TYPE_UNHANDLED;                            \
-       }                                                               \
-                                                                       \
-       field = (typeof(field))entry;                                   \
-                                                                       \
-       ret = trace_seq_printf(s, #call ": " print);                    \
-       if (!ret)                                                       \
-               return TRACE_TYPE_PARTIAL_LINE;                         \
-                                                                       \
-       return TRACE_TYPE_HANDLED;                                      \
-}
-       
-#include <trace/trace_event_types.h>
-
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *     struct ftrace_raw_##call field;
- *     int ret;
- *
- *     ret = trace_seq_printf(s, #type " " #item ";"
- *                            " offset:%u; size:%u;\n",
- *                            offsetof(struct ftrace_raw_##call, item),
- *                            sizeof(field.type));
- *
- * }
- */
-
-#undef TP_STRUCT__entry
-#define TP_STRUCT__entry(args...) args
-
-#undef __field
-#define __field(type, item)                                    \
-       ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
-                              "offset:%u;\tsize:%u;\n",                \
-                              (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
-       if (!ret)                                                       \
-               return 0;
-
-#undef __array
-#define __array(type, item, len)                                               \
-       ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"    \
-                              "offset:%u;\tsize:%u;\n",                \
-                              (unsigned int)offsetof(typeof(field), item), \
-                              (unsigned int)sizeof(field.item));       \
-       if (!ret)                                                       \
-               return 0;
-
-#undef __entry
-#define __entry REC
-
-#undef TP_printk
-#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args)
-
-#undef TP_fast_assign
-#define TP_fast_assign(args...) args
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
-static int                                                             \
-ftrace_format_##call(struct trace_seq *s)                              \
-{                                                                      \
-       struct ftrace_raw_##call field;                                 \
-       int ret;                                                        \
-                                                                       \
-       tstruct;                                                        \
-                                                                       \
-       trace_seq_printf(s, "\nprint fmt: " print);                     \
-                                                                       \
-       return ret;                                                     \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef __field
-#define __field(type, item)                                            \
-       ret = trace_define_field(event_call, #type, #item,              \
-                                offsetof(typeof(field), item),         \
-                                sizeof(field.item));                   \
-       if (ret)                                                        \
-               return ret;
-
-#undef __array
-#define __array(type, item, len)                                       \
-       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
-                                offsetof(typeof(field), item),         \
-                                sizeof(field.item));                   \
-       if (ret)                                                        \
-               return ret;
-
-#define __common_field(type, item)                                     \
-       ret = trace_define_field(event_call, #type, "common_" #item,    \
-                                offsetof(typeof(field.ent), item),     \
-                                sizeof(field.ent.item));               \
-       if (ret)                                                        \
-               return ret;
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, func, print)           \
-int                                                                    \
-ftrace_define_fields_##call(void)                                      \
-{                                                                      \
-       struct ftrace_raw_##call field;                                 \
-       struct ftrace_event_call *event_call = &event_##call;           \
-       int ret;                                                        \
-                                                                       \
-       __common_field(unsigned char, type);                            \
-       __common_field(unsigned char, flags);                           \
-       __common_field(unsigned char, preempt_count);                   \
-       __common_field(int, pid);                                       \
-       __common_field(int, tgid);                                      \
-                                                                       \
-       tstruct;                                                        \
-                                                                       \
-       return ret;                                                     \
-}
-
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
deleted file mode 100644 (file)
index 9d2fa78..0000000
+++ /dev/null
@@ -1,281 +0,0 @@
-/*
- * Stage 3 of the trace events.
- *
- * Override the macros in <trace/trace_event_types.h> to include the following:
- *
- * static void ftrace_event_<call>(proto)
- * {
- *     event_trace_printk(_RET_IP_, "<call>: " <fmt>);
- * }
- *
- * static int ftrace_reg_event_<call>(void)
- * {
- *     int ret;
- *
- *     ret = register_trace_<call>(ftrace_event_<call>);
- *     if (!ret)
- *             pr_info("event trace: Could not activate trace point "
- *                     "probe to  <call>");
- *     return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *     unregister_trace_<call>(ftrace_event_<call>);
- * }
- *
- * For those macros defined with TRACE_FORMAT:
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *     .name                   = "<call>",
- *     .regfunc                = ftrace_reg_event_<call>,
- *     .unregfunc              = ftrace_unreg_event_<call>,
- * }
- *
- *
- * For those macros defined with TRACE_EVENT:
- *
- * static struct ftrace_event_call event_<call>;
- *
- * static void ftrace_raw_event_<call>(proto)
- * {
- *     struct ring_buffer_event *event;
- *     struct ftrace_raw_<call> *entry; <-- defined in stage 1
- *     unsigned long irq_flags;
- *     int pc;
- *
- *     local_save_flags(irq_flags);
- *     pc = preempt_count();
- *
- *     event = trace_current_buffer_lock_reserve(event_<call>.id,
- *                               sizeof(struct ftrace_raw_<call>),
- *                               irq_flags, pc);
- *     if (!event)
- *             return;
- *     entry   = ring_buffer_event_data(event);
- *
- *     <assign>;  <-- Here we assign the entries by the __field and
- *                     __array macros.
- *
- *     trace_current_buffer_unlock_commit(event, irq_flags, pc);
- * }
- *
- * static int ftrace_raw_reg_event_<call>(void)
- * {
- *     int ret;
- *
- *     ret = register_trace_<call>(ftrace_raw_event_<call>);
- *     if (!ret)
- *             pr_info("event trace: Could not activate trace point "
- *                     "probe to <call>");
- *     return ret;
- * }
- *
- * static void ftrace_unreg_event_<call>(void)
- * {
- *     unregister_trace_<call>(ftrace_raw_event_<call>);
- * }
- *
- * static struct trace_event ftrace_event_type_<call> = {
- *     .trace                  = ftrace_raw_output_<call>, <-- stage 2
- * };
- *
- * static int ftrace_raw_init_event_<call>(void)
- * {
- *     int id;
- *
- *     id = register_ftrace_event(&ftrace_event_type_<call>);
- *     if (!id)
- *             return -ENODEV;
- *     event_<call>.id = id;
- *     return 0;
- * }
- *
- * static struct ftrace_event_call __used
- * __attribute__((__aligned__(4)))
- * __attribute__((section("_ftrace_events"))) event_<call> = {
- *     .name                   = "<call>",
- *     .system                 = "<system>",
- *     .raw_init               = ftrace_raw_init_event_<call>,
- *     .regfunc                = ftrace_reg_event_<call>,
- *     .unregfunc              = ftrace_unreg_event_<call>,
- *     .show_format            = ftrace_format_<call>,
- * }
- *
- */
-
-#undef TP_FMT
-#define TP_FMT(fmt, args...)   fmt "\n", ##args
-
-#ifdef CONFIG_EVENT_PROFILE
-#define _TRACE_PROFILE(call, proto, args)                              \
-static void ftrace_profile_##call(proto)                               \
-{                                                                      \
-       extern void perf_tpcounter_event(int);                          \
-       perf_tpcounter_event(event_##call.id);                          \
-}                                                                      \
-                                                                       \
-static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
-{                                                                      \
-       int ret = 0;                                                    \
-                                                                       \
-       if (!atomic_inc_return(&call->profile_count))                   \
-               ret = register_trace_##call(ftrace_profile_##call);     \
-                                                                       \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
-{                                                                      \
-       if (atomic_add_negative(-1, &call->profile_count))              \
-               unregister_trace_##call(ftrace_profile_##call);         \
-}
-
-#define _TRACE_PROFILE_INIT(call)                                      \
-       .profile_count = ATOMIC_INIT(-1),                               \
-       .profile_enable = ftrace_profile_enable_##call,                 \
-       .profile_disable = ftrace_profile_disable_##call,
-
-#else
-#define _TRACE_PROFILE(call, proto, args)
-#define _TRACE_PROFILE_INIT(call)
-#endif
-
-#define _TRACE_FORMAT(call, proto, args, fmt)                          \
-static void ftrace_event_##call(proto)                                 \
-{                                                                      \
-       event_trace_printk(_RET_IP_, #call ": " fmt);                   \
-}                                                                      \
-                                                                       \
-static int ftrace_reg_event_##call(void)                               \
-{                                                                      \
-       int ret;                                                        \
-                                                                       \
-       ret = register_trace_##call(ftrace_event_##call);               \
-       if (ret)                                                        \
-               pr_info("event trace: Could not activate trace point "  \
-                       "probe to " #call "\n");                        \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_unreg_event_##call(void)                            \
-{                                                                      \
-       unregister_trace_##call(ftrace_event_##call);                   \
-}                                                                      \
-                                                                       \
-static struct ftrace_event_call event_##call;                          \
-                                                                       \
-static int ftrace_init_event_##call(void)                              \
-{                                                                      \
-       int id;                                                         \
-                                                                       \
-       id = register_ftrace_event(NULL);                               \
-       if (!id)                                                        \
-               return -ENODEV;                                         \
-       event_##call.id = id;                                           \
-       return 0;                                                       \
-}
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)                           \
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))          \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
-static struct ftrace_event_call __used                                 \
-__attribute__((__aligned__(4)))                                                \
-__attribute__((section("_ftrace_events"))) event_##call = {            \
-       .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_init_event_##call,             \
-       .regfunc                = ftrace_reg_event_##call,              \
-       .unregfunc              = ftrace_unreg_event_##call,            \
-       _TRACE_PROFILE_INIT(call)                                       \
-}
-
-#undef __entry
-#define __entry entry
-
-#undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, assign, print)         \
-_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))                      \
-                                                                       \
-static struct ftrace_event_call event_##call;                          \
-                                                                       \
-static void ftrace_raw_event_##call(proto)                             \
-{                                                                      \
-       struct ftrace_event_call *call = &event_##call;                 \
-       struct ring_buffer_event *event;                                \
-       struct ftrace_raw_##call *entry;                                \
-       unsigned long irq_flags;                                        \
-       int pc;                                                         \
-                                                                       \
-       local_save_flags(irq_flags);                                    \
-       pc = preempt_count();                                           \
-                                                                       \
-       event = trace_current_buffer_lock_reserve(event_##call.id,      \
-                                 sizeof(struct ftrace_raw_##call),     \
-                                 irq_flags, pc);                       \
-       if (!event)                                                     \
-               return;                                                 \
-       entry   = ring_buffer_event_data(event);                        \
-                                                                       \
-       assign;                                                         \
-                                                                       \
-       if (call->preds && !filter_match_preds(call, entry))            \
-               ring_buffer_event_discard(event);                       \
-                                                                       \
-       trace_nowake_buffer_unlock_commit(event, irq_flags, pc);        \
-                                                                       \
-}                                                                      \
-                                                                       \
-static int ftrace_raw_reg_event_##call(void)                           \
-{                                                                      \
-       int ret;                                                        \
-                                                                       \
-       ret = register_trace_##call(ftrace_raw_event_##call);           \
-       if (ret)                                                        \
-               pr_info("event trace: Could not activate trace point "  \
-                       "probe to " #call "\n");                        \
-       return ret;                                                     \
-}                                                                      \
-                                                                       \
-static void ftrace_raw_unreg_event_##call(void)                                \
-{                                                                      \
-       unregister_trace_##call(ftrace_raw_event_##call);               \
-}                                                                      \
-                                                                       \
-static struct trace_event ftrace_event_type_##call = {                 \
-       .trace                  = ftrace_raw_output_##call,             \
-};                                                                     \
-                                                                       \
-static int ftrace_raw_init_event_##call(void)                          \
-{                                                                      \
-       int id;                                                         \
-                                                                       \
-       id = register_ftrace_event(&ftrace_event_type_##call);          \
-       if (!id)                                                        \
-               return -ENODEV;                                         \
-       event_##call.id = id;                                           \
-       INIT_LIST_HEAD(&event_##call.fields);                           \
-       return 0;                                                       \
-}                                                                      \
-                                                                       \
-static struct ftrace_event_call __used                                 \
-__attribute__((__aligned__(4)))                                                \
-__attribute__((section("_ftrace_events"))) event_##call = {            \
-       .name                   = #call,                                \
-       .system                 = __stringify(TRACE_SYSTEM),            \
-       .raw_init               = ftrace_raw_init_event_##call,         \
-       .regfunc                = ftrace_raw_reg_event_##call,          \
-       .unregfunc              = ftrace_raw_unreg_event_##call,        \
-       .show_format            = ftrace_format_##call,                 \
-       .define_fields          = ftrace_define_fields_##call,          \
-       _TRACE_PROFILE_INIT(call)                                       \
-}
-
-#include <trace/trace_event_types.h>
-
-#undef _TRACE_PROFILE
-#undef _TRACE_PROFILE_INIT
-
index 07a22c33ebf3c31b4d07b7125a9062118ca90490..d06cf898dc86aeb012a5f2e5bb1abed214aa8102 100644 (file)
 #undef TRACE_STRUCT
 #define TRACE_STRUCT(args...) args
 
+extern void __bad_type_size(void);
+
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)                                        \
+       if (sizeof(type) != sizeof(field.item))                         \
+               __bad_type_size();                                      \
        ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"      \
                               "offset:%u;\tsize:%u;\n",                \
                               (unsigned int)offsetof(typeof(field), item), \
@@ -30,7 +34,7 @@
 
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)                      \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd)                 \
        ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"   \
                               "offset:%u;\tsize:%u;\n",                \
                               (unsigned int)offsetof(typeof(field), item), \
@@ -46,6 +50,9 @@
        if (!ret)                                                       \
                return 0;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)        \
+       TRACE_FIELD(type, item, assign)
 
 #undef TP_RAW_FMT
 #define TP_RAW_FMT(args...) args
@@ -65,6 +72,22 @@ ftrace_format_##call(struct trace_seq *s)                            \
        return ret;                                                     \
 }
 
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)                              \
+static int                                                             \
+ftrace_format_##call(struct trace_seq *s)                              \
+{                                                                      \
+       struct args field;                                              \
+       int ret;                                                        \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);            \
+                                                                       \
+       return ret;                                                     \
+}
+
 #include "trace_event_types.h"
 
 #undef TRACE_ZERO_CHAR
@@ -78,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s)                           \
 #define TRACE_FIELD(type, item, assign)\
        entry->item = assign;
 
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)        \
+       TRACE_FIELD(type, item, assign)
+
 #undef TP_CMD
 #define TP_CMD(cmd...) cmd
 
@@ -85,18 +112,95 @@ ftrace_format_##call(struct trace_seq *s)                          \
 #define TRACE_ENTRY    entry
 
 #undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \
        cmd;
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+int ftrace_define_fields_##call(void);                                 \
+static int ftrace_raw_init_event_##call(void);                         \
+                                                                       \
+struct ftrace_event_call __used                                                \
+__attribute__((__aligned__(4)))                                                \
+__attribute__((section("_ftrace_events"))) event_##call = {            \
+       .name                   = #call,                                \
+       .id                     = proto,                                \
+       .system                 = __stringify(TRACE_SYSTEM),            \
+       .raw_init               = ftrace_raw_init_event_##call,         \
+       .show_format            = ftrace_format_##call,                 \
+       .define_fields          = ftrace_define_fields_##call,          \
+};                                                                     \
+static int ftrace_raw_init_event_##call(void)                          \
+{                                                                      \
+       INIT_LIST_HEAD(&event_##call.fields);                           \
+       init_preds(&event_##call);                                      \
+       return 0;                                                       \
+}                                                                      \
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)                              \
                                                                        \
-static struct ftrace_event_call __used                                 \
+struct ftrace_event_call __used                                                \
 __attribute__((__aligned__(4)))                                                \
 __attribute__((section("_ftrace_events"))) event_##call = {            \
        .name                   = #call,                                \
        .id                     = proto,                                \
        .system                 = __stringify(TRACE_SYSTEM),            \
        .show_format            = ftrace_format_##call,                 \
+};
+
+#include "trace_event_types.h"
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)                                        \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed_type(type));     \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type, item, len, cmd)                      \
+       ret = trace_define_field(event_call, #type "[" #len "]", #item, \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), 0);                \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_SIGN
+#define TRACE_FIELD_SIGN(type, item, assign, is_signed)                        \
+       ret = trace_define_field(event_call, #type, #item,              \
+                                offsetof(typeof(field), item),         \
+                                sizeof(field.item), is_signed);        \
+       if (ret)                                                        \
+               return ret;
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)     \
+int                                                                    \
+ftrace_define_fields_##call(void)                                      \
+{                                                                      \
+       struct ftrace_event_call *event_call = &event_##call;           \
+       struct args field;                                              \
+       int ret;                                                        \
+                                                                       \
+       __common_field(unsigned char, type, 0);                         \
+       __common_field(unsigned char, flags, 0);                        \
+       __common_field(unsigned char, preempt_count, 0);                \
+       __common_field(int, pid, 1);                                    \
+       __common_field(int, tgid, 1);                                   \
+                                                                       \
+       tstruct;                                                        \
+                                                                       \
+       return ret;                                                     \
 }
+
+#undef TRACE_EVENT_FORMAT_NOFILTER
+#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct,   \
+                                   tpfmt)
+
 #include "trace_event_types.h"
index d28687e7b3a7b36859f1ac48fa1ac62d5e52f456..8b592418d8b28953a28e40c2202a50cb2b40dd37 100644 (file)
@@ -65,6 +65,12 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
        if (!current->ret_stack)
                return -EBUSY;
 
+       /*
+        * We must make sure the ret_stack is tested before we read
+        * anything else.
+        */
+       smp_rmb();
+
        /* The return trace stack is full */
        if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
                atomic_inc(&current->trace_overrun);
@@ -78,13 +84,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
        current->ret_stack[index].ret = ret;
        current->ret_stack[index].func = func;
        current->ret_stack[index].calltime = calltime;
+       current->ret_stack[index].subtime = 0;
        *depth = index;
 
        return 0;
 }
 
 /* Retrieve a function return address to the trace stack on thread info.*/
-void
+static void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
 {
        int index;
@@ -104,9 +111,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
        trace->calltime = current->ret_stack[index].calltime;
        trace->overrun = atomic_read(&current->trace_overrun);
        trace->depth = index;
-       barrier();
-       current->curr_ret_stack--;
-
 }
 
 /*
@@ -121,6 +125,8 @@ unsigned long ftrace_return_to_handler(void)
        ftrace_pop_return_trace(&trace, &ret);
        trace.rettime = trace_clock_local();
        ftrace_graph_return(&trace);
+       barrier();
+       current->curr_ret_stack--;
 
        if (unlikely(!ret)) {
                ftrace_graph_stop();
@@ -426,8 +432,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr,
        return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t
-print_graph_duration(unsigned long long duration, struct trace_seq *s)
+enum print_line_t
+trace_print_graph_duration(unsigned long long duration, struct trace_seq *s)
 {
        unsigned long nsecs_rem = do_div(duration, 1000);
        /* log10(ULONG_MAX) + '\0' */
@@ -464,12 +470,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
                if (!ret)
                        return TRACE_TYPE_PARTIAL_LINE;
        }
+       return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+print_graph_duration(unsigned long long duration, struct trace_seq *s)
+{
+       int ret;
+
+       ret = trace_print_graph_duration(duration, s);
+       if (ret != TRACE_TYPE_HANDLED)
+               return ret;
 
        ret = trace_seq_printf(s, "|  ");
        if (!ret)
                return TRACE_TYPE_PARTIAL_LINE;
-       return TRACE_TYPE_HANDLED;
 
+       return TRACE_TYPE_HANDLED;
 }
 
 /* Case of a leaf function on its call entry */
index 7bfdf4c2347f38251acb30e6d3ef3ad588c89e30..ca7d7c4d0c2aef41b74585996001d993ff4505e2 100644 (file)
@@ -1,10 +1,9 @@
 /*
- * h/w branch tracer for x86 based on bts
+ * h/w branch tracer for x86 based on BTS
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  */
-#include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 
 #include <asm/ds.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 
-#define SIZEOF_BTS (1 << 13)
+#define BTS_BUFFER_SIZE (1 << 13)
 
-/*
- * The tracer lock protects the below per-cpu tracer array.
- * It needs to be held to:
- * - start tracing on all cpus
- * - stop tracing on all cpus
- * - start tracing on a single hotplug cpu
- * - stop tracing on a single hotplug cpu
- * - read the trace from all cpus
- * - read the trace from a single cpu
- */
-static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
-static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
+static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
-#define this_buffer per_cpu(buffer, smp_processor_id())
 
-static int __read_mostly trace_hw_branches_enabled;
+static int trace_hw_branches_enabled __read_mostly;
+static int trace_hw_branches_suspended __read_mostly;
 static struct trace_array *hw_branch_trace __read_mostly;
 
 
-/*
- * Start tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_start_cpu(void *arg)
+static void bts_trace_init_cpu(int cpu)
 {
-       if (this_tracer)
-               ds_release_bts(this_tracer);
-
-       this_tracer =
-               ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS,
-                              /* ovfl = */ NULL, /* th = */ (size_t)-1,
-                              BTS_KERNEL);
-       if (IS_ERR(this_tracer)) {
-               this_tracer = NULL;
-               return;
-       }
+       per_cpu(tracer, cpu) =
+               ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE,
+                                  NULL, (size_t)-1, BTS_KERNEL);
+
+       if (IS_ERR(per_cpu(tracer, cpu)))
+               per_cpu(tracer, cpu) = NULL;
 }
 
-static void bts_trace_start(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
+
+       hw_branch_trace = tr;
+       trace_hw_branches_enabled = 0;
 
-       on_each_cpu(bts_trace_start_cpu, NULL, 1);
-       trace_hw_branches_enabled = 1;
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               bts_trace_init_cpu(cpu);
 
-       spin_unlock(&bts_tracer_lock);
+               if (likely(per_cpu(tracer, cpu)))
+                       trace_hw_branches_enabled = 1;
+       }
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
+
+       /* If we could not enable tracing on a single cpu, we fail. */
+       return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP;
 }
 
-/*
- * Stop tracing on the current cpu.
- * The argument is ignored.
- *
- * pre: bts_tracer_lock must be locked.
- */
-static void bts_trace_stop_cpu(void *arg)
+static void bts_trace_reset(struct trace_array *tr)
 {
-       if (this_tracer) {
-               ds_release_bts(this_tracer);
-               this_tracer = NULL;
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu) {
+               if (likely(per_cpu(tracer, cpu))) {
+                       ds_release_bts(per_cpu(tracer, cpu));
+                       per_cpu(tracer, cpu) = NULL;
+               }
        }
+       trace_hw_branches_enabled = 0;
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
 }
 
-static void bts_trace_stop(struct trace_array *tr)
+static void bts_trace_start(struct trace_array *tr)
 {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
 
-       trace_hw_branches_enabled = 0;
-       on_each_cpu(bts_trace_stop_cpu, NULL, 1);
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_resume_bts(per_cpu(tracer, cpu));
+       trace_hw_branches_suspended = 0;
+       put_online_cpus();
+}
 
-       spin_unlock(&bts_tracer_lock);
+static void bts_trace_stop(struct trace_array *tr)
+{
+       int cpu;
+
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_suspend_bts(per_cpu(tracer, cpu));
+       trace_hw_branches_suspended = 1;
+       put_online_cpus();
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
                                     unsigned long action, void *hcpu)
 {
-       unsigned int cpu = (unsigned long)hcpu;
-
-       spin_lock(&bts_tracer_lock);
-
-       if (!trace_hw_branches_enabled)
-               goto out;
+       int cpu = (long)hcpu;
 
        switch (action) {
        case CPU_ONLINE:
        case CPU_DOWN_FAILED:
-               smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+               /* The notification is sent with interrupts enabled. */
+               if (trace_hw_branches_enabled) {
+                       bts_trace_init_cpu(cpu);
+
+                       if (trace_hw_branches_suspended &&
+                           likely(per_cpu(tracer, cpu)))
+                               ds_suspend_bts(per_cpu(tracer, cpu));
+               }
                break;
+
        case CPU_DOWN_PREPARE:
-               smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
-               break;
+               /* The notification is sent with interrupts enabled. */
+               if (likely(per_cpu(tracer, cpu))) {
+                       ds_release_bts(per_cpu(tracer, cpu));
+                       per_cpu(tracer, cpu) = NULL;
+               }
        }
 
- out:
-       spin_unlock(&bts_tracer_lock);
        return NOTIFY_DONE;
 }
 
@@ -126,20 +134,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
        .notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
-{
-       hw_branch_trace = tr;
-
-       bts_trace_start(tr);
-
-       return 0;
-}
-
-static void bts_trace_reset(struct trace_array *tr)
-{
-       bts_trace_stop(tr);
-}
-
 static void bts_trace_print_header(struct seq_file *m)
 {
        seq_puts(m, "# CPU#        TO  <-  FROM\n");
@@ -147,10 +141,10 @@ static void bts_trace_print_header(struct seq_file *m)
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 {
+       unsigned long symflags = TRACE_ITER_SYM_OFFSET;
        struct trace_entry *entry = iter->ent;
        struct trace_seq *seq = &iter->seq;
        struct hw_branch_entry *it;
-       unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
        trace_assign_type(it, entry);
 
@@ -168,6 +162,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 
 void trace_hw_branch(u64 from, u64 to)
 {
+       struct ftrace_event_call *call = &event_hw_branch;
        struct trace_array *tr = hw_branch_trace;
        struct ring_buffer_event *event;
        struct hw_branch_entry *entry;
@@ -194,7 +189,8 @@ void trace_hw_branch(u64 from, u64 to)
        entry->ent.type = TRACE_HW_BRANCHES;
        entry->from = from;
        entry->to   = to;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
        atomic_dec(&tr->data[cpu]->disabled);
@@ -224,11 +220,11 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_lock must be locked
+ * pre: tracing must be suspended on the current cpu
  */
 static void trace_bts_cpu(void *arg)
 {
-       struct trace_array *tr = (struct trace_array *) arg;
+       struct trace_array *tr = (struct trace_array *)arg;
        const struct bts_trace *trace;
        unsigned char *at;
 
@@ -241,10 +237,9 @@ static void trace_bts_cpu(void *arg)
        if (unlikely(!this_tracer))
                return;
 
-       ds_suspend_bts(this_tracer);
        trace = ds_read_bts(this_tracer);
        if (!trace)
-               goto out;
+               return;
 
        for (at = trace->ds.top; (void *)at < trace->ds.end;
             at += trace->ds.size)
@@ -253,18 +248,27 @@ static void trace_bts_cpu(void *arg)
        for (at = trace->ds.begin; (void *)at < trace->ds.top;
             at += trace->ds.size)
                trace_bts_at(trace, at);
-
-out:
-       ds_resume_bts(this_tracer);
 }
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-       spin_lock(&bts_tracer_lock);
+       int cpu;
 
+       get_online_cpus();
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_suspend_bts(per_cpu(tracer, cpu));
+       /*
+        * We need to collect the trace on the respective cpu since ftrace
+        * implicitly adds the record for the current cpu.
+        * Once that is more flexible, we could collect the data from any cpu.
+        */
        on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-       spin_unlock(&bts_tracer_lock);
+       for_each_online_cpu(cpu)
+               if (likely(per_cpu(tracer, cpu)))
+                       ds_resume_bts(per_cpu(tracer, cpu));
+       put_online_cpus();
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -274,11 +278,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-       spin_lock(&bts_tracer_lock);
-
-       trace_bts_cpu(hw_branch_trace);
-
-       spin_unlock(&bts_tracer_lock);
+       if (this_tracer) {
+               ds_suspend_bts_noirq(this_tracer);
+               trace_bts_cpu(hw_branch_trace);
+               ds_resume_bts_noirq(this_tracer);
+       }
 }
 
 struct tracer bts_tracer __read_mostly =
@@ -291,7 +295,10 @@ struct tracer bts_tracer __read_mostly =
        .start          = bts_trace_start,
        .stop           = bts_trace_stop,
        .open           = trace_bts_prepare,
-       .close          = trace_bts_close
+       .close          = trace_bts_close,
+#ifdef CONFIG_FTRACE_SELFTEST
+       .selftest       = trace_selftest_startup_hw_branches,
+#endif /* CONFIG_FTRACE_SELFTEST */
 };
 
 __init static int init_bts_trace(void)
index 8e37fcddd8b49fdaf46e7cc145b4de0382b69b1e..d53b45ed080622933b659ca85774f6b13ae0538e 100644 (file)
@@ -9,6 +9,8 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <linux/time.h>
+
 #include <asm/atomic.h>
 
 #include "trace.h"
@@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
        struct mmiotrace_rw *rw;
        struct trace_seq *s     = &iter->seq;
        unsigned long long t    = ns2usecs(iter->ts);
-       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
        int ret = 1;
 
@@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
        struct mmiotrace_map *m;
        struct trace_seq *s     = &iter->seq;
        unsigned long long t    = ns2usecs(iter->ts);
-       unsigned long usec_rem  = do_div(t, 1000000ULL);
+       unsigned long usec_rem  = do_div(t, USEC_PER_SEC);
        unsigned secs           = (unsigned long)t;
        int ret;
 
index 64b54a59c55b585ff3b979f216ba0451273ee6e4..7938f3ae93e3dbe9d898e037c47027c4908a8d94 100644 (file)
 /* must be a power of 2 */
 #define EVENT_HASHSIZE 128
 
-static DEFINE_MUTEX(trace_event_mutex);
+DECLARE_RWSEM(trace_event_mutex);
+
+DEFINE_PER_CPU(struct trace_seq, ftrace_event_seq);
+EXPORT_PER_CPU_SYMBOL(ftrace_event_seq);
+
 static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+void trace_print_seq(struct seq_file *m, struct trace_seq *s)
+{
+       int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len;
+
+       s->buffer[len] = 0;
+       seq_puts(m, s->buffer);
+
+       trace_seq_init(s);
+}
+
 enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
 {
        struct trace_seq *s = &iter->seq;
@@ -84,6 +98,39 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 
        return len;
 }
+EXPORT_SYMBOL_GPL(trace_seq_printf);
+
+/**
+ * trace_seq_vprintf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args)
+{
+       int len = (PAGE_SIZE - 1) - s->len;
+       int ret;
+
+       if (!len)
+               return 0;
+
+       ret = vsnprintf(s->buffer + s->len, len, fmt, args);
+
+       /* If we can't write it all, don't bother writing anything */
+       if (ret >= len)
+               return 0;
+
+       s->len += ret;
+
+       return len;
+}
+EXPORT_SYMBOL_GPL(trace_seq_vprintf);
 
 int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
@@ -201,6 +248,67 @@ int trace_seq_path(struct trace_seq *s, struct path *path)
        return 0;
 }
 
+const char *
+ftrace_print_flags_seq(struct trace_seq *p, const char *delim,
+                      unsigned long flags,
+                      const struct trace_print_flags *flag_array)
+{
+       unsigned long mask;
+       const char *str;
+       const char *ret = p->buffer + p->len;
+       int i;
+
+       for (i = 0;  flag_array[i].name && flags; i++) {
+
+               mask = flag_array[i].mask;
+               if ((flags & mask) != mask)
+                       continue;
+
+               str = flag_array[i].name;
+               flags &= ~mask;
+               if (p->len && delim)
+                       trace_seq_puts(p, delim);
+               trace_seq_puts(p, str);
+       }
+
+       /* check for left over flags */
+       if (flags) {
+               if (p->len && delim)
+                       trace_seq_puts(p, delim);
+               trace_seq_printf(p, "0x%lx", flags);
+       }
+
+       trace_seq_putc(p, 0);
+
+       return ret;
+}
+EXPORT_SYMBOL(ftrace_print_flags_seq);
+
+const char *
+ftrace_print_symbols_seq(struct trace_seq *p, unsigned long val,
+                        const struct trace_print_flags *symbol_array)
+{
+       int i;
+       const char *ret = p->buffer + p->len;
+
+       for (i = 0;  symbol_array[i].name; i++) {
+
+               if (val != symbol_array[i].mask)
+                       continue;
+
+               trace_seq_puts(p, symbol_array[i].name);
+               break;
+       }
+
+       if (!p->len)
+               trace_seq_printf(p, "0x%lx", val);
+               
+       trace_seq_putc(p, 0);
+
+       return ret;
+}
+EXPORT_SYMBOL(ftrace_print_symbols_seq);
+
 #ifdef CONFIG_KRETPROBES
 static inline const char *kretprobed(const char *name)
 {
@@ -311,17 +419,20 @@ seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
 
                if (ip == ULONG_MAX || !ret)
                        break;
-               if (i && ret)
-                       ret = trace_seq_puts(s, " <- ");
+               if (ret)
+                       ret = trace_seq_puts(s, " => ");
                if (!ip) {
                        if (ret)
                                ret = trace_seq_puts(s, "??");
+                       if (ret)
+                               ret = trace_seq_puts(s, "\n");
                        continue;
                }
                if (!ret)
                        break;
                if (ret)
                        ret = seq_print_user_ip(s, mm, ip, sym_flags);
+               ret = trace_seq_puts(s, "\n");
        }
 
        if (mm)
@@ -455,6 +566,7 @@ static int task_state_char(unsigned long state)
  * @type: the type of event to look for
  *
  * Returns an event of type @type otherwise NULL
+ * Called with trace_event_read_lock() held.
  */
 struct trace_event *ftrace_find_event(int type)
 {
@@ -464,7 +576,7 @@ struct trace_event *ftrace_find_event(int type)
 
        key = type & (EVENT_HASHSIZE - 1);
 
-       hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+       hlist_for_each_entry(event, n, &event_hash[key], node) {
                if (event->type == type)
                        return event;
        }
@@ -472,6 +584,46 @@ struct trace_event *ftrace_find_event(int type)
        return NULL;
 }
 
+static LIST_HEAD(ftrace_event_list);
+
+static int trace_search_list(struct list_head **list)
+{
+       struct trace_event *e;
+       int last = __TRACE_LAST_TYPE;
+
+       if (list_empty(&ftrace_event_list)) {
+               *list = &ftrace_event_list;
+               return last + 1;
+       }
+
+       /*
+        * We used up all possible max events,
+        * lets see if somebody freed one.
+        */
+       list_for_each_entry(e, &ftrace_event_list, list) {
+               if (e->type != last + 1)
+                       break;
+               last++;
+       }
+
+       /* Did we used up all 65 thousand events??? */
+       if ((last + 1) > FTRACE_MAX_EVENT)
+               return 0;
+
+       *list = &e->list;
+       return last + 1;
+}
+
+void trace_event_read_lock(void)
+{
+       down_read(&trace_event_mutex);
+}
+
+void trace_event_read_unlock(void)
+{
+       up_read(&trace_event_mutex);
+}
+
 /**
  * register_ftrace_event - register output for an event type
  * @event: the event type to register
@@ -492,22 +644,42 @@ int register_ftrace_event(struct trace_event *event)
        unsigned key;
        int ret = 0;
 
-       mutex_lock(&trace_event_mutex);
+       down_write(&trace_event_mutex);
 
-       if (!event) {
-               ret = next_event_type++;
+       if (WARN_ON(!event))
                goto out;
-       }
 
-       if (!event->type)
-               event->type = next_event_type++;
-       else if (event->type > __TRACE_LAST_TYPE) {
+       INIT_LIST_HEAD(&event->list);
+
+       if (!event->type) {
+               struct list_head *list = NULL;
+
+               if (next_event_type > FTRACE_MAX_EVENT) {
+
+                       event->type = trace_search_list(&list);
+                       if (!event->type)
+                               goto out;
+
+               } else {
+                       
+                       event->type = next_event_type++;
+                       list = &ftrace_event_list;
+               }
+
+               if (WARN_ON(ftrace_find_event(event->type)))
+                       goto out;
+
+               list_add_tail(&event->list, list);
+
+       } else if (event->type > __TRACE_LAST_TYPE) {
                printk(KERN_WARNING "Need to add type to trace.h\n");
                WARN_ON(1);
-       }
-
-       if (ftrace_find_event(event->type))
                goto out;
+       } else {
+               /* Is this event already used */
+               if (ftrace_find_event(event->type))
+                       goto out;
+       }
 
        if (event->trace == NULL)
                event->trace = trace_nop_print;
@@ -520,14 +692,25 @@ int register_ftrace_event(struct trace_event *event)
 
        key = event->type & (EVENT_HASHSIZE - 1);
 
-       hlist_add_head_rcu(&event->node, &event_hash[key]);
+       hlist_add_head(&event->node, &event_hash[key]);
 
        ret = event->type;
  out:
-       mutex_unlock(&trace_event_mutex);
+       up_write(&trace_event_mutex);
 
        return ret;
 }
+EXPORT_SYMBOL_GPL(register_ftrace_event);
+
+/*
+ * Used by module code with the trace_event_mutex held for write.
+ */
+int __unregister_ftrace_event(struct trace_event *event)
+{
+       hlist_del(&event->node);
+       list_del(&event->list);
+       return 0;
+}
 
 /**
  * unregister_ftrace_event - remove a no longer used event
@@ -535,12 +718,13 @@ int register_ftrace_event(struct trace_event *event)
  */
 int unregister_ftrace_event(struct trace_event *event)
 {
-       mutex_lock(&trace_event_mutex);
-       hlist_del(&event->node);
-       mutex_unlock(&trace_event_mutex);
+       down_write(&trace_event_mutex);
+       __unregister_ftrace_event(event);
+       up_write(&trace_event_mutex);
 
        return 0;
 }
+EXPORT_SYMBOL_GPL(unregister_ftrace_event);
 
 /*
  * Standard events
@@ -833,14 +1017,16 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 
        trace_assign_type(field, iter->ent);
 
+       if (!trace_seq_puts(s, "<stack trace>\n"))
+               goto partial;
        for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-               if (i) {
-                       if (!trace_seq_puts(s, " <= "))
-                               goto partial;
+               if (!field->caller[i] || (field->caller[i] == ULONG_MAX))
+                       break;
+               if (!trace_seq_puts(s, " => "))
+                       goto partial;
 
-                       if (!seq_print_ip_sym(s, field->caller[i], flags))
-                               goto partial;
-               }
+               if (!seq_print_ip_sym(s, field->caller[i], flags))
+                       goto partial;
                if (!trace_seq_puts(s, "\n"))
                        goto partial;
        }
@@ -868,10 +1054,10 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 
        trace_assign_type(field, iter->ent);
 
-       if (!seq_print_userip_objs(field, s, flags))
+       if (!trace_seq_puts(s, "<user stack trace>\n"))
                goto partial;
 
-       if (!trace_seq_putc(s, '\n'))
+       if (!seq_print_userip_objs(field, s, flags))
                goto partial;
 
        return TRACE_TYPE_HANDLED;
index e0bde39c2dd9b00ab8ac5388aa0b5168fe5f37d2..d38bec4a9c3081e52bca0dec51e04e94827cba67 100644 (file)
@@ -1,41 +1,17 @@
 #ifndef __TRACE_EVENTS_H
 #define __TRACE_EVENTS_H
 
+#include <linux/trace_seq.h>
 #include "trace.h"
 
-typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
-                                             int flags);
-
-struct trace_event {
-       struct hlist_node       node;
-       int                     type;
-       trace_print_func        trace;
-       trace_print_func        raw;
-       trace_print_func        hex;
-       trace_print_func        binary;
-};
-
 extern enum print_line_t
 trace_print_bprintk_msg_only(struct trace_iterator *iter);
 extern enum print_line_t
 trace_print_printk_msg_only(struct trace_iterator *iter);
 
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-       __attribute__ ((format (printf, 2, 3)));
-extern int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
                unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-                                size_t cnt);
-extern int trace_seq_puts(struct trace_seq *s, const char *str);
-extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
-                               size_t len);
-extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
-extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
                                 struct trace_seq *s, unsigned long sym_flags);
 extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
@@ -44,13 +20,17 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 extern int trace_print_context(struct trace_iterator *iter);
 extern int trace_print_lat_context(struct trace_iterator *iter);
 
+extern void trace_event_read_lock(void);
+extern void trace_event_read_unlock(void);
 extern struct trace_event *ftrace_find_event(int type);
-extern int register_ftrace_event(struct trace_event *event);
-extern int unregister_ftrace_event(struct trace_event *event);
 
 extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
                                         int flags);
 
+/* used by module unregistering */
+extern int __unregister_ftrace_event(struct trace_event *event);
+extern struct rw_semaphore trace_event_mutex;
+
 #define MAX_MEMHEX_BYTES       8
 #define HEX_CHARS              (MAX_MEMHEX_BYTES*2 + 1)
 
index 118439709fb771f4fa2ad576454e0b64d1274144..8a30d9874cd430d507c3f9e997610430f1bcb894 100644 (file)
@@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type,
 
 static void probe_power_end(struct power_trace *it)
 {
+       struct ftrace_event_call *call = &event_power;
        struct ring_buffer_event *event;
        struct trace_power *entry;
        struct trace_array_cpu *data;
@@ -54,7 +55,8 @@ static void probe_power_end(struct power_trace *it)
                goto out;
        entry   = ring_buffer_event_data(event);
        entry->state_data = *it;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
        preempt_enable();
 }
@@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it)
 static void probe_power_mark(struct power_trace *it, unsigned int type,
                                unsigned int level)
 {
+       struct ftrace_event_call *call = &event_power;
        struct ring_buffer_event *event;
        struct trace_power *entry;
        struct trace_array_cpu *data;
@@ -84,7 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type,
                goto out;
        entry   = ring_buffer_event_data(event);
        entry->state_data = *it;
-       trace_buffer_unlock_commit(tr, event, 0, 0);
+       if (!filter_check_discard(call, entry, tr->buffer, event))
+               trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
        preempt_enable();
 }
index eb81556107fec71dc8ba53278df9264862df0f3b..9bece9687b62a8d2ab6b59199017ef0ad7898b98 100644 (file)
@@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = {
 static __init int init_trace_printk_function_export(void)
 {
        struct dentry *d_tracer;
-       struct dentry *entry;
 
        d_tracer = tracing_init_dentry();
        if (!d_tracer)
                return 0;
 
-       entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+       trace_create_file("printk_formats", 0444, d_tracer,
                                    NULL, &ftrace_formats_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs "
-                          "'printk_formats' entry\n");
 
        return 0;
 }
index 9117cea6f1ae78f17f2be7fc9894aa9b0abe2075..a98106dd979cfd7bd305cfab5a506fec61e95e46 100644 (file)
@@ -10,7 +10,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
@@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
        int cpu;
        int pc;
 
-       if (!sched_ref || sched_stopped)
+       if (unlikely(!sched_ref))
                return;
 
        tracing_record_cmdline(prev);
        tracing_record_cmdline(next);
 
-       if (!tracer_enabled)
+       if (!tracer_enabled || sched_stopped)
                return;
 
        pc = preempt_count();
@@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
        unsigned long flags;
        int cpu, pc;
 
-       if (!likely(tracer_enabled))
+       if (unlikely(!sched_ref))
                return;
 
-       pc = preempt_count();
        tracing_record_cmdline(current);
 
-       if (sched_stopped)
+       if (!tracer_enabled || sched_stopped)
                return;
 
+       pc = preempt_count();
        local_irq_save(flags);
        cpu = raw_smp_processor_id();
        data = ctx_trace->data[cpu];
index 5bc00e8f153ebb8682589caa37b553b05800f6b0..eacb272251736276335a02d04a83b856ec550dfb 100644 (file)
@@ -15,7 +15,7 @@
 #include <linux/kallsyms.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
-#include <trace/sched.h>
+#include <trace/events/sched.h>
 
 #include "trace.h"
 
@@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 
        pc = preempt_count();
 
-       /* The task we are waiting for is waking up */
-       data = wakeup_trace->data[wakeup_cpu];
-
        /* disable local data, not wakeup_cpu data */
        cpu = raw_smp_processor_id();
        disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled);
@@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
        if (unlikely(!tracer_enabled || next != wakeup_task))
                goto out_unlock;
 
+       /* The task we are waiting for is waking up */
+       data = wakeup_trace->data[wakeup_cpu];
+
        trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
        tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
index 08f4eb2763d141bbae33c518a4de1e15d1c3614a..00dd6485bdd7e7abf390a6fcbe1c27d626a8531e 100644 (file)
@@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry)
        case TRACE_BRANCH:
        case TRACE_GRAPH_ENT:
        case TRACE_GRAPH_RET:
+       case TRACE_HW_BRANCHES:
                return 1;
        }
        return 0;
@@ -188,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 #else
 # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; })
 #endif /* CONFIG_DYNAMIC_FTRACE */
+
 /*
  * Simple verification test of ftrace function tracer.
  * Enable ftrace, sleep 1/10 second, and then read the trace
@@ -749,3 +751,59 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
        return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+int
+trace_selftest_startup_hw_branches(struct tracer *trace,
+                                  struct trace_array *tr)
+{
+       struct trace_iterator *iter;
+       struct tracer tracer;
+       unsigned long count;
+       int ret;
+
+       if (!trace->open) {
+               printk(KERN_CONT "missing open function...");
+               return -1;
+       }
+
+       ret = tracer_init(trace, tr);
+       if (ret) {
+               warn_failed_init_tracer(trace, ret);
+               return ret;
+       }
+
+       /*
+        * The hw-branch tracer needs to collect the trace from the various
+        * cpu trace buffers - before tracing is stopped.
+        */
+       iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+       if (!iter)
+               return -ENOMEM;
+
+       memcpy(&tracer, trace, sizeof(tracer));
+
+       iter->trace = &tracer;
+       iter->tr = tr;
+       iter->pos = -1;
+       mutex_init(&iter->mutex);
+
+       trace->open(iter);
+
+       mutex_destroy(&iter->mutex);
+       kfree(iter);
+
+       tracing_stop();
+
+       ret = trace_test_buffer(tr, &count);
+       trace->reset(tr);
+       tracing_start();
+
+       if (!ret && !count) {
+               printk(KERN_CONT "no entries found..");
+               ret = -1;
+       }
+
+       return ret;
+}
+#endif /* CONFIG_HW_BRANCH_TRACER */
index c750f65f9661531273ee16553623b405268fc758..2d7aebd71dbd4e3489b8bf59fd29585b422be8e9 100644 (file)
@@ -265,7 +265,7 @@ static int t_show(struct seq_file *m, void *v)
                seq_printf(m, "        Depth    Size   Location"
                           "    (%d entries)\n"
                           "        -----    ----   --------\n",
-                          max_stack_trace.nr_entries);
+                          max_stack_trace.nr_entries - 1);
 
                if (!stack_tracer_enabled && !max_stack_size)
                        print_disabled(m);
@@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace);
 static __init int stack_trace_init(void)
 {
        struct dentry *d_tracer;
-       struct dentry *entry;
 
        d_tracer = tracing_init_dentry();
 
-       entry = debugfs_create_file("stack_max_size", 0644, d_tracer,
-                                   &max_stack_size, &stack_max_size_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'stack_max_size' entry\n");
+       trace_create_file("stack_max_size", 0644, d_tracer,
+                       &max_stack_size, &stack_max_size_fops);
 
-       entry = debugfs_create_file("stack_trace", 0444, d_tracer,
-                                   NULL, &stack_trace_fops);
-       if (!entry)
-               pr_warning("Could not create debugfs 'stack_trace' entry\n");
+       trace_create_file("stack_trace", 0444, d_tracer,
+                       NULL, &stack_trace_fops);
 
        if (stack_tracer_enabled)
                register_ftrace_function(&trace_ops);
index acdebd771a93b9623e57b8ed922c988524254ba7..c00643733f4ccca7be15b22ef311ec03711edc48 100644 (file)
@@ -1,7 +1,7 @@
 /*
  * Infrastructure for statistic tracing (histogram output).
  *
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (C) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  *
  * Based on the code from trace_branch.c which is
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
 
 
 #include <linux/list.h>
+#include <linux/rbtree.h>
 #include <linux/debugfs.h>
 #include "trace_stat.h"
 #include "trace.h"
 
 
-/* List of stat entries from a tracer */
-struct trace_stat_list {
-       struct list_head        list;
+/*
+ * List of stat red-black nodes from a tracer
+ * We use a such tree to sort quickly the stat
+ * entries from the tracer.
+ */
+struct stat_node {
+       struct rb_node          node;
        void                    *stat;
 };
 
 /* A stat session is the stats output in one file */
-struct tracer_stat_session {
+struct stat_session {
        struct list_head        session_list;
        struct tracer_stat      *ts;
-       struct list_head        stat_list;
+       struct rb_root          stat_root;
        struct mutex            stat_mutex;
        struct dentry           *file;
 };
@@ -37,18 +42,48 @@ static DEFINE_MUTEX(all_stat_sessions_mutex);
 /* The root directory for all stat files */
 static struct dentry           *stat_dir;
 
+/*
+ * Iterate through the rbtree using a post order traversal path
+ * to release the next node.
+ * It won't necessary release one at each iteration
+ * but it will at least advance closer to the next one
+ * to be released.
+ */
+static struct rb_node *release_next(struct rb_node *node)
+{
+       struct stat_node *snode;
+       struct rb_node *parent = rb_parent(node);
+
+       if (node->rb_left)
+               return node->rb_left;
+       else if (node->rb_right)
+               return node->rb_right;
+       else {
+               if (!parent)
+                       ;
+               else if (parent->rb_left == node)
+                       parent->rb_left = NULL;
+               else
+                       parent->rb_right = NULL;
+
+               snode = container_of(node, struct stat_node, node);
+               kfree(snode);
+
+               return parent;
+       }
+}
 
-static void reset_stat_session(struct tracer_stat_session *session)
+static void reset_stat_session(struct stat_session *session)
 {
-       struct trace_stat_list *node, *next;
+       struct rb_node *node = session->stat_root.rb_node;
 
-       list_for_each_entry_safe(node, next, &session->stat_list, list)
-               kfree(node);
+       while (node)
+               node = release_next(node);
 
-       INIT_LIST_HEAD(&session->stat_list);
+       session->stat_root = RB_ROOT;
 }
 
-static void destroy_session(struct tracer_stat_session *session)
+static void destroy_session(struct stat_session *session)
 {
        debugfs_remove(session->file);
        reset_stat_session(session);
@@ -56,25 +91,60 @@ static void destroy_session(struct tracer_stat_session *session)
        kfree(session);
 }
 
+typedef int (*cmp_stat_t)(void *, void *);
+
+static int insert_stat(struct rb_root *root, void *stat, cmp_stat_t cmp)
+{
+       struct rb_node **new = &(root->rb_node), *parent = NULL;
+       struct stat_node *data;
+
+       data = kzalloc(sizeof(*data), GFP_KERNEL);
+       if (!data)
+               return -ENOMEM;
+       data->stat = stat;
+
+       /*
+        * Figure out where to put new node
+        * This is a descendent sorting
+        */
+       while (*new) {
+               struct stat_node *this;
+               int result;
+
+               this = container_of(*new, struct stat_node, node);
+               result = cmp(data->stat, this->stat);
+
+               parent = *new;
+               if (result >= 0)
+                       new = &((*new)->rb_left);
+               else
+                       new = &((*new)->rb_right);
+       }
+
+       rb_link_node(&data->node, parent, new);
+       rb_insert_color(&data->node, root);
+       return 0;
+}
+
 /*
  * For tracers that don't provide a stat_cmp callback.
- * This one will force an immediate insertion on tail of
- * the list.
+ * This one will force an insertion as right-most node
+ * in the rbtree.
  */
 static int dummy_cmp(void *p1, void *p2)
 {
-       return 1;
+       return -1;
 }
 
 /*
- * Initialize the stat list at each trace_stat file opening.
+ * Initialize the stat rbtree at each trace_stat file opening.
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(struct tracer_stat_session *session)
+static int stat_seq_init(struct stat_session *session)
 {
-       struct trace_stat_list *iter_entry, *new_entry;
        struct tracer_stat *ts = session->ts;
+       struct rb_root *root = &session->stat_root;
        void *stat;
        int ret = 0;
        int i;
@@ -85,29 +155,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
        if (!ts->stat_cmp)
                ts->stat_cmp = dummy_cmp;
 
-       stat = ts->stat_start();
+       stat = ts->stat_start(ts);
        if (!stat)
                goto exit;
 
-       /*
-        * The first entry. Actually this is the second, but the first
-        * one (the stat_list head) is pointless.
-        */
-       new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
-       if (!new_entry) {
-               ret = -ENOMEM;
+       ret = insert_stat(root, stat, ts->stat_cmp);
+       if (ret)
                goto exit;
-       }
-
-       INIT_LIST_HEAD(&new_entry->list);
-
-       list_add(&new_entry->list, &session->stat_list);
-
-       new_entry->stat = stat;
 
        /*
-        * Iterate over the tracer stat entries and store them in a sorted
-        * list.
+        * Iterate over the tracer stat entries and store them in an rbtree.
         */
        for (i = 1; ; i++) {
                stat = ts->stat_next(stat, i);
@@ -116,36 +173,16 @@ static int stat_seq_init(struct tracer_stat_session *session)
                if (!stat)
                        break;
 
-               new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
-               if (!new_entry) {
-                       ret = -ENOMEM;
-                       goto exit_free_list;
-               }
-
-               INIT_LIST_HEAD(&new_entry->list);
-               new_entry->stat = stat;
-
-               list_for_each_entry_reverse(iter_entry, &session->stat_list,
-                               list) {
-
-                       /* Insertion with a descendent sorting */
-                       if (ts->stat_cmp(iter_entry->stat,
-                                       new_entry->stat) >= 0) {
-
-                               list_add(&new_entry->list, &iter_entry->list);
-                               break;
-                       }
-               }
-
-               /* The current larger value */
-               if (list_empty(&new_entry->list))
-                       list_add(&new_entry->list, &session->stat_list);
+               ret = insert_stat(root, stat, ts->stat_cmp);
+               if (ret)
+                       goto exit_free_rbtree;
        }
+
 exit:
        mutex_unlock(&session->stat_mutex);
        return ret;
 
-exit_free_list:
+exit_free_rbtree:
        reset_stat_session(session);
        mutex_unlock(&session->stat_mutex);
        return ret;
@@ -154,38 +191,51 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
+       struct rb_node *node;
+       int i;
 
-       /* Prevent from tracer switch or stat_list modification */
+       /* Prevent from tracer switch or rbtree modification */
        mutex_lock(&session->stat_mutex);
 
        /* If we are in the beginning of the file, print the headers */
-       if (!*pos && session->ts->stat_headers)
+       if (!*pos && session->ts->stat_headers) {
+               (*pos)++;
                return SEQ_START_TOKEN;
+       }
 
-       return seq_list_start(&session->stat_list, *pos);
+       node = rb_first(&session->stat_root);
+       for (i = 0; node && i < *pos; i++)
+               node = rb_next(node);
+
+       (*pos)++;
+
+       return node;
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
+       struct rb_node *node = p;
+
+       (*pos)++;
 
        if (p == SEQ_START_TOKEN)
-               return seq_list_start(&session->stat_list, *pos);
+               return rb_first(&session->stat_root);
 
-       return seq_list_next(p, &session->stat_list, pos);
+       return rb_next(node);
 }
 
 static void stat_seq_stop(struct seq_file *s, void *p)
 {
-       struct tracer_stat_session *session = s->private;
+       struct stat_session *session = s->private;
        mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-       struct tracer_stat_session *session = s->private;
-       struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+       struct stat_session *session = s->private;
+       struct stat_node *l = container_of(v, struct stat_node, node);
 
        if (v == SEQ_START_TOKEN)
                return session->ts->stat_headers(s);
@@ -205,7 +255,7 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 {
        int ret;
 
-       struct tracer_stat_session *session = inode->i_private;
+       struct stat_session *session = inode->i_private;
 
        ret = seq_open(file, &trace_stat_seq_ops);
        if (!ret) {
@@ -218,11 +268,11 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 }
 
 /*
- * Avoid consuming memory with our now useless list.
+ * Avoid consuming memory with our now useless rbtree.
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-       struct tracer_stat_session *session = i->i_private;
+       struct stat_session *session = i->i_private;
 
        mutex_lock(&session->stat_mutex);
        reset_stat_session(session);
@@ -251,7 +301,7 @@ static int tracing_stat_init(void)
        return 0;
 }
 
-static int init_stat_file(struct tracer_stat_session *session)
+static int init_stat_file(struct stat_session *session)
 {
        if (!stat_dir && tracing_stat_init())
                return -ENODEV;
@@ -266,7 +316,7 @@ static int init_stat_file(struct tracer_stat_session *session)
 
 int register_stat_tracer(struct tracer_stat *trace)
 {
-       struct tracer_stat_session *session, *node, *tmp;
+       struct stat_session *session, *node;
        int ret;
 
        if (!trace)
@@ -277,7 +327,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
        /* Already registered? */
        mutex_lock(&all_stat_sessions_mutex);
-       list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+       list_for_each_entry(node, &all_stat_sessions, session_list) {
                if (node->ts == trace) {
                        mutex_unlock(&all_stat_sessions_mutex);
                        return -EINVAL;
@@ -286,15 +336,13 @@ int register_stat_tracer(struct tracer_stat *trace)
        mutex_unlock(&all_stat_sessions_mutex);
 
        /* Init the session */
-       session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+       session = kzalloc(sizeof(*session), GFP_KERNEL);
        if (!session)
                return -ENOMEM;
 
        session->ts = trace;
        INIT_LIST_HEAD(&session->session_list);
-       INIT_LIST_HEAD(&session->stat_list);
        mutex_init(&session->stat_mutex);
-       session->file = NULL;
 
        ret = init_stat_file(session);
        if (ret) {
@@ -312,7 +360,7 @@ int register_stat_tracer(struct tracer_stat *trace)
 
 void unregister_stat_tracer(struct tracer_stat *trace)
 {
-       struct tracer_stat_session *node, *tmp;
+       struct stat_session *node, *tmp;
 
        mutex_lock(&all_stat_sessions_mutex);
        list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
index 202274cf7f3d06d1145437ab8cd27d43eaf90e04..f3546a2cd826bc6ae0ecb85c3c631bd411884aab 100644 (file)
@@ -12,7 +12,7 @@ struct tracer_stat {
        /* The name of your stat file */
        const char              *name;
        /* Iteration over statistic entries */
-       void                    *(*stat_start)(void);
+       void                    *(*stat_start)(struct tracer_stat *trace);
        void                    *(*stat_next)(void *prev, int idx);
        /* Compare two entries for stats sorting */
        int                     (*stat_cmp)(void *p1, void *p2);
index 91fd19c2149f5c57e37e808286bc4a3f1929510e..e04b76cc238a69816cd96b57146afbc97cc36da1 100644 (file)
@@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = {
 
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 {
-       struct dentry *entry;
 
-       entry = debugfs_create_file("sysprof_sample_period", 0644,
+       trace_create_file("sysprof_sample_period", 0644,
                        d_tracer, NULL, &sysprof_sample_fops);
-       if (entry)
-               return;
-       pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
index 797201e4a1376af5987bbce519398b14de3aeada..97fcea4acce156228e29490345373938d92277d8 100644 (file)
@@ -6,7 +6,7 @@
  */
 
 
-#include <trace/workqueue.h>
+#include <trace/events/workqueue.h>
 #include <linux/list.h>
 #include <linux/percpu.h>
 #include "trace_stat.h"
@@ -16,8 +16,6 @@
 /* A cpu workqueue thread */
 struct cpu_workqueue_stats {
        struct list_head            list;
-/* Useful to know if we print the cpu headers */
-       bool                        first_entry;
        int                         cpu;
        pid_t                       pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
@@ -47,12 +45,11 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-       struct cpu_workqueue_stats *node, *next;
+       struct cpu_workqueue_stats *node;
        unsigned long flags;
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-                                                       list) {
+       list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
                if (node->pid == wq_thread->pid) {
                        atomic_inc(&node->inserted);
                        goto found;
@@ -69,12 +66,11 @@ probe_workqueue_execution(struct task_struct *wq_thread,
                          struct work_struct *work)
 {
        int cpu = cpumask_first(&wq_thread->cpus_allowed);
-       struct cpu_workqueue_stats *node, *next;
+       struct cpu_workqueue_stats *node;
        unsigned long flags;
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
-                                                       list) {
+       list_for_each_entry(node, &workqueue_cpu_stat(cpu)->list, list) {
                if (node->pid == wq_thread->pid) {
                        node->executed++;
                        goto found;
@@ -105,8 +101,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
        cws->pid = wq_thread->pid;
 
        spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       if (list_empty(&workqueue_cpu_stat(cpu)->list))
-               cws->first_entry = true;
        list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
        spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
@@ -152,7 +146,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
        return ret;
 }
 
-static void *workqueue_stat_start(void)
+static void *workqueue_stat_start(struct tracer_stat *trace)
 {
        int cpu;
        void *ret = NULL;
@@ -191,16 +185,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 static int workqueue_stat_show(struct seq_file *s, void *p)
 {
        struct cpu_workqueue_stats *cws = p;
-       unsigned long flags;
-       int cpu = cws->cpu;
        struct pid *pid;
        struct task_struct *tsk;
 
-       spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-       if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-               seq_printf(s, "\n");
-       spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
        pid = find_get_pid(cws->pid);
        if (pid) {
                tsk = get_pid_task(pid, PIDTYPE_PID);
index 42a2dbc181c89ca708360d84ee53a5ca21640b44..ea7c3b4275cf362f75cee939aa7861f7673b95cc 100644 (file)
@@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
        if (!list_empty(&wait->task_list))
                list_del_init(&wait->task_list);
        else if (waitqueue_active(q))
-               __wake_up_common(q, mode, 1, 0, key);
+               __wake_up_locked_key(q, mode, key);
        spin_unlock_irqrestore(&q->lock, flags);
 }
 EXPORT_SYMBOL(abort_exclusive_wait);
index f71fb2a089503534eec8d8790f82d9702a3545cc..0668795d8818477683d184edff057f6fe19144ad 100644 (file)
@@ -33,7 +33,8 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
-#include <trace/workqueue.h>
+#define CREATE_TRACE_POINTS
+#include <trace/events/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -124,8 +125,6 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
        return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
-DEFINE_TRACE(workqueue_insertion);
-
 static void insert_work(struct cpu_workqueue_struct *cwq,
                        struct work_struct *work, struct list_head *head)
 {
@@ -262,8 +261,6 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
-DEFINE_TRACE(workqueue_execution);
-
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
        spin_lock_irq(&cwq->lock);
@@ -753,8 +750,6 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
        return cwq;
 }
 
-DEFINE_TRACE(workqueue_creation);
-
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
        struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -860,8 +855,6 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
-DEFINE_TRACE(workqueue_destruction);
-
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
        /*
index 8ade0a7a91e09ae11e4921338d5363ee74803d05..9960be04cbbe608cad81733096dd49d73928dbba 100644 (file)
@@ -10,6 +10,9 @@ menu "Library routines"
 config BITREVERSE
        tristate
 
+config RATIONAL
+       boolean
+
 config GENERIC_FIND_FIRST_BIT
        bool
 
index 33a40e40e3ee44807b0347add1a81aaf0390d7a7..1f6edefebffe3f7fd94d0348b97d0dd6d80a9dc1 100644 (file)
@@ -50,6 +50,7 @@ ifneq ($(CONFIG_HAVE_DEC_LOCK),y)
 endif
 
 obj-$(CONFIG_BITREVERSE) += bitrev.o
+obj-$(CONFIG_RATIONAL) += rational.o
 obj-$(CONFIG_CRC_CCITT)        += crc-ccitt.o
 obj-$(CONFIG_CRC16)    += crc16.o
 obj-$(CONFIG_CRC_T10DIF)+= crc-t10dif.o
index 69da09a085a1943ea6a70f4871fd2df1683110fb..ad65fc0317d93b6c1dd20171e625e7e619631d21 100644 (file)
 #include <linux/dma-debug.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/device.h>
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/ctype.h>
 #include <linux/list.h>
 #include <linux/slab.h>
 
@@ -85,6 +87,7 @@ static u32 show_num_errors = 1;
 
 static u32 num_free_entries;
 static u32 min_free_entries;
+static u32 nr_total_entries;
 
 /* number of preallocated entries requested by kernel cmdline */
 static u32 req_entries;
@@ -97,6 +100,16 @@ static struct dentry *show_all_errors_dent  __read_mostly;
 static struct dentry *show_num_errors_dent  __read_mostly;
 static struct dentry *num_free_entries_dent __read_mostly;
 static struct dentry *min_free_entries_dent __read_mostly;
+static struct dentry *filter_dent           __read_mostly;
+
+/* per-driver filter related state */
+
+#define NAME_MAX_LEN   64
+
+static char                  current_driver_name[NAME_MAX_LEN] __read_mostly;
+static struct device_driver *current_driver                    __read_mostly;
+
+static DEFINE_RWLOCK(driver_name_lock);
 
 static const char *type2name[4] = { "single", "page",
                                    "scather-gather", "coherent" };
@@ -104,6 +117,11 @@ static const char *type2name[4] = { "single", "page",
 static const char *dir2name[4] = { "DMA_BIDIRECTIONAL", "DMA_TO_DEVICE",
                                   "DMA_FROM_DEVICE", "DMA_NONE" };
 
+/* little merge helper - remove it after the merge window */
+#ifndef BUS_NOTIFY_UNBOUND_DRIVER
+#define BUS_NOTIFY_UNBOUND_DRIVER 0x0005
+#endif
+
 /*
  * The access to some variables in this macro is racy. We can't use atomic_t
  * here because all these variables are exported to debugfs. Some of them even
@@ -121,15 +139,54 @@ static inline void dump_entry_trace(struct dma_debug_entry *entry)
 {
 #ifdef CONFIG_STACKTRACE
        if (entry) {
-               printk(KERN_WARNING "Mapped at:\n");
+               pr_warning("Mapped at:\n");
                print_stack_trace(&entry->stacktrace, 0);
        }
 #endif
 }
 
+static bool driver_filter(struct device *dev)
+{
+       struct device_driver *drv;
+       unsigned long flags;
+       bool ret;
+
+       /* driver filter off */
+       if (likely(!current_driver_name[0]))
+               return true;
+
+       /* driver filter on and initialized */
+       if (current_driver && dev->driver == current_driver)
+               return true;
+
+       if (current_driver || !current_driver_name[0])
+               return false;
+
+       /* driver filter on but not yet initialized */
+       drv = get_driver(dev->driver);
+       if (!drv)
+               return false;
+
+       /* lock to protect against change of current_driver_name */
+       read_lock_irqsave(&driver_name_lock, flags);
+
+       ret = false;
+       if (drv->name &&
+           strncmp(current_driver_name, drv->name, NAME_MAX_LEN - 1) == 0) {
+               current_driver = drv;
+               ret = true;
+       }
+
+       read_unlock_irqrestore(&driver_name_lock, flags);
+       put_driver(drv);
+
+       return ret;
+}
+
 #define err_printk(dev, entry, format, arg...) do {            \
                error_count += 1;                               \
-               if (show_all_errors || show_num_errors > 0) {   \
+               if (driver_filter(dev) &&                       \
+                   (show_all_errors || show_num_errors > 0)) { \
                        WARN(1, "%s %s: " format,               \
                             dev_driver_string(dev),            \
                             dev_name(dev) , ## arg);           \
@@ -185,15 +242,50 @@ static void put_hash_bucket(struct hash_bucket *bucket,
 static struct dma_debug_entry *hash_bucket_find(struct hash_bucket *bucket,
                                                struct dma_debug_entry *ref)
 {
-       struct dma_debug_entry *entry;
+       struct dma_debug_entry *entry, *ret = NULL;
+       int matches = 0, match_lvl, last_lvl = 0;
 
        list_for_each_entry(entry, &bucket->list, list) {
-               if ((entry->dev_addr == ref->dev_addr) &&
-                   (entry->dev == ref->dev))
+               if ((entry->dev_addr != ref->dev_addr) ||
+                   (entry->dev != ref->dev))
+                       continue;
+
+               /*
+                * Some drivers map the same physical address multiple
+                * times. Without a hardware IOMMU this results in the
+                * same device addresses being put into the dma-debug
+                * hash multiple times too. This can result in false
+                * positives being reported. Therfore we implement a
+                * best-fit algorithm here which returns the entry from
+                * the hash which fits best to the reference value
+                * instead of the first-fit.
+                */
+               matches += 1;
+               match_lvl = 0;
+               entry->size      == ref->size      ? ++match_lvl : match_lvl;
+               entry->type      == ref->type      ? ++match_lvl : match_lvl;
+               entry->direction == ref->direction ? ++match_lvl : match_lvl;
+
+               if (match_lvl == 3) {
+                       /* perfect-fit - return the result */
                        return entry;
+               } else if (match_lvl > last_lvl) {
+                       /*
+                        * We found an entry that fits better then the
+                        * previous one
+                        */
+                       last_lvl = match_lvl;
+                       ret      = entry;
+               }
        }
 
-       return NULL;
+       /*
+        * If we have multiple matches but no perfect-fit, just return
+        * NULL.
+        */
+       ret = (matches == 1) ? ret : NULL;
+
+       return ret;
 }
 
 /*
@@ -257,6 +349,21 @@ static void add_dma_entry(struct dma_debug_entry *entry)
        put_hash_bucket(bucket, &flags);
 }
 
+static struct dma_debug_entry *__dma_entry_alloc(void)
+{
+       struct dma_debug_entry *entry;
+
+       entry = list_entry(free_entries.next, struct dma_debug_entry, list);
+       list_del(&entry->list);
+       memset(entry, 0, sizeof(*entry));
+
+       num_free_entries -= 1;
+       if (num_free_entries < min_free_entries)
+               min_free_entries = num_free_entries;
+
+       return entry;
+}
+
 /* struct dma_entry allocator
  *
  * The next two functions implement the allocator for
@@ -270,15 +377,12 @@ static struct dma_debug_entry *dma_entry_alloc(void)
        spin_lock_irqsave(&free_entries_lock, flags);
 
        if (list_empty(&free_entries)) {
-               printk(KERN_ERR "DMA-API: debugging out of memory "
-                               "- disabling\n");
+               pr_err("DMA-API: debugging out of memory - disabling\n");
                global_disable = true;
                goto out;
        }
 
-       entry = list_entry(free_entries.next, struct dma_debug_entry, list);
-       list_del(&entry->list);
-       memset(entry, 0, sizeof(*entry));
+       entry = __dma_entry_alloc();
 
 #ifdef CONFIG_STACKTRACE
        entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES;
@@ -286,9 +390,6 @@ static struct dma_debug_entry *dma_entry_alloc(void)
        entry->stacktrace.skip = 2;
        save_stack_trace(&entry->stacktrace);
 #endif
-       num_free_entries -= 1;
-       if (num_free_entries < min_free_entries)
-               min_free_entries = num_free_entries;
 
 out:
        spin_unlock_irqrestore(&free_entries_lock, flags);
@@ -310,6 +411,53 @@ static void dma_entry_free(struct dma_debug_entry *entry)
        spin_unlock_irqrestore(&free_entries_lock, flags);
 }
 
+int dma_debug_resize_entries(u32 num_entries)
+{
+       int i, delta, ret = 0;
+       unsigned long flags;
+       struct dma_debug_entry *entry;
+       LIST_HEAD(tmp);
+
+       spin_lock_irqsave(&free_entries_lock, flags);
+
+       if (nr_total_entries < num_entries) {
+               delta = num_entries - nr_total_entries;
+
+               spin_unlock_irqrestore(&free_entries_lock, flags);
+
+               for (i = 0; i < delta; i++) {
+                       entry = kzalloc(sizeof(*entry), GFP_KERNEL);
+                       if (!entry)
+                               break;
+
+                       list_add_tail(&entry->list, &tmp);
+               }
+
+               spin_lock_irqsave(&free_entries_lock, flags);
+
+               list_splice(&tmp, &free_entries);
+               nr_total_entries += i;
+               num_free_entries += i;
+       } else {
+               delta = nr_total_entries - num_entries;
+
+               for (i = 0; i < delta && !list_empty(&free_entries); i++) {
+                       entry = __dma_entry_alloc();
+                       kfree(entry);
+               }
+
+               nr_total_entries -= i;
+       }
+
+       if (nr_total_entries != num_entries)
+               ret = 1;
+
+       spin_unlock_irqrestore(&free_entries_lock, flags);
+
+       return ret;
+}
+EXPORT_SYMBOL(dma_debug_resize_entries);
+
 /*
  * DMA-API debugging init code
  *
@@ -334,8 +482,7 @@ static int prealloc_memory(u32 num_entries)
        num_free_entries = num_entries;
        min_free_entries = num_entries;
 
-       printk(KERN_INFO "DMA-API: preallocated %d debug entries\n",
-                       num_entries);
+       pr_info("DMA-API: preallocated %d debug entries\n", num_entries);
 
        return 0;
 
@@ -349,11 +496,102 @@ out_err:
        return -ENOMEM;
 }
 
+static ssize_t filter_read(struct file *file, char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+       char buf[NAME_MAX_LEN + 1];
+       unsigned long flags;
+       int len;
+
+       if (!current_driver_name[0])
+               return 0;
+
+       /*
+        * We can't copy to userspace directly because current_driver_name can
+        * only be read under the driver_name_lock with irqs disabled. So
+        * create a temporary copy first.
+        */
+       read_lock_irqsave(&driver_name_lock, flags);
+       len = scnprintf(buf, NAME_MAX_LEN + 1, "%s\n", current_driver_name);
+       read_unlock_irqrestore(&driver_name_lock, flags);
+
+       return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+static ssize_t filter_write(struct file *file, const char __user *userbuf,
+                           size_t count, loff_t *ppos)
+{
+       char buf[NAME_MAX_LEN];
+       unsigned long flags;
+       size_t len;
+       int i;
+
+       /*
+        * We can't copy from userspace directly. Access to
+        * current_driver_name is protected with a write_lock with irqs
+        * disabled. Since copy_from_user can fault and may sleep we
+        * need to copy to temporary buffer first
+        */
+       len = min(count, (size_t)(NAME_MAX_LEN - 1));
+       if (copy_from_user(buf, userbuf, len))
+               return -EFAULT;
+
+       buf[len] = 0;
+
+       write_lock_irqsave(&driver_name_lock, flags);
+
+       /*
+        * Now handle the string we got from userspace very carefully.
+        * The rules are:
+        *         - only use the first token we got
+        *         - token delimiter is everything looking like a space
+        *           character (' ', '\n', '\t' ...)
+        *
+        */
+       if (!isalnum(buf[0])) {
+               /*
+                * If the first character userspace gave us is not
+                * alphanumerical then assume the filter should be
+                * switched off.
+                */
+               if (current_driver_name[0])
+                       pr_info("DMA-API: switching off dma-debug driver filter\n");
+               current_driver_name[0] = 0;
+               current_driver = NULL;
+               goto out_unlock;
+       }
+
+       /*
+        * Now parse out the first token and use it as the name for the
+        * driver to filter for.
+        */
+       for (i = 0; i < NAME_MAX_LEN; ++i) {
+               current_driver_name[i] = buf[i];
+               if (isspace(buf[i]) || buf[i] == ' ' || buf[i] == 0)
+                       break;
+       }
+       current_driver_name[i] = 0;
+       current_driver = NULL;
+
+       pr_info("DMA-API: enable driver filter for driver [%s]\n",
+               current_driver_name);
+
+out_unlock:
+       write_unlock_irqrestore(&driver_name_lock, flags);
+
+       return count;
+}
+
+const struct file_operations filter_fops = {
+       .read  = filter_read,
+       .write = filter_write,
+};
+
 static int dma_debug_fs_init(void)
 {
        dma_debug_dent = debugfs_create_dir("dma-api", NULL);
        if (!dma_debug_dent) {
-               printk(KERN_ERR "DMA-API: can not create debugfs directory\n");
+               pr_err("DMA-API: can not create debugfs directory\n");
                return -ENOMEM;
        }
 
@@ -392,6 +630,11 @@ static int dma_debug_fs_init(void)
        if (!min_free_entries_dent)
                goto out_err;
 
+       filter_dent = debugfs_create_file("driver_filter", 0644,
+                                         dma_debug_dent, NULL, &filter_fops);
+       if (!filter_dent)
+               goto out_err;
+
        return 0;
 
 out_err:
@@ -400,9 +643,64 @@ out_err:
        return -ENOMEM;
 }
 
+static int device_dma_allocations(struct device *dev)
+{
+       struct dma_debug_entry *entry;
+       unsigned long flags;
+       int count = 0, i;
+
+       local_irq_save(flags);
+
+       for (i = 0; i < HASH_SIZE; ++i) {
+               spin_lock(&dma_entry_hash[i].lock);
+               list_for_each_entry(entry, &dma_entry_hash[i].list, list) {
+                       if (entry->dev == dev)
+                               count += 1;
+               }
+               spin_unlock(&dma_entry_hash[i].lock);
+       }
+
+       local_irq_restore(flags);
+
+       return count;
+}
+
+static int dma_debug_device_change(struct notifier_block *nb,
+                                   unsigned long action, void *data)
+{
+       struct device *dev = data;
+       int count;
+
+
+       switch (action) {
+       case BUS_NOTIFY_UNBOUND_DRIVER:
+               count = device_dma_allocations(dev);
+               if (count == 0)
+                       break;
+               err_printk(dev, NULL, "DMA-API: device driver has pending "
+                               "DMA allocations while released from device "
+                               "[count=%d]\n", count);
+               break;
+       default:
+               break;
+       }
+
+       return 0;
+}
+
 void dma_debug_add_bus(struct bus_type *bus)
 {
-       /* FIXME: register notifier */
+       struct notifier_block *nb;
+
+       nb = kzalloc(sizeof(struct notifier_block), GFP_KERNEL);
+       if (nb == NULL) {
+               pr_err("dma_debug_add_bus: out of memory\n");
+               return;
+       }
+
+       nb->notifier_call = dma_debug_device_change;
+
+       bus_register_notifier(bus, nb);
 }
 
 /*
@@ -421,8 +719,7 @@ void dma_debug_init(u32 num_entries)
        }
 
        if (dma_debug_fs_init() != 0) {
-               printk(KERN_ERR "DMA-API: error creating debugfs entries "
-                               "- disabling\n");
+               pr_err("DMA-API: error creating debugfs entries - disabling\n");
                global_disable = true;
 
                return;
@@ -432,14 +729,15 @@ void dma_debug_init(u32 num_entries)
                num_entries = req_entries;
 
        if (prealloc_memory(num_entries) != 0) {
-               printk(KERN_ERR "DMA-API: debugging out of memory error "
-                               "- disabled\n");
+               pr_err("DMA-API: debugging out of memory error - disabled\n");
                global_disable = true;
 
                return;
        }
 
-       printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n");
+       nr_total_entries = num_free_entries;
+
+       pr_info("DMA-API: debugging enabled by kernel config\n");
 }
 
 static __init int dma_debug_cmdline(char *str)
@@ -448,8 +746,7 @@ static __init int dma_debug_cmdline(char *str)
                return -EINVAL;
 
        if (strncmp(str, "off", 3) == 0) {
-               printk(KERN_INFO "DMA-API: debugging disabled on kernel "
-                                "command line\n");
+               pr_info("DMA-API: debugging disabled on kernel command line\n");
                global_disable = true;
        }
 
@@ -723,15 +1020,15 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
                entry->type           = dma_debug_sg;
                entry->dev            = dev;
                entry->paddr          = sg_phys(s);
-               entry->size           = s->length;
-               entry->dev_addr       = s->dma_address;
+               entry->size           = sg_dma_len(s);
+               entry->dev_addr       = sg_dma_address(s);
                entry->direction      = direction;
                entry->sg_call_ents   = nents;
                entry->sg_mapped_ents = mapped_ents;
 
                if (!PageHighMem(sg_page(s))) {
                        check_for_stack(dev, sg_virt(s));
-                       check_for_illegal_area(dev, sg_virt(s), s->length);
+                       check_for_illegal_area(dev, sg_virt(s), sg_dma_len(s));
                }
 
                add_dma_entry(entry);
@@ -739,13 +1036,33 @@ void debug_dma_map_sg(struct device *dev, struct scatterlist *sg,
 }
 EXPORT_SYMBOL(debug_dma_map_sg);
 
+static int get_nr_mapped_entries(struct device *dev, struct scatterlist *s)
+{
+       struct dma_debug_entry *entry, ref;
+       struct hash_bucket *bucket;
+       unsigned long flags;
+       int mapped_ents;
+
+       ref.dev      = dev;
+       ref.dev_addr = sg_dma_address(s);
+       ref.size     = sg_dma_len(s),
+
+       bucket       = get_hash_bucket(&ref, &flags);
+       entry        = hash_bucket_find(bucket, &ref);
+       mapped_ents  = 0;
+
+       if (entry)
+               mapped_ents = entry->sg_mapped_ents;
+       put_hash_bucket(bucket, &flags);
+
+       return mapped_ents;
+}
+
 void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                        int nelems, int dir)
 {
-       struct dma_debug_entry *entry;
        struct scatterlist *s;
        int mapped_ents = 0, i;
-       unsigned long flags;
 
        if (unlikely(global_disable))
                return;
@@ -756,8 +1073,8 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                        .type           = dma_debug_sg,
                        .dev            = dev,
                        .paddr          = sg_phys(s),
-                       .dev_addr       = s->dma_address,
-                       .size           = s->length,
+                       .dev_addr       = sg_dma_address(s),
+                       .size           = sg_dma_len(s),
                        .direction      = dir,
                        .sg_call_ents   = 0,
                };
@@ -765,14 +1082,9 @@ void debug_dma_unmap_sg(struct device *dev, struct scatterlist *sglist,
                if (mapped_ents && i >= mapped_ents)
                        break;
 
-               if (mapped_ents == 0) {
-                       struct hash_bucket *bucket;
+               if (!i) {
                        ref.sg_call_ents = nelems;
-                       bucket = get_hash_bucket(&ref, &flags);
-                       entry = hash_bucket_find(bucket, &ref);
-                       if (entry)
-                               mapped_ents = entry->sg_mapped_ents;
-                       put_hash_bucket(bucket, &flags);
+                       mapped_ents = get_nr_mapped_entries(dev, s);
                }
 
                check_unmap(&ref);
@@ -874,14 +1186,20 @@ void debug_dma_sync_sg_for_cpu(struct device *dev, struct scatterlist *sg,
                               int nelems, int direction)
 {
        struct scatterlist *s;
-       int i;
+       int mapped_ents = 0, i;
 
        if (unlikely(global_disable))
                return;
 
        for_each_sg(sg, s, nelems, i) {
-               check_sync(dev, s->dma_address, s->dma_length, 0,
-                               direction, true);
+               if (!i)
+                       mapped_ents = get_nr_mapped_entries(dev, s);
+
+               if (i >= mapped_ents)
+                       break;
+
+               check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
+                          direction, true);
        }
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_cpu);
@@ -890,15 +1208,39 @@ void debug_dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg,
                                  int nelems, int direction)
 {
        struct scatterlist *s;
-       int i;
+       int mapped_ents = 0, i;
 
        if (unlikely(global_disable))
                return;
 
        for_each_sg(sg, s, nelems, i) {
-               check_sync(dev, s->dma_address, s->dma_length, 0,
-                               direction, false);
+               if (!i)
+                       mapped_ents = get_nr_mapped_entries(dev, s);
+
+               if (i >= mapped_ents)
+                       break;
+
+               check_sync(dev, sg_dma_address(s), sg_dma_len(s), 0,
+                          direction, false);
        }
 }
 EXPORT_SYMBOL(debug_dma_sync_sg_for_device);
 
+static int __init dma_debug_driver_setup(char *str)
+{
+       int i;
+
+       for (i = 0; i < NAME_MAX_LEN - 1; ++i, ++str) {
+               current_driver_name[i] = *str;
+               if (*str == 0)
+                       break;
+       }
+
+       if (current_driver_name[0])
+               pr_info("DMA-API: enable driver filter for driver [%s]\n",
+                       current_driver_name);
+
+
+       return 1;
+}
+__setup("dma_debug_driver=", dma_debug_driver_setup);
diff --git a/lib/rational.c b/lib/rational.c
new file mode 100644 (file)
index 0000000..b3c099b
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * rational fractions
+ *
+ * Copyright (C) 2009 emlix GmbH, Oskar Schirmer <os@emlix.com>
+ *
+ * helper functions when coping with rational numbers
+ */
+
+#include <linux/rational.h>
+
+/*
+ * calculate best rational approximation for a given fraction
+ * taking into account restricted register size, e.g. to find
+ * appropriate values for a pll with 5 bit denominator and
+ * 8 bit numerator register fields, trying to set up with a
+ * frequency ratio of 3.1415, one would say:
+ *
+ * rational_best_approximation(31415, 10000,
+ *             (1 << 8) - 1, (1 << 5) - 1, &n, &d);
+ *
+ * you may look at given_numerator as a fixed point number,
+ * with the fractional part size described in given_denominator.
+ *
+ * for theoretical background, see:
+ * http://en.wikipedia.org/wiki/Continued_fraction
+ */
+
+void rational_best_approximation(
+       unsigned long given_numerator, unsigned long given_denominator,
+       unsigned long max_numerator, unsigned long max_denominator,
+       unsigned long *best_numerator, unsigned long *best_denominator)
+{
+       unsigned long n, d, n0, d0, n1, d1;
+       n = given_numerator;
+       d = given_denominator;
+       n0 = d1 = 0;
+       n1 = d0 = 1;
+       for (;;) {
+               unsigned long t, a;
+               if ((n1 > max_numerator) || (d1 > max_denominator)) {
+                       n1 = n0;
+                       d1 = d0;
+                       break;
+               }
+               if (d == 0)
+                       break;
+               t = d;
+               a = n / d;
+               d = n % d;
+               n = t;
+               t = n0 + a * n1;
+               n0 = n1;
+               n1 = t;
+               t = d0 + a * d1;
+               d0 = d1;
+               d1 = t;
+       }
+       *best_numerator = n1;
+       *best_denominator = d1;
+}
+
+EXPORT_SYMBOL(rational_best_approximation);
index 2b0b5a7d2ced165b1e2b83e2df888c237349f23f..bffe6d7ef9d9a01a52db11e12e54207bc9b41a0c 100644 (file)
@@ -60,8 +60,8 @@ enum dma_sync_target {
 int swiotlb_force;
 
 /*
- * Used to do a quick range check in swiotlb_unmap_single and
- * swiotlb_sync_single_*, to see if the memory was in fact allocated by this
+ * Used to do a quick range check in unmap_single and
+ * sync_single_*, to see if the memory was in fact allocated by this
  * API.
  */
 static char *io_tlb_start, *io_tlb_end;
@@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr)
        return paddr;
 }
 
-phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr)
+phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr)
 {
        return baddr;
 }
@@ -140,9 +140,15 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev,
        return swiotlb_phys_to_bus(hwdev, virt_to_phys(address));
 }
 
-static void *swiotlb_bus_to_virt(dma_addr_t address)
+void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address)
 {
-       return phys_to_virt(swiotlb_bus_to_phys(address));
+       return phys_to_virt(swiotlb_bus_to_phys(hwdev, address));
+}
+
+int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev,
+                                              dma_addr_t addr, size_t size)
+{
+       return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
 }
 
 int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -309,10 +315,10 @@ cleanup1:
        return -ENOMEM;
 }
 
-static int
+static inline int
 address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size)
 {
-       return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size);
+       return swiotlb_arch_address_needs_mapping(hwdev, addr, size);
 }
 
 static inline int range_needs_mapping(phys_addr_t paddr, size_t size)
@@ -341,7 +347,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size,
                unsigned long flags;
 
                while (size) {
-                       sz = min(PAGE_SIZE - offset, size);
+                       sz = min_t(size_t, PAGE_SIZE - offset, size);
 
                        local_irq_save(flags);
                        buffer = kmap_atomic(pfn_to_page(pfn),
@@ -476,7 +482,7 @@ found:
  * dma_addr is the kernel virtual address of the bounce buffer to unmap.
  */
 static void
-unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
+do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir)
 {
        unsigned long flags;
        int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT;
@@ -560,7 +566,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                                   size)) {
                /*
                 * The allocated memory isn't reachable by the device.
-                * Fall back on swiotlb_map_single().
                 */
                free_pages((unsigned long) ret, order);
                ret = NULL;
@@ -568,9 +573,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
        if (!ret) {
                /*
                 * We are either out of memory or the device can't DMA
-                * to GFP_DMA memory; fall back on
-                * swiotlb_map_single(), which will grab memory from
-                * the lowest available address range.
+                * to GFP_DMA memory; fall back on map_single(), which
+                * will grab memory from the lowest available address range.
                 */
                ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE);
                if (!ret)
@@ -587,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size,
                       (unsigned long long)dev_addr);
 
                /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-               unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
+               do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE);
                return NULL;
        }
        *dma_handle = dev_addr;
@@ -604,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr,
                free_pages((unsigned long) vaddr, get_order(size));
        else
                /* DMA_TO_DEVICE to avoid memcpy in unmap_single */
-               unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
+               do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE);
 }
 EXPORT_SYMBOL(swiotlb_free_coherent);
 
@@ -634,7 +638,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic)
  * physical address to use is returned.
  *
  * Once the device is given the dma address, the device owns this memory until
- * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed.
+ * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed.
  */
 dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
                            unsigned long offset, size_t size,
@@ -642,18 +646,17 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page,
                            struct dma_attrs *attrs)
 {
        phys_addr_t phys = page_to_phys(page) + offset;
-       void *ptr = page_address(page) + offset;
        dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys);
        void *map;
 
        BUG_ON(dir == DMA_NONE);
        /*
-        * If the pointer passed in happens to be in the device's DMA window,
+        * If the address happens to be in the device's DMA window,
         * we can safely return the device addr and not worry about bounce
         * buffering it.
         */
        if (!address_needs_mapping(dev, dev_addr, size) &&
-           !range_needs_mapping(virt_to_phys(ptr), size))
+           !range_needs_mapping(phys, size))
                return dev_addr;
 
        /*
@@ -679,23 +682,35 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page);
 
 /*
  * Unmap a single streaming mode DMA translation.  The dma_addr and size must
- * match what was provided for in a previous swiotlb_map_single call.  All
+ * match what was provided for in a previous swiotlb_map_page call.  All
  * other usages are undefined.
  *
  * After this call, reads by the cpu to the buffer are guaranteed to see
  * whatever the device wrote there.
  */
+static void unmap_single(struct device *hwdev, dma_addr_t dev_addr,
+                        size_t size, int dir)
+{
+       char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
+
+       BUG_ON(dir == DMA_NONE);
+
+       if (is_swiotlb_buffer(dma_addr)) {
+               do_unmap_single(hwdev, dma_addr, size, dir);
+               return;
+       }
+
+       if (dir != DMA_FROM_DEVICE)
+               return;
+
+       dma_mark_clean(dma_addr, size);
+}
+
 void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr,
                        size_t size, enum dma_data_direction dir,
                        struct dma_attrs *attrs)
 {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr);
-
-       BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
-               unmap_single(hwdev, dma_addr, size, dir);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+       unmap_single(hwdev, dev_addr, size, dir);
 }
 EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
 
@@ -703,7 +718,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page);
  * Make physical memory consistent for a single streaming mode DMA translation
  * after a transfer.
  *
- * If you perform a swiotlb_map_single() but wish to interrogate the buffer
+ * If you perform a swiotlb_map_page() but wish to interrogate the buffer
  * using the cpu, yet do not wish to teardown the dma mapping, you must
  * call this function before doing so.  At the next point you give the dma
  * address back to the card, you must first perform a
@@ -713,13 +728,19 @@ static void
 swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr,
                    size_t size, int dir, int target)
 {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr);
+       char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr);
 
        BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
+
+       if (is_swiotlb_buffer(dma_addr)) {
                sync_single(hwdev, dma_addr, size, dir, target);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+               return;
+       }
+
+       if (dir != DMA_FROM_DEVICE)
+               return;
+
+       dma_mark_clean(dma_addr, size);
 }
 
 void
@@ -746,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr,
                          unsigned long offset, size_t size,
                          int dir, int target)
 {
-       char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset;
-
-       BUG_ON(dir == DMA_NONE);
-       if (is_swiotlb_buffer(dma_addr))
-               sync_single(hwdev, dma_addr, size, dir, target);
-       else if (dir == DMA_FROM_DEVICE)
-               dma_mark_clean(dma_addr, size);
+       swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target);
 }
 
 void
@@ -777,7 +792,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
 
 /*
  * Map a set of buffers described by scatterlist in streaming mode for DMA.
- * This is the scatter-gather version of the above swiotlb_map_single
+ * This is the scatter-gather version of the above swiotlb_map_page
  * interface.  Here the scatter gather list elements are each tagged with the
  * appropriate dma address and length.  They are obtained via
  * sg_dma_{address,length}(SG).
@@ -788,7 +803,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device);
  *       The routine returns the number of addr/length pairs actually
  *       used, at most nents.
  *
- * Device ownership issues as mentioned above for swiotlb_map_single are the
+ * Device ownership issues as mentioned above for swiotlb_map_page are the
  * same here.
  */
 int
@@ -836,7 +851,7 @@ EXPORT_SYMBOL(swiotlb_map_sg);
 
 /*
  * Unmap a set of streaming mode DMA translations.  Again, cpu read rules
- * concerning calls here are the same as for swiotlb_unmap_single() above.
+ * concerning calls here are the same as for swiotlb_unmap_page() above.
  */
 void
 swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
@@ -847,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl,
 
        BUG_ON(dir == DMA_NONE);
 
-       for_each_sg(sgl, sg, nelems, i) {
-               if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-                       unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
-                                    sg->dma_length, dir);
-               else if (dir == DMA_FROM_DEVICE)
-                       dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-       }
+       for_each_sg(sgl, sg, nelems, i)
+               unmap_single(hwdev, sg->dma_address, sg->dma_length, dir);
+
 }
 EXPORT_SYMBOL(swiotlb_unmap_sg_attrs);
 
@@ -879,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl,
        struct scatterlist *sg;
        int i;
 
-       BUG_ON(dir == DMA_NONE);
-
-       for_each_sg(sgl, sg, nelems, i) {
-               if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg)))
-                       sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address),
+       for_each_sg(sgl, sg, nelems, i)
+               swiotlb_sync_single(hwdev, sg->dma_address,
                                    sg->dma_length, dir, target);
-               else if (dir == DMA_FROM_DEVICE)
-                       dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length);
-       }
 }
 
 void
index 7536acea135ba4069c1dc04ad36d5024ac747d8c..756ccafa9cec04ada5c940f7031e6dd56aeb7837 100644 (file)
@@ -408,6 +408,8 @@ enum format_type {
        FORMAT_TYPE_LONG_LONG,
        FORMAT_TYPE_ULONG,
        FORMAT_TYPE_LONG,
+       FORMAT_TYPE_UBYTE,
+       FORMAT_TYPE_BYTE,
        FORMAT_TYPE_USHORT,
        FORMAT_TYPE_SHORT,
        FORMAT_TYPE_UINT,
@@ -573,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec)
 }
 
 static char *symbol_string(char *buf, char *end, void *ptr,
-                               struct printf_spec spec)
+                               struct printf_spec spec, char ext)
 {
        unsigned long value = (unsigned long) ptr;
 #ifdef CONFIG_KALLSYMS
        char sym[KSYM_SYMBOL_LEN];
-       sprint_symbol(sym, value);
+       if (ext != 'f')
+               sprint_symbol(sym, value);
+       else
+               kallsyms_lookup(value, NULL, NULL, NULL, sym);
        return string(buf, end, sym, spec);
 #else
        spec.field_width = 2*sizeof(void *);
@@ -690,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr,
  *
  * Right now we handle:
  *
- * - 'F' For symbolic function descriptor pointers
+ * - 'F' For symbolic function descriptor pointers with offset
+ * - 'f' For simple symbolic function names without offset
  * - 'S' For symbolic direct pointers
  * - 'R' For a struct resource pointer, it prints the range of
  *       addresses (not the name nor the flags)
@@ -713,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr,
 
        switch (*fmt) {
        case 'F':
+       case 'f':
                ptr = dereference_function_descriptor(ptr);
                /* Fallthrough */
        case 'S':
-               return symbol_string(buf, end, ptr, spec);
+               return symbol_string(buf, end, ptr, spec, *fmt);
        case 'R':
                return resource_string(buf, end, ptr, spec);
        case 'm':
@@ -853,11 +860,15 @@ qualifier:
        spec->qualifier = -1;
        if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' ||
            *fmt == 'Z' || *fmt == 'z' || *fmt == 't') {
-               spec->qualifier = *fmt;
-               ++fmt;
-               if (spec->qualifier == 'l' && *fmt == 'l') {
-                       spec->qualifier = 'L';
-                       ++fmt;
+               spec->qualifier = *fmt++;
+               if (unlikely(spec->qualifier == *fmt)) {
+                       if (spec->qualifier == 'l') {
+                               spec->qualifier = 'L';
+                               ++fmt;
+                       } else if (spec->qualifier == 'h') {
+                               spec->qualifier = 'H';
+                               ++fmt;
+                       }
                }
        }
 
@@ -919,6 +930,11 @@ qualifier:
                spec->type = FORMAT_TYPE_SIZE_T;
        } else if (spec->qualifier == 't') {
                spec->type = FORMAT_TYPE_PTRDIFF;
+       } else if (spec->qualifier == 'H') {
+               if (spec->flags & SIGN)
+                       spec->type = FORMAT_TYPE_BYTE;
+               else
+                       spec->type = FORMAT_TYPE_UBYTE;
        } else if (spec->qualifier == 'h') {
                if (spec->flags & SIGN)
                        spec->type = FORMAT_TYPE_SHORT;
@@ -943,7 +959,8 @@ qualifier:
  *
  * This function follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  *
  * The return value is the number of characters which would
@@ -1087,6 +1104,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args)
                        case FORMAT_TYPE_PTRDIFF:
                                num = va_arg(args, ptrdiff_t);
                                break;
+                       case FORMAT_TYPE_UBYTE:
+                               num = (unsigned char) va_arg(args, int);
+                               break;
+                       case FORMAT_TYPE_BYTE:
+                               num = (signed char) va_arg(args, int);
+                               break;
                        case FORMAT_TYPE_USHORT:
                                num = (unsigned short) va_arg(args, int);
                                break;
@@ -1363,6 +1386,10 @@ do {                                                                     \
                        case FORMAT_TYPE_PTRDIFF:
                                save_arg(ptrdiff_t);
                                break;
+                       case FORMAT_TYPE_UBYTE:
+                       case FORMAT_TYPE_BYTE:
+                               save_arg(char);
+                               break;
                        case FORMAT_TYPE_USHORT:
                        case FORMAT_TYPE_SHORT:
                                save_arg(short);
@@ -1391,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf);
  *
  * The format follows C99 vsnprintf, but has some extensions:
  * %pS output the name of a text symbol
- * %pF output the name of a function pointer
+ * %pF output the name of a function pointer with its offset
+ * %pf output the name of a function pointer without its offset
  * %pR output the address range in a struct resource
  * %n is ignored
  *
@@ -1538,6 +1566,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf)
                        case FORMAT_TYPE_PTRDIFF:
                                num = get_arg(ptrdiff_t);
                                break;
+                       case FORMAT_TYPE_UBYTE:
+                               num = get_arg(unsigned char);
+                               break;
+                       case FORMAT_TYPE_BYTE:
+                               num = get_arg(signed char);
+                               break;
                        case FORMAT_TYPE_USHORT:
                                num = get_arg(unsigned short);
                                break;
index c2b57d81e153077bec9bc8be39af8bb658537065..71830ba7b986a8e4da4b448793791bf2bc65c245 100644 (file)
@@ -226,6 +226,25 @@ config HAVE_MLOCKED_PAGE_BIT
 config MMU_NOTIFIER
        bool
 
+config DEFAULT_MMAP_MIN_ADDR
+        int "Low address space to protect from user allocation"
+        default 4096
+        help
+         This is the portion of low virtual memory which should be protected
+         from userspace allocation.  Keeping a user from writing to low pages
+         can help reduce the impact of kernel NULL pointer bugs.
+
+         For most ia64, ppc64 and x86 users with lots of address space
+         a value of 65536 is reasonable and should cause no problems.
+         On arm and other archs it should not be higher than 32768.
+         Programs which use vm86 functionality would either need additional
+         permissions from either the LSM or the capabilities module or have
+         this protection disabled.
+
+         This value can be changed after boot using the
+         /proc/sys/vm/mmap_min_addr tunable.
+
+
 config NOMMU_INITIAL_TRIM_EXCESS
        int "Turn on mmap() excess space trimming before booting"
        depends on !MMU
index e590272fe7a8f3e40acb21059bb0082f74354f26..65f5e17e411aaf78913a41129d8d63a8a7503a81 100644 (file)
 #include <linux/hash.h>
 #include <linux/highmem.h>
 #include <linux/blktrace_api.h>
-#include <trace/block.h>
 #include <asm/tlbflush.h>
 
+#include <trace/events/block.h>
+
 #define POOL_SIZE      64
 #define ISA_POOL_SIZE  16
 
 static mempool_t *page_pool, *isa_page_pool;
 
-DEFINE_TRACE(block_bio_bounce);
-
 #ifdef CONFIG_HIGHMEM
 static __init int init_emergency_pool(void)
 {
index cbe9e0581b75dcaf06335ccc017a68789d6247d2..ac130433c7d35da275b1ad081bfddd9fbf017173 100644 (file)
@@ -629,52 +629,43 @@ void user_shm_unlock(size_t size, struct user_struct *user)
        free_uid(user);
 }
 
-void *alloc_locked_buffer(size_t size)
+int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim,
+                         size_t size)
 {
-       unsigned long rlim, vm, pgsz;
-       void *buffer = NULL;
+       unsigned long lim, vm, pgsz;
+       int error = -ENOMEM;
 
        pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-       down_write(&current->mm->mmap_sem);
-
-       rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->total_vm + pgsz;
-       if (rlim < vm)
-               goto out;
+       down_write(&mm->mmap_sem);
 
-       rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
-       vm   = current->mm->locked_vm + pgsz;
-       if (rlim < vm)
+       lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT;
+       vm   = mm->total_vm + pgsz;
+       if (lim < vm)
                goto out;
 
-       buffer = kzalloc(size, GFP_KERNEL);
-       if (!buffer)
+       lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT;
+       vm   = mm->locked_vm + pgsz;
+       if (lim < vm)
                goto out;
 
-       current->mm->total_vm  += pgsz;
-       current->mm->locked_vm += pgsz;
+       mm->total_vm  += pgsz;
+       mm->locked_vm += pgsz;
 
+       error = 0;
  out:
-       up_write(&current->mm->mmap_sem);
-       return buffer;
+       up_write(&mm->mmap_sem);
+       return error;
 }
 
-void release_locked_buffer(void *buffer, size_t size)
+void refund_locked_memory(struct mm_struct *mm, size_t size)
 {
        unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT;
 
-       down_write(&current->mm->mmap_sem);
-
-       current->mm->total_vm  -= pgsz;
-       current->mm->locked_vm -= pgsz;
-
-       up_write(&current->mm->mmap_sem);
-}
+       down_write(&mm->mmap_sem);
 
-void free_locked_buffer(void *buffer, size_t size)
-{
-       release_locked_buffer(buffer, size);
+       mm->total_vm  -= pgsz;
+       mm->locked_vm -= pgsz;
 
-       kfree(buffer);
+       up_write(&mm->mmap_sem);
 }
index 6b7b1a95944bf267bd21cd981703280fe25a39e9..2b43fa1aa3c8318cbaa5e6b80da0c26944b90b4a 100644 (file)
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -87,6 +87,9 @@ int sysctl_overcommit_ratio = 50;     /* default is 50% */
 int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
 struct percpu_counter vm_committed_as;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 /*
  * Check that a process has enough memory to allocate a new virtual
  * mapping. 0 means there is enough memory for the allocation to
index b571ef707428c5e171fa27b7fc09d0212018c5df..2fd2ad5da98e5d82e751b76f4e6369c57f20e6db 100644 (file)
@@ -69,6 +69,9 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = CONFIG_NOMMU_INITIAL_TRIM_EXCESS;
 int heap_stack_gap = 0;
 
+/* amount of vm to protect from userspace access */
+unsigned long mmap_min_addr = CONFIG_DEFAULT_MMAP_MIN_ADDR;
+
 atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
index fe753ecf2aa5fd234dedb6916c333055bcfe8b36..474c7e9dd51ac66f97b027bf5b384a1bacd42da7 100644 (file)
@@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve;
   static int __meminitdata nr_nodemap_entries;
   static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
   static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-  static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES];
-  static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES];
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
   static unsigned long __initdata required_kernelcore;
   static unsigned long __initdata required_movablecore;
   static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES];
@@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid)
                                early_node_map[i].end_pfn);
 }
 
-/**
- * push_node_boundaries - Push node boundaries to at least the requested boundary
- * @nid: The nid of the node to push the boundary for
- * @start_pfn: The start pfn of the node
- * @end_pfn: The end pfn of the node
- *
- * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
- * time. Specifically, on x86_64, SRAT will report ranges that can potentially
- * be hotplugged even though no physical memory exists. This function allows
- * an arch to push out the node boundaries so mem_map is allocated that can
- * be used later.
- */
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering push_node_boundaries(%u, %lu, %lu)\n",
-                       nid, start_pfn, end_pfn);
-
-       /* Initialise the boundary for this node if necessary */
-       if (node_boundary_end_pfn[nid] == 0)
-               node_boundary_start_pfn[nid] = -1UL;
-
-       /* Update the boundaries */
-       if (node_boundary_start_pfn[nid] > start_pfn)
-               node_boundary_start_pfn[nid] = start_pfn;
-       if (node_boundary_end_pfn[nid] < end_pfn)
-               node_boundary_end_pfn[nid] = end_pfn;
-}
-
-/* If necessary, push the node boundary out for reserve hotadd */
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn)
-{
-       mminit_dprintk(MMINIT_TRACE, "zoneboundary",
-                       "Entering account_node_boundary(%u, %lu, %lu)\n",
-                       nid, *start_pfn, *end_pfn);
-
-       /* Return if boundary information has not been provided */
-       if (node_boundary_end_pfn[nid] == 0)
-               return;
-
-       /* Check the boundaries and update if necessary */
-       if (node_boundary_start_pfn[nid] < *start_pfn)
-               *start_pfn = node_boundary_start_pfn[nid];
-       if (node_boundary_end_pfn[nid] > *end_pfn)
-               *end_pfn = node_boundary_end_pfn[nid];
-}
-#else
-void __init push_node_boundaries(unsigned int nid,
-               unsigned long start_pfn, unsigned long end_pfn) {}
-
-static void __meminit account_node_boundary(unsigned int nid,
-               unsigned long *start_pfn, unsigned long *end_pfn) {}
-#endif
-
-
 /**
  * get_pfn_range_for_nid - Return the start and end page frames for a node
  * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
@@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid,
 
        if (*start_pfn == -1UL)
                *start_pfn = 0;
-
-       /* Push the node boundaries out if requested */
-       account_node_boundary(nid, start_pfn, end_pfn);
 }
 
 /*
@@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void)
 {
        memset(early_node_map, 0, sizeof(early_node_map));
        nr_nodemap_entries = 0;
-#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
-       memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
-       memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
-#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
 }
 
 /* Compare two active node_active_regions */
index 1aa5d8fbca121d434f3cb812e10569f1b8e05f82..c0b2c1a76e81c280398b27ed310b1e03500e3a23 100644 (file)
@@ -23,7 +23,7 @@
  * Allocation is done in offset-size areas of single unit space.  Ie,
  * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
  * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
- * percpu base registers UNIT_SIZE apart.
+ * percpu base registers pcpu_unit_size apart.
  *
  * There are usually many small percpu allocations many of them as
  * small as 4 bytes.  The allocator organizes chunks into lists
@@ -38,8 +38,8 @@
  * region and negative allocated.  Allocation inside a chunk is done
  * by scanning this map sequentially and serving the first matching
  * entry.  This is mostly copied from the percpu_modalloc() allocator.
- * Chunks are also linked into a rb tree to ease address to chunk
- * mapping during free.
+ * Chunks can be determined from the address using the index field
+ * in the page struct. The index field contains a pointer to the chunk.
  *
  * To use this allocator, arch code should do the followings.
  *
@@ -61,7 +61,6 @@
 #include <linux/mutex.h>
 #include <linux/percpu.h>
 #include <linux/pfn.h>
-#include <linux/rbtree.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/vmalloc.h>
@@ -88,7 +87,6 @@
 
 struct pcpu_chunk {
        struct list_head        list;           /* linked to pcpu_slot lists */
-       struct rb_node          rb_node;        /* key is chunk->vm->addr */
        int                     free_size;      /* free bytes in the chunk */
        int                     contig_hint;    /* max contiguous size hint */
        struct vm_struct        *vm;            /* mapped vmalloc region */
@@ -110,9 +108,21 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-/* optional reserved chunk, only accessible for reserved allocations */
+/*
+ * The first chunk which always exists.  Note that unlike other
+ * chunks, this one can be allocated and mapped in several different
+ * ways and thus often doesn't live in the vmalloc area.
+ */
+static struct pcpu_chunk *pcpu_first_chunk;
+
+/*
+ * Optional reserved chunk.  This chunk reserves part of the first
+ * chunk and serves it for reserved allocations.  The amount of
+ * reserved offset is in pcpu_reserved_chunk_limit.  When reserved
+ * area doesn't exist, the following variables contain NULL and 0
+ * respectively.
+ */
 static struct pcpu_chunk *pcpu_reserved_chunk;
-/* offset limit of the reserved chunk */
 static int pcpu_reserved_chunk_limit;
 
 /*
@@ -121,7 +131,7 @@ static int pcpu_reserved_chunk_limit;
  * There are two locks - pcpu_alloc_mutex and pcpu_lock.  The former
  * protects allocation/reclaim paths, chunks and chunk->page arrays.
  * The latter is a spinlock and protects the index data structures -
- * chunk slots, rbtree, chunks and area maps in chunks.
+ * chunk slots, chunks and area maps in chunks.
  *
  * During allocation, pcpu_alloc_mutex is kept locked all the time and
  * pcpu_lock is grabbed and released as necessary.  All actual memory
@@ -140,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex);      /* protects whole alloc and reclaim */
 static DEFINE_SPINLOCK(pcpu_lock);     /* protects index data structures */
 
 static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
-static struct rb_root pcpu_addr_root = RB_ROOT;        /* chunks by address */
 
 /* reclaim work to release fully free chunks, scheduled from free path */
 static void pcpu_reclaim(struct work_struct *work);
@@ -191,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
        return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
 }
 
+/* set the pointer to a chunk in a page struct */
+static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
+{
+       page->index = (unsigned long)pcpu;
+}
+
+/* obtain pointer to a chunk from a page struct */
+static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
+{
+       return (struct pcpu_chunk *)page->index;
+}
+
 /**
  * pcpu_mem_alloc - allocate memory
  * @size: bytes to allocate
@@ -257,93 +278,26 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
        }
 }
 
-static struct rb_node **pcpu_chunk_rb_search(void *addr,
-                                            struct rb_node **parentp)
-{
-       struct rb_node **p = &pcpu_addr_root.rb_node;
-       struct rb_node *parent = NULL;
-       struct pcpu_chunk *chunk;
-
-       while (*p) {
-               parent = *p;
-               chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
-
-               if (addr < chunk->vm->addr)
-                       p = &(*p)->rb_left;
-               else if (addr > chunk->vm->addr)
-                       p = &(*p)->rb_right;
-               else
-                       break;
-       }
-
-       if (parentp)
-               *parentp = parent;
-       return p;
-}
-
 /**
- * pcpu_chunk_addr_search - search for chunk containing specified address
- * @addr: address to search for
- *
- * Look for chunk which might contain @addr.  More specifically, it
- * searchs for the chunk with the highest start address which isn't
- * beyond @addr.
- *
- * CONTEXT:
- * pcpu_lock.
+ * pcpu_chunk_addr_search - determine chunk containing specified address
+ * @addr: address for which the chunk needs to be determined.
  *
  * RETURNS:
  * The address of the found chunk.
  */
 static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 {
-       struct rb_node *n, *parent;
-       struct pcpu_chunk *chunk;
+       void *first_start = pcpu_first_chunk->vm->addr;
 
-       /* is it in the reserved chunk? */
-       if (pcpu_reserved_chunk) {
-               void *start = pcpu_reserved_chunk->vm->addr;
-
-               if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+       /* is it in the first chunk? */
+       if (addr >= first_start && addr < first_start + pcpu_chunk_size) {
+               /* is it in the reserved area? */
+               if (addr < first_start + pcpu_reserved_chunk_limit)
                        return pcpu_reserved_chunk;
+               return pcpu_first_chunk;
        }
 
-       /* nah... search the regular ones */
-       n = *pcpu_chunk_rb_search(addr, &parent);
-       if (!n) {
-               /* no exactly matching chunk, the parent is the closest */
-               n = parent;
-               BUG_ON(!n);
-       }
-       chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-
-       if (addr < chunk->vm->addr) {
-               /* the parent was the next one, look for the previous one */
-               n = rb_prev(n);
-               BUG_ON(!n);
-               chunk = rb_entry(n, struct pcpu_chunk, rb_node);
-       }
-
-       return chunk;
-}
-
-/**
- * pcpu_chunk_addr_insert - insert chunk into address rb tree
- * @new: chunk to insert
- *
- * Insert @new into address rb tree.
- *
- * CONTEXT:
- * pcpu_lock.
- */
-static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
-{
-       struct rb_node **p, *parent;
-
-       p = pcpu_chunk_rb_search(new->vm->addr, &parent);
-       BUG_ON(*p);
-       rb_link_node(&new->rb_node, parent, p);
-       rb_insert_color(&new->rb_node, &pcpu_addr_root);
+       return pcpu_get_page_chunk(vmalloc_to_page(addr));
 }
 
 /**
@@ -755,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
                                                  alloc_mask, 0);
                        if (!*pagep)
                                goto err;
+                       pcpu_set_page_chunk(*pagep, chunk);
                }
        }
 
@@ -879,7 +834,6 @@ restart:
 
        spin_lock_irq(&pcpu_lock);
        pcpu_chunk_relocate(chunk, -1);
-       pcpu_chunk_addr_insert(chunk);
        goto restart;
 
 area_found:
@@ -968,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work)
                if (chunk == list_first_entry(head, struct pcpu_chunk, list))
                        continue;
 
-               rb_erase(&chunk->rb_node, &pcpu_addr_root);
                list_move(&chunk->list, &todo);
        }
 
@@ -1147,7 +1100,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 
        if (reserved_size) {
                schunk->free_size = reserved_size;
-               pcpu_reserved_chunk = schunk;   /* not for dynamic alloc */
+               pcpu_reserved_chunk = schunk;
+               pcpu_reserved_chunk_limit = static_size + reserved_size;
        } else {
                schunk->free_size = dyn_size;
                dyn_size = 0;                   /* dynamic area covered */
@@ -1158,8 +1112,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
        if (schunk->free_size)
                schunk->map[schunk->map_used++] = schunk->free_size;
 
-       pcpu_reserved_chunk_limit = static_size + schunk->free_size;
-
        /* init dynamic chunk if necessary */
        if (dyn_size) {
                dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
@@ -1226,13 +1178,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
        }
 
        /* link the first chunk in */
-       if (!dchunk) {
-               pcpu_chunk_relocate(schunk, -1);
-               pcpu_chunk_addr_insert(schunk);
-       } else {
-               pcpu_chunk_relocate(dchunk, -1);
-               pcpu_chunk_addr_insert(dchunk);
-       }
+       pcpu_first_chunk = dchunk ?: schunk;
+       pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
        /* we're done */
        pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
index b25f95ce3db76bb3e04658b3d6a0273729fbae55..0132fbd45a23837d5abc3cdea527a6d3cafb53a7 100644 (file)
@@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags)
        if (error)
                goto close_file;
 #endif
+       ima_counts_get(file);
        return file;
 
 close_file:
@@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma)
        if (IS_ERR(file))
                return PTR_ERR(file);
 
-       ima_shm_check(file);
        if (vma->vm_file)
                fput(vma->vm_file);
        vma->vm_file = file;
index 9a90b00d2f9140e2aaa67ecb949e33d89431e9c5..f85831da9080d22e4abe0b2e2f2405cd067a0d85 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
 #include       <linux/cpu.h>
 #include       <linux/sysctl.h>
 #include       <linux/module.h>
-#include       <trace/kmemtrace.h>
+#include       <linux/kmemtrace.h>
 #include       <linux/rcupdate.h>
 #include       <linux/string.h>
 #include       <linux/uaccess.h>
index f92e66d558bd3608f5c758d7b7c39748cf936f51..9b1737b0787bf53740fd538e697df549e140620b 100644 (file)
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -66,7 +66,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
index 65ffda5934b09b8220e9a00332945dc19ba88de6..5e805a6fe36c46a200bc9d1ded7224bed5fce3ba 100644 (file)
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -17,7 +17,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <trace/kmemtrace.h>
+#include <linux/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
index 55bef160b9f1484c032220b506f4ebfaeffe9a47..abc65aa7cdfc7bcfc76817afe2451cce7a4ece3e 100644 (file)
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,9 +4,11 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
-#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/kmem.h>
+
 /**
  * kstrdup - allocate space for and copy an existing string
  * @s: the string to duplicate
@@ -255,13 +257,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
 
 /* Tracepoints definitions. */
-DEFINE_TRACE(kmalloc);
-DEFINE_TRACE(kmem_cache_alloc);
-DEFINE_TRACE(kmalloc_node);
-DEFINE_TRACE(kmem_cache_alloc_node);
-DEFINE_TRACE(kfree);
-DEFINE_TRACE(kmem_cache_free);
-
 EXPORT_TRACEPOINT_SYMBOL(kmalloc);
 EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
 EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
index 9fd0dc3cca99f5bad5af9fdbc189c528573fcc90..b75b6cea49dab47cbca9098a259ac44e4e51b4c8 100644 (file)
@@ -23,7 +23,7 @@
 #include <linux/bitops.h>
 #include <net/genetlink.h>
 
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include <asm/unaligned.h>
 
index c8fb45665e4f4af4cf6ced67fe90fff64f74e3ed..499a67eaf3ae201262c4a2e53f20870721b8f184 100644 (file)
 #include <linux/workqueue.h>
 #include <linux/netlink.h>
 #include <linux/net_dropmon.h>
-#include <trace/skb.h>
 
 #include <asm/unaligned.h>
 #include <asm/bitops.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/skb.h>
 
-DEFINE_TRACE(kfree_skb);
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
index e505b5392e1e511b278eda64e71914750ffc6577..c2e4fb8f3546c06a39b22b8c8538ecdf7187c3e7 100644 (file)
@@ -65,7 +65,7 @@
 
 #include <asm/uaccess.h>
 #include <asm/system.h>
-#include <trace/skb.h>
+#include <trace/events/skb.h>
 
 #include "kmap_skb.h"
 
index 4b02f5a0e6560f4c886d413409d5507ffe6b0356..b75d28cba3f75fef74d90d01b2fb8990984a1c55 100644 (file)
@@ -19,6 +19,12 @@ config SAMPLE_TRACEPOINTS
        help
          This build tracepoints example modules.
 
+config SAMPLE_TRACE_EVENTS
+       tristate "Build trace_events examples -- loadable modules only"
+       depends on EVENT_TRACING && m
+       help
+         This build trace event example modules.
+
 config SAMPLE_KOBJECT
        tristate "Build kobject examples"
        help
index 10eaca89fe17913875f90cf0354a9478978db018..13e4b470b5399b41a8f140b935bade02511c688e 100644 (file)
@@ -1,3 +1,3 @@
 # Makefile for Linux samples code
 
-obj-$(CONFIG_SAMPLES)  += markers/ kobject/ kprobes/ tracepoints/
+obj-$(CONFIG_SAMPLES)  += markers/ kobject/ kprobes/ tracepoints/ trace_events/
diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile
new file mode 100644 (file)
index 0000000..0d428dc
--- /dev/null
@@ -0,0 +1,6 @@
+# builds the trace events example kernel modules;
+# then to use one (as root):  insmod <module_name.ko>
+
+CFLAGS_trace-events-sample.o := -I$(src)
+
+obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o
diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c
new file mode 100644 (file)
index 0000000..aabc4e9
--- /dev/null
@@ -0,0 +1,52 @@
+#include <linux/module.h>
+#include <linux/kthread.h>
+
+/*
+ * Any file that uses trace points, must include the header.
+ * But only one file, must include the header by defining
+ * CREATE_TRACE_POINTS first.  This will make the C code that
+ * creates the handles for the trace points.
+ */
+#define CREATE_TRACE_POINTS
+#include "trace-events-sample.h"
+
+
+static void simple_thread_func(int cnt)
+{
+       set_current_state(TASK_INTERRUPTIBLE);
+       schedule_timeout(HZ);
+       trace_foo_bar("hello", cnt);
+}
+
+static int simple_thread(void *arg)
+{
+       int cnt = 0;
+
+       while (!kthread_should_stop())
+               simple_thread_func(cnt++);
+
+       return 0;
+}
+
+static struct task_struct *simple_tsk;
+
+static int __init trace_event_init(void)
+{
+       simple_tsk = kthread_run(simple_thread, NULL, "event-sample");
+       if (IS_ERR(simple_tsk))
+               return -1;
+
+       return 0;
+}
+
+static void __exit trace_event_exit(void)
+{
+       kthread_stop(simple_tsk);
+}
+
+module_init(trace_event_init);
+module_exit(trace_event_exit);
+
+MODULE_AUTHOR("Steven Rostedt");
+MODULE_DESCRIPTION("trace-events-sample");
+MODULE_LICENSE("GPL");
diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h
new file mode 100644 (file)
index 0000000..128a897
--- /dev/null
@@ -0,0 +1,129 @@
+/*
+ * Notice that this file is not protected like a normal header.
+ * We also must allow for rereading of this file. The
+ *
+ *  || defined(TRACE_HEADER_MULTI_READ)
+ *
+ * serves this purpose.
+ */
+#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_EVENT_SAMPLE_H
+
+/*
+ * All trace headers should include tracepoint.h, until we finally
+ * make it into a standard header.
+ */
+#include <linux/tracepoint.h>
+
+/*
+ * If TRACE_SYSTEM is defined, that will be the directory created
+ * in the ftrace directory under /debugfs/tracing/events/<system>
+ *
+ * The define_trace.h belowe will also look for a file name of
+ * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here.
+ *
+ * If you want a different system than file name, you can override
+ * the header name by defining TRACE_INCLUDE_FILE
+ *
+ * If this file was called, goofy.h, then we would define:
+ *
+ * #define TRACE_INCLUDE_FILE goofy
+ *
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM sample
+
+/*
+ * The TRACE_EVENT macro is broken up into 5 parts.
+ *
+ * name: name of the trace point. This is also how to enable the tracepoint.
+ *   A function called trace_foo_bar() will be created.
+ *
+ * proto: the prototype of the function trace_foo_bar()
+ *   Here it is trace_foo_bar(char *foo, int bar).
+ *
+ * args:  must match the arguments in the prototype.
+ *    Here it is simply "foo, bar".
+ *
+ * struct:  This defines the way the data will be stored in the ring buffer.
+ *    There are currently two types of elements. __field and __array.
+ *    a __field is broken up into (type, name). Where type can be any
+ *    type but an array.
+ *    For an array. there are three fields. (type, name, size). The
+ *    type of elements in the array, the name of the field and the size
+ *    of the array.
+ *
+ *    __array( char, foo, 10) is the same as saying   char foo[10].
+ *
+ * fast_assign: This is a C like function that is used to store the items
+ *    into the ring buffer.
+ *
+ * printk: This is a way to print out the data in pretty print. This is
+ *    useful if the system crashes and you are logging via a serial line,
+ *    the data can be printed to the console using this "printk" method.
+ *
+ * Note, that for both the assign and the printk, __entry is the handler
+ * to the data structure in the ring buffer, and is defined by the
+ * TP_STRUCT__entry.
+ */
+TRACE_EVENT(foo_bar,
+
+       TP_PROTO(char *foo, int bar),
+
+       TP_ARGS(foo, bar),
+
+       TP_STRUCT__entry(
+               __array(        char,   foo,    10              )
+               __field(        int,    bar                     )
+       ),
+
+       TP_fast_assign(
+               strncpy(__entry->foo, foo, 10);
+               __entry->bar    = bar;
+       ),
+
+       TP_printk("foo %s %d", __entry->foo, __entry->bar)
+);
+#endif
+
+/***** NOTICE! The #if protection ends here. *****/
+
+
+/*
+ * There are several ways I could have done this. If I left out the
+ * TRACE_INCLUDE_PATH, then it would default to the kernel source
+ * include/trace/events directory.
+ *
+ * I could specify a path from the define_trace.h file back to this
+ * file.
+ *
+ * #define TRACE_INCLUDE_PATH ../../samples/trace_events
+ *
+ * But I chose to simply make it use the current directory and then in
+ * the Makefile I added:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/
+ *
+ * This will make sure the current path is part of the include
+ * structure for our file so that we can find it.
+ *
+ * I could have made only the top level directory the include:
+ *
+ * CFLAGS_trace-events-sample.o := -I$(PWD)
+ *
+ * And then let the path to this directory be the TRACE_INCLUDE_PATH:
+ *
+ * #define TRACE_INCLUDE_PATH samples/trace_events
+ *
+ * But then if something defines "samples" or "trace_events" then we
+ * could risk that being converted too, and give us an unexpected
+ * result.
+ */
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+#define TRACE_INCLUDE_PATH .
+/*
+ * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal
+ */
+#define TRACE_INCLUDE_FILE trace-events-sample
+#include <trace/define_trace.h>
index cba61ca403cacb644bf07fed6e11f340f90c8f50..2b706617c89a806c69b83a112f046f16046904b4 100644 (file)
@@ -188,20 +188,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@
 # ---------------------------------------------------------------------------
 
 quiet_cmd_gzip = GZIP    $@
-cmd_gzip = gzip -f -9 < $< > $@
+cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \
+       (rm -f $@ ; false)
 
 
 # Bzip2
 # ---------------------------------------------------------------------------
 
-# Bzip2 does not include size in file... so we have to fake that
-size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size
-
-quiet_cmd_bzip2 = BZIP2    $@
-cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false)
+# Bzip2 and LZMA do not include size in file... so we have to fake that;
+# append the size as a 32-bit littleendian number as gzip does.
+size_append = echo -ne $(shell                                         \
+dec_size=0;                                                            \
+for F in $1; do                                                                \
+       fsize=$$(stat -c "%s" $$F);                                     \
+       dec_size=$$(expr $$dec_size + $$fsize);                         \
+done;                                                                  \
+printf "%08x" $$dec_size |                                             \
+       sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \
+)
+
+quiet_cmd_bzip2 = BZIP2   $@
+cmd_bzip2 = (cat $(filter-out FORCE,$^) | \
+       bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+       (rm -f $@ ; false)
 
 # Lzma
 # ---------------------------------------------------------------------------
 
 quiet_cmd_lzma = LZMA    $@
-cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false)
+cmd_lzma = (cat $(filter-out FORCE,$^) | \
+       lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \
+       (rm -f $@ ; false)
diff --git a/scripts/bin_size b/scripts/bin_size
deleted file mode 100644 (file)
index 43e1b36..0000000
+++ /dev/null
@@ -1,10 +0,0 @@
-#!/bin/sh
-
-if [ $# = 0 ] ; then
-   echo Usage: $0 file
-fi
-
-size_dec=`stat -c "%s" $1`
-size_hex_echo_string=`printf "%08x" $size_dec |
-     sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'`
-/bin/echo -ne $size_hex_echo_string
index 3208a3a7e7fe5e3f9ad07a20a5bb9827d099d5ae..acd8c4a8e3e0b00f24eaf3331b70156becbecf60 100755 (executable)
@@ -1828,6 +1828,25 @@ sub reset_state {
     $state = 0;
 }
 
+sub tracepoint_munge($) {
+       my $file = shift;
+       my $tracepointname = 0;
+       my $tracepointargs = 0;
+
+       if($prototype =~ m/TRACE_EVENT\((.*?),/) {
+               $tracepointname = $1;
+       }
+       if($prototype =~ m/TP_PROTO\((.*?)\)/) {
+               $tracepointargs = $1;
+       }
+       if (($tracepointname eq 0) || ($tracepointargs eq 0)) {
+               print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n".
+                            "$prototype\n";
+       } else {
+               $prototype = "static inline void trace_$tracepointname($tracepointargs)";
+       }
+}
+
 sub syscall_munge() {
        my $void = 0;
 
@@ -1882,6 +1901,9 @@ sub process_state3_function($$) {
        if ($prototype =~ /SYSCALL_DEFINE/) {
                syscall_munge();
        }
+       if ($prototype =~ /TRACE_EVENT/) {
+               tracepoint_munge($file);
+       }
        dump_function($prototype, $file);
        reset_state();
     }
index 409596eca124f7ada601417a59050eb12f55210a..0fae7da0529cac497a5337be4b6dd0e89cbf13d6 100755 (executable)
@@ -26,7 +26,7 @@
 # which will also be the location of that section after final link.
 # e.g.
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  .globl my_func
 #  my_func:
 #        [...]
@@ -39,7 +39,7 @@
 #        [...]
 #
 # Both relocation offsets for the mcounts in the above example will be
-# offset from .text.sched. If we make another file called tmp.s with:
+# offset from .sched.text. If we make another file called tmp.s with:
 #
 #  .section __mcount_loc
 #  .quad  my_func + 0x5
@@ -51,7 +51,7 @@
 # But this gets hard if my_func is not globl (a static function).
 # In such a case we have:
 #
-#  .section ".text.sched"
+#  .section ".sched.text", "ax"
 #  my_func:
 #        [...]
 #        call mcount  (offset: 0x5)
index bb244774e9d765ae10e41768f5dfd5c40f1f3069..d23c839038f00836cb96a51e53e27db8b8ec163c 100644 (file)
@@ -110,28 +110,8 @@ config SECURITY_ROOTPLUG
 
          See <http://www.linuxjournal.com/article.php?sid=6279> for
          more information about this module.
-         
-         If you are unsure how to answer this question, answer N.
-
-config SECURITY_DEFAULT_MMAP_MIN_ADDR
-        int "Low address space to protect from user allocation"
-        depends on SECURITY
-        default 0
-        help
-         This is the portion of low virtual memory which should be protected
-         from userspace allocation.  Keeping a user from writing to low pages
-         can help reduce the impact of kernel NULL pointer bugs.
-
-         For most ia64, ppc64 and x86 users with lots of address space
-         a value of 65536 is reasonable and should cause no problems.
-         On arm and other archs it should not be higher than 32768.
-         Programs which use vm86 functionality would either need additional
-         permissions from either the LSM or the capabilities module or have
-         this protection disabled.
-
-         This value can be changed after boot using the
-         /proc/sys/vm/mmap_min_addr tunable.
 
+         If you are unsure how to answer this question, answer N.
 
 source security/selinux/Kconfig
 source security/smack/Kconfig
index fa77021d9778ac1f32d7ec10e5322a0e6825e5c9..c67557cdaa857f9046d30cacb7f3ecc42196f9bb 100644 (file)
@@ -16,6 +16,9 @@ obj-$(CONFIG_SECURITYFS)              += inode.o
 # Must precede capability.o in order to stack properly.
 obj-$(CONFIG_SECURITY_SELINUX)         += selinux/built-in.o
 obj-$(CONFIG_SECURITY_SMACK)           += smack/built-in.o
+ifeq ($(CONFIG_AUDIT),y)
+obj-$(CONFIG_SECURITY_SMACK)           += lsm_audit.o
+endif
 obj-$(CONFIG_SECURITY_TOMOYO)          += tomoyo/built-in.o
 obj-$(CONFIG_SECURITY_ROOTPLUG)                += root_plug.o
 obj-$(CONFIG_CGROUP_DEVICE)            += device_cgroup.o
index beac0258c2a8f3a0cbad52ec9adf12e019264626..48b7e0228fa38455ee6c2bf0cb37876e96c99afb 100644 (file)
 #include <linux/prctl.h>
 #include <linux/securebits.h>
 
+/*
+ * If a non-root user executes a setuid-root binary in
+ * !secure(SECURE_NOROOT) mode, then we raise capabilities.
+ * However if fE is also set, then the intent is for only
+ * the file capabilities to be applied, and the setuid-root
+ * bit is left on either to change the uid (plausible) or
+ * to get full privilege on a kernel without file capabilities
+ * support.  So in that case we do not raise capabilities.
+ *
+ * Warn if that happens, once per boot.
+ */
+static void warn_setuid_and_fcaps_mixed(char *fname)
+{
+       static int warned;
+       if (!warned) {
+               printk(KERN_INFO "warning: `%s' has both setuid-root and"
+                       " effective capabilities. Therefore not raising all"
+                       " capabilities.\n", fname);
+               warned = 1;
+       }
+}
+
 int cap_netlink_send(struct sock *sk, struct sk_buff *skb)
 {
        NETLINK_CB(skb).eff_cap = current_cap();
@@ -463,6 +485,15 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
                return ret;
 
        if (!issecure(SECURE_NOROOT)) {
+               /*
+                * If the legacy file capability is set, then don't set privs
+                * for a setuid root binary run by a non-root user.  Do set it
+                * for a root user just to cause least surprise to an admin.
+                */
+               if (effective && new->uid != 0 && new->euid == 0) {
+                       warn_setuid_and_fcaps_mixed(bprm->filename);
+                       goto skip;
+               }
                /*
                 * To support inheritance of root-permissions and suid-root
                 * executables under compatibility mode, we override the
@@ -478,6 +509,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm)
                if (new->euid == 0)
                        effective = true;
        }
+skip:
 
        /* Don't let someone trace a set[ug]id/setpcap binary with the revised
         * credentials unless they have the appropriate permit
index f3b91bfbe4cb9483ea55e7e64c5c1c52b8c71d0d..f7496c6a022b7c2213f061c17b435bf0328085ed 100644 (file)
@@ -287,7 +287,7 @@ void securityfs_remove(struct dentry *dentry)
 {
        struct dentry *parent;
 
-       if (!dentry)
+       if (!dentry || IS_ERR(dentry))
                return;
 
        parent = dentry->d_parent;
index 1e082bb987beef1ca3322fd7409f57472bac60cc..ff513ff737f5c62861ff20e181567c195d126357 100644 (file)
@@ -22,18 +22,9 @@ static int ima_audit;
 static int __init ima_audit_setup(char *str)
 {
        unsigned long audit;
-       int rc, result = 0;
-       char *op = "ima_audit";
-       char *cause;
 
-       rc = strict_strtoul(str, 0, &audit);
-       if (rc || audit > 1)
-               result = 1;
-       else
-               ima_audit = audit;
-       cause = ima_audit ? "enabled" : "not_enabled";
-       integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL,
-                           op, cause, result, 0);
+       if (!strict_strtoul(str, 0, &audit))
+               ima_audit = audit ? 1 : 0;
        return 1;
 }
 __setup("ima_audit=", ima_audit_setup);
@@ -50,23 +41,14 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode,
 
        ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno);
        audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u ses=%u",
-                        current->pid, current->cred->uid,
+                        current->pid, current_cred()->uid,
                         audit_get_loginuid(current),
                         audit_get_sessionid(current));
        audit_log_task_context(ab);
-       switch (audit_msgno) {
-       case AUDIT_INTEGRITY_DATA:
-       case AUDIT_INTEGRITY_METADATA:
-       case AUDIT_INTEGRITY_PCR:
-       case AUDIT_INTEGRITY_STATUS:
-               audit_log_format(ab, " op=%s cause=%s", op, cause);
-               break;
-       case AUDIT_INTEGRITY_HASH:
-               audit_log_format(ab, " op=%s hash=%s", op, cause);
-               break;
-       default:
-               audit_log_format(ab, " op=%s", op);
-       }
+       audit_log_format(ab, " op=");
+       audit_log_string(ab, op);
+       audit_log_format(ab, " cause=");
+       audit_log_string(ab, cause);
        audit_log_format(ab, " comm=");
        audit_log_untrustedstring(ab, current->comm);
        if (fname) {
index 50d572b74caff78b157f9c713e6148d299b3c387..63003a63aaeedbc6fd6e6badaa3eaa5a705b67a8 100644 (file)
@@ -103,7 +103,7 @@ int ima_calc_template_hash(int template_len, void *template, char *digest)
        return rc;
 }
 
-static void ima_pcrread(int idx, u8 *pcr)
+static void __init ima_pcrread(int idx, u8 *pcr)
 {
        if (!ima_used_chip)
                return;
@@ -115,7 +115,7 @@ static void ima_pcrread(int idx, u8 *pcr)
 /*
  * Calculate the boot aggregate hash
  */
-int ima_calc_boot_aggregate(char *digest)
+int __init ima_calc_boot_aggregate(char *digest)
 {
        struct hash_desc desc;
        struct scatterlist sg;
index ffbe259700b10b54852cc5d4b29ae91a754230e4..6bfc7eaebfdabb88d7fbc9c5c61d7089d2c83f23 100644 (file)
@@ -15,6 +15,7 @@
  *     implemenents security file system for reporting
  *     current measurement list and IMA statistics
  */
+#include <linux/fcntl.h>
 #include <linux/module.h>
 #include <linux/seq_file.h>
 #include <linux/rculist.h>
@@ -84,8 +85,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos)
         * against concurrent list-extension
         */
        rcu_read_lock();
-       qe = list_entry(rcu_dereference(qe->later.next),
-                       struct ima_queue_entry, later);
+       qe = list_entry_rcu(qe->later.next,
+                           struct ima_queue_entry, later);
        rcu_read_unlock();
        (*pos)++;
 
@@ -283,6 +284,9 @@ static atomic_t policy_opencount = ATOMIC_INIT(1);
  */
 int ima_open_policy(struct inode * inode, struct file * filp)
 {
+       /* No point in being allowed to open it if you aren't going to write */
+       if (!(filp->f_flags & O_WRONLY))
+               return -EACCES;
        if (atomic_dec_and_test(&policy_opencount))
                return 0;
        return -EBUSY;
@@ -315,7 +319,7 @@ static struct file_operations ima_measure_policy_ops = {
        .release = ima_release_policy
 };
 
-int ima_fs_init(void)
+int __init ima_fs_init(void)
 {
        ima_dir = securityfs_create_dir("ima", NULL);
        if (IS_ERR(ima_dir))
@@ -349,7 +353,7 @@ int ima_fs_init(void)
                goto out;
 
        ima_policy = securityfs_create_file("policy",
-                                           S_IRUSR | S_IRGRP | S_IWUSR,
+                                           S_IWUSR,
                                            ima_dir, NULL,
                                            &ima_measure_policy_ops);
        if (IS_ERR(ima_policy))
index ec79f1ee992cb94c9362bc28f160633978c8c2d3..b8dd693f8790a62a4ae55b848bb8398cadbe3248 100644 (file)
@@ -196,7 +196,7 @@ static void init_once(void *foo)
        kref_set(&iint->refcount, 1);
 }
 
-void ima_iintcache_init(void)
+void __init ima_iintcache_init(void)
 {
        iint_cache =
            kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0,
index 0b0bb8c978cc8575c6948a294fc08fa7efa14815..a40da7ae590021933bbb2d5982fc7a61494947ff 100644 (file)
@@ -38,7 +38,7 @@ int ima_used_chip;
  * a different value.) Violations add a zero entry to the measurement
  * list and extend the aggregate PCR value with ff...ff's.
  */
-static void ima_add_boot_aggregate(void)
+static void __init ima_add_boot_aggregate(void)
 {
        struct ima_template_entry *entry;
        const char *op = "add_boot_aggregate";
@@ -71,7 +71,7 @@ err_out:
                            audit_cause, result, 0);
 }
 
-int ima_init(void)
+int __init ima_init(void)
 {
        u8 pcr_i[IMA_DIGEST_SIZE];
        int rc;
index f4e7266f5aeec4f68de76ef040155153ae95ae1a..6f611874d10e9f7bbea81fab4fc537d68a711b75 100644 (file)
@@ -29,20 +29,8 @@ int ima_initialized;
 char *ima_hash = "sha1";
 static int __init hash_setup(char *str)
 {
-       const char *op = "hash_setup";
-       const char *hash = "sha1";
-       int result = 0;
-       int audit_info = 0;
-
-       if (strncmp(str, "md5", 3) == 0) {
-               hash = "md5";
-               ima_hash = str;
-       } else if (strncmp(str, "sha1", 4) != 0) {
-               hash = "invalid_hash_type";
-               result = 1;
-       }
-       integrity_audit_msg(AUDIT_INTEGRITY_HASH, NULL, NULL, op, hash,
-                           result, audit_info);
+       if (strncmp(str, "md5", 3) == 0)
+               ima_hash = "md5";
        return 1;
 }
 __setup("ima_hash=", hash_setup);
@@ -128,10 +116,6 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
 {
        int rc = 0;
 
-       if (IS_ERR(file)) {
-               pr_info("%s dentry_open failed\n", filename);
-               return rc;
-       }
        iint->opencount++;
        iint->readcount++;
 
@@ -141,6 +125,15 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
        return rc;
 }
 
+static void ima_update_counts(struct ima_iint_cache *iint, int mask)
+{
+       iint->opencount++;
+       if ((mask & MAY_WRITE) || (mask == 0))
+               iint->writecount++;
+       else if (mask & (MAY_READ | MAY_EXEC))
+               iint->readcount++;
+}
+
 /**
  * ima_path_check - based on policy, collect/store measurement.
  * @path: contains a pointer to the path to be measured
@@ -156,10 +149,10 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file,
  *     - Opening a file for read when already open for write,
  *       could result in a file measurement error.
  *
- * Return 0 on success, an error code on failure.
- * (Based on the results of appraise_measurement().)
+ * Always return 0 and audit dentry_open failures.
+ * (Return code will be based upon measurement appraisal.)
  */
-int ima_path_check(struct path *path, int mask)
+int ima_path_check(struct path *path, int mask, int update_counts)
 {
        struct inode *inode = path->dentry->d_inode;
        struct ima_iint_cache *iint;
@@ -173,11 +166,8 @@ int ima_path_check(struct path *path, int mask)
                return 0;
 
        mutex_lock(&iint->mutex);
-       iint->opencount++;
-       if ((mask & MAY_WRITE) || (mask == 0))
-               iint->writecount++;
-       else if (mask & (MAY_READ | MAY_EXEC))
-               iint->readcount++;
+       if (update_counts)
+               ima_update_counts(iint, mask);
 
        rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK);
        if (rc < 0)
@@ -196,7 +186,19 @@ int ima_path_check(struct path *path, int mask)
                struct dentry *dentry = dget(path->dentry);
                struct vfsmount *mnt = mntget(path->mnt);
 
-               file = dentry_open(dentry, mnt, O_RDONLY, current->cred);
+               file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE,
+                                  current_cred());
+               if (IS_ERR(file)) {
+                       int audit_info = 0;
+
+                       integrity_audit_msg(AUDIT_INTEGRITY_PCR, inode,
+                                           dentry->d_name.name,
+                                           "add_measurement",
+                                           "dentry_open failed",
+                                           1, audit_info);
+                       file = NULL;
+                       goto out;
+               }
                rc = get_path_measurement(iint, file, dentry->d_name.name);
        }
 out:
@@ -206,6 +208,7 @@ out:
        kref_put(&iint->refcount, iint_free);
        return 0;
 }
+EXPORT_SYMBOL_GPL(ima_path_check);
 
 static int process_measurement(struct file *file, const unsigned char *filename,
                               int mask, int function)
@@ -234,7 +237,16 @@ out:
        return rc;
 }
 
-static void opencount_get(struct file *file)
+/*
+ * ima_opens_get - increment file counts
+ *
+ * - for IPC shm and shmat file.
+ * - for nfsd exported files.
+ *
+ * Increment the counts for these files to prevent unnecessary
+ * imbalance messages.
+ */
+void ima_counts_get(struct file *file)
 {
        struct inode *inode = file->f_dentry->d_inode;
        struct ima_iint_cache *iint;
@@ -246,8 +258,14 @@ static void opencount_get(struct file *file)
                return;
        mutex_lock(&iint->mutex);
        iint->opencount++;
+       if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
+               iint->readcount++;
+
+       if (file->f_mode & FMODE_WRITE)
+               iint->writecount++;
        mutex_unlock(&iint->mutex);
 }
+EXPORT_SYMBOL_GPL(ima_counts_get);
 
 /**
  * ima_file_mmap - based on policy, collect/store measurement.
@@ -272,18 +290,6 @@ int ima_file_mmap(struct file *file, unsigned long prot)
        return 0;
 }
 
-/*
- * ima_shm_check - IPC shm and shmat create/fput a file
- *
- * Maintain the opencount for these files to prevent unnecessary
- * imbalance messages.
- */
-void ima_shm_check(struct file *file)
-{
-       opencount_get(file);
-       return;
-}
-
 /**
  * ima_bprm_check - based on policy, collect/store measurement.
  * @bprm: contains the linux_binprm structure
index b5291ad5ef563b4a5707f0dc6b926fd2aafdf997..e1278399b34546a8ba5bbb4e8fdb90f9a8ec009f 100644 (file)
@@ -45,24 +45,30 @@ struct ima_measure_rule_entry {
        } lsm[MAX_LSM_RULES];
 };
 
-/* Without LSM specific knowledge, the default policy can only be
+/*
+ * Without LSM specific knowledge, the default policy can only be
  * written in terms of .action, .func, .mask, .fsmagic, and .uid
  */
+
+/*
+ * The minimum rule set to allow for full TCB coverage.  Measures all files
+ * opened or mmap for exec and everything read by root.  Dangerous because
+ * normal users can easily run the machine out of memory simply building
+ * and running executables.
+ */
 static struct ima_measure_rule_entry default_rules[] = {
-       {.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,
-        .flags = IMA_FSMAGIC},
+       {.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC},
        {.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC},
-       {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,
-        .flags = IMA_FSMAGIC},
-       {.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC},
+       {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC},
+       {.action = DONT_MEASURE,.fsmagic = SELINUX_MAGIC,.flags = IMA_FSMAGIC},
        {.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC,
         .flags = IMA_FUNC | IMA_MASK},
        {.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0,
-        .flags = IMA_FUNC | IMA_MASK | IMA_UID}
+        .flags = IMA_FUNC | IMA_MASK | IMA_UID},
 };
 
 static LIST_HEAD(measure_default_rules);
@@ -71,6 +77,14 @@ static struct list_head *ima_measure;
 
 static DEFINE_MUTEX(ima_measure_mutex);
 
+static bool ima_use_tcb __initdata;
+static int __init default_policy_setup(char *str)
+{
+       ima_use_tcb = 1;
+       return 1;
+}
+__setup("ima_tcb", default_policy_setup);
+
 /**
  * ima_match_rules - determine whether an inode matches the measure rule.
  * @rule: a pointer to a rule
@@ -96,7 +110,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
        if ((rule->flags & IMA_UID) && rule->uid != tsk->cred->uid)
                return false;
        for (i = 0; i < MAX_LSM_RULES; i++) {
-               int rc;
+               int rc = 0;
                u32 osid, sid;
 
                if (!rule->lsm[i].rule)
@@ -109,7 +123,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
                        security_inode_getsecid(inode, &osid);
                        rc = security_filter_rule_match(osid,
                                                        rule->lsm[i].type,
-                                                       AUDIT_EQUAL,
+                                                       Audit_equal,
                                                        rule->lsm[i].rule,
                                                        NULL);
                        break;
@@ -119,7 +133,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule,
                        security_task_getsecid(tsk, &sid);
                        rc = security_filter_rule_match(sid,
                                                        rule->lsm[i].type,
-                                                       AUDIT_EQUAL,
+                                                       Audit_equal,
                                                        rule->lsm[i].rule,
                                                        NULL);
                default:
@@ -164,11 +178,17 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask)
  * ima_measure points to either the measure_default_rules or the
  * the new measure_policy_rules.
  */
-void ima_init_policy(void)
+void __init ima_init_policy(void)
 {
-       int i;
+       int i, entries;
+
+       /* if !ima_use_tcb set entries = 0 so we load NO default rules */
+       if (ima_use_tcb)
+               entries = ARRAY_SIZE(default_rules);
+       else
+               entries = 0;
 
-       for (i = 0; i < ARRAY_SIZE(default_rules); i++)
+       for (i = 0; i < entries; i++)
                list_add_tail(&default_rules[i].list, &measure_default_rules);
        ima_measure = &measure_default_rules;
 }
@@ -227,7 +247,7 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry,
 
        entry->lsm[lsm_rule].type = audit_type;
        result = security_filter_rule_init(entry->lsm[lsm_rule].type,
-                                          AUDIT_EQUAL, args,
+                                          Audit_equal, args,
                                           &entry->lsm[lsm_rule].rule);
        return result;
 }
diff --git a/security/lsm_audit.c b/security/lsm_audit.c
new file mode 100644 (file)
index 0000000..94b8684
--- /dev/null
@@ -0,0 +1,386 @@
+/*
+ * common LSM auditing functions
+ *
+ * Based on code written for SELinux by :
+ *                     Stephen Smalley, <sds@epoch.ncsc.mil>
+ *                     James Morris <jmorris@redhat.com>
+ * Author : Etienne Basset, <etienne.basset@ensta.org>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2,
+ * as published by the Free Software Foundation.
+ */
+
+#include <linux/types.h>
+#include <linux/stddef.h>
+#include <linux/kernel.h>
+#include <linux/fs.h>
+#include <linux/init.h>
+#include <net/sock.h>
+#include <linux/un.h>
+#include <net/af_unix.h>
+#include <linux/audit.h>
+#include <linux/ipv6.h>
+#include <linux/ip.h>
+#include <net/ip.h>
+#include <net/ipv6.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/dccp.h>
+#include <linux/sctp.h>
+#include <linux/lsm_audit.h>
+
+/**
+ * ipv4_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv4_skb_to_auditdata(struct sk_buff *skb,
+               struct common_audit_data *ad, u8 *proto)
+{
+       int ret = 0;
+       struct iphdr *ih;
+
+       ih = ip_hdr(skb);
+       if (ih == NULL)
+               return -EINVAL;
+
+       ad->u.net.v4info.saddr = ih->saddr;
+       ad->u.net.v4info.daddr = ih->daddr;
+
+       if (proto)
+               *proto = ih->protocol;
+       /* non initial fragment */
+       if (ntohs(ih->frag_off) & IP_OFFSET)
+               return 0;
+
+       switch (ih->protocol) {
+       case IPPROTO_TCP: {
+               struct tcphdr *th = tcp_hdr(skb);
+               if (th == NULL)
+                       break;
+
+               ad->u.net.sport = th->source;
+               ad->u.net.dport = th->dest;
+               break;
+       }
+       case IPPROTO_UDP: {
+               struct udphdr *uh = udp_hdr(skb);
+               if (uh == NULL)
+                       break;
+
+               ad->u.net.sport = uh->source;
+               ad->u.net.dport = uh->dest;
+               break;
+       }
+       case IPPROTO_DCCP: {
+               struct dccp_hdr *dh = dccp_hdr(skb);
+               if (dh == NULL)
+                       break;
+
+               ad->u.net.sport = dh->dccph_sport;
+               ad->u.net.dport = dh->dccph_dport;
+               break;
+       }
+       case IPPROTO_SCTP: {
+               struct sctphdr *sh = sctp_hdr(skb);
+               if (sh == NULL)
+                       break;
+               ad->u.net.sport = sh->source;
+               ad->u.net.dport = sh->dest;
+               break;
+       }
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+/**
+ * ipv6_skb_to_auditdata : fill auditdata from skb
+ * @skb : the skb
+ * @ad : the audit data to fill
+ * @proto : the layer 4 protocol
+ *
+ * return  0 on success
+ */
+int ipv6_skb_to_auditdata(struct sk_buff *skb,
+               struct common_audit_data *ad, u8 *proto)
+{
+       int offset, ret = 0;
+       struct ipv6hdr *ip6;
+       u8 nexthdr;
+
+       ip6 = ipv6_hdr(skb);
+       if (ip6 == NULL)
+               return -EINVAL;
+       ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr);
+       ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr);
+       ret = 0;
+       /* IPv6 can have several extension header before the Transport header
+        * skip them */
+       offset = skb_network_offset(skb);
+       offset += sizeof(*ip6);
+       nexthdr = ip6->nexthdr;
+       offset = ipv6_skip_exthdr(skb, offset, &nexthdr);
+       if (offset < 0)
+               return 0;
+       if (proto)
+               *proto = nexthdr;
+       switch (nexthdr) {
+       case IPPROTO_TCP: {
+               struct tcphdr _tcph, *th;
+
+               th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
+               if (th == NULL)
+                       break;
+
+               ad->u.net.sport = th->source;
+               ad->u.net.dport = th->dest;
+               break;
+       }
+       case IPPROTO_UDP: {
+               struct udphdr _udph, *uh;
+
+               uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph);
+               if (uh == NULL)
+                       break;
+
+               ad->u.net.sport = uh->source;
+               ad->u.net.dport = uh->dest;
+               break;
+       }
+       case IPPROTO_DCCP: {
+               struct dccp_hdr _dccph, *dh;
+
+               dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph);
+               if (dh == NULL)
+                       break;
+
+               ad->u.net.sport = dh->dccph_sport;
+               ad->u.net.dport = dh->dccph_dport;
+               break;
+       }
+       case IPPROTO_SCTP: {
+               struct sctphdr _sctph, *sh;
+
+               sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph);
+               if (sh == NULL)
+                       break;
+               ad->u.net.sport = sh->source;
+               ad->u.net.dport = sh->dest;
+               break;
+       }
+       default:
+               ret = -EINVAL;
+       }
+       return ret;
+}
+#endif
+
+
+static inline void print_ipv6_addr(struct audit_buffer *ab,
+                                  struct in6_addr *addr, __be16 port,
+                                  char *name1, char *name2)
+{
+       if (!ipv6_addr_any(addr))
+               audit_log_format(ab, " %s=%pI6", name1, addr);
+       if (port)
+               audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr,
+                                  __be16 port, char *name1, char *name2)
+{
+       if (addr)
+               audit_log_format(ab, " %s=%pI4", name1, &addr);
+       if (port)
+               audit_log_format(ab, " %s=%d", name2, ntohs(port));
+}
+
+/**
+ * dump_common_audit_data - helper to dump common audit data
+ * @a : common audit data
+ *
+ */
+static void dump_common_audit_data(struct audit_buffer *ab,
+                                  struct common_audit_data *a)
+{
+       struct inode *inode = NULL;
+       struct task_struct *tsk = current;
+
+       if (a->tsk)
+               tsk = a->tsk;
+       if (tsk && tsk->pid) {
+               audit_log_format(ab, " pid=%d comm=", tsk->pid);
+               audit_log_untrustedstring(ab, tsk->comm);
+       }
+
+       switch (a->type) {
+       case LSM_AUDIT_DATA_IPC:
+               audit_log_format(ab, " key=%d ", a->u.ipc_id);
+               break;
+       case LSM_AUDIT_DATA_CAP:
+               audit_log_format(ab, " capability=%d ", a->u.cap);
+               break;
+       case LSM_AUDIT_DATA_FS:
+               if (a->u.fs.path.dentry) {
+                       struct dentry *dentry = a->u.fs.path.dentry;
+                       if (a->u.fs.path.mnt) {
+                               audit_log_d_path(ab, "path=", &a->u.fs.path);
+                       } else {
+                               audit_log_format(ab, " name=");
+                               audit_log_untrustedstring(ab,
+                                                dentry->d_name.name);
+                       }
+                       inode = dentry->d_inode;
+               } else if (a->u.fs.inode) {
+                       struct dentry *dentry;
+                       inode = a->u.fs.inode;
+                       dentry = d_find_alias(inode);
+                       if (dentry) {
+                               audit_log_format(ab, " name=");
+                               audit_log_untrustedstring(ab,
+                                                dentry->d_name.name);
+                               dput(dentry);
+                       }
+               }
+               if (inode)
+                       audit_log_format(ab, " dev=%s ino=%lu",
+                                       inode->i_sb->s_id,
+                                       inode->i_ino);
+               break;
+       case LSM_AUDIT_DATA_TASK:
+               tsk = a->u.tsk;
+               if (tsk && tsk->pid) {
+                       audit_log_format(ab, " pid=%d comm=", tsk->pid);
+                       audit_log_untrustedstring(ab, tsk->comm);
+               }
+               break;
+       case LSM_AUDIT_DATA_NET:
+               if (a->u.net.sk) {
+                       struct sock *sk = a->u.net.sk;
+                       struct unix_sock *u;
+                       int len = 0;
+                       char *p = NULL;
+
+                       switch (sk->sk_family) {
+                       case AF_INET: {
+                               struct inet_sock *inet = inet_sk(sk);
+
+                               print_ipv4_addr(ab, inet->rcv_saddr,
+                                               inet->sport,
+                                               "laddr", "lport");
+                               print_ipv4_addr(ab, inet->daddr,
+                                               inet->dport,
+                                               "faddr", "fport");
+                               break;
+                       }
+                       case AF_INET6: {
+                               struct inet_sock *inet = inet_sk(sk);
+                               struct ipv6_pinfo *inet6 = inet6_sk(sk);
+
+                               print_ipv6_addr(ab, &inet6->rcv_saddr,
+                                               inet->sport,
+                                               "laddr", "lport");
+                               print_ipv6_addr(ab, &inet6->daddr,
+                                               inet->dport,
+                                               "faddr", "fport");
+                               break;
+                       }
+                       case AF_UNIX:
+                               u = unix_sk(sk);
+                               if (u->dentry) {
+                                       struct path path = {
+                                               .dentry = u->dentry,
+                                               .mnt = u->mnt
+                                       };
+                                       audit_log_d_path(ab, "path=", &path);
+                                       break;
+                               }
+                               if (!u->addr)
+                                       break;
+                               len = u->addr->len-sizeof(short);
+                               p = &u->addr->name->sun_path[0];
+                               audit_log_format(ab, " path=");
+                               if (*p)
+                                       audit_log_untrustedstring(ab, p);
+                               else
+                                       audit_log_n_hex(ab, p, len);
+                               break;
+                       }
+               }
+
+               switch (a->u.net.family) {
+               case AF_INET:
+                       print_ipv4_addr(ab, a->u.net.v4info.saddr,
+                                       a->u.net.sport,
+                                       "saddr", "src");
+                       print_ipv4_addr(ab, a->u.net.v4info.daddr,
+                                       a->u.net.dport,
+                                       "daddr", "dest");
+                       break;
+               case AF_INET6:
+                       print_ipv6_addr(ab, &a->u.net.v6info.saddr,
+                                       a->u.net.sport,
+                                       "saddr", "src");
+                       print_ipv6_addr(ab, &a->u.net.v6info.daddr,
+                                       a->u.net.dport,
+                                       "daddr", "dest");
+                       break;
+               }
+               if (a->u.net.netif > 0) {
+                       struct net_device *dev;
+
+                       /* NOTE: we always use init's namespace */
+                       dev = dev_get_by_index(&init_net, a->u.net.netif);
+                       if (dev) {
+                               audit_log_format(ab, " netif=%s", dev->name);
+                               dev_put(dev);
+                       }
+               }
+               break;
+#ifdef CONFIG_KEYS
+       case LSM_AUDIT_DATA_KEY:
+               audit_log_format(ab, " key_serial=%u", a->u.key_struct.key);
+               if (a->u.key_struct.key_desc) {
+                       audit_log_format(ab, " key_desc=");
+                       audit_log_untrustedstring(ab, a->u.key_struct.key_desc);
+               }
+               break;
+#endif
+       } /* switch (a->type) */
+}
+
+/**
+ * common_lsm_audit - generic LSM auditing function
+ * @a:  auxiliary audit data
+ *
+ * setup the audit buffer for common security information
+ * uses callback to print LSM specific information
+ */
+void common_lsm_audit(struct common_audit_data *a)
+{
+       struct audit_buffer *ab;
+
+       if (a == NULL)
+               return;
+       /* we use GFP_ATOMIC so we won't sleep */
+       ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC);
+
+       if (ab == NULL)
+               return;
+
+       if (a->lsm_pre_audit)
+               a->lsm_pre_audit(ab, a);
+
+       dump_common_audit_data(ab, a);
+
+       if (a->lsm_post_audit)
+               a->lsm_post_audit(ab, a);
+
+       audit_log_end(ab);
+}
index 40fb4f15e27b6d360e1634c4d0f795e0edf621c1..2f7ffa67c4d2db7069f6c2e9b141eb544c409d25 100644 (file)
@@ -71,18 +71,6 @@ static int rootplug_bprm_check_security (struct linux_binprm *bprm)
 }
 
 static struct security_operations rootplug_security_ops = {
-       /* Use the capability functions for some of the hooks */
-       .ptrace_may_access =            cap_ptrace_may_access,
-       .ptrace_traceme =               cap_ptrace_traceme,
-       .capget =                       cap_capget,
-       .capset =                       cap_capset,
-       .capable =                      cap_capable,
-
-       .bprm_set_creds =               cap_bprm_set_creds,
-
-       .task_fix_setuid =              cap_task_fix_setuid,
-       .task_prctl =                   cap_task_prctl,
-
        .bprm_check_security =          rootplug_bprm_check_security,
 };
 
index 5284255c5cdff9869ac4086c3a8976a4d9ad0a1a..dc7674fbfc7a4a6fcfb132bc974f2c9480b90922 100644 (file)
@@ -26,9 +26,6 @@ extern void security_fixup_ops(struct security_operations *ops);
 
 struct security_operations *security_ops;      /* Initialized to NULL */
 
-/* amount of vm to protect from userspace access */
-unsigned long mmap_min_addr = CONFIG_SECURITY_DEFAULT_MMAP_MIN_ADDR;
-
 static inline int verify(struct security_operations *ops)
 {
        /* verify the security_operations structure exists */
index 7f9b5fac87793a19faf3d310a4e04f8a158bb476..b2ab608598325bcea26430f6ebd1eac30a67ad61 100644 (file)
@@ -927,7 +927,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid,
        if (denied) {
                if (flags & AVC_STRICT)
                        rc = -EACCES;
-               else if (!selinux_enforcing || security_permissive_sid(ssid))
+               else if (!selinux_enforcing || (avd->flags & AVD_FLAGS_PERMISSIVE))
                        avc_update_node(AVC_CALLBACK_GRANT, requested, ssid,
                                        tsid, tclass, avd->seqno);
                else
index 2fcad7c33eafd43a5e67beef49aea5f7cd020a89..195906bce2663f09e4fb65c4e39edb2cd20d0d33 100644 (file)
@@ -1980,10 +1980,6 @@ static int selinux_sysctl(ctl_table *table, int op)
        u32 tsid, sid;
        int rc;
 
-       rc = secondary_ops->sysctl(table, op);
-       if (rc)
-               return rc;
-
        sid = current_sid();
 
        rc = selinux_sysctl_get_sid(table, (op == 0001) ?
@@ -2375,10 +2371,8 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
 {
        const struct task_security_struct *tsec = current_security();
        struct itimerval itimer;
-       struct sighand_struct *psig;
        u32 osid, sid;
        int rc, i;
-       unsigned long flags;
 
        osid = tsec->osid;
        sid = tsec->sid;
@@ -2398,22 +2392,20 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm)
                memset(&itimer, 0, sizeof itimer);
                for (i = 0; i < 3; i++)
                        do_setitimer(i, &itimer, NULL);
-               flush_signals(current);
                spin_lock_irq(&current->sighand->siglock);
-               flush_signal_handlers(current, 1);
-               sigemptyset(&current->blocked);
-               recalc_sigpending();
+               if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) {
+                       __flush_signals(current);
+                       flush_signal_handlers(current, 1);
+                       sigemptyset(&current->blocked);
+               }
                spin_unlock_irq(&current->sighand->siglock);
        }
 
        /* Wake up the parent if it is waiting so that it can recheck
         * wait permission to the new task SID. */
-       read_lock_irq(&tasklist_lock);
-       psig = current->parent->sighand;
-       spin_lock_irqsave(&psig->siglock, flags);
-       wake_up_interruptible(&current->parent->signal->wait_chldexit);
-       spin_unlock_irqrestore(&psig->siglock, flags);
-       read_unlock_irq(&tasklist_lock);
+       read_lock(&tasklist_lock);
+       wake_up_interruptible(&current->real_parent->signal->wait_chldexit);
+       read_unlock(&tasklist_lock);
 }
 
 /* superblock security operations */
index 5c3434f7626fdd5cbe81ff0120ad3f092713fd95..ca835795a8b322e7e4398d065d2eb0976741e2ad 100644 (file)
@@ -8,14 +8,13 @@
 #ifndef _SELINUX_SECURITY_H_
 #define _SELINUX_SECURITY_H_
 
+#include <linux/magic.h>
 #include "flask.h"
 
 #define SECSID_NULL                    0x00000000 /* unspecified SID */
 #define SECSID_WILD                    0xffffffff /* wildcard SID */
 #define SECCLASS_NULL                  0x0000 /* no class */
 
-#define SELINUX_MAGIC 0xf97cff8c
-
 /* Identify specific policy version changes */
 #define POLICYDB_VERSION_BASE          15
 #define POLICYDB_VERSION_BOOL          16
@@ -91,9 +90,11 @@ struct av_decision {
        u32 auditallow;
        u32 auditdeny;
        u32 seqno;
+       u32 flags;
 };
 
-int security_permissive_sid(u32 sid);
+/* definitions of av_decision.flags */
+#define AVD_FLAGS_PERMISSIVE   0x0001
 
 int security_compute_av(u32 ssid, u32 tsid,
        u16 tclass, u32 requested,
index c6875fd3b9d61445009f22d3ea8b91037b864d25..dd7cc6de77f9e3a47118b6e8dffca3538b36adb3 100644 (file)
@@ -112,6 +112,8 @@ static struct nlmsg_perm nlmsg_audit_perms[] =
        { AUDIT_DEL_RULE,       NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_USER,           NETLINK_AUDIT_SOCKET__NLMSG_RELAY    },
        { AUDIT_SIGNAL_INFO,    NETLINK_AUDIT_SOCKET__NLMSG_READ     },
+       { AUDIT_TRIM,           NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
+       { AUDIT_MAKE_EQUIV,     NETLINK_AUDIT_SOCKET__NLMSG_WRITE    },
        { AUDIT_TTY_GET,        NETLINK_AUDIT_SOCKET__NLMSG_READ     },
        { AUDIT_TTY_SET,        NETLINK_AUDIT_SOCKET__NLMSG_TTY_AUDIT   },
 };
index 2d5136ec3d5451c945f89e59b2f9ea4433bfa6a4..b4fc506e7a87c8aa69a71ccc1a9c1b6c55c00ce8 100644 (file)
@@ -527,10 +527,10 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size)
                goto out2;
 
        length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT,
-                         "%x %x %x %x %u",
+                         "%x %x %x %x %u %x",
                          avd.allowed, 0xffffffff,
                          avd.auditallow, avd.auditdeny,
-                         avd.seqno);
+                         avd.seqno, avd.flags);
 out2:
        kfree(tcon);
 out:
@@ -803,10 +803,6 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf,
                goto out;
        }
 
-       if (count > PAGE_SIZE) {
-               ret = -EINVAL;
-               goto out;
-       }
        page = (char *)get_zeroed_page(GFP_KERNEL);
        if (!page) {
                ret = -ENOMEM;
index deeec6c013aef6dee9d664e3fe46b0a892d9f27e..500e6f78e1159e1d568355ce74a113b4b49859c1 100644 (file)
@@ -410,6 +410,7 @@ static int context_struct_compute_av(struct context *scontext,
        avd->auditallow = 0;
        avd->auditdeny = 0xffffffff;
        avd->seqno = latest_granting;
+       avd->flags = 0;
 
        /*
         * Check for all the invalid cases.
@@ -528,31 +529,6 @@ inval_class:
        return 0;
 }
 
-/*
- * Given a sid find if the type has the permissive flag set
- */
-int security_permissive_sid(u32 sid)
-{
-       struct context *context;
-       u32 type;
-       int rc;
-
-       read_lock(&policy_rwlock);
-
-       context = sidtab_search(&sidtab, sid);
-       BUG_ON(!context);
-
-       type = context->type;
-       /*
-        * we are intentionally using type here, not type-1, the 0th bit may
-        * someday indicate that we are globally setting permissive in policy.
-        */
-       rc = ebitmap_get_bit(&policydb.permissive_map, type);
-
-       read_unlock(&policy_rwlock);
-       return rc;
-}
-
 static int security_validtrans_handle_fail(struct context *ocontext,
                                           struct context *ncontext,
                                           struct context *tcontext,
@@ -767,6 +743,10 @@ int security_compute_av(u32 ssid,
 
        rc = context_struct_compute_av(scontext, tcontext, tclass,
                                       requested, avd);
+
+       /* permissive domain? */
+       if (ebitmap_get_bit(&policydb.permissive_map, scontext->type))
+           avd->flags |= AVD_FLAGS_PERMISSIVE;
 out:
        read_unlock(&policy_rwlock);
        return rc;
index 42ef313f98560451b2b45d563d1f5e26e23ca795..243bec175be050f930189a25d4a4bb6d5dc85ef3 100644 (file)
@@ -20,6 +20,7 @@
 #include <net/netlabel.h>
 #include <linux/list.h>
 #include <linux/rculist.h>
+#include <linux/lsm_audit.h>
 
 /*
  * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is
@@ -178,6 +179,20 @@ struct smack_known {
 #define MAY_READWRITE  (MAY_READ | MAY_WRITE)
 #define MAY_NOT                0
 
+/*
+ * Number of access types used by Smack (rwxa)
+ */
+#define SMK_NUM_ACCESS_TYPE 4
+
+/*
+ * Smack audit data; is empty if CONFIG_AUDIT not set
+ * to save some stack
+ */
+struct smk_audit_info {
+#ifdef CONFIG_AUDIT
+       struct common_audit_data a;
+#endif
+};
 /*
  * These functions are in smack_lsm.c
  */
@@ -186,8 +201,8 @@ struct inode_smack *new_inode_smack(char *);
 /*
  * These functions are in smack_access.c
  */
-int smk_access(char *, char *, int);
-int smk_curacc(char *, u32);
+int smk_access(char *, char *, int, struct smk_audit_info *);
+int smk_curacc(char *, u32, struct smk_audit_info *);
 int smack_to_cipso(const char *, struct smack_cipso *);
 void smack_from_cipso(u32, char *, char *);
 char *smack_from_secid(const u32);
@@ -237,4 +252,93 @@ static inline char *smk_of_inode(const struct inode *isp)
        return sip->smk_inode;
 }
 
+/*
+ * logging functions
+ */
+#define SMACK_AUDIT_DENIED 0x1
+#define SMACK_AUDIT_ACCEPT 0x2
+extern int log_policy;
+
+void smack_log(char *subject_label, char *object_label,
+               int request,
+               int result, struct smk_audit_info *auditdata);
+
+#ifdef CONFIG_AUDIT
+
+/*
+ * some inline functions to set up audit data
+ * they do nothing if CONFIG_AUDIT is not set
+ *
+ */
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+                              char type)
+{
+       memset(a, 0, sizeof(*a));
+       a->a.type = type;
+       a->a.function = func;
+}
+
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+                                        struct task_struct *t)
+{
+       a->a.u.tsk = t;
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+                                                   struct dentry *d)
+{
+       a->a.u.fs.path.dentry = d;
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+                                                struct vfsmount *m)
+{
+       a->a.u.fs.path.mnt = m;
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+                                             struct inode *i)
+{
+       a->a.u.fs.inode = i;
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+                                            struct path p)
+{
+       a->a.u.fs.path = p;
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+                                           struct sock *sk)
+{
+       a->a.u.net.sk = sk;
+}
+
+#else /* no AUDIT */
+
+static inline void smk_ad_init(struct smk_audit_info *a, const char *func,
+                              char type)
+{
+}
+static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a,
+                                        struct task_struct *t)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a,
+                                                   struct dentry *d)
+{
+}
+static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a,
+                                                struct vfsmount *m)
+{
+}
+static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a,
+                                             struct inode *i)
+{
+}
+static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a,
+                                            struct path p)
+{
+}
+static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a,
+                                           struct sock *sk)
+{
+}
+#endif
+
 #endif  /* _SECURITY_SMACK_H */
index ac0a2707f6d41583e9e325819ab2265483320853..513dc1aa16dd1ff5594745b463dd67118648705a 100644 (file)
@@ -59,11 +59,18 @@ LIST_HEAD(smack_known_list);
  */
 static u32 smack_next_secid = 10;
 
+/*
+ * what events do we log
+ * can be overwritten at run-time by /smack/logging
+ */
+int log_policy = SMACK_AUDIT_DENIED;
+
 /**
  * smk_access - determine if a subject has a specific access to an object
  * @subject_label: a pointer to the subject's Smack label
  * @object_label: a pointer to the object's Smack label
  * @request: the access requested, in "MAY" format
+ * @a : a pointer to the audit data
  *
  * This function looks up the subject/object pair in the
  * access rule list and returns 0 if the access is permitted,
@@ -78,10 +85,12 @@ static u32 smack_next_secid = 10;
  * will be on the list, so checking the pointers may be a worthwhile
  * optimization.
  */
-int smk_access(char *subject_label, char *object_label, int request)
+int smk_access(char *subject_label, char *object_label, int request,
+              struct smk_audit_info *a)
 {
        u32 may = MAY_NOT;
        struct smack_rule *srp;
+       int rc = 0;
 
        /*
         * Hardcoded comparisons.
@@ -89,8 +98,10 @@ int smk_access(char *subject_label, char *object_label, int request)
         * A star subject can't access any object.
         */
        if (subject_label == smack_known_star.smk_known ||
-           strcmp(subject_label, smack_known_star.smk_known) == 0)
-               return -EACCES;
+           strcmp(subject_label, smack_known_star.smk_known) == 0) {
+               rc = -EACCES;
+               goto out_audit;
+       }
        /*
         * An internet object can be accessed by any subject.
         * Tasks cannot be assigned the internet label.
@@ -100,20 +111,20 @@ int smk_access(char *subject_label, char *object_label, int request)
            subject_label == smack_known_web.smk_known ||
            strcmp(object_label, smack_known_web.smk_known) == 0 ||
            strcmp(subject_label, smack_known_web.smk_known) == 0)
-               return 0;
+               goto out_audit;
        /*
         * A star object can be accessed by any subject.
         */
        if (object_label == smack_known_star.smk_known ||
            strcmp(object_label, smack_known_star.smk_known) == 0)
-               return 0;
+               goto out_audit;
        /*
         * An object can be accessed in any way by a subject
         * with the same label.
         */
        if (subject_label == object_label ||
            strcmp(subject_label, object_label) == 0)
-               return 0;
+               goto out_audit;
        /*
         * A hat subject can read any object.
         * A floor object can be read by any subject.
@@ -121,10 +132,10 @@ int smk_access(char *subject_label, char *object_label, int request)
        if ((request & MAY_ANYREAD) == request) {
                if (object_label == smack_known_floor.smk_known ||
                    strcmp(object_label, smack_known_floor.smk_known) == 0)
-                       return 0;
+                       goto out_audit;
                if (subject_label == smack_known_hat.smk_known ||
                    strcmp(subject_label, smack_known_hat.smk_known) == 0)
-                       return 0;
+                       goto out_audit;
        }
        /*
         * Beyond here an explicit relationship is required.
@@ -148,28 +159,36 @@ int smk_access(char *subject_label, char *object_label, int request)
         * This is a bit map operation.
         */
        if ((request & may) == request)
-               return 0;
-
-       return -EACCES;
+               goto out_audit;
+
+       rc = -EACCES;
+out_audit:
+#ifdef CONFIG_AUDIT
+       if (a)
+               smack_log(subject_label, object_label, request, rc, a);
+#endif
+       return rc;
 }
 
 /**
  * smk_curacc - determine if current has a specific access to an object
  * @obj_label: a pointer to the object's Smack label
  * @mode: the access requested, in "MAY" format
+ * @a : common audit data
  *
  * This function checks the current subject label/object label pair
  * in the access rule list and returns 0 if the access is permitted,
  * non zero otherwise. It allows that current may have the capability
  * to override the rules.
  */
-int smk_curacc(char *obj_label, u32 mode)
+int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a)
 {
        int rc;
+       char *sp = current_security();
 
-       rc = smk_access(current_security(), obj_label, mode);
+       rc = smk_access(sp, obj_label, mode, NULL);
        if (rc == 0)
-               return 0;
+               goto out_audit;
 
        /*
         * Return if a specific label has been designated as the
@@ -177,14 +196,105 @@ int smk_curacc(char *obj_label, u32 mode)
         * have that label.
         */
        if (smack_onlycap != NULL && smack_onlycap != current->cred->security)
-               return rc;
+               goto out_audit;
 
        if (capable(CAP_MAC_OVERRIDE))
                return 0;
 
+out_audit:
+#ifdef CONFIG_AUDIT
+       if (a)
+               smack_log(sp, obj_label, mode, rc, a);
+#endif
        return rc;
 }
 
+#ifdef CONFIG_AUDIT
+/**
+ * smack_str_from_perm : helper to transalate an int to a
+ * readable string
+ * @string : the string to fill
+ * @access : the int
+ *
+ */
+static inline void smack_str_from_perm(char *string, int access)
+{
+       int i = 0;
+       if (access & MAY_READ)
+               string[i++] = 'r';
+       if (access & MAY_WRITE)
+               string[i++] = 'w';
+       if (access & MAY_EXEC)
+               string[i++] = 'x';
+       if (access & MAY_APPEND)
+               string[i++] = 'a';
+       string[i] = '\0';
+}
+/**
+ * smack_log_callback - SMACK specific information
+ * will be called by generic audit code
+ * @ab : the audit_buffer
+ * @a  : audit_data
+ *
+ */
+static void smack_log_callback(struct audit_buffer *ab, void *a)
+{
+       struct common_audit_data *ad = a;
+       struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data;
+       audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function,
+                        sad->result ? "denied" : "granted");
+       audit_log_format(ab, " subject=");
+       audit_log_untrustedstring(ab, sad->subject);
+       audit_log_format(ab, " object=");
+       audit_log_untrustedstring(ab, sad->object);
+       audit_log_format(ab, " requested=%s", sad->request);
+}
+
+/**
+ *  smack_log - Audit the granting or denial of permissions.
+ *  @subject_label : smack label of the requester
+ *  @object_label  : smack label of the object being accessed
+ *  @request: requested permissions
+ *  @result: result from smk_access
+ *  @a:  auxiliary audit data
+ *
+ * Audit the granting or denial of permissions in accordance
+ * with the policy.
+ */
+void smack_log(char *subject_label, char *object_label, int request,
+              int result, struct smk_audit_info *ad)
+{
+       char request_buffer[SMK_NUM_ACCESS_TYPE + 1];
+       struct smack_audit_data *sad;
+       struct common_audit_data *a = &ad->a;
+
+       /* check if we have to log the current event */
+       if (result != 0 && (log_policy & SMACK_AUDIT_DENIED) == 0)
+               return;
+       if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0)
+               return;
+
+       if (a->function == NULL)
+               a->function = "unknown";
+
+       /* end preparing the audit data */
+       sad = &a->lsm_priv.smack_audit_data;
+       smack_str_from_perm(request_buffer, request);
+       sad->subject = subject_label;
+       sad->object  = object_label;
+       sad->request = request_buffer;
+       sad->result  = result;
+       a->lsm_pre_audit = smack_log_callback;
+
+       common_lsm_audit(a);
+}
+#else /* #ifdef CONFIG_AUDIT */
+void smack_log(char *subject_label, char *object_label, int request,
+               int result, struct smk_audit_info *ad)
+{
+}
+#endif
+
 static DEFINE_MUTEX(smack_known_lock);
 
 /**
@@ -209,7 +319,8 @@ struct smack_known *smk_import_entry(const char *string, int len)
                if (found)
                        smack[i] = '\0';
                else if (i >= len || string[i] > '~' || string[i] <= ' ' ||
-                        string[i] == '/') {
+                        string[i] == '/' || string[i] == '"' ||
+                        string[i] == '\\' || string[i] == '\'') {
                        smack[i] = '\0';
                        found = 1;
                } else
index 98b3195347ab46d84749920de3b6b45df134de5f..0023182078c726797de5ccab784a1922eba17859 100644 (file)
@@ -30,7 +30,6 @@
 #include <net/netlabel.h>
 #include <net/cipso_ipv4.h>
 #include <linux/audit.h>
-
 #include "smack.h"
 
 #define task_security(task)    (task_cred_xxx((task), security))
@@ -103,14 +102,24 @@ struct inode_smack *new_inode_smack(char *smack)
 static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 {
        int rc;
+       struct smk_audit_info ad;
+       char *sp, *tsp;
 
        rc = cap_ptrace_may_access(ctp, mode);
        if (rc != 0)
                return rc;
 
-       rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE);
+       sp = current_security();
+       tsp = task_security(ctp);
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, ctp);
+
+       /* we won't log here, because rc can be overriden */
+       rc = smk_access(sp, tsp, MAY_READWRITE, NULL);
        if (rc != 0 && capable(CAP_MAC_OVERRIDE))
-               return 0;
+               rc = 0;
+
+       smack_log(sp, tsp, MAY_READWRITE, rc, &ad);
        return rc;
 }
 
@@ -125,14 +134,24 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode)
 static int smack_ptrace_traceme(struct task_struct *ptp)
 {
        int rc;
+       struct smk_audit_info ad;
+       char *sp, *tsp;
 
        rc = cap_ptrace_traceme(ptp);
        if (rc != 0)
                return rc;
 
-       rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE);
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, ptp);
+
+       sp = current_security();
+       tsp = task_security(ptp);
+       /* we won't log here, because rc can be overriden */
+       rc = smk_access(tsp, sp, MAY_READWRITE, NULL);
        if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE))
-               return 0;
+               rc = 0;
+
+       smack_log(tsp, sp, MAY_READWRITE, rc, &ad);
        return rc;
 }
 
@@ -327,8 +346,14 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data)
 static int smack_sb_statfs(struct dentry *dentry)
 {
        struct superblock_smack *sbp = dentry->d_sb->s_security;
+       int rc;
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 
-       return smk_curacc(sbp->smk_floor, MAY_READ);
+       rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad);
+       return rc;
 }
 
 /**
@@ -346,8 +371,12 @@ static int smack_sb_mount(char *dev_name, struct path *path,
                          char *type, unsigned long flags, void *data)
 {
        struct superblock_smack *sbp = path->mnt->mnt_sb->s_security;
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path(&ad, *path);
 
-       return smk_curacc(sbp->smk_floor, MAY_WRITE);
+       return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /**
@@ -361,10 +390,14 @@ static int smack_sb_mount(char *dev_name, struct path *path,
 static int smack_sb_umount(struct vfsmount *mnt, int flags)
 {
        struct superblock_smack *sbp;
+       struct smk_audit_info ad;
 
-       sbp = mnt->mnt_sb->s_security;
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, mnt->mnt_mountpoint);
+       smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
 
-       return smk_curacc(sbp->smk_floor, MAY_WRITE);
+       sbp = mnt->mnt_sb->s_security;
+       return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad);
 }
 
 /*
@@ -441,15 +474,20 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir,
 static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
                            struct dentry *new_dentry)
 {
-       int rc;
        char *isp;
+       struct smk_audit_info ad;
+       int rc;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
        isp = smk_of_inode(old_dentry->d_inode);
-       rc = smk_curacc(isp, MAY_WRITE);
+       rc = smk_curacc(isp, MAY_WRITE, &ad);
 
        if (rc == 0 && new_dentry->d_inode != NULL) {
                isp = smk_of_inode(new_dentry->d_inode);
-               rc = smk_curacc(isp, MAY_WRITE);
+               smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+               rc = smk_curacc(isp, MAY_WRITE, &ad);
        }
 
        return rc;
@@ -466,18 +504,24 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir,
 static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
 {
        struct inode *ip = dentry->d_inode;
+       struct smk_audit_info ad;
        int rc;
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
        /*
         * You need write access to the thing you're unlinking
         */
-       rc = smk_curacc(smk_of_inode(ip), MAY_WRITE);
-       if (rc == 0)
+       rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad);
+       if (rc == 0) {
                /*
                 * You also need write access to the containing directory
                 */
-               rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
-
+               smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+               smk_ad_setfield_u_fs_inode(&ad, dir);
+               rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+       }
        return rc;
 }
 
@@ -491,17 +535,24 @@ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry)
  */
 static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry)
 {
+       struct smk_audit_info ad;
        int rc;
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
        /*
         * You need write access to the thing you're removing
         */
-       rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
-       if (rc == 0)
+       rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
+       if (rc == 0) {
                /*
                 * You also need write access to the containing directory
                 */
-               rc = smk_curacc(smk_of_inode(dir), MAY_WRITE);
+               smk_ad_setfield_u_fs_path_dentry(&ad, NULL);
+               smk_ad_setfield_u_fs_inode(&ad, dir);
+               rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad);
+       }
 
        return rc;
 }
@@ -525,15 +576,19 @@ static int smack_inode_rename(struct inode *old_inode,
 {
        int rc;
        char *isp;
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry);
 
        isp = smk_of_inode(old_dentry->d_inode);
-       rc = smk_curacc(isp, MAY_READWRITE);
+       rc = smk_curacc(isp, MAY_READWRITE, &ad);
 
        if (rc == 0 && new_dentry->d_inode != NULL) {
                isp = smk_of_inode(new_dentry->d_inode);
-               rc = smk_curacc(isp, MAY_READWRITE);
+               smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry);
+               rc = smk_curacc(isp, MAY_READWRITE, &ad);
        }
-
        return rc;
 }
 
@@ -548,13 +603,15 @@ static int smack_inode_rename(struct inode *old_inode,
  */
 static int smack_inode_permission(struct inode *inode, int mask)
 {
+       struct smk_audit_info ad;
        /*
         * No permission to check. Existence test. Yup, it's there.
         */
        if (mask == 0)
                return 0;
-
-       return smk_curacc(smk_of_inode(inode), mask);
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_inode(&ad, inode);
+       return smk_curacc(smk_of_inode(inode), mask, &ad);
 }
 
 /**
@@ -566,13 +623,16 @@ static int smack_inode_permission(struct inode *inode, int mask)
  */
 static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
 {
+       struct smk_audit_info ad;
        /*
         * Need to allow for clearing the setuid bit.
         */
        if (iattr->ia_valid & ATTR_FORCE)
                return 0;
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
 
-       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 }
 
 /**
@@ -584,7 +644,12 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr)
  */
 static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 {
-       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+       smk_ad_setfield_u_fs_path_mnt(&ad, mnt);
+       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /**
@@ -602,6 +667,7 @@ static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry)
 static int smack_inode_setxattr(struct dentry *dentry, const char *name,
                                const void *value, size_t size, int flags)
 {
+       struct smk_audit_info ad;
        int rc = 0;
 
        if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -619,8 +685,11 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name,
        } else
                rc = cap_inode_setxattr(dentry, name, value, size, flags);
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
        if (rc == 0)
-               rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+               rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
        return rc;
 }
@@ -672,7 +741,12 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name,
  */
 static int smack_inode_getxattr(struct dentry *dentry, const char *name)
 {
-       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ);
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
+
+       return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad);
 }
 
 /*
@@ -686,6 +760,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name)
  */
 static int smack_inode_removexattr(struct dentry *dentry, const char *name)
 {
+       struct smk_audit_info ad;
        int rc = 0;
 
        if (strcmp(name, XATTR_NAME_SMACK) == 0 ||
@@ -696,8 +771,10 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name)
        } else
                rc = cap_inode_removexattr(dentry, name);
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, dentry);
        if (rc == 0)
-               rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE);
+               rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad);
 
        return rc;
 }
@@ -856,12 +933,16 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
        int rc = 0;
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path(&ad, file->f_path);
 
        if (_IOC_DIR(cmd) & _IOC_WRITE)
-               rc = smk_curacc(file->f_security, MAY_WRITE);
+               rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
 
        if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ))
-               rc = smk_curacc(file->f_security, MAY_READ);
+               rc = smk_curacc(file->f_security, MAY_READ, &ad);
 
        return rc;
 }
@@ -875,7 +956,11 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd,
  */
 static int smack_file_lock(struct file *file, unsigned int cmd)
 {
-       return smk_curacc(file->f_security, MAY_WRITE);
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path_dentry(&ad, file->f_path.dentry);
+       return smk_curacc(file->f_security, MAY_WRITE, &ad);
 }
 
 /**
@@ -889,8 +974,12 @@ static int smack_file_lock(struct file *file, unsigned int cmd)
 static int smack_file_fcntl(struct file *file, unsigned int cmd,
                            unsigned long arg)
 {
+       struct smk_audit_info ad;
        int rc;
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS);
+       smk_ad_setfield_u_fs_path(&ad, file->f_path);
+
        switch (cmd) {
        case F_DUPFD:
        case F_GETFD:
@@ -898,7 +987,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
        case F_GETLK:
        case F_GETOWN:
        case F_GETSIG:
-               rc = smk_curacc(file->f_security, MAY_READ);
+               rc = smk_curacc(file->f_security, MAY_READ, &ad);
                break;
        case F_SETFD:
        case F_SETFL:
@@ -906,10 +995,10 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd,
        case F_SETLKW:
        case F_SETOWN:
        case F_SETSIG:
-               rc = smk_curacc(file->f_security, MAY_WRITE);
+               rc = smk_curacc(file->f_security, MAY_WRITE, &ad);
                break;
        default:
-               rc = smk_curacc(file->f_security, MAY_READWRITE);
+               rc = smk_curacc(file->f_security, MAY_READWRITE, &ad);
        }
 
        return rc;
@@ -944,14 +1033,21 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 {
        struct file *file;
        int rc;
+       char *tsp = tsk->cred->security;
+       struct smk_audit_info ad;
 
        /*
         * struct fown_struct is never outside the context of a struct file
         */
        file = container_of(fown, struct file, f_owner);
-       rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE);
+       /* we don't log here as rc can be overriden */
+       rc = smk_access(file->f_security, tsp, MAY_WRITE, NULL);
        if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE))
-               return 0;
+               rc = 0;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, tsk);
+       smack_log(file->f_security, tsp, MAY_WRITE, rc, &ad);
        return rc;
 }
 
@@ -964,7 +1060,10 @@ static int smack_file_send_sigiotask(struct task_struct *tsk,
 static int smack_file_receive(struct file *file)
 {
        int may = 0;
+       struct smk_audit_info ad;
 
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_fs_path(&ad, file->f_path);
        /*
         * This code relies on bitmasks.
         */
@@ -973,7 +1072,7 @@ static int smack_file_receive(struct file *file)
        if (file->f_mode & FMODE_WRITE)
                may |= MAY_WRITE;
 
-       return smk_curacc(file->f_security, may);
+       return smk_curacc(file->f_security, may, &ad);
 }
 
 /*
@@ -1052,6 +1151,22 @@ static int smack_kernel_create_files_as(struct cred *new,
        return 0;
 }
 
+/**
+ * smk_curacc_on_task - helper to log task related access
+ * @p: the task object
+ * @access : the access requested
+ *
+ * Return 0 if access is permitted
+ */
+static int smk_curacc_on_task(struct task_struct *p, int access)
+{
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, p);
+       return smk_curacc(task_security(p), access, &ad);
+}
+
 /**
  * smack_task_setpgid - Smack check on setting pgid
  * @p: the task object
@@ -1061,7 +1176,7 @@ static int smack_kernel_create_files_as(struct cred *new,
  */
 static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
 {
-       return smk_curacc(task_security(p), MAY_WRITE);
+       return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1072,7 +1187,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid)
  */
 static int smack_task_getpgid(struct task_struct *p)
 {
-       return smk_curacc(task_security(p), MAY_READ);
+       return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1083,7 +1198,7 @@ static int smack_task_getpgid(struct task_struct *p)
  */
 static int smack_task_getsid(struct task_struct *p)
 {
-       return smk_curacc(task_security(p), MAY_READ);
+       return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1111,7 +1226,7 @@ static int smack_task_setnice(struct task_struct *p, int nice)
 
        rc = cap_task_setnice(p, nice);
        if (rc == 0)
-               rc = smk_curacc(task_security(p), MAY_WRITE);
+               rc = smk_curacc_on_task(p, MAY_WRITE);
        return rc;
 }
 
@@ -1128,7 +1243,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
 
        rc = cap_task_setioprio(p, ioprio);
        if (rc == 0)
-               rc = smk_curacc(task_security(p), MAY_WRITE);
+               rc = smk_curacc_on_task(p, MAY_WRITE);
        return rc;
 }
 
@@ -1140,7 +1255,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio)
  */
 static int smack_task_getioprio(struct task_struct *p)
 {
-       return smk_curacc(task_security(p), MAY_READ);
+       return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1158,7 +1273,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
 
        rc = cap_task_setscheduler(p, policy, lp);
        if (rc == 0)
-               rc = smk_curacc(task_security(p), MAY_WRITE);
+               rc = smk_curacc_on_task(p, MAY_WRITE);
        return rc;
 }
 
@@ -1170,7 +1285,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy,
  */
 static int smack_task_getscheduler(struct task_struct *p)
 {
-       return smk_curacc(task_security(p), MAY_READ);
+       return smk_curacc_on_task(p, MAY_READ);
 }
 
 /**
@@ -1181,7 +1296,7 @@ static int smack_task_getscheduler(struct task_struct *p)
  */
 static int smack_task_movememory(struct task_struct *p)
 {
-       return smk_curacc(task_security(p), MAY_WRITE);
+       return smk_curacc_on_task(p, MAY_WRITE);
 }
 
 /**
@@ -1199,18 +1314,23 @@ static int smack_task_movememory(struct task_struct *p)
 static int smack_task_kill(struct task_struct *p, struct siginfo *info,
                           int sig, u32 secid)
 {
+       struct smk_audit_info ad;
+
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, p);
        /*
         * Sending a signal requires that the sender
         * can write the receiver.
         */
        if (secid == 0)
-               return smk_curacc(task_security(p), MAY_WRITE);
+               return smk_curacc(task_security(p), MAY_WRITE, &ad);
        /*
         * If the secid isn't 0 we're dealing with some USB IO
         * specific behavior. This is not clean. For one thing
         * we can't take privilege into account.
         */
-       return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE);
+       return smk_access(smack_from_secid(secid), task_security(p),
+                         MAY_WRITE, &ad);
 }
 
 /**
@@ -1221,11 +1341,15 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info,
  */
 static int smack_task_wait(struct task_struct *p)
 {
+       struct smk_audit_info ad;
+       char *sp = current_security();
+       char *tsp = task_security(p);
        int rc;
 
-       rc = smk_access(current_security(), task_security(p), MAY_WRITE);
+       /* we don't log here, we can be overriden */
+       rc = smk_access(sp, tsp, MAY_WRITE, NULL);
        if (rc == 0)
-               return 0;
+               goto out_log;
 
        /*
         * Allow the operation to succeed if either task
@@ -1239,8 +1363,12 @@ static int smack_task_wait(struct task_struct *p)
         * the smack value.
         */
        if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE))
-               return 0;
-
+               rc = 0;
+       /* we log only if we didn't get overriden */
+ out_log:
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK);
+       smk_ad_setfield_u_tsk(&ad, p);
+       smack_log(sp, tsp, MAY_WRITE, rc, &ad);
        return rc;
 }
 
@@ -1456,12 +1584,19 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap)
        int sk_lbl;
        char *hostsp;
        struct socket_smack *ssp = sk->sk_security;
+       struct smk_audit_info ad;
 
        rcu_read_lock();
        hostsp = smack_host_label(sap);
        if (hostsp != NULL) {
                sk_lbl = SMACK_UNLABELED_SOCKET;
-               rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE);
+#ifdef CONFIG_AUDIT
+               smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+               ad.a.u.net.family = sap->sin_family;
+               ad.a.u.net.dport = sap->sin_port;
+               ad.a.u.net.v4info.daddr = sap->sin_addr.s_addr;
+#endif
+               rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE, &ad);
        } else {
                sk_lbl = SMACK_CIPSO_SOCKET;
                rc = 0;
@@ -1656,6 +1791,25 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
        isp->security = NULL;
 }
 
+/**
+ * smk_curacc_shm : check if current has access on shm
+ * @shp : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_shm(struct shmid_kernel *shp, int access)
+{
+       char *ssp = smack_of_shm(shp);
+       struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+       ad.a.u.ipc_id = shp->shm_perm.id;
+#endif
+       return smk_curacc(ssp, access, &ad);
+}
+
 /**
  * smack_shm_associate - Smack access check for shm
  * @shp: the object
@@ -1665,11 +1819,10 @@ static void smack_shm_free_security(struct shmid_kernel *shp)
  */
 static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
 {
-       char *ssp = smack_of_shm(shp);
        int may;
 
        may = smack_flags_to_may(shmflg);
-       return smk_curacc(ssp, may);
+       return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1681,7 +1834,6 @@ static int smack_shm_associate(struct shmid_kernel *shp, int shmflg)
  */
 static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 {
-       char *ssp;
        int may;
 
        switch (cmd) {
@@ -1704,9 +1856,7 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
        default:
                return -EINVAL;
        }
-
-       ssp = smack_of_shm(shp);
-       return smk_curacc(ssp, may);
+       return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1720,11 +1870,10 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd)
 static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr,
                           int shmflg)
 {
-       char *ssp = smack_of_shm(shp);
        int may;
 
        may = smack_flags_to_may(shmflg);
-       return smk_curacc(ssp, may);
+       return smk_curacc_shm(shp, may);
 }
 
 /**
@@ -1765,6 +1914,25 @@ static void smack_sem_free_security(struct sem_array *sma)
        isp->security = NULL;
 }
 
+/**
+ * smk_curacc_sem : check if current has access on sem
+ * @sma : the object
+ * @access : access requested
+ *
+ * Returns 0 if current has the requested access, error code otherwise
+ */
+static int smk_curacc_sem(struct sem_array *sma, int access)
+{
+       char *ssp = smack_of_sem(sma);
+       struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+       ad.a.u.ipc_id = sma->sem_perm.id;
+#endif
+       return smk_curacc(ssp, access, &ad);
+}
+
 /**
  * smack_sem_associate - Smack access check for sem
  * @sma: the object
@@ -1774,11 +1942,10 @@ static void smack_sem_free_security(struct sem_array *sma)
  */
 static int smack_sem_associate(struct sem_array *sma, int semflg)
 {
-       char *ssp = smack_of_sem(sma);
        int may;
 
        may = smack_flags_to_may(semflg);
-       return smk_curacc(ssp, may);
+       return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1790,7 +1957,6 @@ static int smack_sem_associate(struct sem_array *sma, int semflg)
  */
 static int smack_sem_semctl(struct sem_array *sma, int cmd)
 {
-       char *ssp;
        int may;
 
        switch (cmd) {
@@ -1819,8 +1985,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
                return -EINVAL;
        }
 
-       ssp = smack_of_sem(sma);
-       return smk_curacc(ssp, may);
+       return smk_curacc_sem(sma, may);
 }
 
 /**
@@ -1837,9 +2002,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd)
 static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops,
                           unsigned nsops, int alter)
 {
-       char *ssp = smack_of_sem(sma);
-
-       return smk_curacc(ssp, MAY_READWRITE);
+       return smk_curacc_sem(sma, MAY_READWRITE);
 }
 
 /**
@@ -1880,6 +2043,25 @@ static char *smack_of_msq(struct msg_queue *msq)
        return (char *)msq->q_perm.security;
 }
 
+/**
+ * smk_curacc_msq : helper to check if current has access on msq
+ * @msq : the msq
+ * @access : access requested
+ *
+ * return 0 if current has access, error otherwise
+ */
+static int smk_curacc_msq(struct msg_queue *msq, int access)
+{
+       char *msp = smack_of_msq(msq);
+       struct smk_audit_info ad;
+
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+       ad.a.u.ipc_id = msq->q_perm.id;
+#endif
+       return smk_curacc(msp, access, &ad);
+}
+
 /**
  * smack_msg_queue_associate - Smack access check for msg_queue
  * @msq: the object
@@ -1889,11 +2071,10 @@ static char *smack_of_msq(struct msg_queue *msq)
  */
 static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
 {
-       char *msp = smack_of_msq(msq);
        int may;
 
        may = smack_flags_to_may(msqflg);
-       return smk_curacc(msp, may);
+       return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1905,7 +2086,6 @@ static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg)
  */
 static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 {
-       char *msp;
        int may;
 
        switch (cmd) {
@@ -1927,8 +2107,7 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
                return -EINVAL;
        }
 
-       msp = smack_of_msq(msq);
-       return smk_curacc(msp, may);
+       return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1942,11 +2121,10 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd)
 static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
                                  int msqflg)
 {
-       char *msp = smack_of_msq(msq);
-       int rc;
+       int may;
 
-       rc = smack_flags_to_may(msqflg);
-       return smk_curacc(msp, rc);
+       may = smack_flags_to_may(msqflg);
+       return smk_curacc_msq(msq, may);
 }
 
 /**
@@ -1962,9 +2140,7 @@ static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
                        struct task_struct *target, long type, int mode)
 {
-       char *msp = smack_of_msq(msq);
-
-       return smk_curacc(msp, MAY_READWRITE);
+       return smk_curacc_msq(msq, MAY_READWRITE);
 }
 
 /**
@@ -1977,10 +2153,14 @@ static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg,
 static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag)
 {
        char *isp = ipp->security;
-       int may;
+       int may = smack_flags_to_may(flag);
+       struct smk_audit_info ad;
 
-       may = smack_flags_to_may(flag);
-       return smk_curacc(isp, may);
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC);
+       ad.a.u.ipc_id = ipp->id;
+#endif
+       return smk_curacc(isp, may, &ad);
 }
 
 /**
@@ -2239,8 +2419,12 @@ static int smack_unix_stream_connect(struct socket *sock,
 {
        struct inode *sp = SOCK_INODE(sock);
        struct inode *op = SOCK_INODE(other);
+       struct smk_audit_info ad;
 
-       return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE);
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+       smk_ad_setfield_u_net_sk(&ad, other->sk);
+       return smk_access(smk_of_inode(sp), smk_of_inode(op),
+                                MAY_READWRITE, &ad);
 }
 
 /**
@@ -2255,8 +2439,11 @@ static int smack_unix_may_send(struct socket *sock, struct socket *other)
 {
        struct inode *sp = SOCK_INODE(sock);
        struct inode *op = SOCK_INODE(other);
+       struct smk_audit_info ad;
 
-       return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE);
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+       smk_ad_setfield_u_net_sk(&ad, other->sk);
+       return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE, &ad);
 }
 
 /**
@@ -2371,7 +2558,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
        char smack[SMK_LABELLEN];
        char *csp;
        int rc;
-
+       struct smk_audit_info ad;
        if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6)
                return 0;
 
@@ -2389,13 +2576,19 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb)
 
        netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+       ad.a.u.net.family = sk->sk_family;
+       ad.a.u.net.netif = skb->iif;
+       ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
        /*
         * Receiving a packet requires that the other end
         * be able to write here. Read access is not required.
         * This is the simplist possible security model
         * for networking.
         */
-       rc = smk_access(csp, ssp->smk_in, MAY_WRITE);
+       rc = smk_access(csp, ssp->smk_in, MAY_WRITE, &ad);
        if (rc != 0)
                netlbl_skbuff_err(skb, rc, 0);
        return rc;
@@ -2524,6 +2717,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
        struct iphdr *hdr;
        char smack[SMK_LABELLEN];
        int rc;
+       struct smk_audit_info ad;
 
        /* handle mapped IPv4 packets arriving via IPv6 sockets */
        if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP))
@@ -2537,11 +2731,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb,
                strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN);
        netlbl_secattr_destroy(&secattr);
 
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET);
+       ad.a.u.net.family = family;
+       ad.a.u.net.netif = skb->iif;
+       ipv4_skb_to_auditdata(skb, &ad.a, NULL);
+#endif
        /*
         * Receiving a packet requires that the other end be able to write
         * here. Read access is not required.
         */
-       rc = smk_access(smack, ssp->smk_in, MAY_WRITE);
+       rc = smk_access(smack, ssp->smk_in, MAY_WRITE, &ad);
        if (rc != 0)
                return rc;
 
@@ -2643,6 +2843,7 @@ static int smack_key_permission(key_ref_t key_ref,
                                const struct cred *cred, key_perm_t perm)
 {
        struct key *keyp;
+       struct smk_audit_info ad;
 
        keyp = key_ref_to_ptr(key_ref);
        if (keyp == NULL)
@@ -2658,8 +2859,13 @@ static int smack_key_permission(key_ref_t key_ref,
         */
        if (cred->security == NULL)
                return -EACCES;
-
-       return smk_access(cred->security, keyp->security, MAY_READWRITE);
+#ifdef CONFIG_AUDIT
+       smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY);
+       ad.a.u.key_struct.key = keyp->serial;
+       ad.a.u.key_struct.key_desc = keyp->description;
+#endif
+       return smk_access(cred->security, keyp->security,
+                                MAY_READWRITE, &ad);
 }
 #endif /* CONFIG_KEYS */
 
@@ -2828,15 +3034,7 @@ struct security_operations smack_ops = {
 
        .ptrace_may_access =            smack_ptrace_may_access,
        .ptrace_traceme =               smack_ptrace_traceme,
-       .capget =                       cap_capget,
-       .capset =                       cap_capset,
-       .capable =                      cap_capable,
        .syslog =                       smack_syslog,
-       .settime =                      cap_settime,
-       .vm_enough_memory =             cap_vm_enough_memory,
-
-       .bprm_set_creds =               cap_bprm_set_creds,
-       .bprm_secureexec =              cap_bprm_secureexec,
 
        .sb_alloc_security =            smack_sb_alloc_security,
        .sb_free_security =             smack_sb_free_security,
@@ -2860,8 +3058,6 @@ struct security_operations smack_ops = {
        .inode_post_setxattr =          smack_inode_post_setxattr,
        .inode_getxattr =               smack_inode_getxattr,
        .inode_removexattr =            smack_inode_removexattr,
-       .inode_need_killpriv =          cap_inode_need_killpriv,
-       .inode_killpriv =               cap_inode_killpriv,
        .inode_getsecurity =            smack_inode_getsecurity,
        .inode_setsecurity =            smack_inode_setsecurity,
        .inode_listsecurity =           smack_inode_listsecurity,
@@ -2882,7 +3078,6 @@ struct security_operations smack_ops = {
        .cred_commit =                  smack_cred_commit,
        .kernel_act_as =                smack_kernel_act_as,
        .kernel_create_files_as =       smack_kernel_create_files_as,
-       .task_fix_setuid =              cap_task_fix_setuid,
        .task_setpgid =                 smack_task_setpgid,
        .task_getpgid =                 smack_task_getpgid,
        .task_getsid =                  smack_task_getsid,
@@ -2896,7 +3091,6 @@ struct security_operations smack_ops = {
        .task_kill =                    smack_task_kill,
        .task_wait =                    smack_task_wait,
        .task_to_inode =                smack_task_to_inode,
-       .task_prctl =                   cap_task_prctl,
 
        .ipc_permission =               smack_ipc_permission,
        .ipc_getsecid =                 smack_ipc_getsecid,
@@ -2923,9 +3117,6 @@ struct security_operations smack_ops = {
        .sem_semctl =                   smack_sem_semctl,
        .sem_semop =                    smack_sem_semop,
 
-       .netlink_send =                 cap_netlink_send,
-       .netlink_recv =                 cap_netlink_recv,
-
        .d_instantiate =                smack_d_instantiate,
 
        .getprocattr =                  smack_getprocattr,
index e03a7e19c73b3a0b49d7533db45ecb719d250f30..f83a809807263beacdb756adae4696b6cda33309 100644 (file)
@@ -41,6 +41,7 @@ enum smk_inos {
        SMK_AMBIENT     = 7,    /* internet ambient label */
        SMK_NETLBLADDR  = 8,    /* single label hosts */
        SMK_ONLYCAP     = 9,    /* the only "capable" label */
+       SMK_LOGGING     = 10,   /* logging */
 };
 
 /*
@@ -734,8 +735,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
                return;
        }
 
-       m = list_entry(rcu_dereference(smk_netlbladdr_list.next),
-                        struct smk_netlbladdr, list);
+       m = list_entry_rcu(smk_netlbladdr_list.next,
+                          struct smk_netlbladdr, list);
 
        /* the comparison '>' is a bit hacky, but works */
        if (new->smk_mask.s_addr > m->smk_mask.s_addr) {
@@ -748,8 +749,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new)
                        list_add_rcu(&new->list, &m->list);
                        return;
                }
-               m_next = list_entry(rcu_dereference(m->list.next),
-                                struct smk_netlbladdr, list);
+               m_next = list_entry_rcu(m->list.next,
+                                       struct smk_netlbladdr, list);
                if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) {
                        list_add_rcu(&new->list, &m->list);
                        return;
@@ -775,7 +776,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf,
        struct sockaddr_in newname;
        char smack[SMK_LABELLEN];
        char *sp;
-       char data[SMK_NETLBLADDRMAX];
+       char data[SMK_NETLBLADDRMAX + 1];
        char *host = (char *)&newname.sin_addr.s_addr;
        int rc;
        struct netlbl_audit audit_info;
@@ -1191,6 +1192,69 @@ static const struct file_operations smk_onlycap_ops = {
        .write          = smk_write_onlycap,
 };
 
+/**
+ * smk_read_logging - read() for /smack/logging
+ * @filp: file pointer, not actually used
+ * @buf: where to put the result
+ * @cn: maximum to send along
+ * @ppos: where to start
+ *
+ * Returns number of bytes read or error code, as appropriate
+ */
+static ssize_t smk_read_logging(struct file *filp, char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       char temp[32];
+       ssize_t rc;
+
+       if (*ppos != 0)
+               return 0;
+
+       sprintf(temp, "%d\n", log_policy);
+       rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp));
+       return rc;
+}
+
+/**
+ * smk_write_logging - write() for /smack/logging
+ * @file: file pointer, not actually used
+ * @buf: where to get the data from
+ * @count: bytes sent
+ * @ppos: where to start
+ *
+ * Returns number of bytes written or error code, as appropriate
+ */
+static ssize_t smk_write_logging(struct file *file, const char __user *buf,
+                               size_t count, loff_t *ppos)
+{
+       char temp[32];
+       int i;
+
+       if (!capable(CAP_MAC_ADMIN))
+               return -EPERM;
+
+       if (count >= sizeof(temp) || count == 0)
+               return -EINVAL;
+
+       if (copy_from_user(temp, buf, count) != 0)
+               return -EFAULT;
+
+       temp[count] = '\0';
+
+       if (sscanf(temp, "%d", &i) != 1)
+               return -EINVAL;
+       if (i < 0 || i > 3)
+               return -EINVAL;
+       log_policy = i;
+       return count;
+}
+
+
+
+static const struct file_operations smk_logging_ops = {
+       .read           = smk_read_logging,
+       .write          = smk_write_logging,
+};
 /**
  * smk_fill_super - fill the /smackfs superblock
  * @sb: the empty superblock
@@ -1221,6 +1285,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent)
                        {"netlabel", &smk_netlbladdr_ops, S_IRUGO|S_IWUSR},
                [SMK_ONLYCAP]   =
                        {"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR},
+               [SMK_LOGGING]   =
+                       {"logging", &smk_logging_ops, S_IRUGO|S_IWUSR},
                /* last one */ {""}
        };
 
index ddfb9cccf468aee271c4a84757c5f475a8c47524..fdd1f4b8c448e4c1fa6611bb9456f5e9c9d30070 100644 (file)
@@ -28,7 +28,13 @@ static const char *tomoyo_mode_2[4] = {
        "disabled", "enabled", "enabled", "enabled"
 };
 
-/* Table for profile. */
+/*
+ * tomoyo_control_array is a static data which contains
+ *
+ *  (1) functionality name used by /sys/kernel/security/tomoyo/profile .
+ *  (2) initial values for "struct tomoyo_profile".
+ *  (3) max values for "struct tomoyo_profile".
+ */
 static struct {
        const char *keyword;
        unsigned int current_value;
@@ -39,7 +45,13 @@ static struct {
        [TOMOYO_VERBOSE]          = { "TOMOYO_VERBOSE",      1,       1 },
 };
 
-/* Profile table. Memory is allocated as needed. */
+/*
+ * tomoyo_profile is a structure which is used for holding the mode of access
+ * controls. TOMOYO has 4 modes: disabled, learning, permissive, enforcing.
+ * An administrator can define up to 256 profiles.
+ * The ->profile of "struct tomoyo_domain_info" is used for remembering
+ * the profile's number (0 - 255) assigned to that domain.
+ */
 static struct tomoyo_profile {
        unsigned int value[TOMOYO_MAX_CONTROL_INDEX];
        const struct tomoyo_path_info *comment;
@@ -428,7 +440,6 @@ void tomoyo_fill_path_info(struct tomoyo_path_info *ptr)
        const char *name = ptr->name;
        const int len = strlen(name);
 
-       ptr->total_len = len;
        ptr->const_len = tomoyo_const_part_length(name);
        ptr->is_dir = len && (name[len - 1] == '/');
        ptr->is_patterned = (ptr->const_len < len);
@@ -866,7 +877,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
 
        if (profile >= TOMOYO_MAX_PROFILES)
                return NULL;
-       /***** EXCLUSIVE SECTION START *****/
        mutex_lock(&lock);
        ptr = tomoyo_profile_ptr[profile];
        if (ptr)
@@ -880,7 +890,6 @@ static struct tomoyo_profile *tomoyo_find_or_assign_new_profile(const unsigned
        tomoyo_profile_ptr[profile] = ptr;
  ok:
        mutex_unlock(&lock);
-       /***** EXCLUSIVE SECTION END *****/
        return ptr;
 }
 
@@ -1009,7 +1018,19 @@ static int tomoyo_read_profile(struct tomoyo_io_buffer *head)
        return 0;
 }
 
-/* Structure for policy manager. */
+/*
+ * tomoyo_policy_manager_entry is a structure which is used for holding list of
+ * domainnames or programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_policy_manager_list .
+ *  (2) "manager" is a domainname or a program's pathname.
+ *  (3) "is_domain" is a bool which is true if "manager" is a domainname, false
+ *      otherwise.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_policy_manager_entry {
        struct list_head list;
        /* A path to program or a domainname. */
@@ -1018,7 +1039,36 @@ struct tomoyo_policy_manager_entry {
        bool is_deleted; /* True if this entry is deleted. */
 };
 
-/* The list for "struct tomoyo_policy_manager_entry". */
+/*
+ * tomoyo_policy_manager_list is used for holding list of domainnames or
+ * programs which are permitted to modify configuration via
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * An entry is added by
+ *
+ * # echo '<kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a domainname)
+ *
+ *  or
+ *
+ * # echo '/usr/lib/ccs/editpolicy' > /sys/kernel/security/tomoyo/manager
+ *  (if you want to specify by a program's location)
+ *
+ * and is deleted by
+ *
+ * # echo 'delete <kernel> /sbin/mingetty /bin/login /bin/bash' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ *  or
+ *
+ * # echo 'delete /usr/lib/ccs/editpolicy' > \
+ *                                        /sys/kernel/security/tomoyo/manager
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/manager
+ */
 static LIST_HEAD(tomoyo_policy_manager_list);
 static DECLARE_RWSEM(tomoyo_policy_manager_list_lock);
 
@@ -1050,7 +1100,6 @@ static int tomoyo_update_manager_entry(const char *manager,
        saved_manager = tomoyo_save_name(manager);
        if (!saved_manager)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_policy_manager_list_lock);
        list_for_each_entry(ptr, &tomoyo_policy_manager_list, list) {
                if (ptr->manager != saved_manager)
@@ -1072,7 +1121,6 @@ static int tomoyo_update_manager_entry(const char *manager,
        error = 0;
  out:
        up_write(&tomoyo_policy_manager_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -1117,10 +1165,9 @@ static int tomoyo_read_manager_policy(struct tomoyo_io_buffer *head)
                                 list);
                if (ptr->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, "%s\n", ptr->manager->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, "%s\n", ptr->manager->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_policy_manager_list_lock);
        head->read_eof = done;
@@ -1197,13 +1244,11 @@ static bool tomoyo_is_select_one(struct tomoyo_io_buffer *head,
 
        if (sscanf(data, "pid=%u", &pid) == 1) {
                struct task_struct *p;
-               /***** CRITICAL SECTION START *****/
                read_lock(&tasklist_lock);
                p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_real_domain(p);
                read_unlock(&tasklist_lock);
-               /***** CRITICAL SECTION END *****/
        } else if (!strncmp(data, "domain=", 7)) {
                if (tomoyo_is_domain_def(data + 7)) {
                        down_read(&tomoyo_domain_list_lock);
@@ -1447,15 +1492,14 @@ static int tomoyo_read_domain_policy(struct tomoyo_io_buffer *head)
                    TOMOYO_DOMAIN_FLAGS_IGNORE_GLOBAL_ALLOW_READ)
                        ignore_global_allow_read
                                = TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ "\n";
-               if (!tomoyo_io_printf(head,
-                                     "%s\n" TOMOYO_KEYWORD_USE_PROFILE "%u\n"
-                                     "%s%s%s\n", domain->domainname->name,
-                                     domain->profile, quota_exceeded,
-                                     transition_failed,
-                                     ignore_global_allow_read)) {
-                       done = false;
+               done = tomoyo_io_printf(head, "%s\n" TOMOYO_KEYWORD_USE_PROFILE
+                                       "%u\n%s%s%s\n",
+                                       domain->domainname->name,
+                                       domain->profile, quota_exceeded,
+                                       transition_failed,
+                                       ignore_global_allow_read);
+               if (!done)
                        break;
-               }
                head->read_step = 2;
 acl_loop:
                if (head->read_step == 3)
@@ -1463,24 +1507,22 @@ acl_loop:
                /* Print ACL entries in the domain. */
                down_read(&tomoyo_domain_acl_info_list_lock);
                list_for_each_cookie(apos, head->read_var2,
-                                     &domain->acl_info_list) {
+                                    &domain->acl_info_list) {
                        struct tomoyo_acl_info *ptr
                                = list_entry(apos, struct tomoyo_acl_info,
-                                             list);
-                       if (!tomoyo_print_entry(head, ptr)) {
-                               done = false;
+                                            list);
+                       done = tomoyo_print_entry(head, ptr);
+                       if (!done)
                                break;
-                       }
                }
                up_read(&tomoyo_domain_acl_info_list_lock);
                if (!done)
                        break;
                head->read_step = 3;
 tail_mark:
-               if (!tomoyo_io_printf(head, "\n")) {
-                       done = false;
+               done = tomoyo_io_printf(head, "\n");
+               if (!done)
                        break;
-               }
                head->read_step = 1;
                if (head->read_single_domain)
                        break;
@@ -1550,11 +1592,10 @@ static int tomoyo_read_domain_profile(struct tomoyo_io_buffer *head)
                domain = list_entry(pos, struct tomoyo_domain_info, list);
                if (domain->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, "%u %s\n", domain->profile,
-                                     domain->domainname->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, "%u %s\n", domain->profile,
+                                       domain->domainname->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_domain_list_lock);
        head->read_eof = done;
@@ -1594,13 +1635,11 @@ static int tomoyo_read_pid(struct tomoyo_io_buffer *head)
                const int pid = head->read_step;
                struct task_struct *p;
                struct tomoyo_domain_info *domain = NULL;
-               /***** CRITICAL SECTION START *****/
                read_lock(&tasklist_lock);
                p = find_task_by_vpid(pid);
                if (p)
                        domain = tomoyo_real_domain(p);
                read_unlock(&tasklist_lock);
-               /***** CRITICAL SECTION END *****/
                if (domain)
                        tomoyo_io_printf(head, "%d %u %s", pid, domain->profile,
                                         domain->domainname->name);
@@ -2138,7 +2177,13 @@ static ssize_t tomoyo_write(struct file *file, const char __user *buf,
        return tomoyo_write_control(file, buf, count);
 }
 
-/* Operations for /sys/kernel/security/tomoyo/ interface. */
+/*
+ * tomoyo_operations is a "struct file_operations" which is used for handling
+ * /sys/kernel/security/tomoyo/ interface.
+ *
+ * Some files under /sys/kernel/security/tomoyo/ directory accept open(O_RDWR).
+ * See tomoyo_io_buffer for internals.
+ */
 static const struct file_operations tomoyo_operations = {
        .open    = tomoyo_open,
        .release = tomoyo_release,
index 678f4ff16aa4477eb7b6f4e44d744dc6aac4f932..6d6ba09af4576b27d6aba92edad7d3c7b03515b3 100644 (file)
 struct dentry;
 struct vfsmount;
 
-/* Temporary buffer for holding pathnames. */
+/*
+ * tomoyo_page_buffer is a structure which is used for holding a pathname
+ * obtained from "struct dentry" and "struct vfsmount" pair.
+ * As of now, it is 4096 bytes. If users complain that 4096 bytes is too small
+ * (because TOMOYO escapes non ASCII printable characters using \ooo format),
+ * we will make the buffer larger.
+ */
 struct tomoyo_page_buffer {
        char buffer[4096];
 };
 
-/* Structure for holding a token. */
+/*
+ * tomoyo_path_info is a structure which is used for holding a string data
+ * used by TOMOYO.
+ * This structure has several fields for supporting pattern matching.
+ *
+ * (1) "name" is the '\0' terminated string data.
+ * (2) "hash" is full_name_hash(name, strlen(name)).
+ *     This allows tomoyo_pathcmp() to compare by hash before actually compare
+ *     using strcmp().
+ * (3) "const_len" is the length of the initial segment of "name" which
+ *     consists entirely of non wildcard characters. In other words, the length
+ *     which we can compare two strings using strncmp().
+ * (4) "is_dir" is a bool which is true if "name" ends with "/",
+ *     false otherwise.
+ *     TOMOYO distinguishes directory and non-directory. A directory ends with
+ *     "/" and non-directory does not end with "/".
+ * (5) "is_patterned" is a bool which is true if "name" contains wildcard
+ *     characters, false otherwise. This allows TOMOYO to use "hash" and
+ *     strcmp() for string comparison if "is_patterned" is false.
+ * (6) "depth" is calculated using the number of "/" characters in "name".
+ *     This allows TOMOYO to avoid comparing two pathnames which never match
+ *     (e.g. whether "/var/www/html/index.html" matches "/tmp/sh-thd-\$").
+ */
 struct tomoyo_path_info {
        const char *name;
        u32 hash;          /* = full_name_hash(name, strlen(name)) */
-       u16 total_len;     /* = strlen(name)                       */
        u16 const_len;     /* = tomoyo_const_part_length(name)     */
        bool is_dir;       /* = tomoyo_strendswith(name, "/")      */
        bool is_patterned; /* = tomoyo_path_contains_pattern(name) */
@@ -51,7 +78,20 @@ struct tomoyo_path_info {
  */
 #define TOMOYO_MAX_PATHNAME_LEN 4000
 
-/* Structure for holding requested pathname. */
+/*
+ * tomoyo_path_info_with_data is a structure which is used for holding a
+ * pathname obtained from "struct dentry" and "struct vfsmount" pair.
+ *
+ * "struct tomoyo_path_info_with_data" consists of "struct tomoyo_path_info"
+ * and buffer for the pathname, while "struct tomoyo_page_buffer" consists of
+ * buffer for the pathname only.
+ *
+ * "struct tomoyo_path_info_with_data" is intended to allow TOMOYO to release
+ * both "struct tomoyo_path_info" and buffer for the pathname by single kfree()
+ * so that we don't need to return two pointers to the caller. If the caller
+ * puts "struct tomoyo_path_info" on stack memory, we will be able to remove
+ * "struct tomoyo_path_info_with_data".
+ */
 struct tomoyo_path_info_with_data {
        /* Keep "head" first, for this pointer is passed to tomoyo_free(). */
        struct tomoyo_path_info head;
@@ -61,7 +101,15 @@ struct tomoyo_path_info_with_data {
 };
 
 /*
- * Common header for holding ACL entries.
+ * tomoyo_acl_info is a structure which is used for holding
+ *
+ *  (1) "list" which is linked to the ->acl_info_list of
+ *      "struct tomoyo_domain_info"
+ *  (2) "type" which tells
+ *      (a) type & 0x7F : type of the entry (either
+ *          "struct tomoyo_single_path_acl_record" or
+ *          "struct tomoyo_double_path_acl_record")
+ *      (b) type & 0x80 : whether the entry is marked as "deleted".
  *
  * Packing "struct tomoyo_acl_info" allows
  * "struct tomoyo_single_path_acl_record" to embed "u16" and
@@ -81,7 +129,28 @@ struct tomoyo_acl_info {
 /* This ACL entry is deleted.           */
 #define TOMOYO_ACL_DELETED        0x80
 
-/* Structure for domain information. */
+/*
+ * tomoyo_domain_info is a structure which is used for holding permissions
+ * (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_list .
+ *  (2) "acl_info_list" which is linked to "struct tomoyo_acl_info".
+ *  (3) "domainname" which holds the name of the domain.
+ *  (4) "profile" which remembers profile number assigned to this domain.
+ *  (5) "is_deleted" is a bool which is true if this domain is marked as
+ *      "deleted", false otherwise.
+ *  (6) "quota_warned" is a bool which is used for suppressing warning message
+ *      when learning mode learned too much entries.
+ *  (7) "flags" which remembers this domain's attributes.
+ *
+ * A domain's lifecycle is an analogy of files on / directory.
+ * Multiple domains with the same domainname cannot be created (as with
+ * creating files with the same filename fails with -EEXIST).
+ * If a process reached a domain, that process can reside in that domain after
+ * that domain is marked as "deleted" (as with a process can access an already
+ * open()ed file after that file was unlink()ed).
+ */
 struct tomoyo_domain_info {
        struct list_head list;
        struct list_head acl_info_list;
@@ -108,10 +177,18 @@ struct tomoyo_domain_info {
 #define TOMOYO_DOMAIN_FLAGS_TRANSITION_FAILED        2
 
 /*
- * Structure for "allow_read/write", "allow_execute", "allow_read",
- * "allow_write", "allow_create", "allow_unlink", "allow_mkdir", "allow_rmdir",
- * "allow_mkfifo", "allow_mksock", "allow_mkblock", "allow_mkchar",
- * "allow_truncate", "allow_symlink" and "allow_rewrite" directive.
+ * tomoyo_single_path_acl_record is a structure which is used for holding an
+ * entry with one pathname operation (e.g. open(), mkdir()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename" is the pathname.
+ *
+ * Directives held by this structure are "allow_read/write", "allow_execute",
+ * "allow_read", "allow_write", "allow_create", "allow_unlink", "allow_mkdir",
+ * "allow_rmdir", "allow_mkfifo", "allow_mksock", "allow_mkblock",
+ * "allow_mkchar", "allow_truncate", "allow_symlink" and "allow_rewrite".
  */
 struct tomoyo_single_path_acl_record {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_SINGLE_PATH_ACL */
@@ -120,7 +197,18 @@ struct tomoyo_single_path_acl_record {
        const struct tomoyo_path_info *filename;
 };
 
-/* Structure for "allow_rename" and "allow_link" directive. */
+/*
+ * tomoyo_double_path_acl_record is a structure which is used for holding an
+ * entry with two pathnames operation (i.e. link() and rename()).
+ * It has following fields.
+ *
+ *  (1) "head" which is a "struct tomoyo_acl_info".
+ *  (2) "perm" which is a bitmask of permitted operations.
+ *  (3) "filename1" is the source/old pathname.
+ *  (4) "filename2" is the destination/new pathname.
+ *
+ * Directives held by this structure are "allow_rename" and "allow_link".
+ */
 struct tomoyo_double_path_acl_record {
        struct tomoyo_acl_info head; /* type = TOMOYO_TYPE_DOUBLE_PATH_ACL */
        u8 perm;
@@ -153,7 +241,29 @@ struct tomoyo_double_path_acl_record {
 #define TOMOYO_VERBOSE                       2
 #define TOMOYO_MAX_CONTROL_INDEX             3
 
-/* Structure for reading/writing policy via securityfs interfaces. */
+/*
+ * tomoyo_io_buffer is a structure which is used for reading and modifying
+ * configuration via /sys/kernel/security/tomoyo/ interface.
+ * It has many fields. ->read_var1 , ->read_var2 , ->write_var1 are used as
+ * cursors.
+ *
+ * Since the content of /sys/kernel/security/tomoyo/domain_policy is a list of
+ * "struct tomoyo_domain_info" entries and each "struct tomoyo_domain_info"
+ * entry has a list of "struct tomoyo_acl_info", we need two cursors when
+ * reading (one is for traversing tomoyo_domain_list and the other is for
+ * traversing "struct tomoyo_acl_info"->acl_info_list ).
+ *
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "select ", TOMOYO seeks the cursor ->read_var1 and ->write_var1 to the
+ * domain with the domainname specified by the rest of that line (NULL is set
+ * if seek failed).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * "delete ", TOMOYO deletes an entry or a domain specified by the rest of that
+ * line (->write_var1 is set to NULL if a domain was deleted).
+ * If a line written to /sys/kernel/security/tomoyo/domain_policy starts with
+ * neither "select " nor "delete ", an entry or a domain specified by that line
+ * is appended.
+ */
 struct tomoyo_io_buffer {
        int (*read) (struct tomoyo_io_buffer *);
        int (*write) (struct tomoyo_io_buffer *);
index 2d6748741a26e4f08e28872d79d846d06e07c6c7..1d8b16960576891615d7d523626d527030cf31bc 100644 (file)
 /* The initial domain. */
 struct tomoyo_domain_info tomoyo_kernel_domain;
 
-/* The list for "struct tomoyo_domain_info". */
+/*
+ * tomoyo_domain_list is used for holding list of domains.
+ * The ->acl_info_list of "struct tomoyo_domain_info" is used for holding
+ * permissions (e.g. "allow_read /lib/libc-2.5.so") given to each domain.
+ *
+ * An entry is added by
+ *
+ * # ( echo "<kernel>"; echo "allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # ( echo "<kernel>"; echo "delete allow_execute /sbin/init" ) > \
+ *                                  /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # cat /sys/kernel/security/tomoyo/domain_policy
+ *
+ * A domain is added by
+ *
+ * # echo "<kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and is deleted by
+ *
+ * # echo "delete <kernel>" > /sys/kernel/security/tomoyo/domain_policy
+ *
+ * and all domains are retrieved by
+ *
+ * # grep '^<kernel>' /sys/kernel/security/tomoyo/domain_policy
+ *
+ * Normally, a domainname is monotonically getting longer because a domainname
+ * which the process will belong to if an execve() operation succeeds is
+ * defined as a concatenation of "current domainname" + "pathname passed to
+ * execve()".
+ * See tomoyo_domain_initializer_list and tomoyo_domain_keeper_list for
+ * exceptions.
+ */
 LIST_HEAD(tomoyo_domain_list);
 DECLARE_RWSEM(tomoyo_domain_list_lock);
 
-/* Structure for "initialize_domain" and "no_initialize_domain" keyword. */
+/*
+ * tomoyo_domain_initializer_entry is a structure which is used for holding
+ * "initialize_domain" and "no_initialize_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_initializer_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname". This field is NULL if "from" clause is not specified.
+ *  (3) "program" which is a program's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_initializer_entry {
        struct list_head list;
        const struct tomoyo_path_info *domainname;    /* This may be NULL */
@@ -34,7 +86,23 @@ struct tomoyo_domain_initializer_entry {
        bool is_last_name;
 };
 
-/* Structure for "keep_domain" and "no_keep_domain" keyword. */
+/*
+ * tomoyo_domain_keeper_entry is a structure which is used for holding
+ * "keep_domain" and "no_keep_domain" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_domain_keeper_list .
+ *  (2) "domainname" which is "a domainname" or "the last component of a
+ *      domainname".
+ *  (3) "program" which is a program's pathname.
+ *      This field is NULL if "from" clause is not specified.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ *  (5) "is_not" is a bool which is true if "no_initialize_domain", false
+ *      otherwise.
+ *  (6) "is_last_name" is a bool which is true if "domainname" is "the last
+ *      component of a domainname", false otherwise.
+ */
 struct tomoyo_domain_keeper_entry {
        struct list_head list;
        const struct tomoyo_path_info *domainname;
@@ -45,7 +113,16 @@ struct tomoyo_domain_keeper_entry {
        bool is_last_name;
 };
 
-/* Structure for "alias" keyword. */
+/*
+ * tomoyo_alias_entry is a structure which is used for holding "alias" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_alias_list .
+ *  (2) "original_name" which is a dereferenced pathname.
+ *  (3) "aliased_name" which is a symlink's pathname.
+ *  (4) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_alias_entry {
        struct list_head list;
        const struct tomoyo_path_info *original_name;
@@ -67,14 +144,12 @@ void tomoyo_set_domain_flag(struct tomoyo_domain_info *domain,
 {
        /* We need to serialize because this is bitfield operation. */
        static DEFINE_SPINLOCK(lock);
-       /***** CRITICAL SECTION START *****/
        spin_lock(&lock);
        if (!is_delete)
                domain->flags |= flags;
        else
                domain->flags &= ~flags;
        spin_unlock(&lock);
-       /***** CRITICAL SECTION END *****/
 }
 
 /**
@@ -94,7 +169,42 @@ const char *tomoyo_get_last_name(const struct tomoyo_domain_info *domain)
        return cp0;
 }
 
-/* The list for "struct tomoyo_domain_initializer_entry". */
+/*
+ * tomoyo_domain_initializer_list is used for holding list of programs which
+ * triggers reinitialization of domainname. Normally, a domainname is
+ * monotonically getting longer. But sometimes, we restart daemon programs.
+ * It would be convenient for us that "a daemon started upon system boot" and
+ * "the daemon restarted from console" belong to the same domain. Thus, TOMOYO
+ * provides a way to shorten domainnames.
+ *
+ * An entry is added by
+ *
+ * # echo 'initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete initialize_domain /usr/sbin/httpd' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^initialize_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, /usr/sbin/httpd will belong to
+ * "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may specify a domainname using "from" keyword.
+ * "initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" executed from "<kernel> /etc/rc.d/init.d/httpd"
+ * domain to belong to "<kernel> /usr/sbin/httpd" domain.
+ *
+ * You may add "no_" prefix to "initialize_domain".
+ * "initialize_domain /usr/sbin/httpd" and
+ * "no_initialize_domain /usr/sbin/httpd from <kernel> /etc/rc.d/init.d/httpd"
+ * will cause "/usr/sbin/httpd" to belong to "<kernel> /usr/sbin/httpd" domain
+ * unless executed from "<kernel> /etc/rc.d/init.d/httpd" domain.
+ */
 static LIST_HEAD(tomoyo_domain_initializer_list);
 static DECLARE_RWSEM(tomoyo_domain_initializer_list_lock);
 
@@ -135,7 +245,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
        saved_program = tomoyo_save_name(program);
        if (!saved_program)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_initializer_list_lock);
        list_for_each_entry(ptr, &tomoyo_domain_initializer_list, list) {
                if (ptr->is_not != is_not ||
@@ -161,7 +270,6 @@ static int tomoyo_update_domain_initializer_entry(const char *domainname,
        error = 0;
  out:
        up_write(&tomoyo_domain_initializer_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -193,13 +301,12 @@ bool tomoyo_read_domain_initializer_policy(struct tomoyo_io_buffer *head)
                        from = " from ";
                        domain = ptr->domainname->name;
                }
-               if (!tomoyo_io_printf(head,
-                                     "%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
-                                     "%s%s%s\n", no, ptr->program->name, from,
-                                     domain)) {
-                       done = false;
+               done = tomoyo_io_printf(head,
+                                       "%s" TOMOYO_KEYWORD_INITIALIZE_DOMAIN
+                                       "%s%s%s\n", no, ptr->program->name,
+                                       from, domain);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_domain_initializer_list_lock);
        return done;
@@ -273,7 +380,44 @@ static bool tomoyo_is_domain_initializer(const struct tomoyo_path_info *
        return flag;
 }
 
-/* The list for "struct tomoyo_domain_keeper_entry". */
+/*
+ * tomoyo_domain_keeper_list is used for holding list of domainnames which
+ * suppresses domain transition. Normally, a domainname is monotonically
+ * getting longer. But sometimes, we want to suppress domain transition.
+ * It would be convenient for us that programs executed from a login session
+ * belong to the same domain. Thus, TOMOYO provides a way to suppress domain
+ * transition.
+ *
+ * An entry is added by
+ *
+ * # echo 'keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete keep_domain <kernel> /usr/sbin/sshd /bin/bash' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^keep_domain /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process which belongs to
+ * "<kernel> /usr/sbin/sshd /bin/bash" domain will remain in that domain,
+ * unless explicitly specified by "initialize_domain" or "no_keep_domain".
+ *
+ * You may specify a program using "from" keyword.
+ * "keep_domain /bin/pwd from <kernel> /usr/sbin/sshd /bin/bash"
+ * will cause "/bin/pwd" executed from "<kernel> /usr/sbin/sshd /bin/bash"
+ * domain to remain in "<kernel> /usr/sbin/sshd /bin/bash" domain.
+ *
+ * You may add "no_" prefix to "keep_domain".
+ * "keep_domain <kernel> /usr/sbin/sshd /bin/bash" and
+ * "no_keep_domain /usr/bin/passwd from <kernel> /usr/sbin/sshd /bin/bash" will
+ * cause "/usr/bin/passwd" to belong to
+ * "<kernel> /usr/sbin/sshd /bin/bash /usr/bin/passwd" domain, unless
+ * explicitly specified by "initialize_domain".
+ */
 static LIST_HEAD(tomoyo_domain_keeper_list);
 static DECLARE_RWSEM(tomoyo_domain_keeper_list_lock);
 
@@ -296,7 +440,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
        struct tomoyo_domain_keeper_entry *ptr;
        const struct tomoyo_path_info *saved_domainname;
        const struct tomoyo_path_info *saved_program = NULL;
-       static DEFINE_MUTEX(lock);
        int error = -ENOMEM;
        bool is_last_name = false;
 
@@ -315,7 +458,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
        saved_domainname = tomoyo_save_name(domainname);
        if (!saved_domainname)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_keeper_list_lock);
        list_for_each_entry(ptr, &tomoyo_domain_keeper_list, list) {
                if (ptr->is_not != is_not ||
@@ -341,7 +483,6 @@ static int tomoyo_update_domain_keeper_entry(const char *domainname,
        error = 0;
  out:
        up_write(&tomoyo_domain_keeper_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -394,13 +535,12 @@ bool tomoyo_read_domain_keeper_policy(struct tomoyo_io_buffer *head)
                        from = " from ";
                        program = ptr->program->name;
                }
-               if (!tomoyo_io_printf(head,
-                                     "%s" TOMOYO_KEYWORD_KEEP_DOMAIN
-                                     "%s%s%s\n", no, program, from,
-                                     ptr->domainname->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head,
+                                       "%s" TOMOYO_KEYWORD_KEEP_DOMAIN
+                                       "%s%s%s\n", no, program, from,
+                                       ptr->domainname->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_domain_keeper_list_lock);
        return done;
@@ -446,7 +586,36 @@ static bool tomoyo_is_domain_keeper(const struct tomoyo_path_info *domainname,
        return flag;
 }
 
-/* The list for "struct tomoyo_alias_entry". */
+/*
+ * tomoyo_alias_list is used for holding list of symlink's pathnames which are
+ * allowed to be passed to an execve() request. Normally, the domainname which
+ * the current process will belong to after execve() succeeds is calculated
+ * using dereferenced pathnames. But some programs behave differently depending
+ * on the name passed to argv[0]. For busybox, calculating domainname using
+ * dereferenced pathnames will cause all programs in the busybox to belong to
+ * the same domain. Thus, TOMOYO provides a way to allow use of symlink's
+ * pathname for checking execve()'s permission and calculating domainname which
+ * the current process will belong to after execve() succeeds.
+ *
+ * An entry is added by
+ *
+ * # echo 'alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete alias /bin/busybox /bin/cat' > \
+ *                            /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^alias /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if /bin/cat is a symlink to /bin/busybox and execution
+ * of /bin/cat is requested, permission is checked for /bin/cat rather than
+ * /bin/busybox and domainname which the current process will belong to after
+ * execve() succeeds is calculated using /bin/cat rather than /bin/busybox .
+ */
 static LIST_HEAD(tomoyo_alias_list);
 static DECLARE_RWSEM(tomoyo_alias_list_lock);
 
@@ -476,7 +645,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
        saved_aliased_name = tomoyo_save_name(aliased_name);
        if (!saved_original_name || !saved_aliased_name)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_alias_list_lock);
        list_for_each_entry(ptr, &tomoyo_alias_list, list) {
                if (ptr->original_name != saved_original_name ||
@@ -499,7 +667,6 @@ static int tomoyo_update_alias_entry(const char *original_name,
        error = 0;
  out:
        up_write(&tomoyo_alias_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -522,12 +689,11 @@ bool tomoyo_read_alias_policy(struct tomoyo_io_buffer *head)
                ptr = list_entry(pos, struct tomoyo_alias_entry, list);
                if (ptr->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
-                                     ptr->original_name->name,
-                                     ptr->aliased_name->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALIAS "%s %s\n",
+                                       ptr->original_name->name,
+                                       ptr->aliased_name->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_alias_list_lock);
        return done;
@@ -567,7 +733,6 @@ int tomoyo_delete_domain(char *domainname)
 
        name.name = domainname;
        tomoyo_fill_path_info(&name);
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_list_lock);
        /* Is there an active domain? */
        list_for_each_entry(domain, &tomoyo_domain_list, list) {
@@ -581,7 +746,6 @@ int tomoyo_delete_domain(char *domainname)
                break;
        }
        up_write(&tomoyo_domain_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return 0;
 }
 
@@ -600,7 +764,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
        struct tomoyo_domain_info *domain = NULL;
        const struct tomoyo_path_info *saved_domainname;
 
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_list_lock);
        domain = tomoyo_find_domain(domainname);
        if (domain)
@@ -619,7 +782,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
                    domain->domainname != saved_domainname)
                        continue;
                flag = false;
-               /***** CRITICAL SECTION START *****/
                read_lock(&tasklist_lock);
                for_each_process(p) {
                        if (tomoyo_real_domain(p) != domain)
@@ -628,7 +790,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
                        break;
                }
                read_unlock(&tasklist_lock);
-               /***** CRITICAL SECTION END *****/
                if (flag)
                        continue;
                list_for_each_entry(ptr, &domain->acl_info_list, list) {
@@ -651,7 +812,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char *
        }
  out:
        up_write(&tomoyo_domain_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return domain;
 }
 
@@ -739,7 +899,7 @@ int tomoyo_find_next_domain(struct linux_binprm *bprm,
        }
 
        /* Check execute permission. */
-       retval = tomoyo_check_exec_perm(old_domain, &r, tmp);
+       retval = tomoyo_check_exec_perm(old_domain, &r);
        if (retval < 0)
                goto out;
 
index 2316da8ec5bcaf13485aca9f93bdd0e01b171107..5ae3a571559f58192b7be8a94cada7886706c243 100644 (file)
 #include "realpath.h"
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
 
-/* Structure for "allow_read" keyword. */
+/*
+ * tomoyo_globally_readable_file_entry is a structure which is used for holding
+ * "allow_read" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_globally_readable_list .
+ *  (2) "filename" is a pathname which is allowed to open(O_RDONLY).
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_globally_readable_file_entry {
        struct list_head list;
        const struct tomoyo_path_info *filename;
        bool is_deleted;
 };
 
-/* Structure for "file_pattern" keyword. */
+/*
+ * tomoyo_pattern_entry is a structure which is used for holding
+ * "tomoyo_pattern_list" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_pattern_list .
+ *  (2) "pattern" is a pathname pattern which is used for converting pathnames
+ *      to pathname patterns during learning mode.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_pattern_entry {
        struct list_head list;
        const struct tomoyo_path_info *pattern;
        bool is_deleted;
 };
 
-/* Structure for "deny_rewrite" keyword. */
+/*
+ * tomoyo_no_rewrite_entry is a structure which is used for holding
+ * "deny_rewrite" entries.
+ * It has following fields.
+ *
+ *  (1) "list" which is linked to tomoyo_no_rewrite_list .
+ *  (2) "pattern" is a pathname which is by default not permitted to modify
+ *      already existing content.
+ *  (3) "is_deleted" is a bool which is true if marked as deleted, false
+ *      otherwise.
+ */
 struct tomoyo_no_rewrite_entry {
        struct list_head list;
        const struct tomoyo_path_info *pattern;
@@ -141,7 +170,31 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
                                         struct tomoyo_domain_info *
                                         const domain, const bool is_delete);
 
-/* The list for "struct tomoyo_globally_readable_file_entry". */
+/*
+ * tomoyo_globally_readable_list is used for holding list of pathnames which
+ * are by default allowed to be open()ed for reading by any process.
+ *
+ * An entry is added by
+ *
+ * # echo 'allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete allow_read /lib/libc-2.5.so' > \
+ *                               /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^allow_read /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, any process is allowed to
+ * open("/lib/libc-2.5.so", O_RDONLY).
+ * One exception is, if the domain which current process belongs to is marked
+ * as "ignore_global_allow_read", current process can't do so unless explicitly
+ * given "allow_read /lib/libc-2.5.so" to the domain which current process
+ * belongs to.
+ */
 static LIST_HEAD(tomoyo_globally_readable_list);
 static DECLARE_RWSEM(tomoyo_globally_readable_list_lock);
 
@@ -166,7 +219,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
        saved_filename = tomoyo_save_name(filename);
        if (!saved_filename)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_globally_readable_list_lock);
        list_for_each_entry(ptr, &tomoyo_globally_readable_list, list) {
                if (ptr->filename != saved_filename)
@@ -187,7 +239,6 @@ static int tomoyo_update_globally_readable_entry(const char *filename,
        error = 0;
  out:
        up_write(&tomoyo_globally_readable_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -249,17 +300,44 @@ bool tomoyo_read_globally_readable_policy(struct tomoyo_io_buffer *head)
                                 list);
                if (ptr->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
-                                     ptr->filename->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, TOMOYO_KEYWORD_ALLOW_READ "%s\n",
+                                       ptr->filename->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_globally_readable_list_lock);
        return done;
 }
 
-/* The list for "struct tomoyo_pattern_entry". */
+/* tomoyo_pattern_list is used for holding list of pathnames which are used for
+ * converting pathnames to pathname patterns during learning mode.
+ *
+ * An entry is added by
+ *
+ * # echo 'file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete file_pattern /proc/\$/mounts' > \
+ *                             /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^file_pattern /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process which belongs to a domain which is in
+ * learning mode requested open("/proc/1/mounts", O_RDONLY),
+ * "allow_read /proc/\$/mounts" is automatically added to the domain which that
+ * process belongs to.
+ *
+ * It is not a desirable behavior that we have to use /proc/\$/ instead of
+ * /proc/self/ when current process needs to access only current process's
+ * information. As of now, LSM version of TOMOYO is using __d_path() for
+ * calculating pathname. Non LSM version of TOMOYO is using its own function
+ * which pretends as if /proc/self/ is not a symlink; so that we can forbid
+ * current process from accessing other process's information.
+ */
 static LIST_HEAD(tomoyo_pattern_list);
 static DECLARE_RWSEM(tomoyo_pattern_list_lock);
 
@@ -284,7 +362,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
        saved_pattern = tomoyo_save_name(pattern);
        if (!saved_pattern)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_pattern_list_lock);
        list_for_each_entry(ptr, &tomoyo_pattern_list, list) {
                if (saved_pattern != ptr->pattern)
@@ -305,7 +382,6 @@ static int tomoyo_update_file_pattern_entry(const char *pattern,
        error = 0;
  out:
        up_write(&tomoyo_pattern_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -373,17 +449,44 @@ bool tomoyo_read_file_pattern(struct tomoyo_io_buffer *head)
                ptr = list_entry(pos, struct tomoyo_pattern_entry, list);
                if (ptr->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN "%s\n",
-                                     ptr->pattern->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, TOMOYO_KEYWORD_FILE_PATTERN
+                                       "%s\n", ptr->pattern->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_pattern_list_lock);
        return done;
 }
 
-/* The list for "struct tomoyo_no_rewrite_entry". */
+/*
+ * tomoyo_no_rewrite_list is used for holding list of pathnames which are by
+ * default forbidden to modify already written content of a file.
+ *
+ * An entry is added by
+ *
+ * # echo 'deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and is deleted by
+ *
+ * # echo 'delete deny_rewrite /var/log/messages' > \
+ *                              /sys/kernel/security/tomoyo/exception_policy
+ *
+ * and all entries are retrieved by
+ *
+ * # grep ^deny_rewrite /sys/kernel/security/tomoyo/exception_policy
+ *
+ * In the example above, if a process requested to rewrite /var/log/messages ,
+ * the process can't rewrite unless the domain which that process belongs to
+ * has "allow_rewrite /var/log/messages" entry.
+ *
+ * It is not a desirable behavior that we have to add "\040(deleted)" suffix
+ * when we want to allow rewriting already unlink()ed file. As of now,
+ * LSM version of TOMOYO is using __d_path() for calculating pathname.
+ * Non LSM version of TOMOYO is using its own function which doesn't append
+ * " (deleted)" suffix if the file is already unlink()ed; so that we don't
+ * need to worry whether the file is already unlink()ed or not.
+ */
 static LIST_HEAD(tomoyo_no_rewrite_list);
 static DECLARE_RWSEM(tomoyo_no_rewrite_list_lock);
 
@@ -407,7 +510,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
        saved_pattern = tomoyo_save_name(pattern);
        if (!saved_pattern)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_no_rewrite_list_lock);
        list_for_each_entry(ptr, &tomoyo_no_rewrite_list, list) {
                if (ptr->pattern != saved_pattern)
@@ -428,7 +530,6 @@ static int tomoyo_update_no_rewrite_entry(const char *pattern,
        error = 0;
  out:
        up_write(&tomoyo_no_rewrite_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -489,11 +590,10 @@ bool tomoyo_read_no_rewrite_policy(struct tomoyo_io_buffer *head)
                ptr = list_entry(pos, struct tomoyo_no_rewrite_entry, list);
                if (ptr->is_deleted)
                        continue;
-               if (!tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE "%s\n",
-                                     ptr->pattern->name)) {
-                       done = false;
+               done = tomoyo_io_printf(head, TOMOYO_KEYWORD_DENY_REWRITE
+                                       "%s\n", ptr->pattern->name);
+               if (!done)
                        break;
-               }
        }
        up_read(&tomoyo_no_rewrite_list_lock);
        return done;
@@ -745,7 +845,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
        saved_filename = tomoyo_save_name(filename);
        if (!saved_filename)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_acl_info_list_lock);
        if (is_delete)
                goto delete;
@@ -800,7 +899,6 @@ static int tomoyo_update_single_path_acl(const u8 type, const char *filename,
        }
  out:
        up_write(&tomoyo_domain_acl_info_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -836,7 +934,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
        saved_filename2 = tomoyo_save_name(filename2);
        if (!saved_filename1 || !saved_filename2)
                return -ENOMEM;
-       /***** EXCLUSIVE SECTION START *****/
        down_write(&tomoyo_domain_acl_info_list_lock);
        if (is_delete)
                goto delete;
@@ -884,7 +981,6 @@ static int tomoyo_update_double_path_acl(const u8 type, const char *filename1,
        }
  out:
        up_write(&tomoyo_domain_acl_info_list_lock);
-       /***** EXCLUSIVE SECTION END *****/
        return error;
 }
 
@@ -1025,13 +1121,11 @@ int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
  *
  * @domain:   Pointer to "struct tomoyo_domain_info".
  * @filename: Check permission for "execute".
- * @tmp:      Buffer for temporary use.
  *
  * Returns 0 on success, negativevalue otherwise.
  */
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-                          const struct tomoyo_path_info *filename,
-                          struct tomoyo_page_buffer *tmp)
+                          const struct tomoyo_path_info *filename)
 {
        const u8 mode = tomoyo_check_flags(domain, TOMOYO_MAC_FOR_FILE);
 
index 40927a84cb6e697a24671c36a3527c1645f850a1..5f2e3326337118c9252dc80ec10e53b3365c0a9c 100644 (file)
@@ -220,7 +220,6 @@ void *tomoyo_alloc_element(const unsigned int size)
                = roundup(size, max(sizeof(void *), sizeof(long)));
        if (word_aligned_size > PATH_MAX)
                return NULL;
-       /***** EXCLUSIVE SECTION START *****/
        mutex_lock(&lock);
        if (buf_used_len + word_aligned_size > PATH_MAX) {
                if (!tomoyo_quota_for_elements ||
@@ -251,7 +250,6 @@ void *tomoyo_alloc_element(const unsigned int size)
                }
        }
        mutex_unlock(&lock);
-       /***** EXCLUSIVE SECTION END *****/
        return ptr;
 }
 
@@ -267,7 +265,16 @@ static unsigned int tomoyo_quota_for_savename;
  */
 #define TOMOYO_MAX_HASH 256
 
-/* Structure for string data. */
+/*
+ * tomoyo_name_entry is a structure which is used for linking
+ * "struct tomoyo_path_info" into tomoyo_name_list .
+ *
+ * Since tomoyo_name_list manages a list of strings which are shared by
+ * multiple processes (whereas "struct tomoyo_path_info" inside
+ * "struct tomoyo_path_info_with_data" is not shared), a reference counter will
+ * be added to "struct tomoyo_name_entry" rather than "struct tomoyo_path_info"
+ * when TOMOYO starts supporting garbage collector.
+ */
 struct tomoyo_name_entry {
        struct list_head list;
        struct tomoyo_path_info entry;
@@ -281,10 +288,10 @@ struct tomoyo_free_memory_block_list {
 };
 
 /*
- * The list for "struct tomoyo_name_entry".
- *
- * This list is updated only inside tomoyo_save_name(), thus
- * no global mutex exists.
+ * tomoyo_name_list is used for holding string data used by TOMOYO.
+ * Since same string data is likely used for multiple times (e.g.
+ * "/lib/libc-2.5.so"), TOMOYO shares string data in the form of
+ * "const struct tomoyo_path_info *".
  */
 static struct list_head tomoyo_name_list[TOMOYO_MAX_HASH];
 
@@ -318,7 +325,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
                return NULL;
        }
        hash = full_name_hash((const unsigned char *) name, len - 1);
-       /***** EXCLUSIVE SECTION START *****/
        mutex_lock(&lock);
        list_for_each_entry(ptr, &tomoyo_name_list[hash % TOMOYO_MAX_HASH],
                             list) {
@@ -366,7 +372,6 @@ const struct tomoyo_path_info *tomoyo_save_name(const char *name)
        }
  out:
        mutex_unlock(&lock);
-       /***** EXCLUSIVE SECTION END *****/
        return ptr ? &ptr->entry : NULL;
 }
 
index e42be5c4f055b5ebb5258fdb3e6d041f9b3d2f91..3194d09fe0f4ccd5311b819e526a1b696a8ecc33 100644 (file)
@@ -262,6 +262,10 @@ static int tomoyo_dentry_open(struct file *f, const struct cred *cred)
        return tomoyo_check_open_permission(tomoyo_domain(), &f->f_path, flags);
 }
 
+/*
+ * tomoyo_security_ops is a "struct security_operations" which is used for
+ * registering TOMOYO.
+ */
 static struct security_operations tomoyo_security_ops = {
        .name                = "tomoyo",
        .cred_prepare        = tomoyo_cred_prepare,
index 41c6ebafb9c561ad203cc52cef886378e01309e7..0fd588a629cf915ca6be0a47dbc300292d908570 100644 (file)
@@ -17,13 +17,11 @@ struct path;
 struct inode;
 struct linux_binprm;
 struct pt_regs;
-struct tomoyo_page_buffer;
 
 int tomoyo_check_file_perm(struct tomoyo_domain_info *domain,
                           const char *filename, const u8 perm);
 int tomoyo_check_exec_perm(struct tomoyo_domain_info *domain,
-                          const struct tomoyo_path_info *filename,
-                          struct tomoyo_page_buffer *buf);
+                          const struct tomoyo_path_info *filename);
 int tomoyo_check_open_permission(struct tomoyo_domain_info *domain,
                                 struct path *path, const int flag);
 int tomoyo_check_1path_perm(struct tomoyo_domain_info *domain,
@@ -90,17 +88,10 @@ static inline struct tomoyo_domain_info *tomoyo_domain(void)
        return current_cred()->security;
 }
 
-/* Caller holds tasklist_lock spinlock. */
 static inline struct tomoyo_domain_info *tomoyo_real_domain(struct task_struct
                                                            *task)
 {
-       /***** CRITICAL SECTION START *****/
-       const struct cred *cred = get_task_cred(task);
-       struct tomoyo_domain_info *domain = cred->security;
-
-       put_cred(cred);
-       return domain;
-       /***** CRITICAL SECTION END *****/
+       return task_cred_xxx(task, security);
 }
 
 #endif /* !defined(_SECURITY_TOMOYO_TOMOYO_H) */