xen: lazy-mmu operations

[linux-2.6] / arch / i386 / xen / enlighten.c
diff --git a/arch/i386/xen/enlighten.c b/arch/i386/xen/enlighten.c

index 9550ae3b1fb123fb81c2011cced4d909108c9fb3..031dc1dcf8195f62274cc752c20e20d828aa202d 100644 (file)
--- a/arch/i386/xen/enlighten.c
+++ b/arch/i386/xen/enlighten.c
@@ -15,6 +15,7 @@
  #include <linux/init.h>
  #include <linux/smp.h>
  #include <linux/preempt.h>
+#include <linux/hardirq.h>
  #include <linux/percpu.h>
  #include <linux/delay.h>
  #include <linux/start_kernel.h>
@@ -24,6 +25,7 @@
  #include <linux/mm.h>
  #include <linux/page-flags.h>
  #include <linux/highmem.h>
+#include <linux/smp.h>
  
  #include <xen/interface/xen.h>
  #include <xen/interface/physdev.h>
@@ -40,6 +42,7 @@
  #include <asm/setup.h>
  #include <asm/desc.h>
  #include <asm/pgtable.h>
+#include <asm/tlbflush.h>
  
  #include "xen-ops.h"
  #include "mmu.h"
@@ -56,7 +59,7 @@ DEFINE_PER_CPU(unsigned long, xen_cr3);
  struct start_info *xen_start_info;
  EXPORT_SYMBOL_GPL(xen_start_info);
  
-static void xen_vcpu_setup(int cpu)
+void xen_vcpu_setup(int cpu)
  {
         per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
  }
@@ -106,11 +109,10 @@ static unsigned long xen_save_fl(void)
         struct vcpu_info *vcpu;
         unsigned long flags;
  
-       preempt_disable();
         vcpu = x86_read_percpu(xen_vcpu);
+
         /* flag has opposite sense of mask */
         flags = !vcpu->evtchn_upcall_mask;
-       preempt_enable();
  
         /* convert to IF type flag
            -0 -> 0x00000000
@@ -123,32 +125,35 @@ static void xen_restore_fl(unsigned long flags)
  {
         struct vcpu_info *vcpu;
  
-       preempt_disable();
-
         /* convert from IF type flag */
         flags = !(flags & X86_EFLAGS_IF);
+
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
+       preempt_disable();
         vcpu = x86_read_percpu(xen_vcpu);
         vcpu->evtchn_upcall_mask = flags;
+       preempt_enable_no_resched();
  
-       if (flags == 0) {
-               /* Unmask then check (avoid races).  We're only protecting
-                  against updates by this CPU, so there's no need for
-                  anything stronger. */
-               barrier();
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
  
+       if (flags == 0) {
+               preempt_check_resched();
+               barrier(); /* unmask then check (avoid races) */
                 if (unlikely(vcpu->evtchn_upcall_pending))
                         force_evtchn_callback();
-               preempt_enable();
-       } else
-               preempt_enable_no_resched();
+       }
  }
  
  static void xen_irq_disable(void)
  {
-       struct vcpu_info *vcpu;
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
         preempt_disable();
-       vcpu = x86_read_percpu(xen_vcpu);
-       vcpu->evtchn_upcall_mask = 1;
+       x86_read_percpu(xen_vcpu)->evtchn_upcall_mask = 1;
         preempt_enable_no_resched();
  }
  
@@ -156,18 +161,20 @@ static void xen_irq_enable(void)
  {
         struct vcpu_info *vcpu;
  
+       /* There's a one instruction preempt window here.  We need to
+          make sure we're don't switch CPUs between getting the vcpu
+          pointer and updating the mask. */
         preempt_disable();
         vcpu = x86_read_percpu(xen_vcpu);
         vcpu->evtchn_upcall_mask = 0;
+       preempt_enable_no_resched();
  
-       /* Unmask then check (avoid races).  We're only protecting
-          against updates by this CPU, so there's no need for
-          anything stronger. */
-       barrier();
+       /* Doesn't matter if we get preempted here, because any
+          pending event will get dealt with anyway. */
  
+       barrier(); /* unmask then check (avoid races) */
         if (unlikely(vcpu->evtchn_upcall_pending))
                 force_evtchn_callback();
-       preempt_enable();
  }
  
  static void xen_safe_halt(void)
@@ -187,6 +194,8 @@ static void xen_halt(void)
  
  static void xen_set_lazy_mode(enum paravirt_lazy_mode mode)
  {
+       BUG_ON(preemptible());
+
         switch (mode) {
         case PARAVIRT_LAZY_NONE:
                 BUG_ON(x86_read_percpu(xen_lazy_mode) == PARAVIRT_LAZY_NONE);
@@ -291,9 +300,13 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum,
         xmaddr_t mach_lp = virt_to_machine(lp);
         u64 entry = (u64)high << 32 | low;
  
+       preempt_disable();
+
         xen_mc_flush();
         if (HYPERVISOR_update_descriptor(mach_lp.maddr, entry))
                 BUG();
+
+       preempt_enable();
  }
  
  static int cvt_gate_to_trap(int vector, u32 low, u32 high,
@@ -326,11 +339,13 @@ static DEFINE_PER_CPU(struct Xgt_desc_struct, idt_desc);
  static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
                                 u32 low, u32 high)
  {
-
-       int cpu = smp_processor_id();
         unsigned long p = (unsigned long)&dt[entrynum];
-       unsigned long start = per_cpu(idt_desc, cpu).address;
-       unsigned long end = start + per_cpu(idt_desc, cpu).size + 1;
+       unsigned long start, end;
+
+       preempt_disable();
+
+       start = __get_cpu_var(idt_desc).address;
+       end = start + __get_cpu_var(idt_desc).size + 1;
  
         xen_mc_flush();
  
@@ -345,25 +360,18 @@ static void xen_write_idt_entry(struct desc_struct *dt, int entrynum,
                         if (HYPERVISOR_set_trap_table(info))
                                 BUG();
         }
+
+       preempt_enable();
  }
  
-/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
-   hold a spinlock to protect the static traps[] array (static because
-   it avoids allocation, and saves stack space). */
-static void xen_load_idt(const struct Xgt_desc_struct *desc)
+static void xen_convert_trap_info(const struct Xgt_desc_struct *desc,
+                                 struct trap_info *traps)
  {
-       static DEFINE_SPINLOCK(lock);
-       static struct trap_info traps[257];
-
-       int cpu = smp_processor_id();
         unsigned in, out, count;
  
-       per_cpu(idt_desc, cpu) = *desc;
-
         count = (desc->size+1) / 8;
         BUG_ON(count > 256);
  
-       spin_lock(&lock);
         for (in = out = 0; in < count; in++) {
                 const u32 *entry = (u32 *)(desc->address + in * 8);
  
@@ -371,6 +379,28 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
                         out++;
         }
         traps[out].address = 0;
+}
+
+void xen_copy_trap_info(struct trap_info *traps)
+{
+       const struct Xgt_desc_struct *desc = &__get_cpu_var(idt_desc);
+
+       xen_convert_trap_info(desc, traps);
+}
+
+/* Load a new IDT into Xen.  In principle this can be per-CPU, so we
+   hold a spinlock to protect the static traps[] array (static because
+   it avoids allocation, and saves stack space). */
+static void xen_load_idt(const struct Xgt_desc_struct *desc)
+{
+       static DEFINE_SPINLOCK(lock);
+       static struct trap_info traps[257];
+
+       spin_lock(&lock);
+
+       __get_cpu_var(idt_desc) = *desc;
+
+       xen_convert_trap_info(desc, traps);
  
         xen_mc_flush();
         if (HYPERVISOR_set_trap_table(traps))
@@ -384,6 +414,8 @@ static void xen_load_idt(const struct Xgt_desc_struct *desc)
  static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
                                 u32 low, u32 high)
  {
+       preempt_disable();
+
         switch ((high >> 8) & 0xff) {
         case DESCTYPE_LDT:
         case DESCTYPE_TSS:
@@ -400,10 +432,12 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
         }
  
         }
+
+       preempt_enable();
  }
  
  static void xen_load_esp0(struct tss_struct *tss,
-                                  struct thread_struct *thread)
+                         struct thread_struct *thread)
  {
         struct multicall_space mcs = xen_mc_entry(0);
         MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->esp0);
@@ -428,25 +462,79 @@ static unsigned long xen_apic_read(unsigned long reg)
  {
         return 0;
  }
+
+static void xen_apic_write(unsigned long reg, unsigned long val)
+{
+       /* Warn to see if there's any stray references */
+       WARN_ON(1);
+}
  #endif
  
  static void xen_flush_tlb(void)
  {
-       struct mmuext_op op;
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
  
-       op.cmd = MMUEXT_TLB_FLUSH_LOCAL;
-       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-               BUG();
+       op = mcs.args;
+       op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
  }
  
  static void xen_flush_tlb_single(unsigned long addr)
  {
-       struct mmuext_op op;
+       struct mmuext_op *op;
+       struct multicall_space mcs = xen_mc_entry(sizeof(*op));
  
-       op.cmd = MMUEXT_INVLPG_LOCAL;
-       op.arg1.linear_addr = addr & PAGE_MASK;
-       if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF))
-               BUG();
+       op = mcs.args;
+       op->cmd = MMUEXT_INVLPG_LOCAL;
+       op->arg1.linear_addr = addr & PAGE_MASK;
+       MULTI_mmuext_op(mcs.mc, op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
+}
+
+static void xen_flush_tlb_others(const cpumask_t *cpus, struct mm_struct *mm,
+                                unsigned long va)
+{
+       struct {
+               struct mmuext_op op;
+               cpumask_t mask;
+       } *args;
+       cpumask_t cpumask = *cpus;
+       struct multicall_space mcs;
+
+       /*
+        * A couple of (to be removed) sanity checks:
+        *
+        * - current CPU must not be in mask
+        * - mask must exist :)
+        */
+       BUG_ON(cpus_empty(cpumask));
+       BUG_ON(cpu_isset(smp_processor_id(), cpumask));
+       BUG_ON(!mm);
+
+       /* If a CPU which we ran on has gone down, OK. */
+       cpus_and(cpumask, cpumask, cpu_online_map);
+       if (cpus_empty(cpumask))
+               return;
+
+       mcs = xen_mc_entry(sizeof(*args));
+       args = mcs.args;
+       args->mask = cpumask;
+       args->op.arg2.vcpumask = &args->mask;
+
+       if (va == TLB_FLUSH_ALL) {
+               args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
+       } else {
+               args->op.cmd = MMUEXT_INVLPG_MULTI;
+               args->op.arg1.linear_addr = va;
+       }
+
+       MULTI_mmuext_op(mcs.mc, &args->op, 1, NULL, DOMID_SELF);
+
+       xen_mc_issue(PARAVIRT_LAZY_MMU);
  }
  
  static unsigned long xen_read_cr2(void)
@@ -460,18 +548,6 @@ static void xen_write_cr4(unsigned long cr4)
         native_write_cr4(cr4 & ~X86_CR4_TSD);
  }
  
-/*
- * Page-directory addresses above 4GB do not fit into architectural %cr3.
- * When accessing %cr3, or equivalent field in vcpu_guest_context, guests
- * must use the following accessor macros to pack/unpack valid MFNs.
- *
- * Note that Xen is using the fact that the pagetable base is always
- * page-aligned, and putting the 12 MSB of the address into the 12 LSB
- * of cr3.
- */
-#define xen_pfn_to_cr3(pfn) (((unsigned)(pfn) << 12) | ((unsigned)(pfn) >> 20))
-#define xen_cr3_to_pfn(cr3) (((unsigned)(cr3) >> 12) | ((unsigned)(cr3) << 20))
-
  static unsigned long xen_read_cr3(void)
  {
         return x86_read_percpu(xen_cr3);
@@ -479,6 +555,8 @@ static unsigned long xen_read_cr3(void)
  
  static void xen_write_cr3(unsigned long cr3)
  {
+       BUG_ON(preemptible());
+
         if (cr3 == x86_read_percpu(xen_cr3)) {
                 /* just a simple tlb flush */
                 xen_flush_tlb();
@@ -683,7 +761,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
         .set_wallclock = xen_set_wallclock,
         .get_wallclock = xen_get_wallclock,
         .get_cpu_khz = xen_cpu_khz,
-       .sched_clock = xen_clocksource_read,
+       .sched_clock = xen_sched_clock,
  
         .cpuid = xen_cpuid,
  
@@ -740,8 +818,8 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
         .io_delay = xen_io_delay,
  
  #ifdef CONFIG_X86_LOCAL_APIC
-       .apic_write = paravirt_nop,
-       .apic_write_atomic = paravirt_nop,
+       .apic_write = xen_apic_write,
+       .apic_write_atomic = xen_apic_write,
         .apic_read = xen_apic_read,
         .setup_boot_clock = paravirt_nop,
         .setup_secondary_clock = paravirt_nop,
@@ -751,6 +829,7 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
         .flush_tlb_user = xen_flush_tlb,
         .flush_tlb_kernel = xen_flush_tlb,
         .flush_tlb_single = xen_flush_tlb_single,
+       .flush_tlb_others = xen_flush_tlb_others,
  
         .pte_update = paravirt_nop,
         .pte_update_defer = paravirt_nop,
@@ -796,6 +875,19 @@ static const struct paravirt_ops xen_paravirt_ops __initdata = {
         .set_lazy_mode = xen_set_lazy_mode,
  };
  
+#ifdef CONFIG_SMP
+static const struct smp_ops xen_smp_ops __initdata = {
+       .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
+       .smp_prepare_cpus = xen_smp_prepare_cpus,
+       .cpu_up = xen_cpu_up,
+       .smp_cpus_done = xen_smp_cpus_done,
+
+       .smp_send_stop = xen_smp_send_stop,
+       .smp_send_reschedule = xen_smp_send_reschedule,
+       .smp_call_function_mask = xen_smp_call_function_mask,
+};
+#endif /* CONFIG_SMP */
+
  /* First C function to be called on Xen boot */
  asmlinkage void __init xen_start_kernel(void)
  {
@@ -808,6 +900,9 @@ asmlinkage void __init xen_start_kernel(void)
  
         /* Install Xen paravirt ops */
         paravirt_ops = xen_paravirt_ops;
+#ifdef CONFIG_SMP
+       smp_ops = xen_smp_ops;
+#endif
  
         xen_setup_features();