diff options
Diffstat (limited to 'trunk/2.6.22/20047_xen3-patch-2.6.22.patch1')
-rw-r--r-- | trunk/2.6.22/20047_xen3-patch-2.6.22.patch1 | 7866 |
1 files changed, 7866 insertions, 0 deletions
diff --git a/trunk/2.6.22/20047_xen3-patch-2.6.22.patch1 b/trunk/2.6.22/20047_xen3-patch-2.6.22.patch1 new file mode 100644 index 0000000..df38df6 --- /dev/null +++ b/trunk/2.6.22/20047_xen3-patch-2.6.22.patch1 @@ -0,0 +1,7866 @@ +From: www.kernel.org +Subject: Update to 2.6.22 +Patch-mainline: 2.6.22 + +Automatically created from "patches.kernel.org/patch-2.6.22" by xen-port-patches.py + +Acked-by: jbeulich@novell.com + +Index: 10.3-2007-11-26/arch/i386/Kconfig +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/Kconfig 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/Kconfig 2007-10-22 13:58:56.000000000 +0200 +@@ -922,7 +922,6 @@ config HOTPLUG_CPU + + config COMPAT_VDSO + bool "Compat VDSO support" +- depends on !X86_XEN + default y + help + Map the VDSO to the predictable old-style address too. +@@ -1086,7 +1085,7 @@ config PCI + bool "PCI support" if !X86_VISWS + depends on !X86_VOYAGER + default y if X86_VISWS +- select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) ++ select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !X86_XEN) + help + Find out whether you have a PCI motherboard. PCI is the name of a + bus system, i.e. the way the CPU talks to the other stuff inside +Index: 10.3-2007-11-26/arch/i386/Kconfig.cpu +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/Kconfig.cpu 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/arch/i386/Kconfig.cpu 2007-10-22 13:58:56.000000000 +0200 +@@ -299,7 +299,7 @@ config X86_POPAD_OK + + config X86_CMPXCHG64 + bool +- depends on X86_PAE ++ depends on X86_PAE || X86_XEN + default y + + config X86_ALIGNMENT_16 +Index: 10.3-2007-11-26/arch/i386/kernel/Makefile +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/Makefile 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/Makefile 2007-10-22 13:58:56.000000000 +0200 +@@ -103,5 +103,4 @@ n-obj-xen := i8253.o i8259.o reboot.o sm + obj-y := $(call filterxen, $(obj-y), $(n-obj-xen)) + obj-y := $(call cherrypickxen, $(obj-y)) + extra-y := $(call cherrypickxen, $(extra-y)) +-%/head-xen.o %/head-xen.s: EXTRA_AFLAGS := + endif +Index: 10.3-2007-11-26/arch/i386/kernel/acpi/boot-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/acpi/boot-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/acpi/boot-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -624,8 +624,6 @@ static int __init acpi_parse_sbf(struct + static int __init acpi_parse_hpet(struct acpi_table_header *table) + { + struct acpi_table_hpet *hpet_tbl; +- struct resource *hpet_res; +- resource_size_t res_start; + + hpet_tbl = (struct acpi_table_hpet *)table; + if (!hpet_tbl) { +@@ -639,29 +637,10 @@ static int __init acpi_parse_hpet(struct + return -1; + } + +-#define HPET_RESOURCE_NAME_SIZE 9 +- hpet_res = alloc_bootmem(sizeof(*hpet_res) + HPET_RESOURCE_NAME_SIZE); +- if (hpet_res) { +- memset(hpet_res, 0, sizeof(*hpet_res)); +- hpet_res->name = (void *)&hpet_res[1]; +- hpet_res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; +- snprintf((char *)hpet_res->name, HPET_RESOURCE_NAME_SIZE, +- "HPET %u", hpet_tbl->sequence); +- hpet_res->end = (1 * 1024) - 1; +- } +- + hpet_address = hpet_tbl->address.address; + printk(KERN_INFO PREFIX "HPET id: %#x base: %#lx\n", + hpet_tbl->id, hpet_address); + +- res_start = hpet_address; +- +- if (hpet_res) { +- hpet_res->start = res_start; +- hpet_res->end += res_start; +- insert_resource(&iomem_resource, hpet_res); +- } +- + return 0; + } + #else +@@ -877,7 +856,7 @@ static void __init acpi_process_madt(voi + acpi_ioapic = 1; + + smp_found_config = 1; +- clustered_apic_check(); ++ setup_apic_routing(); + } + } + if (error == -EINVAL) { +Index: 10.3-2007-11-26/arch/i386/kernel/apic-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/apic-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/apic-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -19,7 +19,6 @@ + #include <linux/mm.h> + #include <linux/delay.h> + #include <linux/bootmem.h> +-#include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/mc146818rtc.h> + #include <linux/kernel_stat.h> +Index: 10.3-2007-11-26/arch/i386/kernel/cpu/common-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/cpu/common-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/cpu/common-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -22,16 +22,40 @@ + #define phys_pkg_id(a,b) a + #endif + #endif +-#include <asm/pda.h> + #include <asm/hypervisor.h> + + #include "cpu.h" + +-DEFINE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +-EXPORT_PER_CPU_SYMBOL(cpu_gdt_descr); ++DEFINE_PER_CPU(struct gdt_page, gdt_page) = { .gdt = { ++ [GDT_ENTRY_KERNEL_CS] = { 0x0000ffff, 0x00cf9a00 }, ++ [GDT_ENTRY_KERNEL_DS] = { 0x0000ffff, 0x00cf9200 }, ++ [GDT_ENTRY_DEFAULT_USER_CS] = { 0x0000ffff, 0x00cffa00 }, ++ [GDT_ENTRY_DEFAULT_USER_DS] = { 0x0000ffff, 0x00cff200 }, ++#ifndef CONFIG_XEN ++ /* ++ * Segments used for calling PnP BIOS have byte granularity. ++ * They code segments and data segments have fixed 64k limits, ++ * the transfer segment sizes are set at run time. ++ */ ++ [GDT_ENTRY_PNPBIOS_CS32] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ ++ [GDT_ENTRY_PNPBIOS_CS16] = { 0x0000ffff, 0x00009a00 },/* 16-bit code */ ++ [GDT_ENTRY_PNPBIOS_DS] = { 0x0000ffff, 0x00009200 }, /* 16-bit data */ ++ [GDT_ENTRY_PNPBIOS_TS1] = { 0x00000000, 0x00009200 },/* 16-bit data */ ++ [GDT_ENTRY_PNPBIOS_TS2] = { 0x00000000, 0x00009200 },/* 16-bit data */ ++ /* ++ * The APM segments have byte granularity and their bases ++ * are set at run time. All have 64k limits. ++ */ ++ [GDT_ENTRY_APMBIOS_BASE] = { 0x0000ffff, 0x00409a00 },/* 32-bit code */ ++ /* 16-bit code */ ++ [GDT_ENTRY_APMBIOS_BASE+1] = { 0x0000ffff, 0x00009a00 }, ++ [GDT_ENTRY_APMBIOS_BASE+2] = { 0x0000ffff, 0x00409200 }, /* data */ + +-struct i386_pda *_cpu_pda[NR_CPUS] __read_mostly; +-EXPORT_SYMBOL(_cpu_pda); ++ [GDT_ENTRY_ESPFIX_SS] = { 0x00000000, 0x00c09200 }, ++#endif ++ [GDT_ENTRY_PERCPU] = { 0x00000000, 0x00000000 }, ++} }; ++EXPORT_PER_CPU_SYMBOL_GPL(gdt_page); + + static int cachesize_override __cpuinitdata = -1; + static int disable_x86_fxsr __cpuinitdata; +@@ -373,7 +397,7 @@ __setup("serialnumber", x86_serial_nr_se + /* + * This does the hard work of actually picking apart the CPU stuff... + */ +-void __cpuinit identify_cpu(struct cpuinfo_x86 *c) ++static void __cpuinit identify_cpu(struct cpuinfo_x86 *c) + { + int i; + +@@ -484,15 +508,22 @@ void __cpuinit identify_cpu(struct cpuin + + /* Init Machine Check Exception if available. */ + mcheck_init(c); ++} + +- if (c == &boot_cpu_data) +- sysenter_setup(); ++void __init identify_boot_cpu(void) ++{ ++ identify_cpu(&boot_cpu_data); ++ sysenter_setup(); + enable_sep_cpu(); ++ mtrr_bp_init(); ++} + +- if (c == &boot_cpu_data) +- mtrr_bp_init(); +- else +- mtrr_ap_init(); ++void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) ++{ ++ BUG_ON(c == &boot_cpu_data); ++ identify_cpu(c); ++ enable_sep_cpu(); ++ mtrr_ap_init(); + } + + #ifdef CONFIG_X86_HT +@@ -606,136 +637,47 @@ void __init early_cpu_init(void) + #endif + } + +-/* Make sure %gs is initialized properly in idle threads */ ++/* Make sure %fs is initialized properly in idle threads */ + struct pt_regs * __devinit idle_regs(struct pt_regs *regs) + { + memset(regs, 0, sizeof(struct pt_regs)); +- regs->xfs = __KERNEL_PDA; ++ regs->xfs = __KERNEL_PERCPU; + return regs; + } + +-static __cpuinit int alloc_gdt(int cpu) ++/* Current gdt points %fs at the "master" per-cpu area: after this, ++ * it's on the real one. */ ++void switch_to_new_gdt(void) + { +- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +- struct desc_struct *gdt; +- struct i386_pda *pda; +- +- gdt = (struct desc_struct *)cpu_gdt_descr->address; +- pda = cpu_pda(cpu); +- +- /* +- * This is a horrible hack to allocate the GDT. The problem +- * is that cpu_init() is called really early for the boot CPU +- * (and hence needs bootmem) but much later for the secondary +- * CPUs, when bootmem will have gone away +- */ +- if (NODE_DATA(0)->bdata->node_bootmem_map) { +- BUG_ON(gdt != NULL || pda != NULL); +- +- gdt = alloc_bootmem_pages(PAGE_SIZE); +- pda = alloc_bootmem(sizeof(*pda)); +- /* alloc_bootmem(_pages) panics on failure, so no check */ +- +- memset(gdt, 0, PAGE_SIZE); +- memset(pda, 0, sizeof(*pda)); +- } else { +- /* GDT and PDA might already have been allocated if +- this is a CPU hotplug re-insertion. */ +- if (gdt == NULL) +- gdt = (struct desc_struct *)get_zeroed_page(GFP_KERNEL); +- +- if (pda == NULL) +- pda = kmalloc_node(sizeof(*pda), GFP_KERNEL, cpu_to_node(cpu)); +- +- if (unlikely(!gdt || !pda)) { +- free_pages((unsigned long)gdt, 0); +- kfree(pda); +- return 0; +- } +- } +- +- cpu_gdt_descr->address = (unsigned long)gdt; +- cpu_pda(cpu) = pda; +- +- return 1; +-} +- +-/* Initial PDA used by boot CPU */ +-struct i386_pda boot_pda = { +- ._pda = &boot_pda, +- .cpu_number = 0, +- .pcurrent = &init_task, +-}; +- +-static inline void set_kernel_fs(void) +-{ +- /* Set %fs for this CPU's PDA. Memory clobber is to create a +- barrier with respect to any PDA operations, so the compiler +- doesn't move any before here. */ +- asm volatile ("mov %0, %%fs" : : "r" (__KERNEL_PDA) : "memory"); +-} +- +-/* Initialize the CPU's GDT and PDA. The boot CPU does this for +- itself, but secondaries find this done for them. */ +-__cpuinit int init_gdt(int cpu, struct task_struct *idle) +-{ +- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +- struct desc_struct *gdt; +- struct i386_pda *pda; +- +- /* For non-boot CPUs, the GDT and PDA should already have been +- allocated. */ +- if (!alloc_gdt(cpu)) { +- printk(KERN_CRIT "CPU%d failed to allocate GDT or PDA\n", cpu); +- return 0; +- } +- +- gdt = (struct desc_struct *)cpu_gdt_descr->address; +- pda = cpu_pda(cpu); +- +- BUG_ON(gdt == NULL || pda == NULL); +- +- /* +- * Initialize the per-CPU GDT with the boot GDT, +- * and set up the GDT descriptor: +- */ +- memcpy(gdt, cpu_gdt_table, GDT_SIZE); +- cpu_gdt_descr->size = GDT_SIZE - 1; +- +- pack_descriptor((u32 *)&gdt[GDT_ENTRY_PDA].a, +- (u32 *)&gdt[GDT_ENTRY_PDA].b, +- (unsigned long)pda, sizeof(*pda) - 1, +- 0x80 | DESCTYPE_S | 0x2, 0); /* present read-write data segment */ +- +- memset(pda, 0, sizeof(*pda)); +- pda->_pda = pda; +- pda->cpu_number = cpu; +- pda->pcurrent = idle; +- +- return 1; +-} +- +-void __cpuinit cpu_set_gdt(int cpu) +-{ +- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); ++ struct Xgt_desc_struct gdt_descr; + unsigned long va, frames[16]; + int f; + +- for (va = cpu_gdt_descr->address, f = 0; +- va < cpu_gdt_descr->address + cpu_gdt_descr->size; ++ gdt_descr.address = (long)get_cpu_gdt_table(smp_processor_id()); ++ gdt_descr.size = GDT_SIZE - 1; ++ ++ for (va = gdt_descr.address, f = 0; ++ va < gdt_descr.address + gdt_descr.size; + va += PAGE_SIZE, f++) { + frames[f] = virt_to_mfn(va); + make_lowmem_page_readonly( + (void *)va, XENFEAT_writable_descriptor_tables); + } +- BUG_ON(HYPERVISOR_set_gdt(frames, cpu_gdt_descr->size / 8)); +- +- set_kernel_fs(); ++ if (HYPERVISOR_set_gdt(frames, gdt_descr.size / 8)) ++ BUG(); ++ asm("mov %0, %%fs" : : "r" (__KERNEL_PERCPU) : "memory"); + } + +-/* Common CPU init for both boot and secondary CPUs */ +-static void __cpuinit _cpu_init(int cpu, struct task_struct *curr) ++/* ++ * cpu_init() initializes state that is per-CPU. Some data is already ++ * initialized (naturally) in the bootstrap process, such as the GDT ++ * and IDT. We reload them nevertheless, this function acts as a ++ * 'CPU state barrier', nothing should get across. ++ */ ++void __cpuinit cpu_init(void) + { ++ int cpu = smp_processor_id(); ++ struct task_struct *curr = current; + #ifndef CONFIG_X86_NO_TSS + struct tss_struct * t = &per_cpu(init_tss, cpu); + #endif +@@ -757,6 +699,8 @@ static void __cpuinit _cpu_init(int cpu, + set_in_cr4(X86_CR4_TSD); + } + ++ switch_to_new_gdt(); ++ + /* + * Set up and load the per-CPU TSS and LDT + */ +@@ -794,38 +738,6 @@ static void __cpuinit _cpu_init(int cpu, + mxcsr_feature_mask_init(); + } + +-/* Entrypoint to initialize secondary CPU */ +-void __cpuinit secondary_cpu_init(void) +-{ +- int cpu = smp_processor_id(); +- struct task_struct *curr = current; +- +- _cpu_init(cpu, curr); +-} +- +-/* +- * cpu_init() initializes state that is per-CPU. Some data is already +- * initialized (naturally) in the bootstrap process, such as the GDT +- * and IDT. We reload them nevertheless, this function acts as a +- * 'CPU state barrier', nothing should get across. +- */ +-void __cpuinit cpu_init(void) +-{ +- int cpu = smp_processor_id(); +- struct task_struct *curr = current; +- +- /* Set up the real GDT and PDA, so we can transition from the +- boot versions. */ +- if (!init_gdt(cpu, curr)) { +- /* failed to allocate something; not much we can do... */ +- for (;;) +- local_irq_enable(); +- } +- +- cpu_set_gdt(cpu); +- _cpu_init(cpu, curr); +-} +- + #ifdef CONFIG_HOTPLUG_CPU + void __cpuinit cpu_uninit(void) + { +Index: 10.3-2007-11-26/arch/i386/kernel/cpu/mtrr/main-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/cpu/mtrr/main-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -166,7 +166,7 @@ mtrr_del(int reg, unsigned long base, un + EXPORT_SYMBOL(mtrr_add); + EXPORT_SYMBOL(mtrr_del); + +-void __init mtrr_bp_init(void) ++__init void mtrr_bp_init(void) + { + } + +Index: 10.3-2007-11-26/arch/i386/kernel/e820-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/e820-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/e820-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -162,26 +162,27 @@ static struct resource standard_io_resou + + static int __init romsignature(const unsigned char *rom) + { ++ const unsigned short * const ptr = (const unsigned short *)rom; + unsigned short sig; + +- return probe_kernel_address((const unsigned short *)rom, sig) == 0 && +- sig == ROMSIGNATURE; ++ return probe_kernel_address(ptr, sig) == 0 && sig == ROMSIGNATURE; + } + +-static int __init romchecksum(unsigned char *rom, unsigned long length) ++static int __init romchecksum(const unsigned char *rom, unsigned long length) + { +- unsigned char sum; ++ unsigned char sum, c; + +- for (sum = 0; length; length--) +- sum += *rom++; +- return sum == 0; ++ for (sum = 0; length && probe_kernel_address(rom++, c) == 0; length--) ++ sum += c; ++ return !length && !sum; + } + + static void __init probe_roms(void) + { ++ const unsigned char *rom; + unsigned long start, length, upper; +- unsigned char *rom; +- int i; ++ unsigned char c; ++ int i; + + #ifdef CONFIG_XEN + /* Nothing to do if not running in dom0. */ +@@ -198,8 +199,11 @@ static void __init probe_roms(void) + + video_rom_resource.start = start; + ++ if (probe_kernel_address(rom + 2, c) != 0) ++ continue; ++ + /* 0 < length <= 0x7f * 512, historically */ +- length = rom[2] * 512; ++ length = c * 512; + + /* if checksum okay, trust length byte */ + if (length && romchecksum(rom, length)) +@@ -233,8 +237,11 @@ static void __init probe_roms(void) + if (!romsignature(rom)) + continue; + ++ if (probe_kernel_address(rom + 2, c) != 0) ++ continue; ++ + /* 0 < length <= 0x7f * 512, historically */ +- length = rom[2] * 512; ++ length = c * 512; + + /* but accept any length that fits if checksum okay */ + if (!length || start + length > upper || !romchecksum(rom, length)) +@@ -249,7 +256,7 @@ static void __init probe_roms(void) + } + + #ifdef CONFIG_XEN +-static struct e820map machine_e820 __initdata; ++static struct e820map machine_e820; + #define e820 machine_e820 + #endif + +@@ -409,10 +416,8 @@ int __init sanitize_e820_map(struct e820 + ____________________33__ + ______________________4_ + */ +- printk("sanitize start\n"); + /* if there's only one memory region, don't bother */ + if (*pnr_map < 2) { +- printk("sanitize bail 0\n"); + return -1; + } + +@@ -421,7 +426,6 @@ int __init sanitize_e820_map(struct e820 + /* bail out if we find any unreasonable addresses in bios map */ + for (i=0; i<old_nr; i++) + if (biosmap[i].addr + biosmap[i].size < biosmap[i].addr) { +- printk("sanitize bail 1\n"); + return -1; + } + +@@ -517,7 +521,6 @@ int __init sanitize_e820_map(struct e820 + memcpy(biosmap, new_bios, new_nr*sizeof(struct e820entry)); + *pnr_map = new_nr; + +- printk("sanitize end\n"); + return 0; + } + +@@ -552,7 +555,6 @@ int __init copy_e820_map(struct e820entr + unsigned long long size = biosmap->size; + unsigned long long end = start + size; + unsigned long type = biosmap->type; +- printk("copy_e820_map() start: %016Lx size: %016Lx end: %016Lx type: %ld\n", start, size, end, type); + + /* Overflow in 64 bits? Ignore the memory map. */ + if (start > end) +@@ -564,17 +566,11 @@ int __init copy_e820_map(struct e820entr + * Not right. Fix it up. + */ + if (type == E820_RAM) { +- printk("copy_e820_map() type is E820_RAM\n"); + if (start < 0x100000ULL && end > 0xA0000ULL) { +- printk("copy_e820_map() lies in range...\n"); +- if (start < 0xA0000ULL) { +- printk("copy_e820_map() start < 0xA0000ULL\n"); ++ if (start < 0xA0000ULL) + add_memory_region(start, 0xA0000ULL-start, type); +- } +- if (end <= 0x100000ULL) { +- printk("copy_e820_map() end <= 0x100000ULL\n"); ++ if (end <= 0x100000ULL) + continue; +- } + start = 0x100000ULL; + size = end - start; + } +@@ -887,6 +883,33 @@ void __init limit_regions(unsigned long + print_memory_map("limit_regions endfunc"); + } + ++/* ++ * This function checks if any part of the range <start,end> is mapped ++ * with type. ++ */ ++int ++e820_any_mapped(u64 start, u64 end, unsigned type) ++{ ++ int i; ++#ifndef CONFIG_XEN ++ for (i = 0; i < e820.nr_map; i++) { ++ const struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; ++i) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif ++ if (type && ei->type != type) ++ continue; ++ if (ei->addr >= end || ei->addr + ei->size <= start) ++ continue; ++ return 1; ++ } ++ return 0; ++} ++EXPORT_SYMBOL_GPL(e820_any_mapped); ++ + /* + * This function checks if the entire range <start,end> is mapped with type. + * +Index: 10.3-2007-11-26/arch/i386/kernel/entry-xen.S +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/entry-xen.S 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/entry-xen.S 2007-10-22 13:58:57.000000000 +0200 +@@ -15,7 +15,7 @@ + * I changed all the .align's to 4 (16 byte alignment), as that's faster + * on a 486. + * +- * Stack layout in 'ret_from_system_call': ++ * Stack layout in 'syscall_exit': + * ptrace needs to have all regs on the stack. + * if the order here is changed, it needs to be + * updated in fork.c:copy_process, signal.c:do_signal, +@@ -135,7 +135,7 @@ NMI_MASK = 0x80000000 + movl $(__USER_DS), %edx; \ + movl %edx, %ds; \ + movl %edx, %es; \ +- movl $(__KERNEL_PDA), %edx; \ ++ movl $(__KERNEL_PERCPU), %edx; \ + movl %edx, %fs + + #define RESTORE_INT_REGS \ +@@ -308,16 +308,12 @@ sysenter_past_esp: + pushl $(__USER_CS) + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET cs, 0*/ +-#ifndef CONFIG_COMPAT_VDSO + /* + * Push current_thread_info()->sysenter_return to the stack. + * A tiny bit of offset fixup is necessary - 4*4 means the 4 words + * pushed above; +8 corresponds to copy_thread's esp0 setting. + */ + pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) +-#else +- pushl $SYSENTER_RETURN +-#endif + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET eip, 0 + +@@ -345,7 +341,7 @@ sysenter_past_esp: + jae syscall_badsys + call *sys_call_table(,%eax,4) + movl %eax,PT_EAX(%esp) +- DISABLE_INTERRUPTS(CLBR_ECX|CLBR_EDX) ++ DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_OFF + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx +@@ -374,10 +370,6 @@ ENTRY(system_call) + CFI_ADJUST_CFA_OFFSET 4 + SAVE_ALL + GET_THREAD_INFO(%ebp) +- testl $TF_MASK,PT_EFLAGS(%esp) +- jz no_singlestep +- orl $_TIF_SINGLESTEP,TI_flags(%ebp) +-no_singlestep: + # system call tracing in operation / emulation + /* Note, _TIF_SECCOMP is bit number 8, and so it needs testw and not testb */ + testw $(_TIF_SYSCALL_EMU|_TIF_SYSCALL_TRACE|_TIF_SECCOMP|_TIF_SYSCALL_AUDIT),TI_flags(%ebp) +@@ -392,6 +384,10 @@ syscall_exit: + # setting need_resched or sigpending + # between sampling and the iret + TRACE_IRQS_OFF ++ testl $TF_MASK,PT_EFLAGS(%esp) # If tracing set singlestep flag on exit ++ jz no_singlestep ++ orl $_TIF_SINGLESTEP,TI_flags(%ebp) ++no_singlestep: + movl TI_flags(%ebp), %ecx + testw $_TIF_ALLWORK_MASK, %cx # current->work + jne syscall_exit_work +@@ -609,9 +605,7 @@ END(syscall_badsys) + #ifndef CONFIG_XEN + #define FIXUP_ESPFIX_STACK \ + /* since we are on a wrong stack, we cant make it a C code :( */ \ +- movl %fs:PDA_cpu, %ebx; \ +- PER_CPU(cpu_gdt_descr, %ebx); \ +- movl GDS_address(%ebx), %ebx; \ ++ PER_CPU(gdt_page, %ebx); \ + GET_DESC_BASE(GDT_ENTRY_ESPFIX_SS, %ebx, %eax, %ax, %al, %ah); \ + addl %esp, %eax; \ + pushl $__KERNEL_DS; \ +@@ -684,7 +678,7 @@ ENTRY(name) \ + SAVE_ALL; \ + TRACE_IRQS_OFF \ + movl %esp,%eax; \ +- call smp_/**/name; \ ++ call smp_##name; \ + jmp ret_from_intr; \ + CFI_ENDPROC; \ + ENDPROC(name) +@@ -692,10 +686,6 @@ ENDPROC(name) + /* The include is where all of the SMP etc. interrupts come from */ + #include "entry_arch.h" + +-/* This alternate entry is needed because we hijack the apic LVTT */ +-#if defined(CONFIG_VMI) && defined(CONFIG_X86_LOCAL_APIC) +-BUILD_INTERRUPT(apic_vmi_timer_interrupt,LOCAL_TIMER_VECTOR) +-#endif + #else + #define UNWIND_ESPFIX_STACK + #endif +@@ -738,7 +728,7 @@ error_code: + pushl %fs + CFI_ADJUST_CFA_OFFSET 4 + /*CFI_REL_OFFSET fs, 0*/ +- movl $(__KERNEL_PDA), %ecx ++ movl $(__KERNEL_PERCPU), %ecx + movl %ecx, %fs + UNWIND_ESPFIX_STACK + popl %ecx +Index: 10.3-2007-11-26/arch/i386/kernel/head-xen.S +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/head-xen.S 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/head-xen.S 2007-10-22 13:58:57.000000000 +0200 +@@ -37,7 +37,8 @@ ENTRY(startup_32) + /* Set up the stack pointer */ + movl $(init_thread_union+THREAD_SIZE),%esp + +- call setup_pda ++ movl %ss,%eax ++ movl %eax,%fs # gets reset once there's real percpu + + /* get vendor info */ + xorl %eax,%eax # call CPUID with 0 -> return vendor ID +@@ -64,55 +65,11 @@ ENTRY(startup_32) + xorl %eax,%eax # Clear GS + movl %eax,%gs + +- movl $(__KERNEL_PDA),%eax +- mov %eax,%fs +- + cld # gcc2 wants the direction flag cleared at all times + + pushl $0 # fake return address for unwinder + jmp start_kernel + +-/* +- * Point the GDT at this CPU's PDA. This will be +- * cpu_gdt_table and boot_pda. +- */ +-ENTRY(setup_pda) +- /* get the PDA pointer */ +- movl $boot_pda, %eax +- +- /* slot the PDA address into the GDT */ +- mov $cpu_gdt_table, %ecx +- mov %ax, (__KERNEL_PDA+0+2)(%ecx) /* base & 0x0000ffff */ +- shr $16, %eax +- mov %al, (__KERNEL_PDA+4+0)(%ecx) /* base & 0x00ff0000 */ +- mov %ah, (__KERNEL_PDA+4+3)(%ecx) /* base & 0xff000000 */ +- +- # %esi still points to start_info, and no registers +- # need to be preserved. +- +- movl XEN_START_mfn_list(%esi), %ebx +- movl $(cpu_gdt_table - __PAGE_OFFSET), %eax +- shrl $PAGE_SHIFT, %eax +- movl (%ebx,%eax,4), %ecx +- pushl %ecx # frame number for set_gdt below +- +- xorl %esi, %esi +- xorl %edx, %edx +- shldl $PAGE_SHIFT, %ecx, %edx +- shll $PAGE_SHIFT, %ecx +- orl $0x61, %ecx +- movl $cpu_gdt_table, %ebx +- movl $__HYPERVISOR_update_va_mapping, %eax +- int $0x82 +- +- movl $(PAGE_SIZE_asm / 8), %ecx +- movl %esp, %ebx +- movl $__HYPERVISOR_set_gdt, %eax +- int $0x82 +- +- popl %ecx +- ret +- + #define HYPERCALL_PAGE_OFFSET 0x1000 + .org HYPERCALL_PAGE_OFFSET + ENTRY(hypercall_page) +@@ -138,60 +95,6 @@ ENTRY(empty_zero_page) + */ + .data + +-/* +- * The Global Descriptor Table contains 28 quadwords, per-CPU. +- */ +- .section .data.page_aligned, "aw" +- .align PAGE_SIZE_asm +-ENTRY(cpu_gdt_table) +- .quad 0x0000000000000000 /* NULL descriptor */ +- .quad 0x0000000000000000 /* 0x0b reserved */ +- .quad 0x0000000000000000 /* 0x13 reserved */ +- .quad 0x0000000000000000 /* 0x1b reserved */ +- .quad 0x0000000000000000 /* 0x20 unused */ +- .quad 0x0000000000000000 /* 0x28 unused */ +- .quad 0x0000000000000000 /* 0x33 TLS entry 1 */ +- .quad 0x0000000000000000 /* 0x3b TLS entry 2 */ +- .quad 0x0000000000000000 /* 0x43 TLS entry 3 */ +- .quad 0x0000000000000000 /* 0x4b reserved */ +- .quad 0x0000000000000000 /* 0x53 reserved */ +- .quad 0x0000000000000000 /* 0x5b reserved */ +- +- .quad 0x00cf9a000000ffff /* 0x60 kernel 4GB code at 0x00000000 */ +- .quad 0x00cf92000000ffff /* 0x68 kernel 4GB data at 0x00000000 */ +- .quad 0x00cffa000000ffff /* 0x73 user 4GB code at 0x00000000 */ +- .quad 0x00cff2000000ffff /* 0x7b user 4GB data at 0x00000000 */ +- +- .quad 0x0000000000000000 /* 0x80 TSS descriptor */ +- .quad 0x0000000000000000 /* 0x88 LDT descriptor */ +- +- /* +- * Segments used for calling PnP BIOS have byte granularity. +- * They code segments and data segments have fixed 64k limits, +- * the transfer segment sizes are set at run time. +- */ +- .quad 0x0000000000000000 /* 0x90 32-bit code */ +- .quad 0x0000000000000000 /* 0x98 16-bit code */ +- .quad 0x0000000000000000 /* 0xa0 16-bit data */ +- .quad 0x0000000000000000 /* 0xa8 16-bit data */ +- .quad 0x0000000000000000 /* 0xb0 16-bit data */ +- +- /* +- * The APM segments have byte granularity and their bases +- * are set at run time. All have 64k limits. +- */ +- .quad 0x0000000000000000 /* 0xb8 APM CS code */ +- .quad 0x0000000000000000 /* 0xc0 APM CS 16 code (16 bit) */ +- .quad 0x0000000000000000 /* 0xc8 APM DS data */ +- +- .quad 0x0000000000000000 /* 0xd0 - ESPFIX SS */ +- .quad 0x00cf92000000ffff /* 0xd8 - PDA */ +- .quad 0x0000000000000000 /* 0xe0 - unused */ +- .quad 0x0000000000000000 /* 0xe8 - unused */ +- .quad 0x0000000000000000 /* 0xf0 - unused */ +- .quad 0x0000000000000000 /* 0xf8 - GDT entry 31: double-fault TSS */ +- .align PAGE_SIZE_asm +- + #if CONFIG_XEN_COMPAT <= 0x030002 + /* + * __xen_guest information +Index: 10.3-2007-11-26/arch/i386/kernel/io_apic-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/io_apic-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/io_apic-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -25,7 +25,6 @@ + #include <linux/init.h> + #include <linux/delay.h> + #include <linux/sched.h> +-#include <linux/smp_lock.h> + #include <linux/mc146818rtc.h> + #include <linux/compiler.h> + #include <linux/acpi.h> +@@ -35,6 +34,7 @@ + #include <linux/msi.h> + #include <linux/htirq.h> + #include <linux/freezer.h> ++#include <linux/kthread.h> + + #include <asm/io.h> + #include <asm/smp.h> +@@ -705,8 +705,6 @@ static int balanced_irq(void *unused) + unsigned long prev_balance_time = jiffies; + long time_remaining = balanced_irq_interval; + +- daemonize("kirqd"); +- + /* push everything to CPU 0 to give us a starting point. */ + for (i = 0 ; i < NR_IRQS ; i++) { + irq_desc[i].pending_mask = cpumask_of_cpu(0); +@@ -766,10 +764,9 @@ static int __init balanced_irq_init(void + } + + printk(KERN_INFO "Starting balanced_irq\n"); +- if (kernel_thread(balanced_irq, NULL, CLONE_KERNEL) >= 0) ++ if (!IS_ERR(kthread_run(balanced_irq, NULL, "kirqd"))) + return 0; +- else +- printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); ++ printk(KERN_ERR "balanced_irq_init: failed to spawn balanced_irq"); + failed: + for_each_possible_cpu(i) { + kfree(irq_cpu_data[i].irq_delta); +@@ -1445,10 +1442,6 @@ static void __init setup_ExtINT_IRQ0_pin + enable_8259A_irq(0); + } + +-static inline void UNEXPECTED_IO_APIC(void) +-{ +-} +- + void __init print_IO_APIC(void) + { + int apic, i; +@@ -1488,34 +1481,12 @@ void __init print_IO_APIC(void) + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); + printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); + printk(KERN_DEBUG "....... : LTS : %X\n", reg_00.bits.LTS); +- if (reg_00.bits.ID >= get_physical_broadcast()) +- UNEXPECTED_IO_APIC(); +- if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", reg_01.raw); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); +- if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ +- (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ +- (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ +- (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ +- (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ +- (reg_01.bits.entries != 0x2E) && +- (reg_01.bits.entries != 0x3F) +- ) +- UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); +- if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ +- (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ +- (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ +- (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ +- (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ +- ) +- UNEXPECTED_IO_APIC(); +- if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + + /* + * Some Intel chipsets with IO APIC VERSION of 0x1? don't have reg_02, +@@ -1525,8 +1496,6 @@ void __init print_IO_APIC(void) + if (reg_01.bits.version >= 0x10 && reg_02.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); +- if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + } + + /* +@@ -1538,8 +1507,6 @@ void __init print_IO_APIC(void) + reg_03.raw != reg_01.raw) { + printk(KERN_DEBUG ".... register #03: %08X\n", reg_03.raw); + printk(KERN_DEBUG "....... : Boot DT : %X\n", reg_03.bits.boot_DT); +- if (reg_03.bits.__reserved_1) +- UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); +@@ -2679,19 +2646,19 @@ int arch_setup_msi_irq(struct pci_dev *d + if (irq < 0) + return irq; + +- set_irq_msi(irq, desc); + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + ++ set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, + "edge"); + +- return irq; ++ return 0; + } + + void arch_teardown_msi_irq(unsigned int irq) +Index: 10.3-2007-11-26/arch/i386/kernel/ioport-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/ioport-xen.c 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/arch/i386/kernel/ioport-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -12,10 +12,10 @@ + #include <linux/types.h> + #include <linux/ioport.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/stddef.h> + #include <linux/slab.h> + #include <linux/thread_info.h> ++#include <linux/syscalls.h> + #include <xen/interface/physdev.h> + + /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +Index: 10.3-2007-11-26/arch/i386/kernel/irq-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/irq-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/irq-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -24,6 +24,9 @@ + DEFINE_PER_CPU(irq_cpustat_t, irq_stat) ____cacheline_internodealigned_in_smp; + EXPORT_PER_CPU_SYMBOL(irq_stat); + ++DEFINE_PER_CPU(struct pt_regs *, irq_regs); ++EXPORT_PER_CPU_SYMBOL(irq_regs); ++ + /* + * 'what should we do if we get a hw irq event on an illegal vector'. + * each architecture has to answer this themselves. +Index: 10.3-2007-11-26/arch/i386/kernel/ldt-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/ldt-xen.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/ldt-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -10,7 +10,6 @@ + #include <linux/string.h> + #include <linux/mm.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/vmalloc.h> + #include <linux/slab.h> + +Index: 10.3-2007-11-26/arch/i386/kernel/microcode-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/microcode-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/microcode-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -135,7 +135,7 @@ static int __init microcode_dev_init (vo + return 0; + } + +-static void __exit microcode_dev_exit (void) ++static void microcode_dev_exit (void) + { + misc_deregister(µcode_dev); + } +Index: 10.3-2007-11-26/arch/i386/kernel/mpparse-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/mpparse-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/mpparse-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -18,7 +18,6 @@ + #include <linux/acpi.h> + #include <linux/delay.h> + #include <linux/bootmem.h> +-#include <linux/smp_lock.h> + #include <linux/kernel_stat.h> + #include <linux/mc146818rtc.h> + #include <linux/bitops.h> +@@ -484,7 +483,7 @@ static int __init smp_read_mpc(struct mp + } + ++mpc_record; + } +- clustered_apic_check(); ++ setup_apic_routing(); + if (!num_processors) + printk(KERN_ERR "SMP mptable: no processors registered!\n"); + return num_processors; +Index: 10.3-2007-11-26/arch/i386/kernel/pci-dma-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/pci-dma-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/pci-dma-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,6 +13,7 @@ + #include <linux/pci.h> + #include <linux/module.h> + #include <linux/version.h> ++#include <linux/pci.h> + #include <asm/io.h> + #include <xen/balloon.h> + #include <xen/gnttab.h> +@@ -251,7 +252,7 @@ int dma_declare_coherent_memory(struct d + { + void __iomem *mem_base = NULL; + int pages = size >> PAGE_SHIFT; +- int bitmap_size = (pages + 31)/32; ++ int bitmap_size = BITS_TO_LONGS(pages) * sizeof(long); + + if ((flags & (DMA_MEMORY_MAP | DMA_MEMORY_IO)) == 0) + goto out; +@@ -324,6 +325,32 @@ void *dma_mark_declared_memory_occupied( + EXPORT_SYMBOL(dma_mark_declared_memory_occupied); + #endif /* ARCH_HAS_DMA_DECLARE_COHERENT_MEMORY */ + ++#if defined(CONFIG_PCI) && !defined(CONFIG_XEN) ++/* Many VIA bridges seem to corrupt data for DAC. Disable it here */ ++ ++int forbid_dac; ++EXPORT_SYMBOL(forbid_dac); ++ ++static __devinit void via_no_dac(struct pci_dev *dev) ++{ ++ if ((dev->class >> 8) == PCI_CLASS_BRIDGE_PCI && forbid_dac == 0) { ++ printk(KERN_INFO "PCI: VIA PCI bridge detected. Disabling DAC.\n"); ++ forbid_dac = 1; ++ } ++} ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_VIA, PCI_ANY_ID, via_no_dac); ++ ++static int check_iommu(char *s) ++{ ++ if (!strcmp(s, "usedac")) { ++ forbid_dac = -1; ++ return 1; ++ } ++ return 0; ++} ++__setup("iommu=", check_iommu); ++#endif ++ + dma_addr_t + dma_map_single(struct device *dev, void *ptr, size_t size, + enum dma_data_direction direction) +Index: 10.3-2007-11-26/arch/i386/kernel/process-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/process-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/process-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -21,7 +21,6 @@ + #include <linux/mm.h> + #include <linux/elfcore.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/stddef.h> + #include <linux/slab.h> + #include <linux/vmalloc.h> +@@ -39,6 +38,7 @@ + #include <linux/random.h> + #include <linux/personality.h> + #include <linux/tick.h> ++#include <linux/percpu.h> + + #include <asm/uaccess.h> + #include <asm/pgtable.h> +@@ -61,7 +61,6 @@ + + #include <asm/tlbflush.h> + #include <asm/cpu.h> +-#include <asm/pda.h> + + asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); + +@@ -70,6 +69,12 @@ static int hlt_counter; + unsigned long boot_option_idle_override = 0; + EXPORT_SYMBOL(boot_option_idle_override); + ++DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; ++EXPORT_PER_CPU_SYMBOL(current_task); ++ ++DEFINE_PER_CPU(int, cpu_number); ++EXPORT_PER_CPU_SYMBOL(cpu_number); ++ + /* + * Return saved PC of a blocked thread. + */ +@@ -168,6 +173,7 @@ void cpu_idle(void) + if (__get_cpu_var(cpu_idle_state)) + __get_cpu_var(cpu_idle_state) = 0; + ++ check_pgt_cache(); + rmb(); + idle = xen_idle; /* no alternatives */ + +@@ -218,18 +224,19 @@ void __devinit select_idle_routine(const + { + } + +-static int __init idle_setup (char *str) ++static int __init idle_setup(char *str) + { +- if (!strncmp(str, "poll", 4)) { ++ if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; + } ++ else ++ return -1; + + boot_option_idle_override = 1; +- return 1; ++ return 0; + } +- +-__setup("idle=", idle_setup); ++early_param("idle", idle_setup); + + void show_regs(struct pt_regs * regs) + { +@@ -282,7 +289,7 @@ int kernel_thread(int (*fn)(void *), voi + + regs.xds = __USER_DS; + regs.xes = __USER_DS; +- regs.xfs = __KERNEL_PDA; ++ regs.xfs = __KERNEL_PERCPU; + regs.orig_eax = -1; + regs.eip = (unsigned long) kernel_thread_helper; + regs.xcs = __KERNEL_CS | get_kernel_rpl(); +@@ -555,7 +562,7 @@ struct task_struct fastcall * __switch_t + * multicall to indicate FPU task switch, rather than + * synchronously trapping to Xen. + */ +- if (prev_p->thread_info->status & TS_USEDFPU) { ++ if (task_thread_info(prev_p)->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; +@@ -645,7 +652,7 @@ struct task_struct fastcall * __switch_t + if (prev->gs | next->gs) + loadsegment(gs, next->gs); + +- write_pda(pcurrent, next_p); ++ x86_write_percpu(current_task, next_p); + + return prev_p; + } +Index: 10.3-2007-11-26/arch/i386/kernel/quirks-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/quirks-xen.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/quirks-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -3,12 +3,10 @@ + */ + #include <linux/pci.h> + #include <linux/irq.h> +-#include <asm/pci-direct.h> +-#include <asm/genapic.h> +-#include <asm/cpu.h> + + #if defined(CONFIG_X86_IO_APIC) && (defined(CONFIG_SMP) || defined(CONFIG_XEN)) && defined(CONFIG_PCI) +-static void __devinit verify_quirk_intel_irqbalance(struct pci_dev *dev) ++ ++static void __devinit quirk_intel_irqbalance(struct pci_dev *dev) + { + u8 config, rev; + u32 word; +@@ -16,7 +14,7 @@ static void __devinit verify_quirk_intel + /* BIOS may enable hardware IRQ balancing for + * E7520/E7320/E7525(revision ID 0x9 and below) + * based platforms. +- * For those platforms, make sure that the genapic is set to 'flat' ++ * Disable SW irqbalance/affinity on those platforms. + */ + pci_read_config_byte(dev, PCI_CLASS_REVISION, &rev); + if (rev > 0x9) +@@ -30,59 +28,19 @@ static void __devinit verify_quirk_intel + raw_pci_ops->read(0, 0, 0x40, 0x4c, 2, &word); + + if (!(word & (1 << 13))) { +-#ifndef CONFIG_XEN +-#ifdef CONFIG_X86_64 +- if (genapic != &apic_flat) +- panic("APIC mode must be flat on this system\n"); +-#elif defined(CONFIG_X86_GENERICARCH) +- if (genapic != &apic_default) +- panic("APIC mode must be default(flat) on this system. Use apic=default\n"); +-#endif +-#endif +- } +- +- /* put back the original value for config space*/ +- if (!(config & 0x2)) +- pci_write_config_byte(dev, 0xf4, config); +-} +- +-void __init quirk_intel_irqbalance(void) +-{ +- u8 config, rev; +- u32 word; +- +- /* BIOS may enable hardware IRQ balancing for +- * E7520/E7320/E7525(revision ID 0x9 and below) +- * based platforms. +- * Disable SW irqbalance/affinity on those platforms. +- */ +- rev = read_pci_config_byte(0, 0, 0, PCI_CLASS_REVISION); +- if (rev > 0x9) +- return; +- +- printk(KERN_INFO "Intel E7520/7320/7525 detected."); +- +- /* enable access to config space */ +- config = read_pci_config_byte(0, 0, 0, 0xf4); +- write_pci_config_byte(0, 0, 0, 0xf4, config|0x2); +- +- /* read xTPR register */ +- word = read_pci_config_16(0, 0, 0x40, 0x4c); +- +- if (!(word & (1 << 13))) { + struct xen_platform_op op; +- printk(KERN_INFO "Disabling irq balancing and affinity\n"); ++ printk(KERN_INFO "Intel E7520/7320/7525 detected. " ++ "Disabling irq balancing and affinity\n"); + op.cmd = XENPF_platform_quirk; + op.u.platform_quirk.quirk_id = QUIRK_NOIRQBALANCING; + (void)HYPERVISOR_platform_op(&op); + } + +- /* put back the original value for config space */ ++ /* put back the original value for config space*/ + if (!(config & 0x2)) +- write_pci_config_byte(0, 0, 0, 0xf4, config); ++ pci_write_config_byte(dev, 0xf4, config); + } +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, verify_quirk_intel_irqbalance); +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, verify_quirk_intel_irqbalance); +-DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, verify_quirk_intel_irqbalance); +- ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7320_MCH, quirk_intel_irqbalance); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7525_MCH, quirk_intel_irqbalance); ++DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_E7520_MCH, quirk_intel_irqbalance); + #endif +Index: 10.3-2007-11-26/arch/i386/kernel/smp-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/smp-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/smp-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,7 +13,6 @@ + #include <linux/mm.h> + #include <linux/delay.h> + #include <linux/spinlock.h> +-#include <linux/smp_lock.h> + #include <linux/kernel_stat.h> + #include <linux/mc146818rtc.h> + #include <linux/cache.h> +@@ -216,7 +215,6 @@ static cpumask_t flush_cpumask; + static struct mm_struct * flush_mm; + static unsigned long flush_va; + static DEFINE_SPINLOCK(tlbstate_lock); +-#define FLUSH_ALL 0xffffffff + + /* + * We cannot call mmdrop() because we are in interrupt context, +@@ -298,7 +296,7 @@ irqreturn_t smp_invalidate_interrupt(int + + if (flush_mm == per_cpu(cpu_tlbstate, cpu).active_mm) { + if (per_cpu(cpu_tlbstate, cpu).state == TLBSTATE_OK) { +- if (flush_va == FLUSH_ALL) ++ if (flush_va == TLB_FLUSH_ALL) + local_flush_tlb(); + else + __flush_tlb_one(flush_va); +@@ -314,9 +312,11 @@ out: + return IRQ_HANDLED; + } + +-static void flush_tlb_others(cpumask_t cpumask, struct mm_struct *mm, +- unsigned long va) ++void xen_flush_tlb_others(const cpumask_t *cpumaskp, struct mm_struct *mm, ++ unsigned long va) + { ++ cpumask_t cpumask = *cpumaskp; ++ + /* + * A couple of (to be removed) sanity checks: + * +@@ -327,10 +327,12 @@ static void flush_tlb_others(cpumask_t c + BUG_ON(cpu_isset(smp_processor_id(), cpumask)); + BUG_ON(!mm); + ++#ifdef CONFIG_HOTPLUG_CPU + /* If a CPU which we ran on has gone down, OK. */ + cpus_and(cpumask, cpumask, cpu_online_map); +- if (cpus_empty(cpumask)) ++ if (unlikely(cpus_empty(cpumask))) + return; ++#endif + + /* + * i'm not happy about this global shared spinlock in the +@@ -341,17 +343,7 @@ static void flush_tlb_others(cpumask_t c + + flush_mm = mm; + flush_va = va; +-#if NR_CPUS <= BITS_PER_LONG +- atomic_set_mask(cpumask, &flush_cpumask); +-#else +- { +- int k; +- unsigned long *flush_mask = (unsigned long *)&flush_cpumask; +- unsigned long *cpu_mask = (unsigned long *)&cpumask; +- for (k = 0; k < BITS_TO_LONGS(NR_CPUS); ++k) +- atomic_set_mask(cpu_mask[k], &flush_mask[k]); +- } +-#endif ++ cpus_or(flush_cpumask, cpumask, flush_cpumask); + /* + * We have to send the IPI only to + * CPUs affected. +@@ -378,7 +370,7 @@ void flush_tlb_current_task(void) + + local_flush_tlb(); + if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, FLUSH_ALL); ++ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); + preempt_enable(); + } + +@@ -397,7 +389,7 @@ void flush_tlb_mm (struct mm_struct * mm + leave_mm(smp_processor_id()); + } + if (!cpus_empty(cpu_mask)) +- flush_tlb_others(cpu_mask, mm, FLUSH_ALL); ++ flush_tlb_others(cpu_mask, mm, TLB_FLUSH_ALL); + + preempt_enable(); + } +@@ -460,7 +452,7 @@ void flush_tlb_all(void) + * it goes straight through and wastes no time serializing + * anything. Worst case is that we lose a reschedule ... + */ +-void smp_send_reschedule(int cpu) ++void xen_smp_send_reschedule(int cpu) + { + WARN_ON(cpu_is_offline(cpu)); + send_IPI_mask(cpumask_of_cpu(cpu), RESCHEDULE_VECTOR); +@@ -492,36 +484,79 @@ void unlock_ipi_call_lock(void) + + static struct call_data_struct *call_data; + ++static void __smp_call_function(void (*func) (void *info), void *info, ++ int nonatomic, int wait) ++{ ++ struct call_data_struct data; ++ int cpus = num_online_cpus() - 1; ++ ++ if (!cpus) ++ return; ++ ++ data.func = func; ++ data.info = info; ++ atomic_set(&data.started, 0); ++ data.wait = wait; ++ if (wait) ++ atomic_set(&data.finished, 0); ++ ++ call_data = &data; ++ mb(); ++ ++ /* Send a message to all other CPUs and wait for them to respond */ ++ send_IPI_allbutself(CALL_FUNCTION_VECTOR); ++ ++ /* Wait for response */ ++ while (atomic_read(&data.started) != cpus) ++ cpu_relax(); ++ ++ if (wait) ++ while (atomic_read(&data.finished) != cpus) ++ cpu_relax(); ++} ++ ++ + /** +- * smp_call_function(): Run a function on all other CPUs. ++ * smp_call_function_mask(): Run a function on a set of other CPUs. ++ * @mask: The set of cpus to run on. Must not include the current cpu. + * @func: The function to run. This must be fast and non-blocking. + * @info: An arbitrary pointer to pass to the function. +- * @nonatomic: currently unused. + * @wait: If true, wait (atomically) until function has completed on other CPUs. + * +- * Returns 0 on success, else a negative status code. Does not return until +- * remote CPUs are nearly ready to execute <<func>> or are or have executed. ++ * Returns 0 on success, else a negative status code. ++ * ++ * If @wait is true, then returns once @func has returned; otherwise ++ * it returns just before the target cpu calls @func. + * + * You must not call this function with disabled interrupts or from a + * hardware interrupt handler or from a bottom half handler. + */ +-int smp_call_function (void (*func) (void *info), void *info, int nonatomic, +- int wait) ++int ++xen_smp_call_function_mask(cpumask_t mask, ++ void (*func)(void *), void *info, ++ int wait) + { + struct call_data_struct data; ++ cpumask_t allbutself; + int cpus; + ++ /* Can deadlock when called with interrupts disabled */ ++ WARN_ON(irqs_disabled()); ++ + /* Holding any lock stops cpus from going down. */ + spin_lock(&call_lock); +- cpus = num_online_cpus() - 1; ++ ++ allbutself = cpu_online_map; ++ cpu_clear(smp_processor_id(), allbutself); ++ ++ cpus_and(mask, mask, allbutself); ++ cpus = cpus_weight(mask); ++ + if (!cpus) { + spin_unlock(&call_lock); + return 0; + } + +- /* Can deadlock when called with interrupts disabled */ +- WARN_ON(irqs_disabled()); +- + data.func = func; + data.info = info; + atomic_set(&data.started, 0); +@@ -531,9 +566,12 @@ int smp_call_function (void (*func) (voi + + call_data = &data; + mb(); +- +- /* Send a message to all other CPUs and wait for them to respond */ +- send_IPI_allbutself(CALL_FUNCTION_VECTOR); ++ ++ /* Send a message to other CPUs */ ++ if (cpus_equal(mask, allbutself)) ++ send_IPI_allbutself(CALL_FUNCTION_VECTOR); ++ else ++ send_IPI_mask(mask, CALL_FUNCTION_VECTOR); + + /* Wait for response */ + while (atomic_read(&data.started) != cpus) +@@ -546,15 +584,14 @@ int smp_call_function (void (*func) (voi + + return 0; + } +-EXPORT_SYMBOL(smp_call_function); + + static void stop_this_cpu (void * dummy) + { ++ local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); +- local_irq_disable(); + #if 0 + disable_local_APIC(); + #endif +@@ -567,15 +604,20 @@ static void stop_this_cpu (void * dummy) + * this function calls the 'stop' function on all other CPUs in the system. + */ + +-void smp_send_stop(void) ++void xen_smp_send_stop(void) + { +- smp_call_function(stop_this_cpu, NULL, 1, 0); ++ /* Don't deadlock on the call lock in panic */ ++ int nolock = !spin_trylock(&call_lock); ++ unsigned long flags; + +- local_irq_disable(); ++ local_irq_save(flags); ++ __smp_call_function(stop_this_cpu, NULL, 0, 0); ++ if (!nolock) ++ spin_unlock(&call_lock); + #if 0 + disable_local_APIC(); + #endif +- local_irq_enable(); ++ local_irq_restore(flags); + } + + /* +@@ -616,74 +658,3 @@ irqreturn_t smp_call_function_interrupt( + + return IRQ_HANDLED; + } +- +-/* +- * this function sends a 'generic call function' IPI to one other CPU +- * in the system. +- * +- * cpu is a standard Linux logical CPU number. +- */ +-static void +-__smp_call_function_single(int cpu, void (*func) (void *info), void *info, +- int nonatomic, int wait) +-{ +- struct call_data_struct data; +- int cpus = 1; +- +- data.func = func; +- data.info = info; +- atomic_set(&data.started, 0); +- data.wait = wait; +- if (wait) +- atomic_set(&data.finished, 0); +- +- call_data = &data; +- wmb(); +- /* Send a message to all other CPUs and wait for them to respond */ +- send_IPI_mask(cpumask_of_cpu(cpu), CALL_FUNCTION_VECTOR); +- +- /* Wait for response */ +- while (atomic_read(&data.started) != cpus) +- cpu_relax(); +- +- if (!wait) +- return; +- +- while (atomic_read(&data.finished) != cpus) +- cpu_relax(); +-} +- +-/* +- * smp_call_function_single - Run a function on another CPU +- * @func: The function to run. This must be fast and non-blocking. +- * @info: An arbitrary pointer to pass to the function. +- * @nonatomic: Currently unused. +- * @wait: If true, wait until function has completed on other CPUs. +- * +- * Retrurns 0 on success, else a negative status code. +- * +- * Does not return until the remote CPU is nearly ready to execute <func> +- * or is or has executed. +- */ +- +-int smp_call_function_single(int cpu, void (*func) (void *info), void *info, +- int nonatomic, int wait) +-{ +- /* prevent preemption and reschedule on another processor */ +- int me = get_cpu(); +- if (cpu == me) { +- WARN_ON(1); +- put_cpu(); +- return -EBUSY; +- } +- +- /* Can deadlock when called with interrupts disabled */ +- WARN_ON(irqs_disabled()); +- +- spin_lock_bh(&call_lock); +- __smp_call_function_single(cpu, func, info, nonatomic, wait); +- spin_unlock_bh(&call_lock); +- put_cpu(); +- return 0; +-} +-EXPORT_SYMBOL(smp_call_function_single); +Index: 10.3-2007-11-26/arch/i386/kernel/swiotlb.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/swiotlb.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/swiotlb.c 2007-10-22 13:58:57.000000000 +0200 +@@ -729,7 +729,6 @@ swiotlb_dma_supported (struct device *hw + return (mask >= ((1UL << dma_bits) - 1)); + } + +-EXPORT_SYMBOL(swiotlb_init); + EXPORT_SYMBOL(swiotlb_map_single); + EXPORT_SYMBOL(swiotlb_unmap_single); + EXPORT_SYMBOL(swiotlb_map_sg); +Index: 10.3-2007-11-26/arch/i386/kernel/time-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/time-xen.c 2007-12-06 17:32:21.000000000 +0100 ++++ 10.3-2007-11-26/arch/i386/kernel/time-xen.c 2007-12-06 17:32:30.000000000 +0100 +@@ -79,7 +79,6 @@ + #include <asm/i8253.h> + DEFINE_SPINLOCK(i8253_lock); + EXPORT_SYMBOL(i8253_lock); +-int pit_latch_buggy; /* extern */ + #else + volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; + #endif +@@ -593,7 +592,7 @@ irqreturn_t timer_interrupt(int irq, voi + return IRQ_HANDLED; + } + +-void mark_tsc_unstable(void) ++void mark_tsc_unstable(char *reason) + { + #ifndef CONFIG_XEN /* XXX Should tell the hypervisor about this fact. */ + tsc_unstable = 1; +@@ -812,7 +811,7 @@ static void setup_cpu0_timer_irq(void) + VIRQ_TIMER, + 0, + timer_interrupt, +- SA_INTERRUPT, ++ IRQF_DISABLED, + "timer0", + NULL); + BUG_ON(per_cpu(timer_irq, 0) < 0); +@@ -922,21 +921,21 @@ static void start_hz_timer(void) + cpu_clear(smp_processor_id(), nohz_cpu_mask); + } + +-void raw_safe_halt(void) ++void xen_safe_halt(void) + { + stop_hz_timer(); + /* Blocking includes an implicit local_irq_enable(). */ + HYPERVISOR_block(); + start_hz_timer(); + } +-EXPORT_SYMBOL(raw_safe_halt); ++EXPORT_SYMBOL(xen_safe_halt); + +-void halt(void) ++void xen_halt(void) + { + if (irqs_disabled()) + HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); + } +-EXPORT_SYMBOL(halt); ++EXPORT_SYMBOL(xen_halt); + + /* No locking required. Interrupts are disabled on all CPUs. */ + void time_resume(void) +@@ -983,7 +982,7 @@ int local_setup_timer(unsigned int cpu) + irq = bind_virq_to_irqhandler(VIRQ_TIMER, + cpu, + timer_interrupt, +- SA_INTERRUPT, ++ IRQF_DISABLED, + timer_name[cpu], + NULL); + if (irq < 0) +Index: 10.3-2007-11-26/arch/i386/kernel/traps-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/kernel/traps-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/kernel/traps-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -52,7 +52,7 @@ + #include <asm/unwind.h> + #include <asm/smp.h> + #include <asm/arch_hooks.h> +-#include <asm/kdebug.h> ++#include <linux/kdebug.h> + #include <asm/stacktrace.h> + + #include <linux/module.h> +@@ -101,20 +101,6 @@ asmlinkage void machine_check(void); + + int kstack_depth_to_print = 24; + static unsigned int code_bytes = 64; +-ATOMIC_NOTIFIER_HEAD(i386die_chain); +- +-int register_die_notifier(struct notifier_block *nb) +-{ +- vmalloc_sync_all(); +- return atomic_notifier_chain_register(&i386die_chain, nb); +-} +-EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ +- +-int unregister_die_notifier(struct notifier_block *nb) +-{ +- return atomic_notifier_chain_unregister(&i386die_chain, nb); +-} +-EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ + + static inline int valid_stack_ptr(struct thread_info *tinfo, void *p) + { +@@ -325,7 +311,7 @@ void show_registers(struct pt_regs *regs + regs->xds & 0xffff, regs->xes & 0xffff, regs->xfs & 0xffff, gs, ss); + printk(KERN_EMERG "Process %.*s (pid: %d, ti=%p task=%p task.ti=%p)", + TASK_COMM_LEN, current->comm, current->pid, +- current_thread_info(), current, current->thread_info); ++ current_thread_info(), current, task_thread_info(current)); + /* + * When in-kernel, we also print out the stack and code at the + * time of the fault.. +@@ -482,8 +468,6 @@ static void __kprobes do_trap(int trapnr + siginfo_t *info) + { + struct task_struct *tsk = current; +- tsk->thread.error_code = error_code; +- tsk->thread.trap_no = trapnr; + + if (regs->eflags & VM_MASK) { + if (vm86) +@@ -495,6 +479,18 @@ static void __kprobes do_trap(int trapnr + goto kernel_trap; + + trap_signal: { ++ /* ++ * We want error_code and trap_no set for userspace faults and ++ * kernelspace faults which result in die(), but not ++ * kernelspace faults which are fixed up. die() gives the ++ * process no chance to handle the signal and notice the ++ * kernel fault information, so that won't result in polluting ++ * the information about previously queued, but not yet ++ * delivered, faults. See also do_general_protection below. ++ */ ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = trapnr; ++ + if (info) + force_sig_info(signr, info, tsk); + else +@@ -503,8 +499,11 @@ static void __kprobes do_trap(int trapnr + } + + kernel_trap: { +- if (!fixup_exception(regs)) ++ if (!fixup_exception(regs)) { ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = trapnr; + die(str, regs, error_code); ++ } + return; + } + +@@ -578,9 +577,6 @@ DO_ERROR_INFO(32, SIGSEGV, "iret excepti + fastcall void __kprobes do_general_protection(struct pt_regs * regs, + long error_code) + { +- current->thread.error_code = error_code; +- current->thread.trap_no = 13; +- + if (regs->eflags & VM_MASK) + goto gp_in_vm86; + +@@ -599,6 +595,8 @@ gp_in_vm86: + + gp_in_kernel: + if (!fixup_exception(regs)) { ++ current->thread.error_code = error_code; ++ current->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; +@@ -987,9 +985,7 @@ fastcall void do_spurious_interrupt_bug( + fastcall unsigned long patch_espfix_desc(unsigned long uesp, + unsigned long kesp) + { +- int cpu = smp_processor_id(); +- struct Xgt_desc_struct *cpu_gdt_descr = &per_cpu(cpu_gdt_descr, cpu); +- struct desc_struct *gdt = (struct desc_struct *)cpu_gdt_descr->address; ++ struct desc_struct *gdt = __get_cpu_var(gdt_page).gdt; + unsigned long base = (kesp - uesp) & -THREAD_SIZE; + unsigned long new_kesp = kesp - base; + unsigned long lim_pages = (new_kesp | (THREAD_SIZE - 1)) >> PAGE_SHIFT; +Index: 10.3-2007-11-26/arch/i386/mm/fault-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/mm/fault-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/mm/fault-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -14,19 +14,20 @@ + #include <linux/mman.h> + #include <linux/mm.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/init.h> + #include <linux/tty.h> + #include <linux/vt_kern.h> /* For unblank_screen() */ + #include <linux/highmem.h> ++#include <linux/bootmem.h> /* for max_low_pfn */ ++#include <linux/vmalloc.h> + #include <linux/module.h> + #include <linux/kprobes.h> + #include <linux/uaccess.h> ++#include <linux/kdebug.h> + + #include <asm/system.h> + #include <asm/desc.h> +-#include <asm/kdebug.h> + #include <asm/segment.h> + + extern void die(const char *,struct pt_regs *,long); +@@ -259,25 +260,20 @@ static void dump_fault_path(unsigned lon + unsigned long page; + + page = read_cr3(); +- page = ((unsigned long *) __va(page))[address >> 22]; +- if (oops_may_print()) +- printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, +- machine_to_phys(page)); ++ page = ((unsigned long *) __va(page))[address >> PGDIR_SHIFT]; ++ printk(KERN_ALERT "*pde = ma %08lx pa %08lx\n", page, ++ machine_to_phys(page)); + /* + * We must not directly access the pte in the highpte + * case if the page table is located in highmem. + * And lets rather not kmap-atomic the pte, just in case + * it's allocated already. + */ +-#ifdef CONFIG_HIGHPTE +- if ((page >> PAGE_SHIFT) >= highstart_pfn) +- return; +-#endif +- if ((page & 1) && oops_may_print()) { +- page &= PAGE_MASK; +- address &= 0x003ff000; +- page = machine_to_phys(page); +- page = ((unsigned long *) __va(page))[address >> PAGE_SHIFT]; ++ if ((machine_to_phys(page) >> PAGE_SHIFT) < max_low_pfn ++ && (page & _PAGE_PRESENT)) { ++ page = machine_to_phys(page & PAGE_MASK); ++ page = ((unsigned long *) __va(page))[(address >> PAGE_SHIFT) ++ & (PTRS_PER_PTE - 1)]; + printk(KERN_ALERT "*pte = ma %08lx pa %08lx\n", page, + machine_to_phys(page)); + } +@@ -581,6 +577,11 @@ bad_area: + bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & 4) { ++ /* ++ * It's possible to have interrupts off here. ++ */ ++ local_irq_enable(); ++ + /* + * Valid to do another page fault here because this one came + * from user space. +@@ -633,7 +634,7 @@ no_context: + bust_spinlocks(1); + + if (oops_may_print()) { +- #ifdef CONFIG_X86_PAE ++#ifdef CONFIG_X86_PAE + if (error_code & 16) { + pte_t *pte = lookup_address(address); + +@@ -642,7 +643,7 @@ no_context: + "NX-protected page - exploit attempt? " + "(uid: %d)\n", current->uid); + } +- #endif ++#endif + if (address < PAGE_SIZE) + printk(KERN_ALERT "BUG: unable to handle kernel NULL " + "pointer dereference"); +@@ -652,8 +653,8 @@ no_context: + printk(" at virtual address %08lx\n",address); + printk(KERN_ALERT " printing eip:\n"); + printk("%08lx\n", regs->eip); ++ dump_fault_path(address); + } +- dump_fault_path(address); + tsk->thread.cr2 = address; + tsk->thread.trap_no = 14; + tsk->thread.error_code = error_code; +@@ -694,7 +695,6 @@ do_sigbus: + force_sig_info_fault(SIGBUS, BUS_ADRERR, address, tsk); + } + +-#if !HAVE_SHARED_KERNEL_PMD + void vmalloc_sync_all(void) + { + /* +@@ -710,6 +710,9 @@ void vmalloc_sync_all(void) + static unsigned long start = TASK_SIZE; + unsigned long address; + ++ if (SHARED_KERNEL_PMD) ++ return; ++ + BUILD_BUG_ON(TASK_SIZE & ~PGDIR_MASK); + for (address = start; + address >= TASK_SIZE && address < hypervisor_virt_start; +@@ -742,4 +745,3 @@ void vmalloc_sync_all(void) + start = address + (1UL << PMD_SHIFT); + } + } +-#endif +Index: 10.3-2007-11-26/arch/i386/mm/highmem-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/mm/highmem-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/mm/highmem-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -26,7 +26,7 @@ void kunmap(struct page *page) + * However when holding an atomic kmap is is not legal to sleep, so atomic + * kmaps are appropriate for short, tight code paths only. + */ +-static void *__kmap_atomic(struct page *page, enum km_type type, pgprot_t prot) ++void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) + { + enum fixed_addresses idx; + unsigned long vaddr; +@@ -49,15 +49,7 @@ static void *__kmap_atomic(struct page * + + void *kmap_atomic(struct page *page, enum km_type type) + { +- return __kmap_atomic(page, type, kmap_prot); +-} +- +-/* Same as kmap_atomic but with PAGE_KERNEL_RO page protection. */ +-void *kmap_atomic_pte(struct page *page, enum km_type type) +-{ +- return __kmap_atomic(page, type, +- test_bit(PG_pinned, &page->flags) +- ? PAGE_KERNEL_RO : kmap_prot); ++ return kmap_atomic_prot(page, type, kmap_prot); + } + + void kunmap_atomic(void *kvaddr, enum km_type type) +@@ -80,6 +72,7 @@ void kunmap_atomic(void *kvaddr, enum km + #endif + } + ++ arch_flush_lazy_mmu_mode(); + pagefault_enable(); + } + +@@ -117,6 +110,5 @@ struct page *kmap_atomic_to_page(void *p + EXPORT_SYMBOL(kmap); + EXPORT_SYMBOL(kunmap); + EXPORT_SYMBOL(kmap_atomic); +-EXPORT_SYMBOL(kmap_atomic_pte); + EXPORT_SYMBOL(kunmap_atomic); + EXPORT_SYMBOL(kmap_atomic_to_page); +Index: 10.3-2007-11-26/arch/i386/mm/init-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/mm/init-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/mm/init-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -22,6 +22,7 @@ + #include <linux/init.h> + #include <linux/highmem.h> + #include <linux/pagemap.h> ++#include <linux/pfn.h> + #include <linux/poison.h> + #include <linux/bootmem.h> + #include <linux/slab.h> +@@ -67,17 +68,19 @@ static pmd_t * __init one_md_table_init( + pmd_t *pmd_table; + + #ifdef CONFIG_X86_PAE +- pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); +- paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); +- make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); +- set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); +- pud = pud_offset(pgd, 0); +- if (pmd_table != pmd_offset(pud, 0)) +- BUG(); +-#else ++ if (!(pgd_val(*pgd) & _PAGE_PRESENT)) { ++ pmd_table = (pmd_t *) alloc_bootmem_low_pages(PAGE_SIZE); ++ ++ paravirt_alloc_pd(__pa(pmd_table) >> PAGE_SHIFT); ++ make_lowmem_page_readonly(pmd_table, XENFEAT_writable_page_tables); ++ set_pgd(pgd, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); ++ pud = pud_offset(pgd, 0); ++ if (pmd_table != pmd_offset(pud, 0)) ++ BUG(); ++ } ++#endif + pud = pud_offset(pgd, 0); + pmd_table = pmd_offset(pud, 0); +-#endif + + return pmd_table; + } +@@ -88,16 +91,18 @@ static pmd_t * __init one_md_table_init( + */ + static pte_t * __init one_page_table_init(pmd_t *pmd) + { ++#if CONFIG_XEN_COMPAT <= 0x030002 + if (pmd_none(*pmd)) { ++#else ++ if (!(pmd_val(*pmd) & _PAGE_PRESENT)) { ++#endif + pte_t *page_table = (pte_t *) alloc_bootmem_low_pages(PAGE_SIZE); ++ + paravirt_alloc_pt(__pa(page_table) >> PAGE_SHIFT); + make_lowmem_page_readonly(page_table, + XENFEAT_writable_page_tables); + set_pmd(pmd, __pmd(__pa(page_table) | _PAGE_TABLE)); +- if (page_table != pte_offset_kernel(pmd, 0)) +- BUG(); +- +- return page_table; ++ BUG_ON(page_table != pte_offset_kernel(pmd, 0)); + } + + return pte_offset_kernel(pmd, 0); +@@ -117,7 +122,6 @@ static pte_t * __init one_page_table_ini + static void __init page_table_range_init (unsigned long start, unsigned long end, pgd_t *pgd_base) + { + pgd_t *pgd; +- pud_t *pud; + pmd_t *pmd; + int pgd_idx, pmd_idx; + unsigned long vaddr; +@@ -128,12 +132,10 @@ static void __init page_table_range_init + pgd = pgd_base + pgd_idx; + + for ( ; (pgd_idx < PTRS_PER_PGD) && (vaddr != end); pgd++, pgd_idx++) { +- if (pgd_none(*pgd)) +- one_md_table_init(pgd); +- pud = pud_offset(pgd, vaddr); +- pmd = pmd_offset(pud, vaddr); ++ pmd = one_md_table_init(pgd); ++ pmd = pmd + pmd_index(vaddr); + for (; (pmd_idx < PTRS_PER_PMD) && (vaddr != end); pmd++, pmd_idx++) { +- if (vaddr < hypervisor_virt_start && pmd_none(*pmd)) ++ if (vaddr < hypervisor_virt_start) + one_page_table_init(pmd); + + vaddr += PMD_SIZE; +@@ -196,24 +198,25 @@ static void __init kernel_physical_mappi + /* Map with big pages if possible, otherwise create normal page tables. */ + if (cpu_has_pse) { + unsigned int address2 = (pfn + PTRS_PER_PTE - 1) * PAGE_SIZE + PAGE_OFFSET + PAGE_SIZE-1; +- + if (is_kernel_text(address) || is_kernel_text(address2)) + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE_EXEC)); + else + set_pmd(pmd, pfn_pmd(pfn, PAGE_KERNEL_LARGE)); ++ + pfn += PTRS_PER_PTE; + } else { + pte = one_page_table_init(pmd); + +- pte += pte_ofs; +- for (; pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; pte++, pfn++, pte_ofs++) { +- /* XEN: Only map initial RAM allocation. */ +- if ((pfn >= max_ram_pfn) || pte_present(*pte)) +- continue; +- if (is_kernel_text(address)) +- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); +- else +- set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); ++ for (pte += pte_ofs; ++ pte_ofs < PTRS_PER_PTE && pfn < max_low_pfn; ++ pte++, pfn++, pte_ofs++, address += PAGE_SIZE) { ++ /* XEN: Only map initial RAM allocation. */ ++ if ((pfn >= max_ram_pfn) || pte_present(*pte)) ++ continue; ++ if (is_kernel_text(address)) ++ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL_EXEC)); ++ else ++ set_pte(pte, pfn_pte(pfn, PAGE_KERNEL)); + } + pte_ofs = 0; + } +@@ -383,18 +386,46 @@ extern void __init remap_numa_kva(void); + + pgd_t *swapper_pg_dir; + ++static void __init xen_pagetable_setup_start(pgd_t *base) ++{ ++ swapper_pg_dir = base; ++ init_mm.pgd = base; ++} ++ ++static void __init xen_pagetable_setup_done(pgd_t *base) ++{ ++} ++ ++/* ++ * Build a proper pagetable for the kernel mappings. Up until this ++ * point, we've been running on some set of pagetables constructed by ++ * the boot process. ++ * ++ * If we're booting on native hardware, this will be a pagetable ++ * constructed in arch/i386/kernel/head.S, and not running in PAE mode ++ * (even if we'll end up running in PAE). The root of the pagetable ++ * will be swapper_pg_dir. ++ * ++ * If we're booting paravirtualized under a hypervisor, then there are ++ * more options: we may already be running PAE, and the pagetable may ++ * or may not be based in swapper_pg_dir. In any case, ++ * paravirt_pagetable_setup_start() will set up swapper_pg_dir ++ * appropriately for the rest of the initialization to work. ++ * ++ * In general, pagetable_init() assumes that the pagetable may already ++ * be partially populated, and so it avoids stomping on any existing ++ * mappings. ++ */ + static void __init pagetable_init (void) + { +- unsigned long vaddr; ++ unsigned long vaddr, end; + pgd_t *pgd_base = (pgd_t *)xen_start_info->pt_base; + +- swapper_pg_dir = pgd_base; +- init_mm.pgd = pgd_base; ++ xen_pagetable_setup_start(pgd_base); + + /* Enable PSE if available */ +- if (cpu_has_pse) { ++ if (cpu_has_pse) + set_in_cr4(X86_CR4_PSE); +- } + + /* Enable PGE if available */ + if (cpu_has_pge) { +@@ -411,9 +442,12 @@ static void __init pagetable_init (void) + * created - mappings will be set by set_fixmap(): + */ + vaddr = __fix_to_virt(__end_of_fixed_addresses - 1) & PMD_MASK; +- page_table_range_init(vaddr, hypervisor_virt_start, pgd_base); ++ end = (FIXADDR_TOP + PMD_SIZE - 1) & PMD_MASK; ++ page_table_range_init(vaddr, end, pgd_base); + + permanent_kmaps_init(pgd_base); ++ ++ xen_pagetable_setup_done(pgd_base); + } + + #if defined(CONFIG_SOFTWARE_SUSPEND) || defined(CONFIG_ACPI_SLEEP) +@@ -764,34 +798,29 @@ int remove_memory(u64 start, u64 size) + EXPORT_SYMBOL_GPL(remove_memory); + #endif + +-struct kmem_cache *pgd_cache; + struct kmem_cache *pmd_cache; + + void __init pgtable_cache_init(void) + { ++ size_t pgd_size = PTRS_PER_PGD*sizeof(pgd_t); ++ + if (PTRS_PER_PMD > 1) { + pmd_cache = kmem_cache_create("pmd", + PTRS_PER_PMD*sizeof(pmd_t), + PTRS_PER_PMD*sizeof(pmd_t), +- 0, ++ SLAB_PANIC, + pmd_ctor, + NULL); +- if (!pmd_cache) +- panic("pgtable_cache_init(): cannot create pmd cache"); ++ if (!SHARED_KERNEL_PMD) { ++ /* If we're in PAE mode and have a non-shared ++ kernel pmd, then the pgd size must be a ++ page size. This is because the pgd_list ++ links through the page structure, so there ++ can only be one pgd per page for this to ++ work. */ ++ pgd_size = PAGE_SIZE; ++ } + } +- pgd_cache = kmem_cache_create("pgd", +-#ifndef CONFIG_XEN +- PTRS_PER_PGD*sizeof(pgd_t), +- PTRS_PER_PGD*sizeof(pgd_t), +-#else +- PAGE_SIZE, +- PAGE_SIZE, +-#endif +- 0, +- pgd_ctor, +- PTRS_PER_PMD == 1 ? pgd_dtor : NULL); +- if (!pgd_cache) +- panic("pgtable_cache_init(): Cannot create pgd cache"); + } + + /* +@@ -825,13 +854,26 @@ static int noinline do_test_wp_bit(void) + + void mark_rodata_ro(void) + { +- unsigned long addr = (unsigned long)__start_rodata; +- +- for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) +- change_page_attr(virt_to_page(addr), 1, PAGE_KERNEL_RO); ++ unsigned long start = PFN_ALIGN(_text); ++ unsigned long size = PFN_ALIGN(_etext) - start; + +- printk("Write protecting the kernel read-only data: %uk\n", +- (__end_rodata - __start_rodata) >> 10); ++#ifndef CONFIG_KPROBES ++#ifdef CONFIG_HOTPLUG_CPU ++ /* It must still be possible to apply SMP alternatives. */ ++ if (num_possible_cpus() <= 1) ++#endif ++ { ++ change_page_attr(virt_to_page(start), ++ size >> PAGE_SHIFT, PAGE_KERNEL_RX); ++ printk("Write protecting the kernel text: %luk\n", size >> 10); ++ } ++#endif ++ start += size; ++ size = (unsigned long)__end_rodata - start; ++ change_page_attr(virt_to_page(start), ++ size >> PAGE_SHIFT, PAGE_KERNEL_RO); ++ printk("Write protecting the kernel read-only data: %luk\n", ++ size >> 10); + + /* + * change_page_attr() requires a global_flush_tlb() call after it. +@@ -854,7 +896,7 @@ void free_init_pages(char *what, unsigne + free_page(addr); + totalram_pages++; + } +- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); ++ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + } + + void free_initmem(void) +Index: 10.3-2007-11-26/arch/i386/mm/ioremap-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/mm/ioremap-xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/mm/ioremap-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,6 +13,7 @@ + #include <linux/slab.h> + #include <linux/module.h> + #include <linux/io.h> ++#include <linux/sched.h> + #include <asm/fixmap.h> + #include <asm/cacheflush.h> + #include <asm/tlbflush.h> +Index: 10.3-2007-11-26/arch/i386/mm/pgtable-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/i386/mm/pgtable-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/i386/mm/pgtable-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,6 +13,7 @@ + #include <linux/pagemap.h> + #include <linux/spinlock.h> + #include <linux/module.h> ++#include <linux/quicklist.h> + + #include <asm/system.h> + #include <asm/pgtable.h> +@@ -292,8 +293,6 @@ void pmd_ctor(void *pmd, struct kmem_cac + * against pageattr.c; it is the unique case in which a valid change + * of kernel pagetables can't be lazily synchronized by vmalloc faults. + * vmalloc faults work because attached pagetables are never freed. +- * The locking scheme was chosen on the basis of manfred's +- * recommendations and having no core impact whatsoever. + * -- wli + */ + DEFINE_SPINLOCK(pgd_lock); +@@ -319,37 +318,60 @@ static inline void pgd_list_del(pgd_t *p + set_page_private(next, (unsigned long)pprev); + } + +-void pgd_ctor(void *pgd, struct kmem_cache *cache, unsigned long unused) ++ ++ ++#if (PTRS_PER_PMD == 1) ++/* Non-PAE pgd constructor */ ++void pgd_ctor(void *pgd) + { + unsigned long flags; + +- if (PTRS_PER_PMD > 1) { +- if (HAVE_SHARED_KERNEL_PMD) +- clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, +- swapper_pg_dir + USER_PTRS_PER_PGD, +- KERNEL_PGD_PTRS); +- } else { +- spin_lock_irqsave(&pgd_lock, flags); ++ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); ++ ++ spin_lock_irqsave(&pgd_lock, flags); ++ ++ /* must happen under lock */ ++ clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, ++ swapper_pg_dir + USER_PTRS_PER_PGD, ++ KERNEL_PGD_PTRS); ++ ++ paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, ++ __pa(swapper_pg_dir) >> PAGE_SHIFT, ++ USER_PTRS_PER_PGD, ++ KERNEL_PGD_PTRS); ++ pgd_list_add(pgd); ++ spin_unlock_irqrestore(&pgd_lock, flags); ++} ++#else /* PTRS_PER_PMD > 1 */ ++/* PAE pgd constructor */ ++void pgd_ctor(void *pgd) ++{ ++ /* PAE, kernel PMD may be shared */ ++ ++ if (SHARED_KERNEL_PMD) { + clone_pgd_range((pgd_t *)pgd + USER_PTRS_PER_PGD, + swapper_pg_dir + USER_PTRS_PER_PGD, + KERNEL_PGD_PTRS); +- memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); +- +- /* must happen under lock */ +- paravirt_alloc_pd_clone(__pa(pgd) >> PAGE_SHIFT, +- __pa(swapper_pg_dir) >> PAGE_SHIFT, +- USER_PTRS_PER_PGD, PTRS_PER_PGD - USER_PTRS_PER_PGD); ++#ifndef CONFIG_XEN ++ } else { ++ unsigned long flags; + ++ memset(pgd, 0, USER_PTRS_PER_PGD*sizeof(pgd_t)); ++ spin_lock_irqsave(&pgd_lock, flags); + pgd_list_add(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); ++#endif + } + } ++#endif /* PTRS_PER_PMD */ + +-/* never called when PTRS_PER_PMD > 1 */ +-void pgd_dtor(void *pgd, struct kmem_cache *cache, unsigned long unused) ++void pgd_dtor(void *pgd) + { + unsigned long flags; /* can be called from interrupt context */ + ++ if (SHARED_KERNEL_PMD) ++ return; ++ + paravirt_release_pd(__pa(pgd) >> PAGE_SHIFT); + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); +@@ -358,11 +380,46 @@ void pgd_dtor(void *pgd, struct kmem_cac + pgd_test_and_unpin(pgd); + } + ++#define UNSHARED_PTRS_PER_PGD \ ++ (SHARED_KERNEL_PMD ? USER_PTRS_PER_PGD : PTRS_PER_PGD) ++ ++/* If we allocate a pmd for part of the kernel address space, then ++ make sure its initialized with the appropriate kernel mappings. ++ Otherwise use a cached zeroed pmd. */ ++static pmd_t *pmd_cache_alloc(int idx) ++{ ++ pmd_t *pmd; ++ ++ if (idx >= USER_PTRS_PER_PGD) { ++ pmd = (pmd_t *)__get_free_page(GFP_KERNEL); ++ ++#ifndef CONFIG_XEN ++ if (pmd) ++ memcpy(pmd, ++ (void *)pgd_page_vaddr(swapper_pg_dir[idx]), ++ sizeof(pmd_t) * PTRS_PER_PMD); ++#endif ++ } else ++ pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); ++ ++ return pmd; ++} ++ ++static void pmd_cache_free(pmd_t *pmd, int idx) ++{ ++ if (idx >= USER_PTRS_PER_PGD) { ++ make_lowmem_page_writable(pmd, XENFEAT_writable_page_tables); ++ memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); ++ free_page((unsigned long)pmd); ++ } else ++ kmem_cache_free(pmd_cache, pmd); ++} ++ + pgd_t *pgd_alloc(struct mm_struct *mm) + { + int i; +- pgd_t *pgd = kmem_cache_alloc(pgd_cache, GFP_KERNEL); +- pmd_t **pmd; ++ pgd_t *pgd = quicklist_alloc(0, GFP_KERNEL, pgd_ctor); ++ pmd_t **pmds = NULL; + unsigned long flags; + + pgd_test_and_unpin(pgd); +@@ -370,37 +427,40 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + if (PTRS_PER_PMD == 1 || !pgd) + return pgd; + +- if (HAVE_SHARED_KERNEL_PMD) { +- for (i = 0; i < USER_PTRS_PER_PGD; ++i) { +- pmd_t *pmd = kmem_cache_alloc(pmd_cache, GFP_KERNEL); +- if (!pmd) +- goto out_oom; +- paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); +- set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); ++#ifdef CONFIG_XEN ++ if (!SHARED_KERNEL_PMD) { ++ /* ++ * We can race save/restore (if we sleep during a GFP_KERNEL memory ++ * allocation). We therefore store virtual addresses of pmds as they ++ * do not change across save/restore, and poke the machine addresses ++ * into the pgdir under the pgd_lock. ++ */ ++ pmds = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); ++ if (!pmds) { ++ quicklist_free(0, pgd_dtor, pgd); ++ return NULL; + } +- return pgd; +- } +- +- /* +- * We can race save/restore (if we sleep during a GFP_KERNEL memory +- * allocation). We therefore store virtual addresses of pmds as they +- * do not change across save/restore, and poke the machine addresses +- * into the pgdir under the pgd_lock. +- */ +- pmd = kmalloc(PTRS_PER_PGD * sizeof(pmd_t *), GFP_KERNEL); +- if (!pmd) { +- kmem_cache_free(pgd_cache, pgd); +- return NULL; + } ++#endif + + /* Allocate pmds, remember virtual addresses. */ +- for (i = 0; i < PTRS_PER_PGD; ++i) { +- pmd[i] = kmem_cache_alloc(pmd_cache, GFP_KERNEL); +- if (!pmd[i]) ++ for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { ++ pmd_t *pmd = pmd_cache_alloc(i); ++ ++ if (!pmd) + goto out_oom; ++ + paravirt_alloc_pd(__pa(pmd) >> PAGE_SHIFT); ++ if (pmds) ++ pmds[i] = pmd; ++ else ++ set_pgd(&pgd[i], __pgd(1 + __pa(pmd))); + } + ++#ifdef CONFIG_XEN ++ if (SHARED_KERNEL_PMD) ++ return pgd; ++ + spin_lock_irqsave(&pgd_lock, flags); + + /* Protect against save/restore: move below 4GB under pgd_lock. */ +@@ -419,40 +479,41 @@ pgd_t *pgd_alloc(struct mm_struct *mm) + pgd_t *kpgd = pgd_offset_k(v); + pud_t *kpud = pud_offset(kpgd, v); + pmd_t *kpmd = pmd_offset(kpud, v); +- memcpy(pmd[i], kpmd, PAGE_SIZE); ++ memcpy(pmds[i], kpmd, PAGE_SIZE); + make_lowmem_page_readonly( +- pmd[i], XENFEAT_writable_page_tables); ++ pmds[i], XENFEAT_writable_page_tables); + } + + /* It is safe to poke machine addresses of pmds under the pmd_lock. */ + for (i = 0; i < PTRS_PER_PGD; i++) +- set_pgd(&pgd[i], __pgd(1 + __pa(pmd[i]))); ++ set_pgd(&pgd[i], __pgd(1 + __pa(pmds[i]))); + + /* Ensure this pgd gets picked up and pinned on save/restore. */ + pgd_list_add(pgd); + + spin_unlock_irqrestore(&pgd_lock, flags); + +- kfree(pmd); ++ kfree(pmds); ++#endif + + return pgd; + + out_oom: +- if (HAVE_SHARED_KERNEL_PMD) { ++ if (!pmds) { + for (i--; i >= 0; i--) { + pgd_t pgdent = pgd[i]; + void* pmd = (void *)__va(pgd_val(pgdent)-1); + paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); +- kmem_cache_free(pmd_cache, pmd); ++ pmd_cache_free(pmd, i); + } + } else { + for (i--; i >= 0; i--) { +- paravirt_release_pd(__pa(pmd[i]) >> PAGE_SHIFT); +- kmem_cache_free(pmd_cache, pmd[i]); ++ paravirt_release_pd(__pa(pmds[i]) >> PAGE_SHIFT); ++ pmd_cache_free(pmds[i], i); + } +- kfree(pmd); ++ kfree(pmds); + } +- kmem_cache_free(pgd_cache, pgd); ++ quicklist_free(0, pgd_dtor, pgd); + return NULL; + } + +@@ -472,35 +533,31 @@ void pgd_free(pgd_t *pgd) + + /* in the PAE case user pgd entries are overwritten before usage */ + if (PTRS_PER_PMD > 1) { +- for (i = 0; i < USER_PTRS_PER_PGD; ++i) { +- pgd_t pgdent = pgd[i]; +- void* pmd = (void *)__va(pgd_val(pgdent)-1); +- paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); +- kmem_cache_free(pmd_cache, pmd); +- } +- +- if (!HAVE_SHARED_KERNEL_PMD) { ++ if (!SHARED_KERNEL_PMD) { + unsigned long flags; + spin_lock_irqsave(&pgd_lock, flags); + pgd_list_del(pgd); + spin_unlock_irqrestore(&pgd_lock, flags); ++ } + +- for (i = USER_PTRS_PER_PGD; i < PTRS_PER_PGD; i++) { +- pmd_t *pmd = (void *)__va(pgd_val(pgd[i])-1); +- make_lowmem_page_writable( +- pmd, XENFEAT_writable_page_tables); +- memset(pmd, 0, PTRS_PER_PMD*sizeof(pmd_t)); +- kmem_cache_free(pmd_cache, pmd); +- } +- +- if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) +- xen_destroy_contiguous_region( +- (unsigned long)pgd, 0); ++ for (i = 0; i < UNSHARED_PTRS_PER_PGD; ++i) { ++ pgd_t pgdent = pgd[i]; ++ void* pmd = (void *)__va(pgd_val(pgdent)-1); ++ paravirt_release_pd(__pa(pmd) >> PAGE_SHIFT); ++ pmd_cache_free(pmd, i); + } ++ ++ if (!xen_feature(XENFEAT_pae_pgdir_above_4gb)) ++ xen_destroy_contiguous_region((unsigned long)pgd, 0); + } + + /* in the non-PAE case, free_pgtables() clears user pgd entries */ +- kmem_cache_free(pgd_cache, pgd); ++ quicklist_free(0, pgd_dtor, pgd); ++} ++ ++void check_pgt_cache(void) ++{ ++ quicklist_trim(0, pgd_dtor, 25, 16); + } + + void make_lowmem_page_readonly(void *va, unsigned int feature) +@@ -719,13 +776,13 @@ void mm_pin_all(void) + spin_unlock_irqrestore(&pgd_lock, flags); + } + +-void _arch_dup_mmap(struct mm_struct *mm) ++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + if (!test_bit(PG_pinned, &virt_to_page(mm->pgd)->flags)) + mm_pin(mm); + } + +-void _arch_exit_mmap(struct mm_struct *mm) ++void arch_exit_mmap(struct mm_struct *mm) + { + struct task_struct *tsk = current; + +Index: 10.3-2007-11-26/arch/x86_64/Kconfig +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/Kconfig 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/arch/x86_64/Kconfig 2007-10-22 13:58:57.000000000 +0200 +@@ -599,7 +599,7 @@ config CRASH_DUMP + + config RELOCATABLE + bool "Build a relocatable kernel(EXPERIMENTAL)" +- depends on EXPERIMENTAL ++ depends on EXPERIMENTAL && !X86_64_XEN + help + Builds a relocatable kernel. This enables loading and running + a kernel binary from a different physical address than it has +@@ -736,7 +736,7 @@ menu "Bus options (PCI etc.)" + + config PCI + bool "PCI support" +- select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC) ++ select ARCH_SUPPORTS_MSI if (X86_LOCAL_APIC && X86_IO_APIC && !X86_64_XEN) + + # x86-64 doesn't support PCI BIOS access from long mode so always go direct. + config PCI_DIRECT +Index: 10.3-2007-11-26/arch/x86_64/ia32/ia32entry-xen.S +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/ia32/ia32entry-xen.S 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/ia32/ia32entry-xen.S 2007-10-22 13:58:57.000000000 +0200 +@@ -508,11 +508,7 @@ ia32_sys_call_table: + .quad sys_symlink + .quad sys_lstat + .quad sys_readlink /* 85 */ +-#ifdef CONFIG_IA32_AOUT + .quad sys_uselib +-#else +- .quad quiet_ni_syscall +-#endif + .quad sys_swapon + .quad sys_reboot + .quad compat_sys_old_readdir +@@ -651,7 +647,7 @@ ia32_sys_call_table: + .quad quiet_ni_syscall /* tux */ + .quad quiet_ni_syscall /* security */ + .quad sys_gettid +- .quad sys_readahead /* 225 */ ++ .quad sys32_readahead /* 225 */ + .quad sys_setxattr + .quad sys_lsetxattr + .quad sys_fsetxattr +@@ -676,7 +672,7 @@ ia32_sys_call_table: + .quad compat_sys_io_getevents + .quad compat_sys_io_submit + .quad sys_io_cancel +- .quad sys_fadvise64 /* 250 */ ++ .quad sys32_fadvise64 /* 250 */ + .quad quiet_ni_syscall /* free_huge_pages */ + .quad sys_exit_group + .quad sys32_lookup_dcookie +@@ -740,10 +736,14 @@ ia32_sys_call_table: + .quad compat_sys_set_robust_list + .quad compat_sys_get_robust_list + .quad sys_splice +- .quad sys_sync_file_range +- .quad sys_tee ++ .quad sys32_sync_file_range ++ .quad sys_tee /* 315 */ + .quad compat_sys_vmsplice + .quad compat_sys_move_pages + .quad sys_getcpu + .quad sys_epoll_pwait +-ia32_syscall_end: ++ .quad compat_sys_utimensat /* 320 */ ++ .quad compat_sys_signalfd ++ .quad compat_sys_timerfd ++ .quad sys_eventfd ++ia32_syscall_end: +Index: 10.3-2007-11-26/arch/x86_64/ia32/syscall32-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/ia32/syscall32-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/ia32/syscall32-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,6 +13,7 @@ + #include <asm/proto.h> + #include <asm/tlbflush.h> + #include <asm/ia32_unistd.h> ++#include <asm/vsyscall32.h> + + #ifdef USE_INT80 + extern unsigned char syscall32_int80[], syscall32_int80_end[]; +Index: 10.3-2007-11-26/arch/x86_64/kernel/apic-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/apic-xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/apic-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -19,7 +19,6 @@ + #include <linux/mm.h> + #include <linux/delay.h> + #include <linux/bootmem.h> +-#include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/mc146818rtc.h> + #include <linux/kernel_stat.h> +Index: 10.3-2007-11-26/arch/x86_64/kernel/e820-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/e820-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/e820-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -17,6 +17,8 @@ + #include <linux/kexec.h> + #include <linux/module.h> + #include <linux/mm.h> ++#include <linux/suspend.h> ++#include <linux/pfn.h> + + #include <asm/pgtable.h> + #include <asm/page.h> +@@ -28,7 +30,7 @@ + + struct e820map e820 __initdata; + #ifdef CONFIG_XEN +-struct e820map machine_e820 __initdata; ++struct e820map machine_e820; + #endif + + /* +@@ -105,17 +107,23 @@ static inline int bad_addr(unsigned long + return 0; + } + +-#ifndef CONFIG_XEN + /* + * This function checks if any part of the range <start,end> is mapped + * with type. + */ +-int __meminit ++int + e820_any_mapped(unsigned long start, unsigned long end, unsigned type) + { + int i; ++#ifndef CONFIG_XEN + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; ++#else ++ if (!is_initial_xendomain()) ++ return 0; ++ for (i = 0; i < machine_e820.nr_map; i++) { ++ const struct e820entry *ei = &machine_e820.map[i]; ++#endif + if (type && ei->type != type) + continue; + if (ei->addr >= end || ei->addr + ei->size <= start) +@@ -124,7 +132,7 @@ e820_any_mapped(unsigned long start, uns + } + return 0; + } +-#endif ++EXPORT_SYMBOL_GPL(e820_any_mapped); + + /* + * This function checks if the entire range <start,end> is mapped with type. +@@ -284,22 +292,6 @@ void __init e820_reserve_resources(struc + } + + #ifndef CONFIG_XEN +-/* Mark pages corresponding to given address range as nosave */ +-static void __init +-e820_mark_nosave_range(unsigned long start, unsigned long end) +-{ +- unsigned long pfn, max_pfn; +- +- if (start >= end) +- return; +- +- printk("Nosave address range: %016lx - %016lx\n", start, end); +- max_pfn = end >> PAGE_SHIFT; +- for (pfn = start >> PAGE_SHIFT; pfn < max_pfn; pfn++) +- if (pfn_valid(pfn)) +- SetPageNosave(pfn_to_page(pfn)); +-} +- + /* + * Find the ranges of physical addresses that do not correspond to + * e820 RAM areas and mark the corresponding pages as nosave for software +@@ -318,13 +310,13 @@ void __init e820_mark_nosave_regions(voi + struct e820entry *ei = &e820.map[i]; + + if (paddr < ei->addr) +- e820_mark_nosave_range(paddr, +- round_up(ei->addr, PAGE_SIZE)); ++ register_nosave_region(PFN_DOWN(paddr), ++ PFN_UP(ei->addr)); + + paddr = round_down(ei->addr + ei->size, PAGE_SIZE); + if (ei->type != E820_RAM) +- e820_mark_nosave_range(round_up(ei->addr, PAGE_SIZE), +- paddr); ++ register_nosave_region(PFN_UP(ei->addr), ++ PFN_DOWN(paddr)); + + if (paddr >= (end_pfn << PAGE_SHIFT)) + break; +Index: 10.3-2007-11-26/arch/x86_64/kernel/early_printk-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/early_printk-xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/early_printk-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -11,11 +11,10 @@ + + #ifdef __i386__ + #include <asm/setup.h> +-#define VGABASE (__ISA_IO_base + 0xb8000) + #else + #include <asm/bootsetup.h> +-#define VGABASE ((void __iomem *)0xffffffff800b8000UL) + #endif ++#define VGABASE (__ISA_IO_base + 0xb8000) + + #ifndef CONFIG_XEN + static int max_ypos = 25, max_xpos = 80; +@@ -93,9 +92,9 @@ static int early_serial_putc(unsigned ch + static void early_serial_write(struct console *con, const char *s, unsigned n) + { + while (*s && n-- > 0) { +- early_serial_putc(*s); + if (*s == '\n') + early_serial_putc('\r'); ++ early_serial_putc(*s); + s++; + } + } +@@ -205,7 +204,7 @@ static noinline long simnow(long cmd, lo + return ret; + } + +-void __init simnow_init(char *str) ++static void __init simnow_init(char *str) + { + char *fn = "klog"; + if (*str == '=') +@@ -277,22 +276,12 @@ static int __init setup_early_printk(cha + early_console = &simnow_console; + keep_early = 1; + } ++ ++ if (keep_early) ++ early_console->flags &= ~CON_BOOT; ++ else ++ early_console->flags |= CON_BOOT; + register_console(early_console); + return 0; + } +- + early_param("earlyprintk", setup_early_printk); +- +-void __init disable_early_printk(void) +-{ +- if (!early_console_initialized || !early_console) +- return; +- if (!keep_early) { +- printk("disabling early console\n"); +- unregister_console(early_console); +- early_console_initialized = 0; +- } else { +- printk("keeping early console\n"); +- } +-} +- +Index: 10.3-2007-11-26/arch/x86_64/kernel/entry-xen.S +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/entry-xen.S 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/entry-xen.S 2007-10-22 13:58:57.000000000 +0200 +@@ -1261,3 +1261,10 @@ ENTRY(call_softirq) + ret + CFI_ENDPROC + ENDPROC(call_softirq) ++ ++KPROBE_ENTRY(ignore_sysret) ++ CFI_STARTPROC ++ mov $-ENOSYS,%eax ++ sysret ++ CFI_ENDPROC ++ENDPROC(ignore_sysret) +Index: 10.3-2007-11-26/arch/x86_64/kernel/genapic-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/genapic-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/genapic-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -11,123 +11,57 @@ + #include <linux/threads.h> + #include <linux/cpumask.h> + #include <linux/string.h> ++#include <linux/module.h> + #include <linux/kernel.h> + #include <linux/ctype.h> + #include <linux/init.h> +-#include <linux/module.h> + + #include <asm/smp.h> + #include <asm/ipi.h> ++#include <asm/genapic.h> + +-#if defined(CONFIG_ACPI) ++#ifdef CONFIG_ACPI + #include <acpi/acpi_bus.h> + #endif + + /* which logical CPU number maps to which CPU (physical APIC ID) */ +-u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID }; ++u8 x86_cpu_to_apicid[NR_CPUS] __read_mostly ++ = { [0 ... NR_CPUS-1] = BAD_APICID }; + EXPORT_SYMBOL(x86_cpu_to_apicid); +-u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +-extern struct genapic apic_cluster; +-extern struct genapic apic_flat; +-extern struct genapic apic_physflat; ++u8 x86_cpu_to_log_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + + #ifndef CONFIG_XEN +-struct genapic *genapic = &apic_flat; +-struct genapic *genapic_force; ++struct genapic __read_mostly *genapic = &apic_flat; + #else + extern struct genapic apic_xen; +-struct genapic *genapic = &apic_xen; ++struct genapic __read_mostly *genapic = &apic_xen; + #endif + + + /* + * Check the APIC IDs in bios_cpu_apicid and choose the APIC mode. + */ +-void __init clustered_apic_check(void) ++void __init setup_apic_routing(void) + { + #ifndef CONFIG_XEN +- long i; +- u8 clusters, max_cluster; +- u8 id; +- u8 cluster_cnt[NUM_APIC_CLUSTERS]; +- int max_apic = 0; +- +- /* genapic selection can be forced because of certain quirks. +- */ +- if (genapic_force) { +- genapic = genapic_force; +- goto print; +- } +- +-#if defined(CONFIG_ACPI) ++#ifdef CONFIG_ACPI + /* +- * Some x86_64 machines use physical APIC mode regardless of how many +- * procs/clusters are present (x86_64 ES7000 is an example). ++ * Quirk: some x86_64 machines can only use physical APIC mode ++ * regardless of how many processors are present (x86_64 ES7000 ++ * is an example). + */ +- if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID) +- if (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL) { +- genapic = &apic_cluster; +- goto print; +- } +-#endif +- +- memset(cluster_cnt, 0, sizeof(cluster_cnt)); +- for (i = 0; i < NR_CPUS; i++) { +- id = bios_cpu_apicid[i]; +- if (id == BAD_APICID) +- continue; +- if (id > max_apic) +- max_apic = id; +- cluster_cnt[APIC_CLUSTERID(id)]++; +- } +- +- /* Don't use clustered mode on AMD platforms. */ +- if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { ++ if (acpi_gbl_FADT.header.revision > FADT2_REVISION_ID && ++ (acpi_gbl_FADT.flags & ACPI_FADT_APIC_PHYSICAL)) + genapic = &apic_physflat; +-#ifndef CONFIG_HOTPLUG_CPU +- /* In the CPU hotplug case we cannot use broadcast mode +- because that opens a race when a CPU is removed. +- Stay at physflat mode in this case. +- It is bad to do this unconditionally though. Once +- we have ACPI platform support for CPU hotplug +- we should detect hotplug capablity from ACPI tables and +- only do this when really needed. -AK */ +- if (max_apic <= 8) +- genapic = &apic_flat; +-#endif +- goto print; +- } +- +- clusters = 0; +- max_cluster = 0; +- +- for (i = 0; i < NUM_APIC_CLUSTERS; i++) { +- if (cluster_cnt[i] > 0) { +- ++clusters; +- if (cluster_cnt[i] > max_cluster) +- max_cluster = cluster_cnt[i]; +- } +- } ++ else ++#endif + +- /* +- * If we have clusters <= 1 and CPUs <= 8 in cluster 0, then flat mode, +- * else if max_cluster <= 4 and cluster_cnt[15] == 0, clustered logical +- * else physical mode. +- * (We don't use lowest priority delivery + HW APIC IRQ steering, so +- * can ignore the clustered logical case and go straight to physical.) +- */ +- if (clusters <= 1 && max_cluster <= 8 && cluster_cnt[0] == max_cluster) { +-#ifdef CONFIG_HOTPLUG_CPU +- /* Don't use APIC shortcuts in CPU hotplug to avoid races */ +- genapic = &apic_physflat; +-#else ++ if (cpus_weight(cpu_possible_map) <= 8) + genapic = &apic_flat; +-#endif +- } else +- genapic = &apic_cluster; ++ else ++ genapic = &apic_physflat; + +-print: + #else + /* hardcode to xen apic functions */ + genapic = &apic_xen; +@@ -135,7 +69,7 @@ print: + printk(KERN_INFO "Setting APIC routing to %s\n", genapic->name); + } + +-/* Same for both flat and clustered. */ ++/* Same for both flat and physical. */ + + #ifdef CONFIG_XEN + extern void xen_send_IPI_shortcut(unsigned int shortcut, int vector, unsigned int dest); +Index: 10.3-2007-11-26/arch/x86_64/kernel/genapic_xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/genapic_xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/genapic_xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -21,9 +21,8 @@ + #include <asm/ipi.h> + #else + #include <asm/apic.h> +-#include <asm/apicdef.h> +-#include <asm/genapic.h> + #endif ++#include <asm/genapic.h> + #include <xen/evtchn.h> + + DECLARE_PER_CPU(int, ipi_to_irq[NR_IPIS]); +Index: 10.3-2007-11-26/arch/x86_64/kernel/head-xen.S +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/head-xen.S 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/head-xen.S 2007-10-22 13:58:57.000000000 +0200 +@@ -5,6 +5,7 @@ + * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> + * Copyright (C) 2000 Karsten Keil <kkeil@suse.de> + * Copyright (C) 2001,2002 Andi Kleen <ak@suse.de> ++ * Copyright (C) 2005 Eric Biederman <ebiederm@xmission.com> + * Jun Nakajima <jun.nakajima@intel.com> + * Modified for Xen + */ +@@ -37,18 +38,14 @@ ENTRY(_start) + pushq $0 # fake return address + jmp x86_64_start_kernel + +-ENTRY(stext) +-ENTRY(_stext) ++.balign PAGE_SIZE + +- $page = 0 + #define NEXT_PAGE(name) \ +- $page = $page + 1; \ +- .org $page * 0x1000; \ +- phys_##name = $page * 0x1000 + __PHYSICAL_START; \ ++ .balign PAGE_SIZE; \ ++ phys_##name = . - .bootstrap.text; \ + ENTRY(name) + + NEXT_PAGE(init_level4_pgt) +- /* This gets initialized in x86_64_start_kernel */ + .fill 512,8,0 + + /* +@@ -125,13 +122,13 @@ gdt: + + ENTRY(cpu_gdt_table) + .quad 0x0000000000000000 /* NULL descriptor */ ++ .quad 0x00cf9b000000ffff /* __KERNEL32_CS */ ++ .quad 0x00af9b000000ffff /* __KERNEL_CS */ ++ .quad 0x00cf93000000ffff /* __KERNEL_DS */ ++ .quad 0x00cffb000000ffff /* __USER32_CS */ ++ .quad 0x00cff3000000ffff /* __USER_DS, __USER32_DS */ ++ .quad 0x00affb000000ffff /* __USER_CS */ + .quad 0x0 /* unused */ +- .quad 0x00af9a000000ffff /* __KERNEL_CS */ +- .quad 0x00cf92000000ffff /* __KERNEL_DS */ +- .quad 0x00cffa000000ffff /* __USER32_CS */ +- .quad 0x00cff2000000ffff /* __USER_DS, __USER32_DS */ +- .quad 0x00affa000000ffff /* __USER_CS */ +- .quad 0x00cf9a000000ffff /* __KERNEL32_CS */ + .quad 0,0 /* TSS */ + .quad 0,0 /* LDT */ + .quad 0,0,0 /* three TLS descriptors */ +@@ -154,14 +151,11 @@ ENTRY(empty_zero_page) + * __xen_guest information + */ + .macro utoh value +- .if (\value) < 0 || (\value) >= 0x10 +- utoh (((\value)>>4)&0x0fffffffffffffff) +- .endif +- .if ((\value) & 0xf) < 10 +- .byte '0' + ((\value) & 0xf) +- .else +- .byte 'A' + ((\value) & 0xf) - 10 +- .endif ++ i = 64 ++ .rept 16 ++ i = i - 4 ++ .byte '0' + ((((\value) >> i) & 0xf) > 9) * ('0' - 'A' + 10) + (((\value) >> i) & 0xf) ++ .endr + .endm + + .section __xen_guest +Index: 10.3-2007-11-26/arch/x86_64/kernel/head64-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/head64-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/head64-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -22,13 +22,21 @@ + #include <asm/setup.h> + #include <asm/desc.h> + #include <asm/pgtable.h> ++#include <asm/tlbflush.h> + #include <asm/sections.h> + + unsigned long start_pfn; + ++#ifndef CONFIG_XEN ++static void __init zap_identity_mappings(void) ++{ ++ pgd_t *pgd = pgd_offset_k(0UL); ++ pgd_clear(pgd); ++ __flush_tlb(); ++} ++ + /* Don't add a printk in there. printk relies on the PDA which is not initialized + yet. */ +-#if 0 + static void __init clear_bss(void) + { + memset(__bss_start, 0, +@@ -37,7 +45,7 @@ static void __init clear_bss(void) + #endif + + #define NEW_CL_POINTER 0x228 /* Relative to real mode data */ +-#define OLD_CL_MAGIC_ADDR 0x90020 ++#define OLD_CL_MAGIC_ADDR 0x20 + #define OLD_CL_MAGIC 0xA33F + #define OLD_CL_BASE_ADDR 0x90000 + #define OLD_CL_OFFSET 0x90022 +@@ -45,18 +53,18 @@ static void __init clear_bss(void) + static void __init copy_bootdata(char *real_mode_data) + { + #ifndef CONFIG_XEN +- int new_data; ++ unsigned long new_data; + char * command_line; + + memcpy(x86_boot_params, real_mode_data, BOOT_PARAM_SIZE); +- new_data = *(int *) (x86_boot_params + NEW_CL_POINTER); ++ new_data = *(u32 *) (x86_boot_params + NEW_CL_POINTER); + if (!new_data) { +- if (OLD_CL_MAGIC != * (u16 *) OLD_CL_MAGIC_ADDR) { ++ if (OLD_CL_MAGIC != *(u16 *)(real_mode_data + OLD_CL_MAGIC_ADDR)) { + return; + } +- new_data = OLD_CL_BASE_ADDR + * (u16 *) OLD_CL_OFFSET; ++ new_data = __pa(real_mode_data) + *(u16 *)(real_mode_data + OLD_CL_OFFSET); + } +- command_line = (char *) ((u64)(new_data)); ++ command_line = __va(new_data); + memcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); + #else + int max_cmdline; +@@ -98,10 +106,13 @@ void __init x86_64_start_kernel(char * r + while ((1UL << machine_to_phys_order) < machine_to_phys_nr_ents ) + machine_to_phys_order++; + +-#if 0 ++#ifndef CONFIG_XEN + /* clear bss before set_intr_gate with early_idt_handler */ + clear_bss(); + ++ /* Make NULL pointers segfault */ ++ zap_identity_mappings(); ++ + for (i = 0; i < IDT_ENTRIES; i++) + set_intr_gate(i, early_idt_handler); + asm volatile("lidt %0" :: "m" (idt_descr)); +@@ -113,7 +124,7 @@ void __init x86_64_start_kernel(char * r + cpu_pda(i) = &boot_cpu_pda[i]; + + pda_init(0); +- copy_bootdata(real_mode_data); ++ copy_bootdata(__va(real_mode_data)); + #ifdef CONFIG_SMP + cpu_set(0, cpu_online_map); + #endif +Index: 10.3-2007-11-26/arch/x86_64/kernel/io_apic-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/io_apic-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/io_apic-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -25,7 +25,6 @@ + #include <linux/init.h> + #include <linux/delay.h> + #include <linux/sched.h> +-#include <linux/smp_lock.h> + #include <linux/pci.h> + #include <linux/mc146818rtc.h> + #include <linux/acpi.h> +@@ -897,10 +896,6 @@ static void __init setup_ExtINT_IRQ0_pin + enable_8259A_irq(0); + } + +-void __init UNEXPECTED_IO_APIC(void) +-{ +-} +- + void __apicdebuginit print_IO_APIC(void) + { + int apic, i; +@@ -936,40 +931,16 @@ void __apicdebuginit print_IO_APIC(void) + printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].mpc_apicid); + printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); + printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); +- if (reg_00.bits.__reserved_1 || reg_00.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG ".... register #01: %08X\n", *(int *)®_01); + printk(KERN_DEBUG "....... : max redirection entries: %04X\n", reg_01.bits.entries); +- if ( (reg_01.bits.entries != 0x0f) && /* older (Neptune) boards */ +- (reg_01.bits.entries != 0x17) && /* typical ISA+PCI boards */ +- (reg_01.bits.entries != 0x1b) && /* Compaq Proliant boards */ +- (reg_01.bits.entries != 0x1f) && /* dual Xeon boards */ +- (reg_01.bits.entries != 0x22) && /* bigger Xeon boards */ +- (reg_01.bits.entries != 0x2E) && +- (reg_01.bits.entries != 0x3F) && +- (reg_01.bits.entries != 0x03) +- ) +- UNEXPECTED_IO_APIC(); + + printk(KERN_DEBUG "....... : PRQ implemented: %X\n", reg_01.bits.PRQ); + printk(KERN_DEBUG "....... : IO APIC version: %04X\n", reg_01.bits.version); +- if ( (reg_01.bits.version != 0x01) && /* 82489DX IO-APICs */ +- (reg_01.bits.version != 0x02) && /* 82801BA IO-APICs (ICH2) */ +- (reg_01.bits.version != 0x10) && /* oldest IO-APICs */ +- (reg_01.bits.version != 0x11) && /* Pentium/Pro IO-APICs */ +- (reg_01.bits.version != 0x13) && /* Xeon IO-APICs */ +- (reg_01.bits.version != 0x20) /* Intel P64H (82806 AA) */ +- ) +- UNEXPECTED_IO_APIC(); +- if (reg_01.bits.__reserved_1 || reg_01.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + + if (reg_01.bits.version >= 0x10) { + printk(KERN_DEBUG ".... register #02: %08X\n", reg_02.raw); + printk(KERN_DEBUG "....... : arbitration: %02X\n", reg_02.bits.arbitration); +- if (reg_02.bits.__reserved_1 || reg_02.bits.__reserved_2) +- UNEXPECTED_IO_APIC(); + } + + printk(KERN_DEBUG ".... IRQ redirection table:\n"); +@@ -1408,8 +1379,7 @@ static void irq_complete_move(unsigned i + + vector = ~get_irq_regs()->orig_rax; + me = smp_processor_id(); +- if ((vector == cfg->vector) && +- cpu_isset(smp_processor_id(), cfg->domain)) { ++ if ((vector == cfg->vector) && cpu_isset(me, cfg->domain)) { + cpumask_t cleanup_mask; + + cpus_and(cleanup_mask, cfg->old_domain, cpu_online_map); +@@ -1444,7 +1414,7 @@ static void ack_apic_level(unsigned int + + /* + * We must acknowledge the irq before we move it or the acknowledge will +- * not propogate properly. ++ * not propagate properly. + */ + ack_APIC_irq(); + +@@ -1527,6 +1497,7 @@ static void ack_lapic_irq (unsigned int + static void end_lapic_irq (unsigned int i) { /* nothing */ } + + static struct hw_interrupt_type lapic_irq_type __read_mostly = { ++ .name = "local-APIC", + .typename = "local-APIC-edge", + .startup = NULL, /* startup_irq() not used for IRQ0 */ + .shutdown = NULL, /* shutdown_irq() not used for IRQ0 */ +@@ -1996,18 +1967,18 @@ int arch_setup_msi_irq(struct pci_dev *d + if (irq < 0) + return irq; + +- set_irq_msi(irq, desc); + ret = msi_compose_msg(dev, irq, &msg); + if (ret < 0) { + destroy_irq(irq); + return ret; + } + ++ set_irq_msi(irq, desc); + write_msi_msg(irq, &msg); + + set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge"); + +- return irq; ++ return 0; + } + + void arch_teardown_msi_irq(unsigned int irq) +Index: 10.3-2007-11-26/arch/x86_64/kernel/ioport-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/ioport-xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/ioport-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,10 +13,10 @@ + #include <linux/ioport.h> + #include <linux/mm.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/stddef.h> + #include <linux/slab.h> + #include <linux/thread_info.h> ++#include <linux/syscalls.h> + #include <xen/interface/physdev.h> + + /* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */ +Index: 10.3-2007-11-26/arch/x86_64/kernel/irq-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/irq-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/irq-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -32,7 +32,7 @@ atomic_t irq_err_count; + */ + static inline void stack_overflow_check(struct pt_regs *regs) + { +- u64 curbase = (u64) current->thread_info; ++ u64 curbase = (u64)task_stack_page(current); + static unsigned long warned = -60*HZ; + + if (regs->rsp >= curbase && regs->rsp <= curbase + THREAD_SIZE && +@@ -145,17 +145,43 @@ void fixup_irqs(cpumask_t map) + + for (irq = 0; irq < NR_IRQS; irq++) { + cpumask_t mask; ++ int break_affinity = 0; ++ int set_affinity = 1; ++ + if (irq == 2) + continue; + ++ /* interrupt's are disabled at this point */ ++ spin_lock(&irq_desc[irq].lock); ++ ++ if (!irq_has_action(irq) || ++ cpus_equal(irq_desc[irq].affinity, map)) { ++ spin_unlock(&irq_desc[irq].lock); ++ continue; ++ } ++ + cpus_and(mask, irq_desc[irq].affinity, map); +- if (any_online_cpu(mask) == NR_CPUS) { +- printk("Breaking affinity for irq %i\n", irq); ++ if (cpus_empty(mask)) { ++ break_affinity = 1; + mask = map; + } ++ ++ if (irq_desc[irq].chip->mask) ++ irq_desc[irq].chip->mask(irq); ++ + if (irq_desc[irq].chip->set_affinity) + irq_desc[irq].chip->set_affinity(irq, mask); +- else if (irq_desc[irq].action && !(warned++)) ++ else if (!(warned++)) ++ set_affinity = 0; ++ ++ if (irq_desc[irq].chip->unmask) ++ irq_desc[irq].chip->unmask(irq); ++ ++ spin_unlock(&irq_desc[irq].lock); ++ ++ if (break_affinity && set_affinity) ++ printk("Broke affinity for irq %i\n", irq); ++ else if (!set_affinity) + printk("Cannot set affinity for irq %i\n", irq); + } + +Index: 10.3-2007-11-26/arch/x86_64/kernel/ldt-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/ldt-xen.c 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/arch/x86_64/kernel/ldt-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -13,7 +13,6 @@ + #include <linux/string.h> + #include <linux/mm.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/vmalloc.h> + #include <linux/slab.h> + +Index: 10.3-2007-11-26/arch/x86_64/kernel/mpparse-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/mpparse-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/mpparse-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -17,7 +17,6 @@ + #include <linux/init.h> + #include <linux/delay.h> + #include <linux/bootmem.h> +-#include <linux/smp_lock.h> + #include <linux/kernel_stat.h> + #include <linux/mc146818rtc.h> + #include <linux/acpi.h> +@@ -307,7 +306,7 @@ static int __init smp_read_mpc(struct mp + } + } + } +- clustered_apic_check(); ++ setup_apic_routing(); + if (!num_processors) + printk(KERN_ERR "MPTABLE: no processors registered!\n"); + return num_processors; +Index: 10.3-2007-11-26/arch/x86_64/kernel/pci-swiotlb-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/pci-swiotlb-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/pci-swiotlb-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -16,7 +16,7 @@ EXPORT_SYMBOL(swiotlb); + + void swiotlb_init(void); + +-struct dma_mapping_ops swiotlb_dma_ops = { ++const struct dma_mapping_ops swiotlb_dma_ops = { + #if 0 + .mapping_error = swiotlb_dma_mapping_error, + .alloc_coherent = swiotlb_alloc_coherent, +Index: 10.3-2007-11-26/arch/x86_64/kernel/process-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/process-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/process-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -39,6 +39,7 @@ + #include <linux/random.h> + #include <linux/notifier.h> + #include <linux/kprobes.h> ++#include <linux/kdebug.h> + + #include <asm/uaccess.h> + #include <asm/pgtable.h> +@@ -49,7 +50,6 @@ + #include <asm/mmu_context.h> + #include <asm/pda.h> + #include <asm/prctl.h> +-#include <asm/kdebug.h> + #include <xen/interface/platform.h> + #include <xen/interface/physdev.h> + #include <xen/interface/vcpu.h> +@@ -232,16 +232,18 @@ void __cpuinit select_idle_routine(const + + static int __init idle_setup (char *str) + { +- if (!strncmp(str, "poll", 4)) { ++ if (!strcmp(str, "poll")) { + printk("using polling idle threads.\n"); + pm_idle = poll_idle; +- } ++ } else if (!strcmp(str, "mwait")) ++ force_mwait = 1; ++ else ++ return -1; + + boot_option_idle_override = 1; +- return 1; ++ return 0; + } +- +-__setup("idle=", idle_setup); ++early_param("idle", idle_setup); + + /* Prints also some state that isn't saved in the pt_regs */ + void __show_regs(struct pt_regs * regs) +@@ -532,7 +534,7 @@ __switch_to(struct task_struct *prev_p, + * The AMD workaround requires it to be after DS reload, or + * after DS has been cleared, which we do in __prepare_arch_switch. + */ +- if (prev_p->thread_info->status & TS_USEDFPU) { ++ if (task_thread_info(prev_p)->status & TS_USEDFPU) { + __save_init_fpu(prev_p); /* _not_ save_init_fpu() */ + mcl->op = __HYPERVISOR_fpu_taskswitch; + mcl->args[0] = 1; +Index: 10.3-2007-11-26/arch/x86_64/kernel/setup-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/setup-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/setup-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -123,6 +123,8 @@ int bootloader_type; + + unsigned long saved_video_mode; + ++int force_mwait __cpuinitdata; ++ + /* + * Early DMI memory + */ +@@ -256,10 +258,10 @@ static void discover_ebda(void) + * there is a real-mode segmented pointer pointing to the + * 4K EBDA area at 0x40E + */ +- ebda_addr = *(unsigned short *)EBDA_ADDR_POINTER; ++ ebda_addr = *(unsigned short *)__va(EBDA_ADDR_POINTER); + ebda_addr <<= 4; + +- ebda_size = *(unsigned short *)(unsigned long)ebda_addr; ++ ebda_size = *(unsigned short *)__va(ebda_addr); + + /* Round EBDA up to pages */ + if (ebda_size == 0) +@@ -434,15 +436,8 @@ void __init setup_arch(char **cmdline_p) + #endif + + #ifdef CONFIG_SMP +- /* +- * But first pinch a few for the stack/trampoline stuff +- * FIXME: Don't need the extra page at 4K, but need to fix +- * trampoline before removing it. (see the GDT stuff) +- */ +- reserve_bootmem_generic(PAGE_SIZE, PAGE_SIZE); +- + /* Reserve SMP trampoline */ +- reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, PAGE_SIZE); ++ reserve_bootmem_generic(SMP_TRAMPOLINE_BASE, 2*PAGE_SIZE); + #endif + #endif + +@@ -563,8 +558,6 @@ void __init setup_arch(char **cmdline_p) + early_quirks(); + #endif + +- zap_low_mappings(0); +- + /* + * set this early, so we dont allocate cpu0 + * if MADT list doesnt list BSP first +@@ -868,6 +861,10 @@ static void __cpuinit init_amd(struct cp + + /* RDTSC can be speculated around */ + clear_bit(X86_FEATURE_SYNC_RDTSC, &c->x86_capability); ++ ++ /* Family 10 doesn't support C states in MWAIT so don't use it */ ++ if (c->x86 == 0x10 && !force_mwait) ++ clear_bit(X86_FEATURE_MWAIT, &c->x86_capability); + } + + static void __cpuinit detect_ht(struct cpuinfo_x86 *c) +@@ -1150,9 +1147,7 @@ void __cpuinit identify_cpu(struct cpuin + #ifdef CONFIG_X86_MCE + mcheck_init(c); + #endif +- if (c == &boot_cpu_data) +- mtrr_bp_init(); +- else ++ if (c != &boot_cpu_data) + mtrr_ap_init(); + #ifdef CONFIG_NUMA + numa_add_cpu(smp_processor_id()); +@@ -1243,9 +1238,8 @@ static int show_cpuinfo(struct seq_file + "stc", + "100mhzsteps", + "hwpstate", +- NULL, /* tsc invariant mapped to constant_tsc */ +- NULL, +- /* nothing */ /* constant_tsc - moved to flags */ ++ "", /* tsc invariant mapped to constant_tsc */ ++ /* nothing */ + }; + + +Index: 10.3-2007-11-26/arch/x86_64/kernel/setup64-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/setup64-xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/setup64-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -113,9 +113,9 @@ void __init setup_per_cpu_areas(void) + if (!NODE_DATA(cpu_to_node(i))) { + printk("cpu with no node %d, num_online_nodes %d\n", + i, num_online_nodes()); +- ptr = alloc_bootmem(size); ++ ptr = alloc_bootmem_pages(size); + } else { +- ptr = alloc_bootmem_node(NODE_DATA(cpu_to_node(i)), size); ++ ptr = alloc_bootmem_pages_node(NODE_DATA(cpu_to_node(i)), size); + } + if (!ptr) + panic("Cannot allocate cpu data for CPU %d\n", i); +@@ -206,6 +206,8 @@ char boot_exception_stacks[(N_EXCEPTION_ + __attribute__((section(".bss.page_aligned"))); + #endif + ++extern asmlinkage void ignore_sysret(void); ++ + /* May not be marked __init: used by software suspend */ + void syscall_init(void) + { +@@ -217,6 +219,7 @@ void syscall_init(void) + */ + wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); + wrmsrl(MSR_LSTAR, system_call); ++ wrmsrl(MSR_CSTAR, ignore_sysret); + + /* Flags to clear on syscall */ + wrmsrl(MSR_SYSCALL_MASK, EF_TF|EF_DF|EF_IE|0x3000); +@@ -260,7 +263,6 @@ void __cpuinit cpu_init (void) + /* CPU 0 is initialised in head64.c */ + if (cpu != 0) { + pda_init(cpu); +- zap_low_mappings(cpu); + } + #ifndef CONFIG_X86_NO_TSS + else +Index: 10.3-2007-11-26/arch/x86_64/kernel/smp-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/smp-xen.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/smp-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -14,7 +14,6 @@ + #include <linux/mm.h> + #include <linux/delay.h> + #include <linux/spinlock.h> +-#include <linux/smp_lock.h> + #include <linux/smp.h> + #include <linux/kernel_stat.h> + #include <linux/mc146818rtc.h> +@@ -478,48 +477,40 @@ int smp_call_function (void (*func) (voi + } + EXPORT_SYMBOL(smp_call_function); + +-void smp_stop_cpu(void) ++static void stop_this_cpu(void *dummy) + { +- unsigned long flags; ++ local_irq_disable(); + /* + * Remove this CPU: + */ + cpu_clear(smp_processor_id(), cpu_online_map); +- local_irq_save(flags); + #ifndef CONFIG_XEN + disable_local_APIC(); + #endif +- local_irq_restore(flags); +-} +- +-static void smp_really_stop_cpu(void *dummy) +-{ +- smp_stop_cpu(); + for (;;) + halt(); + } + + void smp_send_stop(void) + { +- int nolock = 0; ++ int nolock; ++ unsigned long flags; ++ + #ifndef CONFIG_XEN + if (reboot_force) + return; + #endif ++ + /* Don't deadlock on the call lock in panic */ +- if (!spin_trylock(&call_lock)) { +- /* ignore locking because we have panicked anyways */ +- nolock = 1; +- } +- __smp_call_function(smp_really_stop_cpu, NULL, 0, 0); ++ nolock = !spin_trylock(&call_lock); ++ local_irq_save(flags); ++ __smp_call_function(stop_this_cpu, NULL, 0, 0); + if (!nolock) + spin_unlock(&call_lock); +- +- local_irq_disable(); + #ifndef CONFIG_XEN + disable_local_APIC(); + #endif +- local_irq_enable(); ++ local_irq_restore(flags); + } + + /* +Index: 10.3-2007-11-26/arch/x86_64/kernel/traps-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/traps-xen.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/traps-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -32,6 +32,7 @@ + #include <linux/unwind.h> + #include <linux/uaccess.h> + #include <linux/bug.h> ++#include <linux/kdebug.h> + + #include <asm/system.h> + #include <asm/io.h> +@@ -39,7 +40,6 @@ + #include <asm/debugreg.h> + #include <asm/desc.h> + #include <asm/i387.h> +-#include <asm/kdebug.h> + #include <asm/processor.h> + #include <asm/unwind.h> + #include <asm/smp.h> +@@ -71,22 +71,6 @@ asmlinkage void alignment_check(void); + asmlinkage void machine_check(void); + asmlinkage void spurious_interrupt_bug(void); + +-ATOMIC_NOTIFIER_HEAD(die_chain); +-EXPORT_SYMBOL(die_chain); +- +-int register_die_notifier(struct notifier_block *nb) +-{ +- vmalloc_sync_all(); +- return atomic_notifier_chain_register(&die_chain, nb); +-} +-EXPORT_SYMBOL(register_die_notifier); /* used modular by kdb */ +- +-int unregister_die_notifier(struct notifier_block *nb) +-{ +- return atomic_notifier_chain_unregister(&die_chain, nb); +-} +-EXPORT_SYMBOL(unregister_die_notifier); /* used modular by kdb */ +- + static inline void conditional_sti(struct pt_regs *regs) + { + if (regs->eflags & X86_EFLAGS_IF) +@@ -428,8 +412,7 @@ void show_registers(struct pt_regs *regs + const int cpu = smp_processor_id(); + struct task_struct *cur = cpu_pda(cpu)->pcurrent; + +- rsp = regs->rsp; +- ++ rsp = regs->rsp; + printk("CPU %d ", cpu); + __show_regs(regs); + printk("Process %s (pid: %d, threadinfo %p, task %p)\n", +@@ -440,7 +423,6 @@ void show_registers(struct pt_regs *regs + * time of the fault.. + */ + if (in_kernel) { +- + printk("Stack: "); + _show_stack(NULL, regs, (unsigned long*)rsp); + +@@ -485,13 +467,14 @@ static unsigned int die_nest_count; + + unsigned __kprobes long oops_begin(void) + { +- int cpu = smp_processor_id(); ++ int cpu; + unsigned long flags; + + oops_enter(); + + /* racy, but better than risking deadlock. */ + local_irq_save(flags); ++ cpu = smp_processor_id(); + if (!spin_trylock(&die_lock)) { + if (cpu == die_owner) + /* nested oops. should stop eventually */; +@@ -585,10 +568,20 @@ static void __kprobes do_trap(int trapnr + { + struct task_struct *tsk = current; + +- tsk->thread.error_code = error_code; +- tsk->thread.trap_no = trapnr; +- + if (user_mode(regs)) { ++ /* ++ * We want error_code and trap_no set for userspace ++ * faults and kernelspace faults which result in ++ * die(), but not kernelspace faults which are fixed ++ * up. die() gives the process no chance to handle ++ * the signal and notice the kernel fault information, ++ * so that won't result in polluting the information ++ * about previously queued, but not yet delivered, ++ * faults. See also do_general_protection below. ++ */ ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = trapnr; ++ + if (exception_trace && unhandled_signal(tsk, signr)) + printk(KERN_INFO + "%s[%d] trap %s rip:%lx rsp:%lx error:%lx\n", +@@ -609,8 +602,11 @@ static void __kprobes do_trap(int trapnr + fixup = search_exception_tables(regs->rip); + if (fixup) + regs->rip = fixup->fixup; +- else ++ else { ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = trapnr; + die(str, regs, error_code); ++ } + return; + } + } +@@ -686,10 +682,10 @@ asmlinkage void __kprobes do_general_pro + + conditional_sti(regs); + +- tsk->thread.error_code = error_code; +- tsk->thread.trap_no = 13; +- + if (user_mode(regs)) { ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = 13; ++ + if (exception_trace && unhandled_signal(tsk, SIGSEGV)) + printk(KERN_INFO + "%s[%d] general protection rip:%lx rsp:%lx error:%lx\n", +@@ -708,6 +704,9 @@ asmlinkage void __kprobes do_general_pro + regs->rip = fixup->fixup; + return; + } ++ ++ tsk->thread.error_code = error_code; ++ tsk->thread.trap_no = 13; + if (notify_die(DIE_GPF, "general protection fault", regs, + error_code, 13, SIGSEGV) == NOTIFY_STOP) + return; +Index: 10.3-2007-11-26/arch/x86_64/kernel/vsyscall-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/kernel/vsyscall-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/kernel/vsyscall-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -45,14 +45,34 @@ + + #define __vsyscall(nr) __attribute__ ((unused,__section__(".vsyscall_" #nr))) + #define __syscall_clobber "r11","rcx","memory" ++#define __pa_vsymbol(x) \ ++ ({unsigned long v; \ ++ extern char __vsyscall_0; \ ++ asm("" : "=r" (v) : "0" (x)); \ ++ ((v - VSYSCALL_FIRST_PAGE) + __pa_symbol(&__vsyscall_0)); }) + ++/* ++ * vsyscall_gtod_data contains data that is : ++ * - readonly from vsyscalls ++ * - writen by timer interrupt or systcl (/proc/sys/kernel/vsyscall64) ++ * Try to keep this structure as small as possible to avoid cache line ping pongs ++ */ + struct vsyscall_gtod_data_t { +- seqlock_t lock; +- int sysctl_enabled; +- struct timeval wall_time_tv; ++ seqlock_t lock; ++ ++ /* open coded 'struct timespec' */ ++ time_t wall_time_sec; ++ u32 wall_time_nsec; ++ ++ int sysctl_enabled; + struct timezone sys_tz; +- cycle_t offset_base; +- struct clocksource clock; ++ struct { /* extract of a clocksource struct */ ++ cycle_t (*vread)(void); ++ cycle_t cycle_last; ++ cycle_t mask; ++ u32 mult; ++ u32 shift; ++ } clock; + }; + int __vgetcpu_mode __section_vgetcpu_mode; + +@@ -68,9 +88,13 @@ void update_vsyscall(struct timespec *wa + + write_seqlock_irqsave(&vsyscall_gtod_data.lock, flags); + /* copy vsyscall data */ +- vsyscall_gtod_data.clock = *clock; +- vsyscall_gtod_data.wall_time_tv.tv_sec = wall_time->tv_sec; +- vsyscall_gtod_data.wall_time_tv.tv_usec = wall_time->tv_nsec/1000; ++ vsyscall_gtod_data.clock.vread = clock->vread; ++ vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; ++ vsyscall_gtod_data.clock.mask = clock->mask; ++ vsyscall_gtod_data.clock.mult = clock->mult; ++ vsyscall_gtod_data.clock.shift = clock->shift; ++ vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; ++ vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; + vsyscall_gtod_data.sys_tz = sys_tz; + write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); + } +@@ -105,7 +129,8 @@ static __always_inline long time_syscall + static __always_inline void do_vgettimeofday(struct timeval * tv) + { + cycle_t now, base, mask, cycle_delta; +- unsigned long seq, mult, shift, nsec_delta; ++ unsigned seq; ++ unsigned long mult, shift, nsec; + cycle_t (*vread)(void); + do { + seq = read_seqbegin(&__vsyscall_gtod_data.lock); +@@ -121,21 +146,20 @@ static __always_inline void do_vgettimeo + mult = __vsyscall_gtod_data.clock.mult; + shift = __vsyscall_gtod_data.clock.shift; + +- *tv = __vsyscall_gtod_data.wall_time_tv; +- ++ tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; ++ nsec = __vsyscall_gtod_data.wall_time_nsec; + } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); + + /* calculate interval: */ + cycle_delta = (now - base) & mask; + /* convert to nsecs: */ +- nsec_delta = (cycle_delta * mult) >> shift; ++ nsec += (cycle_delta * mult) >> shift; + +- /* convert to usecs and add to timespec: */ +- tv->tv_usec += nsec_delta / NSEC_PER_USEC; +- while (tv->tv_usec > USEC_PER_SEC) { ++ while (nsec >= NSEC_PER_SEC) { + tv->tv_sec += 1; +- tv->tv_usec -= USEC_PER_SEC; ++ nsec -= NSEC_PER_SEC; + } ++ tv->tv_usec = nsec / NSEC_PER_USEC; + } + + int __vsyscall(0) vgettimeofday(struct timeval * tv, struct timezone * tz) +@@ -151,11 +175,16 @@ int __vsyscall(0) vgettimeofday(struct t + * unlikely */ + time_t __vsyscall(1) vtime(time_t *t) + { ++ struct timeval tv; ++ time_t result; + if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) + return time_syscall(t); +- else if (t) +- *t = __vsyscall_gtod_data.wall_time_tv.tv_sec; +- return __vsyscall_gtod_data.wall_time_tv.tv_sec; ++ ++ vgettimeofday(&tv, 0); ++ result = tv.tv_sec; ++ if (t) ++ *t = result; ++ return result; + } + + /* Fast way to get current CPU and node. +@@ -224,10 +253,10 @@ static int vsyscall_sysctl_change(ctl_ta + return ret; + /* gcc has some trouble with __va(__pa()), so just do it this + way. */ +- map1 = ioremap(__pa_symbol(&vsysc1), 2); ++ map1 = ioremap(__pa_vsymbol(&vsysc1), 2); + if (!map1) + return -ENOMEM; +- map2 = ioremap(__pa_symbol(&vsysc2), 2); ++ map2 = ioremap(__pa_vsymbol(&vsysc2), 2); + if (!map2) { + ret = -ENOMEM; + goto out; +@@ -301,7 +330,7 @@ static int __cpuinit + cpu_vsyscall_notifier(struct notifier_block *n, unsigned long action, void *arg) + { + long cpu = (long)arg; +- if (action == CPU_ONLINE) ++ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) + smp_call_function_single(cpu, cpu_vsyscall_init, NULL, 0, 1); + return NOTIFY_DONE; + } +Index: 10.3-2007-11-26/arch/x86_64/mm/fault-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/mm/fault-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/mm/fault-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -15,22 +15,22 @@ + #include <linux/mman.h> + #include <linux/mm.h> + #include <linux/smp.h> +-#include <linux/smp_lock.h> + #include <linux/interrupt.h> + #include <linux/init.h> + #include <linux/tty.h> + #include <linux/vt_kern.h> /* For unblank_screen() */ + #include <linux/compiler.h> ++#include <linux/vmalloc.h> + #include <linux/module.h> + #include <linux/kprobes.h> + #include <linux/uaccess.h> ++#include <linux/kdebug.h> + + #include <asm/system.h> + #include <asm/pgalloc.h> + #include <asm/smp.h> + #include <asm/tlbflush.h> + #include <asm/proto.h> +-#include <asm/kdebug.h> + #include <asm-generic/sections.h> + + /* Page fault error code bits */ +@@ -537,6 +537,12 @@ bad_area: + bad_area_nosemaphore: + /* User mode accesses just cause a SIGSEGV */ + if (error_code & PF_USER) { ++ ++ /* ++ * It's possible to have interrupts off here. ++ */ ++ local_irq_enable(); ++ + if (is_prefetch(regs, address, error_code)) + return; + +@@ -646,7 +652,7 @@ do_sigbus: + } + + DEFINE_SPINLOCK(pgd_lock); +-struct page *pgd_list; ++LIST_HEAD(pgd_list); + + void vmalloc_sync_all(void) + { +@@ -666,8 +672,7 @@ void vmalloc_sync_all(void) + if (pgd_none(*pgd_ref)) + continue; + spin_lock(&pgd_lock); +- for (page = pgd_list; page; +- page = (struct page *)page->index) { ++ list_for_each_entry(page, &pgd_list, lru) { + pgd_t *pgd; + pgd = (pgd_t *)page_address(page) + pgd_index(address); + if (pgd_none(*pgd)) +Index: 10.3-2007-11-26/arch/x86_64/mm/init-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/mm/init-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/mm/init-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -25,10 +25,12 @@ + #include <linux/bootmem.h> + #include <linux/proc_fs.h> + #include <linux/pci.h> ++#include <linux/pfn.h> + #include <linux/poison.h> + #include <linux/dma-mapping.h> + #include <linux/module.h> + #include <linux/memory_hotplug.h> ++#include <linux/nmi.h> + + #include <asm/processor.h> + #include <asm/system.h> +@@ -51,7 +53,7 @@ + #define Dprintk(x...) + #endif + +-struct dma_mapping_ops* dma_ops; ++const struct dma_mapping_ops* dma_ops; + EXPORT_SYMBOL(dma_ops); + + #if CONFIG_XEN_COMPAT <= 0x030002 +@@ -189,6 +191,13 @@ void show_mem(void) + + for_each_online_pgdat(pgdat) { + for (i = 0; i < pgdat->node_spanned_pages; ++i) { ++ /* this loop can take a while with 256 GB and 4k pages ++ so update the NMI watchdog */ ++ if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) { ++ touch_nmi_watchdog(); ++ } ++ if (!pfn_valid(pgdat->node_start_pfn + i)) ++ continue; + page = pfn_to_page(pgdat->node_start_pfn + i); + total++; + if (PageReserved(page)) +@@ -374,7 +383,7 @@ __set_fixmap_user (enum fixed_addresses + set_pte_phys(address, phys, prot, SET_FIXMAP_USER); + } + +-unsigned long __initdata table_start, table_end; ++unsigned long __meminitdata table_start, table_end; + + static __meminit void *alloc_static_page(unsigned long *phys) + { +@@ -391,7 +400,7 @@ static __meminit void *alloc_static_page + start_pfn++; + memset((void *)va, 0, PAGE_SIZE); + return (void *)va; +-} ++} + + #define PTE_SIZE PAGE_SIZE + +@@ -432,28 +441,46 @@ static inline int make_readonly(unsigned + + #ifndef CONFIG_XEN + /* Must run before zap_low_mappings */ +-__init void *early_ioremap(unsigned long addr, unsigned long size) ++__meminit void *early_ioremap(unsigned long addr, unsigned long size) + { +- unsigned long map = round_down(addr, LARGE_PAGE_SIZE); +- +- /* actually usually some more */ +- if (size >= LARGE_PAGE_SIZE) { +- return NULL; ++ unsigned long vaddr; ++ pmd_t *pmd, *last_pmd; ++ int i, pmds; ++ ++ pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; ++ vaddr = __START_KERNEL_map; ++ pmd = level2_kernel_pgt; ++ last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1; ++ for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) { ++ for (i = 0; i < pmds; i++) { ++ if (pmd_present(pmd[i])) ++ goto next; ++ } ++ vaddr += addr & ~PMD_MASK; ++ addr &= PMD_MASK; ++ for (i = 0; i < pmds; i++, addr += PMD_SIZE) ++ set_pmd(pmd + i,__pmd(addr | _KERNPG_TABLE | _PAGE_PSE)); ++ __flush_tlb(); ++ return (void *)vaddr; ++ next: ++ ; + } +- set_pmd(temp_mappings[0].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); +- map += LARGE_PAGE_SIZE; +- set_pmd(temp_mappings[1].pmd, __pmd(map | _KERNPG_TABLE | _PAGE_PSE)); +- __flush_tlb(); +- return temp_mappings[0].address + (addr & (LARGE_PAGE_SIZE-1)); ++ printk("early_ioremap(0x%lx, %lu) failed\n", addr, size); ++ return NULL; + } + + /* To avoid virtual aliases later */ +-__init void early_iounmap(void *addr, unsigned long size) ++__meminit void early_iounmap(void *addr, unsigned long size) + { +- if ((void *)round_down((unsigned long)addr, LARGE_PAGE_SIZE) != temp_mappings[0].address) +- printk("early_iounmap: bad address %p\n", addr); +- set_pmd(temp_mappings[0].pmd, __pmd(0)); +- set_pmd(temp_mappings[1].pmd, __pmd(0)); ++ unsigned long vaddr; ++ pmd_t *pmd; ++ int i, pmds; ++ ++ vaddr = (unsigned long)addr; ++ pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE; ++ pmd = level2_kernel_pgt + pmd_index(vaddr); ++ for (i = 0; i < pmds; i++) ++ pmd_clear(pmd + i); + __flush_tlb(); + } + #endif +@@ -787,14 +814,6 @@ void __meminit init_memory_mapping(unsig + __flush_tlb_all(); + } + +-void __cpuinit zap_low_mappings(int cpu) +-{ +- /* this is not required for Xen */ +-#if 0 +- swap_low_mappings(); +-#endif +-} +- + #ifndef CONFIG_NUMA + void __init paging_init(void) + { +@@ -986,17 +1005,6 @@ void __init mem_init(void) + reservedpages << (PAGE_SHIFT-10), + datasize >> 10, + initsize >> 10); +- +-#ifndef CONFIG_XEN +-#ifdef CONFIG_SMP +- /* +- * Sync boot_level4_pgt mappings with the init_level4_pgt +- * except for the low identity mappings which are already zapped +- * in init_level4_pgt. This sync-up is essential for AP's bringup +- */ +- memcpy(boot_level4_pgt+1, init_level4_pgt+1, (PTRS_PER_PGD-1)*sizeof(pgd_t)); +-#endif +-#endif + } + + void free_init_pages(char *what, unsigned long begin, unsigned long end) +@@ -1006,7 +1014,7 @@ void free_init_pages(char *what, unsigne + if (begin >= end) + return; + +- printk(KERN_INFO "Freeing %s: %ldk freed\n", what, (end - begin) >> 10); ++ printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(virt_to_page(addr)); + init_page_count(virt_to_page(addr)); +@@ -1015,24 +1023,17 @@ void free_init_pages(char *what, unsigne + if (addr >= __START_KERNEL_map) { + /* make_readonly() reports all kernel addresses. */ + __make_page_writable(__va(__pa(addr))); +- if (HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { +- pgd_t *pgd = pgd_offset_k(addr); +- pud_t *pud = pud_offset(pgd, addr); +- pmd_t *pmd = pmd_offset(pud, addr); +- pte_t *pte = pte_offset_kernel(pmd, addr); +- +- xen_l1_entry_update(pte, __pte(0)); /* fallback */ +- } ++ change_page_attr_addr(addr, 1, __pgprot(0)); + } + free_page(addr); + totalram_pages++; + } ++ if (addr > __START_KERNEL_map) ++ global_flush_tlb(); + } + + void free_initmem(void) + { +- memset(__initdata_begin, POISON_FREE_INITDATA, +- __initdata_end - __initdata_begin); + free_init_pages("unused kernel memory", + (unsigned long)(&__init_begin), + (unsigned long)(&__init_end)); +@@ -1042,13 +1043,28 @@ void free_initmem(void) + + void mark_rodata_ro(void) + { +- unsigned long addr = (unsigned long)__start_rodata; ++ unsigned long start = (unsigned long)_stext, end; ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ /* It must still be possible to apply SMP alternatives. */ ++ if (num_possible_cpus() > 1) ++ start = (unsigned long)_etext; ++#endif ++ ++#ifdef CONFIG_KPROBES ++ start = (unsigned long)__start_rodata; ++#endif ++ ++ end = (unsigned long)__end_rodata; ++ start = (start + PAGE_SIZE - 1) & PAGE_MASK; ++ end &= PAGE_MASK; ++ if (end <= start) ++ return; + +- for (; addr < (unsigned long)__end_rodata; addr += PAGE_SIZE) +- change_page_attr_addr(addr, 1, PAGE_KERNEL_RO); ++ change_page_attr_addr(start, (end - start) >> PAGE_SHIFT, PAGE_KERNEL_RO); + +- printk ("Write protecting the kernel read-only data: %luk\n", +- (__end_rodata - __start_rodata) >> 10); ++ printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", ++ (end - start) >> 10); + + /* + * change_page_attr_addr() requires a global_flush_tlb() call after it. +@@ -1201,3 +1217,11 @@ int in_gate_area_no_task(unsigned long a + { + return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); + } ++ ++#ifndef CONFIG_XEN ++void *alloc_bootmem_high_node(pg_data_t *pgdat, unsigned long size) ++{ ++ return __alloc_bootmem_core(pgdat->bdata, size, ++ SMP_CACHE_BYTES, (4UL*1024*1024*1024), 0); ++} ++#endif +Index: 10.3-2007-11-26/arch/x86_64/mm/pageattr-xen.c +=================================================================== +--- 10.3-2007-11-26.orig/arch/x86_64/mm/pageattr-xen.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/arch/x86_64/mm/pageattr-xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -146,13 +146,13 @@ void mm_pin_all(void) + preempt_enable(); + } + +-void _arch_dup_mmap(struct mm_struct *mm) ++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) + { + if (!mm->context.pinned) + mm_pin(mm); + } + +-void _arch_exit_mmap(struct mm_struct *mm) ++void arch_exit_mmap(struct mm_struct *mm) + { + struct task_struct *tsk = current; + +@@ -268,10 +268,11 @@ static void flush_kernel_map(void *arg) + struct page *pg; + + /* When clflush is available always use it because it is +- much cheaper than WBINVD */ +- if (!cpu_has_clflush) ++ much cheaper than WBINVD. Disable clflush for now because ++ the high level code is not ready yet */ ++ if (1 || !cpu_has_clflush) + asm volatile("wbinvd" ::: "memory"); +- list_for_each_entry(pg, l, lru) { ++ else list_for_each_entry(pg, l, lru) { + void *adr = page_address(pg); + if (cpu_has_clflush) + cache_flush_page(adr); +@@ -385,16 +386,24 @@ __change_page_attr(unsigned long address + */ + int change_page_attr_addr(unsigned long address, int numpages, pgprot_t prot) + { +- int err = 0; ++ int err = 0, kernel_map = 0; + int i; + ++ if (address >= __START_KERNEL_map ++ && address < __START_KERNEL_map + KERNEL_TEXT_SIZE) { ++ address = (unsigned long)__va(__pa(address)); ++ kernel_map = 1; ++ } ++ + down_write(&init_mm.mmap_sem); + for (i = 0; i < numpages; i++, address += PAGE_SIZE) { + unsigned long pfn = __pa(address) >> PAGE_SHIFT; + +- err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); +- if (err) +- break; ++ if (!kernel_map || pte_present(pfn_pte(0, prot))) { ++ err = __change_page_attr(address, pfn, prot, PAGE_KERNEL); ++ if (err) ++ break; ++ } + /* Handle kernel mapping too which aliases part of the + * lowmem */ + if (__pa(address) < KERNEL_TEXT_SIZE) { +Index: 10.3-2007-11-26/drivers/char/tpm/tpm_xen.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/char/tpm/tpm_xen.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/drivers/char/tpm/tpm_xen.c 2007-10-22 13:58:57.000000000 +0200 +@@ -462,7 +462,7 @@ static int tpmif_connect(struct xenbus_d + tp->backend_id = domid; + + err = bind_listening_port_to_irqhandler( +- domid, tpmif_int, SA_SAMPLE_RANDOM, "tpmif", tp); ++ domid, tpmif_int, IRQF_SAMPLE_RANDOM, "tpmif", tp); + if (err <= 0) { + WPRINTK("bind_listening_port_to_irqhandler failed " + "(err=%d)\n", err); +Index: 10.3-2007-11-26/drivers/xen/blkfront/blkfront.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/blkfront/blkfront.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/blkfront/blkfront.c 2007-10-22 13:58:57.000000000 +0200 +@@ -236,7 +236,7 @@ static int setup_blkring(struct xenbus_d + info->ring_ref = err; + + err = bind_listening_port_to_irqhandler( +- dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); ++ dev->otherend_id, blkif_int, IRQF_SAMPLE_RANDOM, "blkif", info); + if (err <= 0) { + xenbus_dev_fatal(dev, err, + "bind_listening_port_to_irqhandler"); +Index: 10.3-2007-11-26/drivers/xen/char/mem.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/char/mem.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/char/mem.c 2007-10-22 13:58:57.000000000 +0200 +@@ -18,7 +18,6 @@ + #include <linux/raw.h> + #include <linux/tty.h> + #include <linux/capability.h> +-#include <linux/smp_lock.h> + #include <linux/ptrace.h> + #include <linux/device.h> + #include <asm/pgalloc.h> +Index: 10.3-2007-11-26/drivers/xen/core/hypervisor_sysfs.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/core/hypervisor_sysfs.c 2007-09-03 09:52:56.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/core/hypervisor_sysfs.c 2007-10-22 13:58:57.000000000 +0200 +@@ -49,7 +49,7 @@ static int __init hypervisor_subsys_init + if (!is_running_on_xen()) + return -ENODEV; + +- hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type; ++ hypervisor_subsys.kobj.ktype = &hyp_sysfs_kobj_type; + return 0; + } + +Index: 10.3-2007-11-26/drivers/xen/core/smpboot.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/core/smpboot.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/core/smpboot.c 2007-10-22 13:58:57.000000000 +0200 +@@ -127,7 +127,7 @@ static int xen_smp_intr_init(unsigned in + rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, + cpu, + smp_reschedule_interrupt, +- SA_INTERRUPT, ++ IRQF_DISABLED, + resched_name[cpu], + NULL); + if (rc < 0) +@@ -138,7 +138,7 @@ static int xen_smp_intr_init(unsigned in + rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, + cpu, + smp_call_function_interrupt, +- SA_INTERRUPT, ++ IRQF_DISABLED, + callfunc_name[cpu], + NULL); + if (rc < 0) +@@ -171,12 +171,7 @@ static void xen_smp_intr_exit(unsigned i + + void cpu_bringup(void) + { +-#ifdef __i386__ +- cpu_set_gdt(current_thread_info()->cpu); +- secondary_cpu_init(); +-#else + cpu_init(); +-#endif + touch_softlockup_watchdog(); + preempt_disable(); + local_irq_enable(); +@@ -194,8 +189,6 @@ static void cpu_initialize_context(unsig + struct task_struct *idle = idle_task(cpu); + #ifdef __x86_64__ + struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu]; +-#else +- struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu); + #endif + + if (cpu_test_and_set(cpu, cpu_initialized_map)) +@@ -218,16 +211,18 @@ static void cpu_initialize_context(unsig + + ctxt.ldt_ents = 0; + +- ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); +- ctxt.gdt_ents = gdt_descr->size / 8; +- + #ifdef __i386__ ++ ctxt.gdt_frames[0] = virt_to_mfn(get_cpu_gdt_table(cpu)); ++ ctxt.gdt_ents = GDT_SIZE / 8; ++ + ctxt.user_regs.cs = __KERNEL_CS; + ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); + + ctxt.kernel_ss = __KERNEL_DS; + ctxt.kernel_sp = idle->thread.esp0; + ++ ctxt.user_regs.fs = __KERNEL_PERCPU; ++ + ctxt.event_callback_cs = __KERNEL_CS; + ctxt.event_callback_eip = (unsigned long)hypervisor_callback; + ctxt.failsafe_callback_cs = __KERNEL_CS; +@@ -235,6 +230,9 @@ static void cpu_initialize_context(unsig + + ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); + #else /* __x86_64__ */ ++ ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); ++ ctxt.gdt_ents = gdt_descr->size / 8; ++ + ctxt.user_regs.cs = __KERNEL_CS; + ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); + +@@ -259,9 +257,8 @@ void __init smp_prepare_cpus(unsigned in + struct task_struct *idle; + #ifdef __x86_64__ + struct desc_ptr *gdt_descr; +-#else +- struct Xgt_desc_struct *gdt_descr; + #endif ++ void *gdt_addr; + + boot_cpu_data.apicid = 0; + cpu_data[0] = boot_cpu_data; +@@ -308,14 +305,13 @@ void __init smp_prepare_cpus(unsigned in + } + gdt_descr->size = GDT_SIZE; + memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); ++ gdt_addr = (void *)gdt_descr->address; + #else +- if (unlikely(!init_gdt(cpu, idle))) +- continue; +- gdt_descr = &per_cpu(cpu_gdt_descr, cpu); ++ init_gdt(cpu); ++ gdt_addr = get_cpu_gdt_table(cpu); + #endif +- make_page_readonly( +- (void *)gdt_descr->address, +- XENFEAT_writable_descriptor_tables); ++ make_page_readonly(gdt_addr, ++ XENFEAT_writable_descriptor_tables); + + cpu_data[cpu] = boot_cpu_data; + cpu_data[cpu].apicid = cpu; +@@ -326,7 +322,9 @@ void __init smp_prepare_cpus(unsigned in + #ifdef __x86_64__ + cpu_pda(cpu)->pcurrent = idle; + cpu_pda(cpu)->cpunumber = cpu; +- clear_ti_thread_flag(idle->thread_info, TIF_FORK); ++ clear_ti_thread_flag(task_thread_info(idle), TIF_FORK); ++#else ++ per_cpu(current_task, cpu) = idle; + #endif + + irq_ctx_init(cpu); +@@ -351,8 +349,12 @@ void __init smp_prepare_cpus(unsigned in + #endif + } + +-void __devinit smp_prepare_boot_cpu(void) ++void __init smp_prepare_boot_cpu(void) + { ++#ifdef __i386__ ++ init_gdt(smp_processor_id()); ++ switch_to_new_gdt(); ++#endif + prefill_possible_map(); + } + +Index: 10.3-2007-11-26/drivers/xen/core/xen_sysfs.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/core/xen_sysfs.c 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/drivers/xen/core/xen_sysfs.c 2007-10-22 13:58:57.000000000 +0200 +@@ -28,12 +28,12 @@ HYPERVISOR_ATTR_RO(type); + + static int __init xen_sysfs_type_init(void) + { +- return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); ++ return sysfs_create_file(&hypervisor_subsys.kobj, &type_attr.attr); + } + + static void xen_sysfs_type_destroy(void) + { +- sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); ++ sysfs_remove_file(&hypervisor_subsys.kobj, &type_attr.attr); + } + + /* xen version attributes */ +@@ -89,13 +89,13 @@ static struct attribute_group version_gr + + static int __init xen_sysfs_version_init(void) + { +- return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ return sysfs_create_group(&hypervisor_subsys.kobj, + &version_group); + } + + static void xen_sysfs_version_destroy(void) + { +- sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group); ++ sysfs_remove_group(&hypervisor_subsys.kobj, &version_group); + } + + /* UUID */ +@@ -121,12 +121,12 @@ HYPERVISOR_ATTR_RO(uuid); + + static int __init xen_sysfs_uuid_init(void) + { +- return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); ++ return sysfs_create_file(&hypervisor_subsys.kobj, &uuid_attr.attr); + } + + static void xen_sysfs_uuid_destroy(void) + { +- sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); ++ sysfs_remove_file(&hypervisor_subsys.kobj, &uuid_attr.attr); + } + + /* xen compilation attributes */ +@@ -199,13 +199,13 @@ static struct attribute_group xen_compil + + int __init static xen_compilation_init(void) + { +- return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ return sysfs_create_group(&hypervisor_subsys.kobj, + &xen_compilation_group); + } + + static void xen_compilation_destroy(void) + { +- sysfs_remove_group(&hypervisor_subsys.kset.kobj, ++ sysfs_remove_group(&hypervisor_subsys.kobj, + &xen_compilation_group); + } + +@@ -320,13 +320,13 @@ static struct attribute_group xen_proper + + static int __init xen_properties_init(void) + { +- return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ return sysfs_create_group(&hypervisor_subsys.kobj, + &xen_properties_group); + } + + static void xen_properties_destroy(void) + { +- sysfs_remove_group(&hypervisor_subsys.kset.kobj, ++ sysfs_remove_group(&hypervisor_subsys.kobj, + &xen_properties_group); + } + +Index: 10.3-2007-11-26/drivers/xen/netback/netback.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/netback/netback.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/netback/netback.c 2007-10-22 13:58:57.000000000 +0200 +@@ -156,7 +156,7 @@ static struct sk_buff *netbk_copy_skb(st + goto err; + + skb_reserve(nskb, 16 + NET_IP_ALIGN); +- headlen = nskb->end - nskb->data; ++ headlen = skb_end_pointer(nskb) - nskb->data; + if (headlen > skb_headlen(skb)) + headlen = skb_headlen(skb); + ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); +@@ -202,11 +202,15 @@ static struct sk_buff *netbk_copy_skb(st + len -= copy; + } + ++#ifdef NET_SKBUFF_DATA_USES_OFFSET ++ offset = 0; ++#else + offset = nskb->data - skb->data; ++#endif + +- nskb->h.raw = skb->h.raw + offset; +- nskb->nh.raw = skb->nh.raw + offset; +- nskb->mac.raw = skb->mac.raw + offset; ++ nskb->transport_header = skb->transport_header + offset; ++ nskb->network_header = skb->network_header + offset; ++ nskb->mac_header = skb->mac_header + offset; + + return nskb; + +@@ -1483,7 +1487,7 @@ static int __init netback_init(void) + (void)bind_virq_to_irqhandler(VIRQ_DEBUG, + 0, + netif_be_dbg, +- SA_SHIRQ, ++ IRQF_SHARED, + "net-be-dbg", + &netif_be_dbg); + #endif +Index: 10.3-2007-11-26/drivers/xen/netfront/netfront.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/netfront/netfront.c 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/netfront/netfront.c 2007-10-22 13:58:57.000000000 +0200 +@@ -533,7 +533,7 @@ static int setup_device(struct xenbus_de + memcpy(netdev->dev_addr, info->mac, ETH_ALEN); + + err = bind_listening_port_to_irqhandler( +- dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name, ++ dev->otherend_id, netif_int, IRQF_SAMPLE_RANDOM, netdev->name, + netdev); + if (err < 0) + goto fail; +Index: 10.3-2007-11-26/drivers/xen/pciback/xenbus.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/pciback/xenbus.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/pciback/xenbus.c 2007-10-22 13:58:57.000000000 +0200 +@@ -86,7 +86,7 @@ static int pciback_do_attach(struct pcib + + err = bind_interdomain_evtchn_to_irqhandler( + pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event, +- SA_SAMPLE_RANDOM, "pciback", pdev); ++ IRQF_SAMPLE_RANDOM, "pciback", pdev); + if (err < 0) { + xenbus_dev_fatal(pdev->xdev, err, + "Error binding event channel to IRQ"); +Index: 10.3-2007-11-26/drivers/xen/xenoprof/xenoprofile.c +=================================================================== +--- 10.3-2007-11-26.orig/drivers/xen/xenoprof/xenoprofile.c 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/drivers/xen/xenoprof/xenoprofile.c 2007-10-22 13:58:57.000000000 +0200 +@@ -219,7 +219,7 @@ static int bind_virq(void) + result = bind_virq_to_irqhandler(VIRQ_XENOPROF, + i, + xenoprof_ovf_interrupt, +- SA_INTERRUPT, ++ IRQF_DISABLED, + "xenoprof", + NULL); + +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/agp.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/agp.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/agp.h 2007-10-22 13:58:57.000000000 +0200 +@@ -13,8 +13,15 @@ + * data corruption on some CPUs. + */ + +-int map_page_into_agp(struct page *page); +-int unmap_page_from_agp(struct page *page); ++/* Caller's responsibility to call global_flush_tlb() for ++ * performance reasons */ ++#define map_page_into_agp(page) ( \ ++ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ ++ ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) ++#define unmap_page_from_agp(page) ( \ ++ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ ++ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ ++ change_page_attr(page, 1, PAGE_KERNEL)) + #define flush_agp_mappings() global_flush_tlb() + + /* Could use CLFLUSH here if the cpu supports it. But then it would +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/desc.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/desc.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/desc.h 2007-10-22 13:58:57.000000000 +0200 +@@ -11,23 +11,24 @@ + + #include <asm/mmu.h> + +-extern struct desc_struct cpu_gdt_table[GDT_ENTRIES]; +- + struct Xgt_desc_struct { + unsigned short size; + unsigned long address __attribute__((packed)); + unsigned short pad; + } __attribute__ ((packed)); + +-extern struct Xgt_desc_struct idt_descr; +-DECLARE_PER_CPU(struct Xgt_desc_struct, cpu_gdt_descr); +-extern struct Xgt_desc_struct early_gdt_descr; ++struct gdt_page ++{ ++ struct desc_struct gdt[GDT_ENTRIES]; ++} __attribute__((aligned(PAGE_SIZE))); ++DECLARE_PER_CPU(struct gdt_page, gdt_page); + + static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) + { +- return (struct desc_struct *)per_cpu(cpu_gdt_descr, cpu).address; ++ return per_cpu(gdt_page, cpu).gdt; + } + ++extern struct Xgt_desc_struct idt_descr; + extern struct desc_struct idt_table[]; + extern void set_intr_gate(unsigned int irq, void * addr); + +@@ -55,51 +56,32 @@ static inline void pack_gate(__u32 *a, _ + #define DESCTYPE_S 0x10 /* !system */ + + #ifndef CONFIG_XEN +-#define load_TR_desc() __asm__ __volatile__("ltr %w0"::"q" (GDT_ENTRY_TSS*8)) +- +-#define load_gdt(dtr) __asm__ __volatile("lgdt %0"::"m" (*dtr)) +-#define load_idt(dtr) __asm__ __volatile("lidt %0"::"m" (*dtr)) ++#define load_TR_desc() native_load_tr_desc() ++#define load_gdt(dtr) native_load_gdt(dtr) ++#define load_idt(dtr) native_load_idt(dtr) + #define load_tr(tr) __asm__ __volatile("ltr %0"::"m" (tr)) + #define load_ldt(ldt) __asm__ __volatile("lldt %0"::"m" (ldt)) + +-#define store_gdt(dtr) __asm__ ("sgdt %0":"=m" (*dtr)) +-#define store_idt(dtr) __asm__ ("sidt %0":"=m" (*dtr)) +-#define store_tr(tr) __asm__ ("str %0":"=m" (tr)) ++#define store_gdt(dtr) native_store_gdt(dtr) ++#define store_idt(dtr) native_store_idt(dtr) ++#define store_tr(tr) (tr = native_store_tr()) + #define store_ldt(ldt) __asm__ ("sldt %0":"=m" (ldt)) +-#endif + +-#if TLS_SIZE != 24 +-# error update this code. +-#endif +- +-static inline void load_TLS(struct thread_struct *t, unsigned int cpu) +-{ +-#define C(i) HYPERVISOR_update_descriptor(virt_to_machine(&get_cpu_gdt_table(cpu)[GDT_ENTRY_TLS_MIN + i]), *(u64 *)&t->tls_array[i]) +- C(0); C(1); C(2); +-#undef C +-} ++#define load_TLS(t, cpu) native_load_tls(t, cpu) ++#define set_ldt native_set_ldt + +-#ifndef CONFIG_XEN + #define write_ldt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + #define write_gdt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + #define write_idt_entry(dt, entry, a, b) write_dt_entry(dt, entry, a, b) + +-static inline void write_dt_entry(void *dt, int entry, __u32 entry_a, __u32 entry_b) ++static inline void write_dt_entry(struct desc_struct *dt, ++ int entry, u32 entry_low, u32 entry_high) + { +- __u32 *lp = (__u32 *)((char *)dt + entry*8); +- *lp = entry_a; +- *(lp+1) = entry_b; ++ dt[entry].a = entry_low; ++ dt[entry].b = entry_high; + } +-#define set_ldt native_set_ldt +-#else +-extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); +-extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); +-#define set_ldt(addr, entries) xen_set_ldt((unsigned long)(addr), entries) +-#endif + +-#ifndef CONFIG_XEN +-static inline fastcall void native_set_ldt(const void *addr, +- unsigned int entries) ++static inline void native_set_ldt(const void *addr, unsigned int entries) + { + if (likely(entries == 0)) + __asm__ __volatile__("lldt %w0"::"q" (0)); +@@ -114,6 +96,64 @@ static inline fastcall void native_set_l + __asm__ __volatile__("lldt %w0"::"q" (GDT_ENTRY_LDT*8)); + } + } ++ ++ ++static inline void native_load_tr_desc(void) ++{ ++ asm volatile("ltr %w0"::"q" (GDT_ENTRY_TSS*8)); ++} ++ ++static inline void native_load_gdt(const struct Xgt_desc_struct *dtr) ++{ ++ asm volatile("lgdt %0"::"m" (*dtr)); ++} ++ ++static inline void native_load_idt(const struct Xgt_desc_struct *dtr) ++{ ++ asm volatile("lidt %0"::"m" (*dtr)); ++} ++ ++static inline void native_store_gdt(struct Xgt_desc_struct *dtr) ++{ ++ asm ("sgdt %0":"=m" (*dtr)); ++} ++ ++static inline void native_store_idt(struct Xgt_desc_struct *dtr) ++{ ++ asm ("sidt %0":"=m" (*dtr)); ++} ++ ++static inline unsigned long native_store_tr(void) ++{ ++ unsigned long tr; ++ asm ("str %0":"=r" (tr)); ++ return tr; ++} ++ ++static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) ++{ ++ unsigned int i; ++ struct desc_struct *gdt = get_cpu_gdt_table(cpu); ++ ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; ++} ++#else ++#define load_TLS(t, cpu) xen_load_tls(t, cpu) ++#define set_ldt(addr, entries) xen_set_ldt((unsigned long)(addr), entries) ++ ++extern int write_ldt_entry(void *ldt, int entry, __u32 entry_a, __u32 entry_b); ++extern int write_gdt_entry(void *gdt, int entry, __u32 entry_a, __u32 entry_b); ++ ++static inline void xen_load_tls(struct thread_struct *t, unsigned int cpu) ++{ ++ unsigned int i; ++ struct desc_struct *gdt = get_cpu_gdt_table(cpu); ++ ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ HYPERVISOR_update_descriptor(virt_to_machine(&gdt[GDT_ENTRY_TLS_MIN + i]), ++ *(u64 *)&t->tls_array[i]); ++} + #endif + + #ifndef CONFIG_X86_NO_IDT +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/fixmap.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/fixmap.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/fixmap.h 2007-10-22 13:58:57.000000000 +0200 +@@ -19,10 +19,8 @@ + * the start of the fixmap. + */ + extern unsigned long __FIXADDR_TOP; +-#ifdef CONFIG_COMPAT_VDSO +-#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) +-#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) +-#endif ++#define FIXADDR_USER_START __fix_to_virt(FIX_VDSO) ++#define FIXADDR_USER_END __fix_to_virt(FIX_VDSO - 1) + + #ifndef __ASSEMBLY__ + #include <linux/kernel.h> +@@ -85,6 +83,9 @@ enum fixed_addresses { + #ifdef CONFIG_PCI_MMCONFIG + FIX_PCIE_MCFG, + #endif ++#ifdef CONFIG_PARAVIRT ++ FIX_PARAVIRT_BOOTMAP, ++#endif + FIX_SHARED_INFO, + #define NR_FIX_ISAMAPS 256 + FIX_ISAMAP_END, +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/highmem.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/highmem.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/highmem.h 2007-10-22 13:58:57.000000000 +0200 +@@ -67,12 +67,18 @@ extern void FASTCALL(kunmap_high(struct + + void *kmap(struct page *page); + void kunmap(struct page *page); ++void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); + void *kmap_atomic(struct page *page, enum km_type type); + void *kmap_atomic_pte(struct page *page, enum km_type type); + void kunmap_atomic(void *kvaddr, enum km_type type); + void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); + struct page *kmap_atomic_to_page(void *ptr); + ++#define kmap_atomic_pte(page, type) \ ++ kmap_atomic_prot(page, type, \ ++ test_bit(PG_pinned, &(page)->flags) \ ++ ? PAGE_KERNEL_RO : kmap_prot) ++ + #define flush_cache_kmaps() do { } while (0) + + #endif /* __KERNEL__ */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/io.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/io.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/io.h 2007-10-22 13:58:57.000000000 +0200 +@@ -264,15 +264,18 @@ static inline void flush_write_buffers(v + + #endif /* __KERNEL__ */ + +-#define __SLOW_DOWN_IO "outb %%al,$0x80;" ++static inline void xen_io_delay(void) ++{ ++ asm volatile("outb %%al,$0x80" : : : "memory"); ++} + + static inline void slow_down_io(void) { +- __asm__ __volatile__( +- __SLOW_DOWN_IO ++ xen_io_delay(); + #ifdef REALLY_SLOW_IO +- __SLOW_DOWN_IO __SLOW_DOWN_IO __SLOW_DOWN_IO ++ xen_io_delay(); ++ xen_io_delay(); ++ xen_io_delay(); + #endif +- : : ); + } + + #ifdef CONFIG_X86_NUMAQ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/irqflags.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/irqflags.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/irqflags.h 2007-10-22 13:58:57.000000000 +0200 +@@ -11,6 +11,43 @@ + #define _ASM_IRQFLAGS_H + + #ifndef __ASSEMBLY__ ++#define xen_save_fl(void) (current_vcpu_info()->evtchn_upcall_mask) ++ ++#define xen_restore_fl(f) \ ++do { \ ++ vcpu_info_t *_vcpu; \ ++ barrier(); \ ++ _vcpu = current_vcpu_info(); \ ++ if ((_vcpu->evtchn_upcall_mask = (f)) == 0) { \ ++ barrier(); /* unmask then check (avoid races) */\ ++ if (unlikely(_vcpu->evtchn_upcall_pending)) \ ++ force_evtchn_callback(); \ ++ } \ ++} while (0) ++ ++#define xen_irq_disable() \ ++do { \ ++ current_vcpu_info()->evtchn_upcall_mask = 1; \ ++ barrier(); \ ++} while (0) ++ ++#define xen_irq_enable() \ ++do { \ ++ vcpu_info_t *_vcpu; \ ++ barrier(); \ ++ _vcpu = current_vcpu_info(); \ ++ _vcpu->evtchn_upcall_mask = 0; \ ++ barrier(); /* unmask then check (avoid races) */ \ ++ if (unlikely(_vcpu->evtchn_upcall_pending)) \ ++ force_evtchn_callback(); \ ++} while (0) ++ ++void xen_safe_halt(void); ++ ++void xen_halt(void); ++#endif /* __ASSEMBLY__ */ ++ ++#ifndef __ASSEMBLY__ + + /* + * The use of 'barrier' in the following reflects their use as local-lock +@@ -20,48 +57,31 @@ + * includes these barriers, for example. + */ + +-#define __raw_local_save_flags() (current_vcpu_info()->evtchn_upcall_mask) ++#define __raw_local_save_flags(void) xen_save_fl() + +-#define raw_local_irq_restore(x) \ +-do { \ +- vcpu_info_t *_vcpu; \ +- barrier(); \ +- _vcpu = current_vcpu_info(); \ +- if ((_vcpu->evtchn_upcall_mask = (x)) == 0) { \ +- barrier(); /* unmask then check (avoid races) */ \ +- if (unlikely(_vcpu->evtchn_upcall_pending)) \ +- force_evtchn_callback(); \ +- } \ +-} while (0) ++#define raw_local_irq_restore(flags) xen_restore_fl(flags) + +-#define raw_local_irq_disable() \ +-do { \ +- current_vcpu_info()->evtchn_upcall_mask = 1; \ +- barrier(); \ +-} while (0) ++#define raw_local_irq_disable() xen_irq_disable() + +-#define raw_local_irq_enable() \ +-do { \ +- vcpu_info_t *_vcpu; \ +- barrier(); \ +- _vcpu = current_vcpu_info(); \ +- _vcpu->evtchn_upcall_mask = 0; \ +- barrier(); /* unmask then check (avoid races) */ \ +- if (unlikely(_vcpu->evtchn_upcall_pending)) \ +- force_evtchn_callback(); \ +-} while (0) ++#define raw_local_irq_enable() xen_irq_enable() + + /* + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +-void raw_safe_halt(void); ++static inline void raw_safe_halt(void) ++{ ++ xen_safe_halt(); ++} + + /* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +-void halt(void); ++static inline void halt(void) ++{ ++ xen_halt(); ++} + + /* + * For spinlocks, etc: +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/mmu.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/mmu.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/mmu.h 2007-10-22 13:58:57.000000000 +0200 +@@ -18,12 +18,4 @@ typedef struct { + #endif + } mm_context_t; + +-/* mm/memory.c:exit_mmap hook */ +-extern void _arch_exit_mmap(struct mm_struct *mm); +-#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +- +-/* kernel/fork.c:dup_mmap hook */ +-extern void _arch_dup_mmap(struct mm_struct *mm); +-#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm)) +- + #endif +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/mmu_context.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/mmu_context.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/mmu_context.h 2007-10-22 13:58:57.000000000 +0200 +@@ -6,6 +6,20 @@ + #include <asm/pgalloc.h> + #include <asm/tlbflush.h> + ++void arch_exit_mmap(struct mm_struct *mm); ++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); ++ ++void mm_pin(struct mm_struct *mm); ++void mm_unpin(struct mm_struct *mm); ++void mm_pin_all(void); ++ ++static inline void xen_activate_mm(struct mm_struct *prev, ++ struct mm_struct *next) ++{ ++ if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) ++ mm_pin(next); ++} ++ + /* + * Used for LDT copy/destruction. + */ +@@ -37,10 +51,6 @@ static inline void __prepare_arch_switch + : : "r" (0) ); + } + +-extern void mm_pin(struct mm_struct *mm); +-extern void mm_unpin(struct mm_struct *mm); +-void mm_pin_all(void); +- + static inline void switch_mm(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) +@@ -97,11 +107,10 @@ static inline void switch_mm(struct mm_s + #define deactivate_mm(tsk, mm) \ + asm("movl %0,%%gs": :"r" (0)); + +-static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) +-{ +- if (!test_bit(PG_pinned, &virt_to_page(next->pgd)->flags)) +- mm_pin(next); +- switch_mm(prev, next, NULL); +-} ++#define activate_mm(prev, next) \ ++ do { \ ++ xen_activate_mm(prev, next); \ ++ switch_mm((prev),(next),NULL); \ ++ } while(0) + + #endif +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/page.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/page.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/page.h 2007-10-22 13:58:57.000000000 +0200 +@@ -66,6 +66,7 @@ + * These are used to make use of C type-checking.. + */ + extern int nx_enabled; ++ + #ifdef CONFIG_X86_PAE + extern unsigned long long __supported_pte_mask; + typedef struct { unsigned long pte_low, pte_high; } pte_t; +@@ -74,69 +75,117 @@ typedef struct { unsigned long long pgd; + typedef struct { unsigned long long pgprot; } pgprot_t; + #define pgprot_val(x) ((x).pgprot) + #include <asm/maddr.h> +-#define __pte(x) ({ unsigned long long _x = (x); \ +- if (_x & _PAGE_PRESENT) _x = pte_phys_to_machine(_x); \ +- ((pte_t) {(unsigned long)(_x), (unsigned long)(_x>>32)}); }) +-#define __pgd(x) ({ unsigned long long _x = (x); \ +- (pgd_t) {((_x) & _PAGE_PRESENT) ? pte_phys_to_machine(_x) : (_x)}; }) +-#define __pmd(x) ({ unsigned long long _x = (x); \ +- (pmd_t) {((_x) & _PAGE_PRESENT) ? pte_phys_to_machine(_x) : (_x)}; }) +-static inline unsigned long long pte_val_ma(pte_t x) +-{ +- return ((unsigned long long)x.pte_high << 32) | x.pte_low; +-} +-static inline unsigned long long pte_val(pte_t x) ++ ++static inline unsigned long long xen_pgd_val(pgd_t pgd) + { +- unsigned long long ret = pte_val_ma(x); +- if (x.pte_low & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); ++ unsigned long long ret = pgd.pgd; ++ if (ret & _PAGE_PRESENT) ++ ret = pte_machine_to_phys(ret); + return ret; + } +-static inline unsigned long long pmd_val(pmd_t x) ++ ++static inline unsigned long long xen_pmd_val(pmd_t pmd) + { +- unsigned long long ret = x.pmd; ++ unsigned long long ret = pmd.pmd; + #if CONFIG_XEN_COMPAT <= 0x030002 +- if (ret) ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; ++ if (ret) ++ ret = pte_machine_to_phys(ret) | _PAGE_PRESENT; + #else +- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); ++ if (ret & _PAGE_PRESENT) ++ ret = pte_machine_to_phys(ret); + #endif + return ret; + } +-static inline unsigned long long pgd_val(pgd_t x) ++ ++static inline unsigned long long pte_val_ma(pte_t pte) + { +- unsigned long long ret = x.pgd; +- if (ret & _PAGE_PRESENT) ret = pte_machine_to_phys(ret); ++ return ((unsigned long long)pte.pte_high << 32) | pte.pte_low; ++} ++static inline unsigned long long xen_pte_val(pte_t pte) ++{ ++ unsigned long long ret = pte_val_ma(pte); ++ if (pte.pte_low & _PAGE_PRESENT) ++ ret = pte_machine_to_phys(ret); + return ret; + } ++ ++static inline pgd_t xen_make_pgd(unsigned long long val) ++{ ++ if (val & _PAGE_PRESENT) ++ val = pte_phys_to_machine(val); ++ return (pgd_t) { val }; ++} ++ ++static inline pmd_t xen_make_pmd(unsigned long long val) ++{ ++ if (val & _PAGE_PRESENT) ++ val = pte_phys_to_machine(val); ++ return (pmd_t) { val }; ++} ++ ++static inline pte_t xen_make_pte(unsigned long long val) ++{ ++ if (val & _PAGE_PRESENT) ++ val = pte_phys_to_machine(val); ++ return (pte_t) { .pte_low = val, .pte_high = (val >> 32) } ; ++} ++ ++#define pmd_val(x) xen_pmd_val(x) ++#define __pmd(x) xen_make_pmd(x) ++ + #define HPAGE_SHIFT 21 + #include <asm-generic/pgtable-nopud.h> +-#else ++#else /* !CONFIG_X86_PAE */ + typedef struct { unsigned long pte_low; } pte_t; + typedef struct { unsigned long pgd; } pgd_t; + typedef struct { unsigned long pgprot; } pgprot_t; + #define pgprot_val(x) ((x).pgprot) +-#include <asm/maddr.h> + #define boot_pte_t pte_t /* or would you rather have a typedef */ +-#define pte_val(x) (((x).pte_low & _PAGE_PRESENT) ? \ +- machine_to_phys((x).pte_low) : \ +- (x).pte_low) +-#define pte_val_ma(x) ((x).pte_low) +-#define __pte(x) ({ unsigned long _x = (x); \ +- (pte_t) {((_x) & _PAGE_PRESENT) ? phys_to_machine(_x) : (_x)}; }) +-#define __pgd(x) ({ unsigned long _x = (x); \ +- (pgd_t) {((_x) & _PAGE_PRESENT) ? phys_to_machine(_x) : (_x)}; }) +-static inline unsigned long pgd_val(pgd_t x) ++#include <asm/maddr.h> ++ ++static inline unsigned long xen_pgd_val(pgd_t pgd) + { +- unsigned long ret = x.pgd; ++ unsigned long ret = pgd.pgd; + #if CONFIG_XEN_COMPAT <= 0x030002 +- if (ret) ret = machine_to_phys(ret) | _PAGE_PRESENT; ++ if (ret) ++ ret = machine_to_phys(ret) | _PAGE_PRESENT; + #else +- if (ret & _PAGE_PRESENT) ret = machine_to_phys(ret); ++ if (ret & _PAGE_PRESENT) ++ ret = machine_to_phys(ret); + #endif + return ret; + } ++ ++static inline unsigned long pte_val_ma(pte_t pte) ++{ ++ return pte.pte_low; ++} ++static inline unsigned long xen_pte_val(pte_t pte) ++{ ++ unsigned long ret = pte_val_ma(pte); ++ if (ret & _PAGE_PRESENT) ++ ret = machine_to_phys(ret); ++ return ret; ++} ++ ++static inline pgd_t xen_make_pgd(unsigned long val) ++{ ++ if (val & _PAGE_PRESENT) ++ val = phys_to_machine(val); ++ return (pgd_t) { val }; ++} ++ ++static inline pte_t xen_make_pte(unsigned long val) ++{ ++ if (val & _PAGE_PRESENT) ++ val = phys_to_machine(val); ++ return (pte_t) { .pte_low = val }; ++} ++ + #define HPAGE_SHIFT 22 + #include <asm-generic/pgtable-nopmd.h> +-#endif ++#endif /* CONFIG_X86_PAE */ ++ + #define PTE_MASK PHYSICAL_PAGE_MASK + + #ifdef CONFIG_HUGETLB_PAGE +@@ -148,6 +197,11 @@ static inline unsigned long pgd_val(pgd_ + + #define __pgprot(x) ((pgprot_t) { (x) } ) + ++#define pgd_val(x) xen_pgd_val(x) ++#define __pgd(x) xen_make_pgd(x) ++#define pte_val(x) xen_pte_val(x) ++#define __pte(x) xen_make_pte(x) ++ + #endif /* !__ASSEMBLY__ */ + + /* to align the pointer to the (next) page boundary */ +@@ -188,6 +242,7 @@ extern int page_is_ram(unsigned long pag + #define __PAGE_OFFSET ((unsigned long)CONFIG_PAGE_OFFSET) + #endif + ++ + #define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) + #define VMALLOC_RESERVE ((unsigned long)__VMALLOC_RESERVE) + #define MAXMEM (-__PAGE_OFFSET-__VMALLOC_RESERVE) +@@ -212,9 +267,7 @@ extern int page_is_ram(unsigned long pag + #include <asm-generic/memory_model.h> + #include <asm-generic/page.h> + +-#ifndef CONFIG_COMPAT_VDSO + #define __HAVE_ARCH_GATE_AREA 1 +-#endif + #endif /* __KERNEL__ */ + + #endif /* _I386_PAGE_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgalloc.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgalloc.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgalloc.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,7 +1,6 @@ + #ifndef _I386_PGALLOC_H + #define _I386_PGALLOC_H + +-#include <asm/fixmap.h> + #include <linux/threads.h> + #include <linux/mm.h> /* for struct page */ + #include <asm/io.h> /* for phys_to_virt and page_to_pseudophys */ +@@ -69,6 +68,4 @@ do { \ + #define pud_populate(mm, pmd, pte) BUG() + #endif + +-#define check_pgt_cache() do { } while (0) +- + #endif /* _I386_PGALLOC_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgtable-2level-defs.h 2007-12-06 17:27:30.000000000 +0100 ++++ /dev/null 1970-01-01 00:00:00.000000000 +0000 +@@ -1,20 +0,0 @@ +-#ifndef _I386_PGTABLE_2LEVEL_DEFS_H +-#define _I386_PGTABLE_2LEVEL_DEFS_H +- +-#define HAVE_SHARED_KERNEL_PMD 0 +- +-/* +- * traditional i386 two-level paging structure: +- */ +- +-#define PGDIR_SHIFT 22 +-#define PTRS_PER_PGD 1024 +- +-/* +- * the i386 is two-level, so we don't really have any +- * PMD directory physically. +- */ +- +-#define PTRS_PER_PTE 1024 +- +-#endif /* _I386_PGTABLE_2LEVEL_DEFS_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-2level.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgtable-2level.h 2007-10-22 13:54:57.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-2level.h 2007-10-22 13:58:57.000000000 +0200 +@@ -11,22 +11,43 @@ + * within a page table are directly modified. Thus, the following + * hook is made available. + */ +-#define set_pte(pteptr, pteval) (*(pteptr) = pteval) +- +-#define set_pte_at(_mm,addr,ptep,pteval) do { \ +- if (((_mm) != current->mm && (_mm) != &init_mm) || \ +- HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ +- set_pte((ptep), (pteval)); \ +-} while (0) +- +-#define set_pmd(pmdptr, pmdval) xen_l2_entry_update((pmdptr), (pmdval)) ++static inline void xen_set_pte(pte_t *ptep , pte_t pte) ++{ ++ *ptep = pte; ++} ++static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, ++ pte_t *ptep , pte_t pte) ++{ ++ if ((mm != current->mm && mm != &init_mm) || ++ HYPERVISOR_update_va_mapping(addr, pte, 0)) ++ xen_set_pte(ptep, pte); ++} ++static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) ++{ ++ xen_l2_entry_update(pmdp, pmd); ++} ++#define set_pte(pteptr, pteval) xen_set_pte(pteptr, pteval) ++#define set_pte_at(mm,addr,ptep,pteval) xen_set_pte_at(mm, addr, ptep, pteval) ++#define set_pmd(pmdptr, pmdval) xen_set_pmd(pmdptr, pmdval) + + #define set_pte_atomic(pteptr, pteval) set_pte(pteptr,pteval) + + #define pte_clear(mm,addr,xp) do { set_pte_at(mm, addr, xp, __pte(0)); } while (0) + #define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) + +-#define raw_ptep_get_and_clear(xp, pte) __pte_ma(xchg(&(xp)->pte_low, 0)) ++static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *xp) ++{ ++ xen_set_pte_at(mm, addr, xp, __pte(0)); ++} ++ ++#ifdef CONFIG_SMP ++static inline pte_t xen_ptep_get_and_clear(pte_t *xp, pte_t res) ++{ ++ return __pte_ma(xchg(&xp->pte_low, 0)); ++} ++#else ++#define xen_ptep_get_and_clear(xp, res) xen_local_ptep_get_and_clear(xp, res) ++#endif + + #define __HAVE_ARCH_PTEP_CLEAR_FLUSH + #define ptep_clear_flush(vma, addr, ptep) \ +@@ -91,6 +112,4 @@ static inline int pte_exec_kernel(pte_t + #define __pte_to_swp_entry(pte) ((swp_entry_t) { (pte).pte_low }) + #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + +-void vmalloc_sync_all(void); +- + #endif /* _I386_PGTABLE_2LEVEL_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-3level-defs.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,7 +1,7 @@ + #ifndef _I386_PGTABLE_3LEVEL_DEFS_H + #define _I386_PGTABLE_3LEVEL_DEFS_H + +-#define HAVE_SHARED_KERNEL_PMD 0 ++#define SHARED_KERNEL_PMD 0 + + /* + * PGDIR_SHIFT determines what a top-level page table entry can map +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-3level.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgtable-3level.h 2007-10-22 13:58:00.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable-3level.h 2007-10-22 13:58:57.000000000 +0200 +@@ -49,32 +49,40 @@ static inline int pte_exec_kernel(pte_t + * value and then use set_pte to update it. -ben + */ + +-static inline void set_pte(pte_t *ptep, pte_t pte) ++static inline void xen_set_pte(pte_t *ptep, pte_t pte) + { + ptep->pte_high = pte.pte_high; + smp_wmb(); + ptep->pte_low = pte.pte_low; + } +-#define set_pte_atomic(pteptr,pteval) \ +- set_64bit((unsigned long long *)(pteptr),pte_val_ma(pteval)) + +-#define set_pte_at(_mm,addr,ptep,pteval) do { \ +- if (((_mm) != current->mm && (_mm) != &init_mm) || \ +- HYPERVISOR_update_va_mapping((addr), (pteval), 0)) \ +- set_pte((ptep), (pteval)); \ +-} while (0) +- +-#define set_pmd(pmdptr,pmdval) \ +- xen_l2_entry_update((pmdptr), (pmdval)) +-#define set_pud(pudptr,pudval) \ +- xen_l3_entry_update((pudptr), (pudval)) ++static inline void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, ++ pte_t *ptep , pte_t pte) ++{ ++ if ((mm != current->mm && mm != &init_mm) || ++ HYPERVISOR_update_va_mapping(addr, pte, 0)) ++ xen_set_pte(ptep, pte); ++} ++ ++static inline void xen_set_pte_atomic(pte_t *ptep, pte_t pte) ++{ ++ set_64bit((unsigned long long *)(ptep),pte_val_ma(pte)); ++} ++static inline void xen_set_pmd(pmd_t *pmdp, pmd_t pmd) ++{ ++ xen_l2_entry_update(pmdp, pmd); ++} ++static inline void xen_set_pud(pud_t *pudp, pud_t pud) ++{ ++ xen_l3_entry_update(pudp, pud); ++} + + /* + * For PTEs and PDEs, we must clear the P-bit first when clearing a page table + * entry, so clear the bottom half first and enforce ordering with a compiler + * barrier. + */ +-static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) ++static inline void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + { + if ((mm != current->mm && mm != &init_mm) + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0)) { +@@ -84,7 +92,18 @@ static inline void pte_clear(struct mm_s + } + } + +-#define pmd_clear(xp) do { set_pmd(xp, __pmd(0)); } while (0) ++static inline void xen_pmd_clear(pmd_t *pmd) ++{ ++ xen_l2_entry_update(pmd, __pmd(0)); ++} ++ ++#define set_pte(ptep, pte) xen_set_pte(ptep, pte) ++#define set_pte_at(mm, addr, ptep, pte) xen_set_pte_at(mm, addr, ptep, pte) ++#define set_pte_atomic(ptep, pte) xen_set_pte_atomic(ptep, pte) ++#define set_pmd(pmdp, pmd) xen_set_pmd(pmdp, pmd) ++#define set_pud(pudp, pud) xen_set_pud(pudp, pud) ++#define pte_clear(mm, addr, ptep) xen_pte_clear(mm, addr, ptep) ++#define pmd_clear(pmd) xen_pmd_clear(pmd) + + /* + * Pentium-II erratum A13: in PAE mode we explicitly have to flush +@@ -105,7 +124,8 @@ static inline void pud_clear (pud_t * pu + #define pmd_offset(pud, address) ((pmd_t *) pud_page(*(pud)) + \ + pmd_index(address)) + +-static inline pte_t raw_ptep_get_and_clear(pte_t *ptep, pte_t res) ++#ifdef CONFIG_SMP ++static inline pte_t xen_ptep_get_and_clear(pte_t *ptep, pte_t res) + { + uint64_t val = pte_val_ma(res); + if (__cmpxchg64(ptep, val, 0) != val) { +@@ -116,6 +136,9 @@ static inline pte_t raw_ptep_get_and_cle + } + return res; + } ++#else ++#define xen_ptep_get_and_clear(xp, pte) xen_local_ptep_get_and_clear(xp, pte) ++#endif + + #define __HAVE_ARCH_PTEP_CLEAR_FLUSH + #define ptep_clear_flush(vma, addr, ptep) \ +@@ -160,13 +183,13 @@ extern unsigned long long __supported_pt + static inline pte_t pfn_pte(unsigned long page_nr, pgprot_t pgprot) + { + return __pte((((unsigned long long)page_nr << PAGE_SHIFT) | +- pgprot_val(pgprot)) & __supported_pte_mask); ++ pgprot_val(pgprot)) & __supported_pte_mask); + } + + static inline pmd_t pfn_pmd(unsigned long page_nr, pgprot_t pgprot) + { + return __pmd((((unsigned long long)page_nr << PAGE_SHIFT) | +- pgprot_val(pgprot)) & __supported_pte_mask); ++ pgprot_val(pgprot)) & __supported_pte_mask); + } + + /* +@@ -186,6 +209,4 @@ static inline pmd_t pfn_pmd(unsigned lon + + #define __pmd_free_tlb(tlb, x) do { } while (0) + +-void vmalloc_sync_all(void); +- + #endif /* _I386_PGTABLE_3LEVEL_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/pgtable.h 2007-10-22 14:08:56.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/pgtable.h 2007-10-22 14:09:14.000000000 +0200 +@@ -24,11 +24,11 @@ + #include <linux/slab.h> + #include <linux/list.h> + #include <linux/spinlock.h> ++#include <linux/sched.h> + + /* Is this pagetable pinned? */ + #define PG_pinned PG_arch_1 + +-struct mm_struct; + struct vm_area_struct; + + /* +@@ -38,17 +38,16 @@ struct vm_area_struct; + #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + extern unsigned long empty_zero_page[1024]; + extern pgd_t *swapper_pg_dir; +-extern struct kmem_cache *pgd_cache; + extern struct kmem_cache *pmd_cache; + extern spinlock_t pgd_lock; + extern struct page *pgd_list; ++void check_pgt_cache(void); + + void pmd_ctor(void *, struct kmem_cache *, unsigned long); +-void pgd_ctor(void *, struct kmem_cache *, unsigned long); +-void pgd_dtor(void *, struct kmem_cache *, unsigned long); + void pgtable_cache_init(void); + void paging_init(void); + ++ + /* + * The Linux x86 paging architecture is 'compile-time dual-mode', it + * implements both the traditional 2-level x86 page tables and the +@@ -162,6 +161,7 @@ void paging_init(void); + + extern unsigned long long __PAGE_KERNEL, __PAGE_KERNEL_EXEC; + #define __PAGE_KERNEL_RO (__PAGE_KERNEL & ~_PAGE_RW) ++#define __PAGE_KERNEL_RX (__PAGE_KERNEL_EXEC & ~_PAGE_RW) + #define __PAGE_KERNEL_NOCACHE (__PAGE_KERNEL | _PAGE_PCD) + #define __PAGE_KERNEL_LARGE (__PAGE_KERNEL | _PAGE_PSE) + #define __PAGE_KERNEL_LARGE_EXEC (__PAGE_KERNEL_EXEC | _PAGE_PSE) +@@ -169,6 +169,7 @@ extern unsigned long long __PAGE_KERNEL, + #define PAGE_KERNEL __pgprot(__PAGE_KERNEL) + #define PAGE_KERNEL_RO __pgprot(__PAGE_KERNEL_RO) + #define PAGE_KERNEL_EXEC __pgprot(__PAGE_KERNEL_EXEC) ++#define PAGE_KERNEL_RX __pgprot(__PAGE_KERNEL_RX) + #define PAGE_KERNEL_NOCACHE __pgprot(__PAGE_KERNEL_NOCACHE) + #define PAGE_KERNEL_LARGE __pgprot(__PAGE_KERNEL_LARGE) + #define PAGE_KERNEL_LARGE_EXEC __pgprot(__PAGE_KERNEL_LARGE_EXEC) +@@ -271,7 +272,13 @@ static inline pte_t pte_mkhuge(pte_t pte + */ + #define pte_update(mm, addr, ptep) do { } while (0) + #define pte_update_defer(mm, addr, ptep) do { } while (0) +-#define paravirt_map_pt_hook(slot, va, pfn) do { } while (0) ++ ++/* local pte updates need not use xchg for locking */ ++static inline pte_t xen_local_ptep_get_and_clear(pte_t *ptep, pte_t res) ++{ ++ xen_set_pte(ptep, __pte(0)); ++ return res; ++} + + /* + * We only update the dirty/accessed state if we set +@@ -282,17 +289,34 @@ static inline pte_t pte_mkhuge(pte_t pte + */ + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ +-do { \ +- if (dirty) \ ++({ \ ++ int __changed = !pte_same(*(ptep), entry); \ ++ if (__changed && (dirty)) \ + ptep_establish(vma, address, ptep, entry); \ +-} while (0) ++ __changed; \ ++}) + +-/* +- * We don't actually have these, but we want to advertise them so that +- * we can encompass the flush here. +- */ + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY ++#define ptep_test_and_clear_dirty(vma, addr, ptep) ({ \ ++ int __ret = 0; \ ++ if (pte_dirty(*(ptep))) \ ++ __ret = test_and_clear_bit(_PAGE_BIT_DIRTY, \ ++ &(ptep)->pte_low); \ ++ if (__ret) \ ++ pte_update((vma)->vm_mm, addr, ptep); \ ++ __ret; \ ++}) ++ + #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG ++#define ptep_test_and_clear_young(vma, addr, ptep) ({ \ ++ int __ret = 0; \ ++ if (pte_young(*(ptep))) \ ++ __ret = test_and_clear_bit(_PAGE_BIT_ACCESSED, \ ++ &(ptep)->pte_low); \ ++ if (__ret) \ ++ pte_update((vma)->vm_mm, addr, ptep); \ ++ __ret; \ ++}) + + /* + * Rules for using ptep_establish: the pte MUST be a user pte, and +@@ -319,7 +343,7 @@ do { \ + int __dirty = pte_dirty(__pte); \ + __pte = pte_mkclean(__pte); \ + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ +- ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ ++ (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + else if (__dirty) \ + (ptep)->pte_low = __pte.pte_low; \ + __dirty; \ +@@ -332,7 +356,7 @@ do { \ + int __young = pte_young(__pte); \ + __pte = pte_mkold(__pte); \ + if (test_bit(PG_pinned, &virt_to_page((vma)->vm_mm->pgd)->flags)) \ +- ptep_set_access_flags(vma, address, ptep, __pte, __young); \ ++ (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + else if (__young) \ + (ptep)->pte_low = __pte.pte_low; \ + __young; \ +@@ -345,7 +369,7 @@ static inline pte_t ptep_get_and_clear(s + if (!pte_none(pte) + && (mm != &init_mm + || HYPERVISOR_update_va_mapping(addr, __pte(0), 0))) { +- pte = raw_ptep_get_and_clear(ptep, pte); ++ pte = xen_ptep_get_and_clear(ptep, pte); + pte_update(mm, addr, ptep); + } + return pte; +@@ -487,24 +511,10 @@ extern pte_t *lookup_address(unsigned lo + #endif + + #if defined(CONFIG_HIGHPTE) +-#define pte_offset_map(dir, address) \ +-({ \ +- pte_t *__ptep; \ +- unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ +- __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE0); \ +- paravirt_map_pt_hook(KM_PTE0,__ptep, pfn); \ +- __ptep = __ptep + pte_index(address); \ +- __ptep; \ +-}) +-#define pte_offset_map_nested(dir, address) \ +-({ \ +- pte_t *__ptep; \ +- unsigned pfn = pmd_val(*(dir)) >> PAGE_SHIFT; \ +- __ptep = (pte_t *)kmap_atomic_pte(pfn_to_page(pfn),KM_PTE1); \ +- paravirt_map_pt_hook(KM_PTE1,__ptep, pfn); \ +- __ptep = __ptep + pte_index(address); \ +- __ptep; \ +-}) ++#define pte_offset_map(dir, address) \ ++ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE0) + pte_index(address)) ++#define pte_offset_map_nested(dir, address) \ ++ ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)),KM_PTE1) + pte_index(address)) + #define pte_unmap(pte) kunmap_atomic(pte, KM_PTE0) + #define pte_unmap_nested(pte) kunmap_atomic(pte, KM_PTE1) + #else +@@ -574,10 +584,6 @@ int touch_pte_range(struct mm_struct *mm + #define io_remap_pfn_range(vma,from,pfn,size,prot) \ + direct_remap_pfn_range(vma,from,pfn,size,prot,DOMID_IO) + +-#define MK_IOSPACE_PFN(space, pfn) (pfn) +-#define GET_IOSPACE(pfn) 0 +-#define GET_PFN(pfn) (pfn) +- + #include <asm-generic/pgtable.h> + + #endif /* _I386_PGTABLE_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/processor.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/processor.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/processor.h 2007-10-22 13:58:57.000000000 +0200 +@@ -21,6 +21,7 @@ + #include <asm/percpu.h> + #include <linux/cpumask.h> + #include <linux/init.h> ++#include <asm/processor-flags.h> + #include <xen/interface/physdev.h> + + /* flag for disabling the tsc */ +@@ -118,7 +119,8 @@ extern char ignore_fpu_irq; + + void __init cpu_detect(struct cpuinfo_x86 *c); + +-extern void identify_cpu(struct cpuinfo_x86 *); ++extern void identify_boot_cpu(void); ++extern void identify_secondary_cpu(struct cpuinfo_x86 *); + extern void print_cpu_info(struct cpuinfo_x86 *); + extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); + extern unsigned short num_cache_leaves; +@@ -129,29 +131,8 @@ extern void detect_ht(struct cpuinfo_x86 + static inline void detect_ht(struct cpuinfo_x86 *c) {} + #endif + +-/* +- * EFLAGS bits +- */ +-#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +-#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +-#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +-#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +-#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +-#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +-#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +-#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +-#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +-#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +-#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +-#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +-#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +-#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +-#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +-#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +-#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +- +-static inline fastcall void xen_cpuid(unsigned int *eax, unsigned int *ebx, +- unsigned int *ecx, unsigned int *edx) ++static inline void xen_cpuid(unsigned int *eax, unsigned int *ebx, ++ unsigned int *ecx, unsigned int *edx) + { + /* ecx is often an input as well as an output. */ + __asm__(XEN_CPUID +@@ -165,21 +146,6 @@ static inline fastcall void xen_cpuid(un + #define load_cr3(pgdir) write_cr3(__pa(pgdir)) + + /* +- * Intel CPU features in CR4 +- */ +-#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +-#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +-#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +-#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +-#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +-#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +-#define X86_CR4_MCE 0x0040 /* Machine check enable */ +-#define X86_CR4_PGE 0x0080 /* enable global pages */ +-#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +-#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ +-#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ +- +-/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up +@@ -206,26 +172,6 @@ static inline void clear_in_cr4 (unsigne + } + + /* +- * NSC/Cyrix CPU configuration register indexes +- */ +- +-#define CX86_PCR0 0x20 +-#define CX86_GCR 0xb8 +-#define CX86_CCR0 0xc0 +-#define CX86_CCR1 0xc1 +-#define CX86_CCR2 0xc2 +-#define CX86_CCR3 0xc3 +-#define CX86_CCR4 0xe8 +-#define CX86_CCR5 0xe9 +-#define CX86_CCR6 0xea +-#define CX86_CCR7 0xeb +-#define CX86_PCR1 0xf0 +-#define CX86_DIR0 0xfe +-#define CX86_DIR1 0xff +-#define CX86_ARR_BASE 0xc4 +-#define CX86_RCR_BASE 0xdc +- +-/* + * NSC/Cyrix CPU indexed register access macros + */ + +@@ -351,7 +297,8 @@ typedef struct { + struct thread_struct; + + #ifndef CONFIG_X86_NO_TSS +-struct tss_struct { ++/* This is the TSS defined by the hardware. */ ++struct i386_hw_tss { + unsigned short back_link,__blh; + unsigned long esp0; + unsigned short ss0,__ss0h; +@@ -375,6 +322,11 @@ struct tss_struct { + unsigned short gs, __gsh; + unsigned short ldt, __ldth; + unsigned short trace, io_bitmap_base; ++} __attribute__((packed)); ++ ++struct tss_struct { ++ struct i386_hw_tss x86_tss; ++ + /* + * The extra 1 is there because the CPU will access an + * additional byte beyond the end of the IO permission +@@ -428,10 +380,11 @@ struct thread_struct { + }; + + #define INIT_THREAD { \ ++ .esp0 = sizeof(init_stack) + (long)&init_stack, \ + .vm86_info = NULL, \ + .sysenter_cs = __KERNEL_CS, \ + .io_bitmap_ptr = NULL, \ +- .fs = __KERNEL_PDA, \ ++ .fs = __KERNEL_PERCPU, \ + } + + /* +@@ -441,10 +394,12 @@ struct thread_struct { + * be within the limit. + */ + #define INIT_TSS { \ +- .esp0 = sizeof(init_stack) + (long)&init_stack, \ +- .ss0 = __KERNEL_DS, \ +- .ss1 = __KERNEL_CS, \ +- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ ++ .x86_tss = { \ ++ .esp0 = sizeof(init_stack) + (long)&init_stack, \ ++ .ss0 = __KERNEL_DS, \ ++ .ss1 = __KERNEL_CS, \ ++ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ ++ }, \ + .io_bitmap = { [ 0 ... IO_BITMAP_LONGS] = ~0 }, \ + } + +@@ -551,36 +506,31 @@ static inline void rep_nop(void) + + #define cpu_relax() rep_nop() + +-#define paravirt_enabled() 0 +-#define __cpuid xen_cpuid +- + #ifndef CONFIG_X86_NO_TSS +-static inline void __load_esp0(struct tss_struct *tss, struct thread_struct *thread) ++static inline void native_load_esp0(struct tss_struct *tss, struct thread_struct *thread) + { +- tss->esp0 = thread->esp0; ++ tss->x86_tss.esp0 = thread->esp0; + /* This can only happen when SEP is enabled, no need to test "SEP"arately */ +- if (unlikely(tss->ss1 != thread->sysenter_cs)) { +- tss->ss1 = thread->sysenter_cs; ++ if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { ++ tss->x86_tss.ss1 = thread->sysenter_cs; + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); + } + } +-#define load_esp0(tss, thread) \ +- __load_esp0(tss, thread) + #else +-#define load_esp0(tss, thread) \ ++#define xen_load_esp0(tss, thread) \ + HYPERVISOR_stack_switch(__KERNEL_DS, (thread)->esp0) + #endif + + +-/* +- * These special macros can be used to get or set a debugging register +- */ +-#define get_debugreg(var, register) \ +- (var) = HYPERVISOR_get_debugreg((register)) +-#define set_debugreg(value, register) \ +- HYPERVISOR_set_debugreg((register), (value)) ++static inline unsigned long xen_get_debugreg(int regno) ++{ ++ return HYPERVISOR_get_debugreg(regno); ++} + +-#define set_iopl_mask xen_set_iopl_mask ++static inline void xen_set_debugreg(int regno, unsigned long value) ++{ ++ HYPERVISOR_set_debugreg(regno, value); ++} + + /* + * Set IOPL bits in EFLAGS from given mask +@@ -595,6 +545,21 @@ static inline void xen_set_iopl_mask(uns + } + + ++#define paravirt_enabled() 0 ++#define __cpuid xen_cpuid ++ ++#define load_esp0 xen_load_esp0 ++ ++/* ++ * These special macros can be used to get or set a debugging register ++ */ ++#define get_debugreg(var, register) \ ++ (var) = xen_get_debugreg(register) ++#define set_debugreg(value, register) \ ++ xen_set_debugreg(register, value) ++ ++#define set_iopl_mask xen_set_iopl_mask ++ + /* + * Generic CPUID function + * clear %ecx since some cpus (Cyrix MII) do not set or clear %ecx +@@ -747,8 +712,14 @@ extern unsigned long boot_option_idle_ov + extern void enable_sep_cpu(void); + extern int sysenter_setup(void); + +-extern int init_gdt(int cpu, struct task_struct *idle); ++/* Defined in head.S */ ++extern struct Xgt_desc_struct early_gdt_descr; ++ + extern void cpu_set_gdt(int); +-extern void secondary_cpu_init(void); ++extern void switch_to_new_gdt(void); ++extern void cpu_init(void); ++extern void init_gdt(int cpu); ++ ++extern int force_mwait; + + #endif /* __ASM_I386_PROCESSOR_H */ +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/scatterlist.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/scatterlist.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/scatterlist.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,6 +1,8 @@ + #ifndef _I386_SCATTERLIST_H + #define _I386_SCATTERLIST_H + ++#include <asm/types.h> ++ + struct scatterlist { + struct page *page; + unsigned int offset; +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/segment.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/segment.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/segment.h 2007-10-22 13:58:57.000000000 +0200 +@@ -39,7 +39,7 @@ + * 25 - APM BIOS support + * + * 26 - ESPFIX small SS +- * 27 - PDA [ per-cpu private data area ] ++ * 27 - per-cpu [ offset to per-cpu data area ] + * 28 - unused + * 29 - unused + * 30 - unused +@@ -74,8 +74,12 @@ + #define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) + #define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) + +-#define GDT_ENTRY_PDA (GDT_ENTRY_KERNEL_BASE + 15) +-#define __KERNEL_PDA (GDT_ENTRY_PDA * 8) ++#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) ++#ifdef CONFIG_SMP ++#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) ++#else ++#define __KERNEL_PERCPU 0 ++#endif + + #define GDT_ENTRY_DOUBLEFAULT_TSS 31 + +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/smp.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/smp.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/smp.h 2007-10-22 13:58:57.000000000 +0200 +@@ -8,19 +8,15 @@ + #include <linux/kernel.h> + #include <linux/threads.h> + #include <linux/cpumask.h> +-#include <asm/pda.h> + #endif + +-#ifdef CONFIG_X86_LOCAL_APIC +-#ifndef __ASSEMBLY__ +-#include <asm/fixmap.h> ++#if defined(CONFIG_X86_LOCAL_APIC) && !defined(__ASSEMBLY__) + #include <asm/bitops.h> + #include <asm/mpspec.h> ++#include <asm/apic.h> + #ifdef CONFIG_X86_IO_APIC + #include <asm/io_apic.h> + #endif +-#include <asm/apic.h> +-#endif + #endif + + #define BAD_APICID 0xFFu +@@ -52,9 +48,76 @@ extern void cpu_exit_clear(void); + extern void cpu_uninit(void); + #endif + +-#ifndef CONFIG_PARAVIRT ++#ifndef CONFIG_XEN ++struct smp_ops ++{ ++ void (*smp_prepare_boot_cpu)(void); ++ void (*smp_prepare_cpus)(unsigned max_cpus); ++ int (*cpu_up)(unsigned cpu); ++ void (*smp_cpus_done)(unsigned max_cpus); ++ ++ void (*smp_send_stop)(void); ++ void (*smp_send_reschedule)(int cpu); ++ int (*smp_call_function_mask)(cpumask_t mask, ++ void (*func)(void *info), void *info, ++ int wait); ++}; ++ ++extern struct smp_ops smp_ops; ++ ++static inline void smp_prepare_boot_cpu(void) ++{ ++ smp_ops.smp_prepare_boot_cpu(); ++} ++static inline void smp_prepare_cpus(unsigned int max_cpus) ++{ ++ smp_ops.smp_prepare_cpus(max_cpus); ++} ++static inline int __cpu_up(unsigned int cpu) ++{ ++ return smp_ops.cpu_up(cpu); ++} ++static inline void smp_cpus_done(unsigned int max_cpus) ++{ ++ smp_ops.smp_cpus_done(max_cpus); ++} ++ ++static inline void smp_send_stop(void) ++{ ++ smp_ops.smp_send_stop(); ++} ++static inline void smp_send_reschedule(int cpu) ++{ ++ smp_ops.smp_send_reschedule(cpu); ++} ++static inline int smp_call_function_mask(cpumask_t mask, ++ void (*func) (void *info), void *info, ++ int wait) ++{ ++ return smp_ops.smp_call_function_mask(mask, func, info, wait); ++} ++ ++void native_smp_prepare_boot_cpu(void); ++void native_smp_prepare_cpus(unsigned int max_cpus); ++int native_cpu_up(unsigned int cpunum); ++void native_smp_cpus_done(unsigned int max_cpus); ++ + #define startup_ipi_hook(phys_apicid, start_eip, start_esp) \ + do { } while (0) ++ ++#else ++ ++ ++void xen_smp_send_stop(void); ++void xen_smp_send_reschedule(int cpu); ++int xen_smp_call_function_mask(cpumask_t mask, ++ void (*func) (void *info), void *info, ++ int wait); ++ ++#define smp_send_stop xen_smp_send_stop ++#define smp_send_reschedule xen_smp_send_reschedule ++#define smp_call_function_mask xen_smp_call_function_mask ++ + #endif + + /* +@@ -62,7 +125,8 @@ do { } while (0) + * from the initial startup. We map APIC_BASE very early in page_setup(), + * so this is correct in the x86 case. + */ +-#define raw_smp_processor_id() (read_pda(cpu_number)) ++DECLARE_PER_CPU(int, cpu_number); ++#define raw_smp_processor_id() (x86_read_percpu(cpu_number)) + + extern cpumask_t cpu_possible_map; + #define cpu_callin_map cpu_possible_map +@@ -73,20 +137,6 @@ static inline int num_booting_cpus(void) + return cpus_weight(cpu_possible_map); + } + +-#ifdef CONFIG_X86_LOCAL_APIC +- +-#ifdef APIC_DEFINITION +-extern int hard_smp_processor_id(void); +-#else +-#include <mach_apicdef.h> +-static inline int hard_smp_processor_id(void) +-{ +- /* we don't want to mark this access volatile - bad code generation */ +- return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); +-} +-#endif +-#endif +- + extern int safe_smp_processor_id(void); + extern int __cpu_disable(void); + extern void __cpu_die(unsigned int cpu); +@@ -102,10 +152,31 @@ extern unsigned int num_processors; + + #define NO_PROC_ID 0xFF /* No processor magic marker */ + +-#endif ++#endif /* CONFIG_SMP */ + + #ifndef __ASSEMBLY__ + ++#ifdef CONFIG_X86_LOCAL_APIC ++ ++#ifdef APIC_DEFINITION ++extern int hard_smp_processor_id(void); ++#else ++#include <mach_apicdef.h> ++static inline int hard_smp_processor_id(void) ++{ ++ /* we don't want to mark this access volatile - bad code generation */ ++ return GET_APIC_ID(*(unsigned long *)(APIC_BASE+APIC_ID)); ++} ++#endif /* APIC_DEFINITION */ ++ ++#else /* CONFIG_X86_LOCAL_APIC */ ++ ++#ifndef CONFIG_SMP ++#define hard_smp_processor_id() 0 ++#endif ++ ++#endif /* CONFIG_X86_LOCAL_APIC */ ++ + extern u8 apicid_2_node[]; + + #ifdef CONFIG_X86_LOCAL_APIC +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/system.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/system.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/system.h 2007-10-22 13:58:57.000000000 +0200 +@@ -4,7 +4,7 @@ + #include <linux/kernel.h> + #include <asm/segment.h> + #include <asm/cpufeature.h> +-#include <linux/bitops.h> /* for LOCK_PREFIX */ ++#include <asm/cmpxchg.h> + #include <asm/synch_bitops.h> + #include <asm/hypervisor.h> + +@@ -90,308 +90,102 @@ __asm__ __volatile__ ("movw %%dx,%1\n\t" + #define savesegment(seg, value) \ + asm volatile("mov %%" #seg ",%0":"=rm" (value)) + +-#define read_cr0() ({ \ +- unsigned int __dummy; \ +- __asm__ __volatile__( \ +- "movl %%cr0,%0\n\t" \ +- :"=r" (__dummy)); \ +- __dummy; \ +-}) +-#define write_cr0(x) \ +- __asm__ __volatile__("movl %0,%%cr0": :"r" (x)) +- +-#define read_cr2() (current_vcpu_info()->arch.cr2) +-#define write_cr2(x) \ +- __asm__ __volatile__("movl %0,%%cr2": :"r" (x)) +- +-#define read_cr3() ({ \ +- unsigned int __dummy; \ +- __asm__ ( \ +- "movl %%cr3,%0\n\t" \ +- :"=r" (__dummy)); \ +- __dummy = xen_cr3_to_pfn(__dummy); \ +- mfn_to_pfn(__dummy) << PAGE_SHIFT; \ +-}) +-#define write_cr3(x) ({ \ +- unsigned int __dummy = pfn_to_mfn((x) >> PAGE_SHIFT); \ +- __dummy = xen_pfn_to_cr3(__dummy); \ +- __asm__ __volatile__("movl %0,%%cr3": :"r" (__dummy)); \ +-}) +-#define read_cr4() ({ \ +- unsigned int __dummy; \ +- __asm__( \ +- "movl %%cr4,%0\n\t" \ +- :"=r" (__dummy)); \ +- __dummy; \ +-}) +-#define read_cr4_safe() ({ \ +- unsigned int __dummy; \ +- /* This could fault if %cr4 does not exist */ \ +- __asm__("1: movl %%cr4, %0 \n" \ +- "2: \n" \ +- ".section __ex_table,\"a\" \n" \ +- ".long 1b,2b \n" \ +- ".previous \n" \ +- : "=r" (__dummy): "0" (0)); \ +- __dummy; \ +-}) +- +-#define write_cr4(x) \ +- __asm__ __volatile__("movl %0,%%cr4": :"r" (x)) +- +-#define wbinvd() \ +- __asm__ __volatile__ ("wbinvd": : :"memory") +- +-/* Clear the 'TS' bit */ +-#define clts() (HYPERVISOR_fpu_taskswitch(0)) +- +-/* Set the 'TS' bit */ +-#define stts() (HYPERVISOR_fpu_taskswitch(1)) +- +-#endif /* __KERNEL__ */ +- +-static inline unsigned long get_limit(unsigned long segment) ++static inline void xen_clts(void) + { +- unsigned long __limit; +- __asm__("lsll %1,%0" +- :"=r" (__limit):"r" (segment)); +- return __limit+1; ++ HYPERVISOR_fpu_taskswitch(0); + } + +-#define nop() __asm__ __volatile__ ("nop") +- +-#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +- +-#define tas(ptr) (xchg((ptr),1)) +- +-struct __xchg_dummy { unsigned long a[100]; }; +-#define __xg(x) ((struct __xchg_dummy *)(x)) ++static inline unsigned long xen_read_cr0(void) ++{ ++ unsigned long val; ++ asm volatile("movl %%cr0,%0\n\t" :"=r" (val)); ++ return val; ++} + ++static inline void xen_write_cr0(unsigned long val) ++{ ++ asm volatile("movl %0,%%cr0": :"r" (val)); ++} + +-#ifdef CONFIG_X86_CMPXCHG64 ++#define xen_read_cr2() (current_vcpu_info()->arch.cr2) + +-/* +- * The semantics of XCHGCMP8B are a bit strange, this is why +- * there is a loop and the loading of %%eax and %%edx has to +- * be inside. This inlines well in most cases, the cached +- * cost is around ~38 cycles. (in the future we might want +- * to do an SIMD/3DNOW!/MMX/FPU 64-bit store here, but that +- * might have an implicit FPU-save as a cost, so it's not +- * clear which path to go.) +- * +- * cmpxchg8b must be used with the lock prefix here to allow +- * the instruction to be executed atomically, see page 3-102 +- * of the instruction set reference 24319102.pdf. We need +- * the reader side to see the coherent 64bit value. +- */ +-static inline void __set_64bit (unsigned long long * ptr, +- unsigned int low, unsigned int high) ++static inline void xen_write_cr2(unsigned long val) + { +- __asm__ __volatile__ ( +- "\n1:\t" +- "movl (%0), %%eax\n\t" +- "movl 4(%0), %%edx\n\t" +- "lock cmpxchg8b (%0)\n\t" +- "jnz 1b" +- : /* no outputs */ +- : "D"(ptr), +- "b"(low), +- "c"(high) +- : "ax","dx","memory"); ++ asm volatile("movl %0,%%cr2": :"r" (val)); + } + +-static inline void __set_64bit_constant (unsigned long long *ptr, +- unsigned long long value) ++static inline unsigned long xen_read_cr3(void) + { +- __set_64bit(ptr,(unsigned int)(value), (unsigned int)((value)>>32ULL)); ++ unsigned long val; ++ asm volatile("movl %%cr3,%0\n\t" :"=r" (val)); ++ return mfn_to_pfn(xen_cr3_to_pfn(val)) << PAGE_SHIFT; + } +-#define ll_low(x) *(((unsigned int*)&(x))+0) +-#define ll_high(x) *(((unsigned int*)&(x))+1) + +-static inline void __set_64bit_var (unsigned long long *ptr, +- unsigned long long value) ++static inline void xen_write_cr3(unsigned long val) + { +- __set_64bit(ptr,ll_low(value), ll_high(value)); ++ val = xen_pfn_to_cr3(pfn_to_mfn(val >> PAGE_SHIFT)); ++ asm volatile("movl %0,%%cr3": :"r" (val)); + } + +-#define set_64bit(ptr,value) \ +-(__builtin_constant_p(value) ? \ +- __set_64bit_constant(ptr, value) : \ +- __set_64bit_var(ptr, value) ) +- +-#define _set_64bit(ptr,value) \ +-(__builtin_constant_p(value) ? \ +- __set_64bit(ptr, (unsigned int)(value), (unsigned int)((value)>>32ULL) ) : \ +- __set_64bit(ptr, ll_low(value), ll_high(value)) ) +- +-#endif +- +-/* +- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway +- * Note 2: xchg has side effect, so that attribute volatile is necessary, +- * but generally the primitive is invalid, *ptr is output argument. --ANK +- */ +-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) ++static inline unsigned long xen_read_cr4(void) + { +- switch (size) { +- case 1: +- __asm__ __volatile__("xchgb %b0,%1" +- :"=q" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- case 2: +- __asm__ __volatile__("xchgw %w0,%1" +- :"=r" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- case 4: +- __asm__ __volatile__("xchgl %0,%1" +- :"=r" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- } +- return x; ++ unsigned long val; ++ asm volatile("movl %%cr4,%0\n\t" :"=r" (val)); ++ return val; + } + +-/* +- * Atomic compare and exchange. Compare OLD with MEM, if identical, +- * store NEW in MEM. Return the initial value in MEM. Success is +- * indicated by comparing RETURN with OLD. +- */ +- +-#ifdef CONFIG_X86_CMPXCHG +-#define __HAVE_ARCH_CMPXCHG 1 +-#define cmpxchg(ptr,o,n)\ +- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ +- (unsigned long)(n),sizeof(*(ptr)))) +-#define sync_cmpxchg(ptr,o,n)\ +- ((__typeof__(*(ptr)))__sync_cmpxchg((ptr),(unsigned long)(o),\ +- (unsigned long)(n),sizeof(*(ptr)))) +-#endif ++static inline unsigned long xen_read_cr4_safe(void) ++{ ++ unsigned long val; ++ /* This could fault if %cr4 does not exist */ ++ asm("1: movl %%cr4, %0 \n" ++ "2: \n" ++ ".section __ex_table,\"a\" \n" ++ ".long 1b,2b \n" ++ ".previous \n" ++ : "=r" (val): "0" (0)); ++ return val; ++} + +-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, +- unsigned long new, int size) ++static inline void xen_write_cr4(unsigned long val) + { +- unsigned long prev; +- switch (size) { +- case 1: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" +- : "=a"(prev) +- : "q"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 2: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 4: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- } +- return old; ++ asm volatile("movl %0,%%cr4": :"r" (val)); + } + +-/* +- * Always use locked operations when touching memory shared with a +- * hypervisor, since the system may be SMP even if the guest kernel +- * isn't. +- */ +-static inline unsigned long __sync_cmpxchg(volatile void *ptr, +- unsigned long old, +- unsigned long new, int size) +-{ +- unsigned long prev; +- switch (size) { +- case 1: +- __asm__ __volatile__("lock; cmpxchgb %b1,%2" +- : "=a"(prev) +- : "q"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 2: +- __asm__ __volatile__("lock; cmpxchgw %w1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 4: +- __asm__ __volatile__("lock; cmpxchgl %1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- } +- return old; ++static inline void xen_wbinvd(void) ++{ ++ asm volatile("wbinvd": : :"memory"); + } + +-#ifndef CONFIG_X86_CMPXCHG +-/* +- * Building a kernel capable running on 80386. It may be necessary to +- * simulate the cmpxchg on the 80386 CPU. For that purpose we define +- * a function for each of the sizes we support. +- */ ++#define read_cr0() (xen_read_cr0()) ++#define write_cr0(x) (xen_write_cr0(x)) ++#define read_cr2() (xen_read_cr2()) ++#define write_cr2(x) (xen_write_cr2(x)) ++#define read_cr3() (xen_read_cr3()) ++#define write_cr3(x) (xen_write_cr3(x)) ++#define read_cr4() (xen_read_cr4()) ++#define read_cr4_safe() (xen_read_cr4_safe()) ++#define write_cr4(x) (xen_write_cr4(x)) ++#define wbinvd() (xen_wbinvd()) + +-extern unsigned long cmpxchg_386_u8(volatile void *, u8, u8); +-extern unsigned long cmpxchg_386_u16(volatile void *, u16, u16); +-extern unsigned long cmpxchg_386_u32(volatile void *, u32, u32); +- +-static inline unsigned long cmpxchg_386(volatile void *ptr, unsigned long old, +- unsigned long new, int size) +-{ +- switch (size) { +- case 1: +- return cmpxchg_386_u8(ptr, old, new); +- case 2: +- return cmpxchg_386_u16(ptr, old, new); +- case 4: +- return cmpxchg_386_u32(ptr, old, new); +- } +- return old; +-} +- +-#define cmpxchg(ptr,o,n) \ +-({ \ +- __typeof__(*(ptr)) __ret; \ +- if (likely(boot_cpu_data.x86 > 3)) \ +- __ret = __cmpxchg((ptr), (unsigned long)(o), \ +- (unsigned long)(n), sizeof(*(ptr))); \ +- else \ +- __ret = cmpxchg_386((ptr), (unsigned long)(o), \ +- (unsigned long)(n), sizeof(*(ptr))); \ +- __ret; \ +-}) +-#endif ++/* Clear the 'TS' bit */ ++#define clts() (xen_clts()) + +-#ifdef CONFIG_X86_CMPXCHG64 ++/* Set the 'TS' bit */ ++#define stts() (HYPERVISOR_fpu_taskswitch(1)) + +-static inline unsigned long long __cmpxchg64(volatile void *ptr, unsigned long long old, +- unsigned long long new) ++#endif /* __KERNEL__ */ ++ ++static inline unsigned long get_limit(unsigned long segment) + { +- unsigned long long prev; +- __asm__ __volatile__(LOCK_PREFIX "cmpxchg8b %3" +- : "=A"(prev) +- : "b"((unsigned long)new), +- "c"((unsigned long)(new >> 32)), +- "m"(*__xg(ptr)), +- "0"(old) +- : "memory"); +- return prev; +-} +- +-#define cmpxchg64(ptr,o,n)\ +- ((__typeof__(*(ptr)))__cmpxchg64((ptr),(unsigned long long)(o),\ +- (unsigned long long)(n))) ++ unsigned long __limit; ++ __asm__("lsll %1,%0" ++ :"=r" (__limit):"r" (segment)); ++ return __limit+1; ++} ++ ++#define nop() __asm__ __volatile__ ("nop") + +-#endif +- + /* + * Force strict CPU ordering. + * And yes, this is required on UP too when we're talking +Index: 10.3-2007-11-26/include/asm-i386/mach-xen/asm/tlbflush.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-i386/mach-xen/asm/tlbflush.h 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-i386/mach-xen/asm/tlbflush.h 2007-10-22 13:58:57.000000000 +0200 +@@ -24,13 +24,19 @@ + * - flush_tlb_range(vma, start, end) flushes a range of pages + * - flush_tlb_kernel_range(start, end) flushes a range of kernel pages + * - flush_tlb_pgtables(mm, start, end) flushes a range of page tables ++ * - flush_tlb_others(cpumask, mm, va) flushes a TLBs on other cpus + * + * ..but the i386 has somewhat limited tlb flushing capabilities, + * and page-granular flushes are available only on i486 and up. + */ + ++#define TLB_FLUSH_ALL 0xffffffff ++ ++ + #ifndef CONFIG_SMP + ++#include <linux/sched.h> ++ + #define flush_tlb() __flush_tlb() + #define flush_tlb_all() __flush_tlb_all() + #define local_flush_tlb() __flush_tlb() +@@ -55,7 +61,12 @@ static inline void flush_tlb_range(struc + __flush_tlb(); + } + +-#else ++static inline void xen_flush_tlb_others(const cpumask_t *cpumask, ++ struct mm_struct *mm, unsigned long va) ++{ ++} ++ ++#else /* SMP */ + + #include <asm/smp.h> + +@@ -74,6 +85,9 @@ static inline void flush_tlb_range(struc + flush_tlb_mm(vma->vm_mm); + } + ++void xen_flush_tlb_others(const cpumask_t *cpumask, struct mm_struct *mm, ++ unsigned long va); ++ + #define TLBSTATE_OK 1 + #define TLBSTATE_LAZY 2 + +@@ -84,9 +98,10 @@ struct tlb_state + char __cacheline_padding[L1_CACHE_BYTES-8]; + }; + DECLARE_PER_CPU(struct tlb_state, cpu_tlbstate); ++#endif /* SMP */ + +- +-#endif ++#define flush_tlb_others(mask, mm, va) \ ++ xen_flush_tlb_others(&mask, mm, va) + + #define flush_tlb_kernel_range(start, end) flush_tlb_all() + +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/agp.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/agp.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/agp.h 2007-10-22 13:58:57.000000000 +0200 +@@ -11,8 +11,15 @@ + * with different cachability attributes for the same page. + */ + +-int map_page_into_agp(struct page *page); +-int unmap_page_from_agp(struct page *page); ++/* Caller's responsibility to call global_flush_tlb() for ++ * performance reasons */ ++#define map_page_into_agp(page) ( \ ++ xen_create_contiguous_region((unsigned long)page_address(page), 0, 32) \ ++ ?: change_page_attr(page, 1, PAGE_KERNEL_NOCACHE)) ++#define unmap_page_from_agp(page) ( \ ++ xen_destroy_contiguous_region((unsigned long)page_address(page), 0), \ ++ /* only a fallback: xen_destroy_contiguous_region uses PAGE_KERNEL */ \ ++ change_page_attr(page, 1, PAGE_KERNEL)) + #define flush_agp_mappings() global_flush_tlb() + + /* Could use CLFLUSH here if the cpu supports it. But then it would +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/desc.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/desc.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/desc.h 2007-10-22 13:58:57.000000000 +0200 +@@ -127,16 +127,6 @@ static inline void set_ldt_desc(unsigned + DESC_LDT, size * 8 - 1); + } + +-static inline void set_seg_base(unsigned cpu, int entry, void *base) +-{ +- struct desc_struct *d = &cpu_gdt(cpu)[entry]; +- u32 addr = (u32)(u64)base; +- BUG_ON((u64)base >> 32); +- d->base0 = addr & 0xffff; +- d->base1 = (addr >> 16) & 0xff; +- d->base2 = (addr >> 24) & 0xff; +-} +- + #define LDT_entry_a(info) \ + ((((info)->base_addr & 0x0000ffff) << 16) | ((info)->limit & 0x0ffff)) + /* Don't allow setting of the lm bit. It is useless anyways because +@@ -165,23 +155,13 @@ static inline void set_seg_base(unsigned + (info)->useable == 0 && \ + (info)->lm == 0) + +-#if TLS_SIZE != 24 +-# error update this code. +-#endif +- + static inline void load_TLS(struct thread_struct *t, unsigned int cpu) + { +-#if 0 ++ unsigned int i; + u64 *gdt = (u64 *)(cpu_gdt(cpu) + GDT_ENTRY_TLS_MIN); +- gdt[0] = t->tls_array[0]; +- gdt[1] = t->tls_array[1]; +- gdt[2] = t->tls_array[2]; +-#endif +-#define C(i) \ +- HYPERVISOR_update_descriptor(virt_to_machine(&cpu_gdt(cpu)[GDT_ENTRY_TLS_MIN + i]), t->tls_array[i]) + +- C(0); C(1); C(2); +-#undef C ++ for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) ++ HYPERVISOR_update_descriptor(virt_to_machine(&gdt[i]), t->tls_array[i]); + } + + /* +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/dma-mapping.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/dma-mapping.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/dma-mapping.h 2007-10-22 13:58:57.000000000 +0200 +@@ -51,7 +51,7 @@ struct dma_mapping_ops { + }; + + extern dma_addr_t bad_dma_address; +-extern struct dma_mapping_ops* dma_ops; ++extern const struct dma_mapping_ops* dma_ops; + extern int iommu_merge; + + #if 0 +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/fixmap.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/fixmap.h 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/fixmap.h 2007-10-22 13:58:57.000000000 +0200 +@@ -15,7 +15,6 @@ + #include <asm/apicdef.h> + #include <asm/page.h> + #include <asm/vsyscall.h> +-#include <asm/vsyscall32.h> + #include <asm/acpi.h> + + /* +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/irqflags.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/irqflags.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/irqflags.h 2007-10-22 13:58:57.000000000 +0200 +@@ -9,6 +9,7 @@ + */ + #ifndef _ASM_IRQFLAGS_H + #define _ASM_IRQFLAGS_H ++#include <asm/processor-flags.h> + + #ifndef __ASSEMBLY__ + /* +@@ -50,19 +51,19 @@ static inline void raw_local_irq_disable + { + unsigned long flags = __raw_local_save_flags(); + +- raw_local_irq_restore((flags & ~(1 << 9)) | (1 << 18)); ++ raw_local_irq_restore((flags & ~X86_EFLAGS_IF) | X86_EFLAGS_AC); + } + + static inline void raw_local_irq_enable(void) + { + unsigned long flags = __raw_local_save_flags(); + +- raw_local_irq_restore((flags | (1 << 9)) & ~(1 << 18)); ++ raw_local_irq_restore((flags | X86_EFLAGS_IF) & (~X86_EFLAGS_AC)); + } + + static inline int raw_irqs_disabled_flags(unsigned long flags) + { +- return !(flags & (1<<9)) || (flags & (1 << 18)); ++ return !(flags & X86_EFLAGS_IF) || (flags & X86_EFLAGS_AC); + } + + #else /* CONFIG_X86_VSMP */ +@@ -118,13 +119,21 @@ static inline int raw_irqs_disabled_flag + * Used in the idle loop; sti takes one instruction cycle + * to complete: + */ +-void raw_safe_halt(void); ++void xen_safe_halt(void); ++static inline void raw_safe_halt(void) ++{ ++ xen_safe_halt(); ++} + + /* + * Used when interrupts are already enabled or to + * shutdown the processor: + */ +-void halt(void); ++void xen_halt(void); ++static inline void halt(void) ++{ ++ xen_halt(); ++} + + #else /* __ASSEMBLY__: */ + # ifdef CONFIG_TRACE_IRQFLAGS +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/mmu.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/mmu.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/mmu.h 2007-10-22 13:58:57.000000000 +0200 +@@ -25,14 +25,6 @@ typedef struct { + #ifdef CONFIG_XEN + extern struct list_head mm_unpinned; + extern spinlock_t mm_unpinned_lock; +- +-/* mm/memory.c:exit_mmap hook */ +-extern void _arch_exit_mmap(struct mm_struct *mm); +-#define arch_exit_mmap(_mm) _arch_exit_mmap(_mm) +- +-/* kernel/fork.c:dup_mmap hook */ +-extern void _arch_dup_mmap(struct mm_struct *mm); +-#define arch_dup_mmap(mm, oldmm) ((void)(oldmm), _arch_dup_mmap(mm)) + #endif + + #endif +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/mmu_context.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/mmu_context.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/mmu_context.h 2007-10-22 13:58:57.000000000 +0200 +@@ -9,6 +9,9 @@ + #include <asm/pgtable.h> + #include <asm/tlbflush.h> + ++void arch_exit_mmap(struct mm_struct *mm); ++void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm); ++ + /* + * possibly do the LDT unload here? + */ +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/msr.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/msr.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/msr.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,7 +1,10 @@ + #ifndef X86_64_MSR_H + #define X86_64_MSR_H 1 + ++#include <asm/msr-index.h> ++ + #ifndef __ASSEMBLY__ ++#include <linux/errno.h> + /* + * Access to machine-specific registers (available on 586 and better only) + * Note: the rd* operations modify the parameters directly (without using +@@ -157,12 +160,11 @@ static inline unsigned int cpuid_edx(uns + return edx; + } + +-#define MSR_IA32_UCODE_WRITE 0x79 +-#define MSR_IA32_UCODE_REV 0x8b +- + #ifdef CONFIG_SMP + void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); + void wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); ++int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); ++int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); + #else /* CONFIG_SMP */ + static inline void rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) + { +@@ -172,269 +174,14 @@ static inline void wrmsr_on_cpu(unsigned + { + wrmsr(msr_no, l, h); + } +-#endif /* CONFIG_SMP */ +- +-#endif +- +-/* AMD/K8 specific MSRs */ +-#define MSR_EFER 0xc0000080 /* extended feature register */ +-#define MSR_STAR 0xc0000081 /* legacy mode SYSCALL target */ +-#define MSR_LSTAR 0xc0000082 /* long mode SYSCALL target */ +-#define MSR_CSTAR 0xc0000083 /* compatibility mode SYSCALL target */ +-#define MSR_SYSCALL_MASK 0xc0000084 /* EFLAGS mask for syscall */ +-#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ +-#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ +-#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow (or USER_GS from kernel) */ +-/* EFER bits: */ +-#define _EFER_SCE 0 /* SYSCALL/SYSRET */ +-#define _EFER_LME 8 /* Long mode enable */ +-#define _EFER_LMA 10 /* Long mode active (read-only) */ +-#define _EFER_NX 11 /* No execute enable */ +- +-#define EFER_SCE (1<<_EFER_SCE) +-#define EFER_LME (1<<_EFER_LME) +-#define EFER_LMA (1<<_EFER_LMA) +-#define EFER_NX (1<<_EFER_NX) +- +-/* Intel MSRs. Some also available on other CPUs */ +-#define MSR_IA32_TSC 0x10 +-#define MSR_IA32_PLATFORM_ID 0x17 +- +-#define MSR_IA32_PERFCTR0 0xc1 +-#define MSR_IA32_PERFCTR1 0xc2 +-#define MSR_FSB_FREQ 0xcd +- +-#define MSR_MTRRcap 0x0fe +-#define MSR_IA32_BBL_CR_CTL 0x119 +- +-#define MSR_IA32_SYSENTER_CS 0x174 +-#define MSR_IA32_SYSENTER_ESP 0x175 +-#define MSR_IA32_SYSENTER_EIP 0x176 +- +-#define MSR_IA32_MCG_CAP 0x179 +-#define MSR_IA32_MCG_STATUS 0x17a +-#define MSR_IA32_MCG_CTL 0x17b +- +-#define MSR_IA32_EVNTSEL0 0x186 +-#define MSR_IA32_EVNTSEL1 0x187 +- +-#define MSR_IA32_DEBUGCTLMSR 0x1d9 +-#define MSR_IA32_LASTBRANCHFROMIP 0x1db +-#define MSR_IA32_LASTBRANCHTOIP 0x1dc +-#define MSR_IA32_LASTINTFROMIP 0x1dd +-#define MSR_IA32_LASTINTTOIP 0x1de +- +-#define MSR_IA32_PEBS_ENABLE 0x3f1 +-#define MSR_IA32_DS_AREA 0x600 +-#define MSR_IA32_PERF_CAPABILITIES 0x345 +- +-#define MSR_MTRRfix64K_00000 0x250 +-#define MSR_MTRRfix16K_80000 0x258 +-#define MSR_MTRRfix16K_A0000 0x259 +-#define MSR_MTRRfix4K_C0000 0x268 +-#define MSR_MTRRfix4K_C8000 0x269 +-#define MSR_MTRRfix4K_D0000 0x26a +-#define MSR_MTRRfix4K_D8000 0x26b +-#define MSR_MTRRfix4K_E0000 0x26c +-#define MSR_MTRRfix4K_E8000 0x26d +-#define MSR_MTRRfix4K_F0000 0x26e +-#define MSR_MTRRfix4K_F8000 0x26f +-#define MSR_MTRRdefType 0x2ff +- +-#define MSR_IA32_MC0_CTL 0x400 +-#define MSR_IA32_MC0_STATUS 0x401 +-#define MSR_IA32_MC0_ADDR 0x402 +-#define MSR_IA32_MC0_MISC 0x403 +- +-#define MSR_P6_PERFCTR0 0xc1 +-#define MSR_P6_PERFCTR1 0xc2 +-#define MSR_P6_EVNTSEL0 0x186 +-#define MSR_P6_EVNTSEL1 0x187 +- +-/* K7/K8 MSRs. Not complete. See the architecture manual for a more complete list. */ +-#define MSR_K7_EVNTSEL0 0xC0010000 +-#define MSR_K7_PERFCTR0 0xC0010004 +-#define MSR_K7_EVNTSEL1 0xC0010001 +-#define MSR_K7_PERFCTR1 0xC0010005 +-#define MSR_K7_EVNTSEL2 0xC0010002 +-#define MSR_K7_PERFCTR2 0xC0010006 +-#define MSR_K7_EVNTSEL3 0xC0010003 +-#define MSR_K7_PERFCTR3 0xC0010007 +-#define MSR_K8_TOP_MEM1 0xC001001A +-#define MSR_K8_TOP_MEM2 0xC001001D +-#define MSR_K8_SYSCFG 0xC0010010 +-#define MSR_K8_HWCR 0xC0010015 +- +-/* K6 MSRs */ +-#define MSR_K6_EFER 0xC0000080 +-#define MSR_K6_STAR 0xC0000081 +-#define MSR_K6_WHCR 0xC0000082 +-#define MSR_K6_UWCCR 0xC0000085 +-#define MSR_K6_PSOR 0xC0000087 +-#define MSR_K6_PFIR 0xC0000088 +- +-/* Centaur-Hauls/IDT defined MSRs. */ +-#define MSR_IDT_FCR1 0x107 +-#define MSR_IDT_FCR2 0x108 +-#define MSR_IDT_FCR3 0x109 +-#define MSR_IDT_FCR4 0x10a +- +-#define MSR_IDT_MCR0 0x110 +-#define MSR_IDT_MCR1 0x111 +-#define MSR_IDT_MCR2 0x112 +-#define MSR_IDT_MCR3 0x113 +-#define MSR_IDT_MCR4 0x114 +-#define MSR_IDT_MCR5 0x115 +-#define MSR_IDT_MCR6 0x116 +-#define MSR_IDT_MCR7 0x117 +-#define MSR_IDT_MCR_CTRL 0x120 +- +-/* VIA Cyrix defined MSRs*/ +-#define MSR_VIA_FCR 0x1107 +-#define MSR_VIA_LONGHAUL 0x110a +-#define MSR_VIA_RNG 0x110b +-#define MSR_VIA_BCR2 0x1147 +- +-/* Intel defined MSRs. */ +-#define MSR_IA32_P5_MC_ADDR 0 +-#define MSR_IA32_P5_MC_TYPE 1 +-#define MSR_IA32_PLATFORM_ID 0x17 +-#define MSR_IA32_EBL_CR_POWERON 0x2a +- +-#define MSR_IA32_APICBASE 0x1b +-#define MSR_IA32_APICBASE_BSP (1<<8) +-#define MSR_IA32_APICBASE_ENABLE (1<<11) +-#define MSR_IA32_APICBASE_BASE (0xfffff<<12) +- +-/* P4/Xeon+ specific */ +-#define MSR_IA32_MCG_EAX 0x180 +-#define MSR_IA32_MCG_EBX 0x181 +-#define MSR_IA32_MCG_ECX 0x182 +-#define MSR_IA32_MCG_EDX 0x183 +-#define MSR_IA32_MCG_ESI 0x184 +-#define MSR_IA32_MCG_EDI 0x185 +-#define MSR_IA32_MCG_EBP 0x186 +-#define MSR_IA32_MCG_ESP 0x187 +-#define MSR_IA32_MCG_EFLAGS 0x188 +-#define MSR_IA32_MCG_EIP 0x189 +-#define MSR_IA32_MCG_RESERVED 0x18A +- +-#define MSR_P6_EVNTSEL0 0x186 +-#define MSR_P6_EVNTSEL1 0x187 +- +-#define MSR_IA32_PERF_STATUS 0x198 +-#define MSR_IA32_PERF_CTL 0x199 +- +-#define MSR_IA32_MPERF 0xE7 +-#define MSR_IA32_APERF 0xE8 +- +-#define MSR_IA32_THERM_CONTROL 0x19a +-#define MSR_IA32_THERM_INTERRUPT 0x19b +-#define MSR_IA32_THERM_STATUS 0x19c +-#define MSR_IA32_MISC_ENABLE 0x1a0 +- +-#define MSR_IA32_DEBUGCTLMSR 0x1d9 +-#define MSR_IA32_LASTBRANCHFROMIP 0x1db +-#define MSR_IA32_LASTBRANCHTOIP 0x1dc +-#define MSR_IA32_LASTINTFROMIP 0x1dd +-#define MSR_IA32_LASTINTTOIP 0x1de +- +-#define MSR_IA32_MC0_CTL 0x400 +-#define MSR_IA32_MC0_STATUS 0x401 +-#define MSR_IA32_MC0_ADDR 0x402 +-#define MSR_IA32_MC0_MISC 0x403 +- +-/* Pentium IV performance counter MSRs */ +-#define MSR_P4_BPU_PERFCTR0 0x300 +-#define MSR_P4_BPU_PERFCTR1 0x301 +-#define MSR_P4_BPU_PERFCTR2 0x302 +-#define MSR_P4_BPU_PERFCTR3 0x303 +-#define MSR_P4_MS_PERFCTR0 0x304 +-#define MSR_P4_MS_PERFCTR1 0x305 +-#define MSR_P4_MS_PERFCTR2 0x306 +-#define MSR_P4_MS_PERFCTR3 0x307 +-#define MSR_P4_FLAME_PERFCTR0 0x308 +-#define MSR_P4_FLAME_PERFCTR1 0x309 +-#define MSR_P4_FLAME_PERFCTR2 0x30a +-#define MSR_P4_FLAME_PERFCTR3 0x30b +-#define MSR_P4_IQ_PERFCTR0 0x30c +-#define MSR_P4_IQ_PERFCTR1 0x30d +-#define MSR_P4_IQ_PERFCTR2 0x30e +-#define MSR_P4_IQ_PERFCTR3 0x30f +-#define MSR_P4_IQ_PERFCTR4 0x310 +-#define MSR_P4_IQ_PERFCTR5 0x311 +-#define MSR_P4_BPU_CCCR0 0x360 +-#define MSR_P4_BPU_CCCR1 0x361 +-#define MSR_P4_BPU_CCCR2 0x362 +-#define MSR_P4_BPU_CCCR3 0x363 +-#define MSR_P4_MS_CCCR0 0x364 +-#define MSR_P4_MS_CCCR1 0x365 +-#define MSR_P4_MS_CCCR2 0x366 +-#define MSR_P4_MS_CCCR3 0x367 +-#define MSR_P4_FLAME_CCCR0 0x368 +-#define MSR_P4_FLAME_CCCR1 0x369 +-#define MSR_P4_FLAME_CCCR2 0x36a +-#define MSR_P4_FLAME_CCCR3 0x36b +-#define MSR_P4_IQ_CCCR0 0x36c +-#define MSR_P4_IQ_CCCR1 0x36d +-#define MSR_P4_IQ_CCCR2 0x36e +-#define MSR_P4_IQ_CCCR3 0x36f +-#define MSR_P4_IQ_CCCR4 0x370 +-#define MSR_P4_IQ_CCCR5 0x371 +-#define MSR_P4_ALF_ESCR0 0x3ca +-#define MSR_P4_ALF_ESCR1 0x3cb +-#define MSR_P4_BPU_ESCR0 0x3b2 +-#define MSR_P4_BPU_ESCR1 0x3b3 +-#define MSR_P4_BSU_ESCR0 0x3a0 +-#define MSR_P4_BSU_ESCR1 0x3a1 +-#define MSR_P4_CRU_ESCR0 0x3b8 +-#define MSR_P4_CRU_ESCR1 0x3b9 +-#define MSR_P4_CRU_ESCR2 0x3cc +-#define MSR_P4_CRU_ESCR3 0x3cd +-#define MSR_P4_CRU_ESCR4 0x3e0 +-#define MSR_P4_CRU_ESCR5 0x3e1 +-#define MSR_P4_DAC_ESCR0 0x3a8 +-#define MSR_P4_DAC_ESCR1 0x3a9 +-#define MSR_P4_FIRM_ESCR0 0x3a4 +-#define MSR_P4_FIRM_ESCR1 0x3a5 +-#define MSR_P4_FLAME_ESCR0 0x3a6 +-#define MSR_P4_FLAME_ESCR1 0x3a7 +-#define MSR_P4_FSB_ESCR0 0x3a2 +-#define MSR_P4_FSB_ESCR1 0x3a3 +-#define MSR_P4_IQ_ESCR0 0x3ba +-#define MSR_P4_IQ_ESCR1 0x3bb +-#define MSR_P4_IS_ESCR0 0x3b4 +-#define MSR_P4_IS_ESCR1 0x3b5 +-#define MSR_P4_ITLB_ESCR0 0x3b6 +-#define MSR_P4_ITLB_ESCR1 0x3b7 +-#define MSR_P4_IX_ESCR0 0x3c8 +-#define MSR_P4_IX_ESCR1 0x3c9 +-#define MSR_P4_MOB_ESCR0 0x3aa +-#define MSR_P4_MOB_ESCR1 0x3ab +-#define MSR_P4_MS_ESCR0 0x3c0 +-#define MSR_P4_MS_ESCR1 0x3c1 +-#define MSR_P4_PMH_ESCR0 0x3ac +-#define MSR_P4_PMH_ESCR1 0x3ad +-#define MSR_P4_RAT_ESCR0 0x3bc +-#define MSR_P4_RAT_ESCR1 0x3bd +-#define MSR_P4_SAAT_ESCR0 0x3ae +-#define MSR_P4_SAAT_ESCR1 0x3af +-#define MSR_P4_SSU_ESCR0 0x3be +-#define MSR_P4_SSU_ESCR1 0x3bf /* guess: not defined in manual */ +-#define MSR_P4_TBPU_ESCR0 0x3c2 +-#define MSR_P4_TBPU_ESCR1 0x3c3 +-#define MSR_P4_TC_ESCR0 0x3c4 +-#define MSR_P4_TC_ESCR1 0x3c5 +-#define MSR_P4_U2L_ESCR0 0x3b0 +-#define MSR_P4_U2L_ESCR1 0x3b1 +- +-/* Intel Core-based CPU performance counters */ +-#define MSR_CORE_PERF_FIXED_CTR0 0x309 +-#define MSR_CORE_PERF_FIXED_CTR1 0x30a +-#define MSR_CORE_PERF_FIXED_CTR2 0x30b +-#define MSR_CORE_PERF_FIXED_CTR_CTRL 0x38d +-#define MSR_CORE_PERF_GLOBAL_STATUS 0x38e +-#define MSR_CORE_PERF_GLOBAL_CTRL 0x38f +-#define MSR_CORE_PERF_GLOBAL_OVF_CTRL 0x390 +- +-#endif ++static inline int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) ++{ ++ return rdmsr_safe(msr_no, l, h); ++} ++static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) ++{ ++ return wrmsr_safe(msr_no, l, h); ++} ++#endif /* CONFIG_SMP */ ++#endif /* __ASSEMBLY__ */ ++#endif /* X86_64_MSR_H */ +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/nmi.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/nmi.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/nmi.h 2007-10-22 13:58:57.000000000 +0200 +@@ -96,4 +96,13 @@ extern int unknown_nmi_panic; + void __trigger_all_cpu_backtrace(void); + #define trigger_all_cpu_backtrace() __trigger_all_cpu_backtrace() + ++ ++void lapic_watchdog_stop(void); ++int lapic_watchdog_init(unsigned nmi_hz); ++int lapic_wd_event(unsigned nmi_hz); ++unsigned lapic_adjust_nmi_hz(unsigned hz); ++int lapic_watchdog_ok(void); ++void disable_lapic_nmi_watchdog(void); ++void enable_lapic_nmi_watchdog(void); ++ + #endif /* ASM_NMI_H */ +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/page.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/page.h 2007-05-31 14:39:08.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/page.h 2007-10-22 13:58:57.000000000 +0200 +@@ -7,6 +7,7 @@ + #include <linux/types.h> + #include <asm/bug.h> + #endif ++#include <linux/const.h> + #include <xen/interface/xen.h> + + /* +@@ -18,18 +19,14 @@ + + /* PAGE_SHIFT determines the page size */ + #define PAGE_SHIFT 12 +-#ifdef __ASSEMBLY__ +-#define PAGE_SIZE (0x1 << PAGE_SHIFT) +-#else +-#define PAGE_SIZE (1UL << PAGE_SHIFT) +-#endif ++#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) + #define PAGE_MASK (~(PAGE_SIZE-1)) + + /* See Documentation/x86_64/mm.txt for a description of the memory map. */ + #define __PHYSICAL_MASK_SHIFT 46 +-#define __PHYSICAL_MASK ((1UL << __PHYSICAL_MASK_SHIFT) - 1) ++#define __PHYSICAL_MASK ((_AC(1,UL) << __PHYSICAL_MASK_SHIFT) - 1) + #define __VIRTUAL_MASK_SHIFT 48 +-#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) ++#define __VIRTUAL_MASK ((_AC(1,UL) << __VIRTUAL_MASK_SHIFT) - 1) + + #define PHYSICAL_PAGE_MASK (~(PAGE_SIZE-1) & __PHYSICAL_MASK) + +@@ -54,10 +51,10 @@ + #define N_EXCEPTION_STACKS 5 /* hw limit: 7 */ + + #define LARGE_PAGE_MASK (~(LARGE_PAGE_SIZE-1)) +-#define LARGE_PAGE_SIZE (1UL << PMD_SHIFT) ++#define LARGE_PAGE_SIZE (_AC(1,UL) << PMD_SHIFT) + + #define HPAGE_SHIFT PMD_SHIFT +-#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT) ++#define HPAGE_SIZE (_AC(1,UL) << HPAGE_SHIFT) + #define HPAGE_MASK (~(HPAGE_SIZE - 1)) + #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) + +@@ -146,17 +143,23 @@ static inline pgd_t __pgd(unsigned long + + #define __pgprot(x) ((pgprot_t) { (x) } ) + +-#define __PHYSICAL_START ((unsigned long)CONFIG_PHYSICAL_START) +-#define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +-#define __START_KERNEL_map 0xffffffff80000000UL +-#define __PAGE_OFFSET 0xffff880000000000UL ++#endif /* !__ASSEMBLY__ */ + +-#else + #define __PHYSICAL_START CONFIG_PHYSICAL_START ++#define __KERNEL_ALIGN 0x200000 ++ ++/* ++ * Make sure kernel is aligned to 2MB address. Catching it at compile ++ * time is better. Change your config file and compile the kernel ++ * for a 2MB aligned address (CONFIG_PHYSICAL_START) ++ */ ++#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 ++#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" ++#endif ++ + #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) +-#define __START_KERNEL_map 0xffffffff80000000 +-#define __PAGE_OFFSET 0xffff880000000000 +-#endif /* !__ASSEMBLY__ */ ++#define __START_KERNEL_map _AC(0xffffffff80000000, UL) ++#define __PAGE_OFFSET _AC(0xffff880000000000, UL) + + #if CONFIG_XEN_COMPAT <= 0x030002 + #undef LOAD_OFFSET +@@ -166,20 +169,20 @@ static inline pgd_t __pgd(unsigned long + /* to align the pointer to the (next) page boundary */ + #define PAGE_ALIGN(addr) (((addr)+PAGE_SIZE-1)&PAGE_MASK) + +-#define KERNEL_TEXT_SIZE (40UL*1024*1024) +-#define KERNEL_TEXT_START 0xffffffff80000000UL ++#define KERNEL_TEXT_SIZE (40*1024*1024) ++#define KERNEL_TEXT_START _AC(0xffffffff80000000, UL) ++ ++#define PAGE_OFFSET __PAGE_OFFSET + +-#define PAGE_OFFSET ((unsigned long)__PAGE_OFFSET) ++#ifndef __ASSEMBLY__ ++static inline unsigned long __phys_addr(unsigned long x) ++{ ++ return x - (x >= __START_KERNEL_map ? __START_KERNEL_map : PAGE_OFFSET); ++} ++#endif + +-/* Note: __pa(&symbol_visible_to_c) should be always replaced with __pa_symbol. +- Otherwise you risk miscompilation. */ +-#define __pa(x) (((unsigned long)(x)>=__START_KERNEL_map)?(unsigned long)(x) - (unsigned long)__START_KERNEL_map:(unsigned long)(x) - PAGE_OFFSET) +-/* __pa_symbol should be used for C visible symbols. +- This seems to be the official gcc blessed way to do such arithmetic. */ +-#define __pa_symbol(x) \ +- ({unsigned long v; \ +- asm("" : "=r" (v) : "0" (x)); \ +- __pa(v); }) ++#define __pa(x) __phys_addr((unsigned long)(x)) ++#define __pa_symbol(x) __phys_addr((unsigned long)(x)) + + #define __va(x) ((void *)((unsigned long)(x)+PAGE_OFFSET)) + #define __boot_va(x) __va(x) +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/pgalloc.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/pgalloc.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/pgalloc.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,7 +1,6 @@ + #ifndef _X86_64_PGALLOC_H + #define _X86_64_PGALLOC_H + +-#include <asm/fixmap.h> + #include <asm/pda.h> + #include <linux/threads.h> + #include <linux/mm.h> +@@ -100,24 +99,16 @@ static inline void pgd_list_add(pgd_t *p + struct page *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); +- page->index = (pgoff_t)pgd_list; +- if (pgd_list) +- pgd_list->private = (unsigned long)&page->index; +- pgd_list = page; +- page->private = (unsigned long)&pgd_list; ++ list_add(&page->lru, &pgd_list); + spin_unlock(&pgd_lock); + } + + static inline void pgd_list_del(pgd_t *pgd) + { +- struct page *next, **pprev, *page = virt_to_page(pgd); ++ struct page *page = virt_to_page(pgd); + + spin_lock(&pgd_lock); +- next = (struct page *)page->index; +- pprev = (struct page **)page->private; +- *pprev = next; +- if (next) +- next->private = (unsigned long)pprev; ++ list_del(&page->lru); + spin_unlock(&pgd_lock); + } + +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/pgtable.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/pgtable.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/pgtable.h 2007-10-22 13:58:57.000000000 +0200 +@@ -1,12 +1,14 @@ + #ifndef _X86_64_PGTABLE_H + #define _X86_64_PGTABLE_H + ++#include <linux/const.h> ++#ifndef __ASSEMBLY__ ++ + /* + * This file contains the functions and defines necessary to modify and use + * the x86-64 page table tree. + */ + #include <asm/processor.h> +-#include <asm/fixmap.h> + #include <asm/bitops.h> + #include <linux/threads.h> + #include <linux/sched.h> +@@ -35,11 +37,9 @@ extern void xen_init_pt(void); + #endif + + extern pud_t level3_kernel_pgt[512]; +-extern pud_t level3_physmem_pgt[512]; + extern pud_t level3_ident_pgt[512]; + extern pmd_t level2_kernel_pgt[512]; + extern pgd_t init_level4_pgt[]; +-extern pgd_t boot_level4_pgt[]; + extern unsigned long __supported_pte_mask; + + #define swapper_pg_dir init_level4_pgt +@@ -54,6 +54,8 @@ extern void clear_kernel_mapping(unsigne + extern unsigned long empty_zero_page[PAGE_SIZE/sizeof(unsigned long)]; + #define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) + ++#endif /* !__ASSEMBLY__ */ ++ + /* + * PGDIR_SHIFT determines what a top-level page table entry can map + */ +@@ -78,6 +80,8 @@ extern unsigned long empty_zero_page[PAG + */ + #define PTRS_PER_PTE 512 + ++#ifndef __ASSEMBLY__ ++ + #define pte_ERROR(e) \ + printk("%s:%d: bad pte %p(%016lx).\n", __FILE__, __LINE__, &(e), pte_val(e)) + #define pmd_ERROR(e) \ +@@ -116,22 +120,23 @@ static inline void pgd_clear (pgd_t * pg + + #define pte_pgprot(a) (__pgprot((a).pte & ~PHYSICAL_PAGE_MASK)) + +-#define PMD_SIZE (1UL << PMD_SHIFT) ++#endif /* !__ASSEMBLY__ */ ++ ++#define PMD_SIZE (_AC(1,UL) << PMD_SHIFT) + #define PMD_MASK (~(PMD_SIZE-1)) +-#define PUD_SIZE (1UL << PUD_SHIFT) ++#define PUD_SIZE (_AC(1,UL) << PUD_SHIFT) + #define PUD_MASK (~(PUD_SIZE-1)) +-#define PGDIR_SIZE (1UL << PGDIR_SHIFT) ++#define PGDIR_SIZE (_AC(1,UL) << PGDIR_SHIFT) + #define PGDIR_MASK (~(PGDIR_SIZE-1)) + + #define USER_PTRS_PER_PGD ((TASK_SIZE-1)/PGDIR_SIZE+1) + #define FIRST_USER_ADDRESS 0 + +-#ifndef __ASSEMBLY__ +-#define MAXMEM 0x3fffffffffffUL +-#define VMALLOC_START 0xffffc20000000000UL +-#define VMALLOC_END 0xffffe1ffffffffffUL +-#define MODULES_VADDR 0xffffffff88000000UL +-#define MODULES_END 0xfffffffffff00000UL ++#define MAXMEM _AC(0x3fffffffffff, UL) ++#define VMALLOC_START _AC(0xffffc20000000000, UL) ++#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) ++#define MODULES_VADDR _AC(0xffffffff88000000, UL) ++#define MODULES_END _AC(0xfffffffffff00000, UL) + #define MODULES_LEN (MODULES_END - MODULES_VADDR) + + #define _PAGE_BIT_PRESENT 0 +@@ -157,7 +162,7 @@ static inline void pgd_clear (pgd_t * pg + #define _PAGE_GLOBAL 0x100 /* Global TLB entry */ + + #define _PAGE_PROTNONE 0x080 /* If not present */ +-#define _PAGE_NX (1UL<<_PAGE_BIT_NX) ++#define _PAGE_NX (_AC(1,UL)<<_PAGE_BIT_NX) + + #if CONFIG_XEN_COMPAT <= 0x030002 + extern unsigned int __kernel_page_user; +@@ -228,6 +233,8 @@ extern unsigned int __kernel_page_user; + #define __S110 PAGE_SHARED_EXEC + #define __S111 PAGE_SHARED_EXEC + ++#ifndef __ASSEMBLY__ ++ + static inline unsigned long pgd_bad(pgd_t pgd) + { + return pgd_val(pgd) & ~(PTE_MASK | _KERNPG_TABLE | _PAGE_USER); +@@ -339,6 +346,20 @@ static inline pte_t pte_mkwrite(pte_t pt + static inline pte_t pte_mkhuge(pte_t pte) { __pte_val(pte) |= _PAGE_PSE; return pte; } + static inline pte_t pte_clrhuge(pte_t pte) { __pte_val(pte) &= ~_PAGE_PSE; return pte; } + ++static inline int ptep_test_and_clear_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) ++{ ++ if (!pte_dirty(*ptep)) ++ return 0; ++ return test_and_clear_bit(_PAGE_BIT_DIRTY, &ptep->pte); ++} ++ ++static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) ++{ ++ if (!pte_young(*ptep)) ++ return 0; ++ return test_and_clear_bit(_PAGE_BIT_ACCESSED, &ptep->pte); ++} ++ + static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) + { + pte_t pte = *ptep; +@@ -464,18 +485,12 @@ static inline pte_t pte_modify(pte_t pte + * bit at the same time. */ + #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS + #define ptep_set_access_flags(vma, address, ptep, entry, dirty) \ +- do { \ +- if (dirty) \ +- ptep_establish(vma, address, ptep, entry); \ +- } while (0) +- +- +-/* +- * i386 says: We don't actually have these, but we want to advertise +- * them so that we can encompass the flush here. +- */ +-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY +-#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG ++({ \ ++ int __changed = !pte_same(*(ptep), entry); \ ++ if (__changed && (dirty)) \ ++ ptep_establish(vma, address, ptep, entry); \ ++ __changed; \ ++}) + + #define __HAVE_ARCH_PTEP_CLEAR_DIRTY_FLUSH + #define ptep_clear_flush_dirty(vma, address, ptep) \ +@@ -484,7 +499,7 @@ static inline pte_t pte_modify(pte_t pte + int __dirty = pte_dirty(__pte); \ + __pte = pte_mkclean(__pte); \ + if ((vma)->vm_mm->context.pinned) \ +- ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ ++ (void)ptep_set_access_flags(vma, address, ptep, __pte, __dirty); \ + else if (__dirty) \ + set_pte(ptep, __pte); \ + __dirty; \ +@@ -497,7 +512,7 @@ static inline pte_t pte_modify(pte_t pte + int __young = pte_young(__pte); \ + __pte = pte_mkold(__pte); \ + if ((vma)->vm_mm->context.pinned) \ +- ptep_set_access_flags(vma, address, ptep, __pte, __young); \ ++ (void)ptep_set_access_flags(vma, address, ptep, __pte, __young); \ + else if (__young) \ + set_pte(ptep, __pte); \ + __young; \ +@@ -511,10 +526,7 @@ static inline pte_t pte_modify(pte_t pte + #define __swp_entry_to_pte(x) ((pte_t) { (x).val }) + + extern spinlock_t pgd_lock; +-extern struct page *pgd_list; +-void vmalloc_sync_all(void); +- +-#endif /* !__ASSEMBLY__ */ ++extern struct list_head pgd_list; + + extern int kern_addr_valid(unsigned long addr); + +@@ -546,10 +558,6 @@ int touch_pte_range(struct mm_struct *mm + #define io_remap_pfn_range(vma, vaddr, pfn, size, prot) \ + direct_remap_pfn_range(vma,vaddr,pfn,size,prot,DOMID_IO) + +-#define MK_IOSPACE_PFN(space, pfn) (pfn) +-#define GET_IOSPACE(pfn) 0 +-#define GET_PFN(pfn) (pfn) +- + #define HAVE_ARCH_UNMAPPED_AREA + + #define pgtable_cache_init() do { } while (0) +@@ -563,11 +571,14 @@ int touch_pte_range(struct mm_struct *mm + #define kc_offset_to_vaddr(o) \ + (((o) & (1UL << (__VIRTUAL_MASK_SHIFT-1))) ? ((o) | (~__VIRTUAL_MASK)) : (o)) + ++#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG ++#define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_DIRTY + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR + #define __HAVE_ARCH_PTEP_GET_AND_CLEAR_FULL + #define __HAVE_ARCH_PTEP_CLEAR_FLUSH + #define __HAVE_ARCH_PTEP_SET_WRPROTECT + #define __HAVE_ARCH_PTE_SAME + #include <asm-generic/pgtable.h> ++#endif /* !__ASSEMBLY__ */ + + #endif /* _X86_64_PGTABLE_H */ +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/processor.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/processor.h 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/processor.h 2007-10-22 13:58:57.000000000 +0200 +@@ -20,6 +20,7 @@ + #include <asm/percpu.h> + #include <linux/personality.h> + #include <linux/cpumask.h> ++#include <asm/processor-flags.h> + + #define TF_MASK 0x00000100 + #define IF_MASK 0x00000200 +@@ -103,42 +104,6 @@ extern unsigned int init_intel_cacheinfo + extern unsigned short num_cache_leaves; + + /* +- * EFLAGS bits +- */ +-#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ +-#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ +-#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ +-#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ +-#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ +-#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ +-#define X86_EFLAGS_IF 0x00000200 /* Interrupt Flag */ +-#define X86_EFLAGS_DF 0x00000400 /* Direction Flag */ +-#define X86_EFLAGS_OF 0x00000800 /* Overflow Flag */ +-#define X86_EFLAGS_IOPL 0x00003000 /* IOPL mask */ +-#define X86_EFLAGS_NT 0x00004000 /* Nested Task */ +-#define X86_EFLAGS_RF 0x00010000 /* Resume Flag */ +-#define X86_EFLAGS_VM 0x00020000 /* Virtual Mode */ +-#define X86_EFLAGS_AC 0x00040000 /* Alignment Check */ +-#define X86_EFLAGS_VIF 0x00080000 /* Virtual Interrupt Flag */ +-#define X86_EFLAGS_VIP 0x00100000 /* Virtual Interrupt Pending */ +-#define X86_EFLAGS_ID 0x00200000 /* CPUID detection flag */ +- +-/* +- * Intel CPU features in CR4 +- */ +-#define X86_CR4_VME 0x0001 /* enable vm86 extensions */ +-#define X86_CR4_PVI 0x0002 /* virtual interrupts flag enable */ +-#define X86_CR4_TSD 0x0004 /* disable time stamp at ipl 3 */ +-#define X86_CR4_DE 0x0008 /* enable debugging extensions */ +-#define X86_CR4_PSE 0x0010 /* enable page size extensions */ +-#define X86_CR4_PAE 0x0020 /* enable physical address extensions */ +-#define X86_CR4_MCE 0x0040 /* Machine check enable */ +-#define X86_CR4_PGE 0x0080 /* enable global pages */ +-#define X86_CR4_PCE 0x0100 /* enable performance counters at ipl 3 */ +-#define X86_CR4_OSFXSR 0x0200 /* enable fast FPU save and restore */ +-#define X86_CR4_OSXMMEXCPT 0x0400 /* enable unmasked SSE exceptions */ +- +-/* + * Save the cr4 feature set we're using (ie + * Pentium 4MB enable and PPro Global page + * enable), so that any CPU's that boot up +@@ -209,7 +174,7 @@ struct i387_fxsave_struct { + u32 mxcsr; + u32 mxcsr_mask; + u32 st_space[32]; /* 8*16 bytes for each FP-reg = 128 bytes */ +- u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 128 bytes */ ++ u32 xmm_space[64]; /* 16*16 bytes for each XMM-reg = 256 bytes */ + u32 padding[24]; + } __attribute__ ((aligned (16))); + +@@ -440,22 +405,6 @@ static inline void prefetchw(void *x) + #define cpu_relax() rep_nop() + + /* +- * NSC/Cyrix CPU configuration register indexes +- */ +-#define CX86_CCR0 0xc0 +-#define CX86_CCR1 0xc1 +-#define CX86_CCR2 0xc2 +-#define CX86_CCR3 0xc3 +-#define CX86_CCR4 0xe8 +-#define CX86_CCR5 0xe9 +-#define CX86_CCR6 0xea +-#define CX86_CCR7 0xeb +-#define CX86_DIR0 0xfe +-#define CX86_DIR1 0xff +-#define CX86_ARR_BASE 0xc4 +-#define CX86_RCR_BASE 0xdc +- +-/* + * NSC/Cyrix CPU indexed register access macros + */ + +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/smp.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/smp.h 2007-10-22 13:58:46.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/smp.h 2007-10-22 13:58:57.000000000 +0200 +@@ -11,12 +11,11 @@ + extern int disable_apic; + + #ifdef CONFIG_X86_LOCAL_APIC +-#include <asm/fixmap.h> + #include <asm/mpspec.h> ++#include <asm/apic.h> + #ifdef CONFIG_X86_IO_APIC + #include <asm/io_apic.h> + #endif +-#include <asm/apic.h> + #include <asm/thread_info.h> + #endif + +@@ -41,7 +40,6 @@ extern void lock_ipi_call_lock(void); + extern void unlock_ipi_call_lock(void); + extern int smp_num_siblings; + extern void smp_send_reschedule(int cpu); +-void smp_stop_cpu(void); + + extern cpumask_t cpu_sibling_map[NR_CPUS]; + extern cpumask_t cpu_core_map[NR_CPUS]; +@@ -62,14 +60,6 @@ static inline int num_booting_cpus(void) + + #define raw_smp_processor_id() read_pda(cpunumber) + +-#ifdef CONFIG_X86_LOCAL_APIC +-static inline int hard_smp_processor_id(void) +-{ +- /* we don't want to mark this access volatile - bad code generation */ +- return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); +-} +-#endif +- + extern int __cpu_disable(void); + extern void __cpu_die(unsigned int cpu); + extern void prefill_possible_map(void); +@@ -78,6 +68,14 @@ extern unsigned __cpuinitdata disabled_c + + #define NO_PROC_ID 0xFF /* No processor magic marker */ + ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_X86_LOCAL_APIC ++static inline int hard_smp_processor_id(void) ++{ ++ /* we don't want to mark this access volatile - bad code generation */ ++ return GET_APIC_ID(*(unsigned int *)(APIC_BASE+APIC_ID)); ++} + #endif + + /* +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/system.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/system.h 2007-10-22 13:53:08.000000000 +0200 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/system.h 2007-10-22 13:58:57.000000000 +0200 +@@ -3,7 +3,7 @@ + + #include <linux/kernel.h> + #include <asm/segment.h> +-#include <asm/alternative.h> ++#include <asm/cmpxchg.h> + + #include <asm/synch_bitops.h> + #include <asm/hypervisor.h> +@@ -49,7 +49,7 @@ + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ + [ti_flags] "i" (offsetof(struct thread_info, flags)),\ + [tif_fork] "i" (TIF_FORK), \ +- [thread_info] "i" (offsetof(struct task_struct, thread_info)), \ ++ [thread_info] "i" (offsetof(struct task_struct, stack)), \ + [pda_pcurrent] "i" (offsetof(struct x8664_pda, pcurrent)) \ + : "memory", "cc" __EXTRA_CLOBBER) + +@@ -98,6 +98,12 @@ static inline void write_cr0(unsigned lo + machine_to_phys(__dummy); \ + }) + ++static inline void write_cr3(unsigned long val) ++{ ++ val = phys_to_machine(val); ++ asm volatile("movq %0,%%cr3" :: "r" (val) : "memory"); ++} ++ + static inline unsigned long read_cr4(void) + { + unsigned long cr4; +@@ -107,7 +113,7 @@ static inline unsigned long read_cr4(voi + + static inline void write_cr4(unsigned long val) + { +- asm volatile("movq %0,%%cr4" :: "r" (val)); ++ asm volatile("movq %0,%%cr4" :: "r" (val) : "memory"); + } + + #define stts() (HYPERVISOR_fpu_taskswitch(1)) +@@ -128,100 +134,6 @@ static inline void sched_cacheflush(void + + #define nop() __asm__ __volatile__ ("nop") + +-#define xchg(ptr,v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v),(ptr),sizeof(*(ptr)))) +- +-#define tas(ptr) (xchg((ptr),1)) +- +-#define __xg(x) ((volatile long *)(x)) +- +-static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) +-{ +- *ptr = val; +-} +- +-#define _set_64bit set_64bit +- +-/* +- * Note: no "lock" prefix even on SMP: xchg always implies lock anyway +- * Note 2: xchg has side effect, so that attribute volatile is necessary, +- * but generally the primitive is invalid, *ptr is output argument. --ANK +- */ +-static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int size) +-{ +- switch (size) { +- case 1: +- __asm__ __volatile__("xchgb %b0,%1" +- :"=q" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- case 2: +- __asm__ __volatile__("xchgw %w0,%1" +- :"=r" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- case 4: +- __asm__ __volatile__("xchgl %k0,%1" +- :"=r" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- case 8: +- __asm__ __volatile__("xchgq %0,%1" +- :"=r" (x) +- :"m" (*__xg(ptr)), "0" (x) +- :"memory"); +- break; +- } +- return x; +-} +- +-/* +- * Atomic compare and exchange. Compare OLD with MEM, if identical, +- * store NEW in MEM. Return the initial value in MEM. Success is +- * indicated by comparing RETURN with OLD. +- */ +- +-#define __HAVE_ARCH_CMPXCHG 1 +- +-static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, +- unsigned long new, int size) +-{ +- unsigned long prev; +- switch (size) { +- case 1: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgb %b1,%2" +- : "=a"(prev) +- : "q"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 2: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgw %w1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 4: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgl %k1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- case 8: +- __asm__ __volatile__(LOCK_PREFIX "cmpxchgq %1,%2" +- : "=a"(prev) +- : "r"(new), "m"(*__xg(ptr)), "0"(old) +- : "memory"); +- return prev; +- } +- return old; +-} +- +-#define cmpxchg(ptr,o,n)\ +- ((__typeof__(*(ptr)))__cmpxchg((ptr),(unsigned long)(o),\ +- (unsigned long)(n),sizeof(*(ptr)))) +- + #ifdef CONFIG_SMP + #define smp_mb() mb() + #define smp_rmb() rmb() +Index: 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/tlbflush.h +=================================================================== +--- 10.3-2007-11-26.orig/include/asm-x86_64/mach-xen/asm/tlbflush.h 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/include/asm-x86_64/mach-xen/asm/tlbflush.h 2007-10-22 13:58:57.000000000 +0200 +@@ -2,7 +2,9 @@ + #define _X8664_TLBFLUSH_H + + #include <linux/mm.h> ++#include <linux/sched.h> + #include <asm/processor.h> ++#include <asm/system.h> + + #define __flush_tlb() xen_tlb_flush() + +Index: 10.3-2007-11-26/mm/highmem.c +=================================================================== +--- 10.3-2007-11-26.orig/mm/highmem.c 2007-12-06 17:27:30.000000000 +0100 ++++ 10.3-2007-11-26/mm/highmem.c 2007-10-22 13:58:57.000000000 +0200 +@@ -158,17 +158,6 @@ start: + return vaddr; + } + +-#ifdef CONFIG_XEN +-void kmap_flush_unused(void) +-{ +- spin_lock(&kmap_lock); +- flush_all_zero_pkmaps(); +- spin_unlock(&kmap_lock); +-} +- +-EXPORT_SYMBOL(kmap_flush_unused); +-#endif +- + void fastcall *kmap_high(struct page *page) + { + unsigned long vaddr; +Index: 10.3-2007-11-26/net/core/dev.c +=================================================================== +--- 10.3-2007-11-26.orig/net/core/dev.c 2007-10-22 13:53:25.000000000 +0200 ++++ 10.3-2007-11-26/net/core/dev.c 2007-10-22 13:58:57.000000000 +0200 +@@ -1466,12 +1466,16 @@ out_kfree_skb: + inline int skb_checksum_setup(struct sk_buff *skb) + { + if (skb->proto_csum_blank) { ++ struct iphdr *iph; ++ + if (skb->protocol != htons(ETH_P_IP)) + goto out; +- skb->h.raw = (unsigned char *)skb->nh.iph + 4*skb->nh.iph->ihl; +- if (skb->h.raw >= skb->tail) ++ iph = ip_hdr(skb); ++ skb->transport_header = skb->network_header + 4 * iph->ihl; ++ if (skb->transport_header >= skb->tail) + goto out; +- switch (skb->nh.iph->protocol) { ++ skb->csum_start = skb_transport_header(skb) - skb->head; ++ switch (iph->protocol) { + case IPPROTO_TCP: + skb->csum_offset = offsetof(struct tcphdr, check); + break; +@@ -1482,10 +1486,10 @@ inline int skb_checksum_setup(struct sk_ + if (net_ratelimit()) + printk(KERN_ERR "Attempting to checksum a non-" + "TCP/UDP packet, dropping a protocol" +- " %d packet", skb->nh.iph->protocol); ++ " %d packet", iph->protocol); + goto out; + } +- if ((skb->h.raw + skb->csum_offset + 2) > skb->tail) ++ if ((skb->transport_header + skb->csum_offset + 2) > skb->tail) + goto out; + skb->ip_summed = CHECKSUM_PARTIAL; + skb->proto_csum_blank = 0; |