diff options
Diffstat (limited to 'trunk/2.6.22/20012_xen3-auto-xen-drivers.patch1')
-rw-r--r-- | trunk/2.6.22/20012_xen3-auto-xen-drivers.patch1 | 28404 |
1 files changed, 28404 insertions, 0 deletions
diff --git a/trunk/2.6.22/20012_xen3-auto-xen-drivers.patch1 b/trunk/2.6.22/20012_xen3-auto-xen-drivers.patch1 new file mode 100644 index 0000000..5b134b5 --- /dev/null +++ b/trunk/2.6.22/20012_xen3-auto-xen-drivers.patch1 @@ -0,0 +1,28404 @@ +Subject: xen3 xen-drivers +From: http://xenbits.xensource.com/xen-3.1-testing.hg (tip 15042) +Patch-mainline: obsolete +Acked-by: jbeulich@novell.com + +--- + drivers/Makefile | 1 + drivers/xen/Makefile | 20 + drivers/xen/balloon/Makefile | 2 + drivers/xen/balloon/balloon.c | 663 +++++++ + drivers/xen/balloon/common.h | 58 + drivers/xen/balloon/sysfs.c | 170 + + drivers/xen/blkback/Makefile | 3 + drivers/xen/blkback/blkback.c | 614 ++++++ + drivers/xen/blkback/common.h | 139 + + drivers/xen/blkback/interface.c | 181 ++ + drivers/xen/blkback/vbd.c | 118 + + drivers/xen/blkback/xenbus.c | 533 +++++ + drivers/xen/blkfront/Makefile | 5 + drivers/xen/blkfront/blkfront.c | 902 ++++++++++ + drivers/xen/blkfront/block.h | 142 + + drivers/xen/blkfront/vbd.c | 372 ++++ + drivers/xen/blktap/Makefile | 5 + drivers/xen/blktap/blktap.c | 1528 +++++++++++++++++ + drivers/xen/blktap/common.h | 121 + + drivers/xen/blktap/interface.c | 174 + + drivers/xen/blktap/xenbus.c | 473 +++++ + drivers/xen/char/Makefile | 2 + drivers/xen/char/mem.c | 203 ++ + drivers/xen/console/Makefile | 2 + drivers/xen/console/console.c | 721 ++++++++ + drivers/xen/console/xencons_ring.c | 143 + + drivers/xen/core/Makefile | 12 + drivers/xen/core/cpu_hotplug.c | 172 + + drivers/xen/core/evtchn.c | 1015 +++++++++++ + drivers/xen/core/features.c | 34 + drivers/xen/core/gnttab.c | 631 +++++++ + drivers/xen/core/hypervisor_sysfs.c | 59 + drivers/xen/core/machine_kexec.c | 189 ++ + drivers/xen/core/machine_reboot.c | 241 ++ + drivers/xen/core/reboot.c | 249 ++ + drivers/xen/core/smpboot.c | 452 +++++ + drivers/xen/core/xen_proc.c | 23 + drivers/xen/core/xen_sysfs.c | 378 ++++ + drivers/xen/evtchn/Makefile | 2 + drivers/xen/evtchn/evtchn.c | 469 +++++ + drivers/xen/fbfront/Makefile | 2 + drivers/xen/fbfront/xenfb.c | 752 ++++++++ + drivers/xen/fbfront/xenkbd.c | 333 +++ + drivers/xen/gntdev/Makefile | 1 + drivers/xen/gntdev/gntdev.c | 973 ++++++++++ + drivers/xen/netback/Makefile | 5 + drivers/xen/netback/common.h | 157 + + drivers/xen/netback/interface.c | 336 +++ + drivers/xen/netback/loopback.c | 320 +++ + drivers/xen/netback/netback.c | 1496 ++++++++++++++++ + drivers/xen/netback/xenbus.c | 448 +++++ + drivers/xen/netfront/Makefile | 4 + drivers/xen/netfront/netfront.c | 2133 ++++++++++++++++++++++++ + drivers/xen/pciback/Makefile | 15 + drivers/xen/pciback/conf_space.c | 426 ++++ + drivers/xen/pciback/conf_space.h | 126 + + drivers/xen/pciback/conf_space_capability.c | 71 + drivers/xen/pciback/conf_space_capability.h | 23 + drivers/xen/pciback/conf_space_capability_pm.c | 128 + + drivers/xen/pciback/conf_space_capability_vpd.c | 42 + drivers/xen/pciback/conf_space_header.c | 309 +++ + drivers/xen/pciback/conf_space_quirks.c | 126 + + drivers/xen/pciback/conf_space_quirks.h | 35 + drivers/xen/pciback/passthrough.c | 157 + + drivers/xen/pciback/pci_stub.c | 929 ++++++++++ + drivers/xen/pciback/pciback.h | 93 + + drivers/xen/pciback/pciback_ops.c | 95 + + drivers/xen/pciback/slot.c | 151 + + drivers/xen/pciback/vpci.c | 204 ++ + drivers/xen/pciback/xenbus.c | 454 +++++ + drivers/xen/pcifront/Makefile | 7 + drivers/xen/pcifront/pci.c | 46 + drivers/xen/pcifront/pci_op.c | 268 +++ + drivers/xen/pcifront/pcifront.h | 40 + drivers/xen/pcifront/xenbus.c | 295 +++ + drivers/xen/privcmd/Makefile | 2 + drivers/xen/privcmd/privcmd.c | 284 +++ + drivers/xen/tpmback/Makefile | 4 + drivers/xen/tpmback/common.h | 85 + drivers/xen/tpmback/interface.c | 167 + + drivers/xen/tpmback/tpmback.c | 944 ++++++++++ + drivers/xen/tpmback/xenbus.c | 289 +++ + drivers/xen/util.c | 70 + drivers/xen/xenbus/Makefile | 9 + drivers/xen/xenbus/xenbus_backend_client.c | 147 + + drivers/xen/xenbus/xenbus_client.c | 283 +++ + drivers/xen/xenbus/xenbus_comms.c | 232 ++ + drivers/xen/xenbus/xenbus_comms.h | 46 + drivers/xen/xenbus/xenbus_dev.c | 404 ++++ + drivers/xen/xenbus/xenbus_probe.c | 1086 ++++++++++++ + drivers/xen/xenbus/xenbus_probe.h | 75 + drivers/xen/xenbus/xenbus_probe_backend.c | 286 +++ + drivers/xen/xenbus/xenbus_xs.c | 880 +++++++++ + drivers/xen/xenoprof/xenoprofile.c | 500 +++++ + 94 files changed, 28014 insertions(+) + +--- a/drivers/Makefile 2007-08-27 14:01:24.000000000 -0400 ++++ b/drivers/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -31,6 +31,7 @@ obj-y += base/ block/ misc/ mfd/ net/ + obj-$(CONFIG_NUBUS) += nubus/ + obj-$(CONFIG_ATM) += atm/ + obj-y += macintosh/ ++obj-$(CONFIG_XEN) += xen/ + obj-$(CONFIG_IDE) += ide/ + obj-$(CONFIG_FC4) += fc4/ + obj-$(CONFIG_SCSI) += scsi/ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,20 @@ ++obj-y += core/ ++obj-y += console/ ++obj-y += evtchn/ ++obj-y += privcmd/ ++obj-y += xenbus/ ++obj-y += gntdev/ ++obj-y += balloon/ ++obj-y += char/ ++ ++obj-y += util.o ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) += blkback/ ++obj-$(CONFIG_XEN_BLKDEV_TAP) += blktap/ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) += netback/ ++obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmback/ ++obj-$(CONFIG_XEN_BLKDEV_FRONTEND) += blkfront/ ++obj-$(CONFIG_XEN_NETDEV_FRONTEND) += netfront/ ++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback/ ++obj-$(CONFIG_XEN_PCIDEV_FRONTEND) += pcifront/ ++obj-$(CONFIG_XEN_FRAMEBUFFER) += fbfront/ ++obj-$(CONFIG_XEN_KEYBOARD) += fbfront/ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/balloon/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,2 @@ ++ ++obj-y := balloon.o sysfs.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/balloon/balloon.c 2007-08-27 14:02:09.000000000 -0400 +@@ -0,0 +1,663 @@ ++/****************************************************************************** ++ * balloon.c ++ * ++ * Xen balloon driver - enables returning/claiming memory to/from Xen. ++ * ++ * Copyright (c) 2003, B Dragovic ++ * Copyright (c) 2003-2004, M Williamson, K Fraser ++ * Copyright (c) 2005 Dan M. Smith, IBM Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/smp_lock.h> ++#include <linux/pagemap.h> ++#include <linux/bootmem.h> ++#include <linux/highmem.h> ++#include <linux/vmalloc.h> ++#include <linux/mutex.h> ++#include <xen/xen_proc.h> ++#include <asm/hypervisor.h> ++#include <xen/balloon.h> ++#include <xen/interface/memory.h> ++#include <asm/maddr.h> ++#include <asm/page.h> ++#include <asm/pgalloc.h> ++#include <asm/pgtable.h> ++#include <asm/uaccess.h> ++#include <asm/tlb.h> ++#include <linux/highmem.h> ++#include <linux/list.h> ++#include <xen/xenbus.h> ++#include "common.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++#ifdef CONFIG_PROC_FS ++static struct proc_dir_entry *balloon_pde; ++#endif ++ ++static DEFINE_MUTEX(balloon_mutex); ++ ++/* ++ * Protects atomic reservation decrease/increase against concurrent increases. ++ * Also protects non-atomic updates of current_pages and driver_pages, and ++ * balloon lists. ++ */ ++DEFINE_SPINLOCK(balloon_lock); ++ ++struct balloon_stats balloon_stats; ++ ++/* We increase/decrease in batches which fit in a page */ ++static unsigned long frame_list[PAGE_SIZE / sizeof(unsigned long)]; ++ ++/* VM /proc information for memory */ ++extern unsigned long totalram_pages; ++ ++/* List of ballooned pages, threaded through the mem_map array. */ ++static LIST_HEAD(ballooned_pages); ++ ++/* Main work function, always executed in process context. */ ++static void balloon_process(void *unused); ++static DECLARE_WORK(balloon_worker, balloon_process, NULL); ++static struct timer_list balloon_timer; ++ ++/* When ballooning out (allocating memory to return to Xen) we don't really ++ want the kernel to try too hard since that can trigger the oom killer. */ ++#define GFP_BALLOON \ ++ (GFP_HIGHUSER | __GFP_NOWARN | __GFP_NORETRY | __GFP_NOMEMALLOC) ++ ++#define PAGE_TO_LIST(p) (&(p)->lru) ++#define LIST_TO_PAGE(l) list_entry((l), struct page, lru) ++#define UNLIST_PAGE(p) \ ++ do { \ ++ list_del(PAGE_TO_LIST(p)); \ ++ PAGE_TO_LIST(p)->next = NULL; \ ++ PAGE_TO_LIST(p)->prev = NULL; \ ++ } while(0) ++ ++#define IPRINTK(fmt, args...) \ ++ printk(KERN_INFO "xen_mem: " fmt, ##args) ++#define WPRINTK(fmt, args...) \ ++ printk(KERN_WARNING "xen_mem: " fmt, ##args) ++ ++/* balloon_append: add the given page to the balloon. */ ++static void balloon_append(struct page *page) ++{ ++ /* Lowmem is re-populated first, so highmem pages go at list tail. */ ++ if (PageHighMem(page)) { ++ list_add_tail(PAGE_TO_LIST(page), &ballooned_pages); ++ bs.balloon_high++; ++ } else { ++ list_add(PAGE_TO_LIST(page), &ballooned_pages); ++ bs.balloon_low++; ++ } ++} ++ ++/* balloon_retrieve: rescue a page from the balloon, if it is not empty. */ ++static struct page *balloon_retrieve(void) ++{ ++ struct page *page; ++ ++ if (list_empty(&ballooned_pages)) ++ return NULL; ++ ++ page = LIST_TO_PAGE(ballooned_pages.next); ++ UNLIST_PAGE(page); ++ ++ if (PageHighMem(page)) ++ bs.balloon_high--; ++ else ++ bs.balloon_low--; ++ ++ return page; ++} ++ ++static struct page *balloon_first_page(void) ++{ ++ if (list_empty(&ballooned_pages)) ++ return NULL; ++ return LIST_TO_PAGE(ballooned_pages.next); ++} ++ ++static struct page *balloon_next_page(struct page *page) ++{ ++ struct list_head *next = PAGE_TO_LIST(page)->next; ++ if (next == &ballooned_pages) ++ return NULL; ++ return LIST_TO_PAGE(next); ++} ++ ++static void balloon_alarm(unsigned long unused) ++{ ++ schedule_work(&balloon_worker); ++} ++ ++static unsigned long current_target(void) ++{ ++ unsigned long target = min(bs.target_pages, bs.hard_limit); ++ if (target > (bs.current_pages + bs.balloon_low + bs.balloon_high)) ++ target = bs.current_pages + bs.balloon_low + bs.balloon_high; ++ return target; ++} ++ ++static int increase_reservation(unsigned long nr_pages) ++{ ++ unsigned long pfn, i, flags; ++ struct page *page; ++ long rc; ++ struct xen_memory_reservation reservation = { ++ .address_bits = 0, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ ++ if (nr_pages > ARRAY_SIZE(frame_list)) ++ nr_pages = ARRAY_SIZE(frame_list); ++ ++ balloon_lock(flags); ++ ++ page = balloon_first_page(); ++ for (i = 0; i < nr_pages; i++) { ++ BUG_ON(page == NULL); ++ frame_list[i] = page_to_pfn(page);; ++ page = balloon_next_page(page); ++ } ++ ++ set_xen_guest_handle(reservation.extent_start, frame_list); ++ reservation.nr_extents = nr_pages; ++ rc = HYPERVISOR_memory_op( ++ XENMEM_populate_physmap, &reservation); ++ if (rc < nr_pages) { ++ if (rc > 0) { ++ int ret; ++ ++ /* We hit the Xen hard limit: reprobe. */ ++ reservation.nr_extents = rc; ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, ++ &reservation); ++ BUG_ON(ret != rc); ++ } ++ if (rc >= 0) ++ bs.hard_limit = (bs.current_pages + rc - ++ bs.driver_pages); ++ goto out; ++ } ++ ++ for (i = 0; i < nr_pages; i++) { ++ page = balloon_retrieve(); ++ BUG_ON(page == NULL); ++ ++ pfn = page_to_pfn(page); ++ BUG_ON(!xen_feature(XENFEAT_auto_translated_physmap) && ++ phys_to_machine_mapping_valid(pfn)); ++ ++ set_phys_to_machine(pfn, frame_list[i]); ++ ++#ifdef CONFIG_XEN ++ /* Link back into the page tables if not highmem. */ ++ if (pfn < max_low_pfn) { ++ int ret; ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)__va(pfn << PAGE_SHIFT), ++ pfn_pte_ma(frame_list[i], PAGE_KERNEL), ++ 0); ++ BUG_ON(ret); ++ } ++#endif ++ ++ /* Relinquish the page back to the allocator. */ ++ ClearPageReserved(page); ++ init_page_count(page); ++ __free_page(page); ++ } ++ ++ bs.current_pages += nr_pages; ++ totalram_pages = bs.current_pages; ++ ++ out: ++ balloon_unlock(flags); ++ ++ return 0; ++} ++ ++static int decrease_reservation(unsigned long nr_pages) ++{ ++ unsigned long pfn, i, flags; ++ struct page *page; ++ void *v; ++ int need_sleep = 0; ++ int ret; ++ struct xen_memory_reservation reservation = { ++ .address_bits = 0, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ ++ if (nr_pages > ARRAY_SIZE(frame_list)) ++ nr_pages = ARRAY_SIZE(frame_list); ++ ++ for (i = 0; i < nr_pages; i++) { ++ if ((page = alloc_page(GFP_BALLOON)) == NULL) { ++ nr_pages = i; ++ need_sleep = 1; ++ break; ++ } ++ ++ pfn = page_to_pfn(page); ++ frame_list[i] = pfn_to_mfn(pfn); ++ ++ if (!PageHighMem(page)) { ++ v = phys_to_virt(pfn << PAGE_SHIFT); ++ scrub_pages(v, 1); ++#ifdef CONFIG_XEN ++ ret = HYPERVISOR_update_va_mapping( ++ (unsigned long)v, __pte_ma(0), 0); ++ BUG_ON(ret); ++#endif ++ } ++#ifdef CONFIG_XEN_SCRUB_PAGES ++ else { ++ v = kmap(page); ++ scrub_pages(v, 1); ++ kunmap(page); ++ } ++#endif ++ } ++ ++#ifdef CONFIG_XEN ++ /* Ensure that ballooned highmem pages don't have kmaps. */ ++ kmap_flush_unused(); ++ flush_tlb_all(); ++#endif ++ ++ balloon_lock(flags); ++ ++ /* No more mappings: invalidate P2M and add to balloon. */ ++ for (i = 0; i < nr_pages; i++) { ++ pfn = mfn_to_pfn(frame_list[i]); ++ set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ balloon_append(pfn_to_page(pfn)); ++ } ++ ++ set_xen_guest_handle(reservation.extent_start, frame_list); ++ reservation.nr_extents = nr_pages; ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(ret != nr_pages); ++ ++ bs.current_pages -= nr_pages; ++ totalram_pages = bs.current_pages; ++ ++ balloon_unlock(flags); ++ ++ return need_sleep; ++} ++ ++/* ++ * We avoid multiple worker processes conflicting via the balloon mutex. ++ * We may of course race updates of the target counts (which are protected ++ * by the balloon lock), or with changes to the Xen hard limit, but we will ++ * recover from these in time. ++ */ ++static void balloon_process(void *unused) ++{ ++ int need_sleep = 0; ++ long credit; ++ ++ mutex_lock(&balloon_mutex); ++ ++ do { ++ credit = current_target() - bs.current_pages; ++ if (credit > 0) ++ need_sleep = (increase_reservation(credit) != 0); ++ if (credit < 0) ++ need_sleep = (decrease_reservation(-credit) != 0); ++ ++#ifndef CONFIG_PREEMPT ++ if (need_resched()) ++ schedule(); ++#endif ++ } while ((credit != 0) && !need_sleep); ++ ++ /* Schedule more work if there is some still to be done. */ ++ if (current_target() != bs.current_pages) ++ mod_timer(&balloon_timer, jiffies + HZ); ++ ++ mutex_unlock(&balloon_mutex); ++} ++ ++/* Resets the Xen limit, sets new target, and kicks off processing. */ ++void balloon_set_new_target(unsigned long target) ++{ ++ /* No need for lock. Not read-modify-write updates. */ ++ bs.hard_limit = ~0UL; ++ bs.target_pages = target; ++ schedule_work(&balloon_worker); ++} ++ ++static struct xenbus_watch target_watch = ++{ ++ .node = "memory/target" ++}; ++ ++/* React to a change in the target key */ ++static void watch_target(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ unsigned long long new_target; ++ int err; ++ ++ err = xenbus_scanf(XBT_NIL, "memory", "target", "%llu", &new_target); ++ if (err != 1) { ++ /* This is ok (for domain0 at least) - so just return */ ++ return; ++ } ++ ++ /* The given memory/target value is in KiB, so it needs converting to ++ * pages. PAGE_SHIFT converts bytes to pages, hence PAGE_SHIFT - 10. ++ */ ++ balloon_set_new_target(new_target >> (PAGE_SHIFT - 10)); ++} ++ ++static int balloon_init_watcher(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ int err; ++ ++ err = register_xenbus_watch(&target_watch); ++ if (err) ++ printk(KERN_ERR "Failed to set balloon watcher\n"); ++ ++ return NOTIFY_DONE; ++} ++ ++#ifdef CONFIG_PROC_FS ++static int balloon_write(struct file *file, const char __user *buffer, ++ unsigned long count, void *data) ++{ ++ char memstring[64], *endchar; ++ unsigned long long target_bytes; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (count <= 1) ++ return -EBADMSG; /* runt */ ++ if (count > sizeof(memstring)) ++ return -EFBIG; /* too long */ ++ ++ if (copy_from_user(memstring, buffer, count)) ++ return -EFAULT; ++ memstring[sizeof(memstring)-1] = '\0'; ++ ++ target_bytes = memparse(memstring, &endchar); ++ balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ ++ return count; ++} ++ ++static int balloon_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = sprintf( ++ page, ++ "Current allocation: %8lu kB\n" ++ "Requested target: %8lu kB\n" ++ "Low-mem balloon: %8lu kB\n" ++ "High-mem balloon: %8lu kB\n" ++ "Driver pages: %8lu kB\n" ++ "Xen hard limit: ", ++ PAGES2KB(bs.current_pages), PAGES2KB(bs.target_pages), ++ PAGES2KB(bs.balloon_low), PAGES2KB(bs.balloon_high), ++ PAGES2KB(bs.driver_pages)); ++ ++ if (bs.hard_limit != ~0UL) ++ len += sprintf(page + len, "%8lu kB\n", ++ PAGES2KB(bs.hard_limit)); ++ else ++ len += sprintf(page + len, " ??? kB\n"); ++ ++ *eof = 1; ++ return len; ++} ++#endif ++ ++static struct notifier_block xenstore_notifier; ++ ++static int __init balloon_init(void) ++{ ++#if defined(CONFIG_X86) && defined(CONFIG_XEN) ++ unsigned long pfn; ++ struct page *page; ++#endif ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ IPRINTK("Initialising balloon driver.\n"); ++ ++#ifdef CONFIG_XEN ++ bs.current_pages = min(xen_start_info->nr_pages, max_pfn); ++ totalram_pages = bs.current_pages; ++#else ++ bs.current_pages = totalram_pages; ++#endif ++ bs.target_pages = bs.current_pages; ++ bs.balloon_low = 0; ++ bs.balloon_high = 0; ++ bs.driver_pages = 0UL; ++ bs.hard_limit = ~0UL; ++ ++ init_timer(&balloon_timer); ++ balloon_timer.data = 0; ++ balloon_timer.function = balloon_alarm; ++ ++#ifdef CONFIG_PROC_FS ++ if ((balloon_pde = create_xen_proc_entry("balloon", 0644)) == NULL) { ++ WPRINTK("Unable to create /proc/xen/balloon.\n"); ++ return -1; ++ } ++ ++ balloon_pde->read_proc = balloon_read; ++ balloon_pde->write_proc = balloon_write; ++#endif ++ balloon_sysfs_init(); ++ ++#if defined(CONFIG_X86) && defined(CONFIG_XEN) ++ /* Initialise the balloon with excess memory space. */ ++ for (pfn = xen_start_info->nr_pages; pfn < max_pfn; pfn++) { ++ page = pfn_to_page(pfn); ++ if (!PageReserved(page)) ++ balloon_append(page); ++ } ++#endif ++ ++ target_watch.callback = watch_target; ++ xenstore_notifier.notifier_call = balloon_init_watcher; ++ ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++ ++subsys_initcall(balloon_init); ++ ++static void balloon_exit(void) ++{ ++ /* XXX - release balloon here */ ++ return; ++} ++ ++module_exit(balloon_exit); ++ ++void balloon_update_driver_allowance(long delta) ++{ ++ unsigned long flags; ++ ++ balloon_lock(flags); ++ bs.driver_pages += delta; ++ balloon_unlock(flags); ++} ++ ++#ifdef CONFIG_XEN ++static int dealloc_pte_fn( ++ pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) ++{ ++ unsigned long mfn = pte_mfn(*pte); ++ int ret; ++ struct xen_memory_reservation reservation = { ++ .nr_extents = 1, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ set_xen_guest_handle(reservation.extent_start, &mfn); ++ set_pte_at(&init_mm, addr, pte, __pte_ma(0)); ++ set_phys_to_machine(__pa(addr) >> PAGE_SHIFT, INVALID_P2M_ENTRY); ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, &reservation); ++ BUG_ON(ret != 1); ++ return 0; ++} ++#endif ++ ++struct page **alloc_empty_pages_and_pagevec(int nr_pages) ++{ ++ unsigned long vaddr, flags; ++ struct page *page, **pagevec; ++ int i, ret; ++ ++ pagevec = kmalloc(sizeof(page) * nr_pages, GFP_KERNEL); ++ if (pagevec == NULL) ++ return NULL; ++ ++ for (i = 0; i < nr_pages; i++) { ++ page = pagevec[i] = alloc_page(GFP_KERNEL); ++ if (page == NULL) ++ goto err; ++ ++ vaddr = (unsigned long)page_address(page); ++ ++ scrub_pages(vaddr, 1); ++ ++ balloon_lock(flags); ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) { ++ unsigned long gmfn = page_to_pfn(page); ++ struct xen_memory_reservation reservation = { ++ .nr_extents = 1, ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ set_xen_guest_handle(reservation.extent_start, &gmfn); ++ ret = HYPERVISOR_memory_op(XENMEM_decrease_reservation, ++ &reservation); ++ if (ret == 1) ++ ret = 0; /* success */ ++ } else { ++#ifdef CONFIG_XEN ++ ret = apply_to_page_range(&init_mm, vaddr, PAGE_SIZE, ++ dealloc_pte_fn, NULL); ++#else ++ /* Cannot handle non-auto translate mode. */ ++ ret = 1; ++#endif ++ } ++ ++ if (ret != 0) { ++ balloon_unlock(flags); ++ __free_page(page); ++ goto err; ++ } ++ ++ totalram_pages = --bs.current_pages; ++ ++ balloon_unlock(flags); ++ } ++ ++ out: ++ schedule_work(&balloon_worker); ++#ifdef CONFIG_XEN ++ flush_tlb_all(); ++#endif ++ return pagevec; ++ ++ err: ++ balloon_lock(flags); ++ while (--i >= 0) ++ balloon_append(pagevec[i]); ++ balloon_unlock(flags); ++ kfree(pagevec); ++ pagevec = NULL; ++ goto out; ++} ++ ++void free_empty_pages_and_pagevec(struct page **pagevec, int nr_pages) ++{ ++ unsigned long flags; ++ int i; ++ ++ if (pagevec == NULL) ++ return; ++ ++ balloon_lock(flags); ++ for (i = 0; i < nr_pages; i++) { ++ BUG_ON(page_count(pagevec[i]) != 1); ++ balloon_append(pagevec[i]); ++ } ++ balloon_unlock(flags); ++ ++ kfree(pagevec); ++ ++ schedule_work(&balloon_worker); ++} ++ ++void balloon_release_driver_page(struct page *page) ++{ ++ unsigned long flags; ++ ++ balloon_lock(flags); ++ balloon_append(page); ++ bs.driver_pages--; ++ balloon_unlock(flags); ++ ++ schedule_work(&balloon_worker); ++} ++ ++EXPORT_SYMBOL_GPL(balloon_update_driver_allowance); ++EXPORT_SYMBOL_GPL(alloc_empty_pages_and_pagevec); ++EXPORT_SYMBOL_GPL(free_empty_pages_and_pagevec); ++EXPORT_SYMBOL_GPL(balloon_release_driver_page); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/balloon/common.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,58 @@ ++/****************************************************************************** ++ * balloon/common.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_BALLOON_COMMON_H__ ++#define __XEN_BALLOON_COMMON_H__ ++ ++#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) ++ ++struct balloon_stats { ++ /* We aim for 'current allocation' == 'target allocation'. */ ++ unsigned long current_pages; ++ unsigned long target_pages; ++ /* We may hit the hard limit in Xen. If we do then we remember it. */ ++ unsigned long hard_limit; ++ /* ++ * Drivers may alter the memory reservation independently, but they ++ * must inform the balloon driver so we avoid hitting the hard limit. ++ */ ++ unsigned long driver_pages; ++ /* Number of pages in high- and low-memory balloons. */ ++ unsigned long balloon_low; ++ unsigned long balloon_high; ++}; ++ ++extern struct balloon_stats balloon_stats; ++#define bs balloon_stats ++ ++int balloon_sysfs_init(void); ++void balloon_sysfs_exit(void); ++ ++void balloon_set_new_target(unsigned long target); ++ ++#endif /* __XEN_BALLOON_COMMON_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/balloon/sysfs.c 2007-08-27 14:01:59.000000000 -0400 +@@ -0,0 +1,170 @@ ++/****************************************************************************** ++ * balloon/sysfs.c ++ * ++ * Xen balloon driver - sysfs interfaces. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/capability.h> ++#include <linux/errno.h> ++#include <linux/stat.h> ++#include <linux/string.h> ++#include <linux/sysdev.h> ++#include "common.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++#define BALLOON_CLASS_NAME "memory" ++ ++#define BALLOON_SHOW(name, format, args...) \ ++ static ssize_t show_##name(struct sys_device *dev, \ ++ char *buf) \ ++ { \ ++ return sprintf(buf, format, ##args); \ ++ } \ ++ static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) ++ ++BALLOON_SHOW(current_kb, "%lu\n", PAGES2KB(bs.current_pages)); ++BALLOON_SHOW(low_kb, "%lu\n", PAGES2KB(bs.balloon_low)); ++BALLOON_SHOW(high_kb, "%lu\n", PAGES2KB(bs.balloon_high)); ++BALLOON_SHOW(hard_limit_kb, ++ (bs.hard_limit!=~0UL) ? "%lu\n" : "???\n", ++ (bs.hard_limit!=~0UL) ? PAGES2KB(bs.hard_limit) : 0); ++BALLOON_SHOW(driver_kb, "%lu\n", PAGES2KB(bs.driver_pages)); ++ ++static ssize_t show_target_kb(struct sys_device *dev, char *buf) ++{ ++ return sprintf(buf, "%lu\n", PAGES2KB(bs.target_pages)); ++} ++ ++static ssize_t store_target_kb(struct sys_device *dev, ++ const char *buf, ++ size_t count) ++{ ++ char memstring[64], *endchar; ++ unsigned long long target_bytes; ++ ++ if (!capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ if (count <= 1) ++ return -EBADMSG; /* runt */ ++ if (count > sizeof(memstring)) ++ return -EFBIG; /* too long */ ++ strcpy(memstring, buf); ++ ++ target_bytes = memparse(memstring, &endchar); ++ balloon_set_new_target(target_bytes >> PAGE_SHIFT); ++ ++ return count; ++} ++ ++static SYSDEV_ATTR(target_kb, S_IRUGO | S_IWUSR, ++ show_target_kb, store_target_kb); ++ ++static struct sysdev_attribute *balloon_attrs[] = { ++ &attr_target_kb, ++}; ++ ++static struct attribute *balloon_info_attrs[] = { ++ &attr_current_kb.attr, ++ &attr_low_kb.attr, ++ &attr_high_kb.attr, ++ &attr_hard_limit_kb.attr, ++ &attr_driver_kb.attr, ++ NULL ++}; ++ ++static struct attribute_group balloon_info_group = { ++ .name = "info", ++ .attrs = balloon_info_attrs, ++}; ++ ++static struct sysdev_class balloon_sysdev_class = { ++ set_kset_name(BALLOON_CLASS_NAME), ++}; ++ ++static struct sys_device balloon_sysdev; ++ ++static int register_balloon(struct sys_device *sysdev) ++{ ++ int i, error; ++ ++ error = sysdev_class_register(&balloon_sysdev_class); ++ if (error) ++ return error; ++ ++ sysdev->id = 0; ++ sysdev->cls = &balloon_sysdev_class; ++ ++ error = sysdev_register(sysdev); ++ if (error) { ++ sysdev_class_unregister(&balloon_sysdev_class); ++ return error; ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) { ++ error = sysdev_create_file(sysdev, balloon_attrs[i]); ++ if (error) ++ goto fail; ++ } ++ ++ error = sysfs_create_group(&sysdev->kobj, &balloon_info_group); ++ if (error) ++ goto fail; ++ ++ return 0; ++ ++ fail: ++ while (--i >= 0) ++ sysdev_remove_file(sysdev, balloon_attrs[i]); ++ sysdev_unregister(sysdev); ++ sysdev_class_unregister(&balloon_sysdev_class); ++ return error; ++} ++ ++static void unregister_balloon(struct sys_device *sysdev) ++{ ++ int i; ++ ++ sysfs_remove_group(&sysdev->kobj, &balloon_info_group); ++ for (i = 0; i < ARRAY_SIZE(balloon_attrs); i++) ++ sysdev_remove_file(sysdev, balloon_attrs[i]); ++ sysdev_unregister(sysdev); ++ sysdev_class_unregister(&balloon_sysdev_class); ++} ++ ++int balloon_sysfs_init(void) ++{ ++ return register_balloon(&balloon_sysdev); ++} ++ ++void balloon_sysfs_exit(void) ++{ ++ unregister_balloon(&balloon_sysdev); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/Makefile 2007-08-27 14:01:47.000000000 -0400 +@@ -0,0 +1,3 @@ ++obj-$(CONFIG_XEN_BLKDEV_BACKEND) := blkbk.o ++ ++blkbk-y := blkback.o xenbus.o interface.o vbd.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/blkback.c 2007-08-27 14:02:10.000000000 -0400 +@@ -0,0 +1,614 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/main.c ++ * ++ * Back-end of the driver for virtual block devices. This portion of the ++ * driver exports a 'unified' block-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * arch/xen/drivers/blkif/frontend ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Copyright (c) 2005, Christopher Clark ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/kthread.h> ++#include <linux/list.h> ++#include <xen/balloon.h> ++#include <asm/hypervisor.h> ++#include "common.h" ++ ++/* ++ * These are rather arbitrary. They are fairly large because adjacent requests ++ * pulled from a communication ring are quite likely to end up being part of ++ * the same scatter/gather request at the disc. ++ * ++ * ** TRY INCREASING 'blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW ** ++ * ++ * This will increase the chances of being able to write whole tracks. ++ * 64 should be enough to keep us competitive with Linux. ++ */ ++static int blkif_reqs = 64; ++module_param_named(reqs, blkif_reqs, int, 0); ++MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate"); ++ ++/* Run-time switchable: /sys/module/blkback/parameters/ */ ++static unsigned int log_stats = 0; ++static unsigned int debug_lvl = 0; ++module_param(log_stats, int, 0644); ++module_param(debug_lvl, int, 0644); ++ ++/* ++ * Each outstanding request that we've passed to the lower device layers has a ++ * 'pending_req' allocated to it. Each buffer_head that completes decrements ++ * the pendcnt towards zero. When it hits zero, the specified domain has a ++ * response queued for it, with the saved 'id' passed back. ++ */ ++typedef struct { ++ blkif_t *blkif; ++ u64 id; ++ int nr_pages; ++ atomic_t pendcnt; ++ unsigned short operation; ++ int status; ++ struct list_head free_list; ++} pending_req_t; ++ ++static pending_req_t *pending_reqs; ++static struct list_head pending_free; ++static DEFINE_SPINLOCK(pending_free_lock); ++static DECLARE_WAIT_QUEUE_HEAD(pending_free_wq); ++ ++#define BLKBACK_INVALID_HANDLE (~0) ++ ++static struct page **pending_pages; ++static grant_handle_t *pending_grant_handles; ++ ++static inline int vaddr_pagenr(pending_req_t *req, int seg) ++{ ++ return (req - pending_reqs) * BLKIF_MAX_SEGMENTS_PER_REQUEST + seg; ++} ++ ++static inline unsigned long vaddr(pending_req_t *req, int seg) ++{ ++ unsigned long pfn = page_to_pfn(pending_pages[vaddr_pagenr(req, seg)]); ++ return (unsigned long)pfn_to_kaddr(pfn); ++} ++ ++#define pending_handle(_req, _seg) \ ++ (pending_grant_handles[vaddr_pagenr(_req, _seg)]) ++ ++ ++static int do_block_io_op(blkif_t *blkif); ++static void dispatch_rw_block_io(blkif_t *blkif, ++ blkif_request_t *req, ++ pending_req_t *pending_req); ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st); ++ ++/****************************************************************** ++ * misc small helpers ++ */ ++static pending_req_t* alloc_req(void) ++{ ++ pending_req_t *req = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ if (!list_empty(&pending_free)) { ++ req = list_entry(pending_free.next, pending_req_t, free_list); ++ list_del(&req->free_list); ++ } ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ return req; ++} ++ ++static void free_req(pending_req_t *req) ++{ ++ unsigned long flags; ++ int was_empty; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ was_empty = list_empty(&pending_free); ++ list_add(&req->free_list, &pending_free); ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ if (was_empty) ++ wake_up(&pending_free_wq); ++} ++ ++static void unplug_queue(blkif_t *blkif) ++{ ++ if (blkif->plug == NULL) ++ return; ++ if (blkif->plug->unplug_fn) ++ blkif->plug->unplug_fn(blkif->plug); ++ blk_put_queue(blkif->plug); ++ blkif->plug = NULL; ++} ++ ++static void plug_queue(blkif_t *blkif, struct bio *bio) ++{ ++ request_queue_t *q = bdev_get_queue(bio->bi_bdev); ++ ++ if (q == blkif->plug) ++ return; ++ unplug_queue(blkif); ++ blk_get_queue(q); ++ blkif->plug = q; ++} ++ ++static void fast_flush_area(pending_req_t *req) ++{ ++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int i, invcount = 0; ++ grant_handle_t handle; ++ int ret; ++ ++ for (i = 0; i < req->nr_pages; i++) { ++ handle = pending_handle(req, i); ++ if (handle == BLKBACK_INVALID_HANDLE) ++ continue; ++ gnttab_set_unmap_op(&unmap[i], vaddr(req, i), GNTMAP_host_map, ++ handle); ++ pending_handle(req, i) = BLKBACK_INVALID_HANDLE; ++ invcount++; ++ } ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, unmap, invcount); ++ BUG_ON(ret); ++} ++ ++/****************************************************************** ++ * SCHEDULER FUNCTIONS ++ */ ++ ++static void print_stats(blkif_t *blkif) ++{ ++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d | br %4d\n", ++ current->comm, blkif->st_oo_req, ++ blkif->st_rd_req, blkif->st_wr_req, blkif->st_br_req); ++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ++ blkif->st_rd_req = 0; ++ blkif->st_wr_req = 0; ++ blkif->st_oo_req = 0; ++} ++ ++int blkif_schedule(void *arg) ++{ ++ blkif_t *blkif = arg; ++ ++ blkif_get(blkif); ++ ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: started\n", current->comm); ++ ++ while (!kthread_should_stop()) { ++ wait_event_interruptible( ++ blkif->wq, ++ blkif->waiting_reqs || kthread_should_stop()); ++ wait_event_interruptible( ++ pending_free_wq, ++ !list_empty(&pending_free) || kthread_should_stop()); ++ ++ blkif->waiting_reqs = 0; ++ smp_mb(); /* clear flag *before* checking for work */ ++ ++ if (do_block_io_op(blkif)) ++ blkif->waiting_reqs = 1; ++ unplug_queue(blkif); ++ ++ if (log_stats && time_after(jiffies, blkif->st_print)) ++ print_stats(blkif); ++ } ++ ++ if (log_stats) ++ print_stats(blkif); ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: exiting\n", current->comm); ++ ++ blkif->xenblkd = NULL; ++ blkif_put(blkif); ++ ++ return 0; ++} ++ ++/****************************************************************** ++ * COMPLETION CALLBACK -- Called as bh->b_end_io() ++ */ ++ ++static void __end_block_io_op(pending_req_t *pending_req, int error) ++{ ++ /* An error fails the entire request. */ ++ if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && ++ (error == -EOPNOTSUPP)) { ++ DPRINTK("blkback: write barrier op failed, not supported\n"); ++ blkback_barrier(XBT_NIL, pending_req->blkif->be, 0); ++ pending_req->status = BLKIF_RSP_EOPNOTSUPP; ++ } else if (error) { ++ DPRINTK("Buffer not up-to-date at end of operation, " ++ "error=%d\n", error); ++ pending_req->status = BLKIF_RSP_ERROR; ++ } ++ ++ if (atomic_dec_and_test(&pending_req->pendcnt)) { ++ fast_flush_area(pending_req); ++ make_response(pending_req->blkif, pending_req->id, ++ pending_req->operation, pending_req->status); ++ blkif_put(pending_req->blkif); ++ free_req(pending_req); ++ } ++} ++ ++static int end_block_io_op(struct bio *bio, unsigned int done, int error) ++{ ++ if (bio->bi_size != 0) ++ return 1; ++ __end_block_io_op(bio->bi_private, error); ++ bio_put(bio); ++ return error; ++} ++ ++ ++/****************************************************************************** ++ * NOTIFICATION FROM GUEST OS. ++ */ ++ ++static void blkif_notify_work(blkif_t *blkif) ++{ ++ blkif->waiting_reqs = 1; ++ wake_up(&blkif->wq); ++} ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ blkif_notify_work(dev_id); ++ return IRQ_HANDLED; ++} ++ ++ ++ ++/****************************************************************** ++ * DOWNWARD CALLS -- These interface with the block-device layer proper. ++ */ ++ ++static int do_block_io_op(blkif_t *blkif) ++{ ++ blkif_back_rings_t *blk_rings = &blkif->blk_rings; ++ blkif_request_t req; ++ pending_req_t *pending_req; ++ RING_IDX rc, rp; ++ int more_to_do = 0; ++ ++ rc = blk_rings->common.req_cons; ++ rp = blk_rings->common.sring->req_prod; ++ rmb(); /* Ensure we see queued requests up to 'rp'. */ ++ ++ while ((rc != rp)) { ++ ++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) ++ break; ++ ++ pending_req = alloc_req(); ++ if (NULL == pending_req) { ++ blkif->st_oo_req++; ++ more_to_do = 1; ++ break; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.req_cons = ++rc; /* before make_response() */ ++ ++ switch (req.operation) { ++ case BLKIF_OP_READ: ++ blkif->st_rd_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ blkif->st_br_req++; ++ /* fall through */ ++ case BLKIF_OP_WRITE: ++ blkif->st_wr_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ default: ++ DPRINTK("error: unknown block io operation [%d]\n", ++ req.operation); ++ make_response(blkif, req.id, req.operation, ++ BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ break; ++ } ++ } ++ return more_to_do; ++} ++ ++static void dispatch_rw_block_io(blkif_t *blkif, ++ blkif_request_t *req, ++ pending_req_t *pending_req) ++{ ++ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); ++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ struct phys_req preq; ++ struct { ++ unsigned long buf; unsigned int nsec; ++ } seg[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ unsigned int nseg; ++ struct bio *bio = NULL, *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++ int ret, i, nbio = 0; ++ int operation; ++ ++ switch (req->operation) { ++ case BLKIF_OP_READ: ++ operation = READ; ++ break; ++ case BLKIF_OP_WRITE: ++ operation = WRITE; ++ break; ++ case BLKIF_OP_WRITE_BARRIER: ++ operation = WRITE_BARRIER; ++ break; ++ default: ++ operation = 0; /* make gcc happy */ ++ BUG(); ++ } ++ ++ /* Check that number of segments is sane. */ ++ nseg = req->nr_segments; ++ if (unlikely(nseg == 0) || ++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) { ++ DPRINTK("Bad number of segments in request (%d)\n", nseg); ++ goto fail_response; ++ } ++ ++ preq.dev = req->handle; ++ preq.sector_number = req->sector_number; ++ preq.nr_sects = 0; ++ ++ pending_req->blkif = blkif; ++ pending_req->id = req->id; ++ pending_req->operation = req->operation; ++ pending_req->status = BLKIF_RSP_OKAY; ++ pending_req->nr_pages = nseg; ++ ++ for (i = 0; i < nseg; i++) { ++ uint32_t flags; ++ ++ seg[i].nsec = req->seg[i].last_sect - ++ req->seg[i].first_sect + 1; ++ ++ if ((req->seg[i].last_sect >= (PAGE_SIZE >> 9)) || ++ (req->seg[i].last_sect < req->seg[i].first_sect)) ++ goto fail_response; ++ preq.nr_sects += seg[i].nsec; ++ ++ flags = GNTMAP_host_map; ++ if (operation != READ) ++ flags |= GNTMAP_readonly; ++ gnttab_set_map_op(&map[i], vaddr(pending_req, i), flags, ++ req->seg[i].gref, blkif->domid); ++ } ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, nseg); ++ BUG_ON(ret); ++ ++ for (i = 0; i < nseg; i++) { ++ if (unlikely(map[i].status != 0)) { ++ DPRINTK("invalid buffer -- could not remap it\n"); ++ map[i].handle = BLKBACK_INVALID_HANDLE; ++ ret |= 1; ++ } ++ ++ pending_handle(pending_req, i) = map[i].handle; ++ ++ if (ret) ++ continue; ++ ++ set_phys_to_machine(__pa(vaddr( ++ pending_req, i)) >> PAGE_SHIFT, ++ FOREIGN_FRAME(map[i].dev_bus_addr >> PAGE_SHIFT)); ++ seg[i].buf = map[i].dev_bus_addr | ++ (req->seg[i].first_sect << 9); ++ } ++ ++ if (ret) ++ goto fail_flush; ++ ++ if (vbd_translate(&preq, blkif, operation) != 0) { ++ DPRINTK("access denied: %s of [%llu,%llu] on dev=%04x\n", ++ operation == READ ? "read" : "write", ++ preq.sector_number, ++ preq.sector_number + preq.nr_sects, preq.dev); ++ goto fail_flush; ++ } ++ ++ for (i = 0; i < nseg; i++) { ++ if (((int)preq.sector_number|(int)seg[i].nsec) & ++ ((bdev_hardsect_size(preq.bdev) >> 9) - 1)) { ++ DPRINTK("Misaligned I/O request from domain %d", ++ blkif->domid); ++ goto fail_put_bio; ++ } ++ ++ while ((bio == NULL) || ++ (bio_add_page(bio, ++ virt_to_page(vaddr(pending_req, i)), ++ seg[i].nsec << 9, ++ seg[i].buf & ~PAGE_MASK) == 0)) { ++ bio = biolist[nbio++] = bio_alloc(GFP_KERNEL, nseg-i); ++ if (unlikely(bio == NULL)) ++ goto fail_put_bio; ++ ++ bio->bi_bdev = preq.bdev; ++ bio->bi_private = pending_req; ++ bio->bi_end_io = end_block_io_op; ++ bio->bi_sector = preq.sector_number; ++ } ++ ++ preq.sector_number += seg[i].nsec; ++ } ++ ++ plug_queue(blkif, bio); ++ atomic_set(&pending_req->pendcnt, nbio); ++ blkif_get(blkif); ++ ++ for (i = 0; i < nbio; i++) ++ submit_bio(operation, biolist[i]); ++ ++ if (operation == READ) ++ blkif->st_rd_sect += preq.nr_sects; ++ else if (operation == WRITE) ++ blkif->st_wr_sect += preq.nr_sects; ++ ++ return; ++ ++ fail_put_bio: ++ for (i = 0; i < (nbio-1); i++) ++ bio_put(biolist[i]); ++ fail_flush: ++ fast_flush_area(pending_req); ++ fail_response: ++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); ++ free_req(pending_req); ++} ++ ++ ++ ++/****************************************************************** ++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING ++ */ ++ ++ ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st) ++{ ++ blkif_response_t resp; ++ unsigned long flags; ++ blkif_back_rings_t *blk_rings = &blkif->blk_rings; ++ int more_to_do = 0; ++ int notify; ++ ++ resp.id = id; ++ resp.operation = op; ++ resp.status = st; ++ ++ spin_lock_irqsave(&blkif->blk_ring_lock, flags); ++ /* Place on the response ring for the relevant domain. */ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, blk_rings->x86_32.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, blk_rings->x86_64.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.rsp_prod_pvt++; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); ++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { ++ /* ++ * Tail check for pending requests. Allows frontend to avoid ++ * notifications if requests are already in flight (lower ++ * overheads and promotes batching). ++ */ ++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); ++ ++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { ++ more_to_do = 1; ++ } ++ ++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); ++ ++ if (more_to_do) ++ blkif_notify_work(blkif); ++ if (notify) ++ notify_remote_via_irq(blkif->irq); ++} ++ ++static int __init blkif_init(void) ++{ ++ int i, mmap_pages; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ mmap_pages = blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST; ++ ++ pending_reqs = kmalloc(sizeof(pending_reqs[0]) * ++ blkif_reqs, GFP_KERNEL); ++ pending_grant_handles = kmalloc(sizeof(pending_grant_handles[0]) * ++ mmap_pages, GFP_KERNEL); ++ pending_pages = alloc_empty_pages_and_pagevec(mmap_pages); ++ ++ if (!pending_reqs || !pending_grant_handles || !pending_pages) ++ goto out_of_memory; ++ ++ for (i = 0; i < mmap_pages; i++) ++ pending_grant_handles[i] = BLKBACK_INVALID_HANDLE; ++ ++ blkif_interface_init(); ++ ++ memset(pending_reqs, 0, sizeof(pending_reqs)); ++ INIT_LIST_HEAD(&pending_free); ++ ++ for (i = 0; i < blkif_reqs; i++) ++ list_add_tail(&pending_reqs[i].free_list, &pending_free); ++ ++ blkif_xenbus_init(); ++ ++ return 0; ++ ++ out_of_memory: ++ kfree(pending_reqs); ++ kfree(pending_grant_handles); ++ free_empty_pages_and_pagevec(pending_pages, mmap_pages); ++ printk("%s: out of memory\n", __FUNCTION__); ++ return -ENOMEM; ++} ++ ++module_init(blkif_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/common.h 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,139 @@ ++/* ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __BLKIF__BACKEND__COMMON_H__ ++#define __BLKIF__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <linux/blkdev.h> ++#include <linux/vmalloc.h> ++#include <linux/wait.h> ++#include <asm/io.h> ++#include <asm/setup.h> ++#include <asm/pgalloc.h> ++#include <xen/evtchn.h> ++#include <asm/hypervisor.h> ++#include <xen/blkif.h> ++#include <xen/gnttab.h> ++#include <xen/driver_util.h> ++#include <xen/xenbus.h> ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++ ++struct vbd { ++ blkif_vdev_t handle; /* what the domain refers to this vbd as */ ++ unsigned char readonly; /* Non-zero -> read-only */ ++ unsigned char type; /* VDISK_xxx */ ++ u32 pdevice; /* phys device that this vbd maps to */ ++ struct block_device *bdev; ++}; ++ ++struct backend_info; ++ ++typedef struct blkif_st { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ /* Physical parameters of the comms window. */ ++ unsigned int irq; ++ /* Comms information. */ ++ enum blkif_protocol blk_protocol; ++ blkif_back_rings_t blk_rings; ++ struct vm_struct *blk_ring_area; ++ /* The VBD attached to this interface. */ ++ struct vbd vbd; ++ /* Back pointer to the backend_info. */ ++ struct backend_info *be; ++ /* Private fields. */ ++ spinlock_t blk_ring_lock; ++ atomic_t refcnt; ++ ++ wait_queue_head_t wq; ++ struct task_struct *xenblkd; ++ unsigned int waiting_reqs; ++ request_queue_t *plug; ++ ++ /* statistics */ ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_br_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ ++ wait_queue_head_t waiting_to_free; ++ ++ grant_handle_t shmem_handle; ++ grant_ref_t shmem_ref; ++} blkif_t; ++ ++blkif_t *blkif_alloc(domid_t domid); ++void blkif_disconnect(blkif_t *blkif); ++void blkif_free(blkif_t *blkif); ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn); ++ ++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define blkif_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ wake_up(&(_b)->waiting_to_free);\ ++ } while (0) ++ ++/* Create a vbd. */ ++int vbd_create(blkif_t *blkif, blkif_vdev_t vdevice, unsigned major, ++ unsigned minor, int readonly); ++void vbd_free(struct vbd *vbd); ++ ++unsigned long long vbd_size(struct vbd *vbd); ++unsigned int vbd_info(struct vbd *vbd); ++unsigned long vbd_secsize(struct vbd *vbd); ++ ++struct phys_req { ++ unsigned short dev; ++ unsigned short nr_sects; ++ struct block_device *bdev; ++ blkif_sector_t sector_number; ++}; ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation); ++ ++void blkif_interface_init(void); ++ ++void blkif_xenbus_init(void); ++ ++irqreturn_t blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); ++int blkif_schedule(void *arg); ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state); ++ ++#endif /* __BLKIF__BACKEND__COMMON_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/interface.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,181 @@ ++/****************************************************************************** ++ * arch/xen/drivers/blkif/backend/interface.c ++ * ++ * Block-device interface management. ++ * ++ * Copyright (c) 2004, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include <xen/evtchn.h> ++#include <linux/kthread.h> ++ ++static kmem_cache_t *blkif_cachep; ++ ++blkif_t *blkif_alloc(domid_t domid) ++{ ++ blkif_t *blkif; ++ ++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); ++ if (!blkif) ++ return ERR_PTR(-ENOMEM); ++ ++ memset(blkif, 0, sizeof(*blkif)); ++ blkif->domid = domid; ++ spin_lock_init(&blkif->blk_ring_lock); ++ atomic_set(&blkif->refcnt, 1); ++ init_waitqueue_head(&blkif->wq); ++ blkif->st_print = jiffies; ++ init_waitqueue_head(&blkif->waiting_to_free); ++ ++ return blkif; ++} ++ ++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, shared_page, blkif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return op.status; ++ } ++ ++ blkif->shmem_ref = shared_page; ++ blkif->shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_page(blkif_t *blkif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, blkif->shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int blkif_map(blkif_t *blkif, unsigned long shared_page, unsigned int evtchn) ++{ ++ int err; ++ ++ /* Already connected through? */ ++ if (blkif->irq) ++ return 0; ++ ++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) ++ return -ENOMEM; ++ ++ err = map_frontend_page(blkif, shared_page); ++ if (err) { ++ free_vm_area(blkif->blk_ring_area); ++ return err; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ { ++ blkif_sring_t *sring; ++ sring = (blkif_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_32: ++ { ++ blkif_x86_32_sring_t *sring_x86_32; ++ sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_64: ++ { ++ blkif_x86_64_sring_t *sring_x86_64; ++ sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ blkif->domid, evtchn, blkif_be_int, 0, "blkif-backend", blkif); ++ if (err < 0) ++ { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ return err; ++ } ++ blkif->irq = err; ++ ++ return 0; ++} ++ ++void blkif_disconnect(blkif_t *blkif) ++{ ++ if (blkif->xenblkd) { ++ kthread_stop(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ } ++ ++ atomic_dec(&blkif->refcnt); ++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); ++ atomic_inc(&blkif->refcnt); ++ ++ if (blkif->irq) { ++ unbind_from_irqhandler(blkif->irq, blkif); ++ blkif->irq = 0; ++ } ++ ++ if (blkif->blk_rings.common.sring) { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ } ++} ++ ++void blkif_free(blkif_t *blkif) ++{ ++ if (!atomic_dec_and_test(&blkif->refcnt)) ++ BUG(); ++ kmem_cache_free(blkif_cachep, blkif); ++} ++ ++void __init blkif_interface_init(void) ++{ ++ blkif_cachep = kmem_cache_create("blkif_cache", sizeof(blkif_t), ++ 0, 0, NULL, NULL); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/vbd.c 2007-08-27 14:01:47.000000000 -0400 +@@ -0,0 +1,118 @@ ++/****************************************************************************** ++ * blkback/vbd.c ++ * ++ * Routines for managing virtual block devices (VBDs). ++ * ++ * Copyright (c) 2003-2005, Keir Fraser & Steve Hand ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++ ++#define vbd_sz(_v) ((_v)->bdev->bd_part ? \ ++ (_v)->bdev->bd_part->nr_sects : (_v)->bdev->bd_disk->capacity) ++ ++unsigned long long vbd_size(struct vbd *vbd) ++{ ++ return vbd_sz(vbd); ++} ++ ++unsigned int vbd_info(struct vbd *vbd) ++{ ++ return vbd->type | (vbd->readonly?VDISK_READONLY:0); ++} ++ ++unsigned long vbd_secsize(struct vbd *vbd) ++{ ++ return bdev_hardsect_size(vbd->bdev); ++} ++ ++int vbd_create(blkif_t *blkif, blkif_vdev_t handle, unsigned major, ++ unsigned minor, int readonly) ++{ ++ struct vbd *vbd; ++ struct block_device *bdev; ++ ++ vbd = &blkif->vbd; ++ vbd->handle = handle; ++ vbd->readonly = readonly; ++ vbd->type = 0; ++ ++ vbd->pdevice = MKDEV(major, minor); ++ ++ bdev = open_by_devnum(vbd->pdevice, ++ vbd->readonly ? FMODE_READ : FMODE_WRITE); ++ ++ if (IS_ERR(bdev)) { ++ DPRINTK("vbd_creat: device %08x could not be opened.\n", ++ vbd->pdevice); ++ return -ENOENT; ++ } ++ ++ vbd->bdev = bdev; ++ ++ if (vbd->bdev->bd_disk == NULL) { ++ DPRINTK("vbd_creat: device %08x doesn't exist.\n", ++ vbd->pdevice); ++ vbd_free(vbd); ++ return -ENOENT; ++ } ++ ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_CD) ++ vbd->type |= VDISK_CDROM; ++ if (vbd->bdev->bd_disk->flags & GENHD_FL_REMOVABLE) ++ vbd->type |= VDISK_REMOVABLE; ++ ++ DPRINTK("Successful creation of handle=%04x (dom=%u)\n", ++ handle, blkif->domid); ++ return 0; ++} ++ ++void vbd_free(struct vbd *vbd) ++{ ++ if (vbd->bdev) ++ blkdev_put(vbd->bdev); ++ vbd->bdev = NULL; ++} ++ ++int vbd_translate(struct phys_req *req, blkif_t *blkif, int operation) ++{ ++ struct vbd *vbd = &blkif->vbd; ++ int rc = -EACCES; ++ ++ if ((operation != READ) && vbd->readonly) ++ goto out; ++ ++ if (unlikely((req->sector_number + req->nr_sects) > vbd_sz(vbd))) ++ goto out; ++ ++ req->dev = vbd->pdevice; ++ req->bdev = vbd->bdev; ++ rc = 0; ++ ++ out: ++ return rc; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkback/xenbus.c 2007-08-27 14:02:09.000000000 -0400 +@@ -0,0 +1,533 @@ ++/* Xenbus code for blkif backend ++ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <linux/kthread.h> ++#include "common.h" ++ ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ pr_debug("blkback/xenbus (%s:%d) " fmt ".\n", \ ++ __FUNCTION__, __LINE__, ##args) ++ ++struct backend_info ++{ ++ struct xenbus_device *dev; ++ blkif_t *blkif; ++ struct xenbus_watch backend_watch; ++ unsigned major; ++ unsigned minor; ++ char *mode; ++}; ++ ++static void connect(struct backend_info *); ++static int connect_ring(struct backend_info *); ++static void backend_changed(struct xenbus_watch *, const char **, ++ unsigned int); ++ ++static int blkback_name(blkif_t *blkif, char *buf) ++{ ++ char *devpath, *devname; ++ struct xenbus_device *dev = blkif->be->dev; ++ ++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); ++ if (IS_ERR(devpath)) ++ return PTR_ERR(devpath); ++ ++ if ((devname = strstr(devpath, "/dev/")) != NULL) ++ devname += strlen("/dev/"); ++ else ++ devname = devpath; ++ ++ snprintf(buf, TASK_COMM_LEN, "blkback.%d.%s", blkif->domid, devname); ++ kfree(devpath); ++ ++ return 0; ++} ++ ++static void update_blkif_status(blkif_t *blkif) ++{ ++ int err; ++ char name[TASK_COMM_LEN]; ++ ++ /* Not ready to connect? */ ++ if (!blkif->irq || !blkif->vbd.bdev) ++ return; ++ ++ /* Already connected? */ ++ if (blkif->be->dev->state == XenbusStateConnected) ++ return; ++ ++ /* Attempt to connect: exit if we fail to. */ ++ connect(blkif->be); ++ if (blkif->be->dev->state != XenbusStateConnected) ++ return; ++ ++ err = blkback_name(blkif, name); ++ if (err) { ++ xenbus_dev_error(blkif->be->dev, err, "get blkback dev name"); ++ return; ++ } ++ ++ blkif->xenblkd = kthread_run(blkif_schedule, blkif, name); ++ if (IS_ERR(blkif->xenblkd)) { ++ err = PTR_ERR(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); ++ } ++} ++ ++ ++/**************************************************************** ++ * sysfs interface for VBD I/O requests ++ */ ++ ++#define VBD_SHOW(name, format, args...) \ ++ static ssize_t show_##name(struct device *_dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct xenbus_device *dev = to_xenbus_device(_dev); \ ++ struct backend_info *be = dev->dev.driver_data; \ ++ \ ++ return sprintf(buf, format, ##args); \ ++ } \ ++ DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) ++ ++VBD_SHOW(oo_req, "%d\n", be->blkif->st_oo_req); ++VBD_SHOW(rd_req, "%d\n", be->blkif->st_rd_req); ++VBD_SHOW(wr_req, "%d\n", be->blkif->st_wr_req); ++VBD_SHOW(br_req, "%d\n", be->blkif->st_br_req); ++VBD_SHOW(rd_sect, "%d\n", be->blkif->st_rd_sect); ++VBD_SHOW(wr_sect, "%d\n", be->blkif->st_wr_sect); ++ ++static struct attribute *vbdstat_attrs[] = { ++ &dev_attr_oo_req.attr, ++ &dev_attr_rd_req.attr, ++ &dev_attr_wr_req.attr, ++ &dev_attr_br_req.attr, ++ &dev_attr_rd_sect.attr, ++ &dev_attr_wr_sect.attr, ++ NULL ++}; ++ ++static struct attribute_group vbdstat_group = { ++ .name = "statistics", ++ .attrs = vbdstat_attrs, ++}; ++ ++VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); ++VBD_SHOW(mode, "%s\n", be->mode); ++ ++int xenvbd_sysfs_addif(struct xenbus_device *dev) ++{ ++ int error; ++ ++ error = device_create_file(&dev->dev, &dev_attr_physical_device); ++ if (error) ++ goto fail1; ++ ++ error = device_create_file(&dev->dev, &dev_attr_mode); ++ if (error) ++ goto fail2; ++ ++ error = sysfs_create_group(&dev->dev.kobj, &vbdstat_group); ++ if (error) ++ goto fail3; ++ ++ return 0; ++ ++fail3: sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++fail2: device_remove_file(&dev->dev, &dev_attr_mode); ++fail1: device_remove_file(&dev->dev, &dev_attr_physical_device); ++ return error; ++} ++ ++void xenvbd_sysfs_delif(struct xenbus_device *dev) ++{ ++ sysfs_remove_group(&dev->dev.kobj, &vbdstat_group); ++ device_remove_file(&dev->dev, &dev_attr_mode); ++ device_remove_file(&dev->dev, &dev_attr_physical_device); ++} ++ ++static int blkback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ ++ DPRINTK(""); ++ ++ if (be->backend_watch.node) { ++ unregister_xenbus_watch(&be->backend_watch); ++ kfree(be->backend_watch.node); ++ be->backend_watch.node = NULL; ++ } ++ ++ if (be->blkif) { ++ blkif_disconnect(be->blkif); ++ vbd_free(&be->blkif->vbd); ++ blkif_free(be->blkif); ++ be->blkif = NULL; ++ } ++ ++ if (be->major || be->minor) ++ xenvbd_sysfs_delif(dev); ++ ++ kfree(be); ++ dev->dev.driver_data = NULL; ++ return 0; ++} ++ ++int blkback_barrier(struct xenbus_transaction xbt, ++ struct backend_info *be, int state) ++{ ++ struct xenbus_device *dev = be->dev; ++ int err; ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-barrier", ++ "%d", state); ++ if (err) ++ xenbus_dev_fatal(dev, err, "writing feature-barrier"); ++ ++ return err; ++} ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures, and watch the store waiting for the hotplug scripts to tell us ++ * the device's physical major and minor numbers. Switch to InitWait. ++ */ ++static int blkback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ be->dev = dev; ++ dev->dev.driver_data = be; ++ ++ be->blkif = blkif_alloc(dev->otherend_id); ++ if (IS_ERR(be->blkif)) { ++ err = PTR_ERR(be->blkif); ++ be->blkif = NULL; ++ xenbus_dev_fatal(dev, err, "creating block interface"); ++ goto fail; ++ } ++ ++ /* setup back pointer */ ++ be->blkif->be = be; ++ ++ err = xenbus_watch_path2(dev, dev->nodename, "physical-device", ++ &be->backend_watch, backend_changed); ++ if (err) ++ goto fail; ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ return 0; ++ ++fail: ++ DPRINTK("failed"); ++ blkback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Callback received when the hotplug scripts have placed the physical-device ++ * node. Read it and the mode node, and create a vbd. If the frontend is ++ * ready, connect. ++ */ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ int err; ++ unsigned major; ++ unsigned minor; ++ struct backend_info *be ++ = container_of(watch, struct backend_info, backend_watch); ++ struct xenbus_device *dev = be->dev; ++ ++ DPRINTK(""); ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "physical-device", "%x:%x", ++ &major, &minor); ++ if (XENBUS_EXIST_ERR(err)) { ++ /* Since this watch will fire once immediately after it is ++ registered, we expect this. Ignore it, and wait for the ++ hotplug scripts. */ ++ return; ++ } ++ if (err != 2) { ++ xenbus_dev_fatal(dev, err, "reading physical-device"); ++ return; ++ } ++ ++ if ((be->major || be->minor) && ++ ((be->major != major) || (be->minor != minor))) { ++ printk(KERN_WARNING ++ "blkback: changing physical device (from %x:%x to " ++ "%x:%x) not supported.\n", be->major, be->minor, ++ major, minor); ++ return; ++ } ++ ++ be->mode = xenbus_read(XBT_NIL, dev->nodename, "mode", NULL); ++ if (IS_ERR(be->mode)) { ++ err = PTR_ERR(be->mode); ++ be->mode = NULL; ++ xenbus_dev_fatal(dev, err, "reading mode"); ++ return; ++ } ++ ++ if (be->major == 0 && be->minor == 0) { ++ /* Front end dir is a number, which is used as the handle. */ ++ ++ char *p = strrchr(dev->otherend, '/') + 1; ++ long handle = simple_strtoul(p, NULL, 0); ++ ++ be->major = major; ++ be->minor = minor; ++ ++ err = vbd_create(be->blkif, handle, major, minor, ++ (NULL == strchr(be->mode, 'w'))); ++ if (err) { ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating vbd structure"); ++ return; ++ } ++ ++ err = xenvbd_sysfs_addif(dev); ++ if (err) { ++ vbd_free(&be->blkif->vbd); ++ be->major = be->minor = 0; ++ xenbus_dev_fatal(dev, err, "creating sysfs entries"); ++ return; ++ } ++ ++ /* We're potentially connected now */ ++ update_blkif_status(be->blkif); ++ } ++} ++ ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ int err; ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ /* Ensure we connect even when two watches fire in ++ close successsion and we miss the intermediate value ++ of frontend_state. */ ++ if (dev->state == XenbusStateConnected) ++ break; ++ ++ err = connect_ring(be); ++ if (err) ++ break; ++ update_blkif_status(be->blkif); ++ break; ++ ++ case XenbusStateClosing: ++ blkif_disconnect(be->blkif); ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++/* ** Connection ** */ ++ ++ ++/** ++ * Write the physical details regarding the block device to the store, and ++ * switch to Connected state. ++ */ ++static void connect(struct backend_info *be) ++{ ++ struct xenbus_transaction xbt; ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ /* Supply the information about the device the frontend needs */ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ return; ++ } ++ ++ err = blkback_barrier(xbt, be, 1); ++ if (err) ++ goto abort; ++ ++ err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu", ++ vbd_size(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sectors", ++ dev->nodename); ++ goto abort; ++ } ++ ++ /* FIXME: use a typename instead */ ++ err = xenbus_printf(xbt, dev->nodename, "info", "%u", ++ vbd_info(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/info", ++ dev->nodename); ++ goto abort; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "sector-size", "%lu", ++ vbd_secsize(&be->blkif->vbd)); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "writing %s/sector-size", ++ dev->nodename); ++ goto abort; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ if (err) ++ xenbus_dev_fatal(dev, err, "ending transaction"); ++ ++ err = xenbus_switch_state(dev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(dev, err, "switching to Connected state", ++ dev->nodename); ++ ++ return; ++ abort: ++ xenbus_transaction_end(xbt, 1); ++} ++ ++ ++static int connect_ring(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long ring_ref; ++ unsigned int evtchn; ++ char protocol[64] = ""; ++ int err; ++ ++ DPRINTK("%s", dev->otherend); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", &ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", ++ "%63s", protocol, NULL); ++ if (err) ++ strcpy(protocol, "unspecified, assuming native"); ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; ++ else { ++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); ++ return -1; ++ } ++ printk(KERN_INFO ++ "blkback: ring-ref %ld, event-channel %d, protocol %d (%s)\n", ++ ring_ref, evtchn, be->blkif->blk_protocol, protocol); ++ ++ /* Map the shared frame, irq etc. */ ++ err = blkif_map(be->blkif, ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", ++ ring_ref, evtchn); ++ return err; ++ } ++ ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static struct xenbus_device_id blkback_ids[] = { ++ { "vbd" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver blkback = { ++ .name = "vbd", ++ .owner = THIS_MODULE, ++ .ids = blkback_ids, ++ .probe = blkback_probe, ++ .remove = blkback_remove, ++ .otherend_changed = frontend_changed ++}; ++ ++ ++void blkif_xenbus_init(void) ++{ ++ xenbus_register_backend(&blkback); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkfront/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,5 @@ ++ ++obj-$(CONFIG_XEN_BLKDEV_FRONTEND) := xenblk.o ++ ++xenblk-objs := blkfront.o vbd.o ++ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkfront/blkfront.c 2007-08-27 14:02:08.000000000 -0400 +@@ -0,0 +1,902 @@ ++/****************************************************************************** ++ * blkfront.c ++ * ++ * XenLinux virtual block-device driver. ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge ++ * Copyright (c) 2004, Christian Limpach ++ * Copyright (c) 2004, Andrew Warfield ++ * Copyright (c) 2005, Christopher Clark ++ * Copyright (c) 2005, XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/version.h> ++#include "block.h" ++#include <linux/cdrom.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <scsi/scsi.h> ++#include <xen/evtchn.h> ++#include <xen/xenbus.h> ++#include <xen/interface/grant_table.h> ++#include <xen/interface/io/protocols.h> ++#include <xen/gnttab.h> ++#include <asm/hypervisor.h> ++#include <asm/maddr.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++#define BLKIF_STATE_DISCONNECTED 0 ++#define BLKIF_STATE_CONNECTED 1 ++#define BLKIF_STATE_SUSPENDED 2 ++ ++#define MAXIMUM_OUTSTANDING_BLOCK_REQS \ ++ (BLKIF_MAX_SEGMENTS_PER_REQUEST * BLK_RING_SIZE) ++#define GRANT_INVALID_REF 0 ++ ++static void connect(struct blkfront_info *); ++static void blkfront_closing(struct xenbus_device *); ++static int blkfront_remove(struct xenbus_device *); ++static int talk_to_backend(struct xenbus_device *, struct blkfront_info *); ++static int setup_blkring(struct xenbus_device *, struct blkfront_info *); ++ ++static void kick_pending_request_queues(struct blkfront_info *); ++ ++static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs); ++static void blkif_restart_queue(void *arg); ++static void blkif_recover(struct blkfront_info *); ++static void blkif_completion(struct blk_shadow *); ++static void blkif_free(struct blkfront_info *, int); ++ ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures and the ring buffer for communication with the backend, and ++ * inform the backend of the appropriate details for those. Switch to ++ * Initialised state. ++ */ ++static int blkfront_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err, vdevice, i; ++ struct blkfront_info *info; ++ ++ /* FIXME: Use dynamic device id if this is not set. */ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, ++ "virtual-device", "%i", &vdevice); ++ if (err != 1) { ++ xenbus_dev_fatal(dev, err, "reading virtual-device"); ++ return err; ++ } ++ ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (!info) { ++ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); ++ return -ENOMEM; ++ } ++ ++ info->xbdev = dev; ++ info->vdevice = vdevice; ++ info->connected = BLKIF_STATE_DISCONNECTED; ++ INIT_WORK(&info->work, blkif_restart_queue, (void *)info); ++ ++ for (i = 0; i < BLK_RING_SIZE; i++) ++ info->shadow[i].req.id = i+1; ++ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; ++ ++ /* Front end dir is a number, which is used as the id. */ ++ info->handle = simple_strtoul(strrchr(dev->nodename,'/')+1, NULL, 0); ++ dev->dev.driver_data = info; ++ ++ err = talk_to_backend(dev, info); ++ if (err) { ++ kfree(info); ++ dev->dev.driver_data = NULL; ++ return err; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * We are reconnecting to the backend, due to a suspend/resume, or a backend ++ * driver restart. We tear down our blkif structure and recreate it, but ++ * leave the device-layer structures intact so that this is transparent to the ++ * rest of the kernel. ++ */ ++static int blkfront_resume(struct xenbus_device *dev) ++{ ++ struct blkfront_info *info = dev->dev.driver_data; ++ int err; ++ ++ DPRINTK("blkfront_resume: %s\n", dev->nodename); ++ ++ blkif_free(info, info->connected == BLKIF_STATE_CONNECTED); ++ ++ err = talk_to_backend(dev, info); ++ if (info->connected == BLKIF_STATE_SUSPENDED && !err) ++ blkif_recover(info); ++ ++ return err; ++} ++ ++ ++/* Common code used when first setting up, and when resuming. */ ++static int talk_to_backend(struct xenbus_device *dev, ++ struct blkfront_info *info) ++{ ++ const char *message = NULL; ++ struct xenbus_transaction xbt; ++ int err; ++ ++ /* Create shared ring, alloc event channel. */ ++ err = setup_blkring(dev, info); ++ if (err) ++ goto out; ++ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ goto destroy_blkring; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, ++ "ring-ref","%u", info->ring_ref); ++ if (err) { ++ message = "writing ring-ref"; ++ goto abort_transaction; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", ++ irq_to_evtchn_port(info->irq)); ++ if (err) { ++ message = "writing event-channel"; ++ goto abort_transaction; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", ++ XEN_IO_PROTO_ABI_NATIVE); ++ if (err) { ++ message = "writing protocol"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err) { ++ if (err == -EAGAIN) ++ goto again; ++ xenbus_dev_fatal(dev, err, "completing transaction"); ++ goto destroy_blkring; ++ } ++ ++ xenbus_switch_state(dev, XenbusStateInitialised); ++ ++ return 0; ++ ++ abort_transaction: ++ xenbus_transaction_end(xbt, 1); ++ if (message) ++ xenbus_dev_fatal(dev, err, "%s", message); ++ destroy_blkring: ++ blkif_free(info, 0); ++ out: ++ return err; ++} ++ ++ ++static int setup_blkring(struct xenbus_device *dev, ++ struct blkfront_info *info) ++{ ++ blkif_sring_t *sring; ++ int err; ++ ++ info->ring_ref = GRANT_INVALID_REF; ++ ++ sring = (blkif_sring_t *)__get_free_page(GFP_KERNEL); ++ if (!sring) { ++ xenbus_dev_fatal(dev, -ENOMEM, "allocating shared ring"); ++ return -ENOMEM; ++ } ++ SHARED_RING_INIT(sring); ++ FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE); ++ ++ err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring)); ++ if (err < 0) { ++ free_page((unsigned long)sring); ++ info->ring.sring = NULL; ++ goto fail; ++ } ++ info->ring_ref = err; ++ ++ err = bind_listening_port_to_irqhandler( ++ dev->otherend_id, blkif_int, SA_SAMPLE_RANDOM, "blkif", info); ++ if (err <= 0) { ++ xenbus_dev_fatal(dev, err, ++ "bind_listening_port_to_irqhandler"); ++ goto fail; ++ } ++ info->irq = err; ++ ++ return 0; ++fail: ++ blkif_free(info, 0); ++ return err; ++} ++ ++ ++/** ++ * Callback received when the backend's state changes. ++ */ ++static void backend_changed(struct xenbus_device *dev, ++ enum xenbus_state backend_state) ++{ ++ struct blkfront_info *info = dev->dev.driver_data; ++ struct block_device *bd; ++ ++ DPRINTK("blkfront:backend_changed.\n"); ++ ++ switch (backend_state) { ++ case XenbusStateInitialising: ++ case XenbusStateInitWait: ++ case XenbusStateInitialised: ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ break; ++ ++ case XenbusStateConnected: ++ connect(info); ++ break; ++ ++ case XenbusStateClosing: ++ bd = bdget(info->dev); ++ if (bd == NULL) ++ xenbus_dev_fatal(dev, -ENODEV, "bdget failed"); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) ++ down(&bd->bd_sem); ++#else ++ mutex_lock(&bd->bd_mutex); ++#endif ++ if (info->users > 0) ++ xenbus_dev_error(dev, -EBUSY, ++ "Device in use; refusing to close"); ++ else ++ blkfront_closing(dev); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17) ++ up(&bd->bd_sem); ++#else ++ mutex_unlock(&bd->bd_mutex); ++#endif ++ bdput(bd); ++ break; ++ } ++} ++ ++ ++/* ** Connection ** */ ++ ++ ++/* ++ * Invoked when the backend is finally 'ready' (and has told produced ++ * the details about the physical device - #sectors, size, etc). ++ */ ++static void connect(struct blkfront_info *info) ++{ ++ unsigned long long sectors; ++ unsigned long sector_size; ++ unsigned int binfo; ++ int err; ++ ++ if ((info->connected == BLKIF_STATE_CONNECTED) || ++ (info->connected == BLKIF_STATE_SUSPENDED) ) ++ return; ++ ++ DPRINTK("blkfront.c:connect:%s.\n", info->xbdev->otherend); ++ ++ err = xenbus_gather(XBT_NIL, info->xbdev->otherend, ++ "sectors", "%Lu", §ors, ++ "info", "%u", &binfo, ++ "sector-size", "%lu", §or_size, ++ NULL); ++ if (err) { ++ xenbus_dev_fatal(info->xbdev, err, ++ "reading backend fields at %s", ++ info->xbdev->otherend); ++ return; ++ } ++ ++ err = xenbus_gather(XBT_NIL, info->xbdev->otherend, ++ "feature-barrier", "%lu", &info->feature_barrier, ++ NULL); ++ if (err) ++ info->feature_barrier = 0; ++ ++ err = xlvbd_add(sectors, info->vdevice, binfo, sector_size, info); ++ if (err) { ++ xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s", ++ info->xbdev->otherend); ++ return; ++ } ++ ++ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); ++ ++ /* Kick pending requests. */ ++ spin_lock_irq(&blkif_io_lock); ++ info->connected = BLKIF_STATE_CONNECTED; ++ kick_pending_request_queues(info); ++ spin_unlock_irq(&blkif_io_lock); ++ ++ add_disk(info->gd); ++} ++ ++/** ++ * Handle the change of state of the backend to Closing. We must delete our ++ * device-layer structures now, to ensure that writes are flushed through to ++ * the backend. Once is this done, we can switch to Closed in ++ * acknowledgement. ++ */ ++static void blkfront_closing(struct xenbus_device *dev) ++{ ++ struct blkfront_info *info = dev->dev.driver_data; ++ unsigned long flags; ++ ++ DPRINTK("blkfront_closing: %s removed\n", dev->nodename); ++ ++ if (info->rq == NULL) ++ goto out; ++ ++ spin_lock_irqsave(&blkif_io_lock, flags); ++ /* No more blkif_request(). */ ++ blk_stop_queue(info->rq); ++ /* No more gnttab callback work. */ ++ gnttab_cancel_free_callback(&info->callback); ++ spin_unlock_irqrestore(&blkif_io_lock, flags); ++ ++ /* Flush gnttab callback work. Must be done with no locks held. */ ++ flush_scheduled_work(); ++ ++ xlvbd_del(info); ++ ++ out: ++ xenbus_frontend_closed(dev); ++} ++ ++ ++static int blkfront_remove(struct xenbus_device *dev) ++{ ++ struct blkfront_info *info = dev->dev.driver_data; ++ ++ DPRINTK("blkfront_remove: %s removed\n", dev->nodename); ++ ++ blkif_free(info, 0); ++ ++ kfree(info); ++ ++ return 0; ++} ++ ++ ++static inline int GET_ID_FROM_FREELIST( ++ struct blkfront_info *info) ++{ ++ unsigned long free = info->shadow_free; ++ BUG_ON(free > BLK_RING_SIZE); ++ info->shadow_free = info->shadow[free].req.id; ++ info->shadow[free].req.id = 0x0fffffee; /* debug */ ++ return free; ++} ++ ++static inline void ADD_ID_TO_FREELIST( ++ struct blkfront_info *info, unsigned long id) ++{ ++ info->shadow[id].req.id = info->shadow_free; ++ info->shadow[id].request = 0; ++ info->shadow_free = id; ++} ++ ++static inline void flush_requests(struct blkfront_info *info) ++{ ++ int notify; ++ ++ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); ++ ++ if (notify) ++ notify_remote_via_irq(info->irq); ++} ++ ++static void kick_pending_request_queues(struct blkfront_info *info) ++{ ++ if (!RING_FULL(&info->ring)) { ++ /* Re-enable calldowns. */ ++ blk_start_queue(info->rq); ++ /* Kick things off immediately. */ ++ do_blkif_request(info->rq); ++ } ++} ++ ++static void blkif_restart_queue(void *arg) ++{ ++ struct blkfront_info *info = (struct blkfront_info *)arg; ++ spin_lock_irq(&blkif_io_lock); ++ if (info->connected == BLKIF_STATE_CONNECTED) ++ kick_pending_request_queues(info); ++ spin_unlock_irq(&blkif_io_lock); ++} ++ ++static void blkif_restart_queue_callback(void *arg) ++{ ++ struct blkfront_info *info = (struct blkfront_info *)arg; ++ schedule_work(&info->work); ++} ++ ++int blkif_open(struct inode *inode, struct file *filep) ++{ ++ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; ++ info->users++; ++ return 0; ++} ++ ++ ++int blkif_release(struct inode *inode, struct file *filep) ++{ ++ struct blkfront_info *info = inode->i_bdev->bd_disk->private_data; ++ info->users--; ++ if (info->users == 0) { ++ /* Check whether we have been instructed to close. We will ++ have ignored this request initially, as the device was ++ still mounted. */ ++ struct xenbus_device * dev = info->xbdev; ++ enum xenbus_state state = xenbus_read_driver_state(dev->otherend); ++ ++ if (state == XenbusStateClosing) ++ blkfront_closing(dev); ++ } ++ return 0; ++} ++ ++ ++int blkif_ioctl(struct inode *inode, struct file *filep, ++ unsigned command, unsigned long argument) ++{ ++ int i; ++ ++ DPRINTK_IOCTL("command: 0x%x, argument: 0x%lx, dev: 0x%04x\n", ++ command, (long)argument, inode->i_rdev); ++ ++ switch (command) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) ++ case HDIO_GETGEO: { ++ struct block_device *bd = inode->i_bdev; ++ struct hd_geometry geo; ++ int ret; ++ ++ if (!argument) ++ return -EINVAL; ++ ++ geo.start = get_start_sect(bd); ++ ret = blkif_getgeo(bd, &geo); ++ if (ret) ++ return ret; ++ ++ if (copy_to_user((struct hd_geometry __user *)argument, &geo, ++ sizeof(geo))) ++ return -EFAULT; ++ ++ return 0; ++ } ++#endif ++ case CDROMMULTISESSION: ++ DPRINTK("FIXME: support multisession CDs later\n"); ++ for (i = 0; i < sizeof(struct cdrom_multisession); i++) ++ if (put_user(0, (char __user *)(argument + i))) ++ return -EFAULT; ++ return 0; ++ ++ default: ++ /*printk(KERN_ALERT "ioctl %08x not supported by Xen blkdev\n", ++ command);*/ ++ return -EINVAL; /* same return as native Linux */ ++ } ++ ++ return 0; ++} ++ ++ ++int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) ++{ ++ /* We don't have real geometry info, but let's at least return ++ values consistent with the size of the device */ ++ sector_t nsect = get_capacity(bd->bd_disk); ++ sector_t cylinders = nsect; ++ ++ hg->heads = 0xff; ++ hg->sectors = 0x3f; ++ sector_div(cylinders, hg->heads * hg->sectors); ++ hg->cylinders = cylinders; ++ if ((sector_t)(hg->cylinders + 1) * hg->heads * hg->sectors < nsect) ++ hg->cylinders = 0xffff; ++ return 0; ++} ++ ++ ++/* ++ * blkif_queue_request ++ * ++ * request block io ++ * ++ * id: for guest use only. ++ * operation: BLKIF_OP_{READ,WRITE,PROBE} ++ * buffer: buffer to read/write into. this should be a ++ * virtual address in the guest os. ++ */ ++static int blkif_queue_request(struct request *req) ++{ ++ struct blkfront_info *info = req->rq_disk->private_data; ++ unsigned long buffer_mfn; ++ blkif_request_t *ring_req; ++ struct bio *bio; ++ struct bio_vec *bvec; ++ int idx; ++ unsigned long id; ++ unsigned int fsect, lsect; ++ int ref; ++ grant_ref_t gref_head; ++ ++ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) ++ return 1; ++ ++ if (gnttab_alloc_grant_references( ++ BLKIF_MAX_SEGMENTS_PER_REQUEST, &gref_head) < 0) { ++ gnttab_request_free_callback( ++ &info->callback, ++ blkif_restart_queue_callback, ++ info, ++ BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ return 1; ++ } ++ ++ /* Fill out a communications ring structure. */ ++ ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); ++ id = GET_ID_FROM_FREELIST(info); ++ info->shadow[id].request = (unsigned long)req; ++ ++ ring_req->id = id; ++ ring_req->sector_number = (blkif_sector_t)req->sector; ++ ring_req->handle = info->handle; ++ ++ ring_req->operation = rq_data_dir(req) ? ++ BLKIF_OP_WRITE : BLKIF_OP_READ; ++ if (blk_barrier_rq(req)) ++ ring_req->operation = BLKIF_OP_WRITE_BARRIER; ++ ++ ring_req->nr_segments = 0; ++ rq_for_each_bio (bio, req) { ++ bio_for_each_segment (bvec, bio, idx) { ++ BUG_ON(ring_req->nr_segments ++ == BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ buffer_mfn = page_to_phys(bvec->bv_page) >> PAGE_SHIFT; ++ fsect = bvec->bv_offset >> 9; ++ lsect = fsect + (bvec->bv_len >> 9) - 1; ++ /* install a grant reference. */ ++ ref = gnttab_claim_grant_reference(&gref_head); ++ BUG_ON(ref == -ENOSPC); ++ ++ gnttab_grant_foreign_access_ref( ++ ref, ++ info->xbdev->otherend_id, ++ buffer_mfn, ++ rq_data_dir(req) ); ++ ++ info->shadow[id].frame[ring_req->nr_segments] = ++ mfn_to_pfn(buffer_mfn); ++ ++ ring_req->seg[ring_req->nr_segments] = ++ (struct blkif_request_segment) { ++ .gref = ref, ++ .first_sect = fsect, ++ .last_sect = lsect }; ++ ++ ring_req->nr_segments++; ++ } ++ } ++ ++ info->ring.req_prod_pvt++; ++ ++ /* Keep a private copy so we can reissue requests when recovering. */ ++ info->shadow[id].req = *ring_req; ++ ++ gnttab_free_grant_references(gref_head); ++ ++ return 0; ++} ++ ++/* ++ * do_blkif_request ++ * read a block; request is in a request queue ++ */ ++void do_blkif_request(request_queue_t *rq) ++{ ++ struct blkfront_info *info = NULL; ++ struct request *req; ++ int queued; ++ ++ DPRINTK("Entered do_blkif_request\n"); ++ ++ queued = 0; ++ ++ while ((req = elv_next_request(rq)) != NULL) { ++ info = req->rq_disk->private_data; ++ if (!blk_fs_request(req)) { ++ end_request(req, 0); ++ continue; ++ } ++ ++ if (RING_FULL(&info->ring)) ++ goto wait; ++ ++ DPRINTK("do_blk_req %p: cmd %p, sec %llx, " ++ "(%u/%li) buffer:%p [%s]\n", ++ req, req->cmd, (long long)req->sector, ++ req->current_nr_sectors, ++ req->nr_sectors, req->buffer, ++ rq_data_dir(req) ? "write" : "read"); ++ ++ ++ blkdev_dequeue_request(req); ++ if (blkif_queue_request(req)) { ++ blk_requeue_request(rq, req); ++ wait: ++ /* Avoid pointless unplugs. */ ++ blk_stop_queue(rq); ++ break; ++ } ++ ++ queued++; ++ } ++ ++ if (queued != 0) ++ flush_requests(info); ++} ++ ++ ++static irqreturn_t blkif_int(int irq, void *dev_id, struct pt_regs *ptregs) ++{ ++ struct request *req; ++ blkif_response_t *bret; ++ RING_IDX i, rp; ++ unsigned long flags; ++ struct blkfront_info *info = (struct blkfront_info *)dev_id; ++ int uptodate; ++ ++ spin_lock_irqsave(&blkif_io_lock, flags); ++ ++ if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { ++ spin_unlock_irqrestore(&blkif_io_lock, flags); ++ return IRQ_HANDLED; ++ } ++ ++ again: ++ rp = info->ring.sring->rsp_prod; ++ rmb(); /* Ensure we see queued responses up to 'rp'. */ ++ ++ for (i = info->ring.rsp_cons; i != rp; i++) { ++ unsigned long id; ++ int ret; ++ ++ bret = RING_GET_RESPONSE(&info->ring, i); ++ id = bret->id; ++ req = (struct request *)info->shadow[id].request; ++ ++ blkif_completion(&info->shadow[id]); ++ ++ ADD_ID_TO_FREELIST(info, id); ++ ++ uptodate = (bret->status == BLKIF_RSP_OKAY); ++ switch (bret->operation) { ++ case BLKIF_OP_WRITE_BARRIER: ++ if (unlikely(bret->status == BLKIF_RSP_EOPNOTSUPP)) { ++ printk("blkfront: %s: write barrier op failed\n", ++ info->gd->disk_name); ++ uptodate = -EOPNOTSUPP; ++ info->feature_barrier = 0; ++ xlvbd_barrier(info); ++ } ++ /* fall through */ ++ case BLKIF_OP_READ: ++ case BLKIF_OP_WRITE: ++ if (unlikely(bret->status != BLKIF_RSP_OKAY)) ++ DPRINTK("Bad return from blkdev data " ++ "request: %x\n", bret->status); ++ ++ ret = end_that_request_first(req, uptodate, ++ req->hard_nr_sectors); ++ BUG_ON(ret); ++ end_that_request_last(req, uptodate); ++ break; ++ default: ++ BUG(); ++ } ++ } ++ ++ info->ring.rsp_cons = i; ++ ++ if (i != info->ring.req_prod_pvt) { ++ int more_to_do; ++ RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); ++ if (more_to_do) ++ goto again; ++ } else ++ info->ring.sring->rsp_event = i + 1; ++ ++ kick_pending_request_queues(info); ++ ++ spin_unlock_irqrestore(&blkif_io_lock, flags); ++ ++ return IRQ_HANDLED; ++} ++ ++static void blkif_free(struct blkfront_info *info, int suspend) ++{ ++ /* Prevent new requests being issued until we fix things up. */ ++ spin_lock_irq(&blkif_io_lock); ++ info->connected = suspend ? ++ BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; ++ /* No more blkif_request(). */ ++ if (info->rq) ++ blk_stop_queue(info->rq); ++ /* No more gnttab callback work. */ ++ gnttab_cancel_free_callback(&info->callback); ++ spin_unlock_irq(&blkif_io_lock); ++ ++ /* Flush gnttab callback work. Must be done with no locks held. */ ++ flush_scheduled_work(); ++ ++ /* Free resources associated with old device channel. */ ++ if (info->ring_ref != GRANT_INVALID_REF) { ++ gnttab_end_foreign_access(info->ring_ref, 0, ++ (unsigned long)info->ring.sring); ++ info->ring_ref = GRANT_INVALID_REF; ++ info->ring.sring = NULL; ++ } ++ if (info->irq) ++ unbind_from_irqhandler(info->irq, info); ++ info->irq = 0; ++} ++ ++static void blkif_completion(struct blk_shadow *s) ++{ ++ int i; ++ for (i = 0; i < s->req.nr_segments; i++) ++ gnttab_end_foreign_access(s->req.seg[i].gref, 0, 0UL); ++} ++ ++static void blkif_recover(struct blkfront_info *info) ++{ ++ int i; ++ blkif_request_t *req; ++ struct blk_shadow *copy; ++ int j; ++ ++ /* Stage 1: Make a safe copy of the shadow state. */ ++ copy = kmalloc(sizeof(info->shadow), GFP_KERNEL | __GFP_NOFAIL); ++ memcpy(copy, info->shadow, sizeof(info->shadow)); ++ ++ /* Stage 2: Set up free list. */ ++ memset(&info->shadow, 0, sizeof(info->shadow)); ++ for (i = 0; i < BLK_RING_SIZE; i++) ++ info->shadow[i].req.id = i+1; ++ info->shadow_free = info->ring.req_prod_pvt; ++ info->shadow[BLK_RING_SIZE-1].req.id = 0x0fffffff; ++ ++ /* Stage 3: Find pending requests and requeue them. */ ++ for (i = 0; i < BLK_RING_SIZE; i++) { ++ /* Not in use? */ ++ if (copy[i].request == 0) ++ continue; ++ ++ /* Grab a request slot and copy shadow state into it. */ ++ req = RING_GET_REQUEST( ++ &info->ring, info->ring.req_prod_pvt); ++ *req = copy[i].req; ++ ++ /* We get a new request id, and must reset the shadow state. */ ++ req->id = GET_ID_FROM_FREELIST(info); ++ memcpy(&info->shadow[req->id], ©[i], sizeof(copy[i])); ++ ++ /* Rewrite any grant references invalidated by susp/resume. */ ++ for (j = 0; j < req->nr_segments; j++) ++ gnttab_grant_foreign_access_ref( ++ req->seg[j].gref, ++ info->xbdev->otherend_id, ++ pfn_to_mfn(info->shadow[req->id].frame[j]), ++ rq_data_dir( ++ (struct request *) ++ info->shadow[req->id].request)); ++ info->shadow[req->id].req = *req; ++ ++ info->ring.req_prod_pvt++; ++ } ++ ++ kfree(copy); ++ ++ (void)xenbus_switch_state(info->xbdev, XenbusStateConnected); ++ ++ spin_lock_irq(&blkif_io_lock); ++ ++ /* Now safe for us to use the shared ring */ ++ info->connected = BLKIF_STATE_CONNECTED; ++ ++ /* Send off requeued requests */ ++ flush_requests(info); ++ ++ /* Kick any other new requests queued since we resumed */ ++ kick_pending_request_queues(info); ++ ++ spin_unlock_irq(&blkif_io_lock); ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static struct xenbus_device_id blkfront_ids[] = { ++ { "vbd" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver blkfront = { ++ .name = "vbd", ++ .owner = THIS_MODULE, ++ .ids = blkfront_ids, ++ .probe = blkfront_probe, ++ .remove = blkfront_remove, ++ .resume = blkfront_resume, ++ .otherend_changed = backend_changed, ++}; ++ ++ ++static int __init xlblk_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ return xenbus_register_frontend(&blkfront); ++} ++module_init(xlblk_init); ++ ++ ++static void xlblk_exit(void) ++{ ++ return xenbus_unregister_driver(&blkfront); ++} ++module_exit(xlblk_exit); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkfront/block.h 2007-08-27 14:02:08.000000000 -0400 +@@ -0,0 +1,142 @@ ++/****************************************************************************** ++ * block.h ++ * ++ * Shared definitions between all levels of XenLinux Virtual block devices. ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge ++ * Copyright (c) 2004-2005, Christian Limpach ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __XEN_DRIVERS_BLOCK_H__ ++#define __XEN_DRIVERS_BLOCK_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/string.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/hdreg.h> ++#include <linux/blkdev.h> ++#include <linux/major.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/gnttab.h> ++#include <xen/interface/xen.h> ++#include <xen/interface/io/blkif.h> ++#include <xen/interface/io/ring.h> ++#include <asm/io.h> ++#include <asm/atomic.h> ++#include <asm/uaccess.h> ++ ++#define DPRINTK(_f, _a...) pr_debug(_f, ## _a) ++ ++#if 0 ++#define DPRINTK_IOCTL(_f, _a...) printk(KERN_ALERT _f, ## _a) ++#else ++#define DPRINTK_IOCTL(_f, _a...) ((void)0) ++#endif ++ ++struct xlbd_type_info ++{ ++ int partn_shift; ++ int disks_per_major; ++ char *devname; ++ char *diskname; ++}; ++ ++struct xlbd_major_info ++{ ++ int major; ++ int index; ++ int usage; ++ struct xlbd_type_info *type; ++}; ++ ++struct blk_shadow { ++ blkif_request_t req; ++ unsigned long request; ++ unsigned long frame[BLKIF_MAX_SEGMENTS_PER_REQUEST]; ++}; ++ ++#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) ++ ++/* ++ * We have one of these per vbd, whether ide, scsi or 'other'. They ++ * hang in private_data off the gendisk structure. We may end up ++ * putting all kinds of interesting stuff here :-) ++ */ ++struct blkfront_info ++{ ++ struct xenbus_device *xbdev; ++ dev_t dev; ++ struct gendisk *gd; ++ int vdevice; ++ blkif_vdev_t handle; ++ int connected; ++ int ring_ref; ++ blkif_front_ring_t ring; ++ unsigned int irq; ++ struct xlbd_major_info *mi; ++ request_queue_t *rq; ++ struct work_struct work; ++ struct gnttab_free_callback callback; ++ struct blk_shadow shadow[BLK_RING_SIZE]; ++ unsigned long shadow_free; ++ int feature_barrier; ++ ++ /** ++ * The number of people holding this device open. We won't allow a ++ * hot-unplug unless this is 0. ++ */ ++ int users; ++}; ++ ++extern spinlock_t blkif_io_lock; ++ ++extern int blkif_open(struct inode *inode, struct file *filep); ++extern int blkif_release(struct inode *inode, struct file *filep); ++extern int blkif_ioctl(struct inode *inode, struct file *filep, ++ unsigned command, unsigned long argument); ++extern int blkif_getgeo(struct block_device *, struct hd_geometry *); ++extern int blkif_check(dev_t dev); ++extern int blkif_revalidate(dev_t dev); ++extern void do_blkif_request (request_queue_t *rq); ++ ++/* Virtual block-device subsystem. */ ++/* Note that xlvbd_add doesn't call add_disk for you: you're expected ++ to call add_disk on info->gd once the disk is properly connected ++ up. */ ++int xlvbd_add(blkif_sector_t capacity, int device, ++ u16 vdisk_info, u16 sector_size, struct blkfront_info *info); ++void xlvbd_del(struct blkfront_info *info); ++int xlvbd_barrier(struct blkfront_info *info); ++ ++#endif /* __XEN_DRIVERS_BLOCK_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blkfront/vbd.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,372 @@ ++/****************************************************************************** ++ * vbd.c ++ * ++ * XenLinux virtual block-device driver (xvd). ++ * ++ * Copyright (c) 2003-2004, Keir Fraser & Steve Hand ++ * Modifications by Mark A. Williamson are (c) Intel Research Cambridge ++ * Copyright (c) 2004-2005, Christian Limpach ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "block.h" ++#include <linux/blkdev.h> ++#include <linux/list.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++#define BLKIF_MAJOR(dev) ((dev)>>8) ++#define BLKIF_MINOR(dev) ((dev) & 0xff) ++ ++/* ++ * For convenience we distinguish between ide, scsi and 'other' (i.e., ++ * potentially combinations of the two) in the naming scheme and in a few other ++ * places. ++ */ ++ ++#define NUM_IDE_MAJORS 10 ++#define NUM_SCSI_MAJORS 17 ++#define NUM_VBD_MAJORS 1 ++ ++static struct xlbd_type_info xlbd_ide_type = { ++ .partn_shift = 6, ++ .disks_per_major = 2, ++ .devname = "ide", ++ .diskname = "hd", ++}; ++ ++static struct xlbd_type_info xlbd_scsi_type = { ++ .partn_shift = 4, ++ .disks_per_major = 16, ++ .devname = "sd", ++ .diskname = "sd", ++}; ++ ++static struct xlbd_type_info xlbd_vbd_type = { ++ .partn_shift = 4, ++ .disks_per_major = 16, ++ .devname = "xvd", ++ .diskname = "xvd", ++}; ++ ++static struct xlbd_major_info *major_info[NUM_IDE_MAJORS + NUM_SCSI_MAJORS + ++ NUM_VBD_MAJORS]; ++ ++#define XLBD_MAJOR_IDE_START 0 ++#define XLBD_MAJOR_SCSI_START (NUM_IDE_MAJORS) ++#define XLBD_MAJOR_VBD_START (NUM_IDE_MAJORS + NUM_SCSI_MAJORS) ++ ++#define XLBD_MAJOR_IDE_RANGE XLBD_MAJOR_IDE_START ... XLBD_MAJOR_SCSI_START - 1 ++#define XLBD_MAJOR_SCSI_RANGE XLBD_MAJOR_SCSI_START ... XLBD_MAJOR_VBD_START - 1 ++#define XLBD_MAJOR_VBD_RANGE XLBD_MAJOR_VBD_START ... XLBD_MAJOR_VBD_START + NUM_VBD_MAJORS - 1 ++ ++/* Information about our VBDs. */ ++#define MAX_VBDS 64 ++static LIST_HEAD(vbds_list); ++ ++static struct block_device_operations xlvbd_block_fops = ++{ ++ .owner = THIS_MODULE, ++ .open = blkif_open, ++ .release = blkif_release, ++ .ioctl = blkif_ioctl, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) ++ .getgeo = blkif_getgeo ++#endif ++}; ++ ++DEFINE_SPINLOCK(blkif_io_lock); ++ ++static struct xlbd_major_info * ++xlbd_alloc_major_info(int major, int minor, int index) ++{ ++ struct xlbd_major_info *ptr; ++ ++ ptr = kzalloc(sizeof(struct xlbd_major_info), GFP_KERNEL); ++ if (ptr == NULL) ++ return NULL; ++ ++ ptr->major = major; ++ ++ switch (index) { ++ case XLBD_MAJOR_IDE_RANGE: ++ ptr->type = &xlbd_ide_type; ++ ptr->index = index - XLBD_MAJOR_IDE_START; ++ break; ++ case XLBD_MAJOR_SCSI_RANGE: ++ ptr->type = &xlbd_scsi_type; ++ ptr->index = index - XLBD_MAJOR_SCSI_START; ++ break; ++ case XLBD_MAJOR_VBD_RANGE: ++ ptr->type = &xlbd_vbd_type; ++ ptr->index = index - XLBD_MAJOR_VBD_START; ++ break; ++ } ++ ++ if (register_blkdev(ptr->major, ptr->type->devname)) { ++ kfree(ptr); ++ return NULL; ++ } ++ ++ printk("xen-vbd: registered block device major %i\n", ptr->major); ++ major_info[index] = ptr; ++ return ptr; ++} ++ ++static struct xlbd_major_info * ++xlbd_get_major_info(int vdevice) ++{ ++ struct xlbd_major_info *mi; ++ int major, minor, index; ++ ++ major = BLKIF_MAJOR(vdevice); ++ minor = BLKIF_MINOR(vdevice); ++ ++ switch (major) { ++ case IDE0_MAJOR: index = 0; break; ++ case IDE1_MAJOR: index = 1; break; ++ case IDE2_MAJOR: index = 2; break; ++ case IDE3_MAJOR: index = 3; break; ++ case IDE4_MAJOR: index = 4; break; ++ case IDE5_MAJOR: index = 5; break; ++ case IDE6_MAJOR: index = 6; break; ++ case IDE7_MAJOR: index = 7; break; ++ case IDE8_MAJOR: index = 8; break; ++ case IDE9_MAJOR: index = 9; break; ++ case SCSI_DISK0_MAJOR: index = 10; break; ++ case SCSI_DISK1_MAJOR ... SCSI_DISK7_MAJOR: ++ index = 11 + major - SCSI_DISK1_MAJOR; ++ break; ++ case SCSI_DISK8_MAJOR ... SCSI_DISK15_MAJOR: ++ index = 18 + major - SCSI_DISK8_MAJOR; ++ break; ++ case SCSI_CDROM_MAJOR: index = 26; break; ++ default: index = 27; break; ++ } ++ ++ mi = ((major_info[index] != NULL) ? major_info[index] : ++ xlbd_alloc_major_info(major, minor, index)); ++ if (mi) ++ mi->usage++; ++ return mi; ++} ++ ++static void ++xlbd_put_major_info(struct xlbd_major_info *mi) ++{ ++ mi->usage--; ++ /* XXX: release major if 0 */ ++} ++ ++static int ++xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) ++{ ++ request_queue_t *rq; ++ ++ rq = blk_init_queue(do_blkif_request, &blkif_io_lock); ++ if (rq == NULL) ++ return -1; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ elevator_init(rq, "noop"); ++#else ++ elevator_init(rq, &elevator_noop); ++#endif ++ ++ /* Hard sector size and max sectors impersonate the equiv. hardware. */ ++ blk_queue_hardsect_size(rq, sector_size); ++ blk_queue_max_sectors(rq, 512); ++ ++ /* Each segment in a request is up to an aligned page in size. */ ++ blk_queue_segment_boundary(rq, PAGE_SIZE - 1); ++ blk_queue_max_segment_size(rq, PAGE_SIZE); ++ ++ /* Ensure a merged request will fit in a single I/O ring slot. */ ++ blk_queue_max_phys_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ blk_queue_max_hw_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST); ++ ++ /* Make sure buffer addresses are sector-aligned. */ ++ blk_queue_dma_alignment(rq, 511); ++ ++ gd->queue = rq; ++ ++ return 0; ++} ++ ++static int ++xlvbd_alloc_gendisk(int minor, blkif_sector_t capacity, int vdevice, ++ u16 vdisk_info, u16 sector_size, ++ struct blkfront_info *info) ++{ ++ struct gendisk *gd; ++ struct xlbd_major_info *mi; ++ int nr_minors = 1; ++ int err = -ENODEV; ++ unsigned int offset; ++ ++ BUG_ON(info->gd != NULL); ++ BUG_ON(info->mi != NULL); ++ BUG_ON(info->rq != NULL); ++ ++ mi = xlbd_get_major_info(vdevice); ++ if (mi == NULL) ++ goto out; ++ info->mi = mi; ++ ++ if ((minor & ((1 << mi->type->partn_shift) - 1)) == 0) ++ nr_minors = 1 << mi->type->partn_shift; ++ ++ gd = alloc_disk(nr_minors); ++ if (gd == NULL) ++ goto out; ++ ++ offset = mi->index * mi->type->disks_per_major + ++ (minor >> mi->type->partn_shift); ++ if (nr_minors > 1) { ++ if (offset < 26) { ++ sprintf(gd->disk_name, "%s%c", ++ mi->type->diskname, 'a' + offset ); ++ } ++ else { ++ sprintf(gd->disk_name, "%s%c%c", ++ mi->type->diskname, ++ 'a' + ((offset/26)-1), 'a' + (offset%26) ); ++ } ++ } ++ else { ++ if (offset < 26) { ++ sprintf(gd->disk_name, "%s%c%d", ++ mi->type->diskname, ++ 'a' + offset, ++ minor & ((1 << mi->type->partn_shift) - 1)); ++ } ++ else { ++ sprintf(gd->disk_name, "%s%c%c%d", ++ mi->type->diskname, ++ 'a' + ((offset/26)-1), 'a' + (offset%26), ++ minor & ((1 << mi->type->partn_shift) - 1)); ++ } ++ } ++ ++ gd->major = mi->major; ++ gd->first_minor = minor; ++ gd->fops = &xlvbd_block_fops; ++ gd->private_data = info; ++ gd->driverfs_dev = &(info->xbdev->dev); ++ set_capacity(gd, capacity); ++ ++ if (xlvbd_init_blk_queue(gd, sector_size)) { ++ del_gendisk(gd); ++ goto out; ++ } ++ ++ info->rq = gd->queue; ++ info->gd = gd; ++ ++ if (info->feature_barrier) ++ xlvbd_barrier(info); ++ ++ if (vdisk_info & VDISK_READONLY) ++ set_disk_ro(gd, 1); ++ ++ if (vdisk_info & VDISK_REMOVABLE) ++ gd->flags |= GENHD_FL_REMOVABLE; ++ ++ if (vdisk_info & VDISK_CDROM) ++ gd->flags |= GENHD_FL_CD; ++ ++ return 0; ++ ++ out: ++ if (mi) ++ xlbd_put_major_info(mi); ++ info->mi = NULL; ++ return err; ++} ++ ++int ++xlvbd_add(blkif_sector_t capacity, int vdevice, u16 vdisk_info, ++ u16 sector_size, struct blkfront_info *info) ++{ ++ struct block_device *bd; ++ int err = 0; ++ ++ info->dev = MKDEV(BLKIF_MAJOR(vdevice), BLKIF_MINOR(vdevice)); ++ ++ bd = bdget(info->dev); ++ if (bd == NULL) ++ return -ENODEV; ++ ++ err = xlvbd_alloc_gendisk(BLKIF_MINOR(vdevice), capacity, vdevice, ++ vdisk_info, sector_size, info); ++ ++ bdput(bd); ++ return err; ++} ++ ++void ++xlvbd_del(struct blkfront_info *info) ++{ ++ if (info->mi == NULL) ++ return; ++ ++ BUG_ON(info->gd == NULL); ++ del_gendisk(info->gd); ++ put_disk(info->gd); ++ info->gd = NULL; ++ ++ xlbd_put_major_info(info->mi); ++ info->mi = NULL; ++ ++ BUG_ON(info->rq == NULL); ++ blk_cleanup_queue(info->rq); ++ info->rq = NULL; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) ++int ++xlvbd_barrier(struct blkfront_info *info) ++{ ++ int err; ++ ++ err = blk_queue_ordered(info->rq, ++ info->feature_barrier ? QUEUE_ORDERED_DRAIN : QUEUE_ORDERED_NONE, NULL); ++ if (err) ++ return err; ++ printk("blkfront: %s: barriers %s\n", ++ info->gd->disk_name, info->feature_barrier ? "enabled" : "disabled"); ++ return 0; ++} ++#else ++int ++xlvbd_barrier(struct blkfront_info *info) ++{ ++ printk("blkfront: %s: barriers disabled\n", info->gd->disk_name); ++ return -ENOSYS; ++} ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blktap/Makefile 2007-08-27 14:01:54.000000000 -0400 +@@ -0,0 +1,5 @@ ++LINUXINCLUDE += -I../xen/include/public/io ++ ++obj-$(CONFIG_XEN_BLKDEV_TAP) := xenblktap.o ++ ++xenblktap-y := xenbus.o interface.o blktap.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blktap/blktap.c 2007-08-27 14:02:10.000000000 -0400 +@@ -0,0 +1,1528 @@ ++/****************************************************************************** ++ * drivers/xen/blktap/blktap.c ++ * ++ * Back-end driver for user level virtual block devices. This portion of the ++ * driver exports a 'unified' block-device interface that can be accessed ++ * by any operating system that implements a compatible front end. Requests ++ * are remapped to a user-space memory region. ++ * ++ * Based on the blkback driver code. ++ * ++ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield ++ * ++ * Clean ups and fix ups: ++ * Copyright (c) 2006, Steven Rostedt - Red Hat, Inc. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/spinlock.h> ++#include <linux/kthread.h> ++#include <linux/list.h> ++#include <asm/hypervisor.h> ++#include "common.h" ++#include <xen/balloon.h> ++#include <xen/driver_util.h> ++#include <linux/kernel.h> ++#include <linux/fs.h> ++#include <linux/mm.h> ++#include <linux/errno.h> ++#include <linux/major.h> ++#include <linux/gfp.h> ++#include <linux/poll.h> ++#include <asm/tlbflush.h> ++ ++#define MAX_TAP_DEV 256 /*the maximum number of tapdisk ring devices */ ++#define MAX_DEV_NAME 100 /*the max tapdisk ring device name e.g. blktap0 */ ++ ++/* ++ * The maximum number of requests that can be outstanding at any time ++ * is determined by ++ * ++ * [mmap_alloc * MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST] ++ * ++ * where mmap_alloc < MAX_DYNAMIC_MEM. ++ * ++ * TODO: ++ * mmap_alloc is initialised to 2 and should be adjustable on the fly via ++ * sysfs. ++ */ ++#define BLK_RING_SIZE __RING_SIZE((blkif_sring_t *)0, PAGE_SIZE) ++#define MAX_DYNAMIC_MEM BLK_RING_SIZE ++#define MAX_PENDING_REQS BLK_RING_SIZE ++#define MMAP_PAGES (MAX_PENDING_REQS * BLKIF_MAX_SEGMENTS_PER_REQUEST) ++#define MMAP_VADDR(_start, _req,_seg) \ ++ (_start + \ ++ ((_req) * BLKIF_MAX_SEGMENTS_PER_REQUEST * PAGE_SIZE) + \ ++ ((_seg) * PAGE_SIZE)) ++static int blkif_reqs = MAX_PENDING_REQS; ++static int mmap_pages = MMAP_PAGES; ++ ++#define RING_PAGES 1 /* BLKTAP - immediately before the mmap area, we ++ * have a bunch of pages reserved for shared ++ * memory rings. ++ */ ++ ++/*Data struct handed back to userspace for tapdisk device to VBD mapping*/ ++typedef struct domid_translate { ++ unsigned short domid; ++ unsigned short busid; ++} domid_translate_t ; ++ ++/*Data struct associated with each of the tapdisk devices*/ ++typedef struct tap_blkif { ++ struct vm_area_struct *vma; /*Shared memory area */ ++ unsigned long rings_vstart; /*Kernel memory mapping */ ++ unsigned long user_vstart; /*User memory mapping */ ++ unsigned long dev_inuse; /*One process opens device at a time. */ ++ unsigned long dev_pending; /*In process of being opened */ ++ unsigned long ring_ok; /*make this ring->state */ ++ blkif_front_ring_t ufe_ring; /*Rings up to user space. */ ++ wait_queue_head_t wait; /*for poll */ ++ unsigned long mode; /*current switching mode */ ++ int minor; /*Minor number for tapdisk device */ ++ pid_t pid; /*tapdisk process id */ ++ enum { RUNNING, CLEANSHUTDOWN } status; /*Detect a clean userspace ++ shutdown */ ++ unsigned long *idx_map; /*Record the user ring id to kern ++ [req id, idx] tuple */ ++ blkif_t *blkif; /*Associate blkif with tapdev */ ++ struct domid_translate trans; /*Translation from domid to bus. */ ++} tap_blkif_t; ++ ++static struct tap_blkif *tapfds[MAX_TAP_DEV]; ++static int blktap_next_minor; ++ ++static int __init set_blkif_reqs(char *str) ++{ ++ get_option(&str, &blkif_reqs); ++ return 1; ++} ++__setup("blkif_reqs=", set_blkif_reqs); ++ ++/* Run-time switchable: /sys/module/blktap/parameters/ */ ++static unsigned int log_stats = 0; ++static unsigned int debug_lvl = 0; ++module_param(log_stats, int, 0644); ++module_param(debug_lvl, int, 0644); ++ ++/* ++ * Each outstanding request that we've passed to the lower device layers has a ++ * 'pending_req' allocated to it. Each buffer_head that completes decrements ++ * the pendcnt towards zero. When it hits zero, the specified domain has a ++ * response queued for it, with the saved 'id' passed back. ++ */ ++typedef struct { ++ blkif_t *blkif; ++ u64 id; ++ unsigned short mem_idx; ++ int nr_pages; ++ atomic_t pendcnt; ++ unsigned short operation; ++ int status; ++ struct list_head free_list; ++ int inuse; ++} pending_req_t; ++ ++static pending_req_t *pending_reqs[MAX_PENDING_REQS]; ++static struct list_head pending_free; ++static DEFINE_SPINLOCK(pending_free_lock); ++static DECLARE_WAIT_QUEUE_HEAD (pending_free_wq); ++static int alloc_pending_reqs; ++ ++typedef unsigned int PEND_RING_IDX; ++ ++static inline int MASK_PEND_IDX(int i) { ++ return (i & (MAX_PENDING_REQS-1)); ++} ++ ++static inline unsigned int RTN_PEND_IDX(pending_req_t *req, int idx) { ++ return (req - pending_reqs[idx]); ++} ++ ++#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) ++ ++#define BLKBACK_INVALID_HANDLE (~0) ++ ++static struct page **foreign_pages[MAX_DYNAMIC_MEM]; ++static inline unsigned long idx_to_kaddr( ++ unsigned int mmap_idx, unsigned int req_idx, unsigned int sg_idx) ++{ ++ unsigned int arr_idx = req_idx*BLKIF_MAX_SEGMENTS_PER_REQUEST + sg_idx; ++ unsigned long pfn = page_to_pfn(foreign_pages[mmap_idx][arr_idx]); ++ return (unsigned long)pfn_to_kaddr(pfn); ++} ++ ++static unsigned short mmap_alloc = 0; ++static unsigned short mmap_lock = 0; ++static unsigned short mmap_inuse = 0; ++ ++/****************************************************************** ++ * GRANT HANDLES ++ */ ++ ++/* When using grant tables to map a frame for device access then the ++ * handle returned must be used to unmap the frame. This is needed to ++ * drop the ref count on the frame. ++ */ ++struct grant_handle_pair ++{ ++ grant_handle_t kernel; ++ grant_handle_t user; ++}; ++#define INVALID_GRANT_HANDLE 0xFFFF ++ ++static struct grant_handle_pair ++ pending_grant_handles[MAX_DYNAMIC_MEM][MMAP_PAGES]; ++#define pending_handle(_id, _idx, _i) \ ++ (pending_grant_handles[_id][((_idx) * BLKIF_MAX_SEGMENTS_PER_REQUEST) \ ++ + (_i)]) ++ ++ ++static int blktap_read_ufe_ring(tap_blkif_t *info); /*local prototypes*/ ++ ++#define BLKTAP_MINOR 0 /*/dev/xen/blktap has a dynamic major */ ++#define BLKTAP_DEV_DIR "/dev/xen" ++ ++static int blktap_major; ++ ++/* blktap IOCTLs: */ ++#define BLKTAP_IOCTL_KICK_FE 1 ++#define BLKTAP_IOCTL_KICK_BE 2 /* currently unused */ ++#define BLKTAP_IOCTL_SETMODE 3 ++#define BLKTAP_IOCTL_SENDPID 4 ++#define BLKTAP_IOCTL_NEWINTF 5 ++#define BLKTAP_IOCTL_MINOR 6 ++#define BLKTAP_IOCTL_MAJOR 7 ++#define BLKTAP_QUERY_ALLOC_REQS 8 ++#define BLKTAP_IOCTL_FREEINTF 9 ++#define BLKTAP_IOCTL_PRINT_IDXS 100 ++ ++/* blktap switching modes: (Set with BLKTAP_IOCTL_SETMODE) */ ++#define BLKTAP_MODE_PASSTHROUGH 0x00000000 /* default */ ++#define BLKTAP_MODE_INTERCEPT_FE 0x00000001 ++#define BLKTAP_MODE_INTERCEPT_BE 0x00000002 /* unimp. */ ++ ++#define BLKTAP_MODE_INTERPOSE \ ++ (BLKTAP_MODE_INTERCEPT_FE | BLKTAP_MODE_INTERCEPT_BE) ++ ++ ++static inline int BLKTAP_MODE_VALID(unsigned long arg) ++{ ++ return ((arg == BLKTAP_MODE_PASSTHROUGH ) || ++ (arg == BLKTAP_MODE_INTERCEPT_FE) || ++ (arg == BLKTAP_MODE_INTERPOSE )); ++} ++ ++/* Requests passing through the tap to userspace are re-assigned an ID. ++ * We must record a mapping between the BE [IDX,ID] tuple and the userspace ++ * ring ID. ++ */ ++ ++static inline unsigned long MAKE_ID(domid_t fe_dom, PEND_RING_IDX idx) ++{ ++ return ((fe_dom << 16) | MASK_PEND_IDX(idx)); ++} ++ ++extern inline PEND_RING_IDX ID_TO_IDX(unsigned long id) ++{ ++ return (PEND_RING_IDX)(id & 0x0000ffff); ++} ++ ++extern inline int ID_TO_MIDX(unsigned long id) ++{ ++ return (int)(id >> 16); ++} ++ ++#define INVALID_REQ 0xdead0000 ++ ++/*TODO: Convert to a free list*/ ++static inline int GET_NEXT_REQ(unsigned long *idx_map) ++{ ++ int i; ++ for (i = 0; i < MAX_PENDING_REQS; i++) ++ if (idx_map[i] == INVALID_REQ) ++ return i; ++ ++ return INVALID_REQ; ++} ++ ++ ++#define BLKTAP_INVALID_HANDLE(_g) \ ++ (((_g->kernel) == INVALID_GRANT_HANDLE) && \ ++ ((_g->user) == INVALID_GRANT_HANDLE)) ++ ++#define BLKTAP_INVALIDATE_HANDLE(_g) do { \ ++ (_g)->kernel = INVALID_GRANT_HANDLE; (_g)->user = INVALID_GRANT_HANDLE; \ ++ } while(0) ++ ++ ++/****************************************************************** ++ * BLKTAP VM OPS ++ */ ++ ++static struct page *blktap_nopage(struct vm_area_struct *vma, ++ unsigned long address, ++ int *type) ++{ ++ /* ++ * if the page has not been mapped in by the driver then return ++ * NOPAGE_SIGBUS to the domain. ++ */ ++ ++ return NOPAGE_SIGBUS; ++} ++ ++struct vm_operations_struct blktap_vm_ops = { ++ nopage: blktap_nopage, ++}; ++ ++/****************************************************************** ++ * BLKTAP FILE OPS ++ */ ++ ++/*Function Declarations*/ ++static tap_blkif_t *get_next_free_dev(void); ++static int blktap_open(struct inode *inode, struct file *filp); ++static int blktap_release(struct inode *inode, struct file *filp); ++static int blktap_mmap(struct file *filp, struct vm_area_struct *vma); ++static int blktap_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg); ++static unsigned int blktap_poll(struct file *file, poll_table *wait); ++ ++static const struct file_operations blktap_fops = { ++ .owner = THIS_MODULE, ++ .poll = blktap_poll, ++ .ioctl = blktap_ioctl, ++ .open = blktap_open, ++ .release = blktap_release, ++ .mmap = blktap_mmap, ++}; ++ ++ ++static tap_blkif_t *get_next_free_dev(void) ++{ ++ struct class *class; ++ tap_blkif_t *info; ++ int minor; ++ ++ /* ++ * This is called only from the ioctl, which ++ * means we should always have interrupts enabled. ++ */ ++ BUG_ON(irqs_disabled()); ++ ++ spin_lock_irq(&pending_free_lock); ++ ++ /* tapfds[0] is always NULL */ ++ ++ for (minor = 1; minor < blktap_next_minor; minor++) { ++ info = tapfds[minor]; ++ /* we could have failed a previous attempt. */ ++ if (!info || ++ ((info->dev_inuse == 0) && ++ (info->dev_pending == 0)) ) { ++ info->dev_pending = 1; ++ goto found; ++ } ++ } ++ info = NULL; ++ minor = -1; ++ ++ /* ++ * We didn't find free device. If we can still allocate ++ * more, then we grab the next device minor that is ++ * available. This is done while we are still under ++ * the protection of the pending_free_lock. ++ */ ++ if (blktap_next_minor < MAX_TAP_DEV) ++ minor = blktap_next_minor++; ++found: ++ spin_unlock_irq(&pending_free_lock); ++ ++ if (!info && minor > 0) { ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (unlikely(!info)) { ++ /* ++ * If we failed here, try to put back ++ * the next minor number. But if one ++ * was just taken, then we just lose this ++ * minor. We can try to allocate this ++ * minor again later. ++ */ ++ spin_lock_irq(&pending_free_lock); ++ if (blktap_next_minor == minor+1) ++ blktap_next_minor--; ++ spin_unlock_irq(&pending_free_lock); ++ goto out; ++ } ++ ++ info->minor = minor; ++ /* ++ * Make sure that we have a minor before others can ++ * see us. ++ */ ++ wmb(); ++ tapfds[minor] = info; ++ ++ if ((class = get_xen_class()) != NULL) ++ class_device_create(class, NULL, ++ MKDEV(blktap_major, minor), NULL, ++ "blktap%d", minor); ++ } ++ ++out: ++ return info; ++} ++ ++int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif) ++{ ++ tap_blkif_t *info; ++ int i; ++ ++ for (i = 1; i < blktap_next_minor; i++) { ++ info = tapfds[i]; ++ if ( info && ++ (info->trans.domid == domid) && ++ (info->trans.busid == xenbus_id) ) { ++ info->blkif = blkif; ++ info->status = RUNNING; ++ return i; ++ } ++ } ++ return -1; ++} ++ ++void signal_tapdisk(int idx) ++{ ++ tap_blkif_t *info; ++ struct task_struct *ptask; ++ ++ info = tapfds[idx]; ++ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) ++ return; ++ ++ if (info->pid > 0) { ++ ptask = find_task_by_pid(info->pid); ++ if (ptask) ++ info->status = CLEANSHUTDOWN; ++ } ++ info->blkif = NULL; ++ ++ return; ++} ++ ++static int blktap_open(struct inode *inode, struct file *filp) ++{ ++ blkif_sring_t *sring; ++ int idx = iminor(inode) - BLKTAP_MINOR; ++ tap_blkif_t *info; ++ int i; ++ ++ /* ctrl device, treat differently */ ++ if (!idx) ++ return 0; ++ ++ info = tapfds[idx]; ++ ++ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) { ++ WPRINTK("Unable to open device /dev/xen/blktap%d\n", ++ idx); ++ return -ENODEV; ++ } ++ ++ DPRINTK("Opening device /dev/xen/blktap%d\n",idx); ++ ++ /*Only one process can access device at a time*/ ++ if (test_and_set_bit(0, &info->dev_inuse)) ++ return -EBUSY; ++ ++ info->dev_pending = 0; ++ ++ /* Allocate the fe ring. */ ++ sring = (blkif_sring_t *)get_zeroed_page(GFP_KERNEL); ++ if (sring == NULL) ++ goto fail_nomem; ++ ++ SetPageReserved(virt_to_page(sring)); ++ ++ SHARED_RING_INIT(sring); ++ FRONT_RING_INIT(&info->ufe_ring, sring, PAGE_SIZE); ++ ++ filp->private_data = info; ++ info->vma = NULL; ++ ++ info->idx_map = kmalloc(sizeof(unsigned long) * MAX_PENDING_REQS, ++ GFP_KERNEL); ++ ++ if (idx > 0) { ++ init_waitqueue_head(&info->wait); ++ for (i = 0; i < MAX_PENDING_REQS; i++) ++ info->idx_map[i] = INVALID_REQ; ++ } ++ ++ DPRINTK("Tap open: device /dev/xen/blktap%d\n",idx); ++ return 0; ++ ++ fail_nomem: ++ return -ENOMEM; ++} ++ ++static int blktap_release(struct inode *inode, struct file *filp) ++{ ++ tap_blkif_t *info = filp->private_data; ++ ++ /* check for control device */ ++ if (!info) ++ return 0; ++ ++ info->dev_inuse = 0; ++ DPRINTK("Freeing device [/dev/xen/blktap%d]\n",info->minor); ++ ++ /* Free the ring page. */ ++ ClearPageReserved(virt_to_page(info->ufe_ring.sring)); ++ free_page((unsigned long) info->ufe_ring.sring); ++ ++ /* Clear any active mappings and free foreign map table */ ++ if (info->vma) { ++ zap_page_range( ++ info->vma, info->vma->vm_start, ++ info->vma->vm_end - info->vma->vm_start, NULL); ++ info->vma = NULL; ++ } ++ ++ if ( (info->status != CLEANSHUTDOWN) && (info->blkif != NULL) ) { ++ if (info->blkif->xenblkd != NULL) { ++ kthread_stop(info->blkif->xenblkd); ++ info->blkif->xenblkd = NULL; ++ } ++ info->status = CLEANSHUTDOWN; ++ } ++ return 0; ++} ++ ++ ++/* Note on mmap: ++ * We need to map pages to user space in a way that will allow the block ++ * subsystem set up direct IO to them. This couldn't be done before, because ++ * there isn't really a sane way to translate a user virtual address down to a ++ * physical address when the page belongs to another domain. ++ * ++ * My first approach was to map the page in to kernel memory, add an entry ++ * for it in the physical frame list (using alloc_lomem_region as in blkback) ++ * and then attempt to map that page up to user space. This is disallowed ++ * by xen though, which realizes that we don't really own the machine frame ++ * underlying the physical page. ++ * ++ * The new approach is to provide explicit support for this in xen linux. ++ * The VMA now has a flag, VM_FOREIGN, to indicate that it contains pages ++ * mapped from other vms. vma->vm_private_data is set up as a mapping ++ * from pages to actual page structs. There is a new clause in get_user_pages ++ * that does the right thing for this sort of mapping. ++ */ ++static int blktap_mmap(struct file *filp, struct vm_area_struct *vma) ++{ ++ int size; ++ struct page **map; ++ int i; ++ tap_blkif_t *info = filp->private_data; ++ ++ if (info == NULL) { ++ WPRINTK("blktap: mmap, retrieving idx failed\n"); ++ return -ENOMEM; ++ } ++ ++ vma->vm_flags |= VM_RESERVED; ++ vma->vm_ops = &blktap_vm_ops; ++ ++ size = vma->vm_end - vma->vm_start; ++ if (size != ((mmap_pages + RING_PAGES) << PAGE_SHIFT)) { ++ WPRINTK("you _must_ map exactly %d pages!\n", ++ mmap_pages + RING_PAGES); ++ return -EAGAIN; ++ } ++ ++ size >>= PAGE_SHIFT; ++ info->rings_vstart = vma->vm_start; ++ info->user_vstart = info->rings_vstart + (RING_PAGES << PAGE_SHIFT); ++ ++ /* Map the ring pages to the start of the region and reserve it. */ ++ if (remap_pfn_range(vma, vma->vm_start, ++ __pa(info->ufe_ring.sring) >> PAGE_SHIFT, ++ PAGE_SIZE, vma->vm_page_prot)) { ++ WPRINTK("Mapping user ring failed!\n"); ++ goto fail; ++ } ++ ++ /* Mark this VM as containing foreign pages, and set up mappings. */ ++ map = kzalloc(((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) ++ * sizeof(struct page_struct*), ++ GFP_KERNEL); ++ if (map == NULL) { ++ WPRINTK("Couldn't alloc VM_FOREIGN map.\n"); ++ goto fail; ++ } ++ ++ for (i = 0; i < ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT); i++) ++ map[i] = NULL; ++ ++ vma->vm_private_data = map; ++ vma->vm_flags |= VM_FOREIGN; ++ ++ info->vma = vma; ++ info->ring_ok = 1; ++ return 0; ++ fail: ++ /* Clear any active mappings. */ ++ zap_page_range(vma, vma->vm_start, ++ vma->vm_end - vma->vm_start, NULL); ++ ++ return -ENOMEM; ++} ++ ++ ++static int blktap_ioctl(struct inode *inode, struct file *filp, ++ unsigned int cmd, unsigned long arg) ++{ ++ tap_blkif_t *info = filp->private_data; ++ ++ switch(cmd) { ++ case BLKTAP_IOCTL_KICK_FE: ++ { ++ /* There are fe messages to process. */ ++ return blktap_read_ufe_ring(info); ++ } ++ case BLKTAP_IOCTL_SETMODE: ++ { ++ if (info) { ++ if (BLKTAP_MODE_VALID(arg)) { ++ info->mode = arg; ++ /* XXX: may need to flush rings here. */ ++ DPRINTK("blktap: set mode to %lx\n", ++ arg); ++ return 0; ++ } ++ } ++ return 0; ++ } ++ case BLKTAP_IOCTL_PRINT_IDXS: ++ { ++ if (info) { ++ printk("User Rings: \n-----------\n"); ++ printk("UF: rsp_cons: %2d, req_prod_prv: %2d " ++ "| req_prod: %2d, rsp_prod: %2d\n", ++ info->ufe_ring.rsp_cons, ++ info->ufe_ring.req_prod_pvt, ++ info->ufe_ring.sring->req_prod, ++ info->ufe_ring.sring->rsp_prod); ++ } ++ return 0; ++ } ++ case BLKTAP_IOCTL_SENDPID: ++ { ++ if (info) { ++ info->pid = (pid_t)arg; ++ DPRINTK("blktap: pid received %d\n", ++ info->pid); ++ } ++ return 0; ++ } ++ case BLKTAP_IOCTL_NEWINTF: ++ { ++ uint64_t val = (uint64_t)arg; ++ domid_translate_t *tr = (domid_translate_t *)&val; ++ ++ DPRINTK("NEWINTF Req for domid %d and bus id %d\n", ++ tr->domid, tr->busid); ++ info = get_next_free_dev(); ++ if (!info) { ++ WPRINTK("Error initialising /dev/xen/blktap - " ++ "No more devices\n"); ++ return -1; ++ } ++ info->trans.domid = tr->domid; ++ info->trans.busid = tr->busid; ++ return info->minor; ++ } ++ case BLKTAP_IOCTL_FREEINTF: ++ { ++ unsigned long dev = arg; ++ unsigned long flags; ++ ++ info = tapfds[dev]; ++ ++ if ((dev > MAX_TAP_DEV) || !info) ++ return 0; /* should this be an error? */ ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ if (info->dev_pending) ++ info->dev_pending = 0; ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ ++ return 0; ++ } ++ case BLKTAP_IOCTL_MINOR: ++ { ++ unsigned long dev = arg; ++ ++ info = tapfds[dev]; ++ ++ if ((dev > MAX_TAP_DEV) || !info) ++ return -EINVAL; ++ ++ return info->minor; ++ } ++ case BLKTAP_IOCTL_MAJOR: ++ return blktap_major; ++ ++ case BLKTAP_QUERY_ALLOC_REQS: ++ { ++ WPRINTK("BLKTAP_QUERY_ALLOC_REQS ioctl: %d/%d\n", ++ alloc_pending_reqs, blkif_reqs); ++ return (alloc_pending_reqs/blkif_reqs) * 100; ++ } ++ } ++ return -ENOIOCTLCMD; ++} ++ ++static unsigned int blktap_poll(struct file *filp, poll_table *wait) ++{ ++ tap_blkif_t *info = filp->private_data; ++ ++ /* do not work on the control device */ ++ if (!info) ++ return 0; ++ ++ poll_wait(filp, &info->wait, wait); ++ if (info->ufe_ring.req_prod_pvt != info->ufe_ring.sring->req_prod) { ++ RING_PUSH_REQUESTS(&info->ufe_ring); ++ return POLLIN | POLLRDNORM; ++ } ++ return 0; ++} ++ ++void blktap_kick_user(int idx) ++{ ++ tap_blkif_t *info; ++ ++ info = tapfds[idx]; ++ ++ if ((idx < 0) || (idx > MAX_TAP_DEV) || !info) ++ return; ++ ++ wake_up_interruptible(&info->wait); ++ ++ return; ++} ++ ++static int do_block_io_op(blkif_t *blkif); ++static void dispatch_rw_block_io(blkif_t *blkif, ++ blkif_request_t *req, ++ pending_req_t *pending_req); ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st); ++ ++/****************************************************************** ++ * misc small helpers ++ */ ++static int req_increase(void) ++{ ++ int i, j; ++ ++ if (mmap_alloc >= MAX_PENDING_REQS || mmap_lock) ++ return -EINVAL; ++ ++ pending_reqs[mmap_alloc] = kzalloc(sizeof(pending_req_t) ++ * blkif_reqs, GFP_KERNEL); ++ foreign_pages[mmap_alloc] = alloc_empty_pages_and_pagevec(mmap_pages); ++ ++ if (!pending_reqs[mmap_alloc] || !foreign_pages[mmap_alloc]) ++ goto out_of_memory; ++ ++ DPRINTK("%s: reqs=%d, pages=%d\n", ++ __FUNCTION__, blkif_reqs, mmap_pages); ++ ++ for (i = 0; i < MAX_PENDING_REQS; i++) { ++ list_add_tail(&pending_reqs[mmap_alloc][i].free_list, ++ &pending_free); ++ pending_reqs[mmap_alloc][i].mem_idx = mmap_alloc; ++ for (j = 0; j < BLKIF_MAX_SEGMENTS_PER_REQUEST; j++) ++ BLKTAP_INVALIDATE_HANDLE(&pending_handle(mmap_alloc, ++ i, j)); ++ } ++ ++ mmap_alloc++; ++ DPRINTK("# MMAPs increased to %d\n",mmap_alloc); ++ return 0; ++ ++ out_of_memory: ++ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); ++ kfree(pending_reqs[mmap_alloc]); ++ WPRINTK("%s: out of memory\n", __FUNCTION__); ++ return -ENOMEM; ++} ++ ++static void mmap_req_del(int mmap) ++{ ++ BUG_ON(!spin_is_locked(&pending_free_lock)); ++ ++ kfree(pending_reqs[mmap]); ++ pending_reqs[mmap] = NULL; ++ ++ free_empty_pages_and_pagevec(foreign_pages[mmap_alloc], mmap_pages); ++ foreign_pages[mmap] = NULL; ++ ++ mmap_lock = 0; ++ DPRINTK("# MMAPs decreased to %d\n",mmap_alloc); ++ mmap_alloc--; ++} ++ ++static pending_req_t* alloc_req(void) ++{ ++ pending_req_t *req = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ ++ if (!list_empty(&pending_free)) { ++ req = list_entry(pending_free.next, pending_req_t, free_list); ++ list_del(&req->free_list); ++ } ++ ++ if (req) { ++ req->inuse = 1; ++ alloc_pending_reqs++; ++ } ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ ++ return req; ++} ++ ++static void free_req(pending_req_t *req) ++{ ++ unsigned long flags; ++ int was_empty; ++ ++ spin_lock_irqsave(&pending_free_lock, flags); ++ ++ alloc_pending_reqs--; ++ req->inuse = 0; ++ if (mmap_lock && (req->mem_idx == mmap_alloc-1)) { ++ mmap_inuse--; ++ if (mmap_inuse == 0) mmap_req_del(mmap_alloc-1); ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ return; ++ } ++ was_empty = list_empty(&pending_free); ++ list_add(&req->free_list, &pending_free); ++ ++ spin_unlock_irqrestore(&pending_free_lock, flags); ++ ++ if (was_empty) ++ wake_up(&pending_free_wq); ++} ++ ++static void fast_flush_area(pending_req_t *req, int k_idx, int u_idx, ++ int tapidx) ++{ ++ struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; ++ unsigned int i, invcount = 0; ++ struct grant_handle_pair *khandle; ++ uint64_t ptep; ++ int ret, mmap_idx; ++ unsigned long kvaddr, uvaddr; ++ tap_blkif_t *info; ++ ++ ++ info = tapfds[tapidx]; ++ ++ if ((tapidx < 0) || (tapidx > MAX_TAP_DEV) || !info) { ++ WPRINTK("fast_flush: Couldn't get info!\n"); ++ return; ++ } ++ ++ if (info->vma != NULL && ++ xen_feature(XENFEAT_auto_translated_physmap)) { ++ down_write(&info->vma->vm_mm->mmap_sem); ++ zap_page_range(info->vma, ++ MMAP_VADDR(info->user_vstart, u_idx, 0), ++ req->nr_pages << PAGE_SHIFT, NULL); ++ up_write(&info->vma->vm_mm->mmap_sem); ++ } ++ ++ mmap_idx = req->mem_idx; ++ ++ for (i = 0; i < req->nr_pages; i++) { ++ kvaddr = idx_to_kaddr(mmap_idx, k_idx, i); ++ uvaddr = MMAP_VADDR(info->user_vstart, u_idx, i); ++ ++ khandle = &pending_handle(mmap_idx, k_idx, i); ++ ++ if (khandle->kernel != INVALID_GRANT_HANDLE) { ++ gnttab_set_unmap_op(&unmap[invcount], ++ idx_to_kaddr(mmap_idx, k_idx, i), ++ GNTMAP_host_map, khandle->kernel); ++ invcount++; ++ } ++ ++ if (khandle->user != INVALID_GRANT_HANDLE) { ++ BUG_ON(xen_feature(XENFEAT_auto_translated_physmap)); ++ if (create_lookup_pte_addr( ++ info->vma->vm_mm, ++ MMAP_VADDR(info->user_vstart, u_idx, i), ++ &ptep) !=0) { ++ WPRINTK("Couldn't get a pte addr!\n"); ++ return; ++ } ++ ++ gnttab_set_unmap_op(&unmap[invcount], ptep, ++ GNTMAP_host_map ++ | GNTMAP_application_map ++ | GNTMAP_contains_pte, ++ khandle->user); ++ invcount++; ++ } ++ ++ BLKTAP_INVALIDATE_HANDLE(khandle); ++ } ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, unmap, invcount); ++ BUG_ON(ret); ++ ++ if (info->vma != NULL && !xen_feature(XENFEAT_auto_translated_physmap)) ++ zap_page_range(info->vma, ++ MMAP_VADDR(info->user_vstart, u_idx, 0), ++ req->nr_pages << PAGE_SHIFT, NULL); ++} ++ ++/****************************************************************** ++ * SCHEDULER FUNCTIONS ++ */ ++ ++static void print_stats(blkif_t *blkif) ++{ ++ printk(KERN_DEBUG "%s: oo %3d | rd %4d | wr %4d\n", ++ current->comm, blkif->st_oo_req, ++ blkif->st_rd_req, blkif->st_wr_req); ++ blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); ++ blkif->st_rd_req = 0; ++ blkif->st_wr_req = 0; ++ blkif->st_oo_req = 0; ++} ++ ++int tap_blkif_schedule(void *arg) ++{ ++ blkif_t *blkif = arg; ++ ++ blkif_get(blkif); ++ ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: started\n", current->comm); ++ ++ while (!kthread_should_stop()) { ++ wait_event_interruptible( ++ blkif->wq, ++ blkif->waiting_reqs || kthread_should_stop()); ++ wait_event_interruptible( ++ pending_free_wq, ++ !list_empty(&pending_free) || kthread_should_stop()); ++ ++ blkif->waiting_reqs = 0; ++ smp_mb(); /* clear flag *before* checking for work */ ++ ++ if (do_block_io_op(blkif)) ++ blkif->waiting_reqs = 1; ++ ++ if (log_stats && time_after(jiffies, blkif->st_print)) ++ print_stats(blkif); ++ } ++ ++ if (log_stats) ++ print_stats(blkif); ++ if (debug_lvl) ++ printk(KERN_DEBUG "%s: exiting\n", current->comm); ++ ++ blkif->xenblkd = NULL; ++ blkif_put(blkif); ++ ++ return 0; ++} ++ ++/****************************************************************** ++ * COMPLETION CALLBACK -- Called by user level ioctl() ++ */ ++ ++static int blktap_read_ufe_ring(tap_blkif_t *info) ++{ ++ /* This is called to read responses from the UFE ring. */ ++ RING_IDX i, j, rp; ++ blkif_response_t *resp; ++ blkif_t *blkif=NULL; ++ int pending_idx, usr_idx, mmap_idx; ++ pending_req_t *pending_req; ++ ++ if (!info) ++ return 0; ++ ++ /* We currently only forward packets in INTERCEPT_FE mode. */ ++ if (!(info->mode & BLKTAP_MODE_INTERCEPT_FE)) ++ return 0; ++ ++ /* for each outstanding message on the UFEring */ ++ rp = info->ufe_ring.sring->rsp_prod; ++ rmb(); ++ ++ for (i = info->ufe_ring.rsp_cons; i != rp; i++) { ++ blkif_response_t res; ++ resp = RING_GET_RESPONSE(&info->ufe_ring, i); ++ memcpy(&res, resp, sizeof(res)); ++ mb(); /* rsp_cons read by RING_FULL() in do_block_io_op(). */ ++ ++info->ufe_ring.rsp_cons; ++ ++ /*retrieve [usr_idx] to [mmap_idx,pending_idx] mapping*/ ++ usr_idx = (int)res.id; ++ pending_idx = MASK_PEND_IDX(ID_TO_IDX(info->idx_map[usr_idx])); ++ mmap_idx = ID_TO_MIDX(info->idx_map[usr_idx]); ++ ++ if ( (mmap_idx >= mmap_alloc) || ++ (ID_TO_IDX(info->idx_map[usr_idx]) >= MAX_PENDING_REQS) ) ++ WPRINTK("Incorrect req map" ++ "[%d], internal map [%d,%d (%d)]\n", ++ usr_idx, mmap_idx, ++ ID_TO_IDX(info->idx_map[usr_idx]), ++ MASK_PEND_IDX( ++ ID_TO_IDX(info->idx_map[usr_idx]))); ++ ++ pending_req = &pending_reqs[mmap_idx][pending_idx]; ++ blkif = pending_req->blkif; ++ ++ for (j = 0; j < pending_req->nr_pages; j++) { ++ ++ unsigned long kvaddr, uvaddr; ++ struct page **map = info->vma->vm_private_data; ++ struct page *pg; ++ int offset; ++ ++ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, j); ++ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, j); ++ ++ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ++ ClearPageReserved(pg); ++ offset = (uvaddr - info->vma->vm_start) ++ >> PAGE_SHIFT; ++ map[offset] = NULL; ++ } ++ fast_flush_area(pending_req, pending_idx, usr_idx, info->minor); ++ info->idx_map[usr_idx] = INVALID_REQ; ++ make_response(blkif, pending_req->id, res.operation, ++ res.status); ++ blkif_put(pending_req->blkif); ++ free_req(pending_req); ++ } ++ ++ return 0; ++} ++ ++ ++/****************************************************************************** ++ * NOTIFICATION FROM GUEST OS. ++ */ ++ ++static void blkif_notify_work(blkif_t *blkif) ++{ ++ blkif->waiting_reqs = 1; ++ wake_up(&blkif->wq); ++} ++ ++irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ blkif_notify_work(dev_id); ++ return IRQ_HANDLED; ++} ++ ++ ++ ++/****************************************************************** ++ * DOWNWARD CALLS -- These interface with the block-device layer proper. ++ */ ++static int print_dbug = 1; ++static int do_block_io_op(blkif_t *blkif) ++{ ++ blkif_back_rings_t *blk_rings = &blkif->blk_rings; ++ blkif_request_t req; ++ pending_req_t *pending_req; ++ RING_IDX rc, rp; ++ int more_to_do = 0; ++ tap_blkif_t *info; ++ ++ rc = blk_rings->common.req_cons; ++ rp = blk_rings->common.sring->req_prod; ++ rmb(); /* Ensure we see queued requests up to 'rp'. */ ++ ++ /*Check blkif has corresponding UE ring*/ ++ if (blkif->dev_num < 0) { ++ /*oops*/ ++ if (print_dbug) { ++ WPRINTK("Corresponding UE " ++ "ring does not exist!\n"); ++ print_dbug = 0; /*We only print this message once*/ ++ } ++ return 0; ++ } ++ ++ info = tapfds[blkif->dev_num]; ++ ++ if (blkif->dev_num > MAX_TAP_DEV || !info || !info->dev_inuse) { ++ if (print_dbug) { ++ WPRINTK("Can't get UE info!\n"); ++ print_dbug = 0; ++ } ++ return 0; ++ } ++ ++ while (rc != rp) { ++ ++ if (RING_FULL(&info->ufe_ring)) { ++ WPRINTK("RING_FULL! More to do\n"); ++ more_to_do = 1; ++ break; ++ } ++ ++ if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc)) { ++ WPRINTK("RING_REQUEST_CONS_OVERFLOW!" ++ " More to do\n"); ++ more_to_do = 1; ++ break; ++ } ++ ++ pending_req = alloc_req(); ++ if (NULL == pending_req) { ++ blkif->st_oo_req++; ++ more_to_do = 1; ++ break; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), ++ sizeof(req)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ blkif_get_x86_32_req(&req, RING_GET_REQUEST(&blk_rings->x86_32, rc)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ blkif_get_x86_64_req(&req, RING_GET_REQUEST(&blk_rings->x86_64, rc)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.req_cons = ++rc; /* before make_response() */ ++ ++ switch (req.operation) { ++ case BLKIF_OP_READ: ++ blkif->st_rd_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ ++ case BLKIF_OP_WRITE: ++ blkif->st_wr_req++; ++ dispatch_rw_block_io(blkif, &req, pending_req); ++ break; ++ ++ default: ++ WPRINTK("unknown operation [%d]\n", ++ req.operation); ++ make_response(blkif, req.id, req.operation, ++ BLKIF_RSP_ERROR); ++ free_req(pending_req); ++ break; ++ } ++ } ++ ++ blktap_kick_user(blkif->dev_num); ++ ++ return more_to_do; ++} ++ ++static void dispatch_rw_block_io(blkif_t *blkif, ++ blkif_request_t *req, ++ pending_req_t *pending_req) ++{ ++ extern void ll_rw_block(int rw, int nr, struct buffer_head * bhs[]); ++ int op, operation = (req->operation == BLKIF_OP_WRITE) ? WRITE : READ; ++ struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST*2]; ++ unsigned int nseg; ++ int ret, i, nr_sects = 0; ++ tap_blkif_t *info; ++ uint64_t sector; ++ blkif_request_t *target; ++ int pending_idx = RTN_PEND_IDX(pending_req,pending_req->mem_idx); ++ int usr_idx; ++ uint16_t mmap_idx = pending_req->mem_idx; ++ ++ if (blkif->dev_num < 0 || blkif->dev_num > MAX_TAP_DEV) ++ goto fail_response; ++ ++ info = tapfds[blkif->dev_num]; ++ if (info == NULL) ++ goto fail_response; ++ ++ /* Check we have space on user ring - should never fail. */ ++ usr_idx = GET_NEXT_REQ(info->idx_map); ++ if (usr_idx == INVALID_REQ) { ++ BUG(); ++ goto fail_response; ++ } ++ ++ /* Check that number of segments is sane. */ ++ nseg = req->nr_segments; ++ if ( unlikely(nseg == 0) || ++ unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) ) { ++ WPRINTK("Bad number of segments in request (%d)\n", nseg); ++ goto fail_response; ++ } ++ ++ /* Make sure userspace is ready. */ ++ if (!info->ring_ok) { ++ WPRINTK("blktap: ring not ready for requests!\n"); ++ goto fail_response; ++ } ++ ++ if (RING_FULL(&info->ufe_ring)) { ++ WPRINTK("blktap: fe_ring is full, can't add " ++ "IO Request will be dropped. %d %d\n", ++ RING_SIZE(&info->ufe_ring), ++ RING_SIZE(&blkif->blk_rings.common)); ++ goto fail_response; ++ } ++ ++ pending_req->blkif = blkif; ++ pending_req->id = req->id; ++ pending_req->operation = operation; ++ pending_req->status = BLKIF_RSP_OKAY; ++ pending_req->nr_pages = nseg; ++ op = 0; ++ for (i = 0; i < nseg; i++) { ++ unsigned long uvaddr; ++ unsigned long kvaddr; ++ uint64_t ptep; ++ uint32_t flags; ++ ++ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); ++ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); ++ ++ sector = req->sector_number + ((PAGE_SIZE / 512) * i); ++ if( (blkif->sectors > 0) && (sector >= blkif->sectors) ) { ++ WPRINTK("BLKTAP: Sector request greater" ++ "than size\n"); ++ WPRINTK("BLKTAP: %s request sector" ++ "[%llu,%llu], Total [%llu]\n", ++ (req->operation == ++ BLKIF_OP_WRITE ? "WRITE" : "READ"), ++ (long long unsigned) sector, ++ (long long unsigned) sector>>9, ++ (long long unsigned) blkif->sectors); ++ } ++ ++ flags = GNTMAP_host_map; ++ if (operation == WRITE) ++ flags |= GNTMAP_readonly; ++ gnttab_set_map_op(&map[op], kvaddr, flags, ++ req->seg[i].gref, blkif->domid); ++ op++; ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* Now map it to user. */ ++ ret = create_lookup_pte_addr(info->vma->vm_mm, ++ uvaddr, &ptep); ++ if (ret) { ++ WPRINTK("Couldn't get a pte addr!\n"); ++ goto fail_flush; ++ } ++ ++ flags = GNTMAP_host_map | GNTMAP_application_map ++ | GNTMAP_contains_pte; ++ if (operation == WRITE) ++ flags |= GNTMAP_readonly; ++ gnttab_set_map_op(&map[op], ptep, flags, ++ req->seg[i].gref, blkif->domid); ++ op++; ++ } ++ ++ nr_sects += (req->seg[i].last_sect - ++ req->seg[i].first_sect + 1); ++ } ++ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, map, op); ++ BUG_ON(ret); ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ for (i = 0; i < (nseg*2); i+=2) { ++ unsigned long uvaddr; ++ unsigned long kvaddr; ++ unsigned long offset; ++ struct page *pg; ++ ++ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i/2); ++ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i/2); ++ ++ if (unlikely(map[i].status != 0)) { ++ WPRINTK("invalid kernel buffer -- " ++ "could not remap it\n"); ++ ret |= 1; ++ map[i].handle = INVALID_GRANT_HANDLE; ++ } ++ ++ if (unlikely(map[i+1].status != 0)) { ++ WPRINTK("invalid user buffer -- " ++ "could not remap it\n"); ++ ret |= 1; ++ map[i+1].handle = INVALID_GRANT_HANDLE; ++ } ++ ++ pending_handle(mmap_idx, pending_idx, i/2).kernel ++ = map[i].handle; ++ pending_handle(mmap_idx, pending_idx, i/2).user ++ = map[i+1].handle; ++ ++ if (ret) ++ continue; ++ ++ set_phys_to_machine(__pa(kvaddr) >> PAGE_SHIFT, ++ FOREIGN_FRAME(map[i].dev_bus_addr ++ >> PAGE_SHIFT)); ++ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; ++ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ++ ((struct page **)info->vma->vm_private_data)[offset] = ++ pg; ++ } ++ } else { ++ for (i = 0; i < nseg; i++) { ++ unsigned long uvaddr; ++ unsigned long kvaddr; ++ unsigned long offset; ++ struct page *pg; ++ ++ uvaddr = MMAP_VADDR(info->user_vstart, usr_idx, i); ++ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); ++ ++ if (unlikely(map[i].status != 0)) { ++ WPRINTK("invalid kernel buffer -- " ++ "could not remap it\n"); ++ ret |= 1; ++ map[i].handle = INVALID_GRANT_HANDLE; ++ } ++ ++ pending_handle(mmap_idx, pending_idx, i).kernel ++ = map[i].handle; ++ ++ if (ret) ++ continue; ++ ++ offset = (uvaddr - info->vma->vm_start) >> PAGE_SHIFT; ++ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ++ ((struct page **)info->vma->vm_private_data)[offset] = ++ pg; ++ } ++ } ++ ++ if (ret) ++ goto fail_flush; ++ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ down_write(&info->vma->vm_mm->mmap_sem); ++ /* Mark mapped pages as reserved: */ ++ for (i = 0; i < req->nr_segments; i++) { ++ unsigned long kvaddr; ++ struct page *pg; ++ ++ kvaddr = idx_to_kaddr(mmap_idx, pending_idx, i); ++ pg = pfn_to_page(__pa(kvaddr) >> PAGE_SHIFT); ++ SetPageReserved(pg); ++ if (xen_feature(XENFEAT_auto_translated_physmap)) { ++ ret = vm_insert_page(info->vma, ++ MMAP_VADDR(info->user_vstart, ++ usr_idx, i), pg); ++ if (ret) { ++ up_write(&info->vma->vm_mm->mmap_sem); ++ goto fail_flush; ++ } ++ } ++ } ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ up_write(&info->vma->vm_mm->mmap_sem); ++ ++ /*record [mmap_idx,pending_idx] to [usr_idx] mapping*/ ++ info->idx_map[usr_idx] = MAKE_ID(mmap_idx, pending_idx); ++ ++ blkif_get(blkif); ++ /* Finally, write the request message to the user ring. */ ++ target = RING_GET_REQUEST(&info->ufe_ring, ++ info->ufe_ring.req_prod_pvt); ++ memcpy(target, req, sizeof(*req)); ++ target->id = usr_idx; ++ wmb(); /* blktap_poll() reads req_prod_pvt asynchronously */ ++ info->ufe_ring.req_prod_pvt++; ++ ++ if (operation == READ) ++ blkif->st_rd_sect += nr_sects; ++ else if (operation == WRITE) ++ blkif->st_wr_sect += nr_sects; ++ ++ return; ++ ++ fail_flush: ++ WPRINTK("Reached Fail_flush\n"); ++ fast_flush_area(pending_req, pending_idx, usr_idx, blkif->dev_num); ++ fail_response: ++ make_response(blkif, req->id, req->operation, BLKIF_RSP_ERROR); ++ free_req(pending_req); ++} ++ ++ ++ ++/****************************************************************** ++ * MISCELLANEOUS SETUP / TEARDOWN / DEBUGGING ++ */ ++ ++ ++static void make_response(blkif_t *blkif, u64 id, ++ unsigned short op, int st) ++{ ++ blkif_response_t resp; ++ unsigned long flags; ++ blkif_back_rings_t *blk_rings = &blkif->blk_rings; ++ int more_to_do = 0; ++ int notify; ++ ++ resp.id = id; ++ resp.operation = op; ++ resp.status = st; ++ ++ spin_lock_irqsave(&blkif->blk_ring_lock, flags); ++ /* Place on the response ring for the relevant domain. */ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ memcpy(RING_GET_RESPONSE(&blk_rings->native, ++ blk_rings->native.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_32: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_32, ++ blk_rings->x86_32.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ case BLKIF_PROTOCOL_X86_64: ++ memcpy(RING_GET_RESPONSE(&blk_rings->x86_64, ++ blk_rings->x86_64.rsp_prod_pvt), ++ &resp, sizeof(resp)); ++ break; ++ default: ++ BUG(); ++ } ++ blk_rings->common.rsp_prod_pvt++; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); ++ ++ if (blk_rings->common.rsp_prod_pvt == blk_rings->common.req_cons) { ++ /* ++ * Tail check for pending requests. Allows frontend to avoid ++ * notifications if requests are already in flight (lower ++ * overheads and promotes batching). ++ */ ++ RING_FINAL_CHECK_FOR_REQUESTS(&blk_rings->common, more_to_do); ++ } else if (RING_HAS_UNCONSUMED_REQUESTS(&blk_rings->common)) { ++ more_to_do = 1; ++ } ++ ++ spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); ++ if (more_to_do) ++ blkif_notify_work(blkif); ++ if (notify) ++ notify_remote_via_irq(blkif->irq); ++} ++ ++static int __init blkif_init(void) ++{ ++ int i, ret; ++ struct class *class; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ INIT_LIST_HEAD(&pending_free); ++ for(i = 0; i < 2; i++) { ++ ret = req_increase(); ++ if (ret) ++ break; ++ } ++ if (i == 0) ++ return ret; ++ ++ tap_blkif_interface_init(); ++ ++ alloc_pending_reqs = 0; ++ ++ tap_blkif_xenbus_init(); ++ ++ /* Dynamically allocate a major for this device */ ++ ret = register_chrdev(0, "blktap", &blktap_fops); ++ ++ if (ret < 0) { ++ WPRINTK("Couldn't register /dev/xen/blktap\n"); ++ return -ENOMEM; ++ } ++ ++ blktap_major = ret; ++ ++ /* tapfds[0] is always NULL */ ++ blktap_next_minor++; ++ ++ DPRINTK("Created misc_dev [/dev/xen/blktap%d]\n",i); ++ ++ /* Make sure the xen class exists */ ++ if ((class = get_xen_class()) != NULL) { ++ /* ++ * This will allow udev to create the blktap ctrl device. ++ * We only want to create blktap0 first. We don't want ++ * to flood the sysfs system with needless blktap devices. ++ * We only create the device when a request of a new device is ++ * made. ++ */ ++ class_device_create(class, NULL, ++ MKDEV(blktap_major, 0), NULL, ++ "blktap0"); ++ } else { ++ /* this is bad, but not fatal */ ++ WPRINTK("blktap: sysfs xen_class not created\n"); ++ } ++ ++ DPRINTK("Blktap device successfully created\n"); ++ ++ return 0; ++} ++ ++module_init(blkif_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blktap/common.h 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,121 @@ ++/* ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __BLKIF__BACKEND__COMMON_H__ ++#define __BLKIF__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <linux/blkdev.h> ++#include <linux/vmalloc.h> ++#include <asm/io.h> ++#include <asm/setup.h> ++#include <asm/pgalloc.h> ++#include <xen/evtchn.h> ++#include <asm/hypervisor.h> ++#include <xen/blkif.h> ++#include <xen/gnttab.h> ++#include <xen/driver_util.h> ++ ++#define DPRINTK(_f, _a...) pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++ ++#define WPRINTK(fmt, args...) printk(KERN_WARNING "blk_tap: " fmt, ##args) ++ ++struct backend_info; ++ ++typedef struct blkif_st { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ /* Physical parameters of the comms window. */ ++ unsigned int irq; ++ /* Comms information. */ ++ enum blkif_protocol blk_protocol; ++ blkif_back_rings_t blk_rings; ++ struct vm_struct *blk_ring_area; ++ /* Back pointer to the backend_info. */ ++ struct backend_info *be; ++ /* Private fields. */ ++ spinlock_t blk_ring_lock; ++ atomic_t refcnt; ++ ++ wait_queue_head_t wq; ++ struct task_struct *xenblkd; ++ unsigned int waiting_reqs; ++ request_queue_t *plug; ++ ++ /* statistics */ ++ unsigned long st_print; ++ int st_rd_req; ++ int st_wr_req; ++ int st_oo_req; ++ int st_rd_sect; ++ int st_wr_sect; ++ ++ wait_queue_head_t waiting_to_free; ++ ++ grant_handle_t shmem_handle; ++ grant_ref_t shmem_ref; ++ ++ int dev_num; ++ uint64_t sectors; ++} blkif_t; ++ ++blkif_t *tap_alloc_blkif(domid_t domid); ++void tap_blkif_free(blkif_t *blkif); ++int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, ++ unsigned int evtchn); ++void tap_blkif_unmap(blkif_t *blkif); ++ ++#define blkif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define blkif_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ wake_up(&(_b)->waiting_to_free);\ ++ } while (0) ++ ++ ++struct phys_req { ++ unsigned short dev; ++ unsigned short nr_sects; ++ struct block_device *bdev; ++ blkif_sector_t sector_number; ++}; ++ ++void tap_blkif_interface_init(void); ++ ++void tap_blkif_xenbus_init(void); ++ ++irqreturn_t tap_blkif_be_int(int irq, void *dev_id, struct pt_regs *regs); ++int tap_blkif_schedule(void *arg); ++ ++int dom_to_devid(domid_t domid, int xenbus_id, blkif_t *blkif); ++void signal_tapdisk(int idx); ++ ++#endif /* __BLKIF__BACKEND__COMMON_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blktap/interface.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,174 @@ ++/****************************************************************************** ++ * drivers/xen/blktap/interface.c ++ * ++ * Block-device interface management. ++ * ++ * Copyright (c) 2004, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ ++ */ ++ ++#include "common.h" ++#include <xen/evtchn.h> ++ ++static kmem_cache_t *blkif_cachep; ++ ++blkif_t *tap_alloc_blkif(domid_t domid) ++{ ++ blkif_t *blkif; ++ ++ blkif = kmem_cache_alloc(blkif_cachep, GFP_KERNEL); ++ if (!blkif) ++ return ERR_PTR(-ENOMEM); ++ ++ memset(blkif, 0, sizeof(*blkif)); ++ blkif->domid = domid; ++ spin_lock_init(&blkif->blk_ring_lock); ++ atomic_set(&blkif->refcnt, 1); ++ init_waitqueue_head(&blkif->wq); ++ blkif->st_print = jiffies; ++ init_waitqueue_head(&blkif->waiting_to_free); ++ ++ return blkif; ++} ++ ++static int map_frontend_page(blkif_t *blkif, unsigned long shared_page) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, shared_page, blkif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return op.status; ++ } ++ ++ blkif->shmem_ref = shared_page; ++ blkif->shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_page(blkif_t *blkif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)blkif->blk_ring_area->addr, ++ GNTMAP_host_map, blkif->shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int tap_blkif_map(blkif_t *blkif, unsigned long shared_page, ++ unsigned int evtchn) ++{ ++ int err; ++ ++ /* Already connected through? */ ++ if (blkif->irq) ++ return 0; ++ ++ if ( (blkif->blk_ring_area = alloc_vm_area(PAGE_SIZE)) == NULL ) ++ return -ENOMEM; ++ ++ err = map_frontend_page(blkif, shared_page); ++ if (err) { ++ free_vm_area(blkif->blk_ring_area); ++ return err; ++ } ++ ++ switch (blkif->blk_protocol) { ++ case BLKIF_PROTOCOL_NATIVE: ++ { ++ blkif_sring_t *sring; ++ sring = (blkif_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.native, sring, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_32: ++ { ++ blkif_x86_32_sring_t *sring_x86_32; ++ sring_x86_32 = (blkif_x86_32_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, PAGE_SIZE); ++ break; ++ } ++ case BLKIF_PROTOCOL_X86_64: ++ { ++ blkif_x86_64_sring_t *sring_x86_64; ++ sring_x86_64 = (blkif_x86_64_sring_t *)blkif->blk_ring_area->addr; ++ BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, PAGE_SIZE); ++ break; ++ } ++ default: ++ BUG(); ++ } ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ blkif->domid, evtchn, tap_blkif_be_int, ++ 0, "blkif-backend", blkif); ++ if (err < 0) { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ return err; ++ } ++ blkif->irq = err; ++ ++ return 0; ++} ++ ++void tap_blkif_unmap(blkif_t *blkif) ++{ ++ if (blkif->irq) { ++ unbind_from_irqhandler(blkif->irq, blkif); ++ blkif->irq = 0; ++ } ++ if (blkif->blk_rings.common.sring) { ++ unmap_frontend_page(blkif); ++ free_vm_area(blkif->blk_ring_area); ++ blkif->blk_rings.common.sring = NULL; ++ } ++} ++ ++void tap_blkif_free(blkif_t *blkif) ++{ ++ atomic_dec(&blkif->refcnt); ++ wait_event(blkif->waiting_to_free, atomic_read(&blkif->refcnt) == 0); ++ ++ tap_blkif_unmap(blkif); ++ kmem_cache_free(blkif_cachep, blkif); ++} ++ ++void __init tap_blkif_interface_init(void) ++{ ++ blkif_cachep = kmem_cache_create("blktapif_cache", sizeof(blkif_t), ++ 0, 0, NULL, NULL); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/blktap/xenbus.c 2007-08-27 14:02:09.000000000 -0400 +@@ -0,0 +1,473 @@ ++/* drivers/xen/blktap/xenbus.c ++ * ++ * Xenbus code for blktap ++ * ++ * Copyright (c) 2004-2005, Andrew Warfield and Julian Chesterfield ++ * ++ * Based on the blkback xenbus code: ++ * ++ * Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ * Copyright (C) 2005 XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <linux/kthread.h> ++#include <xen/xenbus.h> ++#include "common.h" ++ ++ ++struct backend_info ++{ ++ struct xenbus_device *dev; ++ blkif_t *blkif; ++ struct xenbus_watch backend_watch; ++ int xenbus_id; ++ int group_added; ++}; ++ ++ ++static void connect(struct backend_info *); ++static int connect_ring(struct backend_info *); ++static int blktap_remove(struct xenbus_device *dev); ++static int blktap_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id); ++static void tap_backend_changed(struct xenbus_watch *, const char **, ++ unsigned int); ++static void tap_frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state); ++ ++static int strsep_len(const char *str, char c, unsigned int len) ++{ ++ unsigned int i; ++ ++ for (i = 0; str[i]; i++) ++ if (str[i] == c) { ++ if (len == 0) ++ return i; ++ len--; ++ } ++ return (len == 0) ? i : -ERANGE; ++} ++ ++static long get_id(const char *str) ++{ ++ int len,end; ++ const char *ptr; ++ char *tptr, num[10]; ++ ++ len = strsep_len(str, '/', 2); ++ end = strlen(str); ++ if ( (len < 0) || (end < 0) ) return -1; ++ ++ ptr = str + len + 1; ++ strncpy(num,ptr,end - len); ++ tptr = num + (end - (len + 1)); ++ *tptr = '\0'; ++ DPRINTK("Get_id called for %s (%s)\n",str,num); ++ ++ return simple_strtol(num, NULL, 10); ++} ++ ++static int blktap_name(blkif_t *blkif, char *buf) ++{ ++ char *devpath, *devname; ++ struct xenbus_device *dev = blkif->be->dev; ++ ++ devpath = xenbus_read(XBT_NIL, dev->nodename, "dev", NULL); ++ if (IS_ERR(devpath)) ++ return PTR_ERR(devpath); ++ ++ if ((devname = strstr(devpath, "/dev/")) != NULL) ++ devname += strlen("/dev/"); ++ else ++ devname = devpath; ++ ++ snprintf(buf, TASK_COMM_LEN, "blktap.%d.%s", blkif->domid, devname); ++ kfree(devpath); ++ ++ return 0; ++} ++ ++/**************************************************************** ++ * sysfs interface for VBD I/O requests ++ */ ++ ++#define VBD_SHOW(name, format, args...) \ ++ static ssize_t show_##name(struct device *_dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct xenbus_device *dev = to_xenbus_device(_dev); \ ++ struct backend_info *be = dev->dev.driver_data; \ ++ \ ++ return sprintf(buf, format, ##args); \ ++ } \ ++ DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) ++ ++VBD_SHOW(tap_oo_req, "%d\n", be->blkif->st_oo_req); ++VBD_SHOW(tap_rd_req, "%d\n", be->blkif->st_rd_req); ++VBD_SHOW(tap_wr_req, "%d\n", be->blkif->st_wr_req); ++VBD_SHOW(tap_rd_sect, "%d\n", be->blkif->st_rd_sect); ++VBD_SHOW(tap_wr_sect, "%d\n", be->blkif->st_wr_sect); ++ ++static struct attribute *tapstat_attrs[] = { ++ &dev_attr_tap_oo_req.attr, ++ &dev_attr_tap_rd_req.attr, ++ &dev_attr_tap_wr_req.attr, ++ &dev_attr_tap_rd_sect.attr, ++ &dev_attr_tap_wr_sect.attr, ++ NULL ++}; ++ ++static struct attribute_group tapstat_group = { ++ .name = "statistics", ++ .attrs = tapstat_attrs, ++}; ++ ++int xentap_sysfs_addif(struct xenbus_device *dev) ++{ ++ int err; ++ struct backend_info *be = dev->dev.driver_data; ++ err = sysfs_create_group(&dev->dev.kobj, &tapstat_group); ++ if (!err) ++ be->group_added = 1; ++ return err; ++} ++ ++void xentap_sysfs_delif(struct xenbus_device *dev) ++{ ++ sysfs_remove_group(&dev->dev.kobj, &tapstat_group); ++} ++ ++static int blktap_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ ++ if (be->backend_watch.node) { ++ unregister_xenbus_watch(&be->backend_watch); ++ kfree(be->backend_watch.node); ++ be->backend_watch.node = NULL; ++ } ++ if (be->blkif) { ++ if (be->blkif->xenblkd) ++ kthread_stop(be->blkif->xenblkd); ++ signal_tapdisk(be->blkif->dev_num); ++ tap_blkif_free(be->blkif); ++ be->blkif = NULL; ++ } ++ if (be->group_added) ++ xentap_sysfs_delif(be->dev); ++ kfree(be); ++ dev->dev.driver_data = NULL; ++ return 0; ++} ++ ++static void tap_update_blkif_status(blkif_t *blkif) ++{ ++ int err; ++ char name[TASK_COMM_LEN]; ++ ++ /* Not ready to connect? */ ++ if(!blkif->irq || !blkif->sectors) { ++ return; ++ } ++ ++ /* Already connected? */ ++ if (blkif->be->dev->state == XenbusStateConnected) ++ return; ++ ++ /* Attempt to connect: exit if we fail to. */ ++ connect(blkif->be); ++ if (blkif->be->dev->state != XenbusStateConnected) ++ return; ++ ++ err = blktap_name(blkif, name); ++ if (err) { ++ xenbus_dev_error(blkif->be->dev, err, "get blktap dev name"); ++ return; ++ } ++ ++ err = xentap_sysfs_addif(blkif->be->dev); ++ if (err) { ++ xenbus_dev_fatal(blkif->be->dev, err, ++ "creating sysfs entries"); ++ return; ++ } ++ ++ blkif->xenblkd = kthread_run(tap_blkif_schedule, blkif, name); ++ if (IS_ERR(blkif->xenblkd)) { ++ err = PTR_ERR(blkif->xenblkd); ++ blkif->xenblkd = NULL; ++ xenbus_dev_fatal(blkif->be->dev, err, "start xenblkd"); ++ WPRINTK("Error starting thread\n"); ++ } ++} ++ ++/** ++ * Entry point to this code when a new device is created. Allocate ++ * the basic structures, and watch the store waiting for the ++ * user-space program to tell us the physical device info. Switch to ++ * InitWait. ++ */ ++static int blktap_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ ++ be->dev = dev; ++ dev->dev.driver_data = be; ++ be->xenbus_id = get_id(dev->nodename); ++ ++ be->blkif = tap_alloc_blkif(dev->otherend_id); ++ if (IS_ERR(be->blkif)) { ++ err = PTR_ERR(be->blkif); ++ be->blkif = NULL; ++ xenbus_dev_fatal(dev, err, "creating block interface"); ++ goto fail; ++ } ++ ++ /* setup back pointer */ ++ be->blkif->be = be; ++ be->blkif->sectors = 0; ++ ++ /* set a watch on disk info, waiting for userspace to update details*/ ++ err = xenbus_watch_path2(dev, dev->nodename, "info", ++ &be->backend_watch, tap_backend_changed); ++ if (err) ++ goto fail; ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ return 0; ++ ++fail: ++ DPRINTK("blktap probe failed\n"); ++ blktap_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Callback received when the user space code has placed the device ++ * information in xenstore. ++ */ ++static void tap_backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ int err; ++ unsigned long info; ++ struct backend_info *be ++ = container_of(watch, struct backend_info, backend_watch); ++ struct xenbus_device *dev = be->dev; ++ ++ /** ++ * Check to see whether userspace code has opened the image ++ * and written sector ++ * and disk info to xenstore ++ */ ++ err = xenbus_gather(XBT_NIL, dev->nodename, "info", "%lu", &info, ++ NULL); ++ if (XENBUS_EXIST_ERR(err)) ++ return; ++ if (err) { ++ xenbus_dev_error(dev, err, "getting info"); ++ return; ++ } ++ ++ DPRINTK("Userspace update on disk info, %lu\n",info); ++ ++ err = xenbus_gather(XBT_NIL, dev->nodename, "sectors", "%llu", ++ &be->blkif->sectors, NULL); ++ ++ /* Associate tap dev with domid*/ ++ be->blkif->dev_num = dom_to_devid(be->blkif->domid, be->xenbus_id, ++ be->blkif); ++ DPRINTK("Thread started for domid [%d], connecting disk\n", ++ be->blkif->dev_num); ++ ++ tap_update_blkif_status(be->blkif); ++} ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void tap_frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ int err; ++ ++ DPRINTK("\n"); ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ /* Ensure we connect even when two watches fire in ++ close successsion and we miss the intermediate value ++ of frontend_state. */ ++ if (dev->state == XenbusStateConnected) ++ break; ++ ++ err = connect_ring(be); ++ if (err) ++ break; ++ tap_update_blkif_status(be->blkif); ++ break; ++ ++ case XenbusStateClosing: ++ if (be->blkif->xenblkd) { ++ kthread_stop(be->blkif->xenblkd); ++ be->blkif->xenblkd = NULL; ++ } ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++/** ++ * Switch to Connected state. ++ */ ++static void connect(struct backend_info *be) ++{ ++ int err; ++ ++ struct xenbus_device *dev = be->dev; ++ ++ err = xenbus_switch_state(dev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(dev, err, "switching to Connected state", ++ dev->nodename); ++ ++ return; ++} ++ ++ ++static int connect_ring(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long ring_ref; ++ unsigned int evtchn; ++ char protocol[64]; ++ int err; ++ ++ DPRINTK("%s\n", dev->otherend); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, "ring-ref", "%lu", ++ &ring_ref, "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", ++ "%63s", protocol, NULL); ++ if (err) ++ strcpy(protocol, "unspecified, assuming native"); ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; ++ else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) ++ be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; ++ else { ++ xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); ++ return -1; ++ } ++ printk(KERN_INFO ++ "blktap: ring-ref %ld, event-channel %d, protocol %d (%s)\n", ++ ring_ref, evtchn, be->blkif->blk_protocol, protocol); ++ ++ /* Map the shared frame, irq etc. */ ++ err = tap_blkif_map(be->blkif, ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "mapping ring-ref %lu port %u", ++ ring_ref, evtchn); ++ return err; ++ } ++ ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static struct xenbus_device_id blktap_ids[] = { ++ { "tap" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver blktap = { ++ .name = "tap", ++ .owner = THIS_MODULE, ++ .ids = blktap_ids, ++ .probe = blktap_probe, ++ .remove = blktap_remove, ++ .otherend_changed = tap_frontend_changed ++}; ++ ++ ++void tap_blkif_xenbus_init(void) ++{ ++ xenbus_register_backend(&blktap); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/char/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,2 @@ ++ ++obj-y := mem.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/char/mem.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,203 @@ ++/* ++ * Originally from linux/drivers/char/mem.c ++ * ++ * Copyright (C) 1991, 1992 Linus Torvalds ++ * ++ * Added devfs support. ++ * Jan-11-1998, C. Scott Ananian <cananian@alumni.princeton.edu> ++ * Shared /dev/zero mmaping support, Feb 2000, Kanoj Sarcar <kanoj@sgi.com> ++ */ ++ ++#include <linux/mm.h> ++#include <linux/miscdevice.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <linux/mman.h> ++#include <linux/random.h> ++#include <linux/init.h> ++#include <linux/raw.h> ++#include <linux/tty.h> ++#include <linux/capability.h> ++#include <linux/smp_lock.h> ++#include <linux/ptrace.h> ++#include <linux/device.h> ++#include <asm/pgalloc.h> ++#include <asm/uaccess.h> ++#include <asm/io.h> ++#include <asm/hypervisor.h> ++ ++#ifndef ARCH_HAS_VALID_PHYS_ADDR_RANGE ++static inline int valid_phys_addr_range(unsigned long addr, size_t count) ++{ ++ return 1; ++} ++#endif ++ ++/* ++ * This funcion reads the *physical* memory. The f_pos points directly to the ++ * memory location. ++ */ ++static ssize_t read_mem(struct file * file, char __user * buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned long p = *ppos, ignored; ++ ssize_t read = 0, sz; ++ void __iomem *v; ++ ++ if (!valid_phys_addr_range(p, count)) ++ return -EFAULT; ++ ++ while (count > 0) { ++ /* ++ * Handle first page in case it's not aligned ++ */ ++ if (-p & (PAGE_SIZE - 1)) ++ sz = -p & (PAGE_SIZE - 1); ++ else ++ sz = PAGE_SIZE; ++ ++ sz = min_t(unsigned long, sz, count); ++ ++ v = xlate_dev_mem_ptr(p, sz); ++ if (IS_ERR(v) || v == NULL) { ++ /* ++ * Some programs (e.g., dmidecode) groove off into ++ * weird RAM areas where no tables can possibly exist ++ * (because Xen will have stomped on them!). These ++ * programs get rather upset if we let them know that ++ * Xen failed their access, so we fake out a read of ++ * all zeroes. ++ */ ++ if (clear_user(buf, count)) ++ return -EFAULT; ++ read += count; ++ break; ++ } ++ ++ ignored = copy_to_user(buf, v, sz); ++ xlate_dev_mem_ptr_unmap(v); ++ if (ignored) ++ return -EFAULT; ++ buf += sz; ++ p += sz; ++ count -= sz; ++ read += sz; ++ } ++ ++ *ppos += read; ++ return read; ++} ++ ++static ssize_t write_mem(struct file * file, const char __user * buf, ++ size_t count, loff_t *ppos) ++{ ++ unsigned long p = *ppos, ignored; ++ ssize_t written = 0, sz; ++ void __iomem *v; ++ ++ if (!valid_phys_addr_range(p, count)) ++ return -EFAULT; ++ ++ while (count > 0) { ++ /* ++ * Handle first page in case it's not aligned ++ */ ++ if (-p & (PAGE_SIZE - 1)) ++ sz = -p & (PAGE_SIZE - 1); ++ else ++ sz = PAGE_SIZE; ++ ++ sz = min_t(unsigned long, sz, count); ++ ++ v = xlate_dev_mem_ptr(p, sz); ++ if (v == NULL) ++ break; ++ if (IS_ERR(v)) { ++ if (written == 0) ++ return PTR_ERR(v); ++ break; ++ } ++ ++ ignored = copy_from_user(v, buf, sz); ++ xlate_dev_mem_ptr_unmap(v); ++ if (ignored) { ++ written += sz - ignored; ++ if (written) ++ break; ++ return -EFAULT; ++ } ++ buf += sz; ++ p += sz; ++ count -= sz; ++ written += sz; ++ } ++ ++ *ppos += written; ++ return written; ++} ++ ++#ifndef ARCH_HAS_DEV_MEM_MMAP_MEM ++static inline int uncached_access(struct file *file) ++{ ++ if (file->f_flags & O_SYNC) ++ return 1; ++ /* Xen sets correct MTRR type on non-RAM for us. */ ++ return 0; ++} ++ ++static int xen_mmap_mem(struct file * file, struct vm_area_struct * vma) ++{ ++ size_t size = vma->vm_end - vma->vm_start; ++ ++ if (uncached_access(file)) ++ vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); ++ ++ /* We want to return the real error code, not EAGAIN. */ ++ return direct_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, ++ size, vma->vm_page_prot, DOMID_IO); ++} ++#endif ++ ++/* ++ * The memory devices use the full 32/64 bits of the offset, and so we cannot ++ * check against negative addresses: they are ok. The return value is weird, ++ * though, in that case (0). ++ * ++ * also note that seeking relative to the "end of file" isn't supported: ++ * it has no meaning, so it returns -EINVAL. ++ */ ++static loff_t memory_lseek(struct file * file, loff_t offset, int orig) ++{ ++ loff_t ret; ++ ++ mutex_lock(&file->f_dentry->d_inode->i_mutex); ++ switch (orig) { ++ case 0: ++ file->f_pos = offset; ++ ret = file->f_pos; ++ force_successful_syscall_return(); ++ break; ++ case 1: ++ file->f_pos += offset; ++ ret = file->f_pos; ++ force_successful_syscall_return(); ++ break; ++ default: ++ ret = -EINVAL; ++ } ++ mutex_unlock(&file->f_dentry->d_inode->i_mutex); ++ return ret; ++} ++ ++static int open_mem(struct inode * inode, struct file * filp) ++{ ++ return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; ++} ++ ++const struct file_operations mem_fops = { ++ .llseek = memory_lseek, ++ .read = read_mem, ++ .write = write_mem, ++ .mmap = xen_mmap_mem, ++ .open = open_mem, ++}; +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/console/Makefile 2007-08-27 14:01:53.000000000 -0400 +@@ -0,0 +1,2 @@ ++ ++obj-y := console.o xencons_ring.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/console/console.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,721 @@ ++/****************************************************************************** ++ * console.c ++ * ++ * Virtual console driver. ++ * ++ * Copyright (c) 2002-2004, K A Fraser. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/errno.h> ++#include <linux/signal.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <linux/tty.h> ++#include <linux/tty_flip.h> ++#include <linux/serial.h> ++#include <linux/major.h> ++#include <linux/ptrace.h> ++#include <linux/ioport.h> ++#include <linux/mm.h> ++#include <linux/slab.h> ++#include <linux/init.h> ++#include <linux/console.h> ++#include <linux/bootmem.h> ++#include <linux/sysrq.h> ++#include <linux/screen_info.h> ++#include <linux/vt.h> ++#include <asm/io.h> ++#include <asm/irq.h> ++#include <asm/uaccess.h> ++#include <xen/interface/xen.h> ++#include <xen/interface/event_channel.h> ++#include <asm/hypervisor.h> ++#include <xen/evtchn.h> ++#include <xen/xenbus.h> ++#include <xen/xencons.h> ++ ++/* ++ * Modes: ++ * 'xencons=off' [XC_OFF]: Console is disabled. ++ * 'xencons=tty' [XC_TTY]: Console attached to '/dev/tty[0-9]+'. ++ * 'xencons=ttyS' [XC_SERIAL]: Console attached to '/dev/ttyS[0-9]+'. ++ * 'xencons=xvc' [XC_XVC]: Console attached to '/dev/xvc0'. ++ * default: DOM0 -> XC_SERIAL ; all others -> XC_TTY. ++ * ++ * NB. In mode XC_TTY, we create dummy consoles for tty2-63. This suppresses ++ * warnings from standard distro startup scripts. ++ */ ++static enum { ++ XC_OFF, XC_TTY, XC_SERIAL, XC_XVC ++} xc_mode; ++static int xc_num = -1; ++ ++/* /dev/xvc0 device number allocated by lanana.org. */ ++#define XEN_XVC_MAJOR 204 ++#define XEN_XVC_MINOR 191 ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static unsigned long sysrq_requested; ++extern int sysrq_enabled; ++#endif ++ ++void xencons_early_setup(void) ++{ ++ extern int console_use_vt; ++ ++ if (is_initial_xendomain()) { ++ xc_mode = XC_SERIAL; ++ } else { ++ xc_mode = XC_TTY; ++ console_use_vt = 0; ++ } ++} ++ ++static int __init xencons_setup(char *str) ++{ ++ char *q; ++ int n; ++ extern int console_use_vt; ++ ++ console_use_vt = 1; ++ if (!strncmp(str, "ttyS", 4)) { ++ xc_mode = XC_SERIAL; ++ str += 4; ++ } else if (!strncmp(str, "tty", 3)) { ++ xc_mode = XC_TTY; ++ str += 3; ++ console_use_vt = 0; ++ } else if (!strncmp(str, "xvc", 3)) { ++ xc_mode = XC_XVC; ++ str += 3; ++ } else if (!strncmp(str, "off", 3)) { ++ xc_mode = XC_OFF; ++ str += 3; ++ } ++ ++ n = simple_strtol(str, &q, 10); ++ if (q != str) ++ xc_num = n; ++ ++ return 1; ++} ++__setup("xencons=", xencons_setup); ++ ++/* The kernel and user-land drivers share a common transmit buffer. */ ++static unsigned int wbuf_size = 4096; ++#define WBUF_MASK(_i) ((_i)&(wbuf_size-1)) ++static char *wbuf; ++static unsigned int wc, wp; /* write_cons, write_prod */ ++ ++static int __init xencons_bufsz_setup(char *str) ++{ ++ unsigned int goal; ++ goal = simple_strtoul(str, NULL, 0); ++ if (goal) { ++ goal = roundup_pow_of_two(goal); ++ if (wbuf_size < goal) ++ wbuf_size = goal; ++ } ++ return 1; ++} ++__setup("xencons_bufsz=", xencons_bufsz_setup); ++ ++/* This lock protects accesses to the common transmit buffer. */ ++static DEFINE_SPINLOCK(xencons_lock); ++ ++/* Common transmit-kick routine. */ ++static void __xencons_tx_flush(void); ++ ++static struct tty_driver *xencons_driver; ++ ++/******************** Kernel console driver ********************************/ ++ ++static void kcons_write(struct console *c, const char *s, unsigned int count) ++{ ++ int i = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ ++ while (i < count) { ++ for (; i < count; i++) { ++ if ((wp - wc) >= (wbuf_size - 1)) ++ break; ++ if ((wbuf[WBUF_MASK(wp++)] = s[i]) == '\n') ++ wbuf[WBUF_MASK(wp++)] = '\r'; ++ } ++ ++ __xencons_tx_flush(); ++ } ++ ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static void kcons_write_dom0(struct console *c, const char *s, unsigned int count) ++{ ++ ++ while (count > 0) { ++ int rc; ++ rc = HYPERVISOR_console_io( CONSOLEIO_write, count, (char *)s); ++ if (rc <= 0) ++ break; ++ count -= rc; ++ s += rc; ++ } ++} ++ ++static struct tty_driver *kcons_device(struct console *c, int *index) ++{ ++ *index = 0; ++ return xencons_driver; ++} ++ ++static struct console kcons_info = { ++ .device = kcons_device, ++ .flags = CON_PRINTBUFFER | CON_ENABLED, ++ .index = -1, ++}; ++ ++static int __init xen_console_init(void) ++{ ++ if (!is_running_on_xen()) ++ goto out; ++ ++ if (is_initial_xendomain()) { ++ kcons_info.write = kcons_write_dom0; ++ } else { ++ if (!xen_start_info->console.domU.evtchn) ++ goto out; ++ kcons_info.write = kcons_write; ++ } ++ ++ switch (xc_mode) { ++ case XC_XVC: ++ strcpy(kcons_info.name, "xvc"); ++ if (xc_num == -1) ++ xc_num = 0; ++ break; ++ ++ case XC_SERIAL: ++ strcpy(kcons_info.name, "ttyS"); ++ if (xc_num == -1) ++ xc_num = 0; ++ break; ++ ++ case XC_TTY: ++ strcpy(kcons_info.name, "tty"); ++ if (xc_num == -1) ++ xc_num = 1; ++ break; ++ ++ default: ++ goto out; ++ } ++ ++ wbuf = alloc_bootmem(wbuf_size); ++ ++ register_console(&kcons_info); ++ ++ out: ++ return 0; ++} ++console_initcall(xen_console_init); ++ ++/*** Useful function for console debugging -- goes straight to Xen. ***/ ++asmlinkage int xprintk(const char *fmt, ...) ++{ ++ va_list args; ++ int printk_len; ++ static char printk_buf[1024]; ++ ++ /* Emit the output into the temporary buffer */ ++ va_start(args, fmt); ++ printk_len = vsnprintf(printk_buf, sizeof(printk_buf), fmt, args); ++ va_end(args); ++ ++ /* Send the processed output directly to Xen. */ ++ kcons_write_dom0(NULL, printk_buf, printk_len); ++ ++ return 0; ++} ++ ++/*** Forcibly flush console data before dying. ***/ ++void xencons_force_flush(void) ++{ ++ int sz; ++ ++ /* Emergency console is synchronous, so there's nothing to flush. */ ++ if (!is_running_on_xen() || ++ is_initial_xendomain() || ++ !xen_start_info->console.domU.evtchn) ++ return; ++ ++ /* Spin until console data is flushed through to the daemon. */ ++ while (wc != wp) { ++ int sent = 0; ++ if ((sz = wp - wc) == 0) ++ continue; ++ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); ++ if (sent > 0) ++ wc += sent; ++ } ++} ++ ++ ++void dom0_init_screen_info(const struct dom0_vga_console_info *info) ++{ ++ switch (info->video_type) { ++ case XEN_VGATYPE_TEXT_MODE_3: ++ screen_info.orig_video_mode = 3; ++ screen_info.orig_video_ega_bx = 3; ++ screen_info.orig_video_isVGA = 1; ++ screen_info.orig_video_lines = info->u.text_mode_3.rows; ++ screen_info.orig_video_cols = info->u.text_mode_3.columns; ++ screen_info.orig_x = info->u.text_mode_3.cursor_x; ++ screen_info.orig_y = info->u.text_mode_3.cursor_y; ++ screen_info.orig_video_points = ++ info->u.text_mode_3.font_height; ++ break; ++ case XEN_VGATYPE_VESA_LFB: ++ screen_info.orig_video_isVGA = VIDEO_TYPE_VLFB; ++ screen_info.lfb_width = info->u.vesa_lfb.width; ++ screen_info.lfb_height = info->u.vesa_lfb.height; ++ screen_info.lfb_depth = info->u.vesa_lfb.bits_per_pixel; ++ screen_info.lfb_base = info->u.vesa_lfb.lfb_base; ++ screen_info.lfb_size = info->u.vesa_lfb.lfb_size; ++ screen_info.lfb_linelength = info->u.vesa_lfb.bytes_per_line; ++ screen_info.red_size = info->u.vesa_lfb.red_size; ++ screen_info.red_pos = info->u.vesa_lfb.red_pos; ++ screen_info.green_size = info->u.vesa_lfb.green_size; ++ screen_info.green_pos = info->u.vesa_lfb.green_pos; ++ screen_info.blue_size = info->u.vesa_lfb.blue_size; ++ screen_info.blue_pos = info->u.vesa_lfb.blue_pos; ++ screen_info.rsvd_size = info->u.vesa_lfb.rsvd_size; ++ screen_info.rsvd_pos = info->u.vesa_lfb.rsvd_pos; ++ break; ++ } ++} ++ ++ ++/******************** User-space console driver (/dev/console) ************/ ++ ++#define DRV(_d) (_d) ++#define DUMMY_TTY(_tty) ((xc_mode == XC_TTY) && \ ++ ((_tty)->index != (xc_num - 1))) ++ ++static struct termios *xencons_termios[MAX_NR_CONSOLES]; ++static struct termios *xencons_termios_locked[MAX_NR_CONSOLES]; ++static struct tty_struct *xencons_tty; ++static int xencons_priv_irq; ++static char x_char; ++ ++void xencons_rx(char *buf, unsigned len, struct pt_regs *regs) ++{ ++ int i; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ if (xencons_tty == NULL) ++ goto out; ++ ++ for (i = 0; i < len; i++) { ++#ifdef CONFIG_MAGIC_SYSRQ ++ if (sysrq_enabled) { ++ if (buf[i] == '\x0f') { /* ^O */ ++ if (!sysrq_requested) { ++ sysrq_requested = jiffies; ++ continue; /* don't print sysrq key */ ++ } ++ sysrq_requested = 0; ++ } else if (sysrq_requested) { ++ unsigned long sysrq_timeout = ++ sysrq_requested + HZ*2; ++ sysrq_requested = 0; ++ if (time_before(jiffies, sysrq_timeout)) { ++ spin_unlock_irqrestore( ++ &xencons_lock, flags); ++ handle_sysrq( ++ buf[i], regs, xencons_tty); ++ spin_lock_irqsave( ++ &xencons_lock, flags); ++ continue; ++ } ++ } ++ } ++#endif ++ tty_insert_flip_char(xencons_tty, buf[i], 0); ++ } ++ tty_flip_buffer_push(xencons_tty); ++ ++ out: ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static void __xencons_tx_flush(void) ++{ ++ int sent, sz, work_done = 0; ++ ++ if (x_char) { ++ if (is_initial_xendomain()) ++ kcons_write_dom0(NULL, &x_char, 1); ++ else ++ while (x_char) ++ if (xencons_ring_send(&x_char, 1) == 1) ++ break; ++ x_char = 0; ++ work_done = 1; ++ } ++ ++ while (wc != wp) { ++ sz = wp - wc; ++ if (sz > (wbuf_size - WBUF_MASK(wc))) ++ sz = wbuf_size - WBUF_MASK(wc); ++ if (is_initial_xendomain()) { ++ kcons_write_dom0(NULL, &wbuf[WBUF_MASK(wc)], sz); ++ wc += sz; ++ } else { ++ sent = xencons_ring_send(&wbuf[WBUF_MASK(wc)], sz); ++ if (sent == 0) ++ break; ++ wc += sent; ++ } ++ work_done = 1; ++ } ++ ++ if (work_done && (xencons_tty != NULL)) { ++ wake_up_interruptible(&xencons_tty->write_wait); ++ if ((xencons_tty->flags & (1 << TTY_DO_WRITE_WAKEUP)) && ++ (xencons_tty->ldisc.write_wakeup != NULL)) ++ (xencons_tty->ldisc.write_wakeup)(xencons_tty); ++ } ++} ++ ++void xencons_tx(void) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ __xencons_tx_flush(); ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++/* Privileged receive callback and transmit kicker. */ ++static irqreturn_t xencons_priv_interrupt(int irq, void *dev_id, ++ struct pt_regs *regs) ++{ ++ static char rbuf[16]; ++ int l; ++ ++ while ((l = HYPERVISOR_console_io(CONSOLEIO_read, 16, rbuf)) > 0) ++ xencons_rx(rbuf, l, regs); ++ ++ xencons_tx(); ++ ++ return IRQ_HANDLED; ++} ++ ++static int xencons_write_room(struct tty_struct *tty) ++{ ++ return wbuf_size - (wp - wc); ++} ++ ++static int xencons_chars_in_buffer(struct tty_struct *tty) ++{ ++ return wp - wc; ++} ++ ++static void xencons_send_xchar(struct tty_struct *tty, char ch) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ x_char = ch; ++ __xencons_tx_flush(); ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static void xencons_throttle(struct tty_struct *tty) ++{ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ if (I_IXOFF(tty)) ++ xencons_send_xchar(tty, STOP_CHAR(tty)); ++} ++ ++static void xencons_unthrottle(struct tty_struct *tty) ++{ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ if (I_IXOFF(tty)) { ++ if (x_char != 0) ++ x_char = 0; ++ else ++ xencons_send_xchar(tty, START_CHAR(tty)); ++ } ++} ++ ++static void xencons_flush_buffer(struct tty_struct *tty) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ wc = wp = 0; ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static inline int __xencons_put_char(int ch) ++{ ++ char _ch = (char)ch; ++ if ((wp - wc) == wbuf_size) ++ return 0; ++ wbuf[WBUF_MASK(wp++)] = _ch; ++ return 1; ++} ++ ++static int xencons_write( ++ struct tty_struct *tty, ++ const unsigned char *buf, ++ int count) ++{ ++ int i; ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return count; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ ++ for (i = 0; i < count; i++) ++ if (!__xencons_put_char(buf[i])) ++ break; ++ ++ if (i != 0) ++ __xencons_tx_flush(); ++ ++ spin_unlock_irqrestore(&xencons_lock, flags); ++ ++ return i; ++} ++ ++static void xencons_put_char(struct tty_struct *tty, u_char ch) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ (void)__xencons_put_char(ch); ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static void xencons_flush_chars(struct tty_struct *tty) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ __xencons_tx_flush(); ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static void xencons_wait_until_sent(struct tty_struct *tty, int timeout) ++{ ++ unsigned long orig_jiffies = jiffies; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ while (DRV(tty->driver)->chars_in_buffer(tty)) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ schedule_timeout(1); ++ if (signal_pending(current)) ++ break; ++ if (timeout && time_after(jiffies, orig_jiffies + timeout)) ++ break; ++ } ++ ++ set_current_state(TASK_RUNNING); ++} ++ ++static int xencons_open(struct tty_struct *tty, struct file *filp) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return 0; ++ ++ spin_lock_irqsave(&xencons_lock, flags); ++ tty->driver_data = NULL; ++ if (xencons_tty == NULL) ++ xencons_tty = tty; ++ __xencons_tx_flush(); ++ spin_unlock_irqrestore(&xencons_lock, flags); ++ ++ return 0; ++} ++ ++static void xencons_close(struct tty_struct *tty, struct file *filp) ++{ ++ unsigned long flags; ++ ++ if (DUMMY_TTY(tty)) ++ return; ++ ++ mutex_lock(&tty_mutex); ++ ++ if (tty->count != 1) { ++ mutex_unlock(&tty_mutex); ++ return; ++ } ++ ++ /* Prevent other threads from re-opening this tty. */ ++ set_bit(TTY_CLOSING, &tty->flags); ++ mutex_unlock(&tty_mutex); ++ ++ tty->closing = 1; ++ tty_wait_until_sent(tty, 0); ++ if (DRV(tty->driver)->flush_buffer != NULL) ++ DRV(tty->driver)->flush_buffer(tty); ++ if (tty->ldisc.flush_buffer != NULL) ++ tty->ldisc.flush_buffer(tty); ++ tty->closing = 0; ++ spin_lock_irqsave(&xencons_lock, flags); ++ xencons_tty = NULL; ++ spin_unlock_irqrestore(&xencons_lock, flags); ++} ++ ++static struct tty_operations xencons_ops = { ++ .open = xencons_open, ++ .close = xencons_close, ++ .write = xencons_write, ++ .write_room = xencons_write_room, ++ .put_char = xencons_put_char, ++ .flush_chars = xencons_flush_chars, ++ .chars_in_buffer = xencons_chars_in_buffer, ++ .send_xchar = xencons_send_xchar, ++ .flush_buffer = xencons_flush_buffer, ++ .throttle = xencons_throttle, ++ .unthrottle = xencons_unthrottle, ++ .wait_until_sent = xencons_wait_until_sent, ++}; ++ ++static int __init xencons_init(void) ++{ ++ int rc; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ if (xc_mode == XC_OFF) ++ return 0; ++ ++ if (!is_initial_xendomain()) { ++ rc = xencons_ring_init(); ++ if (rc) ++ return rc; ++ } ++ ++ xencons_driver = alloc_tty_driver((xc_mode == XC_TTY) ? ++ MAX_NR_CONSOLES : 1); ++ if (xencons_driver == NULL) ++ return -ENOMEM; ++ ++ DRV(xencons_driver)->name = "xencons"; ++ DRV(xencons_driver)->major = TTY_MAJOR; ++ DRV(xencons_driver)->type = TTY_DRIVER_TYPE_SERIAL; ++ DRV(xencons_driver)->subtype = SERIAL_TYPE_NORMAL; ++ DRV(xencons_driver)->init_termios = tty_std_termios; ++ DRV(xencons_driver)->flags = ++ TTY_DRIVER_REAL_RAW | ++ TTY_DRIVER_RESET_TERMIOS; ++ DRV(xencons_driver)->termios = xencons_termios; ++ DRV(xencons_driver)->termios_locked = xencons_termios_locked; ++ ++ switch (xc_mode) { ++ case XC_XVC: ++ DRV(xencons_driver)->name = "xvc"; ++ DRV(xencons_driver)->major = XEN_XVC_MAJOR; ++ DRV(xencons_driver)->minor_start = XEN_XVC_MINOR; ++ DRV(xencons_driver)->name_base = xc_num; ++ break; ++ case XC_SERIAL: ++ DRV(xencons_driver)->name = "ttyS"; ++ DRV(xencons_driver)->minor_start = 64 + xc_num; ++ DRV(xencons_driver)->name_base = xc_num; ++ break; ++ default: ++ DRV(xencons_driver)->name = "tty"; ++ DRV(xencons_driver)->minor_start = 1; ++ DRV(xencons_driver)->name_base = 1; ++ break; ++ } ++ ++ tty_set_operations(xencons_driver, &xencons_ops); ++ ++ if ((rc = tty_register_driver(DRV(xencons_driver))) != 0) { ++ printk("WARNING: Failed to register Xen virtual " ++ "console driver as '%s%d'\n", ++ DRV(xencons_driver)->name, ++ DRV(xencons_driver)->name_base); ++ put_tty_driver(xencons_driver); ++ xencons_driver = NULL; ++ return rc; ++ } ++ ++ if (is_initial_xendomain()) { ++ xencons_priv_irq = bind_virq_to_irqhandler( ++ VIRQ_CONSOLE, ++ 0, ++ xencons_priv_interrupt, ++ 0, ++ "console", ++ NULL); ++ BUG_ON(xencons_priv_irq < 0); ++ } ++ ++ printk("Xen virtual console successfully installed as %s%d\n", ++ DRV(xencons_driver)->name, xc_num); ++ ++ return 0; ++} ++ ++module_init(xencons_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/console/xencons_ring.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,143 @@ ++/* ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/errno.h> ++#include <linux/signal.h> ++#include <linux/sched.h> ++#include <linux/interrupt.h> ++#include <linux/tty.h> ++#include <linux/tty_flip.h> ++#include <linux/serial.h> ++#include <linux/major.h> ++#include <linux/ptrace.h> ++#include <linux/ioport.h> ++#include <linux/mm.h> ++#include <linux/slab.h> ++ ++#include <asm/hypervisor.h> ++#include <xen/evtchn.h> ++#include <xen/xencons.h> ++#include <linux/wait.h> ++#include <linux/interrupt.h> ++#include <linux/sched.h> ++#include <linux/err.h> ++#include <xen/interface/io/console.h> ++ ++static int xencons_irq; ++ ++static inline struct xencons_interface *xencons_interface(void) ++{ ++ return mfn_to_virt(xen_start_info->console.domU.mfn); ++} ++ ++static inline void notify_daemon(void) ++{ ++ /* Use evtchn: this is called early, before irq is set up. */ ++ notify_remote_via_evtchn(xen_start_info->console.domU.evtchn); ++} ++ ++int xencons_ring_send(const char *data, unsigned len) ++{ ++ int sent = 0; ++ struct xencons_interface *intf = xencons_interface(); ++ XENCONS_RING_IDX cons, prod; ++ ++ cons = intf->out_cons; ++ prod = intf->out_prod; ++ mb(); ++ BUG_ON((prod - cons) > sizeof(intf->out)); ++ ++ while ((sent < len) && ((prod - cons) < sizeof(intf->out))) ++ intf->out[MASK_XENCONS_IDX(prod++, intf->out)] = data[sent++]; ++ ++ wmb(); ++ intf->out_prod = prod; ++ ++ notify_daemon(); ++ ++ return sent; ++} ++ ++static irqreturn_t handle_input(int irq, void *unused, struct pt_regs *regs) ++{ ++ struct xencons_interface *intf = xencons_interface(); ++ XENCONS_RING_IDX cons, prod; ++ ++ cons = intf->in_cons; ++ prod = intf->in_prod; ++ mb(); ++ BUG_ON((prod - cons) > sizeof(intf->in)); ++ ++ while (cons != prod) { ++ xencons_rx(intf->in+MASK_XENCONS_IDX(cons,intf->in), 1, regs); ++ cons++; ++ } ++ ++ mb(); ++ intf->in_cons = cons; ++ ++ notify_daemon(); ++ ++ xencons_tx(); ++ ++ return IRQ_HANDLED; ++} ++ ++int xencons_ring_init(void) ++{ ++ int irq; ++ ++ if (xencons_irq) ++ unbind_from_irqhandler(xencons_irq, NULL); ++ xencons_irq = 0; ++ ++ if (!is_running_on_xen() || ++ is_initial_xendomain() || ++ !xen_start_info->console.domU.evtchn) ++ return -ENODEV; ++ ++ irq = bind_caller_port_to_irqhandler( ++ xen_start_info->console.domU.evtchn, ++ handle_input, 0, "xencons", NULL); ++ if (irq < 0) { ++ printk(KERN_ERR "XEN console request irq failed %i\n", irq); ++ return irq; ++ } ++ ++ xencons_irq = irq; ++ ++ /* In case we have in-flight data after save/restore... */ ++ notify_daemon(); ++ ++ return 0; ++} ++ ++void xencons_resume(void) ++{ ++ (void)xencons_ring_init(); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/Makefile 2007-08-27 14:02:09.000000000 -0400 +@@ -0,0 +1,12 @@ ++# ++# Makefile for the linux kernel. ++# ++ ++obj-y := evtchn.o gnttab.o features.o reboot.o machine_reboot.o ++ ++obj-$(CONFIG_PROC_FS) += xen_proc.o ++obj-$(CONFIG_SYSFS) += hypervisor_sysfs.o ++obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o ++obj-$(CONFIG_XEN_SYSFS) += xen_sysfs.o ++obj-$(CONFIG_XEN_SMPBOOT) += smpboot.o ++obj-$(CONFIG_KEXEC) += machine_kexec.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/cpu_hotplug.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,172 @@ ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/notifier.h> ++#include <linux/cpu.h> ++#include <xen/cpu_hotplug.h> ++#include <xen/xenbus.h> ++ ++/* ++ * Set of CPUs that remote admin software will allow us to bring online. ++ * Notified to us via xenbus. ++ */ ++static cpumask_t xenbus_allowed_cpumask; ++ ++/* Set of CPUs that local admin will allow us to bring online. */ ++static cpumask_t local_allowed_cpumask = CPU_MASK_ALL; ++ ++static int local_cpu_hotplug_request(void) ++{ ++ /* ++ * We assume a CPU hotplug request comes from local admin if it is made ++ * via a userspace process (i.e., one with a real mm_struct). ++ */ ++ return (current->mm != NULL); ++} ++ ++static void vcpu_hotplug(unsigned int cpu) ++{ ++ int err; ++ char dir[32], state[32]; ++ ++ if ((cpu >= NR_CPUS) || !cpu_possible(cpu)) ++ return; ++ ++ sprintf(dir, "cpu/%d", cpu); ++ err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); ++ if (err != 1) { ++ printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); ++ return; ++ } ++ ++ if (strcmp(state, "online") == 0) { ++ cpu_set(cpu, xenbus_allowed_cpumask); ++ (void)cpu_up(cpu); ++ } else if (strcmp(state, "offline") == 0) { ++ cpu_clear(cpu, xenbus_allowed_cpumask); ++ (void)cpu_down(cpu); ++ } else { ++ printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", ++ state, cpu); ++ } ++} ++ ++static void handle_vcpu_hotplug_event( ++ struct xenbus_watch *watch, const char **vec, unsigned int len) ++{ ++ int cpu; ++ char *cpustr; ++ const char *node = vec[XS_WATCH_PATH]; ++ ++ if ((cpustr = strstr(node, "cpu/")) != NULL) { ++ sscanf(cpustr, "cpu/%d", &cpu); ++ vcpu_hotplug(cpu); ++ } ++} ++ ++static int smpboot_cpu_notify(struct notifier_block *notifier, ++ unsigned long action, void *hcpu) ++{ ++ int cpu = (long)hcpu; ++ ++ /* ++ * We do this in a callback notifier rather than __cpu_disable() ++ * because local_cpu_hotplug_request() does not work in the latter ++ * as it's always executed from within a stopmachine kthread. ++ */ ++ if ((action == CPU_DOWN_PREPARE) && local_cpu_hotplug_request()) ++ cpu_clear(cpu, local_allowed_cpumask); ++ ++ return NOTIFY_OK; ++} ++ ++static int setup_cpu_watcher(struct notifier_block *notifier, ++ unsigned long event, void *data) ++{ ++ int i; ++ ++ static struct xenbus_watch cpu_watch = { ++ .node = "cpu", ++ .callback = handle_vcpu_hotplug_event, ++ .flags = XBWF_new_thread }; ++ (void)register_xenbus_watch(&cpu_watch); ++ ++ if (!is_initial_xendomain()) { ++ for_each_possible_cpu(i) ++ vcpu_hotplug(i); ++ printk(KERN_INFO "Brought up %ld CPUs\n", ++ (long)num_online_cpus()); ++ } ++ ++ return NOTIFY_DONE; ++} ++ ++static int __init setup_vcpu_hotplug_event(void) ++{ ++ static struct notifier_block hotplug_cpu = { ++ .notifier_call = smpboot_cpu_notify }; ++ static struct notifier_block xsn_cpu = { ++ .notifier_call = setup_cpu_watcher }; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ register_cpu_notifier(&hotplug_cpu); ++ register_xenstore_notifier(&xsn_cpu); ++ ++ return 0; ++} ++ ++arch_initcall(setup_vcpu_hotplug_event); ++ ++int smp_suspend(void) ++{ ++ int cpu, err; ++ ++ for_each_online_cpu(cpu) { ++ if (cpu == 0) ++ continue; ++ err = cpu_down(cpu); ++ if (err) { ++ printk(KERN_CRIT "Failed to take all CPUs " ++ "down: %d.\n", err); ++ for_each_possible_cpu(cpu) ++ vcpu_hotplug(cpu); ++ return err; ++ } ++ } ++ ++ return 0; ++} ++ ++void smp_resume(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) ++ vcpu_hotplug(cpu); ++} ++ ++int cpu_up_check(unsigned int cpu) ++{ ++ int rc = 0; ++ ++ if (local_cpu_hotplug_request()) { ++ cpu_set(cpu, local_allowed_cpumask); ++ if (!cpu_isset(cpu, xenbus_allowed_cpumask)) { ++ printk("%s: attempt to bring up CPU %u disallowed by " ++ "remote admin.\n", __FUNCTION__, cpu); ++ rc = -EBUSY; ++ } ++ } else if (!cpu_isset(cpu, local_allowed_cpumask) || ++ !cpu_isset(cpu, xenbus_allowed_cpumask)) { ++ rc = -EBUSY; ++ } ++ ++ return rc; ++} ++ ++void init_xenbus_allowed_cpumask(void) ++{ ++ xenbus_allowed_cpumask = cpu_present_map; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/evtchn.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,1015 @@ ++/****************************************************************************** ++ * evtchn.c ++ * ++ * Communication via Xen event channels. ++ * ++ * Copyright (c) 2002-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/irq.h> ++#include <linux/interrupt.h> ++#include <linux/sched.h> ++#include <linux/kernel_stat.h> ++#include <linux/version.h> ++#include <asm/atomic.h> ++#include <asm/system.h> ++#include <asm/ptrace.h> ++#include <asm/synch_bitops.h> ++#include <xen/evtchn.h> ++#include <xen/interface/event_channel.h> ++#include <xen/interface/physdev.h> ++#include <asm/hypervisor.h> ++#include <linux/mc146818rtc.h> /* RTC_IRQ */ ++ ++/* ++ * This lock protects updates to the following mapping and reference-count ++ * arrays. The lock does not need to be acquired to read the mapping tables. ++ */ ++static DEFINE_SPINLOCK(irq_mapping_update_lock); ++ ++/* IRQ <-> event-channel mappings. */ ++static int evtchn_to_irq[NR_EVENT_CHANNELS] = { ++ [0 ... NR_EVENT_CHANNELS-1] = -1 }; ++ ++/* Packed IRQ information: binding type, sub-type index, and event channel. */ ++static u32 irq_info[NR_IRQS]; ++ ++/* Binding types. */ ++enum { ++ IRQT_UNBOUND, ++ IRQT_PIRQ, ++ IRQT_VIRQ, ++ IRQT_IPI, ++ IRQT_LOCAL_PORT, ++ IRQT_CALLER_PORT ++}; ++ ++/* Constructor for packed IRQ information. */ ++static inline u32 mk_irq_info(u32 type, u32 index, u32 evtchn) ++{ ++ return ((type << 24) | (index << 16) | evtchn); ++} ++ ++/* Convenient shorthand for packed representation of an unbound IRQ. */ ++#define IRQ_UNBOUND mk_irq_info(IRQT_UNBOUND, 0, 0) ++ ++/* ++ * Accessors for packed IRQ information. ++ */ ++ ++static inline unsigned int evtchn_from_irq(int irq) ++{ ++ return (u16)(irq_info[irq]); ++} ++ ++static inline unsigned int index_from_irq(int irq) ++{ ++ return (u8)(irq_info[irq] >> 16); ++} ++ ++static inline unsigned int type_from_irq(int irq) ++{ ++ return (u8)(irq_info[irq] >> 24); ++} ++ ++/* IRQ <-> VIRQ mapping. */ ++DEFINE_PER_CPU(int, virq_to_irq[NR_VIRQS]) = {[0 ... NR_VIRQS-1] = -1}; ++ ++/* IRQ <-> IPI mapping. */ ++#ifndef NR_IPIS ++#define NR_IPIS 1 ++#endif ++DEFINE_PER_CPU(int, ipi_to_irq[NR_IPIS]) = {[0 ... NR_IPIS-1] = -1}; ++ ++/* Reference counts for bindings to IRQs. */ ++static int irq_bindcount[NR_IRQS]; ++ ++/* Bitmap indicating which PIRQs require Xen to be notified on unmask. */ ++static DECLARE_BITMAP(pirq_needs_eoi, NR_PIRQS); ++ ++#ifdef CONFIG_SMP ++ ++static u8 cpu_evtchn[NR_EVENT_CHANNELS]; ++static unsigned long cpu_evtchn_mask[NR_CPUS][NR_EVENT_CHANNELS/BITS_PER_LONG]; ++ ++static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, ++ unsigned int idx) ++{ ++ return (sh->evtchn_pending[idx] & ++ cpu_evtchn_mask[cpu][idx] & ++ ~sh->evtchn_mask[idx]); ++} ++ ++static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) ++{ ++ int irq = evtchn_to_irq[chn]; ++ ++ BUG_ON(irq == -1); ++ set_native_irq_info(irq, cpumask_of_cpu(cpu)); ++ ++ clear_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu_evtchn[chn]]); ++ set_bit(chn, (unsigned long *)cpu_evtchn_mask[cpu]); ++ cpu_evtchn[chn] = cpu; ++} ++ ++static void init_evtchn_cpu_bindings(void) ++{ ++ int i; ++ ++ /* By default all event channels notify CPU#0. */ ++ for (i = 0; i < NR_IRQS; i++) ++ set_native_irq_info(i, cpumask_of_cpu(0)); ++ ++ memset(cpu_evtchn, 0, sizeof(cpu_evtchn)); ++ memset(cpu_evtchn_mask[0], ~0, sizeof(cpu_evtchn_mask[0])); ++} ++ ++static inline unsigned int cpu_from_evtchn(unsigned int evtchn) ++{ ++ return cpu_evtchn[evtchn]; ++} ++ ++#else ++ ++static inline unsigned long active_evtchns(unsigned int cpu, shared_info_t *sh, ++ unsigned int idx) ++{ ++ return (sh->evtchn_pending[idx] & ~sh->evtchn_mask[idx]); ++} ++ ++static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu) ++{ ++} ++ ++static void init_evtchn_cpu_bindings(void) ++{ ++} ++ ++static inline unsigned int cpu_from_evtchn(unsigned int evtchn) ++{ ++ return 0; ++} ++ ++#endif ++ ++/* Upcall to generic IRQ layer. */ ++#ifdef CONFIG_X86 ++extern fastcall unsigned int do_IRQ(struct pt_regs *regs); ++void __init xen_init_IRQ(void); ++void __init init_IRQ(void) ++{ ++ irq_ctx_init(0); ++ xen_init_IRQ(); ++} ++#if defined (__i386__) ++static inline void exit_idle(void) {} ++#define IRQ_REG orig_eax ++#elif defined (__x86_64__) ++#include <asm/idle.h> ++#define IRQ_REG orig_rax ++#endif ++#define do_IRQ(irq, regs) do { \ ++ (regs)->IRQ_REG = ~(irq); \ ++ do_IRQ((regs)); \ ++} while (0) ++#endif ++ ++/* Xen will never allocate port zero for any purpose. */ ++#define VALID_EVTCHN(chn) ((chn) != 0) ++ ++/* ++ * Force a proper event-channel callback from Xen after clearing the ++ * callback mask. We do this in a very simple manner, by making a call ++ * down into Xen. The pending flag will be checked by Xen on return. ++ */ ++void force_evtchn_callback(void) ++{ ++ (void)HYPERVISOR_xen_version(0, NULL); ++} ++/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ ++EXPORT_SYMBOL(force_evtchn_callback); ++ ++static DEFINE_PER_CPU(unsigned int, upcall_count) = { 0 }; ++ ++/* NB. Interrupts are disabled on entry. */ ++asmlinkage void evtchn_do_upcall(struct pt_regs *regs) ++{ ++ unsigned long l1, l2; ++ unsigned int l1i, l2i, port, count; ++ int irq, cpu = smp_processor_id(); ++ shared_info_t *s = HYPERVISOR_shared_info; ++ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; ++ ++ do { ++ /* Avoid a callback storm when we reenable delivery. */ ++ vcpu_info->evtchn_upcall_pending = 0; ++ ++ /* Nested invocations bail immediately. */ ++ if (unlikely(per_cpu(upcall_count, cpu)++)) ++ return; ++ ++#ifndef CONFIG_X86 /* No need for a barrier -- XCHG is a barrier on x86. */ ++ /* Clear master flag /before/ clearing selector flag. */ ++ rmb(); ++#endif ++ l1 = xchg(&vcpu_info->evtchn_pending_sel, 0); ++ while (l1 != 0) { ++ l1i = __ffs(l1); ++ l1 &= ~(1UL << l1i); ++ ++ while ((l2 = active_evtchns(cpu, s, l1i)) != 0) { ++ l2i = __ffs(l2); ++ ++ port = (l1i * BITS_PER_LONG) + l2i; ++ if ((irq = evtchn_to_irq[port]) != -1) ++ do_IRQ(irq, regs); ++ else { ++ exit_idle(); ++ evtchn_device_upcall(port); ++ } ++ } ++ } ++ ++ /* If there were nested callbacks then we have more to do. */ ++ count = per_cpu(upcall_count, cpu); ++ per_cpu(upcall_count, cpu) = 0; ++ } while (unlikely(count != 1)); ++} ++ ++static int find_unbound_irq(void) ++{ ++ static int warned; ++ int dynirq, irq; ++ ++ for (dynirq = 0; dynirq < NR_DYNIRQS; dynirq++) { ++ irq = dynirq_to_irq(dynirq); ++ if (irq_bindcount[irq] == 0) ++ return irq; ++ } ++ ++ if (!warned) { ++ warned = 1; ++ printk(KERN_WARNING "No available IRQ to bind to: " ++ "increase NR_DYNIRQS.\n"); ++ } ++ ++ return -ENOSPC; ++} ++ ++static int bind_caller_port_to_irq(unsigned int caller_port) ++{ ++ int irq; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ if ((irq = evtchn_to_irq[caller_port]) == -1) { ++ if ((irq = find_unbound_irq()) < 0) ++ goto out; ++ ++ evtchn_to_irq[caller_port] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_CALLER_PORT, 0, caller_port); ++ } ++ ++ irq_bindcount[irq]++; ++ ++ out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++ ++static int bind_local_port_to_irq(unsigned int local_port) ++{ ++ int irq; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ BUG_ON(evtchn_to_irq[local_port] != -1); ++ ++ if ((irq = find_unbound_irq()) < 0) { ++ struct evtchn_close close = { .port = local_port }; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) ++ BUG(); ++ goto out; ++ } ++ ++ evtchn_to_irq[local_port] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_LOCAL_PORT, 0, local_port); ++ irq_bindcount[irq]++; ++ ++ out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++ ++static int bind_listening_port_to_irq(unsigned int remote_domain) ++{ ++ struct evtchn_alloc_unbound alloc_unbound; ++ int err; ++ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = remote_domain; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ ++ return err ? : bind_local_port_to_irq(alloc_unbound.port); ++} ++ ++static int bind_interdomain_evtchn_to_irq(unsigned int remote_domain, ++ unsigned int remote_port) ++{ ++ struct evtchn_bind_interdomain bind_interdomain; ++ int err; ++ ++ bind_interdomain.remote_dom = remote_domain; ++ bind_interdomain.remote_port = remote_port; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, ++ &bind_interdomain); ++ ++ return err ? : bind_local_port_to_irq(bind_interdomain.local_port); ++} ++ ++static int bind_virq_to_irq(unsigned int virq, unsigned int cpu) ++{ ++ struct evtchn_bind_virq bind_virq; ++ int evtchn, irq; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) { ++ if ((irq = find_unbound_irq()) < 0) ++ goto out; ++ ++ bind_virq.virq = virq; ++ bind_virq.vcpu = cpu; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, ++ &bind_virq) != 0) ++ BUG(); ++ evtchn = bind_virq.port; ++ ++ evtchn_to_irq[evtchn] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); ++ ++ per_cpu(virq_to_irq, cpu)[virq] = irq; ++ ++ bind_evtchn_to_cpu(evtchn, cpu); ++ } ++ ++ irq_bindcount[irq]++; ++ ++ out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++ ++static int bind_ipi_to_irq(unsigned int ipi, unsigned int cpu) ++{ ++ struct evtchn_bind_ipi bind_ipi; ++ int evtchn, irq; ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) { ++ if ((irq = find_unbound_irq()) < 0) ++ goto out; ++ ++ bind_ipi.vcpu = cpu; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, ++ &bind_ipi) != 0) ++ BUG(); ++ evtchn = bind_ipi.port; ++ ++ evtchn_to_irq[evtchn] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); ++ ++ per_cpu(ipi_to_irq, cpu)[ipi] = irq; ++ ++ bind_evtchn_to_cpu(evtchn, cpu); ++ } ++ ++ irq_bindcount[irq]++; ++ ++ out: ++ spin_unlock(&irq_mapping_update_lock); ++ return irq; ++} ++ ++static void unbind_from_irq(unsigned int irq) ++{ ++ struct evtchn_close close; ++ int cpu, evtchn = evtchn_from_irq(irq); ++ ++ spin_lock(&irq_mapping_update_lock); ++ ++ if ((--irq_bindcount[irq] == 0) && VALID_EVTCHN(evtchn)) { ++ close.port = evtchn; ++ if ((type_from_irq(irq) != IRQT_CALLER_PORT) && ++ HYPERVISOR_event_channel_op(EVTCHNOP_close, &close)) ++ BUG(); ++ ++ switch (type_from_irq(irq)) { ++ case IRQT_VIRQ: ++ per_cpu(virq_to_irq, cpu_from_evtchn(evtchn)) ++ [index_from_irq(irq)] = -1; ++ break; ++ case IRQT_IPI: ++ per_cpu(ipi_to_irq, cpu_from_evtchn(evtchn)) ++ [index_from_irq(irq)] = -1; ++ break; ++ default: ++ break; ++ } ++ ++ /* Closed ports are implicitly re-bound to VCPU0. */ ++ bind_evtchn_to_cpu(evtchn, 0); ++ ++ evtchn_to_irq[evtchn] = -1; ++ irq_info[irq] = IRQ_UNBOUND; ++ ++ /* Zap stats across IRQ changes of use. */ ++ for_each_possible_cpu(cpu) ++ kstat_cpu(cpu).irqs[irq] = 0; ++ } ++ ++ spin_unlock(&irq_mapping_update_lock); ++} ++ ++int bind_caller_port_to_irqhandler( ++ unsigned int caller_port, ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_caller_port_to_irq(caller_port); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_caller_port_to_irqhandler); ++ ++int bind_listening_port_to_irqhandler( ++ unsigned int remote_domain, ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_listening_port_to_irq(remote_domain); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_listening_port_to_irqhandler); ++ ++int bind_interdomain_evtchn_to_irqhandler( ++ unsigned int remote_domain, ++ unsigned int remote_port, ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_interdomain_evtchn_to_irq(remote_domain, remote_port); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_interdomain_evtchn_to_irqhandler); ++ ++int bind_virq_to_irqhandler( ++ unsigned int virq, ++ unsigned int cpu, ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_virq_to_irq(virq, cpu); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_virq_to_irqhandler); ++ ++int bind_ipi_to_irqhandler( ++ unsigned int ipi, ++ unsigned int cpu, ++ irqreturn_t (*handler)(int, void *, struct pt_regs *), ++ unsigned long irqflags, ++ const char *devname, ++ void *dev_id) ++{ ++ int irq, retval; ++ ++ irq = bind_ipi_to_irq(ipi, cpu); ++ if (irq < 0) ++ return irq; ++ ++ retval = request_irq(irq, handler, irqflags, devname, dev_id); ++ if (retval != 0) { ++ unbind_from_irq(irq); ++ return retval; ++ } ++ ++ return irq; ++} ++EXPORT_SYMBOL_GPL(bind_ipi_to_irqhandler); ++ ++void unbind_from_irqhandler(unsigned int irq, void *dev_id) ++{ ++ free_irq(irq, dev_id); ++ unbind_from_irq(irq); ++} ++EXPORT_SYMBOL_GPL(unbind_from_irqhandler); ++ ++#ifdef CONFIG_SMP ++/* Rebind an evtchn so that it gets delivered to a specific cpu */ ++static void rebind_irq_to_cpu(unsigned irq, unsigned tcpu) ++{ ++ struct evtchn_bind_vcpu bind_vcpu; ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (!VALID_EVTCHN(evtchn)) ++ return; ++ ++ /* Send future instances of this interrupt to other vcpu. */ ++ bind_vcpu.port = evtchn; ++ bind_vcpu.vcpu = tcpu; ++ ++ /* ++ * If this fails, it usually just indicates that we're dealing with a ++ * virq or IPI channel, which don't actually need to be rebound. Ignore ++ * it, but don't do the xenlinux-level rebind in that case. ++ */ ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_vcpu, &bind_vcpu) >= 0) ++ bind_evtchn_to_cpu(evtchn, tcpu); ++} ++ ++static void set_affinity_irq(unsigned irq, cpumask_t dest) ++{ ++ unsigned tcpu = first_cpu(dest); ++ rebind_irq_to_cpu(irq, tcpu); ++} ++#endif ++ ++int resend_irq_on_evtchn(unsigned int irq) ++{ ++ int masked, evtchn = evtchn_from_irq(irq); ++ shared_info_t *s = HYPERVISOR_shared_info; ++ ++ if (!VALID_EVTCHN(evtchn)) ++ return 1; ++ ++ masked = synch_test_and_set_bit(evtchn, s->evtchn_mask); ++ synch_set_bit(evtchn, s->evtchn_pending); ++ if (!masked) ++ unmask_evtchn(evtchn); ++ ++ return 1; ++} ++ ++/* ++ * Interface to generic handling in irq.c ++ */ ++ ++static unsigned int startup_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ unmask_evtchn(evtchn); ++ return 0; ++} ++ ++static void shutdown_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ mask_evtchn(evtchn); ++} ++ ++static void enable_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ unmask_evtchn(evtchn); ++} ++ ++static void disable_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ mask_evtchn(evtchn); ++} ++ ++static void ack_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ move_native_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) { ++ mask_evtchn(evtchn); ++ clear_evtchn(evtchn); ++ } ++} ++ ++static void end_dynirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) ++ unmask_evtchn(evtchn); ++} ++ ++static struct hw_interrupt_type dynirq_type = { ++ .typename = "Dynamic-irq", ++ .startup = startup_dynirq, ++ .shutdown = shutdown_dynirq, ++ .enable = enable_dynirq, ++ .disable = disable_dynirq, ++ .ack = ack_dynirq, ++ .end = end_dynirq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_affinity_irq, ++#endif ++ .retrigger = resend_irq_on_evtchn, ++}; ++ ++static inline void pirq_unmask_notify(int pirq) ++{ ++ struct physdev_eoi eoi = { .irq = pirq }; ++ if (unlikely(test_bit(pirq, pirq_needs_eoi))) ++ (void)HYPERVISOR_physdev_op(PHYSDEVOP_eoi, &eoi); ++} ++ ++static inline void pirq_query_unmask(int pirq) ++{ ++ struct physdev_irq_status_query irq_status; ++ irq_status.irq = pirq; ++ (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status); ++ clear_bit(pirq, pirq_needs_eoi); ++ if (irq_status.flags & XENIRQSTAT_needs_eoi) ++ set_bit(pirq, pirq_needs_eoi); ++} ++ ++/* ++ * On startup, if there is no action associated with the IRQ then we are ++ * probing. In this case we should not share with others as it will confuse us. ++ */ ++#define probing_irq(_irq) (irq_desc[(_irq)].action == NULL) ++ ++static unsigned int startup_pirq(unsigned int irq) ++{ ++ struct evtchn_bind_pirq bind_pirq; ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ goto out; ++ ++ bind_pirq.pirq = irq; ++ /* NB. We are happy to share unless we are probing. */ ++ bind_pirq.flags = probing_irq(irq) ? 0 : BIND_PIRQ__WILL_SHARE; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_pirq, &bind_pirq) != 0) { ++ if (!probing_irq(irq)) ++ printk(KERN_INFO "Failed to obtain physical IRQ %d\n", ++ irq); ++ return 0; ++ } ++ evtchn = bind_pirq.port; ++ ++ pirq_query_unmask(irq_to_pirq(irq)); ++ ++ evtchn_to_irq[evtchn] = irq; ++ bind_evtchn_to_cpu(evtchn, 0); ++ irq_info[irq] = mk_irq_info(IRQT_PIRQ, irq, evtchn); ++ ++ out: ++ unmask_evtchn(evtchn); ++ pirq_unmask_notify(irq_to_pirq(irq)); ++ ++ return 0; ++} ++ ++static void shutdown_pirq(unsigned int irq) ++{ ++ struct evtchn_close close; ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (!VALID_EVTCHN(evtchn)) ++ return; ++ ++ mask_evtchn(evtchn); ++ ++ close.port = evtchn; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_close, &close) != 0) ++ BUG(); ++ ++ bind_evtchn_to_cpu(evtchn, 0); ++ evtchn_to_irq[evtchn] = -1; ++ irq_info[irq] = IRQ_UNBOUND; ++} ++ ++static void enable_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) { ++ unmask_evtchn(evtchn); ++ pirq_unmask_notify(irq_to_pirq(irq)); ++ } ++} ++ ++static void disable_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ mask_evtchn(evtchn); ++} ++ ++static void ack_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ move_native_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) { ++ mask_evtchn(evtchn); ++ clear_evtchn(evtchn); ++ } ++} ++ ++static void end_pirq(unsigned int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn) && !(irq_desc[irq].status & IRQ_DISABLED)) { ++ unmask_evtchn(evtchn); ++ pirq_unmask_notify(irq_to_pirq(irq)); ++ } ++} ++ ++static struct hw_interrupt_type pirq_type = { ++ .typename = "Phys-irq", ++ .startup = startup_pirq, ++ .shutdown = shutdown_pirq, ++ .enable = enable_pirq, ++ .disable = disable_pirq, ++ .ack = ack_pirq, ++ .end = end_pirq, ++#ifdef CONFIG_SMP ++ .set_affinity = set_affinity_irq, ++#endif ++ .retrigger = resend_irq_on_evtchn, ++}; ++ ++int irq_ignore_unhandled(unsigned int irq) ++{ ++ struct physdev_irq_status_query irq_status = { .irq = irq }; ++ ++ if (!is_running_on_xen()) ++ return 0; ++ ++ (void)HYPERVISOR_physdev_op(PHYSDEVOP_irq_status_query, &irq_status); ++ return !!(irq_status.flags & XENIRQSTAT_shared); ++} ++ ++void notify_remote_via_irq(int irq) ++{ ++ int evtchn = evtchn_from_irq(irq); ++ ++ if (VALID_EVTCHN(evtchn)) ++ notify_remote_via_evtchn(evtchn); ++} ++EXPORT_SYMBOL_GPL(notify_remote_via_irq); ++ ++int irq_to_evtchn_port(int irq) ++{ ++ return evtchn_from_irq(irq); ++} ++EXPORT_SYMBOL_GPL(irq_to_evtchn_port); ++ ++void mask_evtchn(int port) ++{ ++ shared_info_t *s = HYPERVISOR_shared_info; ++ synch_set_bit(port, s->evtchn_mask); ++} ++EXPORT_SYMBOL_GPL(mask_evtchn); ++ ++void unmask_evtchn(int port) ++{ ++ shared_info_t *s = HYPERVISOR_shared_info; ++ unsigned int cpu = smp_processor_id(); ++ vcpu_info_t *vcpu_info = &s->vcpu_info[cpu]; ++ ++ BUG_ON(!irqs_disabled()); ++ ++ /* Slow path (hypercall) if this is a non-local port. */ ++ if (unlikely(cpu != cpu_from_evtchn(port))) { ++ struct evtchn_unmask unmask = { .port = port }; ++ (void)HYPERVISOR_event_channel_op(EVTCHNOP_unmask, &unmask); ++ return; ++ } ++ ++ synch_clear_bit(port, s->evtchn_mask); ++ ++ /* Did we miss an interrupt 'edge'? Re-fire if so. */ ++ if (synch_test_bit(port, s->evtchn_pending) && ++ !synch_test_and_set_bit(port / BITS_PER_LONG, ++ &vcpu_info->evtchn_pending_sel)) ++ vcpu_info->evtchn_upcall_pending = 1; ++} ++EXPORT_SYMBOL_GPL(unmask_evtchn); ++ ++static void restore_cpu_virqs(int cpu) ++{ ++ struct evtchn_bind_virq bind_virq; ++ int virq, irq, evtchn; ++ ++ for (virq = 0; virq < NR_VIRQS; virq++) { ++ if ((irq = per_cpu(virq_to_irq, cpu)[virq]) == -1) ++ continue; ++ ++ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_VIRQ, virq, 0)); ++ ++ /* Get a new binding from Xen. */ ++ bind_virq.virq = virq; ++ bind_virq.vcpu = cpu; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, ++ &bind_virq) != 0) ++ BUG(); ++ evtchn = bind_virq.port; ++ ++ /* Record the new mapping. */ ++ evtchn_to_irq[evtchn] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_VIRQ, virq, evtchn); ++ bind_evtchn_to_cpu(evtchn, cpu); ++ ++ /* Ready for use. */ ++ unmask_evtchn(evtchn); ++ } ++} ++ ++static void restore_cpu_ipis(int cpu) ++{ ++ struct evtchn_bind_ipi bind_ipi; ++ int ipi, irq, evtchn; ++ ++ for (ipi = 0; ipi < NR_IPIS; ipi++) { ++ if ((irq = per_cpu(ipi_to_irq, cpu)[ipi]) == -1) ++ continue; ++ ++ BUG_ON(irq_info[irq] != mk_irq_info(IRQT_IPI, ipi, 0)); ++ ++ /* Get a new binding from Xen. */ ++ bind_ipi.vcpu = cpu; ++ if (HYPERVISOR_event_channel_op(EVTCHNOP_bind_ipi, ++ &bind_ipi) != 0) ++ BUG(); ++ evtchn = bind_ipi.port; ++ ++ /* Record the new mapping. */ ++ evtchn_to_irq[evtchn] = irq; ++ irq_info[irq] = mk_irq_info(IRQT_IPI, ipi, evtchn); ++ bind_evtchn_to_cpu(evtchn, cpu); ++ ++ /* Ready for use. */ ++ unmask_evtchn(evtchn); ++ ++ } ++} ++ ++void irq_resume(void) ++{ ++ int cpu, pirq, irq, evtchn; ++ ++ init_evtchn_cpu_bindings(); ++ ++ /* New event-channel space is not 'live' yet. */ ++ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) ++ mask_evtchn(evtchn); ++ ++ /* Check that no PIRQs are still bound. */ ++ for (pirq = 0; pirq < NR_PIRQS; pirq++) ++ BUG_ON(irq_info[pirq_to_irq(pirq)] != IRQ_UNBOUND); ++ ++ /* No IRQ <-> event-channel mappings. */ ++ for (irq = 0; irq < NR_IRQS; irq++) ++ irq_info[irq] &= ~0xFFFF; /* zap event-channel binding */ ++ for (evtchn = 0; evtchn < NR_EVENT_CHANNELS; evtchn++) ++ evtchn_to_irq[evtchn] = -1; ++ ++ for_each_possible_cpu(cpu) { ++ restore_cpu_virqs(cpu); ++ restore_cpu_ipis(cpu); ++ } ++ ++} ++ ++void __init xen_init_IRQ(void) ++{ ++ int i; ++ ++ init_evtchn_cpu_bindings(); ++ ++ /* No event channels are 'live' right now. */ ++ for (i = 0; i < NR_EVENT_CHANNELS; i++) ++ mask_evtchn(i); ++ ++ /* No IRQ -> event-channel mappings. */ ++ for (i = 0; i < NR_IRQS; i++) ++ irq_info[i] = IRQ_UNBOUND; ++ ++ /* Dynamic IRQ space is currently unbound. Zero the refcnts. */ ++ for (i = 0; i < NR_DYNIRQS; i++) { ++ irq_bindcount[dynirq_to_irq(i)] = 0; ++ ++ irq_desc[dynirq_to_irq(i)].status = IRQ_DISABLED; ++ irq_desc[dynirq_to_irq(i)].action = NULL; ++ irq_desc[dynirq_to_irq(i)].depth = 1; ++ irq_desc[dynirq_to_irq(i)].chip = &dynirq_type; ++ } ++ ++ /* Phys IRQ space is statically bound (1:1 mapping). Nail refcnts. */ ++ for (i = 0; i < NR_PIRQS; i++) { ++ irq_bindcount[pirq_to_irq(i)] = 1; ++ ++#ifdef RTC_IRQ ++ /* If not domain 0, force our RTC driver to fail its probe. */ ++ if ((i == RTC_IRQ) && !is_initial_xendomain()) ++ continue; ++#endif ++ ++ irq_desc[pirq_to_irq(i)].status = IRQ_DISABLED; ++ irq_desc[pirq_to_irq(i)].action = NULL; ++ irq_desc[pirq_to_irq(i)].depth = 1; ++ irq_desc[pirq_to_irq(i)].chip = &pirq_type; ++ } ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/features.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,34 @@ ++/****************************************************************************** ++ * features.c ++ * ++ * Xen feature flags. ++ * ++ * Copyright (c) 2006, Ian Campbell, XenSource Inc. ++ */ ++#include <linux/types.h> ++#include <linux/cache.h> ++#include <linux/module.h> ++#include <asm/hypervisor.h> ++#include <xen/features.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++u8 xen_features[XENFEAT_NR_SUBMAPS * 32] __read_mostly; ++/* Not a GPL symbol: used in ubiquitous macros, so too restrictive. */ ++EXPORT_SYMBOL(xen_features); ++ ++void setup_xen_features(void) ++{ ++ xen_feature_info_t fi; ++ int i, j; ++ ++ for (i = 0; i < XENFEAT_NR_SUBMAPS; i++) { ++ fi.submap_idx = i; ++ if (HYPERVISOR_xen_version(XENVER_get_features, &fi) < 0) ++ break; ++ for (j=0; j<32; j++) ++ xen_features[i*32+j] = !!(fi.submap & 1<<j); ++ } ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/gnttab.c 2007-08-27 14:02:10.000000000 -0400 +@@ -0,0 +1,631 @@ ++/****************************************************************************** ++ * gnttab.c ++ * ++ * Granting foreign access to our memory reservation. ++ * ++ * Copyright (c) 2005-2006, Christopher Clark ++ * Copyright (c) 2004-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/sched.h> ++#include <linux/mm.h> ++#include <xen/interface/xen.h> ++#include <xen/gnttab.h> ++#include <asm/pgtable.h> ++#include <asm/uaccess.h> ++#include <asm/synch_bitops.h> ++#include <asm/io.h> ++#include <xen/interface/memory.h> ++#include <xen/driver_util.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++/* External tools reserve first few grant table entries. */ ++#define NR_RESERVED_ENTRIES 8 ++#define GNTTAB_LIST_END 0xffffffff ++#define GREFS_PER_GRANT_FRAME (PAGE_SIZE / sizeof(grant_entry_t)) ++ ++static grant_ref_t **gnttab_list; ++static unsigned int nr_grant_frames; ++static unsigned int boot_max_nr_grant_frames; ++static int gnttab_free_count; ++static grant_ref_t gnttab_free_head; ++static DEFINE_SPINLOCK(gnttab_list_lock); ++ ++static struct grant_entry *shared; ++ ++static struct gnttab_free_callback *gnttab_free_callback_list; ++ ++static int gnttab_expand(unsigned int req_entries); ++ ++#define RPP (PAGE_SIZE / sizeof(grant_ref_t)) ++#define gnttab_entry(entry) (gnttab_list[(entry) / RPP][(entry) % RPP]) ++ ++static int get_free_entries(int count) ++{ ++ unsigned long flags; ++ int ref, rc; ++ grant_ref_t head; ++ ++ spin_lock_irqsave(&gnttab_list_lock, flags); ++ ++ if ((gnttab_free_count < count) && ++ ((rc = gnttab_expand(count - gnttab_free_count)) < 0)) { ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++ return rc; ++ } ++ ++ ref = head = gnttab_free_head; ++ gnttab_free_count -= count; ++ while (count-- > 1) ++ head = gnttab_entry(head); ++ gnttab_free_head = gnttab_entry(head); ++ gnttab_entry(head) = GNTTAB_LIST_END; ++ ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++ ++ return ref; ++} ++ ++#define get_free_entry() get_free_entries(1) ++ ++static void do_free_callbacks(void) ++{ ++ struct gnttab_free_callback *callback, *next; ++ ++ callback = gnttab_free_callback_list; ++ gnttab_free_callback_list = NULL; ++ ++ while (callback != NULL) { ++ next = callback->next; ++ if (gnttab_free_count >= callback->count) { ++ callback->next = NULL; ++ callback->fn(callback->arg); ++ } else { ++ callback->next = gnttab_free_callback_list; ++ gnttab_free_callback_list = callback; ++ } ++ callback = next; ++ } ++} ++ ++static inline void check_free_callbacks(void) ++{ ++ if (unlikely(gnttab_free_callback_list)) ++ do_free_callbacks(); ++} ++ ++static void put_free_entry(grant_ref_t ref) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&gnttab_list_lock, flags); ++ gnttab_entry(ref) = gnttab_free_head; ++ gnttab_free_head = ref; ++ gnttab_free_count++; ++ check_free_callbacks(); ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++} ++ ++/* ++ * Public grant-issuing interface functions ++ */ ++ ++int gnttab_grant_foreign_access(domid_t domid, unsigned long frame, ++ int readonly) ++{ ++ int ref; ++ ++ if (unlikely((ref = get_free_entry()) < 0)) ++ return -ENOSPC; ++ ++ shared[ref].frame = frame; ++ shared[ref].domid = domid; ++ wmb(); ++ shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); ++ ++ return ref; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access); ++ ++void gnttab_grant_foreign_access_ref(grant_ref_t ref, domid_t domid, ++ unsigned long frame, int readonly) ++{ ++ shared[ref].frame = frame; ++ shared[ref].domid = domid; ++ wmb(); ++ shared[ref].flags = GTF_permit_access | (readonly ? GTF_readonly : 0); ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_access_ref); ++ ++ ++int gnttab_query_foreign_access(grant_ref_t ref) ++{ ++ u16 nflags; ++ ++ nflags = shared[ref].flags; ++ ++ return (nflags & (GTF_reading|GTF_writing)); ++} ++EXPORT_SYMBOL_GPL(gnttab_query_foreign_access); ++ ++int gnttab_end_foreign_access_ref(grant_ref_t ref, int readonly) ++{ ++ u16 flags, nflags; ++ ++ nflags = shared[ref].flags; ++ do { ++ if ((flags = nflags) & (GTF_reading|GTF_writing)) { ++ printk(KERN_ALERT "WARNING: g.e. still in use!\n"); ++ return 0; ++ } ++ } while ((nflags = synch_cmpxchg_subword(&shared[ref].flags, flags, 0)) != ++ flags); ++ ++ return 1; ++} ++EXPORT_SYMBOL_GPL(gnttab_end_foreign_access_ref); ++ ++void gnttab_end_foreign_access(grant_ref_t ref, int readonly, ++ unsigned long page) ++{ ++ if (gnttab_end_foreign_access_ref(ref, readonly)) { ++ put_free_entry(ref); ++ if (page != 0) ++ free_page(page); ++ } else { ++ /* XXX This needs to be fixed so that the ref and page are ++ placed on a list to be freed up later. */ ++ printk(KERN_WARNING ++ "WARNING: leaking g.e. and page still in use!\n"); ++ } ++} ++EXPORT_SYMBOL_GPL(gnttab_end_foreign_access); ++ ++int gnttab_grant_foreign_transfer(domid_t domid, unsigned long pfn) ++{ ++ int ref; ++ ++ if (unlikely((ref = get_free_entry()) < 0)) ++ return -ENOSPC; ++ gnttab_grant_foreign_transfer_ref(ref, domid, pfn); ++ ++ return ref; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer); ++ ++void gnttab_grant_foreign_transfer_ref(grant_ref_t ref, domid_t domid, ++ unsigned long pfn) ++{ ++ shared[ref].frame = pfn; ++ shared[ref].domid = domid; ++ wmb(); ++ shared[ref].flags = GTF_accept_transfer; ++} ++EXPORT_SYMBOL_GPL(gnttab_grant_foreign_transfer_ref); ++ ++unsigned long gnttab_end_foreign_transfer_ref(grant_ref_t ref) ++{ ++ unsigned long frame; ++ u16 flags; ++ ++ /* ++ * If a transfer is not even yet started, try to reclaim the grant ++ * reference and return failure (== 0). ++ */ ++ while (!((flags = shared[ref].flags) & GTF_transfer_committed)) { ++ if (synch_cmpxchg_subword(&shared[ref].flags, flags, 0) == flags) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* If a transfer is in progress then wait until it is completed. */ ++ while (!(flags & GTF_transfer_completed)) { ++ flags = shared[ref].flags; ++ cpu_relax(); ++ } ++ ++ /* Read the frame number /after/ reading completion status. */ ++ rmb(); ++ frame = shared[ref].frame; ++ BUG_ON(frame == 0); ++ ++ return frame; ++} ++EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer_ref); ++ ++unsigned long gnttab_end_foreign_transfer(grant_ref_t ref) ++{ ++ unsigned long frame = gnttab_end_foreign_transfer_ref(ref); ++ put_free_entry(ref); ++ return frame; ++} ++EXPORT_SYMBOL_GPL(gnttab_end_foreign_transfer); ++ ++void gnttab_free_grant_reference(grant_ref_t ref) ++{ ++ put_free_entry(ref); ++} ++EXPORT_SYMBOL_GPL(gnttab_free_grant_reference); ++ ++void gnttab_free_grant_references(grant_ref_t head) ++{ ++ grant_ref_t ref; ++ unsigned long flags; ++ int count = 1; ++ if (head == GNTTAB_LIST_END) ++ return; ++ spin_lock_irqsave(&gnttab_list_lock, flags); ++ ref = head; ++ while (gnttab_entry(ref) != GNTTAB_LIST_END) { ++ ref = gnttab_entry(ref); ++ count++; ++ } ++ gnttab_entry(ref) = gnttab_free_head; ++ gnttab_free_head = head; ++ gnttab_free_count += count; ++ check_free_callbacks(); ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++} ++EXPORT_SYMBOL_GPL(gnttab_free_grant_references); ++ ++int gnttab_alloc_grant_references(u16 count, grant_ref_t *head) ++{ ++ int h = get_free_entries(count); ++ ++ if (h < 0) ++ return -ENOSPC; ++ ++ *head = h; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(gnttab_alloc_grant_references); ++ ++int gnttab_empty_grant_references(const grant_ref_t *private_head) ++{ ++ return (*private_head == GNTTAB_LIST_END); ++} ++EXPORT_SYMBOL_GPL(gnttab_empty_grant_references); ++ ++int gnttab_claim_grant_reference(grant_ref_t *private_head) ++{ ++ grant_ref_t g = *private_head; ++ if (unlikely(g == GNTTAB_LIST_END)) ++ return -ENOSPC; ++ *private_head = gnttab_entry(g); ++ return g; ++} ++EXPORT_SYMBOL_GPL(gnttab_claim_grant_reference); ++ ++void gnttab_release_grant_reference(grant_ref_t *private_head, ++ grant_ref_t release) ++{ ++ gnttab_entry(release) = *private_head; ++ *private_head = release; ++} ++EXPORT_SYMBOL_GPL(gnttab_release_grant_reference); ++ ++void gnttab_request_free_callback(struct gnttab_free_callback *callback, ++ void (*fn)(void *), void *arg, u16 count) ++{ ++ unsigned long flags; ++ spin_lock_irqsave(&gnttab_list_lock, flags); ++ if (callback->next) ++ goto out; ++ callback->fn = fn; ++ callback->arg = arg; ++ callback->count = count; ++ callback->next = gnttab_free_callback_list; ++ gnttab_free_callback_list = callback; ++ check_free_callbacks(); ++out: ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++} ++EXPORT_SYMBOL_GPL(gnttab_request_free_callback); ++ ++void gnttab_cancel_free_callback(struct gnttab_free_callback *callback) ++{ ++ struct gnttab_free_callback **pcb; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&gnttab_list_lock, flags); ++ for (pcb = &gnttab_free_callback_list; *pcb; pcb = &(*pcb)->next) { ++ if (*pcb == callback) { ++ *pcb = callback->next; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&gnttab_list_lock, flags); ++} ++EXPORT_SYMBOL_GPL(gnttab_cancel_free_callback); ++ ++static int grow_gnttab_list(unsigned int more_frames) ++{ ++ unsigned int new_nr_grant_frames, extra_entries, i; ++ ++ new_nr_grant_frames = nr_grant_frames + more_frames; ++ extra_entries = more_frames * GREFS_PER_GRANT_FRAME; ++ ++ for (i = nr_grant_frames; i < new_nr_grant_frames; i++) ++ { ++ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_ATOMIC); ++ if (!gnttab_list[i]) ++ goto grow_nomem; ++ } ++ ++ ++ for (i = GREFS_PER_GRANT_FRAME * nr_grant_frames; ++ i < GREFS_PER_GRANT_FRAME * new_nr_grant_frames - 1; i++) ++ gnttab_entry(i) = i + 1; ++ ++ gnttab_entry(i) = gnttab_free_head; ++ gnttab_free_head = GREFS_PER_GRANT_FRAME * nr_grant_frames; ++ gnttab_free_count += extra_entries; ++ ++ nr_grant_frames = new_nr_grant_frames; ++ ++ check_free_callbacks(); ++ ++ return 0; ++ ++grow_nomem: ++ for ( ; i >= nr_grant_frames; i--) ++ free_page((unsigned long) gnttab_list[i]); ++ return -ENOMEM; ++} ++ ++static unsigned int __max_nr_grant_frames(void) ++{ ++ struct gnttab_query_size query; ++ int rc; ++ ++ query.dom = DOMID_SELF; ++ ++ rc = HYPERVISOR_grant_table_op(GNTTABOP_query_size, &query, 1); ++ if ((rc < 0) || (query.status != GNTST_okay)) ++ return 4; /* Legacy max supported number of frames */ ++ ++ return query.max_nr_frames; ++} ++ ++static inline unsigned int max_nr_grant_frames(void) ++{ ++ unsigned int xen_max = __max_nr_grant_frames(); ++ ++ if (xen_max > boot_max_nr_grant_frames) ++ return boot_max_nr_grant_frames; ++ return xen_max; ++} ++ ++#ifdef CONFIG_XEN ++ ++#ifndef __ia64__ ++static int map_pte_fn(pte_t *pte, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ unsigned long **frames = (unsigned long **)data; ++ ++ set_pte_at(&init_mm, addr, pte, pfn_pte_ma((*frames)[0], PAGE_KERNEL)); ++ (*frames)++; ++ return 0; ++} ++ ++static int unmap_pte_fn(pte_t *pte, struct page *pmd_page, ++ unsigned long addr, void *data) ++{ ++ ++ set_pte_at(&init_mm, addr, pte, __pte(0)); ++ return 0; ++} ++#endif ++ ++static int gnttab_map(unsigned int start_idx, unsigned int end_idx) ++{ ++ struct gnttab_setup_table setup; ++ unsigned long *frames; ++ unsigned int nr_gframes = end_idx + 1; ++ int rc; ++ ++ frames = kmalloc(nr_gframes * sizeof(unsigned long), GFP_ATOMIC); ++ if (!frames) ++ return -ENOMEM; ++ ++ setup.dom = DOMID_SELF; ++ setup.nr_frames = nr_gframes; ++ set_xen_guest_handle(setup.frame_list, frames); ++ ++ rc = HYPERVISOR_grant_table_op(GNTTABOP_setup_table, &setup, 1); ++ if (rc == -ENOSYS) { ++ kfree(frames); ++ return -ENOSYS; ++ } ++ ++ BUG_ON(rc || setup.status); ++ ++#ifndef __ia64__ ++ if (shared == NULL) { ++ struct vm_struct *area; ++ area = alloc_vm_area(PAGE_SIZE * max_nr_grant_frames()); ++ BUG_ON(area == NULL); ++ shared = area->addr; ++ } ++ rc = apply_to_page_range(&init_mm, (unsigned long)shared, ++ PAGE_SIZE * nr_gframes, ++ map_pte_fn, &frames); ++ BUG_ON(rc); ++ frames -= nr_gframes; /* adjust after map_pte_fn() */ ++#else ++ shared = __va(frames[0] << PAGE_SHIFT); ++#endif ++ ++ kfree(frames); ++ ++ return 0; ++} ++ ++int gnttab_resume(void) ++{ ++ if (max_nr_grant_frames() < nr_grant_frames) ++ return -ENOSYS; ++ return gnttab_map(0, nr_grant_frames - 1); ++} ++ ++int gnttab_suspend(void) ++{ ++#ifndef __ia64__ ++ apply_to_page_range(&init_mm, (unsigned long)shared, ++ PAGE_SIZE * nr_grant_frames, ++ unmap_pte_fn, NULL); ++#endif ++ return 0; ++} ++ ++#else /* !CONFIG_XEN */ ++ ++#include <platform-pci.h> ++ ++static unsigned long resume_frames; ++ ++static int gnttab_map(unsigned int start_idx, unsigned int end_idx) ++{ ++ struct xen_add_to_physmap xatp; ++ unsigned int i = end_idx; ++ ++ /* Loop backwards, so that the first hypercall has the largest index, ++ * ensuring that the table will grow only once. ++ */ ++ do { ++ xatp.domid = DOMID_SELF; ++ xatp.idx = i; ++ xatp.space = XENMAPSPACE_grant_table; ++ xatp.gpfn = (resume_frames >> PAGE_SHIFT) + i; ++ if (HYPERVISOR_memory_op(XENMEM_add_to_physmap, &xatp)) ++ BUG(); ++ } while (i-- > start_idx); ++ ++ return 0; ++} ++ ++int gnttab_resume(void) ++{ ++ unsigned int max_nr_gframes, nr_gframes; ++ ++ nr_gframes = nr_grant_frames; ++ max_nr_gframes = max_nr_grant_frames(); ++ if (max_nr_gframes < nr_gframes) ++ return -ENOSYS; ++ ++ if (!resume_frames) { ++ resume_frames = alloc_xen_mmio(PAGE_SIZE * max_nr_gframes); ++ shared = ioremap(resume_frames, PAGE_SIZE * max_nr_gframes); ++ if (shared == NULL) { ++ printk("error to ioremap gnttab share frames\n"); ++ return -1; ++ } ++ } ++ ++ gnttab_map(0, nr_gframes - 1); ++ ++ return 0; ++} ++ ++#endif /* !CONFIG_XEN */ ++ ++static int gnttab_expand(unsigned int req_entries) ++{ ++ int rc; ++ unsigned int cur, extra; ++ ++ cur = nr_grant_frames; ++ extra = ((req_entries + (GREFS_PER_GRANT_FRAME-1)) / ++ GREFS_PER_GRANT_FRAME); ++ if (cur + extra > max_nr_grant_frames()) ++ return -ENOSPC; ++ ++ if ((rc = gnttab_map(cur, cur + extra - 1)) == 0) ++ rc = grow_gnttab_list(extra); ++ ++ return rc; ++} ++ ++int __devinit gnttab_init(void) ++{ ++ int i; ++ unsigned int max_nr_glist_frames; ++ unsigned int nr_init_grefs; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ nr_grant_frames = 1; ++ boot_max_nr_grant_frames = __max_nr_grant_frames(); ++ ++ /* Determine the maximum number of frames required for the ++ * grant reference free list on the current hypervisor. ++ */ ++ max_nr_glist_frames = (boot_max_nr_grant_frames * ++ GREFS_PER_GRANT_FRAME / ++ (PAGE_SIZE / sizeof(grant_ref_t))); ++ ++ gnttab_list = kmalloc(max_nr_glist_frames * sizeof(grant_ref_t *), ++ GFP_KERNEL); ++ if (gnttab_list == NULL) ++ return -ENOMEM; ++ ++ for (i = 0; i < nr_grant_frames; i++) { ++ gnttab_list[i] = (grant_ref_t *)__get_free_page(GFP_KERNEL); ++ if (gnttab_list[i] == NULL) ++ goto ini_nomem; ++ } ++ ++ if (gnttab_resume() < 0) ++ return -ENODEV; ++ ++ nr_init_grefs = nr_grant_frames * GREFS_PER_GRANT_FRAME; ++ ++ for (i = NR_RESERVED_ENTRIES; i < nr_init_grefs - 1; i++) ++ gnttab_entry(i) = i + 1; ++ ++ gnttab_entry(nr_init_grefs - 1) = GNTTAB_LIST_END; ++ gnttab_free_count = nr_init_grefs - NR_RESERVED_ENTRIES; ++ gnttab_free_head = NR_RESERVED_ENTRIES; ++ ++ return 0; ++ ++ ini_nomem: ++ for (i--; i >= 0; i--) ++ free_page((unsigned long)gnttab_list[i]); ++ kfree(gnttab_list); ++ return -ENOMEM; ++} ++ ++#ifdef CONFIG_XEN ++core_initcall(gnttab_init); ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/hypervisor_sysfs.c 2007-08-27 14:02:04.000000000 -0400 +@@ -0,0 +1,59 @@ ++/* ++ * copyright (c) 2006 IBM Corporation ++ * Authored by: Mike D. Day <ncmike@us.ibm.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/kobject.h> ++#include <xen/hypervisor_sysfs.h> ++ ++decl_subsys(hypervisor, NULL, NULL); ++ ++static ssize_t hyp_sysfs_show(struct kobject *kobj, ++ struct attribute *attr, ++ char *buffer) ++{ ++ struct hyp_sysfs_attr *hyp_attr; ++ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); ++ if (hyp_attr->show) ++ return hyp_attr->show(hyp_attr, buffer); ++ return 0; ++} ++ ++static ssize_t hyp_sysfs_store(struct kobject *kobj, ++ struct attribute *attr, ++ const char *buffer, ++ size_t len) ++{ ++ struct hyp_sysfs_attr *hyp_attr; ++ hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); ++ if (hyp_attr->store) ++ return hyp_attr->store(hyp_attr, buffer, len); ++ return 0; ++} ++ ++struct sysfs_ops hyp_sysfs_ops = { ++ .show = hyp_sysfs_show, ++ .store = hyp_sysfs_store, ++}; ++ ++static struct kobj_type hyp_sysfs_kobj_type = { ++ .sysfs_ops = &hyp_sysfs_ops, ++}; ++ ++static int __init hypervisor_subsys_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ hypervisor_subsys.kset.kobj.ktype = &hyp_sysfs_kobj_type; ++ return subsystem_register(&hypervisor_subsys); ++} ++ ++device_initcall(hypervisor_subsys_init); ++EXPORT_SYMBOL_GPL(hypervisor_subsys); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/machine_kexec.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,189 @@ ++/* ++ * drivers/xen/core/machine_kexec.c ++ * handle transition of Linux booting another kernel ++ */ ++ ++#include <linux/kexec.h> ++#include <xen/interface/kexec.h> ++#include <linux/mm.h> ++#include <linux/bootmem.h> ++ ++extern void machine_kexec_setup_load_arg(xen_kexec_image_t *xki, ++ struct kimage *image); ++ ++int xen_max_nr_phys_cpus; ++struct resource xen_hypervisor_res; ++struct resource *xen_phys_cpus; ++ ++void xen_machine_kexec_setup_resources(void) ++{ ++ xen_kexec_range_t range; ++ struct resource *res; ++ int k = 0; ++ ++ if (!is_initial_xendomain()) ++ return; ++ ++ /* determine maximum number of physical cpus */ ++ ++ while (1) { ++ memset(&range, 0, sizeof(range)); ++ range.range = KEXEC_RANGE_MA_CPU; ++ range.nr = k; ++ ++ if(HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) ++ break; ++ ++ k++; ++ } ++ ++ if (k == 0) ++ return; ++ ++ xen_max_nr_phys_cpus = k; ++ ++ /* allocate xen_phys_cpus */ ++ ++ xen_phys_cpus = alloc_bootmem_low(k * sizeof(struct resource)); ++ BUG_ON(xen_phys_cpus == NULL); ++ ++ /* fill in xen_phys_cpus with per-cpu crash note information */ ++ ++ for (k = 0; k < xen_max_nr_phys_cpus; k++) { ++ memset(&range, 0, sizeof(range)); ++ range.range = KEXEC_RANGE_MA_CPU; ++ range.nr = k; ++ ++ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) ++ goto err; ++ ++ res = xen_phys_cpus + k; ++ ++ memset(res, 0, sizeof(*res)); ++ res->name = "Crash note"; ++ res->start = range.start; ++ res->end = range.start + range.size - 1; ++ res->flags = IORESOURCE_BUSY | IORESOURCE_MEM; ++ } ++ ++ /* fill in xen_hypervisor_res with hypervisor machine address range */ ++ ++ memset(&range, 0, sizeof(range)); ++ range.range = KEXEC_RANGE_MA_XEN; ++ ++ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) ++ goto err; ++ ++ xen_hypervisor_res.name = "Hypervisor code and data"; ++ xen_hypervisor_res.start = range.start; ++ xen_hypervisor_res.end = range.start + range.size - 1; ++ xen_hypervisor_res.flags = IORESOURCE_BUSY | IORESOURCE_MEM; ++ ++ /* fill in crashk_res if range is reserved by hypervisor */ ++ ++ memset(&range, 0, sizeof(range)); ++ range.range = KEXEC_RANGE_MA_CRASH; ++ ++ if (HYPERVISOR_kexec_op(KEXEC_CMD_kexec_get_range, &range)) ++ return; ++ ++ if (range.size) { ++ crashk_res.start = range.start; ++ crashk_res.end = range.start + range.size - 1; ++ } ++ ++ return; ++ ++ err: ++ /* ++ * It isn't possible to free xen_phys_cpus this early in the ++ * boot. Failure at this stage is unexpected and the amount of ++ * memory is small therefore we tolerate the potential leak. ++ */ ++ xen_max_nr_phys_cpus = 0; ++ return; ++} ++ ++void xen_machine_kexec_register_resources(struct resource *res) ++{ ++ int k; ++ ++ request_resource(res, &xen_hypervisor_res); ++ ++ for (k = 0; k < xen_max_nr_phys_cpus; k++) ++ request_resource(&xen_hypervisor_res, xen_phys_cpus + k); ++ ++} ++ ++static void setup_load_arg(xen_kexec_image_t *xki, struct kimage *image) ++{ ++ machine_kexec_setup_load_arg(xki, image); ++ ++ xki->indirection_page = image->head; ++ xki->start_address = image->start; ++} ++ ++/* ++ * Load the image into xen so xen can kdump itself ++ * This might have been done in prepare, but prepare ++ * is currently called too early. It might make sense ++ * to move prepare, but for now, just add an extra hook. ++ */ ++int xen_machine_kexec_load(struct kimage *image) ++{ ++ xen_kexec_load_t xkl; ++ ++ memset(&xkl, 0, sizeof(xkl)); ++ xkl.type = image->type; ++ setup_load_arg(&xkl.image, image); ++ return HYPERVISOR_kexec_op(KEXEC_CMD_kexec_load, &xkl); ++} ++ ++/* ++ * Unload the image that was stored by machine_kexec_load() ++ * This might have been done in machine_kexec_cleanup() but it ++ * is called too late, and its possible xen could try and kdump ++ * using resources that have been freed. ++ */ ++void xen_machine_kexec_unload(struct kimage *image) ++{ ++ xen_kexec_load_t xkl; ++ ++ memset(&xkl, 0, sizeof(xkl)); ++ xkl.type = image->type; ++ HYPERVISOR_kexec_op(KEXEC_CMD_kexec_unload, &xkl); ++} ++ ++/* ++ * Do not allocate memory (or fail in any way) in machine_kexec(). ++ * We are past the point of no return, committed to rebooting now. ++ * ++ * This has the hypervisor move to the prefered reboot CPU, ++ * stop all CPUs and kexec. That is it combines machine_shutdown() ++ * and machine_kexec() in Linux kexec terms. ++ */ ++NORET_TYPE void machine_kexec(struct kimage *image) ++{ ++ xen_kexec_exec_t xke; ++ ++ memset(&xke, 0, sizeof(xke)); ++ xke.type = image->type; ++ HYPERVISOR_kexec_op(KEXEC_CMD_kexec, &xke); ++ panic("KEXEC_CMD_kexec hypercall should not return\n"); ++} ++ ++void machine_shutdown(void) ++{ ++ /* do nothing */ ++} ++ ++ ++/* ++ * Local variables: ++ * c-file-style: "linux" ++ * indent-tabs-mode: t ++ * c-indent-level: 8 ++ * c-basic-offset: 8 ++ * tab-width: 8 ++ * End: ++ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/machine_reboot.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,241 @@ ++#include <linux/version.h> ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/unistd.h> ++#include <linux/module.h> ++#include <linux/reboot.h> ++#include <linux/sysrq.h> ++#include <linux/stringify.h> ++#include <linux/stop_machine.h> ++#include <asm/irq.h> ++#include <asm/mmu_context.h> ++#include <xen/evtchn.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <linux/cpu.h> ++#include <linux/kthread.h> ++#include <xen/gnttab.h> ++#include <xen/xencons.h> ++#include <xen/cpu_hotplug.h> ++#include <xen/interface/vcpu.h> ++ ++#if defined(__i386__) || defined(__x86_64__) ++ ++/* ++ * Power off function, if any ++ */ ++void (*pm_power_off)(void); ++EXPORT_SYMBOL(pm_power_off); ++ ++void machine_emergency_restart(void) ++{ ++ /* We really want to get pending console data out before we die. */ ++ xencons_force_flush(); ++ HYPERVISOR_shutdown(SHUTDOWN_reboot); ++} ++ ++void machine_restart(char * __unused) ++{ ++ machine_emergency_restart(); ++} ++ ++void machine_halt(void) ++{ ++ machine_power_off(); ++} ++ ++void machine_power_off(void) ++{ ++ /* We really want to get pending console data out before we die. */ ++ xencons_force_flush(); ++ if (pm_power_off) ++ pm_power_off(); ++ HYPERVISOR_shutdown(SHUTDOWN_poweroff); ++} ++ ++int reboot_thru_bios = 0; /* for dmi_scan.c */ ++EXPORT_SYMBOL(machine_restart); ++EXPORT_SYMBOL(machine_halt); ++EXPORT_SYMBOL(machine_power_off); ++ ++static void pre_suspend(void) ++{ ++ HYPERVISOR_shared_info = (shared_info_t *)empty_zero_page; ++ HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO), ++ __pte_ma(0), 0); ++ ++ xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); ++ xen_start_info->console.domU.mfn = ++ mfn_to_pfn(xen_start_info->console.domU.mfn); ++} ++ ++static void post_suspend(int suspend_cancelled) ++{ ++ int i, j, k, fpp; ++ unsigned long shinfo_mfn; ++ extern unsigned long max_pfn; ++ extern unsigned long *pfn_to_mfn_frame_list_list; ++ extern unsigned long *pfn_to_mfn_frame_list[]; ++ ++ if (suspend_cancelled) { ++ xen_start_info->store_mfn = ++ pfn_to_mfn(xen_start_info->store_mfn); ++ xen_start_info->console.domU.mfn = ++ pfn_to_mfn(xen_start_info->console.domU.mfn); ++ } else { ++#ifdef CONFIG_SMP ++ cpu_initialized_map = cpu_online_map; ++#endif ++ } ++ ++ shinfo_mfn = xen_start_info->shared_info >> PAGE_SHIFT; ++ HYPERVISOR_update_va_mapping(fix_to_virt(FIX_SHARED_INFO), ++ pfn_pte_ma(shinfo_mfn, PAGE_KERNEL), 0); ++ HYPERVISOR_shared_info = (shared_info_t *)fix_to_virt(FIX_SHARED_INFO); ++ ++ memset(empty_zero_page, 0, PAGE_SIZE); ++ ++ fpp = PAGE_SIZE/sizeof(unsigned long); ++ for (i = 0, j = 0, k = -1; i < max_pfn; i += fpp, j++) { ++ if ((j % fpp) == 0) { ++ k++; ++ pfn_to_mfn_frame_list_list[k] = ++ virt_to_mfn(pfn_to_mfn_frame_list[k]); ++ j = 0; ++ } ++ pfn_to_mfn_frame_list[k][j] = ++ virt_to_mfn(&phys_to_machine_mapping[i]); ++ } ++ HYPERVISOR_shared_info->arch.max_pfn = max_pfn; ++ HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ++ virt_to_mfn(pfn_to_mfn_frame_list_list); ++} ++ ++#else /* !(defined(__i386__) || defined(__x86_64__)) */ ++ ++#ifndef HAVE_XEN_PRE_SUSPEND ++#define xen_pre_suspend() ((void)0) ++#endif ++ ++#ifndef HAVE_XEN_POST_SUSPEND ++#define xen_post_suspend(x) ((void)0) ++#endif ++ ++#define switch_idle_mm() ((void)0) ++#define mm_pin_all() ((void)0) ++#define pre_suspend() xen_pre_suspend() ++#define post_suspend(x) xen_post_suspend(x) ++ ++#endif ++ ++static int take_machine_down(void *p_fast_suspend) ++{ ++ int fast_suspend = *(int *)p_fast_suspend; ++ int suspend_cancelled, err; ++ extern void time_resume(void); ++ ++ if (fast_suspend) { ++ BUG_ON(!irqs_disabled()); ++ } else { ++ BUG_ON(irqs_disabled()); ++ ++ for (;;) { ++ err = smp_suspend(); ++ if (err) ++ return err; ++ ++ xenbus_suspend(); ++ preempt_disable(); ++ ++ if (num_online_cpus() == 1) ++ break; ++ ++ preempt_enable(); ++ xenbus_suspend_cancel(); ++ } ++ ++ local_irq_disable(); ++ } ++ ++ mm_pin_all(); ++ gnttab_suspend(); ++ pre_suspend(); ++ ++ /* ++ * This hypercall returns 1 if suspend was cancelled or the domain was ++ * merely checkpointed, and 0 if it is resuming in a new domain. ++ */ ++ suspend_cancelled = HYPERVISOR_suspend(virt_to_mfn(xen_start_info)); ++ ++ post_suspend(suspend_cancelled); ++ gnttab_resume(); ++ if (!suspend_cancelled) { ++ irq_resume(); ++#ifdef __x86_64__ ++ /* ++ * Older versions of Xen do not save/restore the user %cr3. ++ * We do it here just in case, but there's no need if we are ++ * in fast-suspend mode as that implies a new enough Xen. ++ */ ++ if (!fast_suspend) { ++ struct mmuext_op op; ++ op.cmd = MMUEXT_NEW_USER_BASEPTR; ++ op.arg1.mfn = pfn_to_mfn(__pa(__user_pgd( ++ current->active_mm->pgd)) >> PAGE_SHIFT); ++ if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) ++ BUG(); ++ } ++#endif ++ } ++ time_resume(); ++ ++ if (!fast_suspend) ++ local_irq_enable(); ++ ++ return suspend_cancelled; ++} ++ ++int __xen_suspend(int fast_suspend) ++{ ++ int err, suspend_cancelled; ++ ++ BUG_ON(smp_processor_id() != 0); ++ BUG_ON(in_interrupt()); ++ ++#if defined(__i386__) || defined(__x86_64__) ++ if (xen_feature(XENFEAT_auto_translated_physmap)) { ++ printk(KERN_WARNING "Cannot suspend in " ++ "auto_translated_physmap mode.\n"); ++ return -EOPNOTSUPP; ++ } ++#endif ++ ++ /* If we are definitely UP then 'slow mode' is actually faster. */ ++ if (num_possible_cpus() == 1) ++ fast_suspend = 0; ++ ++ if (fast_suspend) { ++ xenbus_suspend(); ++ err = stop_machine_run(take_machine_down, &fast_suspend, 0); ++ if (err < 0) ++ xenbus_suspend_cancel(); ++ } else { ++ err = take_machine_down(&fast_suspend); ++ } ++ ++ if (err < 0) ++ return err; ++ ++ suspend_cancelled = err; ++ if (!suspend_cancelled) { ++ xencons_resume(); ++ xenbus_resume(); ++ } else { ++ xenbus_suspend_cancel(); ++ } ++ ++ if (!fast_suspend) ++ smp_resume(); ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/reboot.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,249 @@ ++#define __KERNEL_SYSCALLS__ ++#include <linux/version.h> ++#include <linux/kernel.h> ++#include <linux/unistd.h> ++#include <linux/module.h> ++#include <linux/reboot.h> ++#include <linux/sysrq.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <linux/kthread.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++MODULE_LICENSE("Dual BSD/GPL"); ++ ++#define SHUTDOWN_INVALID -1 ++#define SHUTDOWN_POWEROFF 0 ++#define SHUTDOWN_SUSPEND 2 ++/* Code 3 is SHUTDOWN_CRASH, which we don't use because the domain can only ++ * report a crash, not be instructed to crash! ++ * HALT is the same as POWEROFF, as far as we're concerned. The tools use ++ * the distinction when we return the reason code to them. ++ */ ++#define SHUTDOWN_HALT 4 ++ ++/* Ignore multiple shutdown requests. */ ++static int shutting_down = SHUTDOWN_INVALID; ++ ++/* Can we leave APs online when we suspend? */ ++static int fast_suspend; ++ ++static void __shutdown_handler(void *unused); ++static DECLARE_WORK(shutdown_work, __shutdown_handler, NULL); ++ ++int __xen_suspend(int fast_suspend); ++ ++static int shutdown_process(void *__unused) ++{ ++ static char *envp[] = { "HOME=/", "TERM=linux", ++ "PATH=/sbin:/usr/sbin:/bin:/usr/bin", NULL }; ++ static char *poweroff_argv[] = { "/sbin/poweroff", NULL }; ++ ++ extern asmlinkage long sys_reboot(int magic1, int magic2, ++ unsigned int cmd, void *arg); ++ ++ if ((shutting_down == SHUTDOWN_POWEROFF) || ++ (shutting_down == SHUTDOWN_HALT)) { ++ if (call_usermodehelper("/sbin/poweroff", poweroff_argv, ++ envp, 0) < 0) { ++#ifdef CONFIG_XEN ++ sys_reboot(LINUX_REBOOT_MAGIC1, ++ LINUX_REBOOT_MAGIC2, ++ LINUX_REBOOT_CMD_POWER_OFF, ++ NULL); ++#endif /* CONFIG_XEN */ ++ } ++ } ++ ++ shutting_down = SHUTDOWN_INVALID; /* could try again */ ++ ++ return 0; ++} ++ ++static int xen_suspend(void *__unused) ++{ ++ int err = __xen_suspend(fast_suspend); ++ if (err) ++ printk(KERN_ERR "Xen suspend failed (%d)\n", err); ++ shutting_down = SHUTDOWN_INVALID; ++ return 0; ++} ++ ++static int kthread_create_on_cpu(int (*f)(void *arg), ++ void *arg, ++ const char *name, ++ int cpu) ++{ ++ struct task_struct *p; ++ p = kthread_create(f, arg, name); ++ if (IS_ERR(p)) ++ return PTR_ERR(p); ++ kthread_bind(p, cpu); ++ wake_up_process(p); ++ return 0; ++} ++ ++static void __shutdown_handler(void *unused) ++{ ++ int err; ++ ++ if (shutting_down != SHUTDOWN_SUSPEND) ++ err = kernel_thread(shutdown_process, NULL, ++ CLONE_FS | CLONE_FILES); ++ else ++ err = kthread_create_on_cpu(xen_suspend, NULL, "suspend", 0); ++ ++ if (err < 0) { ++ printk(KERN_WARNING "Error creating shutdown process (%d): " ++ "retrying...\n", -err); ++ schedule_delayed_work(&shutdown_work, HZ/2); ++ } ++} ++ ++static void shutdown_handler(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ extern void ctrl_alt_del(void); ++ char *str; ++ struct xenbus_transaction xbt; ++ int err; ++ ++ if (shutting_down != SHUTDOWN_INVALID) ++ return; ++ ++ again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) ++ return; ++ ++ str = (char *)xenbus_read(xbt, "control", "shutdown", NULL); ++ /* Ignore read errors and empty reads. */ ++ if (XENBUS_IS_ERR_READ(str)) { ++ xenbus_transaction_end(xbt, 1); ++ return; ++ } ++ ++ xenbus_write(xbt, "control", "shutdown", ""); ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) { ++ kfree(str); ++ goto again; ++ } ++ ++ if (strcmp(str, "poweroff") == 0) ++ shutting_down = SHUTDOWN_POWEROFF; ++ else if (strcmp(str, "reboot") == 0) ++ ctrl_alt_del(); ++ else if (strcmp(str, "suspend") == 0) ++ shutting_down = SHUTDOWN_SUSPEND; ++ else if (strcmp(str, "halt") == 0) ++ shutting_down = SHUTDOWN_HALT; ++ else { ++ printk("Ignoring shutdown request: %s\n", str); ++ shutting_down = SHUTDOWN_INVALID; ++ } ++ ++ if (shutting_down != SHUTDOWN_INVALID) ++ schedule_work(&shutdown_work); ++ ++ kfree(str); ++} ++ ++static void sysrq_handler(struct xenbus_watch *watch, const char **vec, ++ unsigned int len) ++{ ++ char sysrq_key = '\0'; ++ struct xenbus_transaction xbt; ++ int err; ++ ++ again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) ++ return; ++ if (!xenbus_scanf(xbt, "control", "sysrq", "%c", &sysrq_key)) { ++ printk(KERN_ERR "Unable to read sysrq code in " ++ "control/sysrq\n"); ++ xenbus_transaction_end(xbt, 1); ++ return; ++ } ++ ++ if (sysrq_key != '\0') ++ xenbus_printf(xbt, "control", "sysrq", "%c", '\0'); ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++ if (sysrq_key != '\0') ++ handle_sysrq(sysrq_key, NULL, NULL); ++#endif ++} ++ ++static struct xenbus_watch shutdown_watch = { ++ .node = "control/shutdown", ++ .callback = shutdown_handler ++}; ++ ++static struct xenbus_watch sysrq_watch = { ++ .node = "control/sysrq", ++ .callback = sysrq_handler ++}; ++ ++static int setup_shutdown_watcher(void) ++{ ++ int err; ++ ++ xenbus_scanf(XBT_NIL, "control", ++ "platform-feature-multiprocessor-suspend", ++ "%d", &fast_suspend); ++ ++ err = register_xenbus_watch(&shutdown_watch); ++ if (err) { ++ printk(KERN_ERR "Failed to set shutdown watcher\n"); ++ return err; ++ } ++ ++ err = register_xenbus_watch(&sysrq_watch); ++ if (err) { ++ printk(KERN_ERR "Failed to set sysrq watcher\n"); ++ return err; ++ } ++ ++ return 0; ++} ++ ++#ifdef CONFIG_XEN ++ ++static int shutdown_event(struct notifier_block *notifier, ++ unsigned long event, ++ void *data) ++{ ++ setup_shutdown_watcher(); ++ return NOTIFY_DONE; ++} ++ ++static int __init setup_shutdown_event(void) ++{ ++ static struct notifier_block xenstore_notifier = { ++ .notifier_call = shutdown_event ++ }; ++ register_xenstore_notifier(&xenstore_notifier); ++ ++ return 0; ++} ++ ++subsys_initcall(setup_shutdown_event); ++ ++#else /* !defined(CONFIG_XEN) */ ++ ++int xen_reboot_init(void) ++{ ++ return setup_shutdown_watcher(); ++} ++ ++#endif /* !defined(CONFIG_XEN) */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/smpboot.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,452 @@ ++/* ++ * Xen SMP booting functions ++ * ++ * See arch/i386/kernel/smpboot.c for copyright and credits for derived ++ * portions of this file. ++ */ ++ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/kernel.h> ++#include <linux/mm.h> ++#include <linux/sched.h> ++#include <linux/kernel_stat.h> ++#include <linux/smp_lock.h> ++#include <linux/irq.h> ++#include <linux/bootmem.h> ++#include <linux/notifier.h> ++#include <linux/cpu.h> ++#include <linux/percpu.h> ++#include <asm/desc.h> ++#include <asm/arch_hooks.h> ++#include <asm/pgalloc.h> ++#include <xen/evtchn.h> ++#include <xen/interface/vcpu.h> ++#include <xen/cpu_hotplug.h> ++#include <xen/xenbus.h> ++ ++extern irqreturn_t smp_reschedule_interrupt(int, void *, struct pt_regs *); ++extern irqreturn_t smp_call_function_interrupt(int, void *, struct pt_regs *); ++ ++extern int local_setup_timer(unsigned int cpu); ++extern void local_teardown_timer(unsigned int cpu); ++ ++extern void hypervisor_callback(void); ++extern void failsafe_callback(void); ++extern void system_call(void); ++extern void smp_trap_init(trap_info_t *); ++ ++/* Number of siblings per CPU package */ ++int smp_num_siblings = 1; ++int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ ++EXPORT_SYMBOL(phys_proc_id); ++int cpu_core_id[NR_CPUS]; /* Core ID of each logical CPU */ ++EXPORT_SYMBOL(cpu_core_id); ++ ++cpumask_t cpu_online_map; ++EXPORT_SYMBOL(cpu_online_map); ++cpumask_t cpu_possible_map; ++EXPORT_SYMBOL(cpu_possible_map); ++cpumask_t cpu_initialized_map; ++ ++struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; ++EXPORT_SYMBOL(cpu_data); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++DEFINE_PER_CPU(int, cpu_state) = { 0 }; ++#endif ++ ++static DEFINE_PER_CPU(int, resched_irq); ++static DEFINE_PER_CPU(int, callfunc_irq); ++static char resched_name[NR_CPUS][15]; ++static char callfunc_name[NR_CPUS][15]; ++ ++u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; ++ ++void *xquad_portio; ++ ++cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; ++cpumask_t cpu_core_map[NR_CPUS] __cacheline_aligned; ++EXPORT_SYMBOL(cpu_core_map); ++ ++#if defined(__i386__) ++u8 x86_cpu_to_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = 0xff }; ++EXPORT_SYMBOL(x86_cpu_to_apicid); ++#elif !defined(CONFIG_X86_IO_APIC) ++unsigned int maxcpus = NR_CPUS; ++#endif ++ ++void __init prefill_possible_map(void) ++{ ++ int i, rc; ++ ++ for_each_possible_cpu(i) ++ if (i != smp_processor_id()) ++ return; ++ ++ for (i = 0; i < NR_CPUS; i++) { ++ rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); ++ if (rc >= 0) ++ cpu_set(i, cpu_possible_map); ++ } ++} ++ ++void __init smp_alloc_memory(void) ++{ ++} ++ ++static inline void ++set_cpu_sibling_map(int cpu) ++{ ++ phys_proc_id[cpu] = cpu; ++ cpu_core_id[cpu] = 0; ++ ++ cpu_sibling_map[cpu] = cpumask_of_cpu(cpu); ++ cpu_core_map[cpu] = cpumask_of_cpu(cpu); ++ ++ cpu_data[cpu].booted_cores = 1; ++} ++ ++static void ++remove_siblinginfo(int cpu) ++{ ++ phys_proc_id[cpu] = BAD_APICID; ++ cpu_core_id[cpu] = BAD_APICID; ++ ++ cpus_clear(cpu_sibling_map[cpu]); ++ cpus_clear(cpu_core_map[cpu]); ++ ++ cpu_data[cpu].booted_cores = 0; ++} ++ ++static int xen_smp_intr_init(unsigned int cpu) ++{ ++ int rc; ++ ++ per_cpu(resched_irq, cpu) = per_cpu(callfunc_irq, cpu) = -1; ++ ++ sprintf(resched_name[cpu], "resched%d", cpu); ++ rc = bind_ipi_to_irqhandler(RESCHEDULE_VECTOR, ++ cpu, ++ smp_reschedule_interrupt, ++ SA_INTERRUPT, ++ resched_name[cpu], ++ NULL); ++ if (rc < 0) ++ goto fail; ++ per_cpu(resched_irq, cpu) = rc; ++ ++ sprintf(callfunc_name[cpu], "callfunc%d", cpu); ++ rc = bind_ipi_to_irqhandler(CALL_FUNCTION_VECTOR, ++ cpu, ++ smp_call_function_interrupt, ++ SA_INTERRUPT, ++ callfunc_name[cpu], ++ NULL); ++ if (rc < 0) ++ goto fail; ++ per_cpu(callfunc_irq, cpu) = rc; ++ ++ if ((cpu != 0) && ((rc = local_setup_timer(cpu)) != 0)) ++ goto fail; ++ ++ return 0; ++ ++ fail: ++ if (per_cpu(resched_irq, cpu) >= 0) ++ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); ++ if (per_cpu(callfunc_irq, cpu) >= 0) ++ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); ++ return rc; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void xen_smp_intr_exit(unsigned int cpu) ++{ ++ if (cpu != 0) ++ local_teardown_timer(cpu); ++ ++ unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); ++ unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); ++} ++#endif ++ ++void cpu_bringup(void) ++{ ++ cpu_init(); ++ touch_softlockup_watchdog(); ++ preempt_disable(); ++ local_irq_enable(); ++} ++ ++static void cpu_bringup_and_idle(void) ++{ ++ cpu_bringup(); ++ cpu_idle(); ++} ++ ++static void cpu_initialize_context(unsigned int cpu) ++{ ++ vcpu_guest_context_t ctxt; ++ struct task_struct *idle = idle_task(cpu); ++#ifdef __x86_64__ ++ struct desc_ptr *gdt_descr = &cpu_gdt_descr[cpu]; ++#else ++ struct Xgt_desc_struct *gdt_descr = &per_cpu(cpu_gdt_descr, cpu); ++#endif ++ ++ if (cpu_test_and_set(cpu, cpu_initialized_map)) ++ return; ++ ++ memset(&ctxt, 0, sizeof(ctxt)); ++ ++ ctxt.flags = VGCF_IN_KERNEL; ++ ctxt.user_regs.ds = __USER_DS; ++ ctxt.user_regs.es = __USER_DS; ++ ctxt.user_regs.fs = 0; ++ ctxt.user_regs.gs = 0; ++ ctxt.user_regs.ss = __KERNEL_DS; ++ ctxt.user_regs.eip = (unsigned long)cpu_bringup_and_idle; ++ ctxt.user_regs.eflags = X86_EFLAGS_IF | 0x1000; /* IOPL_RING1 */ ++ ++ memset(&ctxt.fpu_ctxt, 0, sizeof(ctxt.fpu_ctxt)); ++ ++ smp_trap_init(ctxt.trap_ctxt); ++ ++ ctxt.ldt_ents = 0; ++ ++ ctxt.gdt_frames[0] = virt_to_mfn(gdt_descr->address); ++ ctxt.gdt_ents = gdt_descr->size / 8; ++ ++#ifdef __i386__ ++ ctxt.user_regs.cs = __KERNEL_CS; ++ ctxt.user_regs.esp = idle->thread.esp0 - sizeof(struct pt_regs); ++ ++ ctxt.kernel_ss = __KERNEL_DS; ++ ctxt.kernel_sp = idle->thread.esp0; ++ ++ ctxt.event_callback_cs = __KERNEL_CS; ++ ctxt.event_callback_eip = (unsigned long)hypervisor_callback; ++ ctxt.failsafe_callback_cs = __KERNEL_CS; ++ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ++ ++ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); ++#else /* __x86_64__ */ ++ ctxt.user_regs.cs = __KERNEL_CS; ++ ctxt.user_regs.esp = idle->thread.rsp0 - sizeof(struct pt_regs); ++ ++ ctxt.kernel_ss = __KERNEL_DS; ++ ctxt.kernel_sp = idle->thread.rsp0; ++ ++ ctxt.event_callback_eip = (unsigned long)hypervisor_callback; ++ ctxt.failsafe_callback_eip = (unsigned long)failsafe_callback; ++ ctxt.syscall_callback_eip = (unsigned long)system_call; ++ ++ ctxt.ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(init_level4_pgt)); ++ ++ ctxt.gs_base_kernel = (unsigned long)(cpu_pda(cpu)); ++#endif ++ ++ BUG_ON(HYPERVISOR_vcpu_op(VCPUOP_initialise, cpu, &ctxt)); ++} ++ ++void __init smp_prepare_cpus(unsigned int max_cpus) ++{ ++ int cpu; ++ struct task_struct *idle; ++#ifdef __x86_64__ ++ struct desc_ptr *gdt_descr; ++#else ++ struct Xgt_desc_struct *gdt_descr; ++#endif ++ ++ boot_cpu_data.apicid = 0; ++ cpu_data[0] = boot_cpu_data; ++ ++ cpu_2_logical_apicid[0] = 0; ++ x86_cpu_to_apicid[0] = 0; ++ ++ current_thread_info()->cpu = 0; ++ ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ cpus_clear(cpu_sibling_map[cpu]); ++ cpus_clear(cpu_core_map[cpu]); ++ } ++ ++ set_cpu_sibling_map(0); ++ ++ if (xen_smp_intr_init(0)) ++ BUG(); ++ ++ cpu_initialized_map = cpumask_of_cpu(0); ++ ++ /* Restrict the possible_map according to max_cpus. */ ++ while ((num_possible_cpus() > 1) && (num_possible_cpus() > max_cpus)) { ++ for (cpu = NR_CPUS-1; !cpu_isset(cpu, cpu_possible_map); cpu--) ++ continue; ++ cpu_clear(cpu, cpu_possible_map); ++ } ++ ++ for_each_possible_cpu (cpu) { ++ if (cpu == 0) ++ continue; ++ ++#ifdef __x86_64__ ++ gdt_descr = &cpu_gdt_descr[cpu]; ++#else ++ gdt_descr = &per_cpu(cpu_gdt_descr, cpu); ++#endif ++ gdt_descr->address = get_zeroed_page(GFP_KERNEL); ++ if (unlikely(!gdt_descr->address)) { ++ printk(KERN_CRIT "CPU%d failed to allocate GDT\n", ++ cpu); ++ continue; ++ } ++ gdt_descr->size = GDT_SIZE; ++ memcpy((void *)gdt_descr->address, cpu_gdt_table, GDT_SIZE); ++ make_page_readonly( ++ (void *)gdt_descr->address, ++ XENFEAT_writable_descriptor_tables); ++ ++ cpu_data[cpu] = boot_cpu_data; ++ cpu_data[cpu].apicid = cpu; ++ ++ cpu_2_logical_apicid[cpu] = cpu; ++ x86_cpu_to_apicid[cpu] = cpu; ++ ++ idle = fork_idle(cpu); ++ if (IS_ERR(idle)) ++ panic("failed fork for CPU %d", cpu); ++ ++#ifdef __x86_64__ ++ cpu_pda(cpu)->pcurrent = idle; ++ cpu_pda(cpu)->cpunumber = cpu; ++ clear_ti_thread_flag(idle->thread_info, TIF_FORK); ++#endif ++ ++ irq_ctx_init(cpu); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ if (is_initial_xendomain()) ++ cpu_set(cpu, cpu_present_map); ++#else ++ cpu_set(cpu, cpu_present_map); ++#endif ++ } ++ ++ init_xenbus_allowed_cpumask(); ++ ++#ifdef CONFIG_X86_IO_APIC ++ /* ++ * Here we can be sure that there is an IO-APIC in the system. Let's ++ * go and set it up: ++ */ ++ if (!skip_ioapic_setup && nr_ioapics) ++ setup_IO_APIC(); ++#endif ++} ++ ++void __devinit smp_prepare_boot_cpu(void) ++{ ++ prefill_possible_map(); ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++ ++/* ++ * Initialize cpu_present_map late to skip SMP boot code in init/main.c. ++ * But do it early enough to catch critical for_each_present_cpu() loops ++ * in i386-specific code. ++ */ ++static int __init initialize_cpu_present_map(void) ++{ ++ cpu_present_map = cpu_possible_map; ++ return 0; ++} ++core_initcall(initialize_cpu_present_map); ++ ++int __cpu_disable(void) ++{ ++ cpumask_t map = cpu_online_map; ++ int cpu = smp_processor_id(); ++ ++ if (cpu == 0) ++ return -EBUSY; ++ ++ remove_siblinginfo(cpu); ++ ++ cpu_clear(cpu, map); ++ fixup_irqs(map); ++ cpu_clear(cpu, cpu_online_map); ++ ++ return 0; ++} ++ ++void __cpu_die(unsigned int cpu) ++{ ++ while (HYPERVISOR_vcpu_op(VCPUOP_is_up, cpu, NULL)) { ++ current->state = TASK_UNINTERRUPTIBLE; ++ schedule_timeout(HZ/10); ++ } ++ ++ xen_smp_intr_exit(cpu); ++ ++ if (num_online_cpus() == 1) ++ alternatives_smp_switch(0); ++} ++ ++#else /* !CONFIG_HOTPLUG_CPU */ ++ ++int __cpu_disable(void) ++{ ++ return -ENOSYS; ++} ++ ++void __cpu_die(unsigned int cpu) ++{ ++ BUG(); ++} ++ ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __devinit __cpu_up(unsigned int cpu) ++{ ++ int rc; ++ ++ rc = cpu_up_check(cpu); ++ if (rc) ++ return rc; ++ ++ cpu_initialize_context(cpu); ++ ++ if (num_online_cpus() == 1) ++ alternatives_smp_switch(1); ++ ++ /* This must be done before setting cpu_online_map */ ++ set_cpu_sibling_map(cpu); ++ wmb(); ++ ++ rc = xen_smp_intr_init(cpu); ++ if (rc) { ++ remove_siblinginfo(cpu); ++ return rc; ++ } ++ ++ cpu_set(cpu, cpu_online_map); ++ ++ rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL); ++ BUG_ON(rc); ++ ++ return 0; ++} ++ ++void __init smp_cpus_done(unsigned int max_cpus) ++{ ++} ++ ++#ifndef CONFIG_X86_LOCAL_APIC ++int setup_profiling_timer(unsigned int multiplier) ++{ ++ return -EINVAL; ++} ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/xen_proc.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,23 @@ ++ ++#include <linux/module.h> ++#include <linux/proc_fs.h> ++#include <xen/xen_proc.h> ++ ++static struct proc_dir_entry *xen_base; ++ ++struct proc_dir_entry *create_xen_proc_entry(const char *name, mode_t mode) ++{ ++ if ( xen_base == NULL ) ++ if ( (xen_base = proc_mkdir("xen", &proc_root)) == NULL ) ++ panic("Couldn't create /proc/xen"); ++ return create_proc_entry(name, mode, xen_base); ++} ++ ++EXPORT_SYMBOL_GPL(create_xen_proc_entry); ++ ++void remove_xen_proc_entry(const char *name) ++{ ++ remove_proc_entry(name, xen_base); ++} ++ ++EXPORT_SYMBOL_GPL(remove_xen_proc_entry); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/core/xen_sysfs.c 2007-08-27 14:01:58.000000000 -0400 +@@ -0,0 +1,378 @@ ++/* ++ * copyright (c) 2006 IBM Corporation ++ * Authored by: Mike D. Day <ncmike@us.ibm.com> ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include <linux/err.h> ++#include <linux/kernel.h> ++#include <linux/module.h> ++#include <linux/init.h> ++#include <asm/hypervisor.h> ++#include <xen/features.h> ++#include <xen/hypervisor_sysfs.h> ++#include <xen/xenbus.h> ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR("Mike D. Day <ncmike@us.ibm.com>"); ++ ++static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ return sprintf(buffer, "xen\n"); ++} ++ ++HYPERVISOR_ATTR_RO(type); ++ ++static int __init xen_sysfs_type_init(void) ++{ ++ return sysfs_create_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); ++} ++ ++static void xen_sysfs_type_destroy(void) ++{ ++ sysfs_remove_file(&hypervisor_subsys.kset.kobj, &type_attr.attr); ++} ++ ++/* xen version attributes */ ++static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int version = HYPERVISOR_xen_version(XENVER_version, NULL); ++ if (version) ++ return sprintf(buffer, "%d\n", version >> 16); ++ return -ENODEV; ++} ++ ++HYPERVISOR_ATTR_RO(major); ++ ++static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int version = HYPERVISOR_xen_version(XENVER_version, NULL); ++ if (version) ++ return sprintf(buffer, "%d\n", version & 0xff); ++ return -ENODEV; ++} ++ ++HYPERVISOR_ATTR_RO(minor); ++ ++static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ char *extra; ++ ++ extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); ++ if (extra) { ++ ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", extra); ++ kfree(extra); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(extra); ++ ++static struct attribute *version_attrs[] = { ++ &major_attr.attr, ++ &minor_attr.attr, ++ &extra_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group version_group = { ++ .name = "version", ++ .attrs = version_attrs, ++}; ++ ++static int __init xen_sysfs_version_init(void) ++{ ++ return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ &version_group); ++} ++ ++static void xen_sysfs_version_destroy(void) ++{ ++ sysfs_remove_group(&hypervisor_subsys.kset.kobj, &version_group); ++} ++ ++/* UUID */ ++ ++static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ char *vm, *val; ++ int ret; ++ ++ vm = xenbus_read(XBT_NIL, "vm", "", NULL); ++ if (IS_ERR(vm)) ++ return PTR_ERR(vm); ++ val = xenbus_read(XBT_NIL, vm, "uuid", NULL); ++ kfree(vm); ++ if (IS_ERR(val)) ++ return PTR_ERR(val); ++ ret = sprintf(buffer, "%s\n", val); ++ kfree(val); ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(uuid); ++ ++static int __init xen_sysfs_uuid_init(void) ++{ ++ return sysfs_create_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); ++} ++ ++static void xen_sysfs_uuid_destroy(void) ++{ ++ sysfs_remove_file(&hypervisor_subsys.kset.kobj, &uuid_attr.attr); ++} ++ ++/* xen compilation attributes */ ++ ++static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ struct xen_compile_info *info; ++ ++ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); ++ if (info) { ++ ret = HYPERVISOR_xen_version(XENVER_compile_info, info); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", info->compiler); ++ kfree(info); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(compiler); ++ ++static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ struct xen_compile_info *info; ++ ++ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); ++ if (info) { ++ ret = HYPERVISOR_xen_version(XENVER_compile_info, info); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", info->compile_by); ++ kfree(info); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(compiled_by); ++ ++static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ struct xen_compile_info *info; ++ ++ info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); ++ if (info) { ++ ret = HYPERVISOR_xen_version(XENVER_compile_info, info); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", info->compile_date); ++ kfree(info); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(compile_date); ++ ++static struct attribute *xen_compile_attrs[] = { ++ &compiler_attr.attr, ++ &compiled_by_attr.attr, ++ &compile_date_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group xen_compilation_group = { ++ .name = "compilation", ++ .attrs = xen_compile_attrs, ++}; ++ ++int __init static xen_compilation_init(void) ++{ ++ return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ &xen_compilation_group); ++} ++ ++static void xen_compilation_destroy(void) ++{ ++ sysfs_remove_group(&hypervisor_subsys.kset.kobj, ++ &xen_compilation_group); ++} ++ ++/* xen properties info */ ++ ++static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ char *caps; ++ ++ caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); ++ if (caps) { ++ ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", caps); ++ kfree(caps); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(capabilities); ++ ++static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ char *cset; ++ ++ cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); ++ if (cset) { ++ ret = HYPERVISOR_xen_version(XENVER_changeset, cset); ++ if (!ret) ++ ret = sprintf(buffer, "%s\n", cset); ++ kfree(cset); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(changeset); ++ ++static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret = -ENOMEM; ++ struct xen_platform_parameters *parms; ++ ++ parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); ++ if (parms) { ++ ret = HYPERVISOR_xen_version(XENVER_platform_parameters, ++ parms); ++ if (!ret) ++ ret = sprintf(buffer, "%lx\n", parms->virt_start); ++ kfree(parms); ++ } ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(virtual_start); ++ ++static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ int ret; ++ ++ ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); ++ if (ret > 0) ++ ret = sprintf(buffer, "%x\n", ret); ++ ++ return ret; ++} ++ ++HYPERVISOR_ATTR_RO(pagesize); ++ ++/* eventually there will be several more features to export */ ++static ssize_t xen_feature_show(int index, char *buffer) ++{ ++ int ret = -ENOMEM; ++ struct xen_feature_info *info; ++ ++ info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); ++ if (info) { ++ info->submap_idx = index; ++ ret = HYPERVISOR_xen_version(XENVER_get_features, info); ++ if (!ret) ++ ret = sprintf(buffer, "%d\n", info->submap); ++ kfree(info); ++ } ++ ++ return ret; ++} ++ ++static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) ++{ ++ return xen_feature_show(XENFEAT_writable_page_tables, buffer); ++} ++ ++HYPERVISOR_ATTR_RO(writable_pt); ++ ++static struct attribute *xen_properties_attrs[] = { ++ &capabilities_attr.attr, ++ &changeset_attr.attr, ++ &virtual_start_attr.attr, ++ &pagesize_attr.attr, ++ &writable_pt_attr.attr, ++ NULL ++}; ++ ++static struct attribute_group xen_properties_group = { ++ .name = "properties", ++ .attrs = xen_properties_attrs, ++}; ++ ++static int __init xen_properties_init(void) ++{ ++ return sysfs_create_group(&hypervisor_subsys.kset.kobj, ++ &xen_properties_group); ++} ++ ++static void xen_properties_destroy(void) ++{ ++ sysfs_remove_group(&hypervisor_subsys.kset.kobj, ++ &xen_properties_group); ++} ++ ++static int __init hyper_sysfs_init(void) ++{ ++ int ret; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ ret = xen_sysfs_type_init(); ++ if (ret) ++ goto out; ++ ret = xen_sysfs_version_init(); ++ if (ret) ++ goto version_out; ++ ret = xen_compilation_init(); ++ if (ret) ++ goto comp_out; ++ ret = xen_sysfs_uuid_init(); ++ if (ret) ++ goto uuid_out; ++ ret = xen_properties_init(); ++ if (!ret) ++ goto out; ++ ++ xen_sysfs_uuid_destroy(); ++uuid_out: ++ xen_compilation_destroy(); ++comp_out: ++ xen_sysfs_version_destroy(); ++version_out: ++ xen_sysfs_type_destroy(); ++out: ++ return ret; ++} ++ ++static void hyper_sysfs_exit(void) ++{ ++ xen_properties_destroy(); ++ xen_compilation_destroy(); ++ xen_sysfs_uuid_destroy(); ++ xen_sysfs_version_destroy(); ++ xen_sysfs_type_destroy(); ++ ++} ++ ++module_init(hyper_sysfs_init); ++module_exit(hyper_sysfs_exit); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/evtchn/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,2 @@ ++ ++obj-y := evtchn.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/evtchn/evtchn.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,469 @@ ++/****************************************************************************** ++ * evtchn.c ++ * ++ * Driver for receiving and demuxing event-channel signals. ++ * ++ * Copyright (c) 2004-2005, K A Fraser ++ * Multi-process extensions Copyright (c) 2004, Steven Smith ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/string.h> ++#include <linux/errno.h> ++#include <linux/fs.h> ++#include <linux/errno.h> ++#include <linux/miscdevice.h> ++#include <linux/major.h> ++#include <linux/proc_fs.h> ++#include <linux/stat.h> ++#include <linux/poll.h> ++#include <linux/irq.h> ++#include <linux/init.h> ++#include <linux/gfp.h> ++#include <linux/mutex.h> ++#include <xen/evtchn.h> ++#include <xen/public/evtchn.h> ++ ++struct per_user_data { ++ /* Notification ring, accessed via /dev/xen/evtchn. */ ++#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) ++#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) ++ evtchn_port_t *ring; ++ unsigned int ring_cons, ring_prod, ring_overflow; ++ struct mutex ring_cons_mutex; /* protect against concurrent readers */ ++ ++ /* Processes wait on this queue when ring is empty. */ ++ wait_queue_head_t evtchn_wait; ++ struct fasync_struct *evtchn_async_queue; ++}; ++ ++/* Who's bound to each port? */ ++static struct per_user_data *port_user[NR_EVENT_CHANNELS]; ++static spinlock_t port_user_lock; ++ ++void evtchn_device_upcall(int port) ++{ ++ struct per_user_data *u; ++ ++ spin_lock(&port_user_lock); ++ ++ mask_evtchn(port); ++ clear_evtchn(port); ++ ++ if ((u = port_user[port]) != NULL) { ++ if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { ++ u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; ++ if (u->ring_cons == u->ring_prod++) { ++ wake_up_interruptible(&u->evtchn_wait); ++ kill_fasync(&u->evtchn_async_queue, ++ SIGIO, POLL_IN); ++ } ++ } else { ++ u->ring_overflow = 1; ++ } ++ } ++ ++ spin_unlock(&port_user_lock); ++} ++ ++static ssize_t evtchn_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int rc; ++ unsigned int c, p, bytes1 = 0, bytes2 = 0; ++ struct per_user_data *u = file->private_data; ++ ++ /* Whole number of ports. */ ++ count &= ~(sizeof(evtchn_port_t)-1); ++ ++ if (count == 0) ++ return 0; ++ ++ if (count > PAGE_SIZE) ++ count = PAGE_SIZE; ++ ++ for (;;) { ++ mutex_lock(&u->ring_cons_mutex); ++ ++ rc = -EFBIG; ++ if (u->ring_overflow) ++ goto unlock_out; ++ ++ if ((c = u->ring_cons) != (p = u->ring_prod)) ++ break; ++ ++ mutex_unlock(&u->ring_cons_mutex); ++ ++ if (file->f_flags & O_NONBLOCK) ++ return -EAGAIN; ++ ++ rc = wait_event_interruptible( ++ u->evtchn_wait, u->ring_cons != u->ring_prod); ++ if (rc) ++ return rc; ++ } ++ ++ /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ ++ if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { ++ bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * ++ sizeof(evtchn_port_t); ++ bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); ++ } else { ++ bytes1 = (p - c) * sizeof(evtchn_port_t); ++ bytes2 = 0; ++ } ++ ++ /* Truncate chunks according to caller's maximum byte count. */ ++ if (bytes1 > count) { ++ bytes1 = count; ++ bytes2 = 0; ++ } else if ((bytes1 + bytes2) > count) { ++ bytes2 = count - bytes1; ++ } ++ ++ rc = -EFAULT; ++ if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || ++ ((bytes2 != 0) && ++ copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) ++ goto unlock_out; ++ ++ u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); ++ rc = bytes1 + bytes2; ++ ++ unlock_out: ++ mutex_unlock(&u->ring_cons_mutex); ++ return rc; ++} ++ ++static ssize_t evtchn_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ int rc, i; ++ evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); ++ struct per_user_data *u = file->private_data; ++ ++ if (kbuf == NULL) ++ return -ENOMEM; ++ ++ /* Whole number of ports. */ ++ count &= ~(sizeof(evtchn_port_t)-1); ++ ++ rc = 0; ++ if (count == 0) ++ goto out; ++ ++ if (count > PAGE_SIZE) ++ count = PAGE_SIZE; ++ ++ rc = -EFAULT; ++ if (copy_from_user(kbuf, buf, count) != 0) ++ goto out; ++ ++ spin_lock_irq(&port_user_lock); ++ for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) ++ if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) ++ unmask_evtchn(kbuf[i]); ++ spin_unlock_irq(&port_user_lock); ++ ++ rc = count; ++ ++ out: ++ free_page((unsigned long)kbuf); ++ return rc; ++} ++ ++static void evtchn_bind_to_user(struct per_user_data *u, int port) ++{ ++ spin_lock_irq(&port_user_lock); ++ BUG_ON(port_user[port] != NULL); ++ port_user[port] = u; ++ unmask_evtchn(port); ++ spin_unlock_irq(&port_user_lock); ++} ++ ++static int evtchn_ioctl(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long arg) ++{ ++ int rc; ++ struct per_user_data *u = file->private_data; ++ void __user *uarg = (void __user *) arg; ++ ++ switch (cmd) { ++ case IOCTL_EVTCHN_BIND_VIRQ: { ++ struct ioctl_evtchn_bind_virq bind; ++ struct evtchn_bind_virq bind_virq; ++ ++ rc = -EFAULT; ++ if (copy_from_user(&bind, uarg, sizeof(bind))) ++ break; ++ ++ bind_virq.virq = bind.virq; ++ bind_virq.vcpu = 0; ++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, ++ &bind_virq); ++ if (rc != 0) ++ break; ++ ++ rc = bind_virq.port; ++ evtchn_bind_to_user(u, rc); ++ break; ++ } ++ ++ case IOCTL_EVTCHN_BIND_INTERDOMAIN: { ++ struct ioctl_evtchn_bind_interdomain bind; ++ struct evtchn_bind_interdomain bind_interdomain; ++ ++ rc = -EFAULT; ++ if (copy_from_user(&bind, uarg, sizeof(bind))) ++ break; ++ ++ bind_interdomain.remote_dom = bind.remote_domain; ++ bind_interdomain.remote_port = bind.remote_port; ++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, ++ &bind_interdomain); ++ if (rc != 0) ++ break; ++ ++ rc = bind_interdomain.local_port; ++ evtchn_bind_to_user(u, rc); ++ break; ++ } ++ ++ case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { ++ struct ioctl_evtchn_bind_unbound_port bind; ++ struct evtchn_alloc_unbound alloc_unbound; ++ ++ rc = -EFAULT; ++ if (copy_from_user(&bind, uarg, sizeof(bind))) ++ break; ++ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = bind.remote_domain; ++ rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ if (rc != 0) ++ break; ++ ++ rc = alloc_unbound.port; ++ evtchn_bind_to_user(u, rc); ++ break; ++ } ++ ++ case IOCTL_EVTCHN_UNBIND: { ++ struct ioctl_evtchn_unbind unbind; ++ struct evtchn_close close; ++ int ret; ++ ++ rc = -EFAULT; ++ if (copy_from_user(&unbind, uarg, sizeof(unbind))) ++ break; ++ ++ rc = -EINVAL; ++ if (unbind.port >= NR_EVENT_CHANNELS) ++ break; ++ ++ spin_lock_irq(&port_user_lock); ++ ++ rc = -ENOTCONN; ++ if (port_user[unbind.port] != u) { ++ spin_unlock_irq(&port_user_lock); ++ break; ++ } ++ ++ port_user[unbind.port] = NULL; ++ mask_evtchn(unbind.port); ++ ++ spin_unlock_irq(&port_user_lock); ++ ++ close.port = unbind.port; ++ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); ++ BUG_ON(ret); ++ ++ rc = 0; ++ break; ++ } ++ ++ case IOCTL_EVTCHN_NOTIFY: { ++ struct ioctl_evtchn_notify notify; ++ ++ rc = -EFAULT; ++ if (copy_from_user(¬ify, uarg, sizeof(notify))) ++ break; ++ ++ if (notify.port >= NR_EVENT_CHANNELS) { ++ rc = -EINVAL; ++ } else if (port_user[notify.port] != u) { ++ rc = -ENOTCONN; ++ } else { ++ notify_remote_via_evtchn(notify.port); ++ rc = 0; ++ } ++ break; ++ } ++ ++ case IOCTL_EVTCHN_RESET: { ++ /* Initialise the ring to empty. Clear errors. */ ++ mutex_lock(&u->ring_cons_mutex); ++ spin_lock_irq(&port_user_lock); ++ u->ring_cons = u->ring_prod = u->ring_overflow = 0; ++ spin_unlock_irq(&port_user_lock); ++ mutex_unlock(&u->ring_cons_mutex); ++ rc = 0; ++ break; ++ } ++ ++ default: ++ rc = -ENOSYS; ++ break; ++ } ++ ++ return rc; ++} ++ ++static unsigned int evtchn_poll(struct file *file, poll_table *wait) ++{ ++ unsigned int mask = POLLOUT | POLLWRNORM; ++ struct per_user_data *u = file->private_data; ++ ++ poll_wait(file, &u->evtchn_wait, wait); ++ if (u->ring_cons != u->ring_prod) ++ mask |= POLLIN | POLLRDNORM; ++ if (u->ring_overflow) ++ mask = POLLERR; ++ return mask; ++} ++ ++static int evtchn_fasync(int fd, struct file *filp, int on) ++{ ++ struct per_user_data *u = filp->private_data; ++ return fasync_helper(fd, filp, on, &u->evtchn_async_queue); ++} ++ ++static int evtchn_open(struct inode *inode, struct file *filp) ++{ ++ struct per_user_data *u; ++ ++ if ((u = kmalloc(sizeof(*u), GFP_KERNEL)) == NULL) ++ return -ENOMEM; ++ ++ memset(u, 0, sizeof(*u)); ++ init_waitqueue_head(&u->evtchn_wait); ++ ++ u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); ++ if (u->ring == NULL) { ++ kfree(u); ++ return -ENOMEM; ++ } ++ ++ mutex_init(&u->ring_cons_mutex); ++ ++ filp->private_data = u; ++ ++ return 0; ++} ++ ++static int evtchn_release(struct inode *inode, struct file *filp) ++{ ++ int i; ++ struct per_user_data *u = filp->private_data; ++ struct evtchn_close close; ++ ++ spin_lock_irq(&port_user_lock); ++ ++ free_page((unsigned long)u->ring); ++ ++ for (i = 0; i < NR_EVENT_CHANNELS; i++) { ++ int ret; ++ if (port_user[i] != u) ++ continue; ++ ++ port_user[i] = NULL; ++ mask_evtchn(i); ++ ++ close.port = i; ++ ret = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); ++ BUG_ON(ret); ++ } ++ ++ spin_unlock_irq(&port_user_lock); ++ ++ kfree(u); ++ ++ return 0; ++} ++ ++static const struct file_operations evtchn_fops = { ++ .owner = THIS_MODULE, ++ .read = evtchn_read, ++ .write = evtchn_write, ++ .ioctl = evtchn_ioctl, ++ .poll = evtchn_poll, ++ .fasync = evtchn_fasync, ++ .open = evtchn_open, ++ .release = evtchn_release, ++}; ++ ++static struct miscdevice evtchn_miscdev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "evtchn", ++ .fops = &evtchn_fops, ++}; ++ ++static int __init evtchn_init(void) ++{ ++ int err; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ spin_lock_init(&port_user_lock); ++ memset(port_user, 0, sizeof(port_user)); ++ ++ /* Create '/dev/misc/evtchn'. */ ++ err = misc_register(&evtchn_miscdev); ++ if (err != 0) { ++ printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); ++ return err; ++ } ++ ++ printk("Event-channel device installed.\n"); ++ ++ return 0; ++} ++ ++static void evtchn_cleanup(void) ++{ ++ misc_deregister(&evtchn_miscdev); ++} ++ ++module_init(evtchn_init); ++module_exit(evtchn_cleanup); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/fbfront/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,2 @@ ++obj-$(CONFIG_XEN_FRAMEBUFFER) := xenfb.o ++obj-$(CONFIG_XEN_KEYBOARD) += xenkbd.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/fbfront/xenfb.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,752 @@ ++/* ++ * linux/drivers/video/xenfb.c -- Xen para-virtual frame buffer device ++ * ++ * Copyright (C) 2005-2006 Anthony Liguori <aliguori@us.ibm.com> ++ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com> ++ * ++ * Based on linux/drivers/video/q40fb.c ++ * ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file COPYING in the main directory of this archive for ++ * more details. ++ */ ++ ++/* ++ * TODO: ++ * ++ * Switch to grant tables when they become capable of dealing with the ++ * frame buffer. ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/fb.h> ++#include <linux/module.h> ++#include <linux/vmalloc.h> ++#include <linux/mm.h> ++#include <linux/mutex.h> ++#include <asm/hypervisor.h> ++#include <xen/evtchn.h> ++#include <xen/interface/io/fbif.h> ++#include <xen/interface/io/protocols.h> ++#include <xen/xenbus.h> ++#include <linux/kthread.h> ++ ++struct xenfb_mapping ++{ ++ struct list_head link; ++ struct vm_area_struct *vma; ++ atomic_t map_refs; ++ int faults; ++ struct xenfb_info *info; ++}; ++ ++struct xenfb_info ++{ ++ struct task_struct *kthread; ++ wait_queue_head_t wq; ++ ++ unsigned char *fb; ++ struct fb_info *fb_info; ++ struct timer_list refresh; ++ int dirty; ++ int x1, y1, x2, y2; /* dirty rectangle, ++ protected by dirty_lock */ ++ spinlock_t dirty_lock; ++ struct mutex mm_lock; ++ int nr_pages; ++ struct page **pages; ++ struct list_head mappings; /* protected by mm_lock */ ++ ++ int irq; ++ struct xenfb_page *page; ++ unsigned long *mfns; ++ int update_wanted; /* XENFB_TYPE_UPDATE wanted */ ++ ++ struct xenbus_device *xbdev; ++}; ++ ++/* ++ * How the locks work together ++ * ++ * There are two locks: spinlock dirty_lock protecting the dirty ++ * rectangle, and mutex mm_lock protecting mappings. ++ * ++ * The problem is that dirty rectangle and mappings aren't ++ * independent: the dirty rectangle must cover all faulted pages in ++ * mappings. We need to prove that our locking maintains this ++ * invariant. ++ * ++ * There are several kinds of critical regions: ++ * ++ * 1. Holding only dirty_lock: xenfb_refresh(). May run in ++ * interrupts. Extends the dirty rectangle. Trivially preserves ++ * invariant. ++ * ++ * 2. Holding only mm_lock: xenfb_mmap() and xenfb_vm_close(). Touch ++ * only mappings. The former creates unfaulted pages. Preserves ++ * invariant. The latter removes pages. Preserves invariant. ++ * ++ * 3. Holding both locks: xenfb_vm_nopage(). Extends the dirty ++ * rectangle and updates mappings consistently. Preserves ++ * invariant. ++ * ++ * 4. The ugliest one: xenfb_update_screen(). Clear the dirty ++ * rectangle and update mappings consistently. ++ * ++ * We can't simply hold both locks, because zap_page_range() cannot ++ * be called with a spinlock held. ++ * ++ * Therefore, we first clear the dirty rectangle with both locks ++ * held. Then we unlock dirty_lock and update the mappings. ++ * Critical regions that hold only dirty_lock may interfere with ++ * that. This can only be region 1: xenfb_refresh(). But that ++ * just extends the dirty rectangle, which can't harm the ++ * invariant. ++ * ++ * But FIXME: the invariant is too weak. It misses that the fault ++ * record in mappings must be consistent with the mapping of pages in ++ * the associated address space! do_no_page() updates the PTE after ++ * xenfb_vm_nopage() returns, i.e. outside the critical region. This ++ * allows the following race: ++ * ++ * X writes to some address in the Xen frame buffer ++ * Fault - call do_no_page() ++ * call xenfb_vm_nopage() ++ * grab mm_lock ++ * map->faults++; ++ * release mm_lock ++ * return back to do_no_page() ++ * (preempted, or SMP) ++ * Xen worker thread runs. ++ * grab mm_lock ++ * look at mappings ++ * find this mapping, zaps its pages (but page not in pte yet) ++ * clear map->faults ++ * releases mm_lock ++ * (back to X process) ++ * put page in X's pte ++ * ++ * Oh well, we wont be updating the writes to this page anytime soon. ++ */ ++ ++static int xenfb_fps = 20; ++static unsigned long xenfb_mem_len = XENFB_WIDTH * XENFB_HEIGHT * XENFB_DEPTH / 8; ++ ++static int xenfb_remove(struct xenbus_device *); ++static void xenfb_init_shared_page(struct xenfb_info *); ++static int xenfb_connect_backend(struct xenbus_device *, struct xenfb_info *); ++static void xenfb_disconnect_backend(struct xenfb_info *); ++ ++static void xenfb_do_update(struct xenfb_info *info, ++ int x, int y, int w, int h) ++{ ++ union xenfb_out_event event; ++ __u32 prod; ++ ++ event.type = XENFB_TYPE_UPDATE; ++ event.update.x = x; ++ event.update.y = y; ++ event.update.width = w; ++ event.update.height = h; ++ ++ prod = info->page->out_prod; ++ /* caller ensures !xenfb_queue_full() */ ++ mb(); /* ensure ring space available */ ++ XENFB_OUT_RING_REF(info->page, prod) = event; ++ wmb(); /* ensure ring contents visible */ ++ info->page->out_prod = prod + 1; ++ ++ notify_remote_via_irq(info->irq); ++} ++ ++static int xenfb_queue_full(struct xenfb_info *info) ++{ ++ __u32 cons, prod; ++ ++ prod = info->page->out_prod; ++ cons = info->page->out_cons; ++ return prod - cons == XENFB_OUT_RING_LEN; ++} ++ ++static void xenfb_update_screen(struct xenfb_info *info) ++{ ++ unsigned long flags; ++ int y1, y2, x1, x2; ++ struct xenfb_mapping *map; ++ ++ if (!info->update_wanted) ++ return; ++ if (xenfb_queue_full(info)) ++ return; ++ ++ mutex_lock(&info->mm_lock); ++ ++ spin_lock_irqsave(&info->dirty_lock, flags); ++ y1 = info->y1; ++ y2 = info->y2; ++ x1 = info->x1; ++ x2 = info->x2; ++ info->x1 = info->y1 = INT_MAX; ++ info->x2 = info->y2 = 0; ++ spin_unlock_irqrestore(&info->dirty_lock, flags); ++ ++ list_for_each_entry(map, &info->mappings, link) { ++ if (!map->faults) ++ continue; ++ zap_page_range(map->vma, map->vma->vm_start, ++ map->vma->vm_end - map->vma->vm_start, NULL); ++ map->faults = 0; ++ } ++ ++ mutex_unlock(&info->mm_lock); ++ ++ xenfb_do_update(info, x1, y1, x2 - x1, y2 - y1); ++} ++ ++static int xenfb_thread(void *data) ++{ ++ struct xenfb_info *info = data; ++ ++ while (!kthread_should_stop()) { ++ if (info->dirty) { ++ info->dirty = 0; ++ xenfb_update_screen(info); ++ } ++ wait_event_interruptible(info->wq, ++ kthread_should_stop() || info->dirty); ++ try_to_freeze(); ++ } ++ return 0; ++} ++ ++static int xenfb_setcolreg(unsigned regno, unsigned red, unsigned green, ++ unsigned blue, unsigned transp, ++ struct fb_info *info) ++{ ++ u32 v; ++ ++ if (regno > info->cmap.len) ++ return 1; ++ ++ red >>= (16 - info->var.red.length); ++ green >>= (16 - info->var.green.length); ++ blue >>= (16 - info->var.blue.length); ++ ++ v = (red << info->var.red.offset) | ++ (green << info->var.green.offset) | ++ (blue << info->var.blue.offset); ++ ++ /* FIXME is this sane? check against xxxfb_setcolreg()! */ ++ switch (info->var.bits_per_pixel) { ++ case 16: ++ case 24: ++ case 32: ++ ((u32 *)info->pseudo_palette)[regno] = v; ++ break; ++ } ++ ++ return 0; ++} ++ ++static void xenfb_timer(unsigned long data) ++{ ++ struct xenfb_info *info = (struct xenfb_info *)data; ++ info->dirty = 1; ++ wake_up(&info->wq); ++} ++ ++static void __xenfb_refresh(struct xenfb_info *info, ++ int x1, int y1, int w, int h) ++{ ++ int y2, x2; ++ ++ y2 = y1 + h; ++ x2 = x1 + w; ++ ++ if (info->y1 > y1) ++ info->y1 = y1; ++ if (info->y2 < y2) ++ info->y2 = y2; ++ if (info->x1 > x1) ++ info->x1 = x1; ++ if (info->x2 < x2) ++ info->x2 = x2; ++ ++ if (timer_pending(&info->refresh)) ++ return; ++ ++ mod_timer(&info->refresh, jiffies + HZ/xenfb_fps); ++} ++ ++static void xenfb_refresh(struct xenfb_info *info, ++ int x1, int y1, int w, int h) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&info->dirty_lock, flags); ++ __xenfb_refresh(info, x1, y1, w, h); ++ spin_unlock_irqrestore(&info->dirty_lock, flags); ++} ++ ++static void xenfb_fillrect(struct fb_info *p, const struct fb_fillrect *rect) ++{ ++ struct xenfb_info *info = p->par; ++ ++ cfb_fillrect(p, rect); ++ xenfb_refresh(info, rect->dx, rect->dy, rect->width, rect->height); ++} ++ ++static void xenfb_imageblit(struct fb_info *p, const struct fb_image *image) ++{ ++ struct xenfb_info *info = p->par; ++ ++ cfb_imageblit(p, image); ++ xenfb_refresh(info, image->dx, image->dy, image->width, image->height); ++} ++ ++static void xenfb_copyarea(struct fb_info *p, const struct fb_copyarea *area) ++{ ++ struct xenfb_info *info = p->par; ++ ++ cfb_copyarea(p, area); ++ xenfb_refresh(info, area->dx, area->dy, area->width, area->height); ++} ++ ++static void xenfb_vm_open(struct vm_area_struct *vma) ++{ ++ struct xenfb_mapping *map = vma->vm_private_data; ++ atomic_inc(&map->map_refs); ++} ++ ++static void xenfb_vm_close(struct vm_area_struct *vma) ++{ ++ struct xenfb_mapping *map = vma->vm_private_data; ++ struct xenfb_info *info = map->info; ++ ++ mutex_lock(&info->mm_lock); ++ if (atomic_dec_and_test(&map->map_refs)) { ++ list_del(&map->link); ++ kfree(map); ++ } ++ mutex_unlock(&info->mm_lock); ++} ++ ++static struct page *xenfb_vm_nopage(struct vm_area_struct *vma, ++ unsigned long vaddr, int *type) ++{ ++ struct xenfb_mapping *map = vma->vm_private_data; ++ struct xenfb_info *info = map->info; ++ int pgnr = (vaddr - vma->vm_start) >> PAGE_SHIFT; ++ unsigned long flags; ++ struct page *page; ++ int y1, y2; ++ ++ if (pgnr >= info->nr_pages) ++ return NOPAGE_SIGBUS; ++ ++ mutex_lock(&info->mm_lock); ++ spin_lock_irqsave(&info->dirty_lock, flags); ++ page = info->pages[pgnr]; ++ get_page(page); ++ map->faults++; ++ ++ y1 = pgnr * PAGE_SIZE / info->fb_info->fix.line_length; ++ y2 = (pgnr * PAGE_SIZE + PAGE_SIZE - 1) / info->fb_info->fix.line_length; ++ if (y2 > info->fb_info->var.yres) ++ y2 = info->fb_info->var.yres; ++ __xenfb_refresh(info, 0, y1, info->fb_info->var.xres, y2 - y1); ++ spin_unlock_irqrestore(&info->dirty_lock, flags); ++ mutex_unlock(&info->mm_lock); ++ ++ if (type) ++ *type = VM_FAULT_MINOR; ++ ++ return page; ++} ++ ++static struct vm_operations_struct xenfb_vm_ops = { ++ .open = xenfb_vm_open, ++ .close = xenfb_vm_close, ++ .nopage = xenfb_vm_nopage, ++}; ++ ++static int xenfb_mmap(struct fb_info *fb_info, struct vm_area_struct *vma) ++{ ++ struct xenfb_info *info = fb_info->par; ++ struct xenfb_mapping *map; ++ int map_pages; ++ ++ if (!(vma->vm_flags & VM_WRITE)) ++ return -EINVAL; ++ if (!(vma->vm_flags & VM_SHARED)) ++ return -EINVAL; ++ if (vma->vm_pgoff != 0) ++ return -EINVAL; ++ ++ map_pages = (vma->vm_end - vma->vm_start + PAGE_SIZE-1) >> PAGE_SHIFT; ++ if (map_pages > info->nr_pages) ++ return -EINVAL; ++ ++ map = kzalloc(sizeof(*map), GFP_KERNEL); ++ if (map == NULL) ++ return -ENOMEM; ++ ++ map->vma = vma; ++ map->faults = 0; ++ map->info = info; ++ atomic_set(&map->map_refs, 1); ++ ++ mutex_lock(&info->mm_lock); ++ list_add(&map->link, &info->mappings); ++ mutex_unlock(&info->mm_lock); ++ ++ vma->vm_ops = &xenfb_vm_ops; ++ vma->vm_flags |= (VM_DONTEXPAND | VM_RESERVED); ++ vma->vm_private_data = map; ++ ++ return 0; ++} ++ ++static struct fb_ops xenfb_fb_ops = { ++ .owner = THIS_MODULE, ++ .fb_setcolreg = xenfb_setcolreg, ++ .fb_fillrect = xenfb_fillrect, ++ .fb_copyarea = xenfb_copyarea, ++ .fb_imageblit = xenfb_imageblit, ++ .fb_mmap = xenfb_mmap, ++}; ++ ++static irqreturn_t xenfb_event_handler(int rq, void *dev_id, ++ struct pt_regs *regs) ++{ ++ /* ++ * No in events recognized, simply ignore them all. ++ * If you need to recognize some, see xenbkd's input_handler() ++ * for how to do that. ++ */ ++ struct xenfb_info *info = dev_id; ++ struct xenfb_page *page = info->page; ++ ++ if (page->in_cons != page->in_prod) { ++ info->page->in_cons = info->page->in_prod; ++ notify_remote_via_irq(info->irq); ++ } ++ return IRQ_HANDLED; ++} ++ ++static unsigned long vmalloc_to_mfn(void *address) ++{ ++ return pfn_to_mfn(vmalloc_to_pfn(address)); ++} ++ ++static int __devinit xenfb_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ struct xenfb_info *info; ++ struct fb_info *fb_info; ++ int ret; ++ ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (info == NULL) { ++ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); ++ return -ENOMEM; ++ } ++ dev->dev.driver_data = info; ++ info->xbdev = dev; ++ info->irq = -1; ++ info->x1 = info->y1 = INT_MAX; ++ spin_lock_init(&info->dirty_lock); ++ mutex_init(&info->mm_lock); ++ init_waitqueue_head(&info->wq); ++ init_timer(&info->refresh); ++ info->refresh.function = xenfb_timer; ++ info->refresh.data = (unsigned long)info; ++ INIT_LIST_HEAD(&info->mappings); ++ ++ info->fb = vmalloc(xenfb_mem_len); ++ if (info->fb == NULL) ++ goto error_nomem; ++ memset(info->fb, 0, xenfb_mem_len); ++ ++ info->nr_pages = (xenfb_mem_len + PAGE_SIZE - 1) >> PAGE_SHIFT; ++ ++ info->pages = kmalloc(sizeof(struct page *) * info->nr_pages, ++ GFP_KERNEL); ++ if (info->pages == NULL) ++ goto error_nomem; ++ ++ info->mfns = vmalloc(sizeof(unsigned long) * info->nr_pages); ++ if (!info->mfns) ++ goto error_nomem; ++ ++ /* set up shared page */ ++ info->page = (void *)__get_free_page(GFP_KERNEL | __GFP_ZERO); ++ if (!info->page) ++ goto error_nomem; ++ ++ xenfb_init_shared_page(info); ++ ++ fb_info = framebuffer_alloc(sizeof(u32) * 256, NULL); ++ /* see fishy hackery below */ ++ if (fb_info == NULL) ++ goto error_nomem; ++ ++ /* FIXME fishy hackery */ ++ fb_info->pseudo_palette = fb_info->par; ++ fb_info->par = info; ++ /* /FIXME */ ++ fb_info->screen_base = info->fb; ++ ++ fb_info->fbops = &xenfb_fb_ops; ++ fb_info->var.xres_virtual = fb_info->var.xres = info->page->width; ++ fb_info->var.yres_virtual = fb_info->var.yres = info->page->height; ++ fb_info->var.bits_per_pixel = info->page->depth; ++ ++ fb_info->var.red = (struct fb_bitfield){16, 8, 0}; ++ fb_info->var.green = (struct fb_bitfield){8, 8, 0}; ++ fb_info->var.blue = (struct fb_bitfield){0, 8, 0}; ++ ++ fb_info->var.activate = FB_ACTIVATE_NOW; ++ fb_info->var.height = -1; ++ fb_info->var.width = -1; ++ fb_info->var.vmode = FB_VMODE_NONINTERLACED; ++ ++ fb_info->fix.visual = FB_VISUAL_TRUECOLOR; ++ fb_info->fix.line_length = info->page->line_length; ++ fb_info->fix.smem_start = 0; ++ fb_info->fix.smem_len = xenfb_mem_len; ++ strcpy(fb_info->fix.id, "xen"); ++ fb_info->fix.type = FB_TYPE_PACKED_PIXELS; ++ fb_info->fix.accel = FB_ACCEL_NONE; ++ ++ fb_info->flags = FBINFO_FLAG_DEFAULT; ++ ++ ret = fb_alloc_cmap(&fb_info->cmap, 256, 0); ++ if (ret < 0) { ++ framebuffer_release(fb_info); ++ xenbus_dev_fatal(dev, ret, "fb_alloc_cmap"); ++ goto error; ++ } ++ ++ ret = register_framebuffer(fb_info); ++ if (ret) { ++ fb_dealloc_cmap(&info->fb_info->cmap); ++ framebuffer_release(fb_info); ++ xenbus_dev_fatal(dev, ret, "register_framebuffer"); ++ goto error; ++ } ++ info->fb_info = fb_info; ++ ++ /* FIXME should this be delayed until backend XenbusStateConnected? */ ++ info->kthread = kthread_run(xenfb_thread, info, "xenfb thread"); ++ if (IS_ERR(info->kthread)) { ++ ret = PTR_ERR(info->kthread); ++ info->kthread = NULL; ++ xenbus_dev_fatal(dev, ret, "register_framebuffer"); ++ goto error; ++ } ++ ++ ret = xenfb_connect_backend(dev, info); ++ if (ret < 0) ++ goto error; ++ ++ return 0; ++ ++ error_nomem: ++ ret = -ENOMEM; ++ xenbus_dev_fatal(dev, ret, "allocating device memory"); ++ error: ++ xenfb_remove(dev); ++ return ret; ++} ++ ++static int xenfb_resume(struct xenbus_device *dev) ++{ ++ struct xenfb_info *info = dev->dev.driver_data; ++ ++ xenfb_disconnect_backend(info); ++ xenfb_init_shared_page(info); ++ return xenfb_connect_backend(dev, info); ++} ++ ++static int xenfb_remove(struct xenbus_device *dev) ++{ ++ struct xenfb_info *info = dev->dev.driver_data; ++ ++ del_timer(&info->refresh); ++ if (info->kthread) ++ kthread_stop(info->kthread); ++ xenfb_disconnect_backend(info); ++ if (info->fb_info) { ++ unregister_framebuffer(info->fb_info); ++ fb_dealloc_cmap(&info->fb_info->cmap); ++ framebuffer_release(info->fb_info); ++ } ++ free_page((unsigned long)info->page); ++ vfree(info->mfns); ++ kfree(info->pages); ++ vfree(info->fb); ++ kfree(info); ++ ++ return 0; ++} ++ ++static void xenfb_init_shared_page(struct xenfb_info *info) ++{ ++ int i; ++ ++ for (i = 0; i < info->nr_pages; i++) ++ info->pages[i] = vmalloc_to_page(info->fb + i * PAGE_SIZE); ++ ++ for (i = 0; i < info->nr_pages; i++) ++ info->mfns[i] = vmalloc_to_mfn(info->fb + i * PAGE_SIZE); ++ ++ info->page->pd[0] = vmalloc_to_mfn(info->mfns); ++ info->page->pd[1] = 0; ++ info->page->width = XENFB_WIDTH; ++ info->page->height = XENFB_HEIGHT; ++ info->page->depth = XENFB_DEPTH; ++ info->page->line_length = (info->page->depth / 8) * info->page->width; ++ info->page->mem_length = xenfb_mem_len; ++ info->page->in_cons = info->page->in_prod = 0; ++ info->page->out_cons = info->page->out_prod = 0; ++} ++ ++static int xenfb_connect_backend(struct xenbus_device *dev, ++ struct xenfb_info *info) ++{ ++ int ret; ++ struct xenbus_transaction xbt; ++ ++ ret = bind_listening_port_to_irqhandler( ++ dev->otherend_id, xenfb_event_handler, 0, "xenfb", info); ++ if (ret < 0) { ++ xenbus_dev_fatal(dev, ret, ++ "bind_listening_port_to_irqhandler"); ++ return ret; ++ } ++ info->irq = ret; ++ ++ again: ++ ret = xenbus_transaction_start(&xbt); ++ if (ret) { ++ xenbus_dev_fatal(dev, ret, "starting transaction"); ++ return ret; ++ } ++ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu", ++ virt_to_mfn(info->page)); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", ++ irq_to_evtchn_port(info->irq)); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_printf(xbt, dev->nodename, "protocol", "%s", ++ XEN_IO_PROTO_ABI_NATIVE); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_printf(xbt, dev->nodename, "feature-update", "1"); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_transaction_end(xbt, 0); ++ if (ret) { ++ if (ret == -EAGAIN) ++ goto again; ++ xenbus_dev_fatal(dev, ret, "completing transaction"); ++ return ret; ++ } ++ ++ xenbus_switch_state(dev, XenbusStateInitialised); ++ return 0; ++ ++ error_xenbus: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, ret, "writing xenstore"); ++ return ret; ++} ++ ++static void xenfb_disconnect_backend(struct xenfb_info *info) ++{ ++ if (info->irq >= 0) ++ unbind_from_irqhandler(info->irq, info); ++ info->irq = -1; ++} ++ ++static void xenfb_backend_changed(struct xenbus_device *dev, ++ enum xenbus_state backend_state) ++{ ++ struct xenfb_info *info = dev->dev.driver_data; ++ int val; ++ ++ switch (backend_state) { ++ case XenbusStateInitialising: ++ case XenbusStateInitialised: ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ break; ++ ++ case XenbusStateInitWait: ++ InitWait: ++ xenbus_switch_state(dev, XenbusStateConnected); ++ break; ++ ++ case XenbusStateConnected: ++ /* ++ * Work around xenbus race condition: If backend goes ++ * through InitWait to Connected fast enough, we can ++ * get Connected twice here. ++ */ ++ if (dev->state != XenbusStateConnected) ++ goto InitWait; /* no InitWait seen yet, fudge it */ ++ ++ if (xenbus_scanf(XBT_NIL, info->xbdev->otherend, ++ "request-update", "%d", &val) < 0) ++ val = 0; ++ if (val) ++ info->update_wanted = 1; ++ break; ++ ++ case XenbusStateClosing: ++ // FIXME is this safe in any dev->state? ++ xenbus_frontend_closed(dev); ++ break; ++ } ++} ++ ++static struct xenbus_device_id xenfb_ids[] = { ++ { "vfb" }, ++ { "" } ++}; ++ ++static struct xenbus_driver xenfb = { ++ .name = "vfb", ++ .owner = THIS_MODULE, ++ .ids = xenfb_ids, ++ .probe = xenfb_probe, ++ .remove = xenfb_remove, ++ .resume = xenfb_resume, ++ .otherend_changed = xenfb_backend_changed, ++}; ++ ++static int __init xenfb_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ /* Nothing to do if running in dom0. */ ++ if (is_initial_xendomain()) ++ return -ENODEV; ++ ++ return xenbus_register_frontend(&xenfb); ++} ++ ++static void __exit xenfb_cleanup(void) ++{ ++ return xenbus_unregister_driver(&xenfb); ++} ++ ++module_init(xenfb_init); ++module_exit(xenfb_cleanup); ++ ++MODULE_LICENSE("GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/fbfront/xenkbd.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,333 @@ ++/* ++ * linux/drivers/input/keyboard/xenkbd.c -- Xen para-virtual input device ++ * ++ * Copyright (C) 2005 Anthony Liguori <aliguori@us.ibm.com> ++ * Copyright (C) 2006 Red Hat, Inc., Markus Armbruster <armbru@redhat.com> ++ * ++ * Based on linux/drivers/input/mouse/sermouse.c ++ * ++ * This file is subject to the terms and conditions of the GNU General Public ++ * License. See the file COPYING in the main directory of this archive for ++ * more details. ++ */ ++ ++/* ++ * TODO: ++ * ++ * Switch to grant tables together with xenfb.c. ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/module.h> ++#include <linux/input.h> ++#include <asm/hypervisor.h> ++#include <xen/evtchn.h> ++#include <xen/interface/io/fbif.h> ++#include <xen/interface/io/kbdif.h> ++#include <xen/xenbus.h> ++ ++struct xenkbd_info ++{ ++ struct input_dev *kbd; ++ struct input_dev *ptr; ++ struct xenkbd_page *page; ++ int irq; ++ struct xenbus_device *xbdev; ++ char phys[32]; ++}; ++ ++static int xenkbd_remove(struct xenbus_device *); ++static int xenkbd_connect_backend(struct xenbus_device *, struct xenkbd_info *); ++static void xenkbd_disconnect_backend(struct xenkbd_info *); ++ ++/* ++ * Note: if you need to send out events, see xenfb_do_update() for how ++ * to do that. ++ */ ++ ++static irqreturn_t input_handler(int rq, void *dev_id, struct pt_regs *regs) ++{ ++ struct xenkbd_info *info = dev_id; ++ struct xenkbd_page *page = info->page; ++ __u32 cons, prod; ++ ++ prod = page->in_prod; ++ if (prod == page->out_cons) ++ return IRQ_HANDLED; ++ rmb(); /* ensure we see ring contents up to prod */ ++ for (cons = page->in_cons; cons != prod; cons++) { ++ union xenkbd_in_event *event; ++ struct input_dev *dev; ++ event = &XENKBD_IN_RING_REF(page, cons); ++ ++ dev = info->ptr; ++ switch (event->type) { ++ case XENKBD_TYPE_MOTION: ++ input_report_rel(dev, REL_X, event->motion.rel_x); ++ input_report_rel(dev, REL_Y, event->motion.rel_y); ++ break; ++ case XENKBD_TYPE_KEY: ++ dev = NULL; ++ if (test_bit(event->key.keycode, info->kbd->keybit)) ++ dev = info->kbd; ++ if (test_bit(event->key.keycode, info->ptr->keybit)) ++ dev = info->ptr; ++ if (dev) ++ input_report_key(dev, event->key.keycode, ++ event->key.pressed); ++ else ++ printk("xenkbd: unhandled keycode 0x%x\n", ++ event->key.keycode); ++ break; ++ case XENKBD_TYPE_POS: ++ input_report_abs(dev, ABS_X, event->pos.abs_x); ++ input_report_abs(dev, ABS_Y, event->pos.abs_y); ++ break; ++ } ++ if (dev) ++ input_sync(dev); ++ } ++ mb(); /* ensure we got ring contents */ ++ page->in_cons = cons; ++ notify_remote_via_irq(info->irq); ++ ++ return IRQ_HANDLED; ++} ++ ++int __devinit xenkbd_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int ret, i; ++ struct xenkbd_info *info; ++ struct input_dev *kbd, *ptr; ++ ++ info = kzalloc(sizeof(*info), GFP_KERNEL); ++ if (!info) { ++ xenbus_dev_fatal(dev, -ENOMEM, "allocating info structure"); ++ return -ENOMEM; ++ } ++ dev->dev.driver_data = info; ++ info->xbdev = dev; ++ snprintf(info->phys, sizeof(info->phys), "xenbus/%s", dev->nodename); ++ ++ info->page = (void *)__get_free_page(GFP_KERNEL); ++ if (!info->page) ++ goto error_nomem; ++ info->page->in_cons = info->page->in_prod = 0; ++ info->page->out_cons = info->page->out_prod = 0; ++ ++ /* keyboard */ ++ kbd = input_allocate_device(); ++ if (!kbd) ++ goto error_nomem; ++ kbd->name = "Xen Virtual Keyboard"; ++ kbd->phys = info->phys; ++ kbd->id.bustype = BUS_PCI; ++ kbd->id.vendor = 0x5853; ++ kbd->id.product = 0xffff; ++ kbd->evbit[0] = BIT(EV_KEY); ++ for (i = KEY_ESC; i < KEY_UNKNOWN; i++) ++ set_bit(i, kbd->keybit); ++ for (i = KEY_OK; i < KEY_MAX; i++) ++ set_bit(i, kbd->keybit); ++ ++ ret = input_register_device(kbd); ++ if (ret) { ++ input_free_device(kbd); ++ xenbus_dev_fatal(dev, ret, "input_register_device(kbd)"); ++ goto error; ++ } ++ info->kbd = kbd; ++ ++ /* pointing device */ ++ ptr = input_allocate_device(); ++ if (!ptr) ++ goto error_nomem; ++ ptr->name = "Xen Virtual Pointer"; ++ ptr->phys = info->phys; ++ ptr->id.bustype = BUS_PCI; ++ ptr->id.vendor = 0x5853; ++ ptr->id.product = 0xfffe; ++ ptr->evbit[0] = BIT(EV_KEY) | BIT(EV_REL) | BIT(EV_ABS); ++ for (i = BTN_LEFT; i <= BTN_TASK; i++) ++ set_bit(i, ptr->keybit); ++ ptr->relbit[0] = BIT(REL_X) | BIT(REL_Y); ++ input_set_abs_params(ptr, ABS_X, 0, XENFB_WIDTH, 0, 0); ++ input_set_abs_params(ptr, ABS_Y, 0, XENFB_HEIGHT, 0, 0); ++ ++ ret = input_register_device(ptr); ++ if (ret) { ++ input_free_device(ptr); ++ xenbus_dev_fatal(dev, ret, "input_register_device(ptr)"); ++ goto error; ++ } ++ info->ptr = ptr; ++ ++ ret = xenkbd_connect_backend(dev, info); ++ if (ret < 0) ++ goto error; ++ ++ return 0; ++ ++ error_nomem: ++ ret = -ENOMEM; ++ xenbus_dev_fatal(dev, ret, "allocating device memory"); ++ error: ++ xenkbd_remove(dev); ++ return ret; ++} ++ ++static int xenkbd_resume(struct xenbus_device *dev) ++{ ++ struct xenkbd_info *info = dev->dev.driver_data; ++ ++ xenkbd_disconnect_backend(info); ++ return xenkbd_connect_backend(dev, info); ++} ++ ++static int xenkbd_remove(struct xenbus_device *dev) ++{ ++ struct xenkbd_info *info = dev->dev.driver_data; ++ ++ xenkbd_disconnect_backend(info); ++ input_unregister_device(info->kbd); ++ input_unregister_device(info->ptr); ++ free_page((unsigned long)info->page); ++ kfree(info); ++ return 0; ++} ++ ++static int xenkbd_connect_backend(struct xenbus_device *dev, ++ struct xenkbd_info *info) ++{ ++ int ret; ++ struct xenbus_transaction xbt; ++ ++ ret = bind_listening_port_to_irqhandler( ++ dev->otherend_id, input_handler, 0, "xenkbd", info); ++ if (ret < 0) { ++ xenbus_dev_fatal(dev, ret, ++ "bind_listening_port_to_irqhandler"); ++ return ret; ++ } ++ info->irq = ret; ++ ++ again: ++ ret = xenbus_transaction_start(&xbt); ++ if (ret) { ++ xenbus_dev_fatal(dev, ret, "starting transaction"); ++ return ret; ++ } ++ ret = xenbus_printf(xbt, dev->nodename, "page-ref", "%lu", ++ virt_to_mfn(info->page)); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_printf(xbt, dev->nodename, "event-channel", "%u", ++ irq_to_evtchn_port(info->irq)); ++ if (ret) ++ goto error_xenbus; ++ ret = xenbus_transaction_end(xbt, 0); ++ if (ret) { ++ if (ret == -EAGAIN) ++ goto again; ++ xenbus_dev_fatal(dev, ret, "completing transaction"); ++ return ret; ++ } ++ ++ xenbus_switch_state(dev, XenbusStateInitialised); ++ return 0; ++ ++ error_xenbus: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, ret, "writing xenstore"); ++ return ret; ++} ++ ++static void xenkbd_disconnect_backend(struct xenkbd_info *info) ++{ ++ if (info->irq >= 0) ++ unbind_from_irqhandler(info->irq, info); ++ info->irq = -1; ++} ++ ++static void xenkbd_backend_changed(struct xenbus_device *dev, ++ enum xenbus_state backend_state) ++{ ++ struct xenkbd_info *info = dev->dev.driver_data; ++ int ret, val; ++ ++ switch (backend_state) { ++ case XenbusStateInitialising: ++ case XenbusStateInitialised: ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ break; ++ ++ case XenbusStateInitWait: ++ InitWait: ++ ret = xenbus_scanf(XBT_NIL, info->xbdev->otherend, ++ "feature-abs-pointer", "%d", &val); ++ if (ret < 0) ++ val = 0; ++ if (val) { ++ ret = xenbus_printf(XBT_NIL, info->xbdev->nodename, ++ "request-abs-pointer", "1"); ++ if (ret) ++ ; /* FIXME */ ++ } ++ xenbus_switch_state(dev, XenbusStateConnected); ++ break; ++ ++ case XenbusStateConnected: ++ /* ++ * Work around xenbus race condition: If backend goes ++ * through InitWait to Connected fast enough, we can ++ * get Connected twice here. ++ */ ++ if (dev->state != XenbusStateConnected) ++ goto InitWait; /* no InitWait seen yet, fudge it */ ++ break; ++ ++ case XenbusStateClosing: ++ xenbus_frontend_closed(dev); ++ break; ++ } ++} ++ ++static struct xenbus_device_id xenkbd_ids[] = { ++ { "vkbd" }, ++ { "" } ++}; ++ ++static struct xenbus_driver xenkbd = { ++ .name = "vkbd", ++ .owner = THIS_MODULE, ++ .ids = xenkbd_ids, ++ .probe = xenkbd_probe, ++ .remove = xenkbd_remove, ++ .resume = xenkbd_resume, ++ .otherend_changed = xenkbd_backend_changed, ++}; ++ ++static int __init xenkbd_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ /* Nothing to do if running in dom0. */ ++ if (is_initial_xendomain()) ++ return -ENODEV; ++ ++ return xenbus_register_frontend(&xenkbd); ++} ++ ++static void __exit xenkbd_cleanup(void) ++{ ++ return xenbus_unregister_driver(&xenkbd); ++} ++ ++module_init(xenkbd_init); ++module_exit(xenkbd_cleanup); ++ ++MODULE_LICENSE("GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/gntdev/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1 @@ ++obj-y := gntdev.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/gntdev/gntdev.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,973 @@ ++/****************************************************************************** ++ * gntdev.c ++ * ++ * Device for accessing (in user-space) pages that have been granted by other ++ * domains. ++ * ++ * Copyright (c) 2006-2007, D G Murray. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License ++ * along with this program; if not, write to the Free Software ++ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++ */ ++ ++#include <asm/atomic.h> ++#include <linux/module.h> ++#include <linux/kernel.h> ++#include <linux/init.h> ++#include <linux/fs.h> ++#include <linux/device.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <asm/uaccess.h> ++#include <asm/io.h> ++#include <xen/gnttab.h> ++#include <asm/hypervisor.h> ++#include <xen/balloon.h> ++#include <xen/evtchn.h> ++#include <xen/driver_util.h> ++ ++#include <linux/types.h> ++#include <xen/public/gntdev.h> ++ ++ ++#define DRIVER_AUTHOR "Derek G. Murray <Derek.Murray@cl.cam.ac.uk>" ++#define DRIVER_DESC "User-space granted page access driver" ++ ++MODULE_LICENSE("GPL"); ++MODULE_AUTHOR(DRIVER_AUTHOR); ++MODULE_DESCRIPTION(DRIVER_DESC); ++ ++#define MAX_GRANTS 128 ++ ++/* A slot can be in one of three states: ++ * ++ * 0. GNTDEV_SLOT_INVALID: ++ * This slot is not associated with a grant reference, and is therefore free ++ * to be overwritten by a new grant reference. ++ * ++ * 1. GNTDEV_SLOT_NOT_YET_MAPPED: ++ * This slot is associated with a grant reference (via the ++ * IOCTL_GNTDEV_MAP_GRANT_REF ioctl), but it has not yet been mmap()-ed. ++ * ++ * 2. GNTDEV_SLOT_MAPPED: ++ * This slot is associated with a grant reference, and has been mmap()-ed. ++ */ ++typedef enum gntdev_slot_state { ++ GNTDEV_SLOT_INVALID = 0, ++ GNTDEV_SLOT_NOT_YET_MAPPED, ++ GNTDEV_SLOT_MAPPED ++} gntdev_slot_state_t; ++ ++#define GNTDEV_INVALID_HANDLE -1 ++#define GNTDEV_FREE_LIST_INVALID -1 ++/* Each opened instance of gntdev is associated with a list of grants, ++ * represented by an array of elements of the following type, ++ * gntdev_grant_info_t. ++ */ ++typedef struct gntdev_grant_info { ++ gntdev_slot_state_t state; ++ union { ++ uint32_t free_list_index; ++ struct { ++ domid_t domid; ++ grant_ref_t ref; ++ grant_handle_t kernel_handle; ++ grant_handle_t user_handle; ++ uint64_t dev_bus_addr; ++ } valid; ++ } u; ++} gntdev_grant_info_t; ++ ++/* Private data structure, which is stored in the file pointer for files ++ * associated with this device. ++ */ ++typedef struct gntdev_file_private_data { ++ ++ /* Array of grant information. */ ++ gntdev_grant_info_t grants[MAX_GRANTS]; ++ ++ /* Read/write semaphore used to protect the grants array. */ ++ struct rw_semaphore grants_sem; ++ ++ /* An array of indices of free slots in the grants array. ++ * N.B. An entry in this list may temporarily have the value ++ * GNTDEV_FREE_LIST_INVALID if the corresponding slot has been removed ++ * from the list by the contiguous allocator, but the list has not yet ++ * been compressed. However, this is not visible across invocations of ++ * the device. ++ */ ++ int32_t free_list[MAX_GRANTS]; ++ ++ /* The number of free slots in the grants array. */ ++ uint32_t free_list_size; ++ ++ /* Read/write semaphore used to protect the free list. */ ++ struct rw_semaphore free_list_sem; ++ ++ /* Index of the next slot after the most recent contiguous allocation, ++ * for use in a next-fit allocator. ++ */ ++ uint32_t next_fit_index; ++ ++ /* Used to map grants into the kernel, before mapping them into user ++ * space. ++ */ ++ struct page **foreign_pages; ++ ++} gntdev_file_private_data_t; ++ ++/* Module lifecycle operations. */ ++static int __init gntdev_init(void); ++static void __exit gntdev_exit(void); ++ ++module_init(gntdev_init); ++module_exit(gntdev_exit); ++ ++/* File operations. */ ++static int gntdev_open(struct inode *inode, struct file *flip); ++static int gntdev_release(struct inode *inode, struct file *flip); ++static int gntdev_mmap(struct file *flip, struct vm_area_struct *vma); ++static int gntdev_ioctl (struct inode *inode, struct file *flip, ++ unsigned int cmd, unsigned long arg); ++ ++static struct file_operations gntdev_fops = { ++ .owner = THIS_MODULE, ++ .open = gntdev_open, ++ .release = gntdev_release, ++ .mmap = gntdev_mmap, ++ .ioctl = gntdev_ioctl ++}; ++ ++/* VM operations. */ ++static void gntdev_vma_close(struct vm_area_struct *vma); ++static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t *ptep, int is_fullmm); ++ ++static struct vm_operations_struct gntdev_vmops = { ++ .close = gntdev_vma_close, ++ .zap_pte = gntdev_clear_pte ++}; ++ ++/* Global variables. */ ++ ++/* The driver major number, for use when unregistering the driver. */ ++static int gntdev_major; ++ ++#define GNTDEV_NAME "gntdev" ++ ++/* Memory mapping functions ++ * ------------------------ ++ * ++ * Every granted page is mapped into both kernel and user space, and the two ++ * following functions return the respective virtual addresses of these pages. ++ * ++ * When shadow paging is disabled, the granted page is mapped directly into ++ * user space; when it is enabled, it is mapped into the kernel and remapped ++ * into user space using vm_insert_page() (see gntdev_mmap(), below). ++ */ ++ ++/* Returns the virtual address (in user space) of the @page_index'th page ++ * in the given VM area. ++ */ ++static inline unsigned long get_user_vaddr (struct vm_area_struct *vma, ++ int page_index) ++{ ++ return (unsigned long) vma->vm_start + (page_index << PAGE_SHIFT); ++} ++ ++/* Returns the virtual address (in kernel space) of the @slot_index'th page ++ * mapped by the gntdev instance that owns the given private data struct. ++ */ ++static inline unsigned long get_kernel_vaddr (gntdev_file_private_data_t *priv, ++ int slot_index) ++{ ++ unsigned long pfn; ++ void *kaddr; ++ pfn = page_to_pfn(priv->foreign_pages[slot_index]); ++ kaddr = pfn_to_kaddr(pfn); ++ return (unsigned long) kaddr; ++} ++ ++/* Helper functions. */ ++ ++/* Adds information about a grant reference to the list of grants in the file's ++ * private data structure. Returns non-zero on failure. On success, sets the ++ * value of *offset to the offset that should be mmap()-ed in order to map the ++ * grant reference. ++ */ ++static int add_grant_reference(struct file *flip, ++ struct ioctl_gntdev_grant_ref *op, ++ uint64_t *offset) ++{ ++ gntdev_file_private_data_t *private_data ++ = (gntdev_file_private_data_t *) flip->private_data; ++ ++ uint32_t slot_index; ++ ++ if (unlikely(private_data->free_list_size == 0)) { ++ return -ENOMEM; ++ } ++ ++ slot_index = private_data->free_list[--private_data->free_list_size]; ++ ++ /* Copy the grant information into file's private data. */ ++ private_data->grants[slot_index].state = GNTDEV_SLOT_NOT_YET_MAPPED; ++ private_data->grants[slot_index].u.valid.domid = op->domid; ++ private_data->grants[slot_index].u.valid.ref = op->ref; ++ ++ /* The offset is calculated as the index of the chosen entry in the ++ * file's private data's array of grant information. This is then ++ * shifted to give an offset into the virtual "file address space". ++ */ ++ *offset = slot_index << PAGE_SHIFT; ++ ++ return 0; ++} ++ ++/* Adds the @count grant references to the contiguous range in the slot array ++ * beginning at @first_slot. It is assumed that @first_slot was returned by a ++ * previous invocation of find_contiguous_free_range(), during the same ++ * invocation of the driver. ++ */ ++static int add_grant_references(struct file *flip, ++ int count, ++ struct ioctl_gntdev_grant_ref *ops, ++ uint32_t first_slot) ++{ ++ gntdev_file_private_data_t *private_data ++ = (gntdev_file_private_data_t *) flip->private_data; ++ int i; ++ ++ for (i = 0; i < count; ++i) { ++ ++ /* First, mark the slot's entry in the free list as invalid. */ ++ int free_list_index = ++ private_data->grants[first_slot+i].u.free_list_index; ++ private_data->free_list[free_list_index] = ++ GNTDEV_FREE_LIST_INVALID; ++ ++ /* Now, update the slot. */ ++ private_data->grants[first_slot+i].state = ++ GNTDEV_SLOT_NOT_YET_MAPPED; ++ private_data->grants[first_slot+i].u.valid.domid = ++ ops[i].domid; ++ private_data->grants[first_slot+i].u.valid.ref = ops[i].ref; ++ } ++ ++ return 0; ++} ++ ++/* Scans through the free list for @flip, removing entries that are marked as ++ * GNTDEV_SLOT_INVALID. This will reduce the recorded size of the free list to ++ * the number of valid entries. ++ */ ++static void compress_free_list(struct file *flip) ++{ ++ gntdev_file_private_data_t *private_data ++ = (gntdev_file_private_data_t *) flip->private_data; ++ int i, j = 0, old_size; ++ ++ old_size = private_data->free_list_size; ++ for (i = 0; i < old_size; ++i) { ++ if (private_data->free_list[i] != GNTDEV_FREE_LIST_INVALID) { ++ private_data->free_list[j] = ++ private_data->free_list[i]; ++ ++j; ++ } else { ++ --private_data->free_list_size; ++ } ++ } ++} ++ ++/* Searches the grant array in the private data of @flip for a range of ++ * @num_slots contiguous slots in the GNTDEV_SLOT_INVALID state. ++ * ++ * Returns the index of the first slot if a range is found, otherwise -ENOMEM. ++ */ ++static int find_contiguous_free_range(struct file *flip, ++ uint32_t num_slots) ++{ ++ gntdev_file_private_data_t *private_data ++ = (gntdev_file_private_data_t *) flip->private_data; ++ ++ int i; ++ int start_index = private_data->next_fit_index; ++ int range_start = 0, range_length; ++ ++ if (private_data->free_list_size < num_slots) { ++ return -ENOMEM; ++ } ++ ++ /* First search from the start_index to the end of the array. */ ++ range_length = 0; ++ for (i = start_index; i < MAX_GRANTS; ++i) { ++ if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { ++ if (range_length == 0) { ++ range_start = i; ++ } ++ ++range_length; ++ if (range_length == num_slots) { ++ return range_start; ++ } ++ } ++ } ++ ++ /* Now search from the start of the array to the start_index. */ ++ range_length = 0; ++ for (i = 0; i < start_index; ++i) { ++ if (private_data->grants[i].state == GNTDEV_SLOT_INVALID) { ++ if (range_length == 0) { ++ range_start = i; ++ } ++ ++range_length; ++ if (range_length == num_slots) { ++ return range_start; ++ } ++ } ++ } ++ ++ return -ENOMEM; ++} ++ ++/* Interface functions. */ ++ ++/* Initialises the driver. Called when the module is loaded. */ ++static int __init gntdev_init(void) ++{ ++ struct class *class; ++ struct class_device *device; ++ ++ if (!is_running_on_xen()) { ++ printk(KERN_ERR "You must be running Xen to use gntdev\n"); ++ return -ENODEV; ++ } ++ ++ gntdev_major = register_chrdev(0, GNTDEV_NAME, &gntdev_fops); ++ if (gntdev_major < 0) ++ { ++ printk(KERN_ERR "Could not register gntdev device\n"); ++ return -ENOMEM; ++ } ++ ++ /* Note that if the sysfs code fails, we will still initialise the ++ * device, and output the major number so that the device can be ++ * created manually using mknod. ++ */ ++ if ((class = get_xen_class()) == NULL) { ++ printk(KERN_ERR "Error setting up xen_class\n"); ++ printk(KERN_ERR "gntdev created with major number = %d\n", ++ gntdev_major); ++ return 0; ++ } ++ ++ device = class_device_create(class, NULL, MKDEV(gntdev_major, 0), ++ NULL, GNTDEV_NAME); ++ if (IS_ERR(device)) { ++ printk(KERN_ERR "Error creating gntdev device in xen_class\n"); ++ printk(KERN_ERR "gntdev created with major number = %d\n", ++ gntdev_major); ++ return 0; ++ } ++ ++ return 0; ++} ++ ++/* Cleans up and unregisters the driver. Called when the driver is unloaded. ++ */ ++static void __exit gntdev_exit(void) ++{ ++ struct class *class; ++ if ((class = get_xen_class()) != NULL) ++ class_device_destroy(class, MKDEV(gntdev_major, 0)); ++ unregister_chrdev(gntdev_major, GNTDEV_NAME); ++} ++ ++/* Called when the device is opened. */ ++static int gntdev_open(struct inode *inode, struct file *flip) ++{ ++ gntdev_file_private_data_t *private_data; ++ int i; ++ ++ try_module_get(THIS_MODULE); ++ ++ /* Allocate space for the per-instance private data. */ ++ private_data = kmalloc(sizeof(*private_data), GFP_KERNEL); ++ if (!private_data) ++ goto nomem_out; ++ ++ /* Allocate space for the kernel-mapping of granted pages. */ ++ private_data->foreign_pages = ++ alloc_empty_pages_and_pagevec(MAX_GRANTS); ++ if (!private_data->foreign_pages) ++ goto nomem_out2; ++ ++ /* Initialise the free-list, which contains all slots at first. ++ */ ++ for (i = 0; i < MAX_GRANTS; ++i) { ++ private_data->free_list[MAX_GRANTS - i - 1] = i; ++ private_data->grants[i].state = GNTDEV_SLOT_INVALID; ++ private_data->grants[i].u.free_list_index = MAX_GRANTS - i - 1; ++ } ++ private_data->free_list_size = MAX_GRANTS; ++ private_data->next_fit_index = 0; ++ ++ init_rwsem(&private_data->grants_sem); ++ init_rwsem(&private_data->free_list_sem); ++ ++ flip->private_data = private_data; ++ ++ return 0; ++ ++nomem_out2: ++ kfree(private_data); ++nomem_out: ++ return -ENOMEM; ++} ++ ++/* Called when the device is closed. ++ */ ++static int gntdev_release(struct inode *inode, struct file *flip) ++{ ++ if (flip->private_data) { ++ gntdev_file_private_data_t *private_data = ++ (gntdev_file_private_data_t *) flip->private_data; ++ if (private_data->foreign_pages) { ++ free_empty_pages_and_pagevec ++ (private_data->foreign_pages, MAX_GRANTS); ++ } ++ kfree(private_data); ++ } ++ module_put(THIS_MODULE); ++ return 0; ++} ++ ++/* Called when an attempt is made to mmap() the device. The private data from ++ * @flip contains the list of grant references that can be mapped. The vm_pgoff ++ * field of @vma contains the index into that list that refers to the grant ++ * reference that will be mapped. Only mappings that are a multiple of ++ * PAGE_SIZE are handled. ++ */ ++static int gntdev_mmap (struct file *flip, struct vm_area_struct *vma) ++{ ++ struct gnttab_map_grant_ref op; ++ unsigned long slot_index = vma->vm_pgoff; ++ unsigned long kernel_vaddr, user_vaddr; ++ uint32_t size = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; ++ uint64_t ptep; ++ int ret; ++ int flags; ++ int i; ++ struct page *page; ++ gntdev_file_private_data_t *private_data = flip->private_data; ++ ++ if (unlikely(!private_data)) { ++ printk(KERN_ERR "File's private data is NULL.\n"); ++ return -EINVAL; ++ } ++ ++ if (unlikely((size <= 0) || (size + slot_index) > MAX_GRANTS)) { ++ printk(KERN_ERR "Invalid number of pages or offset" ++ "(num_pages = %d, first_slot = %ld).\n", ++ size, slot_index); ++ return -ENXIO; ++ } ++ ++ if ((vma->vm_flags & VM_WRITE) && !(vma->vm_flags & VM_SHARED)) { ++ printk(KERN_ERR "Writable mappings must be shared.\n"); ++ return -EINVAL; ++ } ++ ++ /* Slots must be in the NOT_YET_MAPPED state. */ ++ down_write(&private_data->grants_sem); ++ for (i = 0; i < size; ++i) { ++ if (private_data->grants[slot_index + i].state != ++ GNTDEV_SLOT_NOT_YET_MAPPED) { ++ printk(KERN_ERR "Slot (index = %ld) is in the wrong " ++ "state (%d).\n", slot_index + i, ++ private_data->grants[slot_index + i].state); ++ up_write(&private_data->grants_sem); ++ return -EINVAL; ++ } ++ } ++ ++ /* Install the hook for unmapping. */ ++ vma->vm_ops = &gntdev_vmops; ++ ++ /* The VM area contains pages from another VM. */ ++ vma->vm_flags |= VM_FOREIGN; ++ vma->vm_private_data = kzalloc(size * sizeof(struct page_struct *), ++ GFP_KERNEL); ++ if (vma->vm_private_data == NULL) { ++ printk(KERN_ERR "Couldn't allocate mapping structure for VM " ++ "area.\n"); ++ return -ENOMEM; ++ } ++ ++ /* This flag prevents Bad PTE errors when the memory is unmapped. */ ++ vma->vm_flags |= VM_RESERVED; ++ ++ /* This flag prevents this VM area being copied on a fork(). A better ++ * behaviour might be to explicitly carry out the appropriate mappings ++ * on fork(), but I don't know if there's a hook for this. ++ */ ++ vma->vm_flags |= VM_DONTCOPY; ++ ++#ifdef CONFIG_X86 ++ /* This flag ensures that the page tables are not unpinned before the ++ * VM area is unmapped. Therefore Xen still recognises the PTE as ++ * belonging to an L1 pagetable, and the grant unmap operation will ++ * succeed, even if the process does not exit cleanly. ++ */ ++ vma->vm_mm->context.has_foreign_mappings = 1; ++#endif ++ ++ for (i = 0; i < size; ++i) { ++ ++ flags = GNTMAP_host_map; ++ if (!(vma->vm_flags & VM_WRITE)) ++ flags |= GNTMAP_readonly; ++ ++ kernel_vaddr = get_kernel_vaddr(private_data, slot_index + i); ++ user_vaddr = get_user_vaddr(vma, i); ++ page = pfn_to_page(__pa(kernel_vaddr) >> PAGE_SHIFT); ++ ++ gnttab_set_map_op(&op, kernel_vaddr, flags, ++ private_data->grants[slot_index+i] ++ .u.valid.ref, ++ private_data->grants[slot_index+i] ++ .u.valid.domid); ++ ++ /* Carry out the mapping of the grant reference. */ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ &op, 1); ++ BUG_ON(ret); ++ if (op.status) { ++ printk(KERN_ERR "Error mapping the grant reference " ++ "into the kernel (%d). domid = %d; ref = %d\n", ++ op.status, ++ private_data->grants[slot_index+i] ++ .u.valid.domid, ++ private_data->grants[slot_index+i] ++ .u.valid.ref); ++ goto undo_map_out; ++ } ++ ++ /* Store a reference to the page that will be mapped into user ++ * space. ++ */ ++ ((struct page **) vma->vm_private_data)[i] = page; ++ ++ /* Mark mapped page as reserved. */ ++ SetPageReserved(page); ++ ++ /* Record the grant handle, for use in the unmap operation. */ ++ private_data->grants[slot_index+i].u.valid.kernel_handle = ++ op.handle; ++ private_data->grants[slot_index+i].u.valid.dev_bus_addr = ++ op.dev_bus_addr; ++ ++ private_data->grants[slot_index+i].state = GNTDEV_SLOT_MAPPED; ++ private_data->grants[slot_index+i].u.valid.user_handle = ++ GNTDEV_INVALID_HANDLE; ++ ++ /* Now perform the mapping to user space. */ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ ++ /* NOT USING SHADOW PAGE TABLES. */ ++ /* In this case, we map the grant(s) straight into user ++ * space. ++ */ ++ ++ /* Get the machine address of the PTE for the user ++ * page. ++ */ ++ if ((ret = create_lookup_pte_addr(vma->vm_mm, ++ vma->vm_start ++ + (i << PAGE_SHIFT), ++ &ptep))) ++ { ++ printk(KERN_ERR "Error obtaining PTE pointer " ++ "(%d).\n", ret); ++ goto undo_map_out; ++ } ++ ++ /* Configure the map operation. */ ++ ++ /* The reference is to be used by host CPUs. */ ++ flags = GNTMAP_host_map; ++ ++ /* Specifies a user space mapping. */ ++ flags |= GNTMAP_application_map; ++ ++ /* The map request contains the machine address of the ++ * PTE to update. ++ */ ++ flags |= GNTMAP_contains_pte; ++ ++ if (!(vma->vm_flags & VM_WRITE)) ++ flags |= GNTMAP_readonly; ++ ++ gnttab_set_map_op(&op, ptep, flags, ++ private_data->grants[slot_index+i] ++ .u.valid.ref, ++ private_data->grants[slot_index+i] ++ .u.valid.domid); ++ ++ /* Carry out the mapping of the grant reference. */ ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ &op, 1); ++ BUG_ON(ret); ++ if (op.status) { ++ printk(KERN_ERR "Error mapping the grant " ++ "reference into user space (%d). domid " ++ "= %d; ref = %d\n", op.status, ++ private_data->grants[slot_index+i].u ++ .valid.domid, ++ private_data->grants[slot_index+i].u ++ .valid.ref); ++ goto undo_map_out; ++ } ++ ++ /* Record the grant handle, for use in the unmap ++ * operation. ++ */ ++ private_data->grants[slot_index+i].u. ++ valid.user_handle = op.handle; ++ ++ /* Update p2m structure with the new mapping. */ ++ set_phys_to_machine(__pa(kernel_vaddr) >> PAGE_SHIFT, ++ FOREIGN_FRAME(private_data-> ++ grants[slot_index+i] ++ .u.valid.dev_bus_addr ++ >> PAGE_SHIFT)); ++ } else { ++ /* USING SHADOW PAGE TABLES. */ ++ /* In this case, we simply insert the page into the VM ++ * area. */ ++ ret = vm_insert_page(vma, user_vaddr, page); ++ } ++ ++ } ++ ++ up_write(&private_data->grants_sem); ++ return 0; ++ ++undo_map_out: ++ /* If we have a mapping failure, the unmapping will be taken care of ++ * by do_mmap_pgoff(), which will eventually call gntdev_clear_pte(). ++ * All we need to do here is free the vma_private_data. ++ */ ++ kfree(vma->vm_private_data); ++ ++ /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file ++ * to NULL on failure. However, we need this in gntdev_clear_pte() to ++ * unmap the grants. Therefore, we smuggle a reference to the file's ++ * private data in the VM area's private data pointer. ++ */ ++ vma->vm_private_data = private_data; ++ ++ up_write(&private_data->grants_sem); ++ ++ return -ENOMEM; ++} ++ ++static pte_t gntdev_clear_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t *ptep, int is_fullmm) ++{ ++ int slot_index, ret; ++ pte_t copy; ++ struct gnttab_unmap_grant_ref op; ++ gntdev_file_private_data_t *private_data; ++ ++ /* THIS IS VERY UNPLEASANT: do_mmap_pgoff() will set the vma->vm_file ++ * to NULL on failure. However, we need this in gntdev_clear_pte() to ++ * unmap the grants. Therefore, we smuggle a reference to the file's ++ * private data in the VM area's private data pointer. ++ */ ++ if (vma->vm_file) { ++ private_data = (gntdev_file_private_data_t *) ++ vma->vm_file->private_data; ++ } else if (vma->vm_private_data) { ++ private_data = (gntdev_file_private_data_t *) ++ vma->vm_private_data; ++ } else { ++ private_data = NULL; /* gcc warning */ ++ BUG(); ++ } ++ ++ /* Copy the existing value of the PTE for returning. */ ++ copy = *ptep; ++ ++ /* Calculate the grant relating to this PTE. */ ++ slot_index = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT); ++ ++ /* Only unmap grants if the slot has been mapped. This could be being ++ * called from a failing mmap(). ++ */ ++ if (private_data->grants[slot_index].state == GNTDEV_SLOT_MAPPED) { ++ ++ /* First, we clear the user space mapping, if it has been made. ++ */ ++ if (private_data->grants[slot_index].u.valid.user_handle != ++ GNTDEV_INVALID_HANDLE && ++ !xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* NOT USING SHADOW PAGE TABLES. */ ++ gnttab_set_unmap_op(&op, virt_to_machine(ptep), ++ GNTMAP_contains_pte, ++ private_data->grants[slot_index] ++ .u.valid.user_handle); ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, &op, 1); ++ BUG_ON(ret); ++ if (op.status) ++ printk("User unmap grant status = %d\n", ++ op.status); ++ } else { ++ /* USING SHADOW PAGE TABLES. */ ++ pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); ++ } ++ ++ /* Finally, we unmap the grant from kernel space. */ ++ gnttab_set_unmap_op(&op, ++ get_kernel_vaddr(private_data, slot_index), ++ GNTMAP_host_map, ++ private_data->grants[slot_index].u.valid ++ .kernel_handle); ++ ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, ++ &op, 1); ++ BUG_ON(ret); ++ if (op.status) ++ printk("Kernel unmap grant status = %d\n", op.status); ++ ++ ++ /* Return slot to the not-yet-mapped state, so that it may be ++ * mapped again, or removed by a subsequent ioctl. ++ */ ++ private_data->grants[slot_index].state = ++ GNTDEV_SLOT_NOT_YET_MAPPED; ++ ++ /* Invalidate the physical to machine mapping for this page. */ ++ set_phys_to_machine(__pa(get_kernel_vaddr(private_data, ++ slot_index)) ++ >> PAGE_SHIFT, INVALID_P2M_ENTRY); ++ ++ } else { ++ pte_clear_full(vma->vm_mm, addr, ptep, is_fullmm); ++ } ++ ++ return copy; ++} ++ ++/* "Destructor" for a VM area. ++ */ ++static void gntdev_vma_close(struct vm_area_struct *vma) { ++ if (vma->vm_private_data) { ++ kfree(vma->vm_private_data); ++ } ++} ++ ++/* Called when an ioctl is made on the device. ++ */ ++static int gntdev_ioctl(struct inode *inode, struct file *flip, ++ unsigned int cmd, unsigned long arg) ++{ ++ int rc = 0; ++ gntdev_file_private_data_t *private_data = ++ (gntdev_file_private_data_t *) flip->private_data; ++ ++ switch (cmd) { ++ case IOCTL_GNTDEV_MAP_GRANT_REF: ++ { ++ struct ioctl_gntdev_map_grant_ref op; ++ down_write(&private_data->grants_sem); ++ down_write(&private_data->free_list_sem); ++ ++ if ((rc = copy_from_user(&op, (void __user *) arg, ++ sizeof(op)))) { ++ rc = -EFAULT; ++ goto map_out; ++ } ++ if (unlikely(op.count <= 0)) { ++ rc = -EINVAL; ++ goto map_out; ++ } ++ ++ if (op.count == 1) { ++ if ((rc = add_grant_reference(flip, &op.refs[0], ++ &op.index)) < 0) { ++ printk(KERN_ERR "Adding grant reference " ++ "failed (%d).\n", rc); ++ goto map_out; ++ } ++ } else { ++ struct ioctl_gntdev_grant_ref *refs, *u; ++ refs = kmalloc(op.count * sizeof(*refs), GFP_KERNEL); ++ if (!refs) { ++ rc = -ENOMEM; ++ goto map_out; ++ } ++ u = ((struct ioctl_gntdev_map_grant_ref *)arg)->refs; ++ if ((rc = copy_from_user(refs, ++ (void __user *)u, ++ sizeof(*refs) * op.count))) { ++ printk(KERN_ERR "Copying refs from user failed" ++ " (%d).\n", rc); ++ rc = -EINVAL; ++ goto map_out; ++ } ++ if ((rc = find_contiguous_free_range(flip, op.count)) ++ < 0) { ++ printk(KERN_ERR "Finding contiguous range " ++ "failed (%d).\n", rc); ++ kfree(refs); ++ goto map_out; ++ } ++ op.index = rc << PAGE_SHIFT; ++ if ((rc = add_grant_references(flip, op.count, ++ refs, rc))) { ++ printk(KERN_ERR "Adding grant references " ++ "failed (%d).\n", rc); ++ kfree(refs); ++ goto map_out; ++ } ++ compress_free_list(flip); ++ kfree(refs); ++ } ++ if ((rc = copy_to_user((void __user *) arg, ++ &op, ++ sizeof(op)))) { ++ printk(KERN_ERR "Copying result back to user failed " ++ "(%d)\n", rc); ++ rc = -EFAULT; ++ goto map_out; ++ } ++ map_out: ++ up_write(&private_data->grants_sem); ++ up_write(&private_data->free_list_sem); ++ return rc; ++ } ++ case IOCTL_GNTDEV_UNMAP_GRANT_REF: ++ { ++ struct ioctl_gntdev_unmap_grant_ref op; ++ int i, start_index; ++ ++ down_write(&private_data->grants_sem); ++ down_write(&private_data->free_list_sem); ++ ++ if ((rc = copy_from_user(&op, ++ (void __user *) arg, ++ sizeof(op)))) { ++ rc = -EFAULT; ++ goto unmap_out; ++ } ++ ++ start_index = op.index >> PAGE_SHIFT; ++ ++ /* First, check that all pages are in the NOT_YET_MAPPED ++ * state. ++ */ ++ for (i = 0; i < op.count; ++i) { ++ if (unlikely ++ (private_data->grants[start_index + i].state ++ != GNTDEV_SLOT_NOT_YET_MAPPED)) { ++ if (private_data->grants[start_index + i].state ++ == GNTDEV_SLOT_INVALID) { ++ printk(KERN_ERR ++ "Tried to remove an invalid " ++ "grant at offset 0x%x.", ++ (start_index + i) ++ << PAGE_SHIFT); ++ rc = -EINVAL; ++ } else { ++ printk(KERN_ERR ++ "Tried to remove a grant which " ++ "is currently mmap()-ed at " ++ "offset 0x%x.", ++ (start_index + i) ++ << PAGE_SHIFT); ++ rc = -EBUSY; ++ } ++ goto unmap_out; ++ } ++ } ++ ++ /* Unmap pages and add them to the free list. ++ */ ++ for (i = 0; i < op.count; ++i) { ++ private_data->grants[start_index+i].state = ++ GNTDEV_SLOT_INVALID; ++ private_data->grants[start_index+i].u.free_list_index = ++ private_data->free_list_size; ++ private_data->free_list[private_data->free_list_size] = ++ start_index + i; ++ ++private_data->free_list_size; ++ } ++ compress_free_list(flip); ++ ++ unmap_out: ++ up_write(&private_data->grants_sem); ++ up_write(&private_data->free_list_sem); ++ return rc; ++ } ++ case IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR: ++ { ++ struct ioctl_gntdev_get_offset_for_vaddr op; ++ struct vm_area_struct *vma; ++ unsigned long vaddr; ++ ++ if ((rc = copy_from_user(&op, ++ (void __user *) arg, ++ sizeof(op)))) { ++ rc = -EFAULT; ++ goto get_offset_out; ++ } ++ vaddr = (unsigned long)op.vaddr; ++ ++ down_read(¤t->mm->mmap_sem); ++ vma = find_vma(current->mm, vaddr); ++ if (vma == NULL) { ++ rc = -EFAULT; ++ goto get_offset_unlock_out; ++ } ++ if ((!vma->vm_ops) || (vma->vm_ops != &gntdev_vmops)) { ++ printk(KERN_ERR "The vaddr specified does not belong " ++ "to a gntdev instance: %#lx\n", vaddr); ++ rc = -EFAULT; ++ goto get_offset_unlock_out; ++ } ++ if (vma->vm_start != vaddr) { ++ printk(KERN_ERR "The vaddr specified in an " ++ "IOCTL_GNTDEV_GET_OFFSET_FOR_VADDR must be at " ++ "the start of the VM area. vma->vm_start = " ++ "%#lx; vaddr = %#lx\n", ++ vma->vm_start, vaddr); ++ rc = -EFAULT; ++ goto get_offset_unlock_out; ++ } ++ op.offset = vma->vm_pgoff << PAGE_SHIFT; ++ op.count = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; ++ up_read(¤t->mm->mmap_sem); ++ if ((rc = copy_to_user((void __user *) arg, ++ &op, ++ sizeof(op)))) { ++ rc = -EFAULT; ++ goto get_offset_out; ++ } ++ goto get_offset_out; ++ get_offset_unlock_out: ++ up_read(¤t->mm->mmap_sem); ++ get_offset_out: ++ return rc; ++ } ++ default: ++ return -ENOIOCTLCMD; ++ } ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,5 @@ ++obj-$(CONFIG_XEN_NETDEV_BACKEND) := netbk.o ++obj-$(CONFIG_XEN_NETDEV_LOOPBACK) += netloop.o ++ ++netbk-y := netback.o xenbus.o interface.o ++netloop-y := loopback.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/common.h 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,157 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/common.h ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef __NETIF__BACKEND__COMMON_H__ ++#define __NETIF__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <linux/ip.h> ++#include <linux/in.h> ++#include <linux/netdevice.h> ++#include <linux/etherdevice.h> ++#include <linux/wait.h> ++#include <xen/evtchn.h> ++#include <xen/interface/io/netif.h> ++#include <asm/io.h> ++#include <asm/pgalloc.h> ++#include <xen/interface/grant_table.h> ++#include <xen/gnttab.h> ++#include <xen/driver_util.h> ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++#define IPRINTK(fmt, args...) \ ++ printk(KERN_INFO "xen_net: " fmt, ##args) ++#define WPRINTK(fmt, args...) \ ++ printk(KERN_WARNING "xen_net: " fmt, ##args) ++ ++typedef struct netif_st { ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ ++ u8 fe_dev_addr[6]; ++ ++ /* Physical parameters of the comms window. */ ++ grant_handle_t tx_shmem_handle; ++ grant_ref_t tx_shmem_ref; ++ grant_handle_t rx_shmem_handle; ++ grant_ref_t rx_shmem_ref; ++ unsigned int irq; ++ ++ /* The shared rings and indexes. */ ++ netif_tx_back_ring_t tx; ++ netif_rx_back_ring_t rx; ++ struct vm_struct *tx_comms_area; ++ struct vm_struct *rx_comms_area; ++ ++ /* Set of features that can be turned on in dev->features. */ ++ int features; ++ ++ /* Internal feature information. */ ++ int can_queue:1; /* can queue packets for receiver? */ ++ int copying_receiver:1; /* copy packets to receiver? */ ++ ++ /* Allow netif_be_start_xmit() to peek ahead in the rx request ring. */ ++ RING_IDX rx_req_cons_peek; ++ ++ /* Transmit shaping: allow 'credit_bytes' every 'credit_usec'. */ ++ unsigned long credit_bytes; ++ unsigned long credit_usec; ++ unsigned long remaining_credit; ++ struct timer_list credit_timeout; ++ ++ /* Enforce draining of the transmit queue. */ ++ struct timer_list tx_queue_timeout; ++ ++ /* Miscellaneous private stuff. */ ++ struct list_head list; /* scheduling list */ ++ atomic_t refcnt; ++ struct net_device *dev; ++ struct net_device_stats stats; ++ ++ unsigned int carrier; ++ ++ wait_queue_head_t waiting_to_free; ++} netif_t; ++ ++/* ++ * Implement our own carrier flag: the network stack's version causes delays ++ * when the carrier is re-enabled (in particular, dev_activate() may not ++ * immediately be called, which can cause packet loss; also the etherbridge ++ * can be rather lazy in activating its port). ++ */ ++#define netback_carrier_on(netif) ((netif)->carrier = 1) ++#define netback_carrier_off(netif) ((netif)->carrier = 0) ++#define netback_carrier_ok(netif) ((netif)->carrier) ++ ++#define NET_TX_RING_SIZE __RING_SIZE((netif_tx_sring_t *)0, PAGE_SIZE) ++#define NET_RX_RING_SIZE __RING_SIZE((netif_rx_sring_t *)0, PAGE_SIZE) ++ ++void netif_disconnect(netif_t *netif); ++ ++netif_t *netif_alloc(domid_t domid, unsigned int handle); ++int netif_map(netif_t *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn); ++ ++#define netif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define netif_put(_b) \ ++ do { \ ++ if ( atomic_dec_and_test(&(_b)->refcnt) ) \ ++ wake_up(&(_b)->waiting_to_free); \ ++ } while (0) ++ ++void netif_xenbus_init(void); ++ ++#define netif_schedulable(netif) \ ++ (netif_running((netif)->dev) && netback_carrier_ok(netif)) ++ ++void netif_schedule_work(netif_t *netif); ++void netif_deschedule_work(netif_t *netif); ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev); ++struct net_device_stats *netif_be_get_stats(struct net_device *dev); ++irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs); ++ ++static inline int netbk_can_queue(struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ return netif->can_queue; ++} ++ ++static inline int netbk_can_sg(struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ return netif->features & NETIF_F_SG; ++} ++ ++#endif /* __NETIF__BACKEND__COMMON_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/interface.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,336 @@ ++/****************************************************************************** ++ * arch/xen/drivers/netif/backend/interface.c ++ * ++ * Network-device interface management. ++ * ++ * Copyright (c) 2004-2005, Keir Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include <linux/ethtool.h> ++#include <linux/rtnetlink.h> ++ ++/* ++ * Module parameter 'queue_length': ++ * ++ * Enables queuing in the network stack when a client has run out of receive ++ * descriptors. Although this feature can improve receive bandwidth by avoiding ++ * packet loss, it can also result in packets sitting in the 'tx_queue' for ++ * unbounded time. This is bad if those packets hold onto foreign resources. ++ * For example, consider a packet that holds onto resources belonging to the ++ * guest for which it is queued (e.g., packet received on vif1.0, destined for ++ * vif1.1 which is not activated in the guest): in this situation the guest ++ * will never be destroyed, unless vif1.1 is taken down. To avoid this, we ++ * run a timer (tx_queue_timeout) to drain the queue when the interface is ++ * blocked. ++ */ ++static unsigned long netbk_queue_length = 32; ++module_param_named(queue_length, netbk_queue_length, ulong, 0); ++ ++static void __netif_up(netif_t *netif) ++{ ++ enable_irq(netif->irq); ++ netif_schedule_work(netif); ++} ++ ++static void __netif_down(netif_t *netif) ++{ ++ disable_irq(netif->irq); ++ netif_deschedule_work(netif); ++} ++ ++static int net_open(struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) { ++ __netif_up(netif); ++ netif_start_queue(dev); ++ } ++ return 0; ++} ++ ++static int net_close(struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ if (netback_carrier_ok(netif)) ++ __netif_down(netif); ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++static int netbk_change_mtu(struct net_device *dev, int mtu) ++{ ++ int max = netbk_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; ++ ++ if (mtu > max) ++ return -EINVAL; ++ dev->mtu = mtu; ++ return 0; ++} ++ ++static int netbk_set_sg(struct net_device *dev, u32 data) ++{ ++ if (data) { ++ netif_t *netif = netdev_priv(dev); ++ ++ if (!(netif->features & NETIF_F_SG)) ++ return -ENOSYS; ++ } ++ ++ return ethtool_op_set_sg(dev, data); ++} ++ ++static int netbk_set_tso(struct net_device *dev, u32 data) ++{ ++ if (data) { ++ netif_t *netif = netdev_priv(dev); ++ ++ if (!(netif->features & NETIF_F_TSO)) ++ return -ENOSYS; ++ } ++ ++ return ethtool_op_set_tso(dev, data); ++} ++ ++static struct ethtool_ops network_ethtool_ops = ++{ ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = ethtool_op_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = netbk_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = netbk_set_tso, ++ .get_link = ethtool_op_get_link, ++}; ++ ++netif_t *netif_alloc(domid_t domid, unsigned int handle) ++{ ++ int err = 0; ++ struct net_device *dev; ++ netif_t *netif; ++ char name[IFNAMSIZ] = {}; ++ ++ snprintf(name, IFNAMSIZ - 1, "vif%u.%u", domid, handle); ++ dev = alloc_netdev(sizeof(netif_t), name, ether_setup); ++ if (dev == NULL) { ++ DPRINTK("Could not create netif: out of memory\n"); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ netif = netdev_priv(dev); ++ memset(netif, 0, sizeof(*netif)); ++ netif->domid = domid; ++ netif->handle = handle; ++ atomic_set(&netif->refcnt, 1); ++ init_waitqueue_head(&netif->waiting_to_free); ++ netif->dev = dev; ++ ++ netback_carrier_off(netif); ++ ++ netif->credit_bytes = netif->remaining_credit = ~0UL; ++ netif->credit_usec = 0UL; ++ init_timer(&netif->credit_timeout); ++ /* Initialize 'expires' now: it's used to track the credit window. */ ++ netif->credit_timeout.expires = jiffies; ++ ++ init_timer(&netif->tx_queue_timeout); ++ ++ dev->hard_start_xmit = netif_be_start_xmit; ++ dev->get_stats = netif_be_get_stats; ++ dev->open = net_open; ++ dev->stop = net_close; ++ dev->change_mtu = netbk_change_mtu; ++ dev->features = NETIF_F_IP_CSUM; ++ ++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops); ++ ++ dev->tx_queue_len = netbk_queue_length; ++ ++ /* ++ * Initialise a dummy MAC address. We choose the numerically ++ * largest non-broadcast address to prevent the address getting ++ * stolen by an Ethernet bridge for STP purposes. ++ * (FE:FF:FF:FF:FF:FF) ++ */ ++ memset(dev->dev_addr, 0xFF, ETH_ALEN); ++ dev->dev_addr[0] &= ~0x01; ++ ++ rtnl_lock(); ++ err = register_netdevice(dev); ++ rtnl_unlock(); ++ if (err) { ++ DPRINTK("Could not register new net device %s: err=%d\n", ++ dev->name, err); ++ free_netdev(dev); ++ return ERR_PTR(err); ++ } ++ ++ DPRINTK("Successfully created netif\n"); ++ return netif; ++} ++ ++static int map_frontend_pages( ++ netif_t *netif, grant_ref_t tx_ring_ref, grant_ref_t rx_ring_ref) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, tx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Gnttab failure mapping tx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->tx_shmem_ref = tx_ring_ref; ++ netif->tx_shmem_handle = op.handle; ++ ++ gnttab_set_map_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, rx_ring_ref, netif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Gnttab failure mapping rx_ring_ref!\n"); ++ return op.status; ++ } ++ ++ netif->rx_shmem_ref = rx_ring_ref; ++ netif->rx_shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_pages(netif_t *netif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->tx_comms_area->addr, ++ GNTMAP_host_map, netif->tx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++ ++ gnttab_set_unmap_op(&op, (unsigned long)netif->rx_comms_area->addr, ++ GNTMAP_host_map, netif->rx_shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int netif_map(netif_t *netif, unsigned long tx_ring_ref, ++ unsigned long rx_ring_ref, unsigned int evtchn) ++{ ++ int err = -ENOMEM; ++ netif_tx_sring_t *txs; ++ netif_rx_sring_t *rxs; ++ ++ /* Already connected through? */ ++ if (netif->irq) ++ return 0; ++ ++ netif->tx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->tx_comms_area == NULL) ++ return -ENOMEM; ++ netif->rx_comms_area = alloc_vm_area(PAGE_SIZE); ++ if (netif->rx_comms_area == NULL) ++ goto err_rx; ++ ++ err = map_frontend_pages(netif, tx_ring_ref, rx_ring_ref); ++ if (err) ++ goto err_map; ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ netif->domid, evtchn, netif_be_int, 0, ++ netif->dev->name, netif); ++ if (err < 0) ++ goto err_hypervisor; ++ netif->irq = err; ++ disable_irq(netif->irq); ++ ++ txs = (netif_tx_sring_t *)netif->tx_comms_area->addr; ++ BACK_RING_INIT(&netif->tx, txs, PAGE_SIZE); ++ ++ rxs = (netif_rx_sring_t *) ++ ((char *)netif->rx_comms_area->addr); ++ BACK_RING_INIT(&netif->rx, rxs, PAGE_SIZE); ++ ++ netif->rx_req_cons_peek = 0; ++ ++ netif_get(netif); ++ ++ rtnl_lock(); ++ netback_carrier_on(netif); ++ if (netif_running(netif->dev)) ++ __netif_up(netif); ++ rtnl_unlock(); ++ ++ return 0; ++err_hypervisor: ++ unmap_frontend_pages(netif); ++err_map: ++ free_vm_area(netif->rx_comms_area); ++err_rx: ++ free_vm_area(netif->tx_comms_area); ++ return err; ++} ++ ++void netif_disconnect(netif_t *netif) ++{ ++ if (netback_carrier_ok(netif)) { ++ rtnl_lock(); ++ netback_carrier_off(netif); ++ netif_carrier_off(netif->dev); /* discard queued packets */ ++ if (netif_running(netif->dev)) ++ __netif_down(netif); ++ rtnl_unlock(); ++ netif_put(netif); ++ } ++ ++ atomic_dec(&netif->refcnt); ++ wait_event(netif->waiting_to_free, atomic_read(&netif->refcnt) == 0); ++ ++ del_timer_sync(&netif->credit_timeout); ++ del_timer_sync(&netif->tx_queue_timeout); ++ ++ if (netif->irq) ++ unbind_from_irqhandler(netif->irq, netif); ++ ++ unregister_netdev(netif->dev); ++ ++ if (netif->tx.sring) { ++ unmap_frontend_pages(netif); ++ free_vm_area(netif->tx_comms_area); ++ free_vm_area(netif->rx_comms_area); ++ } ++ ++ free_netdev(netif->dev); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/loopback.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,320 @@ ++/****************************************************************************** ++ * netback/loopback.c ++ * ++ * A two-interface loopback device to emulate a local netfront-netback ++ * connection. This ensures that local packet delivery looks identical ++ * to inter-domain delivery. Most importantly, packets delivered locally ++ * originating from other domains will get *copied* when they traverse this ++ * driver. This prevents unbounded delays in socket-buffer queues from ++ * causing the netback driver to "seize up". ++ * ++ * This driver creates a symmetric pair of loopback interfaces with names ++ * vif0.0 and veth0. The intention is that 'vif0.0' is bound to an Ethernet ++ * bridge, just like a proper netback interface, while a local IP interface ++ * is configured on 'veth0'. ++ * ++ * As with a real netback interface, vif0.0 is configured with a suitable ++ * dummy MAC address. No default is provided for veth0: a reasonable strategy ++ * is to transfer eth0's MAC address to veth0, and give eth0 a dummy address ++ * (to avoid confusing the Etherbridge). ++ * ++ * Copyright (c) 2005 K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/etherdevice.h> ++#include <linux/skbuff.h> ++#include <linux/ethtool.h> ++#include <net/dst.h> ++#include <net/xfrm.h> /* secpath_reset() */ ++#include <asm/hypervisor.h> /* is_initial_xendomain() */ ++ ++static int nloopbacks = -1; ++module_param(nloopbacks, int, 0); ++MODULE_PARM_DESC(nloopbacks, "Number of netback-loopback devices to create"); ++ ++struct net_private { ++ struct net_device *loopback_dev; ++ struct net_device_stats stats; ++}; ++ ++static int loopback_open(struct net_device *dev) ++{ ++ struct net_private *np = netdev_priv(dev); ++ memset(&np->stats, 0, sizeof(np->stats)); ++ netif_start_queue(dev); ++ return 0; ++} ++ ++static int loopback_close(struct net_device *dev) ++{ ++ netif_stop_queue(dev); ++ return 0; ++} ++ ++#ifdef CONFIG_X86 ++static int is_foreign(unsigned long pfn) ++{ ++ /* NB. Play it safe for auto-translation mode. */ ++ return (xen_feature(XENFEAT_auto_translated_physmap) || ++ (phys_to_machine_mapping[pfn] & FOREIGN_FRAME_BIT)); ++} ++#else ++/* How to detect a foreign mapping? Play it safe. */ ++#define is_foreign(pfn) (1) ++#endif ++ ++static int skb_remove_foreign_references(struct sk_buff *skb) ++{ ++ struct page *page; ++ unsigned long pfn; ++ int i, off; ++ char *vaddr; ++ ++ BUG_ON(skb_shinfo(skb)->frag_list); ++ ++ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) { ++ pfn = page_to_pfn(skb_shinfo(skb)->frags[i].page); ++ if (!is_foreign(pfn)) ++ continue; ++ ++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(!page)) ++ return 0; ++ ++ vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]); ++ off = skb_shinfo(skb)->frags[i].page_offset; ++ memcpy(page_address(page) + off, ++ vaddr + off, ++ skb_shinfo(skb)->frags[i].size); ++ kunmap_skb_frag(vaddr); ++ ++ put_page(skb_shinfo(skb)->frags[i].page); ++ skb_shinfo(skb)->frags[i].page = page; ++ } ++ ++ return 1; ++} ++ ++static int loopback_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ struct net_private *np = netdev_priv(dev); ++ ++ if (!skb_remove_foreign_references(skb)) { ++ np->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ return 0; ++ } ++ ++ dst_release(skb->dst); ++ skb->dst = NULL; ++ ++ skb_orphan(skb); ++ ++ np->stats.tx_bytes += skb->len; ++ np->stats.tx_packets++; ++ ++ /* Switch to loopback context. */ ++ dev = np->loopback_dev; ++ np = netdev_priv(dev); ++ ++ np->stats.rx_bytes += skb->len; ++ np->stats.rx_packets++; ++ ++ if (skb->ip_summed == CHECKSUM_HW) { ++ /* Defer checksum calculation. */ ++ skb->proto_csum_blank = 1; ++ /* Must be a local packet: assert its integrity. */ ++ skb->proto_data_valid = 1; ++ } ++ ++ skb->ip_summed = skb->proto_data_valid ? ++ CHECKSUM_UNNECESSARY : CHECKSUM_NONE; ++ ++ skb->pkt_type = PACKET_HOST; /* overridden by eth_type_trans() */ ++ skb->protocol = eth_type_trans(skb, dev); ++ skb->dev = dev; ++ dev->last_rx = jiffies; ++ ++ /* Flush netfilter context: rx'ed skbuffs not expected to have any. */ ++ nf_reset(skb); ++ secpath_reset(skb); ++ ++ netif_rx(skb); ++ ++ return 0; ++} ++ ++static struct net_device_stats *loopback_get_stats(struct net_device *dev) ++{ ++ struct net_private *np = netdev_priv(dev); ++ return &np->stats; ++} ++ ++static struct ethtool_ops network_ethtool_ops = ++{ ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = ethtool_op_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = ethtool_op_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = ethtool_op_set_tso, ++ .get_link = ethtool_op_get_link, ++}; ++ ++/* ++ * Nothing to do here. Virtual interface is point-to-point and the ++ * physical interface is probably promiscuous anyway. ++ */ ++static void loopback_set_multicast_list(struct net_device *dev) ++{ ++} ++ ++static void loopback_construct(struct net_device *dev, struct net_device *lo) ++{ ++ struct net_private *np = netdev_priv(dev); ++ ++ np->loopback_dev = lo; ++ ++ dev->open = loopback_open; ++ dev->stop = loopback_close; ++ dev->hard_start_xmit = loopback_start_xmit; ++ dev->get_stats = loopback_get_stats; ++ dev->set_multicast_list = loopback_set_multicast_list; ++ dev->change_mtu = NULL; /* allow arbitrary mtu */ ++ ++ dev->tx_queue_len = 0; ++ ++ dev->features = (NETIF_F_HIGHDMA | ++ NETIF_F_LLTX | ++ NETIF_F_TSO | ++ NETIF_F_SG | ++ NETIF_F_IP_CSUM); ++ ++ SET_ETHTOOL_OPS(dev, &network_ethtool_ops); ++ ++ /* ++ * We do not set a jumbo MTU on the interface. Otherwise the network ++ * stack will try to send large packets that will get dropped by the ++ * Ethernet bridge (unless the physical Ethernet interface is ++ * configured to transfer jumbo packets). If a larger MTU is desired ++ * then the system administrator can specify it using the 'ifconfig' ++ * command. ++ */ ++ /*dev->mtu = 16*1024;*/ ++} ++ ++static int __init make_loopback(int i) ++{ ++ struct net_device *dev1, *dev2; ++ char dev_name[IFNAMSIZ]; ++ int err = -ENOMEM; ++ ++ sprintf(dev_name, "vif0.%d", i); ++ dev1 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup); ++ if (!dev1) ++ return err; ++ ++ sprintf(dev_name, "veth%d", i); ++ dev2 = alloc_netdev(sizeof(struct net_private), dev_name, ether_setup); ++ if (!dev2) ++ goto fail_netdev2; ++ ++ loopback_construct(dev1, dev2); ++ loopback_construct(dev2, dev1); ++ ++ /* ++ * Initialise a dummy MAC address for the 'dummy backend' interface. We ++ * choose the numerically largest non-broadcast address to prevent the ++ * address getting stolen by an Ethernet bridge for STP purposes. ++ */ ++ memset(dev1->dev_addr, 0xFF, ETH_ALEN); ++ dev1->dev_addr[0] &= ~0x01; ++ ++ if ((err = register_netdev(dev1)) != 0) ++ goto fail; ++ ++ if ((err = register_netdev(dev2)) != 0) { ++ unregister_netdev(dev1); ++ goto fail; ++ } ++ ++ return 0; ++ ++ fail: ++ free_netdev(dev2); ++ fail_netdev2: ++ free_netdev(dev1); ++ return err; ++} ++ ++static void __exit clean_loopback(int i) ++{ ++ struct net_device *dev1, *dev2; ++ char dev_name[IFNAMSIZ]; ++ ++ sprintf(dev_name, "vif0.%d", i); ++ dev1 = dev_get_by_name(dev_name); ++ sprintf(dev_name, "veth%d", i); ++ dev2 = dev_get_by_name(dev_name); ++ if (dev1 && dev2) { ++ unregister_netdev(dev2); ++ unregister_netdev(dev1); ++ free_netdev(dev2); ++ free_netdev(dev1); ++ } ++} ++ ++static int __init loopback_init(void) ++{ ++ int i, err = 0; ++ ++ if (nloopbacks == -1) ++ nloopbacks = is_initial_xendomain() ? 4 : 0; ++ ++ for (i = 0; i < nloopbacks; i++) ++ if ((err = make_loopback(i)) != 0) ++ break; ++ ++ return err; ++} ++ ++module_init(loopback_init); ++ ++static void __exit loopback_exit(void) ++{ ++ int i; ++ ++ for (i = nloopbacks; i-- > 0; ) ++ clean_loopback(i); ++} ++ ++module_exit(loopback_exit); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/netback.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,1496 @@ ++/****************************************************************************** ++ * drivers/xen/netback/netback.c ++ * ++ * Back-end of the driver for virtual network devices. This portion of the ++ * driver exports a 'unified' network-device interface that can be accessed ++ * by any operating system that implements a compatible front end. A ++ * reference front-end implementation can be found in: ++ * drivers/xen/netfront/netfront.c ++ * ++ * Copyright (c) 2002-2005, K A Fraser ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include "common.h" ++#include <xen/balloon.h> ++#include <xen/interface/memory.h> ++ ++/*define NETBE_DEBUG_INTERRUPT*/ ++ ++/* extra field used in struct page */ ++#define netif_page_index(pg) (*(long *)&(pg)->mapping) ++ ++struct netbk_rx_meta { ++ skb_frag_t frag; ++ int id; ++ int copy:1; ++}; ++ ++static void netif_idx_release(u16 pending_idx); ++static void netif_page_release(struct page *page); ++static void make_tx_response(netif_t *netif, ++ netif_tx_request_t *txp, ++ s8 st); ++static netif_rx_response_t *make_rx_response(netif_t *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags); ++ ++static void net_tx_action(unsigned long unused); ++static DECLARE_TASKLET(net_tx_tasklet, net_tx_action, 0); ++ ++static void net_rx_action(unsigned long unused); ++static DECLARE_TASKLET(net_rx_tasklet, net_rx_action, 0); ++ ++static struct timer_list net_timer; ++ ++#define MAX_PENDING_REQS 256 ++ ++static struct sk_buff_head rx_queue; ++ ++static struct page **mmap_pages; ++static inline unsigned long idx_to_kaddr(unsigned int idx) ++{ ++ return (unsigned long)pfn_to_kaddr(page_to_pfn(mmap_pages[idx])); ++} ++ ++#define PKT_PROT_LEN 64 ++ ++static struct pending_tx_info { ++ netif_tx_request_t req; ++ netif_t *netif; ++} pending_tx_info[MAX_PENDING_REQS]; ++static u16 pending_ring[MAX_PENDING_REQS]; ++typedef unsigned int PEND_RING_IDX; ++#define MASK_PEND_IDX(_i) ((_i)&(MAX_PENDING_REQS-1)) ++static PEND_RING_IDX pending_prod, pending_cons; ++#define NR_PENDING_REQS (MAX_PENDING_REQS - pending_prod + pending_cons) ++ ++/* Freed TX SKBs get batched on this ring before return to pending_ring. */ ++static u16 dealloc_ring[MAX_PENDING_REQS]; ++static PEND_RING_IDX dealloc_prod, dealloc_cons; ++ ++static struct sk_buff_head tx_queue; ++ ++static grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; ++static gnttab_unmap_grant_ref_t tx_unmap_ops[MAX_PENDING_REQS]; ++static gnttab_map_grant_ref_t tx_map_ops[MAX_PENDING_REQS]; ++ ++static struct list_head net_schedule_list; ++static spinlock_t net_schedule_list_lock; ++ ++#define MAX_MFN_ALLOC 64 ++static unsigned long mfn_list[MAX_MFN_ALLOC]; ++static unsigned int alloc_index = 0; ++ ++static inline unsigned long alloc_mfn(void) ++{ ++ BUG_ON(alloc_index == 0); ++ return mfn_list[--alloc_index]; ++} ++ ++static int check_mfn(int nr) ++{ ++ struct xen_memory_reservation reservation = { ++ .extent_order = 0, ++ .domid = DOMID_SELF ++ }; ++ ++ if (likely(alloc_index >= nr)) ++ return 0; ++ ++ set_xen_guest_handle(reservation.extent_start, mfn_list + alloc_index); ++ reservation.nr_extents = MAX_MFN_ALLOC - alloc_index; ++ alloc_index += HYPERVISOR_memory_op(XENMEM_increase_reservation, ++ &reservation); ++ ++ return alloc_index >= nr ? 0 : -ENOMEM; ++} ++ ++static inline void maybe_schedule_tx_action(void) ++{ ++ smp_mb(); ++ if ((NR_PENDING_REQS < (MAX_PENDING_REQS/2)) && ++ !list_empty(&net_schedule_list)) ++ tasklet_schedule(&net_tx_tasklet); ++} ++ ++static struct sk_buff *netbk_copy_skb(struct sk_buff *skb) ++{ ++ struct skb_shared_info *ninfo; ++ struct sk_buff *nskb; ++ unsigned long offset; ++ int ret; ++ int len; ++ int headlen; ++ ++ BUG_ON(skb_shinfo(skb)->frag_list != NULL); ++ ++ nskb = alloc_skb(SKB_MAX_HEAD(0), GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(!nskb)) ++ goto err; ++ ++ skb_reserve(nskb, 16 + NET_IP_ALIGN); ++ headlen = nskb->end - nskb->data; ++ if (headlen > skb_headlen(skb)) ++ headlen = skb_headlen(skb); ++ ret = skb_copy_bits(skb, 0, __skb_put(nskb, headlen), headlen); ++ BUG_ON(ret); ++ ++ ninfo = skb_shinfo(nskb); ++ ninfo->gso_size = skb_shinfo(skb)->gso_size; ++ ninfo->gso_type = skb_shinfo(skb)->gso_type; ++ ++ offset = headlen; ++ len = skb->len - headlen; ++ ++ nskb->len = skb->len; ++ nskb->data_len = len; ++ nskb->truesize += len; ++ ++ while (len) { ++ struct page *page; ++ int copy; ++ int zero; ++ ++ if (unlikely(ninfo->nr_frags >= MAX_SKB_FRAGS)) { ++ dump_stack(); ++ goto err_free; ++ } ++ ++ copy = len >= PAGE_SIZE ? PAGE_SIZE : len; ++ zero = len >= PAGE_SIZE ? 0 : __GFP_ZERO; ++ ++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN | zero); ++ if (unlikely(!page)) ++ goto err_free; ++ ++ ret = skb_copy_bits(skb, offset, page_address(page), copy); ++ BUG_ON(ret); ++ ++ ninfo->frags[ninfo->nr_frags].page = page; ++ ninfo->frags[ninfo->nr_frags].page_offset = 0; ++ ninfo->frags[ninfo->nr_frags].size = copy; ++ ninfo->nr_frags++; ++ ++ offset += copy; ++ len -= copy; ++ } ++ ++ offset = nskb->data - skb->data; ++ ++ nskb->h.raw = skb->h.raw + offset; ++ nskb->nh.raw = skb->nh.raw + offset; ++ nskb->mac.raw = skb->mac.raw + offset; ++ ++ return nskb; ++ ++ err_free: ++ kfree_skb(nskb); ++ err: ++ return NULL; ++} ++ ++static inline int netbk_max_required_rx_slots(netif_t *netif) ++{ ++ if (netif->features & (NETIF_F_SG|NETIF_F_TSO)) ++ return MAX_SKB_FRAGS + 2; /* header + extra_info + frags */ ++ return 1; /* all in one */ ++} ++ ++static inline int netbk_queue_full(netif_t *netif) ++{ ++ RING_IDX peek = netif->rx_req_cons_peek; ++ RING_IDX needed = netbk_max_required_rx_slots(netif); ++ ++ return ((netif->rx.sring->req_prod - peek) < needed) || ++ ((netif->rx.rsp_prod_pvt + NET_RX_RING_SIZE - peek) < needed); ++} ++ ++static void tx_queue_callback(unsigned long data) ++{ ++ netif_t *netif = (netif_t *)data; ++ if (netif_schedulable(netif)) ++ netif_wake_queue(netif->dev); ++} ++ ++int netif_be_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ ++ BUG_ON(skb->dev != dev); ++ ++ /* Drop the packet if the target domain has no receive buffers. */ ++ if (unlikely(!netif_schedulable(netif) || netbk_queue_full(netif))) ++ goto drop; ++ ++ /* ++ * Copy the packet here if it's destined for a flipping interface ++ * but isn't flippable (e.g. extra references to data). ++ * XXX For now we also copy skbuffs whose head crosses a page ++ * boundary, because netbk_gop_skb can't handle them. ++ */ ++ if (!netif->copying_receiver || ++ ((skb_headlen(skb) + offset_in_page(skb->data)) >= PAGE_SIZE)) { ++ struct sk_buff *nskb = netbk_copy_skb(skb); ++ if ( unlikely(nskb == NULL) ) ++ goto drop; ++ /* Copy only the header fields we use in this driver. */ ++ nskb->dev = skb->dev; ++ nskb->ip_summed = skb->ip_summed; ++ nskb->proto_data_valid = skb->proto_data_valid; ++ dev_kfree_skb(skb); ++ skb = nskb; ++ } ++ ++ netif->rx_req_cons_peek += skb_shinfo(skb)->nr_frags + 1 + ++ !!skb_shinfo(skb)->gso_size; ++ netif_get(netif); ++ ++ if (netbk_can_queue(dev) && netbk_queue_full(netif)) { ++ netif->rx.sring->req_event = netif->rx_req_cons_peek + ++ netbk_max_required_rx_slots(netif); ++ mb(); /* request notification /then/ check & stop the queue */ ++ if (netbk_queue_full(netif)) { ++ netif_stop_queue(dev); ++ /* ++ * Schedule 500ms timeout to restart the queue, thus ++ * ensuring that an inactive queue will be drained. ++ * Packets will be immediately be dropped until more ++ * receive buffers become available (see ++ * netbk_queue_full() check above). ++ */ ++ netif->tx_queue_timeout.data = (unsigned long)netif; ++ netif->tx_queue_timeout.function = tx_queue_callback; ++ __mod_timer(&netif->tx_queue_timeout, jiffies + HZ/2); ++ } ++ } ++ ++ skb_queue_tail(&rx_queue, skb); ++ tasklet_schedule(&net_rx_tasklet); ++ ++ return 0; ++ ++ drop: ++ netif->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ return 0; ++} ++ ++#if 0 ++static void xen_network_done_notify(void) ++{ ++ static struct net_device *eth0_dev = NULL; ++ if (unlikely(eth0_dev == NULL)) ++ eth0_dev = __dev_get_by_name("eth0"); ++ netif_rx_schedule(eth0_dev); ++} ++/* ++ * Add following to poll() function in NAPI driver (Tigon3 is example): ++ * if ( xen_network_done() ) ++ * tg3_enable_ints(tp); ++ */ ++int xen_network_done(void) ++{ ++ return skb_queue_empty(&rx_queue); ++} ++#endif ++ ++struct netrx_pending_operations { ++ unsigned trans_prod, trans_cons; ++ unsigned mmu_prod, mmu_cons; ++ unsigned mcl_prod, mcl_cons; ++ unsigned copy_prod, copy_cons; ++ unsigned meta_prod, meta_cons; ++ mmu_update_t *mmu; ++ gnttab_transfer_t *trans; ++ gnttab_copy_t *copy; ++ multicall_entry_t *mcl; ++ struct netbk_rx_meta *meta; ++}; ++ ++/* Set up the grant operations for this fragment. If it's a flipping ++ interface, we also set up the unmap request from here. */ ++static u16 netbk_gop_frag(netif_t *netif, struct netbk_rx_meta *meta, ++ int i, struct netrx_pending_operations *npo, ++ struct page *page, unsigned long size, ++ unsigned long offset) ++{ ++ mmu_update_t *mmu; ++ gnttab_transfer_t *gop; ++ gnttab_copy_t *copy_gop; ++ multicall_entry_t *mcl; ++ netif_rx_request_t *req; ++ unsigned long old_mfn, new_mfn; ++ ++ old_mfn = virt_to_mfn(page_address(page)); ++ ++ req = RING_GET_REQUEST(&netif->rx, netif->rx.req_cons + i); ++ if (netif->copying_receiver) { ++ /* The fragment needs to be copied rather than ++ flipped. */ ++ meta->copy = 1; ++ copy_gop = npo->copy + npo->copy_prod++; ++ copy_gop->flags = GNTCOPY_dest_gref; ++ if (PageForeign(page)) { ++ struct pending_tx_info *src_pend = ++ &pending_tx_info[netif_page_index(page)]; ++ copy_gop->source.domid = src_pend->netif->domid; ++ copy_gop->source.u.ref = src_pend->req.gref; ++ copy_gop->flags |= GNTCOPY_source_gref; ++ } else { ++ copy_gop->source.domid = DOMID_SELF; ++ copy_gop->source.u.gmfn = old_mfn; ++ } ++ copy_gop->source.offset = offset; ++ copy_gop->dest.domid = netif->domid; ++ copy_gop->dest.offset = 0; ++ copy_gop->dest.u.ref = req->gref; ++ copy_gop->len = size; ++ } else { ++ meta->copy = 0; ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ new_mfn = alloc_mfn(); ++ ++ /* ++ * Set the new P2M table entry before ++ * reassigning the old data page. Heed the ++ * comment in pgtable-2level.h:pte_page(). :-) ++ */ ++ set_phys_to_machine(page_to_pfn(page), new_mfn); ++ ++ mcl = npo->mcl + npo->mcl_prod++; ++ MULTI_update_va_mapping(mcl, ++ (unsigned long)page_address(page), ++ pfn_pte_ma(new_mfn, PAGE_KERNEL), ++ 0); ++ ++ mmu = npo->mmu + npo->mmu_prod++; ++ mmu->ptr = ((maddr_t)new_mfn << PAGE_SHIFT) | ++ MMU_MACHPHYS_UPDATE; ++ mmu->val = page_to_pfn(page); ++ } ++ ++ gop = npo->trans + npo->trans_prod++; ++ gop->mfn = old_mfn; ++ gop->domid = netif->domid; ++ gop->ref = req->gref; ++ } ++ return req->id; ++} ++ ++static void netbk_gop_skb(struct sk_buff *skb, ++ struct netrx_pending_operations *npo) ++{ ++ netif_t *netif = netdev_priv(skb->dev); ++ int nr_frags = skb_shinfo(skb)->nr_frags; ++ int i; ++ int extra; ++ struct netbk_rx_meta *head_meta, *meta; ++ ++ head_meta = npo->meta + npo->meta_prod++; ++ head_meta->frag.page_offset = skb_shinfo(skb)->gso_type; ++ head_meta->frag.size = skb_shinfo(skb)->gso_size; ++ extra = !!head_meta->frag.size + 1; ++ ++ for (i = 0; i < nr_frags; i++) { ++ meta = npo->meta + npo->meta_prod++; ++ meta->frag = skb_shinfo(skb)->frags[i]; ++ meta->id = netbk_gop_frag(netif, meta, i + extra, npo, ++ meta->frag.page, ++ meta->frag.size, ++ meta->frag.page_offset); ++ } ++ ++ /* ++ * This must occur at the end to ensure that we don't trash skb_shinfo ++ * until we're done. We know that the head doesn't cross a page ++ * boundary because such packets get copied in netif_be_start_xmit. ++ */ ++ head_meta->id = netbk_gop_frag(netif, head_meta, 0, npo, ++ virt_to_page(skb->data), ++ skb_headlen(skb), ++ offset_in_page(skb->data)); ++ ++ netif->rx.req_cons += nr_frags + extra; ++} ++ ++static inline void netbk_free_pages(int nr_frags, struct netbk_rx_meta *meta) ++{ ++ int i; ++ ++ for (i = 0; i < nr_frags; i++) ++ put_page(meta[i].frag.page); ++} ++ ++/* This is a twin to netbk_gop_skb. Assume that netbk_gop_skb was ++ used to set up the operations on the top of ++ netrx_pending_operations, which have since been done. Check that ++ they didn't give any errors and advance over them. */ ++static int netbk_check_gop(int nr_frags, domid_t domid, ++ struct netrx_pending_operations *npo) ++{ ++ multicall_entry_t *mcl; ++ gnttab_transfer_t *gop; ++ gnttab_copy_t *copy_op; ++ int status = NETIF_RSP_OKAY; ++ int i; ++ ++ for (i = 0; i <= nr_frags; i++) { ++ if (npo->meta[npo->meta_cons + i].copy) { ++ copy_op = npo->copy + npo->copy_cons++; ++ if (copy_op->status != GNTST_okay) { ++ DPRINTK("Bad status %d from copy to DOM%d.\n", ++ copy_op->status, domid); ++ status = NETIF_RSP_ERROR; ++ } ++ } else { ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ mcl = npo->mcl + npo->mcl_cons++; ++ /* The update_va_mapping() must not fail. */ ++ BUG_ON(mcl->result != 0); ++ } ++ ++ gop = npo->trans + npo->trans_cons++; ++ /* Check the reassignment error code. */ ++ if (gop->status != 0) { ++ DPRINTK("Bad status %d from grant transfer to DOM%u\n", ++ gop->status, domid); ++ /* ++ * Page no longer belongs to us unless ++ * GNTST_bad_page, but that should be ++ * a fatal error anyway. ++ */ ++ BUG_ON(gop->status == GNTST_bad_page); ++ status = NETIF_RSP_ERROR; ++ } ++ } ++ } ++ ++ return status; ++} ++ ++static void netbk_add_frag_responses(netif_t *netif, int status, ++ struct netbk_rx_meta *meta, int nr_frags) ++{ ++ int i; ++ unsigned long offset; ++ ++ for (i = 0; i < nr_frags; i++) { ++ int id = meta[i].id; ++ int flags = (i == nr_frags - 1) ? 0 : NETRXF_more_data; ++ ++ if (meta[i].copy) ++ offset = 0; ++ else ++ offset = meta[i].frag.page_offset; ++ make_rx_response(netif, id, status, offset, ++ meta[i].frag.size, flags); ++ } ++} ++ ++static void net_rx_action(unsigned long unused) ++{ ++ netif_t *netif = NULL; ++ s8 status; ++ u16 id, irq, flags; ++ netif_rx_response_t *resp; ++ multicall_entry_t *mcl; ++ struct sk_buff_head rxq; ++ struct sk_buff *skb; ++ int notify_nr = 0; ++ int ret; ++ int nr_frags; ++ int count; ++ unsigned long offset; ++ ++ /* ++ * Putting hundreds of bytes on the stack is considered rude. ++ * Static works because a tasklet can only be on one CPU at any time. ++ */ ++ static multicall_entry_t rx_mcl[NET_RX_RING_SIZE+3]; ++ static mmu_update_t rx_mmu[NET_RX_RING_SIZE]; ++ static gnttab_transfer_t grant_trans_op[NET_RX_RING_SIZE]; ++ static gnttab_copy_t grant_copy_op[NET_RX_RING_SIZE]; ++ static unsigned char rx_notify[NR_IRQS]; ++ static u16 notify_list[NET_RX_RING_SIZE]; ++ static struct netbk_rx_meta meta[NET_RX_RING_SIZE]; ++ ++ struct netrx_pending_operations npo = { ++ mmu: rx_mmu, ++ trans: grant_trans_op, ++ copy: grant_copy_op, ++ mcl: rx_mcl, ++ meta: meta}; ++ ++ skb_queue_head_init(&rxq); ++ ++ count = 0; ++ ++ while ((skb = skb_dequeue(&rx_queue)) != NULL) { ++ nr_frags = skb_shinfo(skb)->nr_frags; ++ *(int *)skb->cb = nr_frags; ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap) && ++ !((netif_t *)netdev_priv(skb->dev))->copying_receiver && ++ check_mfn(nr_frags + 1)) { ++ /* Memory squeeze? Back off for an arbitrary while. */ ++ if ( net_ratelimit() ) ++ WPRINTK("Memory squeeze in netback " ++ "driver.\n"); ++ mod_timer(&net_timer, jiffies + HZ); ++ skb_queue_head(&rx_queue, skb); ++ break; ++ } ++ ++ netbk_gop_skb(skb, &npo); ++ ++ count += nr_frags + 1; ++ ++ __skb_queue_tail(&rxq, skb); ++ ++ /* Filled the batch queue? */ ++ if (count + MAX_SKB_FRAGS >= NET_RX_RING_SIZE) ++ break; ++ } ++ ++ if (npo.mcl_prod && ++ !xen_feature(XENFEAT_auto_translated_physmap)) { ++ mcl = npo.mcl + npo.mcl_prod++; ++ ++ BUG_ON(mcl[-1].op != __HYPERVISOR_update_va_mapping); ++ mcl[-1].args[MULTI_UVMFLAGS_INDEX] = UVMF_TLB_FLUSH|UVMF_ALL; ++ ++ mcl->op = __HYPERVISOR_mmu_update; ++ mcl->args[0] = (unsigned long)rx_mmu; ++ mcl->args[1] = npo.mmu_prod; ++ mcl->args[2] = 0; ++ mcl->args[3] = DOMID_SELF; ++ } ++ ++ if (npo.trans_prod) { ++ mcl = npo.mcl + npo.mcl_prod++; ++ mcl->op = __HYPERVISOR_grant_table_op; ++ mcl->args[0] = GNTTABOP_transfer; ++ mcl->args[1] = (unsigned long)grant_trans_op; ++ mcl->args[2] = npo.trans_prod; ++ } ++ ++ if (npo.copy_prod) { ++ mcl = npo.mcl + npo.mcl_prod++; ++ mcl->op = __HYPERVISOR_grant_table_op; ++ mcl->args[0] = GNTTABOP_copy; ++ mcl->args[1] = (unsigned long)grant_copy_op; ++ mcl->args[2] = npo.copy_prod; ++ } ++ ++ /* Nothing to do? */ ++ if (!npo.mcl_prod) ++ return; ++ ++ BUG_ON(npo.copy_prod > NET_RX_RING_SIZE); ++ BUG_ON(npo.mmu_prod > NET_RX_RING_SIZE); ++ BUG_ON(npo.trans_prod > NET_RX_RING_SIZE); ++ BUG_ON(npo.mcl_prod > NET_RX_RING_SIZE+3); ++ BUG_ON(npo.meta_prod > NET_RX_RING_SIZE); ++ ++ ret = HYPERVISOR_multicall(npo.mcl, npo.mcl_prod); ++ BUG_ON(ret != 0); ++ ++ while ((skb = __skb_dequeue(&rxq)) != NULL) { ++ nr_frags = *(int *)skb->cb; ++ ++ netif = netdev_priv(skb->dev); ++ /* We can't rely on skb_release_data to release the ++ pages used by fragments for us, since it tries to ++ touch the pages in the fraglist. If we're in ++ flipping mode, that doesn't work. In copying mode, ++ we still have access to all of the pages, and so ++ it's safe to let release_data deal with it. */ ++ /* (Freeing the fragments is safe since we copy ++ non-linear skbs destined for flipping interfaces) */ ++ if (!netif->copying_receiver) { ++ atomic_set(&(skb_shinfo(skb)->dataref), 1); ++ skb_shinfo(skb)->frag_list = NULL; ++ skb_shinfo(skb)->nr_frags = 0; ++ netbk_free_pages(nr_frags, meta + npo.meta_cons + 1); ++ } ++ ++ netif->stats.tx_bytes += skb->len; ++ netif->stats.tx_packets++; ++ ++ status = netbk_check_gop(nr_frags, netif->domid, &npo); ++ ++ id = meta[npo.meta_cons].id; ++ flags = nr_frags ? NETRXF_more_data : 0; ++ ++ if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ ++ flags |= NETRXF_csum_blank | NETRXF_data_validated; ++ else if (skb->proto_data_valid) /* remote but checksummed? */ ++ flags |= NETRXF_data_validated; ++ ++ if (meta[npo.meta_cons].copy) ++ offset = 0; ++ else ++ offset = offset_in_page(skb->data); ++ resp = make_rx_response(netif, id, status, offset, ++ skb_headlen(skb), flags); ++ ++ if (meta[npo.meta_cons].frag.size) { ++ struct netif_extra_info *gso = ++ (struct netif_extra_info *) ++ RING_GET_RESPONSE(&netif->rx, ++ netif->rx.rsp_prod_pvt++); ++ ++ resp->flags |= NETRXF_extra_info; ++ ++ gso->u.gso.size = meta[npo.meta_cons].frag.size; ++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ++ gso->u.gso.pad = 0; ++ gso->u.gso.features = 0; ++ ++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO; ++ gso->flags = 0; ++ } ++ ++ netbk_add_frag_responses(netif, status, ++ meta + npo.meta_cons + 1, ++ nr_frags); ++ ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->rx, ret); ++ irq = netif->irq; ++ if (ret && !rx_notify[irq]) { ++ rx_notify[irq] = 1; ++ notify_list[notify_nr++] = irq; ++ } ++ ++ if (netif_queue_stopped(netif->dev) && ++ netif_schedulable(netif) && ++ !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ netif_put(netif); ++ dev_kfree_skb(skb); ++ npo.meta_cons += nr_frags + 1; ++ } ++ ++ while (notify_nr != 0) { ++ irq = notify_list[--notify_nr]; ++ rx_notify[irq] = 0; ++ notify_remote_via_irq(irq); ++ } ++ ++ /* More work to do? */ ++ if (!skb_queue_empty(&rx_queue) && !timer_pending(&net_timer)) ++ tasklet_schedule(&net_rx_tasklet); ++#if 0 ++ else ++ xen_network_done_notify(); ++#endif ++} ++ ++static void net_alarm(unsigned long unused) ++{ ++ tasklet_schedule(&net_rx_tasklet); ++} ++ ++struct net_device_stats *netif_be_get_stats(struct net_device *dev) ++{ ++ netif_t *netif = netdev_priv(dev); ++ return &netif->stats; ++} ++ ++static int __on_net_schedule_list(netif_t *netif) ++{ ++ return netif->list.next != NULL; ++} ++ ++static void remove_from_net_schedule_list(netif_t *netif) ++{ ++ spin_lock_irq(&net_schedule_list_lock); ++ if (likely(__on_net_schedule_list(netif))) { ++ list_del(&netif->list); ++ netif->list.next = NULL; ++ netif_put(netif); ++ } ++ spin_unlock_irq(&net_schedule_list_lock); ++} ++ ++static void add_to_net_schedule_list_tail(netif_t *netif) ++{ ++ if (__on_net_schedule_list(netif)) ++ return; ++ ++ spin_lock_irq(&net_schedule_list_lock); ++ if (!__on_net_schedule_list(netif) && ++ likely(netif_schedulable(netif))) { ++ list_add_tail(&netif->list, &net_schedule_list); ++ netif_get(netif); ++ } ++ spin_unlock_irq(&net_schedule_list_lock); ++} ++ ++/* ++ * Note on CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER: ++ * If this driver is pipelining transmit requests then we can be very ++ * aggressive in avoiding new-packet notifications -- frontend only needs to ++ * send a notification if there are no outstanding unreceived responses. ++ * If we may be buffer transmit buffers for any reason then we must be rather ++ * more conservative and treat this as the final check for pending work. ++ */ ++void netif_schedule_work(netif_t *netif) ++{ ++ int more_to_do; ++ ++#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER ++ more_to_do = RING_HAS_UNCONSUMED_REQUESTS(&netif->tx); ++#else ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); ++#endif ++ ++ if (more_to_do) { ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(); ++ } ++} ++ ++void netif_deschedule_work(netif_t *netif) ++{ ++ remove_from_net_schedule_list(netif); ++} ++ ++ ++static void tx_add_credit(netif_t *netif) ++{ ++ unsigned long max_burst, max_credit; ++ ++ /* ++ * Allow a burst big enough to transmit a jumbo packet of up to 128kB. ++ * Otherwise the interface can seize up due to insufficient credit. ++ */ ++ max_burst = RING_GET_REQUEST(&netif->tx, netif->tx.req_cons)->size; ++ max_burst = min(max_burst, 131072UL); ++ max_burst = max(max_burst, netif->credit_bytes); ++ ++ /* Take care that adding a new chunk of credit doesn't wrap to zero. */ ++ max_credit = netif->remaining_credit + netif->credit_bytes; ++ if (max_credit < netif->remaining_credit) ++ max_credit = ULONG_MAX; /* wrapped: clamp to ULONG_MAX */ ++ ++ netif->remaining_credit = min(max_credit, max_burst); ++} ++ ++static void tx_credit_callback(unsigned long data) ++{ ++ netif_t *netif = (netif_t *)data; ++ tx_add_credit(netif); ++ netif_schedule_work(netif); ++} ++ ++inline static void net_tx_action_dealloc(void) ++{ ++ gnttab_unmap_grant_ref_t *gop; ++ u16 pending_idx; ++ PEND_RING_IDX dc, dp; ++ netif_t *netif; ++ int ret; ++ ++ dc = dealloc_cons; ++ dp = dealloc_prod; ++ ++ /* Ensure we see all indexes enqueued by netif_idx_release(). */ ++ smp_rmb(); ++ ++ /* ++ * Free up any grants we have finished using ++ */ ++ gop = tx_unmap_ops; ++ while (dc != dp) { ++ pending_idx = dealloc_ring[MASK_PEND_IDX(dc++)]; ++ gnttab_set_unmap_op(gop, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map, ++ grant_tx_handle[pending_idx]); ++ gop++; ++ } ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_unmap_grant_ref, tx_unmap_ops, gop - tx_unmap_ops); ++ BUG_ON(ret); ++ ++ while (dealloc_cons != dp) { ++ pending_idx = dealloc_ring[MASK_PEND_IDX(dealloc_cons++)]; ++ ++ netif = pending_tx_info[pending_idx].netif; ++ ++ make_tx_response(netif, &pending_tx_info[pending_idx].req, ++ NETIF_RSP_OKAY); ++ ++ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; ++ ++ netif_put(netif); ++ } ++} ++ ++static void netbk_tx_err(netif_t *netif, netif_tx_request_t *txp, RING_IDX end) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ if (cons >= end) ++ break; ++ txp = RING_GET_REQUEST(&netif->tx, cons++); ++ } while (1); ++ netif->tx.req_cons = cons; ++ netif_schedule_work(netif); ++ netif_put(netif); ++} ++ ++static int netbk_count_requests(netif_t *netif, netif_tx_request_t *first, ++ netif_tx_request_t *txp, int work_to_do) ++{ ++ RING_IDX cons = netif->tx.req_cons; ++ int frags = 0; ++ ++ if (!(first->flags & NETTXF_more_data)) ++ return 0; ++ ++ do { ++ if (frags >= work_to_do) { ++ DPRINTK("Need more frags\n"); ++ return -frags; ++ } ++ ++ if (unlikely(frags >= MAX_SKB_FRAGS)) { ++ DPRINTK("Too many frags\n"); ++ return -frags; ++ } ++ ++ memcpy(txp, RING_GET_REQUEST(&netif->tx, cons + frags), ++ sizeof(*txp)); ++ if (txp->size > first->size) { ++ DPRINTK("Frags galore\n"); ++ return -frags; ++ } ++ ++ first->size -= txp->size; ++ frags++; ++ ++ if (unlikely((txp->offset + txp->size) > PAGE_SIZE)) { ++ DPRINTK("txp->offset: %x, size: %u\n", ++ txp->offset, txp->size); ++ return -frags; ++ } ++ } while ((txp++)->flags & NETTXF_more_data); ++ ++ return frags; ++} ++ ++static gnttab_map_grant_ref_t *netbk_get_requests(netif_t *netif, ++ struct sk_buff *skb, ++ netif_tx_request_t *txp, ++ gnttab_map_grant_ref_t *mop) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ skb_frag_t *frags = shinfo->frags; ++ unsigned long pending_idx = *((u16 *)skb->data); ++ int i, start; ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < shinfo->nr_frags; i++, txp++) { ++ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons++)]; ++ ++ gnttab_set_map_op(mop++, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txp->gref, netif->domid); ++ ++ memcpy(&pending_tx_info[pending_idx].req, txp, sizeof(*txp)); ++ netif_get(netif); ++ pending_tx_info[pending_idx].netif = netif; ++ frags[i].page = (void *)pending_idx; ++ } ++ ++ return mop; ++} ++ ++static int netbk_tx_check_mop(struct sk_buff *skb, ++ gnttab_map_grant_ref_t **mopp) ++{ ++ gnttab_map_grant_ref_t *mop = *mopp; ++ int pending_idx = *((u16 *)skb->data); ++ netif_t *netif = pending_tx_info[pending_idx].netif; ++ netif_tx_request_t *txp; ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i, err, start; ++ ++ /* Check status of header. */ ++ err = mop->status; ++ if (unlikely(err)) { ++ txp = &pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; ++ netif_put(netif); ++ } else { ++ set_phys_to_machine( ++ __pa(idx_to_kaddr(pending_idx)) >> PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr >> PAGE_SHIFT)); ++ grant_tx_handle[pending_idx] = mop->handle; ++ } ++ ++ /* Skip first skb fragment if it is on same page as header fragment. */ ++ start = ((unsigned long)shinfo->frags[0].page == pending_idx); ++ ++ for (i = start; i < nr_frags; i++) { ++ int j, newerr; ++ ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ ++ /* Check error status: if okay then remember grant handle. */ ++ newerr = (++mop)->status; ++ if (likely(!newerr)) { ++ set_phys_to_machine( ++ __pa(idx_to_kaddr(pending_idx))>>PAGE_SHIFT, ++ FOREIGN_FRAME(mop->dev_bus_addr>>PAGE_SHIFT)); ++ grant_tx_handle[pending_idx] = mop->handle; ++ /* Had a previous error? Invalidate this fragment. */ ++ if (unlikely(err)) ++ netif_idx_release(pending_idx); ++ continue; ++ } ++ ++ /* Error on this fragment: respond to client with an error. */ ++ txp = &pending_tx_info[pending_idx].req; ++ make_tx_response(netif, txp, NETIF_RSP_ERROR); ++ pending_ring[MASK_PEND_IDX(pending_prod++)] = pending_idx; ++ netif_put(netif); ++ ++ /* Not the first error? Preceding frags already invalidated. */ ++ if (err) ++ continue; ++ ++ /* First error: invalidate header and preceding fragments. */ ++ pending_idx = *((u16 *)skb->data); ++ netif_idx_release(pending_idx); ++ for (j = start; j < i; j++) { ++ pending_idx = (unsigned long)shinfo->frags[i].page; ++ netif_idx_release(pending_idx); ++ } ++ ++ /* Remember the error: invalidate all subsequent fragments. */ ++ err = newerr; ++ } ++ ++ *mopp = mop + 1; ++ return err; ++} ++ ++static void netbk_fill_frags(struct sk_buff *skb) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ int i; ++ ++ for (i = 0; i < nr_frags; i++) { ++ skb_frag_t *frag = shinfo->frags + i; ++ netif_tx_request_t *txp; ++ unsigned long pending_idx; ++ ++ pending_idx = (unsigned long)frag->page; ++ txp = &pending_tx_info[pending_idx].req; ++ frag->page = virt_to_page(idx_to_kaddr(pending_idx)); ++ frag->size = txp->size; ++ frag->page_offset = txp->offset; ++ ++ skb->len += txp->size; ++ skb->data_len += txp->size; ++ skb->truesize += txp->size; ++ } ++} ++ ++int netbk_get_extras(netif_t *netif, struct netif_extra_info *extras, ++ int work_to_do) ++{ ++ struct netif_extra_info extra; ++ RING_IDX cons = netif->tx.req_cons; ++ ++ do { ++ if (unlikely(work_to_do-- <= 0)) { ++ DPRINTK("Missing extra info\n"); ++ return -EBADR; ++ } ++ ++ memcpy(&extra, RING_GET_REQUEST(&netif->tx, cons), ++ sizeof(extra)); ++ if (unlikely(!extra.type || ++ extra.type >= XEN_NETIF_EXTRA_TYPE_MAX)) { ++ netif->tx.req_cons = ++cons; ++ DPRINTK("Invalid extra type: %d\n", extra.type); ++ return -EINVAL; ++ } ++ ++ memcpy(&extras[extra.type - 1], &extra, sizeof(extra)); ++ netif->tx.req_cons = ++cons; ++ } while (extra.flags & XEN_NETIF_EXTRA_FLAG_MORE); ++ ++ return work_to_do; ++} ++ ++static int netbk_set_skb_gso(struct sk_buff *skb, struct netif_extra_info *gso) ++{ ++ if (!gso->u.gso.size) { ++ DPRINTK("GSO size must not be zero.\n"); ++ return -EINVAL; ++ } ++ ++ /* Currently only TCPv4 S.O. is supported. */ ++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { ++ DPRINTK("Bad GSO type %d.\n", gso->u.gso.type); ++ return -EINVAL; ++ } ++ ++ skb_shinfo(skb)->gso_size = gso->u.gso.size; ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ ++ /* Header must be checked, and gso_segs computed. */ ++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; ++ skb_shinfo(skb)->gso_segs = 0; ++ ++ return 0; ++} ++ ++/* Called after netfront has transmitted */ ++static void net_tx_action(unsigned long unused) ++{ ++ struct list_head *ent; ++ struct sk_buff *skb; ++ netif_t *netif; ++ netif_tx_request_t txreq; ++ netif_tx_request_t txfrags[MAX_SKB_FRAGS]; ++ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; ++ u16 pending_idx; ++ RING_IDX i; ++ gnttab_map_grant_ref_t *mop; ++ unsigned int data_len; ++ int ret, work_to_do; ++ ++ if (dealloc_cons != dealloc_prod) ++ net_tx_action_dealloc(); ++ ++ mop = tx_map_ops; ++ while (((NR_PENDING_REQS + MAX_SKB_FRAGS) < MAX_PENDING_REQS) && ++ !list_empty(&net_schedule_list)) { ++ /* Get a netif from the list with work to do. */ ++ ent = net_schedule_list.next; ++ netif = list_entry(ent, netif_t, list); ++ netif_get(netif); ++ remove_from_net_schedule_list(netif); ++ ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, work_to_do); ++ if (!work_to_do) { ++ netif_put(netif); ++ continue; ++ } ++ ++ i = netif->tx.req_cons; ++ rmb(); /* Ensure that we see the request before we copy it. */ ++ memcpy(&txreq, RING_GET_REQUEST(&netif->tx, i), sizeof(txreq)); ++ ++ /* Credit-based scheduling. */ ++ if (txreq.size > netif->remaining_credit) { ++ unsigned long now = jiffies; ++ unsigned long next_credit = ++ netif->credit_timeout.expires + ++ msecs_to_jiffies(netif->credit_usec / 1000); ++ ++ /* Timer could already be pending in rare cases. */ ++ if (timer_pending(&netif->credit_timeout)) { ++ netif_put(netif); ++ continue; ++ } ++ ++ /* Passed the point where we can replenish credit? */ ++ if (time_after_eq(now, next_credit)) { ++ netif->credit_timeout.expires = now; ++ tx_add_credit(netif); ++ } ++ ++ /* Still too big to send right now? Set a callback. */ ++ if (txreq.size > netif->remaining_credit) { ++ netif->credit_timeout.data = ++ (unsigned long)netif; ++ netif->credit_timeout.function = ++ tx_credit_callback; ++ __mod_timer(&netif->credit_timeout, ++ next_credit); ++ netif_put(netif); ++ continue; ++ } ++ } ++ netif->remaining_credit -= txreq.size; ++ ++ work_to_do--; ++ netif->tx.req_cons = ++i; ++ ++ memset(extras, 0, sizeof(extras)); ++ if (txreq.flags & NETTXF_extra_info) { ++ work_to_do = netbk_get_extras(netif, extras, ++ work_to_do); ++ i = netif->tx.req_cons; ++ if (unlikely(work_to_do < 0)) { ++ netbk_tx_err(netif, &txreq, i); ++ continue; ++ } ++ } ++ ++ ret = netbk_count_requests(netif, &txreq, txfrags, work_to_do); ++ if (unlikely(ret < 0)) { ++ netbk_tx_err(netif, &txreq, i - ret); ++ continue; ++ } ++ i += ret; ++ ++ if (unlikely(txreq.size < ETH_HLEN)) { ++ DPRINTK("Bad packet size: %d\n", txreq.size); ++ netbk_tx_err(netif, &txreq, i); ++ continue; ++ } ++ ++ /* No crossing a page as the payload mustn't fragment. */ ++ if (unlikely((txreq.offset + txreq.size) > PAGE_SIZE)) { ++ DPRINTK("txreq.offset: %x, size: %u, end: %lu\n", ++ txreq.offset, txreq.size, ++ (txreq.offset &~PAGE_MASK) + txreq.size); ++ netbk_tx_err(netif, &txreq, i); ++ continue; ++ } ++ ++ pending_idx = pending_ring[MASK_PEND_IDX(pending_cons)]; ++ ++ data_len = (txreq.size > PKT_PROT_LEN && ++ ret < MAX_SKB_FRAGS) ? ++ PKT_PROT_LEN : txreq.size; ++ ++ skb = alloc_skb(data_len + 16 + NET_IP_ALIGN, ++ GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(skb == NULL)) { ++ DPRINTK("Can't allocate a skb in start_xmit.\n"); ++ netbk_tx_err(netif, &txreq, i); ++ break; ++ } ++ ++ /* Packets passed to netif_rx() must have some headroom. */ ++ skb_reserve(skb, 16 + NET_IP_ALIGN); ++ ++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { ++ struct netif_extra_info *gso; ++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; ++ ++ if (netbk_set_skb_gso(skb, gso)) { ++ kfree_skb(skb); ++ netbk_tx_err(netif, &txreq, i); ++ continue; ++ } ++ } ++ ++ gnttab_set_map_op(mop, idx_to_kaddr(pending_idx), ++ GNTMAP_host_map | GNTMAP_readonly, ++ txreq.gref, netif->domid); ++ mop++; ++ ++ memcpy(&pending_tx_info[pending_idx].req, ++ &txreq, sizeof(txreq)); ++ pending_tx_info[pending_idx].netif = netif; ++ *((u16 *)skb->data) = pending_idx; ++ ++ __skb_put(skb, data_len); ++ ++ skb_shinfo(skb)->nr_frags = ret; ++ if (data_len < txreq.size) { ++ skb_shinfo(skb)->nr_frags++; ++ skb_shinfo(skb)->frags[0].page = ++ (void *)(unsigned long)pending_idx; ++ } else { ++ /* Discriminate from any valid pending_idx value. */ ++ skb_shinfo(skb)->frags[0].page = (void *)~0UL; ++ } ++ ++ __skb_queue_tail(&tx_queue, skb); ++ ++ pending_cons++; ++ ++ mop = netbk_get_requests(netif, skb, txfrags, mop); ++ ++ netif->tx.req_cons = i; ++ netif_schedule_work(netif); ++ ++ if ((mop - tx_map_ops) >= ARRAY_SIZE(tx_map_ops)) ++ break; ++ } ++ ++ if (mop == tx_map_ops) ++ return; ++ ++ ret = HYPERVISOR_grant_table_op( ++ GNTTABOP_map_grant_ref, tx_map_ops, mop - tx_map_ops); ++ BUG_ON(ret); ++ ++ mop = tx_map_ops; ++ while ((skb = __skb_dequeue(&tx_queue)) != NULL) { ++ netif_tx_request_t *txp; ++ ++ pending_idx = *((u16 *)skb->data); ++ netif = pending_tx_info[pending_idx].netif; ++ txp = &pending_tx_info[pending_idx].req; ++ ++ /* Check the remap error code. */ ++ if (unlikely(netbk_tx_check_mop(skb, &mop))) { ++ DPRINTK("netback grant failed.\n"); ++ skb_shinfo(skb)->nr_frags = 0; ++ kfree_skb(skb); ++ continue; ++ } ++ ++ data_len = skb->len; ++ memcpy(skb->data, ++ (void *)(idx_to_kaddr(pending_idx)|txp->offset), ++ data_len); ++ if (data_len < txp->size) { ++ /* Append the packet payload as a fragment. */ ++ txp->offset += data_len; ++ txp->size -= data_len; ++ } else { ++ /* Schedule a response immediately. */ ++ netif_idx_release(pending_idx); ++ } ++ ++ /* ++ * Old frontends do not assert data_validated but we ++ * can infer it from csum_blank so test both flags. ++ */ ++ if (txp->flags & (NETTXF_data_validated|NETTXF_csum_blank)) { ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ skb->proto_data_valid = 1; ++ } else { ++ skb->ip_summed = CHECKSUM_NONE; ++ skb->proto_data_valid = 0; ++ } ++ skb->proto_csum_blank = !!(txp->flags & NETTXF_csum_blank); ++ ++ netbk_fill_frags(skb); ++ ++ skb->dev = netif->dev; ++ skb->protocol = eth_type_trans(skb, skb->dev); ++ ++ netif->stats.rx_bytes += skb->len; ++ netif->stats.rx_packets++; ++ ++ netif_rx(skb); ++ netif->dev->last_rx = jiffies; ++ } ++} ++ ++static void netif_idx_release(u16 pending_idx) ++{ ++ static DEFINE_SPINLOCK(_lock); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&_lock, flags); ++ dealloc_ring[MASK_PEND_IDX(dealloc_prod)] = pending_idx; ++ /* Sync with net_tx_action_dealloc: insert idx /then/ incr producer. */ ++ smp_wmb(); ++ dealloc_prod++; ++ spin_unlock_irqrestore(&_lock, flags); ++ ++ tasklet_schedule(&net_tx_tasklet); ++} ++ ++static void netif_page_release(struct page *page) ++{ ++ /* Ready for next use. */ ++ init_page_count(page); ++ ++ netif_idx_release(netif_page_index(page)); ++} ++ ++irqreturn_t netif_be_int(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ netif_t *netif = dev_id; ++ ++ add_to_net_schedule_list_tail(netif); ++ maybe_schedule_tx_action(); ++ ++ if (netif_schedulable(netif) && !netbk_queue_full(netif)) ++ netif_wake_queue(netif->dev); ++ ++ return IRQ_HANDLED; ++} ++ ++static void make_tx_response(netif_t *netif, ++ netif_tx_request_t *txp, ++ s8 st) ++{ ++ RING_IDX i = netif->tx.rsp_prod_pvt; ++ netif_tx_response_t *resp; ++ int notify; ++ ++ resp = RING_GET_RESPONSE(&netif->tx, i); ++ resp->id = txp->id; ++ resp->status = st; ++ ++ if (txp->flags & NETTXF_extra_info) ++ RING_GET_RESPONSE(&netif->tx, ++i)->status = NETIF_RSP_NULL; ++ ++ netif->tx.rsp_prod_pvt = ++i; ++ RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&netif->tx, notify); ++ if (notify) ++ notify_remote_via_irq(netif->irq); ++ ++#ifdef CONFIG_XEN_NETDEV_PIPELINED_TRANSMITTER ++ if (i == netif->tx.req_cons) { ++ int more_to_do; ++ RING_FINAL_CHECK_FOR_REQUESTS(&netif->tx, more_to_do); ++ if (more_to_do) ++ add_to_net_schedule_list_tail(netif); ++ } ++#endif ++} ++ ++static netif_rx_response_t *make_rx_response(netif_t *netif, ++ u16 id, ++ s8 st, ++ u16 offset, ++ u16 size, ++ u16 flags) ++{ ++ RING_IDX i = netif->rx.rsp_prod_pvt; ++ netif_rx_response_t *resp; ++ ++ resp = RING_GET_RESPONSE(&netif->rx, i); ++ resp->offset = offset; ++ resp->flags = flags; ++ resp->id = id; ++ resp->status = (s16)size; ++ if (st < 0) ++ resp->status = (s16)st; ++ ++ netif->rx.rsp_prod_pvt = ++i; ++ ++ return resp; ++} ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++static irqreturn_t netif_be_dbg(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct list_head *ent; ++ netif_t *netif; ++ int i = 0; ++ ++ printk(KERN_ALERT "netif_schedule_list:\n"); ++ spin_lock_irq(&net_schedule_list_lock); ++ ++ list_for_each (ent, &net_schedule_list) { ++ netif = list_entry(ent, netif_t, list); ++ printk(KERN_ALERT " %d: private(rx_req_cons=%08x " ++ "rx_resp_prod=%08x\n", ++ i, netif->rx.req_cons, netif->rx.rsp_prod_pvt); ++ printk(KERN_ALERT " tx_req_cons=%08x tx_resp_prod=%08x)\n", ++ netif->tx.req_cons, netif->tx.rsp_prod_pvt); ++ printk(KERN_ALERT " shared(rx_req_prod=%08x " ++ "rx_resp_prod=%08x\n", ++ netif->rx.sring->req_prod, netif->rx.sring->rsp_prod); ++ printk(KERN_ALERT " rx_event=%08x tx_req_prod=%08x\n", ++ netif->rx.sring->rsp_event, netif->tx.sring->req_prod); ++ printk(KERN_ALERT " tx_resp_prod=%08x, tx_event=%08x)\n", ++ netif->tx.sring->rsp_prod, netif->tx.sring->rsp_event); ++ i++; ++ } ++ ++ spin_unlock_irq(&net_schedule_list_lock); ++ printk(KERN_ALERT " ** End of netif_schedule_list **\n"); ++ ++ return IRQ_HANDLED; ++} ++#endif ++ ++static int __init netback_init(void) ++{ ++ int i; ++ struct page *page; ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ /* We can increase reservation by this much in net_rx_action(). */ ++ balloon_update_driver_allowance(NET_RX_RING_SIZE); ++ ++ skb_queue_head_init(&rx_queue); ++ skb_queue_head_init(&tx_queue); ++ ++ init_timer(&net_timer); ++ net_timer.data = 0; ++ net_timer.function = net_alarm; ++ ++ mmap_pages = alloc_empty_pages_and_pagevec(MAX_PENDING_REQS); ++ if (mmap_pages == NULL) { ++ printk("%s: out of memory\n", __FUNCTION__); ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < MAX_PENDING_REQS; i++) { ++ page = mmap_pages[i]; ++ SetPageForeign(page, netif_page_release); ++ netif_page_index(page) = i; ++ } ++ ++ pending_cons = 0; ++ pending_prod = MAX_PENDING_REQS; ++ for (i = 0; i < MAX_PENDING_REQS; i++) ++ pending_ring[i] = i; ++ ++ spin_lock_init(&net_schedule_list_lock); ++ INIT_LIST_HEAD(&net_schedule_list); ++ ++ netif_xenbus_init(); ++ ++#ifdef NETBE_DEBUG_INTERRUPT ++ (void)bind_virq_to_irqhandler(VIRQ_DEBUG, ++ 0, ++ netif_be_dbg, ++ SA_SHIRQ, ++ "net-be-dbg", ++ &netif_be_dbg); ++#endif ++ ++ return 0; ++} ++ ++module_init(netback_init); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netback/xenbus.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,448 @@ ++/* Xenbus code for netif backend ++ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ Copyright (C) 2005 XenSource Ltd ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <xen/xenbus.h> ++#include "common.h" ++ ++#if 0 ++#undef DPRINTK ++#define DPRINTK(fmt, args...) \ ++ printk("netback/xenbus (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) ++#endif ++ ++struct backend_info { ++ struct xenbus_device *dev; ++ netif_t *netif; ++ enum xenbus_state frontend_state; ++}; ++ ++static int connect_rings(struct backend_info *); ++static void connect(struct backend_info *); ++static void backend_create_netif(struct backend_info *be); ++ ++static int netback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ ++ if (be->netif) { ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++ kfree(be); ++ dev->dev.driver_data = NULL; ++ return 0; ++} ++ ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures and switch to InitWait. ++ */ ++static int netback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ const char *message; ++ struct xenbus_transaction xbt; ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ ++ be->dev = dev; ++ dev->dev.driver_data = be; ++ ++ do { ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ goto fail; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); ++ if (err) { ++ message = "writing feature-sg"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", ++ "%d", 1); ++ if (err) { ++ message = "writing feature-gso-tcpv4"; ++ goto abort_transaction; ++ } ++ ++ /* We support rx-copy path. */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-copy", "%d", 1); ++ if (err) { ++ message = "writing feature-rx-copy"; ++ goto abort_transaction; ++ } ++ ++ /* ++ * We don't support rx-flip path (except old guests who don't ++ * grok this feature flag). ++ */ ++ err = xenbus_printf(xbt, dev->nodename, ++ "feature-rx-flip", "%d", 0); ++ if (err) { ++ message = "writing feature-rx-flip"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ } while (err == -EAGAIN); ++ ++ if (err) { ++ xenbus_dev_fatal(dev, err, "completing transaction"); ++ goto fail; ++ } ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto fail; ++ ++ /* This kicks hotplug scripts, so do it immediately. */ ++ backend_create_netif(be); ++ ++ return 0; ++ ++abort_transaction: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, err, "%s", message); ++fail: ++ DPRINTK("failed"); ++ netback_remove(dev); ++ return err; ++} ++ ++ ++/** ++ * Handle the creation of the hotplug script environment. We add the script ++ * and vif variables to the environment, for the benefit of the vif-* hotplug ++ * scripts. ++ */ ++static int netback_uevent(struct xenbus_device *xdev, char **envp, ++ int num_envp, char *buffer, int buffer_size) ++{ ++ struct backend_info *be = xdev->dev.driver_data; ++ netif_t *netif = be->netif; ++ int i = 0, length = 0; ++ char *val; ++ ++ DPRINTK("netback_uevent"); ++ ++ val = xenbus_read(XBT_NIL, xdev->nodename, "script", NULL); ++ if (IS_ERR(val)) { ++ int err = PTR_ERR(val); ++ xenbus_dev_fatal(xdev, err, "reading script"); ++ return err; ++ } ++ else { ++ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, ++ &length, "script=%s", val); ++ kfree(val); ++ } ++ ++ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, ++ "vif=%s", netif->dev->name); ++ ++ envp[i] = NULL; ++ ++ return 0; ++} ++ ++ ++static void backend_create_netif(struct backend_info *be) ++{ ++ int err; ++ long handle; ++ struct xenbus_device *dev = be->dev; ++ ++ if (be->netif != NULL) ++ return; ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "handle", "%li", &handle); ++ if (err != 1) { ++ xenbus_dev_fatal(dev, err, "reading handle"); ++ return; ++ } ++ ++ be->netif = netif_alloc(dev->otherend_id, handle); ++ if (IS_ERR(be->netif)) { ++ err = PTR_ERR(be->netif); ++ be->netif = NULL; ++ xenbus_dev_fatal(dev, err, "creating interface"); ++ return; ++ } ++ ++ kobject_uevent(&dev->dev.kobj, KOBJ_ONLINE); ++} ++ ++ ++/** ++ * Callback received when the frontend's state changes. ++ */ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ ++ DPRINTK("%s", xenbus_strstate(frontend_state)); ++ ++ be->frontend_state = frontend_state; ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ if (dev->state == XenbusStateClosed) { ++ printk(KERN_INFO "%s: %s: prepare for reconnect\n", ++ __FUNCTION__, dev->nodename); ++ if (be->netif) { ++ netif_disconnect(be->netif); ++ be->netif = NULL; ++ } ++ xenbus_switch_state(dev, XenbusStateInitWait); ++ } ++ break; ++ ++ case XenbusStateInitialised: ++ break; ++ ++ case XenbusStateConnected: ++ backend_create_netif(be); ++ if (be->netif) ++ connect(be); ++ break; ++ ++ case XenbusStateClosing: ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ if (xenbus_dev_is_online(dev)) ++ break; ++ /* fall through if not online */ ++ case XenbusStateUnknown: ++ if (be->netif != NULL) ++ kobject_uevent(&dev->dev.kobj, KOBJ_OFFLINE); ++ device_unregister(&dev->dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++static void xen_net_read_rate(struct xenbus_device *dev, ++ unsigned long *bytes, unsigned long *usec) ++{ ++ char *s, *e; ++ unsigned long b, u; ++ char *ratestr; ++ ++ /* Default to unlimited bandwidth. */ ++ *bytes = ~0UL; ++ *usec = 0; ++ ++ ratestr = xenbus_read(XBT_NIL, dev->nodename, "rate", NULL); ++ if (IS_ERR(ratestr)) ++ return; ++ ++ s = ratestr; ++ b = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != ',')) ++ goto fail; ++ ++ s = e + 1; ++ u = simple_strtoul(s, &e, 10); ++ if ((s == e) || (*e != '\0')) ++ goto fail; ++ ++ *bytes = b; ++ *usec = u; ++ ++ kfree(ratestr); ++ return; ++ ++ fail: ++ WPRINTK("Failed to parse network rate limit. Traffic unlimited.\n"); ++ kfree(ratestr); ++} ++ ++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) ++{ ++ char *s, *e, *macstr; ++ int i; ++ ++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); ++ if (IS_ERR(macstr)) ++ return PTR_ERR(macstr); ++ ++ for (i = 0; i < ETH_ALEN; i++) { ++ mac[i] = simple_strtoul(s, &e, 16); ++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { ++ kfree(macstr); ++ return -ENOENT; ++ } ++ s = e+1; ++ } ++ ++ kfree(macstr); ++ return 0; ++} ++ ++static void connect(struct backend_info *be) ++{ ++ int err; ++ struct xenbus_device *dev = be->dev; ++ ++ err = connect_rings(be); ++ if (err) ++ return; ++ ++ err = xen_net_read_mac(dev, be->netif->fe_dev_addr); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); ++ return; ++ } ++ ++ xen_net_read_rate(dev, &be->netif->credit_bytes, ++ &be->netif->credit_usec); ++ be->netif->remaining_credit = be->netif->credit_bytes; ++ ++ xenbus_switch_state(dev, XenbusStateConnected); ++ ++ netif_wake_queue(be->netif->dev); ++} ++ ++ ++static int connect_rings(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long tx_ring_ref, rx_ring_ref; ++ unsigned int evtchn, rx_copy; ++ int err; ++ int val; ++ ++ DPRINTK(""); ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, ++ "tx-ring-ref", "%lu", &tx_ring_ref, ++ "rx-ring-ref", "%lu", &rx_ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, dev->otherend, "request-rx-copy", "%u", ++ &rx_copy); ++ if (err == -ENOENT) { ++ err = 0; ++ rx_copy = 0; ++ } ++ if (err < 0) { ++ xenbus_dev_fatal(dev, err, "reading %s/request-rx-copy", ++ dev->otherend); ++ return err; ++ } ++ be->netif->copying_receiver = !!rx_copy; ++ ++ if (be->netif->dev->tx_queue_len != 0) { ++ if (xenbus_scanf(XBT_NIL, dev->otherend, ++ "feature-rx-notify", "%d", &val) < 0) ++ val = 0; ++ if (val) ++ be->netif->can_queue = 1; ++ else ++ /* Must be non-zero for pfifo_fast to work. */ ++ be->netif->dev->tx_queue_len = 1; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-sg", "%d", &val) < 0) ++ val = 0; ++ if (val) { ++ be->netif->features |= NETIF_F_SG; ++ be->netif->dev->features |= NETIF_F_SG; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-gso-tcpv4", "%d", ++ &val) < 0) ++ val = 0; ++ if (val) { ++ be->netif->features |= NETIF_F_TSO; ++ be->netif->dev->features |= NETIF_F_TSO; ++ } ++ ++ if (xenbus_scanf(XBT_NIL, dev->otherend, "feature-no-csum-offload", ++ "%d", &val) < 0) ++ val = 0; ++ if (val) { ++ be->netif->features &= ~NETIF_F_IP_CSUM; ++ be->netif->dev->features &= ~NETIF_F_IP_CSUM; ++ } ++ ++ /* Map the shared frame, irq etc. */ ++ err = netif_map(be->netif, tx_ring_ref, rx_ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_fatal(dev, err, ++ "mapping shared-frames %lu/%lu port %u", ++ tx_ring_ref, rx_ring_ref, evtchn); ++ return err; ++ } ++ return 0; ++} ++ ++ ++/* ** Driver Registration ** */ ++ ++ ++static struct xenbus_device_id netback_ids[] = { ++ { "vif" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver netback = { ++ .name = "vif", ++ .owner = THIS_MODULE, ++ .ids = netback_ids, ++ .probe = netback_probe, ++ .remove = netback_remove, ++ .uevent = netback_uevent, ++ .otherend_changed = frontend_changed, ++}; ++ ++ ++void netif_xenbus_init(void) ++{ ++ xenbus_register_backend(&netback); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netfront/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,4 @@ ++ ++obj-$(CONFIG_XEN_NETDEV_FRONTEND) := xennet.o ++ ++xennet-objs := netfront.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/netfront/netfront.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,2133 @@ ++/****************************************************************************** ++ * Virtual network driver for conversing with remote driver backends. ++ * ++ * Copyright (c) 2002-2005, K A Fraser ++ * Copyright (c) 2005, XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/module.h> ++#include <linux/version.h> ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/string.h> ++#include <linux/errno.h> ++#include <linux/netdevice.h> ++#include <linux/inetdevice.h> ++#include <linux/etherdevice.h> ++#include <linux/skbuff.h> ++#include <linux/init.h> ++#include <linux/bitops.h> ++#include <linux/ethtool.h> ++#include <linux/in.h> ++#include <linux/if_ether.h> ++#include <linux/io.h> ++#include <linux/moduleparam.h> ++#include <net/sock.h> ++#include <net/pkt_sched.h> ++#include <net/arp.h> ++#include <net/route.h> ++#include <asm/uaccess.h> ++#include <xen/evtchn.h> ++#include <xen/xenbus.h> ++#include <xen/interface/io/netif.h> ++#include <xen/interface/memory.h> ++#include <xen/balloon.h> ++#include <asm/page.h> ++#include <asm/maddr.h> ++#include <asm/uaccess.h> ++#include <xen/interface/grant_table.h> ++#include <xen/gnttab.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++struct netfront_cb { ++ struct page *page; ++ unsigned offset; ++}; ++ ++#define NETFRONT_SKB_CB(skb) ((struct netfront_cb *)((skb)->cb)) ++ ++/* ++ * Mutually-exclusive module options to select receive data path: ++ * rx_copy : Packets are copied by network backend into local memory ++ * rx_flip : Page containing packet data is transferred to our ownership ++ * For fully-virtualised guests there is no option - copying must be used. ++ * For paravirtualised guests, flipping is the default. ++ */ ++#ifdef CONFIG_XEN ++static int MODPARM_rx_copy = 0; ++module_param_named(rx_copy, MODPARM_rx_copy, bool, 0); ++MODULE_PARM_DESC(rx_copy, "Copy packets from network card (rather than flip)"); ++static int MODPARM_rx_flip = 0; ++module_param_named(rx_flip, MODPARM_rx_flip, bool, 0); ++MODULE_PARM_DESC(rx_flip, "Flip packets from network card (rather than copy)"); ++#else ++static const int MODPARM_rx_copy = 1; ++static const int MODPARM_rx_flip = 0; ++#endif ++ ++#define RX_COPY_THRESHOLD 256 ++ ++/* If we don't have GSO, fake things up so that we never try to use it. */ ++#if defined(NETIF_F_GSO) ++#define HAVE_GSO 1 ++#define HAVE_TSO 1 /* TSO is a subset of GSO */ ++static inline void dev_disable_gso_features(struct net_device *dev) ++{ ++ /* Turn off all GSO bits except ROBUST. */ ++ dev->features &= (1 << NETIF_F_GSO_SHIFT) - 1; ++ dev->features |= NETIF_F_GSO_ROBUST; ++} ++#elif defined(NETIF_F_TSO) ++#define HAVE_TSO 1 ++ ++/* Some older kernels cannot cope with incorrect checksums, ++ * particularly in netfilter. I'm not sure there is 100% correlation ++ * with the presence of NETIF_F_TSO but it appears to be a good first ++ * approximiation. ++ */ ++#define HAVE_NO_CSUM_OFFLOAD 1 ++ ++#define gso_size tso_size ++#define gso_segs tso_segs ++static inline void dev_disable_gso_features(struct net_device *dev) ++{ ++ /* Turn off all TSO bits. */ ++ dev->features &= ~NETIF_F_TSO; ++} ++static inline int skb_is_gso(const struct sk_buff *skb) ++{ ++ return skb_shinfo(skb)->tso_size; ++} ++static inline int skb_gso_ok(struct sk_buff *skb, int features) ++{ ++ return (features & NETIF_F_TSO); ++} ++ ++static inline int netif_needs_gso(struct net_device *dev, struct sk_buff *skb) ++{ ++ return skb_is_gso(skb) && ++ (!skb_gso_ok(skb, dev->features) || ++ unlikely(skb->ip_summed != CHECKSUM_HW)); ++} ++#else ++#define netif_needs_gso(dev, skb) 0 ++#define dev_disable_gso_features(dev) ((void)0) ++#endif ++ ++#define GRANT_INVALID_REF 0 ++ ++#define NET_TX_RING_SIZE __RING_SIZE((struct netif_tx_sring *)0, PAGE_SIZE) ++#define NET_RX_RING_SIZE __RING_SIZE((struct netif_rx_sring *)0, PAGE_SIZE) ++ ++struct netfront_info { ++ struct list_head list; ++ struct net_device *netdev; ++ ++ struct net_device_stats stats; ++ ++ struct netif_tx_front_ring tx; ++ struct netif_rx_front_ring rx; ++ ++ spinlock_t tx_lock; ++ spinlock_t rx_lock; ++ ++ unsigned int irq; ++ unsigned int copying_receiver; ++ unsigned int carrier; ++ ++ /* Receive-ring batched refills. */ ++#define RX_MIN_TARGET 8 ++#define RX_DFL_MIN_TARGET 64 ++#define RX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) ++ unsigned rx_min_target, rx_max_target, rx_target; ++ struct sk_buff_head rx_batch; ++ ++ struct timer_list rx_refill_timer; ++ ++ /* ++ * {tx,rx}_skbs store outstanding skbuffs. The first entry in tx_skbs ++ * is an index into a chain of free entries. ++ */ ++ struct sk_buff *tx_skbs[NET_TX_RING_SIZE+1]; ++ struct sk_buff *rx_skbs[NET_RX_RING_SIZE]; ++ ++#define TX_MAX_TARGET min_t(int, NET_RX_RING_SIZE, 256) ++ grant_ref_t gref_tx_head; ++ grant_ref_t grant_tx_ref[NET_TX_RING_SIZE + 1]; ++ grant_ref_t gref_rx_head; ++ grant_ref_t grant_rx_ref[NET_RX_RING_SIZE]; ++ ++ struct xenbus_device *xbdev; ++ int tx_ring_ref; ++ int rx_ring_ref; ++ u8 mac[ETH_ALEN]; ++ ++ unsigned long rx_pfn_array[NET_RX_RING_SIZE]; ++ struct multicall_entry rx_mcl[NET_RX_RING_SIZE+1]; ++ struct mmu_update rx_mmu[NET_RX_RING_SIZE]; ++}; ++ ++struct netfront_rx_info { ++ struct netif_rx_response rx; ++ struct netif_extra_info extras[XEN_NETIF_EXTRA_TYPE_MAX - 1]; ++}; ++ ++/* ++ * Implement our own carrier flag: the network stack's version causes delays ++ * when the carrier is re-enabled (in particular, dev_activate() may not ++ * immediately be called, which can cause packet loss). ++ */ ++#define netfront_carrier_on(netif) ((netif)->carrier = 1) ++#define netfront_carrier_off(netif) ((netif)->carrier = 0) ++#define netfront_carrier_ok(netif) ((netif)->carrier) ++ ++/* ++ * Access macros for acquiring freeing slots in tx_skbs[]. ++ */ ++ ++static inline void add_id_to_freelist(struct sk_buff **list, unsigned short id) ++{ ++ list[id] = list[0]; ++ list[0] = (void *)(unsigned long)id; ++} ++ ++static inline unsigned short get_id_from_freelist(struct sk_buff **list) ++{ ++ unsigned int id = (unsigned int)(unsigned long)list[0]; ++ list[0] = list[id]; ++ return id; ++} ++ ++static inline int xennet_rxidx(RING_IDX idx) ++{ ++ return idx & (NET_RX_RING_SIZE - 1); ++} ++ ++static inline struct sk_buff *xennet_get_rx_skb(struct netfront_info *np, ++ RING_IDX ri) ++{ ++ int i = xennet_rxidx(ri); ++ struct sk_buff *skb = np->rx_skbs[i]; ++ np->rx_skbs[i] = NULL; ++ return skb; ++} ++ ++static inline grant_ref_t xennet_get_rx_ref(struct netfront_info *np, ++ RING_IDX ri) ++{ ++ int i = xennet_rxidx(ri); ++ grant_ref_t ref = np->grant_rx_ref[i]; ++ np->grant_rx_ref[i] = GRANT_INVALID_REF; ++ return ref; ++} ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("netfront (%s:%d) " fmt, \ ++ __FUNCTION__, __LINE__, ##args) ++#define IPRINTK(fmt, args...) \ ++ printk(KERN_INFO "netfront: " fmt, ##args) ++#define WPRINTK(fmt, args...) \ ++ printk(KERN_WARNING "netfront: " fmt, ##args) ++ ++static int setup_device(struct xenbus_device *, struct netfront_info *); ++static struct net_device *create_netdev(struct xenbus_device *); ++ ++static void end_access(int, void *); ++static void netif_disconnect_backend(struct netfront_info *); ++ ++static int network_connect(struct net_device *); ++static void network_tx_buf_gc(struct net_device *); ++static void network_alloc_rx_buffers(struct net_device *); ++static int send_fake_arp(struct net_device *); ++ ++static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs); ++ ++#ifdef CONFIG_SYSFS ++static int xennet_sysfs_addif(struct net_device *netdev); ++static void xennet_sysfs_delif(struct net_device *netdev); ++#else /* !CONFIG_SYSFS */ ++#define xennet_sysfs_addif(dev) (0) ++#define xennet_sysfs_delif(dev) do { } while(0) ++#endif ++ ++static inline int xennet_can_sg(struct net_device *dev) ++{ ++ return dev->features & NETIF_F_SG; ++} ++ ++/** ++ * Entry point to this code when a new device is created. Allocate the basic ++ * structures and the ring buffers for communication with the backend, and ++ * inform the backend of the appropriate details for those. ++ */ ++static int __devinit netfront_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct net_device *netdev; ++ struct netfront_info *info; ++ ++ netdev = create_netdev(dev); ++ if (IS_ERR(netdev)) { ++ err = PTR_ERR(netdev); ++ xenbus_dev_fatal(dev, err, "creating netdev"); ++ return err; ++ } ++ ++ info = netdev_priv(netdev); ++ dev->dev.driver_data = info; ++ ++ err = register_netdev(info->netdev); ++ if (err) { ++ printk(KERN_WARNING "%s: register_netdev err=%d\n", ++ __FUNCTION__, err); ++ goto fail; ++ } ++ ++ err = xennet_sysfs_addif(info->netdev); ++ if (err) { ++ unregister_netdev(info->netdev); ++ printk(KERN_WARNING "%s: add sysfs failed err=%d\n", ++ __FUNCTION__, err); ++ goto fail; ++ } ++ ++ return 0; ++ ++ fail: ++ free_netdev(netdev); ++ dev->dev.driver_data = NULL; ++ return err; ++} ++ ++static int __devexit netfront_remove(struct xenbus_device *dev) ++{ ++ struct netfront_info *info = dev->dev.driver_data; ++ ++ DPRINTK("%s\n", dev->nodename); ++ ++ netif_disconnect_backend(info); ++ ++ del_timer_sync(&info->rx_refill_timer); ++ ++ xennet_sysfs_delif(info->netdev); ++ ++ unregister_netdev(info->netdev); ++ ++ free_netdev(info->netdev); ++ ++ return 0; ++} ++ ++/** ++ * We are reconnecting to the backend, due to a suspend/resume, or a backend ++ * driver restart. We tear down our netif structure and recreate it, but ++ * leave the device-layer structures intact so that this is transparent to the ++ * rest of the kernel. ++ */ ++static int netfront_resume(struct xenbus_device *dev) ++{ ++ struct netfront_info *info = dev->dev.driver_data; ++ ++ DPRINTK("%s\n", dev->nodename); ++ ++ netif_disconnect_backend(info); ++ return 0; ++} ++ ++static int xen_net_read_mac(struct xenbus_device *dev, u8 mac[]) ++{ ++ char *s, *e, *macstr; ++ int i; ++ ++ macstr = s = xenbus_read(XBT_NIL, dev->nodename, "mac", NULL); ++ if (IS_ERR(macstr)) ++ return PTR_ERR(macstr); ++ ++ for (i = 0; i < ETH_ALEN; i++) { ++ mac[i] = simple_strtoul(s, &e, 16); ++ if ((s == e) || (*e != ((i == ETH_ALEN-1) ? '\0' : ':'))) { ++ kfree(macstr); ++ return -ENOENT; ++ } ++ s = e+1; ++ } ++ ++ kfree(macstr); ++ return 0; ++} ++ ++/* Common code used when first setting up, and when resuming. */ ++static int talk_to_backend(struct xenbus_device *dev, ++ struct netfront_info *info) ++{ ++ const char *message; ++ struct xenbus_transaction xbt; ++ int err; ++ ++ err = xen_net_read_mac(dev, info->mac); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "parsing %s/mac", dev->nodename); ++ goto out; ++ } ++ ++ /* Create shared ring, alloc event channel. */ ++ err = setup_device(dev, info); ++ if (err) ++ goto out; ++ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(dev, err, "starting transaction"); ++ goto destroy_ring; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "tx-ring-ref","%u", ++ info->tx_ring_ref); ++ if (err) { ++ message = "writing tx ring-ref"; ++ goto abort_transaction; ++ } ++ err = xenbus_printf(xbt, dev->nodename, "rx-ring-ref","%u", ++ info->rx_ring_ref); ++ if (err) { ++ message = "writing rx ring-ref"; ++ goto abort_transaction; ++ } ++ err = xenbus_printf(xbt, dev->nodename, ++ "event-channel", "%u", ++ irq_to_evtchn_port(info->irq)); ++ if (err) { ++ message = "writing event-channel"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "request-rx-copy", "%u", ++ info->copying_receiver); ++ if (err) { ++ message = "writing request-rx-copy"; ++ goto abort_transaction; ++ } ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-rx-notify", "%d", 1); ++ if (err) { ++ message = "writing feature-rx-notify"; ++ goto abort_transaction; ++ } ++ ++#ifdef HAVE_NO_CSUM_OFFLOAD ++ err = xenbus_printf(xbt, dev->nodename, "feature-no-csum-offload", "%d", 1); ++ if (err) { ++ message = "writing feature-no-csum-offload"; ++ goto abort_transaction; ++ } ++#endif ++ ++ err = xenbus_printf(xbt, dev->nodename, "feature-sg", "%d", 1); ++ if (err) { ++ message = "writing feature-sg"; ++ goto abort_transaction; ++ } ++ ++#ifdef HAVE_TSO ++ err = xenbus_printf(xbt, dev->nodename, "feature-gso-tcpv4", "%d", 1); ++ if (err) { ++ message = "writing feature-gso-tcpv4"; ++ goto abort_transaction; ++ } ++#endif ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err) { ++ if (err == -EAGAIN) ++ goto again; ++ xenbus_dev_fatal(dev, err, "completing transaction"); ++ goto destroy_ring; ++ } ++ ++ return 0; ++ ++ abort_transaction: ++ xenbus_transaction_end(xbt, 1); ++ xenbus_dev_fatal(dev, err, "%s", message); ++ destroy_ring: ++ netif_disconnect_backend(info); ++ out: ++ return err; ++} ++ ++static int setup_device(struct xenbus_device *dev, struct netfront_info *info) ++{ ++ struct netif_tx_sring *txs; ++ struct netif_rx_sring *rxs; ++ int err; ++ struct net_device *netdev = info->netdev; ++ ++ info->tx_ring_ref = GRANT_INVALID_REF; ++ info->rx_ring_ref = GRANT_INVALID_REF; ++ info->rx.sring = NULL; ++ info->tx.sring = NULL; ++ info->irq = 0; ++ ++ txs = (struct netif_tx_sring *)get_zeroed_page(GFP_KERNEL); ++ if (!txs) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(dev, err, "allocating tx ring page"); ++ goto fail; ++ } ++ SHARED_RING_INIT(txs); ++ FRONT_RING_INIT(&info->tx, txs, PAGE_SIZE); ++ ++ err = xenbus_grant_ring(dev, virt_to_mfn(txs)); ++ if (err < 0) { ++ free_page((unsigned long)txs); ++ goto fail; ++ } ++ info->tx_ring_ref = err; ++ ++ rxs = (struct netif_rx_sring *)get_zeroed_page(GFP_KERNEL); ++ if (!rxs) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(dev, err, "allocating rx ring page"); ++ goto fail; ++ } ++ SHARED_RING_INIT(rxs); ++ FRONT_RING_INIT(&info->rx, rxs, PAGE_SIZE); ++ ++ err = xenbus_grant_ring(dev, virt_to_mfn(rxs)); ++ if (err < 0) { ++ free_page((unsigned long)rxs); ++ goto fail; ++ } ++ info->rx_ring_ref = err; ++ ++ memcpy(netdev->dev_addr, info->mac, ETH_ALEN); ++ ++ err = bind_listening_port_to_irqhandler( ++ dev->otherend_id, netif_int, SA_SAMPLE_RANDOM, netdev->name, ++ netdev); ++ if (err < 0) ++ goto fail; ++ info->irq = err; ++ ++ return 0; ++ ++ fail: ++ return err; ++} ++ ++/** ++ * Callback received when the backend's state changes. ++ */ ++static void backend_changed(struct xenbus_device *dev, ++ enum xenbus_state backend_state) ++{ ++ struct netfront_info *np = dev->dev.driver_data; ++ struct net_device *netdev = np->netdev; ++ ++ DPRINTK("%s\n", xenbus_strstate(backend_state)); ++ ++ switch (backend_state) { ++ case XenbusStateInitialising: ++ case XenbusStateInitialised: ++ case XenbusStateConnected: ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ break; ++ ++ case XenbusStateInitWait: ++ if (dev->state != XenbusStateInitialising) ++ break; ++ if (network_connect(netdev) != 0) ++ break; ++ xenbus_switch_state(dev, XenbusStateConnected); ++ (void)send_fake_arp(netdev); ++ break; ++ ++ case XenbusStateClosing: ++ xenbus_frontend_closed(dev); ++ break; ++ } ++} ++ ++/** Send a packet on a net device to encourage switches to learn the ++ * MAC. We send a fake ARP request. ++ * ++ * @param dev device ++ * @return 0 on success, error code otherwise ++ */ ++static int send_fake_arp(struct net_device *dev) ++{ ++ struct sk_buff *skb; ++ u32 src_ip, dst_ip; ++ ++ dst_ip = INADDR_BROADCAST; ++ src_ip = inet_select_addr(dev, dst_ip, RT_SCOPE_LINK); ++ ++ /* No IP? Then nothing to do. */ ++ if (src_ip == 0) ++ return 0; ++ ++ skb = arp_create(ARPOP_REPLY, ETH_P_ARP, ++ dst_ip, dev, src_ip, ++ /*dst_hw*/ NULL, /*src_hw*/ NULL, ++ /*target_hw*/ dev->dev_addr); ++ if (skb == NULL) ++ return -ENOMEM; ++ ++ return dev_queue_xmit(skb); ++} ++ ++static inline int netfront_tx_slot_available(struct netfront_info *np) ++{ ++ return ((np->tx.req_prod_pvt - np->tx.rsp_cons) < ++ (TX_MAX_TARGET - MAX_SKB_FRAGS - 2)); ++} ++ ++static inline void network_maybe_wake_tx(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ ++ if (unlikely(netif_queue_stopped(dev)) && ++ netfront_tx_slot_available(np) && ++ likely(netif_running(dev))) ++ netif_wake_queue(dev); ++} ++ ++static int network_open(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ ++ memset(&np->stats, 0, sizeof(np->stats)); ++ ++ spin_lock_bh(&np->rx_lock); ++ if (netfront_carrier_ok(np)) { ++ network_alloc_rx_buffers(dev); ++ np->rx.sring->rsp_event = np->rx.rsp_cons + 1; ++ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) ++ netif_rx_schedule(dev); ++ } ++ spin_unlock_bh(&np->rx_lock); ++ ++ network_maybe_wake_tx(dev); ++ ++ return 0; ++} ++ ++static void network_tx_buf_gc(struct net_device *dev) ++{ ++ RING_IDX cons, prod; ++ unsigned short id; ++ struct netfront_info *np = netdev_priv(dev); ++ struct sk_buff *skb; ++ ++ BUG_ON(!netfront_carrier_ok(np)); ++ ++ do { ++ prod = np->tx.sring->rsp_prod; ++ rmb(); /* Ensure we see responses up to 'rp'. */ ++ ++ for (cons = np->tx.rsp_cons; cons != prod; cons++) { ++ struct netif_tx_response *txrsp; ++ ++ txrsp = RING_GET_RESPONSE(&np->tx, cons); ++ if (txrsp->status == NETIF_RSP_NULL) ++ continue; ++ ++ id = txrsp->id; ++ skb = np->tx_skbs[id]; ++ if (unlikely(gnttab_query_foreign_access( ++ np->grant_tx_ref[id]) != 0)) { ++ printk(KERN_ALERT "network_tx_buf_gc: warning " ++ "-- grant still in use by backend " ++ "domain.\n"); ++ BUG(); ++ } ++ gnttab_end_foreign_access_ref( ++ np->grant_tx_ref[id], GNTMAP_readonly); ++ gnttab_release_grant_reference( ++ &np->gref_tx_head, np->grant_tx_ref[id]); ++ np->grant_tx_ref[id] = GRANT_INVALID_REF; ++ add_id_to_freelist(np->tx_skbs, id); ++ dev_kfree_skb_irq(skb); ++ } ++ ++ np->tx.rsp_cons = prod; ++ ++ /* ++ * Set a new event, then check for race with update of tx_cons. ++ * Note that it is essential to schedule a callback, no matter ++ * how few buffers are pending. Even if there is space in the ++ * transmit ring, higher layers may be blocked because too much ++ * data is outstanding: in such cases notification from Xen is ++ * likely to be the only kick that we'll get. ++ */ ++ np->tx.sring->rsp_event = ++ prod + ((np->tx.sring->req_prod - prod) >> 1) + 1; ++ mb(); ++ } while ((cons == prod) && (prod != np->tx.sring->rsp_prod)); ++ ++ network_maybe_wake_tx(dev); ++} ++ ++static void rx_refill_timeout(unsigned long data) ++{ ++ struct net_device *dev = (struct net_device *)data; ++ netif_rx_schedule(dev); ++} ++ ++static void network_alloc_rx_buffers(struct net_device *dev) ++{ ++ unsigned short id; ++ struct netfront_info *np = netdev_priv(dev); ++ struct sk_buff *skb; ++ struct page *page; ++ int i, batch_target, notify; ++ RING_IDX req_prod = np->rx.req_prod_pvt; ++ struct xen_memory_reservation reservation; ++ grant_ref_t ref; ++ unsigned long pfn; ++ void *vaddr; ++ int nr_flips; ++ netif_rx_request_t *req; ++ ++ if (unlikely(!netfront_carrier_ok(np))) ++ return; ++ ++ /* ++ * Allocate skbuffs greedily, even though we batch updates to the ++ * receive ring. This creates a less bursty demand on the memory ++ * allocator, so should reduce the chance of failed allocation requests ++ * both for ourself and for other kernel subsystems. ++ */ ++ batch_target = np->rx_target - (req_prod - np->rx.rsp_cons); ++ for (i = skb_queue_len(&np->rx_batch); i < batch_target; i++) { ++ /* ++ * Allocate an skb and a page. Do not use __dev_alloc_skb as ++ * that will allocate page-sized buffers which is not ++ * necessary here. ++ * 16 bytes added as necessary headroom for netif_receive_skb. ++ */ ++ skb = alloc_skb(RX_COPY_THRESHOLD + 16 + NET_IP_ALIGN, ++ GFP_ATOMIC | __GFP_NOWARN); ++ if (unlikely(!skb)) ++ goto no_skb; ++ ++ page = alloc_page(GFP_ATOMIC | __GFP_NOWARN); ++ if (!page) { ++ kfree_skb(skb); ++no_skb: ++ /* Any skbuffs queued for refill? Force them out. */ ++ if (i != 0) ++ goto refill; ++ /* Could not allocate any skbuffs. Try again later. */ ++ mod_timer(&np->rx_refill_timer, ++ jiffies + (HZ/10)); ++ break; ++ } ++ ++ skb_reserve(skb, 16 + NET_IP_ALIGN); /* mimic dev_alloc_skb() */ ++ skb_shinfo(skb)->frags[0].page = page; ++ skb_shinfo(skb)->nr_frags = 1; ++ __skb_queue_tail(&np->rx_batch, skb); ++ } ++ ++ /* Is the batch large enough to be worthwhile? */ ++ if (i < (np->rx_target/2)) { ++ if (req_prod > np->rx.sring->req_prod) ++ goto push; ++ return; ++ } ++ ++ /* Adjust our fill target if we risked running out of buffers. */ ++ if (((req_prod - np->rx.sring->rsp_prod) < (np->rx_target / 4)) && ++ ((np->rx_target *= 2) > np->rx_max_target)) ++ np->rx_target = np->rx_max_target; ++ ++ refill: ++ for (nr_flips = i = 0; ; i++) { ++ if ((skb = __skb_dequeue(&np->rx_batch)) == NULL) ++ break; ++ ++ skb->dev = dev; ++ ++ id = xennet_rxidx(req_prod + i); ++ ++ BUG_ON(np->rx_skbs[id]); ++ np->rx_skbs[id] = skb; ++ ++ ref = gnttab_claim_grant_reference(&np->gref_rx_head); ++ BUG_ON((signed short)ref < 0); ++ np->grant_rx_ref[id] = ref; ++ ++ pfn = page_to_pfn(skb_shinfo(skb)->frags[0].page); ++ vaddr = page_address(skb_shinfo(skb)->frags[0].page); ++ ++ req = RING_GET_REQUEST(&np->rx, req_prod + i); ++ if (!np->copying_receiver) { ++ gnttab_grant_foreign_transfer_ref(ref, ++ np->xbdev->otherend_id, ++ pfn); ++ np->rx_pfn_array[nr_flips] = pfn_to_mfn(pfn); ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* Remove this page before passing ++ * back to Xen. */ ++ set_phys_to_machine(pfn, INVALID_P2M_ENTRY); ++ MULTI_update_va_mapping(np->rx_mcl+i, ++ (unsigned long)vaddr, ++ __pte(0), 0); ++ } ++ nr_flips++; ++ } else { ++ gnttab_grant_foreign_access_ref(ref, ++ np->xbdev->otherend_id, ++ pfn_to_mfn(pfn), ++ 0); ++ } ++ ++ req->id = id; ++ req->gref = ref; ++ } ++ ++ if ( nr_flips != 0 ) { ++ /* Tell the ballon driver what is going on. */ ++ balloon_update_driver_allowance(i); ++ ++ set_xen_guest_handle(reservation.extent_start, ++ np->rx_pfn_array); ++ reservation.nr_extents = nr_flips; ++ reservation.extent_order = 0; ++ reservation.address_bits = 0; ++ reservation.domid = DOMID_SELF; ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* After all PTEs have been zapped, flush the TLB. */ ++ np->rx_mcl[i-1].args[MULTI_UVMFLAGS_INDEX] = ++ UVMF_TLB_FLUSH|UVMF_ALL; ++ ++ /* Give away a batch of pages. */ ++ np->rx_mcl[i].op = __HYPERVISOR_memory_op; ++ np->rx_mcl[i].args[0] = XENMEM_decrease_reservation; ++ np->rx_mcl[i].args[1] = (unsigned long)&reservation; ++ ++ /* Zap PTEs and give away pages in one big ++ * multicall. */ ++ (void)HYPERVISOR_multicall(np->rx_mcl, i+1); ++ ++ /* Check return status of HYPERVISOR_memory_op(). */ ++ if (unlikely(np->rx_mcl[i].result != i)) ++ panic("Unable to reduce memory reservation\n"); ++ } else { ++ if (HYPERVISOR_memory_op(XENMEM_decrease_reservation, ++ &reservation) != i) ++ panic("Unable to reduce memory reservation\n"); ++ } ++ } else { ++ wmb(); ++ } ++ ++ /* Above is a suitable barrier to ensure backend will see requests. */ ++ np->rx.req_prod_pvt = req_prod + i; ++ push: ++ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->rx, notify); ++ if (notify) ++ notify_remote_via_irq(np->irq); ++} ++ ++static void xennet_make_frags(struct sk_buff *skb, struct net_device *dev, ++ struct netif_tx_request *tx) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ char *data = skb->data; ++ unsigned long mfn; ++ RING_IDX prod = np->tx.req_prod_pvt; ++ int frags = skb_shinfo(skb)->nr_frags; ++ unsigned int offset = offset_in_page(data); ++ unsigned int len = skb_headlen(skb); ++ unsigned int id; ++ grant_ref_t ref; ++ int i; ++ ++ while (len > PAGE_SIZE - offset) { ++ tx->size = PAGE_SIZE - offset; ++ tx->flags |= NETTXF_more_data; ++ len -= tx->size; ++ data += tx->size; ++ offset = 0; ++ ++ id = get_id_from_freelist(np->tx_skbs); ++ np->tx_skbs[id] = skb_get(skb); ++ tx = RING_GET_REQUEST(&np->tx, prod++); ++ tx->id = id; ++ ref = gnttab_claim_grant_reference(&np->gref_tx_head); ++ BUG_ON((signed short)ref < 0); ++ ++ mfn = virt_to_mfn(data); ++ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, ++ mfn, GNTMAP_readonly); ++ ++ tx->gref = np->grant_tx_ref[id] = ref; ++ tx->offset = offset; ++ tx->size = len; ++ tx->flags = 0; ++ } ++ ++ for (i = 0; i < frags; i++) { ++ skb_frag_t *frag = skb_shinfo(skb)->frags + i; ++ ++ tx->flags |= NETTXF_more_data; ++ ++ id = get_id_from_freelist(np->tx_skbs); ++ np->tx_skbs[id] = skb_get(skb); ++ tx = RING_GET_REQUEST(&np->tx, prod++); ++ tx->id = id; ++ ref = gnttab_claim_grant_reference(&np->gref_tx_head); ++ BUG_ON((signed short)ref < 0); ++ ++ mfn = pfn_to_mfn(page_to_pfn(frag->page)); ++ gnttab_grant_foreign_access_ref(ref, np->xbdev->otherend_id, ++ mfn, GNTMAP_readonly); ++ ++ tx->gref = np->grant_tx_ref[id] = ref; ++ tx->offset = frag->page_offset; ++ tx->size = frag->size; ++ tx->flags = 0; ++ } ++ ++ np->tx.req_prod_pvt = prod; ++} ++ ++static int network_start_xmit(struct sk_buff *skb, struct net_device *dev) ++{ ++ unsigned short id; ++ struct netfront_info *np = netdev_priv(dev); ++ struct netif_tx_request *tx; ++ struct netif_extra_info *extra; ++ char *data = skb->data; ++ RING_IDX i; ++ grant_ref_t ref; ++ unsigned long mfn; ++ int notify; ++ int frags = skb_shinfo(skb)->nr_frags; ++ unsigned int offset = offset_in_page(data); ++ unsigned int len = skb_headlen(skb); ++ ++ frags += (offset + len + PAGE_SIZE - 1) / PAGE_SIZE; ++ if (unlikely(frags > MAX_SKB_FRAGS + 1)) { ++ printk(KERN_ALERT "xennet: skb rides the rocket: %d frags\n", ++ frags); ++ dump_stack(); ++ goto drop; ++ } ++ ++ spin_lock_irq(&np->tx_lock); ++ ++ if (unlikely(!netfront_carrier_ok(np) || ++ (frags > 1 && !xennet_can_sg(dev)) || ++ netif_needs_gso(dev, skb))) { ++ spin_unlock_irq(&np->tx_lock); ++ goto drop; ++ } ++ ++ i = np->tx.req_prod_pvt; ++ ++ id = get_id_from_freelist(np->tx_skbs); ++ np->tx_skbs[id] = skb; ++ ++ tx = RING_GET_REQUEST(&np->tx, i); ++ ++ tx->id = id; ++ ref = gnttab_claim_grant_reference(&np->gref_tx_head); ++ BUG_ON((signed short)ref < 0); ++ mfn = virt_to_mfn(data); ++ gnttab_grant_foreign_access_ref( ++ ref, np->xbdev->otherend_id, mfn, GNTMAP_readonly); ++ tx->gref = np->grant_tx_ref[id] = ref; ++ tx->offset = offset; ++ tx->size = len; ++ ++ tx->flags = 0; ++ extra = NULL; ++ ++ if (skb->ip_summed == CHECKSUM_HW) /* local packet? */ ++ tx->flags |= NETTXF_csum_blank | NETTXF_data_validated; ++#ifdef CONFIG_XEN ++ if (skb->proto_data_valid) /* remote but checksummed? */ ++ tx->flags |= NETTXF_data_validated; ++#endif ++ ++#ifdef HAVE_TSO ++ if (skb_shinfo(skb)->gso_size) { ++ struct netif_extra_info *gso = (struct netif_extra_info *) ++ RING_GET_REQUEST(&np->tx, ++i); ++ ++ if (extra) ++ extra->flags |= XEN_NETIF_EXTRA_FLAG_MORE; ++ else ++ tx->flags |= NETTXF_extra_info; ++ ++ gso->u.gso.size = skb_shinfo(skb)->gso_size; ++ gso->u.gso.type = XEN_NETIF_GSO_TYPE_TCPV4; ++ gso->u.gso.pad = 0; ++ gso->u.gso.features = 0; ++ ++ gso->type = XEN_NETIF_EXTRA_TYPE_GSO; ++ gso->flags = 0; ++ extra = gso; ++ } ++#endif ++ ++ np->tx.req_prod_pvt = i + 1; ++ ++ xennet_make_frags(skb, dev, tx); ++ tx->size = skb->len; ++ ++ RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&np->tx, notify); ++ if (notify) ++ notify_remote_via_irq(np->irq); ++ ++ network_tx_buf_gc(dev); ++ ++ if (!netfront_tx_slot_available(np)) ++ netif_stop_queue(dev); ++ ++ spin_unlock_irq(&np->tx_lock); ++ ++ np->stats.tx_bytes += skb->len; ++ np->stats.tx_packets++; ++ ++ return 0; ++ ++ drop: ++ np->stats.tx_dropped++; ++ dev_kfree_skb(skb); ++ return 0; ++} ++ ++static irqreturn_t netif_int(int irq, void *dev_id, struct pt_regs *ptregs) ++{ ++ struct net_device *dev = dev_id; ++ struct netfront_info *np = netdev_priv(dev); ++ unsigned long flags; ++ ++ spin_lock_irqsave(&np->tx_lock, flags); ++ ++ if (likely(netfront_carrier_ok(np))) { ++ network_tx_buf_gc(dev); ++ /* Under tx_lock: protects access to rx shared-ring indexes. */ ++ if (RING_HAS_UNCONSUMED_RESPONSES(&np->rx)) ++ netif_rx_schedule(dev); ++ } ++ ++ spin_unlock_irqrestore(&np->tx_lock, flags); ++ ++ return IRQ_HANDLED; ++} ++ ++static void xennet_move_rx_slot(struct netfront_info *np, struct sk_buff *skb, ++ grant_ref_t ref) ++{ ++ int new = xennet_rxidx(np->rx.req_prod_pvt); ++ ++ BUG_ON(np->rx_skbs[new]); ++ np->rx_skbs[new] = skb; ++ np->grant_rx_ref[new] = ref; ++ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->id = new; ++ RING_GET_REQUEST(&np->rx, np->rx.req_prod_pvt)->gref = ref; ++ np->rx.req_prod_pvt++; ++} ++ ++int xennet_get_extras(struct netfront_info *np, ++ struct netif_extra_info *extras, RING_IDX rp) ++ ++{ ++ struct netif_extra_info *extra; ++ RING_IDX cons = np->rx.rsp_cons; ++ int err = 0; ++ ++ do { ++ struct sk_buff *skb; ++ grant_ref_t ref; ++ ++ if (unlikely(cons + 1 == rp)) { ++ if (net_ratelimit()) ++ WPRINTK("Missing extra info\n"); ++ err = -EBADR; ++ break; ++ } ++ ++ extra = (struct netif_extra_info *) ++ RING_GET_RESPONSE(&np->rx, ++cons); ++ ++ if (unlikely(!extra->type || ++ extra->type >= XEN_NETIF_EXTRA_TYPE_MAX)) { ++ if (net_ratelimit()) ++ WPRINTK("Invalid extra type: %d\n", ++ extra->type); ++ err = -EINVAL; ++ } else { ++ memcpy(&extras[extra->type - 1], extra, ++ sizeof(*extra)); ++ } ++ ++ skb = xennet_get_rx_skb(np, cons); ++ ref = xennet_get_rx_ref(np, cons); ++ xennet_move_rx_slot(np, skb, ref); ++ } while (extra->flags & XEN_NETIF_EXTRA_FLAG_MORE); ++ ++ np->rx.rsp_cons = cons; ++ return err; ++} ++ ++static int xennet_get_responses(struct netfront_info *np, ++ struct netfront_rx_info *rinfo, RING_IDX rp, ++ struct sk_buff_head *list, ++ int *pages_flipped_p) ++{ ++ int pages_flipped = *pages_flipped_p; ++ struct mmu_update *mmu; ++ struct multicall_entry *mcl; ++ struct netif_rx_response *rx = &rinfo->rx; ++ struct netif_extra_info *extras = rinfo->extras; ++ RING_IDX cons = np->rx.rsp_cons; ++ struct sk_buff *skb = xennet_get_rx_skb(np, cons); ++ grant_ref_t ref = xennet_get_rx_ref(np, cons); ++ int max = MAX_SKB_FRAGS + (rx->status <= RX_COPY_THRESHOLD); ++ int frags = 1; ++ int err = 0; ++ unsigned long ret; ++ ++ if (rx->flags & NETRXF_extra_info) { ++ err = xennet_get_extras(np, extras, rp); ++ cons = np->rx.rsp_cons; ++ } ++ ++ for (;;) { ++ unsigned long mfn; ++ ++ if (unlikely(rx->status < 0 || ++ rx->offset + rx->status > PAGE_SIZE)) { ++ if (net_ratelimit()) ++ WPRINTK("rx->offset: %x, size: %u\n", ++ rx->offset, rx->status); ++ xennet_move_rx_slot(np, skb, ref); ++ err = -EINVAL; ++ goto next; ++ } ++ ++ /* ++ * This definitely indicates a bug, either in this driver or in ++ * the backend driver. In future this should flag the bad ++ * situation to the system controller to reboot the backed. ++ */ ++ if (ref == GRANT_INVALID_REF) { ++ if (net_ratelimit()) ++ WPRINTK("Bad rx response id %d.\n", rx->id); ++ err = -EINVAL; ++ goto next; ++ } ++ ++ if (!np->copying_receiver) { ++ /* Memory pressure, insufficient buffer ++ * headroom, ... */ ++ if (!(mfn = gnttab_end_foreign_transfer_ref(ref))) { ++ if (net_ratelimit()) ++ WPRINTK("Unfulfilled rx req " ++ "(id=%d, st=%d).\n", ++ rx->id, rx->status); ++ xennet_move_rx_slot(np, skb, ref); ++ err = -ENOMEM; ++ goto next; ++ } ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* Remap the page. */ ++ struct page *page = ++ skb_shinfo(skb)->frags[0].page; ++ unsigned long pfn = page_to_pfn(page); ++ void *vaddr = page_address(page); ++ ++ mcl = np->rx_mcl + pages_flipped; ++ mmu = np->rx_mmu + pages_flipped; ++ ++ MULTI_update_va_mapping(mcl, ++ (unsigned long)vaddr, ++ pfn_pte_ma(mfn, ++ PAGE_KERNEL), ++ 0); ++ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) ++ | MMU_MACHPHYS_UPDATE; ++ mmu->val = pfn; ++ ++ set_phys_to_machine(pfn, mfn); ++ } ++ pages_flipped++; ++ } else { ++ ret = gnttab_end_foreign_access_ref(ref, 0); ++ BUG_ON(!ret); ++ } ++ ++ gnttab_release_grant_reference(&np->gref_rx_head, ref); ++ ++ __skb_queue_tail(list, skb); ++ ++next: ++ if (!(rx->flags & NETRXF_more_data)) ++ break; ++ ++ if (cons + frags == rp) { ++ if (net_ratelimit()) ++ WPRINTK("Need more frags\n"); ++ err = -ENOENT; ++ break; ++ } ++ ++ rx = RING_GET_RESPONSE(&np->rx, cons + frags); ++ skb = xennet_get_rx_skb(np, cons + frags); ++ ref = xennet_get_rx_ref(np, cons + frags); ++ frags++; ++ } ++ ++ if (unlikely(frags > max)) { ++ if (net_ratelimit()) ++ WPRINTK("Too many frags\n"); ++ err = -E2BIG; ++ } ++ ++ if (unlikely(err)) ++ np->rx.rsp_cons = cons + frags; ++ ++ *pages_flipped_p = pages_flipped; ++ ++ return err; ++} ++ ++static RING_IDX xennet_fill_frags(struct netfront_info *np, ++ struct sk_buff *skb, ++ struct sk_buff_head *list) ++{ ++ struct skb_shared_info *shinfo = skb_shinfo(skb); ++ int nr_frags = shinfo->nr_frags; ++ RING_IDX cons = np->rx.rsp_cons; ++ skb_frag_t *frag = shinfo->frags + nr_frags; ++ struct sk_buff *nskb; ++ ++ while ((nskb = __skb_dequeue(list))) { ++ struct netif_rx_response *rx = ++ RING_GET_RESPONSE(&np->rx, ++cons); ++ ++ frag->page = skb_shinfo(nskb)->frags[0].page; ++ frag->page_offset = rx->offset; ++ frag->size = rx->status; ++ ++ skb->data_len += rx->status; ++ ++ skb_shinfo(nskb)->nr_frags = 0; ++ kfree_skb(nskb); ++ ++ frag++; ++ nr_frags++; ++ } ++ ++ shinfo->nr_frags = nr_frags; ++ return cons; ++} ++ ++static int xennet_set_skb_gso(struct sk_buff *skb, ++ struct netif_extra_info *gso) ++{ ++ if (!gso->u.gso.size) { ++ if (net_ratelimit()) ++ WPRINTK("GSO size must not be zero.\n"); ++ return -EINVAL; ++ } ++ ++ /* Currently only TCPv4 S.O. is supported. */ ++ if (gso->u.gso.type != XEN_NETIF_GSO_TYPE_TCPV4) { ++ if (net_ratelimit()) ++ WPRINTK("Bad GSO type %d.\n", gso->u.gso.type); ++ return -EINVAL; ++ } ++ ++#ifdef HAVE_TSO ++ skb_shinfo(skb)->gso_size = gso->u.gso.size; ++#ifdef HAVE_GSO ++ skb_shinfo(skb)->gso_type = SKB_GSO_TCPV4; ++ ++ /* Header must be checked, and gso_segs computed. */ ++ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY; ++#endif ++ skb_shinfo(skb)->gso_segs = 0; ++ ++ return 0; ++#else ++ if (net_ratelimit()) ++ WPRINTK("GSO unsupported by this kernel.\n"); ++ return -EINVAL; ++#endif ++} ++ ++static int netif_poll(struct net_device *dev, int *pbudget) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ struct sk_buff *skb; ++ struct netfront_rx_info rinfo; ++ struct netif_rx_response *rx = &rinfo.rx; ++ struct netif_extra_info *extras = rinfo.extras; ++ RING_IDX i, rp; ++ struct multicall_entry *mcl; ++ int work_done, budget, more_to_do = 1; ++ struct sk_buff_head rxq; ++ struct sk_buff_head errq; ++ struct sk_buff_head tmpq; ++ unsigned long flags; ++ unsigned int len; ++ int pages_flipped = 0; ++ int err; ++ ++ spin_lock(&np->rx_lock); /* no need for spin_lock_bh() in ->poll() */ ++ ++ if (unlikely(!netfront_carrier_ok(np))) { ++ spin_unlock(&np->rx_lock); ++ return 0; ++ } ++ ++ skb_queue_head_init(&rxq); ++ skb_queue_head_init(&errq); ++ skb_queue_head_init(&tmpq); ++ ++ if ((budget = *pbudget) > dev->quota) ++ budget = dev->quota; ++ rp = np->rx.sring->rsp_prod; ++ rmb(); /* Ensure we see queued responses up to 'rp'. */ ++ ++ i = np->rx.rsp_cons; ++ work_done = 0; ++ while ((i != rp) && (work_done < budget)) { ++ memcpy(rx, RING_GET_RESPONSE(&np->rx, i), sizeof(*rx)); ++ memset(extras, 0, sizeof(rinfo.extras)); ++ ++ err = xennet_get_responses(np, &rinfo, rp, &tmpq, ++ &pages_flipped); ++ ++ if (unlikely(err)) { ++err: ++ while ((skb = __skb_dequeue(&tmpq))) ++ __skb_queue_tail(&errq, skb); ++ np->stats.rx_errors++; ++ i = np->rx.rsp_cons; ++ continue; ++ } ++ ++ skb = __skb_dequeue(&tmpq); ++ ++ if (extras[XEN_NETIF_EXTRA_TYPE_GSO - 1].type) { ++ struct netif_extra_info *gso; ++ gso = &extras[XEN_NETIF_EXTRA_TYPE_GSO - 1]; ++ ++ if (unlikely(xennet_set_skb_gso(skb, gso))) { ++ __skb_queue_head(&tmpq, skb); ++ np->rx.rsp_cons += skb_queue_len(&tmpq); ++ goto err; ++ } ++ } ++ ++ NETFRONT_SKB_CB(skb)->page = skb_shinfo(skb)->frags[0].page; ++ NETFRONT_SKB_CB(skb)->offset = rx->offset; ++ ++ len = rx->status; ++ if (len > RX_COPY_THRESHOLD) ++ len = RX_COPY_THRESHOLD; ++ skb_put(skb, len); ++ ++ if (rx->status > len) { ++ skb_shinfo(skb)->frags[0].page_offset = ++ rx->offset + len; ++ skb_shinfo(skb)->frags[0].size = rx->status - len; ++ skb->data_len = rx->status - len; ++ } else { ++ skb_shinfo(skb)->frags[0].page = NULL; ++ skb_shinfo(skb)->nr_frags = 0; ++ } ++ ++ i = xennet_fill_frags(np, skb, &tmpq); ++ ++ /* ++ * Truesize must approximates the size of true data plus ++ * any supervisor overheads. Adding hypervisor overheads ++ * has been shown to significantly reduce achievable ++ * bandwidth with the default receive buffer size. It is ++ * therefore not wise to account for it here. ++ * ++ * After alloc_skb(RX_COPY_THRESHOLD), truesize is set to ++ * RX_COPY_THRESHOLD + the supervisor overheads. Here, we ++ * add the size of the data pulled in xennet_fill_frags(). ++ * ++ * We also adjust for any unused space in the main data ++ * area by subtracting (RX_COPY_THRESHOLD - len). This is ++ * especially important with drivers which split incoming ++ * packets into header and data, using only 66 bytes of ++ * the main data area (see the e1000 driver for example.) ++ * On such systems, without this last adjustement, our ++ * achievable receive throughout using the standard receive ++ * buffer size was cut by 25%(!!!). ++ */ ++ skb->truesize += skb->data_len - (RX_COPY_THRESHOLD - len); ++ skb->len += skb->data_len; ++ ++ /* ++ * Old backends do not assert data_validated but we ++ * can infer it from csum_blank so test both flags. ++ */ ++ if (rx->flags & (NETRXF_data_validated|NETRXF_csum_blank)) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb->ip_summed = CHECKSUM_NONE; ++#ifdef CONFIG_XEN ++ skb->proto_data_valid = (skb->ip_summed != CHECKSUM_NONE); ++ skb->proto_csum_blank = !!(rx->flags & NETRXF_csum_blank); ++#endif ++ np->stats.rx_packets++; ++ np->stats.rx_bytes += skb->len; ++ ++ __skb_queue_tail(&rxq, skb); ++ ++ np->rx.rsp_cons = ++i; ++ work_done++; ++ } ++ ++ if (pages_flipped) { ++ /* Some pages are no longer absent... */ ++ balloon_update_driver_allowance(-pages_flipped); ++ ++ /* Do all the remapping work and M2P updates. */ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ mcl = np->rx_mcl + pages_flipped; ++ mcl->op = __HYPERVISOR_mmu_update; ++ mcl->args[0] = (unsigned long)np->rx_mmu; ++ mcl->args[1] = pages_flipped; ++ mcl->args[2] = 0; ++ mcl->args[3] = DOMID_SELF; ++ (void)HYPERVISOR_multicall(np->rx_mcl, ++ pages_flipped + 1); ++ } ++ } ++ ++ while ((skb = __skb_dequeue(&errq))) ++ kfree_skb(skb); ++ ++ while ((skb = __skb_dequeue(&rxq)) != NULL) { ++ struct page *page = NETFRONT_SKB_CB(skb)->page; ++ void *vaddr = page_address(page); ++ unsigned offset = NETFRONT_SKB_CB(skb)->offset; ++ ++ memcpy(skb->data, vaddr + offset, skb_headlen(skb)); ++ ++ if (page != skb_shinfo(skb)->frags[0].page) ++ __free_page(page); ++ ++ /* Ethernet work: Delayed to here as it peeks the header. */ ++ skb->protocol = eth_type_trans(skb, dev); ++ ++ /* Pass it up. */ ++ netif_receive_skb(skb); ++ dev->last_rx = jiffies; ++ } ++ ++ /* If we get a callback with very few responses, reduce fill target. */ ++ /* NB. Note exponential increase, linear decrease. */ ++ if (((np->rx.req_prod_pvt - np->rx.sring->rsp_prod) > ++ ((3*np->rx_target) / 4)) && ++ (--np->rx_target < np->rx_min_target)) ++ np->rx_target = np->rx_min_target; ++ ++ network_alloc_rx_buffers(dev); ++ ++ *pbudget -= work_done; ++ dev->quota -= work_done; ++ ++ if (work_done < budget) { ++ local_irq_save(flags); ++ ++ RING_FINAL_CHECK_FOR_RESPONSES(&np->rx, more_to_do); ++ if (!more_to_do) ++ __netif_rx_complete(dev); ++ ++ local_irq_restore(flags); ++ } ++ ++ spin_unlock(&np->rx_lock); ++ ++ return more_to_do; ++} ++ ++static void netif_release_tx_bufs(struct netfront_info *np) ++{ ++ struct sk_buff *skb; ++ int i; ++ ++ for (i = 1; i <= NET_TX_RING_SIZE; i++) { ++ if ((unsigned long)np->tx_skbs[i] < PAGE_OFFSET) ++ continue; ++ ++ skb = np->tx_skbs[i]; ++ gnttab_end_foreign_access_ref( ++ np->grant_tx_ref[i], GNTMAP_readonly); ++ gnttab_release_grant_reference( ++ &np->gref_tx_head, np->grant_tx_ref[i]); ++ np->grant_tx_ref[i] = GRANT_INVALID_REF; ++ add_id_to_freelist(np->tx_skbs, i); ++ dev_kfree_skb_irq(skb); ++ } ++} ++ ++static void netif_release_rx_bufs(struct netfront_info *np) ++{ ++ struct mmu_update *mmu = np->rx_mmu; ++ struct multicall_entry *mcl = np->rx_mcl; ++ struct sk_buff_head free_list; ++ struct sk_buff *skb; ++ unsigned long mfn; ++ int xfer = 0, noxfer = 0, unused = 0; ++ int id, ref, rc; ++ ++ if (np->copying_receiver) { ++ WPRINTK("%s: fix me for copying receiver.\n", __FUNCTION__); ++ return; ++ } ++ ++ skb_queue_head_init(&free_list); ++ ++ spin_lock_bh(&np->rx_lock); ++ ++ for (id = 0; id < NET_RX_RING_SIZE; id++) { ++ if ((ref = np->grant_rx_ref[id]) == GRANT_INVALID_REF) { ++ unused++; ++ continue; ++ } ++ ++ skb = np->rx_skbs[id]; ++ mfn = gnttab_end_foreign_transfer_ref(ref); ++ gnttab_release_grant_reference(&np->gref_rx_head, ref); ++ np->grant_rx_ref[id] = GRANT_INVALID_REF; ++ add_id_to_freelist(np->rx_skbs, id); ++ ++ if (0 == mfn) { ++ struct page *page = skb_shinfo(skb)->frags[0].page; ++ balloon_release_driver_page(page); ++ skb_shinfo(skb)->nr_frags = 0; ++ dev_kfree_skb(skb); ++ noxfer++; ++ continue; ++ } ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* Remap the page. */ ++ struct page *page = skb_shinfo(skb)->frags[0].page; ++ unsigned long pfn = page_to_pfn(page); ++ void *vaddr = page_address(page); ++ ++ MULTI_update_va_mapping(mcl, (unsigned long)vaddr, ++ pfn_pte_ma(mfn, PAGE_KERNEL), ++ 0); ++ mcl++; ++ mmu->ptr = ((maddr_t)mfn << PAGE_SHIFT) ++ | MMU_MACHPHYS_UPDATE; ++ mmu->val = pfn; ++ mmu++; ++ ++ set_phys_to_machine(pfn, mfn); ++ } ++ __skb_queue_tail(&free_list, skb); ++ xfer++; ++ } ++ ++ IPRINTK("%s: %d xfer, %d noxfer, %d unused\n", ++ __FUNCTION__, xfer, noxfer, unused); ++ ++ if (xfer) { ++ /* Some pages are no longer absent... */ ++ balloon_update_driver_allowance(-xfer); ++ ++ if (!xen_feature(XENFEAT_auto_translated_physmap)) { ++ /* Do all the remapping work and M2P updates. */ ++ mcl->op = __HYPERVISOR_mmu_update; ++ mcl->args[0] = (unsigned long)np->rx_mmu; ++ mcl->args[1] = mmu - np->rx_mmu; ++ mcl->args[2] = 0; ++ mcl->args[3] = DOMID_SELF; ++ mcl++; ++ rc = HYPERVISOR_multicall_check( ++ np->rx_mcl, mcl - np->rx_mcl, NULL); ++ BUG_ON(rc); ++ } ++ } ++ ++ while ((skb = __skb_dequeue(&free_list)) != NULL) ++ dev_kfree_skb(skb); ++ ++ spin_unlock_bh(&np->rx_lock); ++} ++ ++static int network_close(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ netif_stop_queue(np->netdev); ++ return 0; ++} ++ ++ ++static struct net_device_stats *network_get_stats(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ return &np->stats; ++} ++ ++static int xennet_change_mtu(struct net_device *dev, int mtu) ++{ ++ int max = xennet_can_sg(dev) ? 65535 - ETH_HLEN : ETH_DATA_LEN; ++ ++ if (mtu > max) ++ return -EINVAL; ++ dev->mtu = mtu; ++ return 0; ++} ++ ++static int xennet_set_sg(struct net_device *dev, u32 data) ++{ ++ if (data) { ++ struct netfront_info *np = netdev_priv(dev); ++ int val; ++ ++ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, "feature-sg", ++ "%d", &val) < 0) ++ val = 0; ++ if (!val) ++ return -ENOSYS; ++ } else if (dev->mtu > ETH_DATA_LEN) ++ dev->mtu = ETH_DATA_LEN; ++ ++ return ethtool_op_set_sg(dev, data); ++} ++ ++static int xennet_set_tso(struct net_device *dev, u32 data) ++{ ++#ifdef HAVE_TSO ++ if (data) { ++ struct netfront_info *np = netdev_priv(dev); ++ int val; ++ ++ if (xenbus_scanf(XBT_NIL, np->xbdev->otherend, ++ "feature-gso-tcpv4", "%d", &val) < 0) ++ val = 0; ++ if (!val) ++ return -ENOSYS; ++ } ++ ++ return ethtool_op_set_tso(dev, data); ++#else ++ return -ENOSYS; ++#endif ++} ++ ++static void xennet_set_features(struct net_device *dev) ++{ ++ dev_disable_gso_features(dev); ++ xennet_set_sg(dev, 0); ++ ++ /* We need checksum offload to enable scatter/gather and TSO. */ ++ if (!(dev->features & NETIF_F_IP_CSUM)) ++ return; ++ ++ if (xennet_set_sg(dev, 1)) ++ return; ++ ++ /* Before 2.6.9 TSO seems to be unreliable so do not enable it ++ * on older kernels. ++ */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,9) ++ xennet_set_tso(dev, 1); ++#endif ++ ++} ++ ++static int network_connect(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ int i, requeue_idx, err; ++ struct sk_buff *skb; ++ grant_ref_t ref; ++ netif_rx_request_t *req; ++ unsigned int feature_rx_copy, feature_rx_flip; ++ ++ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, ++ "feature-rx-copy", "%u", &feature_rx_copy); ++ if (err != 1) ++ feature_rx_copy = 0; ++ err = xenbus_scanf(XBT_NIL, np->xbdev->otherend, ++ "feature-rx-flip", "%u", &feature_rx_flip); ++ if (err != 1) ++ feature_rx_flip = 1; ++ ++ /* ++ * Copy packets on receive path if: ++ * (a) This was requested by user, and the backend supports it; or ++ * (b) Flipping was requested, but this is unsupported by the backend. ++ */ ++ np->copying_receiver = ((MODPARM_rx_copy && feature_rx_copy) || ++ (MODPARM_rx_flip && !feature_rx_flip)); ++ ++ err = talk_to_backend(np->xbdev, np); ++ if (err) ++ return err; ++ ++ xennet_set_features(dev); ++ ++ IPRINTK("device %s has %sing receive path.\n", ++ dev->name, np->copying_receiver ? "copy" : "flipp"); ++ ++ spin_lock_bh(&np->rx_lock); ++ spin_lock_irq(&np->tx_lock); ++ ++ /* ++ * Recovery procedure: ++ * NB. Freelist index entries are always going to be less than ++ * PAGE_OFFSET, whereas pointers to skbs will always be equal or ++ * greater than PAGE_OFFSET: we use this property to distinguish ++ * them. ++ */ ++ ++ /* Step 1: Discard all pending TX packet fragments. */ ++ netif_release_tx_bufs(np); ++ ++ /* Step 2: Rebuild the RX buffer freelist and the RX ring itself. */ ++ for (requeue_idx = 0, i = 0; i < NET_RX_RING_SIZE; i++) { ++ if (!np->rx_skbs[i]) ++ continue; ++ ++ skb = np->rx_skbs[requeue_idx] = xennet_get_rx_skb(np, i); ++ ref = np->grant_rx_ref[requeue_idx] = xennet_get_rx_ref(np, i); ++ req = RING_GET_REQUEST(&np->rx, requeue_idx); ++ ++ if (!np->copying_receiver) { ++ gnttab_grant_foreign_transfer_ref( ++ ref, np->xbdev->otherend_id, ++ page_to_pfn(skb_shinfo(skb)->frags->page)); ++ } else { ++ gnttab_grant_foreign_access_ref( ++ ref, np->xbdev->otherend_id, ++ pfn_to_mfn(page_to_pfn(skb_shinfo(skb)-> ++ frags->page)), ++ 0); ++ } ++ req->gref = ref; ++ req->id = requeue_idx; ++ ++ requeue_idx++; ++ } ++ ++ np->rx.req_prod_pvt = requeue_idx; ++ ++ /* ++ * Step 3: All public and private state should now be sane. Get ++ * ready to start sending and receiving packets and give the driver ++ * domain a kick because we've probably just requeued some ++ * packets. ++ */ ++ netfront_carrier_on(np); ++ notify_remote_via_irq(np->irq); ++ network_tx_buf_gc(dev); ++ network_alloc_rx_buffers(dev); ++ ++ spin_unlock_irq(&np->tx_lock); ++ spin_unlock_bh(&np->rx_lock); ++ ++ return 0; ++} ++ ++static void netif_uninit(struct net_device *dev) ++{ ++ struct netfront_info *np = netdev_priv(dev); ++ netif_release_tx_bufs(np); ++ netif_release_rx_bufs(np); ++ gnttab_free_grant_references(np->gref_tx_head); ++ gnttab_free_grant_references(np->gref_rx_head); ++} ++ ++static struct ethtool_ops network_ethtool_ops = ++{ ++ .get_tx_csum = ethtool_op_get_tx_csum, ++ .set_tx_csum = ethtool_op_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = xennet_set_sg, ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = xennet_set_tso, ++ .get_link = ethtool_op_get_link, ++}; ++ ++#ifdef CONFIG_SYSFS ++static ssize_t show_rxbuf_min(struct class_device *cd, char *buf) ++{ ++ struct net_device *netdev = container_of(cd, struct net_device, ++ class_dev); ++ struct netfront_info *info = netdev_priv(netdev); ++ ++ return sprintf(buf, "%u\n", info->rx_min_target); ++} ++ ++static ssize_t store_rxbuf_min(struct class_device *cd, ++ const char *buf, size_t len) ++{ ++ struct net_device *netdev = container_of(cd, struct net_device, ++ class_dev); ++ struct netfront_info *np = netdev_priv(netdev); ++ char *endp; ++ unsigned long target; ++ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ target = simple_strtoul(buf, &endp, 0); ++ if (endp == buf) ++ return -EBADMSG; ++ ++ if (target < RX_MIN_TARGET) ++ target = RX_MIN_TARGET; ++ if (target > RX_MAX_TARGET) ++ target = RX_MAX_TARGET; ++ ++ spin_lock_bh(&np->rx_lock); ++ if (target > np->rx_max_target) ++ np->rx_max_target = target; ++ np->rx_min_target = target; ++ if (target > np->rx_target) ++ np->rx_target = target; ++ ++ network_alloc_rx_buffers(netdev); ++ ++ spin_unlock_bh(&np->rx_lock); ++ return len; ++} ++ ++static ssize_t show_rxbuf_max(struct class_device *cd, char *buf) ++{ ++ struct net_device *netdev = container_of(cd, struct net_device, ++ class_dev); ++ struct netfront_info *info = netdev_priv(netdev); ++ ++ return sprintf(buf, "%u\n", info->rx_max_target); ++} ++ ++static ssize_t store_rxbuf_max(struct class_device *cd, ++ const char *buf, size_t len) ++{ ++ struct net_device *netdev = container_of(cd, struct net_device, ++ class_dev); ++ struct netfront_info *np = netdev_priv(netdev); ++ char *endp; ++ unsigned long target; ++ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ target = simple_strtoul(buf, &endp, 0); ++ if (endp == buf) ++ return -EBADMSG; ++ ++ if (target < RX_MIN_TARGET) ++ target = RX_MIN_TARGET; ++ if (target > RX_MAX_TARGET) ++ target = RX_MAX_TARGET; ++ ++ spin_lock_bh(&np->rx_lock); ++ if (target < np->rx_min_target) ++ np->rx_min_target = target; ++ np->rx_max_target = target; ++ if (target < np->rx_target) ++ np->rx_target = target; ++ ++ network_alloc_rx_buffers(netdev); ++ ++ spin_unlock_bh(&np->rx_lock); ++ return len; ++} ++ ++static ssize_t show_rxbuf_cur(struct class_device *cd, char *buf) ++{ ++ struct net_device *netdev = container_of(cd, struct net_device, ++ class_dev); ++ struct netfront_info *info = netdev_priv(netdev); ++ ++ return sprintf(buf, "%u\n", info->rx_target); ++} ++ ++static const struct class_device_attribute xennet_attrs[] = { ++ __ATTR(rxbuf_min, S_IRUGO|S_IWUSR, show_rxbuf_min, store_rxbuf_min), ++ __ATTR(rxbuf_max, S_IRUGO|S_IWUSR, show_rxbuf_max, store_rxbuf_max), ++ __ATTR(rxbuf_cur, S_IRUGO, show_rxbuf_cur, NULL), ++}; ++ ++static int xennet_sysfs_addif(struct net_device *netdev) ++{ ++ int i; ++ int error = 0; ++ ++ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) { ++ error = class_device_create_file(&netdev->class_dev, ++ &xennet_attrs[i]); ++ if (error) ++ goto fail; ++ } ++ return 0; ++ ++ fail: ++ while (--i >= 0) ++ class_device_remove_file(&netdev->class_dev, ++ &xennet_attrs[i]); ++ return error; ++} ++ ++static void xennet_sysfs_delif(struct net_device *netdev) ++{ ++ int i; ++ ++ for (i = 0; i < ARRAY_SIZE(xennet_attrs); i++) { ++ class_device_remove_file(&netdev->class_dev, ++ &xennet_attrs[i]); ++ } ++} ++ ++#endif /* CONFIG_SYSFS */ ++ ++ ++/* ++ * Nothing to do here. Virtual interface is point-to-point and the ++ * physical interface is probably promiscuous anyway. ++ */ ++static void network_set_multicast_list(struct net_device *dev) ++{ ++} ++ ++static struct net_device * __devinit create_netdev(struct xenbus_device *dev) ++{ ++ int i, err = 0; ++ struct net_device *netdev = NULL; ++ struct netfront_info *np = NULL; ++ ++ netdev = alloc_etherdev(sizeof(struct netfront_info)); ++ if (!netdev) { ++ printk(KERN_WARNING "%s> alloc_etherdev failed.\n", ++ __FUNCTION__); ++ return ERR_PTR(-ENOMEM); ++ } ++ ++ np = netdev_priv(netdev); ++ np->xbdev = dev; ++ ++ spin_lock_init(&np->tx_lock); ++ spin_lock_init(&np->rx_lock); ++ ++ skb_queue_head_init(&np->rx_batch); ++ np->rx_target = RX_DFL_MIN_TARGET; ++ np->rx_min_target = RX_DFL_MIN_TARGET; ++ np->rx_max_target = RX_MAX_TARGET; ++ ++ init_timer(&np->rx_refill_timer); ++ np->rx_refill_timer.data = (unsigned long)netdev; ++ np->rx_refill_timer.function = rx_refill_timeout; ++ ++ /* Initialise {tx,rx}_skbs as a free chain containing every entry. */ ++ for (i = 0; i <= NET_TX_RING_SIZE; i++) { ++ np->tx_skbs[i] = (void *)((unsigned long) i+1); ++ np->grant_tx_ref[i] = GRANT_INVALID_REF; ++ } ++ ++ for (i = 0; i < NET_RX_RING_SIZE; i++) { ++ np->rx_skbs[i] = NULL; ++ np->grant_rx_ref[i] = GRANT_INVALID_REF; ++ } ++ ++ /* A grant for every tx ring slot */ ++ if (gnttab_alloc_grant_references(TX_MAX_TARGET, ++ &np->gref_tx_head) < 0) { ++ printk(KERN_ALERT "#### netfront can't alloc tx grant refs\n"); ++ err = -ENOMEM; ++ goto exit; ++ } ++ /* A grant for every rx ring slot */ ++ if (gnttab_alloc_grant_references(RX_MAX_TARGET, ++ &np->gref_rx_head) < 0) { ++ printk(KERN_ALERT "#### netfront can't alloc rx grant refs\n"); ++ err = -ENOMEM; ++ goto exit_free_tx; ++ } ++ ++ netdev->open = network_open; ++ netdev->hard_start_xmit = network_start_xmit; ++ netdev->stop = network_close; ++ netdev->get_stats = network_get_stats; ++ netdev->poll = netif_poll; ++ netdev->set_multicast_list = network_set_multicast_list; ++ netdev->uninit = netif_uninit; ++ netdev->change_mtu = xennet_change_mtu; ++ netdev->weight = 64; ++ netdev->features = NETIF_F_IP_CSUM; ++ ++ SET_ETHTOOL_OPS(netdev, &network_ethtool_ops); ++ SET_MODULE_OWNER(netdev); ++ SET_NETDEV_DEV(netdev, &dev->dev); ++ ++ np->netdev = netdev; ++ ++ netfront_carrier_off(np); ++ ++ return netdev; ++ ++ exit_free_tx: ++ gnttab_free_grant_references(np->gref_tx_head); ++ exit: ++ free_netdev(netdev); ++ return ERR_PTR(err); ++} ++ ++/* ++ * We use this notifier to send out a fake ARP reply to reset switches and ++ * router ARP caches when an IP interface is brought up on a VIF. ++ */ ++static int ++inetdev_notify(struct notifier_block *this, unsigned long event, void *ptr) ++{ ++ struct in_ifaddr *ifa = (struct in_ifaddr *)ptr; ++ struct net_device *dev = ifa->ifa_dev->dev; ++ ++ /* UP event and is it one of our devices? */ ++ if (event == NETDEV_UP && dev->open == network_open) ++ (void)send_fake_arp(dev); ++ ++ return NOTIFY_DONE; ++} ++ ++ ++static void netif_disconnect_backend(struct netfront_info *info) ++{ ++ /* Stop old i/f to prevent errors whilst we rebuild the state. */ ++ spin_lock_bh(&info->rx_lock); ++ spin_lock_irq(&info->tx_lock); ++ netfront_carrier_off(info); ++ spin_unlock_irq(&info->tx_lock); ++ spin_unlock_bh(&info->rx_lock); ++ ++ if (info->irq) ++ unbind_from_irqhandler(info->irq, info->netdev); ++ info->irq = 0; ++ ++ end_access(info->tx_ring_ref, info->tx.sring); ++ end_access(info->rx_ring_ref, info->rx.sring); ++ info->tx_ring_ref = GRANT_INVALID_REF; ++ info->rx_ring_ref = GRANT_INVALID_REF; ++ info->tx.sring = NULL; ++ info->rx.sring = NULL; ++} ++ ++ ++static void end_access(int ref, void *page) ++{ ++ if (ref != GRANT_INVALID_REF) ++ gnttab_end_foreign_access(ref, 0, (unsigned long)page); ++} ++ ++ ++/* ** Driver registration ** */ ++ ++ ++static struct xenbus_device_id netfront_ids[] = { ++ { "vif" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver netfront = { ++ .name = "vif", ++ .owner = THIS_MODULE, ++ .ids = netfront_ids, ++ .probe = netfront_probe, ++ .remove = __devexit_p(netfront_remove), ++ .resume = netfront_resume, ++ .otherend_changed = backend_changed, ++}; ++ ++ ++static struct notifier_block notifier_inetdev = { ++ .notifier_call = inetdev_notify, ++ .next = NULL, ++ .priority = 0 ++}; ++ ++static int __init netif_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++#ifdef CONFIG_XEN ++ if (MODPARM_rx_flip && MODPARM_rx_copy) { ++ WPRINTK("Cannot specify both rx_copy and rx_flip.\n"); ++ return -EINVAL; ++ } ++ ++ if (!MODPARM_rx_flip && !MODPARM_rx_copy) ++ MODPARM_rx_flip = 1; /* Default is to flip. */ ++#endif ++ ++ if (is_initial_xendomain()) ++ return 0; ++ ++ IPRINTK("Initialising virtual ethernet driver.\n"); ++ ++ (void)register_inetaddr_notifier(¬ifier_inetdev); ++ ++ return xenbus_register_frontend(&netfront); ++} ++module_init(netif_init); ++ ++ ++static void __exit netif_exit(void) ++{ ++ if (is_initial_xendomain()) ++ return; ++ ++ unregister_inetaddr_notifier(¬ifier_inetdev); ++ ++ return xenbus_unregister_driver(&netfront); ++} ++module_exit(netif_exit); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,15 @@ ++obj-$(CONFIG_XEN_PCIDEV_BACKEND) += pciback.o ++ ++pciback-y := pci_stub.o pciback_ops.o xenbus.o ++pciback-y += conf_space.o conf_space_header.o \ ++ conf_space_capability.o \ ++ conf_space_capability_vpd.o \ ++ conf_space_capability_pm.o \ ++ conf_space_quirks.o ++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_VPCI) += vpci.o ++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_SLOT) += slot.o ++pciback-$(CONFIG_XEN_PCIDEV_BACKEND_PASS) += passthrough.o ++ ++ifeq ($(CONFIG_XEN_PCIDEV_BE_DEBUG),y) ++EXTRA_CFLAGS += -DDEBUG ++endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,426 @@ ++/* ++ * PCI Backend - Functions for creating a virtual configuration space for ++ * exported PCI Devices. ++ * It's dangerous to allow PCI Driver Domains to change their ++ * device's resources (memory, i/o ports, interrupts). We need to ++ * restrict changes to certain PCI Configuration registers: ++ * BARs, INTERRUPT_PIN, most registers in the header... ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++#define DEFINE_PCI_CONFIG(op,size,type) \ ++int pciback_##op##_config_##size \ ++(struct pci_dev *dev, int offset, type value, void *data) \ ++{ \ ++ return pci_##op##_config_##size (dev, offset, value); \ ++} ++ ++DEFINE_PCI_CONFIG(read, byte, u8 *) ++DEFINE_PCI_CONFIG(read, word, u16 *) ++DEFINE_PCI_CONFIG(read, dword, u32 *) ++ ++DEFINE_PCI_CONFIG(write, byte, u8) ++DEFINE_PCI_CONFIG(write, word, u16) ++DEFINE_PCI_CONFIG(write, dword, u32) ++ ++static int conf_space_read(struct pci_dev *dev, ++ struct config_field_entry *entry, int offset, ++ u32 * value) ++{ ++ int ret = 0; ++ struct config_field *field = entry->field; ++ ++ *value = 0; ++ ++ switch (field->size) { ++ case 1: ++ if (field->u.b.read) ++ ret = field->u.b.read(dev, offset, (u8 *) value, ++ entry->data); ++ break; ++ case 2: ++ if (field->u.w.read) ++ ret = field->u.w.read(dev, offset, (u16 *) value, ++ entry->data); ++ break; ++ case 4: ++ if (field->u.dw.read) ++ ret = field->u.dw.read(dev, offset, value, entry->data); ++ break; ++ } ++ return ret; ++} ++ ++static int conf_space_write(struct pci_dev *dev, ++ struct config_field_entry *entry, int offset, ++ u32 value) ++{ ++ int ret = 0; ++ struct config_field *field = entry->field; ++ ++ switch (field->size) { ++ case 1: ++ if (field->u.b.write) ++ ret = field->u.b.write(dev, offset, (u8) value, ++ entry->data); ++ break; ++ case 2: ++ if (field->u.w.write) ++ ret = field->u.w.write(dev, offset, (u16) value, ++ entry->data); ++ break; ++ case 4: ++ if (field->u.dw.write) ++ ret = field->u.dw.write(dev, offset, value, ++ entry->data); ++ break; ++ } ++ return ret; ++} ++ ++static inline u32 get_mask(int size) ++{ ++ if (size == 1) ++ return 0xff; ++ else if (size == 2) ++ return 0xffff; ++ else ++ return 0xffffffff; ++} ++ ++static inline int valid_request(int offset, int size) ++{ ++ /* Validate request (no un-aligned requests) */ ++ if ((size == 1 || size == 2 || size == 4) && (offset % size) == 0) ++ return 1; ++ return 0; ++} ++ ++static inline u32 merge_value(u32 val, u32 new_val, u32 new_val_mask, ++ int offset) ++{ ++ if (offset >= 0) { ++ new_val_mask <<= (offset * 8); ++ new_val <<= (offset * 8); ++ } else { ++ new_val_mask >>= (offset * -8); ++ new_val >>= (offset * -8); ++ } ++ val = (val & ~new_val_mask) | (new_val & new_val_mask); ++ ++ return val; ++} ++ ++static int pcibios_err_to_errno(int err) ++{ ++ switch (err) { ++ case PCIBIOS_SUCCESSFUL: ++ return XEN_PCI_ERR_success; ++ case PCIBIOS_DEVICE_NOT_FOUND: ++ return XEN_PCI_ERR_dev_not_found; ++ case PCIBIOS_BAD_REGISTER_NUMBER: ++ return XEN_PCI_ERR_invalid_offset; ++ case PCIBIOS_FUNC_NOT_SUPPORTED: ++ return XEN_PCI_ERR_not_implemented; ++ case PCIBIOS_SET_FAILED: ++ return XEN_PCI_ERR_access_denied; ++ } ++ return err; ++} ++ ++int pciback_config_read(struct pci_dev *dev, int offset, int size, ++ u32 * ret_val) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ struct config_field *field; ++ int req_start, req_end, field_start, field_end; ++ /* if read fails for any reason, return 0 (as if device didn't respond) */ ++ u32 value = 0, tmp_val; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x\n", ++ pci_name(dev), size, offset); ++ ++ if (!valid_request(offset, size)) { ++ err = XEN_PCI_ERR_invalid_offset; ++ goto out; ++ } ++ ++ /* Get the real value first, then modify as appropriate */ ++ switch (size) { ++ case 1: ++ err = pci_read_config_byte(dev, offset, (u8 *) & value); ++ break; ++ case 2: ++ err = pci_read_config_word(dev, offset, (u16 *) & value); ++ break; ++ case 4: ++ err = pci_read_config_dword(dev, offset, &value); ++ break; ++ } ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ req_start = offset; ++ req_end = offset + size; ++ field_start = OFFSET(cfg_entry); ++ field_end = OFFSET(cfg_entry) + field->size; ++ ++ if ((req_start >= field_start && req_start < field_end) ++ || (req_end > field_start && req_end <= field_end)) { ++ err = conf_space_read(dev, cfg_entry, field_start, ++ &tmp_val); ++ if (err) ++ goto out; ++ ++ value = merge_value(value, tmp_val, ++ get_mask(field->size), ++ field_start - req_start); ++ } ++ } ++ ++ out: ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: read %d bytes at 0x%x = %x\n", ++ pci_name(dev), size, offset, value); ++ ++ *ret_val = value; ++ return pcibios_err_to_errno(err); ++} ++ ++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value) ++{ ++ int err = 0, handled = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ struct config_field *field; ++ u32 tmp_val; ++ int req_start, req_end, field_start, field_end; ++ ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG ++ "pciback: %s: write request %d bytes at 0x%x = %x\n", ++ pci_name(dev), size, offset, value); ++ ++ if (!valid_request(offset, size)) ++ return XEN_PCI_ERR_invalid_offset; ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ req_start = offset; ++ req_end = offset + size; ++ field_start = OFFSET(cfg_entry); ++ field_end = OFFSET(cfg_entry) + field->size; ++ ++ if ((req_start >= field_start && req_start < field_end) ++ || (req_end > field_start && req_end <= field_end)) { ++ tmp_val = 0; ++ ++ err = pciback_config_read(dev, field_start, ++ field->size, &tmp_val); ++ if (err) ++ break; ++ ++ tmp_val = merge_value(tmp_val, value, get_mask(size), ++ req_start - field_start); ++ ++ err = conf_space_write(dev, cfg_entry, field_start, ++ tmp_val); ++ ++ /* handled is set true here, but not every byte ++ * may have been written! Properly detecting if ++ * every byte is handled is unnecessary as the ++ * flag is used to detect devices that need ++ * special helpers to work correctly. ++ */ ++ handled = 1; ++ } ++ } ++ ++ if (!handled && !err) { ++ /* By default, anything not specificially handled above is ++ * read-only. The permissive flag changes this behavior so ++ * that anything not specifically handled above is writable. ++ * This means that some fields may still be read-only because ++ * they have entries in the config_field list that intercept ++ * the write and do nothing. */ ++ if (dev_data->permissive) { ++ switch (size) { ++ case 1: ++ err = pci_write_config_byte(dev, offset, ++ (u8) value); ++ break; ++ case 2: ++ err = pci_write_config_word(dev, offset, ++ (u16) value); ++ break; ++ case 4: ++ err = pci_write_config_dword(dev, offset, ++ (u32) value); ++ break; ++ } ++ } else if (!dev_data->warned_on_write) { ++ dev_data->warned_on_write = 1; ++ dev_warn(&dev->dev, "Driver tried to write to a " ++ "read-only configuration space field at offset " ++ "0x%x, size %d. This may be harmless, but if " ++ "you have problems with your device:\n" ++ "1) see permissive attribute in sysfs\n" ++ "2) report problems to the xen-devel " ++ "mailing list along with details of your " ++ "device obtained from lspci.\n", offset, size); ++ } ++ } ++ ++ return pcibios_err_to_errno(err); ++} ++ ++void pciback_config_free_dyn_fields(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry, *t; ++ struct config_field *field; ++ ++ dev_dbg(&dev->dev, ++ "free-ing dynamically allocated virtual configuration space fields\n"); ++ ++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ if (field->clean) { ++ field->clean(field); ++ ++ if (cfg_entry->data) ++ kfree(cfg_entry->data); ++ ++ list_del(&cfg_entry->list); ++ kfree(cfg_entry); ++ } ++ ++ } ++} ++ ++void pciback_config_reset_dev(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ struct config_field *field; ++ ++ dev_dbg(&dev->dev, "resetting virtual configuration space\n"); ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ ++ if (field->reset) ++ field->reset(dev, OFFSET(cfg_entry), cfg_entry->data); ++ } ++} ++ ++void pciback_config_free_dev(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry, *t; ++ struct config_field *field; ++ ++ dev_dbg(&dev->dev, "free-ing virtual configuration space fields\n"); ++ ++ list_for_each_entry_safe(cfg_entry, t, &dev_data->config_fields, list) { ++ list_del(&cfg_entry->list); ++ ++ field = cfg_entry->field; ++ ++ if (field->release) ++ field->release(dev, OFFSET(cfg_entry), cfg_entry->data); ++ ++ kfree(cfg_entry); ++ } ++} ++ ++int pciback_config_add_field_offset(struct pci_dev *dev, ++ struct config_field *field, ++ unsigned int base_offset) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ void *tmp; ++ ++ cfg_entry = kmalloc(sizeof(*cfg_entry), GFP_KERNEL); ++ if (!cfg_entry) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ cfg_entry->data = NULL; ++ cfg_entry->field = field; ++ cfg_entry->base_offset = base_offset; ++ ++ /* silently ignore duplicate fields */ ++ err = pciback_field_is_dup(dev,OFFSET(cfg_entry)); ++ if (err) ++ goto out; ++ ++ if (field->init) { ++ tmp = field->init(dev, OFFSET(cfg_entry)); ++ ++ if (IS_ERR(tmp)) { ++ err = PTR_ERR(tmp); ++ goto out; ++ } ++ ++ cfg_entry->data = tmp; ++ } ++ ++ dev_dbg(&dev->dev, "added config field at offset 0x%02x\n", ++ OFFSET(cfg_entry)); ++ list_add_tail(&cfg_entry->list, &dev_data->config_fields); ++ ++ out: ++ if (err) ++ kfree(cfg_entry); ++ ++ return err; ++} ++ ++/* This sets up the device's virtual configuration space to keep track of ++ * certain registers (like the base address registers (BARs) so that we can ++ * keep the client from manipulating them directly. ++ */ ++int pciback_config_init_dev(struct pci_dev *dev) ++{ ++ int err = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ ++ dev_dbg(&dev->dev, "initializing virtual configuration space\n"); ++ ++ INIT_LIST_HEAD(&dev_data->config_fields); ++ ++ err = pciback_config_header_add_fields(dev); ++ if (err) ++ goto out; ++ ++ err = pciback_config_capability_add_fields(dev); ++ if (err) ++ goto out; ++ ++ err = pciback_config_quirks_init(dev); ++ ++ out: ++ return err; ++} ++ ++int pciback_config_init(void) ++{ ++ return pciback_config_capability_init(); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,126 @@ ++/* ++ * PCI Backend - Common data structures for overriding the configuration space ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#ifndef __XEN_PCIBACK_CONF_SPACE_H__ ++#define __XEN_PCIBACK_CONF_SPACE_H__ ++ ++#include <linux/list.h> ++#include <linux/err.h> ++ ++/* conf_field_init can return an errno in a ptr with ERR_PTR() */ ++typedef void *(*conf_field_init) (struct pci_dev * dev, int offset); ++typedef void (*conf_field_reset) (struct pci_dev * dev, int offset, void *data); ++typedef void (*conf_field_free) (struct pci_dev * dev, int offset, void *data); ++ ++typedef int (*conf_dword_write) (struct pci_dev * dev, int offset, u32 value, ++ void *data); ++typedef int (*conf_word_write) (struct pci_dev * dev, int offset, u16 value, ++ void *data); ++typedef int (*conf_byte_write) (struct pci_dev * dev, int offset, u8 value, ++ void *data); ++typedef int (*conf_dword_read) (struct pci_dev * dev, int offset, u32 * value, ++ void *data); ++typedef int (*conf_word_read) (struct pci_dev * dev, int offset, u16 * value, ++ void *data); ++typedef int (*conf_byte_read) (struct pci_dev * dev, int offset, u8 * value, ++ void *data); ++ ++/* These are the fields within the configuration space which we ++ * are interested in intercepting reads/writes to and changing their ++ * values. ++ */ ++struct config_field { ++ unsigned int offset; ++ unsigned int size; ++ unsigned int mask; ++ conf_field_init init; ++ conf_field_reset reset; ++ conf_field_free release; ++ void (*clean) (struct config_field * field); ++ union { ++ struct { ++ conf_dword_write write; ++ conf_dword_read read; ++ } dw; ++ struct { ++ conf_word_write write; ++ conf_word_read read; ++ } w; ++ struct { ++ conf_byte_write write; ++ conf_byte_read read; ++ } b; ++ } u; ++ struct list_head list; ++}; ++ ++struct config_field_entry { ++ struct list_head list; ++ struct config_field *field; ++ unsigned int base_offset; ++ void *data; ++}; ++ ++#define OFFSET(cfg_entry) ((cfg_entry)->base_offset+(cfg_entry)->field->offset) ++ ++/* Add fields to a device - the add_fields macro expects to get a pointer to ++ * the first entry in an array (of which the ending is marked by size==0) ++ */ ++int pciback_config_add_field_offset(struct pci_dev *dev, ++ struct config_field *field, ++ unsigned int offset); ++ ++static inline int pciback_config_add_field(struct pci_dev *dev, ++ struct config_field *field) ++{ ++ return pciback_config_add_field_offset(dev, field, 0); ++} ++ ++static inline int pciback_config_add_fields(struct pci_dev *dev, ++ struct config_field *field) ++{ ++ int i, err = 0; ++ for (i = 0; field[i].size != 0; i++) { ++ err = pciback_config_add_field(dev, &field[i]); ++ if (err) ++ break; ++ } ++ return err; ++} ++ ++static inline int pciback_config_add_fields_offset(struct pci_dev *dev, ++ struct config_field *field, ++ unsigned int offset) ++{ ++ int i, err = 0; ++ for (i = 0; field[i].size != 0; i++) { ++ err = pciback_config_add_field_offset(dev, &field[i], offset); ++ if (err) ++ break; ++ } ++ return err; ++} ++ ++/* Read/Write the real configuration space */ ++int pciback_read_config_byte(struct pci_dev *dev, int offset, u8 * value, ++ void *data); ++int pciback_read_config_word(struct pci_dev *dev, int offset, u16 * value, ++ void *data); ++int pciback_read_config_dword(struct pci_dev *dev, int offset, u32 * value, ++ void *data); ++int pciback_write_config_byte(struct pci_dev *dev, int offset, u8 value, ++ void *data); ++int pciback_write_config_word(struct pci_dev *dev, int offset, u16 value, ++ void *data); ++int pciback_write_config_dword(struct pci_dev *dev, int offset, u32 value, ++ void *data); ++ ++int pciback_config_capability_init(void); ++ ++int pciback_config_header_add_fields(struct pci_dev *dev); ++int pciback_config_capability_add_fields(struct pci_dev *dev); ++ ++#endif /* __XEN_PCIBACK_CONF_SPACE_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_capability.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,71 @@ ++/* ++ * PCI Backend - Handles the virtual fields found on the capability lists ++ * in the configuration space. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static LIST_HEAD(capabilities); ++ ++static struct config_field caplist_header[] = { ++ { ++ .offset = PCI_CAP_LIST_ID, ++ .size = 2, /* encompass PCI_CAP_LIST_ID & PCI_CAP_LIST_NEXT */ ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = NULL, ++ }, ++ { ++ .size = 0, ++ }, ++}; ++ ++static inline void register_capability(struct pciback_config_capability *cap) ++{ ++ list_add_tail(&cap->cap_list, &capabilities); ++} ++ ++int pciback_config_capability_add_fields(struct pci_dev *dev) ++{ ++ int err = 0; ++ struct pciback_config_capability *cap; ++ int cap_offset; ++ ++ list_for_each_entry(cap, &capabilities, cap_list) { ++ cap_offset = pci_find_capability(dev, cap->capability); ++ if (cap_offset) { ++ dev_dbg(&dev->dev, "Found capability 0x%x at 0x%x\n", ++ cap->capability, cap_offset); ++ ++ err = pciback_config_add_fields_offset(dev, ++ caplist_header, ++ cap_offset); ++ if (err) ++ goto out; ++ err = pciback_config_add_fields_offset(dev, ++ cap->fields, ++ cap_offset); ++ if (err) ++ goto out; ++ } ++ } ++ ++ out: ++ return err; ++} ++ ++extern struct pciback_config_capability pciback_config_capability_vpd; ++extern struct pciback_config_capability pciback_config_capability_pm; ++ ++int pciback_config_capability_init(void) ++{ ++ register_capability(&pciback_config_capability_vpd); ++ register_capability(&pciback_config_capability_pm); ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_capability.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,23 @@ ++/* ++ * PCI Backend - Data structures for special overlays for structures on ++ * the capability list. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#ifndef __PCIBACK_CONFIG_CAPABILITY_H__ ++#define __PCIBACK_CONFIG_CAPABILITY_H__ ++ ++#include <linux/pci.h> ++#include <linux/list.h> ++ ++struct pciback_config_capability { ++ struct list_head cap_list; ++ ++ int capability; ++ ++ /* If the device has the capability found above, add these fields */ ++ struct config_field *fields; ++}; ++ ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_capability_pm.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,128 @@ ++/* ++ * PCI Backend - Configuration space overlay for power management ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/pci.h> ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, ++ void *data) ++{ ++ int err; ++ u16 real_value; ++ ++ err = pci_read_config_word(dev, offset, &real_value); ++ if (err) ++ goto out; ++ ++ *value = real_value & ~PCI_PM_CAP_PME_MASK; ++ ++ out: ++ return err; ++} ++ ++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. ++ * Can't allow driver domain to enable PMEs - they're shared */ ++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) ++ ++static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, ++ void *data) ++{ ++ int err; ++ u16 old_value; ++ pci_power_t new_state, old_state; ++ ++ err = pci_read_config_word(dev, offset, &old_value); ++ if (err) ++ goto out; ++ ++ old_state = (pci_power_t)(old_value & PCI_PM_CTRL_STATE_MASK); ++ new_state = (pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); ++ ++ new_value &= PM_OK_BITS; ++ if ((old_value & PM_OK_BITS) != new_value) { ++ new_value = (old_value & ~PM_OK_BITS) | new_value; ++ err = pci_write_config_word(dev, offset, new_value); ++ if (err) ++ goto out; ++ } ++ ++ /* Let pci core handle the power management change */ ++ dev_dbg(&dev->dev, "set power state to %x\n", new_state); ++ err = pci_set_power_state(dev, new_state); ++ if (err) { ++ err = PCIBIOS_SET_FAILED; ++ goto out; ++ } ++ ++ /* ++ * Device may lose PCI config info on D3->D0 transition. This ++ * is a problem for some guests which will not reset BARs. Even ++ * those that have a go will be foiled by our BAR-write handler ++ * which will discard the write! Since Linux won't re-init ++ * the config space automatically in all cases, we do it here. ++ * Future: Should we re-initialise all first 64 bytes of config space? ++ */ ++ if (new_state == PCI_D0 && ++ (old_state == PCI_D3hot || old_state == PCI_D3cold) && ++ !(old_value & PCI_PM_CTRL_NO_SOFT_RESET)) ++ pci_restore_bars(dev); ++ ++ out: ++ return err; ++} ++ ++/* Ensure PMEs are disabled */ ++static void *pm_ctrl_init(struct pci_dev *dev, int offset) ++{ ++ int err; ++ u16 value; ++ ++ err = pci_read_config_word(dev, offset, &value); ++ if (err) ++ goto out; ++ ++ if (value & PCI_PM_CTRL_PME_ENABLE) { ++ value &= ~PCI_PM_CTRL_PME_ENABLE; ++ err = pci_write_config_word(dev, offset, value); ++ } ++ ++ out: ++ return ERR_PTR(err); ++} ++ ++static struct config_field caplist_pm[] = { ++ { ++ .offset = PCI_PM_PMC, ++ .size = 2, ++ .u.w.read = pm_caps_read, ++ }, ++ { ++ .offset = PCI_PM_CTRL, ++ .size = 2, ++ .init = pm_ctrl_init, ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = pm_ctrl_write, ++ }, ++ { ++ .offset = PCI_PM_PPB_EXTENSIONS, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ .offset = PCI_PM_DATA_REGISTER, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ .size = 0, ++ }, ++}; ++ ++struct pciback_config_capability pciback_config_capability_pm = { ++ .capability = PCI_CAP_ID_PM, ++ .fields = caplist_pm, ++}; +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_capability_vpd.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,42 @@ ++/* ++ * PCI Backend - Configuration space overlay for Vital Product Data ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/pci.h> ++#include "conf_space.h" ++#include "conf_space_capability.h" ++ ++static int vpd_address_write(struct pci_dev *dev, int offset, u16 value, ++ void *data) ++{ ++ /* Disallow writes to the vital product data */ ++ if (value & PCI_VPD_ADDR_F) ++ return PCIBIOS_SET_FAILED; ++ else ++ return pci_write_config_word(dev, offset, value); ++} ++ ++static struct config_field caplist_vpd[] = { ++ { ++ .offset = PCI_VPD_ADDR, ++ .size = 2, ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = vpd_address_write, ++ }, ++ { ++ .offset = PCI_VPD_DATA, ++ .size = 4, ++ .u.dw.read = pciback_read_config_dword, ++ .u.dw.write = NULL, ++ }, ++ { ++ .size = 0, ++ }, ++}; ++ ++struct pciback_config_capability pciback_config_capability_vpd = { ++ .capability = PCI_CAP_ID_VPD, ++ .fields = caplist_vpd, ++}; +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_header.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,309 @@ ++/* ++ * PCI Backend - Handles the virtual fields in the configuration space headers. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++ ++struct pci_bar_info { ++ u32 val; ++ u32 len_val; ++ int which; ++}; ++ ++#define is_enable_cmd(value) ((value)&(PCI_COMMAND_MEMORY|PCI_COMMAND_IO)) ++#define is_master_cmd(value) ((value)&PCI_COMMAND_MASTER) ++ ++static int command_write(struct pci_dev *dev, int offset, u16 value, void *data) ++{ ++ int err; ++ ++ if (!dev->is_enabled && is_enable_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: enable\n", ++ pci_name(dev)); ++ err = pci_enable_device(dev); ++ if (err) ++ return err; ++ } else if (dev->is_enabled && !is_enable_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: disable\n", ++ pci_name(dev)); ++ pci_disable_device(dev); ++ } ++ ++ if (!dev->is_busmaster && is_master_cmd(value)) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG "pciback: %s: set bus master\n", ++ pci_name(dev)); ++ pci_set_master(dev); ++ } ++ ++ if (value & PCI_COMMAND_INVALIDATE) { ++ if (unlikely(verbose_request)) ++ printk(KERN_DEBUG ++ "pciback: %s: enable memory-write-invalidate\n", ++ pci_name(dev)); ++ err = pci_set_mwi(dev); ++ if (err) { ++ printk(KERN_WARNING ++ "pciback: %s: cannot enable memory-write-invalidate (%d)\n", ++ pci_name(dev), err); ++ value &= ~PCI_COMMAND_INVALIDATE; ++ } ++ } ++ ++ return pci_write_config_word(dev, offset, value); ++} ++ ++static int rom_write(struct pci_dev *dev, int offset, u32 value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ /* A write to obtain the length must happen as a 32-bit write. ++ * This does not (yet) support writing individual bytes ++ */ ++ if (value == ~PCI_ROM_ADDRESS_ENABLE) ++ bar->which = 1; ++ else ++ bar->which = 0; ++ ++ /* Do we need to support enabling/disabling the rom address here? */ ++ ++ return 0; ++} ++ ++/* For the BARs, only allow writes which write ~0 or ++ * the correct resource information ++ * (Needed for when the driver probes the resource usage) ++ */ ++static int bar_write(struct pci_dev *dev, int offset, u32 value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ /* A write to obtain the length must happen as a 32-bit write. ++ * This does not (yet) support writing individual bytes ++ */ ++ if (value == ~0) ++ bar->which = 1; ++ else ++ bar->which = 0; ++ ++ return 0; ++} ++ ++static int bar_read(struct pci_dev *dev, int offset, u32 * value, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ if (unlikely(!bar)) { ++ printk(KERN_WARNING "pciback: driver data not found for %s\n", ++ pci_name(dev)); ++ return XEN_PCI_ERR_op_failed; ++ } ++ ++ *value = bar->which ? bar->len_val : bar->val; ++ ++ return 0; ++} ++ ++static inline void read_dev_bar(struct pci_dev *dev, ++ struct pci_bar_info *bar_info, int offset, ++ u32 len_mask) ++{ ++ pci_read_config_dword(dev, offset, &bar_info->val); ++ pci_write_config_dword(dev, offset, len_mask); ++ pci_read_config_dword(dev, offset, &bar_info->len_val); ++ pci_write_config_dword(dev, offset, bar_info->val); ++} ++ ++static void *bar_init(struct pci_dev *dev, int offset) ++{ ++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); ++ ++ if (!bar) ++ return ERR_PTR(-ENOMEM); ++ ++ read_dev_bar(dev, bar, offset, ~0); ++ bar->which = 0; ++ ++ return bar; ++} ++ ++static void *rom_init(struct pci_dev *dev, int offset) ++{ ++ struct pci_bar_info *bar = kmalloc(sizeof(*bar), GFP_KERNEL); ++ ++ if (!bar) ++ return ERR_PTR(-ENOMEM); ++ ++ read_dev_bar(dev, bar, offset, ~PCI_ROM_ADDRESS_ENABLE); ++ bar->which = 0; ++ ++ return bar; ++} ++ ++static void bar_reset(struct pci_dev *dev, int offset, void *data) ++{ ++ struct pci_bar_info *bar = data; ++ ++ bar->which = 0; ++} ++ ++static void bar_release(struct pci_dev *dev, int offset, void *data) ++{ ++ kfree(data); ++} ++ ++static int interrupt_read(struct pci_dev *dev, int offset, u8 * value, ++ void *data) ++{ ++ *value = (u8) dev->irq; ++ ++ return 0; ++} ++ ++static int bist_write(struct pci_dev *dev, int offset, u8 value, void *data) ++{ ++ u8 cur_value; ++ int err; ++ ++ err = pci_read_config_byte(dev, offset, &cur_value); ++ if (err) ++ goto out; ++ ++ if ((cur_value & ~PCI_BIST_START) == (value & ~PCI_BIST_START) ++ || value == PCI_BIST_START) ++ err = pci_write_config_byte(dev, offset, value); ++ ++ out: ++ return err; ++} ++ ++static struct config_field header_common[] = { ++ { ++ .offset = PCI_COMMAND, ++ .size = 2, ++ .u.w.read = pciback_read_config_word, ++ .u.w.write = command_write, ++ }, ++ { ++ .offset = PCI_INTERRUPT_LINE, ++ .size = 1, ++ .u.b.read = interrupt_read, ++ }, ++ { ++ .offset = PCI_INTERRUPT_PIN, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ /* Any side effects of letting driver domain control cache line? */ ++ .offset = PCI_CACHE_LINE_SIZE, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ .u.b.write = pciback_write_config_byte, ++ }, ++ { ++ .offset = PCI_LATENCY_TIMER, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ }, ++ { ++ .offset = PCI_BIST, ++ .size = 1, ++ .u.b.read = pciback_read_config_byte, ++ .u.b.write = bist_write, ++ }, ++ { ++ .size = 0, ++ }, ++}; ++ ++#define CFG_FIELD_BAR(reg_offset) \ ++ { \ ++ .offset = reg_offset, \ ++ .size = 4, \ ++ .init = bar_init, \ ++ .reset = bar_reset, \ ++ .release = bar_release, \ ++ .u.dw.read = bar_read, \ ++ .u.dw.write = bar_write, \ ++ } ++ ++#define CFG_FIELD_ROM(reg_offset) \ ++ { \ ++ .offset = reg_offset, \ ++ .size = 4, \ ++ .init = rom_init, \ ++ .reset = bar_reset, \ ++ .release = bar_release, \ ++ .u.dw.read = bar_read, \ ++ .u.dw.write = rom_write, \ ++ } ++ ++static struct config_field header_0[] = { ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_2), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_3), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_4), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_5), ++ CFG_FIELD_ROM(PCI_ROM_ADDRESS), ++ { ++ .size = 0, ++ }, ++}; ++ ++static struct config_field header_1[] = { ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_0), ++ CFG_FIELD_BAR(PCI_BASE_ADDRESS_1), ++ CFG_FIELD_ROM(PCI_ROM_ADDRESS1), ++ { ++ .size = 0, ++ }, ++}; ++ ++int pciback_config_header_add_fields(struct pci_dev *dev) ++{ ++ int err; ++ ++ err = pciback_config_add_fields(dev, header_common); ++ if (err) ++ goto out; ++ ++ switch (dev->hdr_type) { ++ case PCI_HEADER_TYPE_NORMAL: ++ err = pciback_config_add_fields(dev, header_0); ++ break; ++ ++ case PCI_HEADER_TYPE_BRIDGE: ++ err = pciback_config_add_fields(dev, header_1); ++ break; ++ ++ default: ++ err = -EINVAL; ++ printk(KERN_ERR "pciback: %s: Unsupported header type %d!\n", ++ pci_name(dev), dev->hdr_type); ++ break; ++ } ++ ++ out: ++ return err; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_quirks.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,126 @@ ++/* ++ * PCI Backend - Handle special overlays for broken devices. ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Author: Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/pci.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++LIST_HEAD(pciback_quirks); ++ ++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *tmp_quirk; ++ ++ list_for_each_entry(tmp_quirk, &pciback_quirks, quirks_list) ++ if (pci_match_id(&tmp_quirk->devid, dev)) ++ goto out; ++ tmp_quirk = NULL; ++ printk(KERN_DEBUG ++ "quirk didn't match any device pciback knows about\n"); ++ out: ++ return tmp_quirk; ++} ++ ++static inline void register_quirk(struct pciback_config_quirk *quirk) ++{ ++ list_add_tail(&quirk->quirks_list, &pciback_quirks); ++} ++ ++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg) ++{ ++ int ret = 0; ++ struct pciback_dev_data *dev_data = pci_get_drvdata(dev); ++ struct config_field_entry *cfg_entry; ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ if ( OFFSET(cfg_entry) == reg) { ++ ret = 1; ++ break; ++ } ++ } ++ return ret; ++} ++ ++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field ++ *field) ++{ ++ int err = 0; ++ ++ switch (field->size) { ++ case 1: ++ field->u.b.read = pciback_read_config_byte; ++ field->u.b.write = pciback_write_config_byte; ++ break; ++ case 2: ++ field->u.w.read = pciback_read_config_word; ++ field->u.w.write = pciback_write_config_word; ++ break; ++ case 4: ++ field->u.dw.read = pciback_read_config_dword; ++ field->u.dw.write = pciback_write_config_dword; ++ break; ++ default: ++ err = -EINVAL; ++ goto out; ++ } ++ ++ pciback_config_add_field(dev, field); ++ ++ out: ++ return err; ++} ++ ++int pciback_config_quirks_init(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *quirk; ++ int ret = 0; ++ ++ quirk = kzalloc(sizeof(*quirk), GFP_ATOMIC); ++ if (!quirk) { ++ ret = -ENOMEM; ++ goto out; ++ } ++ ++ quirk->devid.vendor = dev->vendor; ++ quirk->devid.device = dev->device; ++ quirk->devid.subvendor = dev->subsystem_vendor; ++ quirk->devid.subdevice = dev->subsystem_device; ++ quirk->devid.class = 0; ++ quirk->devid.class_mask = 0; ++ quirk->devid.driver_data = 0UL; ++ ++ quirk->pdev = dev; ++ ++ register_quirk(quirk); ++ out: ++ return ret; ++} ++ ++void pciback_config_field_free(struct config_field *field) ++{ ++ kfree(field); ++} ++ ++int pciback_config_quirk_release(struct pci_dev *dev) ++{ ++ struct pciback_config_quirk *quirk; ++ int ret = 0; ++ ++ quirk = pciback_find_quirk(dev); ++ if (!quirk) { ++ ret = -ENXIO; ++ goto out; ++ } ++ ++ list_del(&quirk->quirks_list); ++ kfree(quirk); ++ ++ out: ++ return ret; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/conf_space_quirks.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,35 @@ ++/* ++ * PCI Backend - Data structures for special overlays for broken devices. ++ * ++ * Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++ ++#ifndef __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ ++#define __XEN_PCIBACK_CONF_SPACE_QUIRKS_H__ ++ ++#include <linux/pci.h> ++#include <linux/list.h> ++ ++struct pciback_config_quirk { ++ struct list_head quirks_list; ++ struct pci_device_id devid; ++ struct pci_dev *pdev; ++}; ++ ++struct pciback_config_quirk *pciback_find_quirk(struct pci_dev *dev); ++ ++int pciback_config_quirks_add_field(struct pci_dev *dev, struct config_field ++ *field); ++ ++int pciback_config_quirks_remove_field(struct pci_dev *dev, int reg); ++ ++int pciback_config_quirks_init(struct pci_dev *dev); ++ ++void pciback_config_field_free(struct config_field *field); ++ ++int pciback_config_quirk_release(struct pci_dev *dev); ++ ++int pciback_field_is_dup(struct pci_dev *dev, unsigned int reg); ++ ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/passthrough.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,157 @@ ++/* ++ * PCI Backend - Provides restricted access to the real PCI bus topology ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/list.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++struct passthrough_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct list_head dev_list; ++ spinlock_t lock; ++}; ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry; ++ struct pci_dev *dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) { ++ if (domain == (unsigned int)pci_domain_nr(dev_entry->dev->bus) ++ && bus == (unsigned int)dev_entry->dev->bus->number ++ && devfn == dev_entry->dev->devfn) { ++ dev = dev_entry->dev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ return dev; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry; ++ unsigned long flags; ++ ++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); ++ if (!dev_entry) ++ return -ENOMEM; ++ dev_entry->dev = dev; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ list_add_tail(&dev_entry->list, &dev_data->dev_list); ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ return 0; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *t; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&dev_data->lock, flags); ++ ++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { ++ if (dev_entry->dev == dev) { ++ list_del(&dev_entry->list); ++ found_dev = dev_entry->dev; ++ kfree(dev_entry); ++ } ++ } ++ ++ spin_unlock_irqrestore(&dev_data->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ struct passthrough_dev_data *dev_data; ++ ++ dev_data = kmalloc(sizeof(*dev_data), GFP_KERNEL); ++ if (!dev_data) ++ return -ENOMEM; ++ ++ spin_lock_init(&dev_data->lock); ++ ++ INIT_LIST_HEAD(&dev_data->dev_list); ++ ++ pdev->pci_dev_data = dev_data; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_root_cb) ++{ ++ int err = 0; ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *e; ++ struct pci_dev *dev; ++ int found; ++ unsigned int domain, bus; ++ ++ spin_lock(&dev_data->lock); ++ ++ list_for_each_entry(dev_entry, &dev_data->dev_list, list) { ++ /* Only publish this device as a root if none of its ++ * parent bridges are exported ++ */ ++ found = 0; ++ dev = dev_entry->dev->bus->self; ++ for (; !found && dev != NULL; dev = dev->bus->self) { ++ list_for_each_entry(e, &dev_data->dev_list, list) { ++ if (dev == e->dev) { ++ found = 1; ++ break; ++ } ++ } ++ } ++ ++ domain = (unsigned int)pci_domain_nr(dev_entry->dev->bus); ++ bus = (unsigned int)dev_entry->dev->bus->number; ++ ++ if (!found) { ++ err = publish_root_cb(pdev, domain, bus); ++ if (err) ++ break; ++ } ++ } ++ ++ spin_unlock(&dev_data->lock); ++ ++ return err; ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ struct passthrough_dev_data *dev_data = pdev->pci_dev_data; ++ struct pci_dev_entry *dev_entry, *t; ++ ++ list_for_each_entry_safe(dev_entry, t, &dev_data->dev_list, list) { ++ list_del(&dev_entry->list); ++ pcistub_put_pci_dev(dev_entry->dev); ++ kfree(dev_entry); ++ } ++ ++ kfree(dev_data); ++ pdev->pci_dev_data = NULL; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/pci_stub.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,929 @@ ++/* ++ * PCI Stub Driver - Grabs devices in backend to be exported later ++ * ++ * Ryan Wilson <hap9@epoch.ncsc.mil> ++ * Chris Bookholt <hap10@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/kref.h> ++#include <asm/atomic.h> ++#include "pciback.h" ++#include "conf_space.h" ++#include "conf_space_quirks.h" ++ ++static char *pci_devs_to_hide = NULL; ++module_param_named(hide, pci_devs_to_hide, charp, 0444); ++ ++struct pcistub_device_id { ++ struct list_head slot_list; ++ int domain; ++ unsigned char bus; ++ unsigned int devfn; ++}; ++static LIST_HEAD(pcistub_device_ids); ++static DEFINE_SPINLOCK(device_ids_lock); ++ ++struct pcistub_device { ++ struct kref kref; ++ struct list_head dev_list; ++ spinlock_t lock; ++ ++ struct pci_dev *dev; ++ struct pciback_device *pdev; /* non-NULL if struct pci_dev is in use */ ++}; ++ ++/* Access to pcistub_devices & seized_devices lists and the initialize_devices ++ * flag must be locked with pcistub_devices_lock ++ */ ++static DEFINE_SPINLOCK(pcistub_devices_lock); ++static LIST_HEAD(pcistub_devices); ++ ++/* wait for device_initcall before initializing our devices ++ * (see pcistub_init_devices_late) ++ */ ++static int initialize_devices = 0; ++static LIST_HEAD(seized_devices); ++ ++static struct pcistub_device *pcistub_device_alloc(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ ++ dev_dbg(&dev->dev, "pcistub_device_alloc\n"); ++ ++ psdev = kzalloc(sizeof(*psdev), GFP_ATOMIC); ++ if (!psdev) ++ return NULL; ++ ++ psdev->dev = pci_dev_get(dev); ++ if (!psdev->dev) { ++ kfree(psdev); ++ return NULL; ++ } ++ ++ kref_init(&psdev->kref); ++ spin_lock_init(&psdev->lock); ++ ++ return psdev; ++} ++ ++/* Don't call this directly as it's called by pcistub_device_put */ ++static void pcistub_device_release(struct kref *kref) ++{ ++ struct pcistub_device *psdev; ++ ++ psdev = container_of(kref, struct pcistub_device, kref); ++ ++ dev_dbg(&psdev->dev->dev, "pcistub_device_release\n"); ++ ++ /* Clean-up the device */ ++ pciback_reset_device(psdev->dev); ++ pciback_config_free_dyn_fields(psdev->dev); ++ pciback_config_free_dev(psdev->dev); ++ kfree(pci_get_drvdata(psdev->dev)); ++ pci_set_drvdata(psdev->dev, NULL); ++ ++ pci_dev_put(psdev->dev); ++ ++ kfree(psdev); ++} ++ ++static inline void pcistub_device_get(struct pcistub_device *psdev) ++{ ++ kref_get(&psdev->kref); ++} ++ ++static inline void pcistub_device_put(struct pcistub_device *psdev) ++{ ++ kref_put(&psdev->kref, pcistub_device_release); ++} ++ ++static struct pcistub_device *pcistub_device_find(int domain, int bus, ++ int slot, int func) ++{ ++ struct pcistub_device *psdev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev != NULL ++ && domain == pci_domain_nr(psdev->dev->bus) ++ && bus == psdev->dev->bus->number ++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) { ++ pcistub_device_get(psdev); ++ goto out; ++ } ++ } ++ ++ /* didn't find it */ ++ psdev = NULL; ++ ++ out: ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return psdev; ++} ++ ++static struct pci_dev *pcistub_device_get_pci_dev(struct pciback_device *pdev, ++ struct pcistub_device *psdev) ++{ ++ struct pci_dev *pci_dev = NULL; ++ unsigned long flags; ++ ++ pcistub_device_get(psdev); ++ ++ spin_lock_irqsave(&psdev->lock, flags); ++ if (!psdev->pdev) { ++ psdev->pdev = pdev; ++ pci_dev = psdev->dev; ++ } ++ spin_unlock_irqrestore(&psdev->lock, flags); ++ ++ if (!pci_dev) ++ pcistub_device_put(psdev); ++ ++ return pci_dev; ++} ++ ++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, ++ int domain, int bus, ++ int slot, int func) ++{ ++ struct pcistub_device *psdev; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev != NULL ++ && domain == pci_domain_nr(psdev->dev->bus) ++ && bus == psdev->dev->bus->number ++ && PCI_DEVFN(slot, func) == psdev->dev->devfn) { ++ found_dev = pcistub_device_get_pci_dev(pdev, psdev); ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return found_dev; ++} ++ ++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, ++ struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_dev = pcistub_device_get_pci_dev(pdev, psdev); ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return found_dev; ++} ++ ++void pcistub_put_pci_dev(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev, *found_psdev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_psdev = psdev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /* Cleanup our device ++ * (so it's ready for the next domain) ++ */ ++ pciback_reset_device(found_psdev->dev); ++ pciback_config_free_dyn_fields(found_psdev->dev); ++ pciback_config_reset_dev(found_psdev->dev); ++ ++ spin_lock_irqsave(&found_psdev->lock, flags); ++ found_psdev->pdev = NULL; ++ spin_unlock_irqrestore(&found_psdev->lock, flags); ++ ++ pcistub_device_put(found_psdev); ++} ++ ++static int __devinit pcistub_match_one(struct pci_dev *dev, ++ struct pcistub_device_id *pdev_id) ++{ ++ /* Match the specified device by domain, bus, slot, func and also if ++ * any of the device's parent bridges match. ++ */ ++ for (; dev != NULL; dev = dev->bus->self) { ++ if (pci_domain_nr(dev->bus) == pdev_id->domain ++ && dev->bus->number == pdev_id->bus ++ && dev->devfn == pdev_id->devfn) ++ return 1; ++ ++ /* Sometimes topmost bridge links to itself. */ ++ if (dev == dev->bus->self) ++ break; ++ } ++ ++ return 0; ++} ++ ++static int __devinit pcistub_match(struct pci_dev *dev) ++{ ++ struct pcistub_device_id *pdev_id; ++ unsigned long flags; ++ int found = 0; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(pdev_id, &pcistub_device_ids, slot_list) { ++ if (pcistub_match_one(dev, pdev_id)) { ++ found = 1; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return found; ++} ++ ++static int __devinit pcistub_init_device(struct pci_dev *dev) ++{ ++ struct pciback_dev_data *dev_data; ++ int err = 0; ++ ++ dev_dbg(&dev->dev, "initializing...\n"); ++ ++ /* The PCI backend is not intended to be a module (or to work with ++ * removable PCI devices (yet). If it were, pciback_config_free() ++ * would need to be called somewhere to free the memory allocated ++ * here and then to call kfree(pci_get_drvdata(psdev->dev)). ++ */ ++ dev_data = kzalloc(sizeof(*dev_data), GFP_ATOMIC); ++ if (!dev_data) { ++ err = -ENOMEM; ++ goto out; ++ } ++ pci_set_drvdata(dev, dev_data); ++ ++ dev_dbg(&dev->dev, "initializing config\n"); ++ err = pciback_config_init_dev(dev); ++ if (err) ++ goto out; ++ ++ /* HACK: Force device (& ACPI) to determine what IRQ it's on - we ++ * must do this here because pcibios_enable_device may specify ++ * the pci device's true irq (and possibly its other resources) ++ * if they differ from what's in the configuration space. ++ * This makes the assumption that the device's resources won't ++ * change after this point (otherwise this code may break!) ++ */ ++ dev_dbg(&dev->dev, "enabling device\n"); ++ err = pci_enable_device(dev); ++ if (err) ++ goto config_release; ++ ++ /* Now disable the device (this also ensures some private device ++ * data is setup before we export) ++ */ ++ dev_dbg(&dev->dev, "reset device\n"); ++ pciback_reset_device(dev); ++ ++ return 0; ++ ++ config_release: ++ pciback_config_free_dev(dev); ++ ++ out: ++ pci_set_drvdata(dev, NULL); ++ kfree(dev_data); ++ return err; ++} ++ ++/* ++ * Because some initialization still happens on ++ * devices during fs_initcall, we need to defer ++ * full initialization of our devices until ++ * device_initcall. ++ */ ++static int __init pcistub_init_devices_late(void) ++{ ++ struct pcistub_device *psdev; ++ unsigned long flags; ++ int err = 0; ++ ++ pr_debug("pciback: pcistub_init_devices_late\n"); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ while (!list_empty(&seized_devices)) { ++ psdev = container_of(seized_devices.next, ++ struct pcistub_device, dev_list); ++ list_del(&psdev->dev_list); ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ err = pcistub_init_device(psdev->dev); ++ if (err) { ++ dev_err(&psdev->dev->dev, ++ "error %d initializing device\n", err); ++ kfree(psdev); ++ psdev = NULL; ++ } ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (psdev) ++ list_add_tail(&psdev->dev_list, &pcistub_devices); ++ } ++ ++ initialize_devices = 1; ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ return 0; ++} ++ ++static int __devinit pcistub_seize(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev; ++ unsigned long flags; ++ int err = 0; ++ ++ psdev = pcistub_device_alloc(dev); ++ if (!psdev) ++ return -ENOMEM; ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (initialize_devices) { ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /* don't want irqs disabled when calling pcistub_init_device */ ++ err = pcistub_init_device(psdev->dev); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ if (!err) ++ list_add(&psdev->dev_list, &pcistub_devices); ++ } else { ++ dev_dbg(&dev->dev, "deferring initialization\n"); ++ list_add(&psdev->dev_list, &seized_devices); ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ if (err) ++ pcistub_device_put(psdev); ++ ++ return err; ++} ++ ++static int __devinit pcistub_probe(struct pci_dev *dev, ++ const struct pci_device_id *id) ++{ ++ int err = 0; ++ ++ dev_dbg(&dev->dev, "probing...\n"); ++ ++ if (pcistub_match(dev)) { ++ ++ if (dev->hdr_type != PCI_HEADER_TYPE_NORMAL ++ && dev->hdr_type != PCI_HEADER_TYPE_BRIDGE) { ++ dev_err(&dev->dev, "can't export pci devices that " ++ "don't have a normal (0) or bridge (1) " ++ "header type!\n"); ++ err = -ENODEV; ++ goto out; ++ } ++ ++ dev_info(&dev->dev, "seizing device\n"); ++ err = pcistub_seize(dev); ++ } else ++ /* Didn't find the device */ ++ err = -ENODEV; ++ ++ out: ++ return err; ++} ++ ++static void pcistub_remove(struct pci_dev *dev) ++{ ++ struct pcistub_device *psdev, *found_psdev = NULL; ++ unsigned long flags; ++ ++ dev_dbg(&dev->dev, "removing\n"); ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ ++ pciback_config_quirk_release(dev); ++ ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (psdev->dev == dev) { ++ found_psdev = psdev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ if (found_psdev) { ++ dev_dbg(&dev->dev, "found device to remove - in use? %p\n", ++ found_psdev->pdev); ++ ++ if (found_psdev->pdev) { ++ printk(KERN_WARNING "pciback: ****** removing device " ++ "%s while still in-use! ******\n", ++ pci_name(found_psdev->dev)); ++ printk(KERN_WARNING "pciback: ****** driver domain may " ++ "still access this device's i/o resources!\n"); ++ printk(KERN_WARNING "pciback: ****** shutdown driver " ++ "domain before binding device\n"); ++ printk(KERN_WARNING "pciback: ****** to other drivers " ++ "or domains\n"); ++ ++ pciback_release_pci_dev(found_psdev->pdev, ++ found_psdev->dev); ++ } ++ ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ list_del(&found_psdev->dev_list); ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ ++ /* the final put for releasing from the list */ ++ pcistub_device_put(found_psdev); ++ } ++} ++ ++static struct pci_device_id pcistub_ids[] = { ++ { ++ .vendor = PCI_ANY_ID, ++ .device = PCI_ANY_ID, ++ .subvendor = PCI_ANY_ID, ++ .subdevice = PCI_ANY_ID, ++ }, ++ {0,}, ++}; ++ ++/* ++ * Note: There is no MODULE_DEVICE_TABLE entry here because this isn't ++ * for a normal device. I don't want it to be loaded automatically. ++ */ ++ ++static struct pci_driver pciback_pci_driver = { ++ .name = "pciback", ++ .id_table = pcistub_ids, ++ .probe = pcistub_probe, ++ .remove = pcistub_remove, ++}; ++ ++static inline int str_to_slot(const char *buf, int *domain, int *bus, ++ int *slot, int *func) ++{ ++ int err; ++ ++ err = sscanf(buf, " %x:%x:%x.%x", domain, bus, slot, func); ++ if (err == 4) ++ return 0; ++ else if (err < 0) ++ return -EINVAL; ++ ++ /* try again without domain */ ++ *domain = 0; ++ err = sscanf(buf, " %x:%x.%x", bus, slot, func); ++ if (err == 3) ++ return 0; ++ ++ return -EINVAL; ++} ++ ++static inline int str_to_quirk(const char *buf, int *domain, int *bus, int ++ *slot, int *func, int *reg, int *size, int *mask) ++{ ++ int err; ++ ++ err = ++ sscanf(buf, " %04x:%02x:%02x.%1x-%08x:%1x:%08x", domain, bus, slot, ++ func, reg, size, mask); ++ if (err == 7) ++ return 0; ++ return -EINVAL; ++} ++ ++static int pcistub_device_id_add(int domain, int bus, int slot, int func) ++{ ++ struct pcistub_device_id *pci_dev_id; ++ unsigned long flags; ++ ++ pci_dev_id = kmalloc(sizeof(*pci_dev_id), GFP_KERNEL); ++ if (!pci_dev_id) ++ return -ENOMEM; ++ ++ pci_dev_id->domain = domain; ++ pci_dev_id->bus = bus; ++ pci_dev_id->devfn = PCI_DEVFN(slot, func); ++ ++ pr_debug("pciback: wants to seize %04x:%02x:%02x.%01x\n", ++ domain, bus, slot, func); ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_add_tail(&pci_dev_id->slot_list, &pcistub_device_ids); ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return 0; ++} ++ ++static int pcistub_device_id_remove(int domain, int bus, int slot, int func) ++{ ++ struct pcistub_device_id *pci_dev_id, *t; ++ int devfn = PCI_DEVFN(slot, func); ++ int err = -ENOENT; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry_safe(pci_dev_id, t, &pcistub_device_ids, slot_list) { ++ ++ if (pci_dev_id->domain == domain ++ && pci_dev_id->bus == bus && pci_dev_id->devfn == devfn) { ++ /* Don't break; here because it's possible the same ++ * slot could be in the list more than once ++ */ ++ list_del(&pci_dev_id->slot_list); ++ kfree(pci_dev_id); ++ ++ err = 0; ++ ++ pr_debug("pciback: removed %04x:%02x:%02x.%01x from " ++ "seize list\n", domain, bus, slot, func); ++ } ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return err; ++} ++ ++static int pcistub_reg_add(int domain, int bus, int slot, int func, int reg, ++ int size, int mask) ++{ ++ int err = 0; ++ struct pcistub_device *psdev; ++ struct pci_dev *dev; ++ struct config_field *field; ++ ++ psdev = pcistub_device_find(domain, bus, slot, func); ++ if (!psdev || !psdev->dev) { ++ err = -ENODEV; ++ goto out; ++ } ++ dev = psdev->dev; ++ ++ field = kzalloc(sizeof(*field), GFP_ATOMIC); ++ if (!field) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ field->offset = reg; ++ field->size = size; ++ field->mask = mask; ++ field->init = NULL; ++ field->reset = NULL; ++ field->release = NULL; ++ field->clean = pciback_config_field_free; ++ ++ err = pciback_config_quirks_add_field(dev, field); ++ if (err) ++ kfree(field); ++ out: ++ return err; ++} ++ ++static ssize_t pcistub_slot_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ ++ err = pcistub_device_id_add(domain, bus, slot, func); ++ ++ out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++DRIVER_ATTR(new_slot, S_IWUSR, NULL, pcistub_slot_add); ++ ++static ssize_t pcistub_slot_remove(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ ++ err = pcistub_device_id_remove(domain, bus, slot, func); ++ ++ out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++DRIVER_ATTR(remove_slot, S_IWUSR, NULL, pcistub_slot_remove); ++ ++static ssize_t pcistub_slot_show(struct device_driver *drv, char *buf) ++{ ++ struct pcistub_device_id *pci_dev_id; ++ size_t count = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(pci_dev_id, &pcistub_device_ids, slot_list) { ++ if (count >= PAGE_SIZE) ++ break; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "%04x:%02x:%02x.%01x\n", ++ pci_dev_id->domain, pci_dev_id->bus, ++ PCI_SLOT(pci_dev_id->devfn), ++ PCI_FUNC(pci_dev_id->devfn)); ++ } ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return count; ++} ++ ++DRIVER_ATTR(slots, S_IRUSR, pcistub_slot_show, NULL); ++ ++static ssize_t pcistub_quirk_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func, reg, size, mask; ++ int err; ++ ++ err = str_to_quirk(buf, &domain, &bus, &slot, &func, ®, &size, ++ &mask); ++ if (err) ++ goto out; ++ ++ err = pcistub_reg_add(domain, bus, slot, func, reg, size, mask); ++ ++ out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++static ssize_t pcistub_quirk_show(struct device_driver *drv, char *buf) ++{ ++ int count = 0; ++ unsigned long flags; ++ extern struct list_head pciback_quirks; ++ struct pciback_config_quirk *quirk; ++ struct pciback_dev_data *dev_data; ++ struct config_field *field; ++ struct config_field_entry *cfg_entry; ++ ++ spin_lock_irqsave(&device_ids_lock, flags); ++ list_for_each_entry(quirk, &pciback_quirks, quirks_list) { ++ if (count >= PAGE_SIZE) ++ goto out; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "%02x:%02x.%01x\n\t%04x:%04x:%04x:%04x\n", ++ quirk->pdev->bus->number, ++ PCI_SLOT(quirk->pdev->devfn), ++ PCI_FUNC(quirk->pdev->devfn), ++ quirk->devid.vendor, quirk->devid.device, ++ quirk->devid.subvendor, ++ quirk->devid.subdevice); ++ ++ dev_data = pci_get_drvdata(quirk->pdev); ++ ++ list_for_each_entry(cfg_entry, &dev_data->config_fields, list) { ++ field = cfg_entry->field; ++ if (count >= PAGE_SIZE) ++ goto out; ++ ++ count += scnprintf(buf + count, PAGE_SIZE - count, ++ "\t\t%08x:%01x:%08x\n", ++ cfg_entry->base_offset + field->offset, ++ field->size, field->mask); ++ } ++ } ++ ++ out: ++ spin_unlock_irqrestore(&device_ids_lock, flags); ++ ++ return count; ++} ++ ++DRIVER_ATTR(quirks, S_IRUSR | S_IWUSR, pcistub_quirk_show, pcistub_quirk_add); ++ ++static ssize_t permissive_add(struct device_driver *drv, const char *buf, ++ size_t count) ++{ ++ int domain, bus, slot, func; ++ int err; ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ err = str_to_slot(buf, &domain, &bus, &slot, &func); ++ if (err) ++ goto out; ++ psdev = pcistub_device_find(domain, bus, slot, func); ++ if (!psdev) { ++ err = -ENODEV; ++ goto out; ++ } ++ if (!psdev->dev) { ++ err = -ENODEV; ++ goto release; ++ } ++ dev_data = pci_get_drvdata(psdev->dev); ++ /* the driver data for a device should never be null at this point */ ++ if (!dev_data) { ++ err = -ENXIO; ++ goto release; ++ } ++ if (!dev_data->permissive) { ++ dev_data->permissive = 1; ++ /* Let user know that what they're doing could be unsafe */ ++ dev_warn(&psdev->dev->dev, ++ "enabling permissive mode configuration space accesses!\n"); ++ dev_warn(&psdev->dev->dev, ++ "permissive mode is potentially unsafe!\n"); ++ } ++ release: ++ pcistub_device_put(psdev); ++ out: ++ if (!err) ++ err = count; ++ return err; ++} ++ ++static ssize_t permissive_show(struct device_driver *drv, char *buf) ++{ ++ struct pcistub_device *psdev; ++ struct pciback_dev_data *dev_data; ++ size_t count = 0; ++ unsigned long flags; ++ spin_lock_irqsave(&pcistub_devices_lock, flags); ++ list_for_each_entry(psdev, &pcistub_devices, dev_list) { ++ if (count >= PAGE_SIZE) ++ break; ++ if (!psdev->dev) ++ continue; ++ dev_data = pci_get_drvdata(psdev->dev); ++ if (!dev_data || !dev_data->permissive) ++ continue; ++ count += ++ scnprintf(buf + count, PAGE_SIZE - count, "%s\n", ++ pci_name(psdev->dev)); ++ } ++ spin_unlock_irqrestore(&pcistub_devices_lock, flags); ++ return count; ++} ++ ++DRIVER_ATTR(permissive, S_IRUSR | S_IWUSR, permissive_show, permissive_add); ++ ++static void pcistub_exit(void) ++{ ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_new_slot); ++ driver_remove_file(&pciback_pci_driver.driver, ++ &driver_attr_remove_slot); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_slots); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_quirks); ++ driver_remove_file(&pciback_pci_driver.driver, &driver_attr_permissive); ++ ++ pci_unregister_driver(&pciback_pci_driver); ++} ++ ++static int __init pcistub_init(void) ++{ ++ int pos = 0; ++ int err = 0; ++ int domain, bus, slot, func; ++ int parsed; ++ ++ if (pci_devs_to_hide && *pci_devs_to_hide) { ++ do { ++ parsed = 0; ++ ++ err = sscanf(pci_devs_to_hide + pos, ++ " (%x:%x:%x.%x) %n", ++ &domain, &bus, &slot, &func, &parsed); ++ if (err != 4) { ++ domain = 0; ++ err = sscanf(pci_devs_to_hide + pos, ++ " (%x:%x.%x) %n", ++ &bus, &slot, &func, &parsed); ++ if (err != 3) ++ goto parse_error; ++ } ++ ++ err = pcistub_device_id_add(domain, bus, slot, func); ++ if (err) ++ goto out; ++ ++ /* if parsed<=0, we've reached the end of the string */ ++ pos += parsed; ++ } while (parsed > 0 && pci_devs_to_hide[pos]); ++ } ++ ++ /* If we're the first PCI Device Driver to register, we're the ++ * first one to get offered PCI devices as they become ++ * available (and thus we can be the first to grab them) ++ */ ++ err = pci_register_driver(&pciback_pci_driver); ++ if (err < 0) ++ goto out; ++ ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_new_slot); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_remove_slot); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_slots); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_quirks); ++ if (!err) ++ err = driver_create_file(&pciback_pci_driver.driver, ++ &driver_attr_permissive); ++ ++ if (err) ++ pcistub_exit(); ++ ++ out: ++ return err; ++ ++ parse_error: ++ printk(KERN_ERR "pciback: Error parsing pci_devs_to_hide at \"%s\"\n", ++ pci_devs_to_hide + pos); ++ return -EINVAL; ++} ++ ++#ifndef MODULE ++/* ++ * fs_initcall happens before device_initcall ++ * so pciback *should* get called first (b/c we ++ * want to suck up any device before other drivers ++ * get a chance by being the first pci device ++ * driver to register) ++ */ ++fs_initcall(pcistub_init); ++#endif ++ ++static int __init pciback_init(void) ++{ ++ int err; ++ ++ err = pciback_config_init(); ++ if (err) ++ return err; ++ ++#ifdef MODULE ++ err = pcistub_init(); ++ if (err < 0) ++ return err; ++#endif ++ ++ pcistub_init_devices_late(); ++ err = pciback_xenbus_register(); ++ if (err) ++ pcistub_exit(); ++ ++ return err; ++} ++ ++static void __exit pciback_cleanup(void) ++{ ++ pciback_xenbus_unregister(); ++ pcistub_exit(); ++} ++ ++module_init(pciback_init); ++module_exit(pciback_cleanup); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/pciback.h 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,93 @@ ++/* ++ * PCI Backend Common Data Structures & Function Declarations ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#ifndef __XEN_PCIBACK_H__ ++#define __XEN_PCIBACK_H__ ++ ++#include <linux/pci.h> ++#include <linux/interrupt.h> ++#include <xen/xenbus.h> ++#include <linux/list.h> ++#include <linux/spinlock.h> ++#include <linux/workqueue.h> ++#include <asm/atomic.h> ++#include <xen/interface/io/pciif.h> ++ ++struct pci_dev_entry { ++ struct list_head list; ++ struct pci_dev *dev; ++}; ++ ++#define _PDEVF_op_active (0) ++#define PDEVF_op_active (1<<(_PDEVF_op_active)) ++ ++struct pciback_device { ++ void *pci_dev_data; ++ spinlock_t dev_lock; ++ ++ struct xenbus_device *xdev; ++ ++ struct xenbus_watch be_watch; ++ u8 be_watching; ++ ++ int evtchn_irq; ++ ++ struct vm_struct *sh_area; ++ struct xen_pci_sharedinfo *sh_info; ++ ++ unsigned long flags; ++ ++ struct work_struct op_work; ++}; ++ ++struct pciback_dev_data { ++ struct list_head config_fields; ++ int permissive; ++ int warned_on_write; ++}; ++ ++/* Get/Put PCI Devices that are hidden from the PCI Backend Domain */ ++struct pci_dev *pcistub_get_pci_dev_by_slot(struct pciback_device *pdev, ++ int domain, int bus, ++ int slot, int func); ++struct pci_dev *pcistub_get_pci_dev(struct pciback_device *pdev, ++ struct pci_dev *dev); ++void pcistub_put_pci_dev(struct pci_dev *dev); ++ ++/* Ensure a device is turned off or reset */ ++void pciback_reset_device(struct pci_dev *pdev); ++ ++/* Access a virtual configuration space for a PCI device */ ++int pciback_config_init(void); ++int pciback_config_init_dev(struct pci_dev *dev); ++void pciback_config_free_dyn_fields(struct pci_dev *dev); ++void pciback_config_reset_dev(struct pci_dev *dev); ++void pciback_config_free_dev(struct pci_dev *dev); ++int pciback_config_read(struct pci_dev *dev, int offset, int size, ++ u32 * ret_val); ++int pciback_config_write(struct pci_dev *dev, int offset, int size, u32 value); ++ ++/* Handle requests for specific devices from the frontend */ ++typedef int (*publish_pci_root_cb) (struct pciback_device * pdev, ++ unsigned int domain, unsigned int bus); ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev); ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev); ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn); ++int pciback_init_devices(struct pciback_device *pdev); ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb cb); ++void pciback_release_devices(struct pciback_device *pdev); ++ ++/* Handles events from front-end */ ++irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs); ++void pciback_do_op(void *data); ++ ++int pciback_xenbus_register(void); ++void pciback_xenbus_unregister(void); ++ ++extern int verbose_request; ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/pciback_ops.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,95 @@ ++/* ++ * PCI Backend Operations - respond to PCI requests from Frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <asm/bitops.h> ++#include <xen/evtchn.h> ++#include "pciback.h" ++ ++int verbose_request = 0; ++module_param(verbose_request, int, 0644); ++ ++/* Ensure a device is "turned off" and ready to be exported. ++ * (Also see pciback_config_reset to ensure virtual configuration space is ++ * ready to be re-exported) ++ */ ++void pciback_reset_device(struct pci_dev *dev) ++{ ++ u16 cmd; ++ ++ /* Disable devices (but not bridges) */ ++ if (dev->hdr_type == PCI_HEADER_TYPE_NORMAL) { ++ pci_disable_device(dev); ++ ++ pci_write_config_word(dev, PCI_COMMAND, 0); ++ ++ dev->is_enabled = 0; ++ dev->is_busmaster = 0; ++ } else { ++ pci_read_config_word(dev, PCI_COMMAND, &cmd); ++ if (cmd & (PCI_COMMAND_INVALIDATE)) { ++ cmd &= ~(PCI_COMMAND_INVALIDATE); ++ pci_write_config_word(dev, PCI_COMMAND, cmd); ++ ++ dev->is_busmaster = 0; ++ } ++ } ++} ++ ++static inline void test_and_schedule_op(struct pciback_device *pdev) ++{ ++ /* Check that frontend is requesting an operation and that we are not ++ * already processing a request */ ++ if (test_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags) ++ && !test_and_set_bit(_PDEVF_op_active, &pdev->flags)) ++ schedule_work(&pdev->op_work); ++} ++ ++/* Performing the configuration space reads/writes must not be done in atomic ++ * context because some of the pci_* functions can sleep (mostly due to ACPI ++ * use of semaphores). This function is intended to be called from a work ++ * queue in process context taking a struct pciback_device as a parameter */ ++void pciback_do_op(void *data) ++{ ++ struct pciback_device *pdev = data; ++ struct pci_dev *dev; ++ struct xen_pci_op *op = &pdev->sh_info->op; ++ ++ dev = pciback_get_pci_dev(pdev, op->domain, op->bus, op->devfn); ++ ++ if (dev == NULL) ++ op->err = XEN_PCI_ERR_dev_not_found; ++ else if (op->cmd == XEN_PCI_OP_conf_read) ++ op->err = pciback_config_read(dev, op->offset, op->size, ++ &op->value); ++ else if (op->cmd == XEN_PCI_OP_conf_write) ++ op->err = pciback_config_write(dev, op->offset, op->size, ++ op->value); ++ else ++ op->err = XEN_PCI_ERR_not_implemented; ++ ++ /* Tell the driver domain that we're done. */ ++ wmb(); ++ clear_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); ++ notify_remote_via_irq(pdev->evtchn_irq); ++ ++ /* Mark that we're done. */ ++ smp_mb__before_clear_bit(); /* /after/ clearing PCIF_active */ ++ clear_bit(_PDEVF_op_active, &pdev->flags); ++ smp_mb__after_clear_bit(); /* /before/ final check for work */ ++ ++ /* Check to see if the driver domain tried to start another request in ++ * between clearing _XEN_PCIF_active and clearing _PDEVF_op_active. */ ++ test_and_schedule_op(pdev); ++} ++ ++irqreturn_t pciback_handle_event(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ struct pciback_device *pdev = dev_id; ++ ++ test_and_schedule_op(pdev); ++ ++ return IRQ_HANDLED; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/slot.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,151 @@ ++/* ++ * PCI Backend - Provides a Virtual PCI bus (with real devices) ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> (vpci.c) ++ * Author: Tristan Gingold <tristan.gingold@bull.net>, from vpci.c ++ */ ++ ++#include <linux/list.h> ++#include <linux/slab.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++/* There are at most 32 slots in a pci bus. */ ++#define PCI_SLOT_MAX 32 ++ ++#define PCI_BUS_NBR 2 ++ ++struct slot_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct pci_dev *slots[PCI_BUS_NBR][PCI_SLOT_MAX]; ++ spinlock_t lock; ++}; ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct pci_dev *dev = NULL; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if (domain != 0 || PCI_FUNC(devfn) != 0) ++ return NULL; ++ ++ if (PCI_SLOT(devfn) >= PCI_SLOT_MAX || bus >= PCI_BUS_NBR) ++ return NULL; ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ dev = slot_dev->slots[bus][PCI_SLOT(devfn)]; ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ ++ return dev; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int err = 0, slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { ++ err = -EFAULT; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Can't export bridges on the virtual PCI bus"); ++ goto out; ++ } ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ ++ /* Assign to a new slot on the virtual PCI bus */ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (slot_dev->slots[bus][slot] == NULL) { ++ printk(KERN_INFO ++ "pciback: slot: %s: assign to virtual slot %d, bus %d\n", ++ pci_name(dev), slot, bus); ++ slot_dev->slots[bus][slot] = dev; ++ goto unlock; ++ } ++ } ++ ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "No more space on root virtual PCI bus"); ++ ++ unlock: ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ out: ++ return err; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&slot_dev->lock, flags); ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (slot_dev->slots[bus][slot] == dev) { ++ slot_dev->slots[bus][slot] = NULL; ++ found_dev = dev; ++ goto out; ++ } ++ } ++ ++ out: ++ spin_unlock_irqrestore(&slot_dev->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev; ++ ++ slot_dev = kmalloc(sizeof(*slot_dev), GFP_KERNEL); ++ if (!slot_dev) ++ return -ENOMEM; ++ ++ spin_lock_init(&slot_dev->lock); ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) ++ slot_dev->slots[bus][slot] = NULL; ++ ++ pdev->pci_dev_data = slot_dev; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_cb) ++{ ++ /* The Virtual PCI bus has only one root */ ++ return publish_cb(pdev, 0, 0); ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ int slot, bus; ++ struct slot_dev_data *slot_dev = pdev->pci_dev_data; ++ struct pci_dev *dev; ++ ++ for (bus = 0; bus < PCI_BUS_NBR; bus++) ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ dev = slot_dev->slots[bus][slot]; ++ if (dev != NULL) ++ pcistub_put_pci_dev(dev); ++ } ++ ++ kfree(slot_dev); ++ pdev->pci_dev_data = NULL; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/vpci.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,204 @@ ++/* ++ * PCI Backend - Provides a Virtual PCI bus (with real devices) ++ * to the frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++ ++#include <linux/list.h> ++#include <linux/slab.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pciback.h" ++ ++#define PCI_SLOT_MAX 32 ++ ++struct vpci_dev_data { ++ /* Access to dev_list must be protected by lock */ ++ struct list_head dev_list[PCI_SLOT_MAX]; ++ spinlock_t lock; ++}; ++ ++static inline struct list_head *list_first(struct list_head *head) ++{ ++ return head->next; ++} ++ ++struct pci_dev *pciback_get_pci_dev(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus, ++ unsigned int devfn) ++{ ++ struct pci_dev_entry *entry; ++ struct pci_dev *dev = NULL; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if (domain != 0 || bus != 0) ++ return NULL; ++ ++ if (PCI_SLOT(devfn) < PCI_SLOT_MAX) { ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ list_for_each_entry(entry, ++ &vpci_dev->dev_list[PCI_SLOT(devfn)], ++ list) { ++ if (PCI_FUNC(entry->dev->devfn) == PCI_FUNC(devfn)) { ++ dev = entry->dev; ++ break; ++ } ++ } ++ ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ } ++ return dev; ++} ++ ++static inline int match_slot(struct pci_dev *l, struct pci_dev *r) ++{ ++ if (pci_domain_nr(l->bus) == pci_domain_nr(r->bus) ++ && l->bus == r->bus && PCI_SLOT(l->devfn) == PCI_SLOT(r->devfn)) ++ return 1; ++ ++ return 0; ++} ++ ++int pciback_add_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int err = 0, slot; ++ struct pci_dev_entry *t, *dev_entry; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ unsigned long flags; ++ ++ if ((dev->class >> 24) == PCI_BASE_CLASS_BRIDGE) { ++ err = -EFAULT; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Can't export bridges on the virtual PCI bus"); ++ goto out; ++ } ++ ++ dev_entry = kmalloc(sizeof(*dev_entry), GFP_KERNEL); ++ if (!dev_entry) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error adding entry to virtual PCI bus"); ++ goto out; ++ } ++ ++ dev_entry->dev = dev; ++ ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ /* Keep multi-function devices together on the virtual PCI bus */ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (!list_empty(&vpci_dev->dev_list[slot])) { ++ t = list_entry(list_first(&vpci_dev->dev_list[slot]), ++ struct pci_dev_entry, list); ++ ++ if (match_slot(dev, t->dev)) { ++ pr_info("pciback: vpci: %s: " ++ "assign to virtual slot %d func %d\n", ++ pci_name(dev), slot, ++ PCI_FUNC(dev->devfn)); ++ list_add_tail(&dev_entry->list, ++ &vpci_dev->dev_list[slot]); ++ goto unlock; ++ } ++ } ++ } ++ ++ /* Assign to a new slot on the virtual PCI bus */ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ if (list_empty(&vpci_dev->dev_list[slot])) { ++ printk(KERN_INFO ++ "pciback: vpci: %s: assign to virtual slot %d\n", ++ pci_name(dev), slot); ++ list_add_tail(&dev_entry->list, ++ &vpci_dev->dev_list[slot]); ++ goto unlock; ++ } ++ } ++ ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "No more space on root virtual PCI bus"); ++ ++ unlock: ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ out: ++ return err; ++} ++ ++void pciback_release_pci_dev(struct pciback_device *pdev, struct pci_dev *dev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ struct pci_dev *found_dev = NULL; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vpci_dev->lock, flags); ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ struct pci_dev_entry *e, *tmp; ++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], ++ list) { ++ if (e->dev == dev) { ++ list_del(&e->list); ++ found_dev = e->dev; ++ kfree(e); ++ goto out; ++ } ++ } ++ } ++ ++ out: ++ spin_unlock_irqrestore(&vpci_dev->lock, flags); ++ ++ if (found_dev) ++ pcistub_put_pci_dev(found_dev); ++} ++ ++int pciback_init_devices(struct pciback_device *pdev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev; ++ ++ vpci_dev = kmalloc(sizeof(*vpci_dev), GFP_KERNEL); ++ if (!vpci_dev) ++ return -ENOMEM; ++ ++ spin_lock_init(&vpci_dev->lock); ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ INIT_LIST_HEAD(&vpci_dev->dev_list[slot]); ++ } ++ ++ pdev->pci_dev_data = vpci_dev; ++ ++ return 0; ++} ++ ++int pciback_publish_pci_roots(struct pciback_device *pdev, ++ publish_pci_root_cb publish_cb) ++{ ++ /* The Virtual PCI bus has only one root */ ++ return publish_cb(pdev, 0, 0); ++} ++ ++void pciback_release_devices(struct pciback_device *pdev) ++{ ++ int slot; ++ struct vpci_dev_data *vpci_dev = pdev->pci_dev_data; ++ ++ for (slot = 0; slot < PCI_SLOT_MAX; slot++) { ++ struct pci_dev_entry *e, *tmp; ++ list_for_each_entry_safe(e, tmp, &vpci_dev->dev_list[slot], ++ list) { ++ list_del(&e->list); ++ pcistub_put_pci_dev(e->dev); ++ kfree(e); ++ } ++ } ++ ++ kfree(vpci_dev); ++ pdev->pci_dev_data = NULL; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pciback/xenbus.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,454 @@ ++/* ++ * PCI Backend Xenbus Setup - handles setup with frontend and xend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/list.h> ++#include <linux/vmalloc.h> ++#include <xen/xenbus.h> ++#include <xen/evtchn.h> ++#include "pciback.h" ++ ++#define INVALID_EVTCHN_IRQ (-1) ++ ++static struct pciback_device *alloc_pdev(struct xenbus_device *xdev) ++{ ++ struct pciback_device *pdev; ++ ++ pdev = kzalloc(sizeof(struct pciback_device), GFP_KERNEL); ++ if (pdev == NULL) ++ goto out; ++ dev_dbg(&xdev->dev, "allocated pdev @ 0x%p\n", pdev); ++ ++ pdev->xdev = xdev; ++ xdev->dev.driver_data = pdev; ++ ++ spin_lock_init(&pdev->dev_lock); ++ ++ pdev->sh_area = NULL; ++ pdev->sh_info = NULL; ++ pdev->evtchn_irq = INVALID_EVTCHN_IRQ; ++ pdev->be_watching = 0; ++ ++ INIT_WORK(&pdev->op_work, pciback_do_op, pdev); ++ ++ if (pciback_init_devices(pdev)) { ++ kfree(pdev); ++ pdev = NULL; ++ } ++ out: ++ return pdev; ++} ++ ++static void free_pdev(struct pciback_device *pdev) ++{ ++ if (pdev->be_watching) ++ unregister_xenbus_watch(&pdev->be_watch); ++ ++ /* Ensure the guest can't trigger our handler before removing devices */ ++ if (pdev->evtchn_irq != INVALID_EVTCHN_IRQ) ++ unbind_from_irqhandler(pdev->evtchn_irq, pdev); ++ ++ /* If the driver domain started an op, make sure we complete it or ++ * delete it before releasing the shared memory */ ++ cancel_delayed_work(&pdev->op_work); ++ flush_scheduled_work(); ++ ++ if (pdev->sh_info) ++ xenbus_unmap_ring_vfree(pdev->xdev, pdev->sh_area); ++ ++ pciback_release_devices(pdev); ++ ++ pdev->xdev->dev.driver_data = NULL; ++ pdev->xdev = NULL; ++ ++ kfree(pdev); ++} ++ ++static int pciback_do_attach(struct pciback_device *pdev, int gnt_ref, ++ int remote_evtchn) ++{ ++ int err = 0; ++ struct vm_struct *area; ++ ++ dev_dbg(&pdev->xdev->dev, ++ "Attaching to frontend resources - gnt_ref=%d evtchn=%d\n", ++ gnt_ref, remote_evtchn); ++ ++ area = xenbus_map_ring_valloc(pdev->xdev, gnt_ref); ++ if (IS_ERR(area)) { ++ err = PTR_ERR(area); ++ goto out; ++ } ++ pdev->sh_area = area; ++ pdev->sh_info = area->addr; ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ pdev->xdev->otherend_id, remote_evtchn, pciback_handle_event, ++ SA_SAMPLE_RANDOM, "pciback", pdev); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error binding event channel to IRQ"); ++ goto out; ++ } ++ pdev->evtchn_irq = err; ++ err = 0; ++ ++ dev_dbg(&pdev->xdev->dev, "Attached!\n"); ++ out: ++ return err; ++} ++ ++static int pciback_attach(struct pciback_device *pdev) ++{ ++ int err = 0; ++ int gnt_ref, remote_evtchn; ++ char *magic = NULL; ++ ++ spin_lock(&pdev->dev_lock); ++ ++ /* Make sure we only do this setup once */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitialised) ++ goto out; ++ ++ /* Wait for frontend to state that it has published the configuration */ ++ if (xenbus_read_driver_state(pdev->xdev->otherend) != ++ XenbusStateInitialised) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "Reading frontend config\n"); ++ ++ err = xenbus_gather(XBT_NIL, pdev->xdev->otherend, ++ "pci-op-ref", "%u", &gnt_ref, ++ "event-channel", "%u", &remote_evtchn, ++ "magic", NULL, &magic, NULL); ++ if (err) { ++ /* If configuration didn't get read correctly, wait longer */ ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading configuration from frontend"); ++ goto out; ++ } ++ ++ if (magic == NULL || strcmp(magic, XEN_PCI_MAGIC) != 0) { ++ xenbus_dev_fatal(pdev->xdev, -EFAULT, ++ "version mismatch (%s/%s) with pcifront - " ++ "halting pciback", ++ magic, XEN_PCI_MAGIC); ++ goto out; ++ } ++ ++ err = pciback_do_attach(pdev, gnt_ref, remote_evtchn); ++ if (err) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "Connecting...\n"); ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); ++ if (err) ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching to connected state!"); ++ ++ dev_dbg(&pdev->xdev->dev, "Connected? %d\n", err); ++ out: ++ spin_unlock(&pdev->dev_lock); ++ ++ if (magic) ++ kfree(magic); ++ ++ return err; ++} ++ ++static void pciback_frontend_changed(struct xenbus_device *xdev, ++ enum xenbus_state fe_state) ++{ ++ struct pciback_device *pdev = xdev->dev.driver_data; ++ ++ dev_dbg(&xdev->dev, "fe state changed %d\n", fe_state); ++ ++ switch (fe_state) { ++ case XenbusStateInitialised: ++ pciback_attach(pdev); ++ break; ++ ++ case XenbusStateClosing: ++ xenbus_switch_state(xdev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ dev_dbg(&xdev->dev, "frontend is gone! unregister device\n"); ++ device_unregister(&xdev->dev); ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++static int pciback_publish_pci_root(struct pciback_device *pdev, ++ unsigned int domain, unsigned int bus) ++{ ++ unsigned int d, b; ++ int i, root_num, len, err; ++ char str[64]; ++ ++ dev_dbg(&pdev->xdev->dev, "Publishing pci roots\n"); ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ "root_num", "%d", &root_num); ++ if (err == 0 || err == -ENOENT) ++ root_num = 0; ++ else if (err < 0) ++ goto out; ++ ++ /* Verify that we haven't already published this pci root */ ++ for (i = 0; i < root_num; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, ++ str, "%x:%x", &d, &b); ++ if (err < 0) ++ goto out; ++ if (err != 2) { ++ err = -EINVAL; ++ goto out; ++ } ++ ++ if (d == domain && b == bus) { ++ err = 0; ++ goto out; ++ } ++ } ++ ++ len = snprintf(str, sizeof(str), "root-%d", root_num); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ dev_dbg(&pdev->xdev->dev, "writing root %d at %04x:%02x\n", ++ root_num, domain, bus); ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, str, ++ "%04x:%02x", domain, bus); ++ if (err) ++ goto out; ++ ++ err = xenbus_printf(XBT_NIL, pdev->xdev->nodename, ++ "root_num", "%d", (root_num + 1)); ++ ++ out: ++ return err; ++} ++ ++static int pciback_export_device(struct pciback_device *pdev, ++ int domain, int bus, int slot, int func) ++{ ++ struct pci_dev *dev; ++ int err = 0; ++ ++ dev_dbg(&pdev->xdev->dev, "exporting dom %x bus %x slot %x func %x\n", ++ domain, bus, slot, func); ++ ++ dev = pcistub_get_pci_dev_by_slot(pdev, domain, bus, slot, func); ++ if (!dev) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Couldn't locate PCI device " ++ "(%04x:%02x:%02x.%01x)! " ++ "perhaps already in-use?", ++ domain, bus, slot, func); ++ goto out; ++ } ++ ++ err = pciback_add_pci_dev(pdev, dev); ++ if (err) ++ goto out; ++ ++ /* TODO: It'd be nice to export a bridge and have all of its children ++ * get exported with it. This may be best done in xend (which will ++ * have to calculate resource usage anyway) but we probably want to ++ * put something in here to ensure that if a bridge gets given to a ++ * driver domain, that all devices under that bridge are not given ++ * to other driver domains (as he who controls the bridge can disable ++ * it and stop the other devices from working). ++ */ ++ out: ++ return err; ++} ++ ++static int pciback_setup_backend(struct pciback_device *pdev) ++{ ++ /* Get configuration from xend (if available now) */ ++ int domain, bus, slot, func; ++ int err = 0; ++ int i, num_devs; ++ char dev_str[64]; ++ ++ spin_lock(&pdev->dev_lock); ++ ++ /* It's possible we could get the call to setup twice, so make sure ++ * we're not already connected. ++ */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitWait) ++ goto out; ++ ++ dev_dbg(&pdev->xdev->dev, "getting be setup\n"); ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, "num_devs", "%d", ++ &num_devs); ++ if (err != 1) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of devices"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_devs; i++) { ++ int l = snprintf(dev_str, sizeof(dev_str), "dev-%d", i); ++ if (unlikely(l >= (sizeof(dev_str) - 1))) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "String overflow while reading " ++ "configuration"); ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->nodename, dev_str, ++ "%x:%x:%x.%x", &domain, &bus, &slot, &func); ++ if (err < 0) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading device configuration"); ++ goto out; ++ } ++ if (err != 4) { ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error parsing pci device " ++ "configuration"); ++ goto out; ++ } ++ ++ err = pciback_export_device(pdev, domain, bus, slot, func); ++ if (err) ++ goto out; ++ } ++ ++ err = pciback_publish_pci_roots(pdev, pciback_publish_pci_root); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error while publish PCI root buses " ++ "for frontend"); ++ goto out; ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateInitialised); ++ if (err) ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error switching to initialised state!"); ++ ++ out: ++ spin_unlock(&pdev->dev_lock); ++ ++ if (!err) ++ /* see if pcifront is already configured (if not, we'll wait) */ ++ pciback_attach(pdev); ++ ++ return err; ++} ++ ++static void pciback_be_watch(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ struct pciback_device *pdev = ++ container_of(watch, struct pciback_device, be_watch); ++ ++ switch (xenbus_read_driver_state(pdev->xdev->nodename)) { ++ case XenbusStateInitWait: ++ pciback_setup_backend(pdev); ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++static int pciback_xenbus_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err = 0; ++ struct pciback_device *pdev = alloc_pdev(dev); ++ ++ if (pdev == NULL) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(dev, err, ++ "Error allocating pciback_device struct"); ++ goto out; ++ } ++ ++ /* wait for xend to configure us */ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) ++ goto out; ++ ++ /* watch the backend node for backend configuration information */ ++ err = xenbus_watch_path(dev, dev->nodename, &pdev->be_watch, ++ pciback_be_watch); ++ if (err) ++ goto out; ++ pdev->be_watching = 1; ++ ++ /* We need to force a call to our callback here in case ++ * xend already configured us! ++ */ ++ pciback_be_watch(&pdev->be_watch, NULL, 0); ++ ++ out: ++ return err; ++} ++ ++static int pciback_xenbus_remove(struct xenbus_device *dev) ++{ ++ struct pciback_device *pdev = dev->dev.driver_data; ++ ++ if (pdev != NULL) ++ free_pdev(pdev); ++ ++ return 0; ++} ++ ++static struct xenbus_device_id xenpci_ids[] = { ++ {"pci"}, ++ {{0}}, ++}; ++ ++static struct xenbus_driver xenbus_pciback_driver = { ++ .name = "pciback", ++ .owner = THIS_MODULE, ++ .ids = xenpci_ids, ++ .probe = pciback_xenbus_probe, ++ .remove = pciback_xenbus_remove, ++ .otherend_changed = pciback_frontend_changed, ++}; ++ ++int __init pciback_xenbus_register(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ return xenbus_register_backend(&xenbus_pciback_driver); ++} ++ ++void __exit pciback_xenbus_unregister(void) ++{ ++ xenbus_unregister_driver(&xenbus_pciback_driver); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pcifront/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,7 @@ ++obj-y += pcifront.o ++ ++pcifront-y := pci_op.o xenbus.o pci.o ++ ++ifeq ($(CONFIG_XEN_PCIDEV_FE_DEBUG),y) ++EXTRA_CFLAGS += -DDEBUG ++endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pcifront/pci.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,46 @@ ++/* ++ * PCI Frontend Operations - ensure only one PCI frontend runs at a time ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include "pcifront.h" ++ ++DEFINE_SPINLOCK(pcifront_dev_lock); ++static struct pcifront_device *pcifront_dev = NULL; ++ ++int pcifront_connect(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ ++ spin_lock(&pcifront_dev_lock); ++ ++ if (!pcifront_dev) { ++ dev_info(&pdev->xdev->dev, "Installing PCI frontend\n"); ++ pcifront_dev = pdev; ++ } ++ else { ++ dev_err(&pdev->xdev->dev, "PCI frontend already installed!\n"); ++ err = -EEXIST; ++ } ++ ++ spin_unlock(&pcifront_dev_lock); ++ ++ return err; ++} ++ ++void pcifront_disconnect(struct pcifront_device *pdev) ++{ ++ spin_lock(&pcifront_dev_lock); ++ ++ if (pdev == pcifront_dev) { ++ dev_info(&pdev->xdev->dev, ++ "Disconnecting PCI Frontend Buses\n"); ++ pcifront_dev = NULL; ++ } ++ ++ spin_unlock(&pcifront_dev_lock); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pcifront/pci_op.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,268 @@ ++/* ++ * PCI Frontend Operations - Communicates with frontend ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/version.h> ++#include <linux/init.h> ++#include <linux/pci.h> ++#include <linux/spinlock.h> ++#include <linux/time.h> ++#include <xen/evtchn.h> ++#include "pcifront.h" ++ ++static int verbose_request = 0; ++module_param(verbose_request, int, 0644); ++ ++static int errno_to_pcibios_err(int errno) ++{ ++ switch (errno) { ++ case XEN_PCI_ERR_success: ++ return PCIBIOS_SUCCESSFUL; ++ ++ case XEN_PCI_ERR_dev_not_found: ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ case XEN_PCI_ERR_invalid_offset: ++ case XEN_PCI_ERR_op_failed: ++ return PCIBIOS_BAD_REGISTER_NUMBER; ++ ++ case XEN_PCI_ERR_not_implemented: ++ return PCIBIOS_FUNC_NOT_SUPPORTED; ++ ++ case XEN_PCI_ERR_access_denied: ++ return PCIBIOS_SET_FAILED; ++ } ++ return errno; ++} ++ ++static int do_pci_op(struct pcifront_device *pdev, struct xen_pci_op *op) ++{ ++ int err = 0; ++ struct xen_pci_op *active_op = &pdev->sh_info->op; ++ unsigned long irq_flags; ++ evtchn_port_t port = pdev->evtchn; ++ s64 ns, ns_timeout; ++ struct timeval tv; ++ ++ spin_lock_irqsave(&pdev->sh_info_lock, irq_flags); ++ ++ memcpy(active_op, op, sizeof(struct xen_pci_op)); ++ ++ /* Go */ ++ wmb(); ++ set_bit(_XEN_PCIF_active, (unsigned long *)&pdev->sh_info->flags); ++ notify_remote_via_evtchn(port); ++ ++ /* ++ * We set a poll timeout of 3 seconds but give up on return after ++ * 2 seconds. It is better to time out too late rather than too early ++ * (in the latter case we end up continually re-executing poll() with a ++ * timeout in the past). 1s difference gives plenty of slack for error. ++ */ ++ do_gettimeofday(&tv); ++ ns_timeout = timeval_to_ns(&tv) + 2 * (s64)NSEC_PER_SEC; ++ ++ clear_evtchn(port); ++ ++ while (test_bit(_XEN_PCIF_active, ++ (unsigned long *)&pdev->sh_info->flags)) { ++ if (HYPERVISOR_poll(&port, 1, jiffies + 3*HZ)) ++ BUG(); ++ clear_evtchn(port); ++ do_gettimeofday(&tv); ++ ns = timeval_to_ns(&tv); ++ if (ns > ns_timeout) { ++ dev_err(&pdev->xdev->dev, ++ "pciback not responding!!!\n"); ++ clear_bit(_XEN_PCIF_active, ++ (unsigned long *)&pdev->sh_info->flags); ++ err = XEN_PCI_ERR_dev_not_found; ++ goto out; ++ } ++ } ++ ++ memcpy(op, active_op, sizeof(struct xen_pci_op)); ++ ++ err = op->err; ++ out: ++ spin_unlock_irqrestore(&pdev->sh_info_lock, irq_flags); ++ return err; ++} ++ ++/* Access to this function is spinlocked in drivers/pci/access.c */ ++static int pcifront_bus_read(struct pci_bus *bus, unsigned int devfn, ++ int where, int size, u32 * val) ++{ ++ int err = 0; ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_conf_read, ++ .domain = pci_domain_nr(bus), ++ .bus = bus->number, ++ .devfn = devfn, ++ .offset = where, ++ .size = size, ++ }; ++ struct pcifront_sd *sd = bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, ++ "read dev=%04x:%02x:%02x.%01x - offset %x size %d\n", ++ pci_domain_nr(bus), bus->number, PCI_SLOT(devfn), ++ PCI_FUNC(devfn), where, size); ++ ++ err = do_pci_op(pdev, &op); ++ ++ if (likely(!err)) { ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, "read got back value %x\n", ++ op.value); ++ ++ *val = op.value; ++ } else if (err == -ENODEV) { ++ /* No device here, pretend that it just returned 0 */ ++ err = 0; ++ *val = 0; ++ } ++ ++ return errno_to_pcibios_err(err); ++} ++ ++/* Access to this function is spinlocked in drivers/pci/access.c */ ++static int pcifront_bus_write(struct pci_bus *bus, unsigned int devfn, ++ int where, int size, u32 val) ++{ ++ struct xen_pci_op op = { ++ .cmd = XEN_PCI_OP_conf_write, ++ .domain = pci_domain_nr(bus), ++ .bus = bus->number, ++ .devfn = devfn, ++ .offset = where, ++ .size = size, ++ .value = val, ++ }; ++ struct pcifront_sd *sd = bus->sysdata; ++ struct pcifront_device *pdev = pcifront_get_pdev(sd); ++ ++ if (verbose_request) ++ dev_info(&pdev->xdev->dev, ++ "write dev=%04x:%02x:%02x.%01x - " ++ "offset %x size %d val %x\n", ++ pci_domain_nr(bus), bus->number, ++ PCI_SLOT(devfn), PCI_FUNC(devfn), where, size, val); ++ ++ return errno_to_pcibios_err(do_pci_op(pdev, &op)); ++} ++ ++struct pci_ops pcifront_bus_ops = { ++ .read = pcifront_bus_read, ++ .write = pcifront_bus_write, ++}; ++ ++/* Claim resources for the PCI frontend as-is, backend won't allow changes */ ++static void pcifront_claim_resource(struct pci_dev *dev, void *data) ++{ ++ struct pcifront_device *pdev = data; ++ int i; ++ struct resource *r; ++ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ r = &dev->resource[i]; ++ ++ if (!r->parent && r->start && r->flags) { ++ dev_dbg(&pdev->xdev->dev, "claiming resource %s/%d\n", ++ pci_name(dev), i); ++ pci_claim_resource(dev, i); ++ } ++ } ++} ++ ++int pcifront_scan_root(struct pcifront_device *pdev, ++ unsigned int domain, unsigned int bus) ++{ ++ struct pci_bus *b; ++ struct pcifront_sd *sd = NULL; ++ struct pci_bus_entry *bus_entry = NULL; ++ int err = 0; ++ ++#ifndef CONFIG_PCI_DOMAINS ++ if (domain != 0) { ++ dev_err(&pdev->xdev->dev, ++ "PCI Root in non-zero PCI Domain! domain=%d\n", domain); ++ dev_err(&pdev->xdev->dev, ++ "Please compile with CONFIG_PCI_DOMAINS\n"); ++ err = -EINVAL; ++ goto err_out; ++ } ++#endif ++ ++ dev_info(&pdev->xdev->dev, "Creating PCI Frontend Bus %04x:%02x\n", ++ domain, bus); ++ ++ bus_entry = kmalloc(sizeof(*bus_entry), GFP_KERNEL); ++ sd = kmalloc(sizeof(*sd), GFP_KERNEL); ++ if (!bus_entry || !sd) { ++ err = -ENOMEM; ++ goto err_out; ++ } ++ pcifront_init_sd(sd, domain, pdev); ++ ++ b = pci_scan_bus_parented(&pdev->xdev->dev, bus, ++ &pcifront_bus_ops, sd); ++ if (!b) { ++ dev_err(&pdev->xdev->dev, ++ "Error creating PCI Frontend Bus!\n"); ++ err = -ENOMEM; ++ goto err_out; ++ } ++ bus_entry->bus = b; ++ ++ list_add(&bus_entry->list, &pdev->root_buses); ++ ++ /* Claim resources before going "live" with our devices */ ++ pci_walk_bus(b, pcifront_claim_resource, pdev); ++ ++ pci_bus_add_devices(b); ++ ++ return 0; ++ ++ err_out: ++ kfree(bus_entry); ++ kfree(sd); ++ ++ return err; ++} ++ ++static void free_root_bus_devs(struct pci_bus *bus) ++{ ++ struct pci_dev *dev; ++ ++ while (!list_empty(&bus->devices)) { ++ dev = container_of(bus->devices.next, struct pci_dev, ++ bus_list); ++ dev_dbg(&dev->dev, "removing device\n"); ++ pci_remove_bus_device(dev); ++ } ++} ++ ++void pcifront_free_roots(struct pcifront_device *pdev) ++{ ++ struct pci_bus_entry *bus_entry, *t; ++ ++ dev_dbg(&pdev->xdev->dev, "cleaning up root buses\n"); ++ ++ list_for_each_entry_safe(bus_entry, t, &pdev->root_buses, list) { ++ list_del(&bus_entry->list); ++ ++ free_root_bus_devs(bus_entry->bus); ++ ++ kfree(bus_entry->bus->sysdata); ++ ++ device_unregister(bus_entry->bus->bridge); ++ pci_remove_bus(bus_entry->bus); ++ ++ kfree(bus_entry); ++ } ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pcifront/pcifront.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,40 @@ ++/* ++ * PCI Frontend - Common data structures & function declarations ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#ifndef __XEN_PCIFRONT_H__ ++#define __XEN_PCIFRONT_H__ ++ ++#include <linux/spinlock.h> ++#include <linux/pci.h> ++#include <xen/xenbus.h> ++#include <xen/interface/io/pciif.h> ++#include <xen/pcifront.h> ++ ++struct pci_bus_entry { ++ struct list_head list; ++ struct pci_bus *bus; ++}; ++ ++struct pcifront_device { ++ struct xenbus_device *xdev; ++ struct list_head root_buses; ++ spinlock_t dev_lock; ++ ++ int evtchn; ++ int gnt_ref; ++ ++ /* Lock this when doing any operations in sh_info */ ++ spinlock_t sh_info_lock; ++ struct xen_pci_sharedinfo *sh_info; ++}; ++ ++int pcifront_connect(struct pcifront_device *pdev); ++void pcifront_disconnect(struct pcifront_device *pdev); ++ ++int pcifront_scan_root(struct pcifront_device *pdev, ++ unsigned int domain, unsigned int bus); ++void pcifront_free_roots(struct pcifront_device *pdev); ++ ++#endif /* __XEN_PCIFRONT_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/pcifront/xenbus.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,295 @@ ++/* ++ * PCI Frontend Xenbus Setup - handles setup with backend (imports page/evtchn) ++ * ++ * Author: Ryan Wilson <hap9@epoch.ncsc.mil> ++ */ ++#include <linux/module.h> ++#include <linux/init.h> ++#include <linux/mm.h> ++#include <xen/xenbus.h> ++#include <xen/gnttab.h> ++#include "pcifront.h" ++ ++#define INVALID_GRANT_REF (0) ++#define INVALID_EVTCHN (-1) ++ ++static struct pcifront_device *alloc_pdev(struct xenbus_device *xdev) ++{ ++ struct pcifront_device *pdev; ++ ++ pdev = kmalloc(sizeof(struct pcifront_device), GFP_KERNEL); ++ if (pdev == NULL) ++ goto out; ++ ++ pdev->sh_info = ++ (struct xen_pci_sharedinfo *)__get_free_page(GFP_KERNEL); ++ if (pdev->sh_info == NULL) { ++ kfree(pdev); ++ pdev = NULL; ++ goto out; ++ } ++ pdev->sh_info->flags = 0; ++ ++ xdev->dev.driver_data = pdev; ++ pdev->xdev = xdev; ++ ++ INIT_LIST_HEAD(&pdev->root_buses); ++ ++ spin_lock_init(&pdev->dev_lock); ++ spin_lock_init(&pdev->sh_info_lock); ++ ++ pdev->evtchn = INVALID_EVTCHN; ++ pdev->gnt_ref = INVALID_GRANT_REF; ++ ++ dev_dbg(&xdev->dev, "Allocated pdev @ 0x%p pdev->sh_info @ 0x%p\n", ++ pdev, pdev->sh_info); ++ out: ++ return pdev; ++} ++ ++static void free_pdev(struct pcifront_device *pdev) ++{ ++ dev_dbg(&pdev->xdev->dev, "freeing pdev @ 0x%p\n", pdev); ++ ++ pcifront_free_roots(pdev); ++ ++ if (pdev->evtchn != INVALID_EVTCHN) ++ xenbus_free_evtchn(pdev->xdev, pdev->evtchn); ++ ++ if (pdev->gnt_ref != INVALID_GRANT_REF) ++ gnttab_end_foreign_access(pdev->gnt_ref, 0, ++ (unsigned long)pdev->sh_info); ++ ++ pdev->xdev->dev.driver_data = NULL; ++ ++ kfree(pdev); ++} ++ ++static int pcifront_publish_info(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ struct xenbus_transaction trans; ++ ++ err = xenbus_grant_ring(pdev->xdev, virt_to_mfn(pdev->sh_info)); ++ if (err < 0) ++ goto out; ++ ++ pdev->gnt_ref = err; ++ ++ err = xenbus_alloc_evtchn(pdev->xdev, &pdev->evtchn); ++ if (err) ++ goto out; ++ ++ do_publish: ++ err = xenbus_transaction_start(&trans); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error writing configuration for backend " ++ "(start transaction)"); ++ goto out; ++ } ++ ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "pci-op-ref", "%u", pdev->gnt_ref); ++ if (!err) ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "event-channel", "%u", pdev->evtchn); ++ if (!err) ++ err = xenbus_printf(trans, pdev->xdev->nodename, ++ "magic", XEN_PCI_MAGIC); ++ ++ if (err) { ++ xenbus_transaction_end(trans, 1); ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error writing configuration for backend"); ++ goto out; ++ } else { ++ err = xenbus_transaction_end(trans, 0); ++ if (err == -EAGAIN) ++ goto do_publish; ++ else if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error completing transaction " ++ "for backend"); ++ goto out; ++ } ++ } ++ ++ xenbus_switch_state(pdev->xdev, XenbusStateInitialised); ++ ++ dev_dbg(&pdev->xdev->dev, "publishing successful!\n"); ++ ++ out: ++ return err; ++} ++ ++static int pcifront_try_connect(struct pcifront_device *pdev) ++{ ++ int err = -EFAULT; ++ int i, num_roots, len; ++ char str[64]; ++ unsigned int domain, bus; ++ ++ spin_lock(&pdev->dev_lock); ++ ++ /* Only connect once */ ++ if (xenbus_read_driver_state(pdev->xdev->nodename) != ++ XenbusStateInitialised) ++ goto out; ++ ++ err = pcifront_connect(pdev); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error connecting PCI Frontend"); ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, ++ "root_num", "%d", &num_roots); ++ if (err == -ENOENT) { ++ xenbus_dev_error(pdev->xdev, err, ++ "No PCI Roots found, trying 0000:00"); ++ err = pcifront_scan_root(pdev, 0, 0); ++ num_roots = 0; ++ } else if (err != 1) { ++ if (err == 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading number of PCI roots"); ++ goto out; ++ } ++ ++ for (i = 0; i < num_roots; i++) { ++ len = snprintf(str, sizeof(str), "root-%d", i); ++ if (unlikely(len >= (sizeof(str) - 1))) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xenbus_scanf(XBT_NIL, pdev->xdev->otherend, str, ++ "%x:%x", &domain, &bus); ++ if (err != 2) { ++ if (err >= 0) ++ err = -EINVAL; ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error reading PCI root %d", i); ++ goto out; ++ } ++ ++ err = pcifront_scan_root(pdev, domain, bus); ++ if (err) { ++ xenbus_dev_fatal(pdev->xdev, err, ++ "Error scanning PCI root %04x:%02x", ++ domain, bus); ++ goto out; ++ } ++ } ++ ++ err = xenbus_switch_state(pdev->xdev, XenbusStateConnected); ++ if (err) ++ goto out; ++ ++ out: ++ spin_unlock(&pdev->dev_lock); ++ return err; ++} ++ ++static int pcifront_try_disconnect(struct pcifront_device *pdev) ++{ ++ int err = 0; ++ enum xenbus_state prev_state; ++ ++ spin_lock(&pdev->dev_lock); ++ ++ prev_state = xenbus_read_driver_state(pdev->xdev->nodename); ++ ++ if (prev_state < XenbusStateClosing) ++ err = xenbus_switch_state(pdev->xdev, XenbusStateClosing); ++ ++ if (!err && prev_state == XenbusStateConnected) ++ pcifront_disconnect(pdev); ++ ++ spin_unlock(&pdev->dev_lock); ++ ++ return err; ++} ++ ++static void pcifront_backend_changed(struct xenbus_device *xdev, ++ enum xenbus_state be_state) ++{ ++ struct pcifront_device *pdev = xdev->dev.driver_data; ++ ++ switch (be_state) { ++ case XenbusStateClosing: ++ dev_warn(&xdev->dev, "backend going away!\n"); ++ pcifront_try_disconnect(pdev); ++ break; ++ ++ case XenbusStateUnknown: ++ case XenbusStateClosed: ++ dev_warn(&xdev->dev, "backend went away!\n"); ++ pcifront_try_disconnect(pdev); ++ ++ device_unregister(&pdev->xdev->dev); ++ break; ++ ++ case XenbusStateConnected: ++ pcifront_try_connect(pdev); ++ break; ++ ++ default: ++ break; ++ } ++} ++ ++static int pcifront_xenbus_probe(struct xenbus_device *xdev, ++ const struct xenbus_device_id *id) ++{ ++ int err = 0; ++ struct pcifront_device *pdev = alloc_pdev(xdev); ++ ++ if (pdev == NULL) { ++ err = -ENOMEM; ++ xenbus_dev_fatal(xdev, err, ++ "Error allocating pcifront_device struct"); ++ goto out; ++ } ++ ++ err = pcifront_publish_info(pdev); ++ ++ out: ++ return err; ++} ++ ++static int pcifront_xenbus_remove(struct xenbus_device *xdev) ++{ ++ if (xdev->dev.driver_data) ++ free_pdev(xdev->dev.driver_data); ++ ++ return 0; ++} ++ ++static struct xenbus_device_id xenpci_ids[] = { ++ {"pci"}, ++ {{0}}, ++}; ++ ++static struct xenbus_driver xenbus_pcifront_driver = { ++ .name = "pcifront", ++ .owner = THIS_MODULE, ++ .ids = xenpci_ids, ++ .probe = pcifront_xenbus_probe, ++ .remove = pcifront_xenbus_remove, ++ .otherend_changed = pcifront_backend_changed, ++}; ++ ++static int __init pcifront_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ return xenbus_register_frontend(&xenbus_pcifront_driver); ++} ++ ++/* Initialize after the Xen PCI Frontend Stub is initialized */ ++subsys_initcall(pcifront_init); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/privcmd/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,2 @@ ++ ++obj-$(CONFIG_XEN_PRIVCMD) := privcmd.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/privcmd/privcmd.c 2007-08-27 14:02:05.000000000 -0400 +@@ -0,0 +1,284 @@ ++/****************************************************************************** ++ * privcmd.c ++ * ++ * Interface to privileged domain-0 commands. ++ * ++ * Copyright (c) 2002-2004, K A Fraser, B Dragovic ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/string.h> ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/swap.h> ++#include <linux/smp_lock.h> ++#include <linux/highmem.h> ++#include <linux/pagemap.h> ++#include <linux/seq_file.h> ++#include <linux/kthread.h> ++#include <asm/hypervisor.h> ++ ++#include <asm/pgalloc.h> ++#include <asm/pgtable.h> ++#include <asm/uaccess.h> ++#include <asm/tlb.h> ++#include <asm/hypervisor.h> ++#include <xen/public/privcmd.h> ++#include <xen/interface/xen.h> ++#include <xen/xen_proc.h> ++ ++static struct proc_dir_entry *privcmd_intf; ++static struct proc_dir_entry *capabilities_intf; ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma); ++#endif ++ ++static int privcmd_ioctl(struct inode *inode, struct file *file, ++ unsigned int cmd, unsigned long data) ++{ ++ int ret = -ENOSYS; ++ void __user *udata = (void __user *) data; ++ ++ switch (cmd) { ++ case IOCTL_PRIVCMD_HYPERCALL: { ++ privcmd_hypercall_t hypercall; ++ ++ if (copy_from_user(&hypercall, udata, sizeof(hypercall))) ++ return -EFAULT; ++ ++#if defined(__i386__) ++ if (hypercall.op >= (PAGE_SIZE >> 5)) ++ break; ++ __asm__ __volatile__ ( ++ "pushl %%ebx; pushl %%ecx; pushl %%edx; " ++ "pushl %%esi; pushl %%edi; " ++ "movl 8(%%eax),%%ebx ;" ++ "movl 16(%%eax),%%ecx ;" ++ "movl 24(%%eax),%%edx ;" ++ "movl 32(%%eax),%%esi ;" ++ "movl 40(%%eax),%%edi ;" ++ "movl (%%eax),%%eax ;" ++ "shll $5,%%eax ;" ++ "addl $hypercall_page,%%eax ;" ++ "call *%%eax ;" ++ "popl %%edi; popl %%esi; popl %%edx; " ++ "popl %%ecx; popl %%ebx" ++ : "=a" (ret) : "0" (&hypercall) : "memory" ); ++#elif defined (__x86_64__) ++ if (hypercall.op < (PAGE_SIZE >> 5)) { ++ long ign1, ign2, ign3; ++ __asm__ __volatile__ ( ++ "movq %8,%%r10; movq %9,%%r8;" ++ "shll $5,%%eax ;" ++ "addq $hypercall_page,%%rax ;" ++ "call *%%rax" ++ : "=a" (ret), "=D" (ign1), ++ "=S" (ign2), "=d" (ign3) ++ : "0" ((unsigned int)hypercall.op), ++ "1" (hypercall.arg[0]), ++ "2" (hypercall.arg[1]), ++ "3" (hypercall.arg[2]), ++ "g" (hypercall.arg[3]), ++ "g" (hypercall.arg[4]) ++ : "r8", "r10", "memory" ); ++ } ++#elif defined (__ia64__) ++ ret = privcmd_hypercall(&hypercall); ++#endif ++ } ++ break; ++ ++ case IOCTL_PRIVCMD_MMAP: { ++ privcmd_mmap_t mmapcmd; ++ privcmd_mmap_entry_t msg; ++ privcmd_mmap_entry_t __user *p; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ unsigned long va; ++ int i, rc; ++ ++ if (!is_initial_xendomain()) ++ return -EPERM; ++ ++ if (copy_from_user(&mmapcmd, udata, sizeof(mmapcmd))) ++ return -EFAULT; ++ ++ p = mmapcmd.entry; ++ if (copy_from_user(&msg, p, sizeof(msg))) ++ return -EFAULT; ++ ++ down_read(&mm->mmap_sem); ++ ++ vma = find_vma(mm, msg.va); ++ rc = -EINVAL; ++ if (!vma || (msg.va != vma->vm_start) || ++ !privcmd_enforce_singleshot_mapping(vma)) ++ goto mmap_out; ++ ++ va = vma->vm_start; ++ ++ for (i = 0; i < mmapcmd.num; i++) { ++ rc = -EFAULT; ++ if (copy_from_user(&msg, p, sizeof(msg))) ++ goto mmap_out; ++ ++ /* Do not allow range to wrap the address space. */ ++ rc = -EINVAL; ++ if ((msg.npages > (LONG_MAX >> PAGE_SHIFT)) || ++ ((unsigned long)(msg.npages << PAGE_SHIFT) >= -va)) ++ goto mmap_out; ++ ++ /* Range chunks must be contiguous in va space. */ ++ if ((msg.va != va) || ++ ((msg.va+(msg.npages<<PAGE_SHIFT)) > vma->vm_end)) ++ goto mmap_out; ++ ++ if ((rc = direct_remap_pfn_range( ++ vma, ++ msg.va & PAGE_MASK, ++ msg.mfn, ++ msg.npages << PAGE_SHIFT, ++ vma->vm_page_prot, ++ mmapcmd.dom)) < 0) ++ goto mmap_out; ++ ++ p++; ++ va += msg.npages << PAGE_SHIFT; ++ } ++ ++ rc = 0; ++ ++ mmap_out: ++ up_read(&mm->mmap_sem); ++ ret = rc; ++ } ++ break; ++ ++ case IOCTL_PRIVCMD_MMAPBATCH: { ++ privcmd_mmapbatch_t m; ++ struct mm_struct *mm = current->mm; ++ struct vm_area_struct *vma; ++ xen_pfn_t __user *p; ++ unsigned long addr, mfn, nr_pages; ++ int i; ++ ++ if (!is_initial_xendomain()) ++ return -EPERM; ++ ++ if (copy_from_user(&m, udata, sizeof(m))) ++ return -EFAULT; ++ ++ nr_pages = m.num; ++ if ((m.num <= 0) || (nr_pages > (LONG_MAX >> PAGE_SHIFT))) ++ return -EINVAL; ++ ++ down_read(&mm->mmap_sem); ++ ++ vma = find_vma(mm, m.addr); ++ if (!vma || ++ (m.addr != vma->vm_start) || ++ ((m.addr + (nr_pages << PAGE_SHIFT)) != vma->vm_end) || ++ !privcmd_enforce_singleshot_mapping(vma)) { ++ up_read(&mm->mmap_sem); ++ return -EINVAL; ++ } ++ ++ p = m.arr; ++ addr = m.addr; ++ for (i = 0; i < nr_pages; i++, addr += PAGE_SIZE, p++) { ++ if (get_user(mfn, p)) { ++ up_read(&mm->mmap_sem); ++ return -EFAULT; ++ } ++ ++ ret = direct_remap_pfn_range(vma, addr & PAGE_MASK, ++ mfn, PAGE_SIZE, ++ vma->vm_page_prot, m.dom); ++ if (ret < 0) ++ put_user(0xF0000000 | mfn, p); ++ } ++ ++ up_read(&mm->mmap_sem); ++ ret = 0; ++ } ++ break; ++ ++ default: ++ ret = -EINVAL; ++ break; ++ } ++ ++ return ret; ++} ++ ++#ifndef HAVE_ARCH_PRIVCMD_MMAP ++static struct page *privcmd_nopage(struct vm_area_struct *vma, ++ unsigned long address, ++ int *type) ++{ ++ return NOPAGE_SIGBUS; ++} ++ ++static struct vm_operations_struct privcmd_vm_ops = { ++ .nopage = privcmd_nopage ++}; ++ ++static int privcmd_mmap(struct file * file, struct vm_area_struct * vma) ++{ ++ /* Unsupported for auto-translate guests. */ ++ if (xen_feature(XENFEAT_auto_translated_physmap)) ++ return -ENOSYS; ++ ++ /* DONTCOPY is essential for Xen as copy_page_range is broken. */ ++ vma->vm_flags |= VM_RESERVED | VM_IO | VM_DONTCOPY; ++ vma->vm_ops = &privcmd_vm_ops; ++ vma->vm_private_data = NULL; ++ ++ return 0; ++} ++ ++static int privcmd_enforce_singleshot_mapping(struct vm_area_struct *vma) ++{ ++ return (xchg(&vma->vm_private_data, (void *)1) == NULL); ++} ++#endif ++ ++static const struct file_operations privcmd_file_ops = { ++ .ioctl = privcmd_ioctl, ++ .mmap = privcmd_mmap, ++}; ++ ++static int capabilities_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len = 0; ++ *page = 0; ++ ++ if (is_initial_xendomain()) ++ len = sprintf( page, "control_d\n" ); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int __init privcmd_init(void) ++{ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ privcmd_intf = create_xen_proc_entry("privcmd", 0400); ++ if (privcmd_intf != NULL) ++ privcmd_intf->proc_fops = &privcmd_file_ops; ++ ++ capabilities_intf = create_xen_proc_entry("capabilities", 0400 ); ++ if (capabilities_intf != NULL) ++ capabilities_intf->read_proc = capabilities_read; ++ ++ return 0; ++} ++ ++__initcall(privcmd_init); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/tpmback/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,4 @@ ++ ++obj-$(CONFIG_XEN_TPMDEV_BACKEND) += tpmbk.o ++ ++tpmbk-y += tpmback.o interface.o xenbus.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/tpmback/common.h 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,85 @@ ++/****************************************************************************** ++ * drivers/xen/tpmback/common.h ++ */ ++ ++#ifndef __TPM__BACKEND__COMMON_H__ ++#define __TPM__BACKEND__COMMON_H__ ++ ++#include <linux/version.h> ++#include <linux/module.h> ++#include <linux/interrupt.h> ++#include <linux/slab.h> ++#include <xen/evtchn.h> ++#include <xen/driver_util.h> ++#include <xen/interface/grant_table.h> ++#include <xen/interface/io/tpmif.h> ++#include <asm/io.h> ++#include <asm/pgalloc.h> ++ ++#define DPRINTK(_f, _a...) \ ++ pr_debug("(file=%s, line=%d) " _f, \ ++ __FILE__ , __LINE__ , ## _a ) ++ ++struct backend_info; ++ ++typedef struct tpmif_st { ++ struct list_head tpmif_list; ++ /* Unique identifier for this interface. */ ++ domid_t domid; ++ unsigned int handle; ++ ++ /* Physical parameters of the comms window. */ ++ unsigned int irq; ++ ++ /* The shared rings and indexes. */ ++ tpmif_tx_interface_t *tx; ++ struct vm_struct *tx_area; ++ ++ /* Miscellaneous private stuff. */ ++ enum { DISCONNECTED, DISCONNECTING, CONNECTED } status; ++ int active; ++ ++ struct tpmif_st *hash_next; ++ struct list_head list; /* scheduling list */ ++ atomic_t refcnt; ++ ++ struct backend_info *bi; ++ ++ grant_handle_t shmem_handle; ++ grant_ref_t shmem_ref; ++ struct page **mmap_pages; ++ ++ char devname[20]; ++} tpmif_t; ++ ++void tpmif_disconnect_complete(tpmif_t * tpmif); ++tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi); ++void tpmif_interface_init(void); ++void tpmif_interface_exit(void); ++void tpmif_schedule_work(tpmif_t * tpmif); ++void tpmif_deschedule_work(tpmif_t * tpmif); ++void tpmif_xenbus_init(void); ++void tpmif_xenbus_exit(void); ++int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn); ++irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs); ++ ++long int tpmback_get_instance(struct backend_info *bi); ++ ++int vtpm_release_packets(tpmif_t * tpmif, int send_msgs); ++ ++ ++#define tpmif_get(_b) (atomic_inc(&(_b)->refcnt)) ++#define tpmif_put(_b) \ ++ do { \ ++ if (atomic_dec_and_test(&(_b)->refcnt)) \ ++ tpmif_disconnect_complete(_b); \ ++ } while (0) ++ ++extern int num_frontends; ++ ++static inline unsigned long idx_to_kaddr(tpmif_t *t, unsigned int idx) ++{ ++ return (unsigned long)pfn_to_kaddr(page_to_pfn(t->mmap_pages[idx])); ++} ++ ++#endif /* __TPMIF__BACKEND__COMMON_H__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/tpmback/interface.c 2007-08-27 14:02:01.000000000 -0400 +@@ -0,0 +1,167 @@ ++ /***************************************************************************** ++ * drivers/xen/tpmback/interface.c ++ * ++ * Vritual TPM interface management. ++ * ++ * Copyright (c) 2005, IBM Corporation ++ * ++ * Author: Stefan Berger, stefanb@us.ibm.com ++ * ++ * This code has been derived from drivers/xen/netback/interface.c ++ * Copyright (c) 2004, Keir Fraser ++ */ ++ ++#include "common.h" ++#include <xen/balloon.h> ++#include <xen/gnttab.h> ++ ++static kmem_cache_t *tpmif_cachep; ++int num_frontends = 0; ++ ++LIST_HEAD(tpmif_list); ++ ++static tpmif_t *alloc_tpmif(domid_t domid, struct backend_info *bi) ++{ ++ tpmif_t *tpmif; ++ ++ tpmif = kmem_cache_alloc(tpmif_cachep, GFP_KERNEL); ++ if (tpmif == NULL) ++ goto out_of_memory; ++ ++ memset(tpmif, 0, sizeof (*tpmif)); ++ tpmif->domid = domid; ++ tpmif->status = DISCONNECTED; ++ tpmif->bi = bi; ++ snprintf(tpmif->devname, sizeof(tpmif->devname), "tpmif%d", domid); ++ atomic_set(&tpmif->refcnt, 1); ++ ++ tpmif->mmap_pages = alloc_empty_pages_and_pagevec(TPMIF_TX_RING_SIZE); ++ if (tpmif->mmap_pages == NULL) ++ goto out_of_memory; ++ ++ list_add(&tpmif->tpmif_list, &tpmif_list); ++ num_frontends++; ++ ++ return tpmif; ++ ++ out_of_memory: ++ if (tpmif != NULL) ++ kmem_cache_free(tpmif_cachep, tpmif); ++ printk("%s: out of memory\n", __FUNCTION__); ++ return ERR_PTR(-ENOMEM); ++} ++ ++static void free_tpmif(tpmif_t * tpmif) ++{ ++ num_frontends--; ++ list_del(&tpmif->tpmif_list); ++ free_empty_pages_and_pagevec(tpmif->mmap_pages, TPMIF_TX_RING_SIZE); ++ kmem_cache_free(tpmif_cachep, tpmif); ++} ++ ++tpmif_t *tpmif_find(domid_t domid, struct backend_info *bi) ++{ ++ tpmif_t *tpmif; ++ ++ list_for_each_entry(tpmif, &tpmif_list, tpmif_list) { ++ if (tpmif->bi == bi) { ++ if (tpmif->domid == domid) { ++ tpmif_get(tpmif); ++ return tpmif; ++ } else { ++ return ERR_PTR(-EEXIST); ++ } ++ } ++ } ++ ++ return alloc_tpmif(domid, bi); ++} ++ ++static int map_frontend_page(tpmif_t *tpmif, unsigned long shared_page) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)tpmif->tx_area->addr, ++ GNTMAP_host_map, shared_page, tpmif->domid); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return op.status; ++ } ++ ++ tpmif->shmem_ref = shared_page; ++ tpmif->shmem_handle = op.handle; ++ ++ return 0; ++} ++ ++static void unmap_frontend_page(tpmif_t *tpmif) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)tpmif->tx_area->addr, ++ GNTMAP_host_map, tpmif->shmem_handle); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++} ++ ++int tpmif_map(tpmif_t *tpmif, unsigned long shared_page, unsigned int evtchn) ++{ ++ int err; ++ ++ if (tpmif->irq) ++ return 0; ++ ++ if ((tpmif->tx_area = alloc_vm_area(PAGE_SIZE)) == NULL) ++ return -ENOMEM; ++ ++ err = map_frontend_page(tpmif, shared_page); ++ if (err) { ++ free_vm_area(tpmif->tx_area); ++ return err; ++ } ++ ++ tpmif->tx = (tpmif_tx_interface_t *)tpmif->tx_area->addr; ++ ++ err = bind_interdomain_evtchn_to_irqhandler( ++ tpmif->domid, evtchn, tpmif_be_int, 0, tpmif->devname, tpmif); ++ if (err < 0) { ++ unmap_frontend_page(tpmif); ++ free_vm_area(tpmif->tx_area); ++ return err; ++ } ++ tpmif->irq = err; ++ ++ tpmif->shmem_ref = shared_page; ++ tpmif->active = 1; ++ ++ return 0; ++} ++ ++void tpmif_disconnect_complete(tpmif_t *tpmif) ++{ ++ if (tpmif->irq) ++ unbind_from_irqhandler(tpmif->irq, tpmif); ++ ++ if (tpmif->tx) { ++ unmap_frontend_page(tpmif); ++ free_vm_area(tpmif->tx_area); ++ } ++ ++ free_tpmif(tpmif); ++} ++ ++void __init tpmif_interface_init(void) ++{ ++ tpmif_cachep = kmem_cache_create("tpmif_cache", sizeof (tpmif_t), ++ 0, 0, NULL, NULL); ++} ++ ++void __exit tpmif_interface_exit(void) ++{ ++ kmem_cache_destroy(tpmif_cachep); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/tpmback/tpmback.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,944 @@ ++/****************************************************************************** ++ * drivers/xen/tpmback/tpmback.c ++ * ++ * Copyright (c) 2005, IBM Corporation ++ * ++ * Author: Stefan Berger, stefanb@us.ibm.com ++ * Grant table support: Mahadevan Gomathisankaran ++ * ++ * This code has been derived from drivers/xen/netback/netback.c ++ * Copyright (c) 2002-2004, K A Fraser ++ * ++ */ ++ ++#include "common.h" ++#include <xen/evtchn.h> ++ ++#include <linux/types.h> ++#include <linux/list.h> ++#include <linux/miscdevice.h> ++#include <linux/poll.h> ++#include <asm/uaccess.h> ++#include <xen/xenbus.h> ++#include <xen/interface/grant_table.h> ++#include <xen/gnttab.h> ++ ++/* local data structures */ ++struct data_exchange { ++ struct list_head pending_pak; ++ struct list_head current_pak; ++ unsigned int copied_so_far; ++ u8 has_opener:1; ++ u8 aborted:1; ++ rwlock_t pak_lock; // protects all of the previous fields ++ wait_queue_head_t wait_queue; ++}; ++ ++struct vtpm_resp_hdr { ++ uint32_t instance_no; ++ uint16_t tag_no; ++ uint32_t len_no; ++ uint32_t ordinal_no; ++} __attribute__ ((packed)); ++ ++struct packet { ++ struct list_head next; ++ unsigned int data_len; ++ u8 *data_buffer; ++ tpmif_t *tpmif; ++ u32 tpm_instance; ++ u8 req_tag; ++ u32 last_read; ++ u8 flags; ++ struct timer_list processing_timer; ++}; ++ ++enum { ++ PACKET_FLAG_DISCARD_RESPONSE = 1, ++}; ++ ++/* local variables */ ++static struct data_exchange dataex; ++ ++/* local function prototypes */ ++static int _packet_write(struct packet *pak, ++ const char *data, size_t size, int userbuffer); ++static void processing_timeout(unsigned long ptr); ++static int packet_read_shmem(struct packet *pak, ++ tpmif_t * tpmif, ++ u32 offset, ++ char *buffer, int isuserbuffer, u32 left); ++static int vtpm_queue_packet(struct packet *pak); ++ ++/*************************************************************** ++ Buffer copying fo user and kernel space buffes. ++***************************************************************/ ++static inline int copy_from_buffer(void *to, ++ const void *from, unsigned long size, ++ int isuserbuffer) ++{ ++ if (isuserbuffer) { ++ if (copy_from_user(to, (void __user *)from, size)) ++ return -EFAULT; ++ } else { ++ memcpy(to, from, size); ++ } ++ return 0; ++} ++ ++static inline int copy_to_buffer(void *to, ++ const void *from, unsigned long size, ++ int isuserbuffer) ++{ ++ if (isuserbuffer) { ++ if (copy_to_user((void __user *)to, from, size)) ++ return -EFAULT; ++ } else { ++ memcpy(to, from, size); ++ } ++ return 0; ++} ++ ++ ++static void dataex_init(struct data_exchange *dataex) ++{ ++ INIT_LIST_HEAD(&dataex->pending_pak); ++ INIT_LIST_HEAD(&dataex->current_pak); ++ dataex->has_opener = 0; ++ rwlock_init(&dataex->pak_lock); ++ init_waitqueue_head(&dataex->wait_queue); ++} ++ ++/*************************************************************** ++ Packet-related functions ++***************************************************************/ ++ ++static struct packet *packet_find_instance(struct list_head *head, ++ u32 tpm_instance) ++{ ++ struct packet *pak; ++ struct list_head *p; ++ ++ /* ++ * traverse the list of packets and return the first ++ * one with the given instance number ++ */ ++ list_for_each(p, head) { ++ pak = list_entry(p, struct packet, next); ++ ++ if (pak->tpm_instance == tpm_instance) { ++ return pak; ++ } ++ } ++ return NULL; ++} ++ ++static struct packet *packet_find_packet(struct list_head *head, void *packet) ++{ ++ struct packet *pak; ++ struct list_head *p; ++ ++ /* ++ * traverse the list of packets and return the first ++ * one with the given instance number ++ */ ++ list_for_each(p, head) { ++ pak = list_entry(p, struct packet, next); ++ ++ if (pak == packet) { ++ return pak; ++ } ++ } ++ return NULL; ++} ++ ++static struct packet *packet_alloc(tpmif_t * tpmif, ++ u32 size, u8 req_tag, u8 flags) ++{ ++ struct packet *pak = NULL; ++ pak = kzalloc(sizeof (struct packet), GFP_ATOMIC); ++ if (NULL != pak) { ++ if (tpmif) { ++ pak->tpmif = tpmif; ++ pak->tpm_instance = tpmback_get_instance(tpmif->bi); ++ tpmif_get(tpmif); ++ } ++ pak->data_len = size; ++ pak->req_tag = req_tag; ++ pak->last_read = 0; ++ pak->flags = flags; ++ ++ /* ++ * cannot do tpmif_get(tpmif); bad things happen ++ * on the last tpmif_put() ++ */ ++ init_timer(&pak->processing_timer); ++ pak->processing_timer.function = processing_timeout; ++ pak->processing_timer.data = (unsigned long)pak; ++ } ++ return pak; ++} ++ ++static void inline packet_reset(struct packet *pak) ++{ ++ pak->last_read = 0; ++} ++ ++static void packet_free(struct packet *pak) ++{ ++ if (timer_pending(&pak->processing_timer)) { ++ BUG(); ++ } ++ ++ if (pak->tpmif) ++ tpmif_put(pak->tpmif); ++ kfree(pak->data_buffer); ++ /* ++ * cannot do tpmif_put(pak->tpmif); bad things happen ++ * on the last tpmif_put() ++ */ ++ kfree(pak); ++} ++ ++ ++/* ++ * Write data to the shared memory and send it to the FE. ++ */ ++static int packet_write(struct packet *pak, ++ const char *data, size_t size, int isuserbuffer) ++{ ++ int rc = 0; ++ ++ if (0 != (pak->flags & PACKET_FLAG_DISCARD_RESPONSE)) { ++ /* Don't send a respone to this packet. Just acknowledge it. */ ++ rc = size; ++ } else { ++ rc = _packet_write(pak, data, size, isuserbuffer); ++ } ++ ++ return rc; ++} ++ ++int _packet_write(struct packet *pak, ++ const char *data, size_t size, int isuserbuffer) ++{ ++ /* ++ * Write into the shared memory pages directly ++ * and send it to the front end. ++ */ ++ tpmif_t *tpmif = pak->tpmif; ++ grant_handle_t handle; ++ int rc = 0; ++ unsigned int i = 0; ++ unsigned int offset = 0; ++ ++ if (tpmif == NULL) { ++ return -EFAULT; ++ } ++ ++ if (tpmif->status == DISCONNECTED) { ++ return size; ++ } ++ ++ while (offset < size && i < TPMIF_TX_RING_SIZE) { ++ unsigned int tocopy; ++ struct gnttab_map_grant_ref map_op; ++ struct gnttab_unmap_grant_ref unmap_op; ++ tpmif_tx_request_t *tx; ++ ++ tx = &tpmif->tx->ring[i].req; ++ ++ if (0 == tx->addr) { ++ DPRINTK("ERROR: Buffer for outgoing packet NULL?! i=%d\n", i); ++ return 0; ++ } ++ ++ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), ++ GNTMAP_host_map, tx->ref, tpmif->domid); ++ ++ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ &map_op, 1))) { ++ BUG(); ++ } ++ ++ handle = map_op.handle; ++ ++ if (map_op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return 0; ++ } ++ ++ tocopy = min_t(size_t, size - offset, PAGE_SIZE); ++ ++ if (copy_from_buffer((void *)(idx_to_kaddr(tpmif, i) | ++ (tx->addr & ~PAGE_MASK)), ++ &data[offset], tocopy, isuserbuffer)) { ++ tpmif_put(tpmif); ++ return -EFAULT; ++ } ++ tx->size = tocopy; ++ ++ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), ++ GNTMAP_host_map, handle); ++ ++ if (unlikely ++ (HYPERVISOR_grant_table_op ++ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) { ++ BUG(); ++ } ++ ++ offset += tocopy; ++ i++; ++ } ++ ++ rc = offset; ++ DPRINTK("Notifying frontend via irq %d\n", tpmif->irq); ++ notify_remote_via_irq(tpmif->irq); ++ ++ return rc; ++} ++ ++/* ++ * Read data from the shared memory and copy it directly into the ++ * provided buffer. Advance the read_last indicator which tells ++ * how many bytes have already been read. ++ */ ++static int packet_read(struct packet *pak, size_t numbytes, ++ char *buffer, size_t buffersize, int isuserbuffer) ++{ ++ tpmif_t *tpmif = pak->tpmif; ++ ++ /* ++ * Read 'numbytes' of data from the buffer. The first 4 ++ * bytes are the instance number in network byte order, ++ * after that come the data from the shared memory buffer. ++ */ ++ u32 to_copy; ++ u32 offset = 0; ++ u32 room_left = buffersize; ++ ++ if (pak->last_read < 4) { ++ /* ++ * copy the instance number into the buffer ++ */ ++ u32 instance_no = htonl(pak->tpm_instance); ++ u32 last_read = pak->last_read; ++ ++ to_copy = min_t(size_t, 4 - last_read, numbytes); ++ ++ if (copy_to_buffer(&buffer[0], ++ &(((u8 *) & instance_no)[last_read]), ++ to_copy, isuserbuffer)) { ++ return -EFAULT; ++ } ++ ++ pak->last_read += to_copy; ++ offset += to_copy; ++ room_left -= to_copy; ++ } ++ ++ /* ++ * If the packet has a data buffer appended, read from it... ++ */ ++ ++ if (room_left > 0) { ++ if (pak->data_buffer) { ++ u32 to_copy = min_t(u32, pak->data_len - offset, room_left); ++ u32 last_read = pak->last_read - 4; ++ ++ if (copy_to_buffer(&buffer[offset], ++ &pak->data_buffer[last_read], ++ to_copy, isuserbuffer)) { ++ return -EFAULT; ++ } ++ pak->last_read += to_copy; ++ offset += to_copy; ++ } else { ++ offset = packet_read_shmem(pak, ++ tpmif, ++ offset, ++ buffer, ++ isuserbuffer, room_left); ++ } ++ } ++ return offset; ++} ++ ++static int packet_read_shmem(struct packet *pak, ++ tpmif_t * tpmif, ++ u32 offset, char *buffer, int isuserbuffer, ++ u32 room_left) ++{ ++ u32 last_read = pak->last_read - 4; ++ u32 i = (last_read / PAGE_SIZE); ++ u32 pg_offset = last_read & (PAGE_SIZE - 1); ++ u32 to_copy; ++ grant_handle_t handle; ++ ++ tpmif_tx_request_t *tx; ++ ++ tx = &tpmif->tx->ring[0].req; ++ /* ++ * Start copying data at the page with index 'index' ++ * and within that page at offset 'offset'. ++ * Copy a maximum of 'room_left' bytes. ++ */ ++ to_copy = min_t(u32, PAGE_SIZE - pg_offset, room_left); ++ while (to_copy > 0) { ++ void *src; ++ struct gnttab_map_grant_ref map_op; ++ struct gnttab_unmap_grant_ref unmap_op; ++ ++ tx = &tpmif->tx->ring[i].req; ++ ++ gnttab_set_map_op(&map_op, idx_to_kaddr(tpmif, i), ++ GNTMAP_host_map, tx->ref, tpmif->domid); ++ ++ if (unlikely(HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, ++ &map_op, 1))) { ++ BUG(); ++ } ++ ++ if (map_op.status) { ++ DPRINTK(" Grant table operation failure !\n"); ++ return -EFAULT; ++ } ++ ++ handle = map_op.handle; ++ ++ if (to_copy > tx->size) { ++ /* ++ * User requests more than what's available ++ */ ++ to_copy = min_t(u32, tx->size, to_copy); ++ } ++ ++ DPRINTK("Copying from mapped memory at %08lx\n", ++ (unsigned long)(idx_to_kaddr(tpmif, i) | ++ (tx->addr & ~PAGE_MASK))); ++ ++ src = (void *)(idx_to_kaddr(tpmif, i) | ++ ((tx->addr & ~PAGE_MASK) + pg_offset)); ++ if (copy_to_buffer(&buffer[offset], ++ src, to_copy, isuserbuffer)) { ++ return -EFAULT; ++ } ++ ++ DPRINTK("Data from TPM-FE of domain %d are %d %d %d %d\n", ++ tpmif->domid, buffer[offset], buffer[offset + 1], ++ buffer[offset + 2], buffer[offset + 3]); ++ ++ gnttab_set_unmap_op(&unmap_op, idx_to_kaddr(tpmif, i), ++ GNTMAP_host_map, handle); ++ ++ if (unlikely ++ (HYPERVISOR_grant_table_op ++ (GNTTABOP_unmap_grant_ref, &unmap_op, 1))) { ++ BUG(); ++ } ++ ++ offset += to_copy; ++ pg_offset = 0; ++ last_read += to_copy; ++ room_left -= to_copy; ++ ++ to_copy = min_t(u32, PAGE_SIZE, room_left); ++ i++; ++ } /* while (to_copy > 0) */ ++ /* ++ * Adjust the last_read pointer ++ */ ++ pak->last_read = last_read + 4; ++ return offset; ++} ++ ++/* ============================================================ ++ * The file layer for reading data from this device ++ * ============================================================ ++ */ ++static int vtpm_op_open(struct inode *inode, struct file *f) ++{ ++ int rc = 0; ++ unsigned long flags; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ if (dataex.has_opener == 0) { ++ dataex.has_opener = 1; ++ } else { ++ rc = -EPERM; ++ } ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ return rc; ++} ++ ++static ssize_t vtpm_op_read(struct file *file, ++ char __user * data, size_t size, loff_t * offset) ++{ ++ int ret_size = -ENODATA; ++ struct packet *pak = NULL; ++ unsigned long flags; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ if (dataex.aborted) { ++ dataex.aborted = 0; ++ dataex.copied_so_far = 0; ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ return -EIO; ++ } ++ ++ if (list_empty(&dataex.pending_pak)) { ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ wait_event_interruptible(dataex.wait_queue, ++ !list_empty(&dataex.pending_pak)); ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ dataex.copied_so_far = 0; ++ } ++ ++ if (!list_empty(&dataex.pending_pak)) { ++ unsigned int left; ++ ++ pak = list_entry(dataex.pending_pak.next, struct packet, next); ++ left = pak->data_len - dataex.copied_so_far; ++ list_del(&pak->next); ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ ++ DPRINTK("size given by app: %d, available: %d\n", size, left); ++ ++ ret_size = min_t(size_t, size, left); ++ ++ ret_size = packet_read(pak, ret_size, data, size, 1); ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ ++ if (ret_size < 0) { ++ del_singleshot_timer_sync(&pak->processing_timer); ++ packet_free(pak); ++ dataex.copied_so_far = 0; ++ } else { ++ DPRINTK("Copied %d bytes to user buffer\n", ret_size); ++ ++ dataex.copied_so_far += ret_size; ++ if (dataex.copied_so_far >= pak->data_len + 4) { ++ DPRINTK("All data from this packet given to app.\n"); ++ /* All data given to app */ ++ ++ del_singleshot_timer_sync(&pak-> ++ processing_timer); ++ list_add_tail(&pak->next, &dataex.current_pak); ++ /* ++ * The more fontends that are handled at the same time, ++ * the more time we give the TPM to process the request. ++ */ ++ mod_timer(&pak->processing_timer, ++ jiffies + (num_frontends * 60 * HZ)); ++ dataex.copied_so_far = 0; ++ } else { ++ list_add(&pak->next, &dataex.pending_pak); ++ } ++ } ++ } ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ ++ DPRINTK("Returning result from read to app: %d\n", ret_size); ++ ++ return ret_size; ++} ++ ++/* ++ * Write operation - only works after a previous read operation! ++ */ ++static ssize_t vtpm_op_write(struct file *file, ++ const char __user * data, size_t size, ++ loff_t * offset) ++{ ++ struct packet *pak; ++ int rc = 0; ++ unsigned int off = 4; ++ unsigned long flags; ++ struct vtpm_resp_hdr vrh; ++ ++ /* ++ * Minimum required packet size is: ++ * 4 bytes for instance number ++ * 2 bytes for tag ++ * 4 bytes for paramSize ++ * 4 bytes for the ordinal ++ * sum: 14 bytes ++ */ ++ if (size < sizeof (vrh)) ++ return -EFAULT; ++ ++ if (copy_from_user(&vrh, data, sizeof (vrh))) ++ return -EFAULT; ++ ++ /* malformed packet? */ ++ if ((off + ntohl(vrh.len_no)) != size) ++ return -EFAULT; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ pak = packet_find_instance(&dataex.current_pak, ++ ntohl(vrh.instance_no)); ++ ++ if (pak == NULL) { ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ DPRINTK(KERN_ALERT "No associated packet! (inst=%d)\n", ++ ntohl(vrh.instance_no)); ++ return -EFAULT; ++ } ++ ++ del_singleshot_timer_sync(&pak->processing_timer); ++ list_del(&pak->next); ++ ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ ++ /* ++ * The first 'offset' bytes must be the instance number - skip them. ++ */ ++ size -= off; ++ ++ rc = packet_write(pak, &data[off], size, 1); ++ ++ if (rc > 0) { ++ /* I neglected the first 4 bytes */ ++ rc += off; ++ } ++ packet_free(pak); ++ return rc; ++} ++ ++static int vtpm_op_release(struct inode *inode, struct file *file) ++{ ++ unsigned long flags; ++ ++ vtpm_release_packets(NULL, 1); ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ dataex.has_opener = 0; ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ return 0; ++} ++ ++static unsigned int vtpm_op_poll(struct file *file, ++ struct poll_table_struct *pts) ++{ ++ unsigned int flags = POLLOUT | POLLWRNORM; ++ ++ poll_wait(file, &dataex.wait_queue, pts); ++ if (!list_empty(&dataex.pending_pak)) { ++ flags |= POLLIN | POLLRDNORM; ++ } ++ return flags; ++} ++ ++static const struct file_operations vtpm_ops = { ++ .owner = THIS_MODULE, ++ .llseek = no_llseek, ++ .open = vtpm_op_open, ++ .read = vtpm_op_read, ++ .write = vtpm_op_write, ++ .release = vtpm_op_release, ++ .poll = vtpm_op_poll, ++}; ++ ++static struct miscdevice vtpms_miscdevice = { ++ .minor = 225, ++ .name = "vtpm", ++ .fops = &vtpm_ops, ++}; ++ ++/*************************************************************** ++ Utility functions ++***************************************************************/ ++ ++static int tpm_send_fail_message(struct packet *pak, u8 req_tag) ++{ ++ int rc; ++ static const unsigned char tpm_error_message_fail[] = { ++ 0x00, 0x00, ++ 0x00, 0x00, 0x00, 0x0a, ++ 0x00, 0x00, 0x00, 0x09 /* TPM_FAIL */ ++ }; ++ unsigned char buffer[sizeof (tpm_error_message_fail)]; ++ ++ memcpy(buffer, tpm_error_message_fail, ++ sizeof (tpm_error_message_fail)); ++ /* ++ * Insert the right response tag depending on the given tag ++ * All response tags are '+3' to the request tag. ++ */ ++ buffer[1] = req_tag + 3; ++ ++ /* ++ * Write the data to shared memory and notify the front-end ++ */ ++ rc = packet_write(pak, buffer, sizeof (buffer), 0); ++ ++ return rc; ++} ++ ++static int _vtpm_release_packets(struct list_head *head, ++ tpmif_t * tpmif, int send_msgs) ++{ ++ int aborted = 0; ++ int c = 0; ++ struct packet *pak; ++ struct list_head *pos, *tmp; ++ ++ list_for_each_safe(pos, tmp, head) { ++ pak = list_entry(pos, struct packet, next); ++ c += 1; ++ ++ if (tpmif == NULL || pak->tpmif == tpmif) { ++ int can_send = 0; ++ ++ del_singleshot_timer_sync(&pak->processing_timer); ++ list_del(&pak->next); ++ ++ if (pak->tpmif && pak->tpmif->status == CONNECTED) { ++ can_send = 1; ++ } ++ ++ if (send_msgs && can_send) { ++ tpm_send_fail_message(pak, pak->req_tag); ++ } ++ packet_free(pak); ++ if (c == 1) ++ aborted = 1; ++ } ++ } ++ return aborted; ++} ++ ++int vtpm_release_packets(tpmif_t * tpmif, int send_msgs) ++{ ++ unsigned long flags; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ ++ dataex.aborted = _vtpm_release_packets(&dataex.pending_pak, ++ tpmif, ++ send_msgs); ++ _vtpm_release_packets(&dataex.current_pak, tpmif, send_msgs); ++ ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ return 0; ++} ++ ++static int vtpm_queue_packet(struct packet *pak) ++{ ++ int rc = 0; ++ ++ if (dataex.has_opener) { ++ unsigned long flags; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ list_add_tail(&pak->next, &dataex.pending_pak); ++ /* give the TPM some time to pick up the request */ ++ mod_timer(&pak->processing_timer, jiffies + (30 * HZ)); ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++ ++ wake_up_interruptible(&dataex.wait_queue); ++ } else { ++ rc = -EFAULT; ++ } ++ return rc; ++} ++ ++static int vtpm_receive(tpmif_t * tpmif, u32 size) ++{ ++ int rc = 0; ++ unsigned char buffer[10]; ++ __be32 *native_size; ++ struct packet *pak = packet_alloc(tpmif, size, 0, 0); ++ ++ if (!pak) ++ return -ENOMEM; ++ /* ++ * Read 10 bytes from the received buffer to test its ++ * content for validity. ++ */ ++ if (sizeof (buffer) != packet_read(pak, ++ sizeof (buffer), buffer, ++ sizeof (buffer), 0)) { ++ goto failexit; ++ } ++ /* ++ * Reset the packet read pointer so we can read all its ++ * contents again. ++ */ ++ packet_reset(pak); ++ ++ native_size = (__force __be32 *) (&buffer[4 + 2]); ++ /* ++ * Verify that the size of the packet is correct ++ * as indicated and that there's actually someone reading packets. ++ * The minimum size of the packet is '10' for tag, size indicator ++ * and ordinal. ++ */ ++ if (size < 10 || ++ be32_to_cpu(*native_size) != size || ++ 0 == dataex.has_opener || tpmif->status != CONNECTED) { ++ rc = -EINVAL; ++ goto failexit; ++ } else { ++ rc = vtpm_queue_packet(pak); ++ if (rc < 0) ++ goto failexit; ++ } ++ return 0; ++ ++ failexit: ++ if (pak) { ++ tpm_send_fail_message(pak, buffer[4 + 1]); ++ packet_free(pak); ++ } ++ return rc; ++} ++ ++/* ++ * Timeout function that gets invoked when a packet has not been processed ++ * during the timeout period. ++ * The packet must be on a list when this function is invoked. This ++ * also means that once its taken off a list, the timer must be ++ * destroyed as well. ++ */ ++static void processing_timeout(unsigned long ptr) ++{ ++ struct packet *pak = (struct packet *)ptr; ++ unsigned long flags; ++ ++ write_lock_irqsave(&dataex.pak_lock, flags); ++ /* ++ * The packet needs to be searched whether it ++ * is still on the list. ++ */ ++ if (pak == packet_find_packet(&dataex.pending_pak, pak) || ++ pak == packet_find_packet(&dataex.current_pak, pak)) { ++ if ((pak->flags & PACKET_FLAG_DISCARD_RESPONSE) == 0) { ++ tpm_send_fail_message(pak, pak->req_tag); ++ } ++ /* discard future responses */ ++ pak->flags |= PACKET_FLAG_DISCARD_RESPONSE; ++ } ++ ++ write_unlock_irqrestore(&dataex.pak_lock, flags); ++} ++ ++static void tpm_tx_action(unsigned long unused); ++static DECLARE_TASKLET(tpm_tx_tasklet, tpm_tx_action, 0); ++ ++static struct list_head tpm_schedule_list; ++static spinlock_t tpm_schedule_list_lock; ++ ++static inline void maybe_schedule_tx_action(void) ++{ ++ smp_mb(); ++ tasklet_schedule(&tpm_tx_tasklet); ++} ++ ++static inline int __on_tpm_schedule_list(tpmif_t * tpmif) ++{ ++ return tpmif->list.next != NULL; ++} ++ ++static void remove_from_tpm_schedule_list(tpmif_t * tpmif) ++{ ++ spin_lock_irq(&tpm_schedule_list_lock); ++ if (likely(__on_tpm_schedule_list(tpmif))) { ++ list_del(&tpmif->list); ++ tpmif->list.next = NULL; ++ tpmif_put(tpmif); ++ } ++ spin_unlock_irq(&tpm_schedule_list_lock); ++} ++ ++static void add_to_tpm_schedule_list_tail(tpmif_t * tpmif) ++{ ++ if (__on_tpm_schedule_list(tpmif)) ++ return; ++ ++ spin_lock_irq(&tpm_schedule_list_lock); ++ if (!__on_tpm_schedule_list(tpmif) && tpmif->active) { ++ list_add_tail(&tpmif->list, &tpm_schedule_list); ++ tpmif_get(tpmif); ++ } ++ spin_unlock_irq(&tpm_schedule_list_lock); ++} ++ ++void tpmif_schedule_work(tpmif_t * tpmif) ++{ ++ add_to_tpm_schedule_list_tail(tpmif); ++ maybe_schedule_tx_action(); ++} ++ ++void tpmif_deschedule_work(tpmif_t * tpmif) ++{ ++ remove_from_tpm_schedule_list(tpmif); ++} ++ ++static void tpm_tx_action(unsigned long unused) ++{ ++ struct list_head *ent; ++ tpmif_t *tpmif; ++ tpmif_tx_request_t *tx; ++ ++ DPRINTK("%s: Getting data from front-end(s)!\n", __FUNCTION__); ++ ++ while (!list_empty(&tpm_schedule_list)) { ++ /* Get a tpmif from the list with work to do. */ ++ ent = tpm_schedule_list.next; ++ tpmif = list_entry(ent, tpmif_t, list); ++ tpmif_get(tpmif); ++ remove_from_tpm_schedule_list(tpmif); ++ ++ tx = &tpmif->tx->ring[0].req; ++ ++ /* pass it up */ ++ vtpm_receive(tpmif, tx->size); ++ ++ tpmif_put(tpmif); ++ } ++} ++ ++irqreturn_t tpmif_be_int(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ tpmif_t *tpmif = (tpmif_t *) dev_id; ++ ++ add_to_tpm_schedule_list_tail(tpmif); ++ maybe_schedule_tx_action(); ++ return IRQ_HANDLED; ++} ++ ++static int __init tpmback_init(void) ++{ ++ int rc; ++ ++ if ((rc = misc_register(&vtpms_miscdevice)) != 0) { ++ printk(KERN_ALERT ++ "Could not register misc device for TPM BE.\n"); ++ return rc; ++ } ++ ++ dataex_init(&dataex); ++ ++ spin_lock_init(&tpm_schedule_list_lock); ++ INIT_LIST_HEAD(&tpm_schedule_list); ++ ++ tpmif_interface_init(); ++ tpmif_xenbus_init(); ++ ++ printk(KERN_ALERT "Successfully initialized TPM backend driver.\n"); ++ ++ return 0; ++} ++ ++module_init(tpmback_init); ++ ++void __exit tpmback_exit(void) ++{ ++ vtpm_release_packets(NULL, 0); ++ tpmif_xenbus_exit(); ++ tpmif_interface_exit(); ++ misc_deregister(&vtpms_miscdevice); ++} ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/tpmback/xenbus.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,289 @@ ++/* Xenbus code for tpmif backend ++ Copyright (C) 2005 IBM Corporation ++ Copyright (C) 2005 Rusty Russell <rusty@rustcorp.com.au> ++ ++ This program is free software; you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 2 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program; if not, write to the Free Software ++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA ++*/ ++#include <stdarg.h> ++#include <linux/module.h> ++#include <xen/xenbus.h> ++#include "common.h" ++ ++struct backend_info ++{ ++ struct xenbus_device *dev; ++ ++ /* our communications channel */ ++ tpmif_t *tpmif; ++ ++ long int frontend_id; ++ long int instance; // instance of TPM ++ u8 is_instance_set;// whether instance number has been set ++ ++ /* watch front end for changes */ ++ struct xenbus_watch backend_watch; ++}; ++ ++static void maybe_connect(struct backend_info *be); ++static void connect(struct backend_info *be); ++static int connect_ring(struct backend_info *be); ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len); ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state); ++ ++long int tpmback_get_instance(struct backend_info *bi) ++{ ++ long int res = -1; ++ if (bi && bi->is_instance_set) ++ res = bi->instance; ++ return res; ++} ++ ++static int tpmback_remove(struct xenbus_device *dev) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ ++ if (!be) return 0; ++ ++ if (be->backend_watch.node) { ++ unregister_xenbus_watch(&be->backend_watch); ++ kfree(be->backend_watch.node); ++ be->backend_watch.node = NULL; ++ } ++ if (be->tpmif) { ++ be->tpmif->bi = NULL; ++ vtpm_release_packets(be->tpmif, 0); ++ tpmif_put(be->tpmif); ++ be->tpmif = NULL; ++ } ++ kfree(be); ++ dev->dev.driver_data = NULL; ++ return 0; ++} ++ ++static int tpmback_probe(struct xenbus_device *dev, ++ const struct xenbus_device_id *id) ++{ ++ int err; ++ struct backend_info *be = kzalloc(sizeof(struct backend_info), ++ GFP_KERNEL); ++ ++ if (!be) { ++ xenbus_dev_fatal(dev, -ENOMEM, ++ "allocating backend structure"); ++ return -ENOMEM; ++ } ++ ++ be->is_instance_set = 0; ++ be->dev = dev; ++ dev->dev.driver_data = be; ++ ++ err = xenbus_watch_path2(dev, dev->nodename, ++ "instance", &be->backend_watch, ++ backend_changed); ++ if (err) { ++ goto fail; ++ } ++ ++ err = xenbus_switch_state(dev, XenbusStateInitWait); ++ if (err) { ++ goto fail; ++ } ++ return 0; ++fail: ++ tpmback_remove(dev); ++ return err; ++} ++ ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ int err; ++ long instance; ++ struct backend_info *be ++ = container_of(watch, struct backend_info, backend_watch); ++ struct xenbus_device *dev = be->dev; ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, ++ "instance","%li", &instance); ++ if (XENBUS_EXIST_ERR(err)) { ++ return; ++ } ++ ++ if (err != 1) { ++ xenbus_dev_fatal(dev, err, "reading instance"); ++ return; ++ } ++ ++ if (be->is_instance_set == 0) { ++ be->instance = instance; ++ be->is_instance_set = 1; ++ } ++} ++ ++ ++static void frontend_changed(struct xenbus_device *dev, ++ enum xenbus_state frontend_state) ++{ ++ struct backend_info *be = dev->dev.driver_data; ++ int err; ++ ++ switch (frontend_state) { ++ case XenbusStateInitialising: ++ case XenbusStateInitialised: ++ break; ++ ++ case XenbusStateConnected: ++ err = connect_ring(be); ++ if (err) { ++ return; ++ } ++ maybe_connect(be); ++ break; ++ ++ case XenbusStateClosing: ++ be->instance = -1; ++ xenbus_switch_state(dev, XenbusStateClosing); ++ break; ++ ++ case XenbusStateUnknown: /* keep it here */ ++ case XenbusStateClosed: ++ xenbus_switch_state(dev, XenbusStateClosed); ++ device_unregister(&be->dev->dev); ++ tpmback_remove(dev); ++ break; ++ ++ default: ++ xenbus_dev_fatal(dev, -EINVAL, ++ "saw state %d at frontend", ++ frontend_state); ++ break; ++ } ++} ++ ++ ++ ++static void maybe_connect(struct backend_info *be) ++{ ++ if (be->tpmif == NULL || be->tpmif->status == CONNECTED) ++ return; ++ ++ connect(be); ++} ++ ++ ++static void connect(struct backend_info *be) ++{ ++ struct xenbus_transaction xbt; ++ int err; ++ struct xenbus_device *dev = be->dev; ++ unsigned long ready = 1; ++ ++again: ++ err = xenbus_transaction_start(&xbt); ++ if (err) { ++ xenbus_dev_fatal(be->dev, err, "starting transaction"); ++ return; ++ } ++ ++ err = xenbus_printf(xbt, be->dev->nodename, ++ "ready", "%lu", ready); ++ if (err) { ++ xenbus_dev_fatal(be->dev, err, "writing 'ready'"); ++ goto abort; ++ } ++ ++ err = xenbus_transaction_end(xbt, 0); ++ if (err == -EAGAIN) ++ goto again; ++ if (err) ++ xenbus_dev_fatal(be->dev, err, "end of transaction"); ++ ++ err = xenbus_switch_state(dev, XenbusStateConnected); ++ if (!err) ++ be->tpmif->status = CONNECTED; ++ return; ++abort: ++ xenbus_transaction_end(xbt, 1); ++} ++ ++ ++static int connect_ring(struct backend_info *be) ++{ ++ struct xenbus_device *dev = be->dev; ++ unsigned long ring_ref; ++ unsigned int evtchn; ++ int err; ++ ++ err = xenbus_gather(XBT_NIL, dev->otherend, ++ "ring-ref", "%lu", &ring_ref, ++ "event-channel", "%u", &evtchn, NULL); ++ if (err) { ++ xenbus_dev_error(dev, err, ++ "reading %s/ring-ref and event-channel", ++ dev->otherend); ++ return err; ++ } ++ ++ if (!be->tpmif) { ++ be->tpmif = tpmif_find(dev->otherend_id, be); ++ if (IS_ERR(be->tpmif)) { ++ err = PTR_ERR(be->tpmif); ++ be->tpmif = NULL; ++ xenbus_dev_fatal(dev,err,"creating vtpm interface"); ++ return err; ++ } ++ } ++ ++ if (be->tpmif != NULL) { ++ err = tpmif_map(be->tpmif, ring_ref, evtchn); ++ if (err) { ++ xenbus_dev_error(dev, err, ++ "mapping shared-frame %lu port %u", ++ ring_ref, evtchn); ++ return err; ++ } ++ } ++ return 0; ++} ++ ++ ++static struct xenbus_device_id tpmback_ids[] = { ++ { "vtpm" }, ++ { "" } ++}; ++ ++ ++static struct xenbus_driver tpmback = { ++ .name = "vtpm", ++ .owner = THIS_MODULE, ++ .ids = tpmback_ids, ++ .probe = tpmback_probe, ++ .remove = tpmback_remove, ++ .otherend_changed = frontend_changed, ++}; ++ ++ ++void tpmif_xenbus_init(void) ++{ ++ xenbus_register_backend(&tpmback); ++} ++ ++void tpmif_xenbus_exit(void) ++{ ++ xenbus_unregister_driver(&tpmback); ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/util.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,70 @@ ++#include <linux/mm.h> ++#include <linux/module.h> ++#include <linux/slab.h> ++#include <linux/vmalloc.h> ++#include <asm/uaccess.h> ++#include <xen/driver_util.h> ++ ++struct class *get_xen_class(void) ++{ ++ static struct class *xen_class; ++ ++ if (xen_class) ++ return xen_class; ++ ++ xen_class = class_create(THIS_MODULE, "xen"); ++ if (IS_ERR(xen_class)) { ++ printk("Failed to create xen sysfs class.\n"); ++ xen_class = NULL; ++ } ++ ++ return xen_class; ++} ++EXPORT_SYMBOL_GPL(get_xen_class); ++ ++/* Todo: merge ia64 ('auto-translate physmap') versions of these functions. */ ++#ifndef __ia64__ ++ ++static int f(pte_t *pte, struct page *pmd_page, unsigned long addr, void *data) ++{ ++ /* apply_to_page_range() does all the hard work. */ ++ return 0; ++} ++ ++struct vm_struct *alloc_vm_area(unsigned long size) ++{ ++ struct vm_struct *area; ++ ++ area = get_vm_area(size, VM_IOREMAP); ++ if (area == NULL) ++ return NULL; ++ ++ /* ++ * This ensures that page tables are constructed for this region ++ * of kernel virtual address space and mapped into init_mm. ++ */ ++ if (apply_to_page_range(&init_mm, (unsigned long)area->addr, ++ area->size, f, NULL)) { ++ free_vm_area(area); ++ return NULL; ++ } ++ ++ /* Map page directories into every address space. */ ++#ifdef CONFIG_X86 ++ vmalloc_sync_all(); ++#endif ++ ++ return area; ++} ++EXPORT_SYMBOL_GPL(alloc_vm_area); ++ ++void free_vm_area(struct vm_struct *area) ++{ ++ struct vm_struct *ret; ++ ret = remove_vm_area(area->addr); ++ BUG_ON(ret != area); ++ kfree(area); ++} ++EXPORT_SYMBOL_GPL(free_vm_area); ++ ++#endif /* !__ia64__ */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/Makefile 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,9 @@ ++obj-y += xenbus_client.o xenbus_comms.o xenbus_xs.o xenbus_probe.o ++obj-$(CONFIG_XEN_BACKEND) += xenbus_be.o ++ ++xenbus_be-objs = ++xenbus_be-objs += xenbus_backend_client.o ++ ++xenbus-$(CONFIG_XEN_BACKEND) += xenbus_probe_backend.o ++obj-y += $(xenbus-y) $(xenbus-m) ++obj-$(CONFIG_XEN_XENBUS_DEV) += xenbus_dev.o +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_backend_client.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,147 @@ ++/****************************************************************************** ++ * Backend-client-facing interface for the Xenbus driver. In other words, the ++ * interface between the Xenbus and the device-specific code in the backend ++ * driver. ++ * ++ * Copyright (C) 2005-2006 XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/err.h> ++#include <xen/gnttab.h> ++#include <xen/xenbus.h> ++#include <xen/driver_util.h> ++ ++/* Based on Rusty Russell's skeleton driver's map_page */ ++struct vm_struct *xenbus_map_ring_valloc(struct xenbus_device *dev, int gnt_ref) ++{ ++ struct gnttab_map_grant_ref op; ++ struct vm_struct *area; ++ ++ area = alloc_vm_area(PAGE_SIZE); ++ if (!area) ++ return ERR_PTR(-ENOMEM); ++ ++ gnttab_set_map_op(&op, (unsigned long)area->addr, GNTMAP_host_map, ++ gnt_ref, dev->otherend_id); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status != GNTST_okay) { ++ free_vm_area(area); ++ xenbus_dev_fatal(dev, op.status, ++ "mapping in shared page %d from domain %d", ++ gnt_ref, dev->otherend_id); ++ BUG_ON(!IS_ERR(ERR_PTR(op.status))); ++ return ERR_PTR(op.status); ++ } ++ ++ /* Stuff the handle in an unused field */ ++ area->phys_addr = (unsigned long)op.handle; ++ ++ return area; ++} ++EXPORT_SYMBOL_GPL(xenbus_map_ring_valloc); ++ ++ ++int xenbus_map_ring(struct xenbus_device *dev, int gnt_ref, ++ grant_handle_t *handle, void *vaddr) ++{ ++ struct gnttab_map_grant_ref op; ++ ++ gnttab_set_map_op(&op, (unsigned long)vaddr, GNTMAP_host_map, ++ gnt_ref, dev->otherend_id); ++ if (HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status != GNTST_okay) { ++ xenbus_dev_fatal(dev, op.status, ++ "mapping in shared page %d from domain %d", ++ gnt_ref, dev->otherend_id); ++ } else ++ *handle = op.handle; ++ ++ return op.status; ++} ++EXPORT_SYMBOL_GPL(xenbus_map_ring); ++ ++ ++/* Based on Rusty Russell's skeleton driver's unmap_page */ ++int xenbus_unmap_ring_vfree(struct xenbus_device *dev, struct vm_struct *area) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)area->addr, GNTMAP_host_map, ++ (grant_handle_t)area->phys_addr); ++ ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status == GNTST_okay) ++ free_vm_area(area); ++ else ++ xenbus_dev_error(dev, op.status, ++ "unmapping page at handle %d error %d", ++ (int16_t)area->phys_addr, op.status); ++ ++ return op.status; ++} ++EXPORT_SYMBOL_GPL(xenbus_unmap_ring_vfree); ++ ++ ++int xenbus_unmap_ring(struct xenbus_device *dev, ++ grant_handle_t handle, void *vaddr) ++{ ++ struct gnttab_unmap_grant_ref op; ++ ++ gnttab_set_unmap_op(&op, (unsigned long)vaddr, GNTMAP_host_map, ++ handle); ++ if (HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, &op, 1)) ++ BUG(); ++ ++ if (op.status != GNTST_okay) ++ xenbus_dev_error(dev, op.status, ++ "unmapping page at handle %d error %d", ++ handle, op.status); ++ ++ return op.status; ++} ++EXPORT_SYMBOL_GPL(xenbus_unmap_ring); ++ ++int xenbus_dev_is_online(struct xenbus_device *dev) ++{ ++ int rc, val; ++ ++ rc = xenbus_scanf(XBT_NIL, dev->nodename, "online", "%d", &val); ++ if (rc != 1) ++ val = 0; /* no online node present */ ++ ++ return val; ++} ++EXPORT_SYMBOL_GPL(xenbus_dev_is_online); ++ ++MODULE_LICENSE("Dual BSD/GPL"); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_client.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,283 @@ ++/****************************************************************************** ++ * Client-facing interface for the Xenbus driver. In other words, the ++ * interface between the Xenbus and the device-specific code, be it the ++ * frontend or the backend of that driver. ++ * ++ * Copyright (C) 2005 XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <xen/evtchn.h> ++#include <xen/gnttab.h> ++#include <xen/xenbus.h> ++#include <xen/driver_util.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_client (%s:%d) " fmt ".\n", __FUNCTION__, __LINE__, ##args) ++ ++const char *xenbus_strstate(enum xenbus_state state) ++{ ++ static const char *const name[] = { ++ [ XenbusStateUnknown ] = "Unknown", ++ [ XenbusStateInitialising ] = "Initialising", ++ [ XenbusStateInitWait ] = "InitWait", ++ [ XenbusStateInitialised ] = "Initialised", ++ [ XenbusStateConnected ] = "Connected", ++ [ XenbusStateClosing ] = "Closing", ++ [ XenbusStateClosed ] = "Closed", ++ }; ++ return (state < ARRAY_SIZE(name)) ? name[state] : "INVALID"; ++} ++EXPORT_SYMBOL_GPL(xenbus_strstate); ++ ++int xenbus_watch_path(struct xenbus_device *dev, const char *path, ++ struct xenbus_watch *watch, ++ void (*callback)(struct xenbus_watch *, ++ const char **, unsigned int)) ++{ ++ int err; ++ ++ watch->node = path; ++ watch->callback = callback; ++ ++ err = register_xenbus_watch(watch); ++ ++ if (err) { ++ watch->node = NULL; ++ watch->callback = NULL; ++ xenbus_dev_fatal(dev, err, "adding watch on %s", path); ++ } ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_watch_path); ++ ++ ++int xenbus_watch_path2(struct xenbus_device *dev, const char *path, ++ const char *path2, struct xenbus_watch *watch, ++ void (*callback)(struct xenbus_watch *, ++ const char **, unsigned int)) ++{ ++ int err; ++ char *state = kasprintf(GFP_KERNEL, "%s/%s", path, path2); ++ if (!state) { ++ xenbus_dev_fatal(dev, -ENOMEM, "allocating path for watch"); ++ return -ENOMEM; ++ } ++ err = xenbus_watch_path(dev, state, watch, callback); ++ ++ if (err) ++ kfree(state); ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_watch_path2); ++ ++ ++int xenbus_switch_state(struct xenbus_device *dev, enum xenbus_state state) ++{ ++ /* We check whether the state is currently set to the given value, and ++ if not, then the state is set. We don't want to unconditionally ++ write the given state, because we don't want to fire watches ++ unnecessarily. Furthermore, if the node has gone, we don't write ++ to it, as the device will be tearing down, and we don't want to ++ resurrect that directory. ++ ++ Note that, because of this cached value of our state, this function ++ will not work inside a Xenstore transaction (something it was ++ trying to in the past) because dev->state would not get reset if ++ the transaction was aborted. ++ ++ */ ++ ++ int current_state; ++ int err; ++ ++ if (state == dev->state) ++ return 0; ++ ++ err = xenbus_scanf(XBT_NIL, dev->nodename, "state", "%d", ++ ¤t_state); ++ if (err != 1) ++ return 0; ++ ++ err = xenbus_printf(XBT_NIL, dev->nodename, "state", "%d", state); ++ if (err) { ++ if (state != XenbusStateClosing) /* Avoid looping */ ++ xenbus_dev_fatal(dev, err, "writing new state"); ++ return err; ++ } ++ ++ dev->state = state; ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(xenbus_switch_state); ++ ++int xenbus_frontend_closed(struct xenbus_device *dev) ++{ ++ xenbus_switch_state(dev, XenbusStateClosed); ++ complete(&dev->down); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(xenbus_frontend_closed); ++ ++/** ++ * Return the path to the error node for the given device, or NULL on failure. ++ * If the value returned is non-NULL, then it is the caller's to kfree. ++ */ ++static char *error_path(struct xenbus_device *dev) ++{ ++ return kasprintf(GFP_KERNEL, "error/%s", dev->nodename); ++} ++ ++ ++void _dev_error(struct xenbus_device *dev, int err, const char *fmt, ++ va_list ap) ++{ ++ int ret; ++ unsigned int len; ++ char *printf_buffer = NULL, *path_buffer = NULL; ++ ++#define PRINTF_BUFFER_SIZE 4096 ++ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); ++ if (printf_buffer == NULL) ++ goto fail; ++ ++ len = sprintf(printf_buffer, "%i ", -err); ++ ret = vsnprintf(printf_buffer+len, PRINTF_BUFFER_SIZE-len, fmt, ap); ++ ++ BUG_ON(len + ret > PRINTF_BUFFER_SIZE-1); ++ ++ dev_err(&dev->dev, "%s\n", printf_buffer); ++ ++ path_buffer = error_path(dev); ++ ++ if (path_buffer == NULL) { ++ printk("xenbus: failed to write error node for %s (%s)\n", ++ dev->nodename, printf_buffer); ++ goto fail; ++ } ++ ++ if (xenbus_write(XBT_NIL, path_buffer, "error", printf_buffer) != 0) { ++ printk("xenbus: failed to write error node for %s (%s)\n", ++ dev->nodename, printf_buffer); ++ goto fail; ++ } ++ ++fail: ++ if (printf_buffer) ++ kfree(printf_buffer); ++ if (path_buffer) ++ kfree(path_buffer); ++} ++ ++ ++void xenbus_dev_error(struct xenbus_device *dev, int err, const char *fmt, ++ ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ _dev_error(dev, err, fmt, ap); ++ va_end(ap); ++} ++EXPORT_SYMBOL_GPL(xenbus_dev_error); ++ ++ ++void xenbus_dev_fatal(struct xenbus_device *dev, int err, const char *fmt, ++ ...) ++{ ++ va_list ap; ++ ++ va_start(ap, fmt); ++ _dev_error(dev, err, fmt, ap); ++ va_end(ap); ++ ++ xenbus_switch_state(dev, XenbusStateClosing); ++} ++EXPORT_SYMBOL_GPL(xenbus_dev_fatal); ++ ++ ++int xenbus_grant_ring(struct xenbus_device *dev, unsigned long ring_mfn) ++{ ++ int err = gnttab_grant_foreign_access(dev->otherend_id, ring_mfn, 0); ++ if (err < 0) ++ xenbus_dev_fatal(dev, err, "granting access to ring page"); ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_grant_ring); ++ ++ ++int xenbus_alloc_evtchn(struct xenbus_device *dev, int *port) ++{ ++ struct evtchn_alloc_unbound alloc_unbound; ++ int err; ++ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = dev->otherend_id; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ if (err) ++ xenbus_dev_fatal(dev, err, "allocating event channel"); ++ else ++ *port = alloc_unbound.port; ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_alloc_evtchn); ++ ++ ++int xenbus_free_evtchn(struct xenbus_device *dev, int port) ++{ ++ struct evtchn_close close; ++ int err; ++ ++ close.port = port; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_close, &close); ++ if (err) ++ xenbus_dev_error(dev, err, "freeing event channel %d", port); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_free_evtchn); ++ ++ ++enum xenbus_state xenbus_read_driver_state(const char *path) ++{ ++ enum xenbus_state result; ++ int err = xenbus_gather(XBT_NIL, path, "state", "%d", &result, NULL); ++ if (err) ++ result = XenbusStateUnknown; ++ ++ return result; ++} ++EXPORT_SYMBOL_GPL(xenbus_read_driver_state); +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_comms.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,232 @@ ++/****************************************************************************** ++ * xenbus_comms.c ++ * ++ * Low level code to talks to Xen Store: ringbuffer and event channel. ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/wait.h> ++#include <linux/interrupt.h> ++#include <linux/sched.h> ++#include <linux/err.h> ++#include <linux/ptrace.h> ++#include <xen/evtchn.h> ++#include <xen/xenbus.h> ++ ++#include <asm/hypervisor.h> ++ ++#include "xenbus_comms.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++static int xenbus_irq; ++ ++extern void xenbus_probe(void *); ++extern int xenstored_ready; ++static DECLARE_WORK(probe_work, xenbus_probe, NULL); ++ ++static DECLARE_WAIT_QUEUE_HEAD(xb_waitq); ++ ++static irqreturn_t wake_waiting(int irq, void *unused, struct pt_regs *regs) ++{ ++ if (unlikely(xenstored_ready == 0)) { ++ xenstored_ready = 1; ++ schedule_work(&probe_work); ++ } ++ ++ wake_up(&xb_waitq); ++ return IRQ_HANDLED; ++} ++ ++static int check_indexes(XENSTORE_RING_IDX cons, XENSTORE_RING_IDX prod) ++{ ++ return ((prod - cons) <= XENSTORE_RING_SIZE); ++} ++ ++static void *get_output_chunk(XENSTORE_RING_IDX cons, ++ XENSTORE_RING_IDX prod, ++ char *buf, uint32_t *len) ++{ ++ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(prod); ++ if ((XENSTORE_RING_SIZE - (prod - cons)) < *len) ++ *len = XENSTORE_RING_SIZE - (prod - cons); ++ return buf + MASK_XENSTORE_IDX(prod); ++} ++ ++static const void *get_input_chunk(XENSTORE_RING_IDX cons, ++ XENSTORE_RING_IDX prod, ++ const char *buf, uint32_t *len) ++{ ++ *len = XENSTORE_RING_SIZE - MASK_XENSTORE_IDX(cons); ++ if ((prod - cons) < *len) ++ *len = prod - cons; ++ return buf + MASK_XENSTORE_IDX(cons); ++} ++ ++int xb_write(const void *data, unsigned len) ++{ ++ struct xenstore_domain_interface *intf = xen_store_interface; ++ XENSTORE_RING_IDX cons, prod; ++ int rc; ++ ++ while (len != 0) { ++ void *dst; ++ unsigned int avail; ++ ++ rc = wait_event_interruptible( ++ xb_waitq, ++ (intf->req_prod - intf->req_cons) != ++ XENSTORE_RING_SIZE); ++ if (rc < 0) ++ return rc; ++ ++ /* Read indexes, then verify. */ ++ cons = intf->req_cons; ++ prod = intf->req_prod; ++ if (!check_indexes(cons, prod)) { ++ intf->req_cons = intf->req_prod = 0; ++ return -EIO; ++ } ++ ++ dst = get_output_chunk(cons, prod, intf->req, &avail); ++ if (avail == 0) ++ continue; ++ if (avail > len) ++ avail = len; ++ ++ /* Must write data /after/ reading the consumer index. */ ++ mb(); ++ ++ memcpy(dst, data, avail); ++ data += avail; ++ len -= avail; ++ ++ /* Other side must not see new producer until data is there. */ ++ wmb(); ++ intf->req_prod += avail; ++ ++ /* Implies mb(): other side will see the updated producer. */ ++ notify_remote_via_evtchn(xen_store_evtchn); ++ } ++ ++ return 0; ++} ++ ++int xb_data_to_read(void) ++{ ++ struct xenstore_domain_interface *intf = xen_store_interface; ++ return (intf->rsp_cons != intf->rsp_prod); ++} ++ ++int xb_wait_for_data_to_read(void) ++{ ++ return wait_event_interruptible(xb_waitq, xb_data_to_read()); ++} ++ ++int xb_read(void *data, unsigned len) ++{ ++ struct xenstore_domain_interface *intf = xen_store_interface; ++ XENSTORE_RING_IDX cons, prod; ++ int rc; ++ ++ while (len != 0) { ++ unsigned int avail; ++ const char *src; ++ ++ rc = xb_wait_for_data_to_read(); ++ if (rc < 0) ++ return rc; ++ ++ /* Read indexes, then verify. */ ++ cons = intf->rsp_cons; ++ prod = intf->rsp_prod; ++ if (!check_indexes(cons, prod)) { ++ intf->rsp_cons = intf->rsp_prod = 0; ++ return -EIO; ++ } ++ ++ src = get_input_chunk(cons, prod, intf->rsp, &avail); ++ if (avail == 0) ++ continue; ++ if (avail > len) ++ avail = len; ++ ++ /* Must read data /after/ reading the producer index. */ ++ rmb(); ++ ++ memcpy(data, src, avail); ++ data += avail; ++ len -= avail; ++ ++ /* Other side must not see free space until we've copied out */ ++ mb(); ++ intf->rsp_cons += avail; ++ ++ pr_debug("Finished read of %i bytes (%i to go)\n", avail, len); ++ ++ /* Implies mb(): other side will see the updated consumer. */ ++ notify_remote_via_evtchn(xen_store_evtchn); ++ } ++ ++ return 0; ++} ++ ++/* Set up interrupt handler off store event channel. */ ++int xb_init_comms(void) ++{ ++ struct xenstore_domain_interface *intf = xen_store_interface; ++ int err; ++ ++ if (intf->req_prod != intf->req_cons) ++ printk(KERN_ERR "XENBUS request ring is not quiescent " ++ "(%08x:%08x)!\n", intf->req_cons, intf->req_prod); ++ ++ if (intf->rsp_prod != intf->rsp_cons) { ++ printk(KERN_WARNING "XENBUS response ring is not quiescent " ++ "(%08x:%08x): fixing up\n", ++ intf->rsp_cons, intf->rsp_prod); ++ intf->rsp_cons = intf->rsp_prod; ++ } ++ ++ if (xenbus_irq) ++ unbind_from_irqhandler(xenbus_irq, &xb_waitq); ++ ++ err = bind_caller_port_to_irqhandler( ++ xen_store_evtchn, wake_waiting, ++ 0, "xenbus", &xb_waitq); ++ if (err <= 0) { ++ printk(KERN_ERR "XENBUS request irq failed %i\n", err); ++ return err; ++ } ++ ++ xenbus_irq = err; ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_comms.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,46 @@ ++/* ++ * Private include for xenbus communications. ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef _XENBUS_COMMS_H ++#define _XENBUS_COMMS_H ++ ++int xs_init(void); ++int xb_init_comms(void); ++ ++/* Low level routines. */ ++int xb_write(const void *data, unsigned len); ++int xb_read(void *data, unsigned len); ++int xb_data_to_read(void); ++int xb_wait_for_data_to_read(void); ++int xs_input_avail(void); ++extern struct xenstore_domain_interface *xen_store_interface; ++extern int xen_store_evtchn; ++ ++#endif /* _XENBUS_COMMS_H */ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_dev.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,404 @@ ++/* ++ * xenbus_dev.c ++ * ++ * Driver giving user-space access to the kernel's xenbus connection ++ * to xenstore. ++ * ++ * Copyright (c) 2005, Christian Limpach ++ * Copyright (c) 2005, Rusty Russell, IBM Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/kernel.h> ++#include <linux/errno.h> ++#include <linux/uio.h> ++#include <linux/notifier.h> ++#include <linux/wait.h> ++#include <linux/fs.h> ++#include <linux/poll.h> ++#include <linux/mutex.h> ++ ++#include "xenbus_comms.h" ++ ++#include <asm/uaccess.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/xen_proc.h> ++#include <asm/hypervisor.h> ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++struct xenbus_dev_transaction { ++ struct list_head list; ++ struct xenbus_transaction handle; ++}; ++ ++struct read_buffer { ++ struct list_head list; ++ unsigned int cons; ++ unsigned int len; ++ char msg[]; ++}; ++ ++struct xenbus_dev_data { ++ /* In-progress transaction. */ ++ struct list_head transactions; ++ ++ /* Active watches. */ ++ struct list_head watches; ++ ++ /* Partial request. */ ++ unsigned int len; ++ union { ++ struct xsd_sockmsg msg; ++ char buffer[PAGE_SIZE]; ++ } u; ++ ++ /* Response queue. */ ++ struct list_head read_buffers; ++ wait_queue_head_t read_waitq; ++ ++ struct mutex reply_mutex; ++}; ++ ++static struct proc_dir_entry *xenbus_dev_intf; ++ ++static ssize_t xenbus_dev_read(struct file *filp, ++ char __user *ubuf, ++ size_t len, loff_t *ppos) ++{ ++ struct xenbus_dev_data *u = filp->private_data; ++ struct read_buffer *rb; ++ int i, ret; ++ ++ mutex_lock(&u->reply_mutex); ++ while (list_empty(&u->read_buffers)) { ++ mutex_unlock(&u->reply_mutex); ++ ret = wait_event_interruptible(u->read_waitq, ++ !list_empty(&u->read_buffers)); ++ if (ret) ++ return ret; ++ mutex_lock(&u->reply_mutex); ++ } ++ ++ rb = list_entry(u->read_buffers.next, struct read_buffer, list); ++ for (i = 0; i < len;) { ++ put_user(rb->msg[rb->cons], ubuf + i); ++ i++; ++ rb->cons++; ++ if (rb->cons == rb->len) { ++ list_del(&rb->list); ++ kfree(rb); ++ if (list_empty(&u->read_buffers)) ++ break; ++ rb = list_entry(u->read_buffers.next, ++ struct read_buffer, list); ++ } ++ } ++ mutex_unlock(&u->reply_mutex); ++ ++ return i; ++} ++ ++static void queue_reply(struct xenbus_dev_data *u, ++ char *data, unsigned int len) ++{ ++ struct read_buffer *rb; ++ ++ if (len == 0) ++ return; ++ ++ rb = kmalloc(sizeof(*rb) + len, GFP_KERNEL); ++ BUG_ON(rb == NULL); ++ ++ rb->cons = 0; ++ rb->len = len; ++ ++ memcpy(rb->msg, data, len); ++ ++ list_add_tail(&rb->list, &u->read_buffers); ++ ++ wake_up(&u->read_waitq); ++} ++ ++struct watch_adapter ++{ ++ struct list_head list; ++ struct xenbus_watch watch; ++ struct xenbus_dev_data *dev_data; ++ char *token; ++}; ++ ++static void free_watch_adapter (struct watch_adapter *watch) ++{ ++ kfree(watch->watch.node); ++ kfree(watch->token); ++ kfree(watch); ++} ++ ++static void watch_fired(struct xenbus_watch *watch, ++ const char **vec, ++ unsigned int len) ++{ ++ struct watch_adapter *adap = ++ container_of(watch, struct watch_adapter, watch); ++ struct xsd_sockmsg hdr; ++ const char *path, *token; ++ int path_len, tok_len, body_len; ++ ++ path = vec[XS_WATCH_PATH]; ++ token = adap->token; ++ ++ path_len = strlen(path) + 1; ++ tok_len = strlen(token) + 1; ++ body_len = path_len + tok_len; ++ ++ hdr.type = XS_WATCH_EVENT; ++ hdr.len = body_len; ++ ++ mutex_lock(&adap->dev_data->reply_mutex); ++ queue_reply(adap->dev_data, (char *)&hdr, sizeof(hdr)); ++ queue_reply(adap->dev_data, (char *)path, path_len); ++ queue_reply(adap->dev_data, (char *)token, tok_len); ++ mutex_unlock(&adap->dev_data->reply_mutex); ++} ++ ++static LIST_HEAD(watch_list); ++ ++static ssize_t xenbus_dev_write(struct file *filp, ++ const char __user *ubuf, ++ size_t len, loff_t *ppos) ++{ ++ struct xenbus_dev_data *u = filp->private_data; ++ struct xenbus_dev_transaction *trans = NULL; ++ uint32_t msg_type; ++ void *reply; ++ char *path, *token; ++ struct watch_adapter *watch, *tmp_watch; ++ int err, rc = len; ++ ++ if ((len + u->len) > sizeof(u->u.buffer)) { ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (copy_from_user(u->u.buffer + u->len, ubuf, len) != 0) { ++ rc = -EFAULT; ++ goto out; ++ } ++ ++ u->len += len; ++ if ((u->len < sizeof(u->u.msg)) || ++ (u->len < (sizeof(u->u.msg) + u->u.msg.len))) ++ return rc; ++ ++ msg_type = u->u.msg.type; ++ ++ switch (msg_type) { ++ case XS_TRANSACTION_START: ++ case XS_TRANSACTION_END: ++ case XS_DIRECTORY: ++ case XS_READ: ++ case XS_GET_PERMS: ++ case XS_RELEASE: ++ case XS_GET_DOMAIN_PATH: ++ case XS_WRITE: ++ case XS_MKDIR: ++ case XS_RM: ++ case XS_SET_PERMS: ++ if (msg_type == XS_TRANSACTION_START) { ++ trans = kmalloc(sizeof(*trans), GFP_KERNEL); ++ if (!trans) { ++ rc = -ENOMEM; ++ goto out; ++ } ++ } ++ ++ reply = xenbus_dev_request_and_reply(&u->u.msg); ++ if (IS_ERR(reply)) { ++ kfree(trans); ++ rc = PTR_ERR(reply); ++ goto out; ++ } ++ ++ if (msg_type == XS_TRANSACTION_START) { ++ trans->handle.id = simple_strtoul(reply, NULL, 0); ++ list_add(&trans->list, &u->transactions); ++ } else if (msg_type == XS_TRANSACTION_END) { ++ list_for_each_entry(trans, &u->transactions, list) ++ if (trans->handle.id == u->u.msg.tx_id) ++ break; ++ BUG_ON(&trans->list == &u->transactions); ++ list_del(&trans->list); ++ kfree(trans); ++ } ++ mutex_lock(&u->reply_mutex); ++ queue_reply(u, (char *)&u->u.msg, sizeof(u->u.msg)); ++ queue_reply(u, (char *)reply, u->u.msg.len); ++ mutex_unlock(&u->reply_mutex); ++ kfree(reply); ++ break; ++ ++ case XS_WATCH: ++ case XS_UNWATCH: { ++ static const char *XS_RESP = "OK"; ++ struct xsd_sockmsg hdr; ++ ++ path = u->u.buffer + sizeof(u->u.msg); ++ token = memchr(path, 0, u->u.msg.len); ++ if (token == NULL) { ++ rc = -EILSEQ; ++ goto out; ++ } ++ token++; ++ ++ if (msg_type == XS_WATCH) { ++ watch = kmalloc(sizeof(*watch), GFP_KERNEL); ++ watch->watch.node = kmalloc(strlen(path)+1, ++ GFP_KERNEL); ++ strcpy((char *)watch->watch.node, path); ++ watch->watch.callback = watch_fired; ++ watch->token = kmalloc(strlen(token)+1, GFP_KERNEL); ++ strcpy(watch->token, token); ++ watch->dev_data = u; ++ ++ err = register_xenbus_watch(&watch->watch); ++ if (err) { ++ free_watch_adapter(watch); ++ rc = err; ++ goto out; ++ } ++ ++ list_add(&watch->list, &u->watches); ++ } else { ++ list_for_each_entry_safe(watch, tmp_watch, ++ &u->watches, list) { ++ if (!strcmp(watch->token, token) && ++ !strcmp(watch->watch.node, path)) ++ { ++ unregister_xenbus_watch(&watch->watch); ++ list_del(&watch->list); ++ free_watch_adapter(watch); ++ break; ++ } ++ } ++ } ++ ++ hdr.type = msg_type; ++ hdr.len = strlen(XS_RESP) + 1; ++ mutex_lock(&u->reply_mutex); ++ queue_reply(u, (char *)&hdr, sizeof(hdr)); ++ queue_reply(u, (char *)XS_RESP, hdr.len); ++ mutex_unlock(&u->reply_mutex); ++ break; ++ } ++ ++ default: ++ rc = -EINVAL; ++ break; ++ } ++ ++ out: ++ u->len = 0; ++ return rc; ++} ++ ++static int xenbus_dev_open(struct inode *inode, struct file *filp) ++{ ++ struct xenbus_dev_data *u; ++ ++ if (xen_store_evtchn == 0) ++ return -ENOENT; ++ ++ nonseekable_open(inode, filp); ++ ++ u = kzalloc(sizeof(*u), GFP_KERNEL); ++ if (u == NULL) ++ return -ENOMEM; ++ ++ INIT_LIST_HEAD(&u->transactions); ++ INIT_LIST_HEAD(&u->watches); ++ INIT_LIST_HEAD(&u->read_buffers); ++ init_waitqueue_head(&u->read_waitq); ++ ++ mutex_init(&u->reply_mutex); ++ ++ filp->private_data = u; ++ ++ return 0; ++} ++ ++static int xenbus_dev_release(struct inode *inode, struct file *filp) ++{ ++ struct xenbus_dev_data *u = filp->private_data; ++ struct xenbus_dev_transaction *trans, *tmp; ++ struct watch_adapter *watch, *tmp_watch; ++ ++ list_for_each_entry_safe(trans, tmp, &u->transactions, list) { ++ xenbus_transaction_end(trans->handle, 1); ++ list_del(&trans->list); ++ kfree(trans); ++ } ++ ++ list_for_each_entry_safe(watch, tmp_watch, &u->watches, list) { ++ unregister_xenbus_watch(&watch->watch); ++ list_del(&watch->list); ++ free_watch_adapter(watch); ++ } ++ ++ kfree(u); ++ ++ return 0; ++} ++ ++static unsigned int xenbus_dev_poll(struct file *file, poll_table *wait) ++{ ++ struct xenbus_dev_data *u = file->private_data; ++ ++ poll_wait(file, &u->read_waitq, wait); ++ if (!list_empty(&u->read_buffers)) ++ return POLLIN | POLLRDNORM; ++ return 0; ++} ++ ++static const struct file_operations xenbus_dev_file_ops = { ++ .read = xenbus_dev_read, ++ .write = xenbus_dev_write, ++ .open = xenbus_dev_open, ++ .release = xenbus_dev_release, ++ .poll = xenbus_dev_poll, ++}; ++ ++int xenbus_dev_init(void) ++{ ++ xenbus_dev_intf = create_xen_proc_entry("xenbus", 0400); ++ if (xenbus_dev_intf) ++ xenbus_dev_intf->proc_fops = &xenbus_dev_file_ops; ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_probe.c 2007-08-27 14:02:08.000000000 -0400 +@@ -0,0 +1,1086 @@ ++/****************************************************************************** ++ * Talks to Xen Store to figure out what devices we have. ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard ++ * Copyright (C) 2005, 2006 XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __FUNCTION__, __LINE__, ##args) ++ ++#include <linux/kernel.h> ++#include <linux/err.h> ++#include <linux/string.h> ++#include <linux/ctype.h> ++#include <linux/fcntl.h> ++#include <linux/mm.h> ++#include <linux/notifier.h> ++#include <linux/kthread.h> ++#include <linux/mutex.h> ++ ++#include <asm/io.h> ++#include <asm/page.h> ++#include <asm/maddr.h> ++#include <asm/pgtable.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/xen_proc.h> ++#include <xen/evtchn.h> ++#include <xen/features.h> ++#include <xen/hvm.h> ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++int xen_store_evtchn; ++struct xenstore_domain_interface *xen_store_interface; ++static unsigned long xen_store_mfn; ++ ++extern struct mutex xenwatch_mutex; ++ ++static ATOMIC_NOTIFIER_HEAD(xenstore_chain); ++ ++static void wait_for_devices(struct xenbus_driver *xendrv); ++ ++static int xenbus_probe_frontend(const char *type, const char *name); ++ ++static void xenbus_dev_shutdown(struct device *_dev); ++ ++/* If something in array of ids matches this device, return it. */ ++static const struct xenbus_device_id * ++match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) ++{ ++ for (; *arr->devicetype != '\0'; arr++) { ++ if (!strcmp(arr->devicetype, dev->devicetype)) ++ return arr; ++ } ++ return NULL; ++} ++ ++int xenbus_match(struct device *_dev, struct device_driver *_drv) ++{ ++ struct xenbus_driver *drv = to_xenbus_driver(_drv); ++ ++ if (!drv->ids) ++ return 0; ++ ++ return match_device(drv->ids, to_xenbus_device(_dev)) != NULL; ++} ++ ++/* device/<type>/<id> => <type>-<id> */ ++static int frontend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) ++{ ++ nodename = strchr(nodename, '/'); ++ if (!nodename || strlen(nodename + 1) >= BUS_ID_SIZE) { ++ printk(KERN_WARNING "XENBUS: bad frontend %s\n", nodename); ++ return -EINVAL; ++ } ++ ++ strlcpy(bus_id, nodename + 1, BUS_ID_SIZE); ++ if (!strchr(bus_id, '/')) { ++ printk(KERN_WARNING "XENBUS: bus_id %s no slash\n", bus_id); ++ return -EINVAL; ++ } ++ *strchr(bus_id, '/') = '-'; ++ return 0; ++} ++ ++ ++static void free_otherend_details(struct xenbus_device *dev) ++{ ++ kfree(dev->otherend); ++ dev->otherend = NULL; ++} ++ ++ ++static void free_otherend_watch(struct xenbus_device *dev) ++{ ++ if (dev->otherend_watch.node) { ++ unregister_xenbus_watch(&dev->otherend_watch); ++ kfree(dev->otherend_watch.node); ++ dev->otherend_watch.node = NULL; ++ } ++} ++ ++ ++int read_otherend_details(struct xenbus_device *xendev, ++ char *id_node, char *path_node) ++{ ++ int err = xenbus_gather(XBT_NIL, xendev->nodename, ++ id_node, "%i", &xendev->otherend_id, ++ path_node, NULL, &xendev->otherend, ++ NULL); ++ if (err) { ++ xenbus_dev_fatal(xendev, err, ++ "reading other end details from %s", ++ xendev->nodename); ++ return err; ++ } ++ if (strlen(xendev->otherend) == 0 || ++ !xenbus_exists(XBT_NIL, xendev->otherend, "")) { ++ xenbus_dev_fatal(xendev, -ENOENT, ++ "unable to read other end from %s. " ++ "missing or inaccessible.", ++ xendev->nodename); ++ free_otherend_details(xendev); ++ return -ENOENT; ++ } ++ ++ return 0; ++} ++ ++ ++static int read_backend_details(struct xenbus_device *xendev) ++{ ++ return read_otherend_details(xendev, "backend-id", "backend"); ++} ++ ++ ++/* Bus type for frontend drivers. */ ++static struct xen_bus_type xenbus_frontend = { ++ .root = "device", ++ .levels = 2, /* device/type/<id> */ ++ .get_bus_id = frontend_bus_id, ++ .probe = xenbus_probe_frontend, ++ .bus = { ++ .name = "xen", ++ .match = xenbus_match, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,16) ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++ .shutdown = xenbus_dev_shutdown, ++#endif ++ }, ++ .dev = { ++ .bus_id = "xen", ++ }, ++}; ++ ++static void otherend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ struct xenbus_device *dev = ++ container_of(watch, struct xenbus_device, otherend_watch); ++ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); ++ enum xenbus_state state; ++ ++ /* Protect us against watches firing on old details when the otherend ++ details change, say immediately after a resume. */ ++ if (!dev->otherend || ++ strncmp(dev->otherend, vec[XS_WATCH_PATH], ++ strlen(dev->otherend))) { ++ DPRINTK("Ignoring watch at %s", vec[XS_WATCH_PATH]); ++ return; ++ } ++ ++ state = xenbus_read_driver_state(dev->otherend); ++ ++ DPRINTK("state is %d (%s), %s, %s", state, xenbus_strstate(state), ++ dev->otherend_watch.node, vec[XS_WATCH_PATH]); ++ ++ /* ++ * Ignore xenbus transitions during shutdown. This prevents us doing ++ * work that can fail e.g., when the rootfs is gone. ++ */ ++ if (system_state > SYSTEM_RUNNING) { ++ struct xen_bus_type *bus = bus; ++ bus = container_of(dev->dev.bus, struct xen_bus_type, bus); ++ /* If we're frontend, drive the state machine to Closed. */ ++ /* This should cause the backend to release our resources. */ ++ if ((bus == &xenbus_frontend) && (state == XenbusStateClosing)) ++ xenbus_frontend_closed(dev); ++ return; ++ } ++ ++ if (drv->otherend_changed) ++ drv->otherend_changed(dev, state); ++} ++ ++ ++static int talk_to_otherend(struct xenbus_device *dev) ++{ ++ struct xenbus_driver *drv = to_xenbus_driver(dev->dev.driver); ++ ++ free_otherend_watch(dev); ++ free_otherend_details(dev); ++ ++ return drv->read_otherend_details(dev); ++} ++ ++ ++static int watch_otherend(struct xenbus_device *dev) ++{ ++ return xenbus_watch_path2(dev, dev->otherend, "state", ++ &dev->otherend_watch, otherend_changed); ++} ++ ++ ++int xenbus_dev_probe(struct device *_dev) ++{ ++ struct xenbus_device *dev = to_xenbus_device(_dev); ++ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); ++ const struct xenbus_device_id *id; ++ int err; ++ ++ DPRINTK("%s", dev->nodename); ++ ++ if (!drv->probe) { ++ err = -ENODEV; ++ goto fail; ++ } ++ ++ id = match_device(drv->ids, dev); ++ if (!id) { ++ err = -ENODEV; ++ goto fail; ++ } ++ ++ err = talk_to_otherend(dev); ++ if (err) { ++ printk(KERN_WARNING ++ "xenbus_probe: talk_to_otherend on %s failed.\n", ++ dev->nodename); ++ return err; ++ } ++ ++ err = drv->probe(dev, id); ++ if (err) ++ goto fail; ++ ++ err = watch_otherend(dev); ++ if (err) { ++ printk(KERN_WARNING ++ "xenbus_probe: watch_otherend on %s failed.\n", ++ dev->nodename); ++ return err; ++ } ++ ++ return 0; ++fail: ++ xenbus_dev_error(dev, err, "xenbus_dev_probe on %s", dev->nodename); ++ xenbus_switch_state(dev, XenbusStateClosed); ++ return -ENODEV; ++} ++ ++int xenbus_dev_remove(struct device *_dev) ++{ ++ struct xenbus_device *dev = to_xenbus_device(_dev); ++ struct xenbus_driver *drv = to_xenbus_driver(_dev->driver); ++ ++ DPRINTK("%s", dev->nodename); ++ ++ free_otherend_watch(dev); ++ free_otherend_details(dev); ++ ++ if (drv->remove) ++ drv->remove(dev); ++ ++ xenbus_switch_state(dev, XenbusStateClosed); ++ return 0; ++} ++ ++static void xenbus_dev_shutdown(struct device *_dev) ++{ ++ struct xenbus_device *dev = to_xenbus_device(_dev); ++ unsigned long timeout = 5*HZ; ++ ++ DPRINTK("%s", dev->nodename); ++ ++ get_device(&dev->dev); ++ if (dev->state != XenbusStateConnected) { ++ printk("%s: %s: %s != Connected, skipping\n", __FUNCTION__, ++ dev->nodename, xenbus_strstate(dev->state)); ++ goto out; ++ } ++ xenbus_switch_state(dev, XenbusStateClosing); ++ timeout = wait_for_completion_timeout(&dev->down, timeout); ++ if (!timeout) ++ printk("%s: %s timeout closing device\n", __FUNCTION__, dev->nodename); ++ out: ++ put_device(&dev->dev); ++} ++ ++int xenbus_register_driver_common(struct xenbus_driver *drv, ++ struct xen_bus_type *bus) ++{ ++ int ret; ++ ++ if (bus->error) ++ return bus->error; ++ ++ drv->driver.name = drv->name; ++ drv->driver.bus = &bus->bus; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ drv->driver.owner = drv->owner; ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) ++ drv->driver.probe = xenbus_dev_probe; ++ drv->driver.remove = xenbus_dev_remove; ++ drv->driver.shutdown = xenbus_dev_shutdown; ++#endif ++ ++ mutex_lock(&xenwatch_mutex); ++ ret = driver_register(&drv->driver); ++ mutex_unlock(&xenwatch_mutex); ++ return ret; ++} ++ ++int xenbus_register_frontend(struct xenbus_driver *drv) ++{ ++ int ret; ++ ++ drv->read_otherend_details = read_backend_details; ++ ++ ret = xenbus_register_driver_common(drv, &xenbus_frontend); ++ if (ret) ++ return ret; ++ ++ /* If this driver is loaded as a module wait for devices to attach. */ ++ wait_for_devices(drv); ++ ++ return 0; ++} ++EXPORT_SYMBOL_GPL(xenbus_register_frontend); ++ ++void xenbus_unregister_driver(struct xenbus_driver *drv) ++{ ++ driver_unregister(&drv->driver); ++} ++EXPORT_SYMBOL_GPL(xenbus_unregister_driver); ++ ++struct xb_find_info ++{ ++ struct xenbus_device *dev; ++ const char *nodename; ++}; ++ ++static int cmp_dev(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct xb_find_info *info = data; ++ ++ if (!strcmp(xendev->nodename, info->nodename)) { ++ info->dev = xendev; ++ get_device(dev); ++ return 1; ++ } ++ return 0; ++} ++ ++struct xenbus_device *xenbus_device_find(const char *nodename, ++ struct bus_type *bus) ++{ ++ struct xb_find_info info = { .dev = NULL, .nodename = nodename }; ++ ++ bus_for_each_dev(bus, NULL, &info, cmp_dev); ++ return info.dev; ++} ++ ++static int cleanup_dev(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct xb_find_info *info = data; ++ int len = strlen(info->nodename); ++ ++ DPRINTK("%s", info->nodename); ++ ++ /* Match the info->nodename path, or any subdirectory of that path. */ ++ if (strncmp(xendev->nodename, info->nodename, len)) ++ return 0; ++ ++ /* If the node name is longer, ensure it really is a subdirectory. */ ++ if ((strlen(xendev->nodename) > len) && (xendev->nodename[len] != '/')) ++ return 0; ++ ++ info->dev = xendev; ++ get_device(dev); ++ return 1; ++} ++ ++static void xenbus_cleanup_devices(const char *path, struct bus_type *bus) ++{ ++ struct xb_find_info info = { .nodename = path }; ++ ++ do { ++ info.dev = NULL; ++ bus_for_each_dev(bus, NULL, &info, cleanup_dev); ++ if (info.dev) { ++ device_unregister(&info.dev->dev); ++ put_device(&info.dev->dev); ++ } ++ } while (info.dev); ++} ++ ++static void xenbus_dev_release(struct device *dev) ++{ ++ if (dev) ++ kfree(to_xenbus_device(dev)); ++} ++ ++static ssize_t xendev_show_nodename(struct device *dev, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) ++ struct device_attribute *attr, ++#endif ++ char *buf) ++{ ++ return sprintf(buf, "%s\n", to_xenbus_device(dev)->nodename); ++} ++DEVICE_ATTR(nodename, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_nodename, NULL); ++ ++static ssize_t xendev_show_devtype(struct device *dev, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13) ++ struct device_attribute *attr, ++#endif ++ char *buf) ++{ ++ return sprintf(buf, "%s\n", to_xenbus_device(dev)->devicetype); ++} ++DEVICE_ATTR(devtype, S_IRUSR | S_IRGRP | S_IROTH, xendev_show_devtype, NULL); ++ ++ ++int xenbus_probe_node(struct xen_bus_type *bus, ++ const char *type, ++ const char *nodename) ++{ ++ int err; ++ struct xenbus_device *xendev; ++ size_t stringlen; ++ char *tmpstring; ++ ++ enum xenbus_state state = xenbus_read_driver_state(nodename); ++ ++ if (bus->error) ++ return bus->error; ++ ++ if (state != XenbusStateInitialising) { ++ /* Device is not new, so ignore it. This can happen if a ++ device is going away after switching to Closed. */ ++ return 0; ++ } ++ ++ stringlen = strlen(nodename) + 1 + strlen(type) + 1; ++ xendev = kzalloc(sizeof(*xendev) + stringlen, GFP_KERNEL); ++ if (!xendev) ++ return -ENOMEM; ++ ++ xendev->state = XenbusStateInitialising; ++ ++ /* Copy the strings into the extra space. */ ++ ++ tmpstring = (char *)(xendev + 1); ++ strcpy(tmpstring, nodename); ++ xendev->nodename = tmpstring; ++ ++ tmpstring += strlen(tmpstring) + 1; ++ strcpy(tmpstring, type); ++ xendev->devicetype = tmpstring; ++ init_completion(&xendev->down); ++ ++ xendev->dev.parent = &bus->dev; ++ xendev->dev.bus = &bus->bus; ++ xendev->dev.release = xenbus_dev_release; ++ ++ err = bus->get_bus_id(xendev->dev.bus_id, xendev->nodename); ++ if (err) ++ goto fail; ++ ++ /* Register with generic device framework. */ ++ err = device_register(&xendev->dev); ++ if (err) ++ goto fail; ++ ++ err = device_create_file(&xendev->dev, &dev_attr_nodename); ++ if (err) ++ goto unregister; ++ err = device_create_file(&xendev->dev, &dev_attr_devtype); ++ if (err) ++ goto unregister; ++ ++ return 0; ++unregister: ++ device_remove_file(&xendev->dev, &dev_attr_nodename); ++ device_remove_file(&xendev->dev, &dev_attr_devtype); ++ device_unregister(&xendev->dev); ++fail: ++ kfree(xendev); ++ return err; ++} ++ ++/* device/<typename>/<name> */ ++static int xenbus_probe_frontend(const char *type, const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_frontend.root, type, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s", nodename); ++ ++ err = xenbus_probe_node(&xenbus_frontend, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++static int xenbus_probe_device_type(struct xen_bus_type *bus, const char *type) ++{ ++ int err = 0; ++ char **dir; ++ unsigned int dir_n = 0; ++ int i; ++ ++ dir = xenbus_directory(XBT_NIL, bus->root, type, &dir_n); ++ if (IS_ERR(dir)) ++ return PTR_ERR(dir); ++ ++ for (i = 0; i < dir_n; i++) { ++ err = bus->probe(type, dir[i]); ++ if (err) ++ break; ++ } ++ kfree(dir); ++ return err; ++} ++ ++int xenbus_probe_devices(struct xen_bus_type *bus) ++{ ++ int err = 0; ++ char **dir; ++ unsigned int i, dir_n; ++ ++ if (bus->error) ++ return bus->error; ++ ++ dir = xenbus_directory(XBT_NIL, bus->root, "", &dir_n); ++ if (IS_ERR(dir)) ++ return PTR_ERR(dir); ++ ++ for (i = 0; i < dir_n; i++) { ++ err = xenbus_probe_device_type(bus, dir[i]); ++ if (err) ++ break; ++ } ++ kfree(dir); ++ return err; ++} ++ ++static unsigned int char_count(const char *str, char c) ++{ ++ unsigned int i, ret = 0; ++ ++ for (i = 0; str[i]; i++) ++ if (str[i] == c) ++ ret++; ++ return ret; ++} ++ ++static int strsep_len(const char *str, char c, unsigned int len) ++{ ++ unsigned int i; ++ ++ for (i = 0; str[i]; i++) ++ if (str[i] == c) { ++ if (len == 0) ++ return i; ++ len--; ++ } ++ return (len == 0) ? i : -ERANGE; ++} ++ ++void dev_changed(const char *node, struct xen_bus_type *bus) ++{ ++ int exists, rootlen; ++ struct xenbus_device *dev; ++ char type[BUS_ID_SIZE]; ++ const char *p, *root; ++ ++ if (bus->error || char_count(node, '/') < 2) ++ return; ++ ++ exists = xenbus_exists(XBT_NIL, node, ""); ++ if (!exists) { ++ xenbus_cleanup_devices(node, &bus->bus); ++ return; ++ } ++ ++ /* backend/<type>/... or device/<type>/... */ ++ p = strchr(node, '/') + 1; ++ snprintf(type, BUS_ID_SIZE, "%.*s", (int)strcspn(p, "/"), p); ++ type[BUS_ID_SIZE-1] = '\0'; ++ ++ rootlen = strsep_len(node, '/', bus->levels); ++ if (rootlen < 0) ++ return; ++ root = kasprintf(GFP_KERNEL, "%.*s", rootlen, node); ++ if (!root) ++ return; ++ ++ dev = xenbus_device_find(root, &bus->bus); ++ if (!dev) ++ xenbus_probe_node(bus, type, root); ++ else ++ put_device(&dev->dev); ++ ++ kfree(root); ++} ++ ++static void frontend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ dev_changed(vec[XS_WATCH_PATH], &xenbus_frontend); ++} ++ ++/* We watch for devices appearing and vanishing. */ ++static struct xenbus_watch fe_watch = { ++ .node = "device", ++ .callback = frontend_changed, ++}; ++ ++static int suspend_dev(struct device *dev, void *data) ++{ ++ int err = 0; ++ struct xenbus_driver *drv; ++ struct xenbus_device *xdev; ++ ++ DPRINTK(""); ++ ++ if (dev->driver == NULL) ++ return 0; ++ drv = to_xenbus_driver(dev->driver); ++ xdev = container_of(dev, struct xenbus_device, dev); ++ if (drv->suspend) ++ err = drv->suspend(xdev); ++ if (err) ++ printk(KERN_WARNING ++ "xenbus: suspend %s failed: %i\n", dev->bus_id, err); ++ return 0; ++} ++ ++static int suspend_cancel_dev(struct device *dev, void *data) ++{ ++ int err = 0; ++ struct xenbus_driver *drv; ++ struct xenbus_device *xdev; ++ ++ DPRINTK(""); ++ ++ if (dev->driver == NULL) ++ return 0; ++ drv = to_xenbus_driver(dev->driver); ++ xdev = container_of(dev, struct xenbus_device, dev); ++ if (drv->suspend_cancel) ++ err = drv->suspend_cancel(xdev); ++ if (err) ++ printk(KERN_WARNING ++ "xenbus: suspend_cancel %s failed: %i\n", ++ dev->bus_id, err); ++ return 0; ++} ++ ++static int resume_dev(struct device *dev, void *data) ++{ ++ int err; ++ struct xenbus_driver *drv; ++ struct xenbus_device *xdev; ++ ++ DPRINTK(""); ++ ++ if (dev->driver == NULL) ++ return 0; ++ ++ drv = to_xenbus_driver(dev->driver); ++ xdev = container_of(dev, struct xenbus_device, dev); ++ ++ err = talk_to_otherend(xdev); ++ if (err) { ++ printk(KERN_WARNING ++ "xenbus: resume (talk_to_otherend) %s failed: %i\n", ++ dev->bus_id, err); ++ return err; ++ } ++ ++ xdev->state = XenbusStateInitialising; ++ ++ if (drv->resume) { ++ err = drv->resume(xdev); ++ if (err) { ++ printk(KERN_WARNING ++ "xenbus: resume %s failed: %i\n", ++ dev->bus_id, err); ++ return err; ++ } ++ } ++ ++ err = watch_otherend(xdev); ++ if (err) { ++ printk(KERN_WARNING ++ "xenbus_probe: resume (watch_otherend) %s failed: " ++ "%d.\n", dev->bus_id, err); ++ return err; ++ } ++ ++ return 0; ++} ++ ++void xenbus_suspend(void) ++{ ++ DPRINTK(""); ++ ++ if (!xenbus_frontend.error) ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); ++ xenbus_backend_suspend(suspend_dev); ++ xs_suspend(); ++} ++EXPORT_SYMBOL_GPL(xenbus_suspend); ++ ++void xenbus_resume(void) ++{ ++ xb_init_comms(); ++ xs_resume(); ++ if (!xenbus_frontend.error) ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); ++ xenbus_backend_resume(resume_dev); ++} ++EXPORT_SYMBOL_GPL(xenbus_resume); ++ ++void xenbus_suspend_cancel(void) ++{ ++ xs_suspend_cancel(); ++ if (!xenbus_frontend.error) ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev); ++ xenbus_backend_resume(suspend_cancel_dev); ++} ++EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); ++ ++/* A flag to determine if xenstored is 'ready' (i.e. has started) */ ++int xenstored_ready = 0; ++ ++ ++int register_xenstore_notifier(struct notifier_block *nb) ++{ ++ int ret = 0; ++ ++ if (xenstored_ready > 0) ++ ret = nb->notifier_call(nb, 0, NULL); ++ else ++ atomic_notifier_chain_register(&xenstore_chain, nb); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(register_xenstore_notifier); ++ ++void unregister_xenstore_notifier(struct notifier_block *nb) ++{ ++ atomic_notifier_chain_unregister(&xenstore_chain, nb); ++} ++EXPORT_SYMBOL_GPL(unregister_xenstore_notifier); ++ ++ ++void xenbus_probe(void *unused) ++{ ++ BUG_ON((xenstored_ready <= 0)); ++ ++ /* Enumerate devices in xenstore and watch for changes. */ ++ xenbus_probe_devices(&xenbus_frontend); ++ register_xenbus_watch(&fe_watch); ++ xenbus_backend_probe_and_watch(); ++ ++ /* Notify others that xenstore is up */ ++ atomic_notifier_call_chain(&xenstore_chain, 0, NULL); ++} ++ ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) ++static struct file_operations xsd_kva_fops; ++static struct proc_dir_entry *xsd_kva_intf; ++static struct proc_dir_entry *xsd_port_intf; ++ ++static int xsd_kva_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ size_t size = vma->vm_end - vma->vm_start; ++ ++ if ((size > PAGE_SIZE) || (vma->vm_pgoff != 0)) ++ return -EINVAL; ++ ++ if (remap_pfn_range(vma, vma->vm_start, mfn_to_pfn(xen_store_mfn), ++ size, vma->vm_page_prot)) ++ return -EAGAIN; ++ ++ return 0; ++} ++ ++static int xsd_kva_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = sprintf(page, "0x%p", xen_store_interface); ++ *eof = 1; ++ return len; ++} ++ ++static int xsd_port_read(char *page, char **start, off_t off, ++ int count, int *eof, void *data) ++{ ++ int len; ++ ++ len = sprintf(page, "%d", xen_store_evtchn); ++ *eof = 1; ++ return len; ++} ++#endif ++ ++static int xenbus_probe_init(void) ++{ ++ int err = 0; ++ unsigned long page = 0; ++ ++ DPRINTK(""); ++ ++ if (!is_running_on_xen()) ++ return -ENODEV; ++ ++ /* Register ourselves with the kernel bus subsystem */ ++ xenbus_frontend.error = bus_register(&xenbus_frontend.bus); ++ if (xenbus_frontend.error) ++ printk(KERN_WARNING ++ "XENBUS: Error registering frontend bus: %i\n", ++ xenbus_frontend.error); ++ xenbus_backend_bus_register(); ++ ++ /* ++ * Domain0 doesn't have a store_evtchn or store_mfn yet. ++ */ ++ if (is_initial_xendomain()) { ++ struct evtchn_alloc_unbound alloc_unbound; ++ ++ /* Allocate page. */ ++ page = get_zeroed_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ xen_store_mfn = xen_start_info->store_mfn = ++ pfn_to_mfn(virt_to_phys((void *)page) >> ++ PAGE_SHIFT); ++ ++ /* Next allocate a local port which xenstored can bind to */ ++ alloc_unbound.dom = DOMID_SELF; ++ alloc_unbound.remote_dom = 0; ++ ++ err = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, ++ &alloc_unbound); ++ if (err == -ENOSYS) ++ goto err; ++ BUG_ON(err); ++ xen_store_evtchn = xen_start_info->store_evtchn = ++ alloc_unbound.port; ++ ++#if defined(CONFIG_PROC_FS) && defined(CONFIG_XEN_PRIVILEGED_GUEST) ++ /* And finally publish the above info in /proc/xen */ ++ xsd_kva_intf = create_xen_proc_entry("xsd_kva", 0600); ++ if (xsd_kva_intf) { ++ memcpy(&xsd_kva_fops, xsd_kva_intf->proc_fops, ++ sizeof(xsd_kva_fops)); ++ xsd_kva_fops.mmap = xsd_kva_mmap; ++ xsd_kva_intf->proc_fops = &xsd_kva_fops; ++ xsd_kva_intf->read_proc = xsd_kva_read; ++ } ++ xsd_port_intf = create_xen_proc_entry("xsd_port", 0400); ++ if (xsd_port_intf) ++ xsd_port_intf->read_proc = xsd_port_read; ++#endif ++ xen_store_interface = mfn_to_virt(xen_store_mfn); ++ } else { ++ xenstored_ready = 1; ++#ifdef CONFIG_XEN ++ xen_store_evtchn = xen_start_info->store_evtchn; ++ xen_store_mfn = xen_start_info->store_mfn; ++ xen_store_interface = mfn_to_virt(xen_store_mfn); ++#else ++ xen_store_evtchn = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN); ++ xen_store_mfn = hvm_get_parameter(HVM_PARAM_STORE_PFN); ++ xen_store_interface = ioremap(xen_store_mfn << PAGE_SHIFT, ++ PAGE_SIZE); ++#endif ++ } ++ ++ ++ xenbus_dev_init(); ++ ++ /* Initialize the interface to xenstore. */ ++ err = xs_init(); ++ if (err) { ++ printk(KERN_WARNING ++ "XENBUS: Error initializing xenstore comms: %i\n", err); ++ goto err; ++ } ++ ++ /* Register ourselves with the kernel device subsystem */ ++ if (!xenbus_frontend.error) { ++ xenbus_frontend.error = device_register(&xenbus_frontend.dev); ++ if (xenbus_frontend.error) { ++ bus_unregister(&xenbus_frontend.bus); ++ printk(KERN_WARNING ++ "XENBUS: Error registering frontend device: %i\n", ++ xenbus_frontend.error); ++ } ++ } ++ xenbus_backend_device_register(); ++ ++ if (!is_initial_xendomain()) ++ xenbus_probe(NULL); ++ ++ return 0; ++ ++ err: ++ if (page) ++ free_page(page); ++ ++ /* ++ * Do not unregister the xenbus front/backend buses here. The buses ++ * must exist because front/backend drivers will use them when they are ++ * registered. ++ */ ++ ++ return err; ++} ++ ++#ifdef CONFIG_XEN ++postcore_initcall(xenbus_probe_init); ++MODULE_LICENSE("Dual BSD/GPL"); ++#else ++int xenbus_init(void) ++{ ++ return xenbus_probe_init(); ++} ++#endif ++ ++static int is_disconnected_device(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ ++ /* ++ * A device with no driver will never connect. We care only about ++ * devices which should currently be in the process of connecting. ++ */ ++ if (!dev->driver) ++ return 0; ++ ++ /* Is this search limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ return (xendev->state != XenbusStateConnected); ++} ++ ++static int exists_disconnected_device(struct device_driver *drv) ++{ ++ if (xenbus_frontend.error) ++ return xenbus_frontend.error; ++ return bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ is_disconnected_device); ++} ++ ++static int print_device_status(struct device *dev, void *data) ++{ ++ struct xenbus_device *xendev = to_xenbus_device(dev); ++ struct device_driver *drv = data; ++ ++ /* Is this operation limited to a particular driver? */ ++ if (drv && (dev->driver != drv)) ++ return 0; ++ ++ if (!dev->driver) { ++ /* Information only: is this too noisy? */ ++ printk(KERN_INFO "XENBUS: Device with no driver: %s\n", ++ xendev->nodename); ++ } else if (xendev->state != XenbusStateConnected) { ++ printk(KERN_WARNING "XENBUS: Timeout connecting " ++ "to device: %s (state %d)\n", ++ xendev->nodename, xendev->state); ++ } ++ ++ return 0; ++} ++ ++/* We only wait for device setup after most initcalls have run. */ ++static int ready_to_wait_for_devices; ++ ++/* ++ * On a 10 second timeout, wait for all devices currently configured. We need ++ * to do this to guarantee that the filesystems and / or network devices ++ * needed for boot are available, before we can allow the boot to proceed. ++ * ++ * This needs to be on a late_initcall, to happen after the frontend device ++ * drivers have been initialised, but before the root fs is mounted. ++ * ++ * A possible improvement here would be to have the tools add a per-device ++ * flag to the store entry, indicating whether it is needed at boot time. ++ * This would allow people who knew what they were doing to accelerate their ++ * boot slightly, but of course needs tools or manual intervention to set up ++ * those flags correctly. ++ */ ++static void wait_for_devices(struct xenbus_driver *xendrv) ++{ ++ unsigned long timeout = jiffies + 10*HZ; ++ struct device_driver *drv = xendrv ? &xendrv->driver : NULL; ++ ++ if (!ready_to_wait_for_devices || !is_running_on_xen()) ++ return; ++ ++ while (exists_disconnected_device(drv)) { ++ if (time_after(jiffies, timeout)) ++ break; ++ schedule_timeout_interruptible(HZ/10); ++ } ++ ++ bus_for_each_dev(&xenbus_frontend.bus, NULL, drv, ++ print_device_status); ++} ++ ++#ifndef MODULE ++static int __init boot_wait_for_devices(void) ++{ ++ if (!xenbus_frontend.error) { ++ ready_to_wait_for_devices = 1; ++ wait_for_devices(NULL); ++ } ++ return 0; ++} ++ ++late_initcall(boot_wait_for_devices); ++#endif +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_probe.h 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,75 @@ ++/****************************************************************************** ++ * xenbus_probe.h ++ * ++ * Talks to Xen Store to figure out what devices we have. ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * Copyright (C) 2005 XenSource Ltd. ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#ifndef _XENBUS_PROBE_H ++#define _XENBUS_PROBE_H ++ ++#if defined(CONFIG_XEN_BACKEND) || defined(CONFIG_XEN_BACKEND_MODULE) ++extern void xenbus_backend_suspend(int (*fn)(struct device *, void *)); ++extern void xenbus_backend_resume(int (*fn)(struct device *, void *)); ++extern void xenbus_backend_probe_and_watch(void); ++extern void xenbus_backend_bus_register(void); ++extern void xenbus_backend_device_register(void); ++#else ++static inline void xenbus_backend_suspend(int (*fn)(struct device *, void *)) {} ++static inline void xenbus_backend_resume(int (*fn)(struct device *, void *)) {} ++static inline void xenbus_backend_probe_and_watch(void) {} ++static inline void xenbus_backend_bus_register(void) {} ++static inline void xenbus_backend_device_register(void) {} ++#endif ++ ++struct xen_bus_type ++{ ++ char *root; ++ int error; ++ unsigned int levels; ++ int (*get_bus_id)(char bus_id[BUS_ID_SIZE], const char *nodename); ++ int (*probe)(const char *type, const char *dir); ++ struct bus_type bus; ++ struct device dev; ++}; ++ ++extern int xenbus_match(struct device *_dev, struct device_driver *_drv); ++extern int xenbus_dev_probe(struct device *_dev); ++extern int xenbus_dev_remove(struct device *_dev); ++extern int xenbus_register_driver_common(struct xenbus_driver *drv, ++ struct xen_bus_type *bus); ++extern int xenbus_probe_node(struct xen_bus_type *bus, ++ const char *type, ++ const char *nodename); ++extern int xenbus_probe_devices(struct xen_bus_type *bus); ++ ++extern void dev_changed(const char *node, struct xen_bus_type *bus); ++ ++#endif ++ +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_probe_backend.c 2007-08-27 14:01:25.000000000 -0400 +@@ -0,0 +1,286 @@ ++/****************************************************************************** ++ * Talks to Xen Store to figure out what devices we have (backend half). ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * Copyright (C) 2005 Mike Wray, Hewlett-Packard ++ * Copyright (C) 2005, 2006 XenSource Ltd ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#define DPRINTK(fmt, args...) \ ++ pr_debug("xenbus_probe (%s:%d) " fmt ".\n", \ ++ __FUNCTION__, __LINE__, ##args) ++ ++#include <linux/kernel.h> ++#include <linux/err.h> ++#include <linux/string.h> ++#include <linux/ctype.h> ++#include <linux/fcntl.h> ++#include <linux/mm.h> ++#include <linux/notifier.h> ++#include <linux/kthread.h> ++ ++#include <asm/io.h> ++#include <asm/page.h> ++#include <asm/maddr.h> ++#include <asm/pgtable.h> ++#include <asm/hypervisor.h> ++#include <xen/xenbus.h> ++#include <xen/xen_proc.h> ++#include <xen/evtchn.h> ++#include <xen/features.h> ++#include <xen/hvm.h> ++ ++#include "xenbus_comms.h" ++#include "xenbus_probe.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++static int xenbus_uevent_backend(struct device *dev, char **envp, ++ int num_envp, char *buffer, int buffer_size); ++static int xenbus_probe_backend(const char *type, const char *domid); ++ ++extern int read_otherend_details(struct xenbus_device *xendev, ++ char *id_node, char *path_node); ++ ++static int read_frontend_details(struct xenbus_device *xendev) ++{ ++ return read_otherend_details(xendev, "frontend-id", "frontend"); ++} ++ ++/* backend/<type>/<fe-uuid>/<id> => <type>-<fe-domid>-<id> */ ++static int backend_bus_id(char bus_id[BUS_ID_SIZE], const char *nodename) ++{ ++ int domid, err; ++ const char *devid, *type, *frontend; ++ unsigned int typelen; ++ ++ type = strchr(nodename, '/'); ++ if (!type) ++ return -EINVAL; ++ type++; ++ typelen = strcspn(type, "/"); ++ if (!typelen || type[typelen] != '/') ++ return -EINVAL; ++ ++ devid = strrchr(nodename, '/') + 1; ++ ++ err = xenbus_gather(XBT_NIL, nodename, "frontend-id", "%i", &domid, ++ "frontend", NULL, &frontend, ++ NULL); ++ if (err) ++ return err; ++ if (strlen(frontend) == 0) ++ err = -ERANGE; ++ if (!err && !xenbus_exists(XBT_NIL, frontend, "")) ++ err = -ENOENT; ++ kfree(frontend); ++ ++ if (err) ++ return err; ++ ++ if (snprintf(bus_id, BUS_ID_SIZE, ++ "%.*s-%i-%s", typelen, type, domid, devid) >= BUS_ID_SIZE) ++ return -ENOSPC; ++ return 0; ++} ++ ++static struct xen_bus_type xenbus_backend = { ++ .root = "backend", ++ .levels = 3, /* backend/type/<frontend>/<id> */ ++ .get_bus_id = backend_bus_id, ++ .probe = xenbus_probe_backend, ++ .bus = { ++ .name = "xen-backend", ++ .match = xenbus_match, ++ .probe = xenbus_dev_probe, ++ .remove = xenbus_dev_remove, ++// .shutdown = xenbus_dev_shutdown, ++ .uevent = xenbus_uevent_backend, ++ }, ++ .dev = { ++ .bus_id = "xen-backend", ++ }, ++}; ++ ++static int xenbus_uevent_backend(struct device *dev, char **envp, ++ int num_envp, char *buffer, int buffer_size) ++{ ++ struct xenbus_device *xdev; ++ struct xenbus_driver *drv; ++ int i = 0; ++ int length = 0; ++ ++ DPRINTK(""); ++ ++ if (dev == NULL) ++ return -ENODEV; ++ ++ xdev = to_xenbus_device(dev); ++ if (xdev == NULL) ++ return -ENODEV; ++ ++ /* stuff we want to pass to /sbin/hotplug */ ++ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, ++ "XENBUS_TYPE=%s", xdev->devicetype); ++ ++ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, ++ "XENBUS_PATH=%s", xdev->nodename); ++ ++ add_uevent_var(envp, num_envp, &i, buffer, buffer_size, &length, ++ "XENBUS_BASE_PATH=%s", xenbus_backend.root); ++ ++ /* terminate, set to next free slot, shrink available space */ ++ envp[i] = NULL; ++ envp = &envp[i]; ++ num_envp -= i; ++ buffer = &buffer[length]; ++ buffer_size -= length; ++ ++ if (dev->driver) { ++ drv = to_xenbus_driver(dev->driver); ++ if (drv && drv->uevent) ++ return drv->uevent(xdev, envp, num_envp, buffer, ++ buffer_size); ++ } ++ ++ return 0; ++} ++ ++int xenbus_register_backend(struct xenbus_driver *drv) ++{ ++ drv->read_otherend_details = read_frontend_details; ++ ++ return xenbus_register_driver_common(drv, &xenbus_backend); ++} ++EXPORT_SYMBOL_GPL(xenbus_register_backend); ++ ++/* backend/<typename>/<frontend-uuid>/<name> */ ++static int xenbus_probe_backend_unit(const char *dir, ++ const char *type, ++ const char *name) ++{ ++ char *nodename; ++ int err; ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s", dir, name); ++ if (!nodename) ++ return -ENOMEM; ++ ++ DPRINTK("%s\n", nodename); ++ ++ err = xenbus_probe_node(&xenbus_backend, type, nodename); ++ kfree(nodename); ++ return err; ++} ++ ++/* backend/<typename>/<frontend-domid> */ ++static int xenbus_probe_backend(const char *type, const char *domid) ++{ ++ char *nodename; ++ int err = 0; ++ char **dir; ++ unsigned int i, dir_n = 0; ++ ++ DPRINTK(""); ++ ++ nodename = kasprintf(GFP_KERNEL, "%s/%s/%s", xenbus_backend.root, type, domid); ++ if (!nodename) ++ return -ENOMEM; ++ ++ dir = xenbus_directory(XBT_NIL, nodename, "", &dir_n); ++ if (IS_ERR(dir)) { ++ kfree(nodename); ++ return PTR_ERR(dir); ++ } ++ ++ for (i = 0; i < dir_n; i++) { ++ err = xenbus_probe_backend_unit(nodename, type, dir[i]); ++ if (err) ++ break; ++ } ++ kfree(dir); ++ kfree(nodename); ++ return err; ++} ++ ++static void backend_changed(struct xenbus_watch *watch, ++ const char **vec, unsigned int len) ++{ ++ DPRINTK(""); ++ ++ dev_changed(vec[XS_WATCH_PATH], &xenbus_backend); ++} ++ ++static struct xenbus_watch be_watch = { ++ .node = "backend", ++ .callback = backend_changed, ++}; ++ ++void xenbus_backend_suspend(int (*fn)(struct device *, void *)) ++{ ++ DPRINTK(""); ++ if (!xenbus_backend.error) ++ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++} ++ ++void xenbus_backend_resume(int (*fn)(struct device *, void *)) ++{ ++ DPRINTK(""); ++ if (!xenbus_backend.error) ++ bus_for_each_dev(&xenbus_backend.bus, NULL, NULL, fn); ++} ++ ++void xenbus_backend_probe_and_watch(void) ++{ ++ xenbus_probe_devices(&xenbus_backend); ++ register_xenbus_watch(&be_watch); ++} ++ ++void xenbus_backend_bus_register(void) ++{ ++ xenbus_backend.error = bus_register(&xenbus_backend.bus); ++ if (xenbus_backend.error) ++ printk(KERN_WARNING ++ "XENBUS: Error registering backend bus: %i\n", ++ xenbus_backend.error); ++} ++ ++void xenbus_backend_device_register(void) ++{ ++ if (xenbus_backend.error) ++ return; ++ ++ xenbus_backend.error = device_register(&xenbus_backend.dev); ++ if (xenbus_backend.error) { ++ bus_unregister(&xenbus_backend.bus); ++ printk(KERN_WARNING ++ "XENBUS: Error registering backend device: %i\n", ++ xenbus_backend.error); ++ } ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenbus/xenbus_xs.c 2007-08-27 14:02:10.000000000 -0400 +@@ -0,0 +1,880 @@ ++/****************************************************************************** ++ * xenbus_xs.c ++ * ++ * This is the kernel equivalent of the "xs" library. We don't need everything ++ * and we use xenbus_comms for communication. ++ * ++ * Copyright (C) 2005 Rusty Russell, IBM Corporation ++ * ++ * This program is free software; you can redistribute it and/or ++ * modify it under the terms of the GNU General Public License version 2 ++ * as published by the Free Software Foundation; or, when distributed ++ * separately from the Linux kernel or incorporated into other ++ * software packages, subject to the following license: ++ * ++ * Permission is hereby granted, free of charge, to any person obtaining a copy ++ * of this source file (the "Software"), to deal in the Software without ++ * restriction, including without limitation the rights to use, copy, modify, ++ * merge, publish, distribute, sublicense, and/or sell copies of the Software, ++ * and to permit persons to whom the Software is furnished to do so, subject to ++ * the following conditions: ++ * ++ * The above copyright notice and this permission notice shall be included in ++ * all copies or substantial portions of the Software. ++ * ++ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR ++ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, ++ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE ++ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER ++ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING ++ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS ++ * IN THE SOFTWARE. ++ */ ++ ++#include <linux/unistd.h> ++#include <linux/errno.h> ++#include <linux/types.h> ++#include <linux/uio.h> ++#include <linux/kernel.h> ++#include <linux/string.h> ++#include <linux/err.h> ++#include <linux/slab.h> ++#include <linux/fcntl.h> ++#include <linux/kthread.h> ++#include <linux/rwsem.h> ++#include <linux/module.h> ++#include <linux/mutex.h> ++#include <xen/xenbus.h> ++#include "xenbus_comms.h" ++ ++#ifdef HAVE_XEN_PLATFORM_COMPAT_H ++#include <xen/platform-compat.h> ++#endif ++ ++struct xs_stored_msg { ++ struct list_head list; ++ ++ struct xsd_sockmsg hdr; ++ ++ union { ++ /* Queued replies. */ ++ struct { ++ char *body; ++ } reply; ++ ++ /* Queued watch events. */ ++ struct { ++ struct xenbus_watch *handle; ++ char **vec; ++ unsigned int vec_size; ++ } watch; ++ } u; ++}; ++ ++struct xs_handle { ++ /* A list of replies. Currently only one will ever be outstanding. */ ++ struct list_head reply_list; ++ spinlock_t reply_lock; ++ wait_queue_head_t reply_waitq; ++ ++ /* ++ * Mutex ordering: transaction_mutex -> watch_mutex -> request_mutex. ++ * response_mutex is never taken simultaneously with the other three. ++ */ ++ ++ /* One request at a time. */ ++ struct mutex request_mutex; ++ ++ /* Protect xenbus reader thread against save/restore. */ ++ struct mutex response_mutex; ++ ++ /* Protect transactions against save/restore. */ ++ struct rw_semaphore transaction_mutex; ++ ++ /* Protect watch (de)register against save/restore. */ ++ struct rw_semaphore watch_mutex; ++}; ++ ++static struct xs_handle xs_state; ++ ++/* List of registered watches, and a lock to protect it. */ ++static LIST_HEAD(watches); ++static DEFINE_SPINLOCK(watches_lock); ++ ++/* List of pending watch callback events, and a lock to protect it. */ ++static LIST_HEAD(watch_events); ++static DEFINE_SPINLOCK(watch_events_lock); ++ ++/* ++ * Details of the xenwatch callback kernel thread. The thread waits on the ++ * watch_events_waitq for work to do (queued on watch_events list). When it ++ * wakes up it acquires the xenwatch_mutex before reading the list and ++ * carrying out work. ++ */ ++static pid_t xenwatch_pid; ++/* static */ DEFINE_MUTEX(xenwatch_mutex); ++static DECLARE_WAIT_QUEUE_HEAD(watch_events_waitq); ++ ++static int get_error(const char *errorstring) ++{ ++ unsigned int i; ++ ++ for (i = 0; strcmp(errorstring, xsd_errors[i].errstring) != 0; i++) { ++ if (i == ARRAY_SIZE(xsd_errors) - 1) { ++ printk(KERN_WARNING ++ "XENBUS xen store gave: unknown error %s", ++ errorstring); ++ return EINVAL; ++ } ++ } ++ return xsd_errors[i].errnum; ++} ++ ++static void *read_reply(enum xsd_sockmsg_type *type, unsigned int *len) ++{ ++ struct xs_stored_msg *msg; ++ char *body; ++ ++ spin_lock(&xs_state.reply_lock); ++ ++ while (list_empty(&xs_state.reply_list)) { ++ spin_unlock(&xs_state.reply_lock); ++ /* XXX FIXME: Avoid synchronous wait for response here. */ ++ wait_event(xs_state.reply_waitq, ++ !list_empty(&xs_state.reply_list)); ++ spin_lock(&xs_state.reply_lock); ++ } ++ ++ msg = list_entry(xs_state.reply_list.next, ++ struct xs_stored_msg, list); ++ list_del(&msg->list); ++ ++ spin_unlock(&xs_state.reply_lock); ++ ++ *type = msg->hdr.type; ++ if (len) ++ *len = msg->hdr.len; ++ body = msg->u.reply.body; ++ ++ kfree(msg); ++ ++ return body; ++} ++ ++void *xenbus_dev_request_and_reply(struct xsd_sockmsg *msg) ++{ ++ void *ret; ++ struct xsd_sockmsg req_msg = *msg; ++ int err; ++ ++ if (req_msg.type == XS_TRANSACTION_START) ++ down_read(&xs_state.transaction_mutex); ++ ++ mutex_lock(&xs_state.request_mutex); ++ ++ err = xb_write(msg, sizeof(*msg) + msg->len); ++ if (err) { ++ msg->type = XS_ERROR; ++ ret = ERR_PTR(err); ++ } else ++ ret = read_reply(&msg->type, &msg->len); ++ ++ mutex_unlock(&xs_state.request_mutex); ++ ++ if ((req_msg.type == XS_TRANSACTION_END) || ++ ((req_msg.type == XS_TRANSACTION_START) && ++ (msg->type == XS_ERROR))) ++ up_read(&xs_state.transaction_mutex); ++ ++ return ret; ++} ++ ++/* Send message to xs, get kmalloc'ed reply. ERR_PTR() on error. */ ++static void *xs_talkv(struct xenbus_transaction t, ++ enum xsd_sockmsg_type type, ++ const struct kvec *iovec, ++ unsigned int num_vecs, ++ unsigned int *len) ++{ ++ struct xsd_sockmsg msg; ++ void *ret = NULL; ++ unsigned int i; ++ int err; ++ ++ msg.tx_id = t.id; ++ msg.req_id = 0; ++ msg.type = type; ++ msg.len = 0; ++ for (i = 0; i < num_vecs; i++) ++ msg.len += iovec[i].iov_len; ++ ++ mutex_lock(&xs_state.request_mutex); ++ ++ err = xb_write(&msg, sizeof(msg)); ++ if (err) { ++ mutex_unlock(&xs_state.request_mutex); ++ return ERR_PTR(err); ++ } ++ ++ for (i = 0; i < num_vecs; i++) { ++ err = xb_write(iovec[i].iov_base, iovec[i].iov_len);; ++ if (err) { ++ mutex_unlock(&xs_state.request_mutex); ++ return ERR_PTR(err); ++ } ++ } ++ ++ ret = read_reply(&msg.type, len); ++ ++ mutex_unlock(&xs_state.request_mutex); ++ ++ if (IS_ERR(ret)) ++ return ret; ++ ++ if (msg.type == XS_ERROR) { ++ err = get_error(ret); ++ kfree(ret); ++ return ERR_PTR(-err); ++ } ++ ++ if (msg.type != type) { ++ if (printk_ratelimit()) ++ printk(KERN_WARNING ++ "XENBUS unexpected type [%d], expected [%d]\n", ++ msg.type, type); ++ kfree(ret); ++ return ERR_PTR(-EINVAL); ++ } ++ return ret; ++} ++ ++/* Simplified version of xs_talkv: single message. */ ++static void *xs_single(struct xenbus_transaction t, ++ enum xsd_sockmsg_type type, ++ const char *string, ++ unsigned int *len) ++{ ++ struct kvec iovec; ++ ++ iovec.iov_base = (void *)string; ++ iovec.iov_len = strlen(string) + 1; ++ return xs_talkv(t, type, &iovec, 1, len); ++} ++ ++/* Many commands only need an ack, don't care what it says. */ ++static int xs_error(char *reply) ++{ ++ if (IS_ERR(reply)) ++ return PTR_ERR(reply); ++ kfree(reply); ++ return 0; ++} ++ ++static unsigned int count_strings(const char *strings, unsigned int len) ++{ ++ unsigned int num; ++ const char *p; ++ ++ for (p = strings, num = 0; p < strings + len; p += strlen(p) + 1) ++ num++; ++ ++ return num; ++} ++ ++/* Return the path to dir with /name appended. Buffer must be kfree()'ed. */ ++static char *join(const char *dir, const char *name) ++{ ++ char *buffer; ++ ++ if (strlen(name) == 0) ++ buffer = kasprintf(GFP_KERNEL, "%s", dir); ++ else ++ buffer = kasprintf(GFP_KERNEL, "%s/%s", dir, name); ++ return (!buffer) ? ERR_PTR(-ENOMEM) : buffer; ++} ++ ++static char **split(char *strings, unsigned int len, unsigned int *num) ++{ ++ char *p, **ret; ++ ++ /* Count the strings. */ ++ *num = count_strings(strings, len); ++ ++ /* Transfer to one big alloc for easy freeing. */ ++ ret = kmalloc(*num * sizeof(char *) + len, GFP_KERNEL); ++ if (!ret) { ++ kfree(strings); ++ return ERR_PTR(-ENOMEM); ++ } ++ memcpy(&ret[*num], strings, len); ++ kfree(strings); ++ ++ strings = (char *)&ret[*num]; ++ for (p = strings, *num = 0; p < strings + len; p += strlen(p) + 1) ++ ret[(*num)++] = p; ++ ++ return ret; ++} ++ ++char **xenbus_directory(struct xenbus_transaction t, ++ const char *dir, const char *node, unsigned int *num) ++{ ++ char *strings, *path; ++ unsigned int len; ++ ++ path = join(dir, node); ++ if (IS_ERR(path)) ++ return (char **)path; ++ ++ strings = xs_single(t, XS_DIRECTORY, path, &len); ++ kfree(path); ++ if (IS_ERR(strings)) ++ return (char **)strings; ++ ++ return split(strings, len, num); ++} ++EXPORT_SYMBOL_GPL(xenbus_directory); ++ ++/* Check if a path exists. Return 1 if it does. */ ++int xenbus_exists(struct xenbus_transaction t, ++ const char *dir, const char *node) ++{ ++ char **d; ++ int dir_n; ++ ++ d = xenbus_directory(t, dir, node, &dir_n); ++ if (IS_ERR(d)) ++ return 0; ++ kfree(d); ++ return 1; ++} ++EXPORT_SYMBOL_GPL(xenbus_exists); ++ ++/* Get the value of a single file. ++ * Returns a kmalloced value: call free() on it after use. ++ * len indicates length in bytes. ++ */ ++void *xenbus_read(struct xenbus_transaction t, ++ const char *dir, const char *node, unsigned int *len) ++{ ++ char *path; ++ void *ret; ++ ++ path = join(dir, node); ++ if (IS_ERR(path)) ++ return (void *)path; ++ ++ ret = xs_single(t, XS_READ, path, len); ++ kfree(path); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_read); ++ ++/* Write the value of a single file. ++ * Returns -err on failure. ++ */ ++int xenbus_write(struct xenbus_transaction t, ++ const char *dir, const char *node, const char *string) ++{ ++ const char *path; ++ struct kvec iovec[2]; ++ int ret; ++ ++ path = join(dir, node); ++ if (IS_ERR(path)) ++ return PTR_ERR(path); ++ ++ iovec[0].iov_base = (void *)path; ++ iovec[0].iov_len = strlen(path) + 1; ++ iovec[1].iov_base = (void *)string; ++ iovec[1].iov_len = strlen(string); ++ ++ ret = xs_error(xs_talkv(t, XS_WRITE, iovec, ARRAY_SIZE(iovec), NULL)); ++ kfree(path); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_write); ++ ++/* Create a new directory. */ ++int xenbus_mkdir(struct xenbus_transaction t, ++ const char *dir, const char *node) ++{ ++ char *path; ++ int ret; ++ ++ path = join(dir, node); ++ if (IS_ERR(path)) ++ return PTR_ERR(path); ++ ++ ret = xs_error(xs_single(t, XS_MKDIR, path, NULL)); ++ kfree(path); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_mkdir); ++ ++/* Destroy a file or directory (directories must be empty). */ ++int xenbus_rm(struct xenbus_transaction t, const char *dir, const char *node) ++{ ++ char *path; ++ int ret; ++ ++ path = join(dir, node); ++ if (IS_ERR(path)) ++ return PTR_ERR(path); ++ ++ ret = xs_error(xs_single(t, XS_RM, path, NULL)); ++ kfree(path); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_rm); ++ ++/* Start a transaction: changes by others will not be seen during this ++ * transaction, and changes will not be visible to others until end. ++ */ ++int xenbus_transaction_start(struct xenbus_transaction *t) ++{ ++ char *id_str; ++ ++ down_read(&xs_state.transaction_mutex); ++ ++ id_str = xs_single(XBT_NIL, XS_TRANSACTION_START, "", NULL); ++ if (IS_ERR(id_str)) { ++ up_read(&xs_state.transaction_mutex); ++ return PTR_ERR(id_str); ++ } ++ ++ t->id = simple_strtoul(id_str, NULL, 0); ++ kfree(id_str); ++ return 0; ++} ++EXPORT_SYMBOL_GPL(xenbus_transaction_start); ++ ++/* End a transaction. ++ * If abandon is true, transaction is discarded instead of committed. ++ */ ++int xenbus_transaction_end(struct xenbus_transaction t, int abort) ++{ ++ char abortstr[2]; ++ int err; ++ ++ if (abort) ++ strcpy(abortstr, "F"); ++ else ++ strcpy(abortstr, "T"); ++ ++ err = xs_error(xs_single(t, XS_TRANSACTION_END, abortstr, NULL)); ++ ++ up_read(&xs_state.transaction_mutex); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(xenbus_transaction_end); ++ ++/* Single read and scanf: returns -errno or num scanned. */ ++int xenbus_scanf(struct xenbus_transaction t, ++ const char *dir, const char *node, const char *fmt, ...) ++{ ++ va_list ap; ++ int ret; ++ char *val; ++ ++ val = xenbus_read(t, dir, node, NULL); ++ if (IS_ERR(val)) ++ return PTR_ERR(val); ++ ++ va_start(ap, fmt); ++ ret = vsscanf(val, fmt, ap); ++ va_end(ap); ++ kfree(val); ++ /* Distinctive errno. */ ++ if (ret == 0) ++ return -ERANGE; ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_scanf); ++ ++/* Single printf and write: returns -errno or 0. */ ++int xenbus_printf(struct xenbus_transaction t, ++ const char *dir, const char *node, const char *fmt, ...) ++{ ++ va_list ap; ++ int ret; ++#define PRINTF_BUFFER_SIZE 4096 ++ char *printf_buffer; ++ ++ printf_buffer = kmalloc(PRINTF_BUFFER_SIZE, GFP_KERNEL); ++ if (printf_buffer == NULL) ++ return -ENOMEM; ++ ++ va_start(ap, fmt); ++ ret = vsnprintf(printf_buffer, PRINTF_BUFFER_SIZE, fmt, ap); ++ va_end(ap); ++ ++ BUG_ON(ret > PRINTF_BUFFER_SIZE-1); ++ ret = xenbus_write(t, dir, node, printf_buffer); ++ ++ kfree(printf_buffer); ++ ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_printf); ++ ++/* Takes tuples of names, scanf-style args, and void **, NULL terminated. */ ++int xenbus_gather(struct xenbus_transaction t, const char *dir, ...) ++{ ++ va_list ap; ++ const char *name; ++ int ret = 0; ++ ++ va_start(ap, dir); ++ while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { ++ const char *fmt = va_arg(ap, char *); ++ void *result = va_arg(ap, void *); ++ char *p; ++ ++ p = xenbus_read(t, dir, name, NULL); ++ if (IS_ERR(p)) { ++ ret = PTR_ERR(p); ++ break; ++ } ++ if (fmt) { ++ if (sscanf(p, fmt, result) == 0) ++ ret = -EINVAL; ++ kfree(p); ++ } else ++ *(char **)result = p; ++ } ++ va_end(ap); ++ return ret; ++} ++EXPORT_SYMBOL_GPL(xenbus_gather); ++ ++static int xs_watch(const char *path, const char *token) ++{ ++ struct kvec iov[2]; ++ ++ iov[0].iov_base = (void *)path; ++ iov[0].iov_len = strlen(path) + 1; ++ iov[1].iov_base = (void *)token; ++ iov[1].iov_len = strlen(token) + 1; ++ ++ return xs_error(xs_talkv(XBT_NIL, XS_WATCH, iov, ++ ARRAY_SIZE(iov), NULL)); ++} ++ ++static int xs_unwatch(const char *path, const char *token) ++{ ++ struct kvec iov[2]; ++ ++ iov[0].iov_base = (char *)path; ++ iov[0].iov_len = strlen(path) + 1; ++ iov[1].iov_base = (char *)token; ++ iov[1].iov_len = strlen(token) + 1; ++ ++ return xs_error(xs_talkv(XBT_NIL, XS_UNWATCH, iov, ++ ARRAY_SIZE(iov), NULL)); ++} ++ ++static struct xenbus_watch *find_watch(const char *token) ++{ ++ struct xenbus_watch *i, *cmp; ++ ++ cmp = (void *)simple_strtoul(token, NULL, 16); ++ ++ list_for_each_entry(i, &watches, list) ++ if (i == cmp) ++ return i; ++ ++ return NULL; ++} ++ ++/* Register callback to watch this node. */ ++int register_xenbus_watch(struct xenbus_watch *watch) ++{ ++ /* Pointer in ascii is the token. */ ++ char token[sizeof(watch) * 2 + 1]; ++ int err; ++ ++ sprintf(token, "%lX", (long)watch); ++ ++ down_read(&xs_state.watch_mutex); ++ ++ spin_lock(&watches_lock); ++ BUG_ON(find_watch(token)); ++ list_add(&watch->list, &watches); ++ spin_unlock(&watches_lock); ++ ++ err = xs_watch(watch->node, token); ++ ++ /* Ignore errors due to multiple registration. */ ++ if ((err != 0) && (err != -EEXIST)) { ++ spin_lock(&watches_lock); ++ list_del(&watch->list); ++ spin_unlock(&watches_lock); ++ } ++ ++ up_read(&xs_state.watch_mutex); ++ ++ return err; ++} ++EXPORT_SYMBOL_GPL(register_xenbus_watch); ++ ++void unregister_xenbus_watch(struct xenbus_watch *watch) ++{ ++ struct xs_stored_msg *msg, *tmp; ++ char token[sizeof(watch) * 2 + 1]; ++ int err; ++ ++ sprintf(token, "%lX", (long)watch); ++ ++ down_read(&xs_state.watch_mutex); ++ ++ spin_lock(&watches_lock); ++ BUG_ON(!find_watch(token)); ++ list_del(&watch->list); ++ spin_unlock(&watches_lock); ++ ++ err = xs_unwatch(watch->node, token); ++ if (err) ++ printk(KERN_WARNING ++ "XENBUS Failed to release watch %s: %i\n", ++ watch->node, err); ++ ++ up_read(&xs_state.watch_mutex); ++ ++ /* Cancel pending watch events. */ ++ spin_lock(&watch_events_lock); ++ list_for_each_entry_safe(msg, tmp, &watch_events, list) { ++ if (msg->u.watch.handle != watch) ++ continue; ++ list_del(&msg->list); ++ kfree(msg->u.watch.vec); ++ kfree(msg); ++ } ++ spin_unlock(&watch_events_lock); ++ ++ /* Flush any currently-executing callback, unless we are it. :-) */ ++ if (current->pid != xenwatch_pid) { ++ mutex_lock(&xenwatch_mutex); ++ mutex_unlock(&xenwatch_mutex); ++ } ++} ++EXPORT_SYMBOL_GPL(unregister_xenbus_watch); ++ ++void xs_suspend(void) ++{ ++ down_write(&xs_state.transaction_mutex); ++ down_write(&xs_state.watch_mutex); ++ mutex_lock(&xs_state.request_mutex); ++ mutex_lock(&xs_state.response_mutex); ++} ++ ++void xs_resume(void) ++{ ++ struct xenbus_watch *watch; ++ char token[sizeof(watch) * 2 + 1]; ++ ++ mutex_unlock(&xs_state.response_mutex); ++ mutex_unlock(&xs_state.request_mutex); ++ up_write(&xs_state.transaction_mutex); ++ ++ /* No need for watches_lock: the watch_mutex is sufficient. */ ++ list_for_each_entry(watch, &watches, list) { ++ sprintf(token, "%lX", (long)watch); ++ xs_watch(watch->node, token); ++ } ++ ++ up_write(&xs_state.watch_mutex); ++} ++ ++void xs_suspend_cancel(void) ++{ ++ mutex_unlock(&xs_state.response_mutex); ++ mutex_unlock(&xs_state.request_mutex); ++ up_write(&xs_state.watch_mutex); ++ up_write(&xs_state.transaction_mutex); ++} ++ ++static int xenwatch_handle_callback(void *data) ++{ ++ struct xs_stored_msg *msg = data; ++ ++ msg->u.watch.handle->callback(msg->u.watch.handle, ++ (const char **)msg->u.watch.vec, ++ msg->u.watch.vec_size); ++ ++ kfree(msg->u.watch.vec); ++ kfree(msg); ++ ++ /* Kill this kthread if we were spawned just for this callback. */ ++ if (current->pid != xenwatch_pid) ++ do_exit(0); ++ ++ return 0; ++} ++ ++static int xenwatch_thread(void *unused) ++{ ++ struct list_head *ent; ++ struct xs_stored_msg *msg; ++ ++ for (;;) { ++ wait_event_interruptible(watch_events_waitq, ++ !list_empty(&watch_events)); ++ ++ if (kthread_should_stop()) ++ break; ++ ++ mutex_lock(&xenwatch_mutex); ++ ++ spin_lock(&watch_events_lock); ++ ent = watch_events.next; ++ if (ent != &watch_events) ++ list_del(ent); ++ spin_unlock(&watch_events_lock); ++ ++ if (ent != &watch_events) { ++ msg = list_entry(ent, struct xs_stored_msg, list); ++ if (msg->u.watch.handle->flags & XBWF_new_thread) ++ kthread_run(xenwatch_handle_callback, ++ msg, "xenwatch_cb"); ++ else ++ xenwatch_handle_callback(msg); ++ } ++ ++ mutex_unlock(&xenwatch_mutex); ++ } ++ ++ return 0; ++} ++ ++static int process_msg(void) ++{ ++ struct xs_stored_msg *msg; ++ char *body; ++ int err; ++ ++ /* ++ * We must disallow save/restore while reading a xenstore message. ++ * A partial read across s/r leaves us out of sync with xenstored. ++ */ ++ for (;;) { ++ err = xb_wait_for_data_to_read(); ++ if (err) ++ return err; ++ mutex_lock(&xs_state.response_mutex); ++ if (xb_data_to_read()) ++ break; ++ /* We raced with save/restore: pending data 'disappeared'. */ ++ mutex_unlock(&xs_state.response_mutex); ++ } ++ ++ ++ msg = kmalloc(sizeof(*msg), GFP_KERNEL); ++ if (msg == NULL) { ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xb_read(&msg->hdr, sizeof(msg->hdr)); ++ if (err) { ++ kfree(msg); ++ goto out; ++ } ++ ++ body = kmalloc(msg->hdr.len + 1, GFP_KERNEL); ++ if (body == NULL) { ++ kfree(msg); ++ err = -ENOMEM; ++ goto out; ++ } ++ ++ err = xb_read(body, msg->hdr.len); ++ if (err) { ++ kfree(body); ++ kfree(msg); ++ goto out; ++ } ++ body[msg->hdr.len] = '\0'; ++ ++ if (msg->hdr.type == XS_WATCH_EVENT) { ++ msg->u.watch.vec = split(body, msg->hdr.len, ++ &msg->u.watch.vec_size); ++ if (IS_ERR(msg->u.watch.vec)) { ++ kfree(msg); ++ err = PTR_ERR(msg->u.watch.vec); ++ goto out; ++ } ++ ++ spin_lock(&watches_lock); ++ msg->u.watch.handle = find_watch( ++ msg->u.watch.vec[XS_WATCH_TOKEN]); ++ if (msg->u.watch.handle != NULL) { ++ spin_lock(&watch_events_lock); ++ list_add_tail(&msg->list, &watch_events); ++ wake_up(&watch_events_waitq); ++ spin_unlock(&watch_events_lock); ++ } else { ++ kfree(msg->u.watch.vec); ++ kfree(msg); ++ } ++ spin_unlock(&watches_lock); ++ } else { ++ msg->u.reply.body = body; ++ spin_lock(&xs_state.reply_lock); ++ list_add_tail(&msg->list, &xs_state.reply_list); ++ spin_unlock(&xs_state.reply_lock); ++ wake_up(&xs_state.reply_waitq); ++ } ++ ++ out: ++ mutex_unlock(&xs_state.response_mutex); ++ return err; ++} ++ ++static int xenbus_thread(void *unused) ++{ ++ int err; ++ ++ for (;;) { ++ err = process_msg(); ++ if (err) ++ printk(KERN_WARNING "XENBUS error %d while reading " ++ "message\n", err); ++ if (kthread_should_stop()) ++ break; ++ } ++ ++ return 0; ++} ++ ++int xs_init(void) ++{ ++ int err; ++ struct task_struct *task; ++ ++ INIT_LIST_HEAD(&xs_state.reply_list); ++ spin_lock_init(&xs_state.reply_lock); ++ init_waitqueue_head(&xs_state.reply_waitq); ++ ++ mutex_init(&xs_state.request_mutex); ++ mutex_init(&xs_state.response_mutex); ++ init_rwsem(&xs_state.transaction_mutex); ++ init_rwsem(&xs_state.watch_mutex); ++ ++ /* Initialize the shared memory rings to talk to xenstored */ ++ err = xb_init_comms(); ++ if (err) ++ return err; ++ ++ task = kthread_run(xenwatch_thread, NULL, "xenwatch"); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ xenwatch_pid = task->pid; ++ ++ task = kthread_run(xenbus_thread, NULL, "xenbus"); ++ if (IS_ERR(task)) ++ return PTR_ERR(task); ++ ++ return 0; ++} +--- /dev/null 1970-01-01 00:00:00.000000000 +0000 ++++ b/drivers/xen/xenoprof/xenoprofile.c 2007-08-27 14:02:03.000000000 -0400 +@@ -0,0 +1,500 @@ ++/** ++ * @file xenoprofile.c ++ * ++ * @remark Copyright 2002 OProfile authors ++ * @remark Read the file COPYING ++ * ++ * @author John Levon <levon@movementarian.org> ++ * ++ * Modified by Aravind Menon and Jose Renato Santos for Xen ++ * These modifications are: ++ * Copyright (C) 2005 Hewlett-Packard Co. ++ * ++ * Separated out arch-generic part ++ * Copyright (c) 2006 Isaku Yamahata <yamahata at valinux co jp> ++ * VA Linux Systems Japan K.K. ++ */ ++ ++#include <linux/init.h> ++#include <linux/notifier.h> ++#include <linux/smp.h> ++#include <linux/oprofile.h> ++#include <linux/sysdev.h> ++#include <linux/slab.h> ++#include <linux/interrupt.h> ++#include <linux/vmalloc.h> ++#include <asm/pgtable.h> ++#include <xen/evtchn.h> ++#include <xen/xenoprof.h> ++#include <xen/driver_util.h> ++#include <xen/interface/xen.h> ++#include <xen/interface/xenoprof.h> ++#include "../../../drivers/oprofile/cpu_buffer.h" ++#include "../../../drivers/oprofile/event_buffer.h" ++ ++#define MAX_XENOPROF_SAMPLES 16 ++ ++/* sample buffers shared with Xen */ ++xenoprof_buf_t * xenoprof_buf[MAX_VIRT_CPUS]; ++/* Shared buffer area */ ++struct xenoprof_shared_buffer shared_buffer; ++ ++/* Passive sample buffers shared with Xen */ ++xenoprof_buf_t *p_xenoprof_buf[MAX_OPROF_DOMAINS][MAX_VIRT_CPUS]; ++/* Passive shared buffer area */ ++struct xenoprof_shared_buffer p_shared_buffer[MAX_OPROF_DOMAINS]; ++ ++static int xenoprof_start(void); ++static void xenoprof_stop(void); ++ ++static int xenoprof_enabled = 0; ++static int xenoprof_is_primary = 0; ++static int active_defined; ++ ++/* Number of buffers in shared area (one per VCPU) */ ++int nbuf; ++/* Mappings of VIRQ_XENOPROF to irq number (per cpu) */ ++int ovf_irq[NR_CPUS]; ++/* cpu model type string - copied from Xen memory space on XENOPROF_init command */ ++char cpu_type[XENOPROF_CPU_TYPE_SIZE]; ++ ++#ifdef CONFIG_PM ++ ++static int xenoprof_suspend(struct sys_device * dev, pm_message_t state) ++{ ++ if (xenoprof_enabled == 1) ++ xenoprof_stop(); ++ return 0; ++} ++ ++ ++static int xenoprof_resume(struct sys_device * dev) ++{ ++ if (xenoprof_enabled == 1) ++ xenoprof_start(); ++ return 0; ++} ++ ++ ++static struct sysdev_class oprofile_sysclass = { ++ set_kset_name("oprofile"), ++ .resume = xenoprof_resume, ++ .suspend = xenoprof_suspend ++}; ++ ++ ++static struct sys_device device_oprofile = { ++ .id = 0, ++ .cls = &oprofile_sysclass, ++}; ++ ++ ++static int __init init_driverfs(void) ++{ ++ int error; ++ if (!(error = sysdev_class_register(&oprofile_sysclass))) ++ error = sysdev_register(&device_oprofile); ++ return error; ++} ++ ++ ++static void exit_driverfs(void) ++{ ++ sysdev_unregister(&device_oprofile); ++ sysdev_class_unregister(&oprofile_sysclass); ++} ++ ++#else ++#define init_driverfs() do { } while (0) ++#define exit_driverfs() do { } while (0) ++#endif /* CONFIG_PM */ ++ ++unsigned long long oprofile_samples = 0; ++unsigned long long p_oprofile_samples = 0; ++ ++unsigned int pdomains; ++struct xenoprof_passive passive_domains[MAX_OPROF_DOMAINS]; ++ ++static void xenoprof_add_pc(xenoprof_buf_t *buf, int is_passive) ++{ ++ int head, tail, size; ++ ++ head = buf->event_head; ++ tail = buf->event_tail; ++ size = buf->event_size; ++ ++ if (tail > head) { ++ while (tail < size) { ++ oprofile_add_pc(buf->event_log[tail].eip, ++ buf->event_log[tail].mode, ++ buf->event_log[tail].event); ++ if (!is_passive) ++ oprofile_samples++; ++ else ++ p_oprofile_samples++; ++ tail++; ++ } ++ tail = 0; ++ } ++ while (tail < head) { ++ oprofile_add_pc(buf->event_log[tail].eip, ++ buf->event_log[tail].mode, ++ buf->event_log[tail].event); ++ if (!is_passive) ++ oprofile_samples++; ++ else ++ p_oprofile_samples++; ++ tail++; ++ } ++ ++ buf->event_tail = tail; ++} ++ ++static void xenoprof_handle_passive(void) ++{ ++ int i, j; ++ int flag_domain, flag_switch = 0; ++ ++ for (i = 0; i < pdomains; i++) { ++ flag_domain = 0; ++ for (j = 0; j < passive_domains[i].nbuf; j++) { ++ xenoprof_buf_t *buf = p_xenoprof_buf[i][j]; ++ if (buf->event_head == buf->event_tail) ++ continue; ++ if (!flag_domain) { ++ if (!oprofile_add_domain_switch(passive_domains[i]. ++ domain_id)) ++ goto done; ++ flag_domain = 1; ++ } ++ xenoprof_add_pc(buf, 1); ++ flag_switch = 1; ++ } ++ } ++done: ++ if (flag_switch) ++ oprofile_add_domain_switch(COORDINATOR_DOMAIN); ++} ++ ++static irqreturn_t ++xenoprof_ovf_interrupt(int irq, void * dev_id, struct pt_regs * regs) ++{ ++ struct xenoprof_buf * buf; ++ int cpu; ++ static unsigned long flag; ++ ++ cpu = smp_processor_id(); ++ buf = xenoprof_buf[cpu]; ++ ++ xenoprof_add_pc(buf, 0); ++ ++ if (xenoprof_is_primary && !test_and_set_bit(0, &flag)) { ++ xenoprof_handle_passive(); ++ smp_mb__before_clear_bit(); ++ clear_bit(0, &flag); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++ ++static void unbind_virq(void) ++{ ++ int i; ++ ++ for_each_online_cpu(i) { ++ if (ovf_irq[i] >= 0) { ++ unbind_from_irqhandler(ovf_irq[i], NULL); ++ ovf_irq[i] = -1; ++ } ++ } ++} ++ ++ ++static int bind_virq(void) ++{ ++ int i, result; ++ ++ for_each_online_cpu(i) { ++ result = bind_virq_to_irqhandler(VIRQ_XENOPROF, ++ i, ++ xenoprof_ovf_interrupt, ++ SA_INTERRUPT, ++ "xenoprof", ++ NULL); ++ ++ if (result < 0) { ++ unbind_virq(); ++ return result; ++ } ++ ++ ovf_irq[i] = result; ++ } ++ ++ return 0; ++} ++ ++ ++static void unmap_passive_list(void) ++{ ++ int i; ++ for (i = 0; i < pdomains; i++) ++ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); ++ pdomains = 0; ++} ++ ++ ++static int map_xenoprof_buffer(int max_samples) ++{ ++ struct xenoprof_get_buffer get_buffer; ++ struct xenoprof_buf *buf; ++ int ret, i; ++ ++ if ( shared_buffer.buffer ) ++ return 0; ++ ++ get_buffer.max_samples = max_samples; ++ ret = xenoprof_arch_map_shared_buffer(&get_buffer, &shared_buffer); ++ if (ret) ++ return ret; ++ nbuf = get_buffer.nbuf; ++ ++ for (i=0; i< nbuf; i++) { ++ buf = (struct xenoprof_buf*) ++ &shared_buffer.buffer[i * get_buffer.bufsize]; ++ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); ++ xenoprof_buf[buf->vcpu_id] = buf; ++ } ++ ++ return 0; ++} ++ ++ ++static int xenoprof_setup(void) ++{ ++ int ret; ++ ++ if ( (ret = map_xenoprof_buffer(MAX_XENOPROF_SAMPLES)) ) ++ return ret; ++ ++ if ( (ret = bind_virq()) ) ++ return ret; ++ ++ if (xenoprof_is_primary) { ++ /* Define dom0 as an active domain if not done yet */ ++ if (!active_defined) { ++ domid_t domid; ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); ++ if (ret) ++ goto err; ++ domid = 0; ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); ++ if (ret) ++ goto err; ++ active_defined = 1; ++ } ++ ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_reserve_counters, NULL); ++ if (ret) ++ goto err; ++ xenoprof_arch_counter(); ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_setup_events, NULL); ++ ++ if (ret) ++ goto err; ++ } ++ ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_enable_virq, NULL); ++ if (ret) ++ goto err; ++ ++ xenoprof_enabled = 1; ++ return 0; ++ err: ++ unbind_virq(); ++ return ret; ++} ++ ++ ++static void xenoprof_shutdown(void) ++{ ++ xenoprof_enabled = 0; ++ ++ HYPERVISOR_xenoprof_op(XENOPROF_disable_virq, NULL); ++ ++ if (xenoprof_is_primary) { ++ HYPERVISOR_xenoprof_op(XENOPROF_release_counters, NULL); ++ active_defined = 0; ++ } ++ ++ unbind_virq(); ++ ++ xenoprof_arch_unmap_shared_buffer(&shared_buffer); ++ if (xenoprof_is_primary) ++ unmap_passive_list(); ++} ++ ++ ++static int xenoprof_start(void) ++{ ++ int ret = 0; ++ ++ if (xenoprof_is_primary) ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_start, NULL); ++ if (!ret) ++ xenoprof_arch_start(); ++ return ret; ++} ++ ++ ++static void xenoprof_stop(void) ++{ ++ if (xenoprof_is_primary) ++ HYPERVISOR_xenoprof_op(XENOPROF_stop, NULL); ++ xenoprof_arch_stop(); ++} ++ ++ ++static int xenoprof_set_active(int * active_domains, ++ unsigned int adomains) ++{ ++ int ret = 0; ++ int i; ++ int set_dom0 = 0; ++ domid_t domid; ++ ++ if (!xenoprof_is_primary) ++ return 0; ++ ++ if (adomains > MAX_OPROF_DOMAINS) ++ return -E2BIG; ++ ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); ++ if (ret) ++ return ret; ++ ++ for (i=0; i<adomains; i++) { ++ domid = active_domains[i]; ++ if (domid != active_domains[i]) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); ++ if (ret) ++ goto out; ++ if (active_domains[i] == 0) ++ set_dom0 = 1; ++ } ++ /* dom0 must always be active but may not be in the list */ ++ if (!set_dom0) { ++ domid = 0; ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_set_active, &domid); ++ } ++ ++out: ++ if (ret) ++ HYPERVISOR_xenoprof_op(XENOPROF_reset_active_list, NULL); ++ active_defined = !ret; ++ return ret; ++} ++ ++static int xenoprof_set_passive(int * p_domains, ++ unsigned int pdoms) ++{ ++ int ret; ++ int i, j; ++ struct xenoprof_buf *buf; ++ ++ if (!xenoprof_is_primary) ++ return 0; ++ ++ if (pdoms > MAX_OPROF_DOMAINS) ++ return -E2BIG; ++ ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_reset_passive_list, NULL); ++ if (ret) ++ return ret; ++ unmap_passive_list(); ++ ++ for (i = 0; i < pdoms; i++) { ++ passive_domains[i].domain_id = p_domains[i]; ++ passive_domains[i].max_samples = 2048; ++ ret = xenoprof_arch_set_passive(&passive_domains[i], ++ &p_shared_buffer[i]); ++ if (ret) ++ goto out; ++ for (j = 0; j < passive_domains[i].nbuf; j++) { ++ buf = (struct xenoprof_buf *) ++ &p_shared_buffer[i].buffer[j * passive_domains[i].bufsize]; ++ BUG_ON(buf->vcpu_id >= MAX_VIRT_CPUS); ++ p_xenoprof_buf[i][buf->vcpu_id] = buf; ++ } ++ } ++ ++ pdomains = pdoms; ++ return 0; ++ ++out: ++ for (j = 0; j < i; j++) ++ xenoprof_arch_unmap_shared_buffer(&p_shared_buffer[i]); ++ ++ return ret; ++} ++ ++struct oprofile_operations xenoprof_ops = { ++#ifdef HAVE_XENOPROF_CREATE_FILES ++ .create_files = xenoprof_create_files, ++#endif ++ .set_active = xenoprof_set_active, ++ .set_passive = xenoprof_set_passive, ++ .setup = xenoprof_setup, ++ .shutdown = xenoprof_shutdown, ++ .start = xenoprof_start, ++ .stop = xenoprof_stop ++}; ++ ++ ++/* in order to get driverfs right */ ++static int using_xenoprof; ++ ++int __init xenoprofile_init(struct oprofile_operations * ops) ++{ ++ struct xenoprof_init init; ++ int ret, i; ++ ++ ret = HYPERVISOR_xenoprof_op(XENOPROF_init, &init); ++ if (!ret) { ++ xenoprof_arch_init_counter(&init); ++ xenoprof_is_primary = init.is_primary; ++ ++ /* cpu_type is detected by Xen */ ++ cpu_type[XENOPROF_CPU_TYPE_SIZE-1] = 0; ++ strncpy(cpu_type, init.cpu_type, XENOPROF_CPU_TYPE_SIZE - 1); ++ xenoprof_ops.cpu_type = cpu_type; ++ ++ init_driverfs(); ++ using_xenoprof = 1; ++ *ops = xenoprof_ops; ++ ++ for (i=0; i<NR_CPUS; i++) ++ ovf_irq[i] = -1; ++ ++ active_defined = 0; ++ } ++ printk(KERN_INFO "%s: ret %d, events %d, xenoprof_is_primary %d\n", ++ __func__, ret, init.num_events, xenoprof_is_primary); ++ return ret; ++} ++ ++ ++void xenoprofile_exit(void) ++{ ++ if (using_xenoprof) ++ exit_driverfs(); ++ ++ xenoprof_arch_unmap_shared_buffer(&shared_buffer); ++ if (xenoprof_is_primary) { ++ unmap_passive_list(); ++ HYPERVISOR_xenoprof_op(XENOPROF_shutdown, NULL); ++ } ++} |