diff options
Diffstat (limited to 'arch/powerpc/platforms/pseries')
39 files changed, 13492 insertions, 0 deletions
diff --git a/arch/powerpc/platforms/pseries/Kconfig b/arch/powerpc/platforms/pseries/Kconfig new file mode 100644 index 00000000..71af4c5d --- /dev/null +++ b/arch/powerpc/platforms/pseries/Kconfig @@ -0,0 +1,120 @@ +config PPC_PSERIES + depends on PPC64 && PPC_BOOK3S + bool "IBM pSeries & new (POWER5-based) iSeries" + select MPIC + select PCI_MSI + select PPC_XICS + select PPC_ICP_NATIVE + select PPC_ICP_HV + select PPC_ICS_RTAS + select PPC_I8259 + select PPC_RTAS + select PPC_RTAS_DAEMON + select RTAS_ERROR_LOGGING + select PPC_UDBG_16550 + select PPC_NATIVE + select PPC_PCI_CHOICE if EXPERT + default y + +config PPC_SPLPAR + depends on PPC_PSERIES + bool "Support for shared-processor logical partitions" + default n + help + Enabling this option will make the kernel run more efficiently + on logically-partitioned pSeries systems which use shared + processors, that is, which share physical processors between + two or more partitions. + +config EEH + bool "PCI Extended Error Handling (EEH)" if EXPERT + depends on PPC_PSERIES && PCI + default y if !EXPERT + +config PSERIES_MSI + bool + depends on PCI_MSI && EEH + default y + +config PSERIES_ENERGY + tristate "pSeries energy management capabilities driver" + depends on PPC_PSERIES + default y + help + Provides interface to platform energy management capabilities + on supported PSERIES platforms. + Provides: /sys/devices/system/cpu/pseries_(de)activation_hint_list + and /sys/devices/system/cpu/cpuN/pseries_(de)activation_hint + +config SCANLOG + tristate "Scanlog dump interface" + depends on RTAS_PROC && PPC_PSERIES + +config IO_EVENT_IRQ + bool "IO Event Interrupt support" + depends on PPC_PSERIES + default y + help + Select this option, if you want to enable support for IO Event + interrupts. IO event interrupt is a mechanism provided by RTAS + to return information about hardware error and non-error events + which may need OS attention. RTAS returns events for multiple + event types and scopes. Device drivers can register their handlers + to receive events. + + This option will only enable the IO event platform code. You + will still need to enable or compile the actual drivers + that use this infrastruture to handle IO event interrupts. + + Say Y if you are unsure. + +config LPARCFG + bool "LPAR Configuration Data" + depends on PPC_PSERIES || PPC_ISERIES + help + Provide system capacity information via human readable + <key word>=<value> pairs through a /proc/ppc64/lparcfg interface. + +config PPC_PSERIES_DEBUG + depends on PPC_PSERIES && PPC_EARLY_DEBUG + bool "Enable extra debug logging in platforms/pseries" + help + Say Y here if you want the pseries core to produce a bunch of + debug messages to the system log. Select this if you are having a + problem with the pseries core and want to see more of what is + going on. This does not enable debugging in lpar.c, which must + be manually done due to its verbosity. + default y + +config PPC_SMLPAR + bool "Support for shared-memory logical partitions" + depends on PPC_PSERIES + select LPARCFG + default n + help + Select this option to enable shared memory partition support. + With this option a system running in an LPAR can be given more + memory than physically available and will allow firmware to + balance memory across many LPARs. + +config CMM + tristate "Collaborative memory management" + depends on PPC_SMLPAR + default y + help + Select this option, if you want to enable the kernel interface + to reduce the memory size of the system. This is accomplished + by allocating pages of memory and put them "on hold". This only + makes sense for a system running in an LPAR where the unused pages + will be reused for other LPARs. The interface allows firmware to + balance memory across many LPARs. + +config DTL + bool "Dispatch Trace Log" + depends on PPC_SPLPAR && DEBUG_FS + help + SPLPAR machines can log hypervisor preempt & dispatch events to a + kernel buffer. Saying Y here will enable logging these events, + which are accessible through a debugfs file. + + Say N if you are unsure. diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile new file mode 100644 index 00000000..3556e402 --- /dev/null +++ b/arch/powerpc/platforms/pseries/Makefile @@ -0,0 +1,28 @@ +ccflags-$(CONFIG_PPC64) := -mno-minimal-toc +ccflags-$(CONFIG_PPC_PSERIES_DEBUG) += -DDEBUG + +obj-y := lpar.o hvCall.o nvram.o reconfig.o \ + setup.o iommu.o event_sources.o ras.o \ + firmware.o power.o dlpar.o mobility.o +obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SCANLOG) += scanlog.o +obj-$(CONFIG_EEH) += eeh.o eeh_cache.o eeh_driver.o eeh_event.o eeh_sysfs.o +obj-$(CONFIG_KEXEC) += kexec.o +obj-$(CONFIG_PCI) += pci.o pci_dlpar.o +obj-$(CONFIG_PSERIES_MSI) += msi.o +obj-$(CONFIG_PSERIES_ENERGY) += pseries_energy.o + +obj-$(CONFIG_HOTPLUG_CPU) += hotplug-cpu.o +obj-$(CONFIG_MEMORY_HOTPLUG) += hotplug-memory.o + +obj-$(CONFIG_HVC_CONSOLE) += hvconsole.o +obj-$(CONFIG_HVCS) += hvcserver.o +obj-$(CONFIG_HCALL_STATS) += hvCall_inst.o +obj-$(CONFIG_PHYP_DUMP) += phyp_dump.o +obj-$(CONFIG_CMM) += cmm.o +obj-$(CONFIG_DTL) += dtl.o +obj-$(CONFIG_IO_EVENT_IRQ) += io_event_irq.o + +ifeq ($(CONFIG_PPC_PSERIES),y) +obj-$(CONFIG_SUSPEND) += suspend.o +endif diff --git a/arch/powerpc/platforms/pseries/cmm.c b/arch/powerpc/platforms/pseries/cmm.c new file mode 100644 index 00000000..3cafc306 --- /dev/null +++ b/arch/powerpc/platforms/pseries/cmm.c @@ -0,0 +1,741 @@ +/* + * Collaborative memory management interface. + * + * Copyright (C) 2008 IBM Corporation + * Author(s): Brian King (brking@linux.vnet.ibm.com), + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + */ + +#include <linux/ctype.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/fs.h> +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kthread.h> +#include <linux/module.h> +#include <linux/oom.h> +#include <linux/reboot.h> +#include <linux/sched.h> +#include <linux/stringify.h> +#include <linux/swap.h> +#include <linux/sysdev.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> +#include <asm/mmu.h> +#include <asm/pgalloc.h> +#include <asm/uaccess.h> +#include <linux/memory.h> + +#include "plpar_wrappers.h" + +#define CMM_DRIVER_VERSION "1.0.0" +#define CMM_DEFAULT_DELAY 1 +#define CMM_HOTPLUG_DELAY 5 +#define CMM_DEBUG 0 +#define CMM_DISABLE 0 +#define CMM_OOM_KB 1024 +#define CMM_MIN_MEM_MB 256 +#define KB2PAGES(_p) ((_p)>>(PAGE_SHIFT-10)) +#define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) +/* + * The priority level tries to ensure that this notifier is called as + * late as possible to reduce thrashing in the shared memory pool. + */ +#define CMM_MEM_HOTPLUG_PRI 1 +#define CMM_MEM_ISOLATE_PRI 15 + +static unsigned int delay = CMM_DEFAULT_DELAY; +static unsigned int hotplug_delay = CMM_HOTPLUG_DELAY; +static unsigned int oom_kb = CMM_OOM_KB; +static unsigned int cmm_debug = CMM_DEBUG; +static unsigned int cmm_disabled = CMM_DISABLE; +static unsigned long min_mem_mb = CMM_MIN_MEM_MB; +static struct sys_device cmm_sysdev; + +MODULE_AUTHOR("Brian King <brking@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("IBM System p Collaborative Memory Manager"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(CMM_DRIVER_VERSION); + +module_param_named(delay, delay, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(delay, "Delay (in seconds) between polls to query hypervisor paging requests. " + "[Default=" __stringify(CMM_DEFAULT_DELAY) "]"); +module_param_named(hotplug_delay, hotplug_delay, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(delay, "Delay (in seconds) after memory hotplug remove " + "before loaning resumes. " + "[Default=" __stringify(CMM_HOTPLUG_DELAY) "]"); +module_param_named(oom_kb, oom_kb, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(oom_kb, "Amount of memory in kb to free on OOM. " + "[Default=" __stringify(CMM_OOM_KB) "]"); +module_param_named(min_mem_mb, min_mem_mb, ulong, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(min_mem_mb, "Minimum amount of memory (in MB) to not balloon. " + "[Default=" __stringify(CMM_MIN_MEM_MB) "]"); +module_param_named(debug, cmm_debug, uint, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(debug, "Enable module debugging logging. Set to 1 to enable. " + "[Default=" __stringify(CMM_DEBUG) "]"); + +#define CMM_NR_PAGES ((PAGE_SIZE - sizeof(void *) - sizeof(unsigned long)) / sizeof(unsigned long)) + +#define cmm_dbg(...) if (cmm_debug) { printk(KERN_INFO "cmm: "__VA_ARGS__); } + +struct cmm_page_array { + struct cmm_page_array *next; + unsigned long index; + unsigned long page[CMM_NR_PAGES]; +}; + +static unsigned long loaned_pages; +static unsigned long loaned_pages_target; +static unsigned long oom_freed_pages; + +static struct cmm_page_array *cmm_page_list; +static DEFINE_SPINLOCK(cmm_lock); + +static DEFINE_MUTEX(hotplug_mutex); +static int hotplug_occurred; /* protected by the hotplug mutex */ + +static struct task_struct *cmm_thread_ptr; + +/** + * cmm_alloc_pages - Allocate pages and mark them as loaned + * @nr: number of pages to allocate + * + * Return value: + * number of pages requested to be allocated which were not + **/ +static long cmm_alloc_pages(long nr) +{ + struct cmm_page_array *pa, *npa; + unsigned long addr; + long rc; + + cmm_dbg("Begin request for %ld pages\n", nr); + + while (nr) { + /* Exit if a hotplug operation is in progress or occurred */ + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + mutex_unlock(&hotplug_mutex); + break; + } + mutex_unlock(&hotplug_mutex); + } else { + break; + } + + addr = __get_free_page(GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); + if (!addr) + break; + spin_lock(&cmm_lock); + pa = cmm_page_list; + if (!pa || pa->index >= CMM_NR_PAGES) { + /* Need a new page for the page list. */ + spin_unlock(&cmm_lock); + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); + if (!npa) { + pr_info("%s: Can not allocate new page list\n", __func__); + free_page(addr); + break; + } + spin_lock(&cmm_lock); + pa = cmm_page_list; + + if (!pa || pa->index >= CMM_NR_PAGES) { + npa->next = pa; + npa->index = 0; + pa = npa; + cmm_page_list = pa; + } else + free_page((unsigned long) npa); + } + + if ((rc = plpar_page_set_loaned(__pa(addr)))) { + pr_err("%s: Can not set page to loaned. rc=%ld\n", __func__, rc); + spin_unlock(&cmm_lock); + free_page(addr); + break; + } + + pa->page[pa->index++] = addr; + loaned_pages++; + totalram_pages--; + spin_unlock(&cmm_lock); + nr--; + } + + cmm_dbg("End request with %ld pages unfulfilled\n", nr); + return nr; +} + +/** + * cmm_free_pages - Free pages and mark them as active + * @nr: number of pages to free + * + * Return value: + * number of pages requested to be freed which were not + **/ +static long cmm_free_pages(long nr) +{ + struct cmm_page_array *pa; + unsigned long addr; + + cmm_dbg("Begin free of %ld pages.\n", nr); + spin_lock(&cmm_lock); + pa = cmm_page_list; + while (nr) { + if (!pa || pa->index <= 0) + break; + addr = pa->page[--pa->index]; + + if (pa->index == 0) { + pa = pa->next; + free_page((unsigned long) cmm_page_list); + cmm_page_list = pa; + } + + plpar_page_set_active(__pa(addr)); + free_page(addr); + loaned_pages--; + nr--; + totalram_pages++; + } + spin_unlock(&cmm_lock); + cmm_dbg("End request with %ld pages unfulfilled\n", nr); + return nr; +} + +/** + * cmm_oom_notify - OOM notifier + * @self: notifier block struct + * @dummy: not used + * @parm: returned - number of pages freed + * + * Return value: + * NOTIFY_OK + **/ +static int cmm_oom_notify(struct notifier_block *self, + unsigned long dummy, void *parm) +{ + unsigned long *freed = parm; + long nr = KB2PAGES(oom_kb); + + cmm_dbg("OOM processing started\n"); + nr = cmm_free_pages(nr); + loaned_pages_target = loaned_pages; + *freed += KB2PAGES(oom_kb) - nr; + oom_freed_pages += KB2PAGES(oom_kb) - nr; + cmm_dbg("OOM processing complete\n"); + return NOTIFY_OK; +} + +/** + * cmm_get_mpp - Read memory performance parameters + * + * Makes hcall to query the current page loan request from the hypervisor. + * + * Return value: + * nothing + **/ +static void cmm_get_mpp(void) +{ + int rc; + struct hvcall_mpp_data mpp_data; + signed long active_pages_target, page_loan_request, target; + signed long total_pages = totalram_pages + loaned_pages; + signed long min_mem_pages = (min_mem_mb * 1024 * 1024) / PAGE_SIZE; + + rc = h_get_mpp(&mpp_data); + + if (rc != H_SUCCESS) + return; + + page_loan_request = div_s64((s64)mpp_data.loan_request, PAGE_SIZE); + target = page_loan_request + (signed long)loaned_pages; + + if (target < 0 || total_pages < min_mem_pages) + target = 0; + + if (target > oom_freed_pages) + target -= oom_freed_pages; + else + target = 0; + + active_pages_target = total_pages - target; + + if (min_mem_pages > active_pages_target) + target = total_pages - min_mem_pages; + + if (target < 0) + target = 0; + + loaned_pages_target = target; + + cmm_dbg("delta = %ld, loaned = %lu, target = %lu, oom = %lu, totalram = %lu\n", + page_loan_request, loaned_pages, loaned_pages_target, + oom_freed_pages, totalram_pages); +} + +static struct notifier_block cmm_oom_nb = { + .notifier_call = cmm_oom_notify +}; + +/** + * cmm_thread - CMM task thread + * @dummy: not used + * + * Return value: + * 0 + **/ +static int cmm_thread(void *dummy) +{ + unsigned long timeleft; + + while (1) { + timeleft = msleep_interruptible(delay * 1000); + + if (kthread_should_stop() || timeleft) + break; + + if (mutex_trylock(&hotplug_mutex)) { + if (hotplug_occurred) { + hotplug_occurred = 0; + mutex_unlock(&hotplug_mutex); + cmm_dbg("Hotplug operation has occurred, " + "loaning activity suspended " + "for %d seconds.\n", + hotplug_delay); + timeleft = msleep_interruptible(hotplug_delay * + 1000); + if (kthread_should_stop() || timeleft) + break; + continue; + } + mutex_unlock(&hotplug_mutex); + } else { + cmm_dbg("Hotplug operation in progress, activity " + "suspended\n"); + continue; + } + + cmm_get_mpp(); + + if (loaned_pages_target > loaned_pages) { + if (cmm_alloc_pages(loaned_pages_target - loaned_pages)) + loaned_pages_target = loaned_pages; + } else if (loaned_pages_target < loaned_pages) + cmm_free_pages(loaned_pages - loaned_pages_target); + } + return 0; +} + +#define CMM_SHOW(name, format, args...) \ + static ssize_t show_##name(struct sys_device *dev, \ + struct sysdev_attribute *attr, \ + char *buf) \ + { \ + return sprintf(buf, format, ##args); \ + } \ + static SYSDEV_ATTR(name, S_IRUGO, show_##name, NULL) + +CMM_SHOW(loaned_kb, "%lu\n", PAGES2KB(loaned_pages)); +CMM_SHOW(loaned_target_kb, "%lu\n", PAGES2KB(loaned_pages_target)); + +static ssize_t show_oom_pages(struct sys_device *dev, + struct sysdev_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", PAGES2KB(oom_freed_pages)); +} + +static ssize_t store_oom_pages(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + unsigned long val = simple_strtoul (buf, NULL, 10); + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (val != 0) + return -EBADMSG; + + oom_freed_pages = 0; + return count; +} + +static SYSDEV_ATTR(oom_freed_kb, S_IWUSR| S_IRUGO, + show_oom_pages, store_oom_pages); + +static struct sysdev_attribute *cmm_attrs[] = { + &attr_loaned_kb, + &attr_loaned_target_kb, + &attr_oom_freed_kb, +}; + +static struct sysdev_class cmm_sysdev_class = { + .name = "cmm", +}; + +/** + * cmm_sysfs_register - Register with sysfs + * + * Return value: + * 0 on success / other on failure + **/ +static int cmm_sysfs_register(struct sys_device *sysdev) +{ + int i, rc; + + if ((rc = sysdev_class_register(&cmm_sysdev_class))) + return rc; + + sysdev->id = 0; + sysdev->cls = &cmm_sysdev_class; + + if ((rc = sysdev_register(sysdev))) + goto class_unregister; + + for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) { + if ((rc = sysdev_create_file(sysdev, cmm_attrs[i]))) + goto fail; + } + + return 0; + +fail: + while (--i >= 0) + sysdev_remove_file(sysdev, cmm_attrs[i]); + sysdev_unregister(sysdev); +class_unregister: + sysdev_class_unregister(&cmm_sysdev_class); + return rc; +} + +/** + * cmm_unregister_sysfs - Unregister from sysfs + * + **/ +static void cmm_unregister_sysfs(struct sys_device *sysdev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(cmm_attrs); i++) + sysdev_remove_file(sysdev, cmm_attrs[i]); + sysdev_unregister(sysdev); + sysdev_class_unregister(&cmm_sysdev_class); +} + +/** + * cmm_reboot_notifier - Make sure pages are not still marked as "loaned" + * + **/ +static int cmm_reboot_notifier(struct notifier_block *nb, + unsigned long action, void *unused) +{ + if (action == SYS_RESTART) { + if (cmm_thread_ptr) + kthread_stop(cmm_thread_ptr); + cmm_thread_ptr = NULL; + cmm_free_pages(loaned_pages); + } + return NOTIFY_DONE; +} + +static struct notifier_block cmm_reboot_nb = { + .notifier_call = cmm_reboot_notifier, +}; + +/** + * cmm_count_pages - Count the number of pages loaned in a particular range. + * + * @arg: memory_isolate_notify structure with address range and count + * + * Return value: + * 0 on success + **/ +static unsigned long cmm_count_pages(void *arg) +{ + struct memory_isolate_notify *marg = arg; + struct cmm_page_array *pa; + unsigned long start = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end = start + (marg->nr_pages << PAGE_SHIFT); + unsigned long idx; + + spin_lock(&cmm_lock); + pa = cmm_page_list; + while (pa) { + if ((unsigned long)pa >= start && (unsigned long)pa < end) + marg->pages_found++; + for (idx = 0; idx < pa->index; idx++) + if (pa->page[idx] >= start && pa->page[idx] < end) + marg->pages_found++; + pa = pa->next; + } + spin_unlock(&cmm_lock); + return 0; +} + +/** + * cmm_memory_isolate_cb - Handle memory isolation notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_isolate_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + **/ +static int cmm_memory_isolate_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + if (action == MEM_ISOLATE_COUNT) + ret = cmm_count_pages(arg); + + return notifier_from_errno(ret); +} + +static struct notifier_block cmm_mem_isolate_nb = { + .notifier_call = cmm_memory_isolate_cb, + .priority = CMM_MEM_ISOLATE_PRI +}; + +/** + * cmm_mem_going_offline - Unloan pages where memory is to be removed + * @arg: memory_notify structure with page range to be offlined + * + * Return value: + * 0 on success + **/ +static int cmm_mem_going_offline(void *arg) +{ + struct memory_notify *marg = arg; + unsigned long start_page = (unsigned long)pfn_to_kaddr(marg->start_pfn); + unsigned long end_page = start_page + (marg->nr_pages << PAGE_SHIFT); + struct cmm_page_array *pa_curr, *pa_last, *npa; + unsigned long idx; + unsigned long freed = 0; + + cmm_dbg("Memory going offline, searching 0x%lx (%ld pages).\n", + start_page, marg->nr_pages); + spin_lock(&cmm_lock); + + /* Search the page list for pages in the range to be offlined */ + pa_last = pa_curr = cmm_page_list; + while (pa_curr) { + for (idx = (pa_curr->index - 1); (idx + 1) > 0; idx--) { + if ((pa_curr->page[idx] < start_page) || + (pa_curr->page[idx] >= end_page)) + continue; + + plpar_page_set_active(__pa(pa_curr->page[idx])); + free_page(pa_curr->page[idx]); + freed++; + loaned_pages--; + totalram_pages++; + pa_curr->page[idx] = pa_last->page[--pa_last->index]; + if (pa_last->index == 0) { + if (pa_curr == pa_last) + pa_curr = pa_last->next; + pa_last = pa_last->next; + free_page((unsigned long)cmm_page_list); + cmm_page_list = pa_last; + continue; + } + } + pa_curr = pa_curr->next; + } + + /* Search for page list structures in the range to be offlined */ + pa_last = NULL; + pa_curr = cmm_page_list; + while (pa_curr) { + if (((unsigned long)pa_curr >= start_page) && + ((unsigned long)pa_curr < end_page)) { + npa = (struct cmm_page_array *)__get_free_page( + GFP_NOIO | __GFP_NOWARN | + __GFP_NORETRY | __GFP_NOMEMALLOC); + if (!npa) { + spin_unlock(&cmm_lock); + cmm_dbg("Failed to allocate memory for list " + "management. Memory hotplug " + "failed.\n"); + return ENOMEM; + } + memcpy(npa, pa_curr, PAGE_SIZE); + if (pa_curr == cmm_page_list) + cmm_page_list = npa; + if (pa_last) + pa_last->next = npa; + free_page((unsigned long) pa_curr); + freed++; + pa_curr = npa; + } + + pa_last = pa_curr; + pa_curr = pa_curr->next; + } + + spin_unlock(&cmm_lock); + cmm_dbg("Released %ld pages in the search range.\n", freed); + + return 0; +} + +/** + * cmm_memory_cb - Handle memory hotplug notifier calls + * @self: notifier block struct + * @action: action to take + * @arg: struct memory_notify data for handler + * + * Return value: + * NOTIFY_OK or notifier error based on subfunction return value + * + **/ +static int cmm_memory_cb(struct notifier_block *self, + unsigned long action, void *arg) +{ + int ret = 0; + + switch (action) { + case MEM_GOING_OFFLINE: + mutex_lock(&hotplug_mutex); + hotplug_occurred = 1; + ret = cmm_mem_going_offline(arg); + break; + case MEM_OFFLINE: + case MEM_CANCEL_OFFLINE: + mutex_unlock(&hotplug_mutex); + cmm_dbg("Memory offline operation complete.\n"); + break; + case MEM_GOING_ONLINE: + case MEM_ONLINE: + case MEM_CANCEL_ONLINE: + break; + } + + return notifier_from_errno(ret); +} + +static struct notifier_block cmm_mem_nb = { + .notifier_call = cmm_memory_cb, + .priority = CMM_MEM_HOTPLUG_PRI +}; + +/** + * cmm_init - Module initialization + * + * Return value: + * 0 on success / other on failure + **/ +static int cmm_init(void) +{ + int rc = -ENOMEM; + + if (!firmware_has_feature(FW_FEATURE_CMO)) + return -EOPNOTSUPP; + + if ((rc = register_oom_notifier(&cmm_oom_nb)) < 0) + return rc; + + if ((rc = register_reboot_notifier(&cmm_reboot_nb))) + goto out_oom_notifier; + + if ((rc = cmm_sysfs_register(&cmm_sysdev))) + goto out_reboot_notifier; + + if (register_memory_notifier(&cmm_mem_nb) || + register_memory_isolate_notifier(&cmm_mem_isolate_nb)) + goto out_unregister_notifier; + + if (cmm_disabled) + return rc; + + cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); + if (IS_ERR(cmm_thread_ptr)) { + rc = PTR_ERR(cmm_thread_ptr); + goto out_unregister_notifier; + } + + return rc; + +out_unregister_notifier: + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); + cmm_unregister_sysfs(&cmm_sysdev); +out_reboot_notifier: + unregister_reboot_notifier(&cmm_reboot_nb); +out_oom_notifier: + unregister_oom_notifier(&cmm_oom_nb); + return rc; +} + +/** + * cmm_exit - Module exit + * + * Return value: + * nothing + **/ +static void cmm_exit(void) +{ + if (cmm_thread_ptr) + kthread_stop(cmm_thread_ptr); + unregister_oom_notifier(&cmm_oom_nb); + unregister_reboot_notifier(&cmm_reboot_nb); + unregister_memory_notifier(&cmm_mem_nb); + unregister_memory_isolate_notifier(&cmm_mem_isolate_nb); + cmm_free_pages(loaned_pages); + cmm_unregister_sysfs(&cmm_sysdev); +} + +/** + * cmm_set_disable - Disable/Enable CMM + * + * Return value: + * 0 on success / other on failure + **/ +static int cmm_set_disable(const char *val, struct kernel_param *kp) +{ + int disable = simple_strtoul(val, NULL, 10); + + if (disable != 0 && disable != 1) + return -EINVAL; + + if (disable && !cmm_disabled) { + if (cmm_thread_ptr) + kthread_stop(cmm_thread_ptr); + cmm_thread_ptr = NULL; + cmm_free_pages(loaned_pages); + } else if (!disable && cmm_disabled) { + cmm_thread_ptr = kthread_run(cmm_thread, NULL, "cmmthread"); + if (IS_ERR(cmm_thread_ptr)) + return PTR_ERR(cmm_thread_ptr); + } + + cmm_disabled = disable; + return 0; +} + +module_param_call(disable, cmm_set_disable, param_get_uint, + &cmm_disabled, S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(disable, "Disable CMM. Set to 1 to disable. " + "[Default=" __stringify(CMM_DISABLE) "]"); + +module_init(cmm_init); +module_exit(cmm_exit); diff --git a/arch/powerpc/platforms/pseries/dlpar.c b/arch/powerpc/platforms/pseries/dlpar.c new file mode 100644 index 00000000..82766e5a --- /dev/null +++ b/arch/powerpc/platforms/pseries/dlpar.c @@ -0,0 +1,567 @@ +/* + * Support for dynamic reconfiguration for PCI, Memory, and CPU + * Hotplug and Dynamic Logical Partitioning on RPA platforms. + * + * Copyright (C) 2009 Nathan Fontenot + * Copyright (C) 2009 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kref.h> +#include <linux/notifier.h> +#include <linux/proc_fs.h> +#include <linux/spinlock.h> +#include <linux/cpu.h> +#include <linux/slab.h> +#include "offline_states.h" + +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> +#include <asm/pSeries_reconfig.h> + +struct cc_workarea { + u32 drc_index; + u32 zero; + u32 name_offset; + u32 prop_length; + u32 prop_offset; +}; + +void dlpar_free_cc_property(struct property *prop) +{ + kfree(prop->name); + kfree(prop->value); + kfree(prop); +} + +static struct property *dlpar_parse_cc_property(struct cc_workarea *ccwa) +{ + struct property *prop; + char *name; + char *value; + + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return NULL; + + name = (char *)ccwa + ccwa->name_offset; + prop->name = kstrdup(name, GFP_KERNEL); + + prop->length = ccwa->prop_length; + value = (char *)ccwa + ccwa->prop_offset; + prop->value = kmemdup(value, prop->length, GFP_KERNEL); + if (!prop->value) { + dlpar_free_cc_property(prop); + return NULL; + } + + return prop; +} + +static struct device_node *dlpar_parse_cc_node(struct cc_workarea *ccwa) +{ + struct device_node *dn; + char *name; + + dn = kzalloc(sizeof(*dn), GFP_KERNEL); + if (!dn) + return NULL; + + /* The configure connector reported name does not contain a + * preceding '/', so we allocate a buffer large enough to + * prepend this to the full_name. + */ + name = (char *)ccwa + ccwa->name_offset; + dn->full_name = kasprintf(GFP_KERNEL, "/%s", name); + if (!dn->full_name) { + kfree(dn); + return NULL; + } + + return dn; +} + +static void dlpar_free_one_cc_node(struct device_node *dn) +{ + struct property *prop; + + while (dn->properties) { + prop = dn->properties; + dn->properties = prop->next; + dlpar_free_cc_property(prop); + } + + kfree(dn->full_name); + kfree(dn); +} + +void dlpar_free_cc_nodes(struct device_node *dn) +{ + if (dn->child) + dlpar_free_cc_nodes(dn->child); + + if (dn->sibling) + dlpar_free_cc_nodes(dn->sibling); + + dlpar_free_one_cc_node(dn); +} + +#define COMPLETE 0 +#define NEXT_SIBLING 1 +#define NEXT_CHILD 2 +#define NEXT_PROPERTY 3 +#define PREV_PARENT 4 +#define MORE_MEMORY 5 +#define CALL_AGAIN -2 +#define ERR_CFG_USE -9003 + +struct device_node *dlpar_configure_connector(u32 drc_index) +{ + struct device_node *dn; + struct device_node *first_dn = NULL; + struct device_node *last_dn = NULL; + struct property *property; + struct property *last_property = NULL; + struct cc_workarea *ccwa; + char *data_buf; + int cc_token; + int rc = -1; + + cc_token = rtas_token("ibm,configure-connector"); + if (cc_token == RTAS_UNKNOWN_SERVICE) + return NULL; + + data_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!data_buf) + return NULL; + + ccwa = (struct cc_workarea *)&data_buf[0]; + ccwa->drc_index = drc_index; + ccwa->zero = 0; + + do { + /* Since we release the rtas_data_buf lock between configure + * connector calls we want to re-populate the rtas_data_buffer + * with the contents of the previous call. + */ + spin_lock(&rtas_data_buf_lock); + + memcpy(rtas_data_buf, data_buf, RTAS_DATA_BUF_SIZE); + rc = rtas_call(cc_token, 2, 1, NULL, rtas_data_buf, NULL); + memcpy(data_buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); + + spin_unlock(&rtas_data_buf_lock); + + switch (rc) { + case COMPLETE: + break; + + case NEXT_SIBLING: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + dn->parent = last_dn->parent; + last_dn->sibling = dn; + last_dn = dn; + break; + + case NEXT_CHILD: + dn = dlpar_parse_cc_node(ccwa); + if (!dn) + goto cc_error; + + if (!first_dn) + first_dn = dn; + else { + dn->parent = last_dn; + if (last_dn) + last_dn->child = dn; + } + + last_dn = dn; + break; + + case NEXT_PROPERTY: + property = dlpar_parse_cc_property(ccwa); + if (!property) + goto cc_error; + + if (!last_dn->properties) + last_dn->properties = property; + else + last_property->next = property; + + last_property = property; + break; + + case PREV_PARENT: + last_dn = last_dn->parent; + break; + + case CALL_AGAIN: + break; + + case MORE_MEMORY: + case ERR_CFG_USE: + default: + printk(KERN_ERR "Unexpected Error (%d) " + "returned from configure-connector\n", rc); + goto cc_error; + } + } while (rc); + +cc_error: + kfree(data_buf); + + if (rc) { + if (first_dn) + dlpar_free_cc_nodes(first_dn); + + return NULL; + } + + return first_dn; +} + +static struct device_node *derive_parent(const char *path) +{ + struct device_node *parent; + char *last_slash; + + last_slash = strrchr(path, '/'); + if (last_slash == path) { + parent = of_find_node_by_path("/"); + } else { + char *parent_path; + int parent_path_len = last_slash - path + 1; + parent_path = kmalloc(parent_path_len, GFP_KERNEL); + if (!parent_path) + return NULL; + + strlcpy(parent_path, path, parent_path_len); + parent = of_find_node_by_path(parent_path); + kfree(parent_path); + } + + return parent; +} + +int dlpar_attach_node(struct device_node *dn) +{ +#ifdef CONFIG_PROC_DEVICETREE + struct proc_dir_entry *ent; +#endif + int rc; + + of_node_set_flag(dn, OF_DYNAMIC); + kref_init(&dn->kref); + dn->parent = derive_parent(dn->full_name); + if (!dn->parent) + return -ENOMEM; + + rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_ADD, dn); + if (rc == NOTIFY_BAD) { + printk(KERN_ERR "Failed to add device node %s\n", + dn->full_name); + return -ENOMEM; /* For now, safe to assume kmalloc failure */ + } + + of_attach_node(dn); + +#ifdef CONFIG_PROC_DEVICETREE + ent = proc_mkdir(strrchr(dn->full_name, '/') + 1, dn->parent->pde); + if (ent) + proc_device_tree_add_node(dn, ent); +#endif + + of_node_put(dn->parent); + return 0; +} + +int dlpar_detach_node(struct device_node *dn) +{ +#ifdef CONFIG_PROC_DEVICETREE + struct device_node *parent = dn->parent; + struct property *prop = dn->properties; + + while (prop) { + remove_proc_entry(prop->name, dn->pde); + prop = prop->next; + } + + if (dn->pde) + remove_proc_entry(dn->pde->name, parent->pde); +#endif + + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_REMOVE, dn); + of_detach_node(dn); + of_node_put(dn); /* Must decrement the refcount */ + + return 0; +} + +#define DR_ENTITY_SENSE 9003 +#define DR_ENTITY_PRESENT 1 +#define DR_ENTITY_UNUSABLE 2 +#define ALLOCATION_STATE 9003 +#define ALLOC_UNUSABLE 0 +#define ALLOC_USABLE 1 +#define ISOLATION_STATE 9001 +#define ISOLATE 0 +#define UNISOLATE 1 + +int dlpar_acquire_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_UNUSABLE) + return -1; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_USABLE); + if (rc) + return rc; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + if (rc) { + rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + return rc; + } + + return 0; +} + +int dlpar_release_drc(u32 drc_index) +{ + int dr_status, rc; + + rc = rtas_call(rtas_token("get-sensor-state"), 2, 2, &dr_status, + DR_ENTITY_SENSE, drc_index); + if (rc || dr_status != DR_ENTITY_PRESENT) + return -1; + + rc = rtas_set_indicator(ISOLATION_STATE, drc_index, ISOLATE); + if (rc) + return rc; + + rc = rtas_set_indicator(ALLOCATION_STATE, drc_index, ALLOC_UNUSABLE); + if (rc) { + rtas_set_indicator(ISOLATION_STATE, drc_index, UNISOLATE); + return rc; + } + + return 0; +} + +#ifdef CONFIG_ARCH_CPU_PROBE_RELEASE + +static int dlpar_online_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + BUG_ON(get_cpu_current_state(cpu) + != CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_up(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to online " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_probe(const char *buf, size_t count) +{ + struct device_node *dn; + unsigned long drc_index; + char *cpu_name; + int rc; + + cpu_hotplug_driver_lock(); + rc = strict_strtoul(buf, 0, &drc_index); + if (rc) { + rc = -EINVAL; + goto out; + } + + dn = dlpar_configure_connector(drc_index); + if (!dn) { + rc = -EINVAL; + goto out; + } + + /* configure-connector reports cpus as living in the base + * directory of the device tree. CPUs actually live in the + * cpus directory so we need to fixup the full_name. + */ + cpu_name = kasprintf(GFP_KERNEL, "/cpus%s", dn->full_name); + if (!cpu_name) { + dlpar_free_cc_nodes(dn); + rc = -ENOMEM; + goto out; + } + + kfree(dn->full_name); + dn->full_name = cpu_name; + + rc = dlpar_acquire_drc(drc_index); + if (rc) { + dlpar_free_cc_nodes(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_attach_node(dn); + if (rc) { + dlpar_release_drc(drc_index); + dlpar_free_cc_nodes(dn); + goto out; + } + + rc = dlpar_online_cpu(dn); +out: + cpu_hotplug_driver_unlock(); + + return rc ? rc : count; +} + +static int dlpar_offline_cpu(struct device_node *dn) +{ + int rc = 0; + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(dn, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return -EINVAL; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + + if (get_cpu_current_state(cpu) == CPU_STATE_OFFLINE) + break; + + if (get_cpu_current_state(cpu) == CPU_STATE_ONLINE) { + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + cpu_maps_update_done(); + rc = cpu_down(cpu); + if (rc) + goto out; + cpu_maps_update_begin(); + break; + + } + + /* + * The cpu is in CPU_STATE_INACTIVE. + * Upgrade it's state to CPU_STATE_OFFLINE. + */ + set_preferred_offline_state(cpu, CPU_STATE_OFFLINE); + BUG_ON(plpar_hcall_norets(H_PROD, intserv[i]) + != H_SUCCESS); + __cpu_die(cpu); + break; + } + if (cpu == num_possible_cpus()) + printk(KERN_WARNING "Could not find cpu to offline " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); + +out: + return rc; + +} + +static ssize_t dlpar_cpu_release(const char *buf, size_t count) +{ + struct device_node *dn; + const u32 *drc_index; + int rc; + + dn = of_find_node_by_path(buf); + if (!dn) + return -EINVAL; + + drc_index = of_get_property(dn, "ibm,my-drc-index", NULL); + if (!drc_index) { + of_node_put(dn); + return -EINVAL; + } + + cpu_hotplug_driver_lock(); + rc = dlpar_offline_cpu(dn); + if (rc) { + of_node_put(dn); + rc = -EINVAL; + goto out; + } + + rc = dlpar_release_drc(*drc_index); + if (rc) { + of_node_put(dn); + goto out; + } + + rc = dlpar_detach_node(dn); + if (rc) { + dlpar_acquire_drc(*drc_index); + goto out; + } + + of_node_put(dn); +out: + cpu_hotplug_driver_unlock(); + return rc ? rc : count; +} + +static int __init pseries_dlpar_init(void) +{ + ppc_md.cpu_probe = dlpar_cpu_probe; + ppc_md.cpu_release = dlpar_cpu_release; + + return 0; +} +machine_device_initcall(pseries, pseries_dlpar_init); + +#endif /* CONFIG_ARCH_CPU_PROBE_RELEASE */ diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c new file mode 100644 index 00000000..0e865637 --- /dev/null +++ b/arch/powerpc/platforms/pseries/dtl.c @@ -0,0 +1,396 @@ +/* + * Virtual Processor Dispatch Trace Log + * + * (C) Copyright IBM Corporation 2009 + * + * Author: Jeremy Kerr <jk@ozlabs.org> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2, or (at your option) + * any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/init.h> +#include <linux/slab.h> +#include <linux/debugfs.h> +#include <linux/spinlock.h> +#include <asm/smp.h> +#include <asm/system.h> +#include <asm/uaccess.h> +#include <asm/firmware.h> +#include <asm/lppaca.h> + +#include "plpar_wrappers.h" + +struct dtl { + struct dtl_entry *buf; + struct dentry *file; + int cpu; + int buf_entries; + u64 last_idx; + spinlock_t lock; +}; +static DEFINE_PER_CPU(struct dtl, cpu_dtl); + +/* + * Dispatch trace log event mask: + * 0x7: 0x1: voluntary virtual processor waits + * 0x2: time-slice preempts + * 0x4: virtual partition memory page faults + */ +static u8 dtl_event_mask = 0x7; + + +/* + * Size of per-cpu log buffers. Firmware requires that the buffer does + * not cross a 4k boundary. + */ +static int dtl_buf_entries = N_DISPATCH_LOG; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +struct dtl_ring { + u64 write_index; + struct dtl_entry *write_ptr; + struct dtl_entry *buf; + struct dtl_entry *buf_end; + u8 saved_dtl_mask; +}; + +static DEFINE_PER_CPU(struct dtl_ring, dtl_rings); + +static atomic_t dtl_count; + +/* + * The cpu accounting code controls the DTL ring buffer, and we get + * given entries as they are processed. + */ +static void consume_dtle(struct dtl_entry *dtle, u64 index) +{ + struct dtl_ring *dtlr = &__get_cpu_var(dtl_rings); + struct dtl_entry *wp = dtlr->write_ptr; + struct lppaca *vpa = local_paca->lppaca_ptr; + + if (!wp) + return; + + *wp = *dtle; + barrier(); + + /* check for hypervisor ring buffer overflow, ignore this entry if so */ + if (index + N_DISPATCH_LOG < vpa->dtl_idx) + return; + + ++wp; + if (wp == dtlr->buf_end) + wp = dtlr->buf; + dtlr->write_ptr = wp; + + /* incrementing write_index makes the new entry visible */ + smp_wmb(); + ++dtlr->write_index; +} + +static int dtl_start(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); + + dtlr->buf = dtl->buf; + dtlr->buf_end = dtl->buf + dtl->buf_entries; + dtlr->write_index = 0; + + /* setting write_ptr enables logging into our buffer */ + smp_wmb(); + dtlr->write_ptr = dtl->buf; + + /* enable event logging */ + dtlr->saved_dtl_mask = lppaca_of(dtl->cpu).dtl_enable_mask; + lppaca_of(dtl->cpu).dtl_enable_mask |= dtl_event_mask; + + dtl_consumer = consume_dtle; + atomic_inc(&dtl_count); + return 0; +} + +static void dtl_stop(struct dtl *dtl) +{ + struct dtl_ring *dtlr = &per_cpu(dtl_rings, dtl->cpu); + + dtlr->write_ptr = NULL; + smp_wmb(); + + dtlr->buf = NULL; + + /* restore dtl_enable_mask */ + lppaca_of(dtl->cpu).dtl_enable_mask = dtlr->saved_dtl_mask; + + if (atomic_dec_and_test(&dtl_count)) + dtl_consumer = NULL; +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return per_cpu(dtl_rings, dtl->cpu).write_index; +} + +#else /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_start(struct dtl *dtl) +{ + unsigned long addr; + int ret, hwcpu; + + /* Register our dtl buffer with the hypervisor. The HV expects the + * buffer size to be passed in the second word of the buffer */ + ((u32 *)dtl->buf)[1] = DISPATCH_LOG_BYTES; + + hwcpu = get_hard_smp_processor_id(dtl->cpu); + addr = __pa(dtl->buf); + ret = register_dtl(hwcpu, addr); + if (ret) { + printk(KERN_WARNING "%s: DTL registration for cpu %d (hw %d) " + "failed with %d\n", __func__, dtl->cpu, hwcpu, ret); + return -EIO; + } + + /* set our initial buffer indices */ + lppaca_of(dtl->cpu).dtl_idx = 0; + + /* ensure that our updates to the lppaca fields have occurred before + * we actually enable the logging */ + smp_wmb(); + + /* enable event logging */ + lppaca_of(dtl->cpu).dtl_enable_mask = dtl_event_mask; + + return 0; +} + +static void dtl_stop(struct dtl *dtl) +{ + int hwcpu = get_hard_smp_processor_id(dtl->cpu); + + lppaca_of(dtl->cpu).dtl_enable_mask = 0x0; + + unregister_dtl(hwcpu); +} + +static u64 dtl_current_index(struct dtl *dtl) +{ + return lppaca_of(dtl->cpu).dtl_idx; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int dtl_enable(struct dtl *dtl) +{ + long int n_entries; + long int rc; + struct dtl_entry *buf = NULL; + + if (!dtl_cache) + return -ENOMEM; + + /* only allow one reader */ + if (dtl->buf) + return -EBUSY; + + n_entries = dtl_buf_entries; + buf = kmem_cache_alloc_node(dtl_cache, GFP_KERNEL, cpu_to_node(dtl->cpu)); + if (!buf) { + printk(KERN_WARNING "%s: buffer alloc failed for cpu %d\n", + __func__, dtl->cpu); + return -ENOMEM; + } + + spin_lock(&dtl->lock); + rc = -EBUSY; + if (!dtl->buf) { + /* store the original allocation size for use during read */ + dtl->buf_entries = n_entries; + dtl->buf = buf; + dtl->last_idx = 0; + rc = dtl_start(dtl); + if (rc) + dtl->buf = NULL; + } + spin_unlock(&dtl->lock); + + if (rc) + kmem_cache_free(dtl_cache, buf); + return rc; +} + +static void dtl_disable(struct dtl *dtl) +{ + spin_lock(&dtl->lock); + dtl_stop(dtl); + kmem_cache_free(dtl_cache, dtl->buf); + dtl->buf = NULL; + dtl->buf_entries = 0; + spin_unlock(&dtl->lock); +} + +/* file interface */ + +static int dtl_file_open(struct inode *inode, struct file *filp) +{ + struct dtl *dtl = inode->i_private; + int rc; + + rc = dtl_enable(dtl); + if (rc) + return rc; + + filp->private_data = dtl; + return 0; +} + +static int dtl_file_release(struct inode *inode, struct file *filp) +{ + struct dtl *dtl = inode->i_private; + dtl_disable(dtl); + return 0; +} + +static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + long int rc, n_read, n_req, read_size; + struct dtl *dtl; + u64 cur_idx, last_idx, i; + + if ((len % sizeof(struct dtl_entry)) != 0) + return -EINVAL; + + dtl = filp->private_data; + + /* requested number of entries to read */ + n_req = len / sizeof(struct dtl_entry); + + /* actual number of entries read */ + n_read = 0; + + spin_lock(&dtl->lock); + + cur_idx = dtl_current_index(dtl); + last_idx = dtl->last_idx; + + if (last_idx + dtl->buf_entries <= cur_idx) + last_idx = cur_idx - dtl->buf_entries + 1; + + if (last_idx + n_req > cur_idx) + n_req = cur_idx - last_idx; + + if (n_req > 0) + dtl->last_idx = last_idx + n_req; + + spin_unlock(&dtl->lock); + + if (n_req <= 0) + return 0; + + i = last_idx % dtl->buf_entries; + + /* read the tail of the buffer if we've wrapped */ + if (i + n_req > dtl->buf_entries) { + read_size = dtl->buf_entries - i; + + rc = copy_to_user(buf, &dtl->buf[i], + read_size * sizeof(struct dtl_entry)); + if (rc) + return -EFAULT; + + i = 0; + n_req -= read_size; + n_read += read_size; + buf += read_size * sizeof(struct dtl_entry); + } + + /* .. and now the head */ + rc = copy_to_user(buf, &dtl->buf[i], n_req * sizeof(struct dtl_entry)); + if (rc) + return -EFAULT; + + n_read += n_req; + + return n_read * sizeof(struct dtl_entry); +} + +static const struct file_operations dtl_fops = { + .open = dtl_file_open, + .release = dtl_file_release, + .read = dtl_file_read, + .llseek = no_llseek, +}; + +static struct dentry *dtl_dir; + +static int dtl_setup_file(struct dtl *dtl) +{ + char name[10]; + + sprintf(name, "cpu-%d", dtl->cpu); + + dtl->file = debugfs_create_file(name, 0400, dtl_dir, dtl, &dtl_fops); + if (!dtl->file) + return -ENOMEM; + + return 0; +} + +static int dtl_init(void) +{ + struct dentry *event_mask_file, *buf_entries_file; + int rc, i; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return -ENODEV; + + /* set up common debugfs structure */ + + rc = -ENOMEM; + dtl_dir = debugfs_create_dir("dtl", powerpc_debugfs_root); + if (!dtl_dir) { + printk(KERN_WARNING "%s: can't create dtl root dir\n", + __func__); + goto err; + } + + event_mask_file = debugfs_create_x8("dtl_event_mask", 0600, + dtl_dir, &dtl_event_mask); + buf_entries_file = debugfs_create_u32("dtl_buf_entries", 0400, + dtl_dir, &dtl_buf_entries); + + if (!event_mask_file || !buf_entries_file) { + printk(KERN_WARNING "%s: can't create dtl files\n", __func__); + goto err_remove_dir; + } + + /* set up the per-cpu log structures */ + for_each_possible_cpu(i) { + struct dtl *dtl = &per_cpu(cpu_dtl, i); + spin_lock_init(&dtl->lock); + dtl->cpu = i; + + rc = dtl_setup_file(dtl); + if (rc) + goto err_remove_dir; + } + + return 0; + +err_remove_dir: + debugfs_remove_recursive(dtl_dir); +err: + return rc; +} +arch_initcall(dtl_init); diff --git a/arch/powerpc/platforms/pseries/eeh.c b/arch/powerpc/platforms/pseries/eeh.c new file mode 100644 index 00000000..36087045 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh.c @@ -0,0 +1,1344 @@ +/* + * eeh.c + * Copyright IBM Corporation 2001, 2005, 2006 + * Copyright Dave Engebretsen & Todd Inglett 2001 + * Copyright Linas Vepstas 2005, 2006 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Please address comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ + +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/proc_fs.h> +#include <linux/rbtree.h> +#include <linux/seq_file.h> +#include <linux/spinlock.h> +#include <linux/of.h> + +#include <asm/atomic.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/io.h> +#include <asm/machdep.h> +#include <asm/ppc-pci.h> +#include <asm/rtas.h> + + +/** Overview: + * EEH, or "Extended Error Handling" is a PCI bridge technology for + * dealing with PCI bus errors that can't be dealt with within the + * usual PCI framework, except by check-stopping the CPU. Systems + * that are designed for high-availability/reliability cannot afford + * to crash due to a "mere" PCI error, thus the need for EEH. + * An EEH-capable bridge operates by converting a detected error + * into a "slot freeze", taking the PCI adapter off-line, making + * the slot behave, from the OS'es point of view, as if the slot + * were "empty": all reads return 0xff's and all writes are silently + * ignored. EEH slot isolation events can be triggered by parity + * errors on the address or data busses (e.g. during posted writes), + * which in turn might be caused by low voltage on the bus, dust, + * vibration, humidity, radioactivity or plain-old failed hardware. + * + * Note, however, that one of the leading causes of EEH slot + * freeze events are buggy device drivers, buggy device microcode, + * or buggy device hardware. This is because any attempt by the + * device to bus-master data to a memory address that is not + * assigned to the device will trigger a slot freeze. (The idea + * is to prevent devices-gone-wild from corrupting system memory). + * Buggy hardware/drivers will have a miserable time co-existing + * with EEH. + * + * Ideally, a PCI device driver, when suspecting that an isolation + * event has occurred (e.g. by reading 0xff's), will then ask EEH + * whether this is the case, and then take appropriate steps to + * reset the PCI slot, the PCI device, and then resume operations. + * However, until that day, the checking is done here, with the + * eeh_check_failure() routine embedded in the MMIO macros. If + * the slot is found to be isolated, an "EEH Event" is synthesized + * and sent out for processing. + */ + +/* If a device driver keeps reading an MMIO register in an interrupt + * handler after a slot isolation event, it might be broken. + * This sets the threshold for how many read attempts we allow + * before printing an error message. + */ +#define EEH_MAX_FAILS 2100000 + +/* Time to wait for a PCI slot to report status, in milliseconds */ +#define PCI_BUS_RESET_WAIT_MSEC (60*1000) + +/* RTAS tokens */ +static int ibm_set_eeh_option; +static int ibm_set_slot_reset; +static int ibm_read_slot_reset_state; +static int ibm_read_slot_reset_state2; +static int ibm_slot_error_detail; +static int ibm_get_config_addr_info; +static int ibm_get_config_addr_info2; +static int ibm_configure_bridge; +static int ibm_configure_pe; + +int eeh_subsystem_enabled; +EXPORT_SYMBOL(eeh_subsystem_enabled); + +/* Lock to avoid races due to multiple reports of an error */ +static DEFINE_RAW_SPINLOCK(confirm_error_lock); + +/* Buffer for reporting slot-error-detail rtas calls. Its here + * in BSS, and not dynamically alloced, so that it ends up in + * RMO where RTAS can access it. + */ +static unsigned char slot_errbuf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(slot_errbuf_lock); +static int eeh_error_buf_size; + +/* Buffer for reporting pci register dumps. Its here in BSS, and + * not dynamically alloced, so that it ends up in RMO where RTAS + * can access it. + */ +#define EEH_PCI_REGS_LOG_LEN 4096 +static unsigned char pci_regs_buf[EEH_PCI_REGS_LOG_LEN]; + +/* System monitoring statistics */ +static unsigned long no_device; +static unsigned long no_dn; +static unsigned long no_cfg_addr; +static unsigned long ignored_check; +static unsigned long total_mmio_ffs; +static unsigned long false_positives; +static unsigned long slot_resets; + +#define IS_BRIDGE(class_code) (((class_code)<<16) == PCI_BASE_CLASS_BRIDGE) + +/* --------------------------------------------------------------- */ +/* Below lies the EEH event infrastructure */ + +static void rtas_slot_error_detail(struct pci_dn *pdn, int severity, + char *driver_log, size_t loglen) +{ + int config_addr; + unsigned long flags; + int rc; + + /* Log the error with the rtas logger */ + spin_lock_irqsave(&slot_errbuf_lock, flags); + memset(slot_errbuf, 0, eeh_error_buf_size); + + /* Use PE configuration address, if present */ + config_addr = pdn->eeh_config_addr; + if (pdn->eeh_pe_config_addr) + config_addr = pdn->eeh_pe_config_addr; + + rc = rtas_call(ibm_slot_error_detail, + 8, 1, NULL, config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), + virt_to_phys(driver_log), loglen, + virt_to_phys(slot_errbuf), + eeh_error_buf_size, + severity); + + if (rc == 0) + log_error(slot_errbuf, ERR_TYPE_RTAS_LOG, 0); + spin_unlock_irqrestore(&slot_errbuf_lock, flags); +} + +/** + * gather_pci_data - copy assorted PCI config space registers to buff + * @pdn: device to report data for + * @buf: point to buffer in which to log + * @len: amount of room in buffer + * + * This routine captures assorted PCI configuration space data, + * and puts them into a buffer for RTAS error logging. + */ +static size_t gather_pci_data(struct pci_dn *pdn, char * buf, size_t len) +{ + struct pci_dev *dev = pdn->pcidev; + u32 cfg; + int cap, i; + int n = 0; + + n += scnprintf(buf+n, len-n, "%s\n", pdn->node->full_name); + printk(KERN_WARNING "EEH: of node=%s\n", pdn->node->full_name); + + rtas_read_config(pdn, PCI_VENDOR_ID, 4, &cfg); + n += scnprintf(buf+n, len-n, "dev/vend:%08x\n", cfg); + printk(KERN_WARNING "EEH: PCI device/vendor: %08x\n", cfg); + + rtas_read_config(pdn, PCI_COMMAND, 4, &cfg); + n += scnprintf(buf+n, len-n, "cmd/stat:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI cmd/status register: %08x\n", cfg); + + if (!dev) { + printk(KERN_WARNING "EEH: no PCI device for this of node\n"); + return n; + } + + /* Gather bridge-specific registers */ + if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { + rtas_read_config(pdn, PCI_SEC_STATUS, 2, &cfg); + n += scnprintf(buf+n, len-n, "sec stat:%x\n", cfg); + printk(KERN_WARNING "EEH: Bridge secondary status: %04x\n", cfg); + + rtas_read_config(pdn, PCI_BRIDGE_CONTROL, 2, &cfg); + n += scnprintf(buf+n, len-n, "brdg ctl:%x\n", cfg); + printk(KERN_WARNING "EEH: Bridge control: %04x\n", cfg); + } + + /* Dump out the PCI-X command and status regs */ + cap = pci_find_capability(dev, PCI_CAP_ID_PCIX); + if (cap) { + rtas_read_config(pdn, cap, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-cmd:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI-X cmd: %08x\n", cfg); + + rtas_read_config(pdn, cap+4, 4, &cfg); + n += scnprintf(buf+n, len-n, "pcix-stat:%x\n", cfg); + printk(KERN_WARNING "EEH: PCI-X status: %08x\n", cfg); + } + + /* If PCI-E capable, dump PCI-E cap 10, and the AER */ + cap = pci_find_capability(dev, PCI_CAP_ID_EXP); + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e cap10:\n"); + printk(KERN_WARNING + "EEH: PCI-E capabilities and status follow:\n"); + + for (i=0; i<=8; i++) { + rtas_read_config(pdn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + printk(KERN_WARNING "EEH: PCI-E %02x: %08x\n", i, cfg); + } + + cap = pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ERR); + if (cap) { + n += scnprintf(buf+n, len-n, "pci-e AER:\n"); + printk(KERN_WARNING + "EEH: PCI-E AER capability register set follows:\n"); + + for (i=0; i<14; i++) { + rtas_read_config(pdn, cap+4*i, 4, &cfg); + n += scnprintf(buf+n, len-n, "%02x:%x\n", 4*i, cfg); + printk(KERN_WARNING "EEH: PCI-E AER %02x: %08x\n", i, cfg); + } + } + } + + /* Gather status on devices under the bridge */ + if (dev->class >> 16 == PCI_BASE_CLASS_BRIDGE) { + struct device_node *dn; + + for_each_child_of_node(pdn->node, dn) { + pdn = PCI_DN(dn); + if (pdn) + n += gather_pci_data(pdn, buf+n, len-n); + } + } + + return n; +} + +void eeh_slot_error_detail(struct pci_dn *pdn, int severity) +{ + size_t loglen = 0; + pci_regs_buf[0] = 0; + + rtas_pci_enable(pdn, EEH_THAW_MMIO); + rtas_configure_bridge(pdn); + eeh_restore_bars(pdn); + loglen = gather_pci_data(pdn, pci_regs_buf, EEH_PCI_REGS_LOG_LEN); + + rtas_slot_error_detail(pdn, severity, pci_regs_buf, loglen); +} + +/** + * read_slot_reset_state - Read the reset state of a device node's slot + * @dn: device node to read + * @rets: array to return results in + */ +static int read_slot_reset_state(struct pci_dn *pdn, int rets[]) +{ + int token, outputs; + int config_addr; + + if (ibm_read_slot_reset_state2 != RTAS_UNKNOWN_SERVICE) { + token = ibm_read_slot_reset_state2; + outputs = 4; + } else { + token = ibm_read_slot_reset_state; + rets[2] = 0; /* fake PE Unavailable info */ + outputs = 3; + } + + /* Use PE configuration address, if present */ + config_addr = pdn->eeh_config_addr; + if (pdn->eeh_pe_config_addr) + config_addr = pdn->eeh_pe_config_addr; + + return rtas_call(token, 3, outputs, rets, config_addr, + BUID_HI(pdn->phb->buid), BUID_LO(pdn->phb->buid)); +} + +/** + * eeh_wait_for_slot_status - returns error status of slot + * @pdn pci device node + * @max_wait_msecs maximum number to millisecs to wait + * + * Return negative value if a permanent error, else return + * Partition Endpoint (PE) status value. + * + * If @max_wait_msecs is positive, then this routine will + * sleep until a valid status can be obtained, or until + * the max allowed wait time is exceeded, in which case + * a -2 is returned. + */ +int +eeh_wait_for_slot_status(struct pci_dn *pdn, int max_wait_msecs) +{ + int rc; + int rets[3]; + int mwait; + + while (1) { + rc = read_slot_reset_state(pdn, rets); + if (rc) return rc; + if (rets[1] == 0) return -1; /* EEH is not supported */ + + if (rets[0] != 5) return rets[0]; /* return actual status */ + + if (rets[2] == 0) return -1; /* permanently unavailable */ + + if (max_wait_msecs <= 0) break; + + mwait = rets[2]; + if (mwait <= 0) { + printk (KERN_WARNING + "EEH: Firmware returned bad wait value=%d\n", mwait); + mwait = 1000; + } else if (mwait > 300*1000) { + printk (KERN_WARNING + "EEH: Firmware is taking too long, time=%d\n", mwait); + mwait = 300*1000; + } + max_wait_msecs -= mwait; + msleep (mwait); + } + + printk(KERN_WARNING "EEH: Timed out waiting for slot status\n"); + return -2; +} + +/** + * eeh_token_to_phys - convert EEH address token to phys address + * @token i/o token, should be address in the form 0xA.... + */ +static inline unsigned long eeh_token_to_phys(unsigned long token) +{ + pte_t *ptep; + unsigned long pa; + + ptep = find_linux_pte(init_mm.pgd, token); + if (!ptep) + return token; + pa = pte_pfn(*ptep) << PAGE_SHIFT; + + return pa | (token & (PAGE_SIZE-1)); +} + +/** + * Return the "partitionable endpoint" (pe) under which this device lies + */ +struct device_node * find_device_pe(struct device_node *dn) +{ + while ((dn->parent) && PCI_DN(dn->parent) && + (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + dn = dn->parent; + } + return dn; +} + +/** Mark all devices that are children of this device as failed. + * Mark the device driver too, so that it can see the failure + * immediately; this is critical, since some drivers poll + * status registers in interrupts ... If a driver is polling, + * and the slot is frozen, then the driver can deadlock in + * an interrupt context, which is bad. + */ + +static void __eeh_mark_slot(struct device_node *parent, int mode_flag) +{ + struct device_node *dn; + + for_each_child_of_node(parent, dn) { + if (PCI_DN(dn)) { + /* Mark the pci device driver too */ + struct pci_dev *dev = PCI_DN(dn)->pcidev; + + PCI_DN(dn)->eeh_mode |= mode_flag; + + if (dev && dev->driver) + dev->error_state = pci_channel_io_frozen; + + __eeh_mark_slot(dn, mode_flag); + } + } +} + +void eeh_mark_slot (struct device_node *dn, int mode_flag) +{ + struct pci_dev *dev; + dn = find_device_pe (dn); + + /* Back up one, since config addrs might be shared */ + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent; + + PCI_DN(dn)->eeh_mode |= mode_flag; + + /* Mark the pci device too */ + dev = PCI_DN(dn)->pcidev; + if (dev) + dev->error_state = pci_channel_io_frozen; + + __eeh_mark_slot(dn, mode_flag); +} + +static void __eeh_clear_slot(struct device_node *parent, int mode_flag) +{ + struct device_node *dn; + + for_each_child_of_node(parent, dn) { + if (PCI_DN(dn)) { + PCI_DN(dn)->eeh_mode &= ~mode_flag; + PCI_DN(dn)->eeh_check_count = 0; + __eeh_clear_slot(dn, mode_flag); + } + } +} + +void eeh_clear_slot (struct device_node *dn, int mode_flag) +{ + unsigned long flags; + raw_spin_lock_irqsave(&confirm_error_lock, flags); + + dn = find_device_pe (dn); + + /* Back up one, since config addrs might be shared */ + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent; + + PCI_DN(dn)->eeh_mode &= ~mode_flag; + PCI_DN(dn)->eeh_check_count = 0; + __eeh_clear_slot(dn, mode_flag); + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); +} + +void __eeh_set_pe_freset(struct device_node *parent, unsigned int *freset) +{ + struct device_node *dn; + + for_each_child_of_node(parent, dn) { + if (PCI_DN(dn)) { + + struct pci_dev *dev = PCI_DN(dn)->pcidev; + + if (dev && dev->driver) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); + } + } +} + +void eeh_set_pe_freset(struct device_node *dn, unsigned int *freset) +{ + struct pci_dev *dev; + dn = find_device_pe(dn); + + /* Back up one, since config addrs might be shared */ + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent; + + dev = PCI_DN(dn)->pcidev; + if (dev) + *freset |= dev->needs_freset; + + __eeh_set_pe_freset(dn, freset); +} + +/** + * eeh_dn_check_failure - check if all 1's data is due to EEH slot freeze + * @dn device node + * @dev pci device, if known + * + * Check for an EEH failure for the given device node. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze. This routine + * will query firmware for the EEH status. + * + * Returns 0 if there has not been an EEH error; otherwise returns + * a non-zero value and queues up a slot isolation event notification. + * + * It is safe to call this routine in an interrupt context. + */ +int eeh_dn_check_failure(struct device_node *dn, struct pci_dev *dev) +{ + int ret; + int rets[3]; + unsigned long flags; + struct pci_dn *pdn; + int rc = 0; + const char *location; + + total_mmio_ffs++; + + if (!eeh_subsystem_enabled) + return 0; + + if (!dn) { + no_dn++; + return 0; + } + dn = find_device_pe(dn); + pdn = PCI_DN(dn); + + /* Access to IO BARs might get this far and still not want checking. */ + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { + ignored_check++; + pr_debug("EEH: Ignored check (%x) for %s %s\n", + pdn->eeh_mode, eeh_pci_name(dev), dn->full_name); + return 0; + } + + if (!pdn->eeh_config_addr && !pdn->eeh_pe_config_addr) { + no_cfg_addr++; + return 0; + } + + /* If we already have a pending isolation event for this + * slot, we know it's bad already, we don't need to check. + * Do this checking under a lock; as multiple PCI devices + * in one slot might report errors simultaneously, and we + * only want one error recovery routine running. + */ + raw_spin_lock_irqsave(&confirm_error_lock, flags); + rc = 1; + if (pdn->eeh_mode & EEH_MODE_ISOLATED) { + pdn->eeh_check_count ++; + if (pdn->eeh_check_count % EEH_MAX_FAILS == 0) { + location = of_get_property(dn, "ibm,loc-code", NULL); + printk (KERN_ERR "EEH: %d reads ignored for recovering device at " + "location=%s driver=%s pci addr=%s\n", + pdn->eeh_check_count, location, + dev->driver->name, eeh_pci_name(dev)); + printk (KERN_ERR "EEH: Might be infinite loop in %s driver\n", + dev->driver->name); + dump_stack(); + } + goto dn_unlock; + } + + /* + * Now test for an EEH failure. This is VERY expensive. + * Note that the eeh_config_addr may be a parent device + * in the case of a device behind a bridge, or it may be + * function zero of a multi-function device. + * In any case they must share a common PHB. + */ + ret = read_slot_reset_state(pdn, rets); + + /* If the call to firmware failed, punt */ + if (ret != 0) { + printk(KERN_WARNING "EEH: read_slot_reset_state() failed; rc=%d dn=%s\n", + ret, dn->full_name); + false_positives++; + pdn->eeh_false_positives ++; + rc = 0; + goto dn_unlock; + } + + /* Note that config-io to empty slots may fail; + * they are empty when they don't have children. */ + if ((rets[0] == 5) && (rets[2] == 0) && (dn->child == NULL)) { + false_positives++; + pdn->eeh_false_positives ++; + rc = 0; + goto dn_unlock; + } + + /* If EEH is not supported on this device, punt. */ + if (rets[1] != 1) { + printk(KERN_WARNING "EEH: event on unsupported device, rc=%d dn=%s\n", + ret, dn->full_name); + false_positives++; + pdn->eeh_false_positives ++; + rc = 0; + goto dn_unlock; + } + + /* If not the kind of error we know about, punt. */ + if (rets[0] != 1 && rets[0] != 2 && rets[0] != 4 && rets[0] != 5) { + false_positives++; + pdn->eeh_false_positives ++; + rc = 0; + goto dn_unlock; + } + + slot_resets++; + + /* Avoid repeated reports of this failure, including problems + * with other functions on this device, and functions under + * bridges. */ + eeh_mark_slot (dn, EEH_MODE_ISOLATED); + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); + + eeh_send_failure_event (dn, dev); + + /* Most EEH events are due to device driver bugs. Having + * a stack trace will help the device-driver authors figure + * out what happened. So print that out. */ + dump_stack(); + return 1; + +dn_unlock: + raw_spin_unlock_irqrestore(&confirm_error_lock, flags); + return rc; +} + +EXPORT_SYMBOL_GPL(eeh_dn_check_failure); + +/** + * eeh_check_failure - check if all 1's data is due to EEH slot freeze + * @token i/o token, should be address in the form 0xA.... + * @val value, should be all 1's (XXX why do we need this arg??) + * + * Check for an EEH failure at the given token address. Call this + * routine if the result of a read was all 0xff's and you want to + * find out if this is due to an EEH slot freeze event. This routine + * will query firmware for the EEH status. + * + * Note this routine is safe to call in an interrupt context. + */ +unsigned long eeh_check_failure(const volatile void __iomem *token, unsigned long val) +{ + unsigned long addr; + struct pci_dev *dev; + struct device_node *dn; + + /* Finding the phys addr + pci device; this is pretty quick. */ + addr = eeh_token_to_phys((unsigned long __force) token); + dev = pci_get_device_by_addr(addr); + if (!dev) { + no_device++; + return val; + } + + dn = pci_device_to_OF_node(dev); + eeh_dn_check_failure (dn, dev); + + pci_dev_put(dev); + return val; +} + +EXPORT_SYMBOL(eeh_check_failure); + +/* ------------------------------------------------------------- */ +/* The code below deals with error recovery */ + +/** + * rtas_pci_enable - enable MMIO or DMA transfers for this slot + * @pdn pci device node + */ + +int +rtas_pci_enable(struct pci_dn *pdn, int function) +{ + int config_addr; + int rc; + + /* Use PE configuration address, if present */ + config_addr = pdn->eeh_config_addr; + if (pdn->eeh_pe_config_addr) + config_addr = pdn->eeh_pe_config_addr; + + rc = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), + function); + + if (rc) + printk(KERN_WARNING "EEH: Unexpected state change %d, err=%d dn=%s\n", + function, rc, pdn->node->full_name); + + rc = eeh_wait_for_slot_status (pdn, PCI_BUS_RESET_WAIT_MSEC); + if ((rc == 4) && (function == EEH_THAW_MMIO)) + return 0; + + return rc; +} + +/** + * rtas_pci_slot_reset - raises/lowers the pci #RST line + * @pdn pci device node + * @state: 1/0 to raise/lower the #RST + * + * Clear the EEH-frozen condition on a slot. This routine + * asserts the PCI #RST line if the 'state' argument is '1', + * and drops the #RST line if 'state is '0'. This routine is + * safe to call in an interrupt context. + * + */ + +static void +rtas_pci_slot_reset(struct pci_dn *pdn, int state) +{ + int config_addr; + int rc; + + BUG_ON (pdn==NULL); + + if (!pdn->phb) { + printk (KERN_WARNING "EEH: in slot reset, device node %s has no phb\n", + pdn->node->full_name); + return; + } + + /* Use PE configuration address, if present */ + config_addr = pdn->eeh_config_addr; + if (pdn->eeh_pe_config_addr) + config_addr = pdn->eeh_pe_config_addr; + + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), + state); + + /* Fundamental-reset not supported on this PE, try hot-reset */ + if (rc == -8 && state == 3) { + rc = rtas_call(ibm_set_slot_reset, 4, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid), 1); + if (rc) + printk(KERN_WARNING + "EEH: Unable to reset the failed slot," + " #RST=%d dn=%s\n", + rc, pdn->node->full_name); + } +} + +/** + * pcibios_set_pcie_slot_reset - Set PCI-E reset state + * @dev: pci device struct + * @state: reset state to enter + * + * Return value: + * 0 if success + **/ +int pcibios_set_pcie_reset_state(struct pci_dev *dev, enum pcie_reset_state state) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + struct pci_dn *pdn = PCI_DN(dn); + + switch (state) { + case pcie_deassert_reset: + rtas_pci_slot_reset(pdn, 0); + break; + case pcie_hot_reset: + rtas_pci_slot_reset(pdn, 1); + break; + case pcie_warm_reset: + rtas_pci_slot_reset(pdn, 3); + break; + default: + return -EINVAL; + }; + + return 0; +} + +/** + * rtas_set_slot_reset -- assert the pci #RST line for 1/4 second + * @pdn: pci device node to be reset. + */ + +static void __rtas_set_slot_reset(struct pci_dn *pdn) +{ + unsigned int freset = 0; + + /* Determine type of EEH reset required for + * Partitionable Endpoint, a hot-reset (1) + * or a fundamental reset (3). + * A fundamental reset required by any device under + * Partitionable Endpoint trumps hot-reset. + */ + eeh_set_pe_freset(pdn->node, &freset); + + if (freset) + rtas_pci_slot_reset(pdn, 3); + else + rtas_pci_slot_reset(pdn, 1); + + /* The PCI bus requires that the reset be held high for at least + * a 100 milliseconds. We wait a bit longer 'just in case'. */ + +#define PCI_BUS_RST_HOLD_TIME_MSEC 250 + msleep (PCI_BUS_RST_HOLD_TIME_MSEC); + + /* We might get hit with another EEH freeze as soon as the + * pci slot reset line is dropped. Make sure we don't miss + * these, and clear the flag now. */ + eeh_clear_slot (pdn->node, EEH_MODE_ISOLATED); + + rtas_pci_slot_reset (pdn, 0); + + /* After a PCI slot has been reset, the PCI Express spec requires + * a 1.5 second idle time for the bus to stabilize, before starting + * up traffic. */ +#define PCI_BUS_SETTLE_TIME_MSEC 1800 + msleep (PCI_BUS_SETTLE_TIME_MSEC); +} + +int rtas_set_slot_reset(struct pci_dn *pdn) +{ + int i, rc; + + /* Take three shots at resetting the bus */ + for (i=0; i<3; i++) { + __rtas_set_slot_reset(pdn); + + rc = eeh_wait_for_slot_status(pdn, PCI_BUS_RESET_WAIT_MSEC); + if (rc == 0) + return 0; + + if (rc < 0) { + printk(KERN_ERR "EEH: unrecoverable slot failure %s\n", + pdn->node->full_name); + return -1; + } + printk(KERN_ERR "EEH: bus reset %d failed on slot %s, rc=%d\n", + i+1, pdn->node->full_name, rc); + } + + return -1; +} + +/* ------------------------------------------------------- */ +/** Save and restore of PCI BARs + * + * Although firmware will set up BARs during boot, it doesn't + * set up device BAR's after a device reset, although it will, + * if requested, set up bridge configuration. Thus, we need to + * configure the PCI devices ourselves. + */ + +/** + * __restore_bars - Restore the Base Address Registers + * @pdn: pci device node + * + * Loads the PCI configuration space base address registers, + * the expansion ROM base address, the latency timer, and etc. + * from the saved values in the device node. + */ +static inline void __restore_bars (struct pci_dn *pdn) +{ + int i; + u32 cmd; + + if (NULL==pdn->phb) return; + for (i=4; i<10; i++) { + rtas_write_config(pdn, i*4, 4, pdn->config_space[i]); + } + + /* 12 == Expansion ROM Address */ + rtas_write_config(pdn, 12*4, 4, pdn->config_space[12]); + +#define BYTE_SWAP(OFF) (8*((OFF)/4)+3-(OFF)) +#define SAVED_BYTE(OFF) (((u8 *)(pdn->config_space))[BYTE_SWAP(OFF)]) + + rtas_write_config (pdn, PCI_CACHE_LINE_SIZE, 1, + SAVED_BYTE(PCI_CACHE_LINE_SIZE)); + + rtas_write_config (pdn, PCI_LATENCY_TIMER, 1, + SAVED_BYTE(PCI_LATENCY_TIMER)); + + /* max latency, min grant, interrupt pin and line */ + rtas_write_config(pdn, 15*4, 4, pdn->config_space[15]); + + /* Restore PERR & SERR bits, some devices require it, + don't touch the other command bits */ + rtas_read_config(pdn, PCI_COMMAND, 4, &cmd); + if (pdn->config_space[1] & PCI_COMMAND_PARITY) + cmd |= PCI_COMMAND_PARITY; + else + cmd &= ~PCI_COMMAND_PARITY; + if (pdn->config_space[1] & PCI_COMMAND_SERR) + cmd |= PCI_COMMAND_SERR; + else + cmd &= ~PCI_COMMAND_SERR; + rtas_write_config(pdn, PCI_COMMAND, 4, cmd); +} + +/** + * eeh_restore_bars - restore the PCI config space info + * + * This routine performs a recursive walk to the children + * of this device as well. + */ +void eeh_restore_bars(struct pci_dn *pdn) +{ + struct device_node *dn; + if (!pdn) + return; + + if ((pdn->eeh_mode & EEH_MODE_SUPPORTED) && !IS_BRIDGE(pdn->class_code)) + __restore_bars (pdn); + + for_each_child_of_node(pdn->node, dn) + eeh_restore_bars (PCI_DN(dn)); +} + +/** + * eeh_save_bars - save device bars + * + * Save the values of the device bars. Unlike the restore + * routine, this routine is *not* recursive. This is because + * PCI devices are added individually; but, for the restore, + * an entire slot is reset at a time. + */ +static void eeh_save_bars(struct pci_dn *pdn) +{ + int i; + + if (!pdn ) + return; + + for (i = 0; i < 16; i++) + rtas_read_config(pdn, i * 4, 4, &pdn->config_space[i]); +} + +void +rtas_configure_bridge(struct pci_dn *pdn) +{ + int config_addr; + int rc; + int token; + + /* Use PE configuration address, if present */ + config_addr = pdn->eeh_config_addr; + if (pdn->eeh_pe_config_addr) + config_addr = pdn->eeh_pe_config_addr; + + /* Use new configure-pe function, if supported */ + if (ibm_configure_pe != RTAS_UNKNOWN_SERVICE) + token = ibm_configure_pe; + else + token = ibm_configure_bridge; + + rc = rtas_call(token, 3, 1, NULL, + config_addr, + BUID_HI(pdn->phb->buid), + BUID_LO(pdn->phb->buid)); + if (rc) { + printk (KERN_WARNING "EEH: Unable to configure device bridge (%d) for %s\n", + rc, pdn->node->full_name); + } +} + +/* ------------------------------------------------------------- */ +/* The code below deals with enabling EEH for devices during the + * early boot sequence. EEH must be enabled before any PCI probing + * can be done. + */ + +#define EEH_ENABLE 1 + +struct eeh_early_enable_info { + unsigned int buid_hi; + unsigned int buid_lo; +}; + +static int get_pe_addr (int config_addr, + struct eeh_early_enable_info *info) +{ + unsigned int rets[3]; + int ret; + + /* Use latest config-addr token on power6 */ + if (ibm_get_config_addr_info2 != RTAS_UNKNOWN_SERVICE) { + /* Make sure we have a PE in hand */ + ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, + config_addr, info->buid_hi, info->buid_lo, 1); + if (ret || (rets[0]==0)) + return 0; + + ret = rtas_call (ibm_get_config_addr_info2, 4, 2, rets, + config_addr, info->buid_hi, info->buid_lo, 0); + if (ret) + return 0; + return rets[0]; + } + + /* Use older config-addr token on power5 */ + if (ibm_get_config_addr_info != RTAS_UNKNOWN_SERVICE) { + ret = rtas_call (ibm_get_config_addr_info, 4, 2, rets, + config_addr, info->buid_hi, info->buid_lo, 0); + if (ret) + return 0; + return rets[0]; + } + return 0; +} + +/* Enable eeh for the given device node. */ +static void *early_enable_eeh(struct device_node *dn, void *data) +{ + unsigned int rets[3]; + struct eeh_early_enable_info *info = data; + int ret; + const u32 *class_code = of_get_property(dn, "class-code", NULL); + const u32 *vendor_id = of_get_property(dn, "vendor-id", NULL); + const u32 *device_id = of_get_property(dn, "device-id", NULL); + const u32 *regs; + int enable; + struct pci_dn *pdn = PCI_DN(dn); + + pdn->class_code = 0; + pdn->eeh_mode = 0; + pdn->eeh_check_count = 0; + pdn->eeh_freeze_count = 0; + pdn->eeh_false_positives = 0; + + if (!of_device_is_available(dn)) + return NULL; + + /* Ignore bad nodes. */ + if (!class_code || !vendor_id || !device_id) + return NULL; + + /* There is nothing to check on PCI to ISA bridges */ + if (dn->type && !strcmp(dn->type, "isa")) { + pdn->eeh_mode |= EEH_MODE_NOCHECK; + return NULL; + } + pdn->class_code = *class_code; + + /* Ok... see if this device supports EEH. Some do, some don't, + * and the only way to find out is to check each and every one. */ + regs = of_get_property(dn, "reg", NULL); + if (regs) { + /* First register entry is addr (00BBSS00) */ + /* Try to enable eeh */ + ret = rtas_call(ibm_set_eeh_option, 4, 1, NULL, + regs[0], info->buid_hi, info->buid_lo, + EEH_ENABLE); + + enable = 0; + if (ret == 0) { + pdn->eeh_config_addr = regs[0]; + + /* If the newer, better, ibm,get-config-addr-info is supported, + * then use that instead. */ + pdn->eeh_pe_config_addr = get_pe_addr(pdn->eeh_config_addr, info); + + /* Some older systems (Power4) allow the + * ibm,set-eeh-option call to succeed even on nodes + * where EEH is not supported. Verify support + * explicitly. */ + ret = read_slot_reset_state(pdn, rets); + if ((ret == 0) && (rets[1] == 1)) + enable = 1; + } + + if (enable) { + eeh_subsystem_enabled = 1; + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + + pr_debug("EEH: %s: eeh enabled, config=%x pe_config=%x\n", + dn->full_name, pdn->eeh_config_addr, + pdn->eeh_pe_config_addr); + } else { + + /* This device doesn't support EEH, but it may have an + * EEH parent, in which case we mark it as supported. */ + if (dn->parent && PCI_DN(dn->parent) + && (PCI_DN(dn->parent)->eeh_mode & EEH_MODE_SUPPORTED)) { + /* Parent supports EEH. */ + pdn->eeh_mode |= EEH_MODE_SUPPORTED; + pdn->eeh_config_addr = PCI_DN(dn->parent)->eeh_config_addr; + return NULL; + } + } + } else { + printk(KERN_WARNING "EEH: %s: unable to get reg property.\n", + dn->full_name); + } + + eeh_save_bars(pdn); + return NULL; +} + +/* + * Initialize EEH by trying to enable it for all of the adapters in the system. + * As a side effect we can determine here if eeh is supported at all. + * Note that we leave EEH on so failed config cycles won't cause a machine + * check. If a user turns off EEH for a particular adapter they are really + * telling Linux to ignore errors. Some hardware (e.g. POWER5) won't + * grant access to a slot if EEH isn't enabled, and so we always enable + * EEH for all slots/all devices. + * + * The eeh-force-off option disables EEH checking globally, for all slots. + * Even if force-off is set, the EEH hardware is still enabled, so that + * newer systems can boot. + */ +void __init eeh_init(void) +{ + struct device_node *phb, *np; + struct eeh_early_enable_info info; + + raw_spin_lock_init(&confirm_error_lock); + spin_lock_init(&slot_errbuf_lock); + + np = of_find_node_by_path("/rtas"); + if (np == NULL) + return; + + ibm_set_eeh_option = rtas_token("ibm,set-eeh-option"); + ibm_set_slot_reset = rtas_token("ibm,set-slot-reset"); + ibm_read_slot_reset_state2 = rtas_token("ibm,read-slot-reset-state2"); + ibm_read_slot_reset_state = rtas_token("ibm,read-slot-reset-state"); + ibm_slot_error_detail = rtas_token("ibm,slot-error-detail"); + ibm_get_config_addr_info = rtas_token("ibm,get-config-addr-info"); + ibm_get_config_addr_info2 = rtas_token("ibm,get-config-addr-info2"); + ibm_configure_bridge = rtas_token ("ibm,configure-bridge"); + ibm_configure_pe = rtas_token("ibm,configure-pe"); + + if (ibm_set_eeh_option == RTAS_UNKNOWN_SERVICE) + return; + + eeh_error_buf_size = rtas_token("rtas-error-log-max"); + if (eeh_error_buf_size == RTAS_UNKNOWN_SERVICE) { + eeh_error_buf_size = 1024; + } + if (eeh_error_buf_size > RTAS_ERROR_LOG_MAX) { + printk(KERN_WARNING "EEH: rtas-error-log-max is bigger than allocated " + "buffer ! (%d vs %d)", eeh_error_buf_size, RTAS_ERROR_LOG_MAX); + eeh_error_buf_size = RTAS_ERROR_LOG_MAX; + } + + /* Enable EEH for all adapters. Note that eeh requires buid's */ + for (phb = of_find_node_by_name(NULL, "pci"); phb; + phb = of_find_node_by_name(phb, "pci")) { + unsigned long buid; + + buid = get_phb_buid(phb); + if (buid == 0 || PCI_DN(phb) == NULL) + continue; + + info.buid_lo = BUID_LO(buid); + info.buid_hi = BUID_HI(buid); + traverse_pci_devices(phb, early_enable_eeh, &info); + } + + if (eeh_subsystem_enabled) + printk(KERN_INFO "EEH: PCI Enhanced I/O Error Handling Enabled\n"); + else + printk(KERN_WARNING "EEH: No capable adapters found\n"); +} + +/** + * eeh_add_device_early - enable EEH for the indicated device_node + * @dn: device node for which to set up EEH + * + * This routine must be used to perform EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + * This routine must be called before any i/o is performed to the + * adapter (inluding any config-space i/o). + * Whether this actually enables EEH or not for this device depends + * on the CEC architecture, type of the device, on earlier boot + * command-line arguments & etc. + */ +static void eeh_add_device_early(struct device_node *dn) +{ + struct pci_controller *phb; + struct eeh_early_enable_info info; + + if (!dn || !PCI_DN(dn)) + return; + phb = PCI_DN(dn)->phb; + + /* USB Bus children of PCI devices will not have BUID's */ + if (NULL == phb || 0 == phb->buid) + return; + + info.buid_hi = BUID_HI(phb->buid); + info.buid_lo = BUID_LO(phb->buid); + early_enable_eeh(dn, &info); +} + +void eeh_add_device_tree_early(struct device_node *dn) +{ + struct device_node *sib; + + for_each_child_of_node(dn, sib) + eeh_add_device_tree_early(sib); + eeh_add_device_early(dn); +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_early); + +/** + * eeh_add_device_late - perform EEH initialization for the indicated pci device + * @dev: pci device for which to set up EEH + * + * This routine must be used to complete EEH initialization for PCI + * devices that were added after system boot (e.g. hotplug, dlpar). + */ +static void eeh_add_device_late(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + + if (!dev || !eeh_subsystem_enabled) + return; + + pr_debug("EEH: Adding device %s\n", pci_name(dev)); + + dn = pci_device_to_OF_node(dev); + pdn = PCI_DN(dn); + if (pdn->pcidev == dev) { + pr_debug("EEH: Already referenced !\n"); + return; + } + WARN_ON(pdn->pcidev); + + pci_dev_get (dev); + pdn->pcidev = dev; + + pci_addr_cache_insert_device(dev); + eeh_sysfs_add_device(dev); +} + +void eeh_add_device_tree_late(struct pci_bus *bus) +{ + struct pci_dev *dev; + + list_for_each_entry(dev, &bus->devices, bus_list) { + eeh_add_device_late(dev); + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + struct pci_bus *subbus = dev->subordinate; + if (subbus) + eeh_add_device_tree_late(subbus); + } + } +} +EXPORT_SYMBOL_GPL(eeh_add_device_tree_late); + +/** + * eeh_remove_device - undo EEH setup for the indicated pci device + * @dev: pci device to be removed + * + * This routine should be called when a device is removed from + * a running system (e.g. by hotplug or dlpar). It unregisters + * the PCI device from the EEH subsystem. I/O errors affecting + * this device will no longer be detected after this call; thus, + * i/o errors affecting this slot may leave this device unusable. + */ +static void eeh_remove_device(struct pci_dev *dev) +{ + struct device_node *dn; + if (!dev || !eeh_subsystem_enabled) + return; + + /* Unregister the device with the EEH/PCI address search system */ + pr_debug("EEH: Removing device %s\n", pci_name(dev)); + + dn = pci_device_to_OF_node(dev); + if (PCI_DN(dn)->pcidev == NULL) { + pr_debug("EEH: Not referenced !\n"); + return; + } + PCI_DN(dn)->pcidev = NULL; + pci_dev_put (dev); + + pci_addr_cache_remove_device(dev); + eeh_sysfs_remove_device(dev); +} + +void eeh_remove_bus_device(struct pci_dev *dev) +{ + struct pci_bus *bus = dev->subordinate; + struct pci_dev *child, *tmp; + + eeh_remove_device(dev); + + if (bus && dev->hdr_type == PCI_HEADER_TYPE_BRIDGE) { + list_for_each_entry_safe(child, tmp, &bus->devices, bus_list) + eeh_remove_bus_device(child); + } +} +EXPORT_SYMBOL_GPL(eeh_remove_bus_device); + +static int proc_eeh_show(struct seq_file *m, void *v) +{ + if (0 == eeh_subsystem_enabled) { + seq_printf(m, "EEH Subsystem is globally disabled\n"); + seq_printf(m, "eeh_total_mmio_ffs=%ld\n", total_mmio_ffs); + } else { + seq_printf(m, "EEH Subsystem is enabled\n"); + seq_printf(m, + "no device=%ld\n" + "no device node=%ld\n" + "no config address=%ld\n" + "check not wanted=%ld\n" + "eeh_total_mmio_ffs=%ld\n" + "eeh_false_positives=%ld\n" + "eeh_slot_resets=%ld\n", + no_device, no_dn, no_cfg_addr, + ignored_check, total_mmio_ffs, + false_positives, + slot_resets); + } + + return 0; +} + +static int proc_eeh_open(struct inode *inode, struct file *file) +{ + return single_open(file, proc_eeh_show, NULL); +} + +static const struct file_operations proc_eeh_operations = { + .open = proc_eeh_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init eeh_init_proc(void) +{ + if (machine_is(pseries)) + proc_create("powerpc/eeh", 0, NULL, &proc_eeh_operations); + return 0; +} +__initcall(eeh_init_proc); diff --git a/arch/powerpc/platforms/pseries/eeh_cache.c b/arch/powerpc/platforms/pseries/eeh_cache.c new file mode 100644 index 00000000..8ed0d2d0 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_cache.c @@ -0,0 +1,308 @@ +/* + * eeh_cache.c + * PCI address cache; allows the lookup of PCI devices based on I/O address + * + * Copyright IBM Corporation 2004 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2004 + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/list.h> +#include <linux/pci.h> +#include <linux/rbtree.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <asm/atomic.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> + + +/** + * The pci address cache subsystem. This subsystem places + * PCI device address resources into a red-black tree, sorted + * according to the address range, so that given only an i/o + * address, the corresponding PCI device can be **quickly** + * found. It is safe to perform an address lookup in an interrupt + * context; this ability is an important feature. + * + * Currently, the only customer of this code is the EEH subsystem; + * thus, this code has been somewhat tailored to suit EEH better. + * In particular, the cache does *not* hold the addresses of devices + * for which EEH is not enabled. + * + * (Implementation Note: The RB tree seems to be better/faster + * than any hash algo I could think of for this problem, even + * with the penalty of slow pointer chases for d-cache misses). + */ +struct pci_io_addr_range +{ + struct rb_node rb_node; + unsigned long addr_lo; + unsigned long addr_hi; + struct pci_dev *pcidev; + unsigned int flags; +}; + +static struct pci_io_addr_cache +{ + struct rb_root rb_root; + spinlock_t piar_lock; +} pci_io_addr_cache_root; + +static inline struct pci_dev *__pci_get_device_by_addr(unsigned long addr) +{ + struct rb_node *n = pci_io_addr_cache_root.rb_root.rb_node; + + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (addr < piar->addr_lo) { + n = n->rb_left; + } else { + if (addr > piar->addr_hi) { + n = n->rb_right; + } else { + pci_dev_get(piar->pcidev); + return piar->pcidev; + } + } + } + + return NULL; +} + +/** + * pci_get_device_by_addr - Get device, given only address + * @addr: mmio (PIO) phys address or i/o port number + * + * Given an mmio phys address, or a port number, find a pci device + * that implements this address. Be sure to pci_dev_put the device + * when finished. I/O port numbers are assumed to be offset + * from zero (that is, they do *not* have pci_io_addr added in). + * It is safe to call this function within an interrupt. + */ +struct pci_dev *pci_get_device_by_addr(unsigned long addr) +{ + struct pci_dev *dev; + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + dev = __pci_get_device_by_addr(addr); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); + return dev; +} + +#ifdef DEBUG +/* + * Handy-dandy debug print routine, does nothing more + * than print out the contents of our addr cache. + */ +static void pci_addr_cache_print(struct pci_io_addr_cache *cache) +{ + struct rb_node *n; + int cnt = 0; + + n = rb_first(&cache->rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + printk(KERN_DEBUG "PCI: %s addr range %d [%lx-%lx]: %s\n", + (piar->flags & IORESOURCE_IO) ? "i/o" : "mem", cnt, + piar->addr_lo, piar->addr_hi, pci_name(piar->pcidev)); + cnt++; + n = rb_next(n); + } +} +#endif + +/* Insert address range into the rb tree. */ +static struct pci_io_addr_range * +pci_addr_cache_insert(struct pci_dev *dev, unsigned long alo, + unsigned long ahi, unsigned int flags) +{ + struct rb_node **p = &pci_io_addr_cache_root.rb_root.rb_node; + struct rb_node *parent = NULL; + struct pci_io_addr_range *piar; + + /* Walk tree, find a place to insert into tree */ + while (*p) { + parent = *p; + piar = rb_entry(parent, struct pci_io_addr_range, rb_node); + if (ahi < piar->addr_lo) { + p = &parent->rb_left; + } else if (alo > piar->addr_hi) { + p = &parent->rb_right; + } else { + if (dev != piar->pcidev || + alo != piar->addr_lo || ahi != piar->addr_hi) { + printk(KERN_WARNING "PIAR: overlapping address range\n"); + } + return piar; + } + } + piar = kmalloc(sizeof(struct pci_io_addr_range), GFP_ATOMIC); + if (!piar) + return NULL; + + pci_dev_get(dev); + piar->addr_lo = alo; + piar->addr_hi = ahi; + piar->pcidev = dev; + piar->flags = flags; + +#ifdef DEBUG + printk(KERN_DEBUG "PIAR: insert range=[%lx:%lx] dev=%s\n", + alo, ahi, pci_name (dev)); +#endif + + rb_link_node(&piar->rb_node, parent, p); + rb_insert_color(&piar->rb_node, &pci_io_addr_cache_root.rb_root); + + return piar; +} + +static void __pci_addr_cache_insert_device(struct pci_dev *dev) +{ + struct device_node *dn; + struct pci_dn *pdn; + int i; + + dn = pci_device_to_OF_node(dev); + if (!dn) { + printk(KERN_WARNING "PCI: no pci dn found for dev=%s\n", pci_name(dev)); + return; + } + + /* Skip any devices for which EEH is not enabled. */ + pdn = PCI_DN(dn); + if (!(pdn->eeh_mode & EEH_MODE_SUPPORTED) || + pdn->eeh_mode & EEH_MODE_NOCHECK) { +#ifdef DEBUG + printk(KERN_INFO "PCI: skip building address cache for=%s - %s\n", + pci_name(dev), pdn->node->full_name); +#endif + return; + } + + /* Walk resources on this device, poke them into the tree */ + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + unsigned long start = pci_resource_start(dev,i); + unsigned long end = pci_resource_end(dev,i); + unsigned int flags = pci_resource_flags(dev,i); + + /* We are interested only bus addresses, not dma or other stuff */ + if (0 == (flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if (start == 0 || ~start == 0 || end == 0 || ~end == 0) + continue; + pci_addr_cache_insert(dev, start, end, flags); + } +} + +/** + * pci_addr_cache_insert_device - Add a device to the address cache + * @dev: PCI device whose I/O addresses we are interested in. + * + * In order to support the fast lookup of devices based on addresses, + * we maintain a cache of devices that can be quickly searched. + * This routine adds a device to that cache. + */ +void pci_addr_cache_insert_device(struct pci_dev *dev) +{ + unsigned long flags; + + /* Ignore PCI bridges */ + if ((dev->class >> 16) == PCI_BASE_CLASS_BRIDGE) + return; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_insert_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +static inline void __pci_addr_cache_remove_device(struct pci_dev *dev) +{ + struct rb_node *n; + +restart: + n = rb_first(&pci_io_addr_cache_root.rb_root); + while (n) { + struct pci_io_addr_range *piar; + piar = rb_entry(n, struct pci_io_addr_range, rb_node); + + if (piar->pcidev == dev) { + rb_erase(n, &pci_io_addr_cache_root.rb_root); + pci_dev_put(piar->pcidev); + kfree(piar); + goto restart; + } + n = rb_next(n); + } +} + +/** + * pci_addr_cache_remove_device - remove pci device from addr cache + * @dev: device to remove + * + * Remove a device from the addr-cache tree. + * This is potentially expensive, since it will walk + * the tree multiple times (once per resource). + * But so what; device removal doesn't need to be that fast. + */ +void pci_addr_cache_remove_device(struct pci_dev *dev) +{ + unsigned long flags; + + spin_lock_irqsave(&pci_io_addr_cache_root.piar_lock, flags); + __pci_addr_cache_remove_device(dev); + spin_unlock_irqrestore(&pci_io_addr_cache_root.piar_lock, flags); +} + +/** + * pci_addr_cache_build - Build a cache of I/O addresses + * + * Build a cache of pci i/o addresses. This cache will be used to + * find the pci device that corresponds to a given address. + * This routine scans all pci busses to build the cache. + * Must be run late in boot process, after the pci controllers + * have been scanned for devices (after all device resources are known). + */ +void __init pci_addr_cache_build(void) +{ + struct device_node *dn; + struct pci_dev *dev = NULL; + + spin_lock_init(&pci_io_addr_cache_root.piar_lock); + + for_each_pci_dev(dev) { + pci_addr_cache_insert_device(dev); + + dn = pci_device_to_OF_node(dev); + if (!dn) + continue; + pci_dev_get(dev); /* matching put is in eeh_remove_device() */ + PCI_DN(dn)->pcidev = dev; + + eeh_sysfs_add_device(dev); + } + +#ifdef DEBUG + /* Verify tree built up above, echo back the list of addrs. */ + pci_addr_cache_print(&pci_io_addr_cache_root); +#endif +} + diff --git a/arch/powerpc/platforms/pseries/eeh_driver.c b/arch/powerpc/platforms/pseries/eeh_driver.c new file mode 100644 index 00000000..1b6cb105 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_driver.c @@ -0,0 +1,511 @@ +/* + * PCI Error Recovery Driver for RPA-compliant PPC64 platform. + * Copyright IBM Corp. 2004 2005 + * Copyright Linas Vepstas <linas@linas.org> 2004, 2005 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/delay.h> +#include <linux/interrupt.h> +#include <linux/irq.h> +#include <linux/pci.h> +#include <asm/eeh.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> +#include <asm/rtas.h> + + +static inline const char * pcid_name (struct pci_dev *pdev) +{ + if (pdev && pdev->dev.driver) + return pdev->dev.driver->name; + return ""; +} + +#if 0 +static void print_device_node_tree(struct pci_dn *pdn, int dent) +{ + int i; + struct device_node *pc; + + if (!pdn) + return; + for (i = 0; i < dent; i++) + printk(" "); + printk("dn=%s mode=%x \tcfg_addr=%x pe_addr=%x \tfull=%s\n", + pdn->node->name, pdn->eeh_mode, pdn->eeh_config_addr, + pdn->eeh_pe_config_addr, pdn->node->full_name); + dent += 3; + pc = pdn->node->child; + while (pc) { + print_device_node_tree(PCI_DN(pc), dent); + pc = pc->sibling; + } +} +#endif + +/** + * eeh_disable_irq - disable interrupt for the recovering device + */ +static void eeh_disable_irq(struct pci_dev *dev) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + + /* Don't disable MSI and MSI-X interrupts. They are + * effectively disabled by the DMA Stopped state + * when an EEH error occurs. + */ + if (dev->msi_enabled || dev->msix_enabled) + return; + + if (!irq_has_action(dev->irq)) + return; + + PCI_DN(dn)->eeh_mode |= EEH_MODE_IRQ_DISABLED; + disable_irq_nosync(dev->irq); +} + +/** + * eeh_enable_irq - enable interrupt for the recovering device + */ +static void eeh_enable_irq(struct pci_dev *dev) +{ + struct device_node *dn = pci_device_to_OF_node(dev); + + if ((PCI_DN(dn)->eeh_mode) & EEH_MODE_IRQ_DISABLED) { + PCI_DN(dn)->eeh_mode &= ~EEH_MODE_IRQ_DISABLED; + enable_irq(dev->irq); + } +} + +/* ------------------------------------------------------- */ +/** + * eeh_report_error - report pci error to each device driver + * + * Report an EEH error to each device driver, collect up and + * merge the device driver responses. Cumulative response + * passed back in "userdata". + */ + +static int eeh_report_error(struct pci_dev *dev, void *userdata) +{ + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver = dev->driver; + + dev->error_state = pci_channel_io_frozen; + + if (!driver) + return 0; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) + return 0; + + rc = driver->err_handler->error_detected (dev, pci_channel_io_frozen); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + return 0; +} + +/** + * eeh_report_mmio_enabled - tell drivers that MMIO has been enabled + * + * Tells each device driver that IO ports, MMIO and config space I/O + * are now enabled. Collects up and merges the device driver responses. + * Cumulative response passed back in "userdata". + */ + +static int eeh_report_mmio_enabled(struct pci_dev *dev, void *userdata) +{ + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver = dev->driver; + + if (!driver || + !driver->err_handler || + !driver->err_handler->mmio_enabled) + return 0; + + rc = driver->err_handler->mmio_enabled (dev); + + /* A driver that needs a reset trumps all others */ + if (rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + if (*res == PCI_ERS_RESULT_NONE) *res = rc; + + return 0; +} + +/** + * eeh_report_reset - tell device that slot has been reset + */ + +static int eeh_report_reset(struct pci_dev *dev, void *userdata) +{ + enum pci_ers_result rc, *res = userdata; + struct pci_driver *driver = dev->driver; + + if (!driver) + return 0; + + dev->error_state = pci_channel_io_normal; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->slot_reset) + return 0; + + rc = driver->err_handler->slot_reset(dev); + if ((*res == PCI_ERS_RESULT_NONE) || + (*res == PCI_ERS_RESULT_RECOVERED)) *res = rc; + if (*res == PCI_ERS_RESULT_DISCONNECT && + rc == PCI_ERS_RESULT_NEED_RESET) *res = rc; + + return 0; +} + +/** + * eeh_report_resume - tell device to resume normal operations + */ + +static int eeh_report_resume(struct pci_dev *dev, void *userdata) +{ + struct pci_driver *driver = dev->driver; + + dev->error_state = pci_channel_io_normal; + + if (!driver) + return 0; + + eeh_enable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->resume) + return 0; + + driver->err_handler->resume(dev); + + return 0; +} + +/** + * eeh_report_failure - tell device driver that device is dead. + * + * This informs the device driver that the device is permanently + * dead, and that no further recovery attempts will be made on it. + */ + +static int eeh_report_failure(struct pci_dev *dev, void *userdata) +{ + struct pci_driver *driver = dev->driver; + + dev->error_state = pci_channel_io_perm_failure; + + if (!driver) + return 0; + + eeh_disable_irq(dev); + + if (!driver->err_handler || + !driver->err_handler->error_detected) + return 0; + + driver->err_handler->error_detected(dev, pci_channel_io_perm_failure); + + return 0; +} + +/* ------------------------------------------------------- */ +/** + * handle_eeh_events -- reset a PCI device after hard lockup. + * + * pSeries systems will isolate a PCI slot if the PCI-Host + * bridge detects address or data parity errors, DMA's + * occurring to wild addresses (which usually happen due to + * bugs in device drivers or in PCI adapter firmware). + * Slot isolations also occur if #SERR, #PERR or other misc + * PCI-related errors are detected. + * + * Recovery process consists of unplugging the device driver + * (which generated hotplug events to userspace), then issuing + * a PCI #RST to the device, then reconfiguring the PCI config + * space for all bridges & devices under this slot, and then + * finally restarting the device drivers (which cause a second + * set of hotplug events to go out to userspace). + */ + +/** + * eeh_reset_device() -- perform actual reset of a pci slot + * @bus: pointer to the pci bus structure corresponding + * to the isolated slot. A non-null value will + * cause all devices under the bus to be removed + * and then re-added. + * @pe_dn: pointer to a "Partionable Endpoint" device node. + * This is the top-level structure on which pci + * bus resets can be performed. + */ + +static int eeh_reset_device (struct pci_dn *pe_dn, struct pci_bus *bus) +{ + struct device_node *dn; + int cnt, rc; + + /* pcibios will clear the counter; save the value */ + cnt = pe_dn->eeh_freeze_count; + + if (bus) + pcibios_remove_pci_devices(bus); + + /* Reset the pci controller. (Asserts RST#; resets config space). + * Reconfigure bridges and devices. Don't try to bring the system + * up if the reset failed for some reason. */ + rc = rtas_set_slot_reset(pe_dn); + if (rc) + return rc; + + /* Walk over all functions on this device. */ + dn = pe_dn->node; + if (!pcibios_find_pci_bus(dn) && PCI_DN(dn->parent)) + dn = dn->parent->child; + + while (dn) { + struct pci_dn *ppe = PCI_DN(dn); + /* On Power4, always true because eeh_pe_config_addr=0 */ + if (pe_dn->eeh_pe_config_addr == ppe->eeh_pe_config_addr) { + rtas_configure_bridge(ppe); + eeh_restore_bars(ppe); + } + dn = dn->sibling; + } + + /* Give the system 5 seconds to finish running the user-space + * hotplug shutdown scripts, e.g. ifdown for ethernet. Yes, + * this is a hack, but if we don't do this, and try to bring + * the device up before the scripts have taken it down, + * potentially weird things happen. + */ + if (bus) { + ssleep (5); + pcibios_add_pci_devices(bus); + } + pe_dn->eeh_freeze_count = cnt; + + return 0; +} + +/* The longest amount of time to wait for a pci device + * to come back on line, in seconds. + */ +#define MAX_WAIT_FOR_RECOVERY 150 + +struct pci_dn * handle_eeh_events (struct eeh_event *event) +{ + struct device_node *frozen_dn; + struct pci_dn *frozen_pdn; + struct pci_bus *frozen_bus; + int rc = 0; + enum pci_ers_result result = PCI_ERS_RESULT_NONE; + const char *location, *pci_str, *drv_str, *bus_pci_str, *bus_drv_str; + + frozen_dn = find_device_pe(event->dn); + if (!frozen_dn) { + + location = of_get_property(event->dn, "ibm,loc-code", NULL); + location = location ? location : "unknown"; + printk(KERN_ERR "EEH: Error: Cannot find partition endpoint " + "for location=%s pci addr=%s\n", + location, eeh_pci_name(event->dev)); + return NULL; + } + + frozen_bus = pcibios_find_pci_bus(frozen_dn); + location = of_get_property(frozen_dn, "ibm,loc-code", NULL); + location = location ? location : "unknown"; + + /* There are two different styles for coming up with the PE. + * In the old style, it was the highest EEH-capable device + * which was always an EADS pci bridge. In the new style, + * there might not be any EADS bridges, and even when there are, + * the firmware marks them as "EEH incapable". So another + * two-step is needed to find the pci bus.. */ + if (!frozen_bus) + frozen_bus = pcibios_find_pci_bus (frozen_dn->parent); + + if (!frozen_bus) { + printk(KERN_ERR "EEH: Cannot find PCI bus " + "for location=%s dn=%s\n", + location, frozen_dn->full_name); + return NULL; + } + + frozen_pdn = PCI_DN(frozen_dn); + frozen_pdn->eeh_freeze_count++; + + pci_str = eeh_pci_name(event->dev); + drv_str = pcid_name(event->dev); + + if (frozen_pdn->eeh_freeze_count > EEH_MAX_ALLOWED_FREEZES) + goto excess_failures; + + printk(KERN_WARNING + "EEH: This PCI device has failed %d times in the last hour:\n", + frozen_pdn->eeh_freeze_count); + + if (frozen_pdn->pcidev) { + bus_pci_str = pci_name(frozen_pdn->pcidev); + bus_drv_str = pcid_name(frozen_pdn->pcidev); + printk(KERN_WARNING + "EEH: Bus location=%s driver=%s pci addr=%s\n", + location, bus_drv_str, bus_pci_str); + } + + printk(KERN_WARNING + "EEH: Device location=%s driver=%s pci addr=%s\n", + location, drv_str, pci_str); + + /* Walk the various device drivers attached to this slot through + * a reset sequence, giving each an opportunity to do what it needs + * to accomplish the reset. Each child gets a report of the + * status ... if any child can't handle the reset, then the entire + * slot is dlpar removed and added. + */ + pci_walk_bus(frozen_bus, eeh_report_error, &result); + + /* Get the current PCI slot state. This can take a long time, + * sometimes over 3 seconds for certain systems. */ + rc = eeh_wait_for_slot_status (frozen_pdn, MAX_WAIT_FOR_RECOVERY*1000); + if (rc < 0) { + printk(KERN_WARNING "EEH: Permanent failure\n"); + goto hard_fail; + } + + /* Since rtas may enable MMIO when posting the error log, + * don't post the error log until after all dev drivers + * have been informed. + */ + eeh_slot_error_detail(frozen_pdn, EEH_LOG_TEMP_FAILURE); + + /* If all device drivers were EEH-unaware, then shut + * down all of the device drivers, and hope they + * go down willingly, without panicing the system. + */ + if (result == PCI_ERS_RESULT_NONE) { + rc = eeh_reset_device(frozen_pdn, frozen_bus); + if (rc) { + printk(KERN_WARNING "EEH: Unable to reset, rc=%d\n", rc); + goto hard_fail; + } + } + + /* If all devices reported they can proceed, then re-enable MMIO */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + rc = rtas_pci_enable(frozen_pdn, EEH_THAW_MMIO); + + if (rc < 0) + goto hard_fail; + if (rc) { + result = PCI_ERS_RESULT_NEED_RESET; + } else { + result = PCI_ERS_RESULT_NONE; + pci_walk_bus(frozen_bus, eeh_report_mmio_enabled, &result); + } + } + + /* If all devices reported they can proceed, then re-enable DMA */ + if (result == PCI_ERS_RESULT_CAN_RECOVER) { + rc = rtas_pci_enable(frozen_pdn, EEH_THAW_DMA); + + if (rc < 0) + goto hard_fail; + if (rc) + result = PCI_ERS_RESULT_NEED_RESET; + else + result = PCI_ERS_RESULT_RECOVERED; + } + + /* If any device has a hard failure, then shut off everything. */ + if (result == PCI_ERS_RESULT_DISCONNECT) { + printk(KERN_WARNING "EEH: Device driver gave up\n"); + goto hard_fail; + } + + /* If any device called out for a reset, then reset the slot */ + if (result == PCI_ERS_RESULT_NEED_RESET) { + rc = eeh_reset_device(frozen_pdn, NULL); + if (rc) { + printk(KERN_WARNING "EEH: Cannot reset, rc=%d\n", rc); + goto hard_fail; + } + result = PCI_ERS_RESULT_NONE; + pci_walk_bus(frozen_bus, eeh_report_reset, &result); + } + + /* All devices should claim they have recovered by now. */ + if ((result != PCI_ERS_RESULT_RECOVERED) && + (result != PCI_ERS_RESULT_NONE)) { + printk(KERN_WARNING "EEH: Not recovered\n"); + goto hard_fail; + } + + /* Tell all device drivers that they can resume operations */ + pci_walk_bus(frozen_bus, eeh_report_resume, NULL); + + return frozen_pdn; + +excess_failures: + /* + * About 90% of all real-life EEH failures in the field + * are due to poorly seated PCI cards. Only 10% or so are + * due to actual, failed cards. + */ + printk(KERN_ERR + "EEH: PCI device at location=%s driver=%s pci addr=%s\n" + "has failed %d times in the last hour " + "and has been permanently disabled.\n" + "Please try reseating this device or replacing it.\n", + location, drv_str, pci_str, frozen_pdn->eeh_freeze_count); + goto perm_error; + +hard_fail: + printk(KERN_ERR + "EEH: Unable to recover from failure of PCI device " + "at location=%s driver=%s pci addr=%s\n" + "Please try reseating this device or replacing it.\n", + location, drv_str, pci_str); + +perm_error: + eeh_slot_error_detail(frozen_pdn, EEH_LOG_PERM_FAILURE); + + /* Notify all devices that they're about to go down. */ + pci_walk_bus(frozen_bus, eeh_report_failure, NULL); + + /* Shut down the device drivers for good. */ + pcibios_remove_pci_devices(frozen_bus); + + return NULL; +} + +/* ---------- end of file ---------- */ diff --git a/arch/powerpc/platforms/pseries/eeh_event.c b/arch/powerpc/platforms/pseries/eeh_event.c new file mode 100644 index 00000000..2ec500c1 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_event.c @@ -0,0 +1,157 @@ +/* + * eeh_event.c + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * Copyright (c) 2005 Linas Vepstas <linas@linas.org> + */ + +#include <linux/delay.h> +#include <linux/list.h> +#include <linux/mutex.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/workqueue.h> +#include <asm/eeh_event.h> +#include <asm/ppc-pci.h> + +/** Overview: + * EEH error states may be detected within exception handlers; + * however, the recovery processing needs to occur asynchronously + * in a normal kernel context and not an interrupt context. + * This pair of routines creates an event and queues it onto a + * work-queue, where a worker thread can drive recovery. + */ + +/* EEH event workqueue setup. */ +static DEFINE_SPINLOCK(eeh_eventlist_lock); +LIST_HEAD(eeh_eventlist); +static void eeh_thread_launcher(struct work_struct *); +DECLARE_WORK(eeh_event_wq, eeh_thread_launcher); + +/* Serialize reset sequences for a given pci device */ +DEFINE_MUTEX(eeh_event_mutex); + +/** + * eeh_event_handler - dispatch EEH events. + * @dummy - unused + * + * The detection of a frozen slot can occur inside an interrupt, + * where it can be hard to do anything about it. The goal of this + * routine is to pull these detection events out of the context + * of the interrupt handler, and re-dispatch them for processing + * at a later time in a normal context. + */ +static int eeh_event_handler(void * dummy) +{ + unsigned long flags; + struct eeh_event *event; + struct pci_dn *pdn; + + daemonize ("eehd"); + set_current_state(TASK_INTERRUPTIBLE); + + spin_lock_irqsave(&eeh_eventlist_lock, flags); + event = NULL; + + /* Unqueue the event, get ready to process. */ + if (!list_empty(&eeh_eventlist)) { + event = list_entry(eeh_eventlist.next, struct eeh_event, list); + list_del(&event->list); + } + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + if (event == NULL) + return 0; + + /* Serialize processing of EEH events */ + mutex_lock(&eeh_event_mutex); + eeh_mark_slot(event->dn, EEH_MODE_RECOVERING); + + printk(KERN_INFO "EEH: Detected PCI bus error on device %s\n", + eeh_pci_name(event->dev)); + + pdn = handle_eeh_events(event); + + eeh_clear_slot(event->dn, EEH_MODE_RECOVERING); + pci_dev_put(event->dev); + kfree(event); + mutex_unlock(&eeh_event_mutex); + + /* If there are no new errors after an hour, clear the counter. */ + if (pdn && pdn->eeh_freeze_count>0) { + msleep_interruptible (3600*1000); + if (pdn->eeh_freeze_count>0) + pdn->eeh_freeze_count--; + } + + return 0; +} + +/** + * eeh_thread_launcher + * @dummy - unused + */ +static void eeh_thread_launcher(struct work_struct *dummy) +{ + if (kernel_thread(eeh_event_handler, NULL, CLONE_KERNEL) < 0) + printk(KERN_ERR "Failed to start EEH daemon\n"); +} + +/** + * eeh_send_failure_event - generate a PCI error event + * @dev pci device + * + * This routine can be called within an interrupt context; + * the actual event will be delivered in a normal context + * (from a workqueue). + */ +int eeh_send_failure_event (struct device_node *dn, + struct pci_dev *dev) +{ + unsigned long flags; + struct eeh_event *event; + const char *location; + + if (!mem_init_done) { + printk(KERN_ERR "EEH: event during early boot not handled\n"); + location = of_get_property(dn, "ibm,loc-code", NULL); + printk(KERN_ERR "EEH: device node = %s\n", dn->full_name); + printk(KERN_ERR "EEH: PCI location = %s\n", location); + return 1; + } + event = kmalloc(sizeof(*event), GFP_ATOMIC); + if (event == NULL) { + printk (KERN_ERR "EEH: out of memory, event not handled\n"); + return 1; + } + + if (dev) + pci_dev_get(dev); + + event->dn = dn; + event->dev = dev; + + /* We may or may not be called in an interrupt context */ + spin_lock_irqsave(&eeh_eventlist_lock, flags); + list_add(&event->list, &eeh_eventlist); + spin_unlock_irqrestore(&eeh_eventlist_lock, flags); + + schedule_work(&eeh_event_wq); + + return 0; +} + +/********************** END OF FILE ******************************/ diff --git a/arch/powerpc/platforms/pseries/eeh_sysfs.c b/arch/powerpc/platforms/pseries/eeh_sysfs.c new file mode 100644 index 00000000..23982c78 --- /dev/null +++ b/arch/powerpc/platforms/pseries/eeh_sysfs.c @@ -0,0 +1,86 @@ +/* + * Sysfs entries for PCI Error Recovery for PAPR-compliant platform. + * Copyright IBM Corporation 2007 + * Copyright Linas Vepstas <linas@austin.ibm.com> 2007 + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + * + * Send comments and feedback to Linas Vepstas <linas@austin.ibm.com> + */ +#include <linux/pci.h> +#include <asm/ppc-pci.h> +#include <asm/pci-bridge.h> + +/** + * EEH_SHOW_ATTR -- create sysfs entry for eeh statistic + * @_name: name of file in sysfs directory + * @_memb: name of member in struct pci_dn to access + * @_format: printf format for display + * + * All of the attributes look very similar, so just + * auto-gen a cut-n-paste routine to display them. + */ +#define EEH_SHOW_ATTR(_name,_memb,_format) \ +static ssize_t eeh_show_##_name(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct pci_dev *pdev = to_pci_dev(dev); \ + struct device_node *dn = pci_device_to_OF_node(pdev); \ + struct pci_dn *pdn; \ + \ + if (!dn || PCI_DN(dn) == NULL) \ + return 0; \ + \ + pdn = PCI_DN(dn); \ + return sprintf(buf, _format "\n", pdn->_memb); \ +} \ +static DEVICE_ATTR(_name, S_IRUGO, eeh_show_##_name, NULL); + + +EEH_SHOW_ATTR(eeh_mode, eeh_mode, "0x%x"); +EEH_SHOW_ATTR(eeh_config_addr, eeh_config_addr, "0x%x"); +EEH_SHOW_ATTR(eeh_pe_config_addr, eeh_pe_config_addr, "0x%x"); +EEH_SHOW_ATTR(eeh_check_count, eeh_check_count, "%d"); +EEH_SHOW_ATTR(eeh_freeze_count, eeh_freeze_count, "%d"); +EEH_SHOW_ATTR(eeh_false_positives, eeh_false_positives, "%d"); + +void eeh_sysfs_add_device(struct pci_dev *pdev) +{ + int rc=0; + + rc += device_create_file(&pdev->dev, &dev_attr_eeh_mode); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_config_addr); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_check_count); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_false_positives); + rc += device_create_file(&pdev->dev, &dev_attr_eeh_freeze_count); + + if (rc) + printk(KERN_WARNING "EEH: Unable to create sysfs entries\n"); +} + +void eeh_sysfs_remove_device(struct pci_dev *pdev) +{ + device_remove_file(&pdev->dev, &dev_attr_eeh_mode); + device_remove_file(&pdev->dev, &dev_attr_eeh_config_addr); + device_remove_file(&pdev->dev, &dev_attr_eeh_pe_config_addr); + device_remove_file(&pdev->dev, &dev_attr_eeh_check_count); + device_remove_file(&pdev->dev, &dev_attr_eeh_false_positives); + device_remove_file(&pdev->dev, &dev_attr_eeh_freeze_count); +} + diff --git a/arch/powerpc/platforms/pseries/event_sources.c b/arch/powerpc/platforms/pseries/event_sources.c new file mode 100644 index 00000000..2605c310 --- /dev/null +++ b/arch/powerpc/platforms/pseries/event_sources.c @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2001 Dave Engebretsen IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <asm/prom.h> + +#include "pseries.h" + +void request_event_sources_irqs(struct device_node *np, + irq_handler_t handler, + const char *name) +{ + int i, index, count = 0; + struct of_irq oirq; + const u32 *opicprop; + unsigned int opicplen; + unsigned int virqs[16]; + + /* Check for obsolete "open-pic-interrupt" property. If present, then + * map those interrupts using the default interrupt host and default + * trigger + */ + opicprop = of_get_property(np, "open-pic-interrupt", &opicplen); + if (opicprop) { + opicplen /= sizeof(u32); + for (i = 0; i < opicplen; i++) { + if (count > 15) + break; + virqs[count] = irq_create_mapping(NULL, *(opicprop++)); + if (virqs[count] == NO_IRQ) { + pr_err("event-sources: Unable to allocate " + "interrupt number for %s\n", + np->full_name); + WARN_ON(1); + } + else + count++; + + } + } + /* Else use normal interrupt tree parsing */ + else { + /* First try to do a proper OF tree parsing */ + for (index = 0; of_irq_map_one(np, index, &oirq) == 0; + index++) { + if (count > 15) + break; + virqs[count] = irq_create_of_mapping(oirq.controller, + oirq.specifier, + oirq.size); + if (virqs[count] == NO_IRQ) { + pr_err("event-sources: Unable to allocate " + "interrupt number for %s\n", + np->full_name); + WARN_ON(1); + } + else + count++; + } + } + + /* Now request them */ + for (i = 0; i < count; i++) { + if (request_irq(virqs[i], handler, 0, name, NULL)) { + pr_err("event-sources: Unable to request interrupt " + "%d for %s\n", virqs[i], np->full_name); + WARN_ON(1); + return; + } + } +} + diff --git a/arch/powerpc/platforms/pseries/firmware.c b/arch/powerpc/platforms/pseries/firmware.c new file mode 100644 index 00000000..0b0eff0c --- /dev/null +++ b/arch/powerpc/platforms/pseries/firmware.c @@ -0,0 +1,87 @@ +/* + * pSeries firmware setup code. + * + * Portions from arch/powerpc/platforms/pseries/setup.c: + * Copyright (C) 1995 Linus Torvalds + * Adapted from 'alpha' version by Gary Thomas + * Modified by Cort Dougan (cort@cs.nmt.edu) + * Modified by PPC64 Team, IBM Corp + * + * Portions from arch/powerpc/kernel/firmware.c + * Copyright (C) 2001 Ben. Herrenschmidt (benh@kernel.crashing.org) + * Modifications for ppc64: + * Copyright (C) 2003 Dave Engebretsen <engebret@us.ibm.com> + * Copyright (C) 2005 Stephen Rothwell, IBM Corporation + * + * Copyright 2006 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <asm/firmware.h> +#include <asm/prom.h> +#include <asm/udbg.h> + +#include "pseries.h" + +typedef struct { + unsigned long val; + char * name; +} firmware_feature_t; + +static __initdata firmware_feature_t +firmware_features_table[FIRMWARE_MAX_FEATURES] = { + {FW_FEATURE_PFT, "hcall-pft"}, + {FW_FEATURE_TCE, "hcall-tce"}, + {FW_FEATURE_SPRG0, "hcall-sprg0"}, + {FW_FEATURE_DABR, "hcall-dabr"}, + {FW_FEATURE_COPY, "hcall-copy"}, + {FW_FEATURE_ASR, "hcall-asr"}, + {FW_FEATURE_DEBUG, "hcall-debug"}, + {FW_FEATURE_PERF, "hcall-perf"}, + {FW_FEATURE_DUMP, "hcall-dump"}, + {FW_FEATURE_INTERRUPT, "hcall-interrupt"}, + {FW_FEATURE_MIGRATE, "hcall-migrate"}, + {FW_FEATURE_PERFMON, "hcall-perfmon"}, + {FW_FEATURE_CRQ, "hcall-crq"}, + {FW_FEATURE_VIO, "hcall-vio"}, + {FW_FEATURE_RDMA, "hcall-rdma"}, + {FW_FEATURE_LLAN, "hcall-lLAN"}, + {FW_FEATURE_BULK_REMOVE, "hcall-bulk"}, + {FW_FEATURE_XDABR, "hcall-xdabr"}, + {FW_FEATURE_MULTITCE, "hcall-multi-tce"}, + {FW_FEATURE_SPLPAR, "hcall-splpar"}, + {FW_FEATURE_VPHN, "hcall-vphn"}, +}; + +/* Build up the firmware features bitmask using the contents of + * device-tree/ibm,hypertas-functions. Ultimately this functionality may + * be moved into prom.c prom_init(). + */ +void __init fw_feature_init(const char *hypertas, unsigned long len) +{ + const char *s; + int i; + + pr_debug(" -> fw_feature_init()\n"); + + for (s = hypertas; s < hypertas + len; s += strlen(s) + 1) { + for (i = 0; i < FIRMWARE_MAX_FEATURES; i++) { + /* check value against table of strings */ + if (!firmware_features_table[i].name || + strcmp(firmware_features_table[i].name, s)) + continue; + + /* we have a match */ + powerpc_firmware_features |= + firmware_features_table[i].val; + break; + } + } + + pr_debug(" <- fw_feature_init()\n"); +} diff --git a/arch/powerpc/platforms/pseries/hotplug-cpu.c b/arch/powerpc/platforms/pseries/hotplug-cpu.c new file mode 100644 index 00000000..46f13a3c --- /dev/null +++ b/arch/powerpc/platforms/pseries/hotplug-cpu.c @@ -0,0 +1,418 @@ +/* + * pseries CPU Hotplug infrastructure. + * + * Split out from arch/powerpc/platforms/pseries/setup.c + * arch/powerpc/kernel/rtas.c, and arch/powerpc/platforms/pseries/smp.c + * + * Peter Bergner, IBM March 2001. + * Copyright (C) 2001 IBM. + * Dave Engebretsen, Peter Bergner, and + * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com + * Plus various changes from other IBM teams... + * + * Copyright (C) 2006 Michael Ellerman, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/cpu.h> +#include <asm/system.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/firmware.h> +#include <asm/machdep.h> +#include <asm/vdso_datapage.h> +#include <asm/pSeries_reconfig.h> +#include <asm/xics.h> +#include "plpar_wrappers.h" +#include "offline_states.h" + +/* This version can't take the spinlock, because it never returns */ +static struct rtas_args rtas_stop_self_args = { + .token = RTAS_UNKNOWN_SERVICE, + .nargs = 0, + .nret = 1, + .rets = &rtas_stop_self_args.args[0], +}; + +static DEFINE_PER_CPU(enum cpu_state_vals, preferred_offline_state) = + CPU_STATE_OFFLINE; +static DEFINE_PER_CPU(enum cpu_state_vals, current_state) = CPU_STATE_OFFLINE; + +static enum cpu_state_vals default_offline_state = CPU_STATE_OFFLINE; + +static int cede_offline_enabled __read_mostly = 1; + +/* + * Enable/disable cede_offline when available. + */ +static int __init setup_cede_offline(char *str) +{ + if (!strcmp(str, "off")) + cede_offline_enabled = 0; + else if (!strcmp(str, "on")) + cede_offline_enabled = 1; + else + return 0; + return 1; +} + +__setup("cede_offline=", setup_cede_offline); + +enum cpu_state_vals get_cpu_current_state(int cpu) +{ + return per_cpu(current_state, cpu); +} + +void set_cpu_current_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(current_state, cpu) = state; +} + +enum cpu_state_vals get_preferred_offline_state(int cpu) +{ + return per_cpu(preferred_offline_state, cpu); +} + +void set_preferred_offline_state(int cpu, enum cpu_state_vals state) +{ + per_cpu(preferred_offline_state, cpu) = state; +} + +void set_default_offline_state(int cpu) +{ + per_cpu(preferred_offline_state, cpu) = default_offline_state; +} + +static void rtas_stop_self(void) +{ + struct rtas_args *args = &rtas_stop_self_args; + + local_irq_disable(); + + BUG_ON(args->token == RTAS_UNKNOWN_SERVICE); + + printk("cpu %u (hwid %u) Ready to die...\n", + smp_processor_id(), hard_smp_processor_id()); + enter_rtas(__pa(args)); + + panic("Alas, I survived.\n"); +} + +static void pseries_mach_cpu_die(void) +{ + unsigned int cpu = smp_processor_id(); + unsigned int hwcpu = hard_smp_processor_id(); + u8 cede_latency_hint = 0; + + local_irq_disable(); + idle_task_exit(); + xics_teardown_cpu(); + + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + set_cpu_current_state(cpu, CPU_STATE_INACTIVE); + if (ppc_md.suspend_disable_cpu) + ppc_md.suspend_disable_cpu(); + + cede_latency_hint = 2; + + get_lppaca()->idle = 1; + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 1; + + while (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + extended_cede_processor(cede_latency_hint); + } + + if (!get_lppaca()->shared_proc) + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->idle = 0; + + if (get_preferred_offline_state(cpu) == CPU_STATE_ONLINE) { + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + + /* + * Call to start_secondary_resume() will not return. + * Kernel stack will be reset and start_secondary() + * will be called to continue the online operation. + */ + start_secondary_resume(); + } + } + + /* Requested state is CPU_STATE_OFFLINE at this point */ + WARN_ON(get_preferred_offline_state(cpu) != CPU_STATE_OFFLINE); + + set_cpu_current_state(cpu, CPU_STATE_OFFLINE); + unregister_slb_shadow(hwcpu, __pa(get_slb_shadow())); + rtas_stop_self(); + + /* Should never get here... */ + BUG(); + for(;;); +} + +static int pseries_cpu_disable(void) +{ + int cpu = smp_processor_id(); + + set_cpu_online(cpu, false); + vdso_data->processorCount--; + + /*fix boot_cpuid here*/ + if (cpu == boot_cpuid) + boot_cpuid = cpumask_any(cpu_online_mask); + + /* FIXME: abstract this to not be platform specific later on */ + xics_migrate_irqs_away(); + return 0; +} + +/* + * pseries_cpu_die: Wait for the cpu to die. + * @cpu: logical processor id of the CPU whose death we're awaiting. + * + * This function is called from the context of the thread which is performing + * the cpu-offline. Here we wait for long enough to allow the cpu in question + * to self-destroy so that the cpu-offline thread can send the CPU_DEAD + * notifications. + * + * OTOH, pseries_mach_cpu_die() is called by the @cpu when it wants to + * self-destruct. + */ +static void pseries_cpu_die(unsigned int cpu) +{ + int tries; + int cpu_status = 1; + unsigned int pcpu = get_hard_smp_processor_id(cpu); + + if (get_preferred_offline_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 1; + for (tries = 0; tries < 5000; tries++) { + if (get_cpu_current_state(cpu) == CPU_STATE_INACTIVE) { + cpu_status = 0; + break; + } + msleep(1); + } + } else if (get_preferred_offline_state(cpu) == CPU_STATE_OFFLINE) { + + for (tries = 0; tries < 25; tries++) { + cpu_status = smp_query_cpu_stopped(pcpu); + if (cpu_status == QCSS_STOPPED || + cpu_status == QCSS_HARDWARE_ERROR) + break; + cpu_relax(); + } + } + + if (cpu_status != 0) { + printk("Querying DEAD? cpu %i (%i) shows %i\n", + cpu, pcpu, cpu_status); + } + + /* Isolation and deallocation are definitely done by + * drslot_chrp_cpu. If they were not they would be + * done here. Change isolate state to Isolate and + * change allocation-state to Unusable. + */ + paca[cpu].cpu_start = 0; +} + +/* + * Update cpu_present_mask and paca(s) for a new cpu node. The wrinkle + * here is that a cpu device node may represent up to two logical cpus + * in the SMT case. We must honor the assumption in other code that + * the logical ids for sibling SMT threads x and y are adjacent, such + * that x^1 == y and y^1 == x. + */ +static int pseries_add_processor(struct device_node *np) +{ + unsigned int cpu; + cpumask_var_t candidate_mask, tmp; + int err = -ENOSPC, len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return 0; + + zalloc_cpumask_var(&candidate_mask, GFP_KERNEL); + zalloc_cpumask_var(&tmp, GFP_KERNEL); + + nthreads = len / sizeof(u32); + for (i = 0; i < nthreads; i++) + cpumask_set_cpu(i, tmp); + + cpu_maps_update_begin(); + + BUG_ON(!cpumask_subset(cpu_present_mask, cpu_possible_mask)); + + /* Get a bitmap of unoccupied slots. */ + cpumask_xor(candidate_mask, cpu_possible_mask, cpu_present_mask); + if (cpumask_empty(candidate_mask)) { + /* If we get here, it most likely means that NR_CPUS is + * less than the partition's max processors setting. + */ + printk(KERN_ERR "Cannot add cpu %s; this system configuration" + " supports %d logical cpus.\n", np->full_name, + cpumask_weight(cpu_possible_mask)); + goto out_unlock; + } + + while (!cpumask_empty(tmp)) + if (cpumask_subset(tmp, candidate_mask)) + /* Found a range where we can insert the new cpu(s) */ + break; + else + cpumask_shift_left(tmp, tmp, nthreads); + + if (cpumask_empty(tmp)) { + printk(KERN_ERR "Unable to find space in cpu_present_mask for" + " processor %s with %d thread(s)\n", np->name, + nthreads); + goto out_unlock; + } + + for_each_cpu(cpu, tmp) { + BUG_ON(cpu_present(cpu)); + set_cpu_present(cpu, true); + set_hard_smp_processor_id(cpu, *intserv++); + } + err = 0; +out_unlock: + cpu_maps_update_done(); + free_cpumask_var(candidate_mask); + free_cpumask_var(tmp); + return err; +} + +/* + * Update the present map for a cpu node which is going away, and set + * the hard id in the paca(s) to -1 to be consistent with boot time + * convention for non-present cpus. + */ +static void pseries_remove_processor(struct device_node *np) +{ + unsigned int cpu; + int len, nthreads, i; + const u32 *intserv; + + intserv = of_get_property(np, "ibm,ppc-interrupt-server#s", &len); + if (!intserv) + return; + + nthreads = len / sizeof(u32); + + cpu_maps_update_begin(); + for (i = 0; i < nthreads; i++) { + for_each_present_cpu(cpu) { + if (get_hard_smp_processor_id(cpu) != intserv[i]) + continue; + BUG_ON(cpu_online(cpu)); + set_cpu_present(cpu, false); + set_hard_smp_processor_id(cpu, -1); + break; + } + if (cpu >= nr_cpu_ids) + printk(KERN_WARNING "Could not find cpu to remove " + "with physical id 0x%x\n", intserv[i]); + } + cpu_maps_update_done(); +} + +static int pseries_smp_notifier(struct notifier_block *nb, + unsigned long action, void *node) +{ + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + if (pseries_add_processor(node)) + err = NOTIFY_BAD; + break; + case PSERIES_RECONFIG_REMOVE: + pseries_remove_processor(node); + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pseries_smp_nb = { + .notifier_call = pseries_smp_notifier, +}; + +#define MAX_CEDE_LATENCY_LEVELS 4 +#define CEDE_LATENCY_PARAM_LENGTH 10 +#define CEDE_LATENCY_PARAM_MAX_LENGTH \ + (MAX_CEDE_LATENCY_LEVELS * CEDE_LATENCY_PARAM_LENGTH * sizeof(char)) +#define CEDE_LATENCY_TOKEN 45 + +static char cede_parameters[CEDE_LATENCY_PARAM_MAX_LENGTH]; + +static int parse_cede_parameters(void) +{ + memset(cede_parameters, 0, CEDE_LATENCY_PARAM_MAX_LENGTH); + return rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CEDE_LATENCY_TOKEN, + __pa(cede_parameters), + CEDE_LATENCY_PARAM_MAX_LENGTH); +} + +static int __init pseries_cpu_hotplug_init(void) +{ + struct device_node *np; + const char *typep; + int cpu; + int qcss_tok; + + for_each_node_by_name(np, "interrupt-controller") { + typep = of_get_property(np, "compatible", NULL); + if (strstr(typep, "open-pic")) { + of_node_put(np); + + printk(KERN_INFO "CPU Hotplug not supported on " + "systems using MPIC\n"); + return 0; + } + } + + rtas_stop_self_args.token = rtas_token("stop-self"); + qcss_tok = rtas_token("query-cpu-stopped-state"); + + if (rtas_stop_self_args.token == RTAS_UNKNOWN_SERVICE || + qcss_tok == RTAS_UNKNOWN_SERVICE) { + printk(KERN_INFO "CPU Hotplug not supported by firmware " + "- disabling.\n"); + return 0; + } + + ppc_md.cpu_die = pseries_mach_cpu_die; + smp_ops->cpu_disable = pseries_cpu_disable; + smp_ops->cpu_die = pseries_cpu_die; + + /* Processors can be added/removed only on LPAR */ + if (firmware_has_feature(FW_FEATURE_LPAR)) { + pSeries_reconfig_notifier_register(&pseries_smp_nb); + cpu_maps_update_begin(); + if (cede_offline_enabled && parse_cede_parameters() == 0) { + default_offline_state = CPU_STATE_INACTIVE; + for_each_online_cpu(cpu) + set_default_offline_state(cpu); + } + cpu_maps_update_done(); + } + + return 0; +} +arch_initcall(pseries_cpu_hotplug_init); diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c new file mode 100644 index 00000000..9d6a8eff --- /dev/null +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -0,0 +1,242 @@ +/* + * pseries Memory Hotplug infrastructure. + * + * Copyright (C) 2008 Badari Pulavarty, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/of.h> +#include <linux/memblock.h> +#include <linux/vmalloc.h> +#include <linux/memory.h> + +#include <asm/firmware.h> +#include <asm/machdep.h> +#include <asm/pSeries_reconfig.h> +#include <asm/sparsemem.h> + +static unsigned long get_memblock_size(void) +{ + struct device_node *np; + unsigned int memblock_size = MIN_MEMORY_BLOCK_SIZE; + struct resource r; + + np = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (np) { + const __be64 *size; + + size = of_get_property(np, "ibm,lmb-size", NULL); + if (size) + memblock_size = be64_to_cpup(size); + of_node_put(np); + } else if (machine_is(pseries)) { + /* This fallback really only applies to pseries */ + unsigned int memzero_size = 0; + + np = of_find_node_by_path("/memory@0"); + if (np) { + if (!of_address_to_resource(np, 0, &r)) + memzero_size = resource_size(&r); + of_node_put(np); + } + + if (memzero_size) { + /* We now know the size of memory@0, use this to find + * the first memoryblock and get its size. + */ + char buf[64]; + + sprintf(buf, "/memory@%x", memzero_size); + np = of_find_node_by_path(buf); + if (np) { + if (!of_address_to_resource(np, 0, &r)) + memblock_size = resource_size(&r); + of_node_put(np); + } + } + } + return memblock_size; +} + +/* WARNING: This is going to override the generic definition whenever + * pseries is built-in regardless of what platform is active at boot + * time. This is fine for now as this is the only "option" and it + * should work everywhere. If not, we'll have to turn this into a + * ppc_md. callback + */ +unsigned long memory_block_size_bytes(void) +{ + return get_memblock_size(); +} + +static int pseries_remove_memblock(unsigned long base, unsigned int memblock_size) +{ + unsigned long start, start_pfn; + struct zone *zone; + int ret; + + start_pfn = base >> PAGE_SHIFT; + + if (!pfn_valid(start_pfn)) { + memblock_remove(base, memblock_size); + return 0; + } + + zone = page_zone(pfn_to_page(start_pfn)); + + /* + * Remove section mappings and sysfs entries for the + * section of the memory we are removing. + * + * NOTE: Ideally, this should be done in generic code like + * remove_memory(). But remove_memory() gets called by writing + * to sysfs "state" file and we can't remove sysfs entries + * while writing to it. So we have to defer it to here. + */ + ret = __remove_pages(zone, start_pfn, memblock_size >> PAGE_SHIFT); + if (ret) + return ret; + + /* + * Update memory regions for memory remove + */ + memblock_remove(base, memblock_size); + + /* + * Remove htab bolted mappings for this section of memory + */ + start = (unsigned long)__va(base); + ret = remove_section_mapping(start, start + memblock_size); + + /* Ensure all vmalloc mappings are flushed in case they also + * hit that section of memory + */ + vm_unmap_aliases(); + + return ret; +} + +static int pseries_remove_memory(struct device_node *np) +{ + const char *type; + const unsigned int *regs; + unsigned long base; + unsigned int lmb_size; + int ret = -EINVAL; + + /* + * Check to see if we are actually removing memory + */ + type = of_get_property(np, "device_type", NULL); + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* + * Find the bae address and size of the memblock + */ + regs = of_get_property(np, "reg", NULL); + if (!regs) + return ret; + + base = *(unsigned long *)regs; + lmb_size = regs[3]; + + ret = pseries_remove_memblock(base, lmb_size); + return ret; +} + +static int pseries_add_memory(struct device_node *np) +{ + const char *type; + const unsigned int *regs; + unsigned long base; + unsigned int lmb_size; + int ret = -EINVAL; + + /* + * Check to see if we are actually adding memory + */ + type = of_get_property(np, "device_type", NULL); + if (type == NULL || strcmp(type, "memory") != 0) + return 0; + + /* + * Find the base and size of the memblock + */ + regs = of_get_property(np, "reg", NULL); + if (!regs) + return ret; + + base = *(unsigned long *)regs; + lmb_size = regs[3]; + + /* + * Update memory region to represent the memory add + */ + ret = memblock_add(base, lmb_size); + return (ret < 0) ? -EINVAL : 0; +} + +static int pseries_drconf_memory(unsigned long *base, unsigned int action) +{ + unsigned long memblock_size; + int rc; + + memblock_size = get_memblock_size(); + if (!memblock_size) + return -EINVAL; + + if (action == PSERIES_DRCONF_MEM_ADD) { + rc = memblock_add(*base, memblock_size); + rc = (rc < 0) ? -EINVAL : 0; + } else if (action == PSERIES_DRCONF_MEM_REMOVE) { + rc = pseries_remove_memblock(*base, memblock_size); + } else { + rc = -EINVAL; + } + + return rc; +} + +static int pseries_memory_notifier(struct notifier_block *nb, + unsigned long action, void *node) +{ + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + if (pseries_add_memory(node)) + err = NOTIFY_BAD; + break; + case PSERIES_RECONFIG_REMOVE: + if (pseries_remove_memory(node)) + err = NOTIFY_BAD; + break; + case PSERIES_DRCONF_MEM_ADD: + case PSERIES_DRCONF_MEM_REMOVE: + if (pseries_drconf_memory(node, action)) + err = NOTIFY_BAD; + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pseries_mem_nb = { + .notifier_call = pseries_memory_notifier, +}; + +static int __init pseries_memory_hotplug_init(void) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + pSeries_reconfig_notifier_register(&pseries_mem_nb); + + return 0; +} +machine_device_initcall(pseries, pseries_memory_hotplug_init); diff --git a/arch/powerpc/platforms/pseries/hvCall.S b/arch/powerpc/platforms/pseries/hvCall.S new file mode 100644 index 00000000..fd05fdee --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvCall.S @@ -0,0 +1,269 @@ +/* + * This file contains the generic code to perform a call to the + * pSeries LPAR hypervisor. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include <asm/hvcall.h> +#include <asm/processor.h> +#include <asm/ppc_asm.h> +#include <asm/asm-offsets.h> +#include <asm/ptrace.h> + +#define STK_PARM(i) (48 + ((i)-3)*8) + +#ifdef CONFIG_TRACEPOINTS + + .section ".toc","aw" + + .globl hcall_tracepoint_refcount +hcall_tracepoint_refcount: + .llong 0 + + .section ".text" + +/* + * precall must preserve all registers. use unused STK_PARM() + * areas to save snapshots and opcode. We branch around this + * in early init (eg when populating the MMU hashtable) by using an + * unconditional cpu feature. + */ +#define HCALL_INST_PRECALL(FIRST_REG) \ +BEGIN_FTR_SECTION; \ + b 1f; \ +END_FTR_SECTION(0, 1); \ + ld r12,hcall_tracepoint_refcount@toc(r2); \ + cmpdi r12,0; \ + beq+ 1f; \ + mflr r0; \ + std r3,STK_PARM(r3)(r1); \ + std r4,STK_PARM(r4)(r1); \ + std r5,STK_PARM(r5)(r1); \ + std r6,STK_PARM(r6)(r1); \ + std r7,STK_PARM(r7)(r1); \ + std r8,STK_PARM(r8)(r1); \ + std r9,STK_PARM(r9)(r1); \ + std r10,STK_PARM(r10)(r1); \ + std r0,16(r1); \ + addi r4,r1,STK_PARM(FIRST_REG); \ + stdu r1,-STACK_FRAME_OVERHEAD(r1); \ + bl .__trace_hcall_entry; \ + addi r1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,16(r1); \ + ld r3,STK_PARM(r3)(r1); \ + ld r4,STK_PARM(r4)(r1); \ + ld r5,STK_PARM(r5)(r1); \ + ld r6,STK_PARM(r6)(r1); \ + ld r7,STK_PARM(r7)(r1); \ + ld r8,STK_PARM(r8)(r1); \ + ld r9,STK_PARM(r9)(r1); \ + ld r10,STK_PARM(r10)(r1); \ + mtlr r0; \ +1: + +/* + * postcall is performed immediately before function return which + * allows liberal use of volatile registers. We branch around this + * in early init (eg when populating the MMU hashtable) by using an + * unconditional cpu feature. + */ +#define __HCALL_INST_POSTCALL \ +BEGIN_FTR_SECTION; \ + b 1f; \ +END_FTR_SECTION(0, 1); \ + ld r12,hcall_tracepoint_refcount@toc(r2); \ + cmpdi r12,0; \ + beq+ 1f; \ + mflr r0; \ + ld r6,STK_PARM(r3)(r1); \ + std r3,STK_PARM(r3)(r1); \ + mr r4,r3; \ + mr r3,r6; \ + std r0,16(r1); \ + stdu r1,-STACK_FRAME_OVERHEAD(r1); \ + bl .__trace_hcall_exit; \ + addi r1,r1,STACK_FRAME_OVERHEAD; \ + ld r0,16(r1); \ + ld r3,STK_PARM(r3)(r1); \ + mtlr r0; \ +1: + +#define HCALL_INST_POSTCALL_NORETS \ + li r5,0; \ + __HCALL_INST_POSTCALL + +#define HCALL_INST_POSTCALL(BUFREG) \ + mr r5,BUFREG; \ + __HCALL_INST_POSTCALL + +#else +#define HCALL_INST_PRECALL(FIRST_ARG) +#define HCALL_INST_POSTCALL_NORETS +#define HCALL_INST_POSTCALL(BUFREG) +#endif + + .text + +_GLOBAL(plpar_hcall_norets) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + + HCALL_INST_PRECALL(r4) + + HVSC /* invoke the hypervisor */ + + HCALL_INST_POSTCALL_NORETS + + lwz r0,8(r1) + mtcrf 0xff,r0 + blr /* return r3 = status */ + +_GLOBAL(plpar_hcall) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + + HCALL_INST_PRECALL(r5) + + std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + + mr r4,r5 + mr r5,r6 + mr r6,r7 + mr r7,r8 + mr r8,r9 + mr r9,r10 + + HVSC /* invoke the hypervisor */ + + ld r12,STK_PARM(r4)(r1) + std r4, 0(r12) + std r5, 8(r12) + std r6, 16(r12) + std r7, 24(r12) + + HCALL_INST_POSTCALL(r12) + + lwz r0,8(r1) + mtcrf 0xff,r0 + + blr /* return r3 = status */ + +/* + * plpar_hcall_raw can be called in real mode. kexec/kdump need some + * hypervisor calls to be executed in real mode. So plpar_hcall_raw + * does not access the per cpu hypervisor call statistics variables, + * since these variables may not be present in the RMO region. + */ +_GLOBAL(plpar_hcall_raw) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + + std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + + mr r4,r5 + mr r5,r6 + mr r6,r7 + mr r7,r8 + mr r8,r9 + mr r9,r10 + + HVSC /* invoke the hypervisor */ + + ld r12,STK_PARM(r4)(r1) + std r4, 0(r12) + std r5, 8(r12) + std r6, 16(r12) + std r7, 24(r12) + + lwz r0,8(r1) + mtcrf 0xff,r0 + + blr /* return r3 = status */ + +_GLOBAL(plpar_hcall9) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + + HCALL_INST_PRECALL(r5) + + std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + + mr r4,r5 + mr r5,r6 + mr r6,r7 + mr r7,r8 + mr r8,r9 + mr r9,r10 + ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + + HVSC /* invoke the hypervisor */ + + mr r0,r12 + ld r12,STK_PARM(r4)(r1) + std r4, 0(r12) + std r5, 8(r12) + std r6, 16(r12) + std r7, 24(r12) + std r8, 32(r12) + std r9, 40(r12) + std r10,48(r12) + std r11,56(r12) + std r0, 64(r12) + + HCALL_INST_POSTCALL(r12) + + lwz r0,8(r1) + mtcrf 0xff,r0 + + blr /* return r3 = status */ + +/* See plpar_hcall_raw to see why this is needed */ +_GLOBAL(plpar_hcall9_raw) + HMT_MEDIUM + + mfcr r0 + stw r0,8(r1) + + std r4,STK_PARM(r4)(r1) /* Save ret buffer */ + + mr r4,r5 + mr r5,r6 + mr r6,r7 + mr r7,r8 + mr r8,r9 + mr r9,r10 + ld r10,STK_PARM(r11)(r1) /* put arg7 in R10 */ + ld r11,STK_PARM(r12)(r1) /* put arg8 in R11 */ + ld r12,STK_PARM(r13)(r1) /* put arg9 in R12 */ + + HVSC /* invoke the hypervisor */ + + mr r0,r12 + ld r12,STK_PARM(r4)(r1) + std r4, 0(r12) + std r5, 8(r12) + std r6, 16(r12) + std r7, 24(r12) + std r8, 32(r12) + std r9, 40(r12) + std r10,48(r12) + std r11,56(r12) + std r0, 64(r12) + + lwz r0,8(r1) + mtcrf 0xff,r0 + + blr /* return r3 = status */ diff --git a/arch/powerpc/platforms/pseries/hvCall_inst.c b/arch/powerpc/platforms/pseries/hvCall_inst.c new file mode 100644 index 00000000..c9311cfd --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvCall_inst.c @@ -0,0 +1,165 @@ +/* + * Copyright (C) 2006 Mike Kravetz IBM Corporation + * + * Hypervisor Call Instrumentation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/percpu.h> +#include <linux/debugfs.h> +#include <linux/seq_file.h> +#include <linux/cpumask.h> +#include <asm/hvcall.h> +#include <asm/firmware.h> +#include <asm/cputable.h> +#include <asm/trace.h> + +DEFINE_PER_CPU(struct hcall_stats[HCALL_STAT_ARRAY_SIZE], hcall_stats); + +/* + * Routines for displaying the statistics in debugfs + */ +static void *hc_start(struct seq_file *m, loff_t *pos) +{ + if ((int)*pos < (HCALL_STAT_ARRAY_SIZE-1)) + return (void *)(unsigned long)(*pos + 1); + + return NULL; +} + +static void *hc_next(struct seq_file *m, void *p, loff_t * pos) +{ + ++*pos; + + return hc_start(m, pos); +} + +static void hc_stop(struct seq_file *m, void *p) +{ +} + +static int hc_show(struct seq_file *m, void *p) +{ + unsigned long h_num = (unsigned long)p; + struct hcall_stats *hs = m->private; + + if (hs[h_num].num_calls) { + if (cpu_has_feature(CPU_FTR_PURR)) + seq_printf(m, "%lu %lu %lu %lu\n", h_num<<2, + hs[h_num].num_calls, + hs[h_num].tb_total, + hs[h_num].purr_total); + else + seq_printf(m, "%lu %lu %lu\n", h_num<<2, + hs[h_num].num_calls, + hs[h_num].tb_total); + } + + return 0; +} + +static const struct seq_operations hcall_inst_seq_ops = { + .start = hc_start, + .next = hc_next, + .stop = hc_stop, + .show = hc_show +}; + +static int hcall_inst_seq_open(struct inode *inode, struct file *file) +{ + int rc; + struct seq_file *seq; + + rc = seq_open(file, &hcall_inst_seq_ops); + seq = file->private_data; + seq->private = file->f_path.dentry->d_inode->i_private; + + return rc; +} + +static const struct file_operations hcall_inst_seq_fops = { + .open = hcall_inst_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + +#define HCALL_ROOT_DIR "hcall_inst" +#define CPU_NAME_BUF_SIZE 32 + + +static void probe_hcall_entry(void *ignored, unsigned long opcode, unsigned long *args) +{ + struct hcall_stats *h; + + if (opcode > MAX_HCALL_OPCODE) + return; + + h = &__get_cpu_var(hcall_stats)[opcode / 4]; + h->tb_start = mftb(); + h->purr_start = mfspr(SPRN_PURR); +} + +static void probe_hcall_exit(void *ignored, unsigned long opcode, unsigned long retval, + unsigned long *retbuf) +{ + struct hcall_stats *h; + + if (opcode > MAX_HCALL_OPCODE) + return; + + h = &__get_cpu_var(hcall_stats)[opcode / 4]; + h->num_calls++; + h->tb_total += mftb() - h->tb_start; + h->purr_total += mfspr(SPRN_PURR) - h->purr_start; +} + +static int __init hcall_inst_init(void) +{ + struct dentry *hcall_root; + struct dentry *hcall_file; + char cpu_name_buf[CPU_NAME_BUF_SIZE]; + int cpu; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + if (register_trace_hcall_entry(probe_hcall_entry, NULL)) + return -EINVAL; + + if (register_trace_hcall_exit(probe_hcall_exit, NULL)) { + unregister_trace_hcall_entry(probe_hcall_entry, NULL); + return -EINVAL; + } + + hcall_root = debugfs_create_dir(HCALL_ROOT_DIR, NULL); + if (!hcall_root) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + snprintf(cpu_name_buf, CPU_NAME_BUF_SIZE, "cpu%d", cpu); + hcall_file = debugfs_create_file(cpu_name_buf, S_IRUGO, + hcall_root, + per_cpu(hcall_stats, cpu), + &hcall_inst_seq_fops); + if (!hcall_file) + return -ENOMEM; + } + + return 0; +} +__initcall(hcall_inst_init); diff --git a/arch/powerpc/platforms/pseries/hvconsole.c b/arch/powerpc/platforms/pseries/hvconsole.c new file mode 100644 index 00000000..041e87ca --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvconsole.c @@ -0,0 +1,80 @@ +/* + * hvconsole.c + * Copyright (C) 2004 Hollis Blanchard, IBM Corporation + * Copyright (C) 2004 IBM Corporation + * + * Additional Author(s): + * Ryan S. Arnold <rsa@us.ibm.com> + * + * LPAR console support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <asm/hvcall.h> +#include <asm/hvconsole.h> +#include "plpar_wrappers.h" + +/** + * hvc_get_chars - retrieve characters from firmware for denoted vterm adatper + * @vtermno: The vtermno or unit_address of the adapter from which to fetch the + * data. + * @buf: The character buffer into which to put the character data fetched from + * firmware. + * @count: not used? + */ +int hvc_get_chars(uint32_t vtermno, char *buf, int count) +{ + unsigned long got; + + if (plpar_get_term_char(vtermno, &got, buf) == H_SUCCESS) + return got; + + return 0; +} + +EXPORT_SYMBOL(hvc_get_chars); + + +/** + * hvc_put_chars: send characters to firmware for denoted vterm adapter + * @vtermno: The vtermno or unit_address of the adapter from which the data + * originated. + * @buf: The character buffer that contains the character data to send to + * firmware. + * @count: Send this number of characters. + */ +int hvc_put_chars(uint32_t vtermno, const char *buf, int count) +{ + unsigned long *lbuf = (unsigned long *) buf; + long ret; + + + /* hcall will ret H_PARAMETER if 'count' exceeds firmware max.*/ + if (count > MAX_VIO_PUT_CHARS) + count = MAX_VIO_PUT_CHARS; + + ret = plpar_hcall_norets(H_PUT_TERM_CHAR, vtermno, count, lbuf[0], + lbuf[1]); + if (ret == H_SUCCESS) + return count; + if (ret == H_BUSY) + return -EAGAIN; + return -EIO; +} + +EXPORT_SYMBOL(hvc_put_chars); diff --git a/arch/powerpc/platforms/pseries/hvcserver.c b/arch/powerpc/platforms/pseries/hvcserver.c new file mode 100644 index 00000000..fcf4b4cb --- /dev/null +++ b/arch/powerpc/platforms/pseries/hvcserver.c @@ -0,0 +1,251 @@ +/* + * hvcserver.c + * Copyright (C) 2004 Ryan S Arnold, IBM Corporation + * + * PPC64 virtual I/O console server support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kernel.h> +#include <linux/list.h> +#include <linux/module.h> +#include <linux/slab.h> + +#include <asm/hvcall.h> +#include <asm/hvcserver.h> +#include <asm/io.h> + +#define HVCS_ARCH_VERSION "1.0.0" + +MODULE_AUTHOR("Ryan S. Arnold <rsa@us.ibm.com>"); +MODULE_DESCRIPTION("IBM hvcs ppc64 API"); +MODULE_LICENSE("GPL"); +MODULE_VERSION(HVCS_ARCH_VERSION); + +/* + * Convert arch specific return codes into relevant errnos. The hvcs + * functions aren't performance sensitive, so this conversion isn't an + * issue. + */ +static int hvcs_convert(long to_convert) +{ + switch (to_convert) { + case H_SUCCESS: + return 0; + case H_PARAMETER: + return -EINVAL; + case H_HARDWARE: + return -EIO; + case H_BUSY: + case H_LONG_BUSY_ORDER_1_MSEC: + case H_LONG_BUSY_ORDER_10_MSEC: + case H_LONG_BUSY_ORDER_100_MSEC: + case H_LONG_BUSY_ORDER_1_SEC: + case H_LONG_BUSY_ORDER_10_SEC: + case H_LONG_BUSY_ORDER_100_SEC: + return -EBUSY; + case H_FUNCTION: /* fall through */ + default: + return -EPERM; + } +} + +/** + * hvcs_free_partner_info - free pi allocated by hvcs_get_partner_info + * @head: list_head pointer for an allocated list of partner info structs to + * free. + * + * This function is used to free the partner info list that was returned by + * calling hvcs_get_partner_info(). + */ +int hvcs_free_partner_info(struct list_head *head) +{ + struct hvcs_partner_info *pi; + struct list_head *element; + + if (!head) + return -EINVAL; + + while (!list_empty(head)) { + element = head->next; + pi = list_entry(element, struct hvcs_partner_info, node); + list_del(element); + kfree(pi); + } + + return 0; +} +EXPORT_SYMBOL(hvcs_free_partner_info); + +/* Helper function for hvcs_get_partner_info */ +static int hvcs_next_partner(uint32_t unit_address, + unsigned long last_p_partition_ID, + unsigned long last_p_unit_address, unsigned long *pi_buff) + +{ + long retval; + retval = plpar_hcall_norets(H_VTERM_PARTNER_INFO, unit_address, + last_p_partition_ID, + last_p_unit_address, virt_to_phys(pi_buff)); + return hvcs_convert(retval); +} + +/** + * hvcs_get_partner_info - Get all of the partner info for a vty-server adapter + * @unit_address: The unit_address of the vty-server adapter for which this + * function is fetching partner info. + * @head: An initialized list_head pointer to an empty list to use to return the + * list of partner info fetched from the hypervisor to the caller. + * @pi_buff: A page sized buffer pre-allocated prior to calling this function + * that is to be used to be used by firmware as an iterator to keep track + * of the partner info retrieval. + * + * This function returns non-zero on success, or if there is no partner info. + * + * The pi_buff is pre-allocated prior to calling this function because this + * function may be called with a spin_lock held and kmalloc of a page is not + * recommended as GFP_ATOMIC. + * + * The first long of this buffer is used to store a partner unit address. The + * second long is used to store a partner partition ID and starting at + * pi_buff[2] is the 79 character Converged Location Code (diff size than the + * unsigned longs, hence the casting mumbo jumbo you see later). + * + * Invocation of this function should always be followed by an invocation of + * hvcs_free_partner_info() using a pointer to the SAME list head instance + * that was passed as a parameter to this function. + */ +int hvcs_get_partner_info(uint32_t unit_address, struct list_head *head, + unsigned long *pi_buff) +{ + /* + * Dealt with as longs because of the hcall interface even though the + * values are uint32_t. + */ + unsigned long last_p_partition_ID; + unsigned long last_p_unit_address; + struct hvcs_partner_info *next_partner_info = NULL; + int more = 1; + int retval; + + memset(pi_buff, 0x00, PAGE_SIZE); + /* invalid parameters */ + if (!head || !pi_buff) + return -EINVAL; + + last_p_partition_ID = last_p_unit_address = ~0UL; + INIT_LIST_HEAD(head); + + do { + retval = hvcs_next_partner(unit_address, last_p_partition_ID, + last_p_unit_address, pi_buff); + if (retval) { + /* + * Don't indicate that we've failed if we have + * any list elements. + */ + if (!list_empty(head)) + return 0; + return retval; + } + + last_p_partition_ID = pi_buff[0]; + last_p_unit_address = pi_buff[1]; + + /* This indicates that there are no further partners */ + if (last_p_partition_ID == ~0UL + && last_p_unit_address == ~0UL) + break; + + /* This is a very small struct and will be freed soon in + * hvcs_free_partner_info(). */ + next_partner_info = kmalloc(sizeof(struct hvcs_partner_info), + GFP_ATOMIC); + + if (!next_partner_info) { + printk(KERN_WARNING "HVCONSOLE: kmalloc() failed to" + " allocate partner info struct.\n"); + hvcs_free_partner_info(head); + return -ENOMEM; + } + + next_partner_info->unit_address + = (unsigned int)last_p_unit_address; + next_partner_info->partition_ID + = (unsigned int)last_p_partition_ID; + + /* copy the Null-term char too */ + strncpy(&next_partner_info->location_code[0], + (char *)&pi_buff[2], + strlen((char *)&pi_buff[2]) + 1); + + list_add_tail(&(next_partner_info->node), head); + next_partner_info = NULL; + + } while (more); + + return 0; +} +EXPORT_SYMBOL(hvcs_get_partner_info); + +/** + * hvcs_register_connection - establish a connection between this vty-server and + * a vty. + * @unit_address: The unit address of the vty-server adapter that is to be + * establish a connection. + * @p_partition_ID: The partition ID of the vty adapter that is to be connected. + * @p_unit_address: The unit address of the vty adapter to which the vty-server + * is to be connected. + * + * If this function is called once and -EINVAL is returned it may + * indicate that the partner info needs to be refreshed for the + * target unit address at which point the caller must invoke + * hvcs_get_partner_info() and then call this function again. If, + * for a second time, -EINVAL is returned then it indicates that + * there is probably already a partner connection registered to a + * different vty-server adapter. It is also possible that a second + * -EINVAL may indicate that one of the parms is not valid, for + * instance if the link was removed between the vty-server adapter + * and the vty adapter that you are trying to open. Don't shoot the + * messenger. Firmware implemented it this way. + */ +int hvcs_register_connection( uint32_t unit_address, + uint32_t p_partition_ID, uint32_t p_unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_REGISTER_VTERM, unit_address, + p_partition_ID, p_unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_register_connection); + +/** + * hvcs_free_connection - free the connection between a vty-server and vty + * @unit_address: The unit address of the vty-server that is to have its + * connection severed. + * + * This function is used to free the partner connection between a vty-server + * adapter and a vty adapter. + * + * If -EBUSY is returned continue to call this function until 0 is returned. + */ +int hvcs_free_connection(uint32_t unit_address) +{ + long retval; + retval = plpar_hcall_norets(H_FREE_VTERM, unit_address); + return hvcs_convert(retval); +} +EXPORT_SYMBOL(hvcs_free_connection); diff --git a/arch/powerpc/platforms/pseries/io_event_irq.c b/arch/powerpc/platforms/pseries/io_event_irq.c new file mode 100644 index 00000000..c829e606 --- /dev/null +++ b/arch/powerpc/platforms/pseries/io_event_irq.c @@ -0,0 +1,231 @@ +/* + * Copyright 2010 2011 Mark Nelson and Tseng-Hui (Frank) Lin, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/irq.h> +#include <linux/interrupt.h> +#include <linux/of.h> +#include <linux/list.h> +#include <linux/notifier.h> + +#include <asm/machdep.h> +#include <asm/rtas.h> +#include <asm/irq.h> +#include <asm/io_event_irq.h> + +#include "pseries.h" + +/* + * IO event interrupt is a mechanism provided by RTAS to return + * information about hardware error and non-error events. Device + * drivers can register their event handlers to receive events. + * Device drivers are expected to use atomic_notifier_chain_register() + * and atomic_notifier_chain_unregister() to register and unregister + * their event handlers. Since multiple IO event types and scopes + * share an IO event interrupt, the event handlers are called one + * by one until the IO event is claimed by one of the handlers. + * The event handlers are expected to return NOTIFY_OK if the + * event is handled by the event handler or NOTIFY_DONE if the + * event does not belong to the handler. + * + * Usage: + * + * Notifier function: + * #include <asm/io_event_irq.h> + * int event_handler(struct notifier_block *nb, unsigned long val, void *data) { + * p = (struct pseries_io_event_sect_data *) data; + * if (! is_my_event(p->scope, p->event_type)) return NOTIFY_DONE; + * : + * : + * return NOTIFY_OK; + * } + * struct notifier_block event_nb = { + * .notifier_call = event_handler, + * } + * + * Registration: + * atomic_notifier_chain_register(&pseries_ioei_notifier_list, &event_nb); + * + * Unregistration: + * atomic_notifier_chain_unregister(&pseries_ioei_notifier_list, &event_nb); + */ + +ATOMIC_NOTIFIER_HEAD(pseries_ioei_notifier_list); +EXPORT_SYMBOL_GPL(pseries_ioei_notifier_list); + +static int ioei_check_exception_token; + +/* pSeries event log format */ + +/* Two bytes ASCII section IDs */ +#define PSERIES_ELOG_SECT_ID_PRIV_HDR (('P' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_HDR (('U' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_PRIMARY_SRC (('P' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_EXTENDED_UH (('E' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FAILING_MTMS (('M' << 8) | 'T') +#define PSERIES_ELOG_SECT_ID_SECONDARY_SRC (('S' << 8) | 'S') +#define PSERIES_ELOG_SECT_ID_DUMP_LOCATOR (('D' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_FW_ERROR (('S' << 8) | 'W') +#define PSERIES_ELOG_SECT_ID_IMPACT_PART_ID (('L' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_LOGIC_RESOURCE_ID (('L' << 8) | 'R') +#define PSERIES_ELOG_SECT_ID_HMC_ID (('H' << 8) | 'M') +#define PSERIES_ELOG_SECT_ID_EPOW (('E' << 8) | 'P') +#define PSERIES_ELOG_SECT_ID_IO_EVENT (('I' << 8) | 'E') +#define PSERIES_ELOG_SECT_ID_MANUFACT_INFO (('M' << 8) | 'I') +#define PSERIES_ELOG_SECT_ID_CALL_HOME (('C' << 8) | 'H') +#define PSERIES_ELOG_SECT_ID_USER_DEF (('U' << 8) | 'D') + +/* Vendor specific Platform Event Log Format, Version 6, section header */ +struct pseries_elog_section { + uint16_t id; /* 0x00 2-byte ASCII section ID */ + uint16_t length; /* 0x02 Section length in bytes */ + uint8_t version; /* 0x04 Section version */ + uint8_t subtype; /* 0x05 Section subtype */ + uint16_t creator_component; /* 0x06 Creator component ID */ + uint8_t data[]; /* 0x08 Start of section data */ +}; + +static char ioei_rtas_buf[RTAS_DATA_BUF_SIZE] __cacheline_aligned; + +/** + * Find data portion of a specific section in RTAS extended event log. + * @elog: RTAS error/event log. + * @sect_id: secsion ID. + * + * Return: + * pointer to the section data of the specified section + * NULL if not found + */ +static struct pseries_elog_section *find_xelog_section(struct rtas_error_log *elog, + uint16_t sect_id) +{ + struct rtas_ext_event_log_v6 *xelog = + (struct rtas_ext_event_log_v6 *) elog->buffer; + struct pseries_elog_section *sect; + unsigned char *p, *log_end; + + /* Check that we understand the format */ + if (elog->extended_log_length < sizeof(struct rtas_ext_event_log_v6) || + xelog->log_format != RTAS_V6EXT_LOG_FORMAT_EVENT_LOG || + xelog->company_id != RTAS_V6EXT_COMPANY_ID_IBM) + return NULL; + + log_end = elog->buffer + elog->extended_log_length; + p = xelog->vendor_log; + while (p < log_end) { + sect = (struct pseries_elog_section *)p; + if (sect->id == sect_id) + return sect; + p += sect->length; + } + return NULL; +} + +/** + * Find the data portion of an IO Event section from event log. + * @elog: RTAS error/event log. + * + * Return: + * pointer to a valid IO event section data. NULL if not found. + */ +static struct pseries_io_event * ioei_find_event(struct rtas_error_log *elog) +{ + struct pseries_elog_section *sect; + + /* We should only ever get called for io-event interrupts, but if + * we do get called for another type then something went wrong so + * make some noise about it. + * RTAS_TYPE_IO only exists in extended event log version 6 or later. + * No need to check event log version. + */ + if (unlikely(elog->type != RTAS_TYPE_IO)) { + printk_once(KERN_WARNING "io_event_irq: Unexpected event type %d", + elog->type); + return NULL; + } + + sect = find_xelog_section(elog, PSERIES_ELOG_SECT_ID_IO_EVENT); + if (unlikely(!sect)) { + printk_once(KERN_WARNING "io_event_irq: RTAS extended event " + "log does not contain an IO Event section. " + "Could be a bug in system firmware!\n"); + return NULL; + } + return (struct pseries_io_event *) §->data; +} + +/* + * PAPR: + * - check-exception returns the first found error or event and clear that + * error or event so it is reported once. + * - Each interrupt returns one event. If a plateform chooses to report + * multiple events through a single interrupt, it must ensure that the + * interrupt remains asserted until check-exception has been used to + * process all out-standing events for that interrupt. + * + * Implementation notes: + * - Events must be processed in the order they are returned. Hence, + * sequential in nature. + * - The owner of an event is determined by combinations of scope, + * event type, and sub-type. There is no easy way to pre-sort clients + * by scope or event type alone. For example, Torrent ISR route change + * event is reported with scope 0x00 (Not Applicatable) rather than + * 0x3B (Torrent-hub). It is better to let the clients to identify + * who owns the the event. + */ + +static irqreturn_t ioei_interrupt(int irq, void *dev_id) +{ + struct pseries_io_event *event; + int rtas_rc; + + for (;;) { + rtas_rc = rtas_call(ioei_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, + virq_to_hw(irq), + RTAS_IO_EVENTS, 1 /* Time Critical */, + __pa(ioei_rtas_buf), + RTAS_DATA_BUF_SIZE); + if (rtas_rc != 0) + break; + + event = ioei_find_event((struct rtas_error_log *)ioei_rtas_buf); + if (!event) + continue; + + atomic_notifier_call_chain(&pseries_ioei_notifier_list, + 0, event); + } + return IRQ_HANDLED; +} + +static int __init ioei_init(void) +{ + struct device_node *np; + + ioei_check_exception_token = rtas_token("check-exception"); + if (ioei_check_exception_token == RTAS_UNKNOWN_SERVICE) { + pr_warning("IO Event IRQ not supported on this system !\n"); + return -ENODEV; + } + np = of_find_node_by_path("/event-sources/ibm,io-events"); + if (np) { + request_event_sources_irqs(np, ioei_interrupt, "IO_EVENT"); + of_node_put(np); + } else { + pr_err("io_event_irq: No ibm,io-events on system! " + "IO Event interrupt disabled.\n"); + return -ENODEV; + } + return 0; +} +machine_subsys_initcall(pseries, ioei_init); + diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c new file mode 100644 index 00000000..01faab94 --- /dev/null +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -0,0 +1,1217 @@ +/* + * Copyright (C) 2001 Mike Corrigan & Dave Engebretsen, IBM Corporation + * + * Rewrite, cleanup: + * + * Copyright (C) 2004 Olof Johansson <olof@lixom.net>, IBM Corporation + * Copyright (C) 2006 Olof Johansson <olof@lixom.net> + * + * Dynamic DMA mapping support, pSeries-specific parts, both SMP and LPAR. + * + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/types.h> +#include <linux/slab.h> +#include <linux/mm.h> +#include <linux/spinlock.h> +#include <linux/string.h> +#include <linux/pci.h> +#include <linux/dma-mapping.h> +#include <linux/crash_dump.h> +#include <linux/memory.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/iommu.h> +#include <asm/pci-bridge.h> +#include <asm/machdep.h> +#include <asm/abs_addr.h> +#include <asm/pSeries_reconfig.h> +#include <asm/firmware.h> +#include <asm/tce.h> +#include <asm/ppc-pci.h> +#include <asm/udbg.h> +#include <asm/mmzone.h> + +#include "plpar_wrappers.h" + + +static int tce_build_pSeries(struct iommu_table *tbl, long index, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + u64 proto_tce; + u64 *tcep; + u64 rpn; + + proto_tce = TCE_PCI_READ; // Read allowed + + if (direction != DMA_TO_DEVICE) + proto_tce |= TCE_PCI_WRITE; + + tcep = ((u64 *)tbl->it_base) + index; + + while (npages--) { + /* can't move this out since we might cross MEMBLOCK boundary */ + rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + *tcep = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + + uaddr += TCE_PAGE_SIZE; + tcep++; + } + return 0; +} + + +static void tce_free_pSeries(struct iommu_table *tbl, long index, long npages) +{ + u64 *tcep; + + tcep = ((u64 *)tbl->it_base) + index; + + while (npages--) + *(tcep++) = 0; +} + +static unsigned long tce_get_pseries(struct iommu_table *tbl, long index) +{ + u64 *tcep; + + tcep = ((u64 *)tbl->it_base) + index; + + return *tcep; +} + +static void tce_free_pSeriesLP(struct iommu_table*, long, long); +static void tce_freemulti_pSeriesLP(struct iommu_table*, long, long); + +static int tce_build_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + u64 rc = 0; + u64 proto_tce, tce; + u64 rpn; + int ret = 0; + long tcenum_start = tcenum, npages_start = npages; + + rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + proto_tce = TCE_PCI_READ; + if (direction != DMA_TO_DEVICE) + proto_tce |= TCE_PCI_WRITE; + + while (npages--) { + tce = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, tce); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; + tce_free_pSeriesLP(tbl, tcenum_start, + (npages_start - (npages + 1))); + break; + } + + if (rc && printk_ratelimit()) { + printk("tce_build_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); + printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\ttcenum = 0x%llx\n", (u64)tcenum); + printk("\ttce val = 0x%llx\n", tce ); + show_stack(current, (unsigned long *)__get_SP()); + } + + tcenum++; + rpn++; + } + return ret; +} + +static DEFINE_PER_CPU(u64 *, tce_page); + +static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, + long npages, unsigned long uaddr, + enum dma_data_direction direction, + struct dma_attrs *attrs) +{ + u64 rc = 0; + u64 proto_tce; + u64 *tcep; + u64 rpn; + long l, limit; + long tcenum_start = tcenum, npages_start = npages; + int ret = 0; + + if (npages == 1) { + return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + direction, attrs); + } + + tcep = __get_cpu_var(tce_page); + + /* This is safe to do since interrupts are off when we're called + * from iommu_alloc{,_sg}() + */ + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + /* If allocation fails, fall back to the loop implementation */ + if (!tcep) { + return tce_build_pSeriesLP(tbl, tcenum, npages, uaddr, + direction, attrs); + } + __get_cpu_var(tce_page) = tcep; + } + + rpn = (virt_to_abs(uaddr)) >> TCE_SHIFT; + proto_tce = TCE_PCI_READ; + if (direction != DMA_TO_DEVICE) + proto_tce |= TCE_PCI_WRITE; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, npages, 4096/TCE_ENTRY_SIZE); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | (rpn & TCE_RPN_MASK) << TCE_RPN_SHIFT; + rpn++; + } + + rc = plpar_tce_put_indirect((u64)tbl->it_index, + (u64)tcenum << 12, + (u64)virt_to_abs(tcep), + limit); + + npages -= limit; + tcenum += limit; + } while (npages > 0 && !rc); + + if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { + ret = (int)rc; + tce_freemulti_pSeriesLP(tbl, tcenum_start, + (npages_start - (npages + limit))); + return ret; + } + + if (rc && printk_ratelimit()) { + printk("tce_buildmulti_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); + printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tnpages = 0x%llx\n", (u64)npages); + printk("\ttce[0] val = 0x%llx\n", tcep[0]); + show_stack(current, (unsigned long *)__get_SP()); + } + return ret; +} + +static void tce_free_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +{ + u64 rc; + + while (npages--) { + rc = plpar_tce_put((u64)tbl->it_index, (u64)tcenum << 12, 0); + + if (rc && printk_ratelimit()) { + printk("tce_free_pSeriesLP: plpar_tce_put failed. rc=%lld\n", rc); + printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\ttcenum = 0x%llx\n", (u64)tcenum); + show_stack(current, (unsigned long *)__get_SP()); + } + + tcenum++; + } +} + + +static void tce_freemulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages) +{ + u64 rc; + + rc = plpar_tce_stuff((u64)tbl->it_index, (u64)tcenum << 12, 0, npages); + + if (rc && printk_ratelimit()) { + printk("tce_freemulti_pSeriesLP: plpar_tce_stuff failed\n"); + printk("\trc = %lld\n", rc); + printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\tnpages = 0x%llx\n", (u64)npages); + show_stack(current, (unsigned long *)__get_SP()); + } +} + +static unsigned long tce_get_pSeriesLP(struct iommu_table *tbl, long tcenum) +{ + u64 rc; + unsigned long tce_ret; + + rc = plpar_tce_get((u64)tbl->it_index, (u64)tcenum << 12, &tce_ret); + + if (rc && printk_ratelimit()) { + printk("tce_get_pSeriesLP: plpar_tce_get failed. rc=%lld\n", rc); + printk("\tindex = 0x%llx\n", (u64)tbl->it_index); + printk("\ttcenum = 0x%llx\n", (u64)tcenum); + show_stack(current, (unsigned long *)__get_SP()); + } + + return tce_ret; +} + +/* this is compatible with cells for the device tree property */ +struct dynamic_dma_window_prop { + __be32 liobn; /* tce table number */ + __be64 dma_base; /* address hi,lo */ + __be32 tce_shift; /* ilog2(tce_page_size) */ + __be32 window_shift; /* ilog2(tce_window_size) */ +}; + +struct direct_window { + struct device_node *device; + const struct dynamic_dma_window_prop *prop; + struct list_head list; +}; + +/* Dynamic DMA Window support */ +struct ddw_query_response { + u32 windows_available; + u32 largest_available_block; + u32 page_size; + u32 migration_capable; +}; + +struct ddw_create_response { + u32 liobn; + u32 addr_hi; + u32 addr_lo; +}; + +static LIST_HEAD(direct_window_list); +/* prevents races between memory on/offline and window creation */ +static DEFINE_SPINLOCK(direct_window_list_lock); +/* protects initializing window twice for same device */ +static DEFINE_MUTEX(direct_window_init_mutex); +#define DIRECT64_PROPNAME "linux,direct64-ddr-window-info" + +static int tce_clearrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + int rc; + u64 tce_size, num_tce, dma_offset, next; + u32 tce_shift; + long limit; + + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 512); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + rc = plpar_tce_stuff((u64)be32_to_cpu(maprange->liobn), + dma_offset, + 0, limit); + num_tce -= limit; + } while (num_tce > 0 && !rc); + + return rc; +} + +static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, + unsigned long num_pfn, const void *arg) +{ + const struct dynamic_dma_window_prop *maprange = arg; + u64 *tcep, tce_size, num_tce, dma_offset, next, proto_tce, liobn; + u32 tce_shift; + u64 rc = 0; + long l, limit; + + local_irq_disable(); /* to protect tcep and the page behind it */ + tcep = __get_cpu_var(tce_page); + + if (!tcep) { + tcep = (u64 *)__get_free_page(GFP_ATOMIC); + if (!tcep) { + local_irq_enable(); + return -ENOMEM; + } + __get_cpu_var(tce_page) = tcep; + } + + proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; + + liobn = (u64)be32_to_cpu(maprange->liobn); + tce_shift = be32_to_cpu(maprange->tce_shift); + tce_size = 1ULL << tce_shift; + next = start_pfn << PAGE_SHIFT; + num_tce = num_pfn << PAGE_SHIFT; + + /* round back to the beginning of the tce page size */ + num_tce += next & (tce_size - 1); + next &= ~(tce_size - 1); + + /* covert to number of tces */ + num_tce |= tce_size - 1; + num_tce >>= tce_shift; + + /* We can map max one pageful of TCEs at a time */ + do { + /* + * Set up the page with TCE data, looping through and setting + * the values. + */ + limit = min_t(long, num_tce, 4096/TCE_ENTRY_SIZE); + dma_offset = next + be64_to_cpu(maprange->dma_base); + + for (l = 0; l < limit; l++) { + tcep[l] = proto_tce | next; + next += tce_size; + } + + rc = plpar_tce_put_indirect(liobn, + dma_offset, + (u64)virt_to_abs(tcep), + limit); + + num_tce -= limit; + } while (num_tce > 0 && !rc); + + /* error cleanup: caller will clear whole range */ + + local_irq_enable(); + return rc; +} + +static int tce_setrange_multi_pSeriesLP_walk(unsigned long start_pfn, + unsigned long num_pfn, void *arg) +{ + return tce_setrange_multi_pSeriesLP(start_pfn, num_pfn, arg); +} + + +#ifdef CONFIG_PCI +static void iommu_table_setparms(struct pci_controller *phb, + struct device_node *dn, + struct iommu_table *tbl) +{ + struct device_node *node; + const unsigned long *basep; + const u32 *sizep; + + node = phb->dn; + + basep = of_get_property(node, "linux,tce-base", NULL); + sizep = of_get_property(node, "linux,tce-size", NULL); + if (basep == NULL || sizep == NULL) { + printk(KERN_ERR "PCI_DMA: iommu_table_setparms: %s has " + "missing tce entries !\n", dn->full_name); + return; + } + + tbl->it_base = (unsigned long)__va(*basep); + + if (!is_kdump_kernel()) + memset((void *)tbl->it_base, 0, *sizep); + + tbl->it_busno = phb->bus->number; + + /* Units of tce entries */ + tbl->it_offset = phb->dma_window_base_cur >> IOMMU_PAGE_SHIFT; + + /* Test if we are going over 2GB of DMA space */ + if (phb->dma_window_base_cur + phb->dma_window_size > 0x80000000ul) { + udbg_printf("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + panic("PCI_DMA: Unexpected number of IOAs under this PHB.\n"); + } + + phb->dma_window_base_cur += phb->dma_window_size; + + /* Set the tce table size - measured in entries */ + tbl->it_size = phb->dma_window_size >> IOMMU_PAGE_SHIFT; + + tbl->it_index = 0; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; +} + +/* + * iommu_table_setparms_lpar + * + * Function: On pSeries LPAR systems, return TCE table info, given a pci bus. + */ +static void iommu_table_setparms_lpar(struct pci_controller *phb, + struct device_node *dn, + struct iommu_table *tbl, + const void *dma_window) +{ + unsigned long offset, size; + + of_parse_dma_window(dn, dma_window, &tbl->it_index, &offset, &size); + + tbl->it_busno = phb->bus->number; + tbl->it_base = 0; + tbl->it_blocksize = 16; + tbl->it_type = TCE_PCI; + tbl->it_offset = offset >> IOMMU_PAGE_SHIFT; + tbl->it_size = size >> IOMMU_PAGE_SHIFT; +} + +static void pci_dma_bus_setup_pSeries(struct pci_bus *bus) +{ + struct device_node *dn; + struct iommu_table *tbl; + struct device_node *isa_dn, *isa_dn_orig; + struct device_node *tmp; + struct pci_dn *pci; + int children; + + dn = pci_bus_to_OF_node(bus); + + pr_debug("pci_dma_bus_setup_pSeries: setting up bus %s\n", dn->full_name); + + if (bus->self) { + /* This is not a root bus, any setup will be done for the + * device-side of the bridge in iommu_dev_setup_pSeries(). + */ + return; + } + pci = PCI_DN(dn); + + /* Check if the ISA bus on the system is under + * this PHB. + */ + isa_dn = isa_dn_orig = of_find_node_by_type(NULL, "isa"); + + while (isa_dn && isa_dn != dn) + isa_dn = isa_dn->parent; + + if (isa_dn_orig) + of_node_put(isa_dn_orig); + + /* Count number of direct PCI children of the PHB. */ + for (children = 0, tmp = dn->child; tmp; tmp = tmp->sibling) + children++; + + pr_debug("Children: %d\n", children); + + /* Calculate amount of DMA window per slot. Each window must be + * a power of two (due to pci_alloc_consistent requirements). + * + * Keep 256MB aside for PHBs with ISA. + */ + + if (!isa_dn) { + /* No ISA/IDE - just set window size and return */ + pci->phb->dma_window_size = 0x80000000ul; /* To be divided */ + + while (pci->phb->dma_window_size * children > 0x80000000ul) + pci->phb->dma_window_size >>= 1; + pr_debug("No ISA/IDE, window size is 0x%llx\n", + pci->phb->dma_window_size); + pci->phb->dma_window_base_cur = 0; + + return; + } + + /* If we have ISA, then we probably have an IDE + * controller too. Allocate a 128MB table but + * skip the first 128MB to avoid stepping on ISA + * space. + */ + pci->phb->dma_window_size = 0x8000000ul; + pci->phb->dma_window_base_cur = 0x8000000ul; + + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + pci->phb->node); + + iommu_table_setparms(pci->phb, dn, tbl); + pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + + /* Divide the rest (1.75GB) among the children */ + pci->phb->dma_window_size = 0x80000000ul; + while (pci->phb->dma_window_size * children > 0x70000000ul) + pci->phb->dma_window_size >>= 1; + + pr_debug("ISA/IDE, window size is 0x%llx\n", pci->phb->dma_window_size); +} + + +static void pci_dma_bus_setup_pSeriesLP(struct pci_bus *bus) +{ + struct iommu_table *tbl; + struct device_node *dn, *pdn; + struct pci_dn *ppci; + const void *dma_window = NULL; + + dn = pci_bus_to_OF_node(bus); + + pr_debug("pci_dma_bus_setup_pSeriesLP: setting up bus %s\n", + dn->full_name); + + /* Find nearest ibm,dma-window, walking up the device tree */ + for (pdn = dn; pdn != NULL; pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window != NULL) + break; + } + + if (dma_window == NULL) { + pr_debug(" no ibm,dma-window property !\n"); + return; + } + + ppci = PCI_DN(pdn); + + pr_debug(" parent is %s, iommu_table: 0x%p\n", + pdn->full_name, ppci->iommu_table); + + if (!ppci->iommu_table) { + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + ppci->phb->node); + iommu_table_setparms_lpar(ppci->phb, pdn, tbl, dma_window); + ppci->iommu_table = iommu_init_table(tbl, ppci->phb->node); + pr_debug(" created table: %p\n", ppci->iommu_table); + } +} + + +static void pci_dma_dev_setup_pSeries(struct pci_dev *dev) +{ + struct device_node *dn; + struct iommu_table *tbl; + + pr_debug("pci_dma_dev_setup_pSeries: %s\n", pci_name(dev)); + + dn = dev->dev.of_node; + + /* If we're the direct child of a root bus, then we need to allocate + * an iommu table ourselves. The bus setup code should have setup + * the window sizes already. + */ + if (!dev->bus->self) { + struct pci_controller *phb = PCI_DN(dn)->phb; + + pr_debug(" --> first child, no bridge. Allocating iommu table.\n"); + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + phb->node); + iommu_table_setparms(phb, dn, tbl); + PCI_DN(dn)->iommu_table = iommu_init_table(tbl, phb->node); + set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + return; + } + + /* If this device is further down the bus tree, search upwards until + * an already allocated iommu table is found and use that. + */ + + while (dn && PCI_DN(dn) && PCI_DN(dn)->iommu_table == NULL) + dn = dn->parent; + + if (dn && PCI_DN(dn)) + set_iommu_table_base(&dev->dev, PCI_DN(dn)->iommu_table); + else + printk(KERN_WARNING "iommu: Device %s has no iommu table\n", + pci_name(dev)); +} + +static int __read_mostly disable_ddw; + +static int __init disable_ddw_setup(char *str) +{ + disable_ddw = 1; + printk(KERN_INFO "ppc iommu: disabling ddw.\n"); + + return 0; +} + +early_param("disable_ddw", disable_ddw_setup); + +static void remove_ddw(struct device_node *np) +{ + struct dynamic_dma_window_prop *dwp; + struct property *win64; + const u32 *ddw_avail; + u64 liobn; + int len, ret; + + ddw_avail = of_get_property(np, "ibm,ddw-applicable", &len); + win64 = of_find_property(np, DIRECT64_PROPNAME, NULL); + if (!win64) + return; + + if (!ddw_avail || len < 3 * sizeof(u32) || win64->length < sizeof(*dwp)) + goto delprop; + + dwp = win64->value; + liobn = (u64)be32_to_cpu(dwp->liobn); + + /* clear the whole window, note the arg is in kernel pages */ + ret = tce_clearrange_multi_pSeriesLP(0, + 1ULL << (be32_to_cpu(dwp->window_shift) - PAGE_SHIFT), dwp); + if (ret) + pr_warning("%s failed to clear tces in window.\n", + np->full_name); + else + pr_debug("%s successfully cleared tces in window.\n", + np->full_name); + + ret = rtas_call(ddw_avail[2], 1, 1, NULL, liobn); + if (ret) + pr_warning("%s: failed to remove direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddw_avail[2], liobn); + else + pr_debug("%s: successfully removed direct window: rtas returned " + "%d to ibm,remove-pe-dma-window(%x) %llx\n", + np->full_name, ret, ddw_avail[2], liobn); + +delprop: + ret = prom_remove_property(np, win64); + if (ret) + pr_warning("%s: failed to remove direct window property: %d\n", + np->full_name, ret); +} + +static u64 find_existing_ddw(struct device_node *pdn) +{ + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + u64 dma_addr = 0; + + spin_lock(&direct_window_list_lock); + /* check if we already created a window and dupe that config if so */ + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == pdn) { + direct64 = window->prop; + dma_addr = direct64->dma_base; + break; + } + } + spin_unlock(&direct_window_list_lock); + + return dma_addr; +} + +static int find_existing_ddw_windows(void) +{ + int len; + struct device_node *pdn; + struct direct_window *window; + const struct dynamic_dma_window_prop *direct64; + + if (!firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + for_each_node_with_property(pdn, DIRECT64_PROPNAME) { + direct64 = of_get_property(pdn, DIRECT64_PROPNAME, &len); + if (!direct64) + continue; + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window || len < sizeof(struct dynamic_dma_window_prop)) { + kfree(window); + remove_ddw(pdn); + continue; + } + + window->device = pdn; + window->prop = direct64; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + } + + return 0; +} +machine_arch_initcall(pseries, find_existing_ddw_windows); + +static int query_ddw(struct pci_dev *dev, const u32 *ddw_avail, + struct ddw_query_response *query) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + ret = rtas_call(ddw_avail[0], 3, 5, (u32 *)query, + cfg_addr, BUID_HI(buid), BUID_LO(buid)); + dev_info(&dev->dev, "ibm,query-pe-dma-windows(%x) %x %x %x" + " returned %d\n", ddw_avail[0], cfg_addr, BUID_HI(buid), + BUID_LO(buid), ret); + return ret; +} + +static int create_ddw(struct pci_dev *dev, const u32 *ddw_avail, + struct ddw_create_response *create, int page_shift, + int window_shift) +{ + struct device_node *dn; + struct pci_dn *pcidn; + u32 cfg_addr; + u64 buid; + int ret; + + /* + * Get the config address and phb buid of the PE window. + * Rely on eeh to retrieve this for us. + * Retrieve them from the pci device, not the node with the + * dma-window property + */ + dn = pci_device_to_OF_node(dev); + pcidn = PCI_DN(dn); + cfg_addr = pcidn->eeh_config_addr; + if (pcidn->eeh_pe_config_addr) + cfg_addr = pcidn->eeh_pe_config_addr; + buid = pcidn->phb->buid; + + do { + /* extra outputs are LIOBN and dma-addr (hi, lo) */ + ret = rtas_call(ddw_avail[1], 5, 4, (u32 *)create, cfg_addr, + BUID_HI(buid), BUID_LO(buid), page_shift, window_shift); + } while (rtas_busy_delay(ret)); + dev_info(&dev->dev, + "ibm,create-pe-dma-window(%x) %x %x %x %x %x returned %d " + "(liobn = 0x%x starting addr = %x %x)\n", ddw_avail[1], + cfg_addr, BUID_HI(buid), BUID_LO(buid), page_shift, + window_shift, ret, create->liobn, create->addr_hi, create->addr_lo); + + return ret; +} + +/* + * If the PE supports dynamic dma windows, and there is space for a table + * that can map all pages in a linear offset, then setup such a table, + * and record the dma-offset in the struct device. + * + * dev: the pci device we are checking + * pdn: the parent pe node with the ibm,dma_window property + * Future: also check if we can remap the base window for our base page size + * + * returns the dma offset for use by dma_set_mask + */ +static u64 enable_ddw(struct pci_dev *dev, struct device_node *pdn) +{ + int len, ret; + struct ddw_query_response query; + struct ddw_create_response create; + int page_shift; + u64 dma_addr, max_addr; + struct device_node *dn; + const u32 *uninitialized_var(ddw_avail); + struct direct_window *window; + struct property *win64; + struct dynamic_dma_window_prop *ddwprop; + + mutex_lock(&direct_window_init_mutex); + + dma_addr = find_existing_ddw(pdn); + if (dma_addr != 0) + goto out_unlock; + + /* + * the ibm,ddw-applicable property holds the tokens for: + * ibm,query-pe-dma-window + * ibm,create-pe-dma-window + * ibm,remove-pe-dma-window + * for the given node in that order. + * the property is actually in the parent, not the PE + */ + ddw_avail = of_get_property(pdn, "ibm,ddw-applicable", &len); + if (!ddw_avail || len < 3 * sizeof(u32)) + goto out_unlock; + + /* + * Query if there is a second window of size to map the + * whole partition. Query returns number of windows, largest + * block assigned to PE (partition endpoint), and two bitmasks + * of page sizes: supported and supported for migrate-dma. + */ + dn = pci_device_to_OF_node(dev); + ret = query_ddw(dev, ddw_avail, &query); + if (ret != 0) + goto out_unlock; + + if (query.windows_available == 0) { + /* + * no additional windows are available for this device. + * We might be able to reallocate the existing window, + * trading in for a larger page size. + */ + dev_dbg(&dev->dev, "no free dynamic windows"); + goto out_unlock; + } + if (query.page_size & 4) { + page_shift = 24; /* 16MB */ + } else if (query.page_size & 2) { + page_shift = 16; /* 64kB */ + } else if (query.page_size & 1) { + page_shift = 12; /* 4kB */ + } else { + dev_dbg(&dev->dev, "no supported direct page size in mask %x", + query.page_size); + goto out_unlock; + } + /* verify the window * number of ptes will map the partition */ + /* check largest block * page size > max memory hotplug addr */ + max_addr = memory_hotplug_max(); + if (query.largest_available_block < (max_addr >> page_shift)) { + dev_dbg(&dev->dev, "can't map partiton max 0x%llx with %u " + "%llu-sized pages\n", max_addr, query.largest_available_block, + 1ULL << page_shift); + goto out_unlock; + } + len = order_base_2(max_addr); + win64 = kzalloc(sizeof(struct property), GFP_KERNEL); + if (!win64) { + dev_info(&dev->dev, + "couldn't allocate property for 64bit dma window\n"); + goto out_unlock; + } + win64->name = kstrdup(DIRECT64_PROPNAME, GFP_KERNEL); + win64->value = ddwprop = kmalloc(sizeof(*ddwprop), GFP_KERNEL); + win64->length = sizeof(*ddwprop); + if (!win64->name || !win64->value) { + dev_info(&dev->dev, + "couldn't allocate property name and value\n"); + goto out_free_prop; + } + + ret = create_ddw(dev, ddw_avail, &create, page_shift, len); + if (ret != 0) + goto out_free_prop; + + ddwprop->liobn = cpu_to_be32(create.liobn); + ddwprop->dma_base = cpu_to_be64(of_read_number(&create.addr_hi, 2)); + ddwprop->tce_shift = cpu_to_be32(page_shift); + ddwprop->window_shift = cpu_to_be32(len); + + dev_dbg(&dev->dev, "created tce table LIOBN 0x%x for %s\n", + create.liobn, dn->full_name); + + window = kzalloc(sizeof(*window), GFP_KERNEL); + if (!window) + goto out_clear_window; + + ret = walk_system_ram_range(0, memblock_end_of_DRAM() >> PAGE_SHIFT, + win64->value, tce_setrange_multi_pSeriesLP_walk); + if (ret) { + dev_info(&dev->dev, "failed to map direct window for %s: %d\n", + dn->full_name, ret); + goto out_clear_window; + } + + ret = prom_add_property(pdn, win64); + if (ret) { + dev_err(&dev->dev, "unable to add dma window property for %s: %d", + pdn->full_name, ret); + goto out_clear_window; + } + + window->device = pdn; + window->prop = ddwprop; + spin_lock(&direct_window_list_lock); + list_add(&window->list, &direct_window_list); + spin_unlock(&direct_window_list_lock); + + dma_addr = of_read_number(&create.addr_hi, 2); + goto out_unlock; + +out_clear_window: + remove_ddw(pdn); + +out_free_prop: + kfree(win64->name); + kfree(win64->value); + kfree(win64); + +out_unlock: + mutex_unlock(&direct_window_init_mutex); + return dma_addr; +} + +static void pci_dma_dev_setup_pSeriesLP(struct pci_dev *dev) +{ + struct device_node *pdn, *dn; + struct iommu_table *tbl; + const void *dma_window = NULL; + struct pci_dn *pci; + + pr_debug("pci_dma_dev_setup_pSeriesLP: %s\n", pci_name(dev)); + + /* dev setup for LPAR is a little tricky, since the device tree might + * contain the dma-window properties per-device and not necessarily + * for the bus. So we need to search upwards in the tree until we + * either hit a dma-window property, OR find a parent with a table + * already allocated. + */ + dn = pci_device_to_OF_node(dev); + pr_debug(" node is %s\n", dn->full_name); + + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + + if (!pdn || !PCI_DN(pdn)) { + printk(KERN_WARNING "pci_dma_dev_setup_pSeriesLP: " + "no DMA window found for pci dev=%s dn=%s\n", + pci_name(dev), dn? dn->full_name : "<null>"); + return; + } + pr_debug(" parent is %s\n", pdn->full_name); + + pci = PCI_DN(pdn); + if (!pci->iommu_table) { + tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, + pci->phb->node); + iommu_table_setparms_lpar(pci->phb, pdn, tbl, dma_window); + pci->iommu_table = iommu_init_table(tbl, pci->phb->node); + pr_debug(" created table: %p\n", pci->iommu_table); + } else { + pr_debug(" found DMA window, table: %p\n", pci->iommu_table); + } + + set_iommu_table_base(&dev->dev, pci->iommu_table); +} + +static int dma_set_mask_pSeriesLP(struct device *dev, u64 dma_mask) +{ + bool ddw_enabled = false; + struct device_node *pdn, *dn; + struct pci_dev *pdev; + const void *dma_window = NULL; + u64 dma_offset; + + if (!dev->dma_mask) + return -EIO; + + if (!dev_is_pci(dev)) + goto check_mask; + + pdev = to_pci_dev(dev); + + /* only attempt to use a new window if 64-bit DMA is requested */ + if (!disable_ddw && dma_mask == DMA_BIT_MASK(64)) { + dn = pci_device_to_OF_node(pdev); + dev_dbg(dev, "node is %s\n", dn->full_name); + + /* + * the device tree might contain the dma-window properties + * per-device and not necessarily for the bus. So we need to + * search upwards in the tree until we either hit a dma-window + * property, OR find a parent with a table already allocated. + */ + for (pdn = dn; pdn && PCI_DN(pdn) && !PCI_DN(pdn)->iommu_table; + pdn = pdn->parent) { + dma_window = of_get_property(pdn, "ibm,dma-window", NULL); + if (dma_window) + break; + } + if (pdn && PCI_DN(pdn)) { + dma_offset = enable_ddw(pdev, pdn); + if (dma_offset != 0) { + dev_info(dev, "Using 64-bit direct DMA at offset %llx\n", dma_offset); + set_dma_offset(dev, dma_offset); + set_dma_ops(dev, &dma_direct_ops); + ddw_enabled = true; + } + } + } + + /* fall back on iommu ops, restore table pointer with ops */ + if (!ddw_enabled && get_dma_ops(dev) != &dma_iommu_ops) { + dev_info(dev, "Restoring 32-bit DMA via iommu\n"); + set_dma_ops(dev, &dma_iommu_ops); + pci_dma_dev_setup_pSeriesLP(pdev); + } + +check_mask: + if (!dma_supported(dev, dma_mask)) + return -EIO; + + *dev->dma_mask = dma_mask; + return 0; +} + +#else /* CONFIG_PCI */ +#define pci_dma_bus_setup_pSeries NULL +#define pci_dma_dev_setup_pSeries NULL +#define pci_dma_bus_setup_pSeriesLP NULL +#define pci_dma_dev_setup_pSeriesLP NULL +#define dma_set_mask_pSeriesLP NULL +#endif /* !CONFIG_PCI */ + +static int iommu_mem_notifier(struct notifier_block *nb, unsigned long action, + void *data) +{ + struct direct_window *window; + struct memory_notify *arg = data; + int ret = 0; + + switch (action) { + case MEM_GOING_ONLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_setrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + ret |= tce_clearrange_multi_pSeriesLP(arg->start_pfn, + arg->nr_pages, window->prop); + /* XXX log error */ + } + spin_unlock(&direct_window_list_lock); + break; + default: + break; + } + if (ret && action != MEM_CANCEL_ONLINE) + return NOTIFY_BAD; + + return NOTIFY_OK; +} + +static struct notifier_block iommu_mem_nb = { + .notifier_call = iommu_mem_notifier, +}; + +static int iommu_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) +{ + int err = NOTIFY_OK; + struct device_node *np = node; + struct pci_dn *pci = PCI_DN(np); + struct direct_window *window; + + switch (action) { + case PSERIES_RECONFIG_REMOVE: + if (pci && pci->iommu_table) + iommu_free_table(pci->iommu_table, np->full_name); + + spin_lock(&direct_window_list_lock); + list_for_each_entry(window, &direct_window_list, list) { + if (window->device == np) { + list_del(&window->list); + kfree(window); + break; + } + } + spin_unlock(&direct_window_list_lock); + + /* + * Because the notifier runs after isolation of the + * slot, we are guaranteed any DMA window has already + * been revoked and the TCEs have been marked invalid, + * so we don't need a call to remove_ddw(np). However, + * if an additional notifier action is added before the + * isolate call, we should update this code for + * completeness with such a call. + */ + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block iommu_reconfig_nb = { + .notifier_call = iommu_reconfig_notifier, +}; + +/* These are called very early. */ +void iommu_init_early_pSeries(void) +{ + if (of_chosen && of_get_property(of_chosen, "linux,iommu-off", NULL)) + return; + + if (firmware_has_feature(FW_FEATURE_LPAR)) { + if (firmware_has_feature(FW_FEATURE_MULTITCE)) { + ppc_md.tce_build = tce_buildmulti_pSeriesLP; + ppc_md.tce_free = tce_freemulti_pSeriesLP; + } else { + ppc_md.tce_build = tce_build_pSeriesLP; + ppc_md.tce_free = tce_free_pSeriesLP; + } + ppc_md.tce_get = tce_get_pSeriesLP; + ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeriesLP; + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeriesLP; + ppc_md.dma_set_mask = dma_set_mask_pSeriesLP; + } else { + ppc_md.tce_build = tce_build_pSeries; + ppc_md.tce_free = tce_free_pSeries; + ppc_md.tce_get = tce_get_pseries; + ppc_md.pci_dma_bus_setup = pci_dma_bus_setup_pSeries; + ppc_md.pci_dma_dev_setup = pci_dma_dev_setup_pSeries; + } + + + pSeries_reconfig_notifier_register(&iommu_reconfig_nb); + register_memory_notifier(&iommu_mem_nb); + + set_pci_dma_ops(&dma_iommu_ops); +} + +static int __init disable_multitce(char *str) +{ + if (strcmp(str, "off") == 0 && + firmware_has_feature(FW_FEATURE_LPAR) && + firmware_has_feature(FW_FEATURE_MULTITCE)) { + printk(KERN_INFO "Disabling MULTITCE firmware feature\n"); + ppc_md.tce_build = tce_build_pSeriesLP; + ppc_md.tce_free = tce_free_pSeriesLP; + powerpc_firmware_features &= ~FW_FEATURE_MULTITCE; + } + return 1; +} + +__setup("multitce=", disable_multitce); diff --git a/arch/powerpc/platforms/pseries/kexec.c b/arch/powerpc/platforms/pseries/kexec.c new file mode 100644 index 00000000..1118cb79 --- /dev/null +++ b/arch/powerpc/platforms/pseries/kexec.c @@ -0,0 +1,77 @@ +/* + * Copyright 2006 Michael Ellerman, IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include <linux/kernel.h> +#include <linux/interrupt.h> + +#include <asm/machdep.h> +#include <asm/page.h> +#include <asm/firmware.h> +#include <asm/kexec.h> +#include <asm/mpic.h> +#include <asm/xics.h> +#include <asm/smp.h> + +#include "pseries.h" +#include "plpar_wrappers.h" + +static void pseries_kexec_cpu_down(int crash_shutdown, int secondary) +{ + /* Don't risk a hypervisor call if we're crashing */ + if (firmware_has_feature(FW_FEATURE_SPLPAR) && !crash_shutdown) { + unsigned long addr; + int ret; + + if (get_lppaca()->dtl_enable_mask) { + ret = unregister_dtl(hard_smp_processor_id()); + if (ret) { + pr_err("WARNING: DTL deregistration for cpu " + "%d (hw %d) failed with %d\n", + smp_processor_id(), + hard_smp_processor_id(), ret); + } + } + + addr = __pa(get_slb_shadow()); + if (unregister_slb_shadow(hard_smp_processor_id(), addr)) + printk("SLB shadow buffer deregistration of " + "cpu %u (hw_cpu_id %d) failed\n", + smp_processor_id(), + hard_smp_processor_id()); + + addr = __pa(get_lppaca()); + if (unregister_vpa(hard_smp_processor_id(), addr)) { + printk("VPA deregistration of cpu %u (hw_cpu_id %d) " + "failed\n", smp_processor_id(), + hard_smp_processor_id()); + } + } +} + +static void pseries_kexec_cpu_down_mpic(int crash_shutdown, int secondary) +{ + pseries_kexec_cpu_down(crash_shutdown, secondary); + mpic_teardown_this_cpu(secondary); +} + +void __init setup_kexec_cpu_down_mpic(void) +{ + ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_mpic; +} + +static void pseries_kexec_cpu_down_xics(int crash_shutdown, int secondary) +{ + pseries_kexec_cpu_down(crash_shutdown, secondary); + xics_kexec_teardown_cpu(secondary); +} + +void __init setup_kexec_cpu_down_xics(void) +{ + ppc_md.kexec_cpu_down = pseries_kexec_cpu_down_xics; +} diff --git a/arch/powerpc/platforms/pseries/lpar.c b/arch/powerpc/platforms/pseries/lpar.c new file mode 100644 index 00000000..81e30d96 --- /dev/null +++ b/arch/powerpc/platforms/pseries/lpar.c @@ -0,0 +1,821 @@ +/* + * pSeries_lpar.c + * Copyright (C) 2001 Todd Inglett, IBM Corporation + * + * pSeries LPAR support. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Enables debugging of low-level hash table routines - careful! */ +#undef DEBUG + +#include <linux/kernel.h> +#include <linux/dma-mapping.h> +#include <linux/console.h> +#include <asm/processor.h> +#include <asm/mmu.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/machdep.h> +#include <asm/abs_addr.h> +#include <asm/mmu_context.h> +#include <asm/iommu.h> +#include <asm/tlbflush.h> +#include <asm/tlb.h> +#include <asm/prom.h> +#include <asm/cputable.h> +#include <asm/udbg.h> +#include <asm/smp.h> +#include <asm/trace.h> + +#include "plpar_wrappers.h" +#include "pseries.h" + + +/* in hvCall.S */ +EXPORT_SYMBOL(plpar_hcall); +EXPORT_SYMBOL(plpar_hcall9); +EXPORT_SYMBOL(plpar_hcall_norets); + +extern void pSeries_find_serial_port(void); + + +static int vtermno; /* virtual terminal# for udbg */ + +#define __ALIGNED__ __attribute__((__aligned__(sizeof(long)))) +static void udbg_hvsi_putc(char c) +{ + /* packet's seqno isn't used anyways */ + uint8_t packet[] __ALIGNED__ = { 0xff, 5, 0, 0, c }; + int rc; + + if (c == '\n') + udbg_hvsi_putc('\r'); + + do { + rc = plpar_put_term_char(vtermno, sizeof(packet), packet); + } while (rc == H_BUSY); +} + +static long hvsi_udbg_buf_len; +static uint8_t hvsi_udbg_buf[256]; + +static int udbg_hvsi_getc_poll(void) +{ + unsigned char ch; + int rc, i; + + if (hvsi_udbg_buf_len == 0) { + rc = plpar_get_term_char(vtermno, &hvsi_udbg_buf_len, hvsi_udbg_buf); + if (rc != H_SUCCESS || hvsi_udbg_buf[0] != 0xff) { + /* bad read or non-data packet */ + hvsi_udbg_buf_len = 0; + } else { + /* remove the packet header */ + for (i = 4; i < hvsi_udbg_buf_len; i++) + hvsi_udbg_buf[i-4] = hvsi_udbg_buf[i]; + hvsi_udbg_buf_len -= 4; + } + } + + if (hvsi_udbg_buf_len <= 0 || hvsi_udbg_buf_len > 256) { + /* no data ready */ + hvsi_udbg_buf_len = 0; + return -1; + } + + ch = hvsi_udbg_buf[0]; + /* shift remaining data down */ + for (i = 1; i < hvsi_udbg_buf_len; i++) { + hvsi_udbg_buf[i-1] = hvsi_udbg_buf[i]; + } + hvsi_udbg_buf_len--; + + return ch; +} + +static int udbg_hvsi_getc(void) +{ + int ch; + for (;;) { + ch = udbg_hvsi_getc_poll(); + if (ch == -1) { + /* This shouldn't be needed...but... */ + volatile unsigned long delay; + for (delay=0; delay < 2000000; delay++) + ; + } else { + return ch; + } + } +} + +static void udbg_putcLP(char c) +{ + char buf[16]; + unsigned long rc; + + if (c == '\n') + udbg_putcLP('\r'); + + buf[0] = c; + do { + rc = plpar_put_term_char(vtermno, 1, buf); + } while(rc == H_BUSY); +} + +/* Buffered chars getc */ +static long inbuflen; +static long inbuf[2]; /* must be 2 longs */ + +static int udbg_getc_pollLP(void) +{ + /* The interface is tricky because it may return up to 16 chars. + * We save them statically for future calls to udbg_getc(). + */ + char ch, *buf = (char *)inbuf; + int i; + long rc; + if (inbuflen == 0) { + /* get some more chars. */ + inbuflen = 0; + rc = plpar_get_term_char(vtermno, &inbuflen, buf); + if (rc != H_SUCCESS) + inbuflen = 0; /* otherwise inbuflen is garbage */ + } + if (inbuflen <= 0 || inbuflen > 16) { + /* Catch error case as well as other oddities (corruption) */ + inbuflen = 0; + return -1; + } + ch = buf[0]; + for (i = 1; i < inbuflen; i++) /* shuffle them down. */ + buf[i-1] = buf[i]; + inbuflen--; + return ch; +} + +static int udbg_getcLP(void) +{ + int ch; + for (;;) { + ch = udbg_getc_pollLP(); + if (ch == -1) { + /* This shouldn't be needed...but... */ + volatile unsigned long delay; + for (delay=0; delay < 2000000; delay++) + ; + } else { + return ch; + } + } +} + +/* call this from early_init() for a working debug console on + * vterm capable LPAR machines + */ +void __init udbg_init_debug_lpar(void) +{ + vtermno = 0; + udbg_putc = udbg_putcLP; + udbg_getc = udbg_getcLP; + udbg_getc_poll = udbg_getc_pollLP; + + register_early_udbg_console(); +} + +/* returns 0 if couldn't find or use /chosen/stdout as console */ +void __init find_udbg_vterm(void) +{ + struct device_node *stdout_node; + const u32 *termno; + const char *name; + + /* find the boot console from /chosen/stdout */ + if (!of_chosen) + return; + name = of_get_property(of_chosen, "linux,stdout-path", NULL); + if (name == NULL) + return; + stdout_node = of_find_node_by_path(name); + if (!stdout_node) + return; + name = of_get_property(stdout_node, "name", NULL); + if (!name) { + printk(KERN_WARNING "stdout node missing 'name' property!\n"); + goto out; + } + + /* Check if it's a virtual terminal */ + if (strncmp(name, "vty", 3) != 0) + goto out; + termno = of_get_property(stdout_node, "reg", NULL); + if (termno == NULL) + goto out; + vtermno = termno[0]; + + if (of_device_is_compatible(stdout_node, "hvterm1")) { + udbg_putc = udbg_putcLP; + udbg_getc = udbg_getcLP; + udbg_getc_poll = udbg_getc_pollLP; + add_preferred_console("hvc", termno[0] & 0xff, NULL); + } else if (of_device_is_compatible(stdout_node, "hvterm-protocol")) { + vtermno = termno[0]; + udbg_putc = udbg_hvsi_putc; + udbg_getc = udbg_hvsi_getc; + udbg_getc_poll = udbg_hvsi_getc_poll; + add_preferred_console("hvsi", termno[0] & 0xff, NULL); + } +out: + of_node_put(stdout_node); +} + +void vpa_init(int cpu) +{ + int hwcpu = get_hard_smp_processor_id(cpu); + unsigned long addr; + long ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + + if (cpu_has_feature(CPU_FTR_ALTIVEC)) + lppaca_of(cpu).vmxregs_in_use = 1; + + addr = __pa(&lppaca_of(cpu)); + ret = register_vpa(hwcpu, addr); + + if (ret) { + printk(KERN_ERR "WARNING: vpa_init: VPA registration for " + "cpu %d (hw %d) of area %lx returns %ld\n", + cpu, hwcpu, addr, ret); + return; + } + /* + * PAPR says this feature is SLB-Buffer but firmware never + * reports that. All SPLPAR support SLB shadow buffer. + */ + addr = __pa(&slb_shadow[cpu]); + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + ret = register_slb_shadow(hwcpu, addr); + if (ret) + printk(KERN_ERR + "WARNING: vpa_init: SLB shadow buffer " + "registration for cpu %d (hw %d) of area %lx " + "returns %ld\n", cpu, hwcpu, addr, ret); + } + + /* + * Register dispatch trace log, if one has been allocated. + */ + pp = &paca[cpu]; + dtl = pp->dispatch_log; + if (dtl) { + pp->dtl_ridx = 0; + pp->dtl_curr = dtl; + lppaca_of(cpu).dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hwcpu, __pa(dtl)); + if (ret) + pr_warn("DTL registration failed for cpu %d (%ld)\n", + cpu, ret); + lppaca_of(cpu).dtl_enable_mask = 2; + } +} + +static long pSeries_lpar_hpte_insert(unsigned long hpte_group, + unsigned long va, unsigned long pa, + unsigned long rflags, unsigned long vflags, + int psize, int ssize) +{ + unsigned long lpar_rc; + unsigned long flags; + unsigned long slot; + unsigned long hpte_v, hpte_r; + + if (!(vflags & HPTE_V_BOLTED)) + pr_devel("hpte_insert(group=%lx, va=%016lx, pa=%016lx, " + "rflags=%lx, vflags=%lx, psize=%d)\n", + hpte_group, va, pa, rflags, vflags, psize); + + hpte_v = hpte_encode_v(va, psize, ssize) | vflags | HPTE_V_VALID; + hpte_r = hpte_encode_r(pa, psize) | rflags; + + if (!(vflags & HPTE_V_BOLTED)) + pr_devel(" hpte_v=%016lx, hpte_r=%016lx\n", hpte_v, hpte_r); + + /* Now fill in the actual HPTE */ + /* Set CEC cookie to 0 */ + /* Zero page = 0 */ + /* I-cache Invalidate = 0 */ + /* I-cache synchronize = 0 */ + /* Exact = 0 */ + flags = 0; + + /* Make pHyp happy */ + if ((rflags & _PAGE_NO_CACHE) & !(rflags & _PAGE_WRITETHRU)) + hpte_r &= ~_PAGE_COHERENT; + if (firmware_has_feature(FW_FEATURE_XCMO) && !(hpte_r & HPTE_R_N)) + flags |= H_COALESCE_CAND; + + lpar_rc = plpar_pte_enter(flags, hpte_group, hpte_v, hpte_r, &slot); + if (unlikely(lpar_rc == H_PTEG_FULL)) { + if (!(vflags & HPTE_V_BOLTED)) + pr_devel(" full\n"); + return -1; + } + + /* + * Since we try and ioremap PHBs we don't own, the pte insert + * will fail. However we must catch the failure in hash_page + * or we will loop forever, so return -2 in this case. + */ + if (unlikely(lpar_rc != H_SUCCESS)) { + if (!(vflags & HPTE_V_BOLTED)) + pr_devel(" lpar err %lu\n", lpar_rc); + return -2; + } + if (!(vflags & HPTE_V_BOLTED)) + pr_devel(" -> slot: %lu\n", slot & 7); + + /* Because of iSeries, we have to pass down the secondary + * bucket bit here as well + */ + return (slot & 7) | (!!(vflags & HPTE_V_SECONDARY) << 3); +} + +static DEFINE_SPINLOCK(pSeries_lpar_tlbie_lock); + +static long pSeries_lpar_hpte_remove(unsigned long hpte_group) +{ + unsigned long slot_offset; + unsigned long lpar_rc; + int i; + unsigned long dummy1, dummy2; + + /* pick a random slot to start at */ + slot_offset = mftb() & 0x7; + + for (i = 0; i < HPTES_PER_GROUP; i++) { + + /* don't remove a bolted entry */ + lpar_rc = plpar_pte_remove(H_ANDCOND, hpte_group + slot_offset, + (0x1UL << 4), &dummy1, &dummy2); + if (lpar_rc == H_SUCCESS) + return i; + BUG_ON(lpar_rc != H_NOT_FOUND); + + slot_offset++; + slot_offset &= 0x7; + } + + return -1; +} + +static void pSeries_lpar_hptab_clear(void) +{ + unsigned long size_bytes = 1UL << ppc64_pft_size; + unsigned long hpte_count = size_bytes >> 4; + struct { + unsigned long pteh; + unsigned long ptel; + } ptes[4]; + long lpar_rc; + unsigned long i, j; + + /* Read in batches of 4, + * invalidate only valid entries not in the VRMA + * hpte_count will be a multiple of 4 + */ + for (i = 0; i < hpte_count; i += 4) { + lpar_rc = plpar_pte_read_4_raw(0, i, (void *)ptes); + if (lpar_rc != H_SUCCESS) + continue; + for (j = 0; j < 4; j++){ + if ((ptes[j].pteh & HPTE_V_VRMA_MASK) == + HPTE_V_VRMA_MASK) + continue; + if (ptes[j].pteh & HPTE_V_VALID) + plpar_pte_remove_raw(0, i + j, 0, + &(ptes[j].pteh), &(ptes[j].ptel)); + } + } +} + +/* + * This computes the AVPN and B fields of the first dword of a HPTE, + * for use when we want to match an existing PTE. The bottom 7 bits + * of the returned value are zero. + */ +static inline unsigned long hpte_encode_avpn(unsigned long va, int psize, + int ssize) +{ + unsigned long v; + + v = (va >> 23) & ~(mmu_psize_defs[psize].avpnm); + v <<= HPTE_V_AVPN_SHIFT; + v |= ((unsigned long) ssize) << HPTE_V_SSIZE_SHIFT; + return v; +} + +/* + * NOTE: for updatepp ops we are fortunate that the linux "newpp" bits and + * the low 3 bits of flags happen to line up. So no transform is needed. + * We can probably optimize here and assume the high bits of newpp are + * already zero. For now I am paranoid. + */ +static long pSeries_lpar_hpte_updatepp(unsigned long slot, + unsigned long newpp, + unsigned long va, + int psize, int ssize, int local) +{ + unsigned long lpar_rc; + unsigned long flags = (newpp & 7) | H_AVPN; + unsigned long want_v; + + want_v = hpte_encode_avpn(va, psize, ssize); + + pr_devel(" update: avpnv=%016lx, hash=%016lx, f=%lx, psize: %d ...", + want_v, slot, flags, psize); + + lpar_rc = plpar_pte_protect(flags, slot, want_v); + + if (lpar_rc == H_NOT_FOUND) { + pr_devel("not found !\n"); + return -1; + } + + pr_devel("ok\n"); + + BUG_ON(lpar_rc != H_SUCCESS); + + return 0; +} + +static unsigned long pSeries_lpar_hpte_getword0(unsigned long slot) +{ + unsigned long dword0; + unsigned long lpar_rc; + unsigned long dummy_word1; + unsigned long flags; + + /* Read 1 pte at a time */ + /* Do not need RPN to logical page translation */ + /* No cross CEC PFT access */ + flags = 0; + + lpar_rc = plpar_pte_read(flags, slot, &dword0, &dummy_word1); + + BUG_ON(lpar_rc != H_SUCCESS); + + return dword0; +} + +static long pSeries_lpar_hpte_find(unsigned long va, int psize, int ssize) +{ + unsigned long hash; + unsigned long i; + long slot; + unsigned long want_v, hpte_v; + + hash = hpt_hash(va, mmu_psize_defs[psize].shift, ssize); + want_v = hpte_encode_avpn(va, psize, ssize); + + /* Bolted entries are always in the primary group */ + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + for (i = 0; i < HPTES_PER_GROUP; i++) { + hpte_v = pSeries_lpar_hpte_getword0(slot); + + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + /* HPTE matches */ + return slot; + ++slot; + } + + return -1; +} + +static void pSeries_lpar_hpte_updateboltedpp(unsigned long newpp, + unsigned long ea, + int psize, int ssize) +{ + unsigned long lpar_rc, slot, vsid, va, flags; + + vsid = get_kernel_vsid(ea, ssize); + va = hpt_va(ea, vsid, ssize); + + slot = pSeries_lpar_hpte_find(va, psize, ssize); + BUG_ON(slot == -1); + + flags = newpp & 7; + lpar_rc = plpar_pte_protect(flags, slot, 0); + + BUG_ON(lpar_rc != H_SUCCESS); +} + +static void pSeries_lpar_hpte_invalidate(unsigned long slot, unsigned long va, + int psize, int ssize, int local) +{ + unsigned long want_v; + unsigned long lpar_rc; + unsigned long dummy1, dummy2; + + pr_devel(" inval : slot=%lx, va=%016lx, psize: %d, local: %d\n", + slot, va, psize, local); + + want_v = hpte_encode_avpn(va, psize, ssize); + lpar_rc = plpar_pte_remove(H_AVPN, slot, want_v, &dummy1, &dummy2); + if (lpar_rc == H_NOT_FOUND) + return; + + BUG_ON(lpar_rc != H_SUCCESS); +} + +static void pSeries_lpar_hpte_removebolted(unsigned long ea, + int psize, int ssize) +{ + unsigned long slot, vsid, va; + + vsid = get_kernel_vsid(ea, ssize); + va = hpt_va(ea, vsid, ssize); + + slot = pSeries_lpar_hpte_find(va, psize, ssize); + BUG_ON(slot == -1); + + pSeries_lpar_hpte_invalidate(slot, va, psize, ssize, 0); +} + +/* Flag bits for H_BULK_REMOVE */ +#define HBR_REQUEST 0x4000000000000000UL +#define HBR_RESPONSE 0x8000000000000000UL +#define HBR_END 0xc000000000000000UL +#define HBR_AVPN 0x0200000000000000UL +#define HBR_ANDCOND 0x0100000000000000UL + +/* + * Take a spinlock around flushes to avoid bouncing the hypervisor tlbie + * lock. + */ +static void pSeries_lpar_flush_hash_range(unsigned long number, int local) +{ + unsigned long i, pix, rc; + unsigned long flags = 0; + struct ppc64_tlb_batch *batch = &__get_cpu_var(ppc64_tlb_batch); + int lock_tlbie = !mmu_has_feature(MMU_FTR_LOCKLESS_TLBIE); + unsigned long param[9]; + unsigned long va; + unsigned long hash, index, shift, hidx, slot; + real_pte_t pte; + int psize, ssize; + + if (lock_tlbie) + spin_lock_irqsave(&pSeries_lpar_tlbie_lock, flags); + + psize = batch->psize; + ssize = batch->ssize; + pix = 0; + for (i = 0; i < number; i++) { + va = batch->vaddr[i]; + pte = batch->pte[i]; + pte_iterate_hashed_subpages(pte, psize, va, index, shift) { + hash = hpt_hash(va, shift, ssize); + hidx = __rpte_to_hidx(pte, index); + if (hidx & _PTEIDX_SECONDARY) + hash = ~hash; + slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; + slot += hidx & _PTEIDX_GROUP_IX; + if (!firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + pSeries_lpar_hpte_invalidate(slot, va, psize, + ssize, local); + } else { + param[pix] = HBR_REQUEST | HBR_AVPN | slot; + param[pix+1] = hpte_encode_avpn(va, psize, + ssize); + pix += 2; + if (pix == 8) { + rc = plpar_hcall9(H_BULK_REMOVE, param, + param[0], param[1], param[2], + param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + pix = 0; + } + } + } pte_iterate_hashed_end(); + } + if (pix) { + param[pix] = HBR_END; + rc = plpar_hcall9(H_BULK_REMOVE, param, param[0], param[1], + param[2], param[3], param[4], param[5], + param[6], param[7]); + BUG_ON(rc != H_SUCCESS); + } + + if (lock_tlbie) + spin_unlock_irqrestore(&pSeries_lpar_tlbie_lock, flags); +} + +static int __init disable_bulk_remove(char *str) +{ + if (strcmp(str, "off") == 0 && + firmware_has_feature(FW_FEATURE_BULK_REMOVE)) { + printk(KERN_INFO "Disabling BULK_REMOVE firmware feature"); + powerpc_firmware_features &= ~FW_FEATURE_BULK_REMOVE; + } + return 1; +} + +__setup("bulk_remove=", disable_bulk_remove); + +void __init hpte_init_lpar(void) +{ + ppc_md.hpte_invalidate = pSeries_lpar_hpte_invalidate; + ppc_md.hpte_updatepp = pSeries_lpar_hpte_updatepp; + ppc_md.hpte_updateboltedpp = pSeries_lpar_hpte_updateboltedpp; + ppc_md.hpte_insert = pSeries_lpar_hpte_insert; + ppc_md.hpte_remove = pSeries_lpar_hpte_remove; + ppc_md.hpte_removebolted = pSeries_lpar_hpte_removebolted; + ppc_md.flush_hash_range = pSeries_lpar_flush_hash_range; + ppc_md.hpte_clear_all = pSeries_lpar_hptab_clear; +} + +#ifdef CONFIG_PPC_SMLPAR +#define CMO_FREE_HINT_DEFAULT 1 +static int cmo_free_hint_flag = CMO_FREE_HINT_DEFAULT; + +static int __init cmo_free_hint(char *str) +{ + char *parm; + parm = strstrip(str); + + if (strcasecmp(parm, "no") == 0 || strcasecmp(parm, "off") == 0) { + printk(KERN_INFO "cmo_free_hint: CMO free page hinting is not active.\n"); + cmo_free_hint_flag = 0; + return 1; + } + + cmo_free_hint_flag = 1; + printk(KERN_INFO "cmo_free_hint: CMO free page hinting is active.\n"); + + if (strcasecmp(parm, "yes") == 0 || strcasecmp(parm, "on") == 0) + return 1; + + return 0; +} + +__setup("cmo_free_hint=", cmo_free_hint); + +static void pSeries_set_page_state(struct page *page, int order, + unsigned long state) +{ + int i, j; + unsigned long cmo_page_sz, addr; + + cmo_page_sz = cmo_get_page_size(); + addr = __pa((unsigned long)page_address(page)); + + for (i = 0; i < (1 << order); i++, addr += PAGE_SIZE) { + for (j = 0; j < PAGE_SIZE; j += cmo_page_sz) + plpar_hcall_norets(H_PAGE_INIT, state, addr + j, 0); + } +} + +void arch_free_page(struct page *page, int order) +{ + if (!cmo_free_hint_flag || !firmware_has_feature(FW_FEATURE_CMO)) + return; + + pSeries_set_page_state(page, order, H_PAGE_SET_UNUSED); +} +EXPORT_SYMBOL(arch_free_page); + +#endif + +#ifdef CONFIG_TRACEPOINTS +/* + * We optimise our hcall path by placing hcall_tracepoint_refcount + * directly in the TOC so we can check if the hcall tracepoints are + * enabled via a single load. + */ + +/* NB: reg/unreg are called while guarded with the tracepoints_mutex */ +extern long hcall_tracepoint_refcount; + +/* + * Since the tracing code might execute hcalls we need to guard against + * recursion. One example of this are spinlocks calling H_YIELD on + * shared processor partitions. + */ +static DEFINE_PER_CPU(unsigned int, hcall_trace_depth); + +void hcall_tracepoint_regfunc(void) +{ + hcall_tracepoint_refcount++; +} + +void hcall_tracepoint_unregfunc(void) +{ + hcall_tracepoint_refcount--; +} + +void __trace_hcall_entry(unsigned long opcode, unsigned long *args) +{ + unsigned long flags; + unsigned int *depth; + + local_irq_save(flags); + + depth = &__get_cpu_var(hcall_trace_depth); + + if (*depth) + goto out; + + (*depth)++; + preempt_disable(); + trace_hcall_entry(opcode, args); + (*depth)--; + +out: + local_irq_restore(flags); +} + +void __trace_hcall_exit(long opcode, unsigned long retval, + unsigned long *retbuf) +{ + unsigned long flags; + unsigned int *depth; + + local_irq_save(flags); + + depth = &__get_cpu_var(hcall_trace_depth); + + if (*depth) + goto out; + + (*depth)++; + trace_hcall_exit(opcode, retval, retbuf); + preempt_enable(); + (*depth)--; + +out: + local_irq_restore(flags); +} +#endif + +/** + * h_get_mpp + * H_GET_MPP hcall returns info in 7 parms + */ +int h_get_mpp(struct hvcall_mpp_data *mpp_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9(H_GET_MPP, retbuf); + + mpp_data->entitled_mem = retbuf[0]; + mpp_data->mapped_mem = retbuf[1]; + + mpp_data->group_num = (retbuf[2] >> 2 * 8) & 0xffff; + mpp_data->pool_num = retbuf[2] & 0xffff; + + mpp_data->mem_weight = (retbuf[3] >> 7 * 8) & 0xff; + mpp_data->unallocated_mem_weight = (retbuf[3] >> 6 * 8) & 0xff; + mpp_data->unallocated_entitlement = retbuf[3] & 0xffffffffffff; + + mpp_data->pool_size = retbuf[4]; + mpp_data->loan_request = retbuf[5]; + mpp_data->backing_mem = retbuf[6]; + + return rc; +} +EXPORT_SYMBOL(h_get_mpp); + +int h_get_mpp_x(struct hvcall_mpp_x_data *mpp_x_data) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE] = { 0 }; + + rc = plpar_hcall9(H_GET_MPP_X, retbuf); + + mpp_x_data->coalesced_bytes = retbuf[0]; + mpp_x_data->pool_coalesced_bytes = retbuf[1]; + mpp_x_data->pool_purr_cycles = retbuf[2]; + mpp_x_data->pool_spurr_cycles = retbuf[3]; + + return rc; +} diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c new file mode 100644 index 00000000..3e7f651e --- /dev/null +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -0,0 +1,362 @@ +/* + * Support for Partition Mobility/Migration + * + * Copyright (C) 2010 Nathan Fontenot + * Copyright (C) 2010 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kobject.h> +#include <linux/smp.h> +#include <linux/completion.h> +#include <linux/device.h> +#include <linux/delay.h> +#include <linux/slab.h> + +#include <asm/rtas.h> +#include "pseries.h" + +static struct kobject *mobility_kobj; + +struct update_props_workarea { + u32 phandle; + u32 state; + u64 reserved; + u32 nprops; +}; + +#define NODE_ACTION_MASK 0xff000000 +#define NODE_COUNT_MASK 0x00ffffff + +#define DELETE_DT_NODE 0x01000000 +#define UPDATE_DT_NODE 0x02000000 +#define ADD_DT_NODE 0x03000000 + +static int mobility_rtas_call(int token, char *buf) +{ + int rc; + + spin_lock(&rtas_data_buf_lock); + + memcpy(rtas_data_buf, buf, RTAS_DATA_BUF_SIZE); + rc = rtas_call(token, 2, 1, NULL, rtas_data_buf, 1); + memcpy(buf, rtas_data_buf, RTAS_DATA_BUF_SIZE); + + spin_unlock(&rtas_data_buf_lock); + return rc; +} + +static int delete_dt_node(u32 phandle) +{ + struct device_node *dn; + + dn = of_find_node_by_phandle(phandle); + if (!dn) + return -ENOENT; + + dlpar_detach_node(dn); + return 0; +} + +static int update_dt_property(struct device_node *dn, struct property **prop, + const char *name, u32 vd, char *value) +{ + struct property *new_prop = *prop; + struct property *old_prop; + int more = 0; + + /* A negative 'vd' value indicates that only part of the new property + * value is contained in the buffer and we need to call + * ibm,update-properties again to get the rest of the value. + * + * A negative value is also the two's compliment of the actual value. + */ + if (vd & 0x80000000) { + vd = ~vd + 1; + more = 1; + } + + if (new_prop) { + /* partial property fixup */ + char *new_data = kzalloc(new_prop->length + vd, GFP_KERNEL); + if (!new_data) + return -ENOMEM; + + memcpy(new_data, new_prop->value, new_prop->length); + memcpy(new_data + new_prop->length, value, vd); + + kfree(new_prop->value); + new_prop->value = new_data; + new_prop->length += vd; + } else { + new_prop = kzalloc(sizeof(*new_prop), GFP_KERNEL); + if (!new_prop) + return -ENOMEM; + + new_prop->name = kstrdup(name, GFP_KERNEL); + if (!new_prop->name) { + kfree(new_prop); + return -ENOMEM; + } + + new_prop->length = vd; + new_prop->value = kzalloc(new_prop->length, GFP_KERNEL); + if (!new_prop->value) { + kfree(new_prop->name); + kfree(new_prop); + return -ENOMEM; + } + + memcpy(new_prop->value, value, vd); + *prop = new_prop; + } + + if (!more) { + old_prop = of_find_property(dn, new_prop->name, NULL); + if (old_prop) + prom_update_property(dn, new_prop, old_prop); + else + prom_add_property(dn, new_prop); + + new_prop = NULL; + } + + return 0; +} + +static int update_dt_node(u32 phandle) +{ + struct update_props_workarea *upwa; + struct device_node *dn; + struct property *prop = NULL; + int i, rc; + char *prop_data; + char *rtas_buf; + int update_properties_token; + + update_properties_token = rtas_token("ibm,update-properties"); + if (update_properties_token == RTAS_UNKNOWN_SERVICE) + return -EINVAL; + + rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!rtas_buf) + return -ENOMEM; + + dn = of_find_node_by_phandle(phandle); + if (!dn) { + kfree(rtas_buf); + return -ENOENT; + } + + upwa = (struct update_props_workarea *)&rtas_buf[0]; + upwa->phandle = phandle; + + do { + rc = mobility_rtas_call(update_properties_token, rtas_buf); + if (rc < 0) + break; + + prop_data = rtas_buf + sizeof(*upwa); + + for (i = 0; i < upwa->nprops; i++) { + char *prop_name; + u32 vd; + + prop_name = prop_data + 1; + prop_data += strlen(prop_name) + 1; + vd = *prop_data++; + + switch (vd) { + case 0x00000000: + /* name only property, nothing to do */ + break; + + case 0x80000000: + prop = of_find_property(dn, prop_name, NULL); + prom_remove_property(dn, prop); + prop = NULL; + break; + + default: + rc = update_dt_property(dn, &prop, prop_name, + vd, prop_data); + if (rc) { + printk(KERN_ERR "Could not update %s" + " property\n", prop_name); + } + + prop_data += vd; + } + } + } while (rc == 1); + + of_node_put(dn); + kfree(rtas_buf); + return 0; +} + +static int add_dt_node(u32 parent_phandle, u32 drc_index) +{ + struct device_node *dn; + struct device_node *parent_dn; + int rc; + + dn = dlpar_configure_connector(drc_index); + if (!dn) + return -ENOENT; + + parent_dn = of_find_node_by_phandle(parent_phandle); + if (!parent_dn) { + dlpar_free_cc_nodes(dn); + return -ENOENT; + } + + dn->parent = parent_dn; + rc = dlpar_attach_node(dn); + if (rc) + dlpar_free_cc_nodes(dn); + + of_node_put(parent_dn); + return rc; +} + +static int pseries_devicetree_update(void) +{ + char *rtas_buf; + u32 *data; + int update_nodes_token; + int rc; + + update_nodes_token = rtas_token("ibm,update-nodes"); + if (update_nodes_token == RTAS_UNKNOWN_SERVICE) + return -EINVAL; + + rtas_buf = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!rtas_buf) + return -ENOMEM; + + do { + rc = mobility_rtas_call(update_nodes_token, rtas_buf); + if (rc && rc != 1) + break; + + data = (u32 *)rtas_buf + 4; + while (*data & NODE_ACTION_MASK) { + int i; + u32 action = *data & NODE_ACTION_MASK; + int node_count = *data & NODE_COUNT_MASK; + + data++; + + for (i = 0; i < node_count; i++) { + u32 phandle = *data++; + u32 drc_index; + + switch (action) { + case DELETE_DT_NODE: + delete_dt_node(phandle); + break; + case UPDATE_DT_NODE: + update_dt_node(phandle); + break; + case ADD_DT_NODE: + drc_index = *data++; + add_dt_node(phandle, drc_index); + break; + } + } + } + } while (rc == 1); + + kfree(rtas_buf); + return rc; +} + +void post_mobility_fixup(void) +{ + int rc; + int activate_fw_token; + + rc = pseries_devicetree_update(); + if (rc) { + printk(KERN_ERR "Initial post-mobility device tree update " + "failed: %d\n", rc); + return; + } + + activate_fw_token = rtas_token("ibm,activate-firmware"); + if (activate_fw_token == RTAS_UNKNOWN_SERVICE) { + printk(KERN_ERR "Could not make post-mobility " + "activate-fw call.\n"); + return; + } + + rc = rtas_call(activate_fw_token, 0, 1, NULL); + if (!rc) { + rc = pseries_devicetree_update(); + if (rc) + printk(KERN_ERR "Secondary post-mobility device tree " + "update failed: %d\n", rc); + } else { + printk(KERN_ERR "Post-mobility activate-fw failed: %d\n", rc); + return; + } + + return; +} + +static ssize_t migrate_store(struct class *class, struct class_attribute *attr, + const char *buf, size_t count) +{ + struct rtas_args args; + u64 streamid; + int rc; + + rc = strict_strtoull(buf, 0, &streamid); + if (rc) + return rc; + + memset(&args, 0, sizeof(args)); + args.token = rtas_token("ibm,suspend-me"); + args.nargs = 2; + args.nret = 1; + + args.args[0] = streamid >> 32 ; + args.args[1] = streamid & 0xffffffff; + args.rets = &args.args[args.nargs]; + + do { + args.rets[0] = 0; + rc = rtas_ibm_suspend_me(&args); + if (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE) + ssleep(1); + } while (!rc && args.rets[0] == RTAS_NOT_SUSPENDABLE); + + if (rc) + return rc; + else if (args.rets[0]) + return args.rets[0]; + + post_mobility_fixup(); + return count; +} + +static CLASS_ATTR(migration, S_IWUSR, NULL, migrate_store); + +static int __init mobility_sysfs_init(void) +{ + int rc; + + mobility_kobj = kobject_create_and_add("mobility", kernel_kobj); + if (!mobility_kobj) + return -ENOMEM; + + rc = sysfs_create_file(mobility_kobj, &class_attr_migration.attr); + + return rc; +} +device_initcall(mobility_sysfs_init); diff --git a/arch/powerpc/platforms/pseries/msi.c b/arch/powerpc/platforms/pseries/msi.c new file mode 100644 index 00000000..38d24e7e --- /dev/null +++ b/arch/powerpc/platforms/pseries/msi.c @@ -0,0 +1,491 @@ +/* + * Copyright 2006 Jake Moilanen <moilanen@austin.ibm.com>, IBM Corp. + * Copyright 2006-2007 Michael Ellerman, IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; version 2 of the + * License. + * + */ + +#include <linux/device.h> +#include <linux/irq.h> +#include <linux/msi.h> + +#include <asm/rtas.h> +#include <asm/hw_irq.h> +#include <asm/ppc-pci.h> + +static int query_token, change_token; + +#define RTAS_QUERY_FN 0 +#define RTAS_CHANGE_FN 1 +#define RTAS_RESET_FN 2 +#define RTAS_CHANGE_MSI_FN 3 +#define RTAS_CHANGE_MSIX_FN 4 + +static struct pci_dn *get_pdn(struct pci_dev *pdev) +{ + struct device_node *dn; + struct pci_dn *pdn; + + dn = pci_device_to_OF_node(pdev); + if (!dn) { + dev_dbg(&pdev->dev, "rtas_msi: No OF device node\n"); + return NULL; + } + + pdn = PCI_DN(dn); + if (!pdn) { + dev_dbg(&pdev->dev, "rtas_msi: No PCI DN\n"); + return NULL; + } + + return pdn; +} + +/* RTAS Helpers */ + +static int rtas_change_msi(struct pci_dn *pdn, u32 func, u32 num_irqs) +{ + u32 addr, seq_num, rtas_ret[3]; + unsigned long buid; + int rc; + + addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); + buid = pdn->phb->buid; + + seq_num = 1; + do { + if (func == RTAS_CHANGE_MSI_FN || func == RTAS_CHANGE_MSIX_FN) + rc = rtas_call(change_token, 6, 4, rtas_ret, addr, + BUID_HI(buid), BUID_LO(buid), + func, num_irqs, seq_num); + else + rc = rtas_call(change_token, 6, 3, rtas_ret, addr, + BUID_HI(buid), BUID_LO(buid), + func, num_irqs, seq_num); + + seq_num = rtas_ret[1]; + } while (rtas_busy_delay(rc)); + + /* + * If the RTAS call succeeded, return the number of irqs allocated. + * If not, make sure we return a negative error code. + */ + if (rc == 0) + rc = rtas_ret[0]; + else if (rc > 0) + rc = -rc; + + pr_debug("rtas_msi: ibm,change_msi(func=%d,num=%d), got %d rc = %d\n", + func, num_irqs, rtas_ret[0], rc); + + return rc; +} + +static void rtas_disable_msi(struct pci_dev *pdev) +{ + struct pci_dn *pdn; + + pdn = get_pdn(pdev); + if (!pdn) + return; + + /* + * disabling MSI with the explicit interface also disables MSI-X + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, 0) != 0) { + /* + * may have failed because explicit interface is not + * present + */ + if (rtas_change_msi(pdn, RTAS_CHANGE_FN, 0) != 0) { + pr_debug("rtas_msi: Setting MSIs to 0 failed!\n"); + } + } +} + +static int rtas_query_irq_number(struct pci_dn *pdn, int offset) +{ + u32 addr, rtas_ret[2]; + unsigned long buid; + int rc; + + addr = rtas_config_addr(pdn->busno, pdn->devfn, 0); + buid = pdn->phb->buid; + + do { + rc = rtas_call(query_token, 4, 3, rtas_ret, addr, + BUID_HI(buid), BUID_LO(buid), offset); + } while (rtas_busy_delay(rc)); + + if (rc) { + pr_debug("rtas_msi: error (%d) querying source number\n", rc); + return rc; + } + + return rtas_ret[0]; +} + +static void rtas_teardown_msi_irqs(struct pci_dev *pdev) +{ + struct msi_desc *entry; + + list_for_each_entry(entry, &pdev->msi_list, list) { + if (entry->irq == NO_IRQ) + continue; + + irq_set_msi_desc(entry->irq, NULL); + irq_dispose_mapping(entry->irq); + } + + rtas_disable_msi(pdev); +} + +static int check_req(struct pci_dev *pdev, int nvec, char *prop_name) +{ + struct device_node *dn; + struct pci_dn *pdn; + const u32 *req_msi; + + pdn = get_pdn(pdev); + if (!pdn) + return -ENODEV; + + dn = pdn->node; + + req_msi = of_get_property(dn, prop_name, NULL); + if (!req_msi) { + pr_debug("rtas_msi: No %s on %s\n", prop_name, dn->full_name); + return -ENOENT; + } + + if (*req_msi < nvec) { + pr_debug("rtas_msi: %s requests < %d MSIs\n", prop_name, nvec); + + if (*req_msi == 0) /* Be paranoid */ + return -ENOSPC; + + return *req_msi; + } + + return 0; +} + +static int check_req_msi(struct pci_dev *pdev, int nvec) +{ + return check_req(pdev, nvec, "ibm,req#msi"); +} + +static int check_req_msix(struct pci_dev *pdev, int nvec) +{ + return check_req(pdev, nvec, "ibm,req#msi-x"); +} + +/* Quota calculation */ + +static struct device_node *find_pe_total_msi(struct pci_dev *dev, int *total) +{ + struct device_node *dn; + const u32 *p; + + dn = of_node_get(pci_device_to_OF_node(dev)); + while (dn) { + p = of_get_property(dn, "ibm,pe-total-#msi", NULL); + if (p) { + pr_debug("rtas_msi: found prop on dn %s\n", + dn->full_name); + *total = *p; + return dn; + } + + dn = of_get_next_parent(dn); + } + + return NULL; +} + +static struct device_node *find_pe_dn(struct pci_dev *dev, int *total) +{ + struct device_node *dn; + + /* Found our PE and assume 8 at that point. */ + + dn = pci_device_to_OF_node(dev); + if (!dn) + return NULL; + + dn = find_device_pe(dn); + if (!dn) + return NULL; + + /* We actually want the parent */ + dn = of_get_parent(dn); + if (!dn) + return NULL; + + /* Hardcode of 8 for old firmwares */ + *total = 8; + pr_debug("rtas_msi: using PE dn %s\n", dn->full_name); + + return dn; +} + +struct msi_counts { + struct device_node *requestor; + int num_devices; + int request; + int quota; + int spare; + int over_quota; +}; + +static void *count_non_bridge_devices(struct device_node *dn, void *data) +{ + struct msi_counts *counts = data; + const u32 *p; + u32 class; + + pr_debug("rtas_msi: counting %s\n", dn->full_name); + + p = of_get_property(dn, "class-code", NULL); + class = p ? *p : 0; + + if ((class >> 8) != PCI_CLASS_BRIDGE_PCI) + counts->num_devices++; + + return NULL; +} + +static void *count_spare_msis(struct device_node *dn, void *data) +{ + struct msi_counts *counts = data; + const u32 *p; + int req; + + if (dn == counts->requestor) + req = counts->request; + else { + /* We don't know if a driver will try to use MSI or MSI-X, + * so we just have to punt and use the larger of the two. */ + req = 0; + p = of_get_property(dn, "ibm,req#msi", NULL); + if (p) + req = *p; + + p = of_get_property(dn, "ibm,req#msi-x", NULL); + if (p) + req = max(req, (int)*p); + } + + if (req < counts->quota) + counts->spare += counts->quota - req; + else if (req > counts->quota) + counts->over_quota++; + + return NULL; +} + +static int msi_quota_for_device(struct pci_dev *dev, int request) +{ + struct device_node *pe_dn; + struct msi_counts counts; + int total; + + pr_debug("rtas_msi: calc quota for %s, request %d\n", pci_name(dev), + request); + + pe_dn = find_pe_total_msi(dev, &total); + if (!pe_dn) + pe_dn = find_pe_dn(dev, &total); + + if (!pe_dn) { + pr_err("rtas_msi: couldn't find PE for %s\n", pci_name(dev)); + goto out; + } + + pr_debug("rtas_msi: found PE %s\n", pe_dn->full_name); + + memset(&counts, 0, sizeof(struct msi_counts)); + + /* Work out how many devices we have below this PE */ + traverse_pci_devices(pe_dn, count_non_bridge_devices, &counts); + + if (counts.num_devices == 0) { + pr_err("rtas_msi: found 0 devices under PE for %s\n", + pci_name(dev)); + goto out; + } + + counts.quota = total / counts.num_devices; + if (request <= counts.quota) + goto out; + + /* else, we have some more calculating to do */ + counts.requestor = pci_device_to_OF_node(dev); + counts.request = request; + traverse_pci_devices(pe_dn, count_spare_msis, &counts); + + /* If the quota isn't an integer multiple of the total, we can + * use the remainder as spare MSIs for anyone that wants them. */ + counts.spare += total % counts.num_devices; + + /* Divide any spare by the number of over-quota requestors */ + if (counts.over_quota) + counts.quota += counts.spare / counts.over_quota; + + /* And finally clamp the request to the possibly adjusted quota */ + request = min(counts.quota, request); + + pr_debug("rtas_msi: request clamped to quota %d\n", request); +out: + of_node_put(pe_dn); + + return request; +} + +static int rtas_msi_check_device(struct pci_dev *pdev, int nvec, int type) +{ + int quota, rc; + + if (type == PCI_CAP_ID_MSIX) + rc = check_req_msix(pdev, nvec); + else + rc = check_req_msi(pdev, nvec); + + if (rc) + return rc; + + quota = msi_quota_for_device(pdev, nvec); + + if (quota && quota < nvec) + return quota; + + return 0; +} + +static int check_msix_entries(struct pci_dev *pdev) +{ + struct msi_desc *entry; + int expected; + + /* There's no way for us to express to firmware that we want + * a discontiguous, or non-zero based, range of MSI-X entries. + * So we must reject such requests. */ + + expected = 0; + list_for_each_entry(entry, &pdev->msi_list, list) { + if (entry->msi_attrib.entry_nr != expected) { + pr_debug("rtas_msi: bad MSI-X entries.\n"); + return -EINVAL; + } + expected++; + } + + return 0; +} + +static int rtas_setup_msi_irqs(struct pci_dev *pdev, int nvec, int type) +{ + struct pci_dn *pdn; + int hwirq, virq, i, rc; + struct msi_desc *entry; + struct msi_msg msg; + + pdn = get_pdn(pdev); + if (!pdn) + return -ENODEV; + + if (type == PCI_CAP_ID_MSIX && check_msix_entries(pdev)) + return -EINVAL; + + /* + * Try the new more explicit firmware interface, if that fails fall + * back to the old interface. The old interface is known to never + * return MSI-Xs. + */ + if (type == PCI_CAP_ID_MSI) { + rc = rtas_change_msi(pdn, RTAS_CHANGE_MSI_FN, nvec); + + if (rc < 0) { + pr_debug("rtas_msi: trying the old firmware call.\n"); + rc = rtas_change_msi(pdn, RTAS_CHANGE_FN, nvec); + } + } else + rc = rtas_change_msi(pdn, RTAS_CHANGE_MSIX_FN, nvec); + + if (rc != nvec) { + pr_debug("rtas_msi: rtas_change_msi() failed\n"); + return rc; + } + + i = 0; + list_for_each_entry(entry, &pdev->msi_list, list) { + hwirq = rtas_query_irq_number(pdn, i++); + if (hwirq < 0) { + pr_debug("rtas_msi: error (%d) getting hwirq\n", rc); + return hwirq; + } + + virq = irq_create_mapping(NULL, hwirq); + + if (virq == NO_IRQ) { + pr_debug("rtas_msi: Failed mapping hwirq %d\n", hwirq); + return -ENOSPC; + } + + dev_dbg(&pdev->dev, "rtas_msi: allocated virq %d\n", virq); + irq_set_msi_desc(virq, entry); + + /* Read config space back so we can restore after reset */ + read_msi_msg(virq, &msg); + entry->msg = msg; + } + + return 0; +} + +static void rtas_msi_pci_irq_fixup(struct pci_dev *pdev) +{ + /* No LSI -> leave MSIs (if any) configured */ + if (pdev->irq == NO_IRQ) { + dev_dbg(&pdev->dev, "rtas_msi: no LSI, nothing to do.\n"); + return; + } + + /* No MSI -> MSIs can't have been assigned by fw, leave LSI */ + if (check_req_msi(pdev, 1) && check_req_msix(pdev, 1)) { + dev_dbg(&pdev->dev, "rtas_msi: no req#msi/x, nothing to do.\n"); + return; + } + + dev_dbg(&pdev->dev, "rtas_msi: disabling existing MSI.\n"); + rtas_disable_msi(pdev); +} + +static int rtas_msi_init(void) +{ + query_token = rtas_token("ibm,query-interrupt-source-number"); + change_token = rtas_token("ibm,change-msi"); + + if ((query_token == RTAS_UNKNOWN_SERVICE) || + (change_token == RTAS_UNKNOWN_SERVICE)) { + pr_debug("rtas_msi: no RTAS tokens, no MSI support.\n"); + return -1; + } + + pr_debug("rtas_msi: Registering RTAS MSI callbacks.\n"); + + WARN_ON(ppc_md.setup_msi_irqs); + ppc_md.setup_msi_irqs = rtas_setup_msi_irqs; + ppc_md.teardown_msi_irqs = rtas_teardown_msi_irqs; + ppc_md.msi_check_device = rtas_msi_check_device; + + WARN_ON(ppc_md.pci_irq_fixup); + ppc_md.pci_irq_fixup = rtas_msi_pci_irq_fixup; + + return 0; +} +arch_initcall(rtas_msi_init); diff --git a/arch/powerpc/platforms/pseries/nvram.c b/arch/powerpc/platforms/pseries/nvram.c new file mode 100644 index 00000000..00cc3a09 --- /dev/null +++ b/arch/powerpc/platforms/pseries/nvram.c @@ -0,0 +1,516 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * /dev/nvram driver for PPC64 + * + * This perhaps should live in drivers/char + */ + + +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/slab.h> +#include <linux/kmsg_dump.h> +#include <asm/uaccess.h> +#include <asm/nvram.h> +#include <asm/rtas.h> +#include <asm/prom.h> +#include <asm/machdep.h> + +/* Max bytes to read/write in one go */ +#define NVRW_CNT 0x20 + +static unsigned int nvram_size; +static int nvram_fetch, nvram_store; +static char nvram_buf[NVRW_CNT]; /* assume this is in the first 4GB */ +static DEFINE_SPINLOCK(nvram_lock); + +struct err_log_info { + int error_type; + unsigned int seq_num; +}; + +struct nvram_os_partition { + const char *name; + int req_size; /* desired size, in bytes */ + int min_size; /* minimum acceptable size (0 means req_size) */ + long size; /* size of data portion (excluding err_log_info) */ + long index; /* offset of data portion of partition */ +}; + +static struct nvram_os_partition rtas_log_partition = { + .name = "ibm,rtas-log", + .req_size = 2079, + .min_size = 1055, + .index = -1 +}; + +static struct nvram_os_partition oops_log_partition = { + .name = "lnx,oops-log", + .req_size = 4000, + .min_size = 2000, + .index = -1 +}; + +static const char *pseries_nvram_os_partitions[] = { + "ibm,rtas-log", + "lnx,oops-log", + NULL +}; + +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len); + +static struct kmsg_dumper nvram_kmsg_dumper = { + .dump = oops_to_nvram +}; + +/* See clobbering_unread_rtas_event() */ +#define NVRAM_RTAS_READ_TIMEOUT 5 /* seconds */ +static unsigned long last_unread_rtas_event; /* timestamp */ + +/* We preallocate oops_buf during init to avoid kmalloc during oops/panic. */ +static char *oops_buf; + +static ssize_t pSeries_nvram_read(char *buf, size_t count, loff_t *index) +{ + unsigned int i; + unsigned long len; + int done; + unsigned long flags; + char *p = buf; + + + if (nvram_size == 0 || nvram_fetch == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + if (*index >= nvram_size) + return 0; + + i = *index; + if (i + count > nvram_size) + count = nvram_size - i; + + spin_lock_irqsave(&nvram_lock, flags); + + for (; count != 0; count -= len) { + len = count; + if (len > NVRW_CNT) + len = NVRW_CNT; + + if ((rtas_call(nvram_fetch, 3, 2, &done, i, __pa(nvram_buf), + len) != 0) || len != done) { + spin_unlock_irqrestore(&nvram_lock, flags); + return -EIO; + } + + memcpy(p, nvram_buf, len); + + p += len; + i += len; + } + + spin_unlock_irqrestore(&nvram_lock, flags); + + *index = i; + return p - buf; +} + +static ssize_t pSeries_nvram_write(char *buf, size_t count, loff_t *index) +{ + unsigned int i; + unsigned long len; + int done; + unsigned long flags; + const char *p = buf; + + if (nvram_size == 0 || nvram_store == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + if (*index >= nvram_size) + return 0; + + i = *index; + if (i + count > nvram_size) + count = nvram_size - i; + + spin_lock_irqsave(&nvram_lock, flags); + + for (; count != 0; count -= len) { + len = count; + if (len > NVRW_CNT) + len = NVRW_CNT; + + memcpy(nvram_buf, p, len); + + if ((rtas_call(nvram_store, 3, 2, &done, i, __pa(nvram_buf), + len) != 0) || len != done) { + spin_unlock_irqrestore(&nvram_lock, flags); + return -EIO; + } + + p += len; + i += len; + } + spin_unlock_irqrestore(&nvram_lock, flags); + + *index = i; + return p - buf; +} + +static ssize_t pSeries_nvram_get_size(void) +{ + return nvram_size ? nvram_size : -ENODEV; +} + + +/* nvram_write_os_partition, nvram_write_error_log + * + * We need to buffer the error logs into nvram to ensure that we have + * the failure information to decode. If we have a severe error there + * is no way to guarantee that the OS or the machine is in a state to + * get back to user land and write the error to disk. For example if + * the SCSI device driver causes a Machine Check by writing to a bad + * IO address, there is no way of guaranteeing that the device driver + * is in any state that is would also be able to write the error data + * captured to disk, thus we buffer it in NVRAM for analysis on the + * next boot. + * + * In NVRAM the partition containing the error log buffer will looks like: + * Header (in bytes): + * +-----------+----------+--------+------------+------------------+ + * | signature | checksum | length | name | data | + * |0 |1 |2 3|4 15|16 length-1| + * +-----------+----------+--------+------------+------------------+ + * + * The 'data' section would look like (in bytes): + * +--------------+------------+-----------------------------------+ + * | event_logged | sequence # | error log | + * |0 3|4 7|8 error_log_size-1| + * +--------------+------------+-----------------------------------+ + * + * event_logged: 0 if event has not been logged to syslog, 1 if it has + * sequence #: The unique sequence # for each event. (until it wraps) + * error log: The error log from event_scan + */ +int nvram_write_os_partition(struct nvram_os_partition *part, char * buff, + int length, unsigned int err_type, unsigned int error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (part->index == -1) { + return -ESPIPE; + } + + if (length > part->size) { + length = part->size; + } + + info.error_type = err_type; + info.seq_num = error_log_cnt; + + tmp_index = part->index; + + rc = ppc_md.nvram_write((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + rc = ppc_md.nvram_write(buff, length, &tmp_index); + if (rc <= 0) { + pr_err("%s: Failed nvram_write (%d)\n", __FUNCTION__, rc); + return rc; + } + + return 0; +} + +int nvram_write_error_log(char * buff, int length, + unsigned int err_type, unsigned int error_log_cnt) +{ + int rc = nvram_write_os_partition(&rtas_log_partition, buff, length, + err_type, error_log_cnt); + if (!rc) + last_unread_rtas_event = get_seconds(); + return rc; +} + +/* nvram_read_error_log + * + * Reads nvram for error log for at most 'length' + */ +int nvram_read_error_log(char * buff, int length, + unsigned int * err_type, unsigned int * error_log_cnt) +{ + int rc; + loff_t tmp_index; + struct err_log_info info; + + if (rtas_log_partition.index == -1) + return -1; + + if (length > rtas_log_partition.size) + length = rtas_log_partition.size; + + tmp_index = rtas_log_partition.index; + + rc = ppc_md.nvram_read((char *)&info, sizeof(struct err_log_info), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + return rc; + } + + rc = ppc_md.nvram_read(buff, length, &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_read_error_log: Failed nvram_read (%d)\n", rc); + return rc; + } + + *error_log_cnt = info.seq_num; + *err_type = info.error_type; + + return 0; +} + +/* This doesn't actually zero anything, but it sets the event_logged + * word to tell that this event is safely in syslog. + */ +int nvram_clear_error_log(void) +{ + loff_t tmp_index; + int clear_word = ERR_FLAG_ALREADY_LOGGED; + int rc; + + if (rtas_log_partition.index == -1) + return -1; + + tmp_index = rtas_log_partition.index; + + rc = ppc_md.nvram_write((char *)&clear_word, sizeof(int), &tmp_index); + if (rc <= 0) { + printk(KERN_ERR "nvram_clear_error_log: Failed nvram_write (%d)\n", rc); + return rc; + } + last_unread_rtas_event = 0; + + return 0; +} + +/* pseries_nvram_init_os_partition + * + * This sets up a partition with an "OS" signature. + * + * The general strategy is the following: + * 1.) If a partition with the indicated name already exists... + * - If it's large enough, use it. + * - Otherwise, recycle it and keep going. + * 2.) Search for a free partition that is large enough. + * 3.) If there's not a free partition large enough, recycle any obsolete + * OS partitions and try again. + * 4.) Will first try getting a chunk that will satisfy the requested size. + * 5.) If a chunk of the requested size cannot be allocated, then try finding + * a chunk that will satisfy the minum needed. + * + * Returns 0 on success, else -1. + */ +static int __init pseries_nvram_init_os_partition(struct nvram_os_partition + *part) +{ + loff_t p; + int size; + + /* Scan nvram for partitions */ + nvram_scan_partitions(); + + /* Look for ours */ + p = nvram_find_partition(part->name, NVRAM_SIG_OS, &size); + + /* Found one but too small, remove it */ + if (p && size < part->min_size) { + pr_info("nvram: Found too small %s partition," + " removing it...\n", part->name); + nvram_remove_partition(part->name, NVRAM_SIG_OS, NULL); + p = 0; + } + + /* Create one if we didn't find */ + if (!p) { + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + if (p == -ENOSPC) { + pr_info("nvram: No room to create %s partition, " + "deleting any obsolete OS partitions...\n", + part->name); + nvram_remove_partition(NULL, NVRAM_SIG_OS, + pseries_nvram_os_partitions); + p = nvram_create_partition(part->name, NVRAM_SIG_OS, + part->req_size, part->min_size); + } + } + + if (p <= 0) { + pr_err("nvram: Failed to find or create %s" + " partition, err %d\n", part->name, (int)p); + return -1; + } + + part->index = p; + part->size = nvram_get_partition_size(p) - sizeof(struct err_log_info); + + return 0; +} + +static void __init nvram_init_oops_partition(int rtas_partition_exists) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&oops_log_partition); + if (rc != 0) { + if (!rtas_partition_exists) + return; + pr_notice("nvram: Using %s partition to log both" + " RTAS errors and oops/panic reports\n", + rtas_log_partition.name); + memcpy(&oops_log_partition, &rtas_log_partition, + sizeof(rtas_log_partition)); + } + oops_buf = kmalloc(oops_log_partition.size, GFP_KERNEL); + rc = kmsg_dump_register(&nvram_kmsg_dumper); + if (rc != 0) { + pr_err("nvram: kmsg_dump_register() failed; returned %d\n", rc); + kfree(oops_buf); + return; + } +} + +static int __init pseries_nvram_init_log_partitions(void) +{ + int rc; + + rc = pseries_nvram_init_os_partition(&rtas_log_partition); + nvram_init_oops_partition(rc == 0); + return 0; +} +machine_arch_initcall(pseries, pseries_nvram_init_log_partitions); + +int __init pSeries_nvram_init(void) +{ + struct device_node *nvram; + const unsigned int *nbytes_p; + unsigned int proplen; + + nvram = of_find_node_by_type(NULL, "nvram"); + if (nvram == NULL) + return -ENODEV; + + nbytes_p = of_get_property(nvram, "#bytes", &proplen); + if (nbytes_p == NULL || proplen != sizeof(unsigned int)) { + of_node_put(nvram); + return -EIO; + } + + nvram_size = *nbytes_p; + + nvram_fetch = rtas_token("nvram-fetch"); + nvram_store = rtas_token("nvram-store"); + printk(KERN_INFO "PPC64 nvram contains %d bytes\n", nvram_size); + of_node_put(nvram); + + ppc_md.nvram_read = pSeries_nvram_read; + ppc_md.nvram_write = pSeries_nvram_write; + ppc_md.nvram_size = pSeries_nvram_get_size; + + return 0; +} + +/* + * Try to capture the last capture_len bytes of the printk buffer. Return + * the amount actually captured. + */ +static size_t capture_last_msgs(const char *old_msgs, size_t old_len, + const char *new_msgs, size_t new_len, + char *captured, size_t capture_len) +{ + if (new_len >= capture_len) { + memcpy(captured, new_msgs + (new_len - capture_len), + capture_len); + return capture_len; + } else { + /* Grab the end of old_msgs. */ + size_t old_tail_len = min(old_len, capture_len - new_len); + memcpy(captured, old_msgs + (old_len - old_tail_len), + old_tail_len); + memcpy(captured + old_tail_len, new_msgs, new_len); + return old_tail_len + new_len; + } +} + +/* + * Are we using the ibm,rtas-log for oops/panic reports? And if so, + * would logging this oops/panic overwrite an RTAS event that rtas_errd + * hasn't had a chance to read and process? Return 1 if so, else 0. + * + * We assume that if rtas_errd hasn't read the RTAS event in + * NVRAM_RTAS_READ_TIMEOUT seconds, it's probably not going to. + */ +static int clobbering_unread_rtas_event(void) +{ + return (oops_log_partition.index == rtas_log_partition.index + && last_unread_rtas_event + && get_seconds() - last_unread_rtas_event <= + NVRAM_RTAS_READ_TIMEOUT); +} + +/* our kmsg_dump callback */ +static void oops_to_nvram(struct kmsg_dumper *dumper, + enum kmsg_dump_reason reason, + const char *old_msgs, unsigned long old_len, + const char *new_msgs, unsigned long new_len) +{ + static unsigned int oops_count = 0; + static bool panicking = false; + size_t text_len; + + switch (reason) { + case KMSG_DUMP_RESTART: + case KMSG_DUMP_HALT: + case KMSG_DUMP_POWEROFF: + /* These are almost always orderly shutdowns. */ + return; + case KMSG_DUMP_OOPS: + case KMSG_DUMP_KEXEC: + break; + case KMSG_DUMP_PANIC: + panicking = true; + break; + case KMSG_DUMP_EMERG: + if (panicking) + /* Panic report already captured. */ + return; + break; + default: + pr_err("%s: ignoring unrecognized KMSG_DUMP_* reason %d\n", + __FUNCTION__, (int) reason); + return; + } + + if (clobbering_unread_rtas_event()) + return; + + text_len = capture_last_msgs(old_msgs, old_len, new_msgs, new_len, + oops_buf, oops_log_partition.size); + (void) nvram_write_os_partition(&oops_log_partition, oops_buf, + (int) text_len, ERR_TYPE_KERNEL_PANIC, ++oops_count); +} diff --git a/arch/powerpc/platforms/pseries/offline_states.h b/arch/powerpc/platforms/pseries/offline_states.h new file mode 100644 index 00000000..08672d91 --- /dev/null +++ b/arch/powerpc/platforms/pseries/offline_states.h @@ -0,0 +1,37 @@ +#ifndef _OFFLINE_STATES_H_ +#define _OFFLINE_STATES_H_ + +/* Cpu offline states go here */ +enum cpu_state_vals { + CPU_STATE_OFFLINE, + CPU_STATE_INACTIVE, + CPU_STATE_ONLINE, + CPU_MAX_OFFLINE_STATES +}; + +#ifdef CONFIG_HOTPLUG_CPU +extern enum cpu_state_vals get_cpu_current_state(int cpu); +extern void set_cpu_current_state(int cpu, enum cpu_state_vals state); +extern void set_preferred_offline_state(int cpu, enum cpu_state_vals state); +extern void set_default_offline_state(int cpu); +#else +static inline enum cpu_state_vals get_cpu_current_state(int cpu) +{ + return CPU_STATE_ONLINE; +} + +static inline void set_cpu_current_state(int cpu, enum cpu_state_vals state) +{ +} + +static inline void set_preferred_offline_state(int cpu, enum cpu_state_vals state) +{ +} + +static inline void set_default_offline_state(int cpu) +{ +} +#endif + +extern enum cpu_state_vals get_preferred_offline_state(int cpu); +#endif diff --git a/arch/powerpc/platforms/pseries/pci.c b/arch/powerpc/platforms/pseries/pci.c new file mode 100644 index 00000000..2c6ded29 --- /dev/null +++ b/arch/powerpc/platforms/pseries/pci.c @@ -0,0 +1,109 @@ +/* + * Copyright (C) 2001 Dave Engebretsen, IBM Corporation + * Copyright (C) 2003 Anton Blanchard <anton@au.ibm.com>, IBM + * + * pSeries specific routines for PCI. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/kernel.h> +#include <linux/pci.h> +#include <linux/string.h> + +#include <asm/eeh.h> +#include <asm/pci-bridge.h> +#include <asm/prom.h> +#include <asm/ppc-pci.h> + +#if 0 +void pcibios_name_device(struct pci_dev *dev) +{ + struct device_node *dn; + + /* + * Add IBM loc code (slot) as a prefix to the device names for service + */ + dn = pci_device_to_OF_node(dev); + if (dn) { + const char *loc_code = of_get_property(dn, "ibm,loc-code", 0); + if (loc_code) { + int loc_len = strlen(loc_code); + if (loc_len < sizeof(dev->dev.name)) { + memmove(dev->dev.name+loc_len+1, dev->dev.name, + sizeof(dev->dev.name)-loc_len-1); + memcpy(dev->dev.name, loc_code, loc_len); + dev->dev.name[loc_len] = ' '; + dev->dev.name[sizeof(dev->dev.name)-1] = '\0'; + } + } + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pcibios_name_device); +#endif + +static void __init pSeries_request_regions(void) +{ + if (!isa_io_base) + return; + + request_region(0x20,0x20,"pic1"); + request_region(0xa0,0x20,"pic2"); + request_region(0x00,0x20,"dma1"); + request_region(0x40,0x20,"timer"); + request_region(0x80,0x10,"dma page reg"); + request_region(0xc0,0x20,"dma2"); +} + +void __init pSeries_final_fixup(void) +{ + pSeries_request_regions(); + + pci_addr_cache_build(); +} + +/* + * Assume the winbond 82c105 is the IDE controller on a + * p610/p615/p630. We should probably be more careful in case + * someone tries to plug in a similar adapter. + */ +static void fixup_winbond_82c105(struct pci_dev* dev) +{ + int i; + unsigned int reg; + + if (!machine_is(pseries)) + return; + + printk("Using INTC for W82c105 IDE controller.\n"); + pci_read_config_dword(dev, 0x40, ®); + /* Enable LEGIRQ to use INTC instead of ISA interrupts */ + pci_write_config_dword(dev, 0x40, reg | (1<<11)); + + for (i = 0; i < DEVICE_COUNT_RESOURCE; ++i) { + /* zap the 2nd function of the winbond chip */ + if (dev->resource[i].flags & IORESOURCE_IO + && dev->bus->number == 0 && dev->devfn == 0x81) + dev->resource[i].flags &= ~IORESOURCE_IO; + if (dev->resource[i].start == 0 && dev->resource[i].end) { + dev->resource[i].flags = 0; + dev->resource[i].end = 0; + } + } +} +DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_WINBOND, PCI_DEVICE_ID_WINBOND_82C105, + fixup_winbond_82c105); diff --git a/arch/powerpc/platforms/pseries/pci_dlpar.c b/arch/powerpc/platforms/pseries/pci_dlpar.c new file mode 100644 index 00000000..3bf4488a --- /dev/null +++ b/arch/powerpc/platforms/pseries/pci_dlpar.c @@ -0,0 +1,208 @@ +/* + * PCI Dynamic LPAR, PCI Hot Plug and PCI EEH recovery code + * for RPA-compliant PPC64 platform. + * Copyright (C) 2003 Linda Xie <lxie@us.ibm.com> + * Copyright (C) 2005 International Business Machines + * + * Updates, 2005, John Rose <johnrose@austin.ibm.com> + * Updates, 2005, Linas Vepstas <linas@austin.ibm.com> + * + * All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or (at + * your option) any later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#include <linux/pci.h> +#include <asm/pci-bridge.h> +#include <asm/ppc-pci.h> +#include <asm/firmware.h> +#include <asm/eeh.h> + +static struct pci_bus * +find_bus_among_children(struct pci_bus *bus, + struct device_node *dn) +{ + struct pci_bus *child = NULL; + struct list_head *tmp; + struct device_node *busdn; + + busdn = pci_bus_to_OF_node(bus); + if (busdn == dn) + return bus; + + list_for_each(tmp, &bus->children) { + child = find_bus_among_children(pci_bus_b(tmp), dn); + if (child) + break; + }; + return child; +} + +struct pci_bus * +pcibios_find_pci_bus(struct device_node *dn) +{ + struct pci_dn *pdn = dn->data; + + if (!pdn || !pdn->phb || !pdn->phb->bus) + return NULL; + + return find_bus_among_children(pdn->phb->bus, dn); +} +EXPORT_SYMBOL_GPL(pcibios_find_pci_bus); + +/** + * pcibios_remove_pci_devices - remove all devices under this bus + * + * Remove all of the PCI devices under this bus both from the + * linux pci device tree, and from the powerpc EEH address cache. + */ +void pcibios_remove_pci_devices(struct pci_bus *bus) +{ + struct pci_dev *dev, *tmp; + struct pci_bus *child_bus; + + /* First go down child busses */ + list_for_each_entry(child_bus, &bus->children, node) + pcibios_remove_pci_devices(child_bus); + + pr_debug("PCI: Removing devices on bus %04x:%02x\n", + pci_domain_nr(bus), bus->number); + list_for_each_entry_safe(dev, tmp, &bus->devices, bus_list) { + pr_debug(" * Removing %s...\n", pci_name(dev)); + eeh_remove_bus_device(dev); + pci_remove_bus_device(dev); + } +} +EXPORT_SYMBOL_GPL(pcibios_remove_pci_devices); + +/** + * pcibios_add_pci_devices - adds new pci devices to bus + * + * This routine will find and fixup new pci devices under + * the indicated bus. This routine presumes that there + * might already be some devices under this bridge, so + * it carefully tries to add only new devices. (And that + * is how this routine differs from other, similar pcibios + * routines.) + */ +void pcibios_add_pci_devices(struct pci_bus * bus) +{ + int slotno, num, mode, pass, max; + struct pci_dev *dev; + struct device_node *dn = pci_bus_to_OF_node(bus); + + eeh_add_device_tree_early(dn); + + mode = PCI_PROBE_NORMAL; + if (ppc_md.pci_probe_mode) + mode = ppc_md.pci_probe_mode(bus); + + if (mode == PCI_PROBE_DEVTREE) { + /* use ofdt-based probe */ + of_rescan_bus(dn, bus); + } else if (mode == PCI_PROBE_NORMAL) { + /* use legacy probe */ + slotno = PCI_SLOT(PCI_DN(dn->child)->devfn); + num = pci_scan_slot(bus, PCI_DEVFN(slotno, 0)); + if (!num) + return; + pcibios_setup_bus_devices(bus); + max = bus->secondary; + for (pass=0; pass < 2; pass++) + list_for_each_entry(dev, &bus->devices, bus_list) { + if (dev->hdr_type == PCI_HEADER_TYPE_BRIDGE || + dev->hdr_type == PCI_HEADER_TYPE_CARDBUS) + max = pci_scan_bridge(bus, dev, max, pass); + } + } + pcibios_finish_adding_to_bus(bus); +} +EXPORT_SYMBOL_GPL(pcibios_add_pci_devices); + +struct pci_controller * __devinit init_phb_dynamic(struct device_node *dn) +{ + struct pci_controller *phb; + + pr_debug("PCI: Initializing new hotplug PHB %s\n", dn->full_name); + + phb = pcibios_alloc_controller(dn); + if (!phb) + return NULL; + rtas_setup_phb(phb); + pci_process_bridge_OF_ranges(phb, dn, 0); + + pci_devs_phb_init_dynamic(phb); + + if (dn->child) + eeh_add_device_tree_early(dn); + + pcibios_scan_phb(phb); + pcibios_finish_adding_to_bus(phb->bus); + + return phb; +} +EXPORT_SYMBOL_GPL(init_phb_dynamic); + +/* RPA-specific bits for removing PHBs */ +int remove_phb_dynamic(struct pci_controller *phb) +{ + struct pci_bus *b = phb->bus; + struct resource *res; + int rc, i; + + pr_debug("PCI: Removing PHB %04x:%02x...\n", + pci_domain_nr(b), b->number); + + /* We cannot to remove a root bus that has children */ + if (!(list_empty(&b->children) && list_empty(&b->devices))) + return -EBUSY; + + /* We -know- there aren't any child devices anymore at this stage + * and thus, we can safely unmap the IO space as it's not in use + */ + res = &phb->io_resource; + if (res->flags & IORESOURCE_IO) { + rc = pcibios_unmap_io_space(b); + if (rc) { + printk(KERN_ERR "%s: failed to unmap IO on bus %s\n", + __func__, b->name); + return 1; + } + } + + /* Unregister the bridge device from sysfs and remove the PCI bus */ + device_unregister(b->bridge); + phb->bus = NULL; + pci_remove_bus(b); + + /* Now release the IO resource */ + if (res->flags & IORESOURCE_IO) + release_resource(res); + + /* Release memory resources */ + for (i = 0; i < 3; ++i) { + res = &phb->mem_resources[i]; + if (!(res->flags & IORESOURCE_MEM)) + continue; + release_resource(res); + } + + /* Free pci_controller data structure */ + pcibios_free_controller(phb); + + return 0; +} +EXPORT_SYMBOL_GPL(remove_phb_dynamic); diff --git a/arch/powerpc/platforms/pseries/phyp_dump.c b/arch/powerpc/platforms/pseries/phyp_dump.c new file mode 100644 index 00000000..6e7742da --- /dev/null +++ b/arch/powerpc/platforms/pseries/phyp_dump.c @@ -0,0 +1,513 @@ +/* + * Hypervisor-assisted dump + * + * Linas Vepstas, Manish Ahuja 2008 + * Copyright 2008 IBM Corp. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/gfp.h> +#include <linux/init.h> +#include <linux/kobject.h> +#include <linux/mm.h> +#include <linux/of.h> +#include <linux/pfn.h> +#include <linux/swap.h> +#include <linux/sysfs.h> + +#include <asm/page.h> +#include <asm/phyp_dump.h> +#include <asm/machdep.h> +#include <asm/prom.h> +#include <asm/rtas.h> + +/* Variables, used to communicate data between early boot and late boot */ +static struct phyp_dump phyp_dump_vars; +struct phyp_dump *phyp_dump_info = &phyp_dump_vars; + +static int ibm_configure_kernel_dump; +/* ------------------------------------------------- */ +/* RTAS interfaces to declare the dump regions */ + +struct dump_section { + u32 dump_flags; + u16 source_type; + u16 error_flags; + u64 source_address; + u64 source_length; + u64 length_copied; + u64 destination_address; +}; + +struct phyp_dump_header { + u32 version; + u16 num_of_sections; + u16 status; + + u32 first_offset_section; + u32 dump_disk_section; + u64 block_num_dd; + u64 num_of_blocks_dd; + u32 offset_dd; + u32 maxtime_to_auto; + /* No dump disk path string used */ + + struct dump_section cpu_data; + struct dump_section hpte_data; + struct dump_section kernel_data; +}; + +/* The dump header *must be* in low memory, so .bss it */ +static struct phyp_dump_header phdr; + +#define NUM_DUMP_SECTIONS 3 +#define DUMP_HEADER_VERSION 0x1 +#define DUMP_REQUEST_FLAG 0x1 +#define DUMP_SOURCE_CPU 0x0001 +#define DUMP_SOURCE_HPTE 0x0002 +#define DUMP_SOURCE_RMO 0x0011 +#define DUMP_ERROR_FLAG 0x2000 +#define DUMP_TRIGGERED 0x4000 +#define DUMP_PERFORMED 0x8000 + + +/** + * init_dump_header() - initialize the header declaring a dump + * Returns: length of dump save area. + * + * When the hypervisor saves crashed state, it needs to put + * it somewhere. The dump header tells the hypervisor where + * the data can be saved. + */ +static unsigned long init_dump_header(struct phyp_dump_header *ph) +{ + unsigned long addr_offset = 0; + + /* Set up the dump header */ + ph->version = DUMP_HEADER_VERSION; + ph->num_of_sections = NUM_DUMP_SECTIONS; + ph->status = 0; + + ph->first_offset_section = + (u32)offsetof(struct phyp_dump_header, cpu_data); + ph->dump_disk_section = 0; + ph->block_num_dd = 0; + ph->num_of_blocks_dd = 0; + ph->offset_dd = 0; + + ph->maxtime_to_auto = 0; /* disabled */ + + /* The first two sections are mandatory */ + ph->cpu_data.dump_flags = DUMP_REQUEST_FLAG; + ph->cpu_data.source_type = DUMP_SOURCE_CPU; + ph->cpu_data.source_address = 0; + ph->cpu_data.source_length = phyp_dump_info->cpu_state_size; + ph->cpu_data.destination_address = addr_offset; + addr_offset += phyp_dump_info->cpu_state_size; + + ph->hpte_data.dump_flags = DUMP_REQUEST_FLAG; + ph->hpte_data.source_type = DUMP_SOURCE_HPTE; + ph->hpte_data.source_address = 0; + ph->hpte_data.source_length = phyp_dump_info->hpte_region_size; + ph->hpte_data.destination_address = addr_offset; + addr_offset += phyp_dump_info->hpte_region_size; + + /* This section describes the low kernel region */ + ph->kernel_data.dump_flags = DUMP_REQUEST_FLAG; + ph->kernel_data.source_type = DUMP_SOURCE_RMO; + ph->kernel_data.source_address = PHYP_DUMP_RMR_START; + ph->kernel_data.source_length = PHYP_DUMP_RMR_END; + ph->kernel_data.destination_address = addr_offset; + addr_offset += ph->kernel_data.source_length; + + return addr_offset; +} + +static void print_dump_header(const struct phyp_dump_header *ph) +{ +#ifdef DEBUG + if (ph == NULL) + return; + + printk(KERN_INFO "dump header:\n"); + /* setup some ph->sections required */ + printk(KERN_INFO "version = %d\n", ph->version); + printk(KERN_INFO "Sections = %d\n", ph->num_of_sections); + printk(KERN_INFO "Status = 0x%x\n", ph->status); + + /* No ph->disk, so all should be set to 0 */ + printk(KERN_INFO "Offset to first section 0x%x\n", + ph->first_offset_section); + printk(KERN_INFO "dump disk sections should be zero\n"); + printk(KERN_INFO "dump disk section = %d\n", ph->dump_disk_section); + printk(KERN_INFO "block num = %lld\n", ph->block_num_dd); + printk(KERN_INFO "number of blocks = %lld\n", ph->num_of_blocks_dd); + printk(KERN_INFO "dump disk offset = %d\n", ph->offset_dd); + printk(KERN_INFO "Max auto time= %d\n", ph->maxtime_to_auto); + + /*set cpu state and hpte states as well scratch pad area */ + printk(KERN_INFO " CPU AREA\n"); + printk(KERN_INFO "cpu dump_flags =%d\n", ph->cpu_data.dump_flags); + printk(KERN_INFO "cpu source_type =%d\n", ph->cpu_data.source_type); + printk(KERN_INFO "cpu error_flags =%d\n", ph->cpu_data.error_flags); + printk(KERN_INFO "cpu source_address =%llx\n", + ph->cpu_data.source_address); + printk(KERN_INFO "cpu source_length =%llx\n", + ph->cpu_data.source_length); + printk(KERN_INFO "cpu length_copied =%llx\n", + ph->cpu_data.length_copied); + + printk(KERN_INFO " HPTE AREA\n"); + printk(KERN_INFO "HPTE dump_flags =%d\n", ph->hpte_data.dump_flags); + printk(KERN_INFO "HPTE source_type =%d\n", ph->hpte_data.source_type); + printk(KERN_INFO "HPTE error_flags =%d\n", ph->hpte_data.error_flags); + printk(KERN_INFO "HPTE source_address =%llx\n", + ph->hpte_data.source_address); + printk(KERN_INFO "HPTE source_length =%llx\n", + ph->hpte_data.source_length); + printk(KERN_INFO "HPTE length_copied =%llx\n", + ph->hpte_data.length_copied); + + printk(KERN_INFO " SRSD AREA\n"); + printk(KERN_INFO "SRSD dump_flags =%d\n", ph->kernel_data.dump_flags); + printk(KERN_INFO "SRSD source_type =%d\n", ph->kernel_data.source_type); + printk(KERN_INFO "SRSD error_flags =%d\n", ph->kernel_data.error_flags); + printk(KERN_INFO "SRSD source_address =%llx\n", + ph->kernel_data.source_address); + printk(KERN_INFO "SRSD source_length =%llx\n", + ph->kernel_data.source_length); + printk(KERN_INFO "SRSD length_copied =%llx\n", + ph->kernel_data.length_copied); +#endif +} + +static ssize_t show_phyp_dump_active(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + + /* create filesystem entry so kdump is phyp-dump aware */ + return sprintf(buf, "%lx\n", phyp_dump_info->phyp_dump_at_boot); +} + +static struct kobj_attribute pdl = __ATTR(phyp_dump_active, 0600, + show_phyp_dump_active, + NULL); + +static void register_dump_area(struct phyp_dump_header *ph, unsigned long addr) +{ + int rc; + + /* Add addr value if not initialized before */ + if (ph->cpu_data.destination_address == 0) { + ph->cpu_data.destination_address += addr; + ph->hpte_data.destination_address += addr; + ph->kernel_data.destination_address += addr; + } + + /* ToDo Invalidate kdump and free memory range. */ + + do { + rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, + 1, ph, sizeof(struct phyp_dump_header)); + } while (rtas_busy_delay(rc)); + + if (rc) { + printk(KERN_ERR "phyp-dump: unexpected error (%d) on " + "register\n", rc); + print_dump_header(ph); + return; + } + + rc = sysfs_create_file(kernel_kobj, &pdl.attr); + if (rc) + printk(KERN_ERR "phyp-dump: unable to create sysfs" + " file (%d)\n", rc); +} + +static +void invalidate_last_dump(struct phyp_dump_header *ph, unsigned long addr) +{ + int rc; + + /* Add addr value if not initialized before */ + if (ph->cpu_data.destination_address == 0) { + ph->cpu_data.destination_address += addr; + ph->hpte_data.destination_address += addr; + ph->kernel_data.destination_address += addr; + } + + do { + rc = rtas_call(ibm_configure_kernel_dump, 3, 1, NULL, + 2, ph, sizeof(struct phyp_dump_header)); + } while (rtas_busy_delay(rc)); + + if (rc) { + printk(KERN_ERR "phyp-dump: unexpected error (%d) " + "on invalidate\n", rc); + print_dump_header(ph); + } +} + +/* ------------------------------------------------- */ +/** + * release_memory_range -- release memory previously memblock_reserved + * @start_pfn: starting physical frame number + * @nr_pages: number of pages to free. + * + * This routine will release memory that had been previously + * memblock_reserved in early boot. The released memory becomes + * available for genreal use. + */ +static void release_memory_range(unsigned long start_pfn, + unsigned long nr_pages) +{ + struct page *rpage; + unsigned long end_pfn; + long i; + + end_pfn = start_pfn + nr_pages; + + for (i = start_pfn; i <= end_pfn; i++) { + rpage = pfn_to_page(i); + if (PageReserved(rpage)) { + ClearPageReserved(rpage); + init_page_count(rpage); + __free_page(rpage); + totalram_pages++; + } + } +} + +/** + * track_freed_range -- Counts the range being freed. + * Once the counter goes to zero, it re-registers dump for + * future use. + */ +static void +track_freed_range(unsigned long addr, unsigned long length) +{ + static unsigned long scratch_area_size, reserved_area_size; + + if (addr < phyp_dump_info->init_reserve_start) + return; + + if ((addr >= phyp_dump_info->init_reserve_start) && + (addr <= phyp_dump_info->init_reserve_start + + phyp_dump_info->init_reserve_size)) + reserved_area_size += length; + + if ((addr >= phyp_dump_info->reserved_scratch_addr) && + (addr <= phyp_dump_info->reserved_scratch_addr + + phyp_dump_info->reserved_scratch_size)) + scratch_area_size += length; + + if ((reserved_area_size == phyp_dump_info->init_reserve_size) && + (scratch_area_size == phyp_dump_info->reserved_scratch_size)) { + + invalidate_last_dump(&phdr, + phyp_dump_info->reserved_scratch_addr); + register_dump_area(&phdr, + phyp_dump_info->reserved_scratch_addr); + } +} + +/* ------------------------------------------------- */ +/** + * sysfs_release_region -- sysfs interface to release memory range. + * + * Usage: + * "echo <start addr> <length> > /sys/kernel/release_region" + * + * Example: + * "echo 0x40000000 0x10000000 > /sys/kernel/release_region" + * + * will release 256MB starting at 1GB. + */ +static ssize_t store_release_region(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t count) +{ + unsigned long start_addr, length, end_addr; + unsigned long start_pfn, nr_pages; + ssize_t ret; + + ret = sscanf(buf, "%lx %lx", &start_addr, &length); + if (ret != 2) + return -EINVAL; + + track_freed_range(start_addr, length); + + /* Range-check - don't free any reserved memory that + * wasn't reserved for phyp-dump */ + if (start_addr < phyp_dump_info->init_reserve_start) + start_addr = phyp_dump_info->init_reserve_start; + + end_addr = phyp_dump_info->init_reserve_start + + phyp_dump_info->init_reserve_size; + if (start_addr+length > end_addr) + length = end_addr - start_addr; + + /* Release the region of memory assed in by user */ + start_pfn = PFN_DOWN(start_addr); + nr_pages = PFN_DOWN(length); + release_memory_range(start_pfn, nr_pages); + + return count; +} + +static ssize_t show_release_region(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + u64 second_addr_range; + + /* total reserved size - start of scratch area */ + second_addr_range = phyp_dump_info->init_reserve_size - + phyp_dump_info->reserved_scratch_size; + return sprintf(buf, "CPU:0x%llx-0x%llx: HPTE:0x%llx-0x%llx:" + " DUMP:0x%llx-0x%llx, 0x%lx-0x%llx:\n", + phdr.cpu_data.destination_address, + phdr.cpu_data.length_copied, + phdr.hpte_data.destination_address, + phdr.hpte_data.length_copied, + phdr.kernel_data.destination_address, + phdr.kernel_data.length_copied, + phyp_dump_info->init_reserve_start, + second_addr_range); +} + +static struct kobj_attribute rr = __ATTR(release_region, 0600, + show_release_region, + store_release_region); + +static int __init phyp_dump_setup(void) +{ + struct device_node *rtas; + const struct phyp_dump_header *dump_header = NULL; + unsigned long dump_area_start; + unsigned long dump_area_length; + int header_len = 0; + int rc; + + /* If no memory was reserved in early boot, there is nothing to do */ + if (phyp_dump_info->init_reserve_size == 0) + return 0; + + /* Return if phyp dump not supported */ + if (!phyp_dump_info->phyp_dump_configured) + return -ENOSYS; + + /* Is there dump data waiting for us? If there isn't, + * then register a new dump area, and release all of + * the rest of the reserved ram. + * + * The /rtas/ibm,kernel-dump rtas node is present only + * if there is dump data waiting for us. + */ + rtas = of_find_node_by_path("/rtas"); + if (rtas) { + dump_header = of_get_property(rtas, "ibm,kernel-dump", + &header_len); + of_node_put(rtas); + } + + ibm_configure_kernel_dump = rtas_token("ibm,configure-kernel-dump"); + + print_dump_header(dump_header); + dump_area_length = init_dump_header(&phdr); + /* align down */ + dump_area_start = phyp_dump_info->init_reserve_start & PAGE_MASK; + + if (dump_header == NULL) { + register_dump_area(&phdr, dump_area_start); + return 0; + } + + /* re-register the dump area, if old dump was invalid */ + if ((dump_header) && (dump_header->status & DUMP_ERROR_FLAG)) { + invalidate_last_dump(&phdr, dump_area_start); + register_dump_area(&phdr, dump_area_start); + return 0; + } + + if (dump_header) { + phyp_dump_info->reserved_scratch_addr = + dump_header->cpu_data.destination_address; + phyp_dump_info->reserved_scratch_size = + dump_header->cpu_data.source_length + + dump_header->hpte_data.source_length + + dump_header->kernel_data.source_length; + } + + /* Should we create a dump_subsys, analogous to s390/ipl.c ? */ + rc = sysfs_create_file(kernel_kobj, &rr.attr); + if (rc) + printk(KERN_ERR "phyp-dump: unable to create sysfs file (%d)\n", + rc); + + /* ToDo: re-register the dump area, for next time. */ + return 0; +} +machine_subsys_initcall(pseries, phyp_dump_setup); + +int __init early_init_dt_scan_phyp_dump(unsigned long node, + const char *uname, int depth, void *data) +{ + const unsigned int *sizes; + + phyp_dump_info->phyp_dump_configured = 0; + phyp_dump_info->phyp_dump_is_active = 0; + + if (depth != 1 || strcmp(uname, "rtas") != 0) + return 0; + + if (of_get_flat_dt_prop(node, "ibm,configure-kernel-dump", NULL)) + phyp_dump_info->phyp_dump_configured++; + + if (of_get_flat_dt_prop(node, "ibm,dump-kernel", NULL)) + phyp_dump_info->phyp_dump_is_active++; + + sizes = of_get_flat_dt_prop(node, "ibm,configure-kernel-dump-sizes", + NULL); + if (!sizes) + return 0; + + if (sizes[0] == 1) + phyp_dump_info->cpu_state_size = *((unsigned long *)&sizes[1]); + + if (sizes[3] == 2) + phyp_dump_info->hpte_region_size = + *((unsigned long *)&sizes[4]); + return 1; +} + +/* Look for phyp_dump= cmdline option */ +static int __init early_phyp_dump_enabled(char *p) +{ + phyp_dump_info->phyp_dump_at_boot = 1; + + if (!p) + return 0; + + if (strncmp(p, "1", 1) == 0) + phyp_dump_info->phyp_dump_at_boot = 1; + else if (strncmp(p, "0", 1) == 0) + phyp_dump_info->phyp_dump_at_boot = 0; + + return 0; +} +early_param("phyp_dump", early_phyp_dump_enabled); + +/* Look for phyp_dump_reserve_size= cmdline option */ +static int __init early_phyp_dump_reserve_size(char *p) +{ + if (p) + phyp_dump_info->reserve_bootvar = memparse(p, &p); + + return 0; +} +early_param("phyp_dump_reserve_size", early_phyp_dump_reserve_size); diff --git a/arch/powerpc/platforms/pseries/plpar_wrappers.h b/arch/powerpc/platforms/pseries/plpar_wrappers.h new file mode 100644 index 00000000..a6921aec --- /dev/null +++ b/arch/powerpc/platforms/pseries/plpar_wrappers.h @@ -0,0 +1,273 @@ +#ifndef _PSERIES_PLPAR_WRAPPERS_H +#define _PSERIES_PLPAR_WRAPPERS_H + +#include <asm/hvcall.h> +#include <asm/page.h> + +/* Get state of physical CPU from query_cpu_stopped */ +int smp_query_cpu_stopped(unsigned int pcpu); +#define QCSS_STOPPED 0 +#define QCSS_STOPPING 1 +#define QCSS_NOT_STOPPED 2 +#define QCSS_HARDWARE_ERROR -1 +#define QCSS_HARDWARE_BUSY -2 + +static inline long poll_pending(void) +{ + return plpar_hcall_norets(H_POLL_PENDING); +} + +static inline u8 get_cede_latency_hint(void) +{ + return get_lppaca()->gpr5_dword.fields.cede_latency_hint; +} + +static inline void set_cede_latency_hint(u8 latency_hint) +{ + get_lppaca()->gpr5_dword.fields.cede_latency_hint = latency_hint; +} + +static inline long cede_processor(void) +{ + return plpar_hcall_norets(H_CEDE); +} + +static inline long extended_cede_processor(unsigned long latency_hint) +{ + long rc; + u8 old_latency_hint = get_cede_latency_hint(); + + set_cede_latency_hint(latency_hint); + rc = cede_processor(); + set_cede_latency_hint(old_latency_hint); + + return rc; +} + +static inline long vpa_call(unsigned long flags, unsigned long cpu, + unsigned long vpa) +{ + /* flags are in bits 16-18 (counting from most significant bit) */ + flags = flags << (63 - 18); + + return plpar_hcall_norets(H_REGISTER_VPA, flags, cpu, vpa); +} + +static inline long unregister_vpa(unsigned long cpu, unsigned long vpa) +{ + return vpa_call(0x5, cpu, vpa); +} + +static inline long register_vpa(unsigned long cpu, unsigned long vpa) +{ + return vpa_call(0x1, cpu, vpa); +} + +static inline long unregister_slb_shadow(unsigned long cpu, unsigned long vpa) +{ + return vpa_call(0x7, cpu, vpa); +} + +static inline long register_slb_shadow(unsigned long cpu, unsigned long vpa) +{ + return vpa_call(0x3, cpu, vpa); +} + +static inline long unregister_dtl(unsigned long cpu) +{ + return vpa_call(0x6, cpu, 0); +} + +static inline long register_dtl(unsigned long cpu, unsigned long vpa) +{ + return vpa_call(0x2, cpu, vpa); +} + +static inline long plpar_page_set_loaned(unsigned long vpa) +{ + unsigned long cmo_page_sz = cmo_get_page_size(); + long rc = 0; + int i; + + for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) + rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, vpa + i, 0); + + for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) + plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, + vpa + i - cmo_page_sz, 0); + + return rc; +} + +static inline long plpar_page_set_active(unsigned long vpa) +{ + unsigned long cmo_page_sz = cmo_get_page_size(); + long rc = 0; + int i; + + for (i = 0; !rc && i < PAGE_SIZE; i += cmo_page_sz) + rc = plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_ACTIVE, vpa + i, 0); + + for (i -= cmo_page_sz; rc && i != 0; i -= cmo_page_sz) + plpar_hcall_norets(H_PAGE_INIT, H_PAGE_SET_LOANED, + vpa + i - cmo_page_sz, 0); + + return rc; +} + +extern void vpa_init(int cpu); + +static inline long plpar_pte_enter(unsigned long flags, + unsigned long hpte_group, unsigned long hpte_v, + unsigned long hpte_r, unsigned long *slot) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_ENTER, retbuf, flags, hpte_group, hpte_v, hpte_r); + + *slot = retbuf[0]; + + return rc; +} + +static inline long plpar_pte_remove(unsigned long flags, unsigned long ptex, + unsigned long avpn, unsigned long *old_pteh_ret, + unsigned long *old_ptel_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_REMOVE, retbuf, flags, ptex, avpn); + + *old_pteh_ret = retbuf[0]; + *old_ptel_ret = retbuf[1]; + + return rc; +} + +/* plpar_pte_remove_raw can be called in real mode. It calls plpar_hcall_raw */ +static inline long plpar_pte_remove_raw(unsigned long flags, unsigned long ptex, + unsigned long avpn, unsigned long *old_pteh_ret, + unsigned long *old_ptel_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall_raw(H_REMOVE, retbuf, flags, ptex, avpn); + + *old_pteh_ret = retbuf[0]; + *old_ptel_ret = retbuf[1]; + + return rc; +} + +static inline long plpar_pte_read(unsigned long flags, unsigned long ptex, + unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_READ, retbuf, flags, ptex); + + *old_pteh_ret = retbuf[0]; + *old_ptel_ret = retbuf[1]; + + return rc; +} + +/* plpar_pte_read_raw can be called in real mode. It calls plpar_hcall_raw */ +static inline long plpar_pte_read_raw(unsigned long flags, unsigned long ptex, + unsigned long *old_pteh_ret, unsigned long *old_ptel_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall_raw(H_READ, retbuf, flags, ptex); + + *old_pteh_ret = retbuf[0]; + *old_ptel_ret = retbuf[1]; + + return rc; +} + +/* + * plpar_pte_read_4_raw can be called in real mode. + * ptes must be 8*sizeof(unsigned long) + */ +static inline long plpar_pte_read_4_raw(unsigned long flags, unsigned long ptex, + unsigned long *ptes) + +{ + long rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + + rc = plpar_hcall9_raw(H_READ, retbuf, flags | H_READ_4, ptex); + + memcpy(ptes, retbuf, 8*sizeof(unsigned long)); + + return rc; +} + +static inline long plpar_pte_protect(unsigned long flags, unsigned long ptex, + unsigned long avpn) +{ + return plpar_hcall_norets(H_PROTECT, flags, ptex, avpn); +} + +static inline long plpar_tce_get(unsigned long liobn, unsigned long ioba, + unsigned long *tce_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_GET_TCE, retbuf, liobn, ioba); + + *tce_ret = retbuf[0]; + + return rc; +} + +static inline long plpar_tce_put(unsigned long liobn, unsigned long ioba, + unsigned long tceval) +{ + return plpar_hcall_norets(H_PUT_TCE, liobn, ioba, tceval); +} + +static inline long plpar_tce_put_indirect(unsigned long liobn, + unsigned long ioba, unsigned long page, unsigned long count) +{ + return plpar_hcall_norets(H_PUT_TCE_INDIRECT, liobn, ioba, page, count); +} + +static inline long plpar_tce_stuff(unsigned long liobn, unsigned long ioba, + unsigned long tceval, unsigned long count) +{ + return plpar_hcall_norets(H_STUFF_TCE, liobn, ioba, tceval, count); +} + +static inline long plpar_get_term_char(unsigned long termno, + unsigned long *len_ret, char *buf_ret) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + unsigned long *lbuf = (unsigned long *)buf_ret; /* TODO: alignment? */ + + rc = plpar_hcall(H_GET_TERM_CHAR, retbuf, termno); + + *len_ret = retbuf[0]; + lbuf[0] = retbuf[1]; + lbuf[1] = retbuf[2]; + + return rc; +} + +static inline long plpar_put_term_char(unsigned long termno, unsigned long len, + const char *buffer) +{ + unsigned long *lbuf = (unsigned long *)buffer; /* TODO: alignment? */ + return plpar_hcall_norets(H_PUT_TERM_CHAR, termno, len, lbuf[0], + lbuf[1]); +} + +#endif /* _PSERIES_PLPAR_WRAPPERS_H */ diff --git a/arch/powerpc/platforms/pseries/power.c b/arch/powerpc/platforms/pseries/power.c new file mode 100644 index 00000000..6d626623 --- /dev/null +++ b/arch/powerpc/platforms/pseries/power.c @@ -0,0 +1,81 @@ +/* + * Interface for power-management for ppc64 compliant platform + * + * Manish Ahuja <mahuja@us.ibm.com> + * + * Feb 2007 + * + * Copyright (C) 2007 IBM Corporation. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; version 2 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/kobject.h> +#include <linux/string.h> +#include <linux/errno.h> +#include <linux/init.h> + +unsigned long rtas_poweron_auto; /* default and normal state is 0 */ + +static ssize_t auto_poweron_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%lu\n", rtas_poweron_auto); +} + +static ssize_t auto_poweron_store(struct kobject *kobj, + struct kobj_attribute *attr, + const char *buf, size_t n) +{ + int ret; + unsigned long ups_restart; + ret = sscanf(buf, "%lu", &ups_restart); + + if ((ret == 1) && ((ups_restart == 1) || (ups_restart == 0))){ + rtas_poweron_auto = ups_restart; + return n; + } + return -EINVAL; +} + +static struct kobj_attribute auto_poweron_attr = + __ATTR(auto_poweron, 0644, auto_poweron_show, auto_poweron_store); + +#ifndef CONFIG_PM +struct kobject *power_kobj; + +static struct attribute *g[] = { + &auto_poweron_attr.attr, + NULL, +}; + +static struct attribute_group attr_group = { + .attrs = g, +}; + +static int __init pm_init(void) +{ + power_kobj = kobject_create_and_add("power", NULL); + if (!power_kobj) + return -ENOMEM; + return sysfs_create_group(power_kobj, &attr_group); +} +core_initcall(pm_init); +#else +static int __init apo_pm_init(void) +{ + return (sysfs_create_file(power_kobj, &auto_poweron_attr.attr)); +} +__initcall(apo_pm_init); +#endif diff --git a/arch/powerpc/platforms/pseries/pseries.h b/arch/powerpc/platforms/pseries/pseries.h new file mode 100644 index 00000000..e9f6d285 --- /dev/null +++ b/arch/powerpc/platforms/pseries/pseries.h @@ -0,0 +1,59 @@ +/* + * Copyright 2006 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _PSERIES_PSERIES_H +#define _PSERIES_PSERIES_H + +#include <linux/interrupt.h> + +struct device_node; + +extern void request_event_sources_irqs(struct device_node *np, + irq_handler_t handler, const char *name); + +#include <linux/of.h> + +extern void __init fw_feature_init(const char *hypertas, unsigned long len); + +struct pt_regs; + +extern int pSeries_system_reset_exception(struct pt_regs *regs); +extern int pSeries_machine_check_exception(struct pt_regs *regs); + +#ifdef CONFIG_SMP +extern void smp_init_pseries_mpic(void); +extern void smp_init_pseries_xics(void); +#else +static inline void smp_init_pseries_mpic(void) { }; +static inline void smp_init_pseries_xics(void) { }; +#endif + +#ifdef CONFIG_KEXEC +extern void setup_kexec_cpu_down_xics(void); +extern void setup_kexec_cpu_down_mpic(void); +#else +static inline void setup_kexec_cpu_down_xics(void) { } +static inline void setup_kexec_cpu_down_mpic(void) { } +#endif + +extern void pSeries_final_fixup(void); + +/* Poweron flag used for enabling auto ups restart */ +extern unsigned long rtas_poweron_auto; + +extern void find_udbg_vterm(void); + +/* Dynamic logical Partitioning/Mobility */ +extern void dlpar_free_cc_nodes(struct device_node *); +extern void dlpar_free_cc_property(struct property *); +extern struct device_node *dlpar_configure_connector(u32); +extern int dlpar_attach_node(struct device_node *); +extern int dlpar_detach_node(struct device_node *); + +#endif /* _PSERIES_PSERIES_H */ diff --git a/arch/powerpc/platforms/pseries/pseries_energy.c b/arch/powerpc/platforms/pseries/pseries_energy.c new file mode 100644 index 00000000..c8b3c69f --- /dev/null +++ b/arch/powerpc/platforms/pseries/pseries_energy.c @@ -0,0 +1,326 @@ +/* + * POWER platform energy management driver + * Copyright (C) 2010 IBM Corporation + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * version 2 as published by the Free Software Foundation. + * + * This pseries platform device driver provides access to + * platform energy management capabilities. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/init.h> +#include <linux/seq_file.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> +#include <linux/of.h> +#include <asm/cputhreads.h> +#include <asm/page.h> +#include <asm/hvcall.h> + + +#define MODULE_VERS "1.0" +#define MODULE_NAME "pseries_energy" + +/* Driver flags */ + +static int sysfs_entries; + +/* Helper routines */ + +/* + * Routine to detect firmware support for hcall + * return 1 if H_BEST_ENERGY is supported + * else return 0 + */ + +static int check_for_h_best_energy(void) +{ + struct device_node *rtas = NULL; + const char *hypertas, *s; + int length; + int rc = 0; + + rtas = of_find_node_by_path("/rtas"); + if (!rtas) + return 0; + + hypertas = of_get_property(rtas, "ibm,hypertas-functions", &length); + if (!hypertas) { + of_node_put(rtas); + return 0; + } + + /* hypertas will have list of strings with hcall names */ + for (s = hypertas; s < hypertas + length; s += strlen(s) + 1) { + if (!strncmp("hcall-best-energy-1", s, 19)) { + rc = 1; /* Found the string */ + break; + } + } + of_node_put(rtas); + return rc; +} + +/* Helper Routines to convert between drc_index to cpu numbers */ + +static u32 cpu_to_drc_index(int cpu) +{ + struct device_node *dn = NULL; + const int *indexes; + int i; + int rc = 1; + u32 ret = 0; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* Convert logical cpu number to core number */ + i = cpu_core_index_of_thread(cpu); + /* + * The first element indexes[0] is the number of drc_indexes + * returned in the list. Hence i+1 will get the drc_index + * corresponding to core number i. + */ + WARN_ON(i > indexes[0]); + ret = indexes[i + 1]; + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "cpu_to_drc_index(%d) failed", cpu); + return ret; +} + +static int drc_index_to_cpu(u32 drc_index) +{ + struct device_node *dn = NULL; + const int *indexes; + int i, cpu = 0; + int rc = 1; + + dn = of_find_node_by_path("/cpus"); + if (dn == NULL) + goto err; + indexes = of_get_property(dn, "ibm,drc-indexes", NULL); + if (indexes == NULL) + goto err_of_node_put; + /* + * First element in the array is the number of drc_indexes + * returned. Search through the list to find the matching + * drc_index and get the core number + */ + for (i = 0; i < indexes[0]; i++) { + if (indexes[i + 1] == drc_index) + break; + } + /* Convert core number to logical cpu number */ + cpu = cpu_first_thread_of_core(i); + rc = 0; + +err_of_node_put: + of_node_put(dn); +err: + if (rc) + printk(KERN_WARNING "drc_index_to_cpu(%d) failed", drc_index); + return cpu; +} + +/* + * pseries hypervisor call H_BEST_ENERGY provides hints to OS on + * preferred logical cpus to activate or deactivate for optimized + * energy consumption. + */ + +#define FLAGS_MODE1 0x004E200000080E01 +#define FLAGS_MODE2 0x004E200000080401 +#define FLAGS_ACTIVATE 0x100 + +static ssize_t get_best_energy_list(char *page, int activate) +{ + int rc, cnt, i, cpu; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + u32 *buf_page; + char *s = page; + + buf_page = (u32 *) get_zeroed_page(GFP_KERNEL); + if (!buf_page) + return -ENOMEM; + + flags = FLAGS_MODE1; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, 0, __pa(buf_page), + 0, 0, 0, 0, 0, 0); + if (rc != H_SUCCESS) { + free_page((unsigned long) buf_page); + return -EINVAL; + } + + cnt = retbuf[0]; + for (i = 0; i < cnt; i++) { + cpu = drc_index_to_cpu(buf_page[2*i+1]); + if ((cpu_online(cpu) && !activate) || + (!cpu_online(cpu) && activate)) + s += sprintf(s, "%d,", cpu); + } + if (s > page) { /* Something to show */ + s--; /* Suppress last comma */ + s += sprintf(s, "\n"); + } + + free_page((unsigned long) buf_page); + return s-page; +} + +static ssize_t get_best_energy_data(struct sys_device *dev, + char *page, int activate) +{ + int rc; + unsigned long retbuf[PLPAR_HCALL9_BUFSIZE]; + unsigned long flags = 0; + + flags = FLAGS_MODE2; + if (activate) + flags |= FLAGS_ACTIVATE; + + rc = plpar_hcall9(H_BEST_ENERGY, retbuf, flags, + cpu_to_drc_index(dev->id), + 0, 0, 0, 0, 0, 0, 0); + + if (rc != H_SUCCESS) + return -EINVAL; + + return sprintf(page, "%lu\n", retbuf[1] >> 32); +} + +/* Wrapper functions */ + +static ssize_t cpu_activate_hint_list_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) +{ + return get_best_energy_list(page, 1); +} + +static ssize_t cpu_deactivate_hint_list_show(struct sysdev_class *class, + struct sysdev_class_attribute *attr, char *page) +{ + return get_best_energy_list(page, 0); +} + +static ssize_t percpu_activate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 1); +} + +static ssize_t percpu_deactivate_hint_show(struct sys_device *dev, + struct sysdev_attribute *attr, char *page) +{ + return get_best_energy_data(dev, page, 0); +} + +/* + * Create sysfs interface: + * /sys/devices/system/cpu/pseries_activate_hint_list + * /sys/devices/system/cpu/pseries_deactivate_hint_list + * Comma separated list of cpus to activate or deactivate + * /sys/devices/system/cpu/cpuN/pseries_activate_hint + * /sys/devices/system/cpu/cpuN/pseries_deactivate_hint + * Per-cpu value of the hint + */ + +struct sysdev_class_attribute attr_cpu_activate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_activate_hint_list, 0444, + cpu_activate_hint_list_show, NULL); + +struct sysdev_class_attribute attr_cpu_deactivate_hint_list = + _SYSDEV_CLASS_ATTR(pseries_deactivate_hint_list, 0444, + cpu_deactivate_hint_list_show, NULL); + +struct sysdev_attribute attr_percpu_activate_hint = + _SYSDEV_ATTR(pseries_activate_hint, 0444, + percpu_activate_hint_show, NULL); + +struct sysdev_attribute attr_percpu_deactivate_hint = + _SYSDEV_ATTR(pseries_deactivate_hint, 0444, + percpu_deactivate_hint_show, NULL); + +static int __init pseries_energy_init(void) +{ + int cpu, err; + struct sys_device *cpu_sys_dev; + + if (!check_for_h_best_energy()) { + printk(KERN_INFO "Hypercall H_BEST_ENERGY not supported\n"); + return 0; + } + /* Create the sysfs files */ + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + if (!err) + err = sysfs_create_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + if (err) + return err; + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + if (err) + break; + err = sysfs_create_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + if (err) + break; + } + + if (err) + return err; + + sysfs_entries = 1; /* Removed entries on cleanup */ + return 0; + +} + +static void __exit pseries_energy_cleanup(void) +{ + int cpu; + struct sys_device *cpu_sys_dev; + + if (!sysfs_entries) + return; + + /* Remove the sysfs files */ + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_activate_hint_list.attr); + + sysfs_remove_file(&cpu_sysdev_class.kset.kobj, + &attr_cpu_deactivate_hint_list.attr); + + for_each_possible_cpu(cpu) { + cpu_sys_dev = get_cpu_sysdev(cpu); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_activate_hint.attr); + sysfs_remove_file(&cpu_sys_dev->kobj, + &attr_percpu_deactivate_hint.attr); + } +} + +module_init(pseries_energy_init); +module_exit(pseries_energy_cleanup); +MODULE_DESCRIPTION("Driver for pSeries platform energy management"); +MODULE_AUTHOR("Vaidyanathan Srinivasan"); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/ras.c b/arch/powerpc/platforms/pseries/ras.c new file mode 100644 index 00000000..086d2ae4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/ras.c @@ -0,0 +1,347 @@ +/* + * Copyright (C) 2001 Dave Engebretsen IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +/* Change Activity: + * 2001/09/21 : engebret : Created with minimal EPOW and HW exception support. + * End Change Activity + */ + +#include <linux/errno.h> +#include <linux/threads.h> +#include <linux/kernel_stat.h> +#include <linux/signal.h> +#include <linux/sched.h> +#include <linux/ioport.h> +#include <linux/interrupt.h> +#include <linux/timex.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/random.h> +#include <linux/sysrq.h> +#include <linux/bitops.h> + +#include <asm/uaccess.h> +#include <asm/system.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/irq.h> +#include <asm/cache.h> +#include <asm/prom.h> +#include <asm/ptrace.h> +#include <asm/machdep.h> +#include <asm/rtas.h> +#include <asm/udbg.h> +#include <asm/firmware.h> + +#include "pseries.h" + +static unsigned char ras_log_buf[RTAS_ERROR_LOG_MAX]; +static DEFINE_SPINLOCK(ras_log_buf_lock); + +static char global_mce_data_buf[RTAS_ERROR_LOG_MAX]; +static DEFINE_PER_CPU(__u64, mce_data_buf); + +static int ras_get_sensor_state_token; +static int ras_check_exception_token; + +#define EPOW_SENSOR_TOKEN 9 +#define EPOW_SENSOR_INDEX 0 + +static irqreturn_t ras_epow_interrupt(int irq, void *dev_id); +static irqreturn_t ras_error_interrupt(int irq, void *dev_id); + + +/* + * Initialize handlers for the set of interrupts caused by hardware errors + * and power system events. + */ +static int __init init_ras_IRQ(void) +{ + struct device_node *np; + + ras_get_sensor_state_token = rtas_token("get-sensor-state"); + ras_check_exception_token = rtas_token("check-exception"); + + /* Internal Errors */ + np = of_find_node_by_path("/event-sources/internal-errors"); + if (np != NULL) { + request_event_sources_irqs(np, ras_error_interrupt, + "RAS_ERROR"); + of_node_put(np); + } + + /* EPOW Events */ + np = of_find_node_by_path("/event-sources/epow-events"); + if (np != NULL) { + request_event_sources_irqs(np, ras_epow_interrupt, "RAS_EPOW"); + of_node_put(np); + } + + return 0; +} +__initcall(init_ras_IRQ); + +/* + * Handle power subsystem events (EPOW). + * + * Presently we just log the event has occurred. This should be fixed + * to examine the type of power failure and take appropriate action where + * the time horizon permits something useful to be done. + */ +static irqreturn_t ras_epow_interrupt(int irq, void *dev_id) +{ + int status = 0xdeadbeef; + int state = 0; + int critical; + + status = rtas_call(ras_get_sensor_state_token, 2, 2, &state, + EPOW_SENSOR_TOKEN, EPOW_SENSOR_INDEX); + + if (state > 3) + critical = 1; /* Time Critical */ + else + critical = 0; + + spin_lock(&ras_log_buf_lock); + + status = rtas_call(ras_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, + virq_to_hw(irq), + RTAS_EPOW_WARNING | RTAS_POWERMGM_EVENTS, + critical, __pa(&ras_log_buf), + rtas_get_error_log_max()); + + udbg_printf("EPOW <0x%lx 0x%x 0x%x>\n", + *((unsigned long *)&ras_log_buf), status, state); + printk(KERN_WARNING "EPOW <0x%lx 0x%x 0x%x>\n", + *((unsigned long *)&ras_log_buf), status, state); + + /* format and print the extended information */ + log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, 0); + + spin_unlock(&ras_log_buf_lock); + return IRQ_HANDLED; +} + +/* + * Handle hardware error interrupts. + * + * RTAS check-exception is called to collect data on the exception. If + * the error is deemed recoverable, we log a warning and return. + * For nonrecoverable errors, an error is logged and we stop all processing + * as quickly as possible in order to prevent propagation of the failure. + */ +static irqreturn_t ras_error_interrupt(int irq, void *dev_id) +{ + struct rtas_error_log *rtas_elog; + int status = 0xdeadbeef; + int fatal; + + spin_lock(&ras_log_buf_lock); + + status = rtas_call(ras_check_exception_token, 6, 1, NULL, + RTAS_VECTOR_EXTERNAL_INTERRUPT, + virq_to_hw(irq), + RTAS_INTERNAL_ERROR, 1 /*Time Critical */, + __pa(&ras_log_buf), + rtas_get_error_log_max()); + + rtas_elog = (struct rtas_error_log *)ras_log_buf; + + if ((status == 0) && (rtas_elog->severity >= RTAS_SEVERITY_ERROR_SYNC)) + fatal = 1; + else + fatal = 0; + + /* format and print the extended information */ + log_error(ras_log_buf, ERR_TYPE_RTAS_LOG, fatal); + + if (fatal) { + udbg_printf("Fatal HW Error <0x%lx 0x%x>\n", + *((unsigned long *)&ras_log_buf), status); + printk(KERN_EMERG "Error: Fatal hardware error <0x%lx 0x%x>\n", + *((unsigned long *)&ras_log_buf), status); + +#ifndef DEBUG_RTAS_POWER_OFF + /* Don't actually power off when debugging so we can test + * without actually failing while injecting errors. + * Error data will not be logged to syslog. + */ + ppc_md.power_off(); +#endif + } else { + udbg_printf("Recoverable HW Error <0x%lx 0x%x>\n", + *((unsigned long *)&ras_log_buf), status); + printk(KERN_WARNING + "Warning: Recoverable hardware error <0x%lx 0x%x>\n", + *((unsigned long *)&ras_log_buf), status); + } + + spin_unlock(&ras_log_buf_lock); + return IRQ_HANDLED; +} + +/* + * Some versions of FWNMI place the buffer inside the 4kB page starting at + * 0x7000. Other versions place it inside the rtas buffer. We check both. + */ +#define VALID_FWNMI_BUFFER(A) \ + ((((A) >= 0x7000) && ((A) < 0x7ff0)) || \ + (((A) >= rtas.base) && ((A) < (rtas.base + rtas.size - 16)))) + +/* + * Get the error information for errors coming through the + * FWNMI vectors. The pt_regs' r3 will be updated to reflect + * the actual r3 if possible, and a ptr to the error log entry + * will be returned if found. + * + * If the RTAS error is not of the extended type, then we put it in a per + * cpu 64bit buffer. If it is the extended type we use global_mce_data_buf. + * + * The global_mce_data_buf does not have any locks or protection around it, + * if a second machine check comes in, or a system reset is done + * before we have logged the error, then we will get corruption in the + * error log. This is preferable over holding off on calling + * ibm,nmi-interlock which would result in us checkstopping if a + * second machine check did come in. + */ +static struct rtas_error_log *fwnmi_get_errinfo(struct pt_regs *regs) +{ + unsigned long *savep; + struct rtas_error_log *h, *errhdr = NULL; + + if (!VALID_FWNMI_BUFFER(regs->gpr[3])) { + printk(KERN_ERR "FWNMI: corrupt r3 0x%016lx\n", regs->gpr[3]); + return NULL; + } + + savep = __va(regs->gpr[3]); + regs->gpr[3] = savep[0]; /* restore original r3 */ + + /* If it isn't an extended log we can use the per cpu 64bit buffer */ + h = (struct rtas_error_log *)&savep[1]; + if (!h->extended) { + memcpy(&__get_cpu_var(mce_data_buf), h, sizeof(__u64)); + errhdr = (struct rtas_error_log *)&__get_cpu_var(mce_data_buf); + } else { + int len; + + len = max_t(int, 8+h->extended_log_length, RTAS_ERROR_LOG_MAX); + memset(global_mce_data_buf, 0, RTAS_ERROR_LOG_MAX); + memcpy(global_mce_data_buf, h, len); + errhdr = (struct rtas_error_log *)global_mce_data_buf; + } + + return errhdr; +} + +/* Call this when done with the data returned by FWNMI_get_errinfo. + * It will release the saved data area for other CPUs in the + * partition to receive FWNMI errors. + */ +static void fwnmi_release_errinfo(void) +{ + int ret = rtas_call(rtas_token("ibm,nmi-interlock"), 0, 1, NULL); + if (ret != 0) + printk(KERN_ERR "FWNMI: nmi-interlock failed: %d\n", ret); +} + +int pSeries_system_reset_exception(struct pt_regs *regs) +{ + if (fwnmi_active) { + struct rtas_error_log *errhdr = fwnmi_get_errinfo(regs); + if (errhdr) { + /* XXX Should look at FWNMI information */ + } + fwnmi_release_errinfo(); + } + return 0; /* need to perform reset */ +} + +/* + * See if we can recover from a machine check exception. + * This is only called on power4 (or above) and only via + * the Firmware Non-Maskable Interrupts (fwnmi) handler + * which provides the error analysis for us. + * + * Return 1 if corrected (or delivered a signal). + * Return 0 if there is nothing we can do. + */ +static int recover_mce(struct pt_regs *regs, struct rtas_error_log *err) +{ + int recovered = 0; + + if (!(regs->msr & MSR_RI)) { + /* If MSR_RI isn't set, we cannot recover */ + recovered = 0; + + } else if (err->disposition == RTAS_DISP_FULLY_RECOVERED) { + /* Platform corrected itself */ + recovered = 1; + + } else if (err->disposition == RTAS_DISP_LIMITED_RECOVERY) { + /* Platform corrected itself but could be degraded */ + printk(KERN_ERR "MCE: limited recovery, system may " + "be degraded\n"); + recovered = 1; + + } else if (user_mode(regs) && !is_global_init(current) && + err->severity == RTAS_SEVERITY_ERROR_SYNC) { + + /* + * If we received a synchronous error when in userspace + * kill the task. Firmware may report details of the fail + * asynchronously, so we can't rely on the target and type + * fields being valid here. + */ + printk(KERN_ERR "MCE: uncorrectable error, killing task " + "%s:%d\n", current->comm, current->pid); + + _exception(SIGBUS, regs, BUS_MCEERR_AR, regs->nip); + recovered = 1; + } + + log_error((char *)err, ERR_TYPE_RTAS_LOG, 0); + + return recovered; +} + +/* + * Handle a machine check. + * + * Note that on Power 4 and beyond Firmware Non-Maskable Interrupts (fwnmi) + * should be present. If so the handler which called us tells us if the + * error was recovered (never true if RI=0). + * + * On hardware prior to Power 4 these exceptions were asynchronous which + * means we can't tell exactly where it occurred and so we can't recover. + */ +int pSeries_machine_check_exception(struct pt_regs *regs) +{ + struct rtas_error_log *errp; + + if (fwnmi_active) { + errp = fwnmi_get_errinfo(regs); + fwnmi_release_errinfo(); + if (errp && recover_mce(regs, errp)) + return 1; + } + + return 0; +} diff --git a/arch/powerpc/platforms/pseries/reconfig.c b/arch/powerpc/platforms/pseries/reconfig.c new file mode 100644 index 00000000..1de2cbb9 --- /dev/null +++ b/arch/powerpc/platforms/pseries/reconfig.c @@ -0,0 +1,560 @@ +/* + * pSeries_reconfig.c - support for dynamic reconfiguration (including PCI + * Hotplug and Dynamic Logical Partitioning on RPA platforms). + * + * Copyright (C) 2005 Nathan Lynch + * Copyright (C) 2005 IBM Corporation + * + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + */ + +#include <linux/kernel.h> +#include <linux/kref.h> +#include <linux/notifier.h> +#include <linux/proc_fs.h> +#include <linux/slab.h> + +#include <asm/prom.h> +#include <asm/machdep.h> +#include <asm/uaccess.h> +#include <asm/pSeries_reconfig.h> +#include <asm/mmu.h> + + + +/* + * Routines for "runtime" addition and removal of device tree nodes. + */ +#ifdef CONFIG_PROC_DEVICETREE +/* + * Add a node to /proc/device-tree. + */ +static void add_node_proc_entries(struct device_node *np) +{ + struct proc_dir_entry *ent; + + ent = proc_mkdir(strrchr(np->full_name, '/') + 1, np->parent->pde); + if (ent) + proc_device_tree_add_node(np, ent); +} + +static void remove_node_proc_entries(struct device_node *np) +{ + struct property *pp = np->properties; + struct device_node *parent = np->parent; + + while (pp) { + remove_proc_entry(pp->name, np->pde); + pp = pp->next; + } + if (np->pde) + remove_proc_entry(np->pde->name, parent->pde); +} +#else /* !CONFIG_PROC_DEVICETREE */ +static void add_node_proc_entries(struct device_node *np) +{ + return; +} + +static void remove_node_proc_entries(struct device_node *np) +{ + return; +} +#endif /* CONFIG_PROC_DEVICETREE */ + +/** + * derive_parent - basically like dirname(1) + * @path: the full_name of a node to be added to the tree + * + * Returns the node which should be the parent of the node + * described by path. E.g., for path = "/foo/bar", returns + * the node with full_name = "/foo". + */ +static struct device_node *derive_parent(const char *path) +{ + struct device_node *parent = NULL; + char *parent_path = "/"; + size_t parent_path_len = strrchr(path, '/') - path + 1; + + /* reject if path is "/" */ + if (!strcmp(path, "/")) + return ERR_PTR(-EINVAL); + + if (strrchr(path, '/') != path) { + parent_path = kmalloc(parent_path_len, GFP_KERNEL); + if (!parent_path) + return ERR_PTR(-ENOMEM); + strlcpy(parent_path, path, parent_path_len); + } + parent = of_find_node_by_path(parent_path); + if (!parent) + return ERR_PTR(-EINVAL); + if (strcmp(parent_path, "/")) + kfree(parent_path); + return parent; +} + +BLOCKING_NOTIFIER_HEAD(pSeries_reconfig_chain); + +int pSeries_reconfig_notifier_register(struct notifier_block *nb) +{ + return blocking_notifier_chain_register(&pSeries_reconfig_chain, nb); +} + +void pSeries_reconfig_notifier_unregister(struct notifier_block *nb) +{ + blocking_notifier_chain_unregister(&pSeries_reconfig_chain, nb); +} + +static int pSeries_reconfig_add_node(const char *path, struct property *proplist) +{ + struct device_node *np; + int err = -ENOMEM; + + np = kzalloc(sizeof(*np), GFP_KERNEL); + if (!np) + goto out_err; + + np->full_name = kstrdup(path, GFP_KERNEL); + if (!np->full_name) + goto out_err; + + np->properties = proplist; + of_node_set_flag(np, OF_DYNAMIC); + kref_init(&np->kref); + + np->parent = derive_parent(path); + if (IS_ERR(np->parent)) { + err = PTR_ERR(np->parent); + goto out_err; + } + + err = blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_ADD, np); + if (err == NOTIFY_BAD) { + printk(KERN_ERR "Failed to add device node %s\n", path); + err = -ENOMEM; /* For now, safe to assume kmalloc failure */ + goto out_err; + } + + of_attach_node(np); + + add_node_proc_entries(np); + + of_node_put(np->parent); + + return 0; + +out_err: + if (np) { + of_node_put(np->parent); + kfree(np->full_name); + kfree(np); + } + return err; +} + +static int pSeries_reconfig_remove_node(struct device_node *np) +{ + struct device_node *parent, *child; + + parent = of_get_parent(np); + if (!parent) + return -EINVAL; + + if ((child = of_get_next_child(np, NULL))) { + of_node_put(child); + of_node_put(parent); + return -EBUSY; + } + + remove_node_proc_entries(np); + + blocking_notifier_call_chain(&pSeries_reconfig_chain, + PSERIES_RECONFIG_REMOVE, np); + of_detach_node(np); + + of_node_put(parent); + of_node_put(np); /* Must decrement the refcount */ + return 0; +} + +/* + * /proc/powerpc/ofdt - yucky binary interface for adding and removing + * OF device nodes. Should be deprecated as soon as we get an + * in-kernel wrapper for the RTAS ibm,configure-connector call. + */ + +static void release_prop_list(const struct property *prop) +{ + struct property *next; + for (; prop; prop = next) { + next = prop->next; + kfree(prop->name); + kfree(prop->value); + kfree(prop); + } + +} + +/** + * parse_next_property - process the next property from raw input buffer + * @buf: input buffer, must be nul-terminated + * @end: end of the input buffer + 1, for validation + * @name: return value; set to property name in buf + * @length: return value; set to length of value + * @value: return value; set to the property value in buf + * + * Note that the caller must make copies of the name and value returned, + * this function does no allocation or copying of the data. Return value + * is set to the next name in buf, or NULL on error. + */ +static char * parse_next_property(char *buf, char *end, char **name, int *length, + unsigned char **value) +{ + char *tmp; + + *name = buf; + + tmp = strchr(buf, ' '); + if (!tmp) { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + *tmp = '\0'; + + if (++tmp >= end) { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + + /* now we're on the length */ + *length = -1; + *length = simple_strtoul(tmp, &tmp, 10); + if (*length == -1) { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + if (*tmp != ' ' || ++tmp >= end) { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + + /* now we're on the value */ + *value = tmp; + tmp += *length; + if (tmp > end) { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + else if (tmp < end && *tmp != ' ' && *tmp != '\0') { + printk(KERN_ERR "property parse failed in %s at line %d\n", + __func__, __LINE__); + return NULL; + } + tmp++; + + /* and now we should be on the next name, or the end */ + return tmp; +} + +static struct property *new_property(const char *name, const int length, + const unsigned char *value, struct property *last) +{ + struct property *new = kzalloc(sizeof(*new), GFP_KERNEL); + + if (!new) + return NULL; + + if (!(new->name = kmalloc(strlen(name) + 1, GFP_KERNEL))) + goto cleanup; + if (!(new->value = kmalloc(length + 1, GFP_KERNEL))) + goto cleanup; + + strcpy(new->name, name); + memcpy(new->value, value, length); + *(((char *)new->value) + length) = 0; + new->length = length; + new->next = last; + return new; + +cleanup: + kfree(new->name); + kfree(new->value); + kfree(new); + return NULL; +} + +static int do_add_node(char *buf, size_t bufsize) +{ + char *path, *end, *name; + struct device_node *np; + struct property *prop = NULL; + unsigned char* value; + int length, rv = 0; + + end = buf + bufsize; + path = buf; + buf = strchr(buf, ' '); + if (!buf) + return -EINVAL; + *buf = '\0'; + buf++; + + if ((np = of_find_node_by_path(path))) { + of_node_put(np); + return -EINVAL; + } + + /* rv = build_prop_list(tmp, bufsize - (tmp - buf), &proplist); */ + while (buf < end && + (buf = parse_next_property(buf, end, &name, &length, &value))) { + struct property *last = prop; + + prop = new_property(name, length, value, last); + if (!prop) { + rv = -ENOMEM; + prop = last; + goto out; + } + } + if (!buf) { + rv = -EINVAL; + goto out; + } + + rv = pSeries_reconfig_add_node(path, prop); + +out: + if (rv) + release_prop_list(prop); + return rv; +} + +static int do_remove_node(char *buf) +{ + struct device_node *node; + int rv = -ENODEV; + + if ((node = of_find_node_by_path(buf))) + rv = pSeries_reconfig_remove_node(node); + + of_node_put(node); + return rv; +} + +static char *parse_node(char *buf, size_t bufsize, struct device_node **npp) +{ + char *handle_str; + phandle handle; + *npp = NULL; + + handle_str = buf; + + buf = strchr(buf, ' '); + if (!buf) + return NULL; + *buf = '\0'; + buf++; + + handle = simple_strtoul(handle_str, NULL, 0); + + *npp = of_find_node_by_phandle(handle); + return buf; +} + +static int do_add_property(char *buf, size_t bufsize) +{ + struct property *prop = NULL; + struct device_node *np; + unsigned char *value; + char *name, *end; + int length; + end = buf + bufsize; + buf = parse_node(buf, bufsize, &np); + + if (!np) + return -ENODEV; + + if (parse_next_property(buf, end, &name, &length, &value) == NULL) + return -EINVAL; + + prop = new_property(name, length, value, NULL); + if (!prop) + return -ENOMEM; + + prom_add_property(np, prop); + + return 0; +} + +static int do_remove_property(char *buf, size_t bufsize) +{ + struct device_node *np; + char *tmp; + struct property *prop; + buf = parse_node(buf, bufsize, &np); + + if (!np) + return -ENODEV; + + tmp = strchr(buf,' '); + if (tmp) + *tmp = '\0'; + + if (strlen(buf) == 0) + return -EINVAL; + + prop = of_find_property(np, buf, NULL); + + return prom_remove_property(np, prop); +} + +static int do_update_property(char *buf, size_t bufsize) +{ + struct device_node *np; + unsigned char *value; + char *name, *end, *next_prop; + int rc, length; + struct property *newprop, *oldprop; + buf = parse_node(buf, bufsize, &np); + end = buf + bufsize; + + if (!np) + return -ENODEV; + + next_prop = parse_next_property(buf, end, &name, &length, &value); + if (!next_prop) + return -EINVAL; + + newprop = new_property(name, length, value, NULL); + if (!newprop) + return -ENOMEM; + + if (!strcmp(name, "slb-size") || !strcmp(name, "ibm,slb-size")) + slb_set_size(*(int *)value); + + oldprop = of_find_property(np, name,NULL); + if (!oldprop) { + if (strlen(name)) + return prom_add_property(np, newprop); + return -ENODEV; + } + + rc = prom_update_property(np, newprop, oldprop); + if (rc) + return rc; + + /* For memory under the ibm,dynamic-reconfiguration-memory node + * of the device tree, adding and removing memory is just an update + * to the ibm,dynamic-memory property instead of adding/removing a + * memory node in the device tree. For these cases we still need to + * involve the notifier chain. + */ + if (!strcmp(name, "ibm,dynamic-memory")) { + int action; + + next_prop = parse_next_property(next_prop, end, &name, + &length, &value); + if (!next_prop) + return -EINVAL; + + if (!strcmp(name, "add")) + action = PSERIES_DRCONF_MEM_ADD; + else + action = PSERIES_DRCONF_MEM_REMOVE; + + rc = blocking_notifier_call_chain(&pSeries_reconfig_chain, + action, value); + if (rc == NOTIFY_BAD) { + rc = prom_update_property(np, oldprop, newprop); + return -ENOMEM; + } + } + + return 0; +} + +/** + * ofdt_write - perform operations on the Open Firmware device tree + * + * @file: not used + * @buf: command and arguments + * @count: size of the command buffer + * @off: not used + * + * Operations supported at this time are addition and removal of + * whole nodes along with their properties. Operations on individual + * properties are not implemented (yet). + */ +static ssize_t ofdt_write(struct file *file, const char __user *buf, size_t count, + loff_t *off) +{ + int rv = 0; + char *kbuf; + char *tmp; + + if (!(kbuf = kmalloc(count + 1, GFP_KERNEL))) { + rv = -ENOMEM; + goto out; + } + if (copy_from_user(kbuf, buf, count)) { + rv = -EFAULT; + goto out; + } + + kbuf[count] = '\0'; + + tmp = strchr(kbuf, ' '); + if (!tmp) { + rv = -EINVAL; + goto out; + } + *tmp = '\0'; + tmp++; + + if (!strcmp(kbuf, "add_node")) + rv = do_add_node(tmp, count - (tmp - kbuf)); + else if (!strcmp(kbuf, "remove_node")) + rv = do_remove_node(tmp); + else if (!strcmp(kbuf, "add_property")) + rv = do_add_property(tmp, count - (tmp - kbuf)); + else if (!strcmp(kbuf, "remove_property")) + rv = do_remove_property(tmp, count - (tmp - kbuf)); + else if (!strcmp(kbuf, "update_property")) + rv = do_update_property(tmp, count - (tmp - kbuf)); + else + rv = -EINVAL; +out: + kfree(kbuf); + return rv ? rv : count; +} + +static const struct file_operations ofdt_fops = { + .write = ofdt_write, + .llseek = noop_llseek, +}; + +/* create /proc/powerpc/ofdt write-only by root */ +static int proc_ppc64_create_ofdt(void) +{ + struct proc_dir_entry *ent; + + if (!machine_is(pseries)) + return 0; + + ent = proc_create("powerpc/ofdt", S_IWUSR, NULL, &ofdt_fops); + if (ent) + ent->size = 0; + + return 0; +} +__initcall(proc_ppc64_create_ofdt); diff --git a/arch/powerpc/platforms/pseries/scanlog.c b/arch/powerpc/platforms/pseries/scanlog.c new file mode 100644 index 00000000..55445729 --- /dev/null +++ b/arch/powerpc/platforms/pseries/scanlog.c @@ -0,0 +1,214 @@ +/* + * c 2001 PPC 64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * scan-log-data driver for PPC64 Todd Inglett <tinglett@vnet.ibm.com> + * + * When ppc64 hardware fails the service processor dumps internal state + * of the system. After a reboot the operating system can access a dump + * of this data using this driver. A dump exists if the device-tree + * /chosen/ibm,scan-log-data property exists. + * + * This driver exports /proc/powerpc/scan-log-dump which can be read. + * The driver supports only sequential reads. + * + * The driver looks at a write to the driver for the single word "reset". + * If given, the driver will reset the scanlog so the platform can free it. + */ + +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/proc_fs.h> +#include <linux/init.h> +#include <linux/delay.h> +#include <linux/slab.h> +#include <asm/uaccess.h> +#include <asm/rtas.h> +#include <asm/prom.h> + +#define MODULE_VERS "1.0" +#define MODULE_NAME "scanlog" + +/* Status returns from ibm,scan-log-dump */ +#define SCANLOG_COMPLETE 0 +#define SCANLOG_HWERROR -1 +#define SCANLOG_CONTINUE 1 + + +static unsigned int ibm_scan_log_dump; /* RTAS token */ +static struct proc_dir_entry *proc_ppc64_scan_log_dump; /* The proc file */ + +static ssize_t scanlog_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct inode * inode = file->f_path.dentry->d_inode; + struct proc_dir_entry *dp; + unsigned int *data; + int status; + unsigned long len, off; + unsigned int wait_time; + + dp = PDE(inode); + data = (unsigned int *)dp->data; + + if (count > RTAS_DATA_BUF_SIZE) + count = RTAS_DATA_BUF_SIZE; + + if (count < 1024) { + /* This is the min supported by this RTAS call. Rather + * than do all the buffering we insist the user code handle + * larger reads. As long as cp works... :) + */ + printk(KERN_ERR "scanlog: cannot perform a small read (%ld)\n", count); + return -EINVAL; + } + + if (!access_ok(VERIFY_WRITE, buf, count)) + return -EFAULT; + + for (;;) { + wait_time = 500; /* default wait if no data */ + spin_lock(&rtas_data_buf_lock); + memcpy(rtas_data_buf, data, RTAS_DATA_BUF_SIZE); + status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, + (u32) __pa(rtas_data_buf), (u32) count); + memcpy(data, rtas_data_buf, RTAS_DATA_BUF_SIZE); + spin_unlock(&rtas_data_buf_lock); + + pr_debug("scanlog: status=%d, data[0]=%x, data[1]=%x, " \ + "data[2]=%x\n", status, data[0], data[1], data[2]); + switch (status) { + case SCANLOG_COMPLETE: + pr_debug("scanlog: hit eof\n"); + return 0; + case SCANLOG_HWERROR: + pr_debug("scanlog: hardware error reading data\n"); + return -EIO; + case SCANLOG_CONTINUE: + /* We may or may not have data yet */ + len = data[1]; + off = data[2]; + if (len > 0) { + if (copy_to_user(buf, ((char *)data)+off, len)) + return -EFAULT; + return len; + } + /* Break to sleep default time */ + break; + default: + /* Assume extended busy */ + wait_time = rtas_busy_delay_time(status); + if (!wait_time) { + printk(KERN_ERR "scanlog: unknown error " \ + "from rtas: %d\n", status); + return -EIO; + } + } + /* Apparently no data yet. Wait and try again. */ + msleep_interruptible(wait_time); + } + /*NOTREACHED*/ +} + +static ssize_t scanlog_write(struct file * file, const char __user * buf, + size_t count, loff_t *ppos) +{ + char stkbuf[20]; + int status; + + if (count > 19) count = 19; + if (copy_from_user (stkbuf, buf, count)) { + return -EFAULT; + } + stkbuf[count] = 0; + + if (buf) { + if (strncmp(stkbuf, "reset", 5) == 0) { + pr_debug("scanlog: reset scanlog\n"); + status = rtas_call(ibm_scan_log_dump, 2, 1, NULL, 0, 0); + pr_debug("scanlog: rtas returns %d\n", status); + } + } + return count; +} + +static int scanlog_open(struct inode * inode, struct file * file) +{ + struct proc_dir_entry *dp = PDE(inode); + unsigned int *data = (unsigned int *)dp->data; + + if (data[0] != 0) { + /* This imperfect test stops a second copy of the + * data (or a reset while data is being copied) + */ + return -EBUSY; + } + + data[0] = 0; /* re-init so we restart the scan */ + + return 0; +} + +static int scanlog_release(struct inode * inode, struct file * file) +{ + struct proc_dir_entry *dp = PDE(inode); + unsigned int *data = (unsigned int *)dp->data; + + data[0] = 0; + + return 0; +} + +const struct file_operations scanlog_fops = { + .owner = THIS_MODULE, + .read = scanlog_read, + .write = scanlog_write, + .open = scanlog_open, + .release = scanlog_release, + .llseek = noop_llseek, +}; + +static int __init scanlog_init(void) +{ + struct proc_dir_entry *ent; + void *data; + int err = -ENOMEM; + + ibm_scan_log_dump = rtas_token("ibm,scan-log-dump"); + if (ibm_scan_log_dump == RTAS_UNKNOWN_SERVICE) + return -ENODEV; + + /* Ideally we could allocate a buffer < 4G */ + data = kzalloc(RTAS_DATA_BUF_SIZE, GFP_KERNEL); + if (!data) + goto err; + + ent = proc_create_data("powerpc/rtas/scan-log-dump", S_IRUSR, NULL, + &scanlog_fops, data); + if (!ent) + goto err; + + proc_ppc64_scan_log_dump = ent; + + return 0; +err: + kfree(data); + return err; +} + +static void __exit scanlog_cleanup(void) +{ + if (proc_ppc64_scan_log_dump) { + kfree(proc_ppc64_scan_log_dump->data); + remove_proc_entry("scan-log-dump", proc_ppc64_scan_log_dump->parent); + } +} + +module_init(scanlog_init); +module_exit(scanlog_cleanup); +MODULE_LICENSE("GPL"); diff --git a/arch/powerpc/platforms/pseries/setup.c b/arch/powerpc/platforms/pseries/setup.c new file mode 100644 index 00000000..593accee --- /dev/null +++ b/arch/powerpc/platforms/pseries/setup.c @@ -0,0 +1,720 @@ +/* + * 64-bit pSeries and RS/6000 setup code. + * + * Copyright (C) 1995 Linus Torvalds + * Adapted from 'alpha' version by Gary Thomas + * Modified by Cort Dougan (cort@cs.nmt.edu) + * Modified by PPC64 Team, IBM Corp + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +/* + * bootup setup stuff.. + */ + +#include <linux/cpu.h> +#include <linux/errno.h> +#include <linux/sched.h> +#include <linux/kernel.h> +#include <linux/mm.h> +#include <linux/stddef.h> +#include <linux/unistd.h> +#include <linux/user.h> +#include <linux/tty.h> +#include <linux/major.h> +#include <linux/interrupt.h> +#include <linux/reboot.h> +#include <linux/init.h> +#include <linux/ioport.h> +#include <linux/console.h> +#include <linux/pci.h> +#include <linux/utsname.h> +#include <linux/adb.h> +#include <linux/module.h> +#include <linux/delay.h> +#include <linux/irq.h> +#include <linux/seq_file.h> +#include <linux/root_dev.h> + +#include <asm/mmu.h> +#include <asm/processor.h> +#include <asm/io.h> +#include <asm/pgtable.h> +#include <asm/prom.h> +#include <asm/rtas.h> +#include <asm/pci-bridge.h> +#include <asm/iommu.h> +#include <asm/dma.h> +#include <asm/machdep.h> +#include <asm/irq.h> +#include <asm/time.h> +#include <asm/nvram.h> +#include <asm/pmc.h> +#include <asm/mpic.h> +#include <asm/xics.h> +#include <asm/ppc-pci.h> +#include <asm/i8259.h> +#include <asm/udbg.h> +#include <asm/smp.h> +#include <asm/firmware.h> +#include <asm/eeh.h> +#include <asm/pSeries_reconfig.h> + +#include "plpar_wrappers.h" +#include "pseries.h" + +int CMO_PrPSP = -1; +int CMO_SecPSP = -1; +unsigned long CMO_PageSize = (ASM_CONST(1) << IOMMU_PAGE_SHIFT); +EXPORT_SYMBOL(CMO_PageSize); + +int fwnmi_active; /* TRUE if an FWNMI handler is present */ + +static void pseries_shared_idle_sleep(void); +static void pseries_dedicated_idle_sleep(void); + +static struct device_node *pSeries_mpic_node; + +static void pSeries_show_cpuinfo(struct seq_file *m) +{ + struct device_node *root; + const char *model = ""; + + root = of_find_node_by_path("/"); + if (root) + model = of_get_property(root, "model", NULL); + seq_printf(m, "machine\t\t: CHRP %s\n", model); + of_node_put(root); +} + +/* Initialize firmware assisted non-maskable interrupts if + * the firmware supports this feature. + */ +static void __init fwnmi_init(void) +{ + unsigned long system_reset_addr, machine_check_addr; + + int ibm_nmi_register = rtas_token("ibm,nmi-register"); + if (ibm_nmi_register == RTAS_UNKNOWN_SERVICE) + return; + + /* If the kernel's not linked at zero we point the firmware at low + * addresses anyway, and use a trampoline to get to the real code. */ + system_reset_addr = __pa(system_reset_fwnmi) - PHYSICAL_START; + machine_check_addr = __pa(machine_check_fwnmi) - PHYSICAL_START; + + if (0 == rtas_call(ibm_nmi_register, 2, 1, NULL, system_reset_addr, + machine_check_addr)) + fwnmi_active = 1; +} + +static void pseries_8259_cascade(unsigned int irq, struct irq_desc *desc) +{ + struct irq_chip *chip = irq_desc_get_chip(desc); + unsigned int cascade_irq = i8259_irq(); + + if (cascade_irq != NO_IRQ) + generic_handle_irq(cascade_irq); + + chip->irq_eoi(&desc->irq_data); +} + +static void __init pseries_setup_i8259_cascade(void) +{ + struct device_node *np, *old, *found = NULL; + unsigned int cascade; + const u32 *addrp; + unsigned long intack = 0; + int naddr; + + for_each_node_by_type(np, "interrupt-controller") { + if (of_device_is_compatible(np, "chrp,iic")) { + found = np; + break; + } + } + + if (found == NULL) { + printk(KERN_DEBUG "pic: no ISA interrupt controller\n"); + return; + } + + cascade = irq_of_parse_and_map(found, 0); + if (cascade == NO_IRQ) { + printk(KERN_ERR "pic: failed to map cascade interrupt"); + return; + } + pr_debug("pic: cascade mapped to irq %d\n", cascade); + + for (old = of_node_get(found); old != NULL ; old = np) { + np = of_get_parent(old); + of_node_put(old); + if (np == NULL) + break; + if (strcmp(np->name, "pci") != 0) + continue; + addrp = of_get_property(np, "8259-interrupt-acknowledge", NULL); + if (addrp == NULL) + continue; + naddr = of_n_addr_cells(np); + intack = addrp[naddr-1]; + if (naddr > 1) + intack |= ((unsigned long)addrp[naddr-2]) << 32; + } + if (intack) + printk(KERN_DEBUG "pic: PCI 8259 intack at 0x%016lx\n", intack); + i8259_init(found, intack); + of_node_put(found); + irq_set_chained_handler(cascade, pseries_8259_cascade); +} + +static void __init pseries_mpic_init_IRQ(void) +{ + struct device_node *np; + const unsigned int *opprop; + unsigned long openpic_addr = 0; + int naddr, n, i, opplen; + struct mpic *mpic; + + np = of_find_node_by_path("/"); + naddr = of_n_addr_cells(np); + opprop = of_get_property(np, "platform-open-pic", &opplen); + if (opprop != 0) { + openpic_addr = of_read_number(opprop, naddr); + printk(KERN_DEBUG "OpenPIC addr: %lx\n", openpic_addr); + } + of_node_put(np); + + BUG_ON(openpic_addr == 0); + + /* Setup the openpic driver */ + mpic = mpic_alloc(pSeries_mpic_node, openpic_addr, + MPIC_PRIMARY, + 16, 250, /* isu size, irq count */ + " MPIC "); + BUG_ON(mpic == NULL); + + /* Add ISUs */ + opplen /= sizeof(u32); + for (n = 0, i = naddr; i < opplen; i += naddr, n++) { + unsigned long isuaddr = of_read_number(opprop + i, naddr); + mpic_assign_isu(mpic, n, isuaddr); + } + + /* Setup top-level get_irq */ + ppc_md.get_irq = mpic_get_irq; + + /* All ISUs are setup, complete initialization */ + mpic_init(mpic); + + /* Look for cascade */ + pseries_setup_i8259_cascade(); +} + +static void __init pseries_xics_init_IRQ(void) +{ + xics_init(); + pseries_setup_i8259_cascade(); +} + +static void pseries_lpar_enable_pmcs(void) +{ + unsigned long set, reset; + + set = 1UL << 63; + reset = 0; + plpar_hcall_norets(H_PERFMON, set, reset); +} + +static void __init pseries_discover_pic(void) +{ + struct device_node *np; + const char *typep; + + for (np = NULL; (np = of_find_node_by_name(np, + "interrupt-controller"));) { + typep = of_get_property(np, "compatible", NULL); + if (strstr(typep, "open-pic")) { + pSeries_mpic_node = of_node_get(np); + ppc_md.init_IRQ = pseries_mpic_init_IRQ; + setup_kexec_cpu_down_mpic(); + smp_init_pseries_mpic(); + return; + } else if (strstr(typep, "ppc-xicp")) { + ppc_md.init_IRQ = pseries_xics_init_IRQ; + setup_kexec_cpu_down_xics(); + smp_init_pseries_xics(); + return; + } + } + printk(KERN_ERR "pSeries_discover_pic: failed to recognize" + " interrupt-controller\n"); +} + +static int pci_dn_reconfig_notifier(struct notifier_block *nb, unsigned long action, void *node) +{ + struct device_node *np = node; + struct pci_dn *pci = NULL; + int err = NOTIFY_OK; + + switch (action) { + case PSERIES_RECONFIG_ADD: + pci = np->parent->data; + if (pci) + update_dn_pci_info(np, pci->phb); + break; + default: + err = NOTIFY_DONE; + break; + } + return err; +} + +static struct notifier_block pci_dn_reconfig_nb = { + .notifier_call = pci_dn_reconfig_notifier, +}; + +struct kmem_cache *dtl_cache; + +#ifdef CONFIG_VIRT_CPU_ACCOUNTING +/* + * Allocate space for the dispatch trace log for all possible cpus + * and register the buffers with the hypervisor. This is used for + * computing time stolen by the hypervisor. + */ +static int alloc_dispatch_logs(void) +{ + int cpu, ret; + struct paca_struct *pp; + struct dtl_entry *dtl; + + if (!firmware_has_feature(FW_FEATURE_SPLPAR)) + return 0; + + if (!dtl_cache) + return 0; + + for_each_possible_cpu(cpu) { + pp = &paca[cpu]; + dtl = kmem_cache_alloc(dtl_cache, GFP_KERNEL); + if (!dtl) { + pr_warn("Failed to allocate dispatch trace log for cpu %d\n", + cpu); + pr_warn("Stolen time statistics will be unreliable\n"); + break; + } + + pp->dtl_ridx = 0; + pp->dispatch_log = dtl; + pp->dispatch_log_end = dtl + N_DISPATCH_LOG; + pp->dtl_curr = dtl; + } + + /* Register the DTL for the current (boot) cpu */ + dtl = get_paca()->dispatch_log; + get_paca()->dtl_ridx = 0; + get_paca()->dtl_curr = dtl; + get_paca()->lppaca_ptr->dtl_idx = 0; + + /* hypervisor reads buffer length from this field */ + dtl->enqueue_to_dispatch_time = DISPATCH_LOG_BYTES; + ret = register_dtl(hard_smp_processor_id(), __pa(dtl)); + if (ret) + pr_warn("DTL registration failed for boot cpu %d (%d)\n", + smp_processor_id(), ret); + get_paca()->lppaca_ptr->dtl_enable_mask = 2; + + return 0; +} +#else /* !CONFIG_VIRT_CPU_ACCOUNTING */ +static inline int alloc_dispatch_logs(void) +{ + return 0; +} +#endif /* CONFIG_VIRT_CPU_ACCOUNTING */ + +static int alloc_dispatch_log_kmem_cache(void) +{ + dtl_cache = kmem_cache_create("dtl", DISPATCH_LOG_BYTES, + DISPATCH_LOG_BYTES, 0, NULL); + if (!dtl_cache) { + pr_warn("Failed to create dispatch trace log buffer cache\n"); + pr_warn("Stolen time statistics will be unreliable\n"); + return 0; + } + + return alloc_dispatch_logs(); +} +early_initcall(alloc_dispatch_log_kmem_cache); + +static void __init pSeries_setup_arch(void) +{ + /* Discover PIC type and setup ppc_md accordingly */ + pseries_discover_pic(); + + /* openpic global configuration register (64-bit format). */ + /* openpic Interrupt Source Unit pointer (64-bit format). */ + /* python0 facility area (mmio) (64-bit format) REAL address. */ + + /* init to some ~sane value until calibrate_delay() runs */ + loops_per_jiffy = 50000000; + + fwnmi_init(); + + /* Find and initialize PCI host bridges */ + init_pci_config_tokens(); + find_and_init_phbs(); + pSeries_reconfig_notifier_register(&pci_dn_reconfig_nb); + eeh_init(); + + pSeries_nvram_init(); + + /* Choose an idle loop */ + if (firmware_has_feature(FW_FEATURE_SPLPAR)) { + vpa_init(boot_cpuid); + if (get_lppaca()->shared_proc) { + printk(KERN_DEBUG "Using shared processor idle loop\n"); + ppc_md.power_save = pseries_shared_idle_sleep; + } else { + printk(KERN_DEBUG "Using dedicated idle loop\n"); + ppc_md.power_save = pseries_dedicated_idle_sleep; + } + } else { + printk(KERN_DEBUG "Using default idle loop\n"); + } + + if (firmware_has_feature(FW_FEATURE_LPAR)) + ppc_md.enable_pmcs = pseries_lpar_enable_pmcs; + else + ppc_md.enable_pmcs = power4_enable_pmcs; +} + +static int __init pSeries_init_panel(void) +{ + /* Manually leave the kernel version on the panel. */ + ppc_md.progress("Linux ppc64\n", 0); + ppc_md.progress(init_utsname()->version, 0); + + return 0; +} +machine_arch_initcall(pseries, pSeries_init_panel); + +static int pseries_set_dabr(unsigned long dabr) +{ + return plpar_hcall_norets(H_SET_DABR, dabr); +} + +static int pseries_set_xdabr(unsigned long dabr) +{ + /* We want to catch accesses from kernel and userspace */ + return plpar_hcall_norets(H_SET_XDABR, dabr, + H_DABRX_KERNEL | H_DABRX_USER); +} + +#define CMO_CHARACTERISTICS_TOKEN 44 +#define CMO_MAXLENGTH 1026 + +void pSeries_coalesce_init(void) +{ + struct hvcall_mpp_x_data mpp_x_data; + + if (firmware_has_feature(FW_FEATURE_CMO) && !h_get_mpp_x(&mpp_x_data)) + powerpc_firmware_features |= FW_FEATURE_XCMO; + else + powerpc_firmware_features &= ~FW_FEATURE_XCMO; +} + +/** + * fw_cmo_feature_init - FW_FEATURE_CMO is not stored in ibm,hypertas-functions, + * handle that here. (Stolen from parse_system_parameter_string) + */ +void pSeries_cmo_feature_init(void) +{ + char *ptr, *key, *value, *end; + int call_status; + int page_order = IOMMU_PAGE_SHIFT; + + pr_debug(" -> fw_cmo_feature_init()\n"); + spin_lock(&rtas_data_buf_lock); + memset(rtas_data_buf, 0, RTAS_DATA_BUF_SIZE); + call_status = rtas_call(rtas_token("ibm,get-system-parameter"), 3, 1, + NULL, + CMO_CHARACTERISTICS_TOKEN, + __pa(rtas_data_buf), + RTAS_DATA_BUF_SIZE); + + if (call_status != 0) { + spin_unlock(&rtas_data_buf_lock); + pr_debug("CMO not available\n"); + pr_debug(" <- fw_cmo_feature_init()\n"); + return; + } + + end = rtas_data_buf + CMO_MAXLENGTH - 2; + ptr = rtas_data_buf + 2; /* step over strlen value */ + key = value = ptr; + + while (*ptr && (ptr <= end)) { + /* Separate the key and value by replacing '=' with '\0' and + * point the value at the string after the '=' + */ + if (ptr[0] == '=') { + ptr[0] = '\0'; + value = ptr + 1; + } else if (ptr[0] == '\0' || ptr[0] == ',') { + /* Terminate the string containing the key/value pair */ + ptr[0] = '\0'; + + if (key == value) { + pr_debug("Malformed key/value pair\n"); + /* Never found a '=', end processing */ + break; + } + + if (0 == strcmp(key, "CMOPageSize")) + page_order = simple_strtol(value, NULL, 10); + else if (0 == strcmp(key, "PrPSP")) + CMO_PrPSP = simple_strtol(value, NULL, 10); + else if (0 == strcmp(key, "SecPSP")) + CMO_SecPSP = simple_strtol(value, NULL, 10); + value = key = ptr + 1; + } + ptr++; + } + + /* Page size is returned as the power of 2 of the page size, + * convert to the page size in bytes before returning + */ + CMO_PageSize = 1 << page_order; + pr_debug("CMO_PageSize = %lu\n", CMO_PageSize); + + if (CMO_PrPSP != -1 || CMO_SecPSP != -1) { + pr_info("CMO enabled\n"); + pr_debug("CMO enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, + CMO_SecPSP); + powerpc_firmware_features |= FW_FEATURE_CMO; + pSeries_coalesce_init(); + } else + pr_debug("CMO not enabled, PrPSP=%d, SecPSP=%d\n", CMO_PrPSP, + CMO_SecPSP); + spin_unlock(&rtas_data_buf_lock); + pr_debug(" <- fw_cmo_feature_init()\n"); +} + +/* + * Early initialization. Relocation is on but do not reference unbolted pages + */ +static void __init pSeries_init_early(void) +{ + pr_debug(" -> pSeries_init_early()\n"); + + if (firmware_has_feature(FW_FEATURE_LPAR)) + find_udbg_vterm(); + + if (firmware_has_feature(FW_FEATURE_DABR)) + ppc_md.set_dabr = pseries_set_dabr; + else if (firmware_has_feature(FW_FEATURE_XDABR)) + ppc_md.set_dabr = pseries_set_xdabr; + + pSeries_cmo_feature_init(); + iommu_init_early_pSeries(); + + pr_debug(" <- pSeries_init_early()\n"); +} + +/* + * Called very early, MMU is off, device-tree isn't unflattened + */ + +static int __init pSeries_probe_hypertas(unsigned long node, + const char *uname, int depth, + void *data) +{ + const char *hypertas; + unsigned long len; + + if (depth != 1 || + (strcmp(uname, "rtas") != 0 && strcmp(uname, "rtas@0") != 0)) + return 0; + + hypertas = of_get_flat_dt_prop(node, "ibm,hypertas-functions", &len); + if (!hypertas) + return 1; + + powerpc_firmware_features |= FW_FEATURE_LPAR; + fw_feature_init(hypertas, len); + + return 1; +} + +static int __init pSeries_probe(void) +{ + unsigned long root = of_get_flat_dt_root(); + char *dtype = of_get_flat_dt_prop(root, "device_type", NULL); + + if (dtype == NULL) + return 0; + if (strcmp(dtype, "chrp")) + return 0; + + /* Cell blades firmware claims to be chrp while it's not. Until this + * is fixed, we need to avoid those here. + */ + if (of_flat_dt_is_compatible(root, "IBM,CPBW-1.0") || + of_flat_dt_is_compatible(root, "IBM,CBEA")) + return 0; + + pr_debug("pSeries detected, looking for LPAR capability...\n"); + + /* Now try to figure out if we are running on LPAR */ + of_scan_flat_dt(pSeries_probe_hypertas, NULL); + + if (firmware_has_feature(FW_FEATURE_LPAR)) + hpte_init_lpar(); + else + hpte_init_native(); + + pr_debug("Machine is%s LPAR !\n", + (powerpc_firmware_features & FW_FEATURE_LPAR) ? "" : " not"); + + return 1; +} + + +DECLARE_PER_CPU(long, smt_snooze_delay); + +static void pseries_dedicated_idle_sleep(void) +{ + unsigned int cpu = smp_processor_id(); + unsigned long start_snooze; + unsigned long in_purr, out_purr; + long snooze = __get_cpu_var(smt_snooze_delay); + + /* + * Indicate to the HV that we are idle. Now would be + * a good time to find other work to dispatch. + */ + get_lppaca()->idle = 1; + get_lppaca()->donate_dedicated_cpu = 1; + in_purr = mfspr(SPRN_PURR); + + /* + * We come in with interrupts disabled, and need_resched() + * has been checked recently. If we should poll for a little + * while, do so. + */ + if (snooze) { + start_snooze = get_tb() + snooze * tb_ticks_per_usec; + local_irq_enable(); + set_thread_flag(TIF_POLLING_NRFLAG); + + while ((snooze < 0) || (get_tb() < start_snooze)) { + if (need_resched() || cpu_is_offline(cpu)) + goto out; + ppc64_runlatch_off(); + HMT_low(); + HMT_very_low(); + } + + HMT_medium(); + clear_thread_flag(TIF_POLLING_NRFLAG); + smp_mb(); + local_irq_disable(); + if (need_resched() || cpu_is_offline(cpu)) + goto out; + } + + cede_processor(); + +out: + HMT_medium(); + out_purr = mfspr(SPRN_PURR); + get_lppaca()->wait_state_cycles += out_purr - in_purr; + get_lppaca()->donate_dedicated_cpu = 0; + get_lppaca()->idle = 0; +} + +static void pseries_shared_idle_sleep(void) +{ + /* + * Indicate to the HV that we are idle. Now would be + * a good time to find other work to dispatch. + */ + get_lppaca()->idle = 1; + + /* + * Yield the processor to the hypervisor. We return if + * an external interrupt occurs (which are driven prior + * to returning here) or if a prod occurs from another + * processor. When returning here, external interrupts + * are enabled. + */ + cede_processor(); + + get_lppaca()->idle = 0; +} + +static int pSeries_pci_probe_mode(struct pci_bus *bus) +{ + if (firmware_has_feature(FW_FEATURE_LPAR)) + return PCI_PROBE_DEVTREE; + return PCI_PROBE_NORMAL; +} + +/** + * pSeries_power_off - tell firmware about how to power off the system. + * + * This function calls either the power-off rtas token in normal cases + * or the ibm,power-off-ups token (if present & requested) in case of + * a power failure. If power-off token is used, power on will only be + * possible with power button press. If ibm,power-off-ups token is used + * it will allow auto poweron after power is restored. + */ +static void pSeries_power_off(void) +{ + int rc; + int rtas_poweroff_ups_token = rtas_token("ibm,power-off-ups"); + + if (rtas_flash_term_hook) + rtas_flash_term_hook(SYS_POWER_OFF); + + if (rtas_poweron_auto == 0 || + rtas_poweroff_ups_token == RTAS_UNKNOWN_SERVICE) { + rc = rtas_call(rtas_token("power-off"), 2, 1, NULL, -1, -1); + printk(KERN_INFO "RTAS power-off returned %d\n", rc); + } else { + rc = rtas_call(rtas_poweroff_ups_token, 0, 1, NULL); + printk(KERN_INFO "RTAS ibm,power-off-ups returned %d\n", rc); + } + for (;;); +} + +#ifndef CONFIG_PCI +void pSeries_final_fixup(void) { } +#endif + +define_machine(pseries) { + .name = "pSeries", + .probe = pSeries_probe, + .setup_arch = pSeries_setup_arch, + .init_early = pSeries_init_early, + .show_cpuinfo = pSeries_show_cpuinfo, + .log_error = pSeries_log_error, + .pcibios_fixup = pSeries_final_fixup, + .pci_probe_mode = pSeries_pci_probe_mode, + .restart = rtas_restart, + .power_off = pSeries_power_off, + .halt = rtas_halt, + .panic = rtas_os_term, + .get_boot_time = rtas_get_boot_time, + .get_rtc_time = rtas_get_rtc_time, + .set_rtc_time = rtas_set_rtc_time, + .calibrate_decr = generic_calibrate_decr, + .progress = rtas_progress, + .system_reset_exception = pSeries_system_reset_exception, + .machine_check_exception = pSeries_machine_check_exception, +}; diff --git a/arch/powerpc/platforms/pseries/smp.c b/arch/powerpc/platforms/pseries/smp.c new file mode 100644 index 00000000..fbffd7e4 --- /dev/null +++ b/arch/powerpc/platforms/pseries/smp.c @@ -0,0 +1,260 @@ +/* + * SMP support for pSeries machines. + * + * Dave Engebretsen, Peter Bergner, and + * Mike Corrigan {engebret|bergner|mikec}@us.ibm.com + * + * Plus various changes from other IBM teams... + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/smp.h> +#include <linux/interrupt.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/spinlock.h> +#include <linux/cache.h> +#include <linux/err.h> +#include <linux/sysdev.h> +#include <linux/cpu.h> + +#include <asm/ptrace.h> +#include <asm/atomic.h> +#include <asm/irq.h> +#include <asm/page.h> +#include <asm/pgtable.h> +#include <asm/io.h> +#include <asm/prom.h> +#include <asm/smp.h> +#include <asm/paca.h> +#include <asm/machdep.h> +#include <asm/cputable.h> +#include <asm/firmware.h> +#include <asm/system.h> +#include <asm/rtas.h> +#include <asm/pSeries_reconfig.h> +#include <asm/mpic.h> +#include <asm/vdso_datapage.h> +#include <asm/cputhreads.h> +#include <asm/mpic.h> +#include <asm/xics.h> + +#include "plpar_wrappers.h" +#include "pseries.h" +#include "offline_states.h" + + +/* + * The Primary thread of each non-boot processor was started from the OF client + * interface by prom_hold_cpus and is spinning on secondary_hold_spinloop. + */ +static cpumask_var_t of_spin_mask; + +/* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ +int smp_query_cpu_stopped(unsigned int pcpu) +{ + int cpu_status, status; + int qcss_tok = rtas_token("query-cpu-stopped-state"); + + if (qcss_tok == RTAS_UNKNOWN_SERVICE) { + printk_once(KERN_INFO + "Firmware doesn't support query-cpu-stopped-state\n"); + return QCSS_HARDWARE_ERROR; + } + + status = rtas_call(qcss_tok, 1, 2, &cpu_status, pcpu); + if (status != 0) { + printk(KERN_ERR + "RTAS query-cpu-stopped-state failed: %i\n", status); + return status; + } + + return cpu_status; +} + +/** + * smp_startup_cpu() - start the given cpu + * + * At boot time, there is nothing to do for primary threads which were + * started from Open Firmware. For anything else, call RTAS with the + * appropriate start location. + * + * Returns: + * 0 - failure + * 1 - success + */ +static inline int __devinit smp_startup_cpu(unsigned int lcpu) +{ + int status; + unsigned long start_here = __pa((u32)*((unsigned long *) + generic_secondary_smp_init)); + unsigned int pcpu; + int start_cpu; + + if (cpumask_test_cpu(lcpu, of_spin_mask)) + /* Already started by OF and sitting in spin loop */ + return 1; + + pcpu = get_hard_smp_processor_id(lcpu); + + /* Check to see if the CPU out of FW already for kexec */ + if (smp_query_cpu_stopped(pcpu) == QCSS_NOT_STOPPED){ + cpumask_set_cpu(lcpu, of_spin_mask); + return 1; + } + + /* Fixup atomic count: it exited inside IRQ handler. */ + task_thread_info(paca[lcpu].__current)->preempt_count = 0; +#ifdef CONFIG_HOTPLUG_CPU + if (get_cpu_current_state(lcpu) == CPU_STATE_INACTIVE) + goto out; +#endif + /* + * If the RTAS start-cpu token does not exist then presume the + * cpu is already spinning. + */ + start_cpu = rtas_token("start-cpu"); + if (start_cpu == RTAS_UNKNOWN_SERVICE) + return 1; + + status = rtas_call(start_cpu, 3, 1, NULL, pcpu, start_here, pcpu); + if (status != 0) { + printk(KERN_ERR "start-cpu failed: %i\n", status); + return 0; + } + +#ifdef CONFIG_HOTPLUG_CPU +out: +#endif + return 1; +} + +static void __devinit smp_xics_setup_cpu(int cpu) +{ + if (cpu != boot_cpuid) + xics_setup_cpu(); + + if (firmware_has_feature(FW_FEATURE_SPLPAR)) + vpa_init(cpu); + + cpumask_clear_cpu(cpu, of_spin_mask); +#ifdef CONFIG_HOTPLUG_CPU + set_cpu_current_state(cpu, CPU_STATE_ONLINE); + set_default_offline_state(cpu); +#endif +} + +static int __devinit smp_pSeries_kick_cpu(int nr) +{ + BUG_ON(nr < 0 || nr >= NR_CPUS); + + if (!smp_startup_cpu(nr)) + return -ENOENT; + + /* + * The processor is currently spinning, waiting for the + * cpu_start field to become non-zero After we set cpu_start, + * the processor will continue on to secondary_start + */ + paca[nr].cpu_start = 1; +#ifdef CONFIG_HOTPLUG_CPU + set_preferred_offline_state(nr, CPU_STATE_ONLINE); + + if (get_cpu_current_state(nr) == CPU_STATE_INACTIVE) { + long rc; + unsigned long hcpuid; + + hcpuid = get_hard_smp_processor_id(nr); + rc = plpar_hcall_norets(H_PROD, hcpuid); + if (rc != H_SUCCESS) + printk(KERN_ERR "Error: Prod to wake up processor %d " + "Ret= %ld\n", nr, rc); + } +#endif + + return 0; +} + +static int smp_pSeries_cpu_bootable(unsigned int nr) +{ + /* Special case - we inhibit secondary thread startup + * during boot if the user requests it. + */ + if (system_state < SYSTEM_RUNNING && cpu_has_feature(CPU_FTR_SMT)) { + if (!smt_enabled_at_boot && cpu_thread_in_core(nr) != 0) + return 0; + if (smt_enabled_at_boot + && cpu_thread_in_core(nr) >= smt_enabled_at_boot) + return 0; + } + + return 1; +} + +static struct smp_ops_t pSeries_mpic_smp_ops = { + .message_pass = smp_mpic_message_pass, + .probe = smp_mpic_probe, + .kick_cpu = smp_pSeries_kick_cpu, + .setup_cpu = smp_mpic_setup_cpu, +}; + +static struct smp_ops_t pSeries_xics_smp_ops = { + .message_pass = smp_muxed_ipi_message_pass, + .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ + .probe = xics_smp_probe, + .kick_cpu = smp_pSeries_kick_cpu, + .setup_cpu = smp_xics_setup_cpu, + .cpu_bootable = smp_pSeries_cpu_bootable, +}; + +/* This is called very early */ +static void __init smp_init_pseries(void) +{ + int i; + + pr_debug(" -> smp_init_pSeries()\n"); + + alloc_bootmem_cpumask_var(&of_spin_mask); + + /* Mark threads which are still spinning in hold loops. */ + if (cpu_has_feature(CPU_FTR_SMT)) { + for_each_present_cpu(i) { + if (cpu_thread_in_core(i) == 0) + cpumask_set_cpu(i, of_spin_mask); + } + } else { + cpumask_copy(of_spin_mask, cpu_present_mask); + } + + cpumask_clear_cpu(boot_cpuid, of_spin_mask); + + /* Non-lpar has additional take/give timebase */ + if (rtas_token("freeze-time-base") != RTAS_UNKNOWN_SERVICE) { + smp_ops->give_timebase = rtas_give_timebase; + smp_ops->take_timebase = rtas_take_timebase; + } + + pr_debug(" <- smp_init_pSeries()\n"); +} + +void __init smp_init_pseries_mpic(void) +{ + smp_ops = &pSeries_mpic_smp_ops; + + smp_init_pseries(); +} + +void __init smp_init_pseries_xics(void) +{ + smp_ops = &pSeries_xics_smp_ops; + + smp_init_pseries(); +} diff --git a/arch/powerpc/platforms/pseries/suspend.c b/arch/powerpc/platforms/pseries/suspend.c new file mode 100644 index 00000000..a8ca289f --- /dev/null +++ b/arch/powerpc/platforms/pseries/suspend.c @@ -0,0 +1,214 @@ +/* + * Copyright (C) 2010 Brian King IBM Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include <linux/delay.h> +#include <linux/suspend.h> +#include <asm/firmware.h> +#include <asm/hvcall.h> +#include <asm/machdep.h> +#include <asm/mmu.h> +#include <asm/rtas.h> + +static u64 stream_id; +static struct sys_device suspend_sysdev; +static DECLARE_COMPLETION(suspend_work); +static struct rtas_suspend_me_data suspend_data; +static atomic_t suspending; + +/** + * pseries_suspend_begin - First phase of hibernation + * + * Check to ensure we are in a valid state to hibernate + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_begin(suspend_state_t state) +{ + long vasi_state, rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + /* Make sure the state is valid */ + rc = plpar_hcall(H_VASI_STATE, retbuf, stream_id); + + vasi_state = retbuf[0]; + + if (rc) { + pr_err("pseries_suspend_begin: vasi_state returned %ld\n",rc); + return rc; + } else if (vasi_state == H_VASI_ENABLED) { + return -EAGAIN; + } else if (vasi_state != H_VASI_SUSPENDING) { + pr_err("pseries_suspend_begin: vasi_state returned state %ld\n", + vasi_state); + return -EIO; + } + + return 0; +} + +/** + * pseries_suspend_cpu - Suspend a single CPU + * + * Makes the H_JOIN call to suspend the CPU + * + **/ +static int pseries_suspend_cpu(void) +{ + if (atomic_read(&suspending)) + return rtas_suspend_cpu(&suspend_data); + return 0; +} + +/** + * pseries_suspend_enter - Final phase of hibernation + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_enter(suspend_state_t state) +{ + int rc = rtas_suspend_last_cpu(&suspend_data); + + atomic_set(&suspending, 0); + atomic_set(&suspend_data.done, 1); + return rc; +} + +/** + * pseries_prepare_late - Prepare to suspend all other CPUs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_prepare_late(void) +{ + atomic_set(&suspending, 1); + atomic_set(&suspend_data.working, 0); + atomic_set(&suspend_data.done, 0); + atomic_set(&suspend_data.error, 0); + suspend_data.complete = &suspend_work; + INIT_COMPLETION(suspend_work); + return 0; +} + +/** + * store_hibernate - Initiate partition hibernation + * @classdev: sysdev class struct + * @attr: class device attribute struct + * @buf: buffer + * @count: buffer size + * + * Write the stream ID received from the HMC to this file + * to trigger hibernating the partition + * + * Return value: + * number of bytes printed to buffer / other on failure + **/ +static ssize_t store_hibernate(struct sysdev_class *classdev, + struct sysdev_class_attribute *attr, + const char *buf, size_t count) +{ + int rc; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + stream_id = simple_strtoul(buf, NULL, 16); + + do { + rc = pseries_suspend_begin(PM_SUSPEND_MEM); + if (rc == -EAGAIN) + ssleep(1); + } while (rc == -EAGAIN); + + if (!rc) + rc = pm_suspend(PM_SUSPEND_MEM); + + stream_id = 0; + + if (!rc) + rc = count; + return rc; +} + +static SYSDEV_CLASS_ATTR(hibernate, S_IWUSR, NULL, store_hibernate); + +static struct sysdev_class suspend_sysdev_class = { + .name = "power", +}; + +static const struct platform_suspend_ops pseries_suspend_ops = { + .valid = suspend_valid_only_mem, + .begin = pseries_suspend_begin, + .prepare_late = pseries_prepare_late, + .enter = pseries_suspend_enter, +}; + +/** + * pseries_suspend_sysfs_register - Register with sysfs + * + * Return value: + * 0 on success / other on failure + **/ +static int pseries_suspend_sysfs_register(struct sys_device *sysdev) +{ + int rc; + + if ((rc = sysdev_class_register(&suspend_sysdev_class))) + return rc; + + sysdev->id = 0; + sysdev->cls = &suspend_sysdev_class; + + if ((rc = sysdev_class_create_file(&suspend_sysdev_class, &attr_hibernate))) + goto class_unregister; + + return 0; + +class_unregister: + sysdev_class_unregister(&suspend_sysdev_class); + return rc; +} + +/** + * pseries_suspend_init - initcall for pSeries suspend + * + * Return value: + * 0 on success / other on failure + **/ +static int __init pseries_suspend_init(void) +{ + int rc; + + if (!machine_is(pseries) || !firmware_has_feature(FW_FEATURE_LPAR)) + return 0; + + suspend_data.token = rtas_token("ibm,suspend-me"); + if (suspend_data.token == RTAS_UNKNOWN_SERVICE) + return 0; + + if ((rc = pseries_suspend_sysfs_register(&suspend_sysdev))) + return rc; + + ppc_md.suspend_disable_cpu = pseries_suspend_cpu; + suspend_set_ops(&pseries_suspend_ops); + return 0; +} + +__initcall(pseries_suspend_init); |