* Version 2. See the file COPYING for more details.
*/
+#include <linux/capability.h>
#include <linux/mm.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/fs.h>
#include <linux/kexec.h>
-#include <linux/spinlock.h>
+#include <linux/mutex.h>
#include <linux/list.h>
#include <linux/highmem.h>
#include <linux/syscalls.h>
#include <linux/reboot.h>
-#include <linux/syscalls.h>
#include <linux/ioport.h>
#include <linux/hardirq.h>
+#include <linux/elf.h>
+#include <linux/elfcore.h>
+#include <linux/utsrelease.h>
+#include <linux/utsname.h>
+#include <linux/numa.h>
+#include <linux/suspend.h>
+#include <linux/device.h>
+#include <linux/freezer.h>
+#include <linux/pm.h>
+#include <linux/cpu.h>
+#include <linux/console.h>
+#include <linux/vmalloc.h>
#include <asm/page.h>
#include <asm/uaccess.h>
#include <asm/io.h>
#include <asm/system.h>
-#include <asm/semaphore.h>
+#include <asm/sections.h>
+
+/* Per cpu memory for storing cpu states in case of system crash. */
+note_buf_t* crash_notes;
+
+/* vmcoreinfo stuff */
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
+size_t vmcoreinfo_size;
+size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
/* Location of the reserved area for the crash kernel */
struct resource crashk_res = {
int kexec_should_crash(struct task_struct *p)
{
- if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
+ if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
return 1;
return 0;
}
*
* The code for the transition from the current kernel to the
* the new kernel is placed in the control_code_buffer, whose size
- * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single
+ * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single
* page of memory is necessary, but some architectures require more.
* Because this memory must be identity mapped in the transition from
* virtual to physical addresses it must live in the range
*/
#define KIMAGE_NO_DEST (-1UL)
-static int kimage_is_destination_range(
- struct kimage *image, unsigned long start, unsigned long end);
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start, unsigned long end);
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long dest);
static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments, struct kexec_segment __user *segments)
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
size_t segment_bytes;
struct kimage *image;
/* Allocate a controlling structure */
result = -ENOMEM;
- image = kmalloc(sizeof(*image), GFP_KERNEL);
- if (!image) {
+ image = kzalloc(sizeof(*image), GFP_KERNEL);
+ if (!image)
goto out;
- }
- memset(image, 0, sizeof(*image));
+
image->head = 0;
image->entry = &image->head;
image->last_entry = &image->head;
result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
+
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
* easy explanation as one segment stops on another.
*/
result = -EINVAL;
- for(i = 0; i < nr_segments; i++) {
+ for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
unsigned long j;
+
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz;
- for(j = 0; j < i; j++) {
+ for (j = 0; j < i; j++) {
unsigned long pstart, pend;
pstart = image->segment[j].mem;
pend = pstart + image->segment[j].memsz;
* later on.
*/
result = -EINVAL;
- for(i = 0; i < nr_segments; i++) {
+ for (i = 0; i < nr_segments; i++) {
if (image->segment[i].bufsz > image->segment[i].memsz)
goto out;
}
-
result = 0;
- out:
- if (result == 0) {
+out:
+ if (result == 0)
*rimage = image;
- } else {
+ else
kfree(image);
- }
+
return result;
}
static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments, struct kexec_segment __user *segments)
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
int result;
struct kimage *image;
/* Allocate and initialize a controlling structure */
image = NULL;
result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result) {
+ if (result)
goto out;
- }
+
*rimage = image;
/*
*/
result = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_CODE_SIZE));
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
goto out;
}
+ image->swap_page = kimage_alloc_control_pages(image, 0);
+ if (!image->swap_page) {
+ printk(KERN_ERR "Could not allocate swap buffer\n");
+ goto out;
+ }
+
result = 0;
out:
- if (result == 0) {
+ if (result == 0)
*rimage = image;
- } else {
+ else
kfree(image);
- }
+
return result;
}
static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
- unsigned long nr_segments, struct kexec_segment *segments)
+ unsigned long nr_segments,
+ struct kexec_segment __user *segments)
{
int result;
struct kimage *image;
/* Allocate and initialize a controlling structure */
result = do_kimage_alloc(&image, entry, nr_segments, segments);
- if (result) {
+ if (result)
goto out;
- }
/* Enable the special crash kernel control page
* allocation policy.
result = -EADDRNOTAVAIL;
for (i = 0; i < nr_segments; i++) {
unsigned long mstart, mend;
+
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz - 1;
/* Ensure we are within the crash kernel limits */
goto out;
}
-
/*
* Find a location for the control code buffer, and add
* the vector of segments so that it's pages will also be
*/
result = -ENOMEM;
image->control_code_page = kimage_alloc_control_pages(image,
- get_order(KEXEC_CONTROL_CODE_SIZE));
+ get_order(KEXEC_CONTROL_PAGE_SIZE));
if (!image->control_code_page) {
printk(KERN_ERR "Could not allocate control_code_buffer\n");
goto out;
}
result = 0;
- out:
- if (result == 0) {
+out:
+ if (result == 0)
*rimage = image;
- } else {
+ else
kfree(image);
- }
+
return result;
}
-static int kimage_is_destination_range(
- struct kimage *image, unsigned long start, unsigned long end)
+static int kimage_is_destination_range(struct kimage *image,
+ unsigned long start,
+ unsigned long end)
{
unsigned long i;
for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
+
mstart = image->segment[i].mem;
- mend = mstart + image->segment[i].memsz;
- if ((end > mstart) && (start < mend)) {
+ mend = mstart + image->segment[i].memsz;
+ if ((end > mstart) && (start < mend))
return 1;
- }
}
+
return 0;
}
-static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
+static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
{
struct page *pages;
+
pages = alloc_pages(gfp_mask, order);
if (pages) {
unsigned int count, i;
pages->mapping = NULL;
- pages->private = order;
+ set_page_private(pages, order);
count = 1 << order;
- for(i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
SetPageReserved(pages + i);
- }
}
+
return pages;
}
static void kimage_free_pages(struct page *page)
{
unsigned int order, count, i;
- order = page->private;
+
+ order = page_private(page);
count = 1 << order;
- for(i = 0; i < count; i++) {
+ for (i = 0; i < count; i++)
ClearPageReserved(page + i);
- }
__free_pages(page, order);
}
static void kimage_free_page_list(struct list_head *list)
{
struct list_head *pos, *next;
+
list_for_each_safe(pos, next, list) {
struct page *page;
page = list_entry(pos, struct page, lru);
list_del(&page->lru);
-
kimage_free_pages(page);
}
}
-static struct page *kimage_alloc_normal_control_pages(
- struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
+ unsigned int order)
{
/* Control pages are special, they are the intermediaries
* that are needed while we copy the rest of the pages
*/
do {
unsigned long pfn, epfn, addr, eaddr;
+
pages = kimage_alloc_pages(GFP_KERNEL, order);
if (!pages)
break;
addr = pfn << PAGE_SHIFT;
eaddr = epfn << PAGE_SHIFT;
if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
- kimage_is_destination_range(image, addr, eaddr))
- {
+ kimage_is_destination_range(image, addr, eaddr)) {
list_add(&pages->lru, &extra_pages);
pages = NULL;
}
- } while(!pages);
+ } while (!pages);
+
if (pages) {
/* Remember the allocated page... */
list_add(&pages->lru, &image->control_pages);
* For now it is simpler to just free the pages.
*/
kimage_free_page_list(&extra_pages);
- return pages;
+ return pages;
}
-static struct page *kimage_alloc_crash_control_pages(
- struct kimage *image, unsigned int order)
+static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
+ unsigned int order)
{
/* Control pages are special, they are the intermediaries
* that are needed while we copy the rest of the pages
*/
unsigned long hole_start, hole_end, size;
struct page *pages;
+
pages = NULL;
size = (1 << order) << PAGE_SHIFT;
hole_start = (image->control_page + (size - 1)) & ~(size - 1);
hole_end = hole_start + size - 1;
- while(hole_end <= crashk_res.end) {
+ while (hole_end <= crashk_res.end) {
unsigned long i;
- if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
+
+ if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
break;
- }
- if (hole_end > crashk_res.end) {
+ if (hole_end > crashk_res.end)
break;
- }
/* See if I overlap any of the segments */
- for(i = 0; i < image->nr_segments; i++) {
+ for (i = 0; i < image->nr_segments; i++) {
unsigned long mstart, mend;
+
mstart = image->segment[i].mem;
mend = mstart + image->segment[i].memsz - 1;
if ((hole_end >= mstart) && (hole_start <= mend)) {
break;
}
}
- if (pages) {
+ if (pages)
image->control_page = hole_end;
- }
+
return pages;
}
-struct page *kimage_alloc_control_pages(
- struct kimage *image, unsigned int order)
+struct page *kimage_alloc_control_pages(struct kimage *image,
+ unsigned int order)
{
struct page *pages = NULL;
- switch(image->type) {
+
+ switch (image->type) {
case KEXEC_TYPE_DEFAULT:
pages = kimage_alloc_normal_control_pages(image, order);
break;
pages = kimage_alloc_crash_control_pages(image, order);
break;
}
+
return pages;
}
static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
{
- if (*image->entry != 0) {
+ if (*image->entry != 0)
image->entry++;
- }
+
if (image->entry == image->last_entry) {
kimage_entry_t *ind_page;
struct page *page;
+
page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
- if (!page) {
+ if (!page)
return -ENOMEM;
- }
+
ind_page = page_address(page);
*image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
image->entry = ind_page;
- image->last_entry =
- ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
+ image->last_entry = ind_page +
+ ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
}
*image->entry = entry;
image->entry++;
*image->entry = 0;
+
return 0;
}
-static int kimage_set_destination(
- struct kimage *image, unsigned long destination)
+static int kimage_set_destination(struct kimage *image,
+ unsigned long destination)
{
int result;
destination &= PAGE_MASK;
result = kimage_add_entry(image, destination | IND_DESTINATION);
- if (result == 0) {
+ if (result == 0)
image->destination = destination;
- }
+
return result;
}
page &= PAGE_MASK;
result = kimage_add_entry(image, page | IND_SOURCE);
- if (result == 0) {
+ if (result == 0)
image->destination += PAGE_SIZE;
- }
+
return result;
}
kimage_free_page_list(&image->unuseable_pages);
}
-static int kimage_terminate(struct kimage *image)
+static void kimage_terminate(struct kimage *image)
{
- if (*image->entry != 0) {
+ if (*image->entry != 0)
image->entry++;
- }
+
*image->entry = IND_DONE;
- return 0;
}
#define for_each_kimage_entry(image, ptr, entry) \
if (!image)
return;
+
kimage_free_extra_pages(image);
for_each_kimage_entry(image, ptr, entry) {
if (entry & IND_INDIRECTION) {
/* Free the previous indirection page */
- if (ind & IND_INDIRECTION) {
+ if (ind & IND_INDIRECTION)
kimage_free_entry(ind);
- }
/* Save this indirection page until we are
* done with it.
*/
ind = entry;
}
- else if (entry & IND_SOURCE) {
+ else if (entry & IND_SOURCE)
kimage_free_entry(entry);
- }
}
/* Free the final indirection page */
- if (ind & IND_INDIRECTION) {
+ if (ind & IND_INDIRECTION)
kimage_free_entry(ind);
- }
/* Handle any machine specific cleanup */
machine_kexec_cleanup(image);
kfree(image);
}
-static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
+static kimage_entry_t *kimage_dst_used(struct kimage *image,
+ unsigned long page)
{
kimage_entry_t *ptr, entry;
unsigned long destination = 0;
for_each_kimage_entry(image, ptr, entry) {
- if (entry & IND_DESTINATION) {
+ if (entry & IND_DESTINATION)
destination = entry & PAGE_MASK;
- }
else if (entry & IND_SOURCE) {
- if (page == destination) {
+ if (page == destination)
return ptr;
- }
destination += PAGE_SIZE;
}
}
- return 0;
+
+ return NULL;
}
-static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
+static struct page *kimage_alloc_page(struct kimage *image,
+ gfp_t gfp_mask,
+ unsigned long destination)
{
/*
* Here we implement safeguards to ensure that a source page
/* Allocate a page, if we run out of memory give up */
page = kimage_alloc_pages(gfp_mask, 0);
- if (!page) {
- return 0;
- }
+ if (!page)
+ return NULL;
/* If the page cannot be used file it away */
- if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
+ if (page_to_pfn(page) >
+ (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
list_add(&page->lru, &image->unuseable_pages);
continue;
}
break;
/* If the page is not a destination page use it */
- if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
+ if (!kimage_is_destination_range(image, addr,
+ addr + PAGE_SIZE))
break;
/*
*old = addr | (*old & ~PAGE_MASK);
/* The old page I have found cannot be a
- * destination page, so return it.
+ * destination page, so return it if it's
+ * gfp_flags honor the ones passed in.
*/
+ if (!(gfp_mask & __GFP_HIGHMEM) &&
+ PageHighMem(old_page)) {
+ kimage_free_pages(old_page);
+ continue;
+ }
addr = old_addr;
page = old_page;
break;
list_add(&page->lru, &image->dest_pages);
}
}
+
return page;
}
static int kimage_load_normal_segment(struct kimage *image,
- struct kexec_segment *segment)
+ struct kexec_segment *segment)
{
unsigned long maddr;
unsigned long ubytes, mbytes;
int result;
- unsigned char *buf;
+ unsigned char __user *buf;
result = 0;
buf = segment->buf;
maddr = segment->mem;
result = kimage_set_destination(image, maddr);
- if (result < 0) {
+ if (result < 0)
goto out;
- }
- while(mbytes) {
+
+ while (mbytes) {
struct page *page;
char *ptr;
size_t uchunk, mchunk;
+
page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
- if (page == 0) {
+ if (!page) {
result = -ENOMEM;
goto out;
}
- result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
- if (result < 0) {
+ result = kimage_add_page(image, page_to_pfn(page)
+ << PAGE_SHIFT);
+ if (result < 0)
goto out;
- }
+
ptr = kmap(page);
/* Start with a clear page */
memset(ptr, 0, PAGE_SIZE);
ptr += maddr & ~PAGE_MASK;
mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes) {
+ if (mchunk > mbytes)
mchunk = mbytes;
- }
+
uchunk = mchunk;
- if (uchunk > ubytes) {
+ if (uchunk > ubytes)
uchunk = ubytes;
- }
+
result = copy_from_user(ptr, buf, uchunk);
kunmap(page);
if (result) {
buf += mchunk;
mbytes -= mchunk;
}
- out:
+out:
return result;
}
static int kimage_load_crash_segment(struct kimage *image,
- struct kexec_segment *segment)
+ struct kexec_segment *segment)
{
/* For crash dumps kernels we simply copy the data from
* user space to it's destination.
unsigned long maddr;
unsigned long ubytes, mbytes;
int result;
- unsigned char *buf;
+ unsigned char __user *buf;
result = 0;
buf = segment->buf;
ubytes = segment->bufsz;
mbytes = segment->memsz;
maddr = segment->mem;
- while(mbytes) {
+ while (mbytes) {
struct page *page;
char *ptr;
size_t uchunk, mchunk;
+
page = pfn_to_page(maddr >> PAGE_SHIFT);
- if (page == 0) {
+ if (!page) {
result = -ENOMEM;
goto out;
}
ptr = kmap(page);
ptr += maddr & ~PAGE_MASK;
mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
- if (mchunk > mbytes) {
+ if (mchunk > mbytes)
mchunk = mbytes;
- }
+
uchunk = mchunk;
if (uchunk > ubytes) {
uchunk = ubytes;
memset(ptr + uchunk, 0, mchunk - uchunk);
}
result = copy_from_user(ptr, buf, uchunk);
+ kexec_flush_icache_page(page);
kunmap(page);
if (result) {
result = (result < 0) ? result : -EIO;
buf += mchunk;
mbytes -= mchunk;
}
- out:
+out:
return result;
}
static int kimage_load_segment(struct kimage *image,
- struct kexec_segment *segment)
+ struct kexec_segment *segment)
{
int result = -ENOMEM;
- switch(image->type) {
+
+ switch (image->type) {
case KEXEC_TYPE_DEFAULT:
result = kimage_load_normal_segment(image, segment);
break;
result = kimage_load_crash_segment(image, segment);
break;
}
+
return result;
}
* kexec does not sync, or unmount filesystems so if you need
* that to happen you need to do that yourself.
*/
-struct kimage *kexec_image = NULL;
-static struct kimage *kexec_crash_image = NULL;
-/*
- * A home grown binary mutex.
- * Nothing can wait so this mutex is safe to use
- * in interrupt context :)
- */
-static int kexec_lock = 0;
+struct kimage *kexec_image;
+struct kimage *kexec_crash_image;
-asmlinkage long sys_kexec_load(unsigned long entry,
- unsigned long nr_segments, struct kexec_segment __user *segments,
- unsigned long flags)
+static DEFINE_MUTEX(kexec_mutex);
+
+SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
+ struct kexec_segment __user *, segments, unsigned long, flags)
{
struct kimage **dest_image, *image;
- int locked;
int result;
/* We only trust the superuser with rebooting the system. */
/* Verify we are on the appropriate architecture */
if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
- {
return -EINVAL;
- }
/* Put an artificial cap on the number
* of segments passed to kexec_load.
*
* KISS: always take the mutex.
*/
- locked = xchg(&kexec_lock, 1);
- if (locked) {
+ if (!mutex_trylock(&kexec_mutex))
return -EBUSY;
- }
+
dest_image = &kexec_image;
- if (flags & KEXEC_ON_CRASH) {
+ if (flags & KEXEC_ON_CRASH)
dest_image = &kexec_crash_image;
- }
if (nr_segments > 0) {
unsigned long i;
+
/* Loading another kernel to reboot into */
- if ((flags & KEXEC_ON_CRASH) == 0) {
- result = kimage_normal_alloc(&image, entry, nr_segments, segments);
- }
+ if ((flags & KEXEC_ON_CRASH) == 0)
+ result = kimage_normal_alloc(&image, entry,
+ nr_segments, segments);
/* Loading another kernel to switch to if this one crashes */
else if (flags & KEXEC_ON_CRASH) {
/* Free any current crash dump kernel before
* we corrupt it.
*/
kimage_free(xchg(&kexec_crash_image, NULL));
- result = kimage_crash_alloc(&image, entry, nr_segments, segments);
+ result = kimage_crash_alloc(&image, entry,
+ nr_segments, segments);
}
- if (result) {
+ if (result)
goto out;
- }
+
+ if (flags & KEXEC_PRESERVE_CONTEXT)
+ image->preserve_context = 1;
result = machine_kexec_prepare(image);
- if (result) {
+ if (result)
goto out;
- }
- for(i = 0; i < nr_segments; i++) {
+
+ for (i = 0; i < nr_segments; i++) {
result = kimage_load_segment(image, &image->segment[i]);
- if (result) {
+ if (result)
goto out;
- }
- }
- result = kimage_terminate(image);
- if (result) {
- goto out;
}
+ kimage_terminate(image);
}
/* Install the new kernel, and Uninstall the old */
image = xchg(dest_image, image);
- out:
- xchg(&kexec_lock, 0); /* Release the mutex */
+out:
+ mutex_unlock(&kexec_mutex);
kimage_free(image);
+
return result;
}
#ifdef CONFIG_COMPAT
asmlinkage long compat_sys_kexec_load(unsigned long entry,
- unsigned long nr_segments, struct compat_kexec_segment __user *segments,
- unsigned long flags)
+ unsigned long nr_segments,
+ struct compat_kexec_segment __user *segments,
+ unsigned long flags)
{
struct compat_kexec_segment in;
struct kexec_segment out, __user *ksegments;
/* Don't allow clients that don't understand the native
* architecture to do anything.
*/
- if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
+ if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
return -EINVAL;
- }
- if (nr_segments > KEXEC_SEGMENT_MAX) {
+ if (nr_segments > KEXEC_SEGMENT_MAX)
return -EINVAL;
- }
ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
for (i=0; i < nr_segments; i++) {
result = copy_from_user(&in, &segments[i], sizeof(in));
- if (result) {
+ if (result)
return -EFAULT;
- }
out.buf = compat_ptr(in.buf);
out.bufsz = in.bufsz;
out.memsz = in.memsz;
result = copy_to_user(&ksegments[i], &out, sizeof(out));
- if (result) {
+ if (result)
return -EFAULT;
- }
}
return sys_kexec_load(entry, nr_segments, ksegments, flags);
void crash_kexec(struct pt_regs *regs)
{
- struct kimage *image;
- int locked;
-
-
- /* Take the kexec_lock here to prevent sys_kexec_load
+ /* Take the kexec_mutex here to prevent sys_kexec_load
* running on one cpu from replacing the crash kernel
* we are using after a panic on a different cpu.
*
* of memory the xchg(&kexec_crash_image) would be
* sufficient. But since I reuse the memory...
*/
- locked = xchg(&kexec_lock, 1);
- if (!locked) {
- image = xchg(&kexec_crash_image, NULL);
- if (image) {
- machine_crash_shutdown(regs);
- machine_kexec(image);
+ if (mutex_trylock(&kexec_mutex)) {
+ if (kexec_crash_image) {
+ struct pt_regs fixed_regs;
+ crash_setup_regs(&fixed_regs, regs);
+ crash_save_vmcoreinfo();
+ machine_crash_shutdown(&fixed_regs);
+ machine_kexec(kexec_crash_image);
}
- xchg(&kexec_lock, 0);
+ mutex_unlock(&kexec_mutex);
}
}
+
+static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
+ size_t data_len)
+{
+ struct elf_note note;
+
+ note.n_namesz = strlen(name) + 1;
+ note.n_descsz = data_len;
+ note.n_type = type;
+ memcpy(buf, ¬e, sizeof(note));
+ buf += (sizeof(note) + 3)/4;
+ memcpy(buf, name, note.n_namesz);
+ buf += (note.n_namesz + 3)/4;
+ memcpy(buf, data, note.n_descsz);
+ buf += (note.n_descsz + 3)/4;
+
+ return buf;
+}
+
+static void final_note(u32 *buf)
+{
+ struct elf_note note;
+
+ note.n_namesz = 0;
+ note.n_descsz = 0;
+ note.n_type = 0;
+ memcpy(buf, ¬e, sizeof(note));
+}
+
+void crash_save_cpu(struct pt_regs *regs, int cpu)
+{
+ struct elf_prstatus prstatus;
+ u32 *buf;
+
+ if ((cpu < 0) || (cpu >= nr_cpu_ids))
+ return;
+
+ /* Using ELF notes here is opportunistic.
+ * I need a well defined structure format
+ * for the data I pass, and I need tags
+ * on the data to indicate what information I have
+ * squirrelled away. ELF notes happen to provide
+ * all of that, so there is no need to invent something new.
+ */
+ buf = (u32*)per_cpu_ptr(crash_notes, cpu);
+ if (!buf)
+ return;
+ memset(&prstatus, 0, sizeof(prstatus));
+ prstatus.pr_pid = current->pid;
+ elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
+ buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
+ &prstatus, sizeof(prstatus));
+ final_note(buf);
+}
+
+static int __init crash_notes_memory_init(void)
+{
+ /* Allocate memory for saving cpu registers. */
+ crash_notes = alloc_percpu(note_buf_t);
+ if (!crash_notes) {
+ printk("Kexec: Memory allocation for saving cpu register"
+ " states failed\n");
+ return -ENOMEM;
+ }
+ return 0;
+}
+module_init(crash_notes_memory_init)
+
+
+/*
+ * parsing the "crashkernel" commandline
+ *
+ * this code is intended to be called from architecture specific code
+ */
+
+
+/*
+ * This function parses command lines in the format
+ *
+ * crashkernel=ramsize-range:size[,...][@offset]
+ *
+ * The function returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_mem(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline, *tmp;
+
+ /* for each entry of the comma-separated list */
+ do {
+ unsigned long long start, end = ULLONG_MAX, size;
+
+ /* get the start of the range */
+ start = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("crashkernel: Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (*cur != '-') {
+ pr_warning("crashkernel: '-' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ /* if no ':' is here, than we read the end */
+ if (*cur != ':') {
+ end = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("crashkernel: Memory "
+ "value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (end <= start) {
+ pr_warning("crashkernel: end <= start\n");
+ return -EINVAL;
+ }
+ }
+
+ if (*cur != ':') {
+ pr_warning("crashkernel: ':' expected\n");
+ return -EINVAL;
+ }
+ cur++;
+
+ size = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("Memory value expected\n");
+ return -EINVAL;
+ }
+ cur = tmp;
+ if (size >= system_ram) {
+ pr_warning("crashkernel: invalid size\n");
+ return -EINVAL;
+ }
+
+ /* match ? */
+ if (system_ram >= start && system_ram < end) {
+ *crash_size = size;
+ break;
+ }
+ } while (*cur++ == ',');
+
+ if (*crash_size > 0) {
+ while (*cur && *cur != ' ' && *cur != '@')
+ cur++;
+ if (*cur == '@') {
+ cur++;
+ *crash_base = memparse(cur, &tmp);
+ if (cur == tmp) {
+ pr_warning("Memory value expected "
+ "after '@'\n");
+ return -EINVAL;
+ }
+ }
+ }
+
+ return 0;
+}
+
+/*
+ * That function parses "simple" (old) crashkernel command lines like
+ *
+ * crashkernel=size[@offset]
+ *
+ * It returns 0 on success and -EINVAL on failure.
+ */
+static int __init parse_crashkernel_simple(char *cmdline,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *cur = cmdline;
+
+ *crash_size = memparse(cmdline, &cur);
+ if (cmdline == cur) {
+ pr_warning("crashkernel: memory value expected\n");
+ return -EINVAL;
+ }
+
+ if (*cur == '@')
+ *crash_base = memparse(cur+1, &cur);
+
+ return 0;
+}
+
+/*
+ * That function is the entry point for command line parsing and should be
+ * called from the arch-specific code.
+ */
+int __init parse_crashkernel(char *cmdline,
+ unsigned long long system_ram,
+ unsigned long long *crash_size,
+ unsigned long long *crash_base)
+{
+ char *p = cmdline, *ck_cmdline = NULL;
+ char *first_colon, *first_space;
+
+ BUG_ON(!crash_size || !crash_base);
+ *crash_size = 0;
+ *crash_base = 0;
+
+ /* find crashkernel and use the last one if there are more */
+ p = strstr(p, "crashkernel=");
+ while (p) {
+ ck_cmdline = p;
+ p = strstr(p+1, "crashkernel=");
+ }
+
+ if (!ck_cmdline)
+ return -EINVAL;
+
+ ck_cmdline += 12; /* strlen("crashkernel=") */
+
+ /*
+ * if the commandline contains a ':', then that's the extended
+ * syntax -- if not, it must be the classic syntax
+ */
+ first_colon = strchr(ck_cmdline, ':');
+ first_space = strchr(ck_cmdline, ' ');
+ if (first_colon && (!first_space || first_colon < first_space))
+ return parse_crashkernel_mem(ck_cmdline, system_ram,
+ crash_size, crash_base);
+ else
+ return parse_crashkernel_simple(ck_cmdline, crash_size,
+ crash_base);
+
+ return 0;
+}
+
+
+
+void crash_save_vmcoreinfo(void)
+{
+ u32 *buf;
+
+ if (!vmcoreinfo_size)
+ return;
+
+ vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
+
+ buf = (u32 *)vmcoreinfo_note;
+
+ buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
+ vmcoreinfo_size);
+
+ final_note(buf);
+}
+
+void vmcoreinfo_append_str(const char *fmt, ...)
+{
+ va_list args;
+ char buf[0x50];
+ int r;
+
+ va_start(args, fmt);
+ r = vsnprintf(buf, sizeof(buf), fmt, args);
+ va_end(args);
+
+ if (r + vmcoreinfo_size > vmcoreinfo_max_size)
+ r = vmcoreinfo_max_size - vmcoreinfo_size;
+
+ memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
+
+ vmcoreinfo_size += r;
+}
+
+/*
+ * provide an empty default implementation here -- architecture
+ * code may override this
+ */
+void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
+{}
+
+unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
+{
+ return __pa((unsigned long)(char *)&vmcoreinfo_note);
+}
+
+static int __init crash_save_vmcoreinfo_init(void)
+{
+ VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
+ VMCOREINFO_PAGESIZE(PAGE_SIZE);
+
+ VMCOREINFO_SYMBOL(init_uts_ns);
+ VMCOREINFO_SYMBOL(node_online_map);
+ VMCOREINFO_SYMBOL(swapper_pg_dir);
+ VMCOREINFO_SYMBOL(_stext);
+ VMCOREINFO_SYMBOL(vmlist);
+
+#ifndef CONFIG_NEED_MULTIPLE_NODES
+ VMCOREINFO_SYMBOL(mem_map);
+ VMCOREINFO_SYMBOL(contig_page_data);
+#endif
+#ifdef CONFIG_SPARSEMEM
+ VMCOREINFO_SYMBOL(mem_section);
+ VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
+ VMCOREINFO_STRUCT_SIZE(mem_section);
+ VMCOREINFO_OFFSET(mem_section, section_mem_map);
+#endif
+ VMCOREINFO_STRUCT_SIZE(page);
+ VMCOREINFO_STRUCT_SIZE(pglist_data);
+ VMCOREINFO_STRUCT_SIZE(zone);
+ VMCOREINFO_STRUCT_SIZE(free_area);
+ VMCOREINFO_STRUCT_SIZE(list_head);
+ VMCOREINFO_SIZE(nodemask_t);
+ VMCOREINFO_OFFSET(page, flags);
+ VMCOREINFO_OFFSET(page, _count);
+ VMCOREINFO_OFFSET(page, mapping);
+ VMCOREINFO_OFFSET(page, lru);
+ VMCOREINFO_OFFSET(pglist_data, node_zones);
+ VMCOREINFO_OFFSET(pglist_data, nr_zones);
+#ifdef CONFIG_FLAT_NODE_MEM_MAP
+ VMCOREINFO_OFFSET(pglist_data, node_mem_map);
+#endif
+ VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
+ VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
+ VMCOREINFO_OFFSET(pglist_data, node_id);
+ VMCOREINFO_OFFSET(zone, free_area);
+ VMCOREINFO_OFFSET(zone, vm_stat);
+ VMCOREINFO_OFFSET(zone, spanned_pages);
+ VMCOREINFO_OFFSET(free_area, free_list);
+ VMCOREINFO_OFFSET(list_head, next);
+ VMCOREINFO_OFFSET(list_head, prev);
+ VMCOREINFO_OFFSET(vm_struct, addr);
+ VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+ log_buf_kexec_setup();
+ VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
+ VMCOREINFO_NUMBER(NR_FREE_PAGES);
+ VMCOREINFO_NUMBER(PG_lru);
+ VMCOREINFO_NUMBER(PG_private);
+ VMCOREINFO_NUMBER(PG_swapcache);
+
+ arch_crash_save_vmcoreinfo();
+
+ return 0;
+}
+
+module_init(crash_save_vmcoreinfo_init)
+
+/*
+ * Move into place and start executing a preloaded standalone
+ * executable. If nothing was preloaded return an error.
+ */
+int kernel_kexec(void)
+{
+ int error = 0;
+
+ if (!mutex_trylock(&kexec_mutex))
+ return -EBUSY;
+ if (!kexec_image) {
+ error = -EINVAL;
+ goto Unlock;
+ }
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ mutex_lock(&pm_mutex);
+ pm_prepare_console();
+ error = freeze_processes();
+ if (error) {
+ error = -EBUSY;
+ goto Restore_console;
+ }
+ suspend_console();
+ error = dpm_suspend_start(PMSG_FREEZE);
+ if (error)
+ goto Resume_console;
+ /* At this point, dpm_suspend_start() has been called,
+ * but *not* dpm_suspend_noirq(). We *must* call
+ * dpm_suspend_noirq() now. Otherwise, drivers for
+ * some devices (e.g. interrupt controllers) become
+ * desynchronized with the actual state of the
+ * hardware at resume time, and evil weirdness ensues.
+ */
+ error = dpm_suspend_noirq(PMSG_FREEZE);
+ if (error)
+ goto Resume_devices;
+ error = disable_nonboot_cpus();
+ if (error)
+ goto Enable_cpus;
+ local_irq_disable();
+ /* Suspend system devices */
+ error = sysdev_suspend(PMSG_FREEZE);
+ if (error)
+ goto Enable_irqs;
+ } else
+#endif
+ {
+ kernel_restart_prepare(NULL);
+ printk(KERN_EMERG "Starting new kernel\n");
+ machine_shutdown();
+ }
+
+ machine_kexec(kexec_image);
+
+#ifdef CONFIG_KEXEC_JUMP
+ if (kexec_image->preserve_context) {
+ sysdev_resume();
+ Enable_irqs:
+ local_irq_enable();
+ Enable_cpus:
+ enable_nonboot_cpus();
+ dpm_resume_noirq(PMSG_RESTORE);
+ Resume_devices:
+ dpm_resume_end(PMSG_RESTORE);
+ Resume_console:
+ resume_console();
+ thaw_processes();
+ Restore_console:
+ pm_restore_console();
+ mutex_unlock(&pm_mutex);
+ }
+#endif
+
+ Unlock:
+ mutex_unlock(&kexec_mutex);
+ return error;
+}