X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Fkexec.c;h=474a84715eaca2936b51ad5a6c46fb4774751c5b;hb=3374cd1abd478f767aaedf2c21d109596ff0fe72;hp=a0411b3bd54a58d0df0c7ee3036b9a6c37871933;hpb=50cccc699ed849d31c9e3f7643db33edade20e4e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/kexec.c b/kernel/kexec.c index a0411b3..474a847 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -6,23 +6,48 @@ * Version 2. See the file COPYING for more details. */ +#include #include #include #include #include #include -#include +#include #include #include #include #include -#include #include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + #include #include #include #include -#include +#include + +/* Per cpu memory for storing cpu states in case of system crash. */ +note_buf_t __percpu *crash_notes; + +/* vmcoreinfo stuff */ +static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES]; +u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4]; +size_t vmcoreinfo_size; +size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data); /* Location of the reserved area for the crash kernel */ struct resource crashk_res = { @@ -32,6 +57,13 @@ struct resource crashk_res = { .flags = IORESOURCE_BUSY | IORESOURCE_MEM }; +int kexec_should_crash(struct task_struct *p) +{ + if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops) + return 1; + return 0; +} + /* * When kexec transitions to the new kernel there is a one-to-one * mapping between physical and virtual addresses. On processors @@ -48,7 +80,7 @@ struct resource crashk_res = { * * The code for the transition from the current kernel to the * the new kernel is placed in the control_code_buffer, whose size - * is given by KEXEC_CONTROL_CODE_SIZE. In the best case only a single + * is given by KEXEC_CONTROL_PAGE_SIZE. In the best case only a single * page of memory is necessary, but some architectures require more. * Because this memory must be identity mapped in the transition from * virtual to physical addresses it must live in the range @@ -78,12 +110,15 @@ struct resource crashk_res = { */ #define KIMAGE_NO_DEST (-1UL) -static int kimage_is_destination_range( - struct kimage *image, unsigned long start, unsigned long end); -static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest); +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, unsigned long end); +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long dest); static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { size_t segment_bytes; struct kimage *image; @@ -92,11 +127,10 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, /* Allocate a controlling structure */ result = -ENOMEM; - image = kmalloc(sizeof(*image), GFP_KERNEL); - if (!image) { + image = kzalloc(sizeof(*image), GFP_KERNEL); + if (!image) goto out; - } - memset(image, 0, sizeof(*image)); + image->head = 0; image->entry = &image->head; image->last_entry = &image->head; @@ -136,6 +170,7 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; + mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK)) @@ -150,12 +185,13 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, * easy explanation as one segment stops on another. */ result = -EINVAL; - for(i = 0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; unsigned long j; + mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz; - for(j = 0; j < i; j++) { + for (j = 0; j < i; j++) { unsigned long pstart, pend; pstart = image->segment[j].mem; pend = pstart + image->segment[j].memsz; @@ -171,25 +207,25 @@ static int do_kimage_alloc(struct kimage **rimage, unsigned long entry, * later on. */ result = -EINVAL; - for(i = 0; i < nr_segments; i++) { + for (i = 0; i < nr_segments; i++) { if (image->segment[i].bufsz > image->segment[i].memsz) goto out; } - result = 0; - out: - if (result == 0) { +out: + if (result == 0) *rimage = image; - } else { + else kfree(image); - } + return result; } static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, struct kexec_segment __user *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { int result; struct kimage *image; @@ -197,9 +233,9 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, /* Allocate and initialize a controlling structure */ image = NULL; result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) { + if (result) goto out; - } + *rimage = image; /* @@ -209,24 +245,31 @@ static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; } + image->swap_page = kimage_alloc_control_pages(image, 0); + if (!image->swap_page) { + printk(KERN_ERR "Could not allocate swap buffer\n"); + goto out; + } + result = 0; out: - if (result == 0) { + if (result == 0) *rimage = image; - } else { + else kfree(image); - } + return result; } static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, - unsigned long nr_segments, struct kexec_segment *segments) + unsigned long nr_segments, + struct kexec_segment __user *segments) { int result; struct kimage *image; @@ -241,9 +284,8 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, /* Allocate and initialize a controlling structure */ result = do_kimage_alloc(&image, entry, nr_segments, segments); - if (result) { + if (result) goto out; - } /* Enable the special crash kernel control page * allocation policy. @@ -263,6 +305,7 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, result = -EADDRNOTAVAIL; for (i = 0; i < nr_segments; i++) { unsigned long mstart, mend; + mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; /* Ensure we are within the crash kernel limits */ @@ -270,7 +313,6 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, goto out; } - /* * Find a location for the control code buffer, and add * the vector of segments so that it's pages will also be @@ -278,80 +320,83 @@ static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry, */ result = -ENOMEM; image->control_code_page = kimage_alloc_control_pages(image, - get_order(KEXEC_CONTROL_CODE_SIZE)); + get_order(KEXEC_CONTROL_PAGE_SIZE)); if (!image->control_code_page) { printk(KERN_ERR "Could not allocate control_code_buffer\n"); goto out; } result = 0; - out: - if (result == 0) { +out: + if (result == 0) *rimage = image; - } else { + else kfree(image); - } + return result; } -static int kimage_is_destination_range( - struct kimage *image, unsigned long start, unsigned long end) +static int kimage_is_destination_range(struct kimage *image, + unsigned long start, + unsigned long end) { unsigned long i; for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; + mstart = image->segment[i].mem; - mend = mstart + image->segment[i].memsz; - if ((end > mstart) && (start < mend)) { + mend = mstart + image->segment[i].memsz; + if ((end > mstart) && (start < mend)) return 1; - } } + return 0; } -static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order) +static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order) { struct page *pages; + pages = alloc_pages(gfp_mask, order); if (pages) { unsigned int count, i; pages->mapping = NULL; - pages->private = order; + set_page_private(pages, order); count = 1 << order; - for(i = 0; i < count; i++) { + for (i = 0; i < count; i++) SetPageReserved(pages + i); - } } + return pages; } static void kimage_free_pages(struct page *page) { unsigned int order, count, i; - order = page->private; + + order = page_private(page); count = 1 << order; - for(i = 0; i < count; i++) { + for (i = 0; i < count; i++) ClearPageReserved(page + i); - } __free_pages(page, order); } static void kimage_free_page_list(struct list_head *list) { struct list_head *pos, *next; + list_for_each_safe(pos, next, list) { struct page *page; page = list_entry(pos, struct page, lru); list_del(&page->lru); - kimage_free_pages(page); } } -static struct page *kimage_alloc_normal_control_pages( - struct kimage *image, unsigned int order) +static struct page *kimage_alloc_normal_control_pages(struct kimage *image, + unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages @@ -378,6 +423,7 @@ static struct page *kimage_alloc_normal_control_pages( */ do { unsigned long pfn, epfn, addr, eaddr; + pages = kimage_alloc_pages(GFP_KERNEL, order); if (!pages) break; @@ -386,12 +432,12 @@ static struct page *kimage_alloc_normal_control_pages( addr = pfn << PAGE_SHIFT; eaddr = epfn << PAGE_SHIFT; if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) || - kimage_is_destination_range(image, addr, eaddr)) - { + kimage_is_destination_range(image, addr, eaddr)) { list_add(&pages->lru, &extra_pages); pages = NULL; } - } while(!pages); + } while (!pages); + if (pages) { /* Remember the allocated page... */ list_add(&pages->lru, &image->control_pages); @@ -411,12 +457,12 @@ static struct page *kimage_alloc_normal_control_pages( * For now it is simpler to just free the pages. */ kimage_free_page_list(&extra_pages); - return pages; + return pages; } -static struct page *kimage_alloc_crash_control_pages( - struct kimage *image, unsigned int order) +static struct page *kimage_alloc_crash_control_pages(struct kimage *image, + unsigned int order) { /* Control pages are special, they are the intermediaries * that are needed while we copy the rest of the pages @@ -441,21 +487,22 @@ static struct page *kimage_alloc_crash_control_pages( */ unsigned long hole_start, hole_end, size; struct page *pages; + pages = NULL; size = (1 << order) << PAGE_SHIFT; hole_start = (image->control_page + (size - 1)) & ~(size - 1); hole_end = hole_start + size - 1; - while(hole_end <= crashk_res.end) { + while (hole_end <= crashk_res.end) { unsigned long i; - if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) { + + if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) break; - } - if (hole_end > crashk_res.end) { + if (hole_end > crashk_res.end) break; - } /* See if I overlap any of the segments */ - for(i = 0; i < image->nr_segments; i++) { + for (i = 0; i < image->nr_segments; i++) { unsigned long mstart, mend; + mstart = image->segment[i].mem; mend = mstart + image->segment[i].memsz - 1; if ((hole_end >= mstart) && (hole_start <= mend)) { @@ -471,18 +518,19 @@ static struct page *kimage_alloc_crash_control_pages( break; } } - if (pages) { + if (pages) image->control_page = hole_end; - } + return pages; } -struct page *kimage_alloc_control_pages( - struct kimage *image, unsigned int order) +struct page *kimage_alloc_control_pages(struct kimage *image, + unsigned int order) { struct page *pages = NULL; - switch(image->type) { + + switch (image->type) { case KEXEC_TYPE_DEFAULT: pages = kimage_alloc_normal_control_pages(image, order); break; @@ -490,43 +538,46 @@ struct page *kimage_alloc_control_pages( pages = kimage_alloc_crash_control_pages(image, order); break; } + return pages; } static int kimage_add_entry(struct kimage *image, kimage_entry_t entry) { - if (*image->entry != 0) { + if (*image->entry != 0) image->entry++; - } + if (image->entry == image->last_entry) { kimage_entry_t *ind_page; struct page *page; + page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST); - if (!page) { + if (!page) return -ENOMEM; - } + ind_page = page_address(page); *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION; image->entry = ind_page; - image->last_entry = - ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); + image->last_entry = ind_page + + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1); } *image->entry = entry; image->entry++; *image->entry = 0; + return 0; } -static int kimage_set_destination( - struct kimage *image, unsigned long destination) +static int kimage_set_destination(struct kimage *image, + unsigned long destination) { int result; destination &= PAGE_MASK; result = kimage_add_entry(image, destination | IND_DESTINATION); - if (result == 0) { + if (result == 0) image->destination = destination; - } + return result; } @@ -537,9 +588,9 @@ static int kimage_add_page(struct kimage *image, unsigned long page) page &= PAGE_MASK; result = kimage_add_entry(image, page | IND_SOURCE); - if (result == 0) { + if (result == 0) image->destination += PAGE_SIZE; - } + return result; } @@ -553,13 +604,12 @@ static void kimage_free_extra_pages(struct kimage *image) kimage_free_page_list(&image->unuseable_pages); } -static int kimage_terminate(struct kimage *image) +static void kimage_terminate(struct kimage *image) { - if (*image->entry != 0) { + if (*image->entry != 0) image->entry++; - } + *image->entry = IND_DONE; - return 0; } #define for_each_kimage_entry(image, ptr, entry) \ @@ -582,26 +632,24 @@ static void kimage_free(struct kimage *image) if (!image) return; + kimage_free_extra_pages(image); for_each_kimage_entry(image, ptr, entry) { if (entry & IND_INDIRECTION) { /* Free the previous indirection page */ - if (ind & IND_INDIRECTION) { + if (ind & IND_INDIRECTION) kimage_free_entry(ind); - } /* Save this indirection page until we are * done with it. */ ind = entry; } - else if (entry & IND_SOURCE) { + else if (entry & IND_SOURCE) kimage_free_entry(entry); - } } /* Free the final indirection page */ - if (ind & IND_INDIRECTION) { + if (ind & IND_INDIRECTION) kimage_free_entry(ind); - } /* Handle any machine specific cleanup */ machine_kexec_cleanup(image); @@ -611,26 +659,28 @@ static void kimage_free(struct kimage *image) kfree(image); } -static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page) +static kimage_entry_t *kimage_dst_used(struct kimage *image, + unsigned long page) { kimage_entry_t *ptr, entry; unsigned long destination = 0; for_each_kimage_entry(image, ptr, entry) { - if (entry & IND_DESTINATION) { + if (entry & IND_DESTINATION) destination = entry & PAGE_MASK; - } else if (entry & IND_SOURCE) { - if (page == destination) { + if (page == destination) return ptr; - } destination += PAGE_SIZE; } } - return 0; + + return NULL; } -static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination) +static struct page *kimage_alloc_page(struct kimage *image, + gfp_t gfp_mask, + unsigned long destination) { /* * Here we implement safeguards to ensure that a source page @@ -670,11 +720,11 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas /* Allocate a page, if we run out of memory give up */ page = kimage_alloc_pages(gfp_mask, 0); - if (!page) { - return 0; - } + if (!page) + return NULL; /* If the page cannot be used file it away */ - if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { + if (page_to_pfn(page) > + (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) { list_add(&page->lru, &image->unuseable_pages); continue; } @@ -685,7 +735,8 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas break; /* If the page is not a destination page use it */ - if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE)) + if (!kimage_is_destination_range(image, addr, + addr + PAGE_SIZE)) break; /* @@ -705,8 +756,14 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas *old = addr | (*old & ~PAGE_MASK); /* The old page I have found cannot be a - * destination page, so return it. + * destination page, so return it if it's + * gfp_flags honor the ones passed in. */ + if (!(gfp_mask & __GFP_HIGHMEM) && + PageHighMem(old_page)) { + kimage_free_pages(old_page); + continue; + } addr = old_addr; page = old_page; break; @@ -718,16 +775,17 @@ static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mas list_add(&page->lru, &image->dest_pages); } } + return page; } static int kimage_load_normal_segment(struct kimage *image, - struct kexec_segment *segment) + struct kexec_segment *segment) { unsigned long maddr; unsigned long ubytes, mbytes; int result; - unsigned char *buf; + unsigned char __user *buf; result = 0; buf = segment->buf; @@ -736,34 +794,36 @@ static int kimage_load_normal_segment(struct kimage *image, maddr = segment->mem; result = kimage_set_destination(image, maddr); - if (result < 0) { + if (result < 0) goto out; - } - while(mbytes) { + + while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; + page = kimage_alloc_page(image, GFP_HIGHUSER, maddr); - if (page == 0) { + if (!page) { result = -ENOMEM; goto out; } - result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT); - if (result < 0) { + result = kimage_add_page(image, page_to_pfn(page) + << PAGE_SHIFT); + if (result < 0) goto out; - } + ptr = kmap(page); /* Start with a clear page */ memset(ptr, 0, PAGE_SIZE); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) { + if (mchunk > mbytes) mchunk = mbytes; - } + uchunk = mchunk; - if (uchunk > ubytes) { + if (uchunk > ubytes) uchunk = ubytes; - } + result = copy_from_user(ptr, buf, uchunk); kunmap(page); if (result) { @@ -775,12 +835,12 @@ static int kimage_load_normal_segment(struct kimage *image, buf += mchunk; mbytes -= mchunk; } - out: +out: return result; } static int kimage_load_crash_segment(struct kimage *image, - struct kexec_segment *segment) + struct kexec_segment *segment) { /* For crash dumps kernels we simply copy the data from * user space to it's destination. @@ -789,28 +849,29 @@ static int kimage_load_crash_segment(struct kimage *image, unsigned long maddr; unsigned long ubytes, mbytes; int result; - unsigned char *buf; + unsigned char __user *buf; result = 0; buf = segment->buf; ubytes = segment->bufsz; mbytes = segment->memsz; maddr = segment->mem; - while(mbytes) { + while (mbytes) { struct page *page; char *ptr; size_t uchunk, mchunk; + page = pfn_to_page(maddr >> PAGE_SHIFT); - if (page == 0) { + if (!page) { result = -ENOMEM; goto out; } ptr = kmap(page); ptr += maddr & ~PAGE_MASK; mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK); - if (mchunk > mbytes) { + if (mchunk > mbytes) mchunk = mbytes; - } + uchunk = mchunk; if (uchunk > ubytes) { uchunk = ubytes; @@ -818,6 +879,7 @@ static int kimage_load_crash_segment(struct kimage *image, memset(ptr + uchunk, 0, mchunk - uchunk); } result = copy_from_user(ptr, buf, uchunk); + kexec_flush_icache_page(page); kunmap(page); if (result) { result = (result < 0) ? result : -EIO; @@ -828,15 +890,16 @@ static int kimage_load_crash_segment(struct kimage *image, buf += mchunk; mbytes -= mchunk; } - out: +out: return result; } static int kimage_load_segment(struct kimage *image, - struct kexec_segment *segment) + struct kexec_segment *segment) { int result = -ENOMEM; - switch(image->type) { + + switch (image->type) { case KEXEC_TYPE_DEFAULT: result = kimage_load_normal_segment(image, segment); break; @@ -844,6 +907,7 @@ static int kimage_load_segment(struct kimage *image, result = kimage_load_crash_segment(image, segment); break; } + return result; } @@ -867,21 +931,15 @@ static int kimage_load_segment(struct kimage *image, * kexec does not sync, or unmount filesystems so if you need * that to happen you need to do that yourself. */ -struct kimage *kexec_image = NULL; -static struct kimage *kexec_crash_image = NULL; -/* - * A home grown binary mutex. - * Nothing can wait so this mutex is safe to use - * in interrupt context :) - */ -static int kexec_lock = 0; +struct kimage *kexec_image; +struct kimage *kexec_crash_image; + +static DEFINE_MUTEX(kexec_mutex); -asmlinkage long sys_kexec_load(unsigned long entry, - unsigned long nr_segments, struct kexec_segment __user *segments, - unsigned long flags) +SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments, + struct kexec_segment __user *, segments, unsigned long, flags) { struct kimage **dest_image, *image; - int locked; int result; /* We only trust the superuser with rebooting the system. */ @@ -898,9 +956,7 @@ asmlinkage long sys_kexec_load(unsigned long entry, /* Verify we are on the appropriate architecture */ if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) && ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT)) - { return -EINVAL; - } /* Put an artificial cap on the number * of segments passed to kexec_load. @@ -919,59 +975,59 @@ asmlinkage long sys_kexec_load(unsigned long entry, * * KISS: always take the mutex. */ - locked = xchg(&kexec_lock, 1); - if (locked) { + if (!mutex_trylock(&kexec_mutex)) return -EBUSY; - } + dest_image = &kexec_image; - if (flags & KEXEC_ON_CRASH) { + if (flags & KEXEC_ON_CRASH) dest_image = &kexec_crash_image; - } if (nr_segments > 0) { unsigned long i; + /* Loading another kernel to reboot into */ - if ((flags & KEXEC_ON_CRASH) == 0) { - result = kimage_normal_alloc(&image, entry, nr_segments, segments); - } + if ((flags & KEXEC_ON_CRASH) == 0) + result = kimage_normal_alloc(&image, entry, + nr_segments, segments); /* Loading another kernel to switch to if this one crashes */ else if (flags & KEXEC_ON_CRASH) { /* Free any current crash dump kernel before * we corrupt it. */ kimage_free(xchg(&kexec_crash_image, NULL)); - result = kimage_crash_alloc(&image, entry, nr_segments, segments); + result = kimage_crash_alloc(&image, entry, + nr_segments, segments); } - if (result) { + if (result) goto out; - } + + if (flags & KEXEC_PRESERVE_CONTEXT) + image->preserve_context = 1; result = machine_kexec_prepare(image); - if (result) { + if (result) goto out; - } - for(i = 0; i < nr_segments; i++) { + + for (i = 0; i < nr_segments; i++) { result = kimage_load_segment(image, &image->segment[i]); - if (result) { + if (result) goto out; - } - } - result = kimage_terminate(image); - if (result) { - goto out; } + kimage_terminate(image); } /* Install the new kernel, and Uninstall the old */ image = xchg(dest_image, image); - out: - xchg(&kexec_lock, 0); /* Release the mutex */ +out: + mutex_unlock(&kexec_mutex); kimage_free(image); + return result; } #ifdef CONFIG_COMPAT asmlinkage long compat_sys_kexec_load(unsigned long entry, - unsigned long nr_segments, struct compat_kexec_segment __user *segments, - unsigned long flags) + unsigned long nr_segments, + struct compat_kexec_segment __user *segments, + unsigned long flags) { struct compat_kexec_segment in; struct kexec_segment out, __user *ksegments; @@ -980,20 +1036,17 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, /* Don't allow clients that don't understand the native * architecture to do anything. */ - if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) { + if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) return -EINVAL; - } - if (nr_segments > KEXEC_SEGMENT_MAX) { + if (nr_segments > KEXEC_SEGMENT_MAX) return -EINVAL; - } ksegments = compat_alloc_user_space(nr_segments * sizeof(out)); for (i=0; i < nr_segments; i++) { result = copy_from_user(&in, &segments[i], sizeof(in)); - if (result) { + if (result) return -EFAULT; - } out.buf = compat_ptr(in.buf); out.bufsz = in.bufsz; @@ -1001,22 +1054,17 @@ asmlinkage long compat_sys_kexec_load(unsigned long entry, out.memsz = in.memsz; result = copy_to_user(&ksegments[i], &out, sizeof(out)); - if (result) { + if (result) return -EFAULT; - } } return sys_kexec_load(entry, nr_segments, ksegments, flags); } #endif -void crash_kexec(void) +void crash_kexec(struct pt_regs *regs) { - struct kimage *image; - int locked; - - - /* Take the kexec_lock here to prevent sys_kexec_load + /* Take the kexec_mutex here to prevent sys_kexec_load * running on one cpu from replacing the crash kernel * we are using after a panic on a different cpu. * @@ -1024,13 +1072,494 @@ void crash_kexec(void) * of memory the xchg(&kexec_crash_image) would be * sufficient. But since I reuse the memory... */ - locked = xchg(&kexec_lock, 1); - if (!locked) { - image = xchg(&kexec_crash_image, NULL); - if (image) { - machine_crash_shutdown(); - machine_kexec(image); + if (mutex_trylock(&kexec_mutex)) { + if (kexec_crash_image) { + struct pt_regs fixed_regs; + + kmsg_dump(KMSG_DUMP_KEXEC); + + crash_setup_regs(&fixed_regs, regs); + crash_save_vmcoreinfo(); + machine_crash_shutdown(&fixed_regs); + machine_kexec(kexec_crash_image); } - xchg(&kexec_lock, 0); + mutex_unlock(&kexec_mutex); } } + +size_t crash_get_memory_size(void) +{ + size_t size; + mutex_lock(&kexec_mutex); + size = crashk_res.end - crashk_res.start + 1; + mutex_unlock(&kexec_mutex); + return size; +} + +static void free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT)); + init_page_count(pfn_to_page(addr >> PAGE_SHIFT)); + free_page((unsigned long)__va(addr)); + totalram_pages++; + } +} + +int crash_shrink_memory(unsigned long new_size) +{ + int ret = 0; + unsigned long start, end; + + mutex_lock(&kexec_mutex); + + if (kexec_crash_image) { + ret = -ENOENT; + goto unlock; + } + start = crashk_res.start; + end = crashk_res.end; + + if (new_size >= end - start + 1) { + ret = -EINVAL; + if (new_size == end - start + 1) + ret = 0; + goto unlock; + } + + start = roundup(start, PAGE_SIZE); + end = roundup(start + new_size, PAGE_SIZE); + + free_reserved_phys_range(end, crashk_res.end); + + if (start == end) + release_resource(&crashk_res); + crashk_res.end = end - 1; + +unlock: + mutex_unlock(&kexec_mutex); + return ret; +} + +static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data, + size_t data_len) +{ + struct elf_note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = data_len; + note.n_type = type; + memcpy(buf, ¬e, sizeof(note)); + buf += (sizeof(note) + 3)/4; + memcpy(buf, name, note.n_namesz); + buf += (note.n_namesz + 3)/4; + memcpy(buf, data, note.n_descsz); + buf += (note.n_descsz + 3)/4; + + return buf; +} + +static void final_note(u32 *buf) +{ + struct elf_note note; + + note.n_namesz = 0; + note.n_descsz = 0; + note.n_type = 0; + memcpy(buf, ¬e, sizeof(note)); +} + +void crash_save_cpu(struct pt_regs *regs, int cpu) +{ + struct elf_prstatus prstatus; + u32 *buf; + + if ((cpu < 0) || (cpu >= nr_cpu_ids)) + return; + + /* Using ELF notes here is opportunistic. + * I need a well defined structure format + * for the data I pass, and I need tags + * on the data to indicate what information I have + * squirrelled away. ELF notes happen to provide + * all of that, so there is no need to invent something new. + */ + buf = (u32*)per_cpu_ptr(crash_notes, cpu); + if (!buf) + return; + memset(&prstatus, 0, sizeof(prstatus)); + prstatus.pr_pid = current->pid; + elf_core_copy_kernel_regs(&prstatus.pr_reg, regs); + buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS, + &prstatus, sizeof(prstatus)); + final_note(buf); +} + +static int __init crash_notes_memory_init(void) +{ + /* Allocate memory for saving cpu registers. */ + crash_notes = alloc_percpu(note_buf_t); + if (!crash_notes) { + printk("Kexec: Memory allocation for saving cpu register" + " states failed\n"); + return -ENOMEM; + } + return 0; +} +module_init(crash_notes_memory_init) + + +/* + * parsing the "crashkernel" commandline + * + * this code is intended to be called from architecture specific code + */ + + +/* + * This function parses command lines in the format + * + * crashkernel=ramsize-range:size[,...][@offset] + * + * The function returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_mem(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline, *tmp; + + /* for each entry of the comma-separated list */ + do { + unsigned long long start, end = ULLONG_MAX, size; + + /* get the start of the range */ + start = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (*cur != '-') { + pr_warning("crashkernel: '-' expected\n"); + return -EINVAL; + } + cur++; + + /* if no ':' is here, than we read the end */ + if (*cur != ':') { + end = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("crashkernel: Memory " + "value expected\n"); + return -EINVAL; + } + cur = tmp; + if (end <= start) { + pr_warning("crashkernel: end <= start\n"); + return -EINVAL; + } + } + + if (*cur != ':') { + pr_warning("crashkernel: ':' expected\n"); + return -EINVAL; + } + cur++; + + size = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected\n"); + return -EINVAL; + } + cur = tmp; + if (size >= system_ram) { + pr_warning("crashkernel: invalid size\n"); + return -EINVAL; + } + + /* match ? */ + if (system_ram >= start && system_ram < end) { + *crash_size = size; + break; + } + } while (*cur++ == ','); + + if (*crash_size > 0) { + while (*cur && *cur != ' ' && *cur != '@') + cur++; + if (*cur == '@') { + cur++; + *crash_base = memparse(cur, &tmp); + if (cur == tmp) { + pr_warning("Memory value expected " + "after '@'\n"); + return -EINVAL; + } + } + } + + return 0; +} + +/* + * That function parses "simple" (old) crashkernel command lines like + * + * crashkernel=size[@offset] + * + * It returns 0 on success and -EINVAL on failure. + */ +static int __init parse_crashkernel_simple(char *cmdline, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *cur = cmdline; + + *crash_size = memparse(cmdline, &cur); + if (cmdline == cur) { + pr_warning("crashkernel: memory value expected\n"); + return -EINVAL; + } + + if (*cur == '@') + *crash_base = memparse(cur+1, &cur); + + return 0; +} + +/* + * That function is the entry point for command line parsing and should be + * called from the arch-specific code. + */ +int __init parse_crashkernel(char *cmdline, + unsigned long long system_ram, + unsigned long long *crash_size, + unsigned long long *crash_base) +{ + char *p = cmdline, *ck_cmdline = NULL; + char *first_colon, *first_space; + + BUG_ON(!crash_size || !crash_base); + *crash_size = 0; + *crash_base = 0; + + /* find crashkernel and use the last one if there are more */ + p = strstr(p, "crashkernel="); + while (p) { + ck_cmdline = p; + p = strstr(p+1, "crashkernel="); + } + + if (!ck_cmdline) + return -EINVAL; + + ck_cmdline += 12; /* strlen("crashkernel=") */ + + /* + * if the commandline contains a ':', then that's the extended + * syntax -- if not, it must be the classic syntax + */ + first_colon = strchr(ck_cmdline, ':'); + first_space = strchr(ck_cmdline, ' '); + if (first_colon && (!first_space || first_colon < first_space)) + return parse_crashkernel_mem(ck_cmdline, system_ram, + crash_size, crash_base); + else + return parse_crashkernel_simple(ck_cmdline, crash_size, + crash_base); + + return 0; +} + + + +void crash_save_vmcoreinfo(void) +{ + u32 *buf; + + if (!vmcoreinfo_size) + return; + + vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds()); + + buf = (u32 *)vmcoreinfo_note; + + buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data, + vmcoreinfo_size); + + final_note(buf); +} + +void vmcoreinfo_append_str(const char *fmt, ...) +{ + va_list args; + char buf[0x50]; + int r; + + va_start(args, fmt); + r = vsnprintf(buf, sizeof(buf), fmt, args); + va_end(args); + + if (r + vmcoreinfo_size > vmcoreinfo_max_size) + r = vmcoreinfo_max_size - vmcoreinfo_size; + + memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r); + + vmcoreinfo_size += r; +} + +/* + * provide an empty default implementation here -- architecture + * code may override this + */ +void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void) +{} + +unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void) +{ + return __pa((unsigned long)(char *)&vmcoreinfo_note); +} + +static int __init crash_save_vmcoreinfo_init(void) +{ + VMCOREINFO_OSRELEASE(init_uts_ns.name.release); + VMCOREINFO_PAGESIZE(PAGE_SIZE); + + VMCOREINFO_SYMBOL(init_uts_ns); + VMCOREINFO_SYMBOL(node_online_map); + VMCOREINFO_SYMBOL(swapper_pg_dir); + VMCOREINFO_SYMBOL(_stext); + VMCOREINFO_SYMBOL(vmlist); + +#ifndef CONFIG_NEED_MULTIPLE_NODES + VMCOREINFO_SYMBOL(mem_map); + VMCOREINFO_SYMBOL(contig_page_data); +#endif +#ifdef CONFIG_SPARSEMEM + VMCOREINFO_SYMBOL(mem_section); + VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS); + VMCOREINFO_STRUCT_SIZE(mem_section); + VMCOREINFO_OFFSET(mem_section, section_mem_map); +#endif + VMCOREINFO_STRUCT_SIZE(page); + VMCOREINFO_STRUCT_SIZE(pglist_data); + VMCOREINFO_STRUCT_SIZE(zone); + VMCOREINFO_STRUCT_SIZE(free_area); + VMCOREINFO_STRUCT_SIZE(list_head); + VMCOREINFO_SIZE(nodemask_t); + VMCOREINFO_OFFSET(page, flags); + VMCOREINFO_OFFSET(page, _count); + VMCOREINFO_OFFSET(page, mapping); + VMCOREINFO_OFFSET(page, lru); + VMCOREINFO_OFFSET(pglist_data, node_zones); + VMCOREINFO_OFFSET(pglist_data, nr_zones); +#ifdef CONFIG_FLAT_NODE_MEM_MAP + VMCOREINFO_OFFSET(pglist_data, node_mem_map); +#endif + VMCOREINFO_OFFSET(pglist_data, node_start_pfn); + VMCOREINFO_OFFSET(pglist_data, node_spanned_pages); + VMCOREINFO_OFFSET(pglist_data, node_id); + VMCOREINFO_OFFSET(zone, free_area); + VMCOREINFO_OFFSET(zone, vm_stat); + VMCOREINFO_OFFSET(zone, spanned_pages); + VMCOREINFO_OFFSET(free_area, free_list); + VMCOREINFO_OFFSET(list_head, next); + VMCOREINFO_OFFSET(list_head, prev); + VMCOREINFO_OFFSET(vm_struct, addr); + VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER); + log_buf_kexec_setup(); + VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES); + VMCOREINFO_NUMBER(NR_FREE_PAGES); + VMCOREINFO_NUMBER(PG_lru); + VMCOREINFO_NUMBER(PG_private); + VMCOREINFO_NUMBER(PG_swapcache); + + arch_crash_save_vmcoreinfo(); + + return 0; +} + +module_init(crash_save_vmcoreinfo_init) + +/* + * Move into place and start executing a preloaded standalone + * executable. If nothing was preloaded return an error. + */ +int kernel_kexec(void) +{ + int error = 0; + + if (!mutex_trylock(&kexec_mutex)) + return -EBUSY; + if (!kexec_image) { + error = -EINVAL; + goto Unlock; + } + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + mutex_lock(&pm_mutex); + pm_prepare_console(); + error = freeze_processes(); + if (error) { + error = -EBUSY; + goto Restore_console; + } + suspend_console(); + error = dpm_suspend_start(PMSG_FREEZE); + if (error) + goto Resume_console; + /* At this point, dpm_suspend_start() has been called, + * but *not* dpm_suspend_noirq(). We *must* call + * dpm_suspend_noirq() now. Otherwise, drivers for + * some devices (e.g. interrupt controllers) become + * desynchronized with the actual state of the + * hardware at resume time, and evil weirdness ensues. + */ + error = dpm_suspend_noirq(PMSG_FREEZE); + if (error) + goto Resume_devices; + error = disable_nonboot_cpus(); + if (error) + goto Enable_cpus; + local_irq_disable(); + /* Suspend system devices */ + error = sysdev_suspend(PMSG_FREEZE); + if (error) + goto Enable_irqs; + } else +#endif + { + kernel_restart_prepare(NULL); + printk(KERN_EMERG "Starting new kernel\n"); + machine_shutdown(); + } + + machine_kexec(kexec_image); + +#ifdef CONFIG_KEXEC_JUMP + if (kexec_image->preserve_context) { + sysdev_resume(); + Enable_irqs: + local_irq_enable(); + Enable_cpus: + enable_nonboot_cpus(); + dpm_resume_noirq(PMSG_RESTORE); + Resume_devices: + dpm_resume_end(PMSG_RESTORE); + Resume_console: + resume_console(); + thaw_processes(); + Restore_console: + pm_restore_console(); + mutex_unlock(&pm_mutex); + } +#endif + + Unlock: + mutex_unlock(&kexec_mutex); + return error; +}