SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mm.h>
  11 #include <linux/file.h>
  12 #include <linux/slab.h>
  13 #include <linux/fs.h>
  14 #include <linux/kexec.h>
  15 #include <linux/mutex.h>
  16 #include <linux/list.h>
  17 #include <linux/highmem.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/reboot.h>
  20 #include <linux/ioport.h>
  21 #include <linux/hardirq.h>
  22 #include <linux/elf.h>
  23 #include <linux/elfcore.h>
  24 #include <generated/utsrelease.h>
  25 #include <linux/utsname.h>
  26 #include <linux/numa.h>
  27 #include <linux/suspend.h>
  28 #include <linux/device.h>
  29 #include <linux/freezer.h>
  30 #include <linux/pm.h>
  31 #include <linux/cpu.h>
  32 #include <linux/console.h>
  33 #include <linux/vmalloc.h>
  34 #include <linux/swap.h>
  35
  36 #include <asm/page.h>
  37 #include <asm/uaccess.h>
  38 #include <asm/io.h>
  39 #include <asm/system.h>
  40 #include <asm/sections.h>
  41
  42 /* Per cpu memory for storing cpu states in case of system crash. */
  43 note_buf_t* crash_notes;
  44
  45 /* vmcoreinfo stuff */
  46 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
  47 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
  48 size_t vmcoreinfo_size;
  49 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
  50
  51 /* Location of the reserved area for the crash kernel */
  52 struct resource crashk_res = {
  53         .name  = "Crash kernel",
  54         .start = 0,
  55         .end   = 0,
  56         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  57 };
  58
  59 int kexec_should_crash(struct task_struct *p)
  60 {
  61         if (in_interrupt() || !p->pid || is_global_init(p) || panic_on_oops)
  62                 return 1;
  63         return 0;
  64 }
  65
  66 /*
  67  * When kexec transitions to the new kernel there is a one-to-one
  68  * mapping between physical and virtual addresses.  On processors
  69  * where you can disable the MMU this is trivial, and easy.  For
  70  * others it is still a simple predictable page table to setup.
  71  *
  72  * In that environment kexec copies the new kernel to its final
  73  * resting place.  This means I can only support memory whose
  74  * physical address can fit in an unsigned long.  In particular
  75  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  76  * If the assembly stub has more restrictive requirements
  77  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  78  * defined more restrictively in <asm/kexec.h>.
  79  *
  80  * The code for the transition from the current kernel to the
  81  * the new kernel is placed in the control_code_buffer, whose size
  82  * is given by KEXEC_CONTROL_PAGE_SIZE.  In the best case only a single
  83  * page of memory is necessary, but some architectures require more.
  84  * Because this memory must be identity mapped in the transition from
  85  * virtual to physical addresses it must live in the range
  86  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  87  * modifiable.
  88  *
  89  * The assembly stub in the control code buffer is passed a linked list
  90  * of descriptor pages detailing the source pages of the new kernel,
  91  * and the destination addresses of those source pages.  As this data
  92  * structure is not used in the context of the current OS, it must
  93  * be self-contained.
  94  *
  95  * The code has been made to work with highmem pages and will use a
  96  * destination page in its final resting place (if it happens
  97  * to allocate it).  The end product of this is that most of the
  98  * physical address space, and most of RAM can be used.
  99  *
 100  * Future directions include:
 101  *  - allocating a page table with the control code buffer identity
 102  *    mapped, to simplify machine_kexec and make kexec_on_panic more
 103  *    reliable.
 104  */
 105
 106 /*
 107  * KIMAGE_NO_DEST is an impossible destination address..., for
 108  * allocating pages whose destination address we do not care about.
 109  */
 110 #define KIMAGE_NO_DEST (-1UL)
 111
 112 static int kimage_is_destination_range(struct kimage *image,
 113                                        unsigned long start, unsigned long end);
 114 static struct page *kimage_alloc_page(struct kimage *image,
 115                                        gfp_t gfp_mask,
 116                                        unsigned long dest);
 117
 118 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 119                             unsigned long nr_segments,
 120                             struct kexec_segment __user *segments)
 121 {
 122         size_t segment_bytes;
 123         struct kimage *image;
 124         unsigned long i;
 125         int result;
 126
 127         /* Allocate a controlling structure */
 128         result = -ENOMEM;
 129         image = kzalloc(sizeof(*image), GFP_KERNEL);
 130         if (!image)
 131                 goto out;
 132
 133         image->head = 0;
 134         image->entry = &image->head;
 135         image->last_entry = &image->head;
 136         image->control_page = ~0; /* By default this does not apply */
 137         image->start = entry;
 138         image->type = KEXEC_TYPE_DEFAULT;
 139
 140         /* Initialize the list of control pages */
 141         INIT_LIST_HEAD(&image->control_pages);
 142
 143         /* Initialize the list of destination pages */
 144         INIT_LIST_HEAD(&image->dest_pages);
 145
 146         /* Initialize the list of unuseable pages */
 147         INIT_LIST_HEAD(&image->unuseable_pages);
 148
 149         /* Read in the segments */
 150         image->nr_segments = nr_segments;
 151         segment_bytes = nr_segments * sizeof(*segments);
 152         result = copy_from_user(image->segment, segments, segment_bytes);
 153         if (result)
 154                 goto out;
 155
 156         /*
 157          * Verify we have good destination addresses.  The caller is
 158          * responsible for making certain we don't attempt to load
 159          * the new image into invalid or reserved areas of RAM.  This
 160          * just verifies it is an address we can use.
 161          *
 162          * Since the kernel does everything in page size chunks ensure
 163          * the destination addreses are page aligned.  Too many
 164          * special cases crop of when we don't do this.  The most
 165          * insidious is getting overlapping destination addresses
 166          * simply because addresses are changed to page size
 167          * granularity.
 168          */
 169         result = -EADDRNOTAVAIL;
 170         for (i = 0; i < nr_segments; i++) {
 171                 unsigned long mstart, mend;
 172
 173                 mstart = image->segment[i].mem;
 174                 mend   = mstart + image->segment[i].memsz;
 175                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 176                         goto out;
 177                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 178                         goto out;
 179         }
 180
 181         /* Verify our destination addresses do not overlap.
 182          * If we alloed overlapping destination addresses
 183          * through very weird things can happen with no
 184          * easy explanation as one segment stops on another.
 185          */
 186         result = -EINVAL;
 187         for (i = 0; i < nr_segments; i++) {
 188                 unsigned long mstart, mend;
 189                 unsigned long j;
 190
 191                 mstart = image->segment[i].mem;
 192                 mend   = mstart + image->segment[i].memsz;
 193                 for (j = 0; j < i; j++) {
 194                         unsigned long pstart, pend;
 195                         pstart = image->segment[j].mem;
 196                         pend   = pstart + image->segment[j].memsz;
 197                         /* Do the segments overlap ? */
 198                         if ((mend > pstart) && (mstart < pend))
 199                                 goto out;
 200                 }
 201         }
 202
 203         /* Ensure our buffer sizes are strictly less than
 204          * our memory sizes.  This should always be the case,
 205          * and it is easier to check up front than to be surprised
 206          * later on.
 207          */
 208         result = -EINVAL;
 209         for (i = 0; i < nr_segments; i++) {
 210                 if (image->segment[i].bufsz > image->segment[i].memsz)
 211                         goto out;
 212         }
 213
 214         result = 0;
 215 out:
 216         if (result == 0)
 217                 *rimage = image;
 218         else
 219                 kfree(image);
 220
 221         return result;
 222
 223 }
 224
 225 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 226                                 unsigned long nr_segments,
 227                                 struct kexec_segment __user *segments)
 228 {
 229         int result;
 230         struct kimage *image;
 231
 232         /* Allocate and initialize a controlling structure */
 233         image = NULL;
 234         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 235         if (result)
 236                 goto out;
 237
 238         *rimage = image;
 239
 240         /*
 241          * Find a location for the control code buffer, and add it
 242          * the vector of segments so that it's pages will also be
 243          * counted as destination pages.
 244          */
 245         result = -ENOMEM;
 246         image->control_code_page = kimage_alloc_control_pages(image,
 247                                            get_order(KEXEC_CONTROL_PAGE_SIZE));
 248         if (!image->control_code_page) {
 249                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 250                 goto out;
 251         }
 252
 253         image->swap_page = kimage_alloc_control_pages(image, 0);
 254         if (!image->swap_page) {
 255                 printk(KERN_ERR "Could not allocate swap buffer\n");
 256                 goto out;
 257         }
 258
 259         result = 0;
 260  out:
 261         if (result == 0)
 262                 *rimage = image;
 263         else
 264                 kfree(image);
 265
 266         return result;
 267 }
 268
 269 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 270                                 unsigned long nr_segments,
 271                                 struct kexec_segment __user *segments)
 272 {
 273         int result;
 274         struct kimage *image;
 275         unsigned long i;
 276
 277         image = NULL;
 278         /* Verify we have a valid entry point */
 279         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 280                 result = -EADDRNOTAVAIL;
 281                 goto out;
 282         }
 283
 284         /* Allocate and initialize a controlling structure */
 285         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 286         if (result)
 287                 goto out;
 288
 289         /* Enable the special crash kernel control page
 290          * allocation policy.
 291          */
 292         image->control_page = crashk_res.start;
 293         image->type = KEXEC_TYPE_CRASH;
 294
 295         /*
 296          * Verify we have good destination addresses.  Normally
 297          * the caller is responsible for making certain we don't
 298          * attempt to load the new image into invalid or reserved
 299          * areas of RAM.  But crash kernels are preloaded into a
 300          * reserved area of ram.  We must ensure the addresses
 301          * are in the reserved area otherwise preloading the
 302          * kernel could corrupt things.
 303          */
 304         result = -EADDRNOTAVAIL;
 305         for (i = 0; i < nr_segments; i++) {
 306                 unsigned long mstart, mend;
 307
 308                 mstart = image->segment[i].mem;
 309                 mend = mstart + image->segment[i].memsz - 1;
 310                 /* Ensure we are within the crash kernel limits */
 311                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 312                         goto out;
 313         }
 314
 315         /*
 316          * Find a location for the control code buffer, and add
 317          * the vector of segments so that it's pages will also be
 318          * counted as destination pages.
 319          */
 320         result = -ENOMEM;
 321         image->control_code_page = kimage_alloc_control_pages(image,
 322                                            get_order(KEXEC_CONTROL_PAGE_SIZE));
 323         if (!image->control_code_page) {
 324                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 325                 goto out;
 326         }
 327
 328         result = 0;
 329 out:
 330         if (result == 0)
 331                 *rimage = image;
 332         else
 333                 kfree(image);
 334
 335         return result;
 336 }
 337
 338 static int kimage_is_destination_range(struct kimage *image,
 339                                         unsigned long start,
 340                                         unsigned long end)
 341 {
 342         unsigned long i;
 343
 344         for (i = 0; i < image->nr_segments; i++) {
 345                 unsigned long mstart, mend;
 346
 347                 mstart = image->segment[i].mem;
 348                 mend = mstart + image->segment[i].memsz;
 349                 if ((end > mstart) && (start < mend))
 350                         return 1;
 351         }
 352
 353         return 0;
 354 }
 355
 356 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 357 {
 358         struct page *pages;
 359
 360         pages = alloc_pages(gfp_mask, order);
 361         if (pages) {
 362                 unsigned int count, i;
 363                 pages->mapping = NULL;
 364                 set_page_private(pages, order);
 365                 count = 1 << order;
 366                 for (i = 0; i < count; i++)
 367                         SetPageReserved(pages + i);
 368         }
 369
 370         return pages;
 371 }
 372
 373 static void kimage_free_pages(struct page *page)
 374 {
 375         unsigned int order, count, i;
 376
 377         order = page_private(page);
 378         count = 1 << order;
 379         for (i = 0; i < count; i++)
 380                 ClearPageReserved(page + i);
 381         __free_pages(page, order);
 382 }
 383
 384 static void kimage_free_page_list(struct list_head *list)
 385 {
 386         struct list_head *pos, *next;
 387
 388         list_for_each_safe(pos, next, list) {
 389                 struct page *page;
 390
 391                 page = list_entry(pos, struct page, lru);
 392                 list_del(&page->lru);
 393                 kimage_free_pages(page);
 394         }
 395 }
 396
 397 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 398                                                         unsigned int order)
 399 {
 400         /* Control pages are special, they are the intermediaries
 401          * that are needed while we copy the rest of the pages
 402          * to their final resting place.  As such they must
 403          * not conflict with either the destination addresses
 404          * or memory the kernel is already using.
 405          *
 406          * The only case where we really need more than one of
 407          * these are for architectures where we cannot disable
 408          * the MMU and must instead generate an identity mapped
 409          * page table for all of the memory.
 410          *
 411          * At worst this runs in O(N) of the image size.
 412          */
 413         struct list_head extra_pages;
 414         struct page *pages;
 415         unsigned int count;
 416
 417         count = 1 << order;
 418         INIT_LIST_HEAD(&extra_pages);
 419
 420         /* Loop while I can allocate a page and the page allocated
 421          * is a destination page.
 422          */
 423         do {
 424                 unsigned long pfn, epfn, addr, eaddr;
 425
 426                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 427                 if (!pages)
 428                         break;
 429                 pfn   = page_to_pfn(pages);
 430                 epfn  = pfn + count;
 431                 addr  = pfn << PAGE_SHIFT;
 432                 eaddr = epfn << PAGE_SHIFT;
 433                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 434                               kimage_is_destination_range(image, addr, eaddr)) {
 435                         list_add(&pages->lru, &extra_pages);
 436                         pages = NULL;
 437                 }
 438         } while (!pages);
 439
 440         if (pages) {
 441                 /* Remember the allocated page... */
 442                 list_add(&pages->lru, &image->control_pages);
 443
 444                 /* Because the page is already in it's destination
 445                  * location we will never allocate another page at
 446                  * that address.  Therefore kimage_alloc_pages
 447                  * will not return it (again) and we don't need
 448                  * to give it an entry in image->segment[].
 449                  */
 450         }
 451         /* Deal with the destination pages I have inadvertently allocated.
 452          *
 453          * Ideally I would convert multi-page allocations into single
 454          * page allocations, and add everyting to image->dest_pages.
 455          *
 456          * For now it is simpler to just free the pages.
 457          */
 458         kimage_free_page_list(&extra_pages);
 459
 460         return pages;
 461 }
 462
 463 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 464                                                       unsigned int order)
 465 {
 466         /* Control pages are special, they are the intermediaries
 467          * that are needed while we copy the rest of the pages
 468          * to their final resting place.  As such they must
 469          * not conflict with either the destination addresses
 470          * or memory the kernel is already using.
 471          *
 472          * Control pages are also the only pags we must allocate
 473          * when loading a crash kernel.  All of the other pages
 474          * are specified by the segments and we just memcpy
 475          * into them directly.
 476          *
 477          * The only case where we really need more than one of
 478          * these are for architectures where we cannot disable
 479          * the MMU and must instead generate an identity mapped
 480          * page table for all of the memory.
 481          *
 482          * Given the low demand this implements a very simple
 483          * allocator that finds the first hole of the appropriate
 484          * size in the reserved memory region, and allocates all
 485          * of the memory up to and including the hole.
 486          */
 487         unsigned long hole_start, hole_end, size;
 488         struct page *pages;
 489
 490         pages = NULL;
 491         size = (1 << order) << PAGE_SHIFT;
 492         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 493         hole_end   = hole_start + size - 1;
 494         while (hole_end <= crashk_res.end) {
 495                 unsigned long i;
 496
 497                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 498                         break;
 499                 if (hole_end > crashk_res.end)
 500                         break;
 501                 /* See if I overlap any of the segments */
 502                 for (i = 0; i < image->nr_segments; i++) {
 503                         unsigned long mstart, mend;
 504
 505                         mstart = image->segment[i].mem;
 506                         mend   = mstart + image->segment[i].memsz - 1;
 507                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 508                                 /* Advance the hole to the end of the segment */
 509                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 510                                 hole_end   = hole_start + size - 1;
 511                                 break;
 512                         }
 513                 }
 514                 /* If I don't overlap any segments I have found my hole! */
 515                 if (i == image->nr_segments) {
 516                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 517                         break;
 518                 }
 519         }
 520         if (pages)
 521                 image->control_page = hole_end;
 522
 523         return pages;
 524 }
 525
 526
 527 struct page *kimage_alloc_control_pages(struct kimage *image,
 528                                          unsigned int order)
 529 {
 530         struct page *pages = NULL;
 531
 532         switch (image->type) {
 533         case KEXEC_TYPE_DEFAULT:
 534                 pages = kimage_alloc_normal_control_pages(image, order);
 535                 break;
 536         case KEXEC_TYPE_CRASH:
 537                 pages = kimage_alloc_crash_control_pages(image, order);
 538                 break;
 539         }
 540
 541         return pages;
 542 }
 543
 544 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 545 {
 546         if (*image->entry != 0)
 547                 image->entry++;
 548
 549         if (image->entry == image->last_entry) {
 550                 kimage_entry_t *ind_page;
 551                 struct page *page;
 552
 553                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 554                 if (!page)
 555                         return -ENOMEM;
 556
 557                 ind_page = page_address(page);
 558                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 559                 image->entry = ind_page;
 560                 image->last_entry = ind_page +
 561                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 562         }
 563         *image->entry = entry;
 564         image->entry++;
 565         *image->entry = 0;
 566
 567         return 0;
 568 }
 569
 570 static int kimage_set_destination(struct kimage *image,
 571                                    unsigned long destination)
 572 {
 573         int result;
 574
 575         destination &= PAGE_MASK;
 576         result = kimage_add_entry(image, destination | IND_DESTINATION);
 577         if (result == 0)
 578                 image->destination = destination;
 579
 580         return result;
 581 }
 582
 583
 584 static int kimage_add_page(struct kimage *image, unsigned long page)
 585 {
 586         int result;
 587
 588         page &= PAGE_MASK;
 589         result = kimage_add_entry(image, page | IND_SOURCE);
 590         if (result == 0)
 591                 image->destination += PAGE_SIZE;
 592
 593         return result;
 594 }
 595
 596
 597 static void kimage_free_extra_pages(struct kimage *image)
 598 {
 599         /* Walk through and free any extra destination pages I may have */
 600         kimage_free_page_list(&image->dest_pages);
 601
 602         /* Walk through and free any unuseable pages I have cached */
 603         kimage_free_page_list(&image->unuseable_pages);
 604
 605 }
 606 static void kimage_terminate(struct kimage *image)
 607 {
 608         if (*image->entry != 0)
 609                 image->entry++;
 610
 611         *image->entry = IND_DONE;
 612 }
 613
 614 #define for_each_kimage_entry(image, ptr, entry) \
 615         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 616                 ptr = (entry & IND_INDIRECTION)? \
 617                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 618
 619 static void kimage_free_entry(kimage_entry_t entry)
 620 {
 621         struct page *page;
 622
 623         page = pfn_to_page(entry >> PAGE_SHIFT);
 624         kimage_free_pages(page);
 625 }
 626
 627 static void kimage_free(struct kimage *image)
 628 {
 629         kimage_entry_t *ptr, entry;
 630         kimage_entry_t ind = 0;
 631
 632         if (!image)
 633                 return;
 634
 635         kimage_free_extra_pages(image);
 636         for_each_kimage_entry(image, ptr, entry) {
 637                 if (entry & IND_INDIRECTION) {
 638                         /* Free the previous indirection page */
 639                         if (ind & IND_INDIRECTION)
 640                                 kimage_free_entry(ind);
 641                         /* Save this indirection page until we are
 642                          * done with it.
 643                          */
 644                         ind = entry;
 645                 }
 646                 else if (entry & IND_SOURCE)
 647                         kimage_free_entry(entry);
 648         }
 649         /* Free the final indirection page */
 650         if (ind & IND_INDIRECTION)
 651                 kimage_free_entry(ind);
 652
 653         /* Handle any machine specific cleanup */
 654         machine_kexec_cleanup(image);
 655
 656         /* Free the kexec control pages... */
 657         kimage_free_page_list(&image->control_pages);
 658         kfree(image);
 659 }
 660
 661 static kimage_entry_t *kimage_dst_used(struct kimage *image,
 662                                         unsigned long page)
 663 {
 664         kimage_entry_t *ptr, entry;
 665         unsigned long destination = 0;
 666
 667         for_each_kimage_entry(image, ptr, entry) {
 668                 if (entry & IND_DESTINATION)
 669                         destination = entry & PAGE_MASK;
 670                 else if (entry & IND_SOURCE) {
 671                         if (page == destination)
 672                                 return ptr;
 673                         destination += PAGE_SIZE;
 674                 }
 675         }
 676
 677         return NULL;
 678 }
 679
 680 static struct page *kimage_alloc_page(struct kimage *image,
 681                                         gfp_t gfp_mask,
 682                                         unsigned long destination)
 683 {
 684         /*
 685          * Here we implement safeguards to ensure that a source page
 686          * is not copied to its destination page before the data on
 687          * the destination page is no longer useful.
 688          *
 689          * To do this we maintain the invariant that a source page is
 690          * either its own destination page, or it is not a
 691          * destination page at all.
 692          *
 693          * That is slightly stronger than required, but the proof
 694          * that no problems will not occur is trivial, and the
 695          * implementation is simply to verify.
 696          *
 697          * When allocating all pages normally this algorithm will run
 698          * in O(N) time, but in the worst case it will run in O(N^2)
 699          * time.   If the runtime is a problem the data structures can
 700          * be fixed.
 701          */
 702         struct page *page;
 703         unsigned long addr;
 704
 705         /*
 706          * Walk through the list of destination pages, and see if I
 707          * have a match.
 708          */
 709         list_for_each_entry(page, &image->dest_pages, lru) {
 710                 addr = page_to_pfn(page) << PAGE_SHIFT;
 711                 if (addr == destination) {
 712                         list_del(&page->lru);
 713                         return page;
 714                 }
 715         }
 716         page = NULL;
 717         while (1) {
 718                 kimage_entry_t *old;
 719
 720                 /* Allocate a page, if we run out of memory give up */
 721                 page = kimage_alloc_pages(gfp_mask, 0);
 722                 if (!page)
 723                         return NULL;
 724                 /* If the page cannot be used file it away */
 725                 if (page_to_pfn(page) >
 726                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 727                         list_add(&page->lru, &image->unuseable_pages);
 728                         continue;
 729                 }
 730                 addr = page_to_pfn(page) << PAGE_SHIFT;
 731
 732                 /* If it is the destination page we want use it */
 733                 if (addr == destination)
 734                         break;
 735
 736                 /* If the page is not a destination page use it */
 737                 if (!kimage_is_destination_range(image, addr,
 738                                                   addr + PAGE_SIZE))
 739                         break;
 740
 741                 /*
 742                  * I know that the page is someones destination page.
 743                  * See if there is already a source page for this
 744                  * destination page.  And if so swap the source pages.
 745                  */
 746                 old = kimage_dst_used(image, addr);
 747                 if (old) {
 748                         /* If so move it */
 749                         unsigned long old_addr;
 750                         struct page *old_page;
 751
 752                         old_addr = *old & PAGE_MASK;
 753                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 754                         copy_highpage(page, old_page);
 755                         *old = addr | (*old & ~PAGE_MASK);
 756
 757                         /* The old page I have found cannot be a
 758                          * destination page, so return it if it's
 759                          * gfp_flags honor the ones passed in.
 760                          */
 761                         if (!(gfp_mask & __GFP_HIGHMEM) &&
 762                             PageHighMem(old_page)) {
 763                                 kimage_free_pages(old_page);
 764                                 continue;
 765                         }
 766                         addr = old_addr;
 767                         page = old_page;
 768                         break;
 769                 }
 770                 else {
 771                         /* Place the page on the destination list I
 772                          * will use it later.
 773                          */
 774                         list_add(&page->lru, &image->dest_pages);
 775                 }
 776         }
 777
 778         return page;
 779 }
 780
 781 static int kimage_load_normal_segment(struct kimage *image,
 782                                          struct kexec_segment *segment)
 783 {
 784         unsigned long maddr;
 785         unsigned long ubytes, mbytes;
 786         int result;
 787         unsigned char __user *buf;
 788
 789         result = 0;
 790         buf = segment->buf;
 791         ubytes = segment->bufsz;
 792         mbytes = segment->memsz;
 793         maddr = segment->mem;
 794
 795         result = kimage_set_destination(image, maddr);
 796         if (result < 0)
 797                 goto out;
 798
 799         while (mbytes) {
 800                 struct page *page;
 801                 char *ptr;
 802                 size_t uchunk, mchunk;
 803
 804                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 805                 if (!page) {
 806                         result  = -ENOMEM;
 807                         goto out;
 808                 }
 809                 result = kimage_add_page(image, page_to_pfn(page)
 810                                                                 << PAGE_SHIFT);
 811                 if (result < 0)
 812                         goto out;
 813
 814                 ptr = kmap(page);
 815                 /* Start with a clear page */
 816                 memset(ptr, 0, PAGE_SIZE);
 817                 ptr += maddr & ~PAGE_MASK;
 818                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 819                 if (mchunk > mbytes)
 820                         mchunk = mbytes;
 821
 822                 uchunk = mchunk;
 823                 if (uchunk > ubytes)
 824                         uchunk = ubytes;
 825
 826                 result = copy_from_user(ptr, buf, uchunk);
 827                 kunmap(page);
 828                 if (result) {
 829                         result = (result < 0) ? result : -EIO;
 830                         goto out;
 831                 }
 832                 ubytes -= uchunk;
 833                 maddr  += mchunk;
 834                 buf    += mchunk;
 835                 mbytes -= mchunk;
 836         }
 837 out:
 838         return result;
 839 }
 840
 841 static int kimage_load_crash_segment(struct kimage *image,
 842                                         struct kexec_segment *segment)
 843 {
 844         /* For crash dumps kernels we simply copy the data from
 845          * user space to it's destination.
 846          * We do things a page at a time for the sake of kmap.
 847          */
 848         unsigned long maddr;
 849         unsigned long ubytes, mbytes;
 850         int result;
 851         unsigned char __user *buf;
 852
 853         result = 0;
 854         buf = segment->buf;
 855         ubytes = segment->bufsz;
 856         mbytes = segment->memsz;
 857         maddr = segment->mem;
 858         while (mbytes) {
 859                 struct page *page;
 860                 char *ptr;
 861                 size_t uchunk, mchunk;
 862
 863                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 864                 if (!page) {
 865                         result  = -ENOMEM;
 866                         goto out;
 867                 }
 868                 ptr = kmap(page);
 869                 ptr += maddr & ~PAGE_MASK;
 870                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 871                 if (mchunk > mbytes)
 872                         mchunk = mbytes;
 873
 874                 uchunk = mchunk;
 875                 if (uchunk > ubytes) {
 876                         uchunk = ubytes;
 877                         /* Zero the trailing part of the page */
 878                         memset(ptr + uchunk, 0, mchunk - uchunk);
 879                 }
 880                 result = copy_from_user(ptr, buf, uchunk);
 881                 kexec_flush_icache_page(page);
 882                 kunmap(page);
 883                 if (result) {
 884                         result = (result < 0) ? result : -EIO;
 885                         goto out;
 886                 }
 887                 ubytes -= uchunk;
 888                 maddr  += mchunk;
 889                 buf    += mchunk;
 890                 mbytes -= mchunk;
 891         }
 892 out:
 893         return result;
 894 }
 895
 896 static int kimage_load_segment(struct kimage *image,
 897                                 struct kexec_segment *segment)
 898 {
 899         int result = -ENOMEM;
 900
 901         switch (image->type) {
 902         case KEXEC_TYPE_DEFAULT:
 903                 result = kimage_load_normal_segment(image, segment);
 904                 break;
 905         case KEXEC_TYPE_CRASH:
 906                 result = kimage_load_crash_segment(image, segment);
 907                 break;
 908         }
 909
 910         return result;
 911 }
 912
 913 /*
 914  * Exec Kernel system call: for obvious reasons only root may call it.
 915  *
 916  * This call breaks up into three pieces.
 917  * - A generic part which loads the new kernel from the current
 918  *   address space, and very carefully places the data in the
 919  *   allocated pages.
 920  *
 921  * - A generic part that interacts with the kernel and tells all of
 922  *   the devices to shut down.  Preventing on-going dmas, and placing
 923  *   the devices in a consistent state so a later kernel can
 924  *   reinitialize them.
 925  *
 926  * - A machine specific part that includes the syscall number
 927  *   and the copies the image to it's final destination.  And
 928  *   jumps into the image at entry.
 929  *
 930  * kexec does not sync, or unmount filesystems so if you need
 931  * that to happen you need to do that yourself.
 932  */
 933 struct kimage *kexec_image;
 934 struct kimage *kexec_crash_image;
 935
 936 static DEFINE_MUTEX(kexec_mutex);
 937
 938 SYSCALL_DEFINE4(kexec_load, unsigned long, entry, unsigned long, nr_segments,
 939                 struct kexec_segment __user *, segments, unsigned long, flags)
 940 {
 941         struct kimage **dest_image, *image;
 942         int result;
 943
 944         /* We only trust the superuser with rebooting the system. */
 945         if (!capable(CAP_SYS_BOOT))
 946                 return -EPERM;
 947
 948         /*
 949          * Verify we have a legal set of flags
 950          * This leaves us room for future extensions.
 951          */
 952         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 953                 return -EINVAL;
 954
 955         /* Verify we are on the appropriate architecture */
 956         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 957                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 958                 return -EINVAL;
 959
 960         /* Put an artificial cap on the number
 961          * of segments passed to kexec_load.
 962          */
 963         if (nr_segments > KEXEC_SEGMENT_MAX)
 964                 return -EINVAL;
 965
 966         image = NULL;
 967         result = 0;
 968
 969         /* Because we write directly to the reserved memory
 970          * region when loading crash kernels we need a mutex here to
 971          * prevent multiple crash  kernels from attempting to load
 972          * simultaneously, and to prevent a crash kernel from loading
 973          * over the top of a in use crash kernel.
 974          *
 975          * KISS: always take the mutex.
 976          */
 977         if (!mutex_trylock(&kexec_mutex))
 978                 return -EBUSY;
 979
 980         dest_image = &kexec_image;
 981         if (flags & KEXEC_ON_CRASH)
 982                 dest_image = &kexec_crash_image;
 983         if (nr_segments > 0) {
 984                 unsigned long i;
 985
 986                 /* Loading another kernel to reboot into */
 987                 if ((flags & KEXEC_ON_CRASH) == 0)
 988                         result = kimage_normal_alloc(&image, entry,
 989                                                         nr_segments, segments);
 990                 /* Loading another kernel to switch to if this one crashes */
 991                 else if (flags & KEXEC_ON_CRASH) {
 992                         /* Free any current crash dump kernel before
 993                          * we corrupt it.
 994                          */
 995                         kimage_free(xchg(&kexec_crash_image, NULL));
 996                         result = kimage_crash_alloc(&image, entry,
 997                                                      nr_segments, segments);
 998                 }
 999                 if (result)
1000                         goto out;
1001
1002                 if (flags & KEXEC_PRESERVE_CONTEXT)
1003                         image->preserve_context = 1;
1004                 result = machine_kexec_prepare(image);
1005                 if (result)
1006                         goto out;
1007
1008                 for (i = 0; i < nr_segments; i++) {
1009                         result = kimage_load_segment(image, &image->segment[i]);
1010                         if (result)
1011                                 goto out;
1012                 }
1013                 kimage_terminate(image);
1014         }
1015         /* Install the new kernel, and  Uninstall the old */
1016         image = xchg(dest_image, image);
1017
1018 out:
1019         mutex_unlock(&kexec_mutex);
1020         kimage_free(image);
1021
1022         return result;
1023 }
1024
1025 #ifdef CONFIG_COMPAT
1026 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1027                                 unsigned long nr_segments,
1028                                 struct compat_kexec_segment __user *segments,
1029                                 unsigned long flags)
1030 {
1031         struct compat_kexec_segment in;
1032         struct kexec_segment out, __user *ksegments;
1033         unsigned long i, result;
1034
1035         /* Don't allow clients that don't understand the native
1036          * architecture to do anything.
1037          */
1038         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1039                 return -EINVAL;
1040
1041         if (nr_segments > KEXEC_SEGMENT_MAX)
1042                 return -EINVAL;
1043
1044         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1045         for (i=0; i < nr_segments; i++) {
1046                 result = copy_from_user(&in, &segments[i], sizeof(in));
1047                 if (result)
1048                         return -EFAULT;
1049
1050                 out.buf   = compat_ptr(in.buf);
1051                 out.bufsz = in.bufsz;
1052                 out.mem   = in.mem;
1053                 out.memsz = in.memsz;
1054
1055                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1056                 if (result)
1057                         return -EFAULT;
1058         }
1059
1060         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1061 }
1062 #endif
1063
1064 void crash_kexec(struct pt_regs *regs)
1065 {
1066         /* Take the kexec_mutex here to prevent sys_kexec_load
1067          * running on one cpu from replacing the crash kernel
1068          * we are using after a panic on a different cpu.
1069          *
1070          * If the crash kernel was not located in a fixed area
1071          * of memory the xchg(&kexec_crash_image) would be
1072          * sufficient.  But since I reuse the memory...
1073          */
1074         if (mutex_trylock(&kexec_mutex)) {
1075                 if (kexec_crash_image) {
1076                         struct pt_regs fixed_regs;
1077                         crash_setup_regs(&fixed_regs, regs);
1078                         crash_save_vmcoreinfo();
1079                         machine_crash_shutdown(&fixed_regs);
1080                         machine_kexec(kexec_crash_image);
1081                 }
1082                 mutex_unlock(&kexec_mutex);
1083         }
1084 }
1085
1086 size_t crash_get_memory_size(void)
1087 {
1088         size_t size;
1089         mutex_lock(&kexec_mutex);
1090         size = crashk_res.end - crashk_res.start + 1;
1091         mutex_unlock(&kexec_mutex);
1092         return size;
1093 }
1094
1095 static void free_reserved_phys_range(unsigned long begin, unsigned long end)
1096 {
1097         unsigned long addr;
1098
1099         for (addr = begin; addr < end; addr += PAGE_SIZE) {
1100                 ClearPageReserved(pfn_to_page(addr >> PAGE_SHIFT));
1101                 init_page_count(pfn_to_page(addr >> PAGE_SHIFT));
1102                 free_page((unsigned long)__va(addr));
1103                 totalram_pages++;
1104         }
1105 }
1106
1107 int crash_shrink_memory(unsigned long new_size)
1108 {
1109         int ret = 0;
1110         unsigned long start, end;
1111
1112         mutex_lock(&kexec_mutex);
1113
1114         if (kexec_crash_image) {
1115                 ret = -ENOENT;
1116                 goto unlock;
1117         }
1118         start = crashk_res.start;
1119         end = crashk_res.end;
1120
1121         if (new_size >= end - start + 1) {
1122                 ret = -EINVAL;
1123                 if (new_size == end - start + 1)
1124                         ret = 0;
1125                 goto unlock;
1126         }
1127
1128         start = roundup(start, PAGE_SIZE);
1129         end = roundup(start + new_size, PAGE_SIZE);
1130
1131         free_reserved_phys_range(end, crashk_res.end);
1132
1133         if (start == end) {
1134                 crashk_res.end = end;
1135                 release_resource(&crashk_res);
1136         } else
1137                 crashk_res.end = end - 1;
1138
1139 unlock:
1140         mutex_unlock(&kexec_mutex);
1141         return ret;
1142 }
1143
1144 static u32 *append_elf_note(u32 *buf, char *name, unsigned type, void *data,
1145                             size_t data_len)
1146 {
1147         struct elf_note note;
1148
1149         note.n_namesz = strlen(name) + 1;
1150         note.n_descsz = data_len;
1151         note.n_type   = type;
1152         memcpy(buf, &note, sizeof(note));
1153         buf += (sizeof(note) + 3)/4;
1154         memcpy(buf, name, note.n_namesz);
1155         buf += (note.n_namesz + 3)/4;
1156         memcpy(buf, data, note.n_descsz);
1157         buf += (note.n_descsz + 3)/4;
1158
1159         return buf;
1160 }
1161
1162 static void final_note(u32 *buf)
1163 {
1164         struct elf_note note;
1165
1166         note.n_namesz = 0;
1167         note.n_descsz = 0;
1168         note.n_type   = 0;
1169         memcpy(buf, &note, sizeof(note));
1170 }
1171
1172 void crash_save_cpu(struct pt_regs *regs, int cpu)
1173 {
1174         struct elf_prstatus prstatus;
1175         u32 *buf;
1176
1177         if ((cpu < 0) || (cpu >= nr_cpu_ids))
1178                 return;
1179
1180         /* Using ELF notes here is opportunistic.
1181          * I need a well defined structure format
1182          * for the data I pass, and I need tags
1183          * on the data to indicate what information I have
1184          * squirrelled away.  ELF notes happen to provide
1185          * all of that, so there is no need to invent something new.
1186          */
1187         buf = (u32*)per_cpu_ptr(crash_notes, cpu);
1188         if (!buf)
1189                 return;
1190         memset(&prstatus, 0, sizeof(prstatus));
1191         prstatus.pr_pid = current->pid;
1192         elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
1193         buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
1194                               &prstatus, sizeof(prstatus));
1195         final_note(buf);
1196 }
1197
1198 static int __init crash_notes_memory_init(void)
1199 {
1200         /* Allocate memory for saving cpu registers. */
1201         crash_notes = alloc_percpu(note_buf_t);
1202         if (!crash_notes) {
1203                 printk("Kexec: Memory allocation for saving cpu register"
1204                 " states failed\n");
1205                 return -ENOMEM;
1206         }
1207         return 0;
1208 }
1209 module_init(crash_notes_memory_init)
1210
1211
1212 /*
1213  * parsing the "crashkernel" commandline
1214  *
1215  * this code is intended to be called from architecture specific code
1216  */
1217
1218
1219 /*
1220  * This function parses command lines in the format
1221  *
1222  *   crashkernel=ramsize-range:size[,...][@offset]
1223  *
1224  * The function returns 0 on success and -EINVAL on failure.
1225  */
1226 static int __init parse_crashkernel_mem(char                    *cmdline,
1227                                         unsigned long long      system_ram,
1228                                         unsigned long long      *crash_size,
1229                                         unsigned long long      *crash_base)
1230 {
1231         char *cur = cmdline, *tmp;
1232
1233         /* for each entry of the comma-separated list */
1234         do {
1235                 unsigned long long start, end = ULLONG_MAX, size;
1236
1237                 /* get the start of the range */
1238                 start = memparse(cur, &tmp);
1239                 if (cur == tmp) {
1240                         pr_warning("crashkernel: Memory value expected\n");
1241                         return -EINVAL;
1242                 }
1243                 cur = tmp;
1244                 if (*cur != '-') {
1245                         pr_warning("crashkernel: '-' expected\n");
1246                         return -EINVAL;
1247                 }
1248                 cur++;
1249
1250                 /* if no ':' is here, than we read the end */
1251                 if (*cur != ':') {
1252                         end = memparse(cur, &tmp);
1253                         if (cur == tmp) {
1254                                 pr_warning("crashkernel: Memory "
1255                                                 "value expected\n");
1256                                 return -EINVAL;
1257                         }
1258                         cur = tmp;
1259                         if (end <= start) {
1260                                 pr_warning("crashkernel: end <= start\n");
1261                                 return -EINVAL;
1262                         }
1263                 }
1264
1265                 if (*cur != ':') {
1266                         pr_warning("crashkernel: ':' expected\n");
1267                         return -EINVAL;
1268                 }
1269                 cur++;
1270
1271                 size = memparse(cur, &tmp);
1272                 if (cur == tmp) {
1273                         pr_warning("Memory value expected\n");
1274                         return -EINVAL;
1275                 }
1276                 cur = tmp;
1277                 if (size >= system_ram) {
1278                         pr_warning("crashkernel: invalid size\n");
1279                         return -EINVAL;
1280                 }
1281
1282                 /* match ? */
1283                 if (system_ram >= start && system_ram < end) {
1284                         *crash_size = size;
1285                         break;
1286                 }
1287         } while (*cur++ == ',');
1288
1289         if (*crash_size > 0) {
1290                 while (*cur && *cur != ' ' && *cur != '@')
1291                         cur++;
1292                 if (*cur == '@') {
1293                         cur++;
1294                         *crash_base = memparse(cur, &tmp);
1295                         if (cur == tmp) {
1296                                 pr_warning("Memory value expected "
1297                                                 "after '@'\n");
1298                                 return -EINVAL;
1299                         }
1300                 }
1301         }
1302
1303         return 0;
1304 }
1305
1306 /*
1307  * That function parses "simple" (old) crashkernel command lines like
1308  *
1309  *      crashkernel=size[@offset]
1310  *
1311  * It returns 0 on success and -EINVAL on failure.
1312  */
1313 static int __init parse_crashkernel_simple(char                 *cmdline,
1314                                            unsigned long long   *crash_size,
1315                                            unsigned long long   *crash_base)
1316 {
1317         char *cur = cmdline;
1318
1319         *crash_size = memparse(cmdline, &cur);
1320         if (cmdline == cur) {
1321                 pr_warning("crashkernel: memory value expected\n");
1322                 return -EINVAL;
1323         }
1324
1325         if (*cur == '@')
1326                 *crash_base = memparse(cur+1, &cur);
1327
1328         return 0;
1329 }
1330
1331 /*
1332  * That function is the entry point for command line parsing and should be
1333  * called from the arch-specific code.
1334  */
1335 int __init parse_crashkernel(char                *cmdline,
1336                              unsigned long long system_ram,
1337                              unsigned long long *crash_size,
1338                              unsigned long long *crash_base)
1339 {
1340         char    *p = cmdline, *ck_cmdline = NULL;
1341         char    *first_colon, *first_space;
1342
1343         BUG_ON(!crash_size || !crash_base);
1344         *crash_size = 0;
1345         *crash_base = 0;
1346
1347         /* find crashkernel and use the last one if there are more */
1348         p = strstr(p, "crashkernel=");
1349         while (p) {
1350                 ck_cmdline = p;
1351                 p = strstr(p+1, "crashkernel=");
1352         }
1353
1354         if (!ck_cmdline)
1355                 return -EINVAL;
1356
1357         ck_cmdline += 12; /* strlen("crashkernel=") */
1358
1359         /*
1360          * if the commandline contains a ':', then that's the extended
1361          * syntax -- if not, it must be the classic syntax
1362          */
1363         first_colon = strchr(ck_cmdline, ':');
1364         first_space = strchr(ck_cmdline, ' ');
1365         if (first_colon && (!first_space || first_colon < first_space))
1366                 return parse_crashkernel_mem(ck_cmdline, system_ram,
1367                                 crash_size, crash_base);
1368         else
1369                 return parse_crashkernel_simple(ck_cmdline, crash_size,
1370                                 crash_base);
1371
1372         return 0;
1373 }
1374
1375
1376
1377 void crash_save_vmcoreinfo(void)
1378 {
1379         u32 *buf;
1380
1381         if (!vmcoreinfo_size)
1382                 return;
1383
1384         vmcoreinfo_append_str("CRASHTIME=%ld", get_seconds());
1385
1386         buf = (u32 *)vmcoreinfo_note;
1387
1388         buf = append_elf_note(buf, VMCOREINFO_NOTE_NAME, 0, vmcoreinfo_data,
1389                               vmcoreinfo_size);
1390
1391         final_note(buf);
1392 }
1393
1394 void vmcoreinfo_append_str(const char *fmt, ...)
1395 {
1396         va_list args;
1397         char buf[0x50];
1398         int r;
1399
1400         va_start(args, fmt);
1401         r = vsnprintf(buf, sizeof(buf), fmt, args);
1402         va_end(args);
1403
1404         if (r + vmcoreinfo_size > vmcoreinfo_max_size)
1405                 r = vmcoreinfo_max_size - vmcoreinfo_size;
1406
1407         memcpy(&vmcoreinfo_data[vmcoreinfo_size], buf, r);
1408
1409         vmcoreinfo_size += r;
1410 }
1411
1412 /*
1413  * provide an empty default implementation here -- architecture
1414  * code may override this
1415  */
1416 void __attribute__ ((weak)) arch_crash_save_vmcoreinfo(void)
1417 {}
1418
1419 unsigned long __attribute__ ((weak)) paddr_vmcoreinfo_note(void)
1420 {
1421         return __pa((unsigned long)(char *)&vmcoreinfo_note);
1422 }
1423
1424 static int __init crash_save_vmcoreinfo_init(void)
1425 {
1426         VMCOREINFO_OSRELEASE(init_uts_ns.name.release);
1427         VMCOREINFO_PAGESIZE(PAGE_SIZE);
1428
1429         VMCOREINFO_SYMBOL(init_uts_ns);
1430         VMCOREINFO_SYMBOL(node_online_map);
1431         VMCOREINFO_SYMBOL(swapper_pg_dir);
1432         VMCOREINFO_SYMBOL(_stext);
1433         VMCOREINFO_SYMBOL(vmlist);
1434
1435 #ifndef CONFIG_NEED_MULTIPLE_NODES
1436         VMCOREINFO_SYMBOL(mem_map);
1437         VMCOREINFO_SYMBOL(contig_page_data);
1438 #endif
1439 #ifdef CONFIG_SPARSEMEM
1440         VMCOREINFO_SYMBOL(mem_section);
1441         VMCOREINFO_LENGTH(mem_section, NR_SECTION_ROOTS);
1442         VMCOREINFO_STRUCT_SIZE(mem_section);
1443         VMCOREINFO_OFFSET(mem_section, section_mem_map);
1444 #endif
1445         VMCOREINFO_STRUCT_SIZE(page);
1446         VMCOREINFO_STRUCT_SIZE(pglist_data);
1447         VMCOREINFO_STRUCT_SIZE(zone);
1448         VMCOREINFO_STRUCT_SIZE(free_area);
1449         VMCOREINFO_STRUCT_SIZE(list_head);
1450         VMCOREINFO_SIZE(nodemask_t);
1451         VMCOREINFO_OFFSET(page, flags);
1452         VMCOREINFO_OFFSET(page, _count);
1453         VMCOREINFO_OFFSET(page, mapping);
1454         VMCOREINFO_OFFSET(page, lru);
1455         VMCOREINFO_OFFSET(pglist_data, node_zones);
1456         VMCOREINFO_OFFSET(pglist_data, nr_zones);
1457 #ifdef CONFIG_FLAT_NODE_MEM_MAP
1458         VMCOREINFO_OFFSET(pglist_data, node_mem_map);
1459 #endif
1460         VMCOREINFO_OFFSET(pglist_data, node_start_pfn);
1461         VMCOREINFO_OFFSET(pglist_data, node_spanned_pages);
1462         VMCOREINFO_OFFSET(pglist_data, node_id);
1463         VMCOREINFO_OFFSET(zone, free_area);
1464         VMCOREINFO_OFFSET(zone, vm_stat);
1465         VMCOREINFO_OFFSET(zone, spanned_pages);
1466         VMCOREINFO_OFFSET(free_area, free_list);
1467         VMCOREINFO_OFFSET(list_head, next);
1468         VMCOREINFO_OFFSET(list_head, prev);
1469         VMCOREINFO_OFFSET(vm_struct, addr);
1470         VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
1471         log_buf_kexec_setup();
1472         VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
1473         VMCOREINFO_NUMBER(NR_FREE_PAGES);
1474         VMCOREINFO_NUMBER(PG_lru);
1475         VMCOREINFO_NUMBER(PG_private);
1476         VMCOREINFO_NUMBER(PG_swapcache);
1477
1478         arch_crash_save_vmcoreinfo();
1479
1480         return 0;
1481 }
1482
1483 module_init(crash_save_vmcoreinfo_init)
1484
1485 /*
1486  * Move into place and start executing a preloaded standalone
1487  * executable.  If nothing was preloaded return an error.
1488  */
1489 int kernel_kexec(void)
1490 {
1491         int error = 0;
1492
1493         if (!mutex_trylock(&kexec_mutex))
1494                 return -EBUSY;
1495         if (!kexec_image) {
1496                 error = -EINVAL;
1497                 goto Unlock;
1498         }
1499
1500 #ifdef CONFIG_KEXEC_JUMP
1501         if (kexec_image->preserve_context) {
1502                 mutex_lock(&pm_mutex);
1503                 pm_prepare_console();
1504                 error = freeze_processes();
1505                 if (error) {
1506                         error = -EBUSY;
1507                         goto Restore_console;
1508                 }
1509                 suspend_console();
1510                 error = dpm_suspend_start(PMSG_FREEZE);
1511                 if (error)
1512                         goto Resume_console;
1513                 /* At this point, dpm_suspend_start() has been called,
1514                  * but *not* dpm_suspend_noirq(). We *must* call
1515                  * dpm_suspend_noirq() now.  Otherwise, drivers for
1516                  * some devices (e.g. interrupt controllers) become
1517                  * desynchronized with the actual state of the
1518                  * hardware at resume time, and evil weirdness ensues.
1519                  */
1520                 error = dpm_suspend_noirq(PMSG_FREEZE);
1521                 if (error)
1522                         goto Resume_devices;
1523                 error = disable_nonboot_cpus();
1524                 if (error)
1525                         goto Enable_cpus;
1526                 local_irq_disable();
1527                 /* Suspend system devices */
1528                 error = sysdev_suspend(PMSG_FREEZE);
1529                 if (error)
1530                         goto Enable_irqs;
1531         } else
1532 #endif
1533         {
1534                 kernel_restart_prepare(NULL);
1535                 printk(KERN_EMERG "Starting new kernel\n");
1536                 machine_shutdown();
1537         }
1538
1539         machine_kexec(kexec_image);
1540
1541 #ifdef CONFIG_KEXEC_JUMP
1542         if (kexec_image->preserve_context) {
1543                 sysdev_resume();
1544  Enable_irqs:
1545                 local_irq_enable();
1546  Enable_cpus:
1547                 enable_nonboot_cpus();
1548                 dpm_resume_noirq(PMSG_RESTORE);
1549  Resume_devices:
1550                 dpm_resume_end(PMSG_RESTORE);
1551  Resume_console:
1552                 resume_console();
1553                 thaw_processes();
1554  Restore_console:
1555                 pm_restore_console();
1556                 mutex_unlock(&pm_mutex);
1557         }
1558 #endif
1559
1560  Unlock:
1561         mutex_unlock(&kexec_mutex);
1562         return error;
1563 }