SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/mm.h>
  10 #include <linux/file.h>
  11 #include <linux/slab.h>
  12 #include <linux/fs.h>
  13 #include <linux/kexec.h>
  14 #include <linux/spinlock.h>
  15 #include <linux/list.h>
  16 #include <linux/highmem.h>
  17 #include <linux/syscalls.h>
  18 #include <linux/reboot.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/ioport.h>
  21 #include <asm/page.h>
  22 #include <asm/uaccess.h>
  23 #include <asm/io.h>
  24 #include <asm/system.h>
  25 #include <asm/semaphore.h>
  26
  27 /* Location of the reserved area for the crash kernel */
  28 struct resource crashk_res = {
  29         .name  = "Crash kernel",
  30         .start = 0,
  31         .end   = 0,
  32         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  33 };
  34
  35 /*
  36  * When kexec transitions to the new kernel there is a one-to-one
  37  * mapping between physical and virtual addresses.  On processors
  38  * where you can disable the MMU this is trivial, and easy.  For
  39  * others it is still a simple predictable page table to setup.
  40  *
  41  * In that environment kexec copies the new kernel to its final
  42  * resting place.  This means I can only support memory whose
  43  * physical address can fit in an unsigned long.  In particular
  44  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  45  * If the assembly stub has more restrictive requirements
  46  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  47  * defined more restrictively in <asm/kexec.h>.
  48  *
  49  * The code for the transition from the current kernel to the
  50  * the new kernel is placed in the control_code_buffer, whose size
  51  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  52  * page of memory is necessary, but some architectures require more.
  53  * Because this memory must be identity mapped in the transition from
  54  * virtual to physical addresses it must live in the range
  55  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  56  * modifiable.
  57  *
  58  * The assembly stub in the control code buffer is passed a linked list
  59  * of descriptor pages detailing the source pages of the new kernel,
  60  * and the destination addresses of those source pages.  As this data
  61  * structure is not used in the context of the current OS, it must
  62  * be self-contained.
  63  *
  64  * The code has been made to work with highmem pages and will use a
  65  * destination page in its final resting place (if it happens
  66  * to allocate it).  The end product of this is that most of the
  67  * physical address space, and most of RAM can be used.
  68  *
  69  * Future directions include:
  70  *  - allocating a page table with the control code buffer identity
  71  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  72  *    reliable.
  73  */
  74
  75 /*
  76  * KIMAGE_NO_DEST is an impossible destination address..., for
  77  * allocating pages whose destination address we do not care about.
  78  */
  79 #define KIMAGE_NO_DEST (-1UL)
  80
  81 static int kimage_is_destination_range(
  82         struct kimage *image, unsigned long start, unsigned long end);
  83 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
  84
  85 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  86         unsigned long nr_segments, struct kexec_segment __user *segments)
  87 {
  88         size_t segment_bytes;
  89         struct kimage *image;
  90         unsigned long i;
  91         int result;
  92
  93         /* Allocate a controlling structure */
  94         result = -ENOMEM;
  95         image = kmalloc(sizeof(*image), GFP_KERNEL);
  96         if (!image) {
  97                 goto out;
  98         }
  99         memset(image, 0, sizeof(*image));
 100         image->head = 0;
 101         image->entry = &image->head;
 102         image->last_entry = &image->head;
 103         image->control_page = ~0; /* By default this does not apply */
 104         image->start = entry;
 105         image->type = KEXEC_TYPE_DEFAULT;
 106
 107         /* Initialize the list of control pages */
 108         INIT_LIST_HEAD(&image->control_pages);
 109
 110         /* Initialize the list of destination pages */
 111         INIT_LIST_HEAD(&image->dest_pages);
 112
 113         /* Initialize the list of unuseable pages */
 114         INIT_LIST_HEAD(&image->unuseable_pages);
 115
 116         /* Read in the segments */
 117         image->nr_segments = nr_segments;
 118         segment_bytes = nr_segments * sizeof(*segments);
 119         result = copy_from_user(image->segment, segments, segment_bytes);
 120         if (result)
 121                 goto out;
 122
 123         /*
 124          * Verify we have good destination addresses.  The caller is
 125          * responsible for making certain we don't attempt to load
 126          * the new image into invalid or reserved areas of RAM.  This
 127          * just verifies it is an address we can use.
 128          *
 129          * Since the kernel does everything in page size chunks ensure
 130          * the destination addreses are page aligned.  Too many
 131          * special cases crop of when we don't do this.  The most
 132          * insidious is getting overlapping destination addresses
 133          * simply because addresses are changed to page size
 134          * granularity.
 135          */
 136         result = -EADDRNOTAVAIL;
 137         for (i = 0; i < nr_segments; i++) {
 138                 unsigned long mstart, mend;
 139                 mstart = image->segment[i].mem;
 140                 mend   = mstart + image->segment[i].memsz;
 141                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 142                         goto out;
 143                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 144                         goto out;
 145         }
 146
 147         /* Verify our destination addresses do not overlap.
 148          * If we alloed overlapping destination addresses
 149          * through very weird things can happen with no
 150          * easy explanation as one segment stops on another.
 151          */
 152         result = -EINVAL;
 153         for(i = 0; i < nr_segments; i++) {
 154                 unsigned long mstart, mend;
 155                 unsigned long j;
 156                 mstart = image->segment[i].mem;
 157                 mend   = mstart + image->segment[i].memsz;
 158                 for(j = 0; j < i; j++) {
 159                         unsigned long pstart, pend;
 160                         pstart = image->segment[j].mem;
 161                         pend   = pstart + image->segment[j].memsz;
 162                         /* Do the segments overlap ? */
 163                         if ((mend > pstart) && (mstart < pend))
 164                                 goto out;
 165                 }
 166         }
 167
 168         /* Ensure our buffer sizes are strictly less than
 169          * our memory sizes.  This should always be the case,
 170          * and it is easier to check up front than to be surprised
 171          * later on.
 172          */
 173         result = -EINVAL;
 174         for(i = 0; i < nr_segments; i++) {
 175                 if (image->segment[i].bufsz > image->segment[i].memsz)
 176                         goto out;
 177         }
 178
 179
 180         result = 0;
 181  out:
 182         if (result == 0) {
 183                 *rimage = image;
 184         } else {
 185                 kfree(image);
 186         }
 187         return result;
 188
 189 }
 190
 191 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 192         unsigned long nr_segments, struct kexec_segment __user *segments)
 193 {
 194         int result;
 195         struct kimage *image;
 196
 197         /* Allocate and initialize a controlling structure */
 198         image = NULL;
 199         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 200         if (result) {
 201                 goto out;
 202         }
 203         *rimage = image;
 204
 205         /*
 206          * Find a location for the control code buffer, and add it
 207          * the vector of segments so that it's pages will also be
 208          * counted as destination pages.
 209          */
 210         result = -ENOMEM;
 211         image->control_code_page = kimage_alloc_control_pages(image,
 212                 get_order(KEXEC_CONTROL_CODE_SIZE));
 213         if (!image->control_code_page) {
 214                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 215                 goto out;
 216         }
 217
 218         result = 0;
 219  out:
 220         if (result == 0) {
 221                 *rimage = image;
 222         } else {
 223                 kfree(image);
 224         }
 225         return result;
 226 }
 227
 228 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 229         unsigned long nr_segments, struct kexec_segment *segments)
 230 {
 231         int result;
 232         struct kimage *image;
 233         unsigned long i;
 234
 235         image = NULL;
 236         /* Verify we have a valid entry point */
 237         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 238                 result = -EADDRNOTAVAIL;
 239                 goto out;
 240         }
 241
 242         /* Allocate and initialize a controlling structure */
 243         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 244         if (result) {
 245                 goto out;
 246         }
 247
 248         /* Enable the special crash kernel control page
 249          * allocation policy.
 250          */
 251         image->control_page = crashk_res.start;
 252         image->type = KEXEC_TYPE_CRASH;
 253
 254         /*
 255          * Verify we have good destination addresses.  Normally
 256          * the caller is responsible for making certain we don't
 257          * attempt to load the new image into invalid or reserved
 258          * areas of RAM.  But crash kernels are preloaded into a
 259          * reserved area of ram.  We must ensure the addresses
 260          * are in the reserved area otherwise preloading the
 261          * kernel could corrupt things.
 262          */
 263         result = -EADDRNOTAVAIL;
 264         for (i = 0; i < nr_segments; i++) {
 265                 unsigned long mstart, mend;
 266                 mstart = image->segment[i].mem;
 267                 mend = mstart + image->segment[i].memsz;
 268                 /* Ensure we are within the crash kernel limits */
 269                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 270                         goto out;
 271         }
 272
 273
 274         /*
 275          * Find a location for the control code buffer, and add
 276          * the vector of segments so that it's pages will also be
 277          * counted as destination pages.
 278          */
 279         result = -ENOMEM;
 280         image->control_code_page = kimage_alloc_control_pages(image,
 281                 get_order(KEXEC_CONTROL_CODE_SIZE));
 282         if (!image->control_code_page) {
 283                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 284                 goto out;
 285         }
 286
 287         result = 0;
 288  out:
 289         if (result == 0) {
 290                 *rimage = image;
 291         } else {
 292                 kfree(image);
 293         }
 294         return result;
 295 }
 296
 297 static int kimage_is_destination_range(
 298         struct kimage *image, unsigned long start, unsigned long end)
 299 {
 300         unsigned long i;
 301
 302         for (i = 0; i < image->nr_segments; i++) {
 303                 unsigned long mstart, mend;
 304                 mstart = image->segment[i].mem;
 305                 mend   = mstart + image->segment[i].memsz;
 306                 if ((end > mstart) && (start < mend)) {
 307                         return 1;
 308                 }
 309         }
 310         return 0;
 311 }
 312
 313 static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
 314 {
 315         struct page *pages;
 316         pages = alloc_pages(gfp_mask, order);
 317         if (pages) {
 318                 unsigned int count, i;
 319                 pages->mapping = NULL;
 320                 pages->private = order;
 321                 count = 1 << order;
 322                 for(i = 0; i < count; i++) {
 323                         SetPageReserved(pages + i);
 324                 }
 325         }
 326         return pages;
 327 }
 328
 329 static void kimage_free_pages(struct page *page)
 330 {
 331         unsigned int order, count, i;
 332         order = page->private;
 333         count = 1 << order;
 334         for(i = 0; i < count; i++) {
 335                 ClearPageReserved(page + i);
 336         }
 337         __free_pages(page, order);
 338 }
 339
 340 static void kimage_free_page_list(struct list_head *list)
 341 {
 342         struct list_head *pos, *next;
 343         list_for_each_safe(pos, next, list) {
 344                 struct page *page;
 345
 346                 page = list_entry(pos, struct page, lru);
 347                 list_del(&page->lru);
 348
 349                 kimage_free_pages(page);
 350         }
 351 }
 352
 353 static struct page *kimage_alloc_normal_control_pages(
 354         struct kimage *image, unsigned int order)
 355 {
 356         /* Control pages are special, they are the intermediaries
 357          * that are needed while we copy the rest of the pages
 358          * to their final resting place.  As such they must
 359          * not conflict with either the destination addresses
 360          * or memory the kernel is already using.
 361          *
 362          * The only case where we really need more than one of
 363          * these are for architectures where we cannot disable
 364          * the MMU and must instead generate an identity mapped
 365          * page table for all of the memory.
 366          *
 367          * At worst this runs in O(N) of the image size.
 368          */
 369         struct list_head extra_pages;
 370         struct page *pages;
 371         unsigned int count;
 372
 373         count = 1 << order;
 374         INIT_LIST_HEAD(&extra_pages);
 375
 376         /* Loop while I can allocate a page and the page allocated
 377          * is a destination page.
 378          */
 379         do {
 380                 unsigned long pfn, epfn, addr, eaddr;
 381                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 382                 if (!pages)
 383                         break;
 384                 pfn   = page_to_pfn(pages);
 385                 epfn  = pfn + count;
 386                 addr  = pfn << PAGE_SHIFT;
 387                 eaddr = epfn << PAGE_SHIFT;
 388                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 389                         kimage_is_destination_range(image, addr, eaddr))
 390                 {
 391                         list_add(&pages->lru, &extra_pages);
 392                         pages = NULL;
 393                 }
 394         } while(!pages);
 395         if (pages) {
 396                 /* Remember the allocated page... */
 397                 list_add(&pages->lru, &image->control_pages);
 398
 399                 /* Because the page is already in it's destination
 400                  * location we will never allocate another page at
 401                  * that address.  Therefore kimage_alloc_pages
 402                  * will not return it (again) and we don't need
 403                  * to give it an entry in image->segment[].
 404                  */
 405         }
 406         /* Deal with the destination pages I have inadvertently allocated.
 407          *
 408          * Ideally I would convert multi-page allocations into single
 409          * page allocations, and add everyting to image->dest_pages.
 410          *
 411          * For now it is simpler to just free the pages.
 412          */
 413         kimage_free_page_list(&extra_pages);
 414         return pages;
 415
 416 }
 417
 418 static struct page *kimage_alloc_crash_control_pages(
 419         struct kimage *image, unsigned int order)
 420 {
 421         /* Control pages are special, they are the intermediaries
 422          * that are needed while we copy the rest of the pages
 423          * to their final resting place.  As such they must
 424          * not conflict with either the destination addresses
 425          * or memory the kernel is already using.
 426          *
 427          * Control pages are also the only pags we must allocate
 428          * when loading a crash kernel.  All of the other pages
 429          * are specified by the segments and we just memcpy
 430          * into them directly.
 431          *
 432          * The only case where we really need more than one of
 433          * these are for architectures where we cannot disable
 434          * the MMU and must instead generate an identity mapped
 435          * page table for all of the memory.
 436          *
 437          * Given the low demand this implements a very simple
 438          * allocator that finds the first hole of the appropriate
 439          * size in the reserved memory region, and allocates all
 440          * of the memory up to and including the hole.
 441          */
 442         unsigned long hole_start, hole_end, size;
 443         struct page *pages;
 444         pages = NULL;
 445         size = (1 << order) << PAGE_SHIFT;
 446         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 447         hole_end   = hole_start + size - 1;
 448         while(hole_end <= crashk_res.end) {
 449                 unsigned long i;
 450                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
 451                         break;
 452                 }
 453                 if (hole_end > crashk_res.end) {
 454                         break;
 455                 }
 456                 /* See if I overlap any of the segments */
 457                 for(i = 0; i < image->nr_segments; i++) {
 458                         unsigned long mstart, mend;
 459                         mstart = image->segment[i].mem;
 460                         mend   = mstart + image->segment[i].memsz - 1;
 461                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 462                                 /* Advance the hole to the end of the segment */
 463                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 464                                 hole_end   = hole_start + size - 1;
 465                                 break;
 466                         }
 467                 }
 468                 /* If I don't overlap any segments I have found my hole! */
 469                 if (i == image->nr_segments) {
 470                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 471                         break;
 472                 }
 473         }
 474         if (pages) {
 475                 image->control_page = hole_end;
 476         }
 477         return pages;
 478 }
 479
 480
 481 struct page *kimage_alloc_control_pages(
 482         struct kimage *image, unsigned int order)
 483 {
 484         struct page *pages = NULL;
 485         switch(image->type) {
 486         case KEXEC_TYPE_DEFAULT:
 487                 pages = kimage_alloc_normal_control_pages(image, order);
 488                 break;
 489         case KEXEC_TYPE_CRASH:
 490                 pages = kimage_alloc_crash_control_pages(image, order);
 491                 break;
 492         }
 493         return pages;
 494 }
 495
 496 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 497 {
 498         if (*image->entry != 0) {
 499                 image->entry++;
 500         }
 501         if (image->entry == image->last_entry) {
 502                 kimage_entry_t *ind_page;
 503                 struct page *page;
 504                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 505                 if (!page) {
 506                         return -ENOMEM;
 507                 }
 508                 ind_page = page_address(page);
 509                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 510                 image->entry = ind_page;
 511                 image->last_entry =
 512                         ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 513         }
 514         *image->entry = entry;
 515         image->entry++;
 516         *image->entry = 0;
 517         return 0;
 518 }
 519
 520 static int kimage_set_destination(
 521         struct kimage *image, unsigned long destination)
 522 {
 523         int result;
 524
 525         destination &= PAGE_MASK;
 526         result = kimage_add_entry(image, destination | IND_DESTINATION);
 527         if (result == 0) {
 528                 image->destination = destination;
 529         }
 530         return result;
 531 }
 532
 533
 534 static int kimage_add_page(struct kimage *image, unsigned long page)
 535 {
 536         int result;
 537
 538         page &= PAGE_MASK;
 539         result = kimage_add_entry(image, page | IND_SOURCE);
 540         if (result == 0) {
 541                 image->destination += PAGE_SIZE;
 542         }
 543         return result;
 544 }
 545
 546
 547 static void kimage_free_extra_pages(struct kimage *image)
 548 {
 549         /* Walk through and free any extra destination pages I may have */
 550         kimage_free_page_list(&image->dest_pages);
 551
 552         /* Walk through and free any unuseable pages I have cached */
 553         kimage_free_page_list(&image->unuseable_pages);
 554
 555 }
 556 static int kimage_terminate(struct kimage *image)
 557 {
 558         if (*image->entry != 0) {
 559                 image->entry++;
 560         }
 561         *image->entry = IND_DONE;
 562         return 0;
 563 }
 564
 565 #define for_each_kimage_entry(image, ptr, entry) \
 566         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 567                 ptr = (entry & IND_INDIRECTION)? \
 568                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 569
 570 static void kimage_free_entry(kimage_entry_t entry)
 571 {
 572         struct page *page;
 573
 574         page = pfn_to_page(entry >> PAGE_SHIFT);
 575         kimage_free_pages(page);
 576 }
 577
 578 static void kimage_free(struct kimage *image)
 579 {
 580         kimage_entry_t *ptr, entry;
 581         kimage_entry_t ind = 0;
 582
 583         if (!image)
 584                 return;
 585         kimage_free_extra_pages(image);
 586         for_each_kimage_entry(image, ptr, entry) {
 587                 if (entry & IND_INDIRECTION) {
 588                         /* Free the previous indirection page */
 589                         if (ind & IND_INDIRECTION) {
 590                                 kimage_free_entry(ind);
 591                         }
 592                         /* Save this indirection page until we are
 593                          * done with it.
 594                          */
 595                         ind = entry;
 596                 }
 597                 else if (entry & IND_SOURCE) {
 598                         kimage_free_entry(entry);
 599                 }
 600         }
 601         /* Free the final indirection page */
 602         if (ind & IND_INDIRECTION) {
 603                 kimage_free_entry(ind);
 604         }
 605
 606         /* Handle any machine specific cleanup */
 607         machine_kexec_cleanup(image);
 608
 609         /* Free the kexec control pages... */
 610         kimage_free_page_list(&image->control_pages);
 611         kfree(image);
 612 }
 613
 614 static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
 615 {
 616         kimage_entry_t *ptr, entry;
 617         unsigned long destination = 0;
 618
 619         for_each_kimage_entry(image, ptr, entry) {
 620                 if (entry & IND_DESTINATION) {
 621                         destination = entry & PAGE_MASK;
 622                 }
 623                 else if (entry & IND_SOURCE) {
 624                         if (page == destination) {
 625                                 return ptr;
 626                         }
 627                         destination += PAGE_SIZE;
 628                 }
 629         }
 630         return 0;
 631 }
 632
 633 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
 634 {
 635         /*
 636          * Here we implement safeguards to ensure that a source page
 637          * is not copied to its destination page before the data on
 638          * the destination page is no longer useful.
 639          *
 640          * To do this we maintain the invariant that a source page is
 641          * either its own destination page, or it is not a
 642          * destination page at all.
 643          *
 644          * That is slightly stronger than required, but the proof
 645          * that no problems will not occur is trivial, and the
 646          * implementation is simply to verify.
 647          *
 648          * When allocating all pages normally this algorithm will run
 649          * in O(N) time, but in the worst case it will run in O(N^2)
 650          * time.   If the runtime is a problem the data structures can
 651          * be fixed.
 652          */
 653         struct page *page;
 654         unsigned long addr;
 655
 656         /*
 657          * Walk through the list of destination pages, and see if I
 658          * have a match.
 659          */
 660         list_for_each_entry(page, &image->dest_pages, lru) {
 661                 addr = page_to_pfn(page) << PAGE_SHIFT;
 662                 if (addr == destination) {
 663                         list_del(&page->lru);
 664                         return page;
 665                 }
 666         }
 667         page = NULL;
 668         while (1) {
 669                 kimage_entry_t *old;
 670
 671                 /* Allocate a page, if we run out of memory give up */
 672                 page = kimage_alloc_pages(gfp_mask, 0);
 673                 if (!page) {
 674                         return 0;
 675                 }
 676                 /* If the page cannot be used file it away */
 677                 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 678                         list_add(&page->lru, &image->unuseable_pages);
 679                         continue;
 680                 }
 681                 addr = page_to_pfn(page) << PAGE_SHIFT;
 682
 683                 /* If it is the destination page we want use it */
 684                 if (addr == destination)
 685                         break;
 686
 687                 /* If the page is not a destination page use it */
 688                 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
 689                         break;
 690
 691                 /*
 692                  * I know that the page is someones destination page.
 693                  * See if there is already a source page for this
 694                  * destination page.  And if so swap the source pages.
 695                  */
 696                 old = kimage_dst_used(image, addr);
 697                 if (old) {
 698                         /* If so move it */
 699                         unsigned long old_addr;
 700                         struct page *old_page;
 701
 702                         old_addr = *old & PAGE_MASK;
 703                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 704                         copy_highpage(page, old_page);
 705                         *old = addr | (*old & ~PAGE_MASK);
 706
 707                         /* The old page I have found cannot be a
 708                          * destination page, so return it.
 709                          */
 710                         addr = old_addr;
 711                         page = old_page;
 712                         break;
 713                 }
 714                 else {
 715                         /* Place the page on the destination list I
 716                          * will use it later.
 717                          */
 718                         list_add(&page->lru, &image->dest_pages);
 719                 }
 720         }
 721         return page;
 722 }
 723
 724 static int kimage_load_normal_segment(struct kimage *image,
 725         struct kexec_segment *segment)
 726 {
 727         unsigned long maddr;
 728         unsigned long ubytes, mbytes;
 729         int result;
 730         unsigned char *buf;
 731
 732         result = 0;
 733         buf = segment->buf;
 734         ubytes = segment->bufsz;
 735         mbytes = segment->memsz;
 736         maddr = segment->mem;
 737
 738         result = kimage_set_destination(image, maddr);
 739         if (result < 0) {
 740                 goto out;
 741         }
 742         while(mbytes) {
 743                 struct page *page;
 744                 char *ptr;
 745                 size_t uchunk, mchunk;
 746                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 747                 if (page == 0) {
 748                         result  = -ENOMEM;
 749                         goto out;
 750                 }
 751                 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
 752                 if (result < 0) {
 753                         goto out;
 754                 }
 755                 ptr = kmap(page);
 756                 /* Start with a clear page */
 757                 memset(ptr, 0, PAGE_SIZE);
 758                 ptr += maddr & ~PAGE_MASK;
 759                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 760                 if (mchunk > mbytes) {
 761                         mchunk = mbytes;
 762                 }
 763                 uchunk = mchunk;
 764                 if (uchunk > ubytes) {
 765                         uchunk = ubytes;
 766                 }
 767                 result = copy_from_user(ptr, buf, uchunk);
 768                 kunmap(page);
 769                 if (result) {
 770                         result = (result < 0) ? result : -EIO;
 771                         goto out;
 772                 }
 773                 ubytes -= uchunk;
 774                 maddr  += mchunk;
 775                 buf    += mchunk;
 776                 mbytes -= mchunk;
 777         }
 778  out:
 779         return result;
 780 }
 781
 782 static int kimage_load_crash_segment(struct kimage *image,
 783         struct kexec_segment *segment)
 784 {
 785         /* For crash dumps kernels we simply copy the data from
 786          * user space to it's destination.
 787          * We do things a page at a time for the sake of kmap.
 788          */
 789         unsigned long maddr;
 790         unsigned long ubytes, mbytes;
 791         int result;
 792         unsigned char *buf;
 793
 794         result = 0;
 795         buf = segment->buf;
 796         ubytes = segment->bufsz;
 797         mbytes = segment->memsz;
 798         maddr = segment->mem;
 799         while(mbytes) {
 800                 struct page *page;
 801                 char *ptr;
 802                 size_t uchunk, mchunk;
 803                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 804                 if (page == 0) {
 805                         result  = -ENOMEM;
 806                         goto out;
 807                 }
 808                 ptr = kmap(page);
 809                 ptr += maddr & ~PAGE_MASK;
 810                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 811                 if (mchunk > mbytes) {
 812                         mchunk = mbytes;
 813                 }
 814                 uchunk = mchunk;
 815                 if (uchunk > ubytes) {
 816                         uchunk = ubytes;
 817                         /* Zero the trailing part of the page */
 818                         memset(ptr + uchunk, 0, mchunk - uchunk);
 819                 }
 820                 result = copy_from_user(ptr, buf, uchunk);
 821                 kunmap(page);
 822                 if (result) {
 823                         result = (result < 0) ? result : -EIO;
 824                         goto out;
 825                 }
 826                 ubytes -= uchunk;
 827                 maddr  += mchunk;
 828                 buf    += mchunk;
 829                 mbytes -= mchunk;
 830         }
 831  out:
 832         return result;
 833 }
 834
 835 static int kimage_load_segment(struct kimage *image,
 836         struct kexec_segment *segment)
 837 {
 838         int result = -ENOMEM;
 839         switch(image->type) {
 840         case KEXEC_TYPE_DEFAULT:
 841                 result = kimage_load_normal_segment(image, segment);
 842                 break;
 843         case KEXEC_TYPE_CRASH:
 844                 result = kimage_load_crash_segment(image, segment);
 845                 break;
 846         }
 847         return result;
 848 }
 849
 850 /*
 851  * Exec Kernel system call: for obvious reasons only root may call it.
 852  *
 853  * This call breaks up into three pieces.
 854  * - A generic part which loads the new kernel from the current
 855  *   address space, and very carefully places the data in the
 856  *   allocated pages.
 857  *
 858  * - A generic part that interacts with the kernel and tells all of
 859  *   the devices to shut down.  Preventing on-going dmas, and placing
 860  *   the devices in a consistent state so a later kernel can
 861  *   reinitialize them.
 862  *
 863  * - A machine specific part that includes the syscall number
 864  *   and the copies the image to it's final destination.  And
 865  *   jumps into the image at entry.
 866  *
 867  * kexec does not sync, or unmount filesystems so if you need
 868  * that to happen you need to do that yourself.
 869  */
 870 struct kimage *kexec_image = NULL;
 871 static struct kimage *kexec_crash_image = NULL;
 872 /*
 873  * A home grown binary mutex.
 874  * Nothing can wait so this mutex is safe to use
 875  * in interrupt context :)
 876  */
 877 static int kexec_lock = 0;
 878
 879 asmlinkage long sys_kexec_load(unsigned long entry,
 880         unsigned long nr_segments, struct kexec_segment __user *segments,
 881         unsigned long flags)
 882 {
 883         struct kimage **dest_image, *image;
 884         int locked;
 885         int result;
 886
 887         /* We only trust the superuser with rebooting the system. */
 888         if (!capable(CAP_SYS_BOOT))
 889                 return -EPERM;
 890
 891         /*
 892          * Verify we have a legal set of flags
 893          * This leaves us room for future extensions.
 894          */
 895         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 896                 return -EINVAL;
 897
 898         /* Verify we are on the appropriate architecture */
 899         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 900                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 901         {
 902                 return -EINVAL;
 903         }
 904
 905         /* Put an artificial cap on the number
 906          * of segments passed to kexec_load.
 907          */
 908         if (nr_segments > KEXEC_SEGMENT_MAX)
 909                 return -EINVAL;
 910
 911         image = NULL;
 912         result = 0;
 913
 914         /* Because we write directly to the reserved memory
 915          * region when loading crash kernels we need a mutex here to
 916          * prevent multiple crash  kernels from attempting to load
 917          * simultaneously, and to prevent a crash kernel from loading
 918          * over the top of a in use crash kernel.
 919          *
 920          * KISS: always take the mutex.
 921          */
 922         locked = xchg(&kexec_lock, 1);
 923         if (locked) {
 924                 return -EBUSY;
 925         }
 926         dest_image = &kexec_image;
 927         if (flags & KEXEC_ON_CRASH) {
 928                 dest_image = &kexec_crash_image;
 929         }
 930         if (nr_segments > 0) {
 931                 unsigned long i;
 932                 /* Loading another kernel to reboot into */
 933                 if ((flags & KEXEC_ON_CRASH) == 0) {
 934                         result = kimage_normal_alloc(&image, entry, nr_segments, segments);
 935                 }
 936                 /* Loading another kernel to switch to if this one crashes */
 937                 else if (flags & KEXEC_ON_CRASH) {
 938                         /* Free any current crash dump kernel before
 939                          * we corrupt it.
 940                          */
 941                         kimage_free(xchg(&kexec_crash_image, NULL));
 942                         result = kimage_crash_alloc(&image, entry, nr_segments, segments);
 943                 }
 944                 if (result) {
 945                         goto out;
 946                 }
 947                 result = machine_kexec_prepare(image);
 948                 if (result) {
 949                         goto out;
 950                 }
 951                 for(i = 0; i < nr_segments; i++) {
 952                         result = kimage_load_segment(image, &image->segment[i]);
 953                         if (result) {
 954                                 goto out;
 955                         }
 956                 }
 957                 result = kimage_terminate(image);
 958                 if (result) {
 959                         goto out;
 960                 }
 961         }
 962         /* Install the new kernel, and  Uninstall the old */
 963         image = xchg(dest_image, image);
 964
 965  out:
 966         xchg(&kexec_lock, 0); /* Release the mutex */
 967         kimage_free(image);
 968         return result;
 969 }
 970
 971 #ifdef CONFIG_COMPAT
 972 asmlinkage long compat_sys_kexec_load(unsigned long entry,
 973         unsigned long nr_segments, struct compat_kexec_segment __user *segments,
 974         unsigned long flags)
 975 {
 976         struct compat_kexec_segment in;
 977         struct kexec_segment out, __user *ksegments;
 978         unsigned long i, result;
 979
 980         /* Don't allow clients that don't understand the native
 981          * architecture to do anything.
 982          */
 983         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
 984                 return -EINVAL;
 985         }
 986
 987         if (nr_segments > KEXEC_SEGMENT_MAX) {
 988                 return -EINVAL;
 989         }
 990
 991         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
 992         for (i=0; i < nr_segments; i++) {
 993                 result = copy_from_user(&in, &segments[i], sizeof(in));
 994                 if (result) {
 995                         return -EFAULT;
 996                 }
 997
 998                 out.buf   = compat_ptr(in.buf);
 999                 out.bufsz = in.bufsz;
1000                 out.mem   = in.mem;
1001                 out.memsz = in.memsz;
1002
1003                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1004                 if (result) {
1005                         return -EFAULT;
1006                 }
1007         }
1008
1009         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1010 }
1011 #endif
1012
1013 void crash_kexec(void)
1014 {
1015         struct kimage *image;
1016         int locked;
1017
1018
1019         /* Take the kexec_lock here to prevent sys_kexec_load
1020          * running on one cpu from replacing the crash kernel
1021          * we are using after a panic on a different cpu.
1022          *
1023          * If the crash kernel was not located in a fixed area
1024          * of memory the xchg(&kexec_crash_image) would be
1025          * sufficient.  But since I reuse the memory...
1026          */
1027         locked = xchg(&kexec_lock, 1);
1028         if (!locked) {
1029                 image = xchg(&kexec_crash_image, NULL);
1030                 if (image) {
1031                         machine_crash_shutdown();
1032                         machine_kexec(image);
1033                 }
1034                 xchg(&kexec_lock, 0);
1035         }
1036 }