SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/mm.h>
  10 #include <linux/file.h>
  11 #include <linux/slab.h>
  12 #include <linux/fs.h>
  13 #include <linux/kexec.h>
  14 #include <linux/spinlock.h>
  15 #include <linux/list.h>
  16 #include <linux/highmem.h>
  17 #include <linux/syscalls.h>
  18 #include <linux/reboot.h>
  19 #include <linux/syscalls.h>
  20 #include <linux/ioport.h>
  21 #include <linux/hardirq.h>
  22
  23 #include <asm/page.h>
  24 #include <asm/uaccess.h>
  25 #include <asm/io.h>
  26 #include <asm/system.h>
  27 #include <asm/semaphore.h>
  28
  29 /* Location of the reserved area for the crash kernel */
  30 struct resource crashk_res = {
  31         .name  = "Crash kernel",
  32         .start = 0,
  33         .end   = 0,
  34         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  35 };
  36
  37 int kexec_should_crash(struct task_struct *p)
  38 {
  39         if (in_interrupt() || !p->pid || p->pid == 1 || panic_on_oops)
  40                 return 1;
  41         return 0;
  42 }
  43
  44 /*
  45  * When kexec transitions to the new kernel there is a one-to-one
  46  * mapping between physical and virtual addresses.  On processors
  47  * where you can disable the MMU this is trivial, and easy.  For
  48  * others it is still a simple predictable page table to setup.
  49  *
  50  * In that environment kexec copies the new kernel to its final
  51  * resting place.  This means I can only support memory whose
  52  * physical address can fit in an unsigned long.  In particular
  53  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  54  * If the assembly stub has more restrictive requirements
  55  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  56  * defined more restrictively in <asm/kexec.h>.
  57  *
  58  * The code for the transition from the current kernel to the
  59  * the new kernel is placed in the control_code_buffer, whose size
  60  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  61  * page of memory is necessary, but some architectures require more.
  62  * Because this memory must be identity mapped in the transition from
  63  * virtual to physical addresses it must live in the range
  64  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  65  * modifiable.
  66  *
  67  * The assembly stub in the control code buffer is passed a linked list
  68  * of descriptor pages detailing the source pages of the new kernel,
  69  * and the destination addresses of those source pages.  As this data
  70  * structure is not used in the context of the current OS, it must
  71  * be self-contained.
  72  *
  73  * The code has been made to work with highmem pages and will use a
  74  * destination page in its final resting place (if it happens
  75  * to allocate it).  The end product of this is that most of the
  76  * physical address space, and most of RAM can be used.
  77  *
  78  * Future directions include:
  79  *  - allocating a page table with the control code buffer identity
  80  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  81  *    reliable.
  82  */
  83
  84 /*
  85  * KIMAGE_NO_DEST is an impossible destination address..., for
  86  * allocating pages whose destination address we do not care about.
  87  */
  88 #define KIMAGE_NO_DEST (-1UL)
  89
  90 static int kimage_is_destination_range(
  91         struct kimage *image, unsigned long start, unsigned long end);
  92 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long dest);
  93
  94 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
  95         unsigned long nr_segments, struct kexec_segment __user *segments)
  96 {
  97         size_t segment_bytes;
  98         struct kimage *image;
  99         unsigned long i;
 100         int result;
 101
 102         /* Allocate a controlling structure */
 103         result = -ENOMEM;
 104         image = kmalloc(sizeof(*image), GFP_KERNEL);
 105         if (!image) {
 106                 goto out;
 107         }
 108         memset(image, 0, sizeof(*image));
 109         image->head = 0;
 110         image->entry = &image->head;
 111         image->last_entry = &image->head;
 112         image->control_page = ~0; /* By default this does not apply */
 113         image->start = entry;
 114         image->type = KEXEC_TYPE_DEFAULT;
 115
 116         /* Initialize the list of control pages */
 117         INIT_LIST_HEAD(&image->control_pages);
 118
 119         /* Initialize the list of destination pages */
 120         INIT_LIST_HEAD(&image->dest_pages);
 121
 122         /* Initialize the list of unuseable pages */
 123         INIT_LIST_HEAD(&image->unuseable_pages);
 124
 125         /* Read in the segments */
 126         image->nr_segments = nr_segments;
 127         segment_bytes = nr_segments * sizeof(*segments);
 128         result = copy_from_user(image->segment, segments, segment_bytes);
 129         if (result)
 130                 goto out;
 131
 132         /*
 133          * Verify we have good destination addresses.  The caller is
 134          * responsible for making certain we don't attempt to load
 135          * the new image into invalid or reserved areas of RAM.  This
 136          * just verifies it is an address we can use.
 137          *
 138          * Since the kernel does everything in page size chunks ensure
 139          * the destination addreses are page aligned.  Too many
 140          * special cases crop of when we don't do this.  The most
 141          * insidious is getting overlapping destination addresses
 142          * simply because addresses are changed to page size
 143          * granularity.
 144          */
 145         result = -EADDRNOTAVAIL;
 146         for (i = 0; i < nr_segments; i++) {
 147                 unsigned long mstart, mend;
 148                 mstart = image->segment[i].mem;
 149                 mend   = mstart + image->segment[i].memsz;
 150                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 151                         goto out;
 152                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 153                         goto out;
 154         }
 155
 156         /* Verify our destination addresses do not overlap.
 157          * If we alloed overlapping destination addresses
 158          * through very weird things can happen with no
 159          * easy explanation as one segment stops on another.
 160          */
 161         result = -EINVAL;
 162         for(i = 0; i < nr_segments; i++) {
 163                 unsigned long mstart, mend;
 164                 unsigned long j;
 165                 mstart = image->segment[i].mem;
 166                 mend   = mstart + image->segment[i].memsz;
 167                 for(j = 0; j < i; j++) {
 168                         unsigned long pstart, pend;
 169                         pstart = image->segment[j].mem;
 170                         pend   = pstart + image->segment[j].memsz;
 171                         /* Do the segments overlap ? */
 172                         if ((mend > pstart) && (mstart < pend))
 173                                 goto out;
 174                 }
 175         }
 176
 177         /* Ensure our buffer sizes are strictly less than
 178          * our memory sizes.  This should always be the case,
 179          * and it is easier to check up front than to be surprised
 180          * later on.
 181          */
 182         result = -EINVAL;
 183         for(i = 0; i < nr_segments; i++) {
 184                 if (image->segment[i].bufsz > image->segment[i].memsz)
 185                         goto out;
 186         }
 187
 188
 189         result = 0;
 190  out:
 191         if (result == 0) {
 192                 *rimage = image;
 193         } else {
 194                 kfree(image);
 195         }
 196         return result;
 197
 198 }
 199
 200 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 201         unsigned long nr_segments, struct kexec_segment __user *segments)
 202 {
 203         int result;
 204         struct kimage *image;
 205
 206         /* Allocate and initialize a controlling structure */
 207         image = NULL;
 208         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 209         if (result) {
 210                 goto out;
 211         }
 212         *rimage = image;
 213
 214         /*
 215          * Find a location for the control code buffer, and add it
 216          * the vector of segments so that it's pages will also be
 217          * counted as destination pages.
 218          */
 219         result = -ENOMEM;
 220         image->control_code_page = kimage_alloc_control_pages(image,
 221                 get_order(KEXEC_CONTROL_CODE_SIZE));
 222         if (!image->control_code_page) {
 223                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 224                 goto out;
 225         }
 226
 227         result = 0;
 228  out:
 229         if (result == 0) {
 230                 *rimage = image;
 231         } else {
 232                 kfree(image);
 233         }
 234         return result;
 235 }
 236
 237 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 238         unsigned long nr_segments, struct kexec_segment *segments)
 239 {
 240         int result;
 241         struct kimage *image;
 242         unsigned long i;
 243
 244         image = NULL;
 245         /* Verify we have a valid entry point */
 246         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 247                 result = -EADDRNOTAVAIL;
 248                 goto out;
 249         }
 250
 251         /* Allocate and initialize a controlling structure */
 252         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 253         if (result) {
 254                 goto out;
 255         }
 256
 257         /* Enable the special crash kernel control page
 258          * allocation policy.
 259          */
 260         image->control_page = crashk_res.start;
 261         image->type = KEXEC_TYPE_CRASH;
 262
 263         /*
 264          * Verify we have good destination addresses.  Normally
 265          * the caller is responsible for making certain we don't
 266          * attempt to load the new image into invalid or reserved
 267          * areas of RAM.  But crash kernels are preloaded into a
 268          * reserved area of ram.  We must ensure the addresses
 269          * are in the reserved area otherwise preloading the
 270          * kernel could corrupt things.
 271          */
 272         result = -EADDRNOTAVAIL;
 273         for (i = 0; i < nr_segments; i++) {
 274                 unsigned long mstart, mend;
 275                 mstart = image->segment[i].mem;
 276                 mend = mstart + image->segment[i].memsz - 1;
 277                 /* Ensure we are within the crash kernel limits */
 278                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 279                         goto out;
 280         }
 281
 282
 283         /*
 284          * Find a location for the control code buffer, and add
 285          * the vector of segments so that it's pages will also be
 286          * counted as destination pages.
 287          */
 288         result = -ENOMEM;
 289         image->control_code_page = kimage_alloc_control_pages(image,
 290                 get_order(KEXEC_CONTROL_CODE_SIZE));
 291         if (!image->control_code_page) {
 292                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 293                 goto out;
 294         }
 295
 296         result = 0;
 297  out:
 298         if (result == 0) {
 299                 *rimage = image;
 300         } else {
 301                 kfree(image);
 302         }
 303         return result;
 304 }
 305
 306 static int kimage_is_destination_range(
 307         struct kimage *image, unsigned long start, unsigned long end)
 308 {
 309         unsigned long i;
 310
 311         for (i = 0; i < image->nr_segments; i++) {
 312                 unsigned long mstart, mend;
 313                 mstart = image->segment[i].mem;
 314                 mend   = mstart + image->segment[i].memsz;
 315                 if ((end > mstart) && (start < mend)) {
 316                         return 1;
 317                 }
 318         }
 319         return 0;
 320 }
 321
 322 static struct page *kimage_alloc_pages(unsigned int gfp_mask, unsigned int order)
 323 {
 324         struct page *pages;
 325         pages = alloc_pages(gfp_mask, order);
 326         if (pages) {
 327                 unsigned int count, i;
 328                 pages->mapping = NULL;
 329                 pages->private = order;
 330                 count = 1 << order;
 331                 for(i = 0; i < count; i++) {
 332                         SetPageReserved(pages + i);
 333                 }
 334         }
 335         return pages;
 336 }
 337
 338 static void kimage_free_pages(struct page *page)
 339 {
 340         unsigned int order, count, i;
 341         order = page->private;
 342         count = 1 << order;
 343         for(i = 0; i < count; i++) {
 344                 ClearPageReserved(page + i);
 345         }
 346         __free_pages(page, order);
 347 }
 348
 349 static void kimage_free_page_list(struct list_head *list)
 350 {
 351         struct list_head *pos, *next;
 352         list_for_each_safe(pos, next, list) {
 353                 struct page *page;
 354
 355                 page = list_entry(pos, struct page, lru);
 356                 list_del(&page->lru);
 357
 358                 kimage_free_pages(page);
 359         }
 360 }
 361
 362 static struct page *kimage_alloc_normal_control_pages(
 363         struct kimage *image, unsigned int order)
 364 {
 365         /* Control pages are special, they are the intermediaries
 366          * that are needed while we copy the rest of the pages
 367          * to their final resting place.  As such they must
 368          * not conflict with either the destination addresses
 369          * or memory the kernel is already using.
 370          *
 371          * The only case where we really need more than one of
 372          * these are for architectures where we cannot disable
 373          * the MMU and must instead generate an identity mapped
 374          * page table for all of the memory.
 375          *
 376          * At worst this runs in O(N) of the image size.
 377          */
 378         struct list_head extra_pages;
 379         struct page *pages;
 380         unsigned int count;
 381
 382         count = 1 << order;
 383         INIT_LIST_HEAD(&extra_pages);
 384
 385         /* Loop while I can allocate a page and the page allocated
 386          * is a destination page.
 387          */
 388         do {
 389                 unsigned long pfn, epfn, addr, eaddr;
 390                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 391                 if (!pages)
 392                         break;
 393                 pfn   = page_to_pfn(pages);
 394                 epfn  = pfn + count;
 395                 addr  = pfn << PAGE_SHIFT;
 396                 eaddr = epfn << PAGE_SHIFT;
 397                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 398                         kimage_is_destination_range(image, addr, eaddr))
 399                 {
 400                         list_add(&pages->lru, &extra_pages);
 401                         pages = NULL;
 402                 }
 403         } while(!pages);
 404         if (pages) {
 405                 /* Remember the allocated page... */
 406                 list_add(&pages->lru, &image->control_pages);
 407
 408                 /* Because the page is already in it's destination
 409                  * location we will never allocate another page at
 410                  * that address.  Therefore kimage_alloc_pages
 411                  * will not return it (again) and we don't need
 412                  * to give it an entry in image->segment[].
 413                  */
 414         }
 415         /* Deal with the destination pages I have inadvertently allocated.
 416          *
 417          * Ideally I would convert multi-page allocations into single
 418          * page allocations, and add everyting to image->dest_pages.
 419          *
 420          * For now it is simpler to just free the pages.
 421          */
 422         kimage_free_page_list(&extra_pages);
 423         return pages;
 424
 425 }
 426
 427 static struct page *kimage_alloc_crash_control_pages(
 428         struct kimage *image, unsigned int order)
 429 {
 430         /* Control pages are special, they are the intermediaries
 431          * that are needed while we copy the rest of the pages
 432          * to their final resting place.  As such they must
 433          * not conflict with either the destination addresses
 434          * or memory the kernel is already using.
 435          *
 436          * Control pages are also the only pags we must allocate
 437          * when loading a crash kernel.  All of the other pages
 438          * are specified by the segments and we just memcpy
 439          * into them directly.
 440          *
 441          * The only case where we really need more than one of
 442          * these are for architectures where we cannot disable
 443          * the MMU and must instead generate an identity mapped
 444          * page table for all of the memory.
 445          *
 446          * Given the low demand this implements a very simple
 447          * allocator that finds the first hole of the appropriate
 448          * size in the reserved memory region, and allocates all
 449          * of the memory up to and including the hole.
 450          */
 451         unsigned long hole_start, hole_end, size;
 452         struct page *pages;
 453         pages = NULL;
 454         size = (1 << order) << PAGE_SHIFT;
 455         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 456         hole_end   = hole_start + size - 1;
 457         while(hole_end <= crashk_res.end) {
 458                 unsigned long i;
 459                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT) {
 460                         break;
 461                 }
 462                 if (hole_end > crashk_res.end) {
 463                         break;
 464                 }
 465                 /* See if I overlap any of the segments */
 466                 for(i = 0; i < image->nr_segments; i++) {
 467                         unsigned long mstart, mend;
 468                         mstart = image->segment[i].mem;
 469                         mend   = mstart + image->segment[i].memsz - 1;
 470                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 471                                 /* Advance the hole to the end of the segment */
 472                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 473                                 hole_end   = hole_start + size - 1;
 474                                 break;
 475                         }
 476                 }
 477                 /* If I don't overlap any segments I have found my hole! */
 478                 if (i == image->nr_segments) {
 479                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 480                         break;
 481                 }
 482         }
 483         if (pages) {
 484                 image->control_page = hole_end;
 485         }
 486         return pages;
 487 }
 488
 489
 490 struct page *kimage_alloc_control_pages(
 491         struct kimage *image, unsigned int order)
 492 {
 493         struct page *pages = NULL;
 494         switch(image->type) {
 495         case KEXEC_TYPE_DEFAULT:
 496                 pages = kimage_alloc_normal_control_pages(image, order);
 497                 break;
 498         case KEXEC_TYPE_CRASH:
 499                 pages = kimage_alloc_crash_control_pages(image, order);
 500                 break;
 501         }
 502         return pages;
 503 }
 504
 505 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 506 {
 507         if (*image->entry != 0) {
 508                 image->entry++;
 509         }
 510         if (image->entry == image->last_entry) {
 511                 kimage_entry_t *ind_page;
 512                 struct page *page;
 513                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 514                 if (!page) {
 515                         return -ENOMEM;
 516                 }
 517                 ind_page = page_address(page);
 518                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 519                 image->entry = ind_page;
 520                 image->last_entry =
 521                         ind_page + ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 522         }
 523         *image->entry = entry;
 524         image->entry++;
 525         *image->entry = 0;
 526         return 0;
 527 }
 528
 529 static int kimage_set_destination(
 530         struct kimage *image, unsigned long destination)
 531 {
 532         int result;
 533
 534         destination &= PAGE_MASK;
 535         result = kimage_add_entry(image, destination | IND_DESTINATION);
 536         if (result == 0) {
 537                 image->destination = destination;
 538         }
 539         return result;
 540 }
 541
 542
 543 static int kimage_add_page(struct kimage *image, unsigned long page)
 544 {
 545         int result;
 546
 547         page &= PAGE_MASK;
 548         result = kimage_add_entry(image, page | IND_SOURCE);
 549         if (result == 0) {
 550                 image->destination += PAGE_SIZE;
 551         }
 552         return result;
 553 }
 554
 555
 556 static void kimage_free_extra_pages(struct kimage *image)
 557 {
 558         /* Walk through and free any extra destination pages I may have */
 559         kimage_free_page_list(&image->dest_pages);
 560
 561         /* Walk through and free any unuseable pages I have cached */
 562         kimage_free_page_list(&image->unuseable_pages);
 563
 564 }
 565 static int kimage_terminate(struct kimage *image)
 566 {
 567         if (*image->entry != 0) {
 568                 image->entry++;
 569         }
 570         *image->entry = IND_DONE;
 571         return 0;
 572 }
 573
 574 #define for_each_kimage_entry(image, ptr, entry) \
 575         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 576                 ptr = (entry & IND_INDIRECTION)? \
 577                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 578
 579 static void kimage_free_entry(kimage_entry_t entry)
 580 {
 581         struct page *page;
 582
 583         page = pfn_to_page(entry >> PAGE_SHIFT);
 584         kimage_free_pages(page);
 585 }
 586
 587 static void kimage_free(struct kimage *image)
 588 {
 589         kimage_entry_t *ptr, entry;
 590         kimage_entry_t ind = 0;
 591
 592         if (!image)
 593                 return;
 594         kimage_free_extra_pages(image);
 595         for_each_kimage_entry(image, ptr, entry) {
 596                 if (entry & IND_INDIRECTION) {
 597                         /* Free the previous indirection page */
 598                         if (ind & IND_INDIRECTION) {
 599                                 kimage_free_entry(ind);
 600                         }
 601                         /* Save this indirection page until we are
 602                          * done with it.
 603                          */
 604                         ind = entry;
 605                 }
 606                 else if (entry & IND_SOURCE) {
 607                         kimage_free_entry(entry);
 608                 }
 609         }
 610         /* Free the final indirection page */
 611         if (ind & IND_INDIRECTION) {
 612                 kimage_free_entry(ind);
 613         }
 614
 615         /* Handle any machine specific cleanup */
 616         machine_kexec_cleanup(image);
 617
 618         /* Free the kexec control pages... */
 619         kimage_free_page_list(&image->control_pages);
 620         kfree(image);
 621 }
 622
 623 static kimage_entry_t *kimage_dst_used(struct kimage *image, unsigned long page)
 624 {
 625         kimage_entry_t *ptr, entry;
 626         unsigned long destination = 0;
 627
 628         for_each_kimage_entry(image, ptr, entry) {
 629                 if (entry & IND_DESTINATION) {
 630                         destination = entry & PAGE_MASK;
 631                 }
 632                 else if (entry & IND_SOURCE) {
 633                         if (page == destination) {
 634                                 return ptr;
 635                         }
 636                         destination += PAGE_SIZE;
 637                 }
 638         }
 639         return 0;
 640 }
 641
 642 static struct page *kimage_alloc_page(struct kimage *image, unsigned int gfp_mask, unsigned long destination)
 643 {
 644         /*
 645          * Here we implement safeguards to ensure that a source page
 646          * is not copied to its destination page before the data on
 647          * the destination page is no longer useful.
 648          *
 649          * To do this we maintain the invariant that a source page is
 650          * either its own destination page, or it is not a
 651          * destination page at all.
 652          *
 653          * That is slightly stronger than required, but the proof
 654          * that no problems will not occur is trivial, and the
 655          * implementation is simply to verify.
 656          *
 657          * When allocating all pages normally this algorithm will run
 658          * in O(N) time, but in the worst case it will run in O(N^2)
 659          * time.   If the runtime is a problem the data structures can
 660          * be fixed.
 661          */
 662         struct page *page;
 663         unsigned long addr;
 664
 665         /*
 666          * Walk through the list of destination pages, and see if I
 667          * have a match.
 668          */
 669         list_for_each_entry(page, &image->dest_pages, lru) {
 670                 addr = page_to_pfn(page) << PAGE_SHIFT;
 671                 if (addr == destination) {
 672                         list_del(&page->lru);
 673                         return page;
 674                 }
 675         }
 676         page = NULL;
 677         while (1) {
 678                 kimage_entry_t *old;
 679
 680                 /* Allocate a page, if we run out of memory give up */
 681                 page = kimage_alloc_pages(gfp_mask, 0);
 682                 if (!page) {
 683                         return 0;
 684                 }
 685                 /* If the page cannot be used file it away */
 686                 if (page_to_pfn(page) > (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 687                         list_add(&page->lru, &image->unuseable_pages);
 688                         continue;
 689                 }
 690                 addr = page_to_pfn(page) << PAGE_SHIFT;
 691
 692                 /* If it is the destination page we want use it */
 693                 if (addr == destination)
 694                         break;
 695
 696                 /* If the page is not a destination page use it */
 697                 if (!kimage_is_destination_range(image, addr, addr + PAGE_SIZE))
 698                         break;
 699
 700                 /*
 701                  * I know that the page is someones destination page.
 702                  * See if there is already a source page for this
 703                  * destination page.  And if so swap the source pages.
 704                  */
 705                 old = kimage_dst_used(image, addr);
 706                 if (old) {
 707                         /* If so move it */
 708                         unsigned long old_addr;
 709                         struct page *old_page;
 710
 711                         old_addr = *old & PAGE_MASK;
 712                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 713                         copy_highpage(page, old_page);
 714                         *old = addr | (*old & ~PAGE_MASK);
 715
 716                         /* The old page I have found cannot be a
 717                          * destination page, so return it.
 718                          */
 719                         addr = old_addr;
 720                         page = old_page;
 721                         break;
 722                 }
 723                 else {
 724                         /* Place the page on the destination list I
 725                          * will use it later.
 726                          */
 727                         list_add(&page->lru, &image->dest_pages);
 728                 }
 729         }
 730         return page;
 731 }
 732
 733 static int kimage_load_normal_segment(struct kimage *image,
 734         struct kexec_segment *segment)
 735 {
 736         unsigned long maddr;
 737         unsigned long ubytes, mbytes;
 738         int result;
 739         unsigned char *buf;
 740
 741         result = 0;
 742         buf = segment->buf;
 743         ubytes = segment->bufsz;
 744         mbytes = segment->memsz;
 745         maddr = segment->mem;
 746
 747         result = kimage_set_destination(image, maddr);
 748         if (result < 0) {
 749                 goto out;
 750         }
 751         while(mbytes) {
 752                 struct page *page;
 753                 char *ptr;
 754                 size_t uchunk, mchunk;
 755                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 756                 if (page == 0) {
 757                         result  = -ENOMEM;
 758                         goto out;
 759                 }
 760                 result = kimage_add_page(image, page_to_pfn(page) << PAGE_SHIFT);
 761                 if (result < 0) {
 762                         goto out;
 763                 }
 764                 ptr = kmap(page);
 765                 /* Start with a clear page */
 766                 memset(ptr, 0, PAGE_SIZE);
 767                 ptr += maddr & ~PAGE_MASK;
 768                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 769                 if (mchunk > mbytes) {
 770                         mchunk = mbytes;
 771                 }
 772                 uchunk = mchunk;
 773                 if (uchunk > ubytes) {
 774                         uchunk = ubytes;
 775                 }
 776                 result = copy_from_user(ptr, buf, uchunk);
 777                 kunmap(page);
 778                 if (result) {
 779                         result = (result < 0) ? result : -EIO;
 780                         goto out;
 781                 }
 782                 ubytes -= uchunk;
 783                 maddr  += mchunk;
 784                 buf    += mchunk;
 785                 mbytes -= mchunk;
 786         }
 787  out:
 788         return result;
 789 }
 790
 791 static int kimage_load_crash_segment(struct kimage *image,
 792         struct kexec_segment *segment)
 793 {
 794         /* For crash dumps kernels we simply copy the data from
 795          * user space to it's destination.
 796          * We do things a page at a time for the sake of kmap.
 797          */
 798         unsigned long maddr;
 799         unsigned long ubytes, mbytes;
 800         int result;
 801         unsigned char *buf;
 802
 803         result = 0;
 804         buf = segment->buf;
 805         ubytes = segment->bufsz;
 806         mbytes = segment->memsz;
 807         maddr = segment->mem;
 808         while(mbytes) {
 809                 struct page *page;
 810                 char *ptr;
 811                 size_t uchunk, mchunk;
 812                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 813                 if (page == 0) {
 814                         result  = -ENOMEM;
 815                         goto out;
 816                 }
 817                 ptr = kmap(page);
 818                 ptr += maddr & ~PAGE_MASK;
 819                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 820                 if (mchunk > mbytes) {
 821                         mchunk = mbytes;
 822                 }
 823                 uchunk = mchunk;
 824                 if (uchunk > ubytes) {
 825                         uchunk = ubytes;
 826                         /* Zero the trailing part of the page */
 827                         memset(ptr + uchunk, 0, mchunk - uchunk);
 828                 }
 829                 result = copy_from_user(ptr, buf, uchunk);
 830                 kunmap(page);
 831                 if (result) {
 832                         result = (result < 0) ? result : -EIO;
 833                         goto out;
 834                 }
 835                 ubytes -= uchunk;
 836                 maddr  += mchunk;
 837                 buf    += mchunk;
 838                 mbytes -= mchunk;
 839         }
 840  out:
 841         return result;
 842 }
 843
 844 static int kimage_load_segment(struct kimage *image,
 845         struct kexec_segment *segment)
 846 {
 847         int result = -ENOMEM;
 848         switch(image->type) {
 849         case KEXEC_TYPE_DEFAULT:
 850                 result = kimage_load_normal_segment(image, segment);
 851                 break;
 852         case KEXEC_TYPE_CRASH:
 853                 result = kimage_load_crash_segment(image, segment);
 854                 break;
 855         }
 856         return result;
 857 }
 858
 859 /*
 860  * Exec Kernel system call: for obvious reasons only root may call it.
 861  *
 862  * This call breaks up into three pieces.
 863  * - A generic part which loads the new kernel from the current
 864  *   address space, and very carefully places the data in the
 865  *   allocated pages.
 866  *
 867  * - A generic part that interacts with the kernel and tells all of
 868  *   the devices to shut down.  Preventing on-going dmas, and placing
 869  *   the devices in a consistent state so a later kernel can
 870  *   reinitialize them.
 871  *
 872  * - A machine specific part that includes the syscall number
 873  *   and the copies the image to it's final destination.  And
 874  *   jumps into the image at entry.
 875  *
 876  * kexec does not sync, or unmount filesystems so if you need
 877  * that to happen you need to do that yourself.
 878  */
 879 struct kimage *kexec_image = NULL;
 880 static struct kimage *kexec_crash_image = NULL;
 881 /*
 882  * A home grown binary mutex.
 883  * Nothing can wait so this mutex is safe to use
 884  * in interrupt context :)
 885  */
 886 static int kexec_lock = 0;
 887
 888 asmlinkage long sys_kexec_load(unsigned long entry,
 889         unsigned long nr_segments, struct kexec_segment __user *segments,
 890         unsigned long flags)
 891 {
 892         struct kimage **dest_image, *image;
 893         int locked;
 894         int result;
 895
 896         /* We only trust the superuser with rebooting the system. */
 897         if (!capable(CAP_SYS_BOOT))
 898                 return -EPERM;
 899
 900         /*
 901          * Verify we have a legal set of flags
 902          * This leaves us room for future extensions.
 903          */
 904         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 905                 return -EINVAL;
 906
 907         /* Verify we are on the appropriate architecture */
 908         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 909                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 910         {
 911                 return -EINVAL;
 912         }
 913
 914         /* Put an artificial cap on the number
 915          * of segments passed to kexec_load.
 916          */
 917         if (nr_segments > KEXEC_SEGMENT_MAX)
 918                 return -EINVAL;
 919
 920         image = NULL;
 921         result = 0;
 922
 923         /* Because we write directly to the reserved memory
 924          * region when loading crash kernels we need a mutex here to
 925          * prevent multiple crash  kernels from attempting to load
 926          * simultaneously, and to prevent a crash kernel from loading
 927          * over the top of a in use crash kernel.
 928          *
 929          * KISS: always take the mutex.
 930          */
 931         locked = xchg(&kexec_lock, 1);
 932         if (locked) {
 933                 return -EBUSY;
 934         }
 935         dest_image = &kexec_image;
 936         if (flags & KEXEC_ON_CRASH) {
 937                 dest_image = &kexec_crash_image;
 938         }
 939         if (nr_segments > 0) {
 940                 unsigned long i;
 941                 /* Loading another kernel to reboot into */
 942                 if ((flags & KEXEC_ON_CRASH) == 0) {
 943                         result = kimage_normal_alloc(&image, entry, nr_segments, segments);
 944                 }
 945                 /* Loading another kernel to switch to if this one crashes */
 946                 else if (flags & KEXEC_ON_CRASH) {
 947                         /* Free any current crash dump kernel before
 948                          * we corrupt it.
 949                          */
 950                         kimage_free(xchg(&kexec_crash_image, NULL));
 951                         result = kimage_crash_alloc(&image, entry, nr_segments, segments);
 952                 }
 953                 if (result) {
 954                         goto out;
 955                 }
 956                 result = machine_kexec_prepare(image);
 957                 if (result) {
 958                         goto out;
 959                 }
 960                 for(i = 0; i < nr_segments; i++) {
 961                         result = kimage_load_segment(image, &image->segment[i]);
 962                         if (result) {
 963                                 goto out;
 964                         }
 965                 }
 966                 result = kimage_terminate(image);
 967                 if (result) {
 968                         goto out;
 969                 }
 970         }
 971         /* Install the new kernel, and  Uninstall the old */
 972         image = xchg(dest_image, image);
 973
 974  out:
 975         xchg(&kexec_lock, 0); /* Release the mutex */
 976         kimage_free(image);
 977         return result;
 978 }
 979
 980 #ifdef CONFIG_COMPAT
 981 asmlinkage long compat_sys_kexec_load(unsigned long entry,
 982         unsigned long nr_segments, struct compat_kexec_segment __user *segments,
 983         unsigned long flags)
 984 {
 985         struct compat_kexec_segment in;
 986         struct kexec_segment out, __user *ksegments;
 987         unsigned long i, result;
 988
 989         /* Don't allow clients that don't understand the native
 990          * architecture to do anything.
 991          */
 992         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT) {
 993                 return -EINVAL;
 994         }
 995
 996         if (nr_segments > KEXEC_SEGMENT_MAX) {
 997                 return -EINVAL;
 998         }
 999
1000         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1001         for (i=0; i < nr_segments; i++) {
1002                 result = copy_from_user(&in, &segments[i], sizeof(in));
1003                 if (result) {
1004                         return -EFAULT;
1005                 }
1006
1007                 out.buf   = compat_ptr(in.buf);
1008                 out.bufsz = in.bufsz;
1009                 out.mem   = in.mem;
1010                 out.memsz = in.memsz;
1011
1012                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1013                 if (result) {
1014                         return -EFAULT;
1015                 }
1016         }
1017
1018         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1019 }
1020 #endif
1021
1022 void crash_kexec(struct pt_regs *regs)
1023 {
1024         struct kimage *image;
1025         int locked;
1026
1027
1028         /* Take the kexec_lock here to prevent sys_kexec_load
1029          * running on one cpu from replacing the crash kernel
1030          * we are using after a panic on a different cpu.
1031          *
1032          * If the crash kernel was not located in a fixed area
1033          * of memory the xchg(&kexec_crash_image) would be
1034          * sufficient.  But since I reuse the memory...
1035          */
1036         locked = xchg(&kexec_lock, 1);
1037         if (!locked) {
1038                 image = xchg(&kexec_crash_image, NULL);
1039                 if (image) {
1040                         machine_crash_shutdown(regs);
1041                         machine_kexec(image);
1042                 }
1043                 xchg(&kexec_lock, 0);
1044         }
1045 }