SAFE public projects git trees. - safe/jmp/linux-2.6/blob - kernel/kexec.c

   1 /*
   2  * kexec.c - kexec system call
   3  * Copyright (C) 2002-2004 Eric Biederman  <ebiederm@xmission.com>
   4  *
   5  * This source code is licensed under the GNU General Public License,
   6  * Version 2.  See the file COPYING for more details.
   7  */
   8
   9 #include <linux/capability.h>
  10 #include <linux/mm.h>
  11 #include <linux/file.h>
  12 #include <linux/slab.h>
  13 #include <linux/fs.h>
  14 #include <linux/kexec.h>
  15 #include <linux/spinlock.h>
  16 #include <linux/list.h>
  17 #include <linux/highmem.h>
  18 #include <linux/syscalls.h>
  19 #include <linux/reboot.h>
  20 #include <linux/syscalls.h>
  21 #include <linux/ioport.h>
  22 #include <linux/hardirq.h>
  23
  24 #include <asm/page.h>
  25 #include <asm/uaccess.h>
  26 #include <asm/io.h>
  27 #include <asm/system.h>
  28 #include <asm/semaphore.h>
  29
  30 /* Per cpu memory for storing cpu states in case of system crash. */
  31 note_buf_t* crash_notes;
  32
  33 /* Location of the reserved area for the crash kernel */
  34 struct resource crashk_res = {
  35         .name  = "Crash kernel",
  36         .start = 0,
  37         .end   = 0,
  38         .flags = IORESOURCE_BUSY | IORESOURCE_MEM
  39 };
  40
  41 int kexec_should_crash(struct task_struct *p)
  42 {
  43         if (in_interrupt() || !p->pid || is_init(p) || panic_on_oops)
  44                 return 1;
  45         return 0;
  46 }
  47
  48 /*
  49  * When kexec transitions to the new kernel there is a one-to-one
  50  * mapping between physical and virtual addresses.  On processors
  51  * where you can disable the MMU this is trivial, and easy.  For
  52  * others it is still a simple predictable page table to setup.
  53  *
  54  * In that environment kexec copies the new kernel to its final
  55  * resting place.  This means I can only support memory whose
  56  * physical address can fit in an unsigned long.  In particular
  57  * addresses where (pfn << PAGE_SHIFT) > ULONG_MAX cannot be handled.
  58  * If the assembly stub has more restrictive requirements
  59  * KEXEC_SOURCE_MEMORY_LIMIT and KEXEC_DEST_MEMORY_LIMIT can be
  60  * defined more restrictively in <asm/kexec.h>.
  61  *
  62  * The code for the transition from the current kernel to the
  63  * the new kernel is placed in the control_code_buffer, whose size
  64  * is given by KEXEC_CONTROL_CODE_SIZE.  In the best case only a single
  65  * page of memory is necessary, but some architectures require more.
  66  * Because this memory must be identity mapped in the transition from
  67  * virtual to physical addresses it must live in the range
  68  * 0 - TASK_SIZE, as only the user space mappings are arbitrarily
  69  * modifiable.
  70  *
  71  * The assembly stub in the control code buffer is passed a linked list
  72  * of descriptor pages detailing the source pages of the new kernel,
  73  * and the destination addresses of those source pages.  As this data
  74  * structure is not used in the context of the current OS, it must
  75  * be self-contained.
  76  *
  77  * The code has been made to work with highmem pages and will use a
  78  * destination page in its final resting place (if it happens
  79  * to allocate it).  The end product of this is that most of the
  80  * physical address space, and most of RAM can be used.
  81  *
  82  * Future directions include:
  83  *  - allocating a page table with the control code buffer identity
  84  *    mapped, to simplify machine_kexec and make kexec_on_panic more
  85  *    reliable.
  86  */
  87
  88 /*
  89  * KIMAGE_NO_DEST is an impossible destination address..., for
  90  * allocating pages whose destination address we do not care about.
  91  */
  92 #define KIMAGE_NO_DEST (-1UL)
  93
  94 static int kimage_is_destination_range(struct kimage *image,
  95                                        unsigned long start, unsigned long end);
  96 static struct page *kimage_alloc_page(struct kimage *image,
  97                                        gfp_t gfp_mask,
  98                                        unsigned long dest);
  99
 100 static int do_kimage_alloc(struct kimage **rimage, unsigned long entry,
 101                             unsigned long nr_segments,
 102                             struct kexec_segment __user *segments)
 103 {
 104         size_t segment_bytes;
 105         struct kimage *image;
 106         unsigned long i;
 107         int result;
 108
 109         /* Allocate a controlling structure */
 110         result = -ENOMEM;
 111         image = kzalloc(sizeof(*image), GFP_KERNEL);
 112         if (!image)
 113                 goto out;
 114
 115         image->head = 0;
 116         image->entry = &image->head;
 117         image->last_entry = &image->head;
 118         image->control_page = ~0; /* By default this does not apply */
 119         image->start = entry;
 120         image->type = KEXEC_TYPE_DEFAULT;
 121
 122         /* Initialize the list of control pages */
 123         INIT_LIST_HEAD(&image->control_pages);
 124
 125         /* Initialize the list of destination pages */
 126         INIT_LIST_HEAD(&image->dest_pages);
 127
 128         /* Initialize the list of unuseable pages */
 129         INIT_LIST_HEAD(&image->unuseable_pages);
 130
 131         /* Read in the segments */
 132         image->nr_segments = nr_segments;
 133         segment_bytes = nr_segments * sizeof(*segments);
 134         result = copy_from_user(image->segment, segments, segment_bytes);
 135         if (result)
 136                 goto out;
 137
 138         /*
 139          * Verify we have good destination addresses.  The caller is
 140          * responsible for making certain we don't attempt to load
 141          * the new image into invalid or reserved areas of RAM.  This
 142          * just verifies it is an address we can use.
 143          *
 144          * Since the kernel does everything in page size chunks ensure
 145          * the destination addreses are page aligned.  Too many
 146          * special cases crop of when we don't do this.  The most
 147          * insidious is getting overlapping destination addresses
 148          * simply because addresses are changed to page size
 149          * granularity.
 150          */
 151         result = -EADDRNOTAVAIL;
 152         for (i = 0; i < nr_segments; i++) {
 153                 unsigned long mstart, mend;
 154
 155                 mstart = image->segment[i].mem;
 156                 mend   = mstart + image->segment[i].memsz;
 157                 if ((mstart & ~PAGE_MASK) || (mend & ~PAGE_MASK))
 158                         goto out;
 159                 if (mend >= KEXEC_DESTINATION_MEMORY_LIMIT)
 160                         goto out;
 161         }
 162
 163         /* Verify our destination addresses do not overlap.
 164          * If we alloed overlapping destination addresses
 165          * through very weird things can happen with no
 166          * easy explanation as one segment stops on another.
 167          */
 168         result = -EINVAL;
 169         for (i = 0; i < nr_segments; i++) {
 170                 unsigned long mstart, mend;
 171                 unsigned long j;
 172
 173                 mstart = image->segment[i].mem;
 174                 mend   = mstart + image->segment[i].memsz;
 175                 for (j = 0; j < i; j++) {
 176                         unsigned long pstart, pend;
 177                         pstart = image->segment[j].mem;
 178                         pend   = pstart + image->segment[j].memsz;
 179                         /* Do the segments overlap ? */
 180                         if ((mend > pstart) && (mstart < pend))
 181                                 goto out;
 182                 }
 183         }
 184
 185         /* Ensure our buffer sizes are strictly less than
 186          * our memory sizes.  This should always be the case,
 187          * and it is easier to check up front than to be surprised
 188          * later on.
 189          */
 190         result = -EINVAL;
 191         for (i = 0; i < nr_segments; i++) {
 192                 if (image->segment[i].bufsz > image->segment[i].memsz)
 193                         goto out;
 194         }
 195
 196         result = 0;
 197 out:
 198         if (result == 0)
 199                 *rimage = image;
 200         else
 201                 kfree(image);
 202
 203         return result;
 204
 205 }
 206
 207 static int kimage_normal_alloc(struct kimage **rimage, unsigned long entry,
 208                                 unsigned long nr_segments,
 209                                 struct kexec_segment __user *segments)
 210 {
 211         int result;
 212         struct kimage *image;
 213
 214         /* Allocate and initialize a controlling structure */
 215         image = NULL;
 216         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 217         if (result)
 218                 goto out;
 219
 220         *rimage = image;
 221
 222         /*
 223          * Find a location for the control code buffer, and add it
 224          * the vector of segments so that it's pages will also be
 225          * counted as destination pages.
 226          */
 227         result = -ENOMEM;
 228         image->control_code_page = kimage_alloc_control_pages(image,
 229                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 230         if (!image->control_code_page) {
 231                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 232                 goto out;
 233         }
 234
 235         result = 0;
 236  out:
 237         if (result == 0)
 238                 *rimage = image;
 239         else
 240                 kfree(image);
 241
 242         return result;
 243 }
 244
 245 static int kimage_crash_alloc(struct kimage **rimage, unsigned long entry,
 246                                 unsigned long nr_segments,
 247                                 struct kexec_segment __user *segments)
 248 {
 249         int result;
 250         struct kimage *image;
 251         unsigned long i;
 252
 253         image = NULL;
 254         /* Verify we have a valid entry point */
 255         if ((entry < crashk_res.start) || (entry > crashk_res.end)) {
 256                 result = -EADDRNOTAVAIL;
 257                 goto out;
 258         }
 259
 260         /* Allocate and initialize a controlling structure */
 261         result = do_kimage_alloc(&image, entry, nr_segments, segments);
 262         if (result)
 263                 goto out;
 264
 265         /* Enable the special crash kernel control page
 266          * allocation policy.
 267          */
 268         image->control_page = crashk_res.start;
 269         image->type = KEXEC_TYPE_CRASH;
 270
 271         /*
 272          * Verify we have good destination addresses.  Normally
 273          * the caller is responsible for making certain we don't
 274          * attempt to load the new image into invalid or reserved
 275          * areas of RAM.  But crash kernels are preloaded into a
 276          * reserved area of ram.  We must ensure the addresses
 277          * are in the reserved area otherwise preloading the
 278          * kernel could corrupt things.
 279          */
 280         result = -EADDRNOTAVAIL;
 281         for (i = 0; i < nr_segments; i++) {
 282                 unsigned long mstart, mend;
 283
 284                 mstart = image->segment[i].mem;
 285                 mend = mstart + image->segment[i].memsz - 1;
 286                 /* Ensure we are within the crash kernel limits */
 287                 if ((mstart < crashk_res.start) || (mend > crashk_res.end))
 288                         goto out;
 289         }
 290
 291         /*
 292          * Find a location for the control code buffer, and add
 293          * the vector of segments so that it's pages will also be
 294          * counted as destination pages.
 295          */
 296         result = -ENOMEM;
 297         image->control_code_page = kimage_alloc_control_pages(image,
 298                                            get_order(KEXEC_CONTROL_CODE_SIZE));
 299         if (!image->control_code_page) {
 300                 printk(KERN_ERR "Could not allocate control_code_buffer\n");
 301                 goto out;
 302         }
 303
 304         result = 0;
 305 out:
 306         if (result == 0)
 307                 *rimage = image;
 308         else
 309                 kfree(image);
 310
 311         return result;
 312 }
 313
 314 static int kimage_is_destination_range(struct kimage *image,
 315                                         unsigned long start,
 316                                         unsigned long end)
 317 {
 318         unsigned long i;
 319
 320         for (i = 0; i < image->nr_segments; i++) {
 321                 unsigned long mstart, mend;
 322
 323                 mstart = image->segment[i].mem;
 324                 mend = mstart + image->segment[i].memsz;
 325                 if ((end > mstart) && (start < mend))
 326                         return 1;
 327         }
 328
 329         return 0;
 330 }
 331
 332 static struct page *kimage_alloc_pages(gfp_t gfp_mask, unsigned int order)
 333 {
 334         struct page *pages;
 335
 336         pages = alloc_pages(gfp_mask, order);
 337         if (pages) {
 338                 unsigned int count, i;
 339                 pages->mapping = NULL;
 340                 set_page_private(pages, order);
 341                 count = 1 << order;
 342                 for (i = 0; i < count; i++)
 343                         SetPageReserved(pages + i);
 344         }
 345
 346         return pages;
 347 }
 348
 349 static void kimage_free_pages(struct page *page)
 350 {
 351         unsigned int order, count, i;
 352
 353         order = page_private(page);
 354         count = 1 << order;
 355         for (i = 0; i < count; i++)
 356                 ClearPageReserved(page + i);
 357         __free_pages(page, order);
 358 }
 359
 360 static void kimage_free_page_list(struct list_head *list)
 361 {
 362         struct list_head *pos, *next;
 363
 364         list_for_each_safe(pos, next, list) {
 365                 struct page *page;
 366
 367                 page = list_entry(pos, struct page, lru);
 368                 list_del(&page->lru);
 369                 kimage_free_pages(page);
 370         }
 371 }
 372
 373 static struct page *kimage_alloc_normal_control_pages(struct kimage *image,
 374                                                         unsigned int order)
 375 {
 376         /* Control pages are special, they are the intermediaries
 377          * that are needed while we copy the rest of the pages
 378          * to their final resting place.  As such they must
 379          * not conflict with either the destination addresses
 380          * or memory the kernel is already using.
 381          *
 382          * The only case where we really need more than one of
 383          * these are for architectures where we cannot disable
 384          * the MMU and must instead generate an identity mapped
 385          * page table for all of the memory.
 386          *
 387          * At worst this runs in O(N) of the image size.
 388          */
 389         struct list_head extra_pages;
 390         struct page *pages;
 391         unsigned int count;
 392
 393         count = 1 << order;
 394         INIT_LIST_HEAD(&extra_pages);
 395
 396         /* Loop while I can allocate a page and the page allocated
 397          * is a destination page.
 398          */
 399         do {
 400                 unsigned long pfn, epfn, addr, eaddr;
 401
 402                 pages = kimage_alloc_pages(GFP_KERNEL, order);
 403                 if (!pages)
 404                         break;
 405                 pfn   = page_to_pfn(pages);
 406                 epfn  = pfn + count;
 407                 addr  = pfn << PAGE_SHIFT;
 408                 eaddr = epfn << PAGE_SHIFT;
 409                 if ((epfn >= (KEXEC_CONTROL_MEMORY_LIMIT >> PAGE_SHIFT)) ||
 410                               kimage_is_destination_range(image, addr, eaddr)) {
 411                         list_add(&pages->lru, &extra_pages);
 412                         pages = NULL;
 413                 }
 414         } while (!pages);
 415
 416         if (pages) {
 417                 /* Remember the allocated page... */
 418                 list_add(&pages->lru, &image->control_pages);
 419
 420                 /* Because the page is already in it's destination
 421                  * location we will never allocate another page at
 422                  * that address.  Therefore kimage_alloc_pages
 423                  * will not return it (again) and we don't need
 424                  * to give it an entry in image->segment[].
 425                  */
 426         }
 427         /* Deal with the destination pages I have inadvertently allocated.
 428          *
 429          * Ideally I would convert multi-page allocations into single
 430          * page allocations, and add everyting to image->dest_pages.
 431          *
 432          * For now it is simpler to just free the pages.
 433          */
 434         kimage_free_page_list(&extra_pages);
 435
 436         return pages;
 437 }
 438
 439 static struct page *kimage_alloc_crash_control_pages(struct kimage *image,
 440                                                       unsigned int order)
 441 {
 442         /* Control pages are special, they are the intermediaries
 443          * that are needed while we copy the rest of the pages
 444          * to their final resting place.  As such they must
 445          * not conflict with either the destination addresses
 446          * or memory the kernel is already using.
 447          *
 448          * Control pages are also the only pags we must allocate
 449          * when loading a crash kernel.  All of the other pages
 450          * are specified by the segments and we just memcpy
 451          * into them directly.
 452          *
 453          * The only case where we really need more than one of
 454          * these are for architectures where we cannot disable
 455          * the MMU and must instead generate an identity mapped
 456          * page table for all of the memory.
 457          *
 458          * Given the low demand this implements a very simple
 459          * allocator that finds the first hole of the appropriate
 460          * size in the reserved memory region, and allocates all
 461          * of the memory up to and including the hole.
 462          */
 463         unsigned long hole_start, hole_end, size;
 464         struct page *pages;
 465
 466         pages = NULL;
 467         size = (1 << order) << PAGE_SHIFT;
 468         hole_start = (image->control_page + (size - 1)) & ~(size - 1);
 469         hole_end   = hole_start + size - 1;
 470         while (hole_end <= crashk_res.end) {
 471                 unsigned long i;
 472
 473                 if (hole_end > KEXEC_CONTROL_MEMORY_LIMIT)
 474                         break;
 475                 if (hole_end > crashk_res.end)
 476                         break;
 477                 /* See if I overlap any of the segments */
 478                 for (i = 0; i < image->nr_segments; i++) {
 479                         unsigned long mstart, mend;
 480
 481                         mstart = image->segment[i].mem;
 482                         mend   = mstart + image->segment[i].memsz - 1;
 483                         if ((hole_end >= mstart) && (hole_start <= mend)) {
 484                                 /* Advance the hole to the end of the segment */
 485                                 hole_start = (mend + (size - 1)) & ~(size - 1);
 486                                 hole_end   = hole_start + size - 1;
 487                                 break;
 488                         }
 489                 }
 490                 /* If I don't overlap any segments I have found my hole! */
 491                 if (i == image->nr_segments) {
 492                         pages = pfn_to_page(hole_start >> PAGE_SHIFT);
 493                         break;
 494                 }
 495         }
 496         if (pages)
 497                 image->control_page = hole_end;
 498
 499         return pages;
 500 }
 501
 502
 503 struct page *kimage_alloc_control_pages(struct kimage *image,
 504                                          unsigned int order)
 505 {
 506         struct page *pages = NULL;
 507
 508         switch (image->type) {
 509         case KEXEC_TYPE_DEFAULT:
 510                 pages = kimage_alloc_normal_control_pages(image, order);
 511                 break;
 512         case KEXEC_TYPE_CRASH:
 513                 pages = kimage_alloc_crash_control_pages(image, order);
 514                 break;
 515         }
 516
 517         return pages;
 518 }
 519
 520 static int kimage_add_entry(struct kimage *image, kimage_entry_t entry)
 521 {
 522         if (*image->entry != 0)
 523                 image->entry++;
 524
 525         if (image->entry == image->last_entry) {
 526                 kimage_entry_t *ind_page;
 527                 struct page *page;
 528
 529                 page = kimage_alloc_page(image, GFP_KERNEL, KIMAGE_NO_DEST);
 530                 if (!page)
 531                         return -ENOMEM;
 532
 533                 ind_page = page_address(page);
 534                 *image->entry = virt_to_phys(ind_page) | IND_INDIRECTION;
 535                 image->entry = ind_page;
 536                 image->last_entry = ind_page +
 537                                       ((PAGE_SIZE/sizeof(kimage_entry_t)) - 1);
 538         }
 539         *image->entry = entry;
 540         image->entry++;
 541         *image->entry = 0;
 542
 543         return 0;
 544 }
 545
 546 static int kimage_set_destination(struct kimage *image,
 547                                    unsigned long destination)
 548 {
 549         int result;
 550
 551         destination &= PAGE_MASK;
 552         result = kimage_add_entry(image, destination | IND_DESTINATION);
 553         if (result == 0)
 554                 image->destination = destination;
 555
 556         return result;
 557 }
 558
 559
 560 static int kimage_add_page(struct kimage *image, unsigned long page)
 561 {
 562         int result;
 563
 564         page &= PAGE_MASK;
 565         result = kimage_add_entry(image, page | IND_SOURCE);
 566         if (result == 0)
 567                 image->destination += PAGE_SIZE;
 568
 569         return result;
 570 }
 571
 572
 573 static void kimage_free_extra_pages(struct kimage *image)
 574 {
 575         /* Walk through and free any extra destination pages I may have */
 576         kimage_free_page_list(&image->dest_pages);
 577
 578         /* Walk through and free any unuseable pages I have cached */
 579         kimage_free_page_list(&image->unuseable_pages);
 580
 581 }
 582 static int kimage_terminate(struct kimage *image)
 583 {
 584         if (*image->entry != 0)
 585                 image->entry++;
 586
 587         *image->entry = IND_DONE;
 588
 589         return 0;
 590 }
 591
 592 #define for_each_kimage_entry(image, ptr, entry) \
 593         for (ptr = &image->head; (entry = *ptr) && !(entry & IND_DONE); \
 594                 ptr = (entry & IND_INDIRECTION)? \
 595                         phys_to_virt((entry & PAGE_MASK)): ptr +1)
 596
 597 static void kimage_free_entry(kimage_entry_t entry)
 598 {
 599         struct page *page;
 600
 601         page = pfn_to_page(entry >> PAGE_SHIFT);
 602         kimage_free_pages(page);
 603 }
 604
 605 static void kimage_free(struct kimage *image)
 606 {
 607         kimage_entry_t *ptr, entry;
 608         kimage_entry_t ind = 0;
 609
 610         if (!image)
 611                 return;
 612
 613         kimage_free_extra_pages(image);
 614         for_each_kimage_entry(image, ptr, entry) {
 615                 if (entry & IND_INDIRECTION) {
 616                         /* Free the previous indirection page */
 617                         if (ind & IND_INDIRECTION)
 618                                 kimage_free_entry(ind);
 619                         /* Save this indirection page until we are
 620                          * done with it.
 621                          */
 622                         ind = entry;
 623                 }
 624                 else if (entry & IND_SOURCE)
 625                         kimage_free_entry(entry);
 626         }
 627         /* Free the final indirection page */
 628         if (ind & IND_INDIRECTION)
 629                 kimage_free_entry(ind);
 630
 631         /* Handle any machine specific cleanup */
 632         machine_kexec_cleanup(image);
 633
 634         /* Free the kexec control pages... */
 635         kimage_free_page_list(&image->control_pages);
 636         kfree(image);
 637 }
 638
 639 static kimage_entry_t *kimage_dst_used(struct kimage *image,
 640                                         unsigned long page)
 641 {
 642         kimage_entry_t *ptr, entry;
 643         unsigned long destination = 0;
 644
 645         for_each_kimage_entry(image, ptr, entry) {
 646                 if (entry & IND_DESTINATION)
 647                         destination = entry & PAGE_MASK;
 648                 else if (entry & IND_SOURCE) {
 649                         if (page == destination)
 650                                 return ptr;
 651                         destination += PAGE_SIZE;
 652                 }
 653         }
 654
 655         return NULL;
 656 }
 657
 658 static struct page *kimage_alloc_page(struct kimage *image,
 659                                         gfp_t gfp_mask,
 660                                         unsigned long destination)
 661 {
 662         /*
 663          * Here we implement safeguards to ensure that a source page
 664          * is not copied to its destination page before the data on
 665          * the destination page is no longer useful.
 666          *
 667          * To do this we maintain the invariant that a source page is
 668          * either its own destination page, or it is not a
 669          * destination page at all.
 670          *
 671          * That is slightly stronger than required, but the proof
 672          * that no problems will not occur is trivial, and the
 673          * implementation is simply to verify.
 674          *
 675          * When allocating all pages normally this algorithm will run
 676          * in O(N) time, but in the worst case it will run in O(N^2)
 677          * time.   If the runtime is a problem the data structures can
 678          * be fixed.
 679          */
 680         struct page *page;
 681         unsigned long addr;
 682
 683         /*
 684          * Walk through the list of destination pages, and see if I
 685          * have a match.
 686          */
 687         list_for_each_entry(page, &image->dest_pages, lru) {
 688                 addr = page_to_pfn(page) << PAGE_SHIFT;
 689                 if (addr == destination) {
 690                         list_del(&page->lru);
 691                         return page;
 692                 }
 693         }
 694         page = NULL;
 695         while (1) {
 696                 kimage_entry_t *old;
 697
 698                 /* Allocate a page, if we run out of memory give up */
 699                 page = kimage_alloc_pages(gfp_mask, 0);
 700                 if (!page)
 701                         return NULL;
 702                 /* If the page cannot be used file it away */
 703                 if (page_to_pfn(page) >
 704                                 (KEXEC_SOURCE_MEMORY_LIMIT >> PAGE_SHIFT)) {
 705                         list_add(&page->lru, &image->unuseable_pages);
 706                         continue;
 707                 }
 708                 addr = page_to_pfn(page) << PAGE_SHIFT;
 709
 710                 /* If it is the destination page we want use it */
 711                 if (addr == destination)
 712                         break;
 713
 714                 /* If the page is not a destination page use it */
 715                 if (!kimage_is_destination_range(image, addr,
 716                                                   addr + PAGE_SIZE))
 717                         break;
 718
 719                 /*
 720                  * I know that the page is someones destination page.
 721                  * See if there is already a source page for this
 722                  * destination page.  And if so swap the source pages.
 723                  */
 724                 old = kimage_dst_used(image, addr);
 725                 if (old) {
 726                         /* If so move it */
 727                         unsigned long old_addr;
 728                         struct page *old_page;
 729
 730                         old_addr = *old & PAGE_MASK;
 731                         old_page = pfn_to_page(old_addr >> PAGE_SHIFT);
 732                         copy_highpage(page, old_page);
 733                         *old = addr | (*old & ~PAGE_MASK);
 734
 735                         /* The old page I have found cannot be a
 736                          * destination page, so return it.
 737                          */
 738                         addr = old_addr;
 739                         page = old_page;
 740                         break;
 741                 }
 742                 else {
 743                         /* Place the page on the destination list I
 744                          * will use it later.
 745                          */
 746                         list_add(&page->lru, &image->dest_pages);
 747                 }
 748         }
 749
 750         return page;
 751 }
 752
 753 static int kimage_load_normal_segment(struct kimage *image,
 754                                          struct kexec_segment *segment)
 755 {
 756         unsigned long maddr;
 757         unsigned long ubytes, mbytes;
 758         int result;
 759         unsigned char __user *buf;
 760
 761         result = 0;
 762         buf = segment->buf;
 763         ubytes = segment->bufsz;
 764         mbytes = segment->memsz;
 765         maddr = segment->mem;
 766
 767         result = kimage_set_destination(image, maddr);
 768         if (result < 0)
 769                 goto out;
 770
 771         while (mbytes) {
 772                 struct page *page;
 773                 char *ptr;
 774                 size_t uchunk, mchunk;
 775
 776                 page = kimage_alloc_page(image, GFP_HIGHUSER, maddr);
 777                 if (page == 0) {
 778                         result  = -ENOMEM;
 779                         goto out;
 780                 }
 781                 result = kimage_add_page(image, page_to_pfn(page)
 782                                                                 << PAGE_SHIFT);
 783                 if (result < 0)
 784                         goto out;
 785
 786                 ptr = kmap(page);
 787                 /* Start with a clear page */
 788                 memset(ptr, 0, PAGE_SIZE);
 789                 ptr += maddr & ~PAGE_MASK;
 790                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 791                 if (mchunk > mbytes)
 792                         mchunk = mbytes;
 793
 794                 uchunk = mchunk;
 795                 if (uchunk > ubytes)
 796                         uchunk = ubytes;
 797
 798                 result = copy_from_user(ptr, buf, uchunk);
 799                 kunmap(page);
 800                 if (result) {
 801                         result = (result < 0) ? result : -EIO;
 802                         goto out;
 803                 }
 804                 ubytes -= uchunk;
 805                 maddr  += mchunk;
 806                 buf    += mchunk;
 807                 mbytes -= mchunk;
 808         }
 809 out:
 810         return result;
 811 }
 812
 813 static int kimage_load_crash_segment(struct kimage *image,
 814                                         struct kexec_segment *segment)
 815 {
 816         /* For crash dumps kernels we simply copy the data from
 817          * user space to it's destination.
 818          * We do things a page at a time for the sake of kmap.
 819          */
 820         unsigned long maddr;
 821         unsigned long ubytes, mbytes;
 822         int result;
 823         unsigned char __user *buf;
 824
 825         result = 0;
 826         buf = segment->buf;
 827         ubytes = segment->bufsz;
 828         mbytes = segment->memsz;
 829         maddr = segment->mem;
 830         while (mbytes) {
 831                 struct page *page;
 832                 char *ptr;
 833                 size_t uchunk, mchunk;
 834
 835                 page = pfn_to_page(maddr >> PAGE_SHIFT);
 836                 if (page == 0) {
 837                         result  = -ENOMEM;
 838                         goto out;
 839                 }
 840                 ptr = kmap(page);
 841                 ptr += maddr & ~PAGE_MASK;
 842                 mchunk = PAGE_SIZE - (maddr & ~PAGE_MASK);
 843                 if (mchunk > mbytes)
 844                         mchunk = mbytes;
 845
 846                 uchunk = mchunk;
 847                 if (uchunk > ubytes) {
 848                         uchunk = ubytes;
 849                         /* Zero the trailing part of the page */
 850                         memset(ptr + uchunk, 0, mchunk - uchunk);
 851                 }
 852                 result = copy_from_user(ptr, buf, uchunk);
 853                 kunmap(page);
 854                 if (result) {
 855                         result = (result < 0) ? result : -EIO;
 856                         goto out;
 857                 }
 858                 ubytes -= uchunk;
 859                 maddr  += mchunk;
 860                 buf    += mchunk;
 861                 mbytes -= mchunk;
 862         }
 863 out:
 864         return result;
 865 }
 866
 867 static int kimage_load_segment(struct kimage *image,
 868                                 struct kexec_segment *segment)
 869 {
 870         int result = -ENOMEM;
 871
 872         switch (image->type) {
 873         case KEXEC_TYPE_DEFAULT:
 874                 result = kimage_load_normal_segment(image, segment);
 875                 break;
 876         case KEXEC_TYPE_CRASH:
 877                 result = kimage_load_crash_segment(image, segment);
 878                 break;
 879         }
 880
 881         return result;
 882 }
 883
 884 /*
 885  * Exec Kernel system call: for obvious reasons only root may call it.
 886  *
 887  * This call breaks up into three pieces.
 888  * - A generic part which loads the new kernel from the current
 889  *   address space, and very carefully places the data in the
 890  *   allocated pages.
 891  *
 892  * - A generic part that interacts with the kernel and tells all of
 893  *   the devices to shut down.  Preventing on-going dmas, and placing
 894  *   the devices in a consistent state so a later kernel can
 895  *   reinitialize them.
 896  *
 897  * - A machine specific part that includes the syscall number
 898  *   and the copies the image to it's final destination.  And
 899  *   jumps into the image at entry.
 900  *
 901  * kexec does not sync, or unmount filesystems so if you need
 902  * that to happen you need to do that yourself.
 903  */
 904 struct kimage *kexec_image;
 905 struct kimage *kexec_crash_image;
 906 /*
 907  * A home grown binary mutex.
 908  * Nothing can wait so this mutex is safe to use
 909  * in interrupt context :)
 910  */
 911 static int kexec_lock;
 912
 913 asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
 914                                 struct kexec_segment __user *segments,
 915                                 unsigned long flags)
 916 {
 917         struct kimage **dest_image, *image;
 918         int locked;
 919         int result;
 920
 921         /* We only trust the superuser with rebooting the system. */
 922         if (!capable(CAP_SYS_BOOT))
 923                 return -EPERM;
 924
 925         /*
 926          * Verify we have a legal set of flags
 927          * This leaves us room for future extensions.
 928          */
 929         if ((flags & KEXEC_FLAGS) != (flags & ~KEXEC_ARCH_MASK))
 930                 return -EINVAL;
 931
 932         /* Verify we are on the appropriate architecture */
 933         if (((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH) &&
 934                 ((flags & KEXEC_ARCH_MASK) != KEXEC_ARCH_DEFAULT))
 935                 return -EINVAL;
 936
 937         /* Put an artificial cap on the number
 938          * of segments passed to kexec_load.
 939          */
 940         if (nr_segments > KEXEC_SEGMENT_MAX)
 941                 return -EINVAL;
 942
 943         image = NULL;
 944         result = 0;
 945
 946         /* Because we write directly to the reserved memory
 947          * region when loading crash kernels we need a mutex here to
 948          * prevent multiple crash  kernels from attempting to load
 949          * simultaneously, and to prevent a crash kernel from loading
 950          * over the top of a in use crash kernel.
 951          *
 952          * KISS: always take the mutex.
 953          */
 954         locked = xchg(&kexec_lock, 1);
 955         if (locked)
 956                 return -EBUSY;
 957
 958         dest_image = &kexec_image;
 959         if (flags & KEXEC_ON_CRASH)
 960                 dest_image = &kexec_crash_image;
 961         if (nr_segments > 0) {
 962                 unsigned long i;
 963
 964                 /* Loading another kernel to reboot into */
 965                 if ((flags & KEXEC_ON_CRASH) == 0)
 966                         result = kimage_normal_alloc(&image, entry,
 967                                                         nr_segments, segments);
 968                 /* Loading another kernel to switch to if this one crashes */
 969                 else if (flags & KEXEC_ON_CRASH) {
 970                         /* Free any current crash dump kernel before
 971                          * we corrupt it.
 972                          */
 973                         kimage_free(xchg(&kexec_crash_image, NULL));
 974                         result = kimage_crash_alloc(&image, entry,
 975                                                      nr_segments, segments);
 976                 }
 977                 if (result)
 978                         goto out;
 979
 980                 result = machine_kexec_prepare(image);
 981                 if (result)
 982                         goto out;
 983
 984                 for (i = 0; i < nr_segments; i++) {
 985                         result = kimage_load_segment(image, &image->segment[i]);
 986                         if (result)
 987                                 goto out;
 988                 }
 989                 result = kimage_terminate(image);
 990                 if (result)
 991                         goto out;
 992         }
 993         /* Install the new kernel, and  Uninstall the old */
 994         image = xchg(dest_image, image);
 995
 996 out:
 997         locked = xchg(&kexec_lock, 0); /* Release the mutex */
 998         BUG_ON(!locked);
 999         kimage_free(image);
1000
1001         return result;
1002 }
1003
1004 #ifdef CONFIG_COMPAT
1005 asmlinkage long compat_sys_kexec_load(unsigned long entry,
1006                                 unsigned long nr_segments,
1007                                 struct compat_kexec_segment __user *segments,
1008                                 unsigned long flags)
1009 {
1010         struct compat_kexec_segment in;
1011         struct kexec_segment out, __user *ksegments;
1012         unsigned long i, result;
1013
1014         /* Don't allow clients that don't understand the native
1015          * architecture to do anything.
1016          */
1017         if ((flags & KEXEC_ARCH_MASK) == KEXEC_ARCH_DEFAULT)
1018                 return -EINVAL;
1019
1020         if (nr_segments > KEXEC_SEGMENT_MAX)
1021                 return -EINVAL;
1022
1023         ksegments = compat_alloc_user_space(nr_segments * sizeof(out));
1024         for (i=0; i < nr_segments; i++) {
1025                 result = copy_from_user(&in, &segments[i], sizeof(in));
1026                 if (result)
1027                         return -EFAULT;
1028
1029                 out.buf   = compat_ptr(in.buf);
1030                 out.bufsz = in.bufsz;
1031                 out.mem   = in.mem;
1032                 out.memsz = in.memsz;
1033
1034                 result = copy_to_user(&ksegments[i], &out, sizeof(out));
1035                 if (result)
1036                         return -EFAULT;
1037         }
1038
1039         return sys_kexec_load(entry, nr_segments, ksegments, flags);
1040 }
1041 #endif
1042
1043 void crash_kexec(struct pt_regs *regs)
1044 {
1045         int locked;
1046
1047
1048         /* Take the kexec_lock here to prevent sys_kexec_load
1049          * running on one cpu from replacing the crash kernel
1050          * we are using after a panic on a different cpu.
1051          *
1052          * If the crash kernel was not located in a fixed area
1053          * of memory the xchg(&kexec_crash_image) would be
1054          * sufficient.  But since I reuse the memory...
1055          */
1056         locked = xchg(&kexec_lock, 1);
1057         if (!locked) {
1058                 if (kexec_crash_image) {
1059                         struct pt_regs fixed_regs;
1060                         crash_setup_regs(&fixed_regs, regs);
1061                         machine_crash_shutdown(&fixed_regs);
1062                         machine_kexec(kexec_crash_image);
1063                 }
1064                 locked = xchg(&kexec_lock, 0);
1065                 BUG_ON(!locked);
1066         }
1067 }
1068
1069 static int __init crash_notes_memory_init(void)
1070 {
1071         /* Allocate memory for saving cpu registers. */
1072         crash_notes = alloc_percpu(note_buf_t);
1073         if (!crash_notes) {
1074                 printk("Kexec: Memory allocation for saving cpu register"
1075                 " states failed\n");
1076                 return -ENOMEM;
1077         }
1078         return 0;
1079 }
1080 module_init(crash_notes_memory_init)