mm/memory_hotplug.c

   1 /*
   2  *  linux/mm/memory_hotplug.c
   3  *
   4  *  Copyright (C)
   5  */
   6
   7 #include <linux/stddef.h>
   8 #include <linux/mm.h>
   9 #include <linux/swap.h>
  10 #include <linux/interrupt.h>
  11 #include <linux/pagemap.h>
  12 #include <linux/bootmem.h>
  13 #include <linux/compiler.h>
  14 #include <linux/export.h>
  15 #include <linux/pagevec.h>
  16 #include <linux/writeback.h>
  17 #include <linux/slab.h>
  18 #include <linux/sysctl.h>
  19 #include <linux/cpu.h>
  20 #include <linux/memory.h>
  21 #include <linux/memory_hotplug.h>
  22 #include <linux/highmem.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/ioport.h>
  25 #include <linux/delay.h>
  26 #include <linux/migrate.h>
  27 #include <linux/page-isolation.h>
  28 #include <linux/pfn.h>
  29 #include <linux/suspend.h>
  30 #include <linux/mm_inline.h>
  31 #include <linux/firmware-map.h>
  32
  33 #include <asm/tlbflush.h>
  34
  35 #include "internal.h"
  36
  37 /*
  38  * online_page_callback contains pointer to current page onlining function.
  39  * Initially it is generic_online_page(). If it is required it could be
  40  * changed by calling set_online_page_callback() for callback registration
  41  * and restore_online_page_callback() for generic callback restore.
  42  */
  43
  44 static void generic_online_page(struct page *page);
  45
  46 static online_page_callback_t online_page_callback = generic_online_page;
  47
  48 DEFINE_MUTEX(mem_hotplug_mutex);
  49
  50 void lock_memory_hotplug(void)
  51 {
  52         mutex_lock(&mem_hotplug_mutex);
  53
  54         /* for exclusive hibernation if CONFIG_HIBERNATION=y */
  55         lock_system_sleep();
  56 }
  57
  58 void unlock_memory_hotplug(void)
  59 {
  60         unlock_system_sleep();
  61         mutex_unlock(&mem_hotplug_mutex);
  62 }
  63
  64
  65 /* add this memory to iomem resource */
  66 static struct resource *register_memory_resource(u64 start, u64 size)
  67 {
  68         struct resource *res;
  69         res = kzalloc(sizeof(struct resource), GFP_KERNEL);
  70         BUG_ON(!res);
  71
  72         res->name = "System RAM";
  73         res->start = start;
  74         res->end = start + size - 1;
  75         res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
  76         if (request_resource(&iomem_resource, res) < 0) {
  77                 printk("System RAM resource %pR cannot be added\n", res);
  78                 kfree(res);
  79                 res = NULL;
  80         }
  81         return res;
  82 }
  83
  84 static void release_memory_resource(struct resource *res)
  85 {
  86         if (!res)
  87                 return;
  88         release_resource(res);
  89         kfree(res);
  90         return;
  91 }
  92
  93 #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
  94 #ifndef CONFIG_SPARSEMEM_VMEMMAP
  95 static void get_page_bootmem(unsigned long info,  struct page *page,
  96                              unsigned long type)
  97 {
  98         page->lru.next = (struct list_head *) type;
  99         SetPagePrivate(page);
 100         set_page_private(page, info);
 101         atomic_inc(&page->_count);
 102 }
 103
 104 /* reference to __meminit __free_pages_bootmem is valid
 105  * so use __ref to tell modpost not to generate a warning */
 106 void __ref put_page_bootmem(struct page *page)
 107 {
 108         unsigned long type;
 109         struct zone *zone;
 110
 111         type = (unsigned long) page->lru.next;
 112         BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE ||
 113                type > MEMORY_HOTPLUG_MAX_BOOTMEM_TYPE);
 114
 115         if (atomic_dec_return(&page->_count) == 1) {
 116                 ClearPagePrivate(page);
 117                 set_page_private(page, 0);
 118                 INIT_LIST_HEAD(&page->lru);
 119                 __free_pages_bootmem(page, 0);
 120
 121                 zone = page_zone(page);
 122                 zone_span_writelock(zone);
 123                 zone->present_pages++;
 124                 zone_span_writeunlock(zone);
 125                 totalram_pages++;
 126         }
 127
 128 }
 129
 130 static void register_page_bootmem_info_section(unsigned long start_pfn)
 131 {
 132         unsigned long *usemap, mapsize, section_nr, i;
 133         struct mem_section *ms;
 134         struct page *page, *memmap;
 135
 136         section_nr = pfn_to_section_nr(start_pfn);
 137         ms = __nr_to_section(section_nr);
 138
 139         /* Get section's memmap address */
 140         memmap = sparse_decode_mem_map(ms->section_mem_map, section_nr);
 141
 142         /*
 143          * Get page for the memmap's phys address
 144          * XXX: need more consideration for sparse_vmemmap...
 145          */
 146         page = virt_to_page(memmap);
 147         mapsize = sizeof(struct page) * PAGES_PER_SECTION;
 148         mapsize = PAGE_ALIGN(mapsize) >> PAGE_SHIFT;
 149
 150         /* remember memmap's page */
 151         for (i = 0; i < mapsize; i++, page++)
 152                 get_page_bootmem(section_nr, page, SECTION_INFO);
 153
 154         usemap = __nr_to_section(section_nr)->pageblock_flags;
 155         page = virt_to_page(usemap);
 156
 157         mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT;
 158
 159         for (i = 0; i < mapsize; i++, page++)
 160                 get_page_bootmem(section_nr, page, MIX_SECTION_INFO);
 161
 162 }
 163
 164 void register_page_bootmem_info_node(struct pglist_data *pgdat)
 165 {
 166         unsigned long i, pfn, end_pfn, nr_pages;
 167         int node = pgdat->node_id;
 168         struct page *page;
 169         struct zone *zone;
 170
 171         nr_pages = PAGE_ALIGN(sizeof(struct pglist_data)) >> PAGE_SHIFT;
 172         page = virt_to_page(pgdat);
 173
 174         for (i = 0; i < nr_pages; i++, page++)
 175                 get_page_bootmem(node, page, NODE_INFO);
 176
 177         zone = &pgdat->node_zones[0];
 178         for (; zone < pgdat->node_zones + MAX_NR_ZONES - 1; zone++) {
 179                 if (zone->wait_table) {
 180                         nr_pages = zone->wait_table_hash_nr_entries
 181                                 * sizeof(wait_queue_head_t);
 182                         nr_pages = PAGE_ALIGN(nr_pages) >> PAGE_SHIFT;
 183                         page = virt_to_page(zone->wait_table);
 184
 185                         for (i = 0; i < nr_pages; i++, page++)
 186                                 get_page_bootmem(node, page, NODE_INFO);
 187                 }
 188         }
 189
 190         pfn = pgdat->node_start_pfn;
 191         end_pfn = pfn + pgdat->node_spanned_pages;
 192
 193         /* register_section info */
 194         for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
 195                 /*
 196                  * Some platforms can assign the same pfn to multiple nodes - on
 197                  * node0 as well as nodeN.  To avoid registering a pfn against
 198                  * multiple nodes we check that this pfn does not already
 199                  * reside in some other node.
 200                  */
 201                 if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
 202                         register_page_bootmem_info_section(pfn);
 203         }
 204 }
 205 #endif /* !CONFIG_SPARSEMEM_VMEMMAP */
 206
 207 static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
 208                            unsigned long end_pfn)
 209 {
 210         unsigned long old_zone_end_pfn;
 211
 212         zone_span_writelock(zone);
 213
 214         old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
 215         if (start_pfn < zone->zone_start_pfn)
 216                 zone->zone_start_pfn = start_pfn;
 217
 218         zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
 219                                 zone->zone_start_pfn;
 220
 221         zone_span_writeunlock(zone);
 222 }
 223
 224 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
 225                             unsigned long end_pfn)
 226 {
 227         unsigned long old_pgdat_end_pfn =
 228                 pgdat->node_start_pfn + pgdat->node_spanned_pages;
 229
 230         if (start_pfn < pgdat->node_start_pfn)
 231                 pgdat->node_start_pfn = start_pfn;
 232
 233         pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
 234                                         pgdat->node_start_pfn;
 235 }
 236
 237 static int __meminit __add_zone(struct zone *zone, unsigned long phys_start_pfn)
 238 {
 239         struct pglist_data *pgdat = zone->zone_pgdat;
 240         int nr_pages = PAGES_PER_SECTION;
 241         int nid = pgdat->node_id;
 242         int zone_type;
 243         unsigned long flags;
 244
 245         zone_type = zone - pgdat->node_zones;
 246         if (!zone->wait_table) {
 247                 int ret;
 248
 249                 ret = init_currently_empty_zone(zone, phys_start_pfn,
 250                                                 nr_pages, MEMMAP_HOTPLUG);
 251                 if (ret)
 252                         return ret;
 253         }
 254         pgdat_resize_lock(zone->zone_pgdat, &flags);
 255         grow_zone_span(zone, phys_start_pfn, phys_start_pfn + nr_pages);
 256         grow_pgdat_span(zone->zone_pgdat, phys_start_pfn,
 257                         phys_start_pfn + nr_pages);
 258         pgdat_resize_unlock(zone->zone_pgdat, &flags);
 259         memmap_init_zone(nr_pages, nid, zone_type,
 260                          phys_start_pfn, MEMMAP_HOTPLUG);
 261         return 0;
 262 }
 263
 264 static int __meminit __add_section(int nid, struct zone *zone,
 265                                         unsigned long phys_start_pfn)
 266 {
 267         int nr_pages = PAGES_PER_SECTION;
 268         int ret;
 269
 270         if (pfn_valid(phys_start_pfn))
 271                 return -EEXIST;
 272
 273         ret = sparse_add_one_section(zone, phys_start_pfn, nr_pages);
 274
 275         if (ret < 0)
 276                 return ret;
 277
 278         ret = __add_zone(zone, phys_start_pfn);
 279
 280         if (ret < 0)
 281                 return ret;
 282
 283         return register_new_memory(nid, __pfn_to_section(phys_start_pfn));
 284 }
 285
 286 #ifdef CONFIG_SPARSEMEM_VMEMMAP
 287 static int __remove_section(struct zone *zone, struct mem_section *ms)
 288 {
 289         /*
 290          * XXX: Freeing memmap with vmemmap is not implement yet.
 291          *      This should be removed later.
 292          */
 293         return -EBUSY;
 294 }
 295 #else
 296 static int __remove_section(struct zone *zone, struct mem_section *ms)
 297 {
 298         unsigned long flags;
 299         struct pglist_data *pgdat = zone->zone_pgdat;
 300         int ret = -EINVAL;
 301
 302         if (!valid_section(ms))
 303                 return ret;
 304
 305         ret = unregister_memory_section(ms);
 306         if (ret)
 307                 return ret;
 308
 309         pgdat_resize_lock(pgdat, &flags);
 310         sparse_remove_one_section(zone, ms);
 311         pgdat_resize_unlock(pgdat, &flags);
 312         return 0;
 313 }
 314 #endif
 315
 316 /*
 317  * Reasonably generic function for adding memory.  It is
 318  * expected that archs that support memory hotplug will
 319  * call this function after deciding the zone to which to
 320  * add the new pages.
 321  */
 322 int __ref __add_pages(int nid, struct zone *zone, unsigned long phys_start_pfn,
 323                         unsigned long nr_pages)
 324 {
 325         unsigned long i;
 326         int err = 0;
 327         int start_sec, end_sec;
 328         /* during initialize mem_map, align hot-added range to section */
 329         start_sec = pfn_to_section_nr(phys_start_pfn);
 330         end_sec = pfn_to_section_nr(phys_start_pfn + nr_pages - 1);
 331
 332         for (i = start_sec; i <= end_sec; i++) {
 333                 err = __add_section(nid, zone, i << PFN_SECTION_SHIFT);
 334
 335                 /*
 336                  * EEXIST is finally dealt with by ioresource collision
 337                  * check. see add_memory() => register_memory_resource()
 338                  * Warning will be printed if there is collision.
 339                  */
 340                 if (err && (err != -EEXIST))
 341                         break;
 342                 err = 0;
 343         }
 344
 345         return err;
 346 }
 347 EXPORT_SYMBOL_GPL(__add_pages);
 348
 349 /**
 350  * __remove_pages() - remove sections of pages from a zone
 351  * @zone: zone from which pages need to be removed
 352  * @phys_start_pfn: starting pageframe (must be aligned to start of a section)
 353  * @nr_pages: number of pages to remove (must be multiple of section size)
 354  *
 355  * Generic helper function to remove section mappings and sysfs entries
 356  * for the section of the memory we are removing. Caller needs to make
 357  * sure that pages are marked reserved and zones are adjust properly by
 358  * calling offline_pages().
 359  */
 360 int __remove_pages(struct zone *zone, unsigned long phys_start_pfn,
 361                  unsigned long nr_pages)
 362 {
 363         unsigned long i, ret = 0;
 364         int sections_to_remove;
 365
 366         /*
 367          * We can only remove entire sections
 368          */
 369         BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK);
 370         BUG_ON(nr_pages % PAGES_PER_SECTION);
 371
 372         release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE);
 373
 374         sections_to_remove = nr_pages / PAGES_PER_SECTION;
 375         for (i = 0; i < sections_to_remove; i++) {
 376                 unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION;
 377                 ret = __remove_section(zone, __pfn_to_section(pfn));
 378                 if (ret)
 379                         break;
 380         }
 381         return ret;
 382 }
 383 EXPORT_SYMBOL_GPL(__remove_pages);
 384
 385 int set_online_page_callback(online_page_callback_t callback)
 386 {
 387         int rc = -EINVAL;
 388
 389         lock_memory_hotplug();
 390
 391         if (online_page_callback == generic_online_page) {
 392                 online_page_callback = callback;
 393                 rc = 0;
 394         }
 395
 396         unlock_memory_hotplug();
 397
 398         return rc;
 399 }
 400 EXPORT_SYMBOL_GPL(set_online_page_callback);
 401
 402 int restore_online_page_callback(online_page_callback_t callback)
 403 {
 404         int rc = -EINVAL;
 405
 406         lock_memory_hotplug();
 407
 408         if (online_page_callback == callback) {
 409                 online_page_callback = generic_online_page;
 410                 rc = 0;
 411         }
 412
 413         unlock_memory_hotplug();
 414
 415         return rc;
 416 }
 417 EXPORT_SYMBOL_GPL(restore_online_page_callback);
 418
 419 void __online_page_set_limits(struct page *page)
 420 {
 421         unsigned long pfn = page_to_pfn(page);
 422
 423         if (pfn >= num_physpages)
 424                 num_physpages = pfn + 1;
 425 }
 426 EXPORT_SYMBOL_GPL(__online_page_set_limits);
 427
 428 void __online_page_increment_counters(struct page *page)
 429 {
 430         totalram_pages++;
 431
 432 #ifdef CONFIG_HIGHMEM
 433         if (PageHighMem(page))
 434                 totalhigh_pages++;
 435 #endif
 436 }
 437 EXPORT_SYMBOL_GPL(__online_page_increment_counters);
 438
 439 void __online_page_free(struct page *page)
 440 {
 441         ClearPageReserved(page);
 442         init_page_count(page);
 443         __free_page(page);
 444 }
 445 EXPORT_SYMBOL_GPL(__online_page_free);
 446
 447 static void generic_online_page(struct page *page)
 448 {
 449         __online_page_set_limits(page);
 450         __online_page_increment_counters(page);
 451         __online_page_free(page);
 452 }
 453
 454 static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
 455                         void *arg)
 456 {
 457         unsigned long i;
 458         unsigned long onlined_pages = *(unsigned long *)arg;
 459         struct page *page;
 460         if (PageReserved(pfn_to_page(start_pfn)))
 461                 for (i = 0; i < nr_pages; i++) {
 462                         page = pfn_to_page(start_pfn + i);
 463                         (*online_page_callback)(page);
 464                         onlined_pages++;
 465                 }
 466         *(unsigned long *)arg = onlined_pages;
 467         return 0;
 468 }
 469
 470
 471 int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
 472 {
 473         unsigned long onlined_pages = 0;
 474         struct zone *zone;
 475         int need_zonelists_rebuild = 0;
 476         int nid;
 477         int ret;
 478         struct memory_notify arg;
 479
 480         lock_memory_hotplug();
 481         arg.start_pfn = pfn;
 482         arg.nr_pages = nr_pages;
 483         arg.status_change_nid = -1;
 484
 485         nid = page_to_nid(pfn_to_page(pfn));
 486         if (node_present_pages(nid) == 0)
 487                 arg.status_change_nid = nid;
 488
 489         ret = memory_notify(MEM_GOING_ONLINE, &arg);
 490         ret = notifier_to_errno(ret);
 491         if (ret) {
 492                 memory_notify(MEM_CANCEL_ONLINE, &arg);
 493                 unlock_memory_hotplug();
 494                 return ret;
 495         }
 496         /*
 497          * This doesn't need a lock to do pfn_to_page().
 498          * The section can't be removed here because of the
 499          * memory_block->state_mutex.
 500          */
 501         zone = page_zone(pfn_to_page(pfn));
 502         /*
 503          * If this zone is not populated, then it is not in zonelist.
 504          * This means the page allocator ignores this zone.
 505          * So, zonelist must be updated after online.
 506          */
 507         mutex_lock(&zonelists_mutex);
 508         if (!populated_zone(zone))
 509                 need_zonelists_rebuild = 1;
 510
 511         ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
 512                 online_pages_range);
 513         if (ret) {
 514                 mutex_unlock(&zonelists_mutex);
 515                 printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
 516                        (unsigned long long) pfn << PAGE_SHIFT,
 517                        (((unsigned long long) pfn + nr_pages)
 518                             << PAGE_SHIFT) - 1);
 519                 memory_notify(MEM_CANCEL_ONLINE, &arg);
 520                 unlock_memory_hotplug();
 521                 return ret;
 522         }
 523
 524         zone->present_pages += onlined_pages;
 525         zone->zone_pgdat->node_present_pages += onlined_pages;
 526         if (onlined_pages) {
 527                 node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
 528                 if (need_zonelists_rebuild)
 529                         build_all_zonelists(NULL, zone);
 530                 else
 531                         zone_pcp_update(zone);
 532         }
 533
 534         mutex_unlock(&zonelists_mutex);
 535
 536         init_per_zone_wmark_min();
 537
 538         if (onlined_pages)
 539                 kswapd_run(zone_to_nid(zone));
 540
 541         vm_total_pages = nr_free_pagecache_pages();
 542
 543         writeback_set_ratelimit();
 544
 545         if (onlined_pages)
 546                 memory_notify(MEM_ONLINE, &arg);
 547         unlock_memory_hotplug();
 548
 549         return 0;
 550 }
 551 #endif /* CONFIG_MEMORY_HOTPLUG_SPARSE */
 552
 553 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 554 static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
 555 {
 556         struct pglist_data *pgdat;
 557         unsigned long zones_size[MAX_NR_ZONES] = {0};
 558         unsigned long zholes_size[MAX_NR_ZONES] = {0};
 559         unsigned long start_pfn = start >> PAGE_SHIFT;
 560
 561         pgdat = arch_alloc_nodedata(nid);
 562         if (!pgdat)
 563                 return NULL;
 564
 565         arch_refresh_nodedata(nid, pgdat);
 566
 567         /* we can use NODE_DATA(nid) from here */
 568
 569         /* init node's zones as empty zones, we don't have any present pages.*/
 570         free_area_init_node(nid, zones_size, start_pfn, zholes_size);
 571
 572         /*
 573          * The node we allocated has no zone fallback lists. For avoiding
 574          * to access not-initialized zonelist, build here.
 575          */
 576         mutex_lock(&zonelists_mutex);
 577         build_all_zonelists(pgdat, NULL);
 578         mutex_unlock(&zonelists_mutex);
 579
 580         return pgdat;
 581 }
 582
 583 static void rollback_node_hotadd(int nid, pg_data_t *pgdat)
 584 {
 585         arch_refresh_nodedata(nid, NULL);
 586         arch_free_nodedata(pgdat);
 587         return;
 588 }
 589
 590
 591 /*
 592  * called by cpu_up() to online a node without onlined memory.
 593  */
 594 int mem_online_node(int nid)
 595 {
 596         pg_data_t       *pgdat;
 597         int     ret;
 598
 599         lock_memory_hotplug();
 600         pgdat = hotadd_new_pgdat(nid, 0);
 601         if (!pgdat) {
 602                 ret = -ENOMEM;
 603                 goto out;
 604         }
 605         node_set_online(nid);
 606         ret = register_one_node(nid);
 607         BUG_ON(ret);
 608
 609 out:
 610         unlock_memory_hotplug();
 611         return ret;
 612 }
 613
 614 /* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
 615 int __ref add_memory(int nid, u64 start, u64 size)
 616 {
 617         pg_data_t *pgdat = NULL;
 618         int new_pgdat = 0;
 619         struct resource *res;
 620         int ret;
 621
 622         lock_memory_hotplug();
 623
 624         res = register_memory_resource(start, size);
 625         ret = -EEXIST;
 626         if (!res)
 627                 goto out;
 628
 629         if (!node_online(nid)) {
 630                 pgdat = hotadd_new_pgdat(nid, start);
 631                 ret = -ENOMEM;
 632                 if (!pgdat)
 633                         goto error;
 634                 new_pgdat = 1;
 635         }
 636
 637         /* call arch's memory hotadd */
 638         ret = arch_add_memory(nid, start, size);
 639
 640         if (ret < 0)
 641                 goto error;
 642
 643         /* we online node here. we can't roll back from here. */
 644         node_set_online(nid);
 645
 646         if (new_pgdat) {
 647                 ret = register_one_node(nid);
 648                 /*
 649                  * If sysfs file of new node can't create, cpu on the node
 650                  * can't be hot-added. There is no rollback way now.
 651                  * So, check by BUG_ON() to catch it reluctantly..
 652                  */
 653                 BUG_ON(ret);
 654         }
 655
 656         /* create new memmap entry */
 657         firmware_map_add_hotplug(start, start + size, "System RAM");
 658
 659         goto out;
 660
 661 error:
 662         /* rollback pgdat allocation and others */
 663         if (new_pgdat)
 664                 rollback_node_hotadd(nid, pgdat);
 665         if (res)
 666                 release_memory_resource(res);
 667
 668 out:
 669         unlock_memory_hotplug();
 670         return ret;
 671 }
 672 EXPORT_SYMBOL_GPL(add_memory);
 673
 674 #ifdef CONFIG_MEMORY_HOTREMOVE
 675 /*
 676  * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
 677  * set and the size of the free page is given by page_order(). Using this,
 678  * the function determines if the pageblock contains only free pages.
 679  * Due to buddy contraints, a free page at least the size of a pageblock will
 680  * be located at the start of the pageblock
 681  */
 682 static inline int pageblock_free(struct page *page)
 683 {
 684         return PageBuddy(page) && page_order(page) >= pageblock_order;
 685 }
 686
 687 /* Return the start of the next active pageblock after a given page */
 688 static struct page *next_active_pageblock(struct page *page)
 689 {
 690         /* Ensure the starting page is pageblock-aligned */
 691         BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
 692
 693         /* If the entire pageblock is free, move to the end of free page */
 694         if (pageblock_free(page)) {
 695                 int order;
 696                 /* be careful. we don't have locks, page_order can be changed.*/
 697                 order = page_order(page);
 698                 if ((order < MAX_ORDER) && (order >= pageblock_order))
 699                         return page + (1 << order);
 700         }
 701
 702         return page + pageblock_nr_pages;
 703 }
 704
 705 /* Checks if this range of memory is likely to be hot-removable. */
 706 int is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
 707 {
 708         struct page *page = pfn_to_page(start_pfn);
 709         struct page *end_page = page + nr_pages;
 710
 711         /* Check the starting page of each pageblock within the range */
 712         for (; page < end_page; page = next_active_pageblock(page)) {
 713                 if (!is_pageblock_removable_nolock(page))
 714                         return 0;
 715                 cond_resched();
 716         }
 717
 718         /* All pageblocks in the memory block are likely to be hot-removable */
 719         return 1;
 720 }
 721
 722 /*
 723  * Confirm all pages in a range [start, end) is belongs to the same zone.
 724  */
 725 static int test_pages_in_a_zone(unsigned long start_pfn, unsigned long end_pfn)
 726 {
 727         unsigned long pfn;
 728         struct zone *zone = NULL;
 729         struct page *page;
 730         int i;
 731         for (pfn = start_pfn;
 732              pfn < end_pfn;
 733              pfn += MAX_ORDER_NR_PAGES) {
 734                 i = 0;
 735                 /* This is just a CONFIG_HOLES_IN_ZONE check.*/
 736                 while ((i < MAX_ORDER_NR_PAGES) && !pfn_valid_within(pfn + i))
 737                         i++;
 738                 if (i == MAX_ORDER_NR_PAGES)
 739                         continue;
 740                 page = pfn_to_page(pfn + i);
 741                 if (zone && page_zone(page) != zone)
 742                         return 0;
 743                 zone = page_zone(page);
 744         }
 745         return 1;
 746 }
 747
 748 /*
 749  * Scanning pfn is much easier than scanning lru list.
 750  * Scan pfn from start to end and Find LRU page.
 751  */
 752 static unsigned long scan_lru_pages(unsigned long start, unsigned long end)
 753 {
 754         unsigned long pfn;
 755         struct page *page;
 756         for (pfn = start; pfn < end; pfn++) {
 757                 if (pfn_valid(pfn)) {
 758                         page = pfn_to_page(pfn);
 759                         if (PageLRU(page))
 760                                 return pfn;
 761                 }
 762         }
 763         return 0;
 764 }
 765
 766 #define NR_OFFLINE_AT_ONCE_PAGES        (256)
 767 static int
 768 do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
 769 {
 770         unsigned long pfn;
 771         struct page *page;
 772         int move_pages = NR_OFFLINE_AT_ONCE_PAGES;
 773         int not_managed = 0;
 774         int ret = 0;
 775         LIST_HEAD(source);
 776
 777         for (pfn = start_pfn; pfn < end_pfn && move_pages > 0; pfn++) {
 778                 if (!pfn_valid(pfn))
 779                         continue;
 780                 page = pfn_to_page(pfn);
 781                 if (!get_page_unless_zero(page))
 782                         continue;
 783                 /*
 784                  * We can skip free pages. And we can only deal with pages on
 785                  * LRU.
 786                  */
 787                 ret = isolate_lru_page(page);
 788                 if (!ret) { /* Success */
 789                         put_page(page);
 790                         list_add_tail(&page->lru, &source);
 791                         move_pages--;
 792                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 793                                             page_is_file_cache(page));
 794
 795                 } else {
 796 #ifdef CONFIG_DEBUG_VM
 797                         printk(KERN_ALERT "removing pfn %lx from LRU failed\n",
 798                                pfn);
 799                         dump_page(page);
 800 #endif
 801                         put_page(page);
 802                         /* Because we don't have big zone->lock. we should
 803                            check this again here. */
 804                         if (page_count(page)) {
 805                                 not_managed++;
 806                                 ret = -EBUSY;
 807                                 break;
 808                         }
 809                 }
 810         }
 811         if (!list_empty(&source)) {
 812                 if (not_managed) {
 813                         putback_lru_pages(&source);
 814                         goto out;
 815                 }
 816
 817                 /*
 818                  * alloc_migrate_target should be improooooved!!
 819                  * migrate_pages returns # of failed pages.
 820                  */
 821                 ret = migrate_pages(&source, alloc_migrate_target, 0,
 822                                                         true, MIGRATE_SYNC);
 823                 if (ret)
 824                         putback_lru_pages(&source);
 825         }
 826 out:
 827         return ret;
 828 }
 829
 830 /*
 831  * remove from free_area[] and mark all as Reserved.
 832  */
 833 static int
 834 offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
 835                         void *data)
 836 {
 837         __offline_isolated_pages(start, start + nr_pages);
 838         return 0;
 839 }
 840
 841 static void
 842 offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
 843 {
 844         walk_system_ram_range(start_pfn, end_pfn - start_pfn, NULL,
 845                                 offline_isolated_pages_cb);
 846 }
 847
 848 /*
 849  * Check all pages in range, recoreded as memory resource, are isolated.
 850  */
 851 static int
 852 check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
 853                         void *data)
 854 {
 855         int ret;
 856         long offlined = *(long *)data;
 857         ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
 858         offlined = nr_pages;
 859         if (!ret)
 860                 *(long *)data += offlined;
 861         return ret;
 862 }
 863
 864 static long
 865 check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
 866 {
 867         long offlined = 0;
 868         int ret;
 869
 870         ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn, &offlined,
 871                         check_pages_isolated_cb);
 872         if (ret < 0)
 873                 offlined = (long)ret;
 874         return offlined;
 875 }
 876
 877 static int __ref __offline_pages(unsigned long start_pfn,
 878                   unsigned long end_pfn, unsigned long timeout)
 879 {
 880         unsigned long pfn, nr_pages, expire;
 881         long offlined_pages;
 882         int ret, drain, retry_max, node;
 883         struct zone *zone;
 884         struct memory_notify arg;
 885
 886         BUG_ON(start_pfn >= end_pfn);
 887         /* at least, alignment against pageblock is necessary */
 888         if (!IS_ALIGNED(start_pfn, pageblock_nr_pages))
 889                 return -EINVAL;
 890         if (!IS_ALIGNED(end_pfn, pageblock_nr_pages))
 891                 return -EINVAL;
 892         /* This makes hotplug much easier...and readable.
 893            we assume this for now. .*/
 894         if (!test_pages_in_a_zone(start_pfn, end_pfn))
 895                 return -EINVAL;
 896
 897         lock_memory_hotplug();
 898
 899         zone = page_zone(pfn_to_page(start_pfn));
 900         node = zone_to_nid(zone);
 901         nr_pages = end_pfn - start_pfn;
 902
 903         /* set above range as isolated */
 904         ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 905         if (ret)
 906                 goto out;
 907
 908         arg.start_pfn = start_pfn;
 909         arg.nr_pages = nr_pages;
 910         arg.status_change_nid = -1;
 911         if (nr_pages >= node_present_pages(node))
 912                 arg.status_change_nid = node;
 913
 914         ret = memory_notify(MEM_GOING_OFFLINE, &arg);
 915         ret = notifier_to_errno(ret);
 916         if (ret)
 917                 goto failed_removal;
 918
 919         pfn = start_pfn;
 920         expire = jiffies + timeout;
 921         drain = 0;
 922         retry_max = 5;
 923 repeat:
 924         /* start memory hot removal */
 925         ret = -EAGAIN;
 926         if (time_after(jiffies, expire))
 927                 goto failed_removal;
 928         ret = -EINTR;
 929         if (signal_pending(current))
 930                 goto failed_removal;
 931         ret = 0;
 932         if (drain) {
 933                 lru_add_drain_all();
 934                 cond_resched();
 935                 drain_all_pages();
 936         }
 937
 938         pfn = scan_lru_pages(start_pfn, end_pfn);
 939         if (pfn) { /* We have page on LRU */
 940                 ret = do_migrate_range(pfn, end_pfn);
 941                 if (!ret) {
 942                         drain = 1;
 943                         goto repeat;
 944                 } else {
 945                         if (ret < 0)
 946                                 if (--retry_max == 0)
 947                                         goto failed_removal;
 948                         yield();
 949                         drain = 1;
 950                         goto repeat;
 951                 }
 952         }
 953         /* drain all zone's lru pagevec, this is asyncronous... */
 954         lru_add_drain_all();
 955         yield();
 956         /* drain pcp pages , this is synchrouns. */
 957         drain_all_pages();
 958         /* check again */
 959         offlined_pages = check_pages_isolated(start_pfn, end_pfn);
 960         if (offlined_pages < 0) {
 961                 ret = -EBUSY;
 962                 goto failed_removal;
 963         }
 964         printk(KERN_INFO "Offlined Pages %ld\n", offlined_pages);
 965         /* Ok, all of our target is islaoted.
 966            We cannot do rollback at this point. */
 967         offline_isolated_pages(start_pfn, end_pfn);
 968         /* reset pagetype flags and makes migrate type to be MOVABLE */
 969         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
 970         /* removal success */
 971         zone->present_pages -= offlined_pages;
 972         zone->zone_pgdat->node_present_pages -= offlined_pages;
 973         totalram_pages -= offlined_pages;
 974
 975         init_per_zone_wmark_min();
 976
 977         if (!populated_zone(zone)) {
 978                 zone_pcp_reset(zone);
 979                 mutex_lock(&zonelists_mutex);
 980                 build_all_zonelists(NULL, NULL);
 981                 mutex_unlock(&zonelists_mutex);
 982         } else
 983                 zone_pcp_update(zone);
 984
 985         if (!node_present_pages(node)) {
 986                 node_clear_state(node, N_HIGH_MEMORY);
 987                 kswapd_stop(node);
 988         }
 989
 990         vm_total_pages = nr_free_pagecache_pages();
 991         writeback_set_ratelimit();
 992
 993         memory_notify(MEM_OFFLINE, &arg);
 994         unlock_memory_hotplug();
 995         return 0;
 996
 997 failed_removal:
 998         printk(KERN_INFO "memory offlining [mem %#010llx-%#010llx] failed\n",
 999                (unsigned long long) start_pfn << PAGE_SHIFT,
1000                ((unsigned long long) end_pfn << PAGE_SHIFT) - 1);
1001         memory_notify(MEM_CANCEL_OFFLINE, &arg);
1002         /* pushback to free area */
1003         undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
1004
1005 out:
1006         unlock_memory_hotplug();
1007         return ret;
1008 }
1009
1010 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1011 {
1012         return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ);
1013 }
1014
1015 int remove_memory(u64 start, u64 size)
1016 {
1017         struct memory_block *mem = NULL;
1018         struct mem_section *section;
1019         unsigned long start_pfn, end_pfn;
1020         unsigned long pfn, section_nr;
1021         int ret;
1022
1023         start_pfn = PFN_DOWN(start);
1024         end_pfn = start_pfn + PFN_DOWN(size);
1025
1026         for (pfn = start_pfn; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
1027                 section_nr = pfn_to_section_nr(pfn);
1028                 if (!present_section_nr(section_nr))
1029                         continue;
1030
1031                 section = __nr_to_section(section_nr);
1032                 /* same memblock? */
1033                 if (mem)
1034                         if ((section_nr >= mem->start_section_nr) &&
1035                             (section_nr <= mem->end_section_nr))
1036                                 continue;
1037
1038                 mem = find_memory_block_hinted(section, mem);
1039                 if (!mem)
1040                         continue;
1041
1042                 ret = offline_memory_block(mem);
1043                 if (ret) {
1044                         kobject_put(&mem->dev.kobj);
1045                         return ret;
1046                 }
1047         }
1048
1049         if (mem)
1050                 kobject_put(&mem->dev.kobj);
1051
1052         return 0;
1053 }
1054 #else
1055 int offline_pages(unsigned long start_pfn, unsigned long nr_pages)
1056 {
1057         return -EINVAL;
1058 }
1059 int remove_memory(u64 start, u64 size)
1060 {
1061         return -EINVAL;
1062 }
1063 #endif /* CONFIG_MEMORY_HOTREMOVE */
1064 EXPORT_SYMBOL_GPL(remove_memory);