]> rtime.felk.cvut.cz Git - can-eth-gw-linux.git/blobdiff - mm/memory_hotplug.c
hotplug: update nodemasks management
[can-eth-gw-linux.git] / mm / memory_hotplug.c
index e4eeacae2b91199142776390bdaef73241416b47..eca4aac1a83bbc0a25a2b3de2e3312bcce0f7323 100644 (file)
@@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writelock(zone);
 
        old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages;
-       if (start_pfn < zone->zone_start_pfn)
+       if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn)
                zone->zone_start_pfn = start_pfn;
 
        zone->spanned_pages = max(old_zone_end_pfn, end_pfn) -
@@ -214,13 +214,134 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn,
        zone_span_writeunlock(zone);
 }
 
+static void resize_zone(struct zone *zone, unsigned long start_pfn,
+               unsigned long end_pfn)
+{
+       zone_span_writelock(zone);
+
+       if (end_pfn - start_pfn) {
+               zone->zone_start_pfn = start_pfn;
+               zone->spanned_pages = end_pfn - start_pfn;
+       } else {
+               /*
+                * make it consist as free_area_init_core(),
+                * if spanned_pages = 0, then keep start_pfn = 0
+                */
+               zone->zone_start_pfn = 0;
+               zone->spanned_pages = 0;
+       }
+
+       zone_span_writeunlock(zone);
+}
+
+static void fix_zone_id(struct zone *zone, unsigned long start_pfn,
+               unsigned long end_pfn)
+{
+       enum zone_type zid = zone_idx(zone);
+       int nid = zone->zone_pgdat->node_id;
+       unsigned long pfn;
+
+       for (pfn = start_pfn; pfn < end_pfn; pfn++)
+               set_page_links(pfn_to_page(pfn), zid, nid, pfn);
+}
+
+static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       int ret;
+       unsigned long flags;
+       unsigned long z1_start_pfn;
+
+       if (!z1->wait_table) {
+               ret = init_currently_empty_zone(z1, start_pfn,
+                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
+               if (ret)
+                       return ret;
+       }
+
+       pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+       /* can't move pfns which are higher than @z2 */
+       if (end_pfn > z2->zone_start_pfn + z2->spanned_pages)
+               goto out_fail;
+       /* the move out part mast at the left most of @z2 */
+       if (start_pfn > z2->zone_start_pfn)
+               goto out_fail;
+       /* must included/overlap */
+       if (end_pfn <= z2->zone_start_pfn)
+               goto out_fail;
+
+       /* use start_pfn for z1's start_pfn if z1 is empty */
+       if (z1->spanned_pages)
+               z1_start_pfn = z1->zone_start_pfn;
+       else
+               z1_start_pfn = start_pfn;
+
+       resize_zone(z1, z1_start_pfn, end_pfn);
+       resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages);
+
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+       fix_zone_id(z1, start_pfn, end_pfn);
+
+       return 0;
+out_fail:
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+       return -1;
+}
+
+static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2,
+               unsigned long start_pfn, unsigned long end_pfn)
+{
+       int ret;
+       unsigned long flags;
+       unsigned long z2_end_pfn;
+
+       if (!z2->wait_table) {
+               ret = init_currently_empty_zone(z2, start_pfn,
+                       end_pfn - start_pfn, MEMMAP_HOTPLUG);
+               if (ret)
+                       return ret;
+       }
+
+       pgdat_resize_lock(z1->zone_pgdat, &flags);
+
+       /* can't move pfns which are lower than @z1 */
+       if (z1->zone_start_pfn > start_pfn)
+               goto out_fail;
+       /* the move out part mast at the right most of @z1 */
+       if (z1->zone_start_pfn + z1->spanned_pages >  end_pfn)
+               goto out_fail;
+       /* must included/overlap */
+       if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages)
+               goto out_fail;
+
+       /* use end_pfn for z2's end_pfn if z2 is empty */
+       if (z2->spanned_pages)
+               z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages;
+       else
+               z2_end_pfn = end_pfn;
+
+       resize_zone(z1, z1->zone_start_pfn, start_pfn);
+       resize_zone(z2, start_pfn, z2_end_pfn);
+
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+
+       fix_zone_id(z2, start_pfn, end_pfn);
+
+       return 0;
+out_fail:
+       pgdat_resize_unlock(z1->zone_pgdat, &flags);
+       return -1;
+}
+
 static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn,
                            unsigned long end_pfn)
 {
        unsigned long old_pgdat_end_pfn =
                pgdat->node_start_pfn + pgdat->node_spanned_pages;
 
-       if (start_pfn < pgdat->node_start_pfn)
+       if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn)
                pgdat->node_start_pfn = start_pfn;
 
        pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) -
@@ -460,8 +581,88 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages,
        return 0;
 }
 
+/* ensure every online node has NORMAL memory */
+static bool can_online_high_movable(struct zone *zone)
+{
+       return node_state(zone_to_nid(zone), N_NORMAL_MEMORY);
+}
+
+/* check which state of node_states will be changed when online memory */
+static void node_states_check_changes_online(unsigned long nr_pages,
+       struct zone *zone, struct memory_notify *arg)
+{
+       int nid = zone_to_nid(zone);
+       enum zone_type zone_last = ZONE_NORMAL;
+
+       /*
+        * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+        * contains nodes which have zones of 0...ZONE_NORMAL,
+        * set zone_last to ZONE_NORMAL.
+        *
+        * If we don't have HIGHMEM nor movable node,
+        * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+        * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+        */
+       if (N_MEMORY == N_NORMAL_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       /*
+        * if the memory to be online is in a zone of 0...zone_last, and
+        * the zones of 0...zone_last don't have memory before online, we will
+        * need to set the node to node_states[N_NORMAL_MEMORY] after
+        * the memory is online.
+        */
+       if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY))
+               arg->status_change_nid_normal = nid;
+       else
+               arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+       /*
+        * If we have movable node, node_states[N_HIGH_MEMORY]
+        * contains nodes which have zones of 0...ZONE_HIGHMEM,
+        * set zone_last to ZONE_HIGHMEM.
+        *
+        * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+        * contains nodes which have zones of 0...ZONE_MOVABLE,
+        * set zone_last to ZONE_MOVABLE.
+        */
+       zone_last = ZONE_HIGHMEM;
+       if (N_MEMORY == N_HIGH_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       if (zone_idx(zone) <= zone_last && !node_state(nid, N_HIGH_MEMORY))
+               arg->status_change_nid_high = nid;
+       else
+               arg->status_change_nid_high = -1;
+#else
+       arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+       /*
+        * if the node don't have memory befor online, we will need to
+        * set the node to node_states[N_MEMORY] after the memory
+        * is online.
+        */
+       if (!node_state(nid, N_MEMORY))
+               arg->status_change_nid = nid;
+       else
+               arg->status_change_nid = -1;
+}
+
+static void node_states_set_node(int node, struct memory_notify *arg)
+{
+       if (arg->status_change_nid_normal >= 0)
+               node_set_state(node, N_NORMAL_MEMORY);
+
+       if (arg->status_change_nid_high >= 0)
+               node_set_state(node, N_HIGH_MEMORY);
+
+       node_set_state(node, N_MEMORY);
+}
+
 
-int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
+int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type)
 {
        unsigned long onlined_pages = 0;
        struct zone *zone;
@@ -471,13 +672,40 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        struct memory_notify arg;
 
        lock_memory_hotplug();
+       /*
+        * This doesn't need a lock to do pfn_to_page().
+        * The section can't be removed here because of the
+        * memory_block->state_mutex.
+        */
+       zone = page_zone(pfn_to_page(pfn));
+
+       if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
+           !can_online_high_movable(zone)) {
+               unlock_memory_hotplug();
+               return -1;
+       }
+
+       if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
+               if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
+                       unlock_memory_hotplug();
+                       return -1;
+               }
+       }
+       if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
+               if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
+                       unlock_memory_hotplug();
+                       return -1;
+               }
+       }
+
+       /* Previous code may changed the zone of the pfn range */
+       zone = page_zone(pfn_to_page(pfn));
+
        arg.start_pfn = pfn;
        arg.nr_pages = nr_pages;
-       arg.status_change_nid = -1;
+       node_states_check_changes_online(nr_pages, zone, &arg);
 
        nid = page_to_nid(pfn_to_page(pfn));
-       if (node_present_pages(nid) == 0)
-               arg.status_change_nid = nid;
 
        ret = memory_notify(MEM_GOING_ONLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -486,24 +714,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
                unlock_memory_hotplug();
                return ret;
        }
-       /*
-        * This doesn't need a lock to do pfn_to_page().
-        * The section can't be removed here because of the
-        * memory_block->state_mutex.
-        */
-       zone = page_zone(pfn_to_page(pfn));
        /*
         * If this zone is not populated, then it is not in zonelist.
         * This means the page allocator ignores this zone.
         * So, zonelist must be updated after online.
         */
        mutex_lock(&zonelists_mutex);
-       if (!populated_zone(zone))
+       if (!populated_zone(zone)) {
                need_zonelists_rebuild = 1;
+               build_all_zonelists(NULL, zone);
+       }
 
        ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages,
                online_pages_range);
        if (ret) {
+               if (need_zonelists_rebuild)
+                       zone_pcp_reset(zone);
                mutex_unlock(&zonelists_mutex);
                printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n",
                       (unsigned long long) pfn << PAGE_SHIFT,
@@ -517,9 +743,9 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages)
        zone->present_pages += onlined_pages;
        zone->zone_pgdat->node_present_pages += onlined_pages;
        if (onlined_pages) {
-               node_set_state(zone_to_nid(zone), N_HIGH_MEMORY);
+               node_states_set_node(zone_to_nid(zone), &arg);
                if (need_zonelists_rebuild)
-                       build_all_zonelists(NULL, zone);
+                       build_all_zonelists(NULL, NULL);
                else
                        zone_pcp_update(zone);
        }
@@ -847,7 +1073,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
 {
        int ret;
        long offlined = *(long *)data;
-       ret = test_pages_isolated(start_pfn, start_pfn + nr_pages);
+       ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true);
        offlined = nr_pages;
        if (!ret)
                *(long *)data += offlined;
@@ -867,6 +1093,121 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn)
        return offlined;
 }
 
+/* ensure the node has NORMAL memory if it is still online */
+static bool can_offline_normal(struct zone *zone, unsigned long nr_pages)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       unsigned long present_pages = 0;
+       enum zone_type zt;
+
+       for (zt = 0; zt <= ZONE_NORMAL; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+
+       if (present_pages > nr_pages)
+               return true;
+
+       present_pages = 0;
+       for (; zt <= ZONE_MOVABLE; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+
+       /*
+        * we can't offline the last normal memory until all
+        * higher memory is offlined.
+        */
+       return present_pages == 0;
+}
+
+/* check which state of node_states will be changed when offline memory */
+static void node_states_check_changes_offline(unsigned long nr_pages,
+               struct zone *zone, struct memory_notify *arg)
+{
+       struct pglist_data *pgdat = zone->zone_pgdat;
+       unsigned long present_pages = 0;
+       enum zone_type zt, zone_last = ZONE_NORMAL;
+
+       /*
+        * If we have HIGHMEM or movable node, node_states[N_NORMAL_MEMORY]
+        * contains nodes which have zones of 0...ZONE_NORMAL,
+        * set zone_last to ZONE_NORMAL.
+        *
+        * If we don't have HIGHMEM nor movable node,
+        * node_states[N_NORMAL_MEMORY] contains nodes which have zones of
+        * 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE.
+        */
+       if (N_MEMORY == N_NORMAL_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       /*
+        * check whether node_states[N_NORMAL_MEMORY] will be changed.
+        * If the memory to be offline is in a zone of 0...zone_last,
+        * and it is the last present memory, 0...zone_last will
+        * become empty after offline , thus we can determind we will
+        * need to clear the node from node_states[N_NORMAL_MEMORY].
+        */
+       for (zt = 0; zt <= zone_last; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+       if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+               arg->status_change_nid_normal = zone_to_nid(zone);
+       else
+               arg->status_change_nid_normal = -1;
+
+#ifdef CONFIG_HIGHMEM
+       /*
+        * If we have movable node, node_states[N_HIGH_MEMORY]
+        * contains nodes which have zones of 0...ZONE_HIGHMEM,
+        * set zone_last to ZONE_HIGHMEM.
+        *
+        * If we don't have movable node, node_states[N_NORMAL_MEMORY]
+        * contains nodes which have zones of 0...ZONE_MOVABLE,
+        * set zone_last to ZONE_MOVABLE.
+        */
+       zone_last = ZONE_HIGHMEM;
+       if (N_MEMORY == N_HIGH_MEMORY)
+               zone_last = ZONE_MOVABLE;
+
+       for (; zt <= zone_last; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+       if (zone_idx(zone) <= zone_last && nr_pages >= present_pages)
+               arg->status_change_nid_high = zone_to_nid(zone);
+       else
+               arg->status_change_nid_high = -1;
+#else
+       arg->status_change_nid_high = arg->status_change_nid_normal;
+#endif
+
+       /*
+        * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE
+        */
+       zone_last = ZONE_MOVABLE;
+
+       /*
+        * check whether node_states[N_HIGH_MEMORY] will be changed
+        * If we try to offline the last present @nr_pages from the node,
+        * we can determind we will need to clear the node from
+        * node_states[N_HIGH_MEMORY].
+        */
+       for (; zt <= zone_last; zt++)
+               present_pages += pgdat->node_zones[zt].present_pages;
+       if (nr_pages >= present_pages)
+               arg->status_change_nid = zone_to_nid(zone);
+       else
+               arg->status_change_nid = -1;
+}
+
+static void node_states_clear_node(int node, struct memory_notify *arg)
+{
+       if (arg->status_change_nid_normal >= 0)
+               node_clear_state(node, N_NORMAL_MEMORY);
+
+       if ((N_MEMORY != N_NORMAL_MEMORY) &&
+           (arg->status_change_nid_high >= 0))
+               node_clear_state(node, N_HIGH_MEMORY);
+
+       if ((N_MEMORY != N_HIGH_MEMORY) &&
+           (arg->status_change_nid >= 0))
+               node_clear_state(node, N_MEMORY);
+}
+
 static int __ref __offline_pages(unsigned long start_pfn,
                  unsigned long end_pfn, unsigned long timeout)
 {
@@ -893,16 +1234,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
        node = zone_to_nid(zone);
        nr_pages = end_pfn - start_pfn;
 
+       ret = -EINVAL;
+       if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages))
+               goto out;
+
        /* set above range as isolated */
-       ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
+       ret = start_isolate_page_range(start_pfn, end_pfn,
+                                      MIGRATE_MOVABLE, true);
        if (ret)
                goto out;
 
        arg.start_pfn = start_pfn;
        arg.nr_pages = nr_pages;
-       arg.status_change_nid = -1;
-       if (nr_pages >= node_present_pages(node))
-               arg.status_change_nid = node;
+       node_states_check_changes_offline(nr_pages, zone, &arg);
 
        ret = memory_notify(MEM_GOING_OFFLINE, &arg);
        ret = notifier_to_errno(ret);
@@ -975,10 +1319,9 @@ repeat:
        } else
                zone_pcp_update(zone);
 
-       if (!node_present_pages(node)) {
-               node_clear_state(node, N_HIGH_MEMORY);
+       node_states_clear_node(node, &arg);
+       if (arg.status_change_nid >= 0)
                kswapd_stop(node);
-       }
 
        vm_total_pages = nr_free_pagecache_pages();
        writeback_set_ratelimit();