mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * Memory thresholds
  10  * Copyright (C) 2009 Nokia Corporation
  11  * Author: Kirill A. Shutemov
  12  *
  13  * Kernel Memory Controller
  14  * Copyright (C) 2012 Parallels Inc. and Google Inc.
  15  * Authors: Glauber Costa and Suleiman Souhlal
  16  *
  17  * Native page reclaim
  18  * Charge lifetime sanitation
  19  * Lockless page tracking & accounting
  20  * Unified hierarchy configuration model
  21  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
  22  *
  23  * This program is free software; you can redistribute it and/or modify
  24  * it under the terms of the GNU General Public License as published by
  25  * the Free Software Foundation; either version 2 of the License, or
  26  * (at your option) any later version.
  27  *
  28  * This program is distributed in the hope that it will be useful,
  29  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  30  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  31  * GNU General Public License for more details.
  32  */
  33
  34 #include <linux/page_counter.h>
  35 #include <linux/memcontrol.h>
  36 #include <linux/cgroup.h>
  37 #include <linux/mm.h>
  38 #include <linux/hugetlb.h>
  39 #include <linux/pagemap.h>
  40 #include <linux/smp.h>
  41 #include <linux/page-flags.h>
  42 #include <linux/backing-dev.h>
  43 #include <linux/bit_spinlock.h>
  44 #include <linux/rcupdate.h>
  45 #include <linux/limits.h>
  46 #include <linux/export.h>
  47 #include <linux/mutex.h>
  48 #include <linux/rbtree.h>
  49 #include <linux/slab.h>
  50 #include <linux/swap.h>
  51 #include <linux/swapops.h>
  52 #include <linux/spinlock.h>
  53 #include <linux/eventfd.h>
  54 #include <linux/poll.h>
  55 #include <linux/sort.h>
  56 #include <linux/fs.h>
  57 #include <linux/seq_file.h>
  58 #include <linux/vmpressure.h>
  59 #include <linux/mm_inline.h>
  60 #include <linux/swap_cgroup.h>
  61 #include <linux/cpu.h>
  62 #include <linux/oom.h>
  63 #include <linux/lockdep.h>
  64 #include <linux/file.h>
  65 #include <linux/tracehook.h>
  66 #include "internal.h"
  67 #include <net/sock.h>
  68 #include <net/ip.h>
  69 #include "slab.h"
  70 #include <linux/locallock.h>
  71
  72 #include <asm/uaccess.h>
  73
  74 #include <trace/events/vmscan.h>
  75
  76 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
  77 EXPORT_SYMBOL(memory_cgrp_subsys);
  78
  79 struct mem_cgroup *root_mem_cgroup __read_mostly;
  80
  81 #define MEM_CGROUP_RECLAIM_RETRIES      5
  82
  83 /* Socket memory accounting disabled? */
  84 static bool cgroup_memory_nosocket;
  85
  86 /* Kernel memory accounting disabled? */
  87 static bool cgroup_memory_nokmem;
  88
  89 /* Whether the swap controller is active */
  90 #ifdef CONFIG_MEMCG_SWAP
  91 int do_swap_account __read_mostly;
  92 #else
  93 #define do_swap_account         0
  94 #endif
  95
  96 static DEFINE_LOCAL_IRQ_LOCK(event_lock);
  97
  98 /* Whether legacy memory+swap accounting is active */
  99 static bool do_memsw_account(void)
 100 {
 101         return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
 102 }
 103
 104 static const char * const mem_cgroup_stat_names[] = {
 105         "cache",
 106         "rss",
 107         "rss_huge",
 108         "mapped_file",
 109         "dirty",
 110         "writeback",
 111         "swap",
 112 };
 113
 114 static const char * const mem_cgroup_events_names[] = {
 115         "pgpgin",
 116         "pgpgout",
 117         "pgfault",
 118         "pgmajfault",
 119 };
 120
 121 static const char * const mem_cgroup_lru_names[] = {
 122         "inactive_anon",
 123         "active_anon",
 124         "inactive_file",
 125         "active_file",
 126         "unevictable",
 127 };
 128
 129 #define THRESHOLDS_EVENTS_TARGET 128
 130 #define SOFTLIMIT_EVENTS_TARGET 1024
 131 #define NUMAINFO_EVENTS_TARGET  1024
 132
 133 /*
 134  * Cgroups above their limits are maintained in a RB-Tree, independent of
 135  * their hierarchy representation
 136  */
 137
 138 struct mem_cgroup_tree_per_node {
 139         struct rb_root rb_root;
 140         spinlock_t lock;
 141 };
 142
 143 struct mem_cgroup_tree {
 144         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
 145 };
 146
 147 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
 148
 149 /* for OOM */
 150 struct mem_cgroup_eventfd_list {
 151         struct list_head list;
 152         struct eventfd_ctx *eventfd;
 153 };
 154
 155 /*
 156  * cgroup_event represents events which userspace want to receive.
 157  */
 158 struct mem_cgroup_event {
 159         /*
 160          * memcg which the event belongs to.
 161          */
 162         struct mem_cgroup *memcg;
 163         /*
 164          * eventfd to signal userspace about the event.
 165          */
 166         struct eventfd_ctx *eventfd;
 167         /*
 168          * Each of these stored in a list by the cgroup.
 169          */
 170         struct list_head list;
 171         /*
 172          * register_event() callback will be used to add new userspace
 173          * waiter for changes related to this event.  Use eventfd_signal()
 174          * on eventfd to send notification to userspace.
 175          */
 176         int (*register_event)(struct mem_cgroup *memcg,
 177                               struct eventfd_ctx *eventfd, const char *args);
 178         /*
 179          * unregister_event() callback will be called when userspace closes
 180          * the eventfd or on cgroup removing.  This callback must be set,
 181          * if you want provide notification functionality.
 182          */
 183         void (*unregister_event)(struct mem_cgroup *memcg,
 184                                  struct eventfd_ctx *eventfd);
 185         /*
 186          * All fields below needed to unregister event when
 187          * userspace closes eventfd.
 188          */
 189         poll_table pt;
 190         wait_queue_head_t *wqh;
 191         wait_queue_t wait;
 192         struct work_struct remove;
 193 };
 194
 195 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
 196 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
 197
 198 /* Stuffs for move charges at task migration. */
 199 /*
 200  * Types of charges to be moved.
 201  */
 202 #define MOVE_ANON       0x1U
 203 #define MOVE_FILE       0x2U
 204 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
 205
 206 /* "mc" and its members are protected by cgroup_mutex */
 207 static struct move_charge_struct {
 208         spinlock_t        lock; /* for from, to */
 209         struct mm_struct  *mm;
 210         struct mem_cgroup *from;
 211         struct mem_cgroup *to;
 212         unsigned long flags;
 213         unsigned long precharge;
 214         unsigned long moved_charge;
 215         unsigned long moved_swap;
 216         struct task_struct *moving_task;        /* a task moving charges */
 217         wait_queue_head_t waitq;                /* a waitq for other context */
 218 } mc = {
 219         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
 220         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
 221 };
 222
 223 /*
 224  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
 225  * limit reclaim to prevent infinite loops, if they ever occur.
 226  */
 227 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
 228 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
 229
 230 enum charge_type {
 231         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
 232         MEM_CGROUP_CHARGE_TYPE_ANON,
 233         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
 234         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
 235         NR_CHARGE_TYPE,
 236 };
 237
 238 /* for encoding cft->private value on file */
 239 enum res_type {
 240         _MEM,
 241         _MEMSWAP,
 242         _OOM_TYPE,
 243         _KMEM,
 244         _TCP,
 245 };
 246
 247 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
 248 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
 249 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
 250 /* Used for OOM nofiier */
 251 #define OOM_CONTROL             (0)
 252
 253 /* Some nice accessors for the vmpressure. */
 254 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
 255 {
 256         if (!memcg)
 257                 memcg = root_mem_cgroup;
 258         return &memcg->vmpressure;
 259 }
 260
 261 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
 262 {
 263         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
 264 }
 265
 266 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg)
 267 {
 268         return (memcg == root_mem_cgroup);
 269 }
 270
 271 #ifndef CONFIG_SLOB
 272 /*
 273  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
 274  * The main reason for not using cgroup id for this:
 275  *  this works better in sparse environments, where we have a lot of memcgs,
 276  *  but only a few kmem-limited. Or also, if we have, for instance, 200
 277  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
 278  *  200 entry array for that.
 279  *
 280  * The current size of the caches array is stored in memcg_nr_cache_ids. It
 281  * will double each time we have to increase it.
 282  */
 283 static DEFINE_IDA(memcg_cache_ida);
 284 int memcg_nr_cache_ids;
 285
 286 /* Protects memcg_nr_cache_ids */
 287 static DECLARE_RWSEM(memcg_cache_ids_sem);
 288
 289 void memcg_get_cache_ids(void)
 290 {
 291         down_read(&memcg_cache_ids_sem);
 292 }
 293
 294 void memcg_put_cache_ids(void)
 295 {
 296         up_read(&memcg_cache_ids_sem);
 297 }
 298
 299 /*
 300  * MIN_SIZE is different than 1, because we would like to avoid going through
 301  * the alloc/free process all the time. In a small machine, 4 kmem-limited
 302  * cgroups is a reasonable guess. In the future, it could be a parameter or
 303  * tunable, but that is strictly not necessary.
 304  *
 305  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
 306  * this constant directly from cgroup, but it is understandable that this is
 307  * better kept as an internal representation in cgroup.c. In any case, the
 308  * cgrp_id space is not getting any smaller, and we don't have to necessarily
 309  * increase ours as well if it increases.
 310  */
 311 #define MEMCG_CACHES_MIN_SIZE 4
 312 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
 313
 314 /*
 315  * A lot of the calls to the cache allocation functions are expected to be
 316  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
 317  * conditional to this static branch, we'll have to allow modules that does
 318  * kmem_cache_alloc and the such to see this symbol as well
 319  */
 320 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
 321 EXPORT_SYMBOL(memcg_kmem_enabled_key);
 322
 323 #endif /* !CONFIG_SLOB */
 324
 325 /**
 326  * mem_cgroup_css_from_page - css of the memcg associated with a page
 327  * @page: page of interest
 328  *
 329  * If memcg is bound to the default hierarchy, css of the memcg associated
 330  * with @page is returned.  The returned css remains associated with @page
 331  * until it is released.
 332  *
 333  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
 334  * is returned.
 335  */
 336 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
 337 {
 338         struct mem_cgroup *memcg;
 339
 340         memcg = page->mem_cgroup;
 341
 342         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
 343                 memcg = root_mem_cgroup;
 344
 345         return &memcg->css;
 346 }
 347
 348 /**
 349  * page_cgroup_ino - return inode number of the memcg a page is charged to
 350  * @page: the page
 351  *
 352  * Look up the closest online ancestor of the memory cgroup @page is charged to
 353  * and return its inode number or 0 if @page is not charged to any cgroup. It
 354  * is safe to call this function without holding a reference to @page.
 355  *
 356  * Note, this function is inherently racy, because there is nothing to prevent
 357  * the cgroup inode from getting torn down and potentially reallocated a moment
 358  * after page_cgroup_ino() returns, so it only should be used by callers that
 359  * do not care (such as procfs interfaces).
 360  */
 361 ino_t page_cgroup_ino(struct page *page)
 362 {
 363         struct mem_cgroup *memcg;
 364         unsigned long ino = 0;
 365
 366         rcu_read_lock();
 367         memcg = READ_ONCE(page->mem_cgroup);
 368         while (memcg && !(memcg->css.flags & CSS_ONLINE))
 369                 memcg = parent_mem_cgroup(memcg);
 370         if (memcg)
 371                 ino = cgroup_ino(memcg->css.cgroup);
 372         rcu_read_unlock();
 373         return ino;
 374 }
 375
 376 static struct mem_cgroup_per_node *
 377 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
 378 {
 379         int nid = page_to_nid(page);
 380
 381         return memcg->nodeinfo[nid];
 382 }
 383
 384 static struct mem_cgroup_tree_per_node *
 385 soft_limit_tree_node(int nid)
 386 {
 387         return soft_limit_tree.rb_tree_per_node[nid];
 388 }
 389
 390 static struct mem_cgroup_tree_per_node *
 391 soft_limit_tree_from_page(struct page *page)
 392 {
 393         int nid = page_to_nid(page);
 394
 395         return soft_limit_tree.rb_tree_per_node[nid];
 396 }
 397
 398 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
 399                                          struct mem_cgroup_tree_per_node *mctz,
 400                                          unsigned long new_usage_in_excess)
 401 {
 402         struct rb_node **p = &mctz->rb_root.rb_node;
 403         struct rb_node *parent = NULL;
 404         struct mem_cgroup_per_node *mz_node;
 405
 406         if (mz->on_tree)
 407                 return;
 408
 409         mz->usage_in_excess = new_usage_in_excess;
 410         if (!mz->usage_in_excess)
 411                 return;
 412         while (*p) {
 413                 parent = *p;
 414                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
 415                                         tree_node);
 416                 if (mz->usage_in_excess < mz_node->usage_in_excess)
 417                         p = &(*p)->rb_left;
 418                 /*
 419                  * We can't avoid mem cgroups that are over their soft
 420                  * limit by the same amount
 421                  */
 422                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
 423                         p = &(*p)->rb_right;
 424         }
 425         rb_link_node(&mz->tree_node, parent, p);
 426         rb_insert_color(&mz->tree_node, &mctz->rb_root);
 427         mz->on_tree = true;
 428 }
 429
 430 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 431                                          struct mem_cgroup_tree_per_node *mctz)
 432 {
 433         if (!mz->on_tree)
 434                 return;
 435         rb_erase(&mz->tree_node, &mctz->rb_root);
 436         mz->on_tree = false;
 437 }
 438
 439 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
 440                                        struct mem_cgroup_tree_per_node *mctz)
 441 {
 442         unsigned long flags;
 443
 444         spin_lock_irqsave(&mctz->lock, flags);
 445         __mem_cgroup_remove_exceeded(mz, mctz);
 446         spin_unlock_irqrestore(&mctz->lock, flags);
 447 }
 448
 449 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
 450 {
 451         unsigned long nr_pages = page_counter_read(&memcg->memory);
 452         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
 453         unsigned long excess = 0;
 454
 455         if (nr_pages > soft_limit)
 456                 excess = nr_pages - soft_limit;
 457
 458         return excess;
 459 }
 460
 461 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
 462 {
 463         unsigned long excess;
 464         struct mem_cgroup_per_node *mz;
 465         struct mem_cgroup_tree_per_node *mctz;
 466
 467         mctz = soft_limit_tree_from_page(page);
 468         /*
 469          * Necessary to update all ancestors when hierarchy is used.
 470          * because their event counter is not touched.
 471          */
 472         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
 473                 mz = mem_cgroup_page_nodeinfo(memcg, page);
 474                 excess = soft_limit_excess(memcg);
 475                 /*
 476                  * We have to update the tree if mz is on RB-tree or
 477                  * mem is over its softlimit.
 478                  */
 479                 if (excess || mz->on_tree) {
 480                         unsigned long flags;
 481
 482                         spin_lock_irqsave(&mctz->lock, flags);
 483                         /* if on-tree, remove it */
 484                         if (mz->on_tree)
 485                                 __mem_cgroup_remove_exceeded(mz, mctz);
 486                         /*
 487                          * Insert again. mz->usage_in_excess will be updated.
 488                          * If excess is 0, no tree ops.
 489                          */
 490                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
 491                         spin_unlock_irqrestore(&mctz->lock, flags);
 492                 }
 493         }
 494 }
 495
 496 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
 497 {
 498         struct mem_cgroup_tree_per_node *mctz;
 499         struct mem_cgroup_per_node *mz;
 500         int nid;
 501
 502         for_each_node(nid) {
 503                 mz = mem_cgroup_nodeinfo(memcg, nid);
 504                 mctz = soft_limit_tree_node(nid);
 505                 mem_cgroup_remove_exceeded(mz, mctz);
 506         }
 507 }
 508
 509 static struct mem_cgroup_per_node *
 510 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 511 {
 512         struct rb_node *rightmost = NULL;
 513         struct mem_cgroup_per_node *mz;
 514
 515 retry:
 516         mz = NULL;
 517         rightmost = rb_last(&mctz->rb_root);
 518         if (!rightmost)
 519                 goto done;              /* Nothing to reclaim from */
 520
 521         mz = rb_entry(rightmost, struct mem_cgroup_per_node, tree_node);
 522         /*
 523          * Remove the node now but someone else can add it back,
 524          * we will to add it back at the end of reclaim to its correct
 525          * position in the tree.
 526          */
 527         __mem_cgroup_remove_exceeded(mz, mctz);
 528         if (!soft_limit_excess(mz->memcg) ||
 529             !css_tryget_online(&mz->memcg->css))
 530                 goto retry;
 531 done:
 532         return mz;
 533 }
 534
 535 static struct mem_cgroup_per_node *
 536 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
 537 {
 538         struct mem_cgroup_per_node *mz;
 539
 540         spin_lock_irq(&mctz->lock);
 541         mz = __mem_cgroup_largest_soft_limit_node(mctz);
 542         spin_unlock_irq(&mctz->lock);
 543         return mz;
 544 }
 545
 546 /*
 547  * Return page count for single (non recursive) @memcg.
 548  *
 549  * Implementation Note: reading percpu statistics for memcg.
 550  *
 551  * Both of vmstat[] and percpu_counter has threshold and do periodic
 552  * synchronization to implement "quick" read. There are trade-off between
 553  * reading cost and precision of value. Then, we may have a chance to implement
 554  * a periodic synchronization of counter in memcg's counter.
 555  *
 556  * But this _read() function is used for user interface now. The user accounts
 557  * memory usage by memory cgroup and he _always_ requires exact value because
 558  * he accounts memory. Even if we provide quick-and-fuzzy read, we always
 559  * have to visit all online cpus and make sum. So, for now, unnecessary
 560  * synchronization is not implemented. (just implemented for cpu hotplug)
 561  *
 562  * If there are kernel internal actions which can make use of some not-exact
 563  * value, and reading all cpu value can be performance bottleneck in some
 564  * common workload, threshold and synchronization as vmstat[] should be
 565  * implemented.
 566  */
 567 static unsigned long
 568 mem_cgroup_read_stat(struct mem_cgroup *memcg, enum mem_cgroup_stat_index idx)
 569 {
 570         long val = 0;
 571         int cpu;
 572
 573         /* Per-cpu values can be negative, use a signed accumulator */
 574         for_each_possible_cpu(cpu)
 575                 val += per_cpu(memcg->stat->count[idx], cpu);
 576         /*
 577          * Summing races with updates, so val may be negative.  Avoid exposing
 578          * transient negative values.
 579          */
 580         if (val < 0)
 581                 val = 0;
 582         return val;
 583 }
 584
 585 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg,
 586                                             enum mem_cgroup_events_index idx)
 587 {
 588         unsigned long val = 0;
 589         int cpu;
 590
 591         for_each_possible_cpu(cpu)
 592                 val += per_cpu(memcg->stat->events[idx], cpu);
 593         return val;
 594 }
 595
 596 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
 597                                          struct page *page,
 598                                          bool compound, int nr_pages)
 599 {
 600         /*
 601          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
 602          * counted as CACHE even if it's on ANON LRU.
 603          */
 604         if (PageAnon(page))
 605                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS],
 606                                 nr_pages);
 607         else
 608                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE],
 609                                 nr_pages);
 610
 611         if (compound) {
 612                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
 613                 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
 614                                 nr_pages);
 615         }
 616
 617         /* pagein of a big page is an event. So, ignore page size */
 618         if (nr_pages > 0)
 619                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]);
 620         else {
 621                 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]);
 622                 nr_pages = -nr_pages; /* for event */
 623         }
 624
 625         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
 626 }
 627
 628 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
 629                                            int nid, unsigned int lru_mask)
 630 {
 631         unsigned long nr = 0;
 632         struct mem_cgroup_per_node *mz;
 633         enum lru_list lru;
 634
 635         VM_BUG_ON((unsigned)nid >= nr_node_ids);
 636
 637         for_each_lru(lru) {
 638                 if (!(BIT(lru) & lru_mask))
 639                         continue;
 640                 mz = mem_cgroup_nodeinfo(memcg, nid);
 641                 nr += mz->lru_size[lru];
 642         }
 643         return nr;
 644 }
 645
 646 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
 647                         unsigned int lru_mask)
 648 {
 649         unsigned long nr = 0;
 650         int nid;
 651
 652         for_each_node_state(nid, N_MEMORY)
 653                 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
 654         return nr;
 655 }
 656
 657 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
 658                                        enum mem_cgroup_events_target target)
 659 {
 660         unsigned long val, next;
 661
 662         val = __this_cpu_read(memcg->stat->nr_page_events);
 663         next = __this_cpu_read(memcg->stat->targets[target]);
 664         /* from time_after() in jiffies.h */
 665         if ((long)next - (long)val < 0) {
 666                 switch (target) {
 667                 case MEM_CGROUP_TARGET_THRESH:
 668                         next = val + THRESHOLDS_EVENTS_TARGET;
 669                         break;
 670                 case MEM_CGROUP_TARGET_SOFTLIMIT:
 671                         next = val + SOFTLIMIT_EVENTS_TARGET;
 672                         break;
 673                 case MEM_CGROUP_TARGET_NUMAINFO:
 674                         next = val + NUMAINFO_EVENTS_TARGET;
 675                         break;
 676                 default:
 677                         break;
 678                 }
 679                 __this_cpu_write(memcg->stat->targets[target], next);
 680                 return true;
 681         }
 682         return false;
 683 }
 684
 685 /*
 686  * Check events in order.
 687  *
 688  */
 689 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
 690 {
 691         /* threshold event is triggered in finer grain than soft limit */
 692         if (unlikely(mem_cgroup_event_ratelimit(memcg,
 693                                                 MEM_CGROUP_TARGET_THRESH))) {
 694                 bool do_softlimit;
 695                 bool do_numainfo __maybe_unused;
 696
 697                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
 698                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
 699 #if MAX_NUMNODES > 1
 700                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
 701                                                 MEM_CGROUP_TARGET_NUMAINFO);
 702 #endif
 703                 mem_cgroup_threshold(memcg);
 704                 if (unlikely(do_softlimit))
 705                         mem_cgroup_update_tree(memcg, page);
 706 #if MAX_NUMNODES > 1
 707                 if (unlikely(do_numainfo))
 708                         atomic_inc(&memcg->numainfo_events);
 709 #endif
 710         }
 711 }
 712
 713 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 714 {
 715         /*
 716          * mm_update_next_owner() may clear mm->owner to NULL
 717          * if it races with swapoff, page migration, etc.
 718          * So this can be called with p == NULL.
 719          */
 720         if (unlikely(!p))
 721                 return NULL;
 722
 723         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
 724 }
 725 EXPORT_SYMBOL(mem_cgroup_from_task);
 726
 727 static struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 728 {
 729         struct mem_cgroup *memcg = NULL;
 730
 731         rcu_read_lock();
 732         do {
 733                 /*
 734                  * Page cache insertions can happen withou an
 735                  * actual mm context, e.g. during disk probing
 736                  * on boot, loopback IO, acct() writes etc.
 737                  */
 738                 if (unlikely(!mm))
 739                         memcg = root_mem_cgroup;
 740                 else {
 741                         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
 742                         if (unlikely(!memcg))
 743                                 memcg = root_mem_cgroup;
 744                 }
 745         } while (!css_tryget_online(&memcg->css));
 746         rcu_read_unlock();
 747         return memcg;
 748 }
 749
 750 /**
 751  * mem_cgroup_iter - iterate over memory cgroup hierarchy
 752  * @root: hierarchy root
 753  * @prev: previously returned memcg, NULL on first invocation
 754  * @reclaim: cookie for shared reclaim walks, NULL for full walks
 755  *
 756  * Returns references to children of the hierarchy below @root, or
 757  * @root itself, or %NULL after a full round-trip.
 758  *
 759  * Caller must pass the return value in @prev on subsequent
 760  * invocations for reference counting, or use mem_cgroup_iter_break()
 761  * to cancel a hierarchy walk before the round-trip is complete.
 762  *
 763  * Reclaimers can specify a zone and a priority level in @reclaim to
 764  * divide up the memcgs in the hierarchy among all concurrent
 765  * reclaimers operating on the same zone and priority.
 766  */
 767 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
 768                                    struct mem_cgroup *prev,
 769                                    struct mem_cgroup_reclaim_cookie *reclaim)
 770 {
 771         struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
 772         struct cgroup_subsys_state *css = NULL;
 773         struct mem_cgroup *memcg = NULL;
 774         struct mem_cgroup *pos = NULL;
 775
 776         if (mem_cgroup_disabled())
 777                 return NULL;
 778
 779         if (!root)
 780                 root = root_mem_cgroup;
 781
 782         if (prev && !reclaim)
 783                 pos = prev;
 784
 785         if (!root->use_hierarchy && root != root_mem_cgroup) {
 786                 if (prev)
 787                         goto out;
 788                 return root;
 789         }
 790
 791         rcu_read_lock();
 792
 793         if (reclaim) {
 794                 struct mem_cgroup_per_node *mz;
 795
 796                 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
 797                 iter = &mz->iter[reclaim->priority];
 798
 799                 if (prev && reclaim->generation != iter->generation)
 800                         goto out_unlock;
 801
 802                 while (1) {
 803                         pos = READ_ONCE(iter->position);
 804                         if (!pos || css_tryget(&pos->css))
 805                                 break;
 806                         /*
 807                          * css reference reached zero, so iter->position will
 808                          * be cleared by ->css_released. However, we should not
 809                          * rely on this happening soon, because ->css_released
 810                          * is called from a work queue, and by busy-waiting we
 811                          * might block it. So we clear iter->position right
 812                          * away.
 813                          */
 814                         (void)cmpxchg(&iter->position, pos, NULL);
 815                 }
 816         }
 817
 818         if (pos)
 819                 css = &pos->css;
 820
 821         for (;;) {
 822                 css = css_next_descendant_pre(css, &root->css);
 823                 if (!css) {
 824                         /*
 825                          * Reclaimers share the hierarchy walk, and a
 826                          * new one might jump in right at the end of
 827                          * the hierarchy - make sure they see at least
 828                          * one group and restart from the beginning.
 829                          */
 830                         if (!prev)
 831                                 continue;
 832                         break;
 833                 }
 834
 835                 /*
 836                  * Verify the css and acquire a reference.  The root
 837                  * is provided by the caller, so we know it's alive
 838                  * and kicking, and don't take an extra reference.
 839                  */
 840                 memcg = mem_cgroup_from_css(css);
 841
 842                 if (css == &root->css)
 843                         break;
 844
 845                 if (css_tryget(css))
 846                         break;
 847
 848                 memcg = NULL;
 849         }
 850
 851         if (reclaim) {
 852                 /*
 853                  * The position could have already been updated by a competing
 854                  * thread, so check that the value hasn't changed since we read
 855                  * it to avoid reclaiming from the same cgroup twice.
 856                  */
 857                 (void)cmpxchg(&iter->position, pos, memcg);
 858
 859                 if (pos)
 860                         css_put(&pos->css);
 861
 862                 if (!memcg)
 863                         iter->generation++;
 864                 else if (!prev)
 865                         reclaim->generation = iter->generation;
 866         }
 867
 868 out_unlock:
 869         rcu_read_unlock();
 870 out:
 871         if (prev && prev != root)
 872                 css_put(&prev->css);
 873
 874         return memcg;
 875 }
 876
 877 /**
 878  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
 879  * @root: hierarchy root
 880  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
 881  */
 882 void mem_cgroup_iter_break(struct mem_cgroup *root,
 883                            struct mem_cgroup *prev)
 884 {
 885         if (!root)
 886                 root = root_mem_cgroup;
 887         if (prev && prev != root)
 888                 css_put(&prev->css);
 889 }
 890
 891 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
 892 {
 893         struct mem_cgroup *memcg = dead_memcg;
 894         struct mem_cgroup_reclaim_iter *iter;
 895         struct mem_cgroup_per_node *mz;
 896         int nid;
 897         int i;
 898
 899         while ((memcg = parent_mem_cgroup(memcg))) {
 900                 for_each_node(nid) {
 901                         mz = mem_cgroup_nodeinfo(memcg, nid);
 902                         for (i = 0; i <= DEF_PRIORITY; i++) {
 903                                 iter = &mz->iter[i];
 904                                 cmpxchg(&iter->position,
 905                                         dead_memcg, NULL);
 906                         }
 907                 }
 908         }
 909 }
 910
 911 /*
 912  * Iteration constructs for visiting all cgroups (under a tree).  If
 913  * loops are exited prematurely (break), mem_cgroup_iter_break() must
 914  * be used for reference counting.
 915  */
 916 #define for_each_mem_cgroup_tree(iter, root)            \
 917         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
 918              iter != NULL;                              \
 919              iter = mem_cgroup_iter(root, iter, NULL))
 920
 921 #define for_each_mem_cgroup(iter)                       \
 922         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
 923              iter != NULL;                              \
 924              iter = mem_cgroup_iter(NULL, iter, NULL))
 925
 926 /**
 927  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
 928  * @memcg: hierarchy root
 929  * @fn: function to call for each task
 930  * @arg: argument passed to @fn
 931  *
 932  * This function iterates over tasks attached to @memcg or to any of its
 933  * descendants and calls @fn for each task. If @fn returns a non-zero
 934  * value, the function breaks the iteration loop and returns the value.
 935  * Otherwise, it will iterate over all tasks and return 0.
 936  *
 937  * This function must not be called for the root memory cgroup.
 938  */
 939 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
 940                           int (*fn)(struct task_struct *, void *), void *arg)
 941 {
 942         struct mem_cgroup *iter;
 943         int ret = 0;
 944
 945         BUG_ON(memcg == root_mem_cgroup);
 946
 947         for_each_mem_cgroup_tree(iter, memcg) {
 948                 struct css_task_iter it;
 949                 struct task_struct *task;
 950
 951                 css_task_iter_start(&iter->css, &it);
 952                 while (!ret && (task = css_task_iter_next(&it)))
 953                         ret = fn(task, arg);
 954                 css_task_iter_end(&it);
 955                 if (ret) {
 956                         mem_cgroup_iter_break(memcg, iter);
 957                         break;
 958                 }
 959         }
 960         return ret;
 961 }
 962
 963 /**
 964  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
 965  * @page: the page
 966  * @zone: zone of the page
 967  *
 968  * This function is only safe when following the LRU page isolation
 969  * and putback protocol: the LRU lock must be held, and the page must
 970  * either be PageLRU() or the caller must have isolated/allocated it.
 971  */
 972 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
 973 {
 974         struct mem_cgroup_per_node *mz;
 975         struct mem_cgroup *memcg;
 976         struct lruvec *lruvec;
 977
 978         if (mem_cgroup_disabled()) {
 979                 lruvec = &pgdat->lruvec;
 980                 goto out;
 981         }
 982
 983         memcg = page->mem_cgroup;
 984         /*
 985          * Swapcache readahead pages are added to the LRU - and
 986          * possibly migrated - before they are charged.
 987          */
 988         if (!memcg)
 989                 memcg = root_mem_cgroup;
 990
 991         mz = mem_cgroup_page_nodeinfo(memcg, page);
 992         lruvec = &mz->lruvec;
 993 out:
 994         /*
 995          * Since a node can be onlined after the mem_cgroup was created,
 996          * we have to be prepared to initialize lruvec->zone here;
 997          * and if offlined then reonlined, we need to reinitialize it.
 998          */
 999         if (unlikely(lruvec->pgdat != pgdat))
1000                 lruvec->pgdat = pgdat;
1001         return lruvec;
1002 }
1003
1004 /**
1005  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1006  * @lruvec: mem_cgroup per zone lru vector
1007  * @lru: index of lru list the page is sitting on
1008  * @nr_pages: positive when adding or negative when removing
1009  *
1010  * This function must be called under lru_lock, just before a page is added
1011  * to or just after a page is removed from an lru list (that ordering being
1012  * so as to allow it to check that lru_size 0 is consistent with list_empty).
1013  */
1014 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1015                                 int nr_pages)
1016 {
1017         struct mem_cgroup_per_node *mz;
1018         unsigned long *lru_size;
1019         long size;
1020         bool empty;
1021
1022         if (mem_cgroup_disabled())
1023                 return;
1024
1025         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1026         lru_size = mz->lru_size + lru;
1027         empty = list_empty(lruvec->lists + lru);
1028
1029         if (nr_pages < 0)
1030                 *lru_size += nr_pages;
1031
1032         size = *lru_size;
1033         if (WARN_ONCE(size < 0 || empty != !size,
1034                 "%s(%p, %d, %d): lru_size %ld but %sempty\n",
1035                 __func__, lruvec, lru, nr_pages, size, empty ? "" : "not ")) {
1036                 VM_BUG_ON(1);
1037                 *lru_size = 0;
1038         }
1039
1040         if (nr_pages > 0)
1041                 *lru_size += nr_pages;
1042 }
1043
1044 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1045 {
1046         struct mem_cgroup *task_memcg;
1047         struct task_struct *p;
1048         bool ret;
1049
1050         p = find_lock_task_mm(task);
1051         if (p) {
1052                 task_memcg = get_mem_cgroup_from_mm(p->mm);
1053                 task_unlock(p);
1054         } else {
1055                 /*
1056                  * All threads may have already detached their mm's, but the oom
1057                  * killer still needs to detect if they have already been oom
1058                  * killed to prevent needlessly killing additional tasks.
1059                  */
1060                 rcu_read_lock();
1061                 task_memcg = mem_cgroup_from_task(task);
1062                 css_get(&task_memcg->css);
1063                 rcu_read_unlock();
1064         }
1065         ret = mem_cgroup_is_descendant(task_memcg, memcg);
1066         css_put(&task_memcg->css);
1067         return ret;
1068 }
1069
1070 /**
1071  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1072  * @memcg: the memory cgroup
1073  *
1074  * Returns the maximum amount of memory @mem can be charged with, in
1075  * pages.
1076  */
1077 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1078 {
1079         unsigned long margin = 0;
1080         unsigned long count;
1081         unsigned long limit;
1082
1083         count = page_counter_read(&memcg->memory);
1084         limit = READ_ONCE(memcg->memory.limit);
1085         if (count < limit)
1086                 margin = limit - count;
1087
1088         if (do_memsw_account()) {
1089                 count = page_counter_read(&memcg->memsw);
1090                 limit = READ_ONCE(memcg->memsw.limit);
1091                 if (count <= limit)
1092                         margin = min(margin, limit - count);
1093                 else
1094                         margin = 0;
1095         }
1096
1097         return margin;
1098 }
1099
1100 /*
1101  * A routine for checking "mem" is under move_account() or not.
1102  *
1103  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1104  * moving cgroups. This is for waiting at high-memory pressure
1105  * caused by "move".
1106  */
1107 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1108 {
1109         struct mem_cgroup *from;
1110         struct mem_cgroup *to;
1111         bool ret = false;
1112         /*
1113          * Unlike task_move routines, we access mc.to, mc.from not under
1114          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1115          */
1116         spin_lock(&mc.lock);
1117         from = mc.from;
1118         to = mc.to;
1119         if (!from)
1120                 goto unlock;
1121
1122         ret = mem_cgroup_is_descendant(from, memcg) ||
1123                 mem_cgroup_is_descendant(to, memcg);
1124 unlock:
1125         spin_unlock(&mc.lock);
1126         return ret;
1127 }
1128
1129 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1130 {
1131         if (mc.moving_task && current != mc.moving_task) {
1132                 if (mem_cgroup_under_move(memcg)) {
1133                         DEFINE_WAIT(wait);
1134                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1135                         /* moving charge context might have finished. */
1136                         if (mc.moving_task)
1137                                 schedule();
1138                         finish_wait(&mc.waitq, &wait);
1139                         return true;
1140                 }
1141         }
1142         return false;
1143 }
1144
1145 #define K(x) ((x) << (PAGE_SHIFT-10))
1146 /**
1147  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1148  * @memcg: The memory cgroup that went over limit
1149  * @p: Task that is going to be killed
1150  *
1151  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1152  * enabled
1153  */
1154 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1155 {
1156         struct mem_cgroup *iter;
1157         unsigned int i;
1158
1159         rcu_read_lock();
1160
1161         if (p) {
1162                 pr_info("Task in ");
1163                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1164                 pr_cont(" killed as a result of limit of ");
1165         } else {
1166                 pr_info("Memory limit reached of cgroup ");
1167         }
1168
1169         pr_cont_cgroup_path(memcg->css.cgroup);
1170         pr_cont("\n");
1171
1172         rcu_read_unlock();
1173
1174         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1175                 K((u64)page_counter_read(&memcg->memory)),
1176                 K((u64)memcg->memory.limit), memcg->memory.failcnt);
1177         pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1178                 K((u64)page_counter_read(&memcg->memsw)),
1179                 K((u64)memcg->memsw.limit), memcg->memsw.failcnt);
1180         pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1181                 K((u64)page_counter_read(&memcg->kmem)),
1182                 K((u64)memcg->kmem.limit), memcg->kmem.failcnt);
1183
1184         for_each_mem_cgroup_tree(iter, memcg) {
1185                 pr_info("Memory cgroup stats for ");
1186                 pr_cont_cgroup_path(iter->css.cgroup);
1187                 pr_cont(":");
1188
1189                 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
1190                         if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account)
1191                                 continue;
1192                         pr_cont(" %s:%luKB", mem_cgroup_stat_names[i],
1193                                 K(mem_cgroup_read_stat(iter, i)));
1194                 }
1195
1196                 for (i = 0; i < NR_LRU_LISTS; i++)
1197                         pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1198                                 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1199
1200                 pr_cont("\n");
1201         }
1202 }
1203
1204 /*
1205  * This function returns the number of memcg under hierarchy tree. Returns
1206  * 1(self count) if no children.
1207  */
1208 static int mem_cgroup_count_children(struct mem_cgroup *memcg)
1209 {
1210         int num = 0;
1211         struct mem_cgroup *iter;
1212
1213         for_each_mem_cgroup_tree(iter, memcg)
1214                 num++;
1215         return num;
1216 }
1217
1218 /*
1219  * Return the memory (and swap, if configured) limit for a memcg.
1220  */
1221 unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg)
1222 {
1223         unsigned long limit;
1224
1225         limit = memcg->memory.limit;
1226         if (mem_cgroup_swappiness(memcg)) {
1227                 unsigned long memsw_limit;
1228                 unsigned long swap_limit;
1229
1230                 memsw_limit = memcg->memsw.limit;
1231                 swap_limit = memcg->swap.limit;
1232                 swap_limit = min(swap_limit, (unsigned long)total_swap_pages);
1233                 limit = min(limit + swap_limit, memsw_limit);
1234         }
1235         return limit;
1236 }
1237
1238 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1239                                      int order)
1240 {
1241         struct oom_control oc = {
1242                 .zonelist = NULL,
1243                 .nodemask = NULL,
1244                 .memcg = memcg,
1245                 .gfp_mask = gfp_mask,
1246                 .order = order,
1247         };
1248         bool ret;
1249
1250         mutex_lock(&oom_lock);
1251         ret = out_of_memory(&oc);
1252         mutex_unlock(&oom_lock);
1253         return ret;
1254 }
1255
1256 #if MAX_NUMNODES > 1
1257
1258 /**
1259  * test_mem_cgroup_node_reclaimable
1260  * @memcg: the target memcg
1261  * @nid: the node ID to be checked.
1262  * @noswap : specify true here if the user wants flle only information.
1263  *
1264  * This function returns whether the specified memcg contains any
1265  * reclaimable pages on a node. Returns true if there are any reclaimable
1266  * pages in the node.
1267  */
1268 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1269                 int nid, bool noswap)
1270 {
1271         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1272                 return true;
1273         if (noswap || !total_swap_pages)
1274                 return false;
1275         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1276                 return true;
1277         return false;
1278
1279 }
1280
1281 /*
1282  * Always updating the nodemask is not very good - even if we have an empty
1283  * list or the wrong list here, we can start from some node and traverse all
1284  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1285  *
1286  */
1287 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1288 {
1289         int nid;
1290         /*
1291          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1292          * pagein/pageout changes since the last update.
1293          */
1294         if (!atomic_read(&memcg->numainfo_events))
1295                 return;
1296         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1297                 return;
1298
1299         /* make a nodemask where this memcg uses memory from */
1300         memcg->scan_nodes = node_states[N_MEMORY];
1301
1302         for_each_node_mask(nid, node_states[N_MEMORY]) {
1303
1304                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1305                         node_clear(nid, memcg->scan_nodes);
1306         }
1307
1308         atomic_set(&memcg->numainfo_events, 0);
1309         atomic_set(&memcg->numainfo_updating, 0);
1310 }
1311
1312 /*
1313  * Selecting a node where we start reclaim from. Because what we need is just
1314  * reducing usage counter, start from anywhere is O,K. Considering
1315  * memory reclaim from current node, there are pros. and cons.
1316  *
1317  * Freeing memory from current node means freeing memory from a node which
1318  * we'll use or we've used. So, it may make LRU bad. And if several threads
1319  * hit limits, it will see a contention on a node. But freeing from remote
1320  * node means more costs for memory reclaim because of memory latency.
1321  *
1322  * Now, we use round-robin. Better algorithm is welcomed.
1323  */
1324 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1325 {
1326         int node;
1327
1328         mem_cgroup_may_update_nodemask(memcg);
1329         node = memcg->last_scanned_node;
1330
1331         node = next_node_in(node, memcg->scan_nodes);
1332         /*
1333          * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1334          * last time it really checked all the LRUs due to rate limiting.
1335          * Fallback to the current node in that case for simplicity.
1336          */
1337         if (unlikely(node == MAX_NUMNODES))
1338                 node = numa_node_id();
1339
1340         memcg->last_scanned_node = node;
1341         return node;
1342 }
1343 #else
1344 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1345 {
1346         return 0;
1347 }
1348 #endif
1349
1350 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1351                                    pg_data_t *pgdat,
1352                                    gfp_t gfp_mask,
1353                                    unsigned long *total_scanned)
1354 {
1355         struct mem_cgroup *victim = NULL;
1356         int total = 0;
1357         int loop = 0;
1358         unsigned long excess;
1359         unsigned long nr_scanned;
1360         struct mem_cgroup_reclaim_cookie reclaim = {
1361                 .pgdat = pgdat,
1362                 .priority = 0,
1363         };
1364
1365         excess = soft_limit_excess(root_memcg);
1366
1367         while (1) {
1368                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1369                 if (!victim) {
1370                         loop++;
1371                         if (loop >= 2) {
1372                                 /*
1373                                  * If we have not been able to reclaim
1374                                  * anything, it might because there are
1375                                  * no reclaimable pages under this hierarchy
1376                                  */
1377                                 if (!total)
1378                                         break;
1379                                 /*
1380                                  * We want to do more targeted reclaim.
1381                                  * excess >> 2 is not to excessive so as to
1382                                  * reclaim too much, nor too less that we keep
1383                                  * coming back to reclaim from this cgroup
1384                                  */
1385                                 if (total >= (excess >> 2) ||
1386                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1387                                         break;
1388                         }
1389                         continue;
1390                 }
1391                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1392                                         pgdat, &nr_scanned);
1393                 *total_scanned += nr_scanned;
1394                 if (!soft_limit_excess(root_memcg))
1395                         break;
1396         }
1397         mem_cgroup_iter_break(root_memcg, victim);
1398         return total;
1399 }
1400
1401 #ifdef CONFIG_LOCKDEP
1402 static struct lockdep_map memcg_oom_lock_dep_map = {
1403         .name = "memcg_oom_lock",
1404 };
1405 #endif
1406
1407 static DEFINE_SPINLOCK(memcg_oom_lock);
1408
1409 /*
1410  * Check OOM-Killer is already running under our hierarchy.
1411  * If someone is running, return false.
1412  */
1413 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1414 {
1415         struct mem_cgroup *iter, *failed = NULL;
1416
1417         spin_lock(&memcg_oom_lock);
1418
1419         for_each_mem_cgroup_tree(iter, memcg) {
1420                 if (iter->oom_lock) {
1421                         /*
1422                          * this subtree of our hierarchy is already locked
1423                          * so we cannot give a lock.
1424                          */
1425                         failed = iter;
1426                         mem_cgroup_iter_break(memcg, iter);
1427                         break;
1428                 } else
1429                         iter->oom_lock = true;
1430         }
1431
1432         if (failed) {
1433                 /*
1434                  * OK, we failed to lock the whole subtree so we have
1435                  * to clean up what we set up to the failing subtree
1436                  */
1437                 for_each_mem_cgroup_tree(iter, memcg) {
1438                         if (iter == failed) {
1439                                 mem_cgroup_iter_break(memcg, iter);
1440                                 break;
1441                         }
1442                         iter->oom_lock = false;
1443                 }
1444         } else
1445                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1446
1447         spin_unlock(&memcg_oom_lock);
1448
1449         return !failed;
1450 }
1451
1452 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1453 {
1454         struct mem_cgroup *iter;
1455
1456         spin_lock(&memcg_oom_lock);
1457         mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1458         for_each_mem_cgroup_tree(iter, memcg)
1459                 iter->oom_lock = false;
1460         spin_unlock(&memcg_oom_lock);
1461 }
1462
1463 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1464 {
1465         struct mem_cgroup *iter;
1466
1467         spin_lock(&memcg_oom_lock);
1468         for_each_mem_cgroup_tree(iter, memcg)
1469                 iter->under_oom++;
1470         spin_unlock(&memcg_oom_lock);
1471 }
1472
1473 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1474 {
1475         struct mem_cgroup *iter;
1476
1477         /*
1478          * When a new child is created while the hierarchy is under oom,
1479          * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1480          */
1481         spin_lock(&memcg_oom_lock);
1482         for_each_mem_cgroup_tree(iter, memcg)
1483                 if (iter->under_oom > 0)
1484                         iter->under_oom--;
1485         spin_unlock(&memcg_oom_lock);
1486 }
1487
1488 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1489
1490 struct oom_wait_info {
1491         struct mem_cgroup *memcg;
1492         wait_queue_t    wait;
1493 };
1494
1495 static int memcg_oom_wake_function(wait_queue_t *wait,
1496         unsigned mode, int sync, void *arg)
1497 {
1498         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1499         struct mem_cgroup *oom_wait_memcg;
1500         struct oom_wait_info *oom_wait_info;
1501
1502         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1503         oom_wait_memcg = oom_wait_info->memcg;
1504
1505         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1506             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1507                 return 0;
1508         return autoremove_wake_function(wait, mode, sync, arg);
1509 }
1510
1511 static void memcg_oom_recover(struct mem_cgroup *memcg)
1512 {
1513         /*
1514          * For the following lockless ->under_oom test, the only required
1515          * guarantee is that it must see the state asserted by an OOM when
1516          * this function is called as a result of userland actions
1517          * triggered by the notification of the OOM.  This is trivially
1518          * achieved by invoking mem_cgroup_mark_under_oom() before
1519          * triggering notification.
1520          */
1521         if (memcg && memcg->under_oom)
1522                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1523 }
1524
1525 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1526 {
1527         if (!current->memcg_may_oom)
1528                 return;
1529         /*
1530          * We are in the middle of the charge context here, so we
1531          * don't want to block when potentially sitting on a callstack
1532          * that holds all kinds of filesystem and mm locks.
1533          *
1534          * Also, the caller may handle a failed allocation gracefully
1535          * (like optional page cache readahead) and so an OOM killer
1536          * invocation might not even be necessary.
1537          *
1538          * That's why we don't do anything here except remember the
1539          * OOM context and then deal with it at the end of the page
1540          * fault when the stack is unwound, the locks are released,
1541          * and when we know whether the fault was overall successful.
1542          */
1543         css_get(&memcg->css);
1544         current->memcg_in_oom = memcg;
1545         current->memcg_oom_gfp_mask = mask;
1546         current->memcg_oom_order = order;
1547 }
1548
1549 /**
1550  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1551  * @handle: actually kill/wait or just clean up the OOM state
1552  *
1553  * This has to be called at the end of a page fault if the memcg OOM
1554  * handler was enabled.
1555  *
1556  * Memcg supports userspace OOM handling where failed allocations must
1557  * sleep on a waitqueue until the userspace task resolves the
1558  * situation.  Sleeping directly in the charge context with all kinds
1559  * of locks held is not a good idea, instead we remember an OOM state
1560  * in the task and mem_cgroup_oom_synchronize() has to be called at
1561  * the end of the page fault to complete the OOM handling.
1562  *
1563  * Returns %true if an ongoing memcg OOM situation was detected and
1564  * completed, %false otherwise.
1565  */
1566 bool mem_cgroup_oom_synchronize(bool handle)
1567 {
1568         struct mem_cgroup *memcg = current->memcg_in_oom;
1569         struct oom_wait_info owait;
1570         bool locked;
1571
1572         /* OOM is global, do not handle */
1573         if (!memcg)
1574                 return false;
1575
1576         if (!handle)
1577                 goto cleanup;
1578
1579         owait.memcg = memcg;
1580         owait.wait.flags = 0;
1581         owait.wait.func = memcg_oom_wake_function;
1582         owait.wait.private = current;
1583         INIT_LIST_HEAD(&owait.wait.task_list);
1584
1585         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1586         mem_cgroup_mark_under_oom(memcg);
1587
1588         locked = mem_cgroup_oom_trylock(memcg);
1589
1590         if (locked)
1591                 mem_cgroup_oom_notify(memcg);
1592
1593         if (locked && !memcg->oom_kill_disable) {
1594                 mem_cgroup_unmark_under_oom(memcg);
1595                 finish_wait(&memcg_oom_waitq, &owait.wait);
1596                 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1597                                          current->memcg_oom_order);
1598         } else {
1599                 schedule();
1600                 mem_cgroup_unmark_under_oom(memcg);
1601                 finish_wait(&memcg_oom_waitq, &owait.wait);
1602         }
1603
1604         if (locked) {
1605                 mem_cgroup_oom_unlock(memcg);
1606                 /*
1607                  * There is no guarantee that an OOM-lock contender
1608                  * sees the wakeups triggered by the OOM kill
1609                  * uncharges.  Wake any sleepers explicitely.
1610                  */
1611                 memcg_oom_recover(memcg);
1612         }
1613 cleanup:
1614         current->memcg_in_oom = NULL;
1615         css_put(&memcg->css);
1616         return true;
1617 }
1618
1619 /**
1620  * lock_page_memcg - lock a page->mem_cgroup binding
1621  * @page: the page
1622  *
1623  * This function protects unlocked LRU pages from being moved to
1624  * another cgroup and stabilizes their page->mem_cgroup binding.
1625  */
1626 void lock_page_memcg(struct page *page)
1627 {
1628         struct mem_cgroup *memcg;
1629         unsigned long flags;
1630
1631         /*
1632          * The RCU lock is held throughout the transaction.  The fast
1633          * path can get away without acquiring the memcg->move_lock
1634          * because page moving starts with an RCU grace period.
1635          */
1636         rcu_read_lock();
1637
1638         if (mem_cgroup_disabled())
1639                 return;
1640 again:
1641         memcg = page->mem_cgroup;
1642         if (unlikely(!memcg))
1643                 return;
1644
1645         if (atomic_read(&memcg->moving_account) <= 0)
1646                 return;
1647
1648         spin_lock_irqsave(&memcg->move_lock, flags);
1649         if (memcg != page->mem_cgroup) {
1650                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1651                 goto again;
1652         }
1653
1654         /*
1655          * When charge migration first begins, we can have locked and
1656          * unlocked page stat updates happening concurrently.  Track
1657          * the task who has the lock for unlock_page_memcg().
1658          */
1659         memcg->move_lock_task = current;
1660         memcg->move_lock_flags = flags;
1661
1662         return;
1663 }
1664 EXPORT_SYMBOL(lock_page_memcg);
1665
1666 /**
1667  * unlock_page_memcg - unlock a page->mem_cgroup binding
1668  * @page: the page
1669  */
1670 void unlock_page_memcg(struct page *page)
1671 {
1672         struct mem_cgroup *memcg = page->mem_cgroup;
1673
1674         if (memcg && memcg->move_lock_task == current) {
1675                 unsigned long flags = memcg->move_lock_flags;
1676
1677                 memcg->move_lock_task = NULL;
1678                 memcg->move_lock_flags = 0;
1679
1680                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1681         }
1682
1683         rcu_read_unlock();
1684 }
1685 EXPORT_SYMBOL(unlock_page_memcg);
1686
1687 /*
1688  * size of first charge trial. "32" comes from vmscan.c's magic value.
1689  * TODO: maybe necessary to use big numbers in big irons.
1690  */
1691 #define CHARGE_BATCH    32U
1692 struct memcg_stock_pcp {
1693         struct mem_cgroup *cached; /* this never be root cgroup */
1694         unsigned int nr_pages;
1695         struct work_struct work;
1696         unsigned long flags;
1697 #define FLUSHING_CACHED_CHARGE  0
1698 };
1699 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1700 static DEFINE_LOCAL_IRQ_LOCK(memcg_stock_ll);
1701 static DEFINE_MUTEX(percpu_charge_mutex);
1702
1703 /**
1704  * consume_stock: Try to consume stocked charge on this cpu.
1705  * @memcg: memcg to consume from.
1706  * @nr_pages: how many pages to charge.
1707  *
1708  * The charges will only happen if @memcg matches the current cpu's memcg
1709  * stock, and at least @nr_pages are available in that stock.  Failure to
1710  * service an allocation will refill the stock.
1711  *
1712  * returns true if successful, false otherwise.
1713  */
1714 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1715 {
1716         struct memcg_stock_pcp *stock;
1717         unsigned long flags;
1718         bool ret = false;
1719
1720         if (nr_pages > CHARGE_BATCH)
1721                 return ret;
1722
1723         local_lock_irqsave(memcg_stock_ll, flags);
1724
1725         stock = this_cpu_ptr(&memcg_stock);
1726         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1727                 stock->nr_pages -= nr_pages;
1728                 ret = true;
1729         }
1730
1731         local_unlock_irqrestore(memcg_stock_ll, flags);
1732
1733         return ret;
1734 }
1735
1736 /*
1737  * Returns stocks cached in percpu and reset cached information.
1738  */
1739 static void drain_stock(struct memcg_stock_pcp *stock)
1740 {
1741         struct mem_cgroup *old = stock->cached;
1742
1743         if (stock->nr_pages) {
1744                 page_counter_uncharge(&old->memory, stock->nr_pages);
1745                 if (do_memsw_account())
1746                         page_counter_uncharge(&old->memsw, stock->nr_pages);
1747                 css_put_many(&old->css, stock->nr_pages);
1748                 stock->nr_pages = 0;
1749         }
1750         stock->cached = NULL;
1751 }
1752
1753 static void drain_local_stock(struct work_struct *dummy)
1754 {
1755         struct memcg_stock_pcp *stock;
1756         unsigned long flags;
1757
1758         local_lock_irqsave(memcg_stock_ll, flags);
1759
1760         stock = this_cpu_ptr(&memcg_stock);
1761         drain_stock(stock);
1762         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
1763
1764         local_unlock_irqrestore(memcg_stock_ll, flags);
1765 }
1766
1767 /*
1768  * Cache charges(val) to local per_cpu area.
1769  * This will be consumed by consume_stock() function, later.
1770  */
1771 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1772 {
1773         struct memcg_stock_pcp *stock;
1774         unsigned long flags;
1775
1776         local_lock_irqsave(memcg_stock_ll, flags);
1777
1778         stock = this_cpu_ptr(&memcg_stock);
1779         if (stock->cached != memcg) { /* reset if necessary */
1780                 drain_stock(stock);
1781                 stock->cached = memcg;
1782         }
1783         stock->nr_pages += nr_pages;
1784
1785         local_unlock_irqrestore(memcg_stock_ll, flags);
1786 }
1787
1788 /*
1789  * Drains all per-CPU charge caches for given root_memcg resp. subtree
1790  * of the hierarchy under it.
1791  */
1792 static void drain_all_stock(struct mem_cgroup *root_memcg)
1793 {
1794         int cpu, curcpu;
1795
1796         /* If someone's already draining, avoid adding running more workers. */
1797         if (!mutex_trylock(&percpu_charge_mutex))
1798                 return;
1799         /* Notify other cpus that system-wide "drain" is running */
1800         get_online_cpus();
1801         curcpu = get_cpu_light();
1802         for_each_online_cpu(cpu) {
1803                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
1804                 struct mem_cgroup *memcg;
1805
1806                 memcg = stock->cached;
1807                 if (!memcg || !stock->nr_pages)
1808                         continue;
1809                 if (!mem_cgroup_is_descendant(memcg, root_memcg))
1810                         continue;
1811                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
1812                         if (cpu == curcpu)
1813                                 drain_local_stock(&stock->work);
1814                         else
1815                                 schedule_work_on(cpu, &stock->work);
1816                 }
1817         }
1818         put_cpu_light();
1819         put_online_cpus();
1820         mutex_unlock(&percpu_charge_mutex);
1821 }
1822
1823 static int memcg_cpu_hotplug_callback(struct notifier_block *nb,
1824                                         unsigned long action,
1825                                         void *hcpu)
1826 {
1827         int cpu = (unsigned long)hcpu;
1828         struct memcg_stock_pcp *stock;
1829
1830         if (action == CPU_ONLINE)
1831                 return NOTIFY_OK;
1832
1833         if (action != CPU_DEAD && action != CPU_DEAD_FROZEN)
1834                 return NOTIFY_OK;
1835
1836         stock = &per_cpu(memcg_stock, cpu);
1837         drain_stock(stock);
1838         return NOTIFY_OK;
1839 }
1840
1841 static void reclaim_high(struct mem_cgroup *memcg,
1842                          unsigned int nr_pages,
1843                          gfp_t gfp_mask)
1844 {
1845         do {
1846                 if (page_counter_read(&memcg->memory) <= memcg->high)
1847                         continue;
1848                 mem_cgroup_events(memcg, MEMCG_HIGH, 1);
1849                 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
1850         } while ((memcg = parent_mem_cgroup(memcg)));
1851 }
1852
1853 static void high_work_func(struct work_struct *work)
1854 {
1855         struct mem_cgroup *memcg;
1856
1857         memcg = container_of(work, struct mem_cgroup, high_work);
1858         reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL);
1859 }
1860
1861 /*
1862  * Scheduled by try_charge() to be executed from the userland return path
1863  * and reclaims memory over the high limit.
1864  */
1865 void mem_cgroup_handle_over_high(void)
1866 {
1867         unsigned int nr_pages = current->memcg_nr_pages_over_high;
1868         struct mem_cgroup *memcg;
1869
1870         if (likely(!nr_pages))
1871                 return;
1872
1873         memcg = get_mem_cgroup_from_mm(current->mm);
1874         reclaim_high(memcg, nr_pages, GFP_KERNEL);
1875         css_put(&memcg->css);
1876         current->memcg_nr_pages_over_high = 0;
1877 }
1878
1879 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
1880                       unsigned int nr_pages)
1881 {
1882         unsigned int batch = max(CHARGE_BATCH, nr_pages);
1883         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
1884         struct mem_cgroup *mem_over_limit;
1885         struct page_counter *counter;
1886         unsigned long nr_reclaimed;
1887         bool may_swap = true;
1888         bool drained = false;
1889
1890         if (mem_cgroup_is_root(memcg))
1891                 return 0;
1892 retry:
1893         if (consume_stock(memcg, nr_pages))
1894                 return 0;
1895
1896         if (!do_memsw_account() ||
1897             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
1898                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
1899                         goto done_restock;
1900                 if (do_memsw_account())
1901                         page_counter_uncharge(&memcg->memsw, batch);
1902                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
1903         } else {
1904                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
1905                 may_swap = false;
1906         }
1907
1908         if (batch > nr_pages) {
1909                 batch = nr_pages;
1910                 goto retry;
1911         }
1912
1913         /*
1914          * Unlike in global OOM situations, memcg is not in a physical
1915          * memory shortage.  Allow dying and OOM-killed tasks to
1916          * bypass the last charges so that they can exit quickly and
1917          * free their memory.
1918          */
1919         if (unlikely(test_thread_flag(TIF_MEMDIE) ||
1920                      fatal_signal_pending(current) ||
1921                      current->flags & PF_EXITING))
1922                 goto force;
1923
1924         /*
1925          * Prevent unbounded recursion when reclaim operations need to
1926          * allocate memory. This might exceed the limits temporarily,
1927          * but we prefer facilitating memory reclaim and getting back
1928          * under the limit over triggering OOM kills in these cases.
1929          */
1930         if (unlikely(current->flags & PF_MEMALLOC))
1931                 goto force;
1932
1933         if (unlikely(task_in_memcg_oom(current)))
1934                 goto nomem;
1935
1936         if (!gfpflags_allow_blocking(gfp_mask))
1937                 goto nomem;
1938
1939         mem_cgroup_events(mem_over_limit, MEMCG_MAX, 1);
1940
1941         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
1942                                                     gfp_mask, may_swap);
1943
1944         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
1945                 goto retry;
1946
1947         if (!drained) {
1948                 drain_all_stock(mem_over_limit);
1949                 drained = true;
1950                 goto retry;
1951         }
1952
1953         if (gfp_mask & __GFP_NORETRY)
1954                 goto nomem;
1955         /*
1956          * Even though the limit is exceeded at this point, reclaim
1957          * may have been able to free some pages.  Retry the charge
1958          * before killing the task.
1959          *
1960          * Only for regular pages, though: huge pages are rather
1961          * unlikely to succeed so close to the limit, and we fall back
1962          * to regular pages anyway in case of failure.
1963          */
1964         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
1965                 goto retry;
1966         /*
1967          * At task move, charge accounts can be doubly counted. So, it's
1968          * better to wait until the end of task_move if something is going on.
1969          */
1970         if (mem_cgroup_wait_acct_move(mem_over_limit))
1971                 goto retry;
1972
1973         if (nr_retries--)
1974                 goto retry;
1975
1976         if (gfp_mask & __GFP_NOFAIL)
1977                 goto force;
1978
1979         if (fatal_signal_pending(current))
1980                 goto force;
1981
1982         mem_cgroup_events(mem_over_limit, MEMCG_OOM, 1);
1983
1984         mem_cgroup_oom(mem_over_limit, gfp_mask,
1985                        get_order(nr_pages * PAGE_SIZE));
1986 nomem:
1987         if (!(gfp_mask & __GFP_NOFAIL))
1988                 return -ENOMEM;
1989 force:
1990         /*
1991          * The allocation either can't fail or will lead to more memory
1992          * being freed very soon.  Allow memory usage go over the limit
1993          * temporarily by force charging it.
1994          */
1995         page_counter_charge(&memcg->memory, nr_pages);
1996         if (do_memsw_account())
1997                 page_counter_charge(&memcg->memsw, nr_pages);
1998         css_get_many(&memcg->css, nr_pages);
1999
2000         return 0;
2001
2002 done_restock:
2003         css_get_many(&memcg->css, batch);
2004         if (batch > nr_pages)
2005                 refill_stock(memcg, batch - nr_pages);
2006
2007         /*
2008          * If the hierarchy is above the normal consumption range, schedule
2009          * reclaim on returning to userland.  We can perform reclaim here
2010          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2011          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2012          * not recorded as it most likely matches current's and won't
2013          * change in the meantime.  As high limit is checked again before
2014          * reclaim, the cost of mismatch is negligible.
2015          */
2016         do {
2017                 if (page_counter_read(&memcg->memory) > memcg->high) {
2018                         /* Don't bother a random interrupted task */
2019                         if (in_interrupt()) {
2020                                 schedule_work(&memcg->high_work);
2021                                 break;
2022                         }
2023                         current->memcg_nr_pages_over_high += batch;
2024                         set_notify_resume(current);
2025                         break;
2026                 }
2027         } while ((memcg = parent_mem_cgroup(memcg)));
2028
2029         return 0;
2030 }
2031
2032 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2033 {
2034         if (mem_cgroup_is_root(memcg))
2035                 return;
2036
2037         page_counter_uncharge(&memcg->memory, nr_pages);
2038         if (do_memsw_account())
2039                 page_counter_uncharge(&memcg->memsw, nr_pages);
2040
2041         css_put_many(&memcg->css, nr_pages);
2042 }
2043
2044 static void lock_page_lru(struct page *page, int *isolated)
2045 {
2046         struct zone *zone = page_zone(page);
2047
2048         spin_lock_irq(zone_lru_lock(zone));
2049         if (PageLRU(page)) {
2050                 struct lruvec *lruvec;
2051
2052                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2053                 ClearPageLRU(page);
2054                 del_page_from_lru_list(page, lruvec, page_lru(page));
2055                 *isolated = 1;
2056         } else
2057                 *isolated = 0;
2058 }
2059
2060 static void unlock_page_lru(struct page *page, int isolated)
2061 {
2062         struct zone *zone = page_zone(page);
2063
2064         if (isolated) {
2065                 struct lruvec *lruvec;
2066
2067                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2068                 VM_BUG_ON_PAGE(PageLRU(page), page);
2069                 SetPageLRU(page);
2070                 add_page_to_lru_list(page, lruvec, page_lru(page));
2071         }
2072         spin_unlock_irq(zone_lru_lock(zone));
2073 }
2074
2075 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2076                           bool lrucare)
2077 {
2078         int isolated;
2079
2080         VM_BUG_ON_PAGE(page->mem_cgroup, page);
2081
2082         /*
2083          * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2084          * may already be on some other mem_cgroup's LRU.  Take care of it.
2085          */
2086         if (lrucare)
2087                 lock_page_lru(page, &isolated);
2088
2089         /*
2090          * Nobody should be changing or seriously looking at
2091          * page->mem_cgroup at this point:
2092          *
2093          * - the page is uncharged
2094          *
2095          * - the page is off-LRU
2096          *
2097          * - an anonymous fault has exclusive page access, except for
2098          *   a locked page table
2099          *
2100          * - a page cache insertion, a swapin fault, or a migration
2101          *   have the page locked
2102          */
2103         page->mem_cgroup = memcg;
2104
2105         if (lrucare)
2106                 unlock_page_lru(page, isolated);
2107 }
2108
2109 #ifndef CONFIG_SLOB
2110 static int memcg_alloc_cache_id(void)
2111 {
2112         int id, size;
2113         int err;
2114
2115         id = ida_simple_get(&memcg_cache_ida,
2116                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2117         if (id < 0)
2118                 return id;
2119
2120         if (id < memcg_nr_cache_ids)
2121                 return id;
2122
2123         /*
2124          * There's no space for the new id in memcg_caches arrays,
2125          * so we have to grow them.
2126          */
2127         down_write(&memcg_cache_ids_sem);
2128
2129         size = 2 * (id + 1);
2130         if (size < MEMCG_CACHES_MIN_SIZE)
2131                 size = MEMCG_CACHES_MIN_SIZE;
2132         else if (size > MEMCG_CACHES_MAX_SIZE)
2133                 size = MEMCG_CACHES_MAX_SIZE;
2134
2135         err = memcg_update_all_caches(size);
2136         if (!err)
2137                 err = memcg_update_all_list_lrus(size);
2138         if (!err)
2139                 memcg_nr_cache_ids = size;
2140
2141         up_write(&memcg_cache_ids_sem);
2142
2143         if (err) {
2144                 ida_simple_remove(&memcg_cache_ida, id);
2145                 return err;
2146         }
2147         return id;
2148 }
2149
2150 static void memcg_free_cache_id(int id)
2151 {
2152         ida_simple_remove(&memcg_cache_ida, id);
2153 }
2154
2155 struct memcg_kmem_cache_create_work {
2156         struct mem_cgroup *memcg;
2157         struct kmem_cache *cachep;
2158         struct work_struct work;
2159 };
2160
2161 static void memcg_kmem_cache_create_func(struct work_struct *w)
2162 {
2163         struct memcg_kmem_cache_create_work *cw =
2164                 container_of(w, struct memcg_kmem_cache_create_work, work);
2165         struct mem_cgroup *memcg = cw->memcg;
2166         struct kmem_cache *cachep = cw->cachep;
2167
2168         memcg_create_kmem_cache(memcg, cachep);
2169
2170         css_put(&memcg->css);
2171         kfree(cw);
2172 }
2173
2174 /*
2175  * Enqueue the creation of a per-memcg kmem_cache.
2176  */
2177 static void __memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2178                                                struct kmem_cache *cachep)
2179 {
2180         struct memcg_kmem_cache_create_work *cw;
2181
2182         cw = kmalloc(sizeof(*cw), GFP_NOWAIT);
2183         if (!cw)
2184                 return;
2185
2186         css_get(&memcg->css);
2187
2188         cw->memcg = memcg;
2189         cw->cachep = cachep;
2190         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2191
2192         schedule_work(&cw->work);
2193 }
2194
2195 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2196                                              struct kmem_cache *cachep)
2197 {
2198         /*
2199          * We need to stop accounting when we kmalloc, because if the
2200          * corresponding kmalloc cache is not yet created, the first allocation
2201          * in __memcg_schedule_kmem_cache_create will recurse.
2202          *
2203          * However, it is better to enclose the whole function. Depending on
2204          * the debugging options enabled, INIT_WORK(), for instance, can
2205          * trigger an allocation. This too, will make us recurse. Because at
2206          * this point we can't allow ourselves back into memcg_kmem_get_cache,
2207          * the safest choice is to do it like this, wrapping the whole function.
2208          */
2209         current->memcg_kmem_skip_account = 1;
2210         __memcg_schedule_kmem_cache_create(memcg, cachep);
2211         current->memcg_kmem_skip_account = 0;
2212 }
2213
2214 static inline bool memcg_kmem_bypass(void)
2215 {
2216         if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2217                 return true;
2218         return false;
2219 }
2220
2221 /**
2222  * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2223  * @cachep: the original global kmem cache
2224  *
2225  * Return the kmem_cache we're supposed to use for a slab allocation.
2226  * We try to use the current memcg's version of the cache.
2227  *
2228  * If the cache does not exist yet, if we are the first user of it, we
2229  * create it asynchronously in a workqueue and let the current allocation
2230  * go through with the original cache.
2231  *
2232  * This function takes a reference to the cache it returns to assure it
2233  * won't get destroyed while we are working with it. Once the caller is
2234  * done with it, memcg_kmem_put_cache() must be called to release the
2235  * reference.
2236  */
2237 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2238 {
2239         struct mem_cgroup *memcg;
2240         struct kmem_cache *memcg_cachep;
2241         int kmemcg_id;
2242
2243         VM_BUG_ON(!is_root_cache(cachep));
2244
2245         if (memcg_kmem_bypass())
2246                 return cachep;
2247
2248         if (current->memcg_kmem_skip_account)
2249                 return cachep;
2250
2251         memcg = get_mem_cgroup_from_mm(current->mm);
2252         kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2253         if (kmemcg_id < 0)
2254                 goto out;
2255
2256         memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2257         if (likely(memcg_cachep))
2258                 return memcg_cachep;
2259
2260         /*
2261          * If we are in a safe context (can wait, and not in interrupt
2262          * context), we could be be predictable and return right away.
2263          * This would guarantee that the allocation being performed
2264          * already belongs in the new cache.
2265          *
2266          * However, there are some clashes that can arrive from locking.
2267          * For instance, because we acquire the slab_mutex while doing
2268          * memcg_create_kmem_cache, this means no further allocation
2269          * could happen with the slab_mutex held. So it's better to
2270          * defer everything.
2271          */
2272         memcg_schedule_kmem_cache_create(memcg, cachep);
2273 out:
2274         css_put(&memcg->css);
2275         return cachep;
2276 }
2277
2278 /**
2279  * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2280  * @cachep: the cache returned by memcg_kmem_get_cache
2281  */
2282 void memcg_kmem_put_cache(struct kmem_cache *cachep)
2283 {
2284         if (!is_root_cache(cachep))
2285                 css_put(&cachep->memcg_params.memcg->css);
2286 }
2287
2288 /**
2289  * memcg_kmem_charge: charge a kmem page
2290  * @page: page to charge
2291  * @gfp: reclaim mode
2292  * @order: allocation order
2293  * @memcg: memory cgroup to charge
2294  *
2295  * Returns 0 on success, an error code on failure.
2296  */
2297 int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2298                             struct mem_cgroup *memcg)
2299 {
2300         unsigned int nr_pages = 1 << order;
2301         struct page_counter *counter;
2302         int ret;
2303
2304         ret = try_charge(memcg, gfp, nr_pages);
2305         if (ret)
2306                 return ret;
2307
2308         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2309             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2310                 cancel_charge(memcg, nr_pages);
2311                 return -ENOMEM;
2312         }
2313
2314         page->mem_cgroup = memcg;
2315
2316         return 0;
2317 }
2318
2319 /**
2320  * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2321  * @page: page to charge
2322  * @gfp: reclaim mode
2323  * @order: allocation order
2324  *
2325  * Returns 0 on success, an error code on failure.
2326  */
2327 int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2328 {
2329         struct mem_cgroup *memcg;
2330         int ret = 0;
2331
2332         if (memcg_kmem_bypass())
2333                 return 0;
2334
2335         memcg = get_mem_cgroup_from_mm(current->mm);
2336         if (!mem_cgroup_is_root(memcg)) {
2337                 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2338                 if (!ret)
2339                         __SetPageKmemcg(page);
2340         }
2341         css_put(&memcg->css);
2342         return ret;
2343 }
2344 /**
2345  * memcg_kmem_uncharge: uncharge a kmem page
2346  * @page: page to uncharge
2347  * @order: allocation order
2348  */
2349 void memcg_kmem_uncharge(struct page *page, int order)
2350 {
2351         struct mem_cgroup *memcg = page->mem_cgroup;
2352         unsigned int nr_pages = 1 << order;
2353
2354         if (!memcg)
2355                 return;
2356
2357         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2358
2359         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2360                 page_counter_uncharge(&memcg->kmem, nr_pages);
2361
2362         page_counter_uncharge(&memcg->memory, nr_pages);
2363         if (do_memsw_account())
2364                 page_counter_uncharge(&memcg->memsw, nr_pages);
2365
2366         page->mem_cgroup = NULL;
2367
2368         /* slab pages do not have PageKmemcg flag set */
2369         if (PageKmemcg(page))
2370                 __ClearPageKmemcg(page);
2371
2372         css_put_many(&memcg->css, nr_pages);
2373 }
2374 #endif /* !CONFIG_SLOB */
2375
2376 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2377
2378 /*
2379  * Because tail pages are not marked as "used", set it. We're under
2380  * zone_lru_lock and migration entries setup in all page mappings.
2381  */
2382 void mem_cgroup_split_huge_fixup(struct page *head)
2383 {
2384         int i;
2385
2386         if (mem_cgroup_disabled())
2387                 return;
2388
2389         for (i = 1; i < HPAGE_PMD_NR; i++)
2390                 head[i].mem_cgroup = head->mem_cgroup;
2391
2392         __this_cpu_sub(head->mem_cgroup->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
2393                        HPAGE_PMD_NR);
2394 }
2395 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2396
2397 #ifdef CONFIG_MEMCG_SWAP
2398 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg,
2399                                          bool charge)
2400 {
2401         int val = (charge) ? 1 : -1;
2402         this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val);
2403 }
2404
2405 /**
2406  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2407  * @entry: swap entry to be moved
2408  * @from:  mem_cgroup which the entry is moved from
2409  * @to:  mem_cgroup which the entry is moved to
2410  *
2411  * It succeeds only when the swap_cgroup's record for this entry is the same
2412  * as the mem_cgroup's id of @from.
2413  *
2414  * Returns 0 on success, -EINVAL on failure.
2415  *
2416  * The caller must have charged to @to, IOW, called page_counter_charge() about
2417  * both res and memsw, and called css_get().
2418  */
2419 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2420                                 struct mem_cgroup *from, struct mem_cgroup *to)
2421 {
2422         unsigned short old_id, new_id;
2423
2424         old_id = mem_cgroup_id(from);
2425         new_id = mem_cgroup_id(to);
2426
2427         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2428                 mem_cgroup_swap_statistics(from, false);
2429                 mem_cgroup_swap_statistics(to, true);
2430                 return 0;
2431         }
2432         return -EINVAL;
2433 }
2434 #else
2435 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2436                                 struct mem_cgroup *from, struct mem_cgroup *to)
2437 {
2438         return -EINVAL;
2439 }
2440 #endif
2441
2442 static DEFINE_MUTEX(memcg_limit_mutex);
2443
2444 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg,
2445                                    unsigned long limit)
2446 {
2447         unsigned long curusage;
2448         unsigned long oldusage;
2449         bool enlarge = false;
2450         int retry_count;
2451         int ret;
2452
2453         /*
2454          * For keeping hierarchical_reclaim simple, how long we should retry
2455          * is depends on callers. We set our retry-count to be function
2456          * of # of children which we should visit in this loop.
2457          */
2458         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2459                       mem_cgroup_count_children(memcg);
2460
2461         oldusage = page_counter_read(&memcg->memory);
2462
2463         do {
2464                 if (signal_pending(current)) {
2465                         ret = -EINTR;
2466                         break;
2467                 }
2468
2469                 mutex_lock(&memcg_limit_mutex);
2470                 if (limit > memcg->memsw.limit) {
2471                         mutex_unlock(&memcg_limit_mutex);
2472                         ret = -EINVAL;
2473                         break;
2474                 }
2475                 if (limit > memcg->memory.limit)
2476                         enlarge = true;
2477                 ret = page_counter_limit(&memcg->memory, limit);
2478                 mutex_unlock(&memcg_limit_mutex);
2479
2480                 if (!ret)
2481                         break;
2482
2483                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true);
2484
2485                 curusage = page_counter_read(&memcg->memory);
2486                 /* Usage is reduced ? */
2487                 if (curusage >= oldusage)
2488                         retry_count--;
2489                 else
2490                         oldusage = curusage;
2491         } while (retry_count);
2492
2493         if (!ret && enlarge)
2494                 memcg_oom_recover(memcg);
2495
2496         return ret;
2497 }
2498
2499 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
2500                                          unsigned long limit)
2501 {
2502         unsigned long curusage;
2503         unsigned long oldusage;
2504         bool enlarge = false;
2505         int retry_count;
2506         int ret;
2507
2508         /* see mem_cgroup_resize_res_limit */
2509         retry_count = MEM_CGROUP_RECLAIM_RETRIES *
2510                       mem_cgroup_count_children(memcg);
2511
2512         oldusage = page_counter_read(&memcg->memsw);
2513
2514         do {
2515                 if (signal_pending(current)) {
2516                         ret = -EINTR;
2517                         break;
2518                 }
2519
2520                 mutex_lock(&memcg_limit_mutex);
2521                 if (limit < memcg->memory.limit) {
2522                         mutex_unlock(&memcg_limit_mutex);
2523                         ret = -EINVAL;
2524                         break;
2525                 }
2526                 if (limit > memcg->memsw.limit)
2527                         enlarge = true;
2528                 ret = page_counter_limit(&memcg->memsw, limit);
2529                 mutex_unlock(&memcg_limit_mutex);
2530
2531                 if (!ret)
2532                         break;
2533
2534                 try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false);
2535
2536                 curusage = page_counter_read(&memcg->memsw);
2537                 /* Usage is reduced ? */
2538                 if (curusage >= oldusage)
2539                         retry_count--;
2540                 else
2541                         oldusage = curusage;
2542         } while (retry_count);
2543
2544         if (!ret && enlarge)
2545                 memcg_oom_recover(memcg);
2546
2547         return ret;
2548 }
2549
2550 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2551                                             gfp_t gfp_mask,
2552                                             unsigned long *total_scanned)
2553 {
2554         unsigned long nr_reclaimed = 0;
2555         struct mem_cgroup_per_node *mz, *next_mz = NULL;
2556         unsigned long reclaimed;
2557         int loop = 0;
2558         struct mem_cgroup_tree_per_node *mctz;
2559         unsigned long excess;
2560         unsigned long nr_scanned;
2561
2562         if (order > 0)
2563                 return 0;
2564
2565         mctz = soft_limit_tree_node(pgdat->node_id);
2566
2567         /*
2568          * Do not even bother to check the largest node if the root
2569          * is empty. Do it lockless to prevent lock bouncing. Races
2570          * are acceptable as soft limit is best effort anyway.
2571          */
2572         if (RB_EMPTY_ROOT(&mctz->rb_root))
2573                 return 0;
2574
2575         /*
2576          * This loop can run a while, specially if mem_cgroup's continuously
2577          * keep exceeding their soft limit and putting the system under
2578          * pressure
2579          */
2580         do {
2581                 if (next_mz)
2582                         mz = next_mz;
2583                 else
2584                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2585                 if (!mz)
2586                         break;
2587
2588                 nr_scanned = 0;
2589                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2590                                                     gfp_mask, &nr_scanned);
2591                 nr_reclaimed += reclaimed;
2592                 *total_scanned += nr_scanned;
2593                 spin_lock_irq(&mctz->lock);
2594                 __mem_cgroup_remove_exceeded(mz, mctz);
2595
2596                 /*
2597                  * If we failed to reclaim anything from this memory cgroup
2598                  * it is time to move on to the next cgroup
2599                  */
2600                 next_mz = NULL;
2601                 if (!reclaimed)
2602                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2603
2604                 excess = soft_limit_excess(mz->memcg);
2605                 /*
2606                  * One school of thought says that we should not add
2607                  * back the node to the tree if reclaim returns 0.
2608                  * But our reclaim could return 0, simply because due
2609                  * to priority we are exposing a smaller subset of
2610                  * memory to reclaim from. Consider this as a longer
2611                  * term TODO.
2612                  */
2613                 /* If excess == 0, no tree ops */
2614                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2615                 spin_unlock_irq(&mctz->lock);
2616                 css_put(&mz->memcg->css);
2617                 loop++;
2618                 /*
2619                  * Could not reclaim anything and there are no more
2620                  * mem cgroups to try or we seem to be looping without
2621                  * reclaiming anything.
2622                  */
2623                 if (!nr_reclaimed &&
2624                         (next_mz == NULL ||
2625                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2626                         break;
2627         } while (!nr_reclaimed);
2628         if (next_mz)
2629                 css_put(&next_mz->memcg->css);
2630         return nr_reclaimed;
2631 }
2632
2633 /*
2634  * Test whether @memcg has children, dead or alive.  Note that this
2635  * function doesn't care whether @memcg has use_hierarchy enabled and
2636  * returns %true if there are child csses according to the cgroup
2637  * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2638  */
2639 static inline bool memcg_has_children(struct mem_cgroup *memcg)
2640 {
2641         bool ret;
2642
2643         rcu_read_lock();
2644         ret = css_next_child(NULL, &memcg->css);
2645         rcu_read_unlock();
2646         return ret;
2647 }
2648
2649 /*
2650  * Reclaims as many pages from the given memcg as possible.
2651  *
2652  * Caller is responsible for holding css reference for memcg.
2653  */
2654 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2655 {
2656         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2657
2658         /* we call try-to-free pages for make this cgroup empty */
2659         lru_add_drain_all();
2660         /* try to free all pages in this cgroup */
2661         while (nr_retries && page_counter_read(&memcg->memory)) {
2662                 int progress;
2663
2664                 if (signal_pending(current))
2665                         return -EINTR;
2666
2667                 progress = try_to_free_mem_cgroup_pages(memcg, 1,
2668                                                         GFP_KERNEL, true);
2669                 if (!progress) {
2670                         nr_retries--;
2671                         /* maybe some writeback is necessary */
2672                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2673                 }
2674
2675         }
2676
2677         return 0;
2678 }
2679
2680 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2681                                             char *buf, size_t nbytes,
2682                                             loff_t off)
2683 {
2684         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2685
2686         if (mem_cgroup_is_root(memcg))
2687                 return -EINVAL;
2688         return mem_cgroup_force_empty(memcg) ?: nbytes;
2689 }
2690
2691 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2692                                      struct cftype *cft)
2693 {
2694         return mem_cgroup_from_css(css)->use_hierarchy;
2695 }
2696
2697 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2698                                       struct cftype *cft, u64 val)
2699 {
2700         int retval = 0;
2701         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2702         struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2703
2704         if (memcg->use_hierarchy == val)
2705                 return 0;
2706
2707         /*
2708          * If parent's use_hierarchy is set, we can't make any modifications
2709          * in the child subtrees. If it is unset, then the change can
2710          * occur, provided the current cgroup has no children.
2711          *
2712          * For the root cgroup, parent_mem is NULL, we allow value to be
2713          * set if there are no children.
2714          */
2715         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2716                                 (val == 1 || val == 0)) {
2717                 if (!memcg_has_children(memcg))
2718                         memcg->use_hierarchy = val;
2719                 else
2720                         retval = -EBUSY;
2721         } else
2722                 retval = -EINVAL;
2723
2724         return retval;
2725 }
2726
2727 static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat)
2728 {
2729         struct mem_cgroup *iter;
2730         int i;
2731
2732         memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT);
2733
2734         for_each_mem_cgroup_tree(iter, memcg) {
2735                 for (i = 0; i < MEMCG_NR_STAT; i++)
2736                         stat[i] += mem_cgroup_read_stat(iter, i);
2737         }
2738 }
2739
2740 static void tree_events(struct mem_cgroup *memcg, unsigned long *events)
2741 {
2742         struct mem_cgroup *iter;
2743         int i;
2744
2745         memset(events, 0, sizeof(*events) * MEMCG_NR_EVENTS);
2746
2747         for_each_mem_cgroup_tree(iter, memcg) {
2748                 for (i = 0; i < MEMCG_NR_EVENTS; i++)
2749                         events[i] += mem_cgroup_read_events(iter, i);
2750         }
2751 }
2752
2753 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2754 {
2755         unsigned long val = 0;
2756
2757         if (mem_cgroup_is_root(memcg)) {
2758                 struct mem_cgroup *iter;
2759
2760                 for_each_mem_cgroup_tree(iter, memcg) {
2761                         val += mem_cgroup_read_stat(iter,
2762                                         MEM_CGROUP_STAT_CACHE);
2763                         val += mem_cgroup_read_stat(iter,
2764                                         MEM_CGROUP_STAT_RSS);
2765                         if (swap)
2766                                 val += mem_cgroup_read_stat(iter,
2767                                                 MEM_CGROUP_STAT_SWAP);
2768                 }
2769         } else {
2770                 if (!swap)
2771                         val = page_counter_read(&memcg->memory);
2772                 else
2773                         val = page_counter_read(&memcg->memsw);
2774         }
2775         return val;
2776 }
2777
2778 enum {
2779         RES_USAGE,
2780         RES_LIMIT,
2781         RES_MAX_USAGE,
2782         RES_FAILCNT,
2783         RES_SOFT_LIMIT,
2784 };
2785
2786 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
2787                                struct cftype *cft)
2788 {
2789         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2790         struct page_counter *counter;
2791
2792         switch (MEMFILE_TYPE(cft->private)) {
2793         case _MEM:
2794                 counter = &memcg->memory;
2795                 break;
2796         case _MEMSWAP:
2797                 counter = &memcg->memsw;
2798                 break;
2799         case _KMEM:
2800                 counter = &memcg->kmem;
2801                 break;
2802         case _TCP:
2803                 counter = &memcg->tcpmem;
2804                 break;
2805         default:
2806                 BUG();
2807         }
2808
2809         switch (MEMFILE_ATTR(cft->private)) {
2810         case RES_USAGE:
2811                 if (counter == &memcg->memory)
2812                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
2813                 if (counter == &memcg->memsw)
2814                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
2815                 return (u64)page_counter_read(counter) * PAGE_SIZE;
2816         case RES_LIMIT:
2817                 return (u64)counter->limit * PAGE_SIZE;
2818         case RES_MAX_USAGE:
2819                 return (u64)counter->watermark * PAGE_SIZE;
2820         case RES_FAILCNT:
2821                 return counter->failcnt;
2822         case RES_SOFT_LIMIT:
2823                 return (u64)memcg->soft_limit * PAGE_SIZE;
2824         default:
2825                 BUG();
2826         }
2827 }
2828
2829 #ifndef CONFIG_SLOB
2830 static int memcg_online_kmem(struct mem_cgroup *memcg)
2831 {
2832         int memcg_id;
2833
2834         if (cgroup_memory_nokmem)
2835                 return 0;
2836
2837         BUG_ON(memcg->kmemcg_id >= 0);
2838         BUG_ON(memcg->kmem_state);
2839
2840         memcg_id = memcg_alloc_cache_id();
2841         if (memcg_id < 0)
2842                 return memcg_id;
2843
2844         static_branch_inc(&memcg_kmem_enabled_key);
2845         /*
2846          * A memory cgroup is considered kmem-online as soon as it gets
2847          * kmemcg_id. Setting the id after enabling static branching will
2848          * guarantee no one starts accounting before all call sites are
2849          * patched.
2850          */
2851         memcg->kmemcg_id = memcg_id;
2852         memcg->kmem_state = KMEM_ONLINE;
2853
2854         return 0;
2855 }
2856
2857 static void memcg_offline_kmem(struct mem_cgroup *memcg)
2858 {
2859         struct cgroup_subsys_state *css;
2860         struct mem_cgroup *parent, *child;
2861         int kmemcg_id;
2862
2863         if (memcg->kmem_state != KMEM_ONLINE)
2864                 return;
2865         /*
2866          * Clear the online state before clearing memcg_caches array
2867          * entries. The slab_mutex in memcg_deactivate_kmem_caches()
2868          * guarantees that no cache will be created for this cgroup
2869          * after we are done (see memcg_create_kmem_cache()).
2870          */
2871         memcg->kmem_state = KMEM_ALLOCATED;
2872
2873         memcg_deactivate_kmem_caches(memcg);
2874
2875         kmemcg_id = memcg->kmemcg_id;
2876         BUG_ON(kmemcg_id < 0);
2877
2878         parent = parent_mem_cgroup(memcg);
2879         if (!parent)
2880                 parent = root_mem_cgroup;
2881
2882         /*
2883          * Change kmemcg_id of this cgroup and all its descendants to the
2884          * parent's id, and then move all entries from this cgroup's list_lrus
2885          * to ones of the parent. After we have finished, all list_lrus
2886          * corresponding to this cgroup are guaranteed to remain empty. The
2887          * ordering is imposed by list_lru_node->lock taken by
2888          * memcg_drain_all_list_lrus().
2889          */
2890         rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
2891         css_for_each_descendant_pre(css, &memcg->css) {
2892                 child = mem_cgroup_from_css(css);
2893                 BUG_ON(child->kmemcg_id != kmemcg_id);
2894                 child->kmemcg_id = parent->kmemcg_id;
2895                 if (!memcg->use_hierarchy)
2896                         break;
2897         }
2898         rcu_read_unlock();
2899
2900         memcg_drain_all_list_lrus(kmemcg_id, parent->kmemcg_id);
2901
2902         memcg_free_cache_id(kmemcg_id);
2903 }
2904
2905 static void memcg_free_kmem(struct mem_cgroup *memcg)
2906 {
2907         /* css_alloc() failed, offlining didn't happen */
2908         if (unlikely(memcg->kmem_state == KMEM_ONLINE))
2909                 memcg_offline_kmem(memcg);
2910
2911         if (memcg->kmem_state == KMEM_ALLOCATED) {
2912                 memcg_destroy_kmem_caches(memcg);
2913                 static_branch_dec(&memcg_kmem_enabled_key);
2914                 WARN_ON(page_counter_read(&memcg->kmem));
2915         }
2916 }
2917 #else
2918 static int memcg_online_kmem(struct mem_cgroup *memcg)
2919 {
2920         return 0;
2921 }
2922 static void memcg_offline_kmem(struct mem_cgroup *memcg)
2923 {
2924 }
2925 static void memcg_free_kmem(struct mem_cgroup *memcg)
2926 {
2927 }
2928 #endif /* !CONFIG_SLOB */
2929
2930 static int memcg_update_kmem_limit(struct mem_cgroup *memcg,
2931                                    unsigned long limit)
2932 {
2933         int ret;
2934
2935         mutex_lock(&memcg_limit_mutex);
2936         ret = page_counter_limit(&memcg->kmem, limit);
2937         mutex_unlock(&memcg_limit_mutex);
2938         return ret;
2939 }
2940
2941 static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit)
2942 {
2943         int ret;
2944
2945         mutex_lock(&memcg_limit_mutex);
2946
2947         ret = page_counter_limit(&memcg->tcpmem, limit);
2948         if (ret)
2949                 goto out;
2950
2951         if (!memcg->tcpmem_active) {
2952                 /*
2953                  * The active flag needs to be written after the static_key
2954                  * update. This is what guarantees that the socket activation
2955                  * function is the last one to run. See mem_cgroup_sk_alloc()
2956                  * for details, and note that we don't mark any socket as
2957                  * belonging to this memcg until that flag is up.
2958                  *
2959                  * We need to do this, because static_keys will span multiple
2960                  * sites, but we can't control their order. If we mark a socket
2961                  * as accounted, but the accounting functions are not patched in
2962                  * yet, we'll lose accounting.
2963                  *
2964                  * We never race with the readers in mem_cgroup_sk_alloc(),
2965                  * because when this value change, the code to process it is not
2966                  * patched in yet.
2967                  */
2968                 static_branch_inc(&memcg_sockets_enabled_key);
2969                 memcg->tcpmem_active = true;
2970         }
2971 out:
2972         mutex_unlock(&memcg_limit_mutex);
2973         return ret;
2974 }
2975
2976 /*
2977  * The user of this function is...
2978  * RES_LIMIT.
2979  */
2980 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
2981                                 char *buf, size_t nbytes, loff_t off)
2982 {
2983         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2984         unsigned long nr_pages;
2985         int ret;
2986
2987         buf = strstrip(buf);
2988         ret = page_counter_memparse(buf, "-1", &nr_pages);
2989         if (ret)
2990                 return ret;
2991
2992         switch (MEMFILE_ATTR(of_cft(of)->private)) {
2993         case RES_LIMIT:
2994                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
2995                         ret = -EINVAL;
2996                         break;
2997                 }
2998                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
2999                 case _MEM:
3000                         ret = mem_cgroup_resize_limit(memcg, nr_pages);
3001                         break;
3002                 case _MEMSWAP:
3003                         ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages);
3004                         break;
3005                 case _KMEM:
3006                         ret = memcg_update_kmem_limit(memcg, nr_pages);
3007                         break;
3008                 case _TCP:
3009                         ret = memcg_update_tcp_limit(memcg, nr_pages);
3010                         break;
3011                 }
3012                 break;
3013         case RES_SOFT_LIMIT:
3014                 memcg->soft_limit = nr_pages;
3015                 ret = 0;
3016                 break;
3017         }
3018         return ret ?: nbytes;
3019 }
3020
3021 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3022                                 size_t nbytes, loff_t off)
3023 {
3024         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3025         struct page_counter *counter;
3026
3027         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3028         case _MEM:
3029                 counter = &memcg->memory;
3030                 break;
3031         case _MEMSWAP:
3032                 counter = &memcg->memsw;
3033                 break;
3034         case _KMEM:
3035                 counter = &memcg->kmem;
3036                 break;
3037         case _TCP:
3038                 counter = &memcg->tcpmem;
3039                 break;
3040         default:
3041                 BUG();
3042         }
3043
3044         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3045         case RES_MAX_USAGE:
3046                 page_counter_reset_watermark(counter);
3047                 break;
3048         case RES_FAILCNT:
3049                 counter->failcnt = 0;
3050                 break;
3051         default:
3052                 BUG();
3053         }
3054
3055         return nbytes;
3056 }
3057
3058 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3059                                         struct cftype *cft)
3060 {
3061         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3062 }
3063
3064 #ifdef CONFIG_MMU
3065 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3066                                         struct cftype *cft, u64 val)
3067 {
3068         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3069
3070         if (val & ~MOVE_MASK)
3071                 return -EINVAL;
3072
3073         /*
3074          * No kind of locking is needed in here, because ->can_attach() will
3075          * check this value once in the beginning of the process, and then carry
3076          * on with stale data. This means that changes to this value will only
3077          * affect task migrations starting after the change.
3078          */
3079         memcg->move_charge_at_immigrate = val;
3080         return 0;
3081 }
3082 #else
3083 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3084                                         struct cftype *cft, u64 val)
3085 {
3086         return -ENOSYS;
3087 }
3088 #endif
3089
3090 #ifdef CONFIG_NUMA
3091 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3092 {
3093         struct numa_stat {
3094                 const char *name;
3095                 unsigned int lru_mask;
3096         };
3097
3098         static const struct numa_stat stats[] = {
3099                 { "total", LRU_ALL },
3100                 { "file", LRU_ALL_FILE },
3101                 { "anon", LRU_ALL_ANON },
3102                 { "unevictable", BIT(LRU_UNEVICTABLE) },
3103         };
3104         const struct numa_stat *stat;
3105         int nid;
3106         unsigned long nr;
3107         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3108
3109         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3110                 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3111                 seq_printf(m, "%s=%lu", stat->name, nr);
3112                 for_each_node_state(nid, N_MEMORY) {
3113                         nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3114                                                           stat->lru_mask);
3115                         seq_printf(m, " N%d=%lu", nid, nr);
3116                 }
3117                 seq_putc(m, '\n');
3118         }
3119
3120         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3121                 struct mem_cgroup *iter;
3122
3123                 nr = 0;
3124                 for_each_mem_cgroup_tree(iter, memcg)
3125                         nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3126                 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3127                 for_each_node_state(nid, N_MEMORY) {
3128                         nr = 0;
3129                         for_each_mem_cgroup_tree(iter, memcg)
3130                                 nr += mem_cgroup_node_nr_lru_pages(
3131                                         iter, nid, stat->lru_mask);
3132                         seq_printf(m, " N%d=%lu", nid, nr);
3133                 }
3134                 seq_putc(m, '\n');
3135         }
3136
3137         return 0;
3138 }
3139 #endif /* CONFIG_NUMA */
3140
3141 static int memcg_stat_show(struct seq_file *m, void *v)
3142 {
3143         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3144         unsigned long memory, memsw;
3145         struct mem_cgroup *mi;
3146         unsigned int i;
3147
3148         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_stat_names) !=
3149                      MEM_CGROUP_STAT_NSTATS);
3150         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_events_names) !=
3151                      MEM_CGROUP_EVENTS_NSTATS);
3152         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3153
3154         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3155                 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3156                         continue;
3157                 seq_printf(m, "%s %lu\n", mem_cgroup_stat_names[i],
3158                            mem_cgroup_read_stat(memcg, i) * PAGE_SIZE);
3159         }
3160
3161         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++)
3162                 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i],
3163                            mem_cgroup_read_events(memcg, i));
3164
3165         for (i = 0; i < NR_LRU_LISTS; i++)
3166                 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3167                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3168
3169         /* Hierarchical information */
3170         memory = memsw = PAGE_COUNTER_MAX;
3171         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3172                 memory = min(memory, mi->memory.limit);
3173                 memsw = min(memsw, mi->memsw.limit);
3174         }
3175         seq_printf(m, "hierarchical_memory_limit %llu\n",
3176                    (u64)memory * PAGE_SIZE);
3177         if (do_memsw_account())
3178                 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3179                            (u64)memsw * PAGE_SIZE);
3180
3181         for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) {
3182                 unsigned long long val = 0;
3183
3184                 if (i == MEM_CGROUP_STAT_SWAP && !do_memsw_account())
3185                         continue;
3186                 for_each_mem_cgroup_tree(mi, memcg)
3187                         val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE;
3188                 seq_printf(m, "total_%s %llu\n", mem_cgroup_stat_names[i], val);
3189         }
3190
3191         for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) {
3192                 unsigned long long val = 0;
3193
3194                 for_each_mem_cgroup_tree(mi, memcg)
3195                         val += mem_cgroup_read_events(mi, i);
3196                 seq_printf(m, "total_%s %llu\n",
3197                            mem_cgroup_events_names[i], val);
3198         }
3199
3200         for (i = 0; i < NR_LRU_LISTS; i++) {
3201                 unsigned long long val = 0;
3202
3203                 for_each_mem_cgroup_tree(mi, memcg)
3204                         val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE;
3205                 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val);
3206         }
3207
3208 #ifdef CONFIG_DEBUG_VM
3209         {
3210                 pg_data_t *pgdat;
3211                 struct mem_cgroup_per_node *mz;
3212                 struct zone_reclaim_stat *rstat;
3213                 unsigned long recent_rotated[2] = {0, 0};
3214                 unsigned long recent_scanned[2] = {0, 0};
3215
3216                 for_each_online_pgdat(pgdat) {
3217                         mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3218                         rstat = &mz->lruvec.reclaim_stat;
3219
3220                         recent_rotated[0] += rstat->recent_rotated[0];
3221                         recent_rotated[1] += rstat->recent_rotated[1];
3222                         recent_scanned[0] += rstat->recent_scanned[0];
3223                         recent_scanned[1] += rstat->recent_scanned[1];
3224                 }
3225                 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3226                 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3227                 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3228                 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3229         }
3230 #endif
3231
3232         return 0;
3233 }
3234
3235 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3236                                       struct cftype *cft)
3237 {
3238         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3239
3240         return mem_cgroup_swappiness(memcg);
3241 }
3242
3243 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3244                                        struct cftype *cft, u64 val)
3245 {
3246         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3247
3248         if (val > 100)
3249                 return -EINVAL;
3250
3251         if (css->parent)
3252                 memcg->swappiness = val;
3253         else
3254                 vm_swappiness = val;
3255
3256         return 0;
3257 }
3258
3259 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3260 {
3261         struct mem_cgroup_threshold_ary *t;
3262         unsigned long usage;
3263         int i;
3264
3265         rcu_read_lock();
3266         if (!swap)
3267                 t = rcu_dereference(memcg->thresholds.primary);
3268         else
3269                 t = rcu_dereference(memcg->memsw_thresholds.primary);
3270
3271         if (!t)
3272                 goto unlock;
3273
3274         usage = mem_cgroup_usage(memcg, swap);
3275
3276         /*
3277          * current_threshold points to threshold just below or equal to usage.
3278          * If it's not true, a threshold was crossed after last
3279          * call of __mem_cgroup_threshold().
3280          */
3281         i = t->current_threshold;
3282
3283         /*
3284          * Iterate backward over array of thresholds starting from
3285          * current_threshold and check if a threshold is crossed.
3286          * If none of thresholds below usage is crossed, we read
3287          * only one element of the array here.
3288          */
3289         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3290                 eventfd_signal(t->entries[i].eventfd, 1);
3291
3292         /* i = current_threshold + 1 */
3293         i++;
3294
3295         /*
3296          * Iterate forward over array of thresholds starting from
3297          * current_threshold+1 and check if a threshold is crossed.
3298          * If none of thresholds above usage is crossed, we read
3299          * only one element of the array here.
3300          */
3301         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3302                 eventfd_signal(t->entries[i].eventfd, 1);
3303
3304         /* Update current_threshold */
3305         t->current_threshold = i - 1;
3306 unlock:
3307         rcu_read_unlock();
3308 }
3309
3310 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3311 {
3312         while (memcg) {
3313                 __mem_cgroup_threshold(memcg, false);
3314                 if (do_memsw_account())
3315                         __mem_cgroup_threshold(memcg, true);
3316
3317                 memcg = parent_mem_cgroup(memcg);
3318         }
3319 }
3320
3321 static int compare_thresholds(const void *a, const void *b)
3322 {
3323         const struct mem_cgroup_threshold *_a = a;
3324         const struct mem_cgroup_threshold *_b = b;
3325
3326         if (_a->threshold > _b->threshold)
3327                 return 1;
3328
3329         if (_a->threshold < _b->threshold)
3330                 return -1;
3331
3332         return 0;
3333 }
3334
3335 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3336 {
3337         struct mem_cgroup_eventfd_list *ev;
3338
3339         spin_lock(&memcg_oom_lock);
3340
3341         list_for_each_entry(ev, &memcg->oom_notify, list)
3342                 eventfd_signal(ev->eventfd, 1);
3343
3344         spin_unlock(&memcg_oom_lock);
3345         return 0;
3346 }
3347
3348 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3349 {
3350         struct mem_cgroup *iter;
3351
3352         for_each_mem_cgroup_tree(iter, memcg)
3353                 mem_cgroup_oom_notify_cb(iter);
3354 }
3355
3356 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3357         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3358 {
3359         struct mem_cgroup_thresholds *thresholds;
3360         struct mem_cgroup_threshold_ary *new;
3361         unsigned long threshold;
3362         unsigned long usage;
3363         int i, size, ret;
3364
3365         ret = page_counter_memparse(args, "-1", &threshold);
3366         if (ret)
3367                 return ret;
3368
3369         mutex_lock(&memcg->thresholds_lock);
3370
3371         if (type == _MEM) {
3372                 thresholds = &memcg->thresholds;
3373                 usage = mem_cgroup_usage(memcg, false);
3374         } else if (type == _MEMSWAP) {
3375                 thresholds = &memcg->memsw_thresholds;
3376                 usage = mem_cgroup_usage(memcg, true);
3377         } else
3378                 BUG();
3379
3380         /* Check if a threshold crossed before adding a new one */
3381         if (thresholds->primary)
3382                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3383
3384         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3385
3386         /* Allocate memory for new array of thresholds */
3387         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3388                         GFP_KERNEL);
3389         if (!new) {
3390                 ret = -ENOMEM;
3391                 goto unlock;
3392         }
3393         new->size = size;
3394
3395         /* Copy thresholds (if any) to new array */
3396         if (thresholds->primary) {
3397                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3398                                 sizeof(struct mem_cgroup_threshold));
3399         }
3400
3401         /* Add new threshold */
3402         new->entries[size - 1].eventfd = eventfd;
3403         new->entries[size - 1].threshold = threshold;
3404
3405         /* Sort thresholds. Registering of new threshold isn't time-critical */
3406         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3407                         compare_thresholds, NULL);
3408
3409         /* Find current threshold */
3410         new->current_threshold = -1;
3411         for (i = 0; i < size; i++) {
3412                 if (new->entries[i].threshold <= usage) {
3413                         /*
3414                          * new->current_threshold will not be used until
3415                          * rcu_assign_pointer(), so it's safe to increment
3416                          * it here.
3417                          */
3418                         ++new->current_threshold;
3419                 } else
3420                         break;
3421         }
3422
3423         /* Free old spare buffer and save old primary buffer as spare */
3424         kfree(thresholds->spare);
3425         thresholds->spare = thresholds->primary;
3426
3427         rcu_assign_pointer(thresholds->primary, new);
3428
3429         /* To be sure that nobody uses thresholds */
3430         synchronize_rcu();
3431
3432 unlock:
3433         mutex_unlock(&memcg->thresholds_lock);
3434
3435         return ret;
3436 }
3437
3438 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3439         struct eventfd_ctx *eventfd, const char *args)
3440 {
3441         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3442 }
3443
3444 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3445         struct eventfd_ctx *eventfd, const char *args)
3446 {
3447         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3448 }
3449
3450 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3451         struct eventfd_ctx *eventfd, enum res_type type)
3452 {
3453         struct mem_cgroup_thresholds *thresholds;
3454         struct mem_cgroup_threshold_ary *new;
3455         unsigned long usage;
3456         int i, j, size;
3457
3458         mutex_lock(&memcg->thresholds_lock);
3459
3460         if (type == _MEM) {
3461                 thresholds = &memcg->thresholds;
3462                 usage = mem_cgroup_usage(memcg, false);
3463         } else if (type == _MEMSWAP) {
3464                 thresholds = &memcg->memsw_thresholds;
3465                 usage = mem_cgroup_usage(memcg, true);
3466         } else
3467                 BUG();
3468
3469         if (!thresholds->primary)
3470                 goto unlock;
3471
3472         /* Check if a threshold crossed before removing */
3473         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3474
3475         /* Calculate new number of threshold */
3476         size = 0;
3477         for (i = 0; i < thresholds->primary->size; i++) {
3478                 if (thresholds->primary->entries[i].eventfd != eventfd)
3479                         size++;
3480         }
3481
3482         new = thresholds->spare;
3483
3484         /* Set thresholds array to NULL if we don't have thresholds */
3485         if (!size) {
3486                 kfree(new);
3487                 new = NULL;
3488                 goto swap_buffers;
3489         }
3490
3491         new->size = size;
3492
3493         /* Copy thresholds and find current threshold */
3494         new->current_threshold = -1;
3495         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3496                 if (thresholds->primary->entries[i].eventfd == eventfd)
3497                         continue;
3498
3499                 new->entries[j] = thresholds->primary->entries[i];
3500                 if (new->entries[j].threshold <= usage) {
3501                         /*
3502                          * new->current_threshold will not be used
3503                          * until rcu_assign_pointer(), so it's safe to increment
3504                          * it here.
3505                          */
3506                         ++new->current_threshold;
3507                 }
3508                 j++;
3509         }
3510
3511 swap_buffers:
3512         /* Swap primary and spare array */
3513         thresholds->spare = thresholds->primary;
3514
3515         rcu_assign_pointer(thresholds->primary, new);
3516
3517         /* To be sure that nobody uses thresholds */
3518         synchronize_rcu();
3519
3520         /* If all events are unregistered, free the spare array */
3521         if (!new) {
3522                 kfree(thresholds->spare);
3523                 thresholds->spare = NULL;
3524         }
3525 unlock:
3526         mutex_unlock(&memcg->thresholds_lock);
3527 }
3528
3529 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3530         struct eventfd_ctx *eventfd)
3531 {
3532         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3533 }
3534
3535 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3536         struct eventfd_ctx *eventfd)
3537 {
3538         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3539 }
3540
3541 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3542         struct eventfd_ctx *eventfd, const char *args)
3543 {
3544         struct mem_cgroup_eventfd_list *event;
3545
3546         event = kmalloc(sizeof(*event), GFP_KERNEL);
3547         if (!event)
3548                 return -ENOMEM;
3549
3550         spin_lock(&memcg_oom_lock);
3551
3552         event->eventfd = eventfd;
3553         list_add(&event->list, &memcg->oom_notify);
3554
3555         /* already in OOM ? */
3556         if (memcg->under_oom)
3557                 eventfd_signal(eventfd, 1);
3558         spin_unlock(&memcg_oom_lock);
3559
3560         return 0;
3561 }
3562
3563 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3564         struct eventfd_ctx *eventfd)
3565 {
3566         struct mem_cgroup_eventfd_list *ev, *tmp;
3567
3568         spin_lock(&memcg_oom_lock);
3569
3570         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3571                 if (ev->eventfd == eventfd) {
3572                         list_del(&ev->list);
3573                         kfree(ev);
3574                 }
3575         }
3576
3577         spin_unlock(&memcg_oom_lock);
3578 }
3579
3580 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3581 {
3582         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3583
3584         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3585         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3586         return 0;
3587 }
3588
3589 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3590         struct cftype *cft, u64 val)
3591 {
3592         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3593
3594         /* cannot set to root cgroup and only 0 and 1 are allowed */
3595         if (!css->parent || !((val == 0) || (val == 1)))
3596                 return -EINVAL;
3597
3598         memcg->oom_kill_disable = val;
3599         if (!val)
3600                 memcg_oom_recover(memcg);
3601
3602         return 0;
3603 }
3604
3605 #ifdef CONFIG_CGROUP_WRITEBACK
3606
3607 struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg)
3608 {
3609         return &memcg->cgwb_list;
3610 }
3611
3612 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3613 {
3614         return wb_domain_init(&memcg->cgwb_domain, gfp);
3615 }
3616
3617 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3618 {
3619         wb_domain_exit(&memcg->cgwb_domain);
3620 }
3621
3622 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3623 {
3624         wb_domain_size_changed(&memcg->cgwb_domain);
3625 }
3626
3627 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3628 {
3629         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3630
3631         if (!memcg->css.parent)
3632                 return NULL;
3633
3634         return &memcg->cgwb_domain;
3635 }
3636
3637 /**
3638  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3639  * @wb: bdi_writeback in question
3640  * @pfilepages: out parameter for number of file pages
3641  * @pheadroom: out parameter for number of allocatable pages according to memcg
3642  * @pdirty: out parameter for number of dirty pages
3643  * @pwriteback: out parameter for number of pages under writeback
3644  *
3645  * Determine the numbers of file, headroom, dirty, and writeback pages in
3646  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3647  * is a bit more involved.
3648  *
3649  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3650  * headroom is calculated as the lowest headroom of itself and the
3651  * ancestors.  Note that this doesn't consider the actual amount of
3652  * available memory in the system.  The caller should further cap
3653  * *@pheadroom accordingly.
3654  */
3655 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3656                          unsigned long *pheadroom, unsigned long *pdirty,
3657                          unsigned long *pwriteback)
3658 {
3659         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3660         struct mem_cgroup *parent;
3661
3662         *pdirty = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_DIRTY);
3663
3664         /* this should eventually include NR_UNSTABLE_NFS */
3665         *pwriteback = mem_cgroup_read_stat(memcg, MEM_CGROUP_STAT_WRITEBACK);
3666         *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3667                                                      (1 << LRU_ACTIVE_FILE));
3668         *pheadroom = PAGE_COUNTER_MAX;
3669
3670         while ((parent = parent_mem_cgroup(memcg))) {
3671                 unsigned long ceiling = min(memcg->memory.limit, memcg->high);
3672                 unsigned long used = page_counter_read(&memcg->memory);
3673
3674                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3675                 memcg = parent;
3676         }
3677 }
3678
3679 #else   /* CONFIG_CGROUP_WRITEBACK */
3680
3681 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3682 {
3683         return 0;
3684 }
3685
3686 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3687 {
3688 }
3689
3690 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3691 {
3692 }
3693
3694 #endif  /* CONFIG_CGROUP_WRITEBACK */
3695
3696 /*
3697  * DO NOT USE IN NEW FILES.
3698  *
3699  * "cgroup.event_control" implementation.
3700  *
3701  * This is way over-engineered.  It tries to support fully configurable
3702  * events for each user.  Such level of flexibility is completely
3703  * unnecessary especially in the light of the planned unified hierarchy.
3704  *
3705  * Please deprecate this and replace with something simpler if at all
3706  * possible.
3707  */
3708
3709 /*
3710  * Unregister event and free resources.
3711  *
3712  * Gets called from workqueue.
3713  */
3714 static void memcg_event_remove(struct work_struct *work)
3715 {
3716         struct mem_cgroup_event *event =
3717                 container_of(work, struct mem_cgroup_event, remove);
3718         struct mem_cgroup *memcg = event->memcg;
3719
3720         remove_wait_queue(event->wqh, &event->wait);
3721
3722         event->unregister_event(memcg, event->eventfd);
3723
3724         /* Notify userspace the event is going away. */
3725         eventfd_signal(event->eventfd, 1);
3726
3727         eventfd_ctx_put(event->eventfd);
3728         kfree(event);
3729         css_put(&memcg->css);
3730 }
3731
3732 /*
3733  * Gets called on POLLHUP on eventfd when user closes it.
3734  *
3735  * Called with wqh->lock held and interrupts disabled.
3736  */
3737 static int memcg_event_wake(wait_queue_t *wait, unsigned mode,
3738                             int sync, void *key)
3739 {
3740         struct mem_cgroup_event *event =
3741                 container_of(wait, struct mem_cgroup_event, wait);
3742         struct mem_cgroup *memcg = event->memcg;
3743         unsigned long flags = (unsigned long)key;
3744
3745         if (flags & POLLHUP) {
3746                 /*
3747                  * If the event has been detached at cgroup removal, we
3748                  * can simply return knowing the other side will cleanup
3749                  * for us.
3750                  *
3751                  * We can't race against event freeing since the other
3752                  * side will require wqh->lock via remove_wait_queue(),
3753                  * which we hold.
3754                  */
3755                 spin_lock(&memcg->event_list_lock);
3756                 if (!list_empty(&event->list)) {
3757                         list_del_init(&event->list);
3758                         /*
3759                          * We are in atomic context, but cgroup_event_remove()
3760                          * may sleep, so we have to call it in workqueue.
3761                          */
3762                         schedule_work(&event->remove);
3763                 }
3764                 spin_unlock(&memcg->event_list_lock);
3765         }
3766
3767         return 0;
3768 }
3769
3770 static void memcg_event_ptable_queue_proc(struct file *file,
3771                 wait_queue_head_t *wqh, poll_table *pt)
3772 {
3773         struct mem_cgroup_event *event =
3774                 container_of(pt, struct mem_cgroup_event, pt);
3775
3776         event->wqh = wqh;
3777         add_wait_queue(wqh, &event->wait);
3778 }
3779
3780 /*
3781  * DO NOT USE IN NEW FILES.
3782  *
3783  * Parse input and register new cgroup event handler.
3784  *
3785  * Input must be in format '<event_fd> <control_fd> <args>'.
3786  * Interpretation of args is defined by control file implementation.
3787  */
3788 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
3789                                          char *buf, size_t nbytes, loff_t off)
3790 {
3791         struct cgroup_subsys_state *css = of_css(of);
3792         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3793         struct mem_cgroup_event *event;
3794         struct cgroup_subsys_state *cfile_css;
3795         unsigned int efd, cfd;
3796         struct fd efile;
3797         struct fd cfile;
3798         const char *name;
3799         char *endp;
3800         int ret;
3801
3802         buf = strstrip(buf);
3803
3804         efd = simple_strtoul(buf, &endp, 10);
3805         if (*endp != ' ')
3806                 return -EINVAL;
3807         buf = endp + 1;
3808
3809         cfd = simple_strtoul(buf, &endp, 10);
3810         if ((*endp != ' ') && (*endp != '\0'))
3811                 return -EINVAL;
3812         buf = endp + 1;
3813
3814         event = kzalloc(sizeof(*event), GFP_KERNEL);
3815         if (!event)
3816                 return -ENOMEM;
3817
3818         event->memcg = memcg;
3819         INIT_LIST_HEAD(&event->list);
3820         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
3821         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
3822         INIT_WORK(&event->remove, memcg_event_remove);
3823
3824         efile = fdget(efd);
3825         if (!efile.file) {
3826                 ret = -EBADF;
3827                 goto out_kfree;
3828         }
3829
3830         event->eventfd = eventfd_ctx_fileget(efile.file);
3831         if (IS_ERR(event->eventfd)) {
3832                 ret = PTR_ERR(event->eventfd);
3833                 goto out_put_efile;
3834         }
3835
3836         cfile = fdget(cfd);
3837         if (!cfile.file) {
3838                 ret = -EBADF;
3839                 goto out_put_eventfd;
3840         }
3841
3842         /* the process need read permission on control file */
3843         /* AV: shouldn't we check that it's been opened for read instead? */
3844         ret = inode_permission(file_inode(cfile.file), MAY_READ);
3845         if (ret < 0)
3846                 goto out_put_cfile;
3847
3848         /*
3849          * Determine the event callbacks and set them in @event.  This used
3850          * to be done via struct cftype but cgroup core no longer knows
3851          * about these events.  The following is crude but the whole thing
3852          * is for compatibility anyway.
3853          *
3854          * DO NOT ADD NEW FILES.
3855          */
3856         name = cfile.file->f_path.dentry->d_name.name;
3857
3858         if (!strcmp(name, "memory.usage_in_bytes")) {
3859                 event->register_event = mem_cgroup_usage_register_event;
3860                 event->unregister_event = mem_cgroup_usage_unregister_event;
3861         } else if (!strcmp(name, "memory.oom_control")) {
3862                 event->register_event = mem_cgroup_oom_register_event;
3863                 event->unregister_event = mem_cgroup_oom_unregister_event;
3864         } else if (!strcmp(name, "memory.pressure_level")) {
3865                 event->register_event = vmpressure_register_event;
3866                 event->unregister_event = vmpressure_unregister_event;
3867         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
3868                 event->register_event = memsw_cgroup_usage_register_event;
3869                 event->unregister_event = memsw_cgroup_usage_unregister_event;
3870         } else {
3871                 ret = -EINVAL;
3872                 goto out_put_cfile;
3873         }
3874
3875         /*
3876          * Verify @cfile should belong to @css.  Also, remaining events are
3877          * automatically removed on cgroup destruction but the removal is
3878          * asynchronous, so take an extra ref on @css.
3879          */
3880         cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
3881                                                &memory_cgrp_subsys);
3882         ret = -EINVAL;
3883         if (IS_ERR(cfile_css))
3884                 goto out_put_cfile;
3885         if (cfile_css != css) {
3886                 css_put(cfile_css);
3887                 goto out_put_cfile;
3888         }
3889
3890         ret = event->register_event(memcg, event->eventfd, buf);
3891         if (ret)
3892                 goto out_put_css;
3893
3894         efile.file->f_op->poll(efile.file, &event->pt);
3895
3896         spin_lock(&memcg->event_list_lock);
3897         list_add(&event->list, &memcg->event_list);
3898         spin_unlock(&memcg->event_list_lock);
3899
3900         fdput(cfile);
3901         fdput(efile);
3902
3903         return nbytes;
3904
3905 out_put_css:
3906         css_put(css);
3907 out_put_cfile:
3908         fdput(cfile);
3909 out_put_eventfd:
3910         eventfd_ctx_put(event->eventfd);
3911 out_put_efile:
3912         fdput(efile);
3913 out_kfree:
3914         kfree(event);
3915
3916         return ret;
3917 }
3918
3919 static struct cftype mem_cgroup_legacy_files[] = {
3920         {
3921                 .name = "usage_in_bytes",
3922                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
3923                 .read_u64 = mem_cgroup_read_u64,
3924         },
3925         {
3926                 .name = "max_usage_in_bytes",
3927                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
3928                 .write = mem_cgroup_reset,
3929                 .read_u64 = mem_cgroup_read_u64,
3930         },
3931         {
3932                 .name = "limit_in_bytes",
3933                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
3934                 .write = mem_cgroup_write,
3935                 .read_u64 = mem_cgroup_read_u64,
3936         },
3937         {
3938                 .name = "soft_limit_in_bytes",
3939                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
3940                 .write = mem_cgroup_write,
3941                 .read_u64 = mem_cgroup_read_u64,
3942         },
3943         {
3944                 .name = "failcnt",
3945                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
3946                 .write = mem_cgroup_reset,
3947                 .read_u64 = mem_cgroup_read_u64,
3948         },
3949         {
3950                 .name = "stat",
3951                 .seq_show = memcg_stat_show,
3952         },
3953         {
3954                 .name = "force_empty",
3955                 .write = mem_cgroup_force_empty_write,
3956         },
3957         {
3958                 .name = "use_hierarchy",
3959                 .write_u64 = mem_cgroup_hierarchy_write,
3960                 .read_u64 = mem_cgroup_hierarchy_read,
3961         },
3962         {
3963                 .name = "cgroup.event_control",         /* XXX: for compat */
3964                 .write = memcg_write_event_control,
3965                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
3966         },
3967         {
3968                 .name = "swappiness",
3969                 .read_u64 = mem_cgroup_swappiness_read,
3970                 .write_u64 = mem_cgroup_swappiness_write,
3971         },
3972         {
3973                 .name = "move_charge_at_immigrate",
3974                 .read_u64 = mem_cgroup_move_charge_read,
3975                 .write_u64 = mem_cgroup_move_charge_write,
3976         },
3977         {
3978                 .name = "oom_control",
3979                 .seq_show = mem_cgroup_oom_control_read,
3980                 .write_u64 = mem_cgroup_oom_control_write,
3981                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
3982         },
3983         {
3984                 .name = "pressure_level",
3985         },
3986 #ifdef CONFIG_NUMA
3987         {
3988                 .name = "numa_stat",
3989                 .seq_show = memcg_numa_stat_show,
3990         },
3991 #endif
3992         {
3993                 .name = "kmem.limit_in_bytes",
3994                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
3995                 .write = mem_cgroup_write,
3996                 .read_u64 = mem_cgroup_read_u64,
3997         },
3998         {
3999                 .name = "kmem.usage_in_bytes",
4000                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4001                 .read_u64 = mem_cgroup_read_u64,
4002         },
4003         {
4004                 .name = "kmem.failcnt",
4005                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4006                 .write = mem_cgroup_reset,
4007                 .read_u64 = mem_cgroup_read_u64,
4008         },
4009         {
4010                 .name = "kmem.max_usage_in_bytes",
4011                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4012                 .write = mem_cgroup_reset,
4013                 .read_u64 = mem_cgroup_read_u64,
4014         },
4015 #ifdef CONFIG_SLABINFO
4016         {
4017                 .name = "kmem.slabinfo",
4018                 .seq_start = slab_start,
4019                 .seq_next = slab_next,
4020                 .seq_stop = slab_stop,
4021                 .seq_show = memcg_slab_show,
4022         },
4023 #endif
4024         {
4025                 .name = "kmem.tcp.limit_in_bytes",
4026                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4027                 .write = mem_cgroup_write,
4028                 .read_u64 = mem_cgroup_read_u64,
4029         },
4030         {
4031                 .name = "kmem.tcp.usage_in_bytes",
4032                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4033                 .read_u64 = mem_cgroup_read_u64,
4034         },
4035         {
4036                 .name = "kmem.tcp.failcnt",
4037                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4038                 .write = mem_cgroup_reset,
4039                 .read_u64 = mem_cgroup_read_u64,
4040         },
4041         {
4042                 .name = "kmem.tcp.max_usage_in_bytes",
4043                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4044                 .write = mem_cgroup_reset,
4045                 .read_u64 = mem_cgroup_read_u64,
4046         },
4047         { },    /* terminate */
4048 };
4049
4050 /*
4051  * Private memory cgroup IDR
4052  *
4053  * Swap-out records and page cache shadow entries need to store memcg
4054  * references in constrained space, so we maintain an ID space that is
4055  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4056  * memory-controlled cgroups to 64k.
4057  *
4058  * However, there usually are many references to the oflline CSS after
4059  * the cgroup has been destroyed, such as page cache or reclaimable
4060  * slab objects, that don't need to hang on to the ID. We want to keep
4061  * those dead CSS from occupying IDs, or we might quickly exhaust the
4062  * relatively small ID space and prevent the creation of new cgroups
4063  * even when there are much fewer than 64k cgroups - possibly none.
4064  *
4065  * Maintain a private 16-bit ID space for memcg, and allow the ID to
4066  * be freed and recycled when it's no longer needed, which is usually
4067  * when the CSS is offlined.
4068  *
4069  * The only exception to that are records of swapped out tmpfs/shmem
4070  * pages that need to be attributed to live ancestors on swapin. But
4071  * those references are manageable from userspace.
4072  */
4073
4074 static DEFINE_IDR(mem_cgroup_idr);
4075
4076 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4077 {
4078         VM_BUG_ON(atomic_read(&memcg->id.ref) <= 0);
4079         atomic_add(n, &memcg->id.ref);
4080 }
4081
4082 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4083 {
4084         VM_BUG_ON(atomic_read(&memcg->id.ref) < n);
4085         if (atomic_sub_and_test(n, &memcg->id.ref)) {
4086                 idr_remove(&mem_cgroup_idr, memcg->id.id);
4087                 memcg->id.id = 0;
4088
4089                 /* Memcg ID pins CSS */
4090                 css_put(&memcg->css);
4091         }
4092 }
4093
4094 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4095 {
4096         mem_cgroup_id_get_many(memcg, 1);
4097 }
4098
4099 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4100 {
4101         mem_cgroup_id_put_many(memcg, 1);
4102 }
4103
4104 /**
4105  * mem_cgroup_from_id - look up a memcg from a memcg id
4106  * @id: the memcg id to look up
4107  *
4108  * Caller must hold rcu_read_lock().
4109  */
4110 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4111 {
4112         WARN_ON_ONCE(!rcu_read_lock_held());
4113         return idr_find(&mem_cgroup_idr, id);
4114 }
4115
4116 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4117 {
4118         struct mem_cgroup_per_node *pn;
4119         int tmp = node;
4120         /*
4121          * This routine is called against possible nodes.
4122          * But it's BUG to call kmalloc() against offline node.
4123          *
4124          * TODO: this routine can waste much memory for nodes which will
4125          *       never be onlined. It's better to use memory hotplug callback
4126          *       function.
4127          */
4128         if (!node_state(node, N_NORMAL_MEMORY))
4129                 tmp = -1;
4130         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4131         if (!pn)
4132                 return 1;
4133
4134         lruvec_init(&pn->lruvec);
4135         pn->usage_in_excess = 0;
4136         pn->on_tree = false;
4137         pn->memcg = memcg;
4138
4139         memcg->nodeinfo[node] = pn;
4140         return 0;
4141 }
4142
4143 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4144 {
4145         kfree(memcg->nodeinfo[node]);
4146 }
4147
4148 static void mem_cgroup_free(struct mem_cgroup *memcg)
4149 {
4150         int node;
4151
4152         memcg_wb_domain_exit(memcg);
4153         for_each_node(node)
4154                 free_mem_cgroup_per_node_info(memcg, node);
4155         free_percpu(memcg->stat);
4156         kfree(memcg);
4157 }
4158
4159 static struct mem_cgroup *mem_cgroup_alloc(void)
4160 {
4161         struct mem_cgroup *memcg;
4162         size_t size;
4163         int node;
4164
4165         size = sizeof(struct mem_cgroup);
4166         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4167
4168         memcg = kzalloc(size, GFP_KERNEL);
4169         if (!memcg)
4170                 return NULL;
4171
4172         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4173                                  1, MEM_CGROUP_ID_MAX,
4174                                  GFP_KERNEL);
4175         if (memcg->id.id < 0)
4176                 goto fail;
4177
4178         memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
4179         if (!memcg->stat)
4180                 goto fail;
4181
4182         for_each_node(node)
4183                 if (alloc_mem_cgroup_per_node_info(memcg, node))
4184                         goto fail;
4185
4186         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4187                 goto fail;
4188
4189         INIT_WORK(&memcg->high_work, high_work_func);
4190         memcg->last_scanned_node = MAX_NUMNODES;
4191         INIT_LIST_HEAD(&memcg->oom_notify);
4192         mutex_init(&memcg->thresholds_lock);
4193         spin_lock_init(&memcg->move_lock);
4194         vmpressure_init(&memcg->vmpressure);
4195         INIT_LIST_HEAD(&memcg->event_list);
4196         spin_lock_init(&memcg->event_list_lock);
4197         memcg->socket_pressure = jiffies;
4198 #ifndef CONFIG_SLOB
4199         memcg->kmemcg_id = -1;
4200 #endif
4201 #ifdef CONFIG_CGROUP_WRITEBACK
4202         INIT_LIST_HEAD(&memcg->cgwb_list);
4203 #endif
4204         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4205         return memcg;
4206 fail:
4207         if (memcg->id.id > 0)
4208                 idr_remove(&mem_cgroup_idr, memcg->id.id);
4209         mem_cgroup_free(memcg);
4210         return NULL;
4211 }
4212
4213 static struct cgroup_subsys_state * __ref
4214 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4215 {
4216         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4217         struct mem_cgroup *memcg;
4218         long error = -ENOMEM;
4219
4220         memcg = mem_cgroup_alloc();
4221         if (!memcg)
4222                 return ERR_PTR(error);
4223
4224         memcg->high = PAGE_COUNTER_MAX;
4225         memcg->soft_limit = PAGE_COUNTER_MAX;
4226         if (parent) {
4227                 memcg->swappiness = mem_cgroup_swappiness(parent);
4228                 memcg->oom_kill_disable = parent->oom_kill_disable;
4229         }
4230         if (parent && parent->use_hierarchy) {
4231                 memcg->use_hierarchy = true;
4232                 page_counter_init(&memcg->memory, &parent->memory);
4233                 page_counter_init(&memcg->swap, &parent->swap);
4234                 page_counter_init(&memcg->memsw, &parent->memsw);
4235                 page_counter_init(&memcg->kmem, &parent->kmem);
4236                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4237         } else {
4238                 page_counter_init(&memcg->memory, NULL);
4239                 page_counter_init(&memcg->swap, NULL);
4240                 page_counter_init(&memcg->memsw, NULL);
4241                 page_counter_init(&memcg->kmem, NULL);
4242                 page_counter_init(&memcg->tcpmem, NULL);
4243                 /*
4244                  * Deeper hierachy with use_hierarchy == false doesn't make
4245                  * much sense so let cgroup subsystem know about this
4246                  * unfortunate state in our controller.
4247                  */
4248                 if (parent != root_mem_cgroup)
4249                         memory_cgrp_subsys.broken_hierarchy = true;
4250         }
4251
4252         /* The following stuff does not apply to the root */
4253         if (!parent) {
4254                 root_mem_cgroup = memcg;
4255                 return &memcg->css;
4256         }
4257
4258         error = memcg_online_kmem(memcg);
4259         if (error)
4260                 goto fail;
4261
4262         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4263                 static_branch_inc(&memcg_sockets_enabled_key);
4264
4265         return &memcg->css;
4266 fail:
4267         mem_cgroup_free(memcg);
4268         return ERR_PTR(-ENOMEM);
4269 }
4270
4271 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4272 {
4273         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4274
4275         /* Online state pins memcg ID, memcg ID pins CSS */
4276         atomic_set(&memcg->id.ref, 1);
4277         css_get(css);
4278         return 0;
4279 }
4280
4281 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4282 {
4283         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4284         struct mem_cgroup_event *event, *tmp;
4285
4286         /*
4287          * Unregister events and notify userspace.
4288          * Notify userspace about cgroup removing only after rmdir of cgroup
4289          * directory to avoid race between userspace and kernelspace.
4290          */
4291         spin_lock(&memcg->event_list_lock);
4292         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4293                 list_del_init(&event->list);
4294                 schedule_work(&event->remove);
4295         }
4296         spin_unlock(&memcg->event_list_lock);
4297
4298         memcg_offline_kmem(memcg);
4299         wb_memcg_offline(memcg);
4300
4301         mem_cgroup_id_put(memcg);
4302 }
4303
4304 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4305 {
4306         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4307
4308         invalidate_reclaim_iterators(memcg);
4309 }
4310
4311 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4312 {
4313         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4314
4315         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4316                 static_branch_dec(&memcg_sockets_enabled_key);
4317
4318         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4319                 static_branch_dec(&memcg_sockets_enabled_key);
4320
4321         vmpressure_cleanup(&memcg->vmpressure);
4322         cancel_work_sync(&memcg->high_work);
4323         mem_cgroup_remove_from_trees(memcg);
4324         memcg_free_kmem(memcg);
4325         mem_cgroup_free(memcg);
4326 }
4327
4328 /**
4329  * mem_cgroup_css_reset - reset the states of a mem_cgroup
4330  * @css: the target css
4331  *
4332  * Reset the states of the mem_cgroup associated with @css.  This is
4333  * invoked when the userland requests disabling on the default hierarchy
4334  * but the memcg is pinned through dependency.  The memcg should stop
4335  * applying policies and should revert to the vanilla state as it may be
4336  * made visible again.
4337  *
4338  * The current implementation only resets the essential configurations.
4339  * This needs to be expanded to cover all the visible parts.
4340  */
4341 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4342 {
4343         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4344
4345         page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX);
4346         page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX);
4347         page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX);
4348         page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX);
4349         page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX);
4350         memcg->low = 0;
4351         memcg->high = PAGE_COUNTER_MAX;
4352         memcg->soft_limit = PAGE_COUNTER_MAX;
4353         memcg_wb_domain_size_changed(memcg);
4354 }
4355
4356 #ifdef CONFIG_MMU
4357 /* Handlers for move charge at task migration. */
4358 static int mem_cgroup_do_precharge(unsigned long count)
4359 {
4360         int ret;
4361
4362         /* Try a single bulk charge without reclaim first, kswapd may wake */
4363         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4364         if (!ret) {
4365                 mc.precharge += count;
4366                 return ret;
4367         }
4368
4369         /* Try charges one by one with reclaim */
4370         while (count--) {
4371                 ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1);
4372                 if (ret)
4373                         return ret;
4374                 mc.precharge++;
4375                 cond_resched();
4376         }
4377         return 0;
4378 }
4379
4380 union mc_target {
4381         struct page     *page;
4382         swp_entry_t     ent;
4383 };
4384
4385 enum mc_target_type {
4386         MC_TARGET_NONE = 0,
4387         MC_TARGET_PAGE,
4388         MC_TARGET_SWAP,
4389 };
4390
4391 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4392                                                 unsigned long addr, pte_t ptent)
4393 {
4394         struct page *page = vm_normal_page(vma, addr, ptent);
4395
4396         if (!page || !page_mapped(page))
4397                 return NULL;
4398         if (PageAnon(page)) {
4399                 if (!(mc.flags & MOVE_ANON))
4400                         return NULL;
4401         } else {
4402                 if (!(mc.flags & MOVE_FILE))
4403                         return NULL;
4404         }
4405         if (!get_page_unless_zero(page))
4406                 return NULL;
4407
4408         return page;
4409 }
4410
4411 #ifdef CONFIG_SWAP
4412 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4413                         pte_t ptent, swp_entry_t *entry)
4414 {
4415         struct page *page = NULL;
4416         swp_entry_t ent = pte_to_swp_entry(ptent);
4417
4418         if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4419                 return NULL;
4420         /*
4421          * Because lookup_swap_cache() updates some statistics counter,
4422          * we call find_get_page() with swapper_space directly.
4423          */
4424         page = find_get_page(swap_address_space(ent), swp_offset(ent));
4425         if (do_memsw_account())
4426                 entry->val = ent.val;
4427
4428         return page;
4429 }
4430 #else
4431 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4432                         pte_t ptent, swp_entry_t *entry)
4433 {
4434         return NULL;
4435 }
4436 #endif
4437
4438 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4439                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4440 {
4441         struct page *page = NULL;
4442         struct address_space *mapping;
4443         pgoff_t pgoff;
4444
4445         if (!vma->vm_file) /* anonymous vma */
4446                 return NULL;
4447         if (!(mc.flags & MOVE_FILE))
4448                 return NULL;
4449
4450         mapping = vma->vm_file->f_mapping;
4451         pgoff = linear_page_index(vma, addr);
4452
4453         /* page is moved even if it's not RSS of this task(page-faulted). */
4454 #ifdef CONFIG_SWAP
4455         /* shmem/tmpfs may report page out on swap: account for that too. */
4456         if (shmem_mapping(mapping)) {
4457                 page = find_get_entry(mapping, pgoff);
4458                 if (radix_tree_exceptional_entry(page)) {
4459                         swp_entry_t swp = radix_to_swp_entry(page);
4460                         if (do_memsw_account())
4461                                 *entry = swp;
4462                         page = find_get_page(swap_address_space(swp),
4463                                              swp_offset(swp));
4464                 }
4465         } else
4466                 page = find_get_page(mapping, pgoff);
4467 #else
4468         page = find_get_page(mapping, pgoff);
4469 #endif
4470         return page;
4471 }
4472
4473 /**
4474  * mem_cgroup_move_account - move account of the page
4475  * @page: the page
4476  * @compound: charge the page as compound or small page
4477  * @from: mem_cgroup which the page is moved from.
4478  * @to: mem_cgroup which the page is moved to. @from != @to.
4479  *
4480  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4481  *
4482  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4483  * from old cgroup.
4484  */
4485 static int mem_cgroup_move_account(struct page *page,
4486                                    bool compound,
4487                                    struct mem_cgroup *from,
4488                                    struct mem_cgroup *to)
4489 {
4490         unsigned long flags;
4491         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4492         int ret;
4493         bool anon;
4494
4495         VM_BUG_ON(from == to);
4496         VM_BUG_ON_PAGE(PageLRU(page), page);
4497         VM_BUG_ON(compound && !PageTransHuge(page));
4498
4499         /*
4500          * Prevent mem_cgroup_migrate() from looking at
4501          * page->mem_cgroup of its source page while we change it.
4502          */
4503         ret = -EBUSY;
4504         if (!trylock_page(page))
4505                 goto out;
4506
4507         ret = -EINVAL;
4508         if (page->mem_cgroup != from)
4509                 goto out_unlock;
4510
4511         anon = PageAnon(page);
4512
4513         spin_lock_irqsave(&from->move_lock, flags);
4514
4515         if (!anon && page_mapped(page)) {
4516                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4517                                nr_pages);
4518                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED],
4519                                nr_pages);
4520         }
4521
4522         /*
4523          * move_lock grabbed above and caller set from->moving_account, so
4524          * mem_cgroup_update_page_stat() will serialize updates to PageDirty.
4525          * So mapping should be stable for dirty pages.
4526          */
4527         if (!anon && PageDirty(page)) {
4528                 struct address_space *mapping = page_mapping(page);
4529
4530                 if (mapping_cap_account_dirty(mapping)) {
4531                         __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_DIRTY],
4532                                        nr_pages);
4533                         __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_DIRTY],
4534                                        nr_pages);
4535                 }
4536         }
4537
4538         if (PageWriteback(page)) {
4539                 __this_cpu_sub(from->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4540                                nr_pages);
4541                 __this_cpu_add(to->stat->count[MEM_CGROUP_STAT_WRITEBACK],
4542                                nr_pages);
4543         }
4544
4545         /*
4546          * It is safe to change page->mem_cgroup here because the page
4547          * is referenced, charged, and isolated - we can't race with
4548          * uncharging, charging, migration, or LRU putback.
4549          */
4550
4551         /* caller should have done css_get */
4552         page->mem_cgroup = to;
4553         spin_unlock_irqrestore(&from->move_lock, flags);
4554
4555         ret = 0;
4556
4557         local_lock_irq(event_lock);
4558         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4559         memcg_check_events(to, page);
4560         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4561         memcg_check_events(from, page);
4562         local_unlock_irq(event_lock);
4563 out_unlock:
4564         unlock_page(page);
4565 out:
4566         return ret;
4567 }
4568
4569 /**
4570  * get_mctgt_type - get target type of moving charge
4571  * @vma: the vma the pte to be checked belongs
4572  * @addr: the address corresponding to the pte to be checked
4573  * @ptent: the pte to be checked
4574  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4575  *
4576  * Returns
4577  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4578  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4579  *     move charge. if @target is not NULL, the page is stored in target->page
4580  *     with extra refcnt got(Callers should handle it).
4581  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4582  *     target for charge migration. if @target is not NULL, the entry is stored
4583  *     in target->ent.
4584  *
4585  * Called with pte lock held.
4586  */
4587
4588 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4589                 unsigned long addr, pte_t ptent, union mc_target *target)
4590 {
4591         struct page *page = NULL;
4592         enum mc_target_type ret = MC_TARGET_NONE;
4593         swp_entry_t ent = { .val = 0 };
4594
4595         if (pte_present(ptent))
4596                 page = mc_handle_present_pte(vma, addr, ptent);
4597         else if (is_swap_pte(ptent))
4598                 page = mc_handle_swap_pte(vma, ptent, &ent);
4599         else if (pte_none(ptent))
4600                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4601
4602         if (!page && !ent.val)
4603                 return ret;
4604         if (page) {
4605                 /*
4606                  * Do only loose check w/o serialization.
4607                  * mem_cgroup_move_account() checks the page is valid or
4608                  * not under LRU exclusion.
4609                  */
4610                 if (page->mem_cgroup == mc.from) {
4611                         ret = MC_TARGET_PAGE;
4612                         if (target)
4613                                 target->page = page;
4614                 }
4615                 if (!ret || !target)
4616                         put_page(page);
4617         }
4618         /* There is a swap entry and a page doesn't exist or isn't charged */
4619         if (ent.val && !ret &&
4620             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4621                 ret = MC_TARGET_SWAP;
4622                 if (target)
4623                         target->ent = ent;
4624         }
4625         return ret;
4626 }
4627
4628 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4629 /*
4630  * We don't consider swapping or file mapped pages because THP does not
4631  * support them for now.
4632  * Caller should make sure that pmd_trans_huge(pmd) is true.
4633  */
4634 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4635                 unsigned long addr, pmd_t pmd, union mc_target *target)
4636 {
4637         struct page *page = NULL;
4638         enum mc_target_type ret = MC_TARGET_NONE;
4639
4640         page = pmd_page(pmd);
4641         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4642         if (!(mc.flags & MOVE_ANON))
4643                 return ret;
4644         if (page->mem_cgroup == mc.from) {
4645                 ret = MC_TARGET_PAGE;
4646                 if (target) {
4647                         get_page(page);
4648                         target->page = page;
4649                 }
4650         }
4651         return ret;
4652 }
4653 #else
4654 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4655                 unsigned long addr, pmd_t pmd, union mc_target *target)
4656 {
4657         return MC_TARGET_NONE;
4658 }
4659 #endif
4660
4661 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4662                                         unsigned long addr, unsigned long end,
4663                                         struct mm_walk *walk)
4664 {
4665         struct vm_area_struct *vma = walk->vma;
4666         pte_t *pte;
4667         spinlock_t *ptl;
4668
4669         ptl = pmd_trans_huge_lock(pmd, vma);
4670         if (ptl) {
4671                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4672                         mc.precharge += HPAGE_PMD_NR;
4673                 spin_unlock(ptl);
4674                 return 0;
4675         }
4676
4677         if (pmd_trans_unstable(pmd))
4678                 return 0;
4679         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4680         for (; addr != end; pte++, addr += PAGE_SIZE)
4681                 if (get_mctgt_type(vma, addr, *pte, NULL))
4682                         mc.precharge++; /* increment precharge temporarily */
4683         pte_unmap_unlock(pte - 1, ptl);
4684         cond_resched();
4685
4686         return 0;
4687 }
4688
4689 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4690 {
4691         unsigned long precharge;
4692
4693         struct mm_walk mem_cgroup_count_precharge_walk = {
4694                 .pmd_entry = mem_cgroup_count_precharge_pte_range,
4695                 .mm = mm,
4696         };
4697         down_read(&mm->mmap_sem);
4698         walk_page_range(0, mm->highest_vm_end,
4699                         &mem_cgroup_count_precharge_walk);
4700         up_read(&mm->mmap_sem);
4701
4702         precharge = mc.precharge;
4703         mc.precharge = 0;
4704
4705         return precharge;
4706 }
4707
4708 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
4709 {
4710         unsigned long precharge = mem_cgroup_count_precharge(mm);
4711
4712         VM_BUG_ON(mc.moving_task);
4713         mc.moving_task = current;
4714         return mem_cgroup_do_precharge(precharge);
4715 }
4716
4717 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
4718 static void __mem_cgroup_clear_mc(void)
4719 {
4720         struct mem_cgroup *from = mc.from;
4721         struct mem_cgroup *to = mc.to;
4722
4723         /* we must uncharge all the leftover precharges from mc.to */
4724         if (mc.precharge) {
4725                 cancel_charge(mc.to, mc.precharge);
4726                 mc.precharge = 0;
4727         }
4728         /*
4729          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
4730          * we must uncharge here.
4731          */
4732         if (mc.moved_charge) {
4733                 cancel_charge(mc.from, mc.moved_charge);
4734                 mc.moved_charge = 0;
4735         }
4736         /* we must fixup refcnts and charges */
4737         if (mc.moved_swap) {
4738                 /* uncharge swap account from the old cgroup */
4739                 if (!mem_cgroup_is_root(mc.from))
4740                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
4741
4742                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
4743
4744                 /*
4745                  * we charged both to->memory and to->memsw, so we
4746                  * should uncharge to->memory.
4747                  */
4748                 if (!mem_cgroup_is_root(mc.to))
4749                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
4750
4751                 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
4752                 css_put_many(&mc.to->css, mc.moved_swap);
4753
4754                 mc.moved_swap = 0;
4755         }
4756         memcg_oom_recover(from);
4757         memcg_oom_recover(to);
4758         wake_up_all(&mc.waitq);
4759 }
4760
4761 static void mem_cgroup_clear_mc(void)
4762 {
4763         struct mm_struct *mm = mc.mm;
4764
4765         /*
4766          * we must clear moving_task before waking up waiters at the end of
4767          * task migration.
4768          */
4769         mc.moving_task = NULL;
4770         __mem_cgroup_clear_mc();
4771         spin_lock(&mc.lock);
4772         mc.from = NULL;
4773         mc.to = NULL;
4774         mc.mm = NULL;
4775         spin_unlock(&mc.lock);
4776
4777         mmput(mm);
4778 }
4779
4780 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
4781 {
4782         struct cgroup_subsys_state *css;
4783         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
4784         struct mem_cgroup *from;
4785         struct task_struct *leader, *p;
4786         struct mm_struct *mm;
4787         unsigned long move_flags;
4788         int ret = 0;
4789
4790         /* charge immigration isn't supported on the default hierarchy */
4791         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
4792                 return 0;
4793
4794         /*
4795          * Multi-process migrations only happen on the default hierarchy
4796          * where charge immigration is not used.  Perform charge
4797          * immigration if @tset contains a leader and whine if there are
4798          * multiple.
4799          */
4800         p = NULL;
4801         cgroup_taskset_for_each_leader(leader, css, tset) {
4802                 WARN_ON_ONCE(p);
4803                 p = leader;
4804                 memcg = mem_cgroup_from_css(css);
4805         }
4806         if (!p)
4807                 return 0;
4808
4809         /*
4810          * We are now commited to this value whatever it is. Changes in this
4811          * tunable will only affect upcoming migrations, not the current one.
4812          * So we need to save it, and keep it going.
4813          */
4814         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
4815         if (!move_flags)
4816                 return 0;
4817
4818         from = mem_cgroup_from_task(p);
4819
4820         VM_BUG_ON(from == memcg);
4821
4822         mm = get_task_mm(p);
4823         if (!mm)
4824                 return 0;
4825         /* We move charges only when we move a owner of the mm */
4826         if (mm->owner == p) {
4827                 VM_BUG_ON(mc.from);
4828                 VM_BUG_ON(mc.to);
4829                 VM_BUG_ON(mc.precharge);
4830                 VM_BUG_ON(mc.moved_charge);
4831                 VM_BUG_ON(mc.moved_swap);
4832
4833                 spin_lock(&mc.lock);
4834                 mc.mm = mm;
4835                 mc.from = from;
4836                 mc.to = memcg;
4837                 mc.flags = move_flags;
4838                 spin_unlock(&mc.lock);
4839                 /* We set mc.moving_task later */
4840
4841                 ret = mem_cgroup_precharge_mc(mm);
4842                 if (ret)
4843                         mem_cgroup_clear_mc();
4844         } else {
4845                 mmput(mm);
4846         }
4847         return ret;
4848 }
4849
4850 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
4851 {
4852         if (mc.to)
4853                 mem_cgroup_clear_mc();
4854 }
4855
4856 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
4857                                 unsigned long addr, unsigned long end,
4858                                 struct mm_walk *walk)
4859 {
4860         int ret = 0;
4861         struct vm_area_struct *vma = walk->vma;
4862         pte_t *pte;
4863         spinlock_t *ptl;
4864         enum mc_target_type target_type;
4865         union mc_target target;
4866         struct page *page;
4867
4868         ptl = pmd_trans_huge_lock(pmd, vma);
4869         if (ptl) {
4870                 if (mc.precharge < HPAGE_PMD_NR) {
4871                         spin_unlock(ptl);
4872                         return 0;
4873                 }
4874                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
4875                 if (target_type == MC_TARGET_PAGE) {
4876                         page = target.page;
4877                         if (!isolate_lru_page(page)) {
4878                                 if (!mem_cgroup_move_account(page, true,
4879                                                              mc.from, mc.to)) {
4880                                         mc.precharge -= HPAGE_PMD_NR;
4881                                         mc.moved_charge += HPAGE_PMD_NR;
4882                                 }
4883                                 putback_lru_page(page);
4884                         }
4885                         put_page(page);
4886                 }
4887                 spin_unlock(ptl);
4888                 return 0;
4889         }
4890
4891         if (pmd_trans_unstable(pmd))
4892                 return 0;
4893 retry:
4894         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4895         for (; addr != end; addr += PAGE_SIZE) {
4896                 pte_t ptent = *(pte++);
4897                 swp_entry_t ent;
4898
4899                 if (!mc.precharge)
4900                         break;
4901
4902                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
4903                 case MC_TARGET_PAGE:
4904                         page = target.page;
4905                         /*
4906                          * We can have a part of the split pmd here. Moving it
4907                          * can be done but it would be too convoluted so simply
4908                          * ignore such a partial THP and keep it in original
4909                          * memcg. There should be somebody mapping the head.
4910                          */
4911                         if (PageTransCompound(page))
4912                                 goto put;
4913                         if (isolate_lru_page(page))
4914                                 goto put;
4915                         if (!mem_cgroup_move_account(page, false,
4916                                                 mc.from, mc.to)) {
4917                                 mc.precharge--;
4918                                 /* we uncharge from mc.from later. */
4919                                 mc.moved_charge++;
4920                         }
4921                         putback_lru_page(page);
4922 put:                    /* get_mctgt_type() gets the page */
4923                         put_page(page);
4924                         break;
4925                 case MC_TARGET_SWAP:
4926                         ent = target.ent;
4927                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
4928                                 mc.precharge--;
4929                                 /* we fixup refcnts and charges later. */
4930                                 mc.moved_swap++;
4931                         }
4932                         break;
4933                 default:
4934                         break;
4935                 }
4936         }
4937         pte_unmap_unlock(pte - 1, ptl);
4938         cond_resched();
4939
4940         if (addr != end) {
4941                 /*
4942                  * We have consumed all precharges we got in can_attach().
4943                  * We try charge one by one, but don't do any additional
4944                  * charges to mc.to if we have failed in charge once in attach()
4945                  * phase.
4946                  */
4947                 ret = mem_cgroup_do_precharge(1);
4948                 if (!ret)
4949                         goto retry;
4950         }
4951
4952         return ret;
4953 }
4954
4955 static void mem_cgroup_move_charge(void)
4956 {
4957         struct mm_walk mem_cgroup_move_charge_walk = {
4958                 .pmd_entry = mem_cgroup_move_charge_pte_range,
4959                 .mm = mc.mm,
4960         };
4961
4962         lru_add_drain_all();
4963         /*
4964          * Signal lock_page_memcg() to take the memcg's move_lock
4965          * while we're moving its pages to another memcg. Then wait
4966          * for already started RCU-only updates to finish.
4967          */
4968         atomic_inc(&mc.from->moving_account);
4969         synchronize_rcu();
4970 retry:
4971         if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
4972                 /*
4973                  * Someone who are holding the mmap_sem might be waiting in
4974                  * waitq. So we cancel all extra charges, wake up all waiters,
4975                  * and retry. Because we cancel precharges, we might not be able
4976                  * to move enough charges, but moving charge is a best-effort
4977                  * feature anyway, so it wouldn't be a big problem.
4978                  */
4979                 __mem_cgroup_clear_mc();
4980                 cond_resched();
4981                 goto retry;
4982         }
4983         /*
4984          * When we have consumed all precharges and failed in doing
4985          * additional charge, the page walk just aborts.
4986          */
4987         walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
4988
4989         up_read(&mc.mm->mmap_sem);
4990         atomic_dec(&mc.from->moving_account);
4991 }
4992
4993 static void mem_cgroup_move_task(void)
4994 {
4995         if (mc.to) {
4996                 mem_cgroup_move_charge();
4997                 mem_cgroup_clear_mc();
4998         }
4999 }
5000 #else   /* !CONFIG_MMU */
5001 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5002 {
5003         return 0;
5004 }
5005 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5006 {
5007 }
5008 static void mem_cgroup_move_task(void)
5009 {
5010 }
5011 #endif
5012
5013 /*
5014  * Cgroup retains root cgroups across [un]mount cycles making it necessary
5015  * to verify whether we're attached to the default hierarchy on each mount
5016  * attempt.
5017  */
5018 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5019 {
5020         /*
5021          * use_hierarchy is forced on the default hierarchy.  cgroup core
5022          * guarantees that @root doesn't have any children, so turning it
5023          * on for the root memcg is enough.
5024          */
5025         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5026                 root_mem_cgroup->use_hierarchy = true;
5027         else
5028                 root_mem_cgroup->use_hierarchy = false;
5029 }
5030
5031 static u64 memory_current_read(struct cgroup_subsys_state *css,
5032                                struct cftype *cft)
5033 {
5034         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5035
5036         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5037 }
5038
5039 static int memory_low_show(struct seq_file *m, void *v)
5040 {
5041         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5042         unsigned long low = READ_ONCE(memcg->low);
5043
5044         if (low == PAGE_COUNTER_MAX)
5045                 seq_puts(m, "max\n");
5046         else
5047                 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5048
5049         return 0;
5050 }
5051
5052 static ssize_t memory_low_write(struct kernfs_open_file *of,
5053                                 char *buf, size_t nbytes, loff_t off)
5054 {
5055         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5056         unsigned long low;
5057         int err;
5058
5059         buf = strstrip(buf);
5060         err = page_counter_memparse(buf, "max", &low);
5061         if (err)
5062                 return err;
5063
5064         memcg->low = low;
5065
5066         return nbytes;
5067 }
5068
5069 static int memory_high_show(struct seq_file *m, void *v)
5070 {
5071         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5072         unsigned long high = READ_ONCE(memcg->high);
5073
5074         if (high == PAGE_COUNTER_MAX)
5075                 seq_puts(m, "max\n");
5076         else
5077                 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5078
5079         return 0;
5080 }
5081
5082 static ssize_t memory_high_write(struct kernfs_open_file *of,
5083                                  char *buf, size_t nbytes, loff_t off)
5084 {
5085         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5086         unsigned long nr_pages;
5087         unsigned long high;
5088         int err;
5089
5090         buf = strstrip(buf);
5091         err = page_counter_memparse(buf, "max", &high);
5092         if (err)
5093                 return err;
5094
5095         memcg->high = high;
5096
5097         nr_pages = page_counter_read(&memcg->memory);
5098         if (nr_pages > high)
5099                 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5100                                              GFP_KERNEL, true);
5101
5102         memcg_wb_domain_size_changed(memcg);
5103         return nbytes;
5104 }
5105
5106 static int memory_max_show(struct seq_file *m, void *v)
5107 {
5108         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5109         unsigned long max = READ_ONCE(memcg->memory.limit);
5110
5111         if (max == PAGE_COUNTER_MAX)
5112                 seq_puts(m, "max\n");
5113         else
5114                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5115
5116         return 0;
5117 }
5118
5119 static ssize_t memory_max_write(struct kernfs_open_file *of,
5120                                 char *buf, size_t nbytes, loff_t off)
5121 {
5122         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5123         unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5124         bool drained = false;
5125         unsigned long max;
5126         int err;
5127
5128         buf = strstrip(buf);
5129         err = page_counter_memparse(buf, "max", &max);
5130         if (err)
5131                 return err;
5132
5133         xchg(&memcg->memory.limit, max);
5134
5135         for (;;) {
5136                 unsigned long nr_pages = page_counter_read(&memcg->memory);
5137
5138                 if (nr_pages <= max)
5139                         break;
5140
5141                 if (signal_pending(current)) {
5142                         err = -EINTR;
5143                         break;
5144                 }
5145
5146                 if (!drained) {
5147                         drain_all_stock(memcg);
5148                         drained = true;
5149                         continue;
5150                 }
5151
5152                 if (nr_reclaims) {
5153                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5154                                                           GFP_KERNEL, true))
5155                                 nr_reclaims--;
5156                         continue;
5157                 }
5158
5159                 mem_cgroup_events(memcg, MEMCG_OOM, 1);
5160                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5161                         break;
5162         }
5163
5164         memcg_wb_domain_size_changed(memcg);
5165         return nbytes;
5166 }
5167
5168 static int memory_events_show(struct seq_file *m, void *v)
5169 {
5170         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5171
5172         seq_printf(m, "low %lu\n", mem_cgroup_read_events(memcg, MEMCG_LOW));
5173         seq_printf(m, "high %lu\n", mem_cgroup_read_events(memcg, MEMCG_HIGH));
5174         seq_printf(m, "max %lu\n", mem_cgroup_read_events(memcg, MEMCG_MAX));
5175         seq_printf(m, "oom %lu\n", mem_cgroup_read_events(memcg, MEMCG_OOM));
5176
5177         return 0;
5178 }
5179
5180 static int memory_stat_show(struct seq_file *m, void *v)
5181 {
5182         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5183         unsigned long stat[MEMCG_NR_STAT];
5184         unsigned long events[MEMCG_NR_EVENTS];
5185         int i;
5186
5187         /*
5188          * Provide statistics on the state of the memory subsystem as
5189          * well as cumulative event counters that show past behavior.
5190          *
5191          * This list is ordered following a combination of these gradients:
5192          * 1) generic big picture -> specifics and details
5193          * 2) reflecting userspace activity -> reflecting kernel heuristics
5194          *
5195          * Current memory state:
5196          */
5197
5198         tree_stat(memcg, stat);
5199         tree_events(memcg, events);
5200
5201         seq_printf(m, "anon %llu\n",
5202                    (u64)stat[MEM_CGROUP_STAT_RSS] * PAGE_SIZE);
5203         seq_printf(m, "file %llu\n",
5204                    (u64)stat[MEM_CGROUP_STAT_CACHE] * PAGE_SIZE);
5205         seq_printf(m, "kernel_stack %llu\n",
5206                    (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024);
5207         seq_printf(m, "slab %llu\n",
5208                    (u64)(stat[MEMCG_SLAB_RECLAIMABLE] +
5209                          stat[MEMCG_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5210         seq_printf(m, "sock %llu\n",
5211                    (u64)stat[MEMCG_SOCK] * PAGE_SIZE);
5212
5213         seq_printf(m, "file_mapped %llu\n",
5214                    (u64)stat[MEM_CGROUP_STAT_FILE_MAPPED] * PAGE_SIZE);
5215         seq_printf(m, "file_dirty %llu\n",
5216                    (u64)stat[MEM_CGROUP_STAT_DIRTY] * PAGE_SIZE);
5217         seq_printf(m, "file_writeback %llu\n",
5218                    (u64)stat[MEM_CGROUP_STAT_WRITEBACK] * PAGE_SIZE);
5219
5220         for (i = 0; i < NR_LRU_LISTS; i++) {
5221                 struct mem_cgroup *mi;
5222                 unsigned long val = 0;
5223
5224                 for_each_mem_cgroup_tree(mi, memcg)
5225                         val += mem_cgroup_nr_lru_pages(mi, BIT(i));
5226                 seq_printf(m, "%s %llu\n",
5227                            mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE);
5228         }
5229
5230         seq_printf(m, "slab_reclaimable %llu\n",
5231                    (u64)stat[MEMCG_SLAB_RECLAIMABLE] * PAGE_SIZE);
5232         seq_printf(m, "slab_unreclaimable %llu\n",
5233                    (u64)stat[MEMCG_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5234
5235         /* Accumulated memory events */
5236
5237         seq_printf(m, "pgfault %lu\n",
5238                    events[MEM_CGROUP_EVENTS_PGFAULT]);
5239         seq_printf(m, "pgmajfault %lu\n",
5240                    events[MEM_CGROUP_EVENTS_PGMAJFAULT]);
5241
5242         return 0;
5243 }
5244
5245 static struct cftype memory_files[] = {
5246         {
5247                 .name = "current",
5248                 .flags = CFTYPE_NOT_ON_ROOT,
5249                 .read_u64 = memory_current_read,
5250         },
5251         {
5252                 .name = "low",
5253                 .flags = CFTYPE_NOT_ON_ROOT,
5254                 .seq_show = memory_low_show,
5255                 .write = memory_low_write,
5256         },
5257         {
5258                 .name = "high",
5259                 .flags = CFTYPE_NOT_ON_ROOT,
5260                 .seq_show = memory_high_show,
5261                 .write = memory_high_write,
5262         },
5263         {
5264                 .name = "max",
5265                 .flags = CFTYPE_NOT_ON_ROOT,
5266                 .seq_show = memory_max_show,
5267                 .write = memory_max_write,
5268         },
5269         {
5270                 .name = "events",
5271                 .flags = CFTYPE_NOT_ON_ROOT,
5272                 .file_offset = offsetof(struct mem_cgroup, events_file),
5273                 .seq_show = memory_events_show,
5274         },
5275         {
5276                 .name = "stat",
5277                 .flags = CFTYPE_NOT_ON_ROOT,
5278                 .seq_show = memory_stat_show,
5279         },
5280         { }     /* terminate */
5281 };
5282
5283 struct cgroup_subsys memory_cgrp_subsys = {
5284         .css_alloc = mem_cgroup_css_alloc,
5285         .css_online = mem_cgroup_css_online,
5286         .css_offline = mem_cgroup_css_offline,
5287         .css_released = mem_cgroup_css_released,
5288         .css_free = mem_cgroup_css_free,
5289         .css_reset = mem_cgroup_css_reset,
5290         .can_attach = mem_cgroup_can_attach,
5291         .cancel_attach = mem_cgroup_cancel_attach,
5292         .post_attach = mem_cgroup_move_task,
5293         .bind = mem_cgroup_bind,
5294         .dfl_cftypes = memory_files,
5295         .legacy_cftypes = mem_cgroup_legacy_files,
5296         .early_init = 0,
5297 };
5298
5299 /**
5300  * mem_cgroup_low - check if memory consumption is below the normal range
5301  * @root: the highest ancestor to consider
5302  * @memcg: the memory cgroup to check
5303  *
5304  * Returns %true if memory consumption of @memcg, and that of all
5305  * configurable ancestors up to @root, is below the normal range.
5306  */
5307 bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg)
5308 {
5309         if (mem_cgroup_disabled())
5310                 return false;
5311
5312         /*
5313          * The toplevel group doesn't have a configurable range, so
5314          * it's never low when looked at directly, and it is not
5315          * considered an ancestor when assessing the hierarchy.
5316          */
5317
5318         if (memcg == root_mem_cgroup)
5319                 return false;
5320
5321         if (page_counter_read(&memcg->memory) >= memcg->low)
5322                 return false;
5323
5324         while (memcg != root) {
5325                 memcg = parent_mem_cgroup(memcg);
5326
5327                 if (memcg == root_mem_cgroup)
5328                         break;
5329
5330                 if (page_counter_read(&memcg->memory) >= memcg->low)
5331                         return false;
5332         }
5333         return true;
5334 }
5335
5336 /**
5337  * mem_cgroup_try_charge - try charging a page
5338  * @page: page to charge
5339  * @mm: mm context of the victim
5340  * @gfp_mask: reclaim mode
5341  * @memcgp: charged memcg return
5342  * @compound: charge the page as compound or small page
5343  *
5344  * Try to charge @page to the memcg that @mm belongs to, reclaiming
5345  * pages according to @gfp_mask if necessary.
5346  *
5347  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5348  * Otherwise, an error code is returned.
5349  *
5350  * After page->mapping has been set up, the caller must finalize the
5351  * charge with mem_cgroup_commit_charge().  Or abort the transaction
5352  * with mem_cgroup_cancel_charge() in case page instantiation fails.
5353  */
5354 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5355                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
5356                           bool compound)
5357 {
5358         struct mem_cgroup *memcg = NULL;
5359         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5360         int ret = 0;
5361
5362         if (mem_cgroup_disabled())
5363                 goto out;
5364
5365         if (PageSwapCache(page)) {
5366                 /*
5367                  * Every swap fault against a single page tries to charge the
5368                  * page, bail as early as possible.  shmem_unuse() encounters
5369                  * already charged pages, too.  The USED bit is protected by
5370                  * the page lock, which serializes swap cache removal, which
5371                  * in turn serializes uncharging.
5372                  */
5373                 VM_BUG_ON_PAGE(!PageLocked(page), page);
5374                 if (page->mem_cgroup)
5375                         goto out;
5376
5377                 if (do_swap_account) {
5378                         swp_entry_t ent = { .val = page_private(page), };
5379                         unsigned short id = lookup_swap_cgroup_id(ent);
5380
5381                         rcu_read_lock();
5382                         memcg = mem_cgroup_from_id(id);
5383                         if (memcg && !css_tryget_online(&memcg->css))
5384                                 memcg = NULL;
5385                         rcu_read_unlock();
5386                 }
5387         }
5388
5389         if (!memcg)
5390                 memcg = get_mem_cgroup_from_mm(mm);
5391
5392         ret = try_charge(memcg, gfp_mask, nr_pages);
5393
5394         css_put(&memcg->css);
5395 out:
5396         *memcgp = memcg;
5397         return ret;
5398 }
5399
5400 /**
5401  * mem_cgroup_commit_charge - commit a page charge
5402  * @page: page to charge
5403  * @memcg: memcg to charge the page to
5404  * @lrucare: page might be on LRU already
5405  * @compound: charge the page as compound or small page
5406  *
5407  * Finalize a charge transaction started by mem_cgroup_try_charge(),
5408  * after page->mapping has been set up.  This must happen atomically
5409  * as part of the page instantiation, i.e. under the page table lock
5410  * for anonymous pages, under the page lock for page and swap cache.
5411  *
5412  * In addition, the page must not be on the LRU during the commit, to
5413  * prevent racing with task migration.  If it might be, use @lrucare.
5414  *
5415  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5416  */
5417 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5418                               bool lrucare, bool compound)
5419 {
5420         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5421
5422         VM_BUG_ON_PAGE(!page->mapping, page);
5423         VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5424
5425         if (mem_cgroup_disabled())
5426                 return;
5427         /*
5428          * Swap faults will attempt to charge the same page multiple
5429          * times.  But reuse_swap_page() might have removed the page
5430          * from swapcache already, so we can't check PageSwapCache().
5431          */
5432         if (!memcg)
5433                 return;
5434
5435         commit_charge(page, memcg, lrucare);
5436
5437         local_lock_irq(event_lock);
5438         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5439         memcg_check_events(memcg, page);
5440         local_unlock_irq(event_lock);
5441
5442         if (do_memsw_account() && PageSwapCache(page)) {
5443                 swp_entry_t entry = { .val = page_private(page) };
5444                 /*
5445                  * The swap entry might not get freed for a long time,
5446                  * let's not wait for it.  The page already received a
5447                  * memory+swap charge, drop the swap entry duplicate.
5448                  */
5449                 mem_cgroup_uncharge_swap(entry);
5450         }
5451 }
5452
5453 /**
5454  * mem_cgroup_cancel_charge - cancel a page charge
5455  * @page: page to charge
5456  * @memcg: memcg to charge the page to
5457  * @compound: charge the page as compound or small page
5458  *
5459  * Cancel a charge transaction started by mem_cgroup_try_charge().
5460  */
5461 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5462                 bool compound)
5463 {
5464         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5465
5466         if (mem_cgroup_disabled())
5467                 return;
5468         /*
5469          * Swap faults will attempt to charge the same page multiple
5470          * times.  But reuse_swap_page() might have removed the page
5471          * from swapcache already, so we can't check PageSwapCache().
5472          */
5473         if (!memcg)
5474                 return;
5475
5476         cancel_charge(memcg, nr_pages);
5477 }
5478
5479 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
5480                            unsigned long nr_anon, unsigned long nr_file,
5481                            unsigned long nr_huge, unsigned long nr_kmem,
5482                            struct page *dummy_page)
5483 {
5484         unsigned long nr_pages = nr_anon + nr_file + nr_kmem;
5485         unsigned long flags;
5486
5487         if (!mem_cgroup_is_root(memcg)) {
5488                 page_counter_uncharge(&memcg->memory, nr_pages);
5489                 if (do_memsw_account())
5490                         page_counter_uncharge(&memcg->memsw, nr_pages);
5491                 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && nr_kmem)
5492                         page_counter_uncharge(&memcg->kmem, nr_kmem);
5493                 memcg_oom_recover(memcg);
5494         }
5495
5496         local_lock_irqsave(event_lock, flags);
5497         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS], nr_anon);
5498         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_CACHE], nr_file);
5499         __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], nr_huge);
5500         __this_cpu_add(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT], pgpgout);
5501         __this_cpu_add(memcg->stat->nr_page_events, nr_pages);
5502         memcg_check_events(memcg, dummy_page);
5503         local_unlock_irqrestore(event_lock, flags);
5504
5505         if (!mem_cgroup_is_root(memcg))
5506                 css_put_many(&memcg->css, nr_pages);
5507 }
5508
5509 static void uncharge_list(struct list_head *page_list)
5510 {
5511         struct mem_cgroup *memcg = NULL;
5512         unsigned long nr_anon = 0;
5513         unsigned long nr_file = 0;
5514         unsigned long nr_huge = 0;
5515         unsigned long nr_kmem = 0;
5516         unsigned long pgpgout = 0;
5517         struct list_head *next;
5518         struct page *page;
5519
5520         /*
5521          * Note that the list can be a single page->lru; hence the
5522          * do-while loop instead of a simple list_for_each_entry().
5523          */
5524         next = page_list->next;
5525         do {
5526                 page = list_entry(next, struct page, lru);
5527                 next = page->lru.next;
5528
5529                 VM_BUG_ON_PAGE(PageLRU(page), page);
5530                 VM_BUG_ON_PAGE(page_count(page), page);
5531
5532                 if (!page->mem_cgroup)
5533                         continue;
5534
5535                 /*
5536                  * Nobody should be changing or seriously looking at
5537                  * page->mem_cgroup at this point, we have fully
5538                  * exclusive access to the page.
5539                  */
5540
5541                 if (memcg != page->mem_cgroup) {
5542                         if (memcg) {
5543                                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5544                                                nr_huge, nr_kmem, page);
5545                                 pgpgout = nr_anon = nr_file =
5546                                         nr_huge = nr_kmem = 0;
5547                         }
5548                         memcg = page->mem_cgroup;
5549                 }
5550
5551                 if (!PageKmemcg(page)) {
5552                         unsigned int nr_pages = 1;
5553
5554                         if (PageTransHuge(page)) {
5555                                 nr_pages <<= compound_order(page);
5556                                 nr_huge += nr_pages;
5557                         }
5558                         if (PageAnon(page))
5559                                 nr_anon += nr_pages;
5560                         else
5561                                 nr_file += nr_pages;
5562                         pgpgout++;
5563                 } else {
5564                         nr_kmem += 1 << compound_order(page);
5565                         __ClearPageKmemcg(page);
5566                 }
5567
5568                 page->mem_cgroup = NULL;
5569         } while (next != page_list);
5570
5571         if (memcg)
5572                 uncharge_batch(memcg, pgpgout, nr_anon, nr_file,
5573                                nr_huge, nr_kmem, page);
5574 }
5575
5576 /**
5577  * mem_cgroup_uncharge - uncharge a page
5578  * @page: page to uncharge
5579  *
5580  * Uncharge a page previously charged with mem_cgroup_try_charge() and
5581  * mem_cgroup_commit_charge().
5582  */
5583 void mem_cgroup_uncharge(struct page *page)
5584 {
5585         if (mem_cgroup_disabled())
5586                 return;
5587
5588         /* Don't touch page->lru of any random page, pre-check: */
5589         if (!page->mem_cgroup)
5590                 return;
5591
5592         INIT_LIST_HEAD(&page->lru);
5593         uncharge_list(&page->lru);
5594 }
5595
5596 /**
5597  * mem_cgroup_uncharge_list - uncharge a list of page
5598  * @page_list: list of pages to uncharge
5599  *
5600  * Uncharge a list of pages previously charged with
5601  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
5602  */
5603 void mem_cgroup_uncharge_list(struct list_head *page_list)
5604 {
5605         if (mem_cgroup_disabled())
5606                 return;
5607
5608         if (!list_empty(page_list))
5609                 uncharge_list(page_list);
5610 }
5611
5612 /**
5613  * mem_cgroup_migrate - charge a page's replacement
5614  * @oldpage: currently circulating page
5615  * @newpage: replacement page
5616  *
5617  * Charge @newpage as a replacement page for @oldpage. @oldpage will
5618  * be uncharged upon free.
5619  *
5620  * Both pages must be locked, @newpage->mapping must be set up.
5621  */
5622 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
5623 {
5624         struct mem_cgroup *memcg;
5625         unsigned int nr_pages;
5626         bool compound;
5627         unsigned long flags;
5628
5629         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
5630         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
5631         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
5632         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
5633                        newpage);
5634
5635         if (mem_cgroup_disabled())
5636                 return;
5637
5638         /* Page cache replacement: new page already charged? */
5639         if (newpage->mem_cgroup)
5640                 return;
5641
5642         /* Swapcache readahead pages can get replaced before being charged */
5643         memcg = oldpage->mem_cgroup;
5644         if (!memcg)
5645                 return;
5646
5647         /* Force-charge the new page. The old one will be freed soon */
5648         compound = PageTransHuge(newpage);
5649         nr_pages = compound ? hpage_nr_pages(newpage) : 1;
5650
5651         page_counter_charge(&memcg->memory, nr_pages);
5652         if (do_memsw_account())
5653                 page_counter_charge(&memcg->memsw, nr_pages);
5654         css_get_many(&memcg->css, nr_pages);
5655
5656         commit_charge(newpage, memcg, false);
5657
5658         local_lock_irqsave(event_lock, flags);
5659         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
5660         memcg_check_events(memcg, newpage);
5661         local_unlock_irqrestore(event_lock, flags);
5662 }
5663
5664 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
5665 EXPORT_SYMBOL(memcg_sockets_enabled_key);
5666
5667 void mem_cgroup_sk_alloc(struct sock *sk)
5668 {
5669         struct mem_cgroup *memcg;
5670
5671         if (!mem_cgroup_sockets_enabled)
5672                 return;
5673
5674         /*
5675          * Socket cloning can throw us here with sk_memcg already
5676          * filled. It won't however, necessarily happen from
5677          * process context. So the test for root memcg given
5678          * the current task's memcg won't help us in this case.
5679          *
5680          * Respecting the original socket's memcg is a better
5681          * decision in this case.
5682          */
5683         if (sk->sk_memcg) {
5684                 BUG_ON(mem_cgroup_is_root(sk->sk_memcg));
5685                 css_get(&sk->sk_memcg->css);
5686                 return;
5687         }
5688
5689         rcu_read_lock();
5690         memcg = mem_cgroup_from_task(current);
5691         if (memcg == root_mem_cgroup)
5692                 goto out;
5693         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && !memcg->tcpmem_active)
5694                 goto out;
5695         if (css_tryget_online(&memcg->css))
5696                 sk->sk_memcg = memcg;
5697 out:
5698         rcu_read_unlock();
5699 }
5700
5701 void mem_cgroup_sk_free(struct sock *sk)
5702 {
5703         if (sk->sk_memcg)
5704                 css_put(&sk->sk_memcg->css);
5705 }
5706
5707 /**
5708  * mem_cgroup_charge_skmem - charge socket memory
5709  * @memcg: memcg to charge
5710  * @nr_pages: number of pages to charge
5711  *
5712  * Charges @nr_pages to @memcg. Returns %true if the charge fit within
5713  * @memcg's configured limit, %false if the charge had to be forced.
5714  */
5715 bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5716 {
5717         gfp_t gfp_mask = GFP_KERNEL;
5718
5719         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5720                 struct page_counter *fail;
5721
5722                 if (page_counter_try_charge(&memcg->tcpmem, nr_pages, &fail)) {
5723                         memcg->tcpmem_pressure = 0;
5724                         return true;
5725                 }
5726                 page_counter_charge(&memcg->tcpmem, nr_pages);
5727                 memcg->tcpmem_pressure = 1;
5728                 return false;
5729         }
5730
5731         /* Don't block in the packet receive path */
5732         if (in_softirq())
5733                 gfp_mask = GFP_NOWAIT;
5734
5735         this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages);
5736
5737         if (try_charge(memcg, gfp_mask, nr_pages) == 0)
5738                 return true;
5739
5740         try_charge(memcg, gfp_mask|__GFP_NOFAIL, nr_pages);
5741         return false;
5742 }
5743
5744 /**
5745  * mem_cgroup_uncharge_skmem - uncharge socket memory
5746  * @memcg - memcg to uncharge
5747  * @nr_pages - number of pages to uncharge
5748  */
5749 void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages)
5750 {
5751         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) {
5752                 page_counter_uncharge(&memcg->tcpmem, nr_pages);
5753                 return;
5754         }
5755
5756         this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages);
5757
5758         page_counter_uncharge(&memcg->memory, nr_pages);
5759         css_put_many(&memcg->css, nr_pages);
5760 }
5761
5762 static int __init cgroup_memory(char *s)
5763 {
5764         char *token;
5765
5766         while ((token = strsep(&s, ",")) != NULL) {
5767                 if (!*token)
5768                         continue;
5769                 if (!strcmp(token, "nosocket"))
5770                         cgroup_memory_nosocket = true;
5771                 if (!strcmp(token, "nokmem"))
5772                         cgroup_memory_nokmem = true;
5773         }
5774         return 0;
5775 }
5776 __setup("cgroup.memory=", cgroup_memory);
5777
5778 /*
5779  * subsys_initcall() for memory controller.
5780  *
5781  * Some parts like hotcpu_notifier() have to be initialized from this context
5782  * because of lock dependencies (cgroup_lock -> cpu hotplug) but basically
5783  * everything that doesn't depend on a specific mem_cgroup structure should
5784  * be initialized from here.
5785  */
5786 static int __init mem_cgroup_init(void)
5787 {
5788         int cpu, node;
5789
5790         hotcpu_notifier(memcg_cpu_hotplug_callback, 0);
5791
5792         for_each_possible_cpu(cpu)
5793                 INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work,
5794                           drain_local_stock);
5795
5796         for_each_node(node) {
5797                 struct mem_cgroup_tree_per_node *rtpn;
5798
5799                 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL,
5800                                     node_online(node) ? node : NUMA_NO_NODE);
5801
5802                 rtpn->rb_root = RB_ROOT;
5803                 spin_lock_init(&rtpn->lock);
5804                 soft_limit_tree.rb_tree_per_node[node] = rtpn;
5805         }
5806
5807         return 0;
5808 }
5809 subsys_initcall(mem_cgroup_init);
5810
5811 #ifdef CONFIG_MEMCG_SWAP
5812 static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg)
5813 {
5814         while (!atomic_inc_not_zero(&memcg->id.ref)) {
5815                 /*
5816                  * The root cgroup cannot be destroyed, so it's refcount must
5817                  * always be >= 1.
5818                  */
5819                 if (WARN_ON_ONCE(memcg == root_mem_cgroup)) {
5820                         VM_BUG_ON(1);
5821                         break;
5822                 }
5823                 memcg = parent_mem_cgroup(memcg);
5824                 if (!memcg)
5825                         memcg = root_mem_cgroup;
5826         }
5827         return memcg;
5828 }
5829
5830 /**
5831  * mem_cgroup_swapout - transfer a memsw charge to swap
5832  * @page: page whose memsw charge to transfer
5833  * @entry: swap entry to move the charge to
5834  *
5835  * Transfer the memsw charge of @page to @entry.
5836  */
5837 void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
5838 {
5839         struct mem_cgroup *memcg, *swap_memcg;
5840         unsigned short oldid;
5841         unsigned long flags;
5842
5843         VM_BUG_ON_PAGE(PageLRU(page), page);
5844         VM_BUG_ON_PAGE(page_count(page), page);
5845
5846         if (!do_memsw_account())
5847                 return;
5848
5849         memcg = page->mem_cgroup;
5850
5851         /* Readahead page, never charged */
5852         if (!memcg)
5853                 return;
5854
5855         /*
5856          * In case the memcg owning these pages has been offlined and doesn't
5857          * have an ID allocated to it anymore, charge the closest online
5858          * ancestor for the swap instead and transfer the memory+swap charge.
5859          */
5860         swap_memcg = mem_cgroup_id_get_online(memcg);
5861         oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg));
5862         VM_BUG_ON_PAGE(oldid, page);
5863         mem_cgroup_swap_statistics(swap_memcg, true);
5864
5865         page->mem_cgroup = NULL;
5866
5867         if (!mem_cgroup_is_root(memcg))
5868                 page_counter_uncharge(&memcg->memory, 1);
5869
5870         if (memcg != swap_memcg) {
5871                 if (!mem_cgroup_is_root(swap_memcg))
5872                         page_counter_charge(&swap_memcg->memsw, 1);
5873                 page_counter_uncharge(&memcg->memsw, 1);
5874         }
5875
5876         /*
5877          * Interrupts should be disabled here because the caller holds the
5878          * mapping->tree_lock lock which is taken with interrupts-off. It is
5879          * important here to have the interrupts disabled because it is the
5880          * only synchronisation we have for udpating the per-CPU variables.
5881          */
5882         local_lock_irqsave(event_lock, flags);
5883 #ifndef CONFIG_PREEMPT_RT_BASE
5884         VM_BUG_ON(!irqs_disabled());
5885 #endif
5886         mem_cgroup_charge_statistics(memcg, page, false, -1);
5887         memcg_check_events(memcg, page);
5888
5889         if (!mem_cgroup_is_root(memcg))
5890                 css_put(&memcg->css);
5891         local_unlock_irqrestore(event_lock, flags);
5892 }
5893
5894 /*
5895  * mem_cgroup_try_charge_swap - try charging a swap entry
5896  * @page: page being added to swap
5897  * @entry: swap entry to charge
5898  *
5899  * Try to charge @entry to the memcg that @page belongs to.
5900  *
5901  * Returns 0 on success, -ENOMEM on failure.
5902  */
5903 int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
5904 {
5905         struct mem_cgroup *memcg;
5906         struct page_counter *counter;
5907         unsigned short oldid;
5908
5909         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
5910                 return 0;
5911
5912         memcg = page->mem_cgroup;
5913
5914         /* Readahead page, never charged */
5915         if (!memcg)
5916                 return 0;
5917
5918         memcg = mem_cgroup_id_get_online(memcg);
5919
5920         if (!mem_cgroup_is_root(memcg) &&
5921             !page_counter_try_charge(&memcg->swap, 1, &counter)) {
5922                 mem_cgroup_id_put(memcg);
5923                 return -ENOMEM;
5924         }
5925
5926         oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg));
5927         VM_BUG_ON_PAGE(oldid, page);
5928         mem_cgroup_swap_statistics(memcg, true);
5929
5930         return 0;
5931 }
5932
5933 /**
5934  * mem_cgroup_uncharge_swap - uncharge a swap entry
5935  * @entry: swap entry to uncharge
5936  *
5937  * Drop the swap charge associated with @entry.
5938  */
5939 void mem_cgroup_uncharge_swap(swp_entry_t entry)
5940 {
5941         struct mem_cgroup *memcg;
5942         unsigned short id;
5943
5944         if (!do_swap_account)
5945                 return;
5946
5947         id = swap_cgroup_record(entry, 0);
5948         rcu_read_lock();
5949         memcg = mem_cgroup_from_id(id);
5950         if (memcg) {
5951                 if (!mem_cgroup_is_root(memcg)) {
5952                         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5953                                 page_counter_uncharge(&memcg->swap, 1);
5954                         else
5955                                 page_counter_uncharge(&memcg->memsw, 1);
5956                 }
5957                 mem_cgroup_swap_statistics(memcg, false);
5958                 mem_cgroup_id_put(memcg);
5959         }
5960         rcu_read_unlock();
5961 }
5962
5963 long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
5964 {
5965         long nr_swap_pages = get_nr_swap_pages();
5966
5967         if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5968                 return nr_swap_pages;
5969         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5970                 nr_swap_pages = min_t(long, nr_swap_pages,
5971                                       READ_ONCE(memcg->swap.limit) -
5972                                       page_counter_read(&memcg->swap));
5973         return nr_swap_pages;
5974 }
5975
5976 bool mem_cgroup_swap_full(struct page *page)
5977 {
5978         struct mem_cgroup *memcg;
5979
5980         VM_BUG_ON_PAGE(!PageLocked(page), page);
5981
5982         if (vm_swap_full())
5983                 return true;
5984         if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
5985                 return false;
5986
5987         memcg = page->mem_cgroup;
5988         if (!memcg)
5989                 return false;
5990
5991         for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
5992                 if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit)
5993                         return true;
5994
5995         return false;
5996 }
5997
5998 /* for remember boot option*/
5999 #ifdef CONFIG_MEMCG_SWAP_ENABLED
6000 static int really_do_swap_account __initdata = 1;
6001 #else
6002 static int really_do_swap_account __initdata;
6003 #endif
6004
6005 static int __init enable_swap_account(char *s)
6006 {
6007         if (!strcmp(s, "1"))
6008                 really_do_swap_account = 1;
6009         else if (!strcmp(s, "0"))
6010                 really_do_swap_account = 0;
6011         return 1;
6012 }
6013 __setup("swapaccount=", enable_swap_account);
6014
6015 static u64 swap_current_read(struct cgroup_subsys_state *css,
6016                              struct cftype *cft)
6017 {
6018         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
6019
6020         return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
6021 }
6022
6023 static int swap_max_show(struct seq_file *m, void *v)
6024 {
6025         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
6026         unsigned long max = READ_ONCE(memcg->swap.limit);
6027
6028         if (max == PAGE_COUNTER_MAX)
6029                 seq_puts(m, "max\n");
6030         else
6031                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
6032
6033         return 0;
6034 }
6035
6036 static ssize_t swap_max_write(struct kernfs_open_file *of,
6037                               char *buf, size_t nbytes, loff_t off)
6038 {
6039         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
6040         unsigned long max;
6041         int err;
6042
6043         buf = strstrip(buf);
6044         err = page_counter_memparse(buf, "max", &max);
6045         if (err)
6046                 return err;
6047
6048         mutex_lock(&memcg_limit_mutex);
6049         err = page_counter_limit(&memcg->swap, max);
6050         mutex_unlock(&memcg_limit_mutex);
6051         if (err)
6052                 return err;
6053
6054         return nbytes;
6055 }
6056
6057 static struct cftype swap_files[] = {
6058         {
6059                 .name = "swap.current",
6060                 .flags = CFTYPE_NOT_ON_ROOT,
6061                 .read_u64 = swap_current_read,
6062         },
6063         {
6064                 .name = "swap.max",
6065                 .flags = CFTYPE_NOT_ON_ROOT,
6066                 .seq_show = swap_max_show,
6067                 .write = swap_max_write,
6068         },
6069         { }     /* terminate */
6070 };
6071
6072 static struct cftype memsw_cgroup_files[] = {
6073         {
6074                 .name = "memsw.usage_in_bytes",
6075                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
6076                 .read_u64 = mem_cgroup_read_u64,
6077         },
6078         {
6079                 .name = "memsw.max_usage_in_bytes",
6080                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE),
6081                 .write = mem_cgroup_reset,
6082                 .read_u64 = mem_cgroup_read_u64,
6083         },
6084         {
6085                 .name = "memsw.limit_in_bytes",
6086                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT),
6087                 .write = mem_cgroup_write,
6088                 .read_u64 = mem_cgroup_read_u64,
6089         },
6090         {
6091                 .name = "memsw.failcnt",
6092                 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT),
6093                 .write = mem_cgroup_reset,
6094                 .read_u64 = mem_cgroup_read_u64,
6095         },
6096         { },    /* terminate */
6097 };
6098
6099 static int __init mem_cgroup_swap_init(void)
6100 {
6101         if (!mem_cgroup_disabled() && really_do_swap_account) {
6102                 do_swap_account = 1;
6103                 WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
6104                                                swap_files));
6105                 WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
6106                                                   memsw_cgroup_files));
6107         }
6108         return 0;
6109 }
6110 subsys_initcall(mem_cgroup_swap_init);
6111
6112 #endif /* CONFIG_MEMCG_SWAP */