diff options
Diffstat (limited to 'target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch')
-rw-r--r-- | target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch | 996 |
1 files changed, 0 insertions, 996 deletions
diff --git a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch b/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch deleted file mode 100644 index fbc15d8c91..0000000000 --- a/target/linux/generic/pending-5.15/020-03-mm-multigenerational-lru-groundwork.patch +++ /dev/null @@ -1,996 +0,0 @@ -From 05f366c941ae2bb8ba21c79fafcb747a5a6b967b Mon Sep 17 00:00:00 2001 -From: Yu Zhao <yuzhao@google.com> -Date: Mon, 25 Jan 2021 21:12:33 -0700 -Subject: [PATCH 04/10] mm: multigenerational lru: groundwork - -For each lruvec, evictable pages are divided into multiple -generations. The youngest generation number is stored in -lrugen->max_seq for both anon and file types as they are aged on an -equal footing. The oldest generation numbers are stored in -lrugen->min_seq[] separately for anon and file types as clean file -pages can be evicted regardless of swap constraints. These three -variables are monotonically increasing. Generation numbers are -truncated into order_base_2(MAX_NR_GENS+1) bits in order to fit into -page->flags. The sliding window technique is used to prevent truncated -generation numbers from overlapping. Each truncated generation number -is an index to -lrugen->lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]. - -The framework comprises two conceptually independent components: the -aging, which produces young generations, and the eviction, which -consumes old generations. Both can be invoked independently from user -space for the purpose of working set estimation and proactive reclaim. - -The protection of hot pages and the selection of cold pages are based -on page access types and patterns. There are two access types: one via -page tables and the other via file descriptors. The protection of the -former type is by design stronger because: - 1) The uncertainty in determining the access patterns of the former - type is higher due to the coalesced nature of the accessed bit. - 2) The cost of evicting the former type is higher due to the TLB - flushes required and the likelihood of involving I/O. - 3) The penalty of under-protecting the former type is higher because - applications usually do not prepare themselves for major faults like - they do for blocked I/O. For example, client applications commonly - dedicate blocked I/O to separate threads to avoid UI janks that - negatively affect user experience. - -There are also two access patterns: one with temporal locality and the -other without. The latter pattern, e.g., random and sequential, needs -to be explicitly excluded to avoid weakening the protection of the -former pattern. Generally the former type follows the former pattern -unless MADV_SEQUENTIAL is specified and the latter type follows the -latter pattern unless outlying refaults have been observed. - -Upon faulting, a page is added to the youngest generation, which -provides the strongest protection as the eviction will not consider -this page before the aging has scanned it at least twice. The first -scan clears the accessed bit set during the initial fault. And the -second scan makes sure this page has not been used since the first -scan. A page from any other generations is brought back to the -youngest generation whenever the aging finds the accessed bit set on -any of the PTEs mapping this page. - -Unmapped pages are initially added to the oldest generation and then -conditionally protected by tiers. This is done later [PATCH 07/10]. - -Signed-off-by: Yu Zhao <yuzhao@google.com> -Tested-by: Konstantin Kharlamov <Hi-Angel@yandex.ru> -Change-Id: I71de7cd15b8dfa6f9fdd838023474693c4fee0a7 ---- - fs/fuse/dev.c | 3 +- - include/linux/cgroup.h | 15 +- - include/linux/mm.h | 36 ++++ - include/linux/mm_inline.h | 182 ++++++++++++++++++++ - include/linux/mmzone.h | 70 ++++++++ - include/linux/page-flags-layout.h | 19 ++- - include/linux/page-flags.h | 4 +- - include/linux/sched.h | 3 + - kernel/bounds.c | 3 + - kernel/cgroup/cgroup-internal.h | 1 - - mm/huge_memory.c | 3 +- - mm/memcontrol.c | 1 + - mm/memory.c | 7 + - mm/mm_init.c | 6 +- - mm/page_alloc.c | 1 + - mm/swap.c | 9 +- - mm/swapfile.c | 2 + - mm/vmscan.c | 268 ++++++++++++++++++++++++++++++ - 18 files changed, 618 insertions(+), 15 deletions(-) - ---- a/fs/fuse/dev.c -+++ b/fs/fuse/dev.c -@@ -785,7 +785,8 @@ static int fuse_check_page(struct page * - 1 << PG_active | - 1 << PG_workingset | - 1 << PG_reclaim | -- 1 << PG_waiters))) { -+ 1 << PG_waiters | -+ LRU_GEN_MASK | LRU_REFS_MASK))) { - dump_page(page, "fuse: trying to steal weird page"); - return 1; - } ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -432,6 +432,18 @@ static inline void cgroup_put(struct cgr - css_put(&cgrp->self); - } - -+extern struct mutex cgroup_mutex; -+ -+static inline void cgroup_lock(void) -+{ -+ mutex_lock(&cgroup_mutex); -+} -+ -+static inline void cgroup_unlock(void) -+{ -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * task_css_set_check - obtain a task's css_set with extra access conditions - * @task: the task to obtain css_set for -@@ -446,7 +458,6 @@ static inline void cgroup_put(struct cgr - * as locks used during the cgroup_subsys::attach() methods. - */ - #ifdef CONFIG_PROVE_RCU --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - #define task_css_set_check(task, __c) \ - rcu_dereference_check((task)->cgroups, \ -@@ -707,6 +718,8 @@ struct cgroup; - static inline u64 cgroup_id(const struct cgroup *cgrp) { return 1; } - static inline void css_get(struct cgroup_subsys_state *css) {} - static inline void css_put(struct cgroup_subsys_state *css) {} -+static inline void cgroup_lock(void) {} -+static inline void cgroup_unlock(void) {} - static inline int cgroup_attach_task_all(struct task_struct *from, - struct task_struct *t) { return 0; } - static inline int cgroupstats_build(struct cgroupstats *stats, ---- a/include/linux/mm.h -+++ b/include/linux/mm.h -@@ -1093,6 +1093,8 @@ vm_fault_t finish_mkwrite_fault(struct v - #define ZONES_PGOFF (NODES_PGOFF - ZONES_WIDTH) - #define LAST_CPUPID_PGOFF (ZONES_PGOFF - LAST_CPUPID_WIDTH) - #define KASAN_TAG_PGOFF (LAST_CPUPID_PGOFF - KASAN_TAG_WIDTH) -+#define LRU_GEN_PGOFF (KASAN_TAG_PGOFF - LRU_GEN_WIDTH) -+#define LRU_REFS_PGOFF (LRU_GEN_PGOFF - LRU_REFS_WIDTH) - - /* - * Define the bit shifts to access each section. For non-existent -@@ -1807,6 +1809,40 @@ static inline void unmap_mapping_range(s - loff_t const holebegin, loff_t const holelen, int even_cows) { } - #endif - -+#ifdef CONFIG_LRU_GEN -+static inline void task_enter_nonseq_fault(void) -+{ -+ WARN_ON(current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 1; -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+ WARN_ON(!current->in_nonseq_fault); -+ -+ current->in_nonseq_fault = 0; -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return current->in_nonseq_fault; -+} -+#else -+static inline void task_enter_nonseq_fault(void) -+{ -+} -+ -+static inline void task_exit_nonseq_fault(void) -+{ -+} -+ -+static inline bool task_in_nonseq_fault(void) -+{ -+ return false; -+} -+#endif /* CONFIG_LRU_GEN */ -+ - static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) - { ---- a/include/linux/mm_inline.h -+++ b/include/linux/mm_inline.h -@@ -79,11 +79,187 @@ static __always_inline enum lru_list pag - return lru; - } - -+#ifdef CONFIG_LRU_GEN -+ -+static inline bool lru_gen_enabled(void) -+{ -+#ifdef CONFIG_LRU_GEN_ENABLED -+ DECLARE_STATIC_KEY_TRUE(lru_gen_static_key); -+ -+ return static_branch_likely(&lru_gen_static_key); -+#else -+ DECLARE_STATIC_KEY_FALSE(lru_gen_static_key); -+ -+ return static_branch_unlikely(&lru_gen_static_key); -+#endif -+} -+ -+/* Return an index within the sliding window that tracks MAX_NR_GENS generations. */ -+static inline int lru_gen_from_seq(unsigned long seq) -+{ -+ return seq % MAX_NR_GENS; -+} -+ -+/* The youngest and the second youngest generations are counted as active. */ -+static inline bool lru_gen_is_active(struct lruvec *lruvec, int gen) -+{ -+ unsigned long max_seq = lruvec->evictable.max_seq; -+ -+ VM_BUG_ON(gen >= MAX_NR_GENS); -+ -+ return gen == lru_gen_from_seq(max_seq) || gen == lru_gen_from_seq(max_seq - 1); -+} -+ -+/* Update the sizes of the multigenerational lru lists. */ -+static inline void lru_gen_update_size(struct page *page, struct lruvec *lruvec, -+ int old_gen, int new_gen) -+{ -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ int delta = thp_nr_pages(page); -+ enum lru_list lru = type * LRU_FILE; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lockdep_assert_held(&lruvec->lru_lock); -+ VM_BUG_ON(old_gen != -1 && old_gen >= MAX_NR_GENS); -+ VM_BUG_ON(new_gen != -1 && new_gen >= MAX_NR_GENS); -+ VM_BUG_ON(old_gen == -1 && new_gen == -1); -+ -+ if (old_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[old_gen][type][zone], -+ lrugen->sizes[old_gen][type][zone] - delta); -+ if (new_gen >= 0) -+ WRITE_ONCE(lrugen->sizes[new_gen][type][zone], -+ lrugen->sizes[new_gen][type][zone] + delta); -+ -+ if (old_gen < 0) { -+ if (lru_gen_is_active(lruvec, new_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, delta); -+ return; -+ } -+ -+ if (new_gen < 0) { -+ if (lru_gen_is_active(lruvec, old_gen)) -+ lru += LRU_ACTIVE; -+ update_lru_size(lruvec, lru, zone, -delta); -+ return; -+ } -+ -+ if (!lru_gen_is_active(lruvec, old_gen) && lru_gen_is_active(lruvec, new_gen)) { -+ update_lru_size(lruvec, lru, zone, -delta); -+ update_lru_size(lruvec, lru + LRU_ACTIVE, zone, delta); -+ } -+ -+ VM_BUG_ON(lru_gen_is_active(lruvec, old_gen) && !lru_gen_is_active(lruvec, new_gen)); -+} -+ -+/* Add a page to one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ int type = page_is_file_lru(page); -+ int zone = page_zonenum(page); -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ if (PageUnevictable(page) || !lrugen->enabled[type]) -+ return false; -+ /* -+ * If a page shouldn't be considered for eviction, i.e., a page mapped -+ * upon fault during which the accessed bit is set, add it to the -+ * youngest generation. -+ * -+ * If a page can't be evicted immediately, i.e., an anon page not in -+ * swap cache or a dirty page pending writeback, add it to the second -+ * oldest generation. -+ * -+ * If a page could be evicted immediately, e.g., a clean page, add it to -+ * the oldest generation. -+ */ -+ if (PageActive(page)) -+ gen = lru_gen_from_seq(lrugen->max_seq); -+ else if ((!type && !PageSwapCache(page)) || -+ (PageReclaim(page) && (PageDirty(page) || PageWriteback(page)))) -+ gen = lru_gen_from_seq(lrugen->min_seq[type] + 1); -+ else -+ gen = lru_gen_from_seq(lrugen->min_seq[type]); -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ VM_BUG_ON_PAGE(new_flags & LRU_GEN_MASK, page); -+ -+ new_flags &= ~(LRU_GEN_MASK | BIT(PG_active)); -+ new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, -1, gen); -+ /* for rotate_reclaimable_page() */ -+ if (reclaiming) -+ list_add_tail(&page->lru, &lrugen->lists[gen][type][zone]); -+ else -+ list_add(&page->lru, &lrugen->lists[gen][type][zone]); -+ -+ return true; -+} -+ -+/* Delete a page from one of the multigenerational lru lists. Return true on success. */ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ int gen; -+ unsigned long old_flags, new_flags; -+ -+ do { -+ new_flags = old_flags = READ_ONCE(page->flags); -+ if (!(new_flags & LRU_GEN_MASK)) -+ return false; -+ -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ -+ gen = ((new_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+ -+ new_flags &= ~LRU_GEN_MASK; -+ /* for shrink_page_list() */ -+ if (reclaiming) -+ new_flags &= ~(BIT(PG_referenced) | BIT(PG_reclaim)); -+ else if (lru_gen_is_active(lruvec, gen)) -+ new_flags |= BIT(PG_active); -+ } while (cmpxchg(&page->flags, old_flags, new_flags) != old_flags); -+ -+ lru_gen_update_size(page, lruvec, gen, -1); -+ list_del(&page->lru); -+ -+ return true; -+} -+ -+#else -+ -+static inline bool lru_gen_enabled(void) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_add_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+static inline bool lru_gen_del_page(struct page *page, struct lruvec *lruvec, bool reclaiming) -+{ -+ return false; -+} -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static __always_inline void add_page_to_lru_list(struct page *page, - struct lruvec *lruvec) - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, false)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add(&page->lru, &lruvec->lists[lru]); - } -@@ -93,6 +269,9 @@ static __always_inline void add_page_to_ - { - enum lru_list lru = page_lru(page); - -+ if (lru_gen_add_page(page, lruvec, true)) -+ return; -+ - update_lru_size(lruvec, lru, page_zonenum(page), thp_nr_pages(page)); - list_add_tail(&page->lru, &lruvec->lists[lru]); - } -@@ -100,6 +279,9 @@ static __always_inline void add_page_to_ - static __always_inline void del_page_from_lru_list(struct page *page, - struct lruvec *lruvec) - { -+ if (lru_gen_del_page(page, lruvec, false)) -+ return; -+ - list_del(&page->lru); - update_lru_size(lruvec, page_lru(page), page_zonenum(page), - -thp_nr_pages(page)); ---- a/include/linux/mmzone.h -+++ b/include/linux/mmzone.h -@@ -294,6 +294,72 @@ enum lruvec_flags { - */ - }; - -+struct lruvec; -+ -+#define LRU_GEN_MASK ((BIT(LRU_GEN_WIDTH) - 1) << LRU_GEN_PGOFF) -+#define LRU_REFS_MASK ((BIT(LRU_REFS_WIDTH) - 1) << LRU_REFS_PGOFF) -+ -+#ifdef CONFIG_LRU_GEN -+ -+/* -+ * For each lruvec, evictable pages are divided into multiple generations. The -+ * youngest and the oldest generation numbers, AKA max_seq and min_seq, are -+ * monotonically increasing. The sliding window technique is used to track at -+ * least MIN_NR_GENS and at most MAX_NR_GENS generations. An offset within the -+ * window, AKA gen, indexes an array of per-type and per-zone lists for the -+ * corresponding generation. The counter in page->flags stores gen+1 while a -+ * page is on one of the multigenerational lru lists. Otherwise, it stores 0. -+ * -+ * After a page is faulted in, the aging must check the accessed bit at least -+ * twice before the eviction would consider it. The first check clears the -+ * accessed bit set during the initial fault. The second check makes sure this -+ * page hasn't been used since then. -+ */ -+#define MIN_NR_GENS 2 -+#define MAX_NR_GENS ((unsigned int)CONFIG_NR_LRU_GENS) -+ -+struct lrugen { -+ /* the aging increments the max generation number */ -+ unsigned long max_seq; -+ /* the eviction increments the min generation numbers */ -+ unsigned long min_seq[ANON_AND_FILE]; -+ /* the birth time of each generation in jiffies */ -+ unsigned long timestamps[MAX_NR_GENS]; -+ /* the multigenerational lru lists */ -+ struct list_head lists[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* the sizes of the multigenerational lru lists in pages */ -+ unsigned long sizes[MAX_NR_GENS][ANON_AND_FILE][MAX_NR_ZONES]; -+ /* whether the multigenerational lru is enabled */ -+ bool enabled[ANON_AND_FILE]; -+}; -+ -+#define MAX_BATCH_SIZE 8192 -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec); -+void lru_gen_change_state(bool enable, bool main, bool swap); -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg); -+#endif -+ -+#else /* !CONFIG_LRU_GEN */ -+ -+static inline void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+} -+ -+static inline void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+} -+ -+#ifdef CONFIG_MEMCG -+static inline void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+} -+#endif -+ -+#endif /* CONFIG_LRU_GEN */ -+ - struct lruvec { - struct list_head lists[NR_LRU_LISTS]; - /* per lruvec lru_lock for memcg */ -@@ -311,6 +377,10 @@ struct lruvec { - unsigned long refaults[ANON_AND_FILE]; - /* Various lruvec state flags (enum lruvec_flags) */ - unsigned long flags; -+#ifdef CONFIG_LRU_GEN -+ /* unevictable pages are on LRU_UNEVICTABLE */ -+ struct lrugen evictable; -+#endif - #ifdef CONFIG_MEMCG - struct pglist_data *pgdat; - #endif ---- a/include/linux/page-flags-layout.h -+++ b/include/linux/page-flags-layout.h -@@ -26,6 +26,14 @@ - - #define ZONES_WIDTH ZONES_SHIFT - -+#ifdef CONFIG_LRU_GEN -+/* LRU_GEN_WIDTH is generated from order_base_2(CONFIG_NR_LRU_GENS + 1). */ -+#define LRU_REFS_WIDTH (CONFIG_TIERS_PER_GEN - 2) -+#else -+#define LRU_GEN_WIDTH 0 -+#define LRU_REFS_WIDTH 0 -+#endif /* CONFIG_LRU_GEN */ -+ - #ifdef CONFIG_SPARSEMEM - #include <asm/sparsemem.h> - #define SECTIONS_SHIFT (MAX_PHYSMEM_BITS - SECTION_SIZE_BITS) -@@ -55,7 +63,8 @@ - #define SECTIONS_WIDTH 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_SHIFT \ -+ <= BITS_PER_LONG - NR_PAGEFLAGS - #define NODES_WIDTH NODES_SHIFT - #elif defined(CONFIG_SPARSEMEM_VMEMMAP) - #error "Vmemmap: No space for nodes field in page flags" -@@ -89,8 +98,8 @@ - #define LAST_CPUPID_SHIFT 0 - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT \ -- <= BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_SHIFT <= BITS_PER_LONG - NR_PAGEFLAGS - #define LAST_CPUPID_WIDTH LAST_CPUPID_SHIFT - #else - #define LAST_CPUPID_WIDTH 0 -@@ -100,8 +109,8 @@ - #define LAST_CPUPID_NOT_IN_PAGE_FLAGS - #endif - --#if ZONES_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH \ -- > BITS_PER_LONG - NR_PAGEFLAGS -+#if ZONES_WIDTH + LRU_GEN_WIDTH + LRU_REFS_WIDTH + SECTIONS_WIDTH + NODES_WIDTH + \ -+ KASAN_TAG_WIDTH + LAST_CPUPID_WIDTH > BITS_PER_LONG - NR_PAGEFLAGS - #error "Not enough bits in page flags" - #endif - ---- a/include/linux/page-flags.h -+++ b/include/linux/page-flags.h -@@ -845,7 +845,7 @@ static inline void ClearPageSlabPfmemall - 1UL << PG_private | 1UL << PG_private_2 | \ - 1UL << PG_writeback | 1UL << PG_reserved | \ - 1UL << PG_slab | 1UL << PG_active | \ -- 1UL << PG_unevictable | __PG_MLOCKED) -+ 1UL << PG_unevictable | __PG_MLOCKED | LRU_GEN_MASK) - - /* - * Flags checked when a page is prepped for return by the page allocator. -@@ -856,7 +856,7 @@ static inline void ClearPageSlabPfmemall - * alloc-free cycle to prevent from reusing the page. - */ - #define PAGE_FLAGS_CHECK_AT_PREP \ -- (PAGEFLAGS_MASK & ~__PG_HWPOISON) -+ ((PAGEFLAGS_MASK & ~__PG_HWPOISON) | LRU_GEN_MASK | LRU_REFS_MASK) - - #define PAGE_FLAGS_PRIVATE \ - (1UL << PG_private | 1UL << PG_private_2) ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -911,6 +911,9 @@ struct task_struct { - #ifdef CONFIG_MEMCG - unsigned in_user_fault:1; - #endif -+#ifdef CONFIG_LRU_GEN -+ unsigned in_nonseq_fault:1; -+#endif - #ifdef CONFIG_COMPAT_BRK - unsigned brk_randomized:1; - #endif ---- a/kernel/bounds.c -+++ b/kernel/bounds.c -@@ -22,6 +22,9 @@ int main(void) - DEFINE(NR_CPUS_BITS, ilog2(CONFIG_NR_CPUS)); - #endif - DEFINE(SPINLOCK_SIZE, sizeof(spinlock_t)); -+#ifdef CONFIG_LRU_GEN -+ DEFINE(LRU_GEN_WIDTH, order_base_2(CONFIG_NR_LRU_GENS + 1)); -+#endif - /* End of constants */ - - return 0; ---- a/kernel/cgroup/cgroup-internal.h -+++ b/kernel/cgroup/cgroup-internal.h -@@ -165,7 +165,6 @@ struct cgroup_mgctx { - #define DEFINE_CGROUP_MGCTX(name) \ - struct cgroup_mgctx name = CGROUP_MGCTX_INIT(name) - --extern struct mutex cgroup_mutex; - extern spinlock_t css_set_lock; - extern struct cgroup_subsys *cgroup_subsys[]; - extern struct list_head cgroup_roots; ---- a/mm/huge_memory.c -+++ b/mm/huge_memory.c -@@ -2364,7 +2364,8 @@ static void __split_huge_page_tail(struc - #ifdef CONFIG_64BIT - (1L << PG_arch_2) | - #endif -- (1L << PG_dirty))); -+ (1L << PG_dirty) | -+ LRU_GEN_MASK | LRU_REFS_MASK)); - - /* ->mapping in first tail page is compound_mapcount */ - VM_BUG_ON_PAGE(tail > 2 && page_tail->mapping != TAIL_MAPPING, ---- a/mm/memcontrol.c -+++ b/mm/memcontrol.c -@@ -5226,6 +5226,7 @@ static struct mem_cgroup *mem_cgroup_all - memcg->deferred_split_queue.split_queue_len = 0; - #endif - idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); -+ lru_gen_init_memcg(memcg); - return memcg; - fail: - mem_cgroup_id_remove(memcg); ---- a/mm/memory.c -+++ b/mm/memory.c -@@ -4788,6 +4788,7 @@ vm_fault_t handle_mm_fault(struct vm_are - unsigned int flags, struct pt_regs *regs) - { - vm_fault_t ret; -+ bool nonseq_fault = !(vma->vm_flags & VM_SEQ_READ); - - __set_current_state(TASK_RUNNING); - -@@ -4809,11 +4810,17 @@ vm_fault_t handle_mm_fault(struct vm_are - if (flags & FAULT_FLAG_USER) - mem_cgroup_enter_user_fault(); - -+ if (nonseq_fault) -+ task_enter_nonseq_fault(); -+ - if (unlikely(is_vm_hugetlb_page(vma))) - ret = hugetlb_fault(vma->vm_mm, vma, address, flags); - else - ret = __handle_mm_fault(vma, address, flags); - -+ if (nonseq_fault) -+ task_exit_nonseq_fault(); -+ - if (flags & FAULT_FLAG_USER) { - mem_cgroup_exit_user_fault(); - /* ---- a/mm/mm_init.c -+++ b/mm/mm_init.c -@@ -65,14 +65,16 @@ void __init mminit_verify_pageflags_layo - - shift = 8 * sizeof(unsigned long); - width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH -- - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH; -+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH - LRU_GEN_WIDTH - LRU_REFS_WIDTH; - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths", -- "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n", -+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Gen %d Tier %d Flags %d\n", - SECTIONS_WIDTH, - NODES_WIDTH, - ZONES_WIDTH, - LAST_CPUPID_WIDTH, - KASAN_TAG_WIDTH, -+ LRU_GEN_WIDTH, -+ LRU_REFS_WIDTH, - NR_PAGEFLAGS); - mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts", - "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n", ---- a/mm/page_alloc.c -+++ b/mm/page_alloc.c -@@ -7456,6 +7456,7 @@ static void __meminit pgdat_init_interna - - pgdat_page_ext_init(pgdat); - lruvec_init(&pgdat->__lruvec); -+ lru_gen_init_state(NULL, &pgdat->__lruvec); - } - - static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, ---- a/mm/swap.c -+++ b/mm/swap.c -@@ -446,6 +446,11 @@ void lru_cache_add(struct page *page) - VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page); - VM_BUG_ON_PAGE(PageLRU(page), page); - -+ /* see the comment in lru_gen_add_page() */ -+ if (lru_gen_enabled() && !PageUnevictable(page) && -+ task_in_nonseq_fault() && !(current->flags & PF_MEMALLOC)) -+ SetPageActive(page); -+ - get_page(page); - local_lock(&lru_pvecs.lock); - pvec = this_cpu_ptr(&lru_pvecs.lru_add); -@@ -547,7 +552,7 @@ static void lru_deactivate_file_fn(struc - - static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec) - { -- if (PageActive(page) && !PageUnevictable(page)) { -+ if (!PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - int nr_pages = thp_nr_pages(page); - - del_page_from_lru_list(page, lruvec); -@@ -661,7 +666,7 @@ void deactivate_file_page(struct page *p - */ - void deactivate_page(struct page *page) - { -- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) { -+ if (PageLRU(page) && !PageUnevictable(page) && (PageActive(page) || lru_gen_enabled())) { - struct pagevec *pvec; - - local_lock(&lru_pvecs.lock); ---- a/mm/swapfile.c -+++ b/mm/swapfile.c -@@ -2688,6 +2688,7 @@ SYSCALL_DEFINE1(swapoff, const char __us - err = 0; - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(false, false, true); - - out_dput: - filp_close(victim, NULL); -@@ -3349,6 +3350,7 @@ SYSCALL_DEFINE2(swapon, const char __use - mutex_unlock(&swapon_mutex); - atomic_inc(&proc_poll_event); - wake_up_interruptible(&proc_poll_wait); -+ lru_gen_change_state(true, false, true); - - error = 0; - goto out; ---- a/mm/vmscan.c -+++ b/mm/vmscan.c -@@ -50,6 +50,7 @@ - #include <linux/printk.h> - #include <linux/dax.h> - #include <linux/psi.h> -+#include <linux/memory.h> - - #include <asm/tlbflush.h> - #include <asm/div64.h> -@@ -2880,6 +2881,273 @@ static bool can_age_anon_pages(struct pg - return can_demote(pgdat->node_id, sc); - } - -+#ifdef CONFIG_LRU_GEN -+ -+/****************************************************************************** -+ * shorthand helpers -+ ******************************************************************************/ -+ -+#define for_each_gen_type_zone(gen, type, zone) \ -+ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ -+ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ -+ for ((zone) = 0; (zone) < MAX_NR_ZONES; (zone)++) -+ -+static int page_lru_gen(struct page *page) -+{ -+ unsigned long flags = READ_ONCE(page->flags); -+ -+ return ((flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; -+} -+ -+static struct lruvec *get_lruvec(int nid, struct mem_cgroup *memcg) -+{ -+ struct pglist_data *pgdat = NODE_DATA(nid); -+ -+#ifdef CONFIG_MEMCG -+ if (memcg) { -+ struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec; -+ -+ if (lruvec->pgdat != pgdat) -+ lruvec->pgdat = pgdat; -+ -+ return lruvec; -+ } -+#endif -+ return pgdat ? &pgdat->__lruvec : NULL; -+} -+ -+static int get_nr_gens(struct lruvec *lruvec, int type) -+{ -+ return lruvec->evictable.max_seq - lruvec->evictable.min_seq[type] + 1; -+} -+ -+static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) -+{ -+ return get_nr_gens(lruvec, 1) >= MIN_NR_GENS && -+ get_nr_gens(lruvec, 1) <= get_nr_gens(lruvec, 0) && -+ get_nr_gens(lruvec, 0) <= MAX_NR_GENS; -+} -+ -+/****************************************************************************** -+ * state change -+ ******************************************************************************/ -+ -+#ifdef CONFIG_LRU_GEN_ENABLED -+DEFINE_STATIC_KEY_TRUE(lru_gen_static_key); -+#else -+DEFINE_STATIC_KEY_FALSE(lru_gen_static_key); -+#endif -+ -+static int lru_gen_nr_swapfiles; -+ -+static bool __maybe_unused state_is_valid(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ enum lru_list lru; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ for_each_evictable_lru(lru) { -+ type = is_file_lru(lru); -+ -+ if (lrugen->enabled[type] && !list_empty(&lruvec->lists[lru])) -+ return false; -+ } -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ if (!lrugen->enabled[type] && !list_empty(&lrugen->lists[gen][type][zone])) -+ return false; -+ -+ /* unlikely but not a bug when reset_batch_size() is pending */ -+ VM_WARN_ON(!lrugen->enabled[type] && lrugen->sizes[gen][type][zone]); -+ } -+ -+ return true; -+} -+ -+static bool fill_lists(struct lruvec *lruvec) -+{ -+ enum lru_list lru; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_evictable_lru(lru) { -+ int type = is_file_lru(lru); -+ bool active = is_active_lru(lru); -+ struct list_head *head = &lruvec->lists[lru]; -+ -+ if (!lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page) != active, page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_lru_gen(page) < MAX_NR_GENS, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ del_page_from_lru_list(page, lruvec); -+ success = lru_gen_add_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+static bool drain_lists(struct lruvec *lruvec) -+{ -+ int gen, type, zone; -+ int remaining = MAX_BATCH_SIZE; -+ -+ for_each_gen_type_zone(gen, type, zone) { -+ struct list_head *head = &lruvec->evictable.lists[gen][type][zone]; -+ -+ if (lruvec->evictable.enabled[type]) -+ continue; -+ -+ while (!list_empty(head)) { -+ bool success; -+ struct page *page = lru_to_page(head); -+ -+ VM_BUG_ON_PAGE(PageTail(page), page); -+ VM_BUG_ON_PAGE(PageUnevictable(page), page); -+ VM_BUG_ON_PAGE(PageActive(page), page); -+ VM_BUG_ON_PAGE(page_is_file_lru(page) != type, page); -+ VM_BUG_ON_PAGE(page_zonenum(page) != zone, page); -+ -+ prefetchw_prev_lru_page(page, head, flags); -+ -+ success = lru_gen_del_page(page, lruvec, false); -+ VM_BUG_ON(!success); -+ add_page_to_lru_list(page, lruvec); -+ -+ if (!--remaining) -+ return false; -+ } -+ } -+ -+ return true; -+} -+ -+/* -+ * For file page tracking, we enable/disable it according to the main switch. -+ * For anon page tracking, we only enabled it when the main switch is on and -+ * there is at least one swapfile; we disable it when there are no swapfiles -+ * regardless of the value of the main switch. Otherwise, we will eventually -+ * reach the max size of the sliding window and have to call inc_min_seq(). -+ */ -+void lru_gen_change_state(bool enable, bool main, bool swap) -+{ -+ static DEFINE_MUTEX(state_mutex); -+ -+ struct mem_cgroup *memcg; -+ -+ mem_hotplug_begin(); -+ cgroup_lock(); -+ mutex_lock(&state_mutex); -+ -+ if (swap) { -+ if (enable) -+ swap = !lru_gen_nr_swapfiles++; -+ else -+ swap = !--lru_gen_nr_swapfiles; -+ } -+ -+ if (main && enable != lru_gen_enabled()) { -+ if (enable) -+ static_branch_enable(&lru_gen_static_key); -+ else -+ static_branch_disable(&lru_gen_static_key); -+ } else if (!swap || !lru_gen_enabled()) -+ goto unlock; -+ -+ memcg = mem_cgroup_iter(NULL, NULL, NULL); -+ do { -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ if (!lruvec) -+ continue; -+ -+ spin_lock_irq(&lruvec->lru_lock); -+ -+ VM_BUG_ON(!seq_is_valid(lruvec)); -+ VM_BUG_ON(!state_is_valid(lruvec)); -+ -+ lruvec->evictable.enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lruvec->evictable.enabled[1] = lru_gen_enabled(); -+ -+ while (!(enable ? fill_lists(lruvec) : drain_lists(lruvec))) { -+ spin_unlock_irq(&lruvec->lru_lock); -+ cond_resched(); -+ spin_lock_irq(&lruvec->lru_lock); -+ } -+ -+ spin_unlock_irq(&lruvec->lru_lock); -+ } -+ -+ cond_resched(); -+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); -+unlock: -+ mutex_unlock(&state_mutex); -+ cgroup_unlock(); -+ mem_hotplug_done(); -+} -+ -+/****************************************************************************** -+ * initialization -+ ******************************************************************************/ -+ -+void lru_gen_init_state(struct mem_cgroup *memcg, struct lruvec *lruvec) -+{ -+ int i; -+ int gen, type, zone; -+ struct lrugen *lrugen = &lruvec->evictable; -+ -+ lrugen->max_seq = MIN_NR_GENS + 1; -+ lrugen->enabled[0] = lru_gen_enabled() && lru_gen_nr_swapfiles; -+ lrugen->enabled[1] = lru_gen_enabled(); -+ -+ for (i = 0; i <= MIN_NR_GENS + 1; i++) -+ lrugen->timestamps[i] = jiffies; -+ -+ for_each_gen_type_zone(gen, type, zone) -+ INIT_LIST_HEAD(&lrugen->lists[gen][type][zone]); -+} -+ -+#ifdef CONFIG_MEMCG -+void lru_gen_init_memcg(struct mem_cgroup *memcg) -+{ -+ int nid; -+ -+ for_each_node(nid) { -+ struct lruvec *lruvec = get_lruvec(nid, memcg); -+ -+ lru_gen_init_state(memcg, lruvec); -+ } -+} -+#endif -+ -+static int __init init_lru_gen(void) -+{ -+ BUILD_BUG_ON(MIN_NR_GENS + 1 >= MAX_NR_GENS); -+ BUILD_BUG_ON(BIT(LRU_GEN_WIDTH) <= MAX_NR_GENS); -+ -+ return 0; -+}; -+late_initcall(init_lru_gen); -+ -+#endif /* CONFIG_LRU_GEN */ -+ - static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) - { - unsigned long nr[NR_LRU_LISTS]; |