diff --git a/include/linux/mm.h b/include/linux/mm.h index 6063ee7..322ee8c 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1080,6 +1080,8 @@ struct zap_details { struct address_space *check_mapping; /* Check page->mapping if set */ pgoff_t first_index; /* Lowest page->index to unmap */ pgoff_t last_index; /* Highest page->index to unmap */ + bool ignore_dirty; /* Ignore dirty pages */ + bool check_swap_entries; /* Check also swap entries */ }; struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, diff --git a/include/linux/oom.h b/include/linux/oom.h index 03e6257..45993b8 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -91,7 +91,7 @@ extern enum oom_scan_t oom_scan_process_thread(struct oom_control *oc, extern bool out_of_memory(struct oom_control *oc); -extern void exit_oom_victim(void); +extern void exit_oom_victim(struct task_struct *tsk); extern int register_oom_notifier(struct notifier_block *nb); extern int unregister_oom_notifier(struct notifier_block *nb); diff --git a/include/linux/sched.h b/include/linux/sched.h index 1c0193b..34a7687 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -423,6 +423,7 @@ extern signed long schedule_timeout(signed long timeout); extern signed long schedule_timeout_interruptible(signed long timeout); extern signed long schedule_timeout_killable(signed long timeout); extern signed long schedule_timeout_uninterruptible(signed long timeout); +extern signed long schedule_timeout_idle(signed long timeout); asmlinkage void schedule(void); extern void schedule_preempt_disabled(void); @@ -1815,6 +1816,9 @@ struct task_struct { unsigned long task_state_change; #endif int pagefault_disabled; +#ifdef CONFIG_MMU + struct task_struct *oom_reaper_list; +#endif /* CPU-specific state of this task */ struct thread_struct thread; /* diff --git a/kernel/exit.c b/kernel/exit.c index ffba5df..ce70d01 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -436,7 +436,7 @@ static void exit_mm(struct task_struct *tsk) mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) - exit_oom_victim(); + exit_oom_victim(tsk); } static struct task_struct *find_alive_thread(struct task_struct *p) diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d11..2c40347 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1566,6 +1566,17 @@ signed long __sched schedule_timeout_uninterruptible(signed long timeout) } EXPORT_SYMBOL(schedule_timeout_uninterruptible); +/* + * Like schedule_timeout_uninterruptible(), except this task will not contribute + * to load average. + */ +signed long __sched schedule_timeout_idle(signed long timeout) +{ + __set_current_state(TASK_IDLE); + return schedule_timeout(timeout); +} +EXPORT_SYMBOL(schedule_timeout_idle); + #ifdef CONFIG_HOTPLUG_CPU static void migrate_timer_list(struct tvec_base *new_base, struct hlist_head *head) { diff --git a/mm/internal.h b/mm/internal.h index 38e24b8..53f50ec 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -41,6 +41,11 @@ static inline void set_page_count(struct page *page, int v) atomic_set(&page->_count, v); } +void unmap_page_range(struct mmu_gather *tlb, + struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct zap_details *details); + extern int __do_page_cache_readahead(struct address_space *mapping, struct file *filp, pgoff_t offset, unsigned long nr_to_read, unsigned long lookahead_size); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6ba4dd9..70c9ae6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5000,7 +5000,6 @@ static void mem_cgroup_move_charge(void) .mm = mc.mm, }; - lru_add_drain_all(); /* * Signal mem_cgroup_begin_page_stat() to take the memcg's * move_lock while we're moving its pages to another memcg. diff --git a/mm/memory.c b/mm/memory.c index 278c33c..7897aba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1157,6 +1157,12 @@ again: rss[MM_ANONPAGES]--; else { if (pte_dirty(ptent)) { + /* + * oom_reaper cannot tear down dirty + * pages + */ + if (unlikely(details && details->ignore_dirty)) + continue; force_flush = 1; set_page_dirty(page); } @@ -1175,8 +1181,8 @@ again: } continue; } - /* If details->check_mapping, we leave swap entries. */ - if (unlikely(details)) + /* only check swap_entries if explicitly asked for in details */ + if (unlikely(details && !details->check_swap_entries)) continue; entry = pte_to_swp_entry(ptent); @@ -1285,7 +1291,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb, return addr; } -static void unmap_page_range(struct mmu_gather *tlb, +void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, unsigned long addr, unsigned long end, struct zap_details *details) @@ -1293,9 +1299,6 @@ static void unmap_page_range(struct mmu_gather *tlb, pgd_t *pgd; unsigned long next; - if (details && !details->check_mapping) - details = NULL; - BUG_ON(addr >= end); tlb_start_vma(tlb, vma); pgd = pgd_offset(vma->vm_mm, addr); @@ -2448,7 +2451,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details; + struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index c126809..d3925cf 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -35,6 +35,11 @@ #include #include #include +#include +#include + +#include +#include "internal.h" #define CREATE_TRACE_POINTS #include @@ -408,6 +413,171 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_victims_wait); bool oom_killer_disabled __read_mostly; +#define K(x) ((x) << (PAGE_SHIFT-10)) + +#ifdef CONFIG_MMU +/* + * OOM Reaper kernel thread which tries to reap the memory used by the OOM + * victim (if that is possible) to help the OOM killer to move on. + */ +static struct task_struct *oom_reaper_th; +static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); +static struct task_struct *oom_reaper_list; +static DEFINE_SPINLOCK(oom_reaper_lock); + + +static bool __oom_reap_task(struct task_struct *tsk) +{ + struct mmu_gather tlb; + struct vm_area_struct *vma; + struct mm_struct *mm; + struct task_struct *p; + struct zap_details details = {.check_swap_entries = true, + .ignore_dirty = true}; + bool ret = true; + + /* + * Make sure we find the associated mm_struct even when the particular + * thread has already terminated and cleared its mm. + * We might have race with exit path so consider our work done if there + * is no mm. + */ + p = find_lock_task_mm(tsk); + if (!p) + return true; + + mm = p->mm; + if (!atomic_inc_not_zero(&mm->mm_users)) { + task_unlock(p); + return true; + } + + task_unlock(p); + + if (!down_read_trylock(&mm->mmap_sem)) { + ret = false; + goto out; + } + + tlb_gather_mmu(&tlb, mm, 0, -1); + for (vma = mm->mmap ; vma; vma = vma->vm_next) { + if (is_vm_hugetlb_page(vma)) + continue; + + /* + * mlocked VMAs require explicit munlocking before unmap. + * Let's keep it simple here and skip such VMAs. + */ + if (vma->vm_flags & VM_LOCKED) + continue; + + /* + * Only anonymous pages have a good chance to be dropped + * without additional steps which we cannot afford as we + * are OOM already. + * + * We do not even care about fs backed pages because all + * which are reclaimable have already been reclaimed and + * we do not want to block exit_mmap by keeping mm ref + * count elevated without a good reason. + */ + if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) + unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, + &details); + } + tlb_finish_mmu(&tlb, 0, -1); + pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB\n", + task_pid_nr(tsk), tsk->comm, + K(get_mm_counter(mm, MM_ANONPAGES)), + K(get_mm_counter(mm, MM_FILEPAGES))); + up_read(&mm->mmap_sem); + + /* + * Clear TIF_MEMDIE because the task shouldn't be sitting on a + * reasonably reclaimable memory anymore. OOM killer can continue + * by selecting other victim if unmapping hasn't led to any + * improvements. This also means that selecting this task doesn't + * make any sense. + */ + tsk->signal->oom_score_adj = OOM_SCORE_ADJ_MIN; + exit_oom_victim(tsk); +out: + mmput(mm); + return ret; +} + +#define MAX_OOM_REAP_RETRIES 10 +static void oom_reap_task(struct task_struct *tsk) +{ + int attempts = 0; + + /* Retry the down_read_trylock(mmap_sem) a few times */ + while (attempts++ < MAX_OOM_REAP_RETRIES && !__oom_reap_task(tsk)) + schedule_timeout_idle(HZ/10); + + if (attempts > MAX_OOM_REAP_RETRIES) { + pr_info("oom_reaper: unable to reap pid:%d (%s)\n", + task_pid_nr(tsk), tsk->comm); + debug_show_all_locks(); + } + + /* Drop a reference taken by wake_oom_reaper */ + put_task_struct(tsk); +} + +static int oom_reaper(void *unused) +{ + set_freezable(); + + while (true) { + struct task_struct *tsk = NULL; + + wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL); + spin_lock(&oom_reaper_lock); + if (oom_reaper_list != NULL) { + tsk = oom_reaper_list; + oom_reaper_list = tsk->oom_reaper_list; + } + spin_unlock(&oom_reaper_lock); + + if (tsk) + oom_reap_task(tsk); + } + + return 0; +} + +static void wake_oom_reaper(struct task_struct *tsk) +{ + if (!oom_reaper_th || tsk->oom_reaper_list) + return; + + get_task_struct(tsk); + + spin_lock(&oom_reaper_lock); + tsk->oom_reaper_list = oom_reaper_list; + oom_reaper_list = tsk; + spin_unlock(&oom_reaper_lock); + wake_up(&oom_reaper_wait); +} + +static int __init oom_init(void) +{ + oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper"); + if (IS_ERR(oom_reaper_th)) { + pr_err("Unable to start OOM reaper %ld. Continuing regardless\n", + PTR_ERR(oom_reaper_th)); + oom_reaper_th = NULL; + } + return 0; +} +subsys_initcall(oom_init) +#else +static void wake_oom_reaper(struct task_struct *tsk) +{ +} +#endif + /** * mark_oom_victim - mark the given task as OOM victim * @tsk: task to mark @@ -434,9 +604,10 @@ void mark_oom_victim(struct task_struct *tsk) /** * exit_oom_victim - note the exit of an OOM victim */ -void exit_oom_victim(void) +void exit_oom_victim(struct task_struct *tsk) { - clear_thread_flag(TIF_MEMDIE); + if (!test_and_clear_tsk_thread_flag(tsk, TIF_MEMDIE)) + return; if (!atomic_dec_return(&oom_victims)) wake_up_all(&oom_victims_wait); @@ -501,7 +672,6 @@ static bool process_shares_mm(struct task_struct *p, struct mm_struct *mm) return false; } -#define K(x) ((x) << (PAGE_SHIFT-10)) /* * Must be called while holding a reference to p, which will be released upon * returning. @@ -517,6 +687,7 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); + bool can_oom_reap = true; /* * If the task is already exiting, don't alarm the sysadmin or kill @@ -606,17 +777,23 @@ void oom_kill_process(struct oom_control *oc, struct task_struct *p, continue; if (same_thread_group(p, victim)) continue; - if (unlikely(p->flags & PF_KTHREAD)) - continue; - if (is_global_init(p)) - continue; - if (p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) + if (unlikely(p->flags & PF_KTHREAD) || is_global_init(p) || + p->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) { + /* + * We cannot use oom_reaper for the mm shared by this + * process because it wouldn't get killed and so the + * memory might be still used. + */ + can_oom_reap = false; continue; - + } do_send_sig_info(SIGKILL, SEND_SIG_FORCED, p, true); } rcu_read_unlock(); + if (can_oom_reap) + wake_oom_reaper(victim); + mmdrop(mm); put_task_struct(victim); }