From d78d9d2e2cdc1c010dfeb3dea39fd6fb35b826e2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:49 +0100 Subject: [PATCH 01/24] block: Unhash block device inodes on gendisk destruction Currently, block device inodes stay around after corresponding gendisk hash died until memory reclaim finds them and frees them. Since we will make block device inode pin the bdi, we want to free the block device inode as soon as the device goes away so that bdi does not stay around unnecessarily. Furthermore we need to avoid issues when new device with the same major,minor pair gets created since reusing the bdi structure would be rather difficult in this case. Unhashing block device inode on gendisk destruction nicely deals with these problems. Once last block device inode reference is dropped (which may be directly in del_gendisk()), the inode gets evicted. Furthermore if the major,minor pair gets reallocated, we are guaranteed to get new block device inode even if old block device inode is not yet evicted and thus we avoid issues with possible reuse of bdi. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit f44f1ab5a2dcd4e16eab850fd08e40ff2d0c28d4) Signed-off-by: Thiago Jung Bauermann --- block/genhd.c | 2 ++ fs/block_dev.c | 15 +++++++++++++++ include/linux/fs.h | 1 + 3 files changed, 18 insertions(+) diff --git a/block/genhd.c b/block/genhd.c index a5bed6bc869d..18a0783afee4 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -646,6 +646,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { + bdev_unhash_inode(MKDEV(disk->major, + disk->first_minor + part->partno)); invalidate_partition(disk, part->partno); delete_partition(disk, part->partno); } diff --git a/fs/block_dev.c b/fs/block_dev.c index 009fbcd8d228..702480b40d64 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -624,6 +624,21 @@ static int bdev_set(struct inode *inode, void *data) static LIST_HEAD(all_bdevs); +/* + * If there is a bdev inode for this device, unhash it so that it gets evicted + * as soon as last inode reference is dropped. + */ +void bdev_unhash_inode(dev_t dev) +{ + struct inode *inode; + + inode = ilookup5(blockdev_superblock, hash(dev), bdev_test, &dev); + if (inode) { + remove_inode_hash(inode); + iput(inode); + } +} + struct block_device *bdget(dev_t dev) { struct block_device *bdev; diff --git a/include/linux/fs.h b/include/linux/fs.h index c11dcf7d6e72..f0ecce474110 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2316,6 +2316,7 @@ extern struct kmem_cache *names_cachep; #ifdef CONFIG_BLOCK extern int register_blkdev(unsigned int, const char *); extern void unregister_blkdev(unsigned int, const char *); +extern void bdev_unhash_inode(dev_t dev); extern struct block_device *bdget(dev_t); extern struct block_device *bdgrab(struct block_device *bdev); extern void bd_set_size(struct block_device *, loff_t size); -- 2.7.4 From 7b5af30b4ea32afbb92405418ee1edb972c506ad Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:50 +0100 Subject: [PATCH 02/24] block: Use pointer to backing_dev_info from request_queue We will want to have struct backing_dev_info allocated separately from struct request_queue. As the first step add pointer to backing_dev_info to request_queue and convert all users touching it. No functional changes in this patch. Reviewed-by: Christoph Hellwig Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit dc3b17cc8bf21307c7e076e7c778d5db756f7871) Conflicts: block/blk-cgroup.c block/blk-core.c block/blk-settings.c block/blk-sysfs.c block/blk-wbt.c drivers/block/aoe/aoeblk.c drivers/block/drbd/drbd_nl.c drivers/md/dm.c drivers/md/md.c Signed-off-by: Thiago Jung Bauermann --- block/blk-cgroup.c | 6 +- block/blk-core.c | 27 +- block/blk-integrity.c | 4 +- block/blk-sysfs.c | 6 +- block/blk-wbt.c | 751 +++++++++++++++++++++++++++++++++++++++++ block/genhd.c | 2 +- drivers/block/aoe/aoeblk.c | 4 +- drivers/block/drbd/drbd_main.c | 6 +- drivers/block/drbd/drbd_nl.c | 10 +- drivers/block/drbd/drbd_proc.c | 2 +- drivers/block/drbd/drbd_req.c | 2 +- drivers/block/pktcdvd.c | 4 +- drivers/block/rbd.c | 2 +- drivers/md/bcache/request.c | 10 +- drivers/md/bcache/super.c | 8 +- drivers/md/dm-cache-target.c | 2 +- drivers/md/dm-era-target.c | 2 +- drivers/md/dm-table.c | 2 +- drivers/md/dm-thin.c | 2 +- drivers/md/dm.c | 6 +- drivers/md/linear.c | 2 +- drivers/md/md.c | 6 +- drivers/md/multipath.c | 2 +- drivers/md/raid0.c | 6 +- drivers/md/raid1.c | 4 +- drivers/md/raid10.c | 10 +- drivers/md/raid5.c | 12 +- fs/gfs2/ops_fstype.c | 2 +- fs/nilfs2/super.c | 2 +- fs/super.c | 2 +- include/linux/blkdev.h | 3 +- mm/page-writeback.c | 4 +- 32 files changed, 834 insertions(+), 79 deletions(-) create mode 100644 block/blk-wbt.c diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 9d359e05fad7..107d0b7971bd 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -184,7 +184,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(&q->backing_dev_info, + wb_congested = wb_congested_get_create(q->backing_dev_info, blkcg->css.id, GFP_NOWAIT); if (!wb_congested) { ret = -ENOMEM; @@ -468,8 +468,8 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info.dev) - return dev_name(blkg->q->backing_dev_info.dev); + if (blkg->q->backing_dev_info->dev) + return dev_name(blkg->q->backing_dev_info->dev); return NULL; } EXPORT_SYMBOL_GPL(blkg_dev_name); diff --git a/block/blk-core.c b/block/blk-core.c index 5e568590aede..0fb635dbe913 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -73,7 +73,7 @@ static void blk_clear_congested(struct request_list *rl, int sync) * flip its congestion state for events on other blkcgs. */ if (rl == &rl->q->root_rl) - clear_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + clear_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } @@ -84,7 +84,7 @@ static void blk_set_congested(struct request_list *rl, int sync) #else /* see blk_clear_congested() */ if (rl == &rl->q->root_rl) - set_wb_congested(rl->q->backing_dev_info.wb.congested, sync); + set_wb_congested(rl->q->backing_dev_info->wb.congested, sync); #endif } @@ -115,7 +115,7 @@ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { struct request_queue *q = bdev_get_queue(bdev); - return &q->backing_dev_info; + return q->backing_dev_info; } EXPORT_SYMBOL(blk_get_backing_dev_info); @@ -583,7 +583,7 @@ void blk_cleanup_queue(struct request_queue *q) blk_flush_integrity(); /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info.laptop_mode_wb_timer); + del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); blk_sync_queue(q); if (q->mq_ops) @@ -595,7 +595,7 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_unregister(&q->backing_dev_info); + bdi_unregister(q->backing_dev_info); /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); @@ -707,17 +707,18 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q->bio_split) goto fail_id; - q->backing_dev_info.ra_pages = + q->backing_dev_info = &q->_backing_dev_info; + q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; - q->backing_dev_info.capabilities = BDI_CAP_CGROUP_WRITEBACK; - q->backing_dev_info.name = "block"; + q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; + q->backing_dev_info->name = "block"; q->node = node_id; - err = bdi_init(&q->backing_dev_info); + err = bdi_init(q->backing_dev_info); if (err) goto fail_split; - setup_timer(&q->backing_dev_info.laptop_mode_wb_timer, + setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); INIT_LIST_HEAD(&q->queue_head); @@ -767,7 +768,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: - bdi_destroy(&q->backing_dev_info); + bdi_destroy(q->backing_dev_info); fail_split: bioset_free(q->bio_split); fail_id: @@ -1191,7 +1192,7 @@ fail_elvpriv: * disturb iosched and blkcg but weird is bettern than dead. */ printk_ratelimited(KERN_WARNING "%s: dev %s: request aux data allocation failed, iosched may be disturbed\n", - __func__, dev_name(q->backing_dev_info.dev)); + __func__, dev_name(q->backing_dev_info->dev)); rq->cmd_flags &= ~REQ_ELVPRIV; rq->elv.icq = NULL; @@ -2738,7 +2739,7 @@ void blk_finish_request(struct request *req, int error) BUG_ON(blk_queued_rq(req)); if (unlikely(laptop_mode) && req->cmd_type == REQ_TYPE_FS) - laptop_io_completion(&req->q->backing_dev_info); + laptop_io_completion(req->q->backing_dev_info); blk_delete_timer(req); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index d69c5c79f98e..9f0ff5ba4f84 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -443,10 +443,10 @@ void blk_integrity_revalidate(struct gendisk *disk) return; if (bi->profile) - disk->queue->backing_dev_info.capabilities |= + disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - disk->queue->backing_dev_info.capabilities &= + disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; } diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index e140cc487ce1..8e1e284fa159 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -75,7 +75,7 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info.ra_pages << + unsigned long ra_kb = q->backing_dev_info->ra_pages << (PAGE_CACHE_SHIFT - 10); return queue_var_show(ra_kb, (page)); @@ -90,7 +90,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) if (ret < 0) return ret; - q->backing_dev_info.ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); + q->backing_dev_info->ra_pages = ra_kb >> (PAGE_CACHE_SHIFT - 10); return ret; } @@ -578,7 +578,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); - bdi_exit(&q->backing_dev_info); + bdi_exit(q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { diff --git a/block/blk-wbt.c b/block/blk-wbt.c new file mode 100644 index 000000000000..1aedb1f7ee0c --- /dev/null +++ b/block/blk-wbt.c @@ -0,0 +1,751 @@ +/* + * buffered writeback throttling. loosely based on CoDel. We can't drop + * packets for IO scheduling, so the logic is something like this: + * + * - Monitor latencies in a defined window of time. + * - If the minimum latency in the above window exceeds some target, increment + * scaling step and scale down queue depth by a factor of 2x. The monitoring + * window is then shrunk to 100 / sqrt(scaling step + 1). + * - For any window where we don't have solid data on what the latencies + * look like, retain status quo. + * - If latencies look good, decrement scaling step. + * - If we're only doing writes, allow the scaling step to go negative. This + * will temporarily boost write performance, snapping back to a stable + * scaling step of 0 if reads show up or the heavy writers finish. Unlike + * positive scaling steps where we shrink the monitoring window, a negative + * scaling step retains the default step==0 window size. + * + * Copyright (C) 2016 Jens Axboe + * + */ +#include +#include +#include +#include +#include + +#include "blk-wbt.h" + +#define CREATE_TRACE_POINTS +#include + +enum { + /* + * Default setting, we'll scale up (to 75% of QD max) or down (min 1) + * from here depending on device stats + */ + RWB_DEF_DEPTH = 16, + + /* + * 100msec window + */ + RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, + + /* + * Disregard stats, if we don't meet this minimum + */ + RWB_MIN_WRITE_SAMPLES = 3, + + /* + * If we have this number of consecutive windows with not enough + * information to scale up or down, scale up. + */ + RWB_UNKNOWN_BUMP = 5, +}; + +static inline bool rwb_enabled(struct rq_wb *rwb) +{ + return rwb && rwb->wb_normal != 0; +} + +/* + * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, + * false if 'v' + 1 would be bigger than 'below'. + */ +static bool atomic_inc_below(atomic_t *v, int below) +{ + int cur = atomic_read(v); + + for (;;) { + int old; + + if (cur >= below) + return false; + old = atomic_cmpxchg(v, cur, cur + 1); + if (old == cur) + break; + cur = old; + } + + return true; +} + +static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) +{ + if (rwb_enabled(rwb)) { + const unsigned long cur = jiffies; + + if (cur != *var) + *var = cur; + } +} + +/* + * If a task was rate throttled in balance_dirty_pages() within the last + * second or so, use that to indicate a higher cleaning rate. + */ +static bool wb_recent_wait(struct rq_wb *rwb) +{ + struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; + + return time_before(jiffies, wb->dirty_sleep + HZ); +} + +static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) +{ + return &rwb->rq_wait[is_kswapd]; +} + +static void rwb_wake_all(struct rq_wb *rwb) +{ + int i; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + struct rq_wait *rqw = &rwb->rq_wait[i]; + + if (waitqueue_active(&rqw->wait)) + wake_up_all(&rqw->wait); + } +} + +void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) +{ + struct rq_wait *rqw; + int inflight, limit; + + if (!(wb_acct & WBT_TRACKED)) + return; + + rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); + inflight = atomic_dec_return(&rqw->inflight); + + /* + * wbt got disabled with IO in flight. Wake up any potential + * waiters, we don't have to do more than that. + */ + if (unlikely(!rwb_enabled(rwb))) { + rwb_wake_all(rwb); + return; + } + + /* + * If the device does write back caching, drop further down + * before we wake people up. + */ + if (rwb->wc && !wb_recent_wait(rwb)) + limit = 0; + else + limit = rwb->wb_normal; + + /* + * Don't wake anyone up if we are above the normal limit. + */ + if (inflight && inflight >= limit) + return; + + if (waitqueue_active(&rqw->wait)) { + int diff = limit - inflight; + + if (!inflight || diff >= rwb->wb_background / 2) + wake_up_all(&rqw->wait); + } +} + +/* + * Called on completion of a request. Note that it's also called when + * a request is merged, when the request gets freed. + */ +void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb) + return; + + if (!wbt_is_tracked(stat)) { + if (rwb->sync_cookie == stat) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } + + if (wbt_is_read(stat)) + wb_timestamp(rwb, &rwb->last_comp); + wbt_clear_state(stat); + } else { + WARN_ON_ONCE(stat == rwb->sync_cookie); + __wbt_done(rwb, wbt_stat_to_mask(stat)); + wbt_clear_state(stat); + } +} + +/* + * Return true, if we can't increase the depth further by scaling + */ +static bool calc_wb_limits(struct rq_wb *rwb) +{ + unsigned int depth; + bool ret = false; + + if (!rwb->min_lat_nsec) { + rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; + return false; + } + + /* + * For QD=1 devices, this is a special case. It's important for those + * to have one request ready when one completes, so force a depth of + * 2 for those devices. On the backend, it'll be a depth of 1 anyway, + * since the device can't have more than that in flight. If we're + * scaling down, then keep a setting of 1/1/1. + */ + if (rwb->queue_depth == 1) { + if (rwb->scale_step > 0) + rwb->wb_max = rwb->wb_normal = 1; + else { + rwb->wb_max = rwb->wb_normal = 2; + ret = true; + } + rwb->wb_background = 1; + } else { + /* + * scale_step == 0 is our default state. If we have suffered + * latency spikes, step will be > 0, and we shrink the + * allowed write depths. If step is < 0, we're only doing + * writes, and we allow a temporarily higher depth to + * increase performance. + */ + depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); + if (rwb->scale_step > 0) + depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); + else if (rwb->scale_step < 0) { + unsigned int maxd = 3 * rwb->queue_depth / 4; + + depth = 1 + ((depth - 1) << -rwb->scale_step); + if (depth > maxd) { + depth = maxd; + ret = true; + } + } + + /* + * Set our max/normal/bg queue depths based on how far + * we have scaled down (->scale_step). + */ + rwb->wb_max = depth; + rwb->wb_normal = (rwb->wb_max + 1) / 2; + rwb->wb_background = (rwb->wb_max + 3) / 4; + } + + return ret; +} + +static inline bool stat_sample_valid(struct blk_rq_stat *stat) +{ + /* + * We need at least one read sample, and a minimum of + * RWB_MIN_WRITE_SAMPLES. We require some write samples to know + * that it's writes impacting us, and not just some sole read on + * a device that is in a lower power state. + */ + return stat[BLK_STAT_READ].nr_samples >= 1 && + stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; +} + +static u64 rwb_sync_issue_lat(struct rq_wb *rwb) +{ + u64 now, issue = ACCESS_ONCE(rwb->sync_issue); + + if (!issue || !rwb->sync_cookie) + return 0; + + now = ktime_to_ns(ktime_get()); + return now - issue; +} + +enum { + LAT_OK = 1, + LAT_UNKNOWN, + LAT_UNKNOWN_WRITES, + LAT_EXCEEDED, +}; + +static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) +{ + struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + u64 thislat; + + /* + * If our stored sync issue exceeds the window size, or it + * exceeds our min target AND we haven't logged any entries, + * flag the latency as exceeded. wbt works off completion latencies, + * but for a flooded device, a single sync IO can take a long time + * to complete after being issued. If this time exceeds our + * monitoring window AND we didn't see any other completions in that + * window, then count that sync IO as a violation of the latency. + */ + thislat = rwb_sync_issue_lat(rwb); + if (thislat > rwb->cur_win_nsec || + (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { + trace_wbt_lat(bdi, thislat); + return LAT_EXCEEDED; + } + + /* + * No read/write mix, if stat isn't valid + */ + if (!stat_sample_valid(stat)) { + /* + * If we had writes in this stat window and the window is + * current, we're only doing writes. If a task recently + * waited or still has writes in flights, consider us doing + * just writes as well. + */ + if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || + wb_recent_wait(rwb) || wbt_inflight(rwb)) + return LAT_UNKNOWN_WRITES; + return LAT_UNKNOWN; + } + + /* + * If the 'min' latency exceeds our target, step down. + */ + if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { + trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); + trace_wbt_stat(bdi, stat); + return LAT_EXCEEDED; + } + + if (rwb->scale_step) + trace_wbt_stat(bdi, stat); + + return LAT_OK; +} + +static int latency_exceeded(struct rq_wb *rwb) +{ + struct blk_rq_stat stat[2]; + + blk_queue_stat_get(rwb->queue, stat); + return __latency_exceeded(rwb, stat); +} + +static void rwb_trace_step(struct rq_wb *rwb, const char *msg) +{ + struct backing_dev_info *bdi = rwb->queue->backing_dev_info; + + trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, + rwb->wb_background, rwb->wb_normal, rwb->wb_max); +} + +static void scale_up(struct rq_wb *rwb) +{ + /* + * Hit max in previous round, stop here + */ + if (rwb->scaled_max) + return; + + rwb->scale_step--; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->queue); + + rwb->scaled_max = calc_wb_limits(rwb); + + rwb_wake_all(rwb); + + rwb_trace_step(rwb, "step up"); +} + +/* + * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we + * had a latency violation. + */ +static void scale_down(struct rq_wb *rwb, bool hard_throttle) +{ + /* + * Stop scaling down when we've hit the limit. This also prevents + * ->scale_step from going to crazy values, if the device can't + * keep up. + */ + if (rwb->wb_max == 1) + return; + + if (rwb->scale_step < 0 && hard_throttle) + rwb->scale_step = 0; + else + rwb->scale_step++; + + rwb->scaled_max = false; + rwb->unknown_cnt = 0; + blk_stat_clear(rwb->queue); + calc_wb_limits(rwb); + rwb_trace_step(rwb, "step down"); +} + +static void rwb_arm_timer(struct rq_wb *rwb) +{ + unsigned long expires; + + if (rwb->scale_step > 0) { + /* + * We should speed this up, using some variant of a fast + * integer inverse square root calculation. Since we only do + * this for every window expiration, it's not a huge deal, + * though. + */ + rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, + int_sqrt((rwb->scale_step + 1) << 8)); + } else { + /* + * For step < 0, we don't want to increase/decrease the + * window size. + */ + rwb->cur_win_nsec = rwb->win_nsec; + } + + expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); + mod_timer(&rwb->window_timer, expires); +} + +static void wb_timer_fn(unsigned long data) +{ + struct rq_wb *rwb = (struct rq_wb *) data; + unsigned int inflight = wbt_inflight(rwb); + int status; + + status = latency_exceeded(rwb); + + trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, + inflight); + + /* + * If we exceeded the latency target, step down. If we did not, + * step one level up. If we don't know enough to say either exceeded + * or ok, then don't do anything. + */ + switch (status) { + case LAT_EXCEEDED: + scale_down(rwb, true); + break; + case LAT_OK: + scale_up(rwb); + break; + case LAT_UNKNOWN_WRITES: + /* + * We started a the center step, but don't have a valid + * read/write sample, but we do have writes going on. + * Allow step to go negative, to increase write perf. + */ + scale_up(rwb); + break; + case LAT_UNKNOWN: + if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) + break; + /* + * We get here when previously scaled reduced depth, and we + * currently don't have a valid read/write sample. For that + * case, slowly return to center state (step == 0). + */ + if (rwb->scale_step > 0) + scale_up(rwb); + else if (rwb->scale_step < 0) + scale_down(rwb, false); + break; + default: + break; + } + + /* + * Re-arm timer, if we have IO in flight + */ + if (rwb->scale_step || inflight) + rwb_arm_timer(rwb); +} + +void wbt_update_limits(struct rq_wb *rwb) +{ + rwb->scale_step = 0; + rwb->scaled_max = false; + calc_wb_limits(rwb); + + rwb_wake_all(rwb); +} + +static bool close_io(struct rq_wb *rwb) +{ + const unsigned long now = jiffies; + + return time_before(now, rwb->last_issue + HZ / 10) || + time_before(now, rwb->last_comp + HZ / 10); +} + +#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) + +static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +{ + unsigned int limit; + + /* + * At this point we know it's a buffered write. If this is + * kswapd trying to free memory, or REQ_SYNC is set, set, then + * it's WB_SYNC_ALL writeback, and we'll use the max limit for + * that. If the write is marked as a background write, then use + * the idle limit, or go to normal if we haven't had competing + * IO for a bit. + */ + if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + limit = rwb->wb_max; + else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + /* + * If less than 100ms since we completed unrelated IO, + * limit us to half the depth for background writeback. + */ + limit = rwb->wb_background; + } else + limit = rwb->wb_normal; + + return limit; +} + +static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, + wait_queue_t *wait, unsigned long rw) +{ + /* + * inc it here even if disabled, since we'll dec it at completion. + * this only happens if the task was sleeping in __wbt_wait(), + * and someone turned it off at the same time. + */ + if (!rwb_enabled(rwb)) { + atomic_inc(&rqw->inflight); + return true; + } + + /* + * If the waitqueue is already active and we are not the next + * in line to be woken up, wait for our turn. + */ + if (waitqueue_active(&rqw->wait) && + rqw->wait.task_list.next != &wait->task_list) + return false; + + return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); +} + +/* + * Block if we will exceed our limit, or if we are currently waiting for + * the timer to kick off queuing again. + */ +static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) + __releases(lock) + __acquires(lock) +{ + struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); + DEFINE_WAIT(wait); + + if (may_queue(rwb, rqw, &wait, rw)) + return; + + do { + prepare_to_wait_exclusive(&rqw->wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (may_queue(rwb, rqw, &wait, rw)) + break; + + if (lock) { + spin_unlock_irq(lock); + io_schedule(); + spin_lock_irq(lock); + } else + io_schedule(); + } while (1); + + finish_wait(&rqw->wait, &wait); +} + +static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +{ + const int op = bio_op(bio); + + /* + * If not a WRITE, do nothing + */ + if (op != REQ_OP_WRITE) + return false; + + /* + * Don't throttle WRITE_ODIRECT + */ + if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) + return false; + + return true; +} + +/* + * Returns true if the IO request should be accounted, false if not. + * May sleep, if we have exceeded the writeback limits. Caller can pass + * in an irq held spinlock, if it holds one when calling this function. + * If we do sleep, we'll release and re-grab it. + */ +enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) +{ + unsigned int ret = 0; + + if (!rwb_enabled(rwb)) + return 0; + + if (bio_op(bio) == REQ_OP_READ) + ret = WBT_READ; + + if (!wbt_should_throttle(rwb, bio)) { + if (ret & WBT_READ) + wb_timestamp(rwb, &rwb->last_issue); + return ret; + } + + __wbt_wait(rwb, bio->bi_opf, lock); + + if (!timer_pending(&rwb->window_timer)) + rwb_arm_timer(rwb); + + if (current_is_kswapd()) + ret |= WBT_KSWAPD; + + return ret | WBT_TRACKED; +} + +void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + + /* + * Track sync issue, in case it takes a long time to complete. Allows + * us to react quicker, if a sync IO takes a long time to complete. + * Note that this is just a hint. 'stat' can go away when the + * request completes, so it's important we never dereference it. We + * only use the address to compare with, which is why we store the + * sync_issue time locally. + */ + if (wbt_is_read(stat) && !rwb->sync_issue) { + rwb->sync_cookie = stat; + rwb->sync_issue = blk_stat_time(stat); + } +} + +void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) +{ + if (!rwb_enabled(rwb)) + return; + if (stat == rwb->sync_cookie) { + rwb->sync_issue = 0; + rwb->sync_cookie = NULL; + } +} + +void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) +{ + if (rwb) { + rwb->queue_depth = depth; + wbt_update_limits(rwb); + } +} + +void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) +{ + if (rwb) + rwb->wc = write_cache_on; +} + + /* + * Disable wbt, if enabled by default. Only called from CFQ, if we have + * cgroups enabled + */ +void wbt_disable_default(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { + del_timer_sync(&rwb->window_timer); + rwb->win_nsec = rwb->min_lat_nsec = 0; + wbt_update_limits(rwb); + } +} +EXPORT_SYMBOL_GPL(wbt_disable_default); + +u64 wbt_default_latency_nsec(struct request_queue *q) +{ + /* + * We default to 2msec for non-rotational storage, and 75msec + * for rotational storage. + */ + if (blk_queue_nonrot(q)) + return 2000000ULL; + else + return 75000000ULL; +} + +int wbt_init(struct request_queue *q) +{ + struct rq_wb *rwb; + int i; + + /* + * For now, we depend on the stats window being larger than + * our monitoring window. Ensure that this isn't inadvertently + * violated. + */ + BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); + BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); + + rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); + if (!rwb) + return -ENOMEM; + + for (i = 0; i < WBT_NUM_RWQ; i++) { + atomic_set(&rwb->rq_wait[i].inflight, 0); + init_waitqueue_head(&rwb->rq_wait[i].wait); + } + + setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); + rwb->wc = 1; + rwb->queue_depth = RWB_DEF_DEPTH; + rwb->last_comp = rwb->last_issue = jiffies; + rwb->queue = q; + rwb->win_nsec = RWB_WINDOW_NSEC; + rwb->enable_state = WBT_STATE_ON_DEFAULT; + wbt_update_limits(rwb); + + /* + * Assign rwb, and turn on stats tracking for this queue + */ + q->rq_wb = rwb; + blk_stat_enable(q); + + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + + wbt_set_queue_depth(rwb, blk_queue_depth(q)); + wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); + + return 0; +} + +void wbt_exit(struct request_queue *q) +{ + struct rq_wb *rwb = q->rq_wb; + + if (rwb) { + del_timer_sync(&rwb->window_timer); + q->rq_wb = NULL; + kfree(rwb); + } +} diff --git a/block/genhd.c b/block/genhd.c index 18a0783afee4..4cdfe556f6e8 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -611,7 +611,7 @@ void add_disk(struct gendisk *disk) disk_alloc_events(disk); /* Register BDI before referencing it from bdev */ - bdi = &disk->queue->backing_dev_info; + bdi = disk->queue->backing_dev_info; bdi_register_owner(bdi, disk_to_dev(disk)); blk_register_region(disk_devt(disk), disk->minors, NULL, diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c index dd73e1ff1759..aadab0381e0d 100644 --- a/drivers/block/aoe/aoeblk.c +++ b/drivers/block/aoe/aoeblk.c @@ -396,8 +396,8 @@ aoeblk_gdalloc(void *vp) WARN_ON(d->gd); WARN_ON(d->flags & DEVFL_UP); blk_queue_max_hw_sectors(q, BLK_DEF_MAX_SECTORS); - q->backing_dev_info.name = "aoe"; - q->backing_dev_info.ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; + q->backing_dev_info->name = "aoe"; + q->backing_dev_info->ra_pages = READ_AHEAD / PAGE_CACHE_SIZE; d->bufpool = mp; d->blkq = gd->queue = q; q->queuedata = d; diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 1d58854c4a9f..1f9c77609dd1 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -2393,7 +2393,7 @@ static int drbd_congested(void *congested_data, int bdi_bits) if (get_ldev(device)) { q = bdev_get_queue(device->ldev->backing_bdev); - r = bdi_congested(&q->backing_dev_info, bdi_bits); + r = bdi_congested(q->backing_dev_info, bdi_bits); put_ldev(device); if (r) reason = 'b'; @@ -2765,8 +2765,8 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig /* we have no partitions. we contain only ourselves. */ device->this_bdev->bd_contains = device->this_bdev; - q->backing_dev_info.congested_fn = drbd_congested; - q->backing_dev_info.congested_data = device; + q->backing_dev_info->congested_fn = drbd_congested; + q->backing_dev_info->congested_data = device; blk_queue_make_request(q, drbd_make_request); blk_queue_flush(q, REQ_FLUSH | REQ_FUA); diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e80cbefbc2b5..c637a0d1c06f 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -1170,11 +1170,13 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi blk_queue_stack_limits(q, b); - if (q->backing_dev_info.ra_pages != b->backing_dev_info.ra_pages) { + if (q->backing_dev_info->ra_pages != + b->backing_dev_info->ra_pages) { drbd_info(device, "Adjusting my ra_pages to backing device's (%lu -> %lu)\n", - q->backing_dev_info.ra_pages, - b->backing_dev_info.ra_pages); - q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; + q->backing_dev_info->ra_pages, + b->backing_dev_info->ra_pages); + q->backing_dev_info->ra_pages = + b->backing_dev_info->ra_pages; } } } diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 3b10fa6cb039..7a6b9f3e1a9f 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -288,7 +288,7 @@ static int drbd_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "%2d: cs:Unconfigured\n", i); } else { /* reset device->congestion_reason */ - bdi_rw_congested(&device->rq_queue->backing_dev_info); + bdi_rw_congested(device->rq_queue->backing_dev_info); nc = rcu_dereference(first_peer_device(device)->connection->net_conf); wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' '; diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3ae2c0086563..17ae4e1ab358 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -937,7 +937,7 @@ static bool remote_due_to_read_balancing(struct drbd_device *device, sector_t se switch (rbm) { case RB_CONGESTED_REMOTE: - bdi = &device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; + bdi = device->ldev->backing_bdev->bd_disk->queue->backing_dev_info; return bdi_read_congested(bdi); case RB_LEAST_PENDING: return atomic_read(&device->local_cnt) > diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index d06c62eccdf0..f018318d4466 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -1276,7 +1276,7 @@ try_next_bio: && pd->bio_queue_size <= pd->write_congestion_off); spin_unlock(&pd->lock); if (wakeup) { - clear_bdi_congested(&pd->disk->queue->backing_dev_info, + clear_bdi_congested(pd->disk->queue->backing_dev_info, BLK_RW_ASYNC); } @@ -2405,7 +2405,7 @@ static void pkt_make_request_write(struct request_queue *q, struct bio *bio) spin_lock(&pd->lock); if (pd->write_congestion_on > 0 && pd->bio_queue_size >= pd->write_congestion_on) { - set_bdi_congested(&q->backing_dev_info, BLK_RW_ASYNC); + set_bdi_congested(q->backing_dev_info, BLK_RW_ASYNC); do { spin_unlock(&pd->lock); congestion_wait(BLK_RW_ASYNC, HZ); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index fbdddd6f94b8..55a8671f1979 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -3780,7 +3780,7 @@ static int rbd_init_disk(struct rbd_device *rbd_dev) q->limits.discard_zeroes_data = 1; if (!ceph_test_opt(rbd_dev->rbd_client->client, NOCRC)) - q->backing_dev_info.capabilities |= BDI_CAP_STABLE_WRITES; + q->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; disk->queue = q; diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 25fa8445bb24..f43ff2601242 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -1016,7 +1016,7 @@ static int cached_dev_congested(void *data, int bits) struct request_queue *q = bdev_get_queue(dc->bdev); int ret = 0; - if (bdi_congested(&q->backing_dev_info, bits)) + if (bdi_congested(q->backing_dev_info, bits)) return 1; if (cached_dev_get(dc)) { @@ -1025,7 +1025,7 @@ static int cached_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } cached_dev_put(dc); @@ -1039,7 +1039,7 @@ void bch_cached_dev_request_init(struct cached_dev *dc) struct gendisk *g = dc->disk.disk; g->queue->make_request_fn = cached_dev_make_request; - g->queue->backing_dev_info.congested_fn = cached_dev_congested; + g->queue->backing_dev_info->congested_fn = cached_dev_congested; dc->disk.cache_miss = cached_dev_cache_miss; dc->disk.ioctl = cached_dev_ioctl; } @@ -1132,7 +1132,7 @@ static int flash_dev_congested(void *data, int bits) for_each_cache(ca, d->c, i) { q = bdev_get_queue(ca->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; @@ -1143,7 +1143,7 @@ void bch_flash_dev_request_init(struct bcache_device *d) struct gendisk *g = d->disk; g->queue->make_request_fn = flash_dev_make_request; - g->queue->backing_dev_info.congested_fn = flash_dev_congested; + g->queue->backing_dev_info->congested_fn = flash_dev_congested; d->cache_miss = flash_dev_cache_miss; d->ioctl = flash_dev_ioctl; } diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index e28554d5e57b..0a4547c09cb3 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -802,7 +802,7 @@ static int bcache_device_init(struct bcache_device *d, unsigned block_size, blk_queue_make_request(q, NULL); d->disk->queue = q; q->queuedata = d; - q->backing_dev_info.congested_data = d; + q->backing_dev_info->congested_data = d; q->limits.max_hw_sectors = UINT_MAX; q->limits.max_sectors = UINT_MAX; q->limits.max_segment_size = UINT_MAX; @@ -1127,9 +1127,9 @@ static int cached_dev_init(struct cached_dev *dc, unsigned block_size) set_capacity(dc->disk.disk, dc->bdev->bd_part->nr_sects - dc->sb.data_offset); - dc->disk.disk->queue->backing_dev_info.ra_pages = - max(dc->disk.disk->queue->backing_dev_info.ra_pages, - q->backing_dev_info.ra_pages); + dc->disk.disk->queue->backing_dev_info->ra_pages = + max(dc->disk.disk->queue->backing_dev_info->ra_pages, + q->backing_dev_info->ra_pages); bch_cached_dev_request_init(dc); bch_cached_dev_writeback_init(dc); diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index 515f83e7d9ab..ff8845a30de0 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2291,7 +2291,7 @@ static void do_waker(struct work_struct *ws) static int is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int cache_is_congested(struct dm_target_callbacks *cb, int bdi_bits) diff --git a/drivers/md/dm-era-target.c b/drivers/md/dm-era-target.c index 665bf3285618..8975f19617ce 100644 --- a/drivers/md/dm-era-target.c +++ b/drivers/md/dm-era-target.c @@ -1379,7 +1379,7 @@ static void stop_worker(struct era *era) static int dev_is_congested(struct dm_dev *dev, int bdi_bits) { struct request_queue *q = bdev_get_queue(dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static int era_is_congested(struct dm_target_callbacks *cb, int bdi_bits) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 2c422257174f..5533fb8dbe37 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1659,7 +1659,7 @@ int dm_table_any_congested(struct dm_table *t, int bdi_bits) char b[BDEVNAME_SIZE]; if (likely(q)) - r |= bdi_congested(&q->backing_dev_info, bdi_bits); + r |= bdi_congested(q->backing_dev_info, bdi_bits); else DMWARN_LIMIT("%s: any_congested: nonexistent device %s", dm_device_name(t->md), diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c index a1cc797fe88f..5f1a943d9e81 100644 --- a/drivers/md/dm-thin.c +++ b/drivers/md/dm-thin.c @@ -2634,7 +2634,7 @@ static int pool_is_congested(struct dm_target_callbacks *cb, int bdi_bits) return 1; q = bdev_get_queue(pt->data_dev->bdev); - return bdi_congested(&q->backing_dev_info, bdi_bits); + return bdi_congested(q->backing_dev_info, bdi_bits); } static void requeue_bios(struct pool *pool) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 3384a3eef917..ff91489b6e41 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -2152,7 +2152,7 @@ static int dm_any_congested(void *congested_data, int bdi_bits) * the query about congestion status of request_queue */ if (dm_request_based(md)) - r = md->queue->backing_dev_info.wb.state & + r = md->queue->backing_dev_info->wb.state & bdi_bits; else r = dm_table_any_congested(map, bdi_bits); @@ -2234,7 +2234,7 @@ static void dm_init_md_queue(struct mapped_device *md) * - must do so here (in alloc_dev callchain) before queue is used */ md->queue->queuedata = md; - md->queue->backing_dev_info.congested_data = md; + md->queue->backing_dev_info->congested_data = md; } static void dm_init_old_md_queue(struct mapped_device *md) @@ -2245,7 +2245,7 @@ static void dm_init_old_md_queue(struct mapped_device *md) /* * Initialize aspects of queue that aren't relevant for blk-mq */ - md->queue->backing_dev_info.congested_fn = dm_any_congested; + md->queue->backing_dev_info->congested_fn = dm_any_congested; blk_queue_bounce_limit(md->queue, BLK_BOUNCE_ANY); } diff --git a/drivers/md/linear.c b/drivers/md/linear.c index b7fe7e9fc777..77dbfee4ba39 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -61,7 +61,7 @@ static int linear_congested(struct mddev *mddev, int bits) for (i = 0; i < mddev->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; diff --git a/drivers/md/md.c b/drivers/md/md.c index eff554a12fb4..5c8afb6628c7 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -5284,8 +5284,8 @@ int md_run(struct mddev *mddev) return err; } if (mddev->queue) { - mddev->queue->backing_dev_info.congested_data = mddev; - mddev->queue->backing_dev_info.congested_fn = md_congested; + mddev->queue->backing_dev_info->congested_data = mddev; + mddev->queue->backing_dev_info->congested_fn = md_congested; } if (pers->sync_request) { if (mddev->kobj.sd && @@ -5642,7 +5642,7 @@ static int do_md_stop(struct mddev *mddev, int mode, __md_stop_writes(mddev); __md_stop(mddev); - mddev->queue->backing_dev_info.congested_fn = NULL; + mddev->queue->backing_dev_info->congested_fn = NULL; /* tell userspace to handle 'inactive' */ sysfs_notify_dirent_safe(mddev->sysfs_state); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index dd483bb2e111..fb03ed86d57a 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -166,7 +166,7 @@ static int multipath_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); /* Just like multipath_map, we just check the * first available device */ diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index f8e5db0cb5aa..7a67e7dcf546 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -35,7 +35,7 @@ static int raid0_congested(struct mddev *mddev, int bits) for (i = 0; i < raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(devlist[i]->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } return ret; } @@ -415,8 +415,8 @@ static int raid0_run(struct mddev *mddev) */ int stripe = mddev->raid_disks * (mddev->chunk_sectors << 9) / PAGE_SIZE; - if (mddev->queue->backing_dev_info.ra_pages < 2* stripe) - mddev->queue->backing_dev_info.ra_pages = 2* stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2* stripe) + mddev->queue->backing_dev_info->ra_pages = 2* stripe; } dump_zones(mddev); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 515554c7365b..1d40f78c008c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -730,9 +730,9 @@ static int raid1_congested(struct mddev *mddev, int bits) * non-congested targets, it can be removed */ if ((bits & (1 << WB_async_congested)) || 1) - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); else - ret &= bdi_congested(&q->backing_dev_info, bits); + ret &= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index ebb0dd612ebd..316ff22965d2 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -838,7 +838,7 @@ static int raid10_congested(struct mddev *mddev, int bits) if (rdev && !test_bit(Faulty, &rdev->flags)) { struct request_queue *q = bdev_get_queue(rdev->bdev); - ret |= bdi_congested(&q->backing_dev_info, bits); + ret |= bdi_congested(q->backing_dev_info, bits); } } rcu_read_unlock(); @@ -3679,8 +3679,8 @@ static int run(struct mddev *mddev) * maybe... */ stripe /= conf->geo.near_copies; - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } if (md_integrity_register(mddev)) @@ -4474,8 +4474,8 @@ static void end_reshape(struct r10conf *conf) int stripe = conf->geo.raid_disks * ((conf->mddev->chunk_sectors << 9) / PAGE_SIZE); stripe /= conf->geo.near_copies; - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } conf->fullsync = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 7af976934441..cd13f4a12506 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6114,10 +6114,10 @@ raid5_store_skip_copy(struct mddev *mddev, const char *page, size_t len) mddev_suspend(mddev); conf->skip_copy = new; if (new) - mddev->queue->backing_dev_info.capabilities |= + mddev->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; else - mddev->queue->backing_dev_info.capabilities &= + mddev->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; mddev_resume(mddev); } @@ -6961,8 +6961,8 @@ static int run(struct mddev *mddev) int data_disks = conf->previous_raid_disks - conf->max_degraded; int stripe = data_disks * ((mddev->chunk_sectors << 9) / PAGE_SIZE); - if (mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + mddev->queue->backing_dev_info->ra_pages = 2 * stripe; chunk_size = mddev->chunk_sectors << 9; blk_queue_io_min(mddev->queue, chunk_size); @@ -7545,8 +7545,8 @@ static void end_reshape(struct r5conf *conf) int data_disks = conf->raid_disks - conf->max_degraded; int stripe = data_disks * ((conf->chunk_sectors << 9) / PAGE_SIZE); - if (conf->mddev->queue->backing_dev_info.ra_pages < 2 * stripe) - conf->mddev->queue->backing_dev_info.ra_pages = 2 * stripe; + if (conf->mddev->queue->backing_dev_info->ra_pages < 2 * stripe) + conf->mddev->queue->backing_dev_info->ra_pages = 2 * stripe; } } } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index baab99b69d8a..de3e91817228 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1222,7 +1222,7 @@ static int set_gfs2_super(struct super_block *s, void *data) * We set the bdi here to the queue backing, file systems can * overwrite this in ->fill_super() */ - s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 354013ea22ec..67c6c650b21e 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -1079,7 +1079,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_gran = 1; sb->s_max_links = NILFS_LINK_MAX; - sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info; + sb->s_bdi = bdev_get_queue(sb->s_bdev)->backing_dev_info; err = load_nilfs(nilfs, sb); if (err) diff --git a/fs/super.c b/fs/super.c index d28b7a9f5e74..7bf0df59bd30 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1011,7 +1011,7 @@ static int set_bdev_super(struct super_block *s, void *data) * We set the bdi here to the queue backing, file systems can * overwrite this in ->fill_super() */ - s->s_bdi = &bdev_get_queue(s->s_bdev)->backing_dev_info; + s->s_bdi = bdev_get_queue(s->s_bdev)->backing_dev_info; return 0; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f28a95a06c08..877338609f30 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -329,7 +329,8 @@ struct request_queue { */ struct delayed_work delay_work; - struct backing_dev_info backing_dev_info; + struct backing_dev_info *backing_dev_info; + struct backing_dev_info _backing_dev_info; /* * The queue owner gets to use this for whatever they like. diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fd51ebfc423f..36040c2b4026 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1977,11 +1977,11 @@ void laptop_mode_timer_fn(unsigned long data) * We want to write everything out, not just down to the dirty * threshold */ - if (!bdi_has_dirty_io(&q->backing_dev_info)) + if (!bdi_has_dirty_io(q->backing_dev_info)) return; rcu_read_lock(); - list_for_each_entry_rcu(wb, &q->backing_dev_info.wb_list, bdi_node) + list_for_each_entry_rcu(wb, &q->backing_dev_info->wb_list, bdi_node) if (wb_has_dirty_io(wb)) wb_start_writeback(wb, nr_pages, true, WB_REASON_LAPTOP_TIMER); -- 2.7.4 From e61392a1c01f2e4062383a02b0d8958bdb2b5797 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:51 +0100 Subject: [PATCH 03/24] block: Dynamically allocate and refcount backing_dev_info Instead of storing backing_dev_info inside struct request_queue, allocate it dynamically, reference count it, and free it when the last reference is dropped. Currently only request_queue holds the reference but in the following patch we add other users referencing backing_dev_info. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit d03f6cdc1fc422accb734c7c07a661a0018d8631) Conflicts: block/blk-sysfs.c include/linux/backing-dev-defs.h Signed-off-by: Thiago Jung Bauermann --- block/blk-core.c | 12 +++++------- block/blk-sysfs.c | 2 +- include/linux/backing-dev-defs.h | 2 ++ include/linux/backing-dev.h | 10 +++++++++- include/linux/blkdev.h | 1 - mm/backing-dev.c | 34 +++++++++++++++++++++++++++++++++- 6 files changed, 50 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0fb635dbe913..079de010d769 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -692,7 +692,6 @@ static void blk_rq_timed_out_timer(unsigned long data) struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; - int err; q = kmem_cache_alloc_node(blk_requestq_cachep, gfp_mask | __GFP_ZERO, node_id); @@ -707,17 +706,16 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) if (!q->bio_split) goto fail_id; - q->backing_dev_info = &q->_backing_dev_info; + q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id); + if (!q->backing_dev_info) + goto fail_split; + q->backing_dev_info->ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->backing_dev_info->name = "block"; q->node = node_id; - err = bdi_init(q->backing_dev_info); - if (err) - goto fail_split; - setup_timer(&q->backing_dev_info->laptop_mode_wb_timer, laptop_mode_timer_fn, (unsigned long) q); setup_timer(&q->timeout, blk_rq_timed_out_timer, (unsigned long) q); @@ -768,7 +766,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) fail_ref: percpu_ref_exit(&q->q_usage_counter); fail_bdi: - bdi_destroy(q->backing_dev_info); + bdi_put(q->backing_dev_info); fail_split: bioset_free(q->bio_split); fail_id: diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 8e1e284fa159..a113dc1e3eb7 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -578,7 +578,7 @@ static void blk_release_queue(struct kobject *kobj) struct request_queue *q = container_of(kobj, struct request_queue, kobj); - bdi_exit(q->backing_dev_info); + bdi_put(q->backing_dev_info); blkcg_exit_queue(q); if (q->elevator) { diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 140c29635069..8f9299aeede2 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -10,6 +10,7 @@ #include #include #include +#include struct page; struct device; @@ -142,6 +143,7 @@ struct backing_dev_info { char *name; + struct kref refcnt; /* Reference counter for the structure */ unsigned int min_ratio; unsigned int max_ratio, max_prop_frac; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 89d3de3e096b..125bc67319b4 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -18,7 +18,14 @@ #include int __must_check bdi_init(struct backing_dev_info *bdi); -void bdi_exit(struct backing_dev_info *bdi); + +static inline struct backing_dev_info *bdi_get(struct backing_dev_info *bdi) +{ + kref_get(&bdi->refcnt); + return bdi; +} + +void bdi_put(struct backing_dev_info *bdi); __printf(3, 4) int bdi_register(struct backing_dev_info *bdi, struct device *parent, @@ -29,6 +36,7 @@ void bdi_unregister(struct backing_dev_info *bdi); int __must_check bdi_setup_and_register(struct backing_dev_info *, char *); void bdi_destroy(struct backing_dev_info *bdi); +struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id); void wb_start_writeback(struct bdi_writeback *wb, long nr_pages, bool range_cyclic, enum wb_reason reason); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 877338609f30..c91ac1a01ccb 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -330,7 +330,6 @@ struct request_queue { struct delayed_work delay_work; struct backing_dev_info *backing_dev_info; - struct backing_dev_info _backing_dev_info; /* * The queue owner gets to use this for whatever they like. diff --git a/mm/backing-dev.c b/mm/backing-dev.c index a1aef87a2e70..b76398c00fec 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -237,6 +237,7 @@ static __init int bdi_class_init(void) bdi_class->dev_groups = bdi_dev_groups; bdi_debug_init(); + return 0; } postcore_initcall(bdi_class_init); @@ -775,6 +776,7 @@ int bdi_init(struct backing_dev_info *bdi) bdi->dev = NULL; + kref_init(&bdi->refcnt); bdi->min_ratio = 0; bdi->max_ratio = 100; bdi->max_prop_frac = FPROP_FRAC_BASE; @@ -790,6 +792,22 @@ int bdi_init(struct backing_dev_info *bdi) } EXPORT_SYMBOL(bdi_init); +struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) +{ + struct backing_dev_info *bdi; + + bdi = kmalloc_node(sizeof(struct backing_dev_info), + gfp_mask | __GFP_ZERO, node_id); + if (!bdi) + return NULL; + + if (bdi_init(bdi)) { + kfree(bdi); + return NULL; + } + return bdi; +} + int bdi_register(struct backing_dev_info *bdi, struct device *parent, const char *fmt, ...) { @@ -870,12 +888,26 @@ void bdi_unregister(struct backing_dev_info *bdi) } } -void bdi_exit(struct backing_dev_info *bdi) +static void bdi_exit(struct backing_dev_info *bdi) { WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); } +static void release_bdi(struct kref *ref) +{ + struct backing_dev_info *bdi = + container_of(ref, struct backing_dev_info, refcnt); + + bdi_exit(bdi); + kfree(bdi); +} + +void bdi_put(struct backing_dev_info *bdi) +{ + kref_put(&bdi->refcnt, release_bdi); +} + void bdi_destroy(struct backing_dev_info *bdi) { bdi_unregister(bdi); -- 2.7.4 From 91ca48091ea8c25c2da9616e8da9b9528103f210 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:52 +0100 Subject: [PATCH 04/24] block: Make blk_get_backing_dev_info() safe without open bdev Currenly blk_get_backing_dev_info() is not safe to be called when the block device is not open as bdev->bd_disk is NULL in that case. However inode_to_bdi() uses this function and may be call called from flusher worker or other writeback related functions without bdev being open which leads to crashes such as: [113031.075540] Unable to handle kernel paging request for data at address 0x00000000 [113031.075614] Faulting instruction address: 0xc0000000003692e0 0:mon> t [c0000000fb65f900] c00000000036cb6c writeback_sb_inodes+0x30c/0x590 [c0000000fb65fa10] c00000000036ced4 __writeback_inodes_wb+0xe4/0x150 [c0000000fb65fa70] c00000000036d33c wb_writeback+0x30c/0x450 [c0000000fb65fb40] c00000000036e198 wb_workfn+0x268/0x580 [c0000000fb65fc50] c0000000000f3470 process_one_work+0x1e0/0x590 [c0000000fb65fce0] c0000000000f38c8 worker_thread+0xa8/0x660 [c0000000fb65fd80] c0000000000fc4b0 kthread+0x110/0x130 [c0000000fb65fe30] c0000000000098f0 ret_from_kernel_thread+0x5c/0x6c Signed-off-by: Jens Axboe (cherry picked from commit b1d2dc5659b41741f5a29b2ade76ffb4e5bb13d8) Conflicts: fs/block_dev.c Signed-off-by: Thiago Jung Bauermann --- block/blk-core.c | 8 +++----- fs/block_dev.c | 8 ++++++++ include/linux/fs.h | 1 + 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 079de010d769..729bf490a460 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -108,14 +108,12 @@ void blk_queue_congestion_threshold(struct request_queue *q) * @bdev: device * * Locates the passed device's request queue and returns the address of its - * backing_dev_info. This function can only be called if @bdev is opened - * and the return value is never NULL. + * backing_dev_info. The return value is never NULL however we may return + * &noop_backing_dev_info if the bdev is not currently open. */ struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - return q->backing_dev_info; + return bdev->bd_bdi; } EXPORT_SYMBOL(blk_get_backing_dev_info); diff --git a/fs/block_dev.c b/fs/block_dev.c index 702480b40d64..3783ea8d3681 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -558,6 +558,8 @@ static void bdev_evict_inode(struct inode *inode) } list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + if (bdev->bd_bdi != &noop_backing_dev_info) + bdi_put(bdev->bd_bdi); } static const struct super_operations bdev_sops = { @@ -656,6 +658,7 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; + bdev->bd_bdi = &noop_backing_dev_info; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; @@ -1219,6 +1222,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); + if (!partno) { ret = -ENXIO; bdev->bd_part = disk_get_part(disk, partno); @@ -1319,6 +1325,8 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; + bdi_put(bdev->bd_bdi); + bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; diff --git a/include/linux/fs.h b/include/linux/fs.h index f0ecce474110..df3ea97f9168 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -470,6 +470,7 @@ struct block_device { int bd_invalidated; struct gendisk * bd_disk; struct request_queue * bd_queue; + struct backing_dev_info *bd_bdi; struct list_head bd_list; /* * Private data. You must have bd_claim'ed the block_device -- 2.7.4 From c47a9b5dc2d0cc142dbcab17c6ae3c6eb6786978 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Feb 2017 15:56:53 +0100 Subject: [PATCH 05/24] block: Get rid of blk_get_backing_dev_info() blk_get_backing_dev_info() is now a simple dereference. Remove that function and simplify some code around that. Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit efa7c9f97e3ef624e9a398bf69c15f58eea9f0e8) Conflicts: block/compat_ioctl.c block/ioctl.c fs/btrfs/volumes.c include/linux/blkdev.h Signed-off-by: Thiago Jung Bauermann --- block/blk-core.c | 14 -------------- block/compat_ioctl.c | 7 ++----- block/ioctl.c | 7 ++----- fs/btrfs/disk-io.c | 2 +- fs/btrfs/volumes.c | 2 +- fs/xfs/xfs_buf.c | 3 +-- fs/xfs/xfs_buf.h | 1 - include/linux/backing-dev.h | 2 +- include/linux/blkdev.h | 1 - 9 files changed, 8 insertions(+), 31 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 729bf490a460..4fdc9bbeb6f0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -103,20 +103,6 @@ void blk_queue_congestion_threshold(struct request_queue *q) q->nr_congestion_off = nr; } -/** - * blk_get_backing_dev_info - get the address of a queue's backing_dev_info - * @bdev: device - * - * Locates the passed device's request queue and returns the address of its - * backing_dev_info. The return value is never NULL however we may return - * &noop_backing_dev_info if the bdev is not currently open. - */ -struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev) -{ - return bdev->bd_bdi; -} -EXPORT_SYMBOL(blk_get_backing_dev_info); - void blk_rq_init(struct request_queue *q, struct request *rq) { memset(rq, 0, sizeof(*rq)); diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f678c733df40..37e01e265ec2 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -661,7 +661,6 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) struct block_device *bdev = inode->i_bdev; struct gendisk *disk = bdev->bd_disk; fmode_t mode = file->f_mode; - struct backing_dev_info *bdi; loff_t size; unsigned int max_sectors; @@ -708,9 +707,8 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKFRAGET: if (!arg) return -EINVAL; - bdi = blk_get_backing_dev_info(bdev); return compat_put_long(arg, - (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); + (bdev->bd_bdi->ra_pages * PAGE_CACHE_SIZE) / 512); case BLKROGET: /* compatible */ return compat_put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -728,8 +726,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKFRASET: if (!capable(CAP_SYS_ADMIN)) return -EACCES; - bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; + bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKGETSIZE: size = i_size_read(bdev->bd_inode); diff --git a/block/ioctl.c b/block/ioctl.c index 0918aed2d847..7a87656b25df 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -496,7 +496,6 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long arg) { - struct backing_dev_info *bdi; void __user *argp = (void __user *)arg; loff_t size; unsigned int max_sectors; @@ -519,8 +518,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!arg) return -EINVAL; - bdi = blk_get_backing_dev_info(bdev); - return put_long(arg, (bdi->ra_pages * PAGE_CACHE_SIZE) / 512); + return put_long(arg, (bdev->bd_bdi->ra_pages*PAGE_CACHE_SIZE) / 512); case BLKROGET: return put_int(arg, bdev_read_only(bdev) != 0); case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -547,8 +545,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdi = blk_get_backing_dev_info(bdev); - bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; + bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_CACHE_SIZE; return 0; case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 85b207d19aa5..fe1f8cf9f86d 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1727,7 +1727,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) list_for_each_entry_rcu(device, &info->fs_devices->devices, dev_list) { if (!device->bdev) continue; - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c62a6f9757a..9de682264557 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -351,7 +351,7 @@ static noinline void run_scheduled_bios(struct btrfs_device *device) */ blk_start_plug(&plug); - bdi = blk_get_backing_dev_info(device->bdev); + bdi = device->bdev->bd_bdi; fs_info = device->dev_root->fs_info; limit = btrfs_async_submit_limit(fs_info); limit = limit * 2 / 3; diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index eb1b8c8acfcb..0cbdb1ef561a 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -681,7 +681,7 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { - if (bdi_read_congested(target->bt_bdi)) + if (bdi_read_congested(target->bt_bdev->bd_bdi)) return; xfs_buf_read_map(target, map, nmaps, @@ -1691,7 +1691,6 @@ xfs_alloc_buftarg( btp->bt_mount = mp; btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; - btp->bt_bdi = blk_get_backing_dev_info(bdev); if (xfs_setsize_buftarg_early(btp, bdev)) goto error; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c75721acd867..ed6f49738a82 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -105,7 +105,6 @@ typedef unsigned int xfs_buf_flags_t; typedef struct xfs_buftarg { dev_t bt_dev; struct block_device *bt_bdev; - struct backing_dev_info *bt_bdi; struct xfs_mount *bt_mount; unsigned int bt_meta_sectorsize; size_t bt_meta_sectormask; diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 125bc67319b4..587dc540767e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -191,7 +191,7 @@ static inline struct backing_dev_info *inode_to_bdi(struct inode *inode) sb = inode->i_sb; #ifdef CONFIG_BLOCK if (sb_is_blkdev_sb(sb)) - return blk_get_backing_dev_info(I_BDEV(inode)); + return I_BDEV(inode)->bd_bdi; #endif return sb->s_bdi; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c91ac1a01ccb..0abef0464bcc 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1004,7 +1004,6 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern void blk_queue_flush(struct request_queue *q, unsigned int flush); extern void blk_queue_flush_queueable(struct request_queue *q, bool queueable); -extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -- 2.7.4 From 2f7bd2a6b7289e78963016b0c3b4b5dc785edf99 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 8 Feb 2017 15:19:07 -0500 Subject: [PATCH 06/24] block: fix double-free in the failure path of cgwb_bdi_init() When !CONFIG_CGROUP_WRITEBACK, bdi has single bdi_writeback_congested at bdi->wb_congested. cgwb_bdi_init() allocates it with kzalloc() and doesn't do further initialization. This usually works fine as the reference count gets bumped to 1 by wb_init() and the put from wb_exit() releases it. However, when wb_init() fails, it puts the wb base ref automatically freeing the wb and the explicit kfree() in cgwb_bdi_init() error path ends up trying to free the same pointer the second time causing a double-free. Fix it by explicitly initilizing the refcnt to 1 and putting the base ref from cgwb_bdi_destroy(). Signed-off-by: Tejun Heo Reported-by: Dmitry Vyukov Fixes: a13f35e87140 ("writeback: don't embed root bdi_writeback_congested in bdi_writeback") Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Jens Axboe (cherry picked from commit 5f478e4ea5c5560b4e40eb136991a09f9389f331) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b76398c00fec..320fd4d27039 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -758,15 +758,20 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; + atomic_set(&bdi->wb_congested->refcnt, 1); + err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { - kfree(bdi->wb_congested); + wb_congested_put(bdi->wb_congested); return err; } return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +{ + wb_congested_put(bdi->wb_congested); +} #endif /* CONFIG_CGROUP_WRITEBACK */ -- 2.7.4 From 56ef9baa06ed0c29162a278d2a8632006b388d8c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Feb 2017 18:09:46 +0100 Subject: [PATCH 07/24] block: Move bdev_unhash_inode() after invalidate_partition() Move bdev_unhash_inode() after invalidate_partition() as invalidate_partition() looks up bdev and it cannot find the right bdev inode after bdev_unhash_inode() is called. Thus invalidate_partition() would not invalidate page cache of the previously used bdev. Also use part_devt() when calling bdev_unhash_inode() instead of manually creating the device number. Tested-by: Lekshmi Pillai Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit 4b8c861a7c79806fb9ee564c87f517dc26fc2d1e) Signed-off-by: Thiago Jung Bauermann --- block/genhd.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 4cdfe556f6e8..18f81634d7fb 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -646,9 +646,8 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); while ((part = disk_part_iter_next(&piter))) { - bdev_unhash_inode(MKDEV(disk->major, - disk->first_minor + part->partno)); invalidate_partition(disk, part->partno); + bdev_unhash_inode(part_devt(part)); delete_partition(disk, part->partno); } disk_part_iter_exit(&piter); -- 2.7.4 From 65b0f7debcd2e80140e885fccb541291dbb0fd91 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Feb 2017 18:09:47 +0100 Subject: [PATCH 08/24] block: Unhash also block device inode for the whole device Iteration over partitions in del_gendisk() omits part0. Add bdev_unhash_inode() call for the whole device. Otherwise if the device number gets reused, bdev inode will be still associated with the old (stale) bdi. Tested-by: Lekshmi Pillai Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit d06e05c026ab58121dc6ab37c11f855ff642531e) Signed-off-by: Thiago Jung Bauermann --- block/genhd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/genhd.c b/block/genhd.c index 18f81634d7fb..510a57a5254b 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -653,6 +653,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_exit(&piter); invalidate_partition(disk, 0); + bdev_unhash_inode(disk_devt(disk)); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; -- 2.7.4 From 9f11f351651bea33db0e3f0d763577b4f1256e65 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 21 Feb 2017 18:09:48 +0100 Subject: [PATCH 09/24] block: Revalidate i_bdev reference in bd_aquire() When a device gets removed, block device inode unhashed so that it is not used anymore (bdget() will not find it anymore). Later when a new device gets created with the same device number, we create new block device inode. However there may be file system device inodes whose i_bdev still points to the original block device inode and thus we get two active block device inodes for the same device. They will share the same gendisk so the only visible differences will be that page caches will not be coherent and BDIs will be different (the old block device inode still points to unregistered BDI). Fix the problem by checking in bd_acquire() whether i_bdev still points to active block device inode and re-lookup the block device if not. That way any open of a block device happening after the old device has been removed will get correct block device inode. Tested-by: Lekshmi Pillai Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit cccd9fb9ec960e343b2f354c4613e49f5a1d8371) Conflicts: fs/block_dev.c Signed-off-by: Thiago Jung Bauermann --- fs/block_dev.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3783ea8d3681..ca76e189088c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -713,13 +713,22 @@ static struct block_device *bd_acquire(struct inode *inode) spin_lock(&bdev_lock); bdev = inode->i_bdev; - if (bdev) { + if (bdev && !inode_unhashed(bdev->bd_inode)) { ihold(bdev->bd_inode); spin_unlock(&bdev_lock); return bdev; } spin_unlock(&bdev_lock); + /* + * i_bdev references block device inode that was already shut down + * (corresponding device got removed). Remove the reference and look + * up block device inode again just in case new device got + * reestablished under the same device number. + */ + if (bdev) + bd_forget(inode); + bdev = bdget(inode->i_rdev); if (bdev) { spin_lock(&bdev_lock); -- 2.7.4 From 8c3ad597eea817badb4414ef0a7ee7d88e22b5d1 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 2 Mar 2017 16:50:13 +0100 Subject: [PATCH 10/24] block: Initialize bd_bdi on inode initialization So far we initialized bd_bdi only in bdget(). That is fine for normal bdev inodes however for the special case of the root inode of blockdev_superblock that function is never called and thus bd_bdi is left uninitialized. As a result bdev_evict_inode() may oops doing bdi_put(root->bd_bdi) on that inode as can be seen when doing: mount -t bdev none /mnt Fix the problem by initializing bd_bdi when first allocating the inode and then reinitializing bd_bdi in bdev_evict_inode(). Thanks to syzkaller team for finding the problem. Reported-by: Dmitry Vyukov Fixes: b1d2dc5659b4 ("block: Make blk_get_backing_dev_info() safe without open bdev") Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit a5a79d00017c9eee68a9bcb40d5dfd6f45f17461) Conflicts: fs/block_dev.c Signed-off-by: Thiago Jung Bauermann --- fs/block_dev.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index ca76e189088c..a07aad0f2caa 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -533,6 +533,7 @@ static void init_once(void *foo) #ifdef CONFIG_SYSFS INIT_LIST_HEAD(&bdev->bd_holder_disks); #endif + bdev->bd_bdi = &noop_backing_dev_info; inode_init_once(&ei->vfs_inode); /* Initialize mutex for freeze. */ mutex_init(&bdev->bd_fsfreeze_mutex); @@ -558,8 +559,10 @@ static void bdev_evict_inode(struct inode *inode) } list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); - if (bdev->bd_bdi != &noop_backing_dev_info) + if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); + bdev->bd_bdi = &noop_backing_dev_info; + } } static const struct super_operations bdev_sops = { @@ -658,7 +661,6 @@ struct block_device *bdget(dev_t dev) bdev->bd_contains = NULL; bdev->bd_super = NULL; bdev->bd_inode = inode; - bdev->bd_bdi = &noop_backing_dev_info; bdev->bd_block_size = (1 << inode->i_blkbits); bdev->bd_part_count = 0; bdev->bd_invalidated = 0; -- 2.7.4 From c1fec6e35edec58c66fe9c9a82a16caa76dd2f92 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Feb 2017 08:05:56 +0100 Subject: [PATCH 11/24] block: Move bdi_unregister() to del_gendisk() Commit 6cd18e711dd8 "block: destroy bdi before blockdev is unregistered." moved bdi unregistration (at that time through bdi_destroy()) from blk_release_queue() to blk_cleanup_queue() because it needs to happen before blk_unregister_region() call in del_gendisk() for MD. SCSI though will free up the device number from sd_remove() called through a maze of callbacks from device_del() in __scsi_remove_device() before blk_cleanup_queue() and thus similar races as described in 6cd18e711dd8 can happen for SCSI as well as reported by Omar [1]. Moving bdi_unregister() to del_gendisk() works for MD and fixes the problem for SCSI since del_gendisk() gets called from sd_remove() before freeing the device number. This also makes device_add_disk() (calling bdi_register_owner()) more symmetric with del_gendisk(). [1] http://marc.info/?l=linux-block&m=148554717109098&w=2 Tested-by: Lekshmi Pillai Acked-by: Tejun Heo Signed-off-by: Jan Kara Tested-by: Omar Sandoval Signed-off-by: Jens Axboe (cherry picked from commit 165a5e22fafb127ecb5914e12e8c32a1f0d3f820) Conflicts: block/blk-core.c Signed-off-by: Thiago Jung Bauermann --- block/blk-core.c | 2 -- block/genhd.c | 5 +++++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 4fdc9bbeb6f0..2edab6234f47 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -579,8 +579,6 @@ void blk_cleanup_queue(struct request_queue *q) q->queue_lock = &q->__queue_lock; spin_unlock_irq(lock); - bdi_unregister(q->backing_dev_info); - /* @q is and will stay empty, shutdown and put */ blk_put_queue(q); } diff --git a/block/genhd.c b/block/genhd.c index 510a57a5254b..6a6c7f544100 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -658,6 +658,11 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); + /* + * Unregister bdi before releasing device numbers (as they can get + * reused and we'd get clashes in sysfs). + */ + bdi_unregister(disk->queue->backing_dev_info); blk_unregister_queue(disk); blk_unregister_region(disk_devt(disk), disk->minors); -- 2.7.4 From 13b0dc4b8a6a1c3e8bec99b705a805ccae208bc3 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2017 17:48:31 +0100 Subject: [PATCH 12/24] block: Allow bdi re-registration SCSI can call device_add_disk() several times for one request queue when a device in unbound and bound, creating new gendisk each time. This will lead to bdi being repeatedly registered and unregistered. This was not a big problem until commit 165a5e22fafb "block: Move bdi_unregister() to del_gendisk()" since bdi was only registered repeatedly (bdi_register() handles repeated calls fine, only we ended up leaking reference to gendisk due to overwriting bdi->owner) but unregistered only in blk_cleanup_queue() which didn't get called repeatedly. After 165a5e22fafb we were doing correct bdi_register() - bdi_unregister() cycles however bdi_unregister() is not prepared for it. So make sure bdi_unregister() cleans up bdi in such a way that it is prepared for a possible following bdi_register() call. An easy way to provoke this behavior is to enable CONFIG_DEBUG_TEST_DRIVER_REMOVE and use scsi_debug driver to create a scsi disk which immediately hangs without this fix. Fixes: 165a5e22fafb127ecb5914e12e8c32a1f0d3f820 Signed-off-by: Jan Kara Tested-by: Omar Sandoval Signed-off-by: Jens Axboe (cherry picked from commit b6f8fec4448aa52a8c36a392aa1ca2ea99acd460) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 320fd4d27039..8b3f4426906e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -709,6 +709,11 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) */ atomic_dec(&bdi->usage_cnt); wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); + /* + * Grab back our reference so that we hold it when @bdi gets + * re-registered. + */ + atomic_inc(&bdi->usage_cnt); } /** @@ -856,6 +861,8 @@ int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner) MINOR(owner->devt)); if (rc) return rc; + /* Leaking owner reference... */ + WARN_ON(bdi->owner); bdi->owner = owner; get_device(owner); return 0; -- 2.7.4 From 014d058f4863d9a86953bc6c4a9f6f7ae18a7379 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2017 17:48:32 +0100 Subject: [PATCH 13/24] bdi: Fix use-after-free in wb_congested_put() bdi_writeback_congested structures get created for each blkcg and bdi regardless whether bdi is registered or not. When they are created in unregistered bdi and the request queue (and thus bdi) is then destroyed while blkg still holds reference to bdi_writeback_congested structure, this structure will be referencing freed bdi and last wb_congested_put() will try to remove the structure from already freed bdi. With commit 165a5e22fafb "block: Move bdi_unregister() to del_gendisk()", SCSI started to destroy bdis without calling bdi_unregister() first (previously it was calling bdi_unregister() even for unregistered bdis) and thus the code detaching bdi_writeback_congested in cgwb_bdi_destroy() was not triggered and we started hitting this use-after-free bug. It is enough to boot a KVM instance with virtio-scsi device to trigger this behavior. Fix the problem by detaching bdi_writeback_congested structures in bdi_exit() instead of bdi_unregister(). This is also more logical as they can get attached to bdi regardless whether it ever got registered or not. Fixes: 165a5e22fafb127ecb5914e12e8c32a1f0d3f820 Signed-off-by: Jan Kara Tested-by: Omar Sandoval Signed-off-by: Jens Axboe (cherry picked from commit df23de55615fa7a190a85f49a950ccecdd9102f3) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8b3f4426906e..f3e06b707612 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -682,30 +682,18 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; - struct rb_node *rbn; void **slot; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); - radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); - - while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { - struct bdi_writeback_congested *congested = - rb_entry(rbn, struct bdi_writeback_congested, rb_node); - - rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ - } - spin_unlock_irq(&cgwb_lock); /* - * All cgwb's and their congested states must be shutdown and - * released before returning. Drain the usage counter to wait for - * all cgwb's and cgwb_congested's ever created on @bdi. + * All cgwb's must be shutdown and released before returning. Drain + * the usage counter to wait for all cgwb's ever created on @bdi. */ atomic_dec(&bdi->usage_cnt); wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); @@ -753,6 +741,21 @@ void wb_blkcg_offline(struct blkcg *blkcg) spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_exit(struct backing_dev_info *bdi) +{ + struct rb_node *rbn; + + spin_lock_irq(&cgwb_lock); + while ((rbn = rb_first(&bdi->cgwb_congested_tree))) { + struct bdi_writeback_congested *congested = + rb_entry(rbn, struct bdi_writeback_congested, rb_node); + + rb_erase(rbn, &bdi->cgwb_congested_tree); + congested->bdi = NULL; /* mark @congested unlinked */ + } + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -773,7 +776,9 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } + +static void cgwb_bdi_exit(struct backing_dev_info *bdi) { wb_congested_put(bdi->wb_congested); } @@ -904,6 +909,7 @@ static void bdi_exit(struct backing_dev_info *bdi) { WARN_ON_ONCE(bdi->dev); wb_exit(&bdi->wb); + cgwb_bdi_exit(bdi); } static void release_bdi(struct kref *ref) -- 2.7.4 From faefd34ae19c2fa712149b4d996472e2782cce56 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 8 Mar 2017 17:48:33 +0100 Subject: [PATCH 14/24] block: Make del_gendisk() safer for disks without queues Commit 165a5e22fafb "block: Move bdi_unregister() to del_gendisk()" added disk->queue dereference to del_gendisk(). Although del_gendisk() is not supposed to be called without disk->queue valid and blk_unregister_queue() warns in that case, this change will make it oops instead. Return to the old more robust behavior of just warning when del_gendisk() gets called for gendisk with disk->queue being NULL. Reported-by: Dan Carpenter Signed-off-by: Jan Kara Tested-by: Omar Sandoval Signed-off-by: Jens Axboe (cherry picked from commit 90f16fddcc2802726142b8386c65ccb89f044613) Signed-off-by: Thiago Jung Bauermann --- block/genhd.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 6a6c7f544100..946507820798 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -658,12 +658,16 @@ void del_gendisk(struct gendisk *disk) disk->flags &= ~GENHD_FL_UP; sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - /* - * Unregister bdi before releasing device numbers (as they can get - * reused and we'd get clashes in sysfs). - */ - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); + if (disk->queue) { + /* + * Unregister bdi before releasing device numbers (as they can + * get reused and we'd get clashes in sysfs). + */ + bdi_unregister(disk->queue->backing_dev_info); + blk_unregister_queue(disk); + } else { + WARN_ON(1); + } blk_unregister_region(disk_devt(disk), disk->minors); part_stat_set_all(&disk->part0, 0); -- 2.7.4 From a43cae2724ffebf5cc774e7dfa7b049aa4e3bd1b Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:53 +0100 Subject: [PATCH 15/24] block: Fix bdi assignment to bdev inode when racing with disk delete When disk->fops->open() in __blkdev_get() returns -ERESTARTSYS, we restart the process of opening the block device. However we forget to switch bdev->bd_bdi back to noop_backing_dev_info and as a result bdev inode will be pointing to a stale bdi. Fix the problem by setting bdev->bd_bdi later when __blkdev_get() is already guaranteed to succeed. Acked-by: Tejun Heo Reviewed-by: Hannes Reinecke Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit 03e262798884b0a5f948b17433afd80606cb3497) Conflicts: fs/block_dev.c Signed-off-by: Thiago Jung Bauermann --- fs/block_dev.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index a07aad0f2caa..3ac854b60792 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1233,8 +1233,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_queue = disk->queue; bdev->bd_contains = bdev; bdev->bd_inode->i_flags = disk->fops->direct_access ? S_DAX : 0; - if (bdev->bd_bdi == &noop_backing_dev_info) - bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); if (!partno) { ret = -ENXIO; @@ -1305,6 +1303,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) (bdev->bd_part->nr_sects % (PAGE_SIZE / 512))) bdev->bd_inode->i_flags &= ~S_DAX; } + + if (bdev->bd_bdi == &noop_backing_dev_info) + bdev->bd_bdi = bdi_get(disk->queue->backing_dev_info); } else { if (bdev->bd_contains == bdev) { ret = 0; @@ -1336,8 +1337,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) bdev->bd_disk = NULL; bdev->bd_part = NULL; bdev->bd_queue = NULL; - bdi_put(bdev->bd_bdi); - bdev->bd_bdi = &noop_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, mode, 1); bdev->bd_contains = NULL; -- 2.7.4 From 3a382bad335181f68e3a704015b990c4a94e9661 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:54 +0100 Subject: [PATCH 16/24] bdi: Mark congested->bdi as internal congested->bdi pointer is used only to be able to remove congested structure from bdi->cgwb_congested_tree on structure release. Moreover the pointer can become NULL when we unregister the bdi. Rename the field to __bdi and add a comment to make it more explicit this is internal stuff of memcg writeback code and people should not use the field as such use will be likely race prone. We do not bother with converting congested->bdi to a proper refcounted reference. It will be slightly ugly to special-case bdi->wb.congested to avoid effectively a cyclic reference of bdi to itself and the reference gets cleared from bdi_unregister() making it impossible to reference a freed bdi. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit b7d680d7bf584bce6023343304b819009a7c3336) Signed-off-by: Thiago Jung Bauermann --- include/linux/backing-dev-defs.h | 4 +++- mm/backing-dev.c | 10 +++++----- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 8f9299aeede2..5f218afa5200 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -54,7 +54,9 @@ struct bdi_writeback_congested { atomic_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK - struct backing_dev_info *bdi; /* the associated bdi */ + struct backing_dev_info *__bdi; /* the associated bdi, set to NULL + * on bdi unregistration. For memcg-wb + * internal use only! */ int blkcg_id; /* ID of the associated blkcg */ struct rb_node rb_node; /* on bdi->cgwb_congestion_tree */ #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f3e06b707612..7157ccd00069 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -437,7 +437,7 @@ retry: return NULL; atomic_set(&new_congested->refcnt, 0); - new_congested->bdi = bdi; + new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; @@ -465,10 +465,10 @@ void wb_congested_put(struct bdi_writeback_congested *congested) } /* bdi might already have been destroyed leaving @congested unlinked */ - if (congested->bdi) { + if (congested->__bdi) { rb_erase(&congested->rb_node, - &congested->bdi->cgwb_congested_tree); - congested->bdi = NULL; + &congested->__bdi->cgwb_congested_tree); + congested->__bdi = NULL; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -751,7 +751,7 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) rb_entry(rbn, struct bdi_writeback_congested, rb_node); rb_erase(rbn, &bdi->cgwb_congested_tree); - congested->bdi = NULL; /* mark @congested unlinked */ + congested->__bdi = NULL; /* mark @congested unlinked */ } spin_unlock_irq(&cgwb_lock); } -- 2.7.4 From 3a8beee8dd2fb7bc0dcf4aa488251996896705e8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:55 +0100 Subject: [PATCH 17/24] bdi: Make wb->bdi a proper reference Make wb->bdi a proper refcounted reference to bdi for all bdi_writeback structures except for the one embedded inside struct backing_dev_info. That will allow us to simplify bdi unregistration. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit 810df54a64fb7841d6511f67818f3e1589c249a2) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 7157ccd00069..eecae12c3d35 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -294,6 +294,8 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, memset(wb, 0, sizeof(*wb)); + if (wb != &bdi->wb) + bdi_get(bdi); wb->bdi = bdi; wb->last_old_flush = jiffies; INIT_LIST_HEAD(&wb->b_dirty); @@ -313,8 +315,10 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi, INIT_DELAYED_WORK(&wb->dwork, wb_workfn); wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp); - if (!wb->congested) - return -ENOMEM; + if (!wb->congested) { + err = -ENOMEM; + goto out_put_bdi; + } err = fprop_local_init_percpu(&wb->completions, gfp); if (err) @@ -334,6 +338,9 @@ out_destroy_stat: fprop_local_destroy_percpu(&wb->completions); out_put_cong: wb_congested_put(wb->congested); +out_put_bdi: + if (wb != &bdi->wb) + bdi_put(bdi); return err; } @@ -371,6 +378,8 @@ static void wb_exit(struct bdi_writeback *wb) fprop_local_destroy_percpu(&wb->completions); wb_congested_put(wb->congested); + if (wb != &wb->bdi->wb) + bdi_put(wb->bdi); } #ifdef CONFIG_CGROUP_WRITEBACK -- 2.7.4 From c6a7763216d6b5748f04189622d2c569f206bdf5 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:56 +0100 Subject: [PATCH 18/24] bdi: Unify bdi->wb_list handling for root wb_writeback Currently root wb_writeback structure is added to bdi->wb_list in bdi_init() and never removed. That is different from all other wb_writeback structures which get added to the list when created and removed from it before wb_shutdown(). So move list addition of root bdi_writeback to bdi_register() and list removal of all wb_writeback structures to wb_shutdown(). That way a wb_writeback structure is on bdi->wb_list if and only if it can handle writeback and it will make it easier for us to handle shutdown of all wb_writeback structures in bdi_unregister(). Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit e8cb72b322cf4a729633b7e2080fbeab477f6ea2) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index eecae12c3d35..b809899e9229 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -344,6 +344,8 @@ out_put_bdi: return err; } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb); + /* * Remove bdi from the global list and shutdown any threads we have running */ @@ -357,6 +359,7 @@ static void wb_shutdown(struct bdi_writeback *wb) } spin_unlock_bh(&wb->work_lock); + cgwb_remove_from_bdi_list(wb); /* * Drain work list and shutdown the delayed_work. !WB_registered * tells wb_workfn() that @wb is dying and its work_list needs to @@ -490,10 +493,6 @@ static void cgwb_release_workfn(struct work_struct *work) release_work); struct backing_dev_info *bdi = wb->bdi; - spin_lock_irq(&cgwb_lock); - list_del_rcu(&wb->bdi_node); - spin_unlock_irq(&cgwb_lock); - wb_shutdown(wb); css_put(wb->memcg_css); @@ -525,6 +524,13 @@ static void cgwb_kill(struct bdi_writeback *wb) percpu_ref_kill(&wb->refcnt); } +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + spin_lock_irq(&cgwb_lock); + list_del_rcu(&wb->bdi_node); + spin_unlock_irq(&cgwb_lock); +} + static int cgwb_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) { @@ -765,6 +771,13 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) spin_unlock_irq(&cgwb_lock); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + spin_lock_irq(&cgwb_lock); + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); + spin_unlock_irq(&cgwb_lock); +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int cgwb_bdi_init(struct backing_dev_info *bdi) @@ -792,6 +805,16 @@ static void cgwb_bdi_exit(struct backing_dev_info *bdi) wb_congested_put(bdi->wb_congested); } +static void cgwb_bdi_register(struct backing_dev_info *bdi) +{ + list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); +} + +static void cgwb_remove_from_bdi_list(struct bdi_writeback *wb) +{ + list_del_rcu(&wb->bdi_node); +} + #endif /* CONFIG_CGROUP_WRITEBACK */ int bdi_init(struct backing_dev_info *bdi) @@ -810,8 +833,6 @@ int bdi_init(struct backing_dev_info *bdi) ret = cgwb_bdi_init(bdi); - list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list); - return ret; } EXPORT_SYMBOL(bdi_init); @@ -847,6 +868,7 @@ int bdi_register(struct backing_dev_info *bdi, struct device *parent, if (IS_ERR(dev)) return PTR_ERR(dev); + cgwb_bdi_register(bdi); bdi->dev = dev; bdi_debug_register(bdi, dev_name(dev)); -- 2.7.4 From a111d0864433c8afb574aa5ebcb74672d6a52edb Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:57 +0100 Subject: [PATCH 19/24] bdi: Shutdown writeback on all cgwbs in cgwb_bdi_destroy() Currently we waited for all cgwbs to get freed in cgwb_bdi_destroy() which also means that writeback has been shutdown on them. Since this wait is going away, directly shutdown writeback on cgwbs from cgwb_bdi_destroy() to avoid live writeback structures after bdi_unregister() has finished. To make that safe with concurrent shutdown from cgwb_release_workfn(), we also have to make sure wb_shutdown() returns only after the bdi_writeback structure is really shutdown. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit 5318ce7d46866e1dbc20ab9349b93753edba0b3e) Signed-off-by: Thiago Jung Bauermann --- include/linux/backing-dev-defs.h | 1 + mm/backing-dev.c | 22 ++++++++++++++++++++++ 2 files changed, 23 insertions(+) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 5f218afa5200..68a3c84a5dc3 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -21,6 +21,7 @@ struct dentry; */ enum wb_state { WB_registered, /* bdi_register() was done */ + WB_shutting_down, /* wb_shutdown() in progress */ WB_writeback_running, /* Writeback is in progress */ WB_has_dirty_io, /* Dirty inodes on ->b_{dirty|io|more_io} */ }; diff --git a/mm/backing-dev.c b/mm/backing-dev.c index b809899e9229..f6c9ca40ce9c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -355,8 +355,15 @@ static void wb_shutdown(struct bdi_writeback *wb) spin_lock_bh(&wb->work_lock); if (!test_and_clear_bit(WB_registered, &wb->state)) { spin_unlock_bh(&wb->work_lock); + /* + * Wait for wb shutdown to finish if someone else is just + * running wb_shutdown(). Otherwise we could proceed to wb / + * bdi destruction before wb_shutdown() is finished. + */ + wait_on_bit(&wb->state, WB_shutting_down, TASK_UNINTERRUPTIBLE); return; } + set_bit(WB_shutting_down, &wb->state); spin_unlock_bh(&wb->work_lock); cgwb_remove_from_bdi_list(wb); @@ -368,6 +375,12 @@ static void wb_shutdown(struct bdi_writeback *wb) mod_delayed_work(bdi_wq, &wb->dwork, 0); flush_delayed_work(&wb->dwork); WARN_ON(!list_empty(&wb->work_list)); + /* + * Make sure bit gets cleared after shutdown is finished. Matches with + * the barrier provided by test_and_clear_bit() above. + */ + smp_wmb(); + clear_bit(WB_shutting_down, &wb->state); } static void wb_exit(struct bdi_writeback *wb) @@ -698,12 +711,21 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; + struct bdi_writeback *wb; WARN_ON(test_bit(WB_registered, &bdi->wb.state)); spin_lock_irq(&cgwb_lock); radix_tree_for_each_slot(slot, &bdi->cgwb_tree, &iter, 0) cgwb_kill(*slot); + + while (!list_empty(&bdi->wb_list)) { + wb = list_first_entry(&bdi->wb_list, struct bdi_writeback, + bdi_node); + spin_unlock_irq(&cgwb_lock); + wb_shutdown(wb); + spin_lock_irq(&cgwb_lock); + } spin_unlock_irq(&cgwb_lock); /* -- 2.7.4 From 2bad36ad46155c997679fe6fff0dc31d4293f204 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:58 +0100 Subject: [PATCH 20/24] bdi: Do not wait for cgwbs release in bdi_unregister() Currently we wait for all cgwbs to get released in cgwb_bdi_destroy() (called from bdi_unregister()). That is however unnecessary now when cgwb->bdi is a proper refcounted reference (thus bdi cannot get released before all cgwbs are released) and when cgwb_bdi_destroy() shuts down writeback directly. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit 4514451e79ae5baabb85d22ba3523602e59d5218) Signed-off-by: Thiago Jung Bauermann --- include/linux/backing-dev-defs.h | 1 - mm/backing-dev.c | 22 +--------------------- 2 files changed, 1 insertion(+), 22 deletions(-) diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 68a3c84a5dc3..4d6cc4d2cc26 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -161,7 +161,6 @@ struct backing_dev_info { #ifdef CONFIG_CGROUP_WRITEBACK struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */ struct rb_root cgwb_congested_tree; /* their congested states */ - atomic_t usage_cnt; /* counts both cgwbs and cgwb_contested's */ #else struct bdi_writeback_congested *wb_congested; #endif diff --git a/mm/backing-dev.c b/mm/backing-dev.c index f6c9ca40ce9c..5cc2c6e1bd26 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -405,11 +405,9 @@ static void wb_exit(struct bdi_writeback *wb) /* * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree, * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU - * protected. cgwb_release_wait is used to wait for the completion of cgwb - * releases from bdi destruction path. + * protected. */ static DEFINE_SPINLOCK(cgwb_lock); -static DECLARE_WAIT_QUEUE_HEAD(cgwb_release_wait); /** * wb_congested_get_create - get or create a wb_congested @@ -504,7 +502,6 @@ static void cgwb_release_workfn(struct work_struct *work) { struct bdi_writeback *wb = container_of(work, struct bdi_writeback, release_work); - struct backing_dev_info *bdi = wb->bdi; wb_shutdown(wb); @@ -515,9 +512,6 @@ static void cgwb_release_workfn(struct work_struct *work) percpu_ref_exit(&wb->refcnt); wb_exit(wb); kfree_rcu(wb, rcu); - - if (atomic_dec_and_test(&bdi->usage_cnt)) - wake_up_all(&cgwb_release_wait); } static void cgwb_release(struct percpu_ref *refcnt) @@ -607,7 +601,6 @@ static int cgwb_create(struct backing_dev_info *bdi, /* we might have raced another instance of this function */ ret = radix_tree_insert(&bdi->cgwb_tree, memcg_css->id, wb); if (!ret) { - atomic_inc(&bdi->usage_cnt); list_add_tail_rcu(&wb->bdi_node, &bdi->wb_list); list_add(&wb->memcg_node, memcg_cgwb_list); list_add(&wb->blkcg_node, blkcg_cgwb_list); @@ -697,7 +690,6 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC); bdi->cgwb_congested_tree = RB_ROOT; - atomic_set(&bdi->usage_cnt, 1); ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (!ret) { @@ -727,18 +719,6 @@ static void cgwb_bdi_destroy(struct backing_dev_info *bdi) spin_lock_irq(&cgwb_lock); } spin_unlock_irq(&cgwb_lock); - - /* - * All cgwb's must be shutdown and released before returning. Drain - * the usage counter to wait for all cgwb's ever created on @bdi. - */ - atomic_dec(&bdi->usage_cnt); - wait_event(cgwb_release_wait, !atomic_read(&bdi->usage_cnt)); - /* - * Grab back our reference so that we hold it when @bdi gets - * re-registered. - */ - atomic_inc(&bdi->usage_cnt); } /** -- 2.7.4 From 9076cd1ebaae956e5122811dcadfe684591d74ff Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:36:59 +0100 Subject: [PATCH 21/24] bdi: Rename cgwb_bdi_destroy() to cgwb_bdi_unregister() Rename cgwb_bdi_destroy() to cgwb_bdi_unregister() as it gets called from bdi_unregister() which is not necessarily called from bdi_destroy() and thus the name is somewhat misleading. Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit b1c51afc00f164457e126dcfc946fdd563948d26) Signed-off-by: Thiago Jung Bauermann --- mm/backing-dev.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5cc2c6e1bd26..007b5adff547 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -699,7 +699,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return ret; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { struct radix_tree_iter iter; void **slot; @@ -800,7 +800,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) return 0; } -static void cgwb_bdi_destroy(struct backing_dev_info *bdi) { } +static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { } static void cgwb_bdi_exit(struct backing_dev_info *bdi) { @@ -924,7 +924,7 @@ void bdi_unregister(struct backing_dev_info *bdi) /* make sure nobody finds us on the bdi_list anymore */ bdi_remove_from_list(bdi); wb_shutdown(&bdi->wb); - cgwb_bdi_destroy(bdi); + cgwb_bdi_unregister(bdi); if (bdi->dev) { bdi_debug_unregister(bdi); -- 2.7.4 From 92165e17474de2612cae40e2552a578953214e96 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:00 +0100 Subject: [PATCH 22/24] block: Fix oops in locked_inode_to_wb_and_lock_list() When block device is closed, we call inode_detach_wb() in __blkdev_put() which sets inode->i_wb to NULL. That is contrary to expectations that inode->i_wb stays valid once set during the whole inode's lifetime and leads to oops in wb_get() in locked_inode_to_wb_and_lock_list() because inode_to_wb() returned NULL. The reason why we called inode_detach_wb() is not valid anymore though. BDI is guaranteed to stay along until we call bdi_put() from bdev_evict_inode() so we can postpone calling inode_detach_wb() to that moment. Also add a warning to catch if someone uses inode_detach_wb() in a dangerous way. Reported-by: Thiago Jung Bauermann Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit f759741d9d913eb57784a94b9bca78b376fc26a9) Signed-off-by: Thiago Jung Bauermann --- fs/block_dev.c | 8 ++------ include/linux/writeback.h | 1 + 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 3ac854b60792..13d8fbb1b9eb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -559,6 +559,8 @@ static void bdev_evict_inode(struct inode *inode) } list_del_init(&bdev->bd_list); spin_unlock(&bdev_lock); + /* Detach inode from wb early as bdi_put() may free bdi->wb */ + inode_detach_wb(inode); if (bdev->bd_bdi != &noop_backing_dev_info) { bdi_put(bdev->bd_bdi); bdev->bd_bdi = &noop_backing_dev_info; @@ -1577,12 +1579,6 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part) kill_bdev(bdev); bdev_write_inode(bdev); - /* - * Detaching bdev inode from its wb in __destroy_inode() - * is too late: the queue which embeds its bdi (along with - * root wb) can be gone as soon as we put_disk() below. - */ - inode_detach_wb(bdev->bd_inode); } if (bdev->bd_contains == bdev) { if (disk->fops->release) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index d0b5ca5d4e08..6c1cbbedc79c 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -224,6 +224,7 @@ static inline void inode_attach_wb(struct inode *inode, struct page *page) static inline void inode_detach_wb(struct inode *inode) { if (inode->i_wb) { + WARN_ON_ONCE(!(inode->i_state & I_CLEAR)); wb_put(inode->i_wb); inode->i_wb = NULL; } -- 2.7.4 From c1ed0b0e7eac76634d51baeafd222befdefbd034 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:01 +0100 Subject: [PATCH 23/24] kobject: Export kobject_get_unless_zero() Make the function available for outside use and fortify it against NULL kobject. CC: Greg Kroah-Hartman Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit c70c176ff8c3ff0ac6ef9a831cd591ea9a66bd1a) Signed-off-by: Thiago Jung Bauermann --- include/linux/kobject.h | 2 ++ lib/kobject.c | 5 ++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/kobject.h b/include/linux/kobject.h index e6284591599e..ca85cb80e99a 100644 --- a/include/linux/kobject.h +++ b/include/linux/kobject.h @@ -108,6 +108,8 @@ extern int __must_check kobject_rename(struct kobject *, const char *new_name); extern int __must_check kobject_move(struct kobject *, struct kobject *); extern struct kobject *kobject_get(struct kobject *kobj); +extern struct kobject * __must_check kobject_get_unless_zero( + struct kobject *kobj); extern void kobject_put(struct kobject *kobj); extern const void *kobject_namespace(struct kobject *kobj); diff --git a/lib/kobject.c b/lib/kobject.c index 7cbccd2b4c72..a5e8e10b5fa3 100644 --- a/lib/kobject.c +++ b/lib/kobject.c @@ -601,12 +601,15 @@ struct kobject *kobject_get(struct kobject *kobj) } EXPORT_SYMBOL(kobject_get); -static struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) +struct kobject * __must_check kobject_get_unless_zero(struct kobject *kobj) { + if (!kobj) + return NULL; if (!kref_get_unless_zero(&kobj->kref)) kobj = NULL; return kobj; } +EXPORT_SYMBOL(kobject_get_unless_zero); /* * kobject_cleanup - free kobject resources. -- 2.7.4 From 3b43b08caf49103a6dc9438c3661f3c82109c072 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Mar 2017 01:37:02 +0100 Subject: [PATCH 24/24] block: Fix oops scsi_disk_get() When device open races with device shutdown, we can get the following oops in scsi_disk_get(): [11863.044351] general protection fault: 0000 [#1] SMP [11863.045561] Modules linked in: scsi_debug xfs libcrc32c netconsole btrfs raid6_pq zlib_deflate lzo_compress xor [last unloaded: loop] [11863.047853] CPU: 3 PID: 13042 Comm: hald-probe-stor Tainted: G W 4.10.0-rc2-xen+ #35 [11863.048030] Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 [11863.048030] task: ffff88007f438200 task.stack: ffffc90000fd0000 [11863.048030] RIP: 0010:scsi_disk_get+0x43/0x70 [11863.048030] RSP: 0018:ffffc90000fd3a08 EFLAGS: 00010202 [11863.048030] RAX: 6b6b6b6b6b6b6b6b RBX: ffff88007f56d000 RCX: 0000000000000000 [11863.048030] RDX: 0000000000000001 RSI: 0000000000000004 RDI: ffffffff81a8d880 [11863.048030] RBP: ffffc90000fd3a18 R08: 0000000000000000 R09: 0000000000000001 [11863.059217] R10: 0000000000000000 R11: 0000000000000000 R12: 00000000fffffffa [11863.059217] R13: ffff880078872800 R14: ffff880070915540 R15: 000000000000001d [11863.059217] FS: 00007f2611f71800(0000) GS:ffff88007f0c0000(0000) knlGS:0000000000000000 [11863.059217] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [11863.059217] CR2: 000000000060e048 CR3: 00000000778d4000 CR4: 00000000000006e0 [11863.059217] Call Trace: [11863.059217] ? disk_get_part+0x22/0x1f0 [11863.059217] sd_open+0x39/0x130 [11863.059217] __blkdev_get+0x69/0x430 [11863.059217] ? bd_acquire+0x7f/0xc0 [11863.059217] ? bd_acquire+0x96/0xc0 [11863.059217] ? blkdev_get+0x350/0x350 [11863.059217] blkdev_get+0x126/0x350 [11863.059217] ? _raw_spin_unlock+0x2b/0x40 [11863.059217] ? bd_acquire+0x7f/0xc0 [11863.059217] ? blkdev_get+0x350/0x350 [11863.059217] blkdev_open+0x65/0x80 ... As you can see RAX value is already poisoned showing that gendisk we got is already freed. The problem is that get_gendisk() looks up device number in ext_devt_idr and then does get_disk() which does kobject_get() on the disks kobject. However the disk gets removed from ext_devt_idr only in disk_release() (through blk_free_devt()) at which moment it has already 0 refcount and is already on its way to be freed. Indeed we've got a warning from kobject_get() about 0 refcount shortly before the oops. We fix the problem by using kobject_get_unless_zero() in get_disk() so that get_disk() cannot get reference on a disk that is already being freed. Tested-by: Lekshmi Pillai Reviewed-by: Bart Van Assche Acked-by: Tejun Heo Signed-off-by: Jan Kara Signed-off-by: Jens Axboe (cherry picked from commit d01b2dcb441b14c3b860d3d16ecb9ed6ed4ba7fb) Signed-off-by: Thiago Jung Bauermann --- block/genhd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/genhd.c b/block/genhd.c index 946507820798..40567818e928 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1324,7 +1324,7 @@ struct kobject *get_disk(struct gendisk *disk) owner = disk->fops->owner; if (owner && !try_module_get(owner)) return NULL; - kobj = kobject_get(&disk_to_dev(disk)->kobj); + kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); if (kobj == NULL) { module_put(owner); return NULL; -- 2.7.4