From 89bbb84261da755d9a4572e9bd946df1b8386f0b Mon Sep 17 00:00:00 2001 From: Ante Karamatic Date: Thu, 30 Oct 2008 08:47:08 +0100 Subject: [PATCH] UBUNTU: Updated drbd to 8.0.13 version Bug: 288226 Ignore: yes Signed-off-by: Ante Karamatic --- ubuntu/block/drbd/drbd_actlog.c | 174 +++++---- ubuntu/block/drbd/drbd_bitmap.c | 60 ++-- ubuntu/block/drbd/drbd_buildtag.c | 5 +- ubuntu/block/drbd/drbd_compat_wrappers.h | 27 +- ubuntu/block/drbd/drbd_int.h | 255 +++++++++---- ubuntu/block/drbd/drbd_main.c | 613 +++++++++++++++++++----------- ubuntu/block/drbd/drbd_nl.c | 338 +++++++++++------ ubuntu/block/drbd/drbd_proc.c | 19 +- ubuntu/block/drbd/drbd_receiver.c | 485 ++++++++++++++---------- ubuntu/block/drbd/drbd_req.c | 57 +++- ubuntu/block/drbd/drbd_req.h | 3 +- ubuntu/block/drbd/drbd_strings.c | 6 +- ubuntu/block/drbd/drbd_worker.c | 93 +++-- ubuntu/block/drbd/linux/connector.h | 5 +- ubuntu/block/drbd/linux/drbd.h | 52 ++- ubuntu/block/drbd/linux/drbd_config.h | 4 +- ubuntu/block/drbd/linux/drbd_limits.h | 7 +- ubuntu/block/drbd/linux/drbd_nl.h | 6 + ubuntu/block/drbd/linux/drbd_tag_magic.h | 4 +- ubuntu/block/drbd/lru_cache.c | 73 +++-- ubuntu/block/drbd/lru_cache.h | 1 + 21 files changed, 1439 insertions(+), 848 deletions(-) diff --git a/ubuntu/block/drbd/drbd_actlog.c b/ubuntu/block/drbd/drbd_actlog.c index 5b69569..1f7ed42 100644 --- a/ubuntu/block/drbd/drbd_actlog.c +++ b/ubuntu/block/drbd/drbd_actlog.c @@ -188,7 +188,7 @@ struct __attribute__((packed)) al_transaction { struct update_odbm_work { struct drbd_work w; unsigned int enr; -} ; +}; struct update_al_work { struct drbd_work w; @@ -214,7 +214,7 @@ struct lc_element* _al_get(struct Drbd_Conf *mdev, unsigned int enr) if(test_bit(BME_NO_WRITES,&bm_ext->flags)) { spin_unlock_irq(&mdev->al_lock); //INFO("Delaying app write until sync read is done\n"); - return 0; + return NULL; } } al_ext = lc_get(mdev->act_log,enr); @@ -322,6 +322,11 @@ w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused) unsigned int extent_nr; u32 xor_sum=0; + if (!inc_local(mdev)) { + ERR("inc_local() failed in w_al_write_transaction\n"); + complete(&((struct update_al_work*)w)->event); + return 1; + } /* do we have to do a bitmap write, first? * TODO reduce maximum latency: * submit both bios, then wait for both, @@ -383,6 +388,7 @@ w_al_write_transaction(struct Drbd_Conf *mdev, struct drbd_work *w, int unused) up(&mdev->md_io_mutex); complete(&((struct update_al_work*)w)->event); + dec_local(mdev); return 1; } @@ -540,7 +546,7 @@ struct drbd_atodb_wait { int error; }; -STATIC BIO_ENDIO_FN(atodb_endio) +STATIC BIO_ENDIO_TYPE atodb_endio BIO_ENDIO_ARGS(struct bio *bio, int error) { struct drbd_atodb_wait *wc = bio->bi_private; struct Drbd_Conf *mdev=wc->mdev; @@ -558,14 +564,15 @@ STATIC BIO_ENDIO_FN(atodb_endio) /* corresponding drbd_io_error is in drbd_al_to_on_disk_bm */ drbd_chk_io_error(mdev,error,TRUE); - if(error && wc->error == 0) wc->error=error; + if (error && wc->error == 0) + wc->error=error; if (atomic_dec_and_test(&wc->count)) { complete(&wc->io_done); } page = bio->bi_io_vec[0].bv_page; - if(page) put_page(page); + put_page(page); bio_put(bio); mdev->bm_writ_cnt++; dec_local(mdev); @@ -578,57 +585,63 @@ STATIC BIO_ENDIO_FN(atodb_endio) * is already covered by previously prepared bios */ STATIC int atodb_prepare_unless_covered(struct Drbd_Conf *mdev, struct bio **bios, - struct page **page, - unsigned int *page_offset, unsigned int enr, - struct drbd_atodb_wait *wc) + struct drbd_atodb_wait *wc) __must_hold(local) { - int i=0,allocated_page=0; struct bio *bio; - struct page *np; - sector_t on_disk_sector = enr + mdev->bc->md.md_offset + mdev->bc->md.bm_offset; + struct page *page; + sector_t on_disk_sector = enr + mdev->bc->md.md_offset + + mdev->bc->md.bm_offset; + unsigned int page_offset = PAGE_SIZE; int offset; - - // check if that enr is already covered by an already created bio. - while( (bio=bios[i]) ) { - if(bio->bi_sector == on_disk_sector) return 0; + int i = 0; + int err = -ENOMEM; + + /* Check if that enr is already covered by an already created bio. + * Caution, bios[] is not NULL terminated, + * but only initialized to all NULL. + * For completely scattered activity log, + * the last invocation iterates over all bios, + * and finds the last NULL entry. + */ + while ( (bio = bios[i]) ) { + if (bio->bi_sector == on_disk_sector) + return 0; i++; } + /* bios[i] == NULL, the next not yet used slot */ bio = bio_alloc(GFP_KERNEL, 1); - if(bio==NULL) return -ENOMEM; - - bio->bi_bdev = mdev->bc->md_bdev; - bio->bi_sector = on_disk_sector; + if (bio == NULL) + return -ENOMEM; - bios[i] = bio; - - if(*page_offset == PAGE_SIZE) { - np = alloc_page(__GFP_HIGHMEM); - /* no memory leak, bio gets cleaned up by caller */ - if(np == NULL) return -ENOMEM; - *page = np; - *page_offset = 0; - allocated_page=1; + if (i > 0) { + const struct bio_vec *prev_bv = bios[i-1]->bi_io_vec; + page_offset = prev_bv->bv_offset + prev_bv->bv_len; + page = prev_bv->bv_page; } - - offset = S2W(enr); - drbd_bm_get_lel( mdev, offset, - min_t(size_t,S2W(1), drbd_bm_words(mdev) - offset), - kmap(*page) + *page_offset ); - kunmap(*page); - - if(bio_add_page(bio, *page, MD_HARDSECT, *page_offset)!=MD_HARDSECT) { - /* no memory leak, page gets cleaned up by caller */ - return -EINVAL; + if (page_offset == PAGE_SIZE) { + page = alloc_page(__GFP_HIGHMEM); + if (page == NULL) + goto out_bio_put; + page_offset = 0; + } else { + get_page(page); } - if(!allocated_page) get_page(*page); - - *page_offset += MD_HARDSECT; + offset = S2W(enr); + drbd_bm_get_lel( mdev, offset, + min_t(size_t, S2W(1), drbd_bm_words(mdev) - offset), + kmap(page) + page_offset ); + kunmap(page); bio->bi_private = wc; bio->bi_end_io = atodb_endio; + bio->bi_bdev = mdev->bc->md_bdev; + bio->bi_sector = on_disk_sector; + + if (bio_add_page(bio, page, MD_HARDSECT, page_offset) != MD_HARDSECT) + goto out_put_page; atomic_inc(&wc->count); /* we already know that we may do this... @@ -637,7 +650,17 @@ STATIC int atodb_prepare_unless_covered(struct Drbd_Conf *mdev, * the number of pending IO requests DRBD at its backing device. */ atomic_inc(&mdev->local_cnt); + + bios[i] = bio; + return 0; + +out_put_page: + err = -EINVAL; + put_page(page); +out_bio_put: + bio_put(bio); + return err; } /** @@ -651,11 +674,9 @@ void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) int i, nr_elements; unsigned int enr; struct bio **bios; - struct page *page; - unsigned int page_offset=PAGE_SIZE; struct drbd_atodb_wait wc; - ERR_IF (!inc_local_if_state(mdev,Attaching)) + ERR_IF (!inc_local_if_state(mdev, Attaching)) return; /* sorry, I don't have any act_log etc... */ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); @@ -670,12 +691,12 @@ void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) wc.mdev = mdev; wc.error = 0; - for(i=0;iact_log,i)->lc_number; - if(enr == LC_FREE) continue; - /* next statement also does atomic_inc wc.count */ - if(atodb_prepare_unless_covered(mdev,bios,&page, - &page_offset, + for (i = 0; i < nr_elements; i++) { + enr = lc_entry(mdev->act_log, i)->lc_number; + if (enr == LC_FREE) + continue; + /* next statement also does atomic_inc wc.count and local_cnt */ + if (atodb_prepare_unless_covered(mdev, bios, enr/AL_EXT_PER_BM_SECT, &wc)) goto free_bios_submit_one_by_one; @@ -718,16 +739,9 @@ void drbd_al_to_on_disk_bm(struct Drbd_Conf *mdev) return; free_bios_submit_one_by_one: - // free everything by calling the endio callback directly. - for(i=0;ibi_size=0; -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) - atodb_endio(bios[i], MD_HARDSECT, 0); -#else - atodb_endio(bios[i], 0); -#endif - } + /* free everything by calling the endio callback directly. */ + for (i = 0; i < nr_elements && bios[i]; i++) + bio_endio(bios[i], 0); kfree(bios); submit_one_by_one: @@ -781,7 +795,9 @@ static inline int _try_lc_del(struct Drbd_Conf *mdev,struct lc_element *al_ext) if(likely(rv)) lc_del(mdev->act_log,al_ext); spin_unlock_irq(&mdev->al_lock); - if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n"); + MTRACE(TraceTypeALExts,TraceLvlMetrics, + if(unlikely(!rv)) INFO("Waiting for extent in drbd_al_shrink()\n"); + ); return rv; } @@ -811,7 +827,7 @@ STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused) { struct update_odbm_work *udw = (struct update_odbm_work*)w; - if( !inc_local_if_state(mdev,Attaching) ) { + if (!inc_local(mdev)) { if (DRBD_ratelimit(5*HZ,5)) WARN("Can not update on disk bitmap, local IO disabled.\n"); return 1; @@ -825,9 +841,7 @@ STATIC int w_update_odbm(drbd_dev *mdev, struct drbd_work *w, int unused) if(drbd_bm_total_weight(mdev) <= mdev->rs_failed && ( mdev->state.conn == SyncSource || mdev->state.conn == SyncTarget || mdev->state.conn == PausedSyncS || mdev->state.conn == PausedSyncT ) ) { - drbd_bm_lock(mdev); drbd_resync_finished(mdev); - drbd_bm_unlock(mdev); } drbd_bcast_sync_progress(mdev); @@ -987,7 +1001,7 @@ void __drbd_set_in_sync(drbd_dev* mdev, sector_t sector, int size, const char* f mdev->rs_mark_left =drbd_bm_total_weight(mdev); } } - if( inc_local_if_state(mdev,Attaching) ) { + if (inc_local(mdev)) { drbd_try_clear_on_disk_bm(mdev,sector,count,TRUE); dec_local(mdev); } @@ -1055,8 +1069,7 @@ struct bm_extent* _bme_get(struct Drbd_Conf *mdev, unsigned int enr) unsigned long rs_flags; spin_lock_irq(&mdev->al_lock); - if (mdev->resync_locked > mdev->resync->nr_elements-3) { - //WARN("bme_get() does not lock all elements\n"); + if (mdev->resync_locked > mdev->resync->nr_elements/2) { spin_unlock_irq(&mdev->al_lock); return NULL; } @@ -1205,7 +1218,8 @@ int drbd_try_rs_begin_io(drbd_dev* mdev, sector_t sector) D_ASSERT(test_bit(BME_NO_WRITES,&bm_ext->flags)); clear_bit(BME_NO_WRITES,&bm_ext->flags); mdev->resync_wenr = LC_FREE; - lc_put(mdev->resync,&bm_ext->lce); + if (lc_put(mdev->resync, &bm_ext->lce) == 0) + mdev->resync_locked--; wake_up(&mdev->al_wait); } else { ALERT("LOGIC BUG\n"); @@ -1231,8 +1245,12 @@ int drbd_try_rs_begin_io(drbd_dev* mdev, sector_t sector) } goto check_al; } else { - if (mdev->resync_locked > mdev->resync->nr_elements-3) + if (mdev->resync_locked > mdev->resync->nr_elements-3) { + MTRACE(TraceTypeResync, TraceLvlAll, + INFO("resync_locked = %u!\n", mdev->resync_locked); + ); goto try_again; + } bm_ext = (struct bm_extent*)lc_get(mdev->resync,enr); if (!bm_ext) { const unsigned long rs_flags = mdev->resync->flags; @@ -1324,26 +1342,14 @@ void drbd_rs_complete_io(drbd_dev* mdev, sector_t sector) */ void drbd_rs_cancel_all(drbd_dev* mdev) { - struct bm_extent* bm_ext; - int i; - MTRACE(TraceTypeResync, TraceLvlMetrics, INFO("drbd_rs_cancel_all\n"); ); spin_lock_irq(&mdev->al_lock); - if(inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. - for(i=0;iresync->nr_elements;i++) { - bm_ext = (struct bm_extent*) lc_entry(mdev->resync,i); - if(bm_ext->lce.lc_number == LC_FREE) continue; - bm_ext->lce.refcnt = 0; // Rude but ok. - bm_ext->rs_left = 0; - clear_bit(BME_LOCKED,&bm_ext->flags); - clear_bit(BME_NO_WRITES,&bm_ext->flags); - lc_del(mdev->resync,&bm_ext->lce); - } - mdev->resync->used=0; + if (inc_local_if_state(mdev,Failed)) { // Makes sure ->resync is there. + lc_reset(mdev->resync); dec_local(mdev); } mdev->resync_locked = 0; @@ -1457,7 +1463,7 @@ void drbd_rs_failed_io(drbd_dev* mdev, sector_t sector, int size) if (count) { mdev->rs_failed += count; - if( inc_local_if_state(mdev,Attaching) ) { + if (inc_local(mdev)) { drbd_try_clear_on_disk_bm(mdev,sector,count,FALSE); dec_local(mdev); } diff --git a/ubuntu/block/drbd/drbd_bitmap.c b/ubuntu/block/drbd/drbd_bitmap.c index 725102c..33017b2 100644 --- a/ubuntu/block/drbd/drbd_bitmap.c +++ b/ubuntu/block/drbd/drbd_bitmap.c @@ -241,7 +241,7 @@ STATIC void bm_end_info(drbd_dev *mdev, const char* where) int drbd_bm_init(drbd_dev *mdev) { struct drbd_bitmap *b = mdev->bitmap; - D_BUG_ON(b); + D_BUG_ON(b != NULL); b = kzalloc(sizeof(struct drbd_bitmap),GFP_KERNEL); if (!b) return -ENOMEM; @@ -370,7 +370,7 @@ void _drbd_bm_recount_bits(drbd_dev *mdev, char* file, int line) int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) { struct drbd_bitmap *b = mdev->bitmap; - unsigned long bits, bytes, words, *nbm, *obm = 0; + unsigned long bits, bytes, words, *nbm, *obm = NULL; int err = 0, growing; ERR_IF(!b) return -ENOMEM; @@ -407,20 +407,23 @@ int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) */ words = ALIGN(bits,64) >> LN2_BPL; - D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12)); + if (inc_local(mdev)) { + D_ASSERT((u64)bits <= (((u64)mdev->bc->md.md_size_sect-MD_BM_OFFSET) << 12)); + dec_local(mdev); + } - if ( words == b->bm_words ) { + growing = bits > b->bm_bits; + if (words == b->bm_words) { /* optimize: capacity has changed, - * but only within one long word worth of bits. - * just update the bm_dev_capacity and bm_bits members. - */ + * but only within one ulong64 word worth of bits. + * no allocation needed, just update the + * bm_dev_capacity and bm_bits members, + * and set the new bits, if any */ spin_lock_irq(&b->bm_lock); - b->bm_bits = bits; - b->bm_dev_capacity = capacity; - b->bm_set -= bm_clear_surplus(b); - bm_end_info(mdev, __FUNCTION__ ); - spin_unlock_irq(&b->bm_lock); - goto out; + bm_set_surplus(b); + if (growing) + b->bm_set += bits - b->bm_bits; + goto done; } else { /* one extra long to catch off by one errors */ bytes = (words+1)*sizeof(long); @@ -436,22 +439,29 @@ int drbd_bm_resize(drbd_dev *mdev, sector_t capacity) // brgs. move several MB within spinlock... // FIXME this should go into userspace! if (obm) { + /* adjust for possibly partially used + * last word of old bitmap. */ bm_set_surplus(b); D_ASSERT(b->bm[b->bm_words] == DRBD_MAGIC); memcpy(nbm,obm,min_t(size_t,b->bm_words,words)*sizeof(long)); } - growing = words > b->bm_words; - if (growing) { // set all newly allocated bits - // start at -1, just to be sure. - memset( nbm + (b->bm_words?:1)-1 , 0xff, - (words - ((b->bm_words?:1)-1)) * sizeof(long) ); + if (growing) { + /* set all newly allocated bits */ + memset( nbm + b->bm_words, 0xff, + (words - b->bm_words) * sizeof(long) ); + /* yes, I know, this is not the same number as was set by this memset. + * bm_set_surplus above before the memcpy, + * and bm_clear_surplus below after the new assignments + * make sure that this is indeed the amount of newly set bits */ b->bm_set += bits - b->bm_bits; } nbm[words] = DRBD_MAGIC; b->bm = nbm; - b->bm_bits = bits; b->bm_words = words; + done: + b->bm_bits = bits; b->bm_dev_capacity = capacity; + /* finally clear possibly only partially used last words */ bm_clear_surplus(b); if (!growing) b->bm_set = bm_count_bits(b); @@ -598,7 +608,7 @@ void drbd_bm_set_all(drbd_dev *mdev) spin_unlock_irq(&b->bm_lock); } -BIO_ENDIO_FN(bm_async_io_complete) +static BIO_ENDIO_TYPE bm_async_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error) { struct drbd_bitmap *b = bio->bi_private; int uptodate = bio_flagged(bio,BIO_UPTODATE); @@ -628,7 +638,7 @@ BIO_ENDIO_FN(bm_async_io_complete) BIO_ENDIO_FN_RETURN; } -STATIC void bm_page_io_async(drbd_dev *mdev, struct drbd_bitmap *b, int page_nr, int rw) +STATIC void bm_page_io_async(drbd_dev *mdev, struct drbd_bitmap *b, int page_nr, int rw) __must_hold(local) { /* we are process context. we always get a bio */ /* THINK: do we need GFP_NOIO here? */ @@ -692,7 +702,7 @@ void bm_cpu_to_lel(struct drbd_bitmap *b) /* * bm_rw: read/write the whole bitmap from/to its on disk location. */ -STATIC int bm_rw(struct Drbd_Conf *mdev, int rw) +STATIC int bm_rw(struct Drbd_Conf *mdev, int rw) __must_hold(local) { struct drbd_bitmap *b = mdev->bitmap; /* sector_t sector; */ @@ -770,7 +780,7 @@ STATIC int bm_rw(struct Drbd_Conf *mdev, int rw) * * currently only called from "drbd_nl_disk_conf" */ -int drbd_bm_read(struct Drbd_Conf *mdev) +int drbd_bm_read(struct Drbd_Conf *mdev) __must_hold(local) { struct drbd_bitmap *b = mdev->bitmap; int err=0; @@ -790,7 +800,7 @@ int drbd_bm_read(struct Drbd_Conf *mdev) * * called at various occasions. */ -int drbd_bm_write(struct Drbd_Conf *mdev) +int drbd_bm_write(struct Drbd_Conf *mdev) __must_hold(local) { return bm_rw(mdev, WRITE); } @@ -802,7 +812,7 @@ int drbd_bm_write(struct Drbd_Conf *mdev) * @enr: The _sector_ offset from the start of the bitmap. * */ -int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr) +int drbd_bm_write_sect(struct Drbd_Conf *mdev,unsigned long enr) __must_hold(local) { sector_t on_disk_sector = enr + mdev->bc->md.md_offset + mdev->bc->md.bm_offset; int bm_words, num_words, offset, err = 0; diff --git a/ubuntu/block/drbd/drbd_buildtag.c b/ubuntu/block/drbd/drbd_buildtag.c index 1271efe..aa6cfd0 100644 --- a/ubuntu/block/drbd/drbd_buildtag.c +++ b/ubuntu/block/drbd/drbd_buildtag.c @@ -1,6 +1,7 @@ /* automatically generated. DO NOT EDIT. */ +#include "linux/drbd_config.h" const char * drbd_buildtag(void) { - return "GIT-hash: b3fe2bdfd3b9f7c2f923186883eb9e2a0d3a5b1b" - " build by phil@mescal, 2008-02-12 11:56:43"; + return "GIT-hash: ee3ad77563d2e87171a3da17cc002ddfd1677dbe" + " build by ivoks@hardy-dev, 2008-10-25 10:47:32"; } diff --git a/ubuntu/block/drbd/drbd_compat_wrappers.h b/ubuntu/block/drbd/drbd_compat_wrappers.h index 8246b80..455ea6d 100644 --- a/ubuntu/block/drbd/drbd_compat_wrappers.h +++ b/ubuntu/block/drbd/drbd_compat_wrappers.h @@ -66,20 +66,22 @@ static inline int drbd_bio_has_active_page(struct bio *bio) /* Before Linux-2.6.24 bie_endio() had the size of the bio as second argument. See 6712ecf8f648118c3363c142196418f89a510b90 */ #define bio_endio(B,E) bio_endio(B, (B)->bi_size, E) -#define BIO_ENDIO_FN(name) int name(struct bio *bio, unsigned int bytes_done, int error) +#define BIO_ENDIO_TYPE int +#define BIO_ENDIO_ARGS(b,e) (b, unsigned int bytes_done, e) #define BIO_ENDIO_FN_START if (bio->bi_size) return 1 #define BIO_ENDIO_FN_RETURN return 0 #else -#define BIO_ENDIO_FN(name) void name(struct bio *bio, int error) +#define BIO_ENDIO_TYPE void +#define BIO_ENDIO_ARGS(b,e) (b,e) #define BIO_ENDIO_FN_START while(0) {} #define BIO_ENDIO_FN_RETURN return #endif // bi_end_io handlers -extern BIO_ENDIO_FN(drbd_md_io_complete); -extern BIO_ENDIO_FN(drbd_endio_read_sec); -extern BIO_ENDIO_FN(drbd_endio_write_sec); -extern BIO_ENDIO_FN(drbd_endio_pri); +extern BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error); +extern BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error); +extern BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error); +extern BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error); #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) /* Before 2.6.23 (with 20c2df83d25c6a95affe6157a4c9cac4cf5ffaac) kmem_cache_create had a @@ -113,6 +115,7 @@ static inline void sg_set_buf(struct scatterlist *sg, const void *buf, */ static inline void drbd_generic_make_request(drbd_dev *mdev, int fault_type, struct bio *bio) { + __release(local); if (!bio->bi_bdev) { printk(KERN_ERR DEVICE_NAME "%d: drbd_generic_make_request: bio->bi_bdev == NULL\n", mdev_to_minor(mdev)); @@ -230,7 +233,7 @@ static __inline__ int atomic_sub_return(int i, atomic_t *v) #endif -#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) +#if !defined(CRYPTO_ALG_ASYNC) /* With Linux-2.6.19 the crypto API changed! */ /* This is not a generic backport of the new api, it just implements the corner case of "hmac(xxx)". */ @@ -323,3 +326,13 @@ static inline void *kzalloc(size_t size, int flags) return rv; } #endif + +#ifndef __CHECKER__ +# undef __cond_lock +# define __cond_lock(x,c) (c) +#endif + +#ifndef KERNEL_HAS_GFP_T +#define KERNEL_HAS_GFP_T +typedef unsigned gfp_t; +#endif diff --git a/ubuntu/block/drbd/drbd_int.h b/ubuntu/block/drbd/drbd_int.h index 8de443b..0b38b01 100644 --- a/ubuntu/block/drbd/drbd_int.h +++ b/ubuntu/block/drbd/drbd_int.h @@ -36,13 +36,45 @@ #include #include #include +#include #include #include "lru_cache.h" +#ifdef __CHECKER__ +# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr"))) +# define __protected_read_by(x) __attribute__((require_context(x,1,999,"read"))) +# define __protected_write_by(x) __attribute__((require_context(x,1,999,"write"))) +# define __must_hold(x) __attribute__((context(x,1,1), require_context(x,1,999,"call"))) +#else +# define __protected_by(x) +# define __protected_read_by(x) +# define __protected_write_by(x) +# define __must_hold(x) +#endif + +#define __no_warn(lock, stmt) do { __acquire(lock); stmt; __release(lock); } while (0) + +/* Compatibility for older kernels */ +#ifndef __acquires +# ifdef __CHECKER__ +# define __acquires(x) __attribute__((context(x,0,1))) +# define __releases(x) __attribute__((context(x,1,0))) +# define __acquire(x) __context__(x,1) +# define __release(x) __context__(x,-1) +# define __cond_lock(x,c) ((c) ? ({ __acquire(x); 1; }) : 0) +# else +# define __acquires(x) +# define __releases(x) +# define __acquire(x) (void)0 +# define __release(x) (void)0 +# define __cond_lock(x,c) (c) +# endif +#endif + // module parameter, defined in drbd_main.c -extern int minor_count; +extern unsigned int minor_count; extern int allow_oos; -extern int major_nr; +extern unsigned int major_nr; extern int use_nbd_major; #ifdef DRBD_ENABLE_FAULTS @@ -629,6 +661,7 @@ struct drbd_request { struct bio *master_bio; /* master bio pointer */ unsigned long rq_state; /* see comments above _req_mod() */ int seq_num; + unsigned long start_time; }; struct drbd_barrier { @@ -708,6 +741,11 @@ enum { so don't even try */ MD_NO_BARRIER, /* meta data device does not support barriers, so don't even try */ + SUSPEND_IO, /* suspend application io */ + BITMAP_IO, /* suspend application io; + once no more io in flight, start bitmap io */ + BITMAP_IO_QUEUED, /* Started bitmap IO */ + RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */ }; struct drbd_bitmap; // opaque for Drbd_Conf @@ -783,6 +821,12 @@ struct drbd_md_io { int error; }; +struct bm_io_work { + struct drbd_work w; + int (*io_fn)(drbd_dev *mdev); + void (*done)(drbd_dev *mdev, int rv); +}; + struct Drbd_Conf { #ifdef PARANOIA long magic; @@ -793,7 +837,7 @@ struct Drbd_Conf { /* configured by drbdsetup */ struct net_conf *net_conf; // protected by inc_net() and dec_net() struct syncer_conf sync_conf; - struct drbd_backing_dev *bc; // protected by inc_local() dec_local() + struct drbd_backing_dev *bc __protected_by(local); sector_t p_size; /* partner's disk size */ struct request_queue *rq_queue; @@ -881,9 +925,12 @@ struct Drbd_Conf { spinlock_t peer_seq_lock; int minor; unsigned long comm_bm_set; // communicated number of set bits. + struct bm_io_work bm_io_work; + u64 ed_uuid; /* UUID of the exposed data */ + struct mutex state_mutex; }; -static inline drbd_dev *minor_to_mdev(int minor) +static inline drbd_dev *minor_to_mdev(unsigned int minor) { drbd_dev *mdev; @@ -922,7 +969,6 @@ static inline void drbd_put_data_sock(drbd_dev *mdev) up(&mdev->data.mutex); } - /* * function declarations *************************/ @@ -932,6 +978,9 @@ static inline void drbd_put_data_sock(drbd_dev *mdev) enum chg_state_flags { ChgStateHard = 1, ChgStateVerbose = 2, + ChgWaitComplete = 4, + ChgSerialize = 8, + ChgOrdered = ChgWaitComplete + ChgSerialize, }; extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, @@ -939,11 +988,11 @@ extern int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, extern void drbd_force_state(drbd_dev*, drbd_state_t, drbd_state_t); extern int _drbd_request_state(drbd_dev*, drbd_state_t, drbd_state_t, enum chg_state_flags); -extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags ); +extern int _drbd_set_state(drbd_dev*, drbd_state_t, enum chg_state_flags, + struct completion *done); extern void print_st_err(drbd_dev*, drbd_state_t, drbd_state_t, int ); extern int drbd_thread_start(struct Drbd_thread *thi); extern void _drbd_thread_stop(struct Drbd_thread *thi, int restart, int wait); -extern void drbd_thread_signal(struct Drbd_thread *thi); extern void drbd_free_resources(drbd_dev *mdev); extern void tl_release(drbd_dev *mdev,unsigned int barrier_nr, unsigned int set_size); @@ -953,11 +1002,9 @@ extern void drbd_free_sock(drbd_dev *mdev); extern int drbd_send(drbd_dev *mdev, struct socket *sock, void* buf, size_t size, unsigned msg_flags); extern int drbd_send_protocol(drbd_dev *mdev); -extern int _drbd_send_uuids(drbd_dev *mdev); extern int drbd_send_uuids(drbd_dev *mdev); extern int drbd_send_sync_uuid(drbd_dev *mdev, u64 val); extern int drbd_send_sizes(drbd_dev *mdev); -extern int _drbd_send_state(drbd_dev *mdev); extern int drbd_send_state(drbd_dev *mdev); extern int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, Drbd_Packet_Cmd cmd, Drbd_Header *h, @@ -996,15 +1043,21 @@ extern void drbd_mdev_cleanup(drbd_dev *mdev); extern void drbd_md_sync(drbd_dev *mdev); extern int drbd_md_read(drbd_dev *mdev, struct drbd_backing_dev * bdev); // maybe define them below as inline? -extern void drbd_uuid_set(drbd_dev *mdev,int idx, u64 val); -extern void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val); -extern void drbd_uuid_new_current(drbd_dev *mdev); -extern void _drbd_uuid_new_current(drbd_dev *mdev); -extern void drbd_uuid_set_bm(drbd_dev *mdev, u64 val); -extern void drbd_md_set_flag(drbd_dev *mdev, int flags); -extern void drbd_md_clear_flag(drbd_dev *mdev, int flags); +extern void drbd_uuid_set(drbd_dev *mdev,int idx, u64 val) __must_hold(local); +extern void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) __must_hold(local); +extern void drbd_uuid_new_current(drbd_dev *mdev) __must_hold(local); +extern void _drbd_uuid_new_current(drbd_dev *mdev) __must_hold(local); +extern void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) __must_hold(local); +extern void drbd_md_set_flag(drbd_dev *mdev, int flags) __must_hold(local); +extern void drbd_md_clear_flag(drbd_dev *mdev, int flags) __must_hold(local); extern int drbd_md_test_flag(struct drbd_backing_dev *, int); extern void drbd_md_mark_dirty(drbd_dev *mdev); +extern void drbd_queue_bitmap_io(drbd_dev *mdev, + int (*io_fn)(drbd_dev *), + void (*done)(drbd_dev *, int)); +extern int drbd_bmio_set_n_write(drbd_dev *mdev); +extern int drbd_bitmap_io(drbd_dev *mdev, int (*io_fn)(drbd_dev *)); + /* Meta data layout We reserve a 128MB Block (4k aligned) @@ -1097,6 +1150,10 @@ struct bm_extent { * Do not use PAGE_SIZE here! Use a architecture agnostic constant! */ #define BM_PACKET_WORDS ((4096-sizeof(Drbd_Header))/sizeof(long)) +#if (PAGE_SIZE < 4096) +/* drbd_send_bitmap / receive_bitmap would break horribly */ +#error "PAGE_SIZE too small" +#endif /* the extent in "PER_EXTENT" below is an activity log extent * we need that many (long words/bytes) to store the bitmap @@ -1123,8 +1180,8 @@ struct bm_extent { #define DRBD_MAX_SECTORS_FLEX DRBD_MAX_SECTORS_32 #else #define DRBD_MAX_SECTORS DRBD_MAX_SECTORS_BM -/* 16 TB in units of sectors */ -#define DRBD_MAX_SECTORS_FLEX (1ULL<<(32+BM_BLOCK_SIZE_B-9)) +/* 8 TB in units of sectors */ +#define DRBD_MAX_SECTORS_FLEX (1ULL<<(31+BM_BLOCK_SIZE_B-9)) #endif /* Sector shift value for the "hash" functions of tl_hash and ee_hash tables. @@ -1147,9 +1204,9 @@ extern int drbd_bm_clear_bits( drbd_dev *mdev, unsigned long s, unsigned long e); extern int drbd_bm_test_bit (drbd_dev *mdev, unsigned long bitnr); extern int drbd_bm_e_weight (drbd_dev *mdev, unsigned long enr); -extern int drbd_bm_write_sect(drbd_dev *mdev, unsigned long enr); -extern int drbd_bm_read (drbd_dev *mdev); -extern int drbd_bm_write (drbd_dev *mdev); +extern int drbd_bm_write_sect(drbd_dev *mdev, unsigned long enr) __must_hold(local); +extern int drbd_bm_read (drbd_dev *mdev) __must_hold(local); +extern int drbd_bm_write (drbd_dev *mdev) __must_hold(local); extern unsigned long drbd_bm_ALe_set_all (drbd_dev *mdev, unsigned long al_enr); extern size_t drbd_bm_words (drbd_dev *mdev); extern sector_t drbd_bm_capacity (drbd_dev *mdev); @@ -1177,7 +1234,7 @@ extern int drbd_bm_count_bits(drbd_dev *mdev, const unsigned long s, const unsig * because of kmem_cache_t weirdness */ #include "drbd_compat_wrappers.h" -extern int minor_count; +extern unsigned int minor_count; extern struct kmem_cache *drbd_request_cache; extern struct kmem_cache *drbd_ee_cache; extern mempool_t *drbd_request_mempool; @@ -1306,9 +1363,9 @@ extern int is_valid_ar_handle(drbd_request_t *, sector_t); extern char* ppsize(char* buf, unsigned long long size); extern sector_t drbd_new_dev_size(struct Drbd_Conf*, struct drbd_backing_dev*); enum determin_dev_size_enum { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 }; -extern enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev*); +extern enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev*) __must_hold(local); extern void resync_after_online_grow(drbd_dev *mdev); -extern void drbd_setup_queue_param(drbd_dev *mdev, unsigned int); +extern void drbd_setup_queue_param(drbd_dev *mdev, unsigned int) __must_hold(local); extern int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force); extern int drbd_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg); @@ -1368,40 +1425,46 @@ extern struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, u64 id, sector_t sector, unsigned int data_size, - unsigned int gfp_mask); + gfp_t gfp_mask) __must_hold(local); extern void drbd_free_ee(drbd_dev *mdev, struct Tl_epoch_entry* e); extern void drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); extern void _drbd_wait_ee_list_empty(drbd_dev *mdev, struct list_head *head); extern void drbd_set_recv_tcq(drbd_dev *mdev, int tcq_enabled); extern void _drbd_clear_done_ee(drbd_dev *mdev); -static inline void drbd_tcp_cork(struct socket *sock) +/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to + * mess with get_fs/set_fs, we know we are KERNEL_DS always. */ +static inline int drbd_setsockopt(struct socket *sock, int level, int optname, + char *optval, int optlen) { -#if 1 - mm_segment_t oldfs = get_fs(); - int val = 1; + int err; + if (level == SOL_SOCKET) + err = sock_setsockopt(sock, level, optname, optval, optlen); + else + err = sock->ops->setsockopt(sock, level, optname, optval, + optlen); + return err; +} - set_fs(KERNEL_DS); - tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); - set_fs(oldfs); -#else - tcp_sk(sock->sk)->nonagle |= TCP_NAGLE_CORK; -#endif +static inline void drbd_tcp_cork(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val) ); } -static inline void drbd_tcp_flush(struct socket *sock) +static inline void drbd_tcp_uncork(struct socket *sock) { -#if 1 - mm_segment_t oldfs = get_fs(); - int val = 0; + int __user val = 0; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK, + (char __user *)&val, sizeof(val) ); +} - set_fs(KERNEL_DS); - tcp_setsockopt(sock->sk, SOL_TCP, TCP_CORK, (char*)&val, sizeof(val) ); - set_fs(oldfs); -#else - tcp_sk(sock->sk)->nonagle &= ~TCP_NAGLE_CORK; - tcp_push_pending_frames(sock->sk, tcp_sk(sock->sk)); -#endif +static inline void drbd_tcp_nodelay(struct socket *sock) +{ + int __user val = 1; + (void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY, + (char __user *)&val, sizeof(val) ); } // drbd_proc.c @@ -1461,13 +1524,13 @@ void drbd_bcast_sync_progress(drbd_dev *mdev); ({drbd_state_t val; val.i=0; val.T1 = (S1); \ val.T2 = (S2); val.T3 = (S3); val;}) -#define _NS(D,T,S) D,({drbd_state_t ns; ns.i = D->state.i; ns.T = (S); ns;}) +#define _NS(D,T,S) D,({drbd_state_t __ns; __ns.i = D->state.i; __ns.T = (S); __ns;}) #define _NS2(D,T1,S1,T2,S2) \ - D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ - ns.T2 = (S2); ns;}) + D,({drbd_state_t __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns;}) #define _NS3(D,T1,S1,T2,S2,T3,S3) \ - D,({drbd_state_t ns; ns.i = D->state.i; ns.T1 = (S1); \ - ns.T2 = (S2); ns.T3 = (S3); ns;}) + D,({drbd_state_t __ns; __ns.i = D->state.i; __ns.T1 = (S1); \ + __ns.T2 = (S2); __ns.T3 = (S3); __ns;}) static inline void drbd_state_lock(drbd_dev *mdev) { @@ -1484,7 +1547,7 @@ static inline void drbd_state_unlock(drbd_dev *mdev) static inline int drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val) { - return _drbd_request_state(mdev, mask, val, ChgStateVerbose); + return _drbd_request_state(mdev, mask, val, ChgStateVerbose + ChgOrdered); } /** @@ -1504,7 +1567,7 @@ static inline void __drbd_chk_io_error(drbd_dev* mdev, int forcedetach) case Detach: case CallIOEHelper: if (mdev->state.disk > Failed) { - _drbd_set_state(_NS(mdev,disk,Failed),ChgStateHard); + _drbd_set_state(_NS(mdev, disk,Failed), ChgStateHard, NULL); ERR("Local IO failed. Detaching...\n"); } break; @@ -1560,18 +1623,24 @@ static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev) } } -/* returns the capacity we announce to out peer */ +/* returns the capacity we announce to out peer. + * we clip ourselves at the various MAX_SECTORS, because if we don't, + * current implementation will oops sooner or later */ static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev) { switch (bdev->dc.meta_dev_idx) { case DRBD_MD_INDEX_INTERNAL: case DRBD_MD_INDEX_FLEX_INT: return drbd_get_capacity(bdev->backing_bdev) - ? drbd_md_first_sector(bdev) + ? min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_md_first_sector(bdev)) : 0; case DRBD_MD_INDEX_FLEX_EXT: + return min_t(sector_t, DRBD_MAX_SECTORS_FLEX, + drbd_get_capacity(bdev->backing_bdev)); default: - return drbd_get_capacity(bdev->backing_bdev); + return min_t(sector_t, DRBD_MAX_SECTORS, + drbd_get_capacity(bdev->backing_bdev)); } } @@ -1776,24 +1845,24 @@ static inline int inc_net(drbd_dev* mdev) return have_net_conf; } -/* strictly speaking, - * these would have to hold the req_lock while looking at - * the disk state. But since we cannot submit within a spinlock, - * this is mood... +/** + * inc_local: Returns TRUE when local IO is possible. If it returns + * TRUE you should call dec_local() after IO is completed. */ +#define inc_local_if_state(M,MINS) __cond_lock(local, _inc_local_if_state(M,MINS)) +#define inc_local(M) __cond_lock(local, _inc_local_if_state(M,Inconsistent)) static inline void dec_local(drbd_dev* mdev) { + __release(local); if(atomic_dec_and_test(&mdev->local_cnt)) { wake_up(&mdev->misc_wait); } D_ASSERT(atomic_read(&mdev->local_cnt)>=0); } -/** - * inc_local: Returns TRUE when local IO is possible. If it returns - * TRUE you should call dec_local() after IO is completed. - */ -static inline int inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) + +#ifndef __CHECKER__ +static inline int _inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) { int io_allowed; @@ -1804,10 +1873,9 @@ static inline int inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) } return io_allowed; } -static inline int inc_local(drbd_dev* mdev) -{ - return inc_local_if_state(mdev, Inconsistent); -} +#else +extern int _inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins); +#endif /* you must have an "inc_local" reference */ static inline void drbd_get_syncer_progress(drbd_dev* mdev, @@ -1827,10 +1895,14 @@ static inline void drbd_get_syncer_progress(drbd_dev* mdev, /* >> 10 to prevent overflow, * +1 to prevent division by zero */ if (*bits_left > mdev->rs_total) { - /* doh. logic bug somewhere. - * for now, just try to prevent in-kernel buffer overflow. + /* doh. maybe a logic bug somewhere. + * may also be just a race condition + * between this and a disconnect during sync. + * for now, just prevent in-kernel buffer overflow. */ - ERR("logic bug? rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + smp_rmb(); + WARN("cs:%s rs_left=%lu > rs_total=%lu (rs_failed %lu)\n", + conns_to_name(mdev->state.conn), *bits_left, mdev->rs_total, mdev->rs_failed); *per_mil_done = 0; } else { @@ -1857,13 +1929,24 @@ static inline int drbd_get_max_buffers(drbd_dev* mdev) } static inline int __inc_ap_bio_cond(drbd_dev* mdev) { + const unsigned int cs = mdev->state.conn; + const unsigned int ds = mdev->state.disk; int mxb = drbd_get_max_buffers(mdev); + if (mdev->state.susp) return 0; - if (mdev->state.conn == WFBitMapS) return 0; - if (mdev->state.conn == WFBitMapT) return 0; + if (test_bit(SUSPEND_IO, &mdev->flags)) return 0; + + /* to avoid deadlock or bitmap corruption, we need to lock out + * application io during attaching and bitmap exchange */ + if (Attaching <= ds && ds <= Negotiating) + return 0; + if (cs == WFBitMapS || cs == WFBitMapT || cs == WFReportParams) + return 0; + /* since some older kernels don't have atomic_add_unless, * and we are within the spinlock anyways, we have this workaround. */ if (atomic_read(&mdev->ap_bio_cnt) > mxb) return 0; + if (test_bit(BITMAP_IO, &mdev->flags)) return 0; atomic_inc(&mdev->ap_bio_cnt); return 1; } @@ -1903,6 +1986,19 @@ static inline void dec_ap_bio(drbd_dev* mdev) D_ASSERT(ap_bio>=0); if (ap_bio < mxb) wake_up(&mdev->misc_wait); + if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) { + if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags)) + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } +} + +static inline void drbd_set_ed_uuid(drbd_dev *mdev, u64 val) +{ + mdev->ed_uuid = val; + + MTRACE(TraceTypeUuid,TraceLvlMetrics, + INFO(" exposed data uuid now %016llX\n",val); + ); } static inline int seq_cmp(u32 a, u32 b) @@ -1941,7 +2037,8 @@ static inline int drbd_queue_order_type(drbd_dev* mdev) # define QUEUE_ORDERED_NONE 0 # define QUEUE_ORDERED_TAG 1 # define QUEUE_ORDERED_FLUSH 2 -# warning "TCQ code disabled at compile time." +/* # warning "TCQ code disabled at compile time." + * no need to warn about, this is all dead code anyways. */ rv = QUEUE_ORDERED_NONE; // Kernels before 2.6.12 had not had TCQ support. #endif return rv; @@ -1969,13 +2066,9 @@ static inline void drbd_blk_run_queue(struct request_queue *q) static inline void drbd_kick_lo(drbd_dev *mdev) { - if (!mdev->bc->backing_bdev) { - if (DRBD_ratelimit(5*HZ,5)) { - ERR("backing_bdev==NULL in drbd_kick_lo! The following call trace is for debuggin purposes only. Don't worry.\n"); - dump_stack(); - } - } else { + if (inc_local(mdev)) { drbd_blk_run_queue(bdev_get_queue(mdev->bc->backing_bdev)); + dec_local(mdev); } } #endif diff --git a/ubuntu/block/drbd/drbd_main.c b/ubuntu/block/drbd/drbd_main.c index 9abaef8..c4bb5e1 100644 --- a/ubuntu/block/drbd/drbd_main.c +++ b/ubuntu/block/drbd/drbd_main.c @@ -65,6 +65,7 @@ struct after_state_chg_work { drbd_state_t os; drbd_state_t ns; enum chg_state_flags flags; + struct completion *done; }; int drbdd_init(struct Drbd_thread*); @@ -91,13 +92,13 @@ MODULE_ALIAS_BLOCKDEV_MAJOR(LANANA_DRBD_MAJOR); MODULE_PARM_DESC(allow_oos, "DONT USE!"); /* thanks to these macros, if compiled into the kernel (not-module), * this becomes the boot parameter drbd.minor_count */ -module_param(minor_count, int,0444); +module_param(minor_count, uint,0444); module_param(allow_oos, bool,0); #ifdef DRBD_ENABLE_FAULTS int enable_faults = 0; int fault_rate; -int fault_count; +static int fault_count; int fault_devs; module_param(enable_faults,int,0664); // bitmap of enabled faults module_param(fault_rate,int,0664); // fault rate % value - applies to all enabled faults @@ -106,8 +107,8 @@ module_param(fault_devs,int,0644); // bitmap of devices to insert faults on #endif // module parameter, defined -int major_nr = LANANA_DRBD_MAJOR; -int minor_count = 32; +unsigned int major_nr = LANANA_DRBD_MAJOR; +unsigned int minor_count = 32; int allow_oos = 0; @@ -129,9 +130,6 @@ char usermode_helper[80] = "/sbin/drbdadm"; module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0644); -// global panic flag -volatile int drbd_did_panic = 0; - /* in 2.6.x, our device mapping and config info contains our virtual gendisks * as member "struct gendisk *vdisk;" */ @@ -161,6 +159,25 @@ STATIC struct block_device_operations drbd_ops = { #define ARRY_SIZE(A) (sizeof(A)/sizeof(A[0])) +#ifdef __CHECKER__ +/* When checking with sparse, and this is an inline function, sparse will + give tons of false positives. When this is a real functions sparse works. + */ +int _inc_local_if_state(drbd_dev* mdev, drbd_disks_t mins) +{ + int io_allowed; + + atomic_inc(&mdev->local_cnt); + io_allowed = (mdev->state.disk >= mins ); + if( !io_allowed ) { + if(atomic_dec_and_test(&mdev->local_cnt)) + wake_up(&mdev->misc_wait); + } + return io_allowed; +} + +#endif + /************************* The transfer log start */ STATIC int tl_init(drbd_dev *mdev) { @@ -170,7 +187,7 @@ STATIC int tl_init(drbd_dev *mdev) if(!b) return 0; INIT_LIST_HEAD(&b->requests); INIT_LIST_HEAD(&b->w.list); - b->next = 0; + b->next = NULL; b->br_number = 4711; b->n_req = 0; b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ @@ -205,7 +222,7 @@ void _tl_add_barrier(drbd_dev *mdev, struct drbd_barrier *new) INIT_LIST_HEAD(&new->requests); INIT_LIST_HEAD(&new->w.list); new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */ - new->next=0; + new->next = NULL; new->n_req=0; newest_before = mdev->newest_barrier; @@ -355,7 +372,7 @@ int drbd_io_error(drbd_dev* mdev, int forcedetach) spin_lock_irqsave(&mdev->req_lock,flags); if( (send = (mdev->state.disk == Failed)) ) { - _drbd_set_state(_NS(mdev,disk,Diskless),ChgStateHard); + _drbd_set_state(_NS(mdev, disk, Diskless), ChgStateHard, NULL); } spin_unlock_irqrestore(&mdev->req_lock,flags); @@ -408,7 +425,7 @@ int drbd_change_state(drbd_dev* mdev, enum chg_state_flags f, spin_lock_irqsave(&mdev->req_lock,flags); os = mdev->state; ns.i = (os.i & ~mask.i) | val.i; - rv = _drbd_set_state(mdev, ns, f); + rv = _drbd_set_state(mdev, ns, f, NULL); ns = mdev->state; spin_unlock_irqrestore(&mdev->req_lock,flags); @@ -424,7 +441,7 @@ STATIC int is_valid_state(drbd_dev* mdev, drbd_state_t ns); STATIC int is_valid_state_transition(drbd_dev*, drbd_state_t, drbd_state_t); STATIC int drbd_send_state_req(drbd_dev *, drbd_state_t, drbd_state_t); -set_st_err_t _req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val) +STATIC set_st_err_t _req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val) { drbd_state_t os,ns; unsigned long flags; @@ -453,19 +470,19 @@ set_st_err_t _req_st_cond(drbd_dev* mdev,drbd_state_t mask, drbd_state_t val) return rv; } -/** - * _drbd_request_state: - * This function is the most gracefull way to change state. For some state - * transition this function even does a cluster wide transaction. - * It has a cousin named drbd_request_state(), which is always verbose. - */ -int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, - enum chg_state_flags f) +STATIC int drbd_req_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, + enum chg_state_flags f) { + struct completion done; unsigned long flags; drbd_state_t os,ns; int rv; + init_completion(&done); + + if (f & ChgSerialize) + mutex_lock(&mdev->state_mutex); + spin_lock_irqsave(&mdev->req_lock,flags); os = mdev->state; ns.i = (os.i & ~mask.i) | val.i; @@ -477,7 +494,7 @@ int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, if( rv < SS_Success ) { if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); - return rv; + goto abort; } drbd_state_lock(mdev); @@ -485,7 +502,7 @@ int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, drbd_state_unlock(mdev); rv = SS_CW_FailedByPeer; if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); - return rv; + goto abort; } wait_event(mdev->state_wait,(rv=_req_st_cond(mdev,mask,val))); @@ -494,18 +511,46 @@ int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, // nearly dead code. drbd_state_unlock(mdev); if( f & ChgStateVerbose ) print_st_err(mdev,os,ns,rv); - return rv; + goto abort; } + spin_lock_irqsave(&mdev->req_lock,flags); os = mdev->state; ns.i = (os.i & ~mask.i) | val.i; + rv = _drbd_set_state(mdev, ns, f, &done); drbd_state_unlock(mdev); + } else { + rv = _drbd_set_state(mdev, ns, f, &done); } - rv = _drbd_set_state(mdev, ns, f); - ns = mdev->state; spin_unlock_irqrestore(&mdev->req_lock,flags); + if (f & ChgWaitComplete && rv == SS_Success) { + D_ASSERT(current != mdev->worker.task); + wait_for_completion(&done); + } + + abort: + if (f & ChgSerialize) + mutex_unlock(&mdev->state_mutex); + + return rv; +} + +/** + * _drbd_request_state: + * This function is the most gracefull way to change state. For some state + * transition this function even does a cluster wide transaction. + * It has a cousin named drbd_request_state(), which is always verbose. + */ +int _drbd_request_state(drbd_dev* mdev, drbd_state_t mask, drbd_state_t val, + enum chg_state_flags f) +{ + int rv; + + wait_event(mdev->state_wait, + (rv = drbd_req_state(mdev, mask, val, f)) != SS_InTransientState); + return rv; } @@ -528,6 +573,7 @@ STATIC void print_st(drbd_dev* mdev, char *name, drbd_state_t ns) void print_st_err(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, int err) { + if (err == SS_InTransientState) return; ERR("State change failed: %s\n",set_st_err_name(err)); print_st(mdev," state",os); print_st(mdev,"wanted",ns); @@ -619,10 +665,17 @@ STATIC int is_valid_state_transition(drbd_dev* mdev,drbd_state_t ns,drbd_state_t if ( ns.disk == Outdated && os.disk < Outdated && os.disk != Attaching) rv=SS_LowerThanOutdated; + if (ns.conn == Disconnecting && os.conn == Unconnected) + rv = SS_InTransientState; + + if (ns.conn == os.conn && ns.conn == WFReportParams) + rv = SS_InTransientState; + return rv; } -int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) +int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns, enum chg_state_flags flags, + struct completion *done) { drbd_state_t os; int rv=SS_Success, warn_sync_abort=0; @@ -652,13 +705,14 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) ns.conn = os.conn; } - /* Dissalow network errors (+TearDown) to overwrite each other. - Dissalow network errors to overwrite the Disconnecting state. */ - if( ( (os.conn >= Timeout && os.conn <= TearDown) - || os.conn == Disconnecting ) && - ns.conn >= Timeout && ns.conn <= TearDown ) { + /* After a network error (+TearDown) only Unconnected or Disconnecting can follow */ + if (os.conn >= Timeout && os.conn <= TearDown && + ns.conn != Unconnected && ns.conn != Disconnecting) + ns.conn = os.conn; + + /* After Disconnecting only StandAlone may follow */ + if (os.conn == Disconnecting && ns.conn != StandAlone) ns.conn = os.conn; - } if( ns.conn < Connected ) { ns.peer_isp = 0; @@ -676,8 +730,8 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) ns.conn = Connected; } - if( ns.conn >= Connected && - ( ns.disk == Consistent || ns.disk == Outdated ) ) { + if (ns.conn != os.conn && ns.conn >= Connected && + (ns.disk == Consistent || ns.disk == Outdated)) { switch(ns.conn) { case WFBitMapT: case PausedSyncT: @@ -699,8 +753,8 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) } } - if( ns.conn >= Connected && - ( ns.pdsk == Consistent || ns.pdsk == Outdated ) ) { + if (ns.conn != os.conn && ns.conn >= Connected && + (ns.pdsk == Consistent || ns.pdsk == Outdated)) { switch(ns.conn) { case Connected: case WFBitMapT: @@ -723,9 +777,17 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) } /* Connection breaks down before we finished "Negotiating" */ - if (ns.conn < Connected && ns.disk == Negotiating ) { - ns.disk = mdev->new_state_tmp.disk; - ns.pdsk = mdev->new_state_tmp.pdsk; + if (ns.conn < Connected && ns.disk == Negotiating && + inc_local_if_state(mdev, Negotiating)) { + if (mdev->ed_uuid == mdev->bc->md.uuid[Current]) { + ns.disk = mdev->new_state_tmp.disk; + ns.pdsk = mdev->new_state_tmp.pdsk; + } else { + ALERT("Connection lost while negotiating, no data!\n"); + ns.disk = Diskless; + ns.pdsk = DUnknown; + } + dec_local(mdev); } if( fp == Stonith ) { @@ -844,6 +906,8 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) mdev->bc->md.flags = mdf; drbd_md_mark_dirty(mdev); } + if (os.disk < Consistent && ns.disk >= Consistent) + drbd_set_ed_uuid(mdev, mdev->bc->md.uuid[Current]); dec_local(mdev); } @@ -854,7 +918,7 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) // Receiver should clean up itself if (os.conn != Disconnecting && ns.conn == Disconnecting) - drbd_thread_signal(&mdev->receiver); + drbd_thread_stop_nowait(&mdev->receiver); // Now the receiver finished cleaning up itself, it should die if (os.conn != StandAlone && ns.conn == StandAlone) @@ -871,6 +935,7 @@ int _drbd_set_state(drbd_dev* mdev, drbd_state_t ns,enum chg_state_flags flags) ascw->ns = ns; ascw->flags = flags; ascw->w.cb = w_after_state_ch; + ascw->done = done; drbd_queue_work(&mdev->data.work, &ascw->w); } else { WARN("Could not kmalloc an ascw\n"); @@ -885,11 +950,33 @@ STATIC int w_after_state_ch(drbd_dev *mdev, struct drbd_work *w, int unused) ascw = (struct after_state_chg_work*) w; after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags); + if (ascw->flags & ChgWaitComplete) { + D_ASSERT(ascw->done != NULL); + complete(ascw->done); + } kfree(ascw); return 1; } +STATIC void abw_start_sync(drbd_dev* mdev, int rv) +{ + if (rv) { + ERR("Writing the bitmap failed not starting resync.\n"); + _drbd_request_state(mdev, NS(conn, Connected), ChgStateVerbose); + return; + } + + switch (mdev->state.conn) { + case StartingSyncT: + _drbd_request_state(mdev, NS(conn, WFSyncUUID), ChgStateVerbose); + break; + case StartingSyncS: + drbd_start_resync(mdev, SyncSource); + break; + } +} + STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, enum chg_state_flags flags) { @@ -911,6 +998,10 @@ STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, /* Inform userspace about the change... */ drbd_bcast_state(mdev, ns); + if (!(os.role == Primary && os.disk < UpToDate && os.pdsk < UpToDate) && + (ns.role == Primary && ns.disk < UpToDate && ns.pdsk < UpToDate)) + drbd_khelper(mdev, "pri-on-incon-degr"); + /* Here we have the actions that are performed after a state change. This function might sleep */ @@ -921,36 +1012,29 @@ STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, (os.conn < Connected && ns.conn >= Connected) ) { tl_clear(mdev); spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev,susp,0),ChgStateVerbose); + _drbd_set_state(_NS(mdev, susp, 0), ChgStateVerbose, NULL); spin_unlock_irq(&mdev->req_lock); } } // Do not change the order of the if above and below... if (os.conn != WFBitMapS && ns.conn == WFBitMapS) { - /* compare with drbd_make_request_common, - * wait_event and inc_ap_bio. - * Note: we may lose connection whilst waiting here. - * no worries though, should work out ok... */ - wait_event(mdev->misc_wait, - mdev->state.conn != WFBitMapS || - !atomic_read(&mdev->ap_bio_cnt)); - drbd_bm_lock(mdev); // { - drbd_send_bitmap(mdev); - drbd_bm_unlock(mdev); // } + drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL); } /* Lost contact to peer's copy of the data */ - if ( (os.pdsk>=Inconsistent && os.pdsk!=DUnknown && os.pdsk!=Outdated) && - (ns.pdskp_uuid ) { - kfree(mdev->p_uuid); - mdev->p_uuid = NULL; - } + if ( (os.pdsk >= Inconsistent && + os.pdsk != DUnknown && + os.pdsk != Outdated) + && (ns.pdsk < Inconsistent || + ns.pdsk == DUnknown || + ns.pdsk == Outdated) ) { + /* FIXME race with drbd_sync_handshake accessing this! */ + kfree(mdev->p_uuid); + mdev->p_uuid = NULL; if (inc_local(mdev)) { - if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 ) { - /* Only do it if we have not yet done it... */ + if (ns.role == Primary && mdev->bc->md.uuid[Bitmap] == 0 && + ns.disk >= UpToDate) drbd_uuid_new_current(mdev); - } if (ns.peer == Primary) { /* Note: The condition ns.peer == Primary implies that we are connected. Otherwise it would @@ -958,6 +1042,7 @@ STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, /* A FullSync is required after a primary detached from its disk! */ _drbd_uuid_new_current(mdev); + drbd_send_uuids(mdev); } dec_local(mdev); } @@ -1013,44 +1098,15 @@ STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, /* We are in the progress to start a full sync... */ if ( ( os.conn != StartingSyncT && ns.conn == StartingSyncT ) || ( os.conn != StartingSyncS && ns.conn == StartingSyncS ) ) { - - drbd_bm_lock(mdev); // racy... - - drbd_md_set_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); - - drbd_bm_set_all(mdev); - drbd_bm_write(mdev); - - drbd_md_clear_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); - - drbd_bm_unlock(mdev); - - if (ns.conn == StartingSyncT) { - spin_lock_irq(&mdev->req_lock); - _drbd_set_state(_NS(mdev,conn,WFSyncUUID),ChgStateVerbose); - spin_unlock_irq(&mdev->req_lock); - } else /* StartingSyncS */ { - drbd_start_resync(mdev,SyncSource); - } + INFO("Queueing bitmap io: about to start a forced full sync\n"); + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, &abw_start_sync); } /* We are invalidating our self... */ if ( os.conn < Connected && ns.conn < Connected && os.disk > Inconsistent && ns.disk == Inconsistent ) { - drbd_bm_lock(mdev); // racy... - - drbd_md_set_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); - - drbd_bm_set_all(mdev); - drbd_bm_write(mdev); - - drbd_md_clear_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); - - drbd_bm_unlock(mdev); + INFO("Queueing bitmap io: invalidate forced full sync\n"); + drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL); } if ( os.disk > Diskless && ns.disk == Diskless ) { @@ -1059,9 +1115,20 @@ STATIC void after_state_ch(drbd_dev* mdev, drbd_state_t os, drbd_state_t ns, not increase... It will reach zero */ wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt)); - drbd_free_bc(mdev->bc); mdev->bc = NULL; - lc_free(mdev->resync); mdev->resync = NULL; - lc_free(mdev->act_log); mdev->act_log = NULL; + lc_free(mdev->resync); + mdev->resync = NULL; + lc_free(mdev->act_log); + mdev->act_log = NULL; + __no_warn(local, drbd_free_bc(mdev->bc);); + wmb(); /* see begin of drbd_nl_disk_conf() */ + __no_warn(local, mdev->bc = NULL;); + } + + /* Disks got bigger while they were detached */ + if (ns.disk > Negotiating && ns.pdsk > Negotiating && + test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) { + if (ns.conn == Connected) + resync_after_online_grow(mdev); } // A resync finished or aborted, wake paused devices... @@ -1091,6 +1158,10 @@ STATIC int drbd_thread_setup(void* arg) struct Drbd_thread *thi = (struct Drbd_thread *) arg; drbd_dev *mdev = thi->mdev; int retval; + const char *me = + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE"; daemonize("drbd_thread"); D_ASSERT(get_t_state(thi) == Running); @@ -1099,29 +1170,41 @@ STATIC int drbd_thread_setup(void* arg) thi->task = current; smp_mb(); spin_unlock(&thi->t_lock); - complete(&thi->startstop); // notify: thi->task is set. + complete(&thi->startstop); /* notify: thi->task is set. */ - while(1) { - retval = thi->function(thi); - if(get_t_state(thi) != Restarting) break; +restart: + retval = thi->function(thi); + + spin_lock(&thi->t_lock); + + /* if the receiver has been "Exiting", the last thing it did + * was set the conn state to "StandAlone", + * if now a re-connect request comes in, conn state goes Unconnected, + * and receiver thread will be "started". + * drbd_thread_start needs to set "Restarting" in that case. + * t_state check and assignement needs to be within the same spinlock, + * so either thread_start sees Exiting, and can remap to Restarting, + * or thread_start see None, and can proceed as normal. + */ + + if (thi->t_state == Restarting) { + INFO("Restarting %s thread\n", me); thi->t_state = Running; + spin_unlock(&thi->t_lock); + goto restart; } - spin_lock(&thi->t_lock); thi->task = NULL; thi->t_state = None; smp_mb(); spin_unlock(&thi->t_lock); - // THINK maybe two different completions? - complete(&thi->startstop); // notify: thi->task unset. + /* THINK maybe two different completions? */ + complete(&thi->startstop); /* notify: thi->task unset. */ - INFO("Terminating %s thread\n", - thi == &mdev->receiver ? "receiver" : - thi == &mdev->asender ? "asender" : - thi == &mdev->worker ? "worker" : "NONSENSE"); + INFO("Terminating %s thread\n", me); - // Release mod reference taken when thread was started + /* Release mod reference taken when thread was started */ module_put(THIS_MODULE); return retval; } @@ -1140,17 +1223,19 @@ int drbd_thread_start(struct Drbd_thread *thi) { int pid; drbd_dev *mdev = thi->mdev; + const char *me = + thi == &mdev->receiver ? "receiver" : + thi == &mdev->asender ? "asender" : + thi == &mdev->worker ? "worker" : "NONSENSE"; spin_lock(&thi->t_lock); - if (thi->t_state == None) { + switch (thi->t_state) { + case None: INFO("Starting %s thread (from %s [%d])\n", - thi == &mdev->receiver ? "receiver" : - thi == &mdev->asender ? "asender" : - thi == &mdev->worker ? "worker" : "NONSENSE", - current->comm, current->pid); + me, current->comm, current->pid); - // Get ref on module for thread - this is released when thread exits + /* Get ref on module for thread - this is released when thread exits */ if (!try_module_get(THIS_MODULE)) { ERR("Failed to get module reference in drbd_thread_start\n"); spin_unlock(&thi->t_lock); @@ -1175,8 +1260,15 @@ int drbd_thread_start(struct Drbd_thread *thi) D_ASSERT(thi->task); D_ASSERT(get_t_state(thi) == Running); - } else { + break; + case Exiting: + thi->t_state = Restarting; + INFO("Restarting %s thread (from %s [%d])\n", + me, current->comm, current->pid); + case Running: + case Restarting: spin_unlock(&thi->t_lock); + break; } return TRUE; @@ -1225,22 +1317,6 @@ void _drbd_thread_stop(struct Drbd_thread *thi, int restart,int wait) } } -void drbd_thread_signal(struct Drbd_thread *thi) -{ - spin_lock(&thi->t_lock); - - if (thi->t_state == None) { - spin_unlock(&thi->t_lock); - return; - } - - if (thi->task != current) { - force_sig(DRBD_SIGKILL,thi->task); - } - - spin_unlock(&thi->t_lock); -} - /* the appropriate socket mutex must be held already */ int _drbd_send_cmd(drbd_dev *mdev, struct socket *sock, Drbd_Packet_Cmd cmd, Drbd_Header *h, @@ -1343,13 +1419,11 @@ int drbd_send_protocol(drbd_dev *mdev) (Drbd_Header*)&p,sizeof(p)); } -/* Hold sock mutex before calling this */ -int _drbd_send_uuids(drbd_dev *mdev) +int drbd_send_uuids(drbd_dev *mdev) { Drbd_GenCnt_Packet p; - int i, ok=0; + int i; u64 uuid_flags = 0; - struct socket *sock = mdev->data.socket; if(!inc_local_if_state(mdev,Negotiating)) return 1; // ok. @@ -1364,25 +1438,13 @@ int _drbd_send_uuids(drbd_dev *mdev) p.uuid[UUID_SIZE] = cpu_to_be64(mdev->comm_bm_set); uuid_flags |= mdev->net_conf->want_lose ? 1 : 0; uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0; + uuid_flags |= mdev->new_state_tmp.disk == Inconsistent ? 4 : 0; p.uuid[UUID_FLAGS] = cpu_to_be64(uuid_flags); dec_local(mdev); - if (likely(sock != NULL)) - ok = _drbd_send_cmd(mdev, sock, ReportUUIDs, - (Drbd_Header*)&p, sizeof(p), 0); - - return ok; -} - -int drbd_send_uuids(drbd_dev *mdev) -{ - int ok; - down(&mdev->data.mutex); - ok = _drbd_send_uuids(mdev); - up(&mdev->data.mutex); - - return ok; + return drbd_send_cmd(mdev,USE_DATA_SOCKET,ReportUUIDs, + (Drbd_Header*)&p,sizeof(p)); } int drbd_send_sync_uuid(drbd_dev *mdev, u64 val) @@ -1426,22 +1488,6 @@ int drbd_send_sizes(drbd_dev *mdev) return ok; } -/* Hold socket mutex before calling this */ -int _drbd_send_state(drbd_dev *mdev) -{ - struct socket *sock = mdev->data.socket; - Drbd_State_Packet p; - int ok = 0; - - p.state = cpu_to_be32(mdev->state.i); - - if (likely(sock != NULL)) - ok = _drbd_send_cmd(mdev, sock, ReportState, - (Drbd_Header*)&p, sizeof(p), 0); - - return ok; -} - /** * drbd_send_state: * Informs the peer about our state. Only call it when @@ -1451,14 +1497,24 @@ int _drbd_send_state(drbd_dev *mdev) */ int drbd_send_state(drbd_dev *mdev) { - int ok; + struct socket *sock; + Drbd_State_Packet p; + int ok = 0; /* Grab state lock so we wont send state if we're in the middle * of a cluster wide state change on another thread */ drbd_state_lock(mdev); down(&mdev->data.mutex); - ok = _drbd_send_state(mdev); + + p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */ + sock = mdev->data.socket; + + if (likely(sock != NULL)) { + ok = _drbd_send_cmd(mdev, sock, ReportState, + (Drbd_Header*)&p, sizeof(p), 0); + } + up(&mdev->data.mutex); drbd_state_unlock(mdev); @@ -1498,23 +1554,32 @@ int _drbd_send_bitmap(drbd_dev *mdev) ERR_IF(!mdev->bitmap) return FALSE; + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + p = (Drbd_Header *) __get_free_page(GFP_NOIO); + if (!p) { + ERR("failed to allocate one page buffer in %s\n", __func__ ); + return FALSE; + } bm_words = drbd_bm_words(mdev); - p = vmalloc(PAGE_SIZE); // sleeps. cannot fail. - buffer = (unsigned long*)p->payload; - - if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { - drbd_bm_set_all(mdev); - drbd_bm_write(mdev); - if (unlikely(mdev->state.disk <= Failed )) { - /* write_bm did fail! Leave full sync flag set in Meta Data - * but otherwise process as per normal - need to tell other - * side that a full resync is required! */ - ERR("Failed to write bitmap to disk!\n"); - } - else { - drbd_md_clear_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); + buffer = (unsigned long *)p->payload; + + if (inc_local(mdev)) { + if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { + INFO("Writing the whole bitmap, MDF_FullSync was set.\n"); + drbd_bm_set_all(mdev); + if (drbd_bm_write(mdev)) { + /* write_bm did fail! Leave full sync flag set in Meta Data + * but otherwise process as per normal - need to tell other + * side that a full resync is required! */ + ERR("Failed to write bitmap to disk!\n"); + } + else { + drbd_md_clear_flag(mdev,MDF_FullSync); + drbd_md_sync(mdev); + } } + dec_local(mdev); } /* @@ -1532,19 +1597,19 @@ int _drbd_send_bitmap(drbd_dev *mdev) bm_i += num_words; } while (ok && want); - vfree(p); + free_page((unsigned long) p); return ok; } int drbd_send_bitmap(drbd_dev *mdev) { - int ok; + int err; if (!drbd_get_data_sock(mdev)) - return 0; - ok=_drbd_send_bitmap(mdev); + return -1; + err = !_drbd_send_bitmap(mdev); drbd_put_data_sock(mdev); - return ok; + return err; } int drbd_send_b_ack(drbd_dev *mdev, u32 barrier_nr,u32 set_size) @@ -1670,7 +1735,7 @@ STATIC int we_should_drop_the_connection(drbd_dev *mdev, struct socket *sock) XFS seems to have problems, still, it submits pages with page_count == 0! As a workaround, we disable sendpage on pages with page_count == 0 or PageSlab. */ -int _drbd_no_send_page(drbd_dev *mdev, struct page *page, +STATIC int _drbd_no_send_page(drbd_dev *mdev, struct page *page, int offset, size_t size) { int ret; @@ -1893,7 +1958,7 @@ int drbd_send(drbd_dev *mdev, struct socket *sock, iov.iov_base = buf; iov.iov_len = size; - msg.msg_name = 0; + msg.msg_name = NULL; msg.msg_namelen = 0; #if !HAVE_KERNEL_SENDMSG msg.msg_iov = &iov; @@ -2047,7 +2112,7 @@ STATIC void drbd_unplug_fn(struct request_queue *q) if(mdev->state.disk >= Inconsistent) drbd_kick_lo(mdev); } -void drbd_set_defaults(drbd_dev *mdev) +STATIC void drbd_set_defaults(drbd_dev *mdev) { mdev->sync_conf.after = DRBD_AFTER_DEF; mdev->sync_conf.rate = DRBD_RATE_DEF; @@ -2060,7 +2125,7 @@ void drbd_set_defaults(drbd_dev *mdev) 0 } }; } -void drbd_init_set_defaults(drbd_dev *mdev) +STATIC void drbd_init_set_defaults(drbd_dev *mdev) { // the memset(,0,) did most of this // note: only assignments, no allocation in here @@ -2090,6 +2155,7 @@ void drbd_init_set_defaults(drbd_dev *mdev) init_MUTEX(&mdev->meta.mutex); sema_init(&mdev->data.work.s,0); sema_init(&mdev->meta.work.s,0); + mutex_init(&mdev->state_mutex); spin_lock_init(&mdev->data.work.q_lock); spin_lock_init(&mdev->meta.work.q_lock); @@ -2218,7 +2284,7 @@ void drbd_mdev_cleanup(drbd_dev *mdev) } -void drbd_destroy_mempools(void) +STATIC void drbd_destroy_mempools(void) { struct page *page; @@ -2244,7 +2310,7 @@ void drbd_destroy_mempools(void) return; } -int drbd_create_mempools(void) +STATIC int drbd_create_mempools(void) { struct page *page; const int number = (DRBD_MAX_SEGMENT_SIZE/PAGE_SIZE) * minor_count; @@ -2311,7 +2377,7 @@ STATIC struct notifier_block drbd_notifier = { }; -STATIC void __exit drbd_cleanup(void) +STATIC void drbd_cleanup(void) { int i, rr; @@ -2321,8 +2387,8 @@ STATIC void __exit drbd_cleanup(void) if (minor_table) { if (drbd_proc) - remove_proc_entry("drbd",&proc_root); - i=minor_count; + remove_proc_entry("drbd", NULL); + i = minor_count; while (i--) { drbd_dev *mdev = minor_to_mdev(i); struct gendisk **disk = &mdev->vdisk; @@ -2336,7 +2402,7 @@ STATIC void __exit drbd_cleanup(void) put_disk(*disk); *disk = NULL; } - if (*q) blk_put_queue(*q); + if (*q) blk_cleanup_queue(*q); *q = NULL; D_ASSERT(mdev->open_cnt == 0); @@ -2558,7 +2624,7 @@ int __init drbd_init(void) /* * register with procfs */ - drbd_proc = create_proc_entry("drbd", S_IFREG | S_IRUGO , &proc_root); + drbd_proc = create_proc_entry("drbd", S_IFREG | S_IRUGO , NULL); if (!drbd_proc) { printk(KERN_ERR DEVICE_NAME": unable to register proc file\n"); @@ -2606,11 +2672,11 @@ void drbd_free_sock(drbd_dev *mdev) { if (mdev->data.socket) { sock_release(mdev->data.socket); - mdev->data.socket = 0; + mdev->data.socket = NULL; } if (mdev->meta.socket) { sock_release(mdev->meta.socket); - mdev->meta.socket = 0; + mdev->meta.socket = NULL; } } @@ -2622,8 +2688,9 @@ void drbd_free_resources(drbd_dev *mdev) mdev->cram_hmac_tfm = NULL; } drbd_free_sock(mdev); - drbd_free_bc(mdev->bc); - mdev->bc=0; + __no_warn(local, + drbd_free_bc(mdev->bc); + mdev->bc = NULL;); } /*********************************/ @@ -2801,7 +2868,7 @@ void drbd_md_mark_dirty(drbd_dev *mdev) } -STATIC void drbd_uuid_move_history(drbd_dev *mdev) +STATIC void drbd_uuid_move_history(drbd_dev *mdev) __must_hold(local) { int i; @@ -2814,7 +2881,7 @@ STATIC void drbd_uuid_move_history(drbd_dev *mdev) } } -void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) __must_hold(local) { if(idx == Current) { if (mdev->state.role == Primary) { @@ -2822,6 +2889,7 @@ void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) } else { val &= ~((u64)1); } + drbd_set_ed_uuid(mdev, val); } mdev->bc->md.uuid[idx] = val; @@ -2834,7 +2902,7 @@ void _drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) } -void drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) +void drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) __must_hold(local) { if(mdev->bc->md.uuid[idx]) { drbd_uuid_move_history(mdev); @@ -2853,7 +2921,7 @@ void drbd_uuid_set(drbd_dev *mdev, int idx, u64 val) * sync upon next connect. Aditionally the full sync is also requested * by the FullSync bit. */ -void _drbd_uuid_new_current(drbd_dev *mdev) +void _drbd_uuid_new_current(drbd_dev *mdev) __must_hold(local) { u64 uuid; @@ -2874,8 +2942,10 @@ void _drbd_uuid_new_current(drbd_dev *mdev) * Creates a new current UUID, and rotates the old current UUID into * the bitmap slot. Causes an incremental resync upon next connect. */ -void drbd_uuid_new_current(drbd_dev *mdev) +void drbd_uuid_new_current(drbd_dev *mdev) __must_hold(local) { + u64 val; + INFO("Creating new current UUID\n"); D_ASSERT(mdev->bc->md.uuid[Bitmap] == 0); mdev->bc->md.uuid[Bitmap] = mdev->bc->md.uuid[Current]; @@ -2883,21 +2953,11 @@ void drbd_uuid_new_current(drbd_dev *mdev) drbd_print_uuid(mdev,Bitmap); ); - get_random_bytes(&mdev->bc->md.uuid[Current], sizeof(u64)); - if (mdev->state.role == Primary) { - mdev->bc->md.uuid[Current] |= 1; - } else { - mdev->bc->md.uuid[Current] &= ~((u64)1); - } - - MTRACE(TraceTypeUuid,TraceLvlSummary, - drbd_print_uuid(mdev,Current); - ); - - drbd_md_mark_dirty(mdev); + get_random_bytes(&val, sizeof(u64)); + _drbd_uuid_set(mdev, Current, val); } -void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) +void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) __must_hold(local) { if( mdev->bc->md.uuid[Bitmap]==0 && val==0 ) return; @@ -2923,8 +2983,109 @@ void drbd_uuid_set_bm(drbd_dev *mdev, u64 val) drbd_md_mark_dirty(mdev); } +/** + * drbd_bmio_set_n_write: + * Is an io_fn for drbd_queue_bitmap_io() or drbd_bitmap_io() that sets + * all bits in the bitmap and writes the whole bitmap to stable storage. + */ +int drbd_bmio_set_n_write(drbd_dev *mdev) +{ + int rv = -EIO; + + if (inc_local_if_state(mdev, Attaching)) { + drbd_md_set_flag(mdev, MDF_FullSync); + drbd_md_sync(mdev); + drbd_bm_set_all(mdev); + + rv = drbd_bm_write(mdev); + + if (!rv) { + drbd_md_clear_flag(mdev, MDF_FullSync); + drbd_md_sync(mdev); + } + + dec_local(mdev); + } -void drbd_md_set_flag(drbd_dev *mdev, int flag) + return rv; +} + +STATIC int w_bitmap_io(drbd_dev *mdev, struct drbd_work *w, int unused) +{ + struct bm_io_work *work = (struct bm_io_work *)w; + int rv; + + D_ASSERT(atomic_read(&mdev->ap_bio_cnt)==0); + + drbd_bm_lock(mdev); + rv = work->io_fn(mdev); + drbd_bm_unlock(mdev); + + clear_bit(BITMAP_IO, &mdev->flags); + wake_up(&mdev->misc_wait); + + if (work->done) work->done(mdev, rv); + + return 1; +} + +/** + * drbd_queue_bitmap_io: + * Queues an IO operation on the whole bitmap. + * While IO on the bitmap happens we freeze appliation IO thus we ensure + * that drbd_set_out_of_sync() can not be called. This function might be + * called from the worker thread and other contexts. + */ +void drbd_queue_bitmap_io(drbd_dev *mdev, + int (*io_fn)(drbd_dev *), + void (*done)(drbd_dev *, int)) +{ + unsigned long flags; + + D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); + + mdev->bm_io_work.w.cb = w_bitmap_io; + mdev->bm_io_work.io_fn = io_fn; + mdev->bm_io_work.done = done; + + spin_lock_irqsave(&mdev->req_lock, flags); + clear_bit(BITMAP_IO_QUEUED, &mdev->flags); + set_bit(BITMAP_IO, &mdev->flags); + if (atomic_read(&mdev->ap_bio_cnt) == 0) { + set_bit(BITMAP_IO_QUEUED, &mdev->flags); + drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w); + } + spin_unlock_irqrestore(&mdev->req_lock, flags); +} + +/** + * drbd_bitmap_io: + * Does an IO operation on the bitmap, freezing application IO while that + * IO operations runs. This functions might not be called from the context + * of the worker thread. + */ +int drbd_bitmap_io(drbd_dev *mdev, int (*io_fn)(drbd_dev *)) +{ + int rv; + + D_ASSERT(current != mdev->worker.task); + D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags)); + + set_bit(BITMAP_IO_QUEUED, &mdev->flags); + set_bit(BITMAP_IO, &mdev->flags); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + + drbd_bm_lock(mdev); + rv = io_fn(mdev); + drbd_bm_unlock(mdev); + + clear_bit(BITMAP_IO, &mdev->flags); + wake_up(&mdev->misc_wait); + + return rv; +} + +void drbd_md_set_flag(drbd_dev *mdev, int flag) __must_hold(local) { MUST_HOLD(mdev->req_lock); if ( (mdev->bc->md.flags & flag) != flag) { @@ -2932,7 +3093,7 @@ void drbd_md_set_flag(drbd_dev *mdev, int flag) mdev->bc->md.flags |= flag; } } -void drbd_md_clear_flag(drbd_dev *mdev, int flag) +void drbd_md_clear_flag(drbd_dev *mdev, int flag) __must_hold(local) { MUST_HOLD(mdev->req_lock); if ( (mdev->bc->md.flags & flag) != 0 ) { @@ -3041,8 +3202,8 @@ STATIC char *_drbd_uuid_str(unsigned int idx) { } /* Pretty print a UUID value */ -void -drbd_print_uuid(drbd_dev *mdev, unsigned int idx) { +void drbd_print_uuid(drbd_dev *mdev, unsigned int idx) __must_hold(local) +{ INFO(" uuid[%s] now %016llX\n",_drbd_uuid_str(idx),mdev->bc->md.uuid[idx]); } @@ -3225,7 +3386,7 @@ do { \ } \ } while (0) -char *_dump_block_id(u64 block_id, char *buff) { +STATIC char *_dump_block_id(u64 block_id, char *buff) { if (is_syncer_block_id(block_id)) strcpy(buff,"SyncerId"); else diff --git a/ubuntu/block/drbd/drbd_nl.c b/ubuntu/block/drbd/drbd_nl.c index be695bc..71190b2 100644 --- a/ubuntu/block/drbd/drbd_nl.c +++ b/ubuntu/block/drbd/drbd_nl.c @@ -40,13 +40,11 @@ #include "linux/drbd_limits.h" /* see get_sb_bdev and bd_claim */ -char *drbd_d_holder = "Hands off! this is DRBD's data storage device."; -char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; - +static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; // Generate the tag_list to struct functions #define NL_PACKET(name, number, fields) \ -int name ## _from_tags (drbd_dev *mdev, unsigned short* tags, struct name * arg) \ +STATIC int name ## _from_tags (drbd_dev *mdev, unsigned short* tags, struct name * arg) \ { \ int tag; \ int dlen; \ @@ -86,7 +84,7 @@ int name ## _from_tags (drbd_dev *mdev, unsigned short* tags, struct name * arg) // Generate the struct to tag_list functions #define NL_PACKET(name, number, fields) \ -unsigned short* \ +STATIC unsigned short* \ name ## _to_tags (drbd_dev *mdev, struct name * arg, unsigned short* tags) \ { \ fields \ @@ -119,7 +117,7 @@ extern void drbd_init_set_defaults(drbd_dev *mdev); void drbd_bcast_ev_helper(drbd_dev *mdev, char* helper_name); void drbd_nl_send_reply(struct cn_msg *, int); -char *nl_packet_name(int packet_type) { +STATIC char *nl_packet_name(int packet_type) { // Generate packet type strings #define NL_PACKET(name, number, fields) \ [ P_ ## name ] = # name, @@ -136,7 +134,7 @@ char *nl_packet_name(int packet_type) { nl_tag_name[packet_type] : "*Unknown*"; } -void nl_trace_packet(void *data) { +STATIC void nl_trace_packet(void *data) { struct cn_msg *req = data; struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; @@ -148,7 +146,7 @@ void nl_trace_packet(void *data) { req->seq, req->ack, req->len); } -void nl_trace_reply(void *data) { +STATIC void nl_trace_reply(void *data) { struct cn_msg *req = data; struct drbd_nl_cfg_reply *nlp = (struct drbd_nl_cfg_reply*)req->data; @@ -165,6 +163,7 @@ int drbd_khelper(drbd_dev *mdev, char* cmd) { char mb[12]; char *argv[] = {usermode_helper, cmd, mb, NULL }; + int ret; static char *envp[] = { "HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", @@ -174,79 +173,109 @@ int drbd_khelper(drbd_dev *mdev, char* cmd) INFO("helper command: %s %s\n",usermode_helper,cmd); - drbd_bcast_ev_helper(mdev,cmd); - return call_usermodehelper(usermode_helper,argv,envp,1); + drbd_bcast_ev_helper(mdev, cmd); + ret = call_usermodehelper(usermode_helper, argv, envp, 1); + if (ret) + WARN("helper command: %s %s %s exit code %d\n", + usermode_helper, cmd, mb, ret); + else + INFO("helper command: %s %s %s exit code %d\n", + usermode_helper, cmd, mb, ret); + + return ret; } drbd_disks_t drbd_try_outdate_peer(drbd_dev *mdev) { + char *ex_to_string; int r; drbd_disks_t nps; enum fencing_policy fp; D_ASSERT(mdev->state.pdsk == DUnknown); - if (inc_local_if_state(mdev,UpToDate)) { + if (inc_local_if_state(mdev, Consistent)) { fp = mdev->bc->dc.fencing; dec_local(mdev); } else { - WARN("Not outdating peer, since I am diskless."); + WARN("Not outdating peer, I'm not even Consistent myself.\n"); return mdev->state.pdsk; } - if( fp == Stonith ) drbd_request_state(mdev,NS(susp,1)); + if (fp == Stonith) + _drbd_request_state(mdev, NS(susp,1), ChgWaitComplete); r=drbd_khelper(mdev,"outdate-peer"); - switch( (r>>8) & 0xff ) { - case 3: /* peer is inconsistent */ + switch ((r>>8) & 0xff) { + case 3: + ex_to_string = "peer is inconsistent or worse"; nps = Inconsistent; break; - case 4: /* peer is outdated */ + case 4: + ex_to_string = "peer is outdated"; nps = Outdated; break; case 5: /* peer was down, we will(have) create(d) a new UUID anyways... */ /* If we would be more strict, we would return DUnknown here. */ + ex_to_string = "peer is unreachable, assumed to be dead"; nps = Outdated; break; - case 6: /* Peer is primary, voluntarily outdate myself */ + case 6: /* Peer is primary, voluntarily outdate myself. + * This is useful when an unconnected Secondary is asked to + * become Primary, but findes the other peer being active. */ + ex_to_string = "peer is active"; WARN("Peer is primary, outdating myself.\n"); nps = DUnknown; - drbd_request_state(mdev,NS(disk,Outdated)); + _drbd_request_state(mdev, NS(disk, Outdated), ChgWaitComplete); break; case 7: - if( fp != Stonith ) { + if (fp != Stonith) ERR("outdate-peer() = 7 && fencing != Stonith !!!\n"); - } + ex_to_string = "peer was stonithed"; nps = Outdated; break; default: /* The script is broken ... */ nps = DUnknown; - ERR("outdate-peer helper broken, returned %d \n",(r>>8)&0xff); + ERR("outdate-peer helper broken, returned %d\n",(r>>8)&0xff); return nps; } - INFO("outdate-peer helper returned %d \n",(r>>8)&0xff); + INFO("outdate-peer helper returned %d (%s)\n", + (r>>8) & 0xff, ex_to_string); return nps; } int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) { - int r=0,forced = 0, try=0; + const int max_tries = 4; + int r = 0; + int try = 0; + int forced = 0; drbd_state_t mask, val; drbd_disks_t nps; - if ( new_role == Primary ) { - request_ping(mdev); // Detect a dead peer ASAP - } + if (new_role == Primary) + request_ping(mdev); /* Detect a dead peer ASAP */ + + mutex_lock(&mdev->state_mutex); mask.i = 0; mask.role = role_mask; val.i = 0; val.role = new_role; - while (try++ < 3) { - r = _drbd_request_state(mdev,mask,val,0); + while (try++ < max_tries) { + r = _drbd_request_state(mdev, mask, val, ChgWaitComplete); + + /* in case we first succeeded to outdate, + * but now suddenly could establish a connection */ + if (r == SS_CW_FailedByPeer && mask.pdsk != 0) { + val.pdsk = 0; + mask.pdsk = 0; + continue; + } + if( r == SS_NoUpToDateDisk && force && ( mdev->state.disk == Inconsistent || mdev->state.disk == Outdated ) ) { @@ -287,14 +316,17 @@ int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) continue; } if( r == SS_TwoPrimaries ) { - // Maybe the peer is detected as dead very soon... + /* Maybe the peer is detected as dead very soon... + retry at most once more in this case. */ set_current_state(TASK_INTERRUPTIBLE); schedule_timeout((mdev->net_conf->ping_timeo+1)*HZ/10); - if(try == 1) try++; // only a single retry in this case. + if (try < max_tries) + try = max_tries -1; continue; } if ( r < SS_Success ) { - r = drbd_request_state(mdev,mask,val); // Be verbose. + r = _drbd_request_state(mdev, mask, val, + ChgStateVerbose + ChgWaitComplete); if( r < SS_Success ) goto fail; } break; @@ -305,11 +337,7 @@ int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) drbd_sync_me(mdev); /* Wait until nothing is on the fly :) */ - if ( wait_event_interruptible( mdev->misc_wait, - atomic_read(&mdev->ap_pending_cnt) == 0 ) ) { - r = GotSignal; - goto fail; - } + wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0); /* FIXME RACE here: if our direct user is not using bd_claim (i.e. * not a filesystem) since cstate might still be >= Connected, new @@ -357,9 +385,8 @@ int drbd_set_role(drbd_dev *mdev, drbd_role_t new_role, int force) drbd_md_sync(mdev); - return r; - fail: + mutex_unlock(&mdev->state_mutex); return r; } @@ -438,7 +465,8 @@ char* ppsize(char* buf, unsigned long long size) static char units[] = { 'K','M','G','T','P','E' }; int base = 0; while (size >= 10000 ) { - size = size >> 10; + /* shift + round */ + size = (size >> 10) + !!(size & (1<<9)); base++; } sprintf(buf,"%lu %cB",(long)size,units[base]); @@ -446,6 +474,38 @@ char* ppsize(char* buf, unsigned long long size) return buf; } +/* there is still a theoretical deadlock when called from receiver + * on an Inconsistent Primary: + * remote READ does inc_ap_bio, receiver would need to receive answer + * packet from remote to dec_ap_bio again. + * receiver receive_sizes(), comes here, + * waits for ap_bio_cnt == 0. -> deadlock. + * but this cannot happen, actually, because: + * Primary Inconsistent, and peer's disk is unreachable + * (not connected, * or bad/no disk on peer): + * see drbd_fail_request_early, ap_bio_cnt is zero. + * Primary Inconsistent, and SyncTarget: + * peer may not initiate a resize. + */ +static void suspend_io(struct Drbd_Conf *mdev) +{ + int in_flight; + set_bit(SUSPEND_IO, &mdev->flags); + in_flight = atomic_read(&mdev->ap_bio_cnt); + if (in_flight) { + DBG("Suspending IO, waiting for %d requests to finish\n", in_flight); + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + } + DBG("IO Suspended, no more requests in flight\n"); +} + +static void resume_io(struct Drbd_Conf *mdev) +{ + clear_bit(SUSPEND_IO, &mdev->flags); + DBG("Resumed IO\n"); + wake_up(&mdev->misc_wait); +} + /** * drbd_determin_dev_size: * Evaluates all constraints and sets our correct device size. @@ -453,7 +513,7 @@ char* ppsize(char* buf, unsigned long long size) * indicate success. * You should call drbd_md_sync() after calling this function. */ -enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev* mdev) +enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev* mdev) __must_hold(local) { sector_t prev_first_sect, prev_size; // previous meta location sector_t la_size; @@ -463,6 +523,18 @@ enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev* mdev) int md_moved, la_size_changed; enum determin_dev_size_enum rv=unchanged; + /* race: + * application request passes inc_ap_bio, + * but then cannot get an AL-reference. + * this function later may wait on ap_bio_cnt == 0. -> deadlock. + * + * to avoid that: + * Suspend IO right here. + * still lock the act_log to not trigger ASSERTs there. + */ + suspend_io(mdev); + + /* no wait necessary anymore, actually we could assert that */ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); prev_first_sect = drbd_md_first_sector(mdev->bc); @@ -512,19 +584,18 @@ enum determin_dev_size_enum drbd_determin_dev_size(drbd_dev* mdev) } if ( la_size_changed || md_moved ) { - if( inc_local_if_state(mdev,Attaching) ) { - drbd_al_shrink(mdev); // All extents inactive. - rv = drbd_bm_write(mdev); // write bitmap - // Write mdev->la_size to on disk. - drbd_md_mark_dirty(mdev); - dec_local(mdev); - } + drbd_al_shrink(mdev); /* All extents inactive. */ + INFO("Writing the whole bitmap, size changed\n"); + rv = drbd_bitmap_io(mdev, &drbd_bm_write); + drbd_md_mark_dirty(mdev); } if (size > la_size) rv = grew; if (size < la_size) rv = shrunk; out: lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + resume_io(mdev); return rv; } @@ -624,7 +695,7 @@ STATIC int drbd_check_al_size(drbd_dev *mdev) return 0; } -void drbd_setup_queue_param(drbd_dev *mdev, unsigned int max_seg_s) +void drbd_setup_queue_param(drbd_dev *mdev, unsigned int max_seg_s) __must_hold(local) { struct request_queue * const q = mdev->rq_queue; struct request_queue * const b = mdev->bc->backing_bdev->bd_disk->queue; @@ -689,6 +760,9 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, struct drbd_nl_cfg_reply *reply) { enum ret_codes retcode; + enum determin_dev_size_enum dd; + sector_t max_possible_sectors; + sector_t min_md_device_sectors; struct drbd_backing_dev* nbc=NULL; // new_backing_conf struct inode *inode, *inode2; struct lru_cache* resync_lru = NULL; @@ -701,11 +775,14 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, goto fail; } - /* + /* * We may have gotten here very quickly from a detach. Wait for a bit * then fail. */ - while(mdev->bc != NULL) { + while (1) { + __no_warn(local, nbc = mdev->bc; ); + if (nbc == NULL) + break; if(ntries++ >= 5) { WARN("drbd_nl_disk_conf: mdev->bc not NULL.\n"); retcode=HaveDiskConfig; @@ -779,7 +856,7 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, nbc->backing_bdev = inode->i_bdev; if (BD_CLAIM(nbc->backing_bdev, mdev)) { printk(KERN_ERR "drbd: bd_claim(%p,%p); failed [%p;%p;%u]\n", - nbc->backing_bdev, mdev, + nbc->backing_bdev, mdev, nbc->backing_bdev->bd_holder, nbc->backing_bdev->bd_contains->bd_holder, nbc->backing_bdev->bd_holders); @@ -787,7 +864,7 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, goto fail; } - resync_lru = lc_alloc("resync",31, sizeof(struct bm_extent),mdev); + resync_lru = lc_alloc("resync", 61, sizeof(struct bm_extent), mdev); if(!resync_lru) { retcode=KMallocFailed; goto fail; @@ -809,39 +886,33 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, goto release_bdev2_fail; } - if ((drbd_get_capacity(nbc->backing_bdev)) < nbc->dc.disk_size) { + if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) { retcode = LDDeviceTooSmall; goto release_bdev2_fail; } -// warning LGE checks below no longer valid -// --- rewrite -#if 0 - if (drbd_get_capacity(nbc->backing_bdev) >= (sector_t)DRBD_MAX_SECTORS) { - retcode = LDDeviceTooLarge; - goto release_bdev2_fail; + if (nbc->dc.meta_dev_idx < 0) { + max_possible_sectors = DRBD_MAX_SECTORS_FLEX; + /* at least one MB, otherwise it does not make sense */ + min_md_device_sectors = (2<<10); + } else { + max_possible_sectors = DRBD_MAX_SECTORS; + min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1); } - if ( nbc->dc.meta_dev_idx == -1 ) i = 1; - else i = nbc->dc.meta_dev_idx+1; + if (drbd_get_capacity(nbc->md_bdev) > max_possible_sectors) + WARN("truncating very big lower level device " + "to currently maximum possible %llu sectors\n", + (unsigned long long) max_possible_sectors); - /* for internal, we need to check agains <= (then we have a drbd with - * zero size, but meta data...) to be on the safe side, I require 32MB - * minimal data storage area for drbd with internal meta data (thats - * 160 total). if someone wants to use that small devices, she can use - * drbd 0.6 anyways... - * - * FIXME this is arbitrary and needs to be reconsidered as soon as we - * move to flexible size meta data. - */ - if( drbd_get_capacity(nbc->md_bdev) < 2*MD_RESERVED_SIZE*i - + (nbc->dc.meta_dev_idx == -1) ? (1<<16) : 0 ) + if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) { retcode = MDDeviceTooSmall; + WARN("refusing attach: md-device too small, " + "at least %llu sectors needed for this meta-disk type\n", + (unsigned long long) min_md_device_sectors); goto release_bdev2_fail; } -#endif -// -- up to here // Make sure the new disk is big enough if (drbd_get_capacity(nbc->backing_bdev) < @@ -850,10 +921,14 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, goto release_bdev2_fail; } - nbc->known_size = drbd_get_capacity(nbc->backing_bdev); + nbc->known_size = drbd_get_max_capacity(nbc); - if((retcode = drbd_request_state(mdev,NS(disk,Attaching))) < SS_Success ) { + retcode = _drbd_request_state(mdev, NS(disk, Attaching), ChgStateVerbose); + if (retcode < SS_Success ) goto release_bdev2_fail; + + if (!inc_local_if_state(mdev, Attaching)) { + goto force_diskless; } drbd_thread_start(&mdev->worker); @@ -861,27 +936,49 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, retcode = drbd_md_read(mdev,nbc); if ( retcode != NoError ) { - goto force_diskless; + goto force_diskless_dec; + } + + if (mdev->state.conn < Connected && + mdev->state.role == Primary && + (mdev->ed_uuid & ~((u64)1)) != (nbc->md.uuid[Current] & ~((u64)1))) { + ERR("Can only attach to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + retcode = DataOfWrongCurrent; + goto force_diskless_dec; } // Since we are diskless, fix the AL first... if (drbd_check_al_size(mdev)) { retcode = KMallocFailed; - goto force_diskless; + goto force_diskless_dec; } // Prevent shrinking of consistent devices ! if(drbd_md_test_flag(nbc,MDF_Consistent) && drbd_new_dev_size(mdev,nbc) < nbc->md.la_size_sect) { + WARN("refusing to truncate a consistent device\n"); retcode = LDDeviceTooSmall; - goto force_diskless; + goto force_diskless_dec; } if(!drbd_al_read_log(mdev,nbc)) { retcode = MDIOError; - goto force_diskless; + goto force_diskless_dec; } + /* Reset the "barriers don't work" bits here, then force meta data to + * be written, to ensure we determine if barriers are supported. */ + if (nbc->dc.no_disk_flush) + set_bit(LL_DEV_NO_FLUSH, &mdev->flags); + else + clear_bit(LL_DEV_NO_FLUSH, &mdev->flags); + + if (nbc->dc.no_md_flush) + set_bit(MD_NO_BARRIER, &mdev->flags); + else + clear_bit(MD_NO_BARRIER, &mdev->flags); + /* Point of no return reached. * Devices and memory are no longer released by error cleanup below. * now mdev takes over responsibility, and the state engine should @@ -930,24 +1027,23 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, set_bit(USE_DEGR_WFC_T,&mdev->flags); } - drbd_bm_lock(mdev); // racy... - if (drbd_determin_dev_size(mdev) == dev_size_error) { + dd = drbd_determin_dev_size(mdev); + if (dd == dev_size_error) { retcode = VMallocFailed; - goto unlock_bm; - } + goto force_diskless_dec; + } else if (dd == grew) + set_bit(RESYNC_AFTER_NEG, &mdev->flags); if (drbd_md_test_flag(mdev->bc,MDF_FullSync)) { INFO("Assuming that all blocks are out of sync (aka FullSync)\n"); - drbd_bm_set_all(mdev); - if (unlikely(drbd_bm_write(mdev) < 0)) { + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write)) { retcode = MDIOError; - goto unlock_bm; + goto force_diskless_dec; } - drbd_md_clear_flag(mdev,MDF_FullSync); } else { - if (unlikely(drbd_bm_read(mdev) < 0)) { + if (drbd_bitmap_io(mdev, &drbd_bm_read) < 0) { retcode = MDIOError; - goto unlock_bm; + goto force_diskless_dec; } } @@ -999,34 +1095,26 @@ STATIC int drbd_nl_disk_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, ns.disk = Negotiating; } - rv = _drbd_set_state(mdev, ns, ChgStateVerbose); + rv = _drbd_set_state(mdev, ns, ChgStateVerbose, NULL); ns = mdev->state; spin_unlock_irq(&mdev->req_lock); if (rv < SS_Success) { - goto unlock_bm; + goto force_diskless_dec; } - drbd_bm_unlock(mdev); - - if(inc_local_if_state(mdev,Attaching)) { - if(mdev->state.role == Primary) mdev->bc->md.uuid[Current] |= (u64)1; - else mdev->bc->md.uuid[Current] &= ~(u64)1; - dec_local(mdev); - } + if(mdev->state.role == Primary) mdev->bc->md.uuid[Current] |= (u64)1; + else mdev->bc->md.uuid[Current] &= ~(u64)1; - /* Reset the "barriers don't work" bits here, then force meta data to - * be written, to ensure we determine if barriers are supported. */ - clear_bit(LL_DEV_NO_FLUSH,&mdev->flags); - clear_bit(MD_NO_BARRIER,&mdev->flags); drbd_md_mark_dirty(mdev); drbd_md_sync(mdev); + dec_local(mdev); reply->ret_code = retcode; return 0; - unlock_bm: - drbd_bm_unlock(mdev); + force_diskless_dec: + dec_local(mdev); force_diskless: drbd_force_state(mdev,NS(disk,Diskless)); drbd_md_sync(mdev); @@ -1052,6 +1140,9 @@ STATIC int drbd_nl_detach(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, drbd_sync_me(mdev); reply->ret_code = drbd_request_state(mdev,NS(disk,Diskless)); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ/20); /* 50ms; Time for worker to finally terminate */ + return 0; } @@ -1082,7 +1173,7 @@ STATIC int drbd_nl_net_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, if( !(nlp->flags & DRBD_NL_SET_DEFAULTS) && inc_net(mdev)) { memcpy(new_conf,mdev->net_conf,sizeof(struct net_conf)); - dec_local(mdev); + dec_net(mdev); } else { memset(new_conf,0,sizeof(struct net_conf)); new_conf->timeout = DRBD_TIMEOUT_DEF; @@ -1228,8 +1319,7 @@ FIXME LGE } mdev->cram_hmac_tfm = tfm; - retcode = drbd_request_state(mdev,NS(conn,Unconnected)); - + retcode = _drbd_request_state(mdev, NS(conn, Unconnected), ChgStateVerbose); if (retcode >= SS_Success) drbd_thread_start(&mdev->worker); @@ -1251,7 +1341,7 @@ STATIC int drbd_nl_disconnect(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, { int retcode; - retcode = _drbd_request_state(mdev,NS(conn,Disconnecting),0); // silently. + retcode = _drbd_request_state(mdev, NS(conn, Disconnecting), ChgOrdered); if ( retcode == SS_NothingToDo ) goto done; else if ( retcode == SS_AlreadyStandAlone ) goto done; @@ -1261,8 +1351,9 @@ STATIC int drbd_nl_disconnect(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, pdsk,Outdated)); } else if (retcode == SS_CW_FailedByPeer) { // The peer probabely wants to see us outdated. - retcode = _drbd_request_state(mdev,NS2(conn,Disconnecting, - disk,Outdated),0); + retcode = _drbd_request_state(mdev, NS2(conn, Disconnecting, + disk, Outdated), + ChgOrdered); if (retcode == SS_IsDiskLess || retcode == SS_LowerThanOutdated) { // We are diskless and our peer wants to outdate us. // So, simply go away, and let the peer try to @@ -1274,8 +1365,10 @@ STATIC int drbd_nl_disconnect(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, if( retcode < SS_Success ) goto fail; - if( wait_event_interruptible( mdev->misc_wait, - mdev->state.conn==StandAlone) ) { + if (wait_event_interruptible(mdev->state_wait, + mdev->state.conn != Disconnecting) ) { + /* Do not test for mdev->state.conn == StandAlone, since + someone else might connect us in the mean time! */ retcode = GotSignal; goto fail; } @@ -1301,7 +1394,7 @@ void resync_after_online_grow(drbd_dev *mdev) if (iass) drbd_start_resync(mdev,SyncSource); else - drbd_request_state(mdev,NS(conn,WFSyncUUID)); + _drbd_request_state(mdev, NS(conn, WFSyncUUID), ChgStateVerbose + ChgSerialize); } STATIC int drbd_nl_resize(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, @@ -1340,10 +1433,8 @@ STATIC int drbd_nl_resize(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, } mdev->bc->dc.disk_size = (sector_t)rs.resize_size; - drbd_bm_lock(mdev); dd = drbd_determin_dev_size(mdev); drbd_md_sync(mdev); - drbd_bm_unlock(mdev); dec_local(mdev); if (dd == dev_size_error) { retcode = VMallocFailed; @@ -1411,7 +1502,12 @@ STATIC int drbd_nl_syncer_conf(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, mdev->sync_conf = sc; if(inc_local(mdev)) { + wait_event(mdev->al_wait, lc_try_lock(mdev->act_log)); + drbd_al_shrink(mdev); err = drbd_check_al_size(mdev); + lc_unlock(mdev->act_log); + wake_up(&mdev->al_wait); + dec_local(mdev); drbd_md_sync(mdev); @@ -1544,11 +1640,11 @@ STATIC int drbd_nl_get_uuids(drbd_dev *mdev, struct drbd_nl_cfg_req *nlp, *tl++ = UUID_SIZE*sizeof(u64); memcpy(tl,mdev->bc->md.uuid,UUID_SIZE*sizeof(u64)); tl=(unsigned short*)((char*)tl + UUID_SIZE*sizeof(u64)); - dec_local(mdev); *tl++ = T_uuids_flags; *tl++ = sizeof(int); memcpy(tl,&mdev->bc->md.flags,sizeof(int)); tl=(unsigned short*)((char*)tl + sizeof(int)); + dec_local(mdev); } *tl++ = TT_END; /* Close the tag list */ @@ -1638,7 +1734,7 @@ static struct cn_handler_struct cnd_table[] = { }; -void drbd_connector_callback(void *data) +STATIC void drbd_connector_callback(void *data) { struct cn_msg *req = data; struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req*)req->data; @@ -1704,7 +1800,7 @@ void drbd_connector_callback(void *data) module_put(THIS_MODULE); } -atomic_t drbd_nl_seq = ATOMIC_INIT(2); // two. +static atomic_t drbd_nl_seq = ATOMIC_INIT(2); // two. void drbd_bcast_state(drbd_dev *mdev, drbd_state_t state) { @@ -1824,7 +1920,7 @@ int __init cn_init(void); void __exit cn_fini(void); #endif -int __init drbd_nl_init() +int __init drbd_nl_init(void) { static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; int err; @@ -1843,7 +1939,7 @@ int __init drbd_nl_init() return 0; } -void drbd_nl_cleanup() +void drbd_nl_cleanup(void) { static struct cb_id cn_id_drbd = { CN_IDX_DRBD, CN_VAL_DRBD }; diff --git a/ubuntu/block/drbd/drbd_proc.c b/ubuntu/block/drbd/drbd_proc.c index 5696dc7..62e00d5 100644 --- a/ubuntu/block/drbd/drbd_proc.c +++ b/ubuntu/block/drbd/drbd_proc.c @@ -60,7 +60,8 @@ struct file_operations drbd_proc_fops = { STATIC void drbd_syncer_progress(struct Drbd_Conf* mdev, struct seq_file *seq) { unsigned long db, dt, dbdt, rt, rs_left; - int i, x, y, res; + unsigned int res; + int i, x, y; drbd_get_syncer_progress(mdev, &rs_left, &res); @@ -217,18 +218,12 @@ STATIC int drbd_seq_show(struct seq_file *seq, void *v) mdev->state.conn == SyncTarget ) { drbd_syncer_progress(mdev,seq); } - if(mdev->resync) { - lc_printf_stats(seq,mdev->resync); - } - if(mdev->act_log) { - lc_printf_stats(seq,mdev->act_log); - } -#if 0 - if(mdev->resync) { - lc_dump(mdev->resync,seq,"rs_left", - resync_dump_detail); + + if (inc_local_if_state(mdev, Failed)) { + lc_printf_stats(seq, mdev->resync); + lc_printf_stats(seq, mdev->act_log); + dec_local(mdev); } -#endif } diff --git a/ubuntu/block/drbd/drbd_receiver.c b/ubuntu/block/drbd/drbd_receiver.c index c81b18b..1e130e7 100644 --- a/ubuntu/block/drbd/drbd_receiver.c +++ b/ubuntu/block/drbd/drbd_receiver.c @@ -46,13 +46,18 @@ #include #include #ifdef HAVE_LINUX_SCATTERLIST_H +/* 2.6.11 (suse 9.3, fc4) does not include requisites + * from linux/scatterlist.h :( */ +#include +#include +#include #include #endif #include "linux/drbd.h" #include "drbd_int.h" #include "drbd_req.h" -#if defined(__arch_um__) && !defined(HAVE_UML_TO_VIRT) +#if 0 static inline void *to_virt(unsigned long phys) { return((void *) uml_physmem + phys); @@ -123,7 +128,7 @@ void check_list(drbd_dev *mdev,struct list_head *list,char *t) /** * drbd_bp_alloc: Returns a page. Fails only if a signal comes in. */ -STATIC struct page * drbd_pp_alloc(drbd_dev *mdev, unsigned int gfp_mask) +STATIC struct page * drbd_pp_alloc(drbd_dev *mdev, gfp_t gfp_mask) { unsigned long flags=0; struct page *page; @@ -239,7 +244,7 @@ struct Tl_epoch_entry* drbd_alloc_ee(drbd_dev *mdev, u64 id, sector_t sector, unsigned int data_size, - unsigned int gfp_mask) + gfp_t gfp_mask) __must_hold(local) { struct request_queue *q; struct Tl_epoch_entry* e; @@ -483,80 +488,94 @@ void drbd_wait_ee_list_empty(drbd_dev *mdev,struct list_head *head) spin_unlock_irq(&mdev->req_lock); } -STATIC struct socket* drbd_accept(drbd_dev *mdev,struct socket* sock) +#ifdef DEFINE_SOCK_CREATE_KERN +/* if there is no sock_create_kern, + * tthere is also sock_create_lite missing */ +int sock_create_lite(int family, int type, int protocol, struct socket **res) { - struct socket *newsock; int err = 0; + struct socket *sock = NULL; - err = sock->ops->listen(sock, 5); - if (err) - goto out; + sock = sock_alloc(); + if (!sock) + err = -ENOMEM; + else + sock->type = type; - if (sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &newsock)) - goto out; + *res = sock; + return err; +} +#endif + +/* see also kernel_accept; which is only present since 2.6.18. + * also we want to log which part of it failed, exactly */ +STATIC int drbd_accept(drbd_dev *mdev, const char **what, + struct socket *sock, struct socket **newsock) +{ + struct sock *sk = sock->sk; + int err = 0; - newsock->type = sock->type; - newsock->ops = sock->ops; + *what = "listen"; + err = sock->ops->listen(sock, 5); + if (err < 0) + goto out; - err = newsock->ops->accept(sock, newsock, 0); + *what = "sock_create_lite"; + err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol, + newsock); if (err < 0) - goto out_release; + goto out; - return newsock; + *what = "accept"; + err = sock->ops->accept(sock, *newsock, 0); + if (err < 0) { + sock_release(*newsock); + *newsock = NULL; + goto out; + } + (*newsock)->ops = sock->ops; - out_release: - sock_release(newsock); - out: - if(err != -EAGAIN && err != -EINTR) - ERR("accept failed! %d\n", err); - return 0; +out: + return err; } STATIC int drbd_recv_short(drbd_dev *mdev, struct socket *sock, void *buf, size_t size, int flags) { mm_segment_t oldfs; - struct iovec iov; - struct msghdr msg; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = (flags ? flags : MSG_WAITALL | MSG_NOSIGNAL) + }; int rv; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iovlen = 1; - msg.msg_iov = &iov; - iov.iov_len = size; - iov.iov_base = buf; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_flags = flags ? flags : MSG_WAITALL | MSG_NOSIGNAL; - oldfs = get_fs(); set_fs(KERNEL_DS); - rv = sock_recvmsg(sock, &msg, size, msg.msg_flags); - set_fs(oldfs); return rv; } -int drbd_recv(drbd_dev *mdev,void *buf, size_t size) +STATIC int drbd_recv(drbd_dev *mdev,void *buf, size_t size) { mm_segment_t oldfs; - struct iovec iov; - struct msghdr msg; + struct kvec iov = { + .iov_base = buf, + .iov_len = size, + }; + struct msghdr msg = { + .msg_iovlen = 1, + .msg_iov = (struct iovec *)&iov, + .msg_flags = MSG_WAITALL | MSG_NOSIGNAL + }; int rv; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iovlen = 1; - msg.msg_iov = &iov; - iov.iov_len = size; - iov.iov_base = buf; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_flags = MSG_WAITALL | MSG_NOSIGNAL; - oldfs = get_fs(); set_fs(KERNEL_DS); @@ -596,17 +615,19 @@ int drbd_recv(drbd_dev *mdev,void *buf, size_t size) STATIC struct socket *drbd_try_connect(drbd_dev *mdev) { - int err; + const char *what; struct socket *sock; struct sockaddr_in src_in; + int err; + int disconnect_on_error = 1; if (!inc_net(mdev)) return NULL; - err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); - if (err) { - dec_net(mdev); - ERR("sock_creat(..)=%d\n", err); - return NULL; + what = "sock_create_kern"; + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock); + if (err < 0) { + sock = NULL; + goto out; } sock->sk->sk_rcvtimeo = @@ -622,26 +643,46 @@ STATIC struct socket *drbd_try_connect(drbd_dev *mdev) memcpy (&src_in, &(mdev->net_conf->my_addr), sizeof(struct sockaddr_in)); src_in.sin_port = 0; + what = "bind"; err = sock->ops->bind(sock, - (struct sockaddr * ) &src_in, - sizeof (struct sockaddr_in)); - if (err) { - ERR("Unable to bind source sock (%d)\n", err); - sock_release(sock); - sock = NULL; - dec_net(mdev); - return sock; - } + (struct sockaddr *) &src_in, + sizeof(struct sockaddr_in)); + if (err < 0) + goto out; + /* connect may fail, peer not yet available. + * stay WFConnection, don't go Disconnecting! */ + disconnect_on_error = 0; + what = "connect"; err = sock->ops->connect(sock, (struct sockaddr *)mdev->net_conf->peer_addr, mdev->net_conf->peer_addr_len, 0); - if (err) { - sock_release(sock); - sock = NULL; +out: + if (err < 0) { + if (sock) { + sock_release(sock); + sock = NULL; + } + switch (-err) { + /* timeout, busy, signal pending */ + case ETIMEDOUT: case EAGAIN: + case EINTR: case ERESTARTSYS: + /* peer not (yet) available, network problem */ + case ECONNREFUSED: case ENETUNREACH: + case EHOSTDOWN: case EHOSTUNREACH: +#if 0 + DBG("%s failure ignored, err = %d\n", + what, err); +#endif + disconnect_on_error = 0; + break; + default: + ERR("%s failed, err = %d\n", what, err); + } + if (disconnect_on_error) + drbd_force_state(mdev, NS(conn, Disconnecting)); } - dec_net(mdev); return sock; } @@ -649,37 +690,49 @@ STATIC struct socket *drbd_try_connect(drbd_dev *mdev) STATIC struct socket *drbd_wait_for_connect(drbd_dev *mdev) { int err; - struct socket *sock,*sock2; + struct socket *s_estab = NULL, *s_listen; + const char *what; if (!inc_net(mdev)) return NULL; - err = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock2); + what = "sock_create_kern"; + err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &s_listen); if (err) { - dec_net(mdev); - ERR("sock_creat(..)=%d\n", err); - return NULL; + s_listen = NULL; + goto out; } - sock2->sk->sk_reuse = 1; /* SO_REUSEADDR */ - sock2->sk->sk_rcvtimeo = - sock2->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; + s_listen->sk->sk_reuse = 1; /* SO_REUSEADDR */ + s_listen->sk->sk_rcvtimeo = + s_listen->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ; - err = sock2->ops->bind(sock2, + what = "bind"; + err = s_listen->ops->bind(s_listen, (struct sockaddr *) mdev->net_conf->my_addr, mdev->net_conf->my_addr_len); - dec_net(mdev); + if (err < 0) + goto out; - if (err) { - ERR("Unable to bind sock2 (%d)\n", err); - sock_release(sock2); - drbd_force_state(mdev,NS(conn,Disconnecting)); - return NULL; - } + err = drbd_accept(mdev, &what, s_listen, &s_estab); - sock = drbd_accept(mdev,sock2); - sock_release(sock2); +out: + if (s_listen) + sock_release(s_listen); + if (err < 0) { + if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) { + ERR("%s failed, err = %d\n", what, err); + drbd_force_state(mdev, NS(conn, Disconnecting)); + } +#if 0 + else { + DBG("%s failure ignored, err = %d, not Disconnecting\n", + what, err); + } +#endif + } + dec_net(mdev); - return sock; + return s_estab; } STATIC int drbd_do_handshake(drbd_dev *mdev); @@ -734,7 +787,7 @@ STATIC int drbd_socket_okay(drbd_dev *mdev, struct socket **sock) * no point in trying again, please go standalone. * -2 We do not have a network config... */ -int drbd_connect(drbd_dev *mdev) +STATIC int drbd_connect(drbd_dev *mdev) { struct socket *s, *sock, *msock; int try, h, ok; @@ -826,23 +879,38 @@ int drbd_connect(drbd_dev *mdev) sock->sk->sk_allocation = GFP_NOIO; msock->sk->sk_allocation = GFP_NOIO; - sock->sk->sk_priority=TC_PRIO_BULK; - // FIXME fold to limits. should be done in drbd_ioctl - sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; - sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK; + msock->sk->sk_priority = TC_PRIO_INTERACTIVE; + + if (mdev->net_conf->sndbuf_size) { + /* FIXME fold to limits. should be done in drbd_ioctl */ + /* this is setsockopt SO_SNDBUFFORCE and SO_RCVBUFFORCE, + * done directly. */ + sock->sk->sk_sndbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_rcvbuf = mdev->net_conf->sndbuf_size; + sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK; + } + +#if 0 /* don't pin the msock bufsize, autotuning should work better */ + msock->sk->sk_sndbuf = 2*32767; + msock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK; +#endif + /* NOT YET ... * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT; * first set it to the HandShake timeout, wich is hardcoded for now: */ sock->sk->sk_sndtimeo = sock->sk->sk_rcvtimeo = 2*HZ; - sock->sk->sk_userlocks |= SOCK_SNDBUF_LOCK | SOCK_RCVBUF_LOCK; - msock->sk->sk_priority=TC_PRIO_INTERACTIVE; - msock->sk->sk_sndbuf = 2*32767; msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10; msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ; + /* we don't want delays. + * we use TCP_CORK where apropriate, though */ + drbd_tcp_nodelay(sock); + drbd_tcp_nodelay(msock); + mdev->data.socket = sock; mdev->meta.socket = msock; mdev->last_received = jiffies; @@ -994,7 +1062,7 @@ STATIC int receive_Barrier_no_tcq(drbd_dev *mdev, Drbd_Header* h) /* used from receive_RSDataReply (recv_resync_read) * and from receive_Data */ STATIC struct Tl_epoch_entry * -read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) +read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) __must_hold(loca) { struct Tl_epoch_entry *e; struct bio_vec *bvec; @@ -1003,7 +1071,7 @@ read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) int ds,i,rr; e = drbd_alloc_ee(mdev,id,sector,data_size,GFP_KERNEL); - if(!e) return 0; + if(!e) return NULL; bio = e->private_bio; ds = data_size; bio_for_each_segment(bvec, bio, i) { @@ -1014,7 +1082,7 @@ read_in_block(drbd_dev *mdev, u64 id, sector_t sector, int data_size) drbd_free_ee(mdev,e); WARN("short read receiving data: read %d expected %d\n", rr, min_t(int,ds,PAGE_SIZE)); - return 0; + return NULL; } ds -= rr; } @@ -1116,12 +1184,15 @@ STATIC int e_end_resync_block(drbd_dev *mdev, struct drbd_work *w, int unused) return ok; } -STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector,int data_size) +STATIC int recv_resync_read(drbd_dev *mdev,sector_t sector,int data_size) __releases(local) { struct Tl_epoch_entry *e; e = read_in_block(mdev,ID_SYNCER,sector,data_size); - if(!e) return FALSE; + if(!e) { + dec_local(mdev); + return FALSE; + } dec_rs_pending(mdev); @@ -1217,16 +1288,12 @@ STATIC int receive_RSDataReply(drbd_dev *mdev,Drbd_Header* h) D_ASSERT(p->block_id == ID_SYNCER); if(inc_local(mdev)) { - /* data is submitted to disk within recv_resync_read. - * corresponding dec_local done below on error, - * or in drbd_endio_write_sec. */ /* FIXME paranoia: * verify that the corresponding bit is set. * in case we are Primary SyncTarget, * verify there are no pending write request to that area. */ ok = recv_resync_read(mdev,sector,data_size); - if (!ok) dec_local(mdev); } else { if (DRBD_ratelimit(5*HZ,5)) ERR("Can not write resync data to local disk.\n"); @@ -1342,11 +1409,11 @@ static int drbd_wait_peer_seq(drbd_dev *mdev, const u32 packet_seq) prepare_to_wait(&mdev->seq_wait,&wait,TASK_INTERRUPTIBLE); if (seq_le(packet_seq,mdev->peer_seq+1)) break; - spin_unlock(&mdev->peer_seq_lock); if (signal_pending(current)) { ret = -ERESTARTSYS; break; } + spin_unlock(&mdev->peer_seq_lock); schedule(); spin_lock(&mdev->peer_seq_lock); } @@ -1421,7 +1488,6 @@ STATIC int receive_Data(drbd_dev *mdev,Drbd_Header* h) } else { /* don't get the req_lock yet, * we may sleep in drbd_wait_peer_seq */ - const sector_t sector = e->sector; const int size = e->size; const int discard = test_bit(DISCARD_CONCURRENT,&mdev->flags); DEFINE_WAIT(wait); @@ -1747,7 +1813,7 @@ STATIC int receive_DataRequest(drbd_dev *mdev,Drbd_Header *h) return TRUE; } -STATIC int drbd_asb_recover_0p(drbd_dev *mdev) +STATIC int drbd_asb_recover_0p(drbd_dev *mdev) __must_hold(local) { int self, peer, rv=-100; unsigned long ch_self, ch_peer; @@ -1803,7 +1869,7 @@ STATIC int drbd_asb_recover_0p(drbd_dev *mdev) return rv; } -STATIC int drbd_asb_recover_1p(drbd_dev *mdev) +STATIC int drbd_asb_recover_1p(drbd_dev *mdev) __must_hold(local) { int self, peer, hg, rv=-100; @@ -1846,7 +1912,7 @@ STATIC int drbd_asb_recover_1p(drbd_dev *mdev) return rv; } -STATIC int drbd_asb_recover_2p(drbd_dev *mdev) +STATIC int drbd_asb_recover_2p(drbd_dev *mdev) __must_hold(local) { int self, peer, hg, rv=-100; @@ -1886,6 +1952,10 @@ STATIC int drbd_asb_recover_2p(drbd_dev *mdev) STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid) { + if (!uuid) { + INFO("%s uuid info vanished while I was looking!\n", text); + return; + } INFO("%s %016llX:%016llX:%016llX:%016llX\n", text, uuid[Current], @@ -1904,7 +1974,7 @@ STATIC void drbd_uuid_dump(drbd_dev *mdev,char* text,u64* uuid) -100 after split brain, disconnect -1000 unrelated data */ -STATIC int drbd_uuid_compare(drbd_dev *mdev, int *rule_nr) +STATIC int drbd_uuid_compare(drbd_dev *mdev, int *rule_nr) __must_hold(local) { u64 self, peer; int i,j; @@ -1988,7 +2058,7 @@ STATIC int drbd_uuid_compare(drbd_dev *mdev, int *rule_nr) conn_mask (-1) on failure. */ STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, - drbd_disks_t peer_disk) + drbd_disks_t peer_disk) __must_hold(local) { int hg,rule_nr; drbd_conns_t rv = conn_mask; @@ -2007,8 +2077,7 @@ STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, ); if (hg == -1000) { - ALERT("Unrelated data, dropping connection!\n"); - drbd_force_state(mdev,NS(conn,Disconnecting)); + ALERT("Unrelated data, aborting!\n"); return conn_mask; } @@ -2064,17 +2133,15 @@ STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, } if (hg == -100) { - ALERT("Split-Brain detected, dropping connection!\n"); + ALERT("Split-Brain detected, aborting!\n"); drbd_uuid_dump(mdev,"self",mdev->bc->md.uuid); drbd_uuid_dump(mdev,"peer",mdev->p_uuid); - drbd_force_state(mdev,NS(conn,Disconnecting)); drbd_khelper(mdev,"split-brain"); return conn_mask; } if (hg > 0 && mydisk <= Inconsistent ) { ERR("I shall become SyncSource, but I am inconsistent!\n"); - drbd_force_state(mdev,NS(conn,Disconnecting)); return conn_mask; } @@ -2086,7 +2153,6 @@ STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, // fall through case Disconnect: ERR("I shall become SyncTarget, but I am primary!\n"); - drbd_force_state(mdev,NS(conn,Disconnecting)); return conn_mask; case Violently: WARN("Becoming SyncTarget, violating the stable-data" @@ -2095,17 +2161,9 @@ STATIC drbd_conns_t drbd_sync_handshake(drbd_dev *mdev, drbd_role_t peer_role, } if (abs(hg) >= 2) { - drbd_md_set_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); - - drbd_bm_set_all(mdev); - - if (unlikely(drbd_bm_write(mdev) < 0)) { + INFO("Writing the whole bitmap, full sync required after drbd_sync_handshake.\n"); + if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write)) return conn_mask; - } - - drbd_md_clear_flag(mdev,MDF_FullSync); - drbd_md_sync(mdev); } if (hg > 0) { // become sync source. @@ -2262,6 +2320,7 @@ static void warn_if_differ_considerably(drbd_dev *mdev, const char *s, sector_t STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) { Drbd_Sizes_Packet *p = (Drbd_Sizes_Packet*)h; + enum determin_dev_size_enum dd = unchanged; unsigned int max_seg_s; sector_t p_size, p_usize, my_usize; int ldsc = 0; /* local disk size changed */ @@ -2287,7 +2346,7 @@ STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) #define min_not_zero(l, r) (l == 0) ? r : ((r == 0) ? l : min(l, r)) if(inc_local(mdev)) { warn_if_differ_considerably(mdev, "lower level device sizes", - p_size, drbd_get_capacity(mdev->bc->backing_bdev)); + p_size, drbd_get_max_capacity(mdev->bc)); warn_if_differ_considerably(mdev, "user requested size", p_usize, mdev->bc->dc.disk_size); @@ -2312,30 +2371,20 @@ STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) drbd_get_capacity(mdev->this_bdev) && mdev->state.disk >= Outdated && mdev->state.conn < Connected ) { - dec_local(mdev); ERR("The peer's disk size is too small!\n"); drbd_force_state(mdev,NS(conn,Disconnecting)); mdev->bc->dc.disk_size = my_usize; + dec_local(mdev); return FALSE; } dec_local(mdev); } #undef min_not_zero - if(inc_local(mdev)) { - enum determin_dev_size_enum dd; - drbd_bm_lock(mdev); // { + if (inc_local(mdev)) { dd = drbd_determin_dev_size(mdev); - drbd_bm_unlock(mdev); // } dec_local(mdev); if (dd == dev_size_error) return FALSE; - if (dd == grew && mdev->state.conn == Connected && - mdev->state.pdsk >= Inconsistent && - mdev->state.disk >= Inconsistent) { - /* With disk >= Inconsistent we take care to not get - here during an attach while we are connected. */ - resync_after_online_grow(mdev); - } drbd_md_sync(mdev); } else { // I am diskless, need to accept the peer's size. @@ -2346,7 +2395,10 @@ STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) nconn=drbd_sync_handshake(mdev,mdev->state.peer,mdev->state.pdsk); dec_local(mdev); - if(nconn == conn_mask) return FALSE; + if (nconn == conn_mask) { + drbd_force_state(mdev, NS(conn, Disconnecting)); + return FALSE; + } if(drbd_request_state(mdev,NS(conn,nconn)) < SS_Success) { drbd_force_state(mdev,NS(conn,Disconnecting)); @@ -2376,6 +2428,13 @@ STATIC int receive_sizes(drbd_dev *mdev, Drbd_Header *h) // needs to know my new size... drbd_send_sizes(mdev); } + if (dd == grew && mdev->state.conn == Connected) { + if (mdev->state.pdsk >= Inconsistent && + mdev->state.disk >= Inconsistent) + resync_after_online_grow(mdev); + else + set_bit(RESYNC_AFTER_NEG, &mdev->flags); + } } return TRUE; @@ -2400,6 +2459,19 @@ STATIC int receive_uuids(drbd_dev *mdev, Drbd_Header *h) if ( mdev->p_uuid ) kfree(mdev->p_uuid); mdev->p_uuid = p_uuid; + if (mdev->state.conn < Connected && + mdev->state.disk < Inconsistent && + mdev->state.role == Primary && + (mdev->ed_uuid & ~((u64)1)) != (p_uuid[Current] & ~((u64)1))) { + ERR("Can only connect to data with current UUID=%016llX\n", + (unsigned long long)mdev->ed_uuid); + drbd_force_state(mdev,NS(conn,Disconnecting)); + return FALSE; + } + + if (mdev->state.conn >= Connected && mdev->state.disk < Inconsistent) + drbd_set_ed_uuid(mdev, p_uuid[Current]); + return TRUE; } @@ -2446,15 +2518,17 @@ STATIC int receive_req_state(drbd_dev *mdev, Drbd_Header *h) mask.i = be32_to_cpu(p->mask); val.i = be32_to_cpu(p->val); - if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_lock(mdev); + if (test_bit(DISCARD_CONCURRENT, &mdev->flags) && + test_bit(CLUSTER_ST_CHANGE, &mdev->flags)) { + drbd_send_sr_reply(mdev, SS_ConcurrentStChg); + return TRUE; + } mask = convert_state(mask); val = convert_state(val); rv = drbd_change_state(mdev,ChgStateVerbose,mask,val); - if (test_bit(DISCARD_CONCURRENT,&mdev->flags)) drbd_state_unlock(mdev); - drbd_send_sr_reply(mdev,rv); drbd_md_sync(mdev); @@ -2466,19 +2540,21 @@ STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) Drbd_State_Packet *p = (Drbd_State_Packet*)h; drbd_conns_t nconn,oconn; drbd_state_t ns,peer_state; + drbd_disks_t real_peer_disk; int rv; - /** - * Ensure no other thread sends state whilst we are running - **/ - down(&mdev->data.mutex); - - ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) goto fail; + ERR_IF(h->length != (sizeof(*p)-sizeof(*h))) return FALSE; if (drbd_recv(mdev, h->payload, h->length) != h->length) - goto fail; + return FALSE; peer_state.i = be32_to_cpu(p->state); + real_peer_disk = peer_state.disk; + if (peer_state.disk == Negotiating) { + real_peer_disk = mdev->p_uuid[UUID_FLAGS] & 4 ? Inconsistent : Consistent; + INFO("real peer disk state = %s\n", disks_to_name(real_peer_disk)); + } + spin_lock_irq(&mdev->req_lock); retry: oconn = nconn = mdev->state.conn; @@ -2497,10 +2573,22 @@ STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) cr |= test_bit(CONSIDER_RESYNC,&mdev->flags); /* peer forced */ cr |= (oconn == Connected && peer_state.conn > Connected); - if (cr) nconn=drbd_sync_handshake(mdev, peer_state.role, peer_state.disk); + if (cr) nconn=drbd_sync_handshake(mdev, peer_state.role, real_peer_disk); dec_local(mdev); - if(nconn == conn_mask) goto fail; + if (nconn == conn_mask) { + if (mdev->state.disk == Negotiating) { + drbd_force_state(mdev, NS(disk, Diskless)); + nconn = Connected; + } else if (peer_state.disk == Negotiating) { + ERR("Disk attach process on the peer node was aborted.\n"); + peer_state.disk = Diskless; + } else { + D_ASSERT(oconn == WFReportParams); + drbd_force_state(mdev, NS(conn, Disconnecting)); + return FALSE; + } + } } spin_lock_irq(&mdev->req_lock); @@ -2509,30 +2597,28 @@ STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) ns.i = mdev->state.i; ns.conn = nconn; ns.peer = peer_state.role; - ns.pdsk = peer_state.disk; + ns.pdsk = real_peer_disk; ns.peer_isp = ( peer_state.aftr_isp | peer_state.user_isp ); - if((nconn == Connected || nconn == WFBitMapS) && - ns.disk == Negotiating ) ns.disk = UpToDate; - if((nconn == Connected || nconn == WFBitMapT) && - ns.pdsk == Negotiating ) ns.pdsk = UpToDate; - rv = _drbd_set_state(mdev,ns,ChgStateVerbose|ChgStateHard); + if ((nconn == Connected || nconn == WFBitMapS) && ns.disk == Negotiating) + ns.disk = mdev->new_state_tmp.disk; + rv = _drbd_set_state(mdev, ns, ChgStateVerbose|ChgStateHard, NULL); ns = mdev->state; spin_unlock_irq(&mdev->req_lock); if(rv < SS_Success) { drbd_force_state(mdev,NS(conn,Disconnecting)); - goto fail; + return FALSE; } if (oconn > WFReportParams ) { if( nconn > Connected && peer_state.conn <= Connected) { // we want resync, peer has not yet decided to sync... - _drbd_send_uuids(mdev); - _drbd_send_state(mdev); + drbd_send_uuids(mdev); + drbd_send_state(mdev); } else if (nconn == Connected && peer_state.disk == Negotiating) { // peer is waiting for us to respond... - _drbd_send_state(mdev); + drbd_send_state(mdev); } } @@ -2541,11 +2627,7 @@ STATIC int receive_state(drbd_dev *mdev, Drbd_Header *h) /* FIXME assertion for (gencounts do not diverge) */ drbd_md_sync(mdev); // update connected indicator, la_size, ... - up(&mdev->data.mutex); return TRUE; - fail: - up(&mdev->data.mutex); - return FALSE; } STATIC int receive_sync_uuid(drbd_dev *mdev, Drbd_Header *h) @@ -2563,10 +2645,15 @@ STATIC int receive_sync_uuid(drbd_dev *mdev, Drbd_Header *h) /* Here the _drbd_uuid_ functions are right, current should _not_ be rotated into the history */ - _drbd_uuid_set(mdev,Current,be64_to_cpu(p->uuid)); - _drbd_uuid_set(mdev,Bitmap,0UL); + if (inc_local_if_state(mdev, Negotiating)) { + _drbd_uuid_set(mdev,Current,be64_to_cpu(p->uuid)); + _drbd_uuid_set(mdev,Bitmap,0UL); + + drbd_start_resync(mdev,SyncTarget); - drbd_start_resync(mdev,SyncTarget); + dec_local(mdev); + } else + ERR("Ignoring SyncUUID packet!\n"); return TRUE; } @@ -2585,11 +2672,19 @@ STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) unsigned long *buffer; int ok=FALSE; + wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt)); + drbd_bm_lock(mdev); // { bm_words = drbd_bm_words(mdev); - bm_i = 0; - buffer = vmalloc(BM_PACKET_WORDS*sizeof(long)); + bm_i = 0; + /* maybe we should use some per thread scratch page, + * and allocate that during initial device creation? */ + buffer = (unsigned long *) __get_free_page(GFP_NOIO); + if (!buffer) { + ERR("failed to allocate one page buffer in %s\n", __func__ ); + return FALSE; + } while (1) { num_words = min_t(size_t, BM_PACKET_WORDS, bm_words-bm_i ); @@ -2610,9 +2705,10 @@ STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) if (mdev->state.conn == WFBitMapS) { drbd_start_resync(mdev,SyncSource); } else if (mdev->state.conn == WFBitMapT) { - ok = drbd_send_bitmap(mdev); + ok = !drbd_send_bitmap(mdev); if (!ok) goto out; - ok = drbd_request_state(mdev,NS(conn,WFSyncUUID)); + /* Omit ChgOrdered with this state transition to avoid deadlocks. */ + ok = _drbd_request_state(mdev, NS(conn, WFSyncUUID), ChgStateVerbose); D_ASSERT( ok == SS_Success ); } else { ERR("unexpected cstate (%s) in receive_bitmap\n", @@ -2622,7 +2718,7 @@ STATIC int receive_bitmap(drbd_dev *mdev, Drbd_Header *h) ok=TRUE; out: drbd_bm_unlock(mdev); // } - vfree(buffer); + free_page((unsigned long) buffer); return ok; } @@ -2862,7 +2958,7 @@ STATIC void drbd_disconnect(drbd_dev *mdev) // Do not restart in case we are Disconnecting ns = os; ns.conn = Unconnected; - rv=_drbd_set_state(mdev,ns,ChgStateVerbose); + rv=_drbd_set_state(mdev, ns, ChgStateVerbose, NULL); } spin_unlock_irq(&mdev->req_lock); @@ -2912,7 +3008,7 @@ STATIC void drbd_disconnect(drbd_dev *mdev) * * for now, they are expected to be zero, but ignored. */ -int drbd_send_handshake(drbd_dev *mdev) +STATIC int drbd_send_handshake(drbd_dev *mdev) { // ASSERT current == mdev->receiver ... Drbd_HandShake_Packet *p = &mdev->data.sbuf.HandShake; @@ -3149,7 +3245,7 @@ STATIC int drbd_do_auth(drbd_dev *mdev) } #endif -int drbdd_init(struct Drbd_thread *thi) +STATIC int drbdd_init(struct Drbd_thread *thi) { drbd_dev *mdev = thi->mdev; int minor = mdev_to_minor(mdev); @@ -3205,8 +3301,8 @@ STATIC int got_RqSReply(drbd_dev *mdev, Drbd_Header* h) set_bit(CL_ST_CHG_SUCCESS,&mdev->flags); } else { set_bit(CL_ST_CHG_FAIL,&mdev->flags); - ERR("Requested state change failed by peer: %s\n", - set_st_err_name(retcode)); + ERR("Requested state change failed by peer: %s (%d)\n", + set_st_err_name(retcode), retcode); } wake_up(&mdev->state_wait); @@ -3291,7 +3387,6 @@ STATIC int got_NegAck(drbd_dev *mdev, Drbd_Header* h) update_peer_seq(mdev,be32_to_cpu(p->seq_num)); if(is_syncer_block_id(p->block_id)) { - sector_t sector = be64_to_cpu(p->sector); int size = be32_to_cpu(p->blksize); dec_rs_pending(mdev); @@ -3328,16 +3423,11 @@ STATIC int got_NegDReply(drbd_dev *mdev, Drbd_Header* h) return FALSE; } - /* FIXME explicitly warn if protocol != C */ - - ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n", - (unsigned long long)sector,be32_to_cpu(p->blksize)); - _req_mod(req, neg_acked, 0); spin_unlock_irq(&mdev->req_lock); -// warning LGE "ugly and wrong" - drbd_khelper(mdev,"pri-on-incon-degr"); + ERR("Got NegDReply; Sector %llus, len %u; Fail original request.\n", + (unsigned long long)sector, be32_to_cpu(p->blksize)); return TRUE; } @@ -3405,7 +3495,7 @@ static struct asender_cmd* get_asender_cmd(int cmd) return &asender_tbl[cmd]; } -int drbd_asender(struct Drbd_thread *thi) +STATIC int drbd_asender(struct Drbd_thread *thi) { drbd_dev *mdev = thi->mdev; Drbd_Header *h = &mdev->meta.rbuf.head; @@ -3429,21 +3519,30 @@ int drbd_asender(struct Drbd_thread *thi) mdev->net_conf->ping_timeo*HZ/10; } - while(1) { + drbd_tcp_cork(mdev->meta.socket); + while (1) { + clear_bit(SIGNAL_ASENDER, &mdev->flags); + flush_signals(current); if (!drbd_process_done_ee(mdev)) { ERR("process_done_ee() = NOT_OK\n"); goto reconnect; } + /* to avoid race with newly queued ACKs */ set_bit(SIGNAL_ASENDER, &mdev->flags); spin_lock_irq(&mdev->req_lock); empty = list_empty(&mdev->done_ee); spin_unlock_irq(&mdev->req_lock); + /* new ack may have been queued right here, + * but then there is also a signal pending, + * and we start over... */ if (empty && !test_bit(WRITE_ACK_PENDING, &mdev->flags)) break; - clear_bit(SIGNAL_ASENDER, &mdev->flags); - flush_signals(current); } - drbd_tcp_flush(mdev->meta.socket); + drbd_tcp_uncork(mdev->meta.socket); + + /* short circuit, recv_msg would return EINTR anyways. */ + if (signal_pending(current)) + continue; rv = drbd_recv_short(mdev, mdev->meta.socket, buf, expect-received, 0); @@ -3451,8 +3550,6 @@ int drbd_asender(struct Drbd_thread *thi) flush_signals(current); - drbd_tcp_cork(mdev->meta.socket); - /* Note: * -EINTR (on meta) we got a signal * -EAGAIN (on meta) rcvtimeo expired diff --git a/ubuntu/block/drbd/drbd_req.c b/ubuntu/block/drbd/drbd_req.c index 21d005b..645be94 100644 --- a/ubuntu/block/drbd/drbd_req.c +++ b/ubuntu/block/drbd/drbd_req.c @@ -35,7 +35,7 @@ /* outside of the ifdef * because of the _print_rq_state(,FIXME) in barrier_acked */ -void _print_rq_state(drbd_request_t *req, const char *txt) +STATIC void _print_rq_state(drbd_request_t *req, const char *txt) { const unsigned long s = req->rq_state; drbd_dev *mdev = req->mdev; @@ -61,7 +61,7 @@ void _print_rq_state(drbd_request_t *req, const char *txt) //#define VERBOSE_REQUEST_CODE #if defined(VERBOSE_REQUEST_CODE) || defined(ENABLE_DYNAMIC_TRACE) -void _print_req_mod(drbd_request_t *req,drbd_req_event_t what) +STATIC void _print_req_mod(drbd_request_t *req,drbd_req_event_t what) { drbd_dev *mdev = req->mdev; const int rw = (req->master_bio == NULL || @@ -105,6 +105,42 @@ void _print_req_mod(drbd_request_t *req,drbd_req_event_t what) #define print_req_mod(T,W) #endif +/* We only support diskstats for 2.6.16 and up. + * see also commit commit a362357b6cd62643d4dda3b152639303d78473da + * Author: Jens Axboe + * Date: Tue Nov 1 09:26:16 2005 +0100 + * [BLOCK] Unify the seperate read/write io stat fields into arrays */ +#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,16) +#define _drbd_start_io_acct(...) do {} while (0) +#define _drbd_end_io_acct(...) do {} while (0) +#else + +/* Update disk stats at start of I/O request */ +static inline void _drbd_start_io_acct(drbd_dev *mdev, drbd_request_t *req, struct bio *bio) +{ + const int rw = bio_data_dir(bio); + + MUST_HOLD(&mdev->req_lock) + __disk_stat_inc(mdev->vdisk, ios[rw]); + __disk_stat_add(mdev->vdisk, sectors[rw], bio_sectors(bio)); + disk_round_stats(mdev->vdisk); + mdev->vdisk->in_flight++; +} + +/* Update disk stats when completing request upwards */ +static inline void _drbd_end_io_acct(drbd_dev *mdev, drbd_request_t *req) +{ + int rw = bio_data_dir(req->master_bio); + unsigned long duration = jiffies - req->start_time; + + MUST_HOLD(&mdev->req_lock) + __disk_stat_add(mdev->vdisk, ticks[rw], duration); + disk_round_stats(mdev->vdisk); + mdev->vdisk->in_flight--; +} + +#endif + static void _req_is_done(drbd_dev *mdev, drbd_request_t *req, const int rw) { const unsigned long s = req->rq_state; @@ -335,6 +371,9 @@ void _req_may_be_done(drbd_request_t *req, int error) * then again, if it is a READ, it is not in the TL at all. * is it still leagal to complete a READ during freeze? */ + /* Update disk stats */ + _drbd_end_io_acct(mdev, req); + _complete_master_bio(mdev,req, ok ? 0 : ( error ? error : -EIO ) ); } else { @@ -956,6 +995,9 @@ drbd_make_request_common(drbd_dev *mdev, struct bio *bio) } + /* Update disk stats */ + _drbd_start_io_acct(mdev, req, bio); + /* _maybe_start_new_epoch(mdev); * If we need to generate a write barrier packet, we have to add the * new epoch (barrier) object, and queue the barrier packet for sending, @@ -968,8 +1010,7 @@ drbd_make_request_common(drbd_dev *mdev, struct bio *bio) * barrier packet, this request is queued within the same spinlock. */ if (remote && mdev->unused_spare_barrier && test_and_clear_bit(CREATE_BARRIER, &mdev->flags)) { - struct drbd_barrier *b = mdev->unused_spare_barrier; - _tl_add_barrier(mdev, b); + _tl_add_barrier(mdev, mdev->unused_spare_barrier); mdev->unused_spare_barrier = NULL; } else { D_ASSERT(!(remote && rw == WRITE && @@ -1012,12 +1053,13 @@ drbd_make_request_common(drbd_dev *mdev, struct bio *bio) local = 0; } if (remote) dec_ap_pending(mdev); + _drbd_end_io_acct(mdev, req); /* THINK: do we want to fail it (-EIO), or pretend success? */ bio_endio(req->master_bio, 0); req->master_bio = NULL; dec_ap_bio(mdev); drbd_req_free(req); - local = remote = 0; + remote = 0; } /* NOTE remote first: to get the concurrent write detection right, @@ -1034,8 +1076,6 @@ drbd_make_request_common(drbd_dev *mdev, struct bio *bio) if (b) kfree(b); /* if someone else has beaten us to it... */ if (local) { - /* FIXME what ref count do we have to ensure the backing_bdev - * was not detached below us? */ req->private_bio->bi_bdev = mdev->bc->backing_bdev; dump_internal_bio("Pri", mdev, req->private_bio, 0); @@ -1091,8 +1131,7 @@ static int drbd_fail_request_early(drbd_dev* mdev, int is_write) * to serialize state changes, this is racy, since we may lose * the connection *after* we test for the cstate. */ - if ( mdev->state.disk < UpToDate && - mdev->state.conn < Connected) { + if (mdev->state.disk < UpToDate && mdev->state.pdsk < UpToDate) { if (DRBD_ratelimit(5*HZ,5)) { ERR("Sorry, I have no access to good data anymore.\n"); } diff --git a/ubuntu/block/drbd/drbd_req.h b/ubuntu/block/drbd/drbd_req.h index 33fc30c..d106463 100644 --- a/ubuntu/block/drbd/drbd_req.h +++ b/ubuntu/block/drbd/drbd_req.h @@ -284,12 +284,13 @@ static inline drbd_request_t* drbd_req_new(drbd_dev *mdev, struct bio *bio_src) req->epoch = 0; req->sector = bio->bi_sector; req->size = bio->bi_size; + req->start_time = jiffies; INIT_HLIST_NODE(&req->colision); INIT_LIST_HEAD(&req->tl_requests); bio->bi_private = req; bio->bi_end_io = drbd_endio_pri; - bio->bi_next = 0; + bio->bi_next = NULL; } return req; } diff --git a/ubuntu/block/drbd/drbd_strings.c b/ubuntu/block/drbd/drbd_strings.c index 25f3cdf..dc6ef1d 100644 --- a/ubuntu/block/drbd/drbd_strings.c +++ b/ubuntu/block/drbd/drbd_strings.c @@ -82,7 +82,9 @@ static const char *drbd_state_sw_errors[] = { "Device is diskless, the requesed operation requires a disk", [-SS_DeviceInUse] = "Device is held open by someone", [-SS_NoNetConfig] = "Have no net/connection configuration", - [-SS_LowerThanOutdated] = "Disk state is lower than outdated" + [-SS_LowerThanOutdated] = "Disk state is lower than outdated", + [-SS_InTransientState] = "In transient state, retry after next state change", + [-SS_ConcurrentStChg] = "Concurrent state changes detected and aborted" }; const char* conns_to_name(drbd_conns_t s) { @@ -102,7 +104,7 @@ const char* disks_to_name(drbd_disks_t s) { } const char* set_st_err_name(set_st_err_t err) { - return err < SS_LowerThanOutdated ? "TOO_SMALL" : + return err <= SS_AfterLastError ? "TOO_SMALL" : err > SS_TwoPrimaries ? "TOO_LARGE" : drbd_state_sw_errors[-err]; } diff --git a/ubuntu/block/drbd/drbd_worker.c b/ubuntu/block/drbd/drbd_worker.c index 793e378..e564926 100644 --- a/ubuntu/block/drbd/drbd_worker.c +++ b/ubuntu/block/drbd/drbd_worker.c @@ -62,7 +62,7 @@ /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ -BIO_ENDIO_FN(drbd_md_io_complete) +BIO_ENDIO_TYPE drbd_md_io_complete BIO_ENDIO_ARGS(struct bio *bio, int error) { struct drbd_md_io *md_io; @@ -83,7 +83,7 @@ BIO_ENDIO_FN(drbd_md_io_complete) /* reads on behalf of the partner, * "submitted" by the receiver */ -BIO_ENDIO_FN(drbd_endio_read_sec) +BIO_ENDIO_TYPE drbd_endio_read_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __releases(local) { unsigned long flags=0; struct Tl_epoch_entry *e=NULL; @@ -129,7 +129,7 @@ BIO_ENDIO_FN(drbd_endio_read_sec) /* writes on behalf of the partner, or resync writes, * "submitted" by the receiver. */ -BIO_ENDIO_FN(drbd_endio_write_sec) +BIO_ENDIO_TYPE drbd_endio_write_sec BIO_ENDIO_ARGS(struct bio *bio, int error) __releases(local) { unsigned long flags=0; struct Tl_epoch_entry *e=NULL; @@ -203,7 +203,7 @@ BIO_ENDIO_FN(drbd_endio_write_sec) /* read, readA or write requests on Primary comming from drbd_make_request */ -BIO_ENDIO_FN(drbd_endio_pri) +BIO_ENDIO_TYPE drbd_endio_pri BIO_ENDIO_ARGS(struct bio *bio, int error) { unsigned long flags; drbd_request_t *req=bio->bi_private; @@ -273,7 +273,6 @@ int w_read_retry_remote(drbd_dev* mdev, struct drbd_work* w,int cancel) mdev->state.pdsk <= Inconsistent ) { _req_mod(req, send_canceled, 0); /* FIXME freeze? ... */ spin_unlock_irq(&mdev->req_lock); - drbd_khelper(mdev,"pri-on-incon-degr"); /* FIXME REALLY? */ ALERT("WE ARE LOST. Local IO failure, no peer.\n"); return 1; } @@ -457,13 +456,11 @@ int w_make_resync_request(drbd_dev* mdev, struct drbd_work* w,int cancel) return 1; } -int w_resync_finished(drbd_dev *mdev, struct drbd_work *w, int cancel) +STATIC int w_resync_finished(drbd_dev *mdev, struct drbd_work *w, int cancel) { kfree(w); - drbd_bm_lock(mdev); drbd_resync_finished(mdev); - drbd_bm_unlock(mdev); return 1; } @@ -471,7 +468,7 @@ int w_resync_finished(drbd_dev *mdev, struct drbd_work *w, int cancel) int drbd_resync_finished(drbd_dev* mdev) { unsigned long db,dt,dbdt; - int dstate, pdstate; + drbd_state_t os, ns; struct drbd_work *w; // Remove all elements from the resync LRU. Since future actions @@ -500,6 +497,21 @@ int drbd_resync_finished(drbd_dev* mdev) db = mdev->rs_total; dbdt = Bit2KB(db/dt); mdev->rs_paused /= HZ; + + if (!inc_local(mdev)) + goto out; + + spin_lock_irq(&mdev->req_lock); + os = mdev->state; + + /* This protects us against multiple calls (that can happen in the presence + of application IO), and against connectivity loss just before we arrive here. */ + if (os.conn <= Connected) + goto out_unlock; + + ns = os; + ns.conn = Connected; + INFO("Resync done (total %lu sec; paused %lu sec; %lu K/sec)\n", dt + mdev->rs_paused, mdev->rs_paused, dbdt); @@ -508,19 +520,18 @@ int drbd_resync_finished(drbd_dev* mdev) if (mdev->rs_failed) { INFO(" %lu failed blocks\n",mdev->rs_failed); - if (mdev->state.conn == SyncTarget || - mdev->state.conn == PausedSyncT) { - dstate = Inconsistent; - pdstate = UpToDate; + if (os.conn == SyncTarget || os.conn == PausedSyncT) { + ns.disk = Inconsistent; + ns.pdsk = UpToDate; } else { - dstate = UpToDate; - pdstate = Inconsistent; + ns.disk = UpToDate; + ns.pdsk = Inconsistent; } } else { - dstate = pdstate = UpToDate; + ns.disk = UpToDate; + ns.pdsk = UpToDate; - if (mdev->state.conn == SyncTarget || - mdev->state.conn == PausedSyncT) { + if (os.conn == SyncTarget || os.conn == PausedSyncT) { if( mdev->p_uuid ) { int i; for ( i=Bitmap ; i<=History_end ; i++ ) { @@ -545,23 +556,22 @@ int drbd_resync_finished(drbd_dev* mdev) } } + _drbd_set_state(mdev, ns, ChgStateVerbose, NULL); + out_unlock: + spin_unlock_irq(&mdev->req_lock); + dec_local(mdev); + out: mdev->rs_total = 0; mdev->rs_failed = 0; mdev->rs_paused = 0; if (test_and_clear_bit(WRITE_BM_AFTER_RESYNC,&mdev->flags)) { WARN("Writing the whole bitmap, due to failed kmalloc\n"); - drbd_bm_write(mdev); + drbd_queue_bitmap_io(mdev, &drbd_bm_write, NULL); } drbd_bm_recount_bits(mdev); - drbd_request_state(mdev,NS3(conn,Connected, - disk,dstate, - pdsk,pdstate)); - - drbd_md_sync(mdev); - return 1; } @@ -758,28 +768,32 @@ int w_send_read_req(drbd_dev *mdev, struct drbd_work *w, int cancel) return ok; } -STATIC void drbd_global_lock(void) +STATIC void drbd_global_lock(void) __acquires(drbd_global_lock) { drbd_dev *mdev; int i; + __acquire(drbd_global_lock); local_irq_disable(); for (i=0; i < minor_count; i++) { if(!(mdev = minor_to_mdev(i))) continue; spin_lock(&mdev->req_lock); + __release(&mdev->req_lock); /* annihilate the spin_lock's annotation here */ } } -STATIC void drbd_global_unlock(void) +STATIC void drbd_global_unlock(void) __releases(drbd_global_lock) { drbd_dev *mdev; int i; for (i=0; i < minor_count; i++) { if(!(mdev = minor_to_mdev(i))) continue; + __acquire(&mdev->req_lock); spin_unlock(&mdev->req_lock); } local_irq_enable(); + __release(drbd_global_lock); } STATIC int _drbd_may_sync_now(drbd_dev *mdev) @@ -812,7 +826,7 @@ STATIC int _drbd_pause_after(drbd_dev *mdev) if (!(odev = minor_to_mdev(i)) || (odev->state.conn == StandAlone && odev->state.disk == Diskless) ) continue; if (! _drbd_may_sync_now(odev)) { - rv |= ( _drbd_set_state(_NS(odev,aftr_isp,1),ChgStateHard) + rv |= ( _drbd_set_state(_NS(odev, aftr_isp, 1), ChgStateHard, NULL) != SS_NothingToDo ) ; } } @@ -835,7 +849,7 @@ STATIC int _drbd_resume_next(drbd_dev *mdev) if( !(odev = minor_to_mdev(i)) ) continue; if ( odev->state.aftr_isp ) { if (_drbd_may_sync_now(odev)) { - rv |= ( _drbd_set_state(_NS(odev,aftr_isp,0),ChgStateHard) + rv |= ( _drbd_set_state(_NS(odev,aftr_isp,0),ChgStateHard,NULL) != SS_NothingToDo ) ; } } @@ -895,6 +909,13 @@ void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) /* In case a previous resync run was aborted by an IO error... */ drbd_rs_cancel_all(mdev); + drbd_state_lock(mdev); + + if (!inc_local_if_state(mdev, Negotiating)) { + drbd_state_unlock(mdev); + return; + } + if(side == SyncTarget) { drbd_bm_reset_find(mdev); } else /* side == SyncSource */ { @@ -920,9 +941,12 @@ void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) ns.pdsk = Inconsistent; } - r = _drbd_set_state(mdev,ns,ChgStateVerbose); + r = _drbd_set_state(mdev,ns,ChgStateVerbose,NULL); ns = mdev->state; + if (ns.conn < Connected) + r = SS_UnknownError; + if ( r == SS_Success ) { mdev->rs_total = mdev->rs_mark_left = drbd_bm_total_weight(mdev); @@ -933,6 +957,8 @@ void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) _drbd_pause_after(mdev); } drbd_global_unlock(); + drbd_state_unlock(mdev); + dec_local(mdev); if ( r == SS_Success ) { INFO("Began resync as %s (will sync %lu KB [%lu bits set]).\n", @@ -957,7 +983,7 @@ void drbd_start_resync(drbd_dev *mdev, drbd_conns_t side) int drbd_worker(struct Drbd_thread *thi) { drbd_dev *mdev = thi->mdev; - struct drbd_work *w = 0; + struct drbd_work *w = NULL; LIST_HEAD(work_list); int intr=0,i; @@ -967,7 +993,8 @@ int drbd_worker(struct Drbd_thread *thi) if(down_trylock(&mdev->data.work.s)) { down(&mdev->data.mutex); - if(mdev->data.socket)drbd_tcp_flush(mdev->data.socket); + if (mdev->data.socket) + drbd_tcp_uncork(mdev->data.socket); up(&mdev->data.mutex); intr = down_interruptible(&mdev->data.work.s); @@ -990,7 +1017,7 @@ int drbd_worker(struct Drbd_thread *thi) the entry from the list. The cleanup code takes care of this... */ - w = 0; + w = NULL; spin_lock_irq(&mdev->data.work.q_lock); ERR_IF(list_empty(&mdev->data.work.q)) { /* something terribly wrong in our logic. diff --git a/ubuntu/block/drbd/linux/connector.h b/ubuntu/block/drbd/linux/connector.h index 404719b..b6943c5 100644 --- a/ubuntu/block/drbd/linux/connector.h +++ b/ubuntu/block/drbd/linux/connector.h @@ -96,8 +96,9 @@ struct cn_ctl_msg { #ifdef __KERNEL__ #include "drbd_config.h" -#if !defined(KERNEL_HAS_GFP_T) -typedef unsigned int gfp_t; +#ifndef KERNEL_HAS_GFP_T +#define KERNEL_HAS_GFP_T +typedef unsigned gfp_t; #endif #include diff --git a/ubuntu/block/drbd/linux/drbd.h b/ubuntu/block/drbd/linux/drbd.h index e54c1e9..9afda37 100644 --- a/ubuntu/block/drbd/linux/drbd.h +++ b/ubuntu/block/drbd/linux/drbd.h @@ -129,6 +129,16 @@ enum ret_codes { HaveNoDiskConfig, ProtocolCRequired, VMallocFailed, + IntegrityAlgNotAvail, /* DRBD 8.2 only */ + IntegrityAlgNotDigest, /* DRBD 8.2 only */ + CPUMaskParseFailed, /* DRBD 8.2 only */ + CSUMSAlgNotAvail, /* DRBD 8.2 only */ + CSUMSAlgNotDigest, /* DRBD 8.2 only */ + VERIFYAlgNotAvail, /* DRBD 8.2 only */ + VERIFYAlgNotDigest, /* DRBD 8.2 only */ + CSUMSResyncRunning, /* DRBD 8.2 only */ + VERIFYIsRunning, /* DRBD 8.2 only */ + DataOfWrongCurrent, /* insert new ones above this line */ AfterLastRetCode @@ -232,24 +242,30 @@ typedef union { } drbd_state_t; typedef enum { - SS_CW_NoNeed=4, - SS_CW_Success=3, - SS_NothingToDo=2, - SS_Success=1, - SS_UnknownError=0, // Used to sleep longer in _drbd_request_state - SS_TwoPrimaries=-1, - SS_NoUpToDateDisk=-2, - SS_BothInconsistent=-4, - SS_SyncingDiskless=-5, - SS_ConnectedOutdates=-6, - SS_PrimaryNOP=-7, - SS_ResyncRunning=-8, - SS_AlreadyStandAlone=-9, - SS_CW_FailedByPeer=-10, - SS_IsDiskLess=-11, - SS_DeviceInUse=-12, - SS_NoNetConfig=-13, - SS_LowerThanOutdated=-14 + SS_CW_NoNeed = 4, + SS_CW_Success = 3, + SS_NothingToDo = 2, + SS_Success = 1, + SS_UnknownError = 0, // Used to sleep longer in _drbd_request_state + SS_TwoPrimaries = -1, + SS_NoUpToDateDisk = -2, + SS_BothInconsistent = -4, + SS_SyncingDiskless = -5, + SS_ConnectedOutdates = -6, + SS_PrimaryNOP = -7, + SS_ResyncRunning = -8, + SS_AlreadyStandAlone = -9, + SS_CW_FailedByPeer = -10, + SS_IsDiskLess = -11, + SS_DeviceInUse = -12, + SS_NoNetConfig = -13, + SS_NoVerifyAlg = -14, /* drbd-8.2 only */ + SS_NeedConnection = -15, /* drbd-8.2 only */ + SS_LowerThanOutdated = -16, + SS_NotSupported = -17, /* drbd-8.2 only */ + SS_InTransientState = -18, /* Retry after the next state change */ + SS_ConcurrentStChg = -19, /* Concurrent cluster side state change! */ + SS_AfterLastError = -20, /* Keep this at bottom */ } set_st_err_t; /* from drbd_strings.c */ diff --git a/ubuntu/block/drbd/linux/drbd_config.h b/ubuntu/block/drbd/linux/drbd_config.h index b0b345d..59a7846 100644 --- a/ubuntu/block/drbd/linux/drbd_config.h +++ b/ubuntu/block/drbd/linux/drbd_config.h @@ -22,14 +22,16 @@ extern const char * drbd_buildtag(void); -#define REL_VERSION "8.0.11" +#define REL_VERSION "8.0.13" #define API_VERSION 86 #define PRO_VERSION 86 // undef if you need the workaround in drbd_receiver #define HAVE_UML_TO_VIRT 1 +#ifndef __CHECKER__ /* for a sparse run, we need all STATICs */ #define DBG_ALL_SYMBOLS // no static functs, improves quality of OOPS traces +#endif //#define DBG_SPINLOCKS // enables MUST_HOLD macro (assertions for spinlocks) //#define DBG_ASSERTS // drbd_assert_breakpoint() function diff --git a/ubuntu/block/drbd/linux/drbd_limits.h b/ubuntu/block/drbd/linux/drbd_limits.h index fff9a36..8deedd4 100644 --- a/ubuntu/block/drbd/linux/drbd_limits.h +++ b/ubuntu/block/drbd/linux/drbd_limits.h @@ -65,7 +65,7 @@ #define DRBD_MAX_EPOCH_SIZE_DEF 2048 /* I don't think that a tcp send buffer of more than 10M is usefull */ -#define DRBD_SNDBUF_SIZE_MIN 1 +#define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX 10000000 #define DRBD_SNDBUF_SIZE_DEF (2*65535) @@ -107,10 +107,11 @@ /* drbdsetup XY resize -d Z * you are free to reduce the device size to nothing, if you want to. - * but more than 3998G are currently not possible */ + * the upper limit with 64bit kernel, enough ram and flexible meta data + * is 8 TB, currently. */ /* DRBD_MAX_SECTORS */ #define DRBD_DISK_SIZE_SECT_MIN 0 -#define DRBD_DISK_SIZE_SECT_MAX ((128LLU*1024*2 - 72)*512LLU*8*8) +#define DRBD_DISK_SIZE_SECT_MAX (8 * (2LLU << 30)) #define DRBD_DISK_SIZE_SECT_DEF 0 // = disabled = no user size... #define DRBD_ON_IO_ERROR_DEF PassOn diff --git a/ubuntu/block/drbd/linux/drbd_nl.h b/ubuntu/block/drbd/linux/drbd_nl.h index b0e770c..6afe92a 100644 --- a/ubuntu/block/drbd/linux/drbd_nl.h +++ b/ubuntu/block/drbd/linux/drbd_nl.h @@ -25,6 +25,8 @@ NL_PACKET(disk_conf, 3, NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) NL_INTEGER( 7, T_MAY_IGNORE, fencing) NL_BIT( 37, T_MAY_IGNORE, use_bmbv) + NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) + NL_BIT( 54, T_MAY_IGNORE, no_md_flush) ) NL_PACKET(detach, 4,) @@ -48,9 +50,13 @@ NL_PACKET(net_conf, 5, NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) + /* 59 addr_family was available in GIT, never released */ + /* drbd-8.2: NL_BIT( 60, T_MANDATORY, mind_af) */ NL_BIT( 27, T_MAY_IGNORE, want_lose) NL_BIT( 28, T_MAY_IGNORE, two_primaries) NL_BIT( 41, T_MAY_IGNORE, always_asbp) + /* drbd-8.2: NL_BIT( 61, T_MAY_IGNORE, no_cork) */ + NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) ) NL_PACKET(disconnect, 6, ) diff --git a/ubuntu/block/drbd/linux/drbd_tag_magic.h b/ubuntu/block/drbd/linux/drbd_tag_magic.h index 8295ad1..e2e4676 100644 --- a/ubuntu/block/drbd/linux/drbd_tag_magic.h +++ b/ubuntu/block/drbd/linux/drbd_tag_magic.h @@ -25,7 +25,7 @@ enum packet_types { #include "drbd_nl.h" // declate tag-list-sizes -const int tag_list_sizes[] = { +static const int tag_list_sizes[] = { #define NL_PACKET(name,number,fields) 2 fields , #define NL_INTEGER(pn,pr,member) +4+4 #define NL_INT64(pn,pr,member) +4+8 @@ -67,7 +67,7 @@ struct tag { // declare tag names #define NL_PACKET(name, number, fields) fields -const struct tag tag_descriptions[] = { +static const struct tag tag_descriptions[] = { #define NL_INTEGER(pn,pr,member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, #define NL_INT64(pn,pr,member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, #define NL_BIT(pn,pr,member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, diff --git a/ubuntu/block/drbd/lru_cache.c b/ubuntu/block/drbd/lru_cache.c index 0f69307..efd2b5f 100644 --- a/ubuntu/block/drbd/lru_cache.c +++ b/ubuntu/block/drbd/lru_cache.c @@ -38,6 +38,37 @@ #define PARANOIA_LEAVE() do { clear_bit(__LC_PARANOIA,&lc->flags); smp_mb__after_clear_bit(); } while (0) #define RETURN(x...) do { PARANOIA_LEAVE(); return x ; } while (0) +static inline size_t size_of_lc(unsigned int e_count, size_t e_size) +{ + return sizeof(struct lru_cache) + + e_count * (e_size + sizeof(struct hlist_head)); +} + +static inline void lc_init(struct lru_cache *lc, + const size_t bytes, const char *name, + const unsigned int e_count, const size_t e_size, + void *private_p) +{ + struct lc_element *e; + unsigned int i; + + memset(lc, 0, bytes); + INIT_LIST_HEAD(&lc->in_use); + INIT_LIST_HEAD(&lc->lru); + INIT_LIST_HEAD(&lc->free); + lc->element_size = e_size; + lc->nr_elements = e_count; + lc->new_number = -1; + lc->lc_private = private_p; + lc->name = name; + for (i = 0; i < e_count; i++) { + e = lc_entry(lc, i); + e->lc_number = LC_FREE; + list_add(&e->list, &lc->free); + // memset(,0,) did the rest of init for us + } +} + /** * lc_alloc: allocates memory for @e_count objects of @e_size bytes plus the * struct lru_cache, and the hash table slots. @@ -46,34 +77,15 @@ struct lru_cache* lc_alloc(const char *name, unsigned int e_count, size_t e_size, void *private_p) { - unsigned long bytes; struct lru_cache *lc; - struct lc_element *e; - int i; + size_t bytes; BUG_ON(!e_count); e_size = max(sizeof(struct lc_element),e_size); - bytes = e_size+sizeof(struct hlist_head); - bytes *= e_count; - bytes += sizeof(struct lru_cache); - lc = vmalloc(bytes); - if (lc) { - memset(lc, 0, bytes); - INIT_LIST_HEAD(&lc->in_use); - INIT_LIST_HEAD(&lc->lru); - INIT_LIST_HEAD(&lc->free); - lc->element_size = e_size; - lc->nr_elements = e_count; - lc->new_number = -1; - lc->lc_private = private_p; - lc->name = name; - for(i=0;ilc_number = LC_FREE; - list_add(&e->list,&lc->free); - // memset(,0,) did the rest of init for us - } - } + bytes = size_of_lc(e_count, e_size); + lc = vmalloc(bytes); + if (lc) + lc_init(lc, bytes, name, e_count, e_size, private_p); return lc; } @@ -86,6 +98,17 @@ void lc_free(struct lru_cache* lc) vfree(lc); } +/** + * lc_reset: does a full reset for @lc and the hash table slots. + * It is roughly the equivalent of re-allocating a fresh lru_cache object, + * basically a short cut to lc_free(lc); lc = lc_alloc(...); + */ +void lc_reset(struct lru_cache *lc) +{ + lc_init(lc, size_of_lc(lc->nr_elements, lc->element_size), lc->name, + lc->nr_elements, lc->element_size, lc->lc_private); +} + size_t lc_printf_stats(struct seq_file *seq, struct lru_cache* lc) { /* NOTE: @@ -130,7 +153,7 @@ STATIC struct lc_element * lc_evict(struct lru_cache* lc) struct list_head *n; struct lc_element *e; - if (list_empty(&lc->lru)) return 0; + if (list_empty(&lc->lru)) return NULL; n=lc->lru.prev; e=list_entry(n, struct lc_element,list); diff --git a/ubuntu/block/drbd/lru_cache.h b/ubuntu/block/drbd/lru_cache.h index 14f6eaa..1398b75 100644 --- a/ubuntu/block/drbd/lru_cache.h +++ b/ubuntu/block/drbd/lru_cache.h @@ -100,6 +100,7 @@ enum { extern struct lru_cache* lc_alloc(const char *name, unsigned int e_count, size_t e_size, void *private_p); +extern void lc_reset(struct lru_cache *lc); extern void lc_free(struct lru_cache* lc); extern void lc_set (struct lru_cache* lc, unsigned int enr, int index); extern void lc_del (struct lru_cache* lc, struct lc_element *element); -- 1.5.4.3