fs/btrfs/raid56.c

   1 /*
   2  * Copyright (C) 2012 Fusion-io  All rights reserved.
   3  * Copyright (C) 2012 Intel Corp. All rights reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public
   7  * License v2 as published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this program; if not, write to the
  16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  17  * Boston, MA 021110-1307, USA.
  18  */
  19 #include <linux/sched.h>
  20 #include <linux/wait.h>
  21 #include <linux/bio.h>
  22 #include <linux/slab.h>
  23 #include <linux/buffer_head.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/random.h>
  26 #include <linux/iocontext.h>
  27 #include <linux/capability.h>
  28 #include <linux/ratelimit.h>
  29 #include <linux/kthread.h>
  30 #include <linux/raid/pq.h>
  31 #include <linux/hash.h>
  32 #include <linux/list_sort.h>
  33 #include <linux/raid/xor.h>
  34 #include <asm/div64.h>
  35 #include "compat.h"
  36 #include "ctree.h"
  37 #include "extent_map.h"
  38 #include "disk-io.h"
  39 #include "transaction.h"
  40 #include "print-tree.h"
  41 #include "volumes.h"
  42 #include "raid56.h"
  43 #include "async-thread.h"
  44 #include "check-integrity.h"
  45 #include "rcu-string.h"
  46
  47 /* set when additional merges to this rbio are not allowed */
  48 #define RBIO_RMW_LOCKED_BIT     1
  49
  50 /*
  51  * set when this rbio is sitting in the hash, but it is just a cache
  52  * of past RMW
  53  */
  54 #define RBIO_CACHE_BIT          2
  55
  56 /*
  57  * set when it is safe to trust the stripe_pages for caching
  58  */
  59 #define RBIO_CACHE_READY_BIT    3
  60
  61
  62 #define RBIO_CACHE_SIZE 1024
  63
  64 struct btrfs_raid_bio {
  65         struct btrfs_fs_info *fs_info;
  66         struct btrfs_bio *bbio;
  67
  68         /*
  69          * logical block numbers for the start of each stripe
  70          * The last one or two are p/q.  These are sorted,
  71          * so raid_map[0] is the start of our full stripe
  72          */
  73         u64 *raid_map;
  74
  75         /* while we're doing rmw on a stripe
  76          * we put it into a hash table so we can
  77          * lock the stripe and merge more rbios
  78          * into it.
  79          */
  80         struct list_head hash_list;
  81
  82         /*
  83          * LRU list for the stripe cache
  84          */
  85         struct list_head stripe_cache;
  86
  87         /*
  88          * for scheduling work in the helper threads
  89          */
  90         struct btrfs_work work;
  91
  92         /*
  93          * bio list and bio_list_lock are used
  94          * to add more bios into the stripe
  95          * in hopes of avoiding the full rmw
  96          */
  97         struct bio_list bio_list;
  98         spinlock_t bio_list_lock;
  99
 100         /* also protected by the bio_list_lock, the
 101          * plug list is used by the plugging code
 102          * to collect partial bios while plugged.  The
 103          * stripe locking code also uses it to hand off
 104          * the stripe lock to the next pending IO
 105          */
 106         struct list_head plug_list;
 107
 108         /*
 109          * flags that tell us if it is safe to
 110          * merge with this bio
 111          */
 112         unsigned long flags;
 113
 114         /* size of each individual stripe on disk */
 115         int stripe_len;
 116
 117         /* number of data stripes (no p/q) */
 118         int nr_data;
 119
 120         /*
 121          * set if we're doing a parity rebuild
 122          * for a read from higher up, which is handled
 123          * differently from a parity rebuild as part of
 124          * rmw
 125          */
 126         int read_rebuild;
 127
 128         /* first bad stripe */
 129         int faila;
 130
 131         /* second bad stripe (for raid6 use) */
 132         int failb;
 133
 134         /*
 135          * number of pages needed to represent the full
 136          * stripe
 137          */
 138         int nr_pages;
 139
 140         /*
 141          * size of all the bios in the bio_list.  This
 142          * helps us decide if the rbio maps to a full
 143          * stripe or not
 144          */
 145         int bio_list_bytes;
 146
 147         atomic_t refs;
 148
 149         /*
 150          * these are two arrays of pointers.  We allocate the
 151          * rbio big enough to hold them both and setup their
 152          * locations when the rbio is allocated
 153          */
 154
 155         /* pointers to pages that we allocated for
 156          * reading/writing stripes directly from the disk (including P/Q)
 157          */
 158         struct page **stripe_pages;
 159
 160         /*
 161          * pointers to the pages in the bio_list.  Stored
 162          * here for faster lookup
 163          */
 164         struct page **bio_pages;
 165 };
 166
 167 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 168 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 169 static void rmw_work(struct btrfs_work *work);
 170 static void read_rebuild_work(struct btrfs_work *work);
 171 static void async_rmw_stripe(struct btrfs_raid_bio *rbio);
 172 static void async_read_rebuild(struct btrfs_raid_bio *rbio);
 173 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 174 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 175 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 176 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 177 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 178
 179 /*
 180  * the stripe hash table is used for locking, and to collect
 181  * bios in hopes of making a full stripe
 182  */
 183 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 184 {
 185         struct btrfs_stripe_hash_table *table;
 186         struct btrfs_stripe_hash_table *x;
 187         struct btrfs_stripe_hash *cur;
 188         struct btrfs_stripe_hash *h;
 189         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 190         int i;
 191
 192         if (info->stripe_hash_table)
 193                 return 0;
 194
 195         table = kzalloc(sizeof(*table) + sizeof(*h) * num_entries, GFP_NOFS);
 196         if (!table)
 197                 return -ENOMEM;
 198
 199         spin_lock_init(&table->cache_lock);
 200         INIT_LIST_HEAD(&table->stripe_cache);
 201
 202         h = table->table;
 203
 204         for (i = 0; i < num_entries; i++) {
 205                 cur = h + i;
 206                 INIT_LIST_HEAD(&cur->hash_list);
 207                 spin_lock_init(&cur->lock);
 208                 init_waitqueue_head(&cur->wait);
 209         }
 210
 211         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 212         if (x)
 213                 kfree(x);
 214         return 0;
 215 }
 216
 217 /*
 218  * caching an rbio means to copy anything from the
 219  * bio_pages array into the stripe_pages array.  We
 220  * use the page uptodate bit in the stripe cache array
 221  * to indicate if it has valid data
 222  *
 223  * once the caching is done, we set the cache ready
 224  * bit.
 225  */
 226 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 227 {
 228         int i;
 229         char *s;
 230         char *d;
 231         int ret;
 232
 233         ret = alloc_rbio_pages(rbio);
 234         if (ret)
 235                 return;
 236
 237         for (i = 0; i < rbio->nr_pages; i++) {
 238                 if (!rbio->bio_pages[i])
 239                         continue;
 240
 241                 s = kmap(rbio->bio_pages[i]);
 242                 d = kmap(rbio->stripe_pages[i]);
 243
 244                 memcpy(d, s, PAGE_CACHE_SIZE);
 245
 246                 kunmap(rbio->bio_pages[i]);
 247                 kunmap(rbio->stripe_pages[i]);
 248                 SetPageUptodate(rbio->stripe_pages[i]);
 249         }
 250         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 251 }
 252
 253 /*
 254  * we hash on the first logical address of the stripe
 255  */
 256 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 257 {
 258         u64 num = rbio->raid_map[0];
 259
 260         /*
 261          * we shift down quite a bit.  We're using byte
 262          * addressing, and most of the lower bits are zeros.
 263          * This tends to upset hash_64, and it consistently
 264          * returns just one or two different values.
 265          *
 266          * shifting off the lower bits fixes things.
 267          */
 268         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 269 }
 270
 271 /*
 272  * stealing an rbio means taking all the uptodate pages from the stripe
 273  * array in the source rbio and putting them into the destination rbio
 274  */
 275 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 276 {
 277         int i;
 278         struct page *s;
 279         struct page *d;
 280
 281         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 282                 return;
 283
 284         for (i = 0; i < dest->nr_pages; i++) {
 285                 s = src->stripe_pages[i];
 286                 if (!s || !PageUptodate(s)) {
 287                         continue;
 288                 }
 289
 290                 d = dest->stripe_pages[i];
 291                 if (d)
 292                         __free_page(d);
 293
 294                 dest->stripe_pages[i] = s;
 295                 src->stripe_pages[i] = NULL;
 296         }
 297 }
 298
 299 /*
 300  * merging means we take the bio_list from the victim and
 301  * splice it into the destination.  The victim should
 302  * be discarded afterwards.
 303  *
 304  * must be called with dest->rbio_list_lock held
 305  */
 306 static void merge_rbio(struct btrfs_raid_bio *dest,
 307                        struct btrfs_raid_bio *victim)
 308 {
 309         bio_list_merge(&dest->bio_list, &victim->bio_list);
 310         dest->bio_list_bytes += victim->bio_list_bytes;
 311         bio_list_init(&victim->bio_list);
 312 }
 313
 314 /*
 315  * used to prune items that are in the cache.  The caller
 316  * must hold the hash table lock.
 317  */
 318 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 319 {
 320         int bucket = rbio_bucket(rbio);
 321         struct btrfs_stripe_hash_table *table;
 322         struct btrfs_stripe_hash *h;
 323         int freeit = 0;
 324
 325         /*
 326          * check the bit again under the hash table lock.
 327          */
 328         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 329                 return;
 330
 331         table = rbio->fs_info->stripe_hash_table;
 332         h = table->table + bucket;
 333
 334         /* hold the lock for the bucket because we may be
 335          * removing it from the hash table
 336          */
 337         spin_lock(&h->lock);
 338
 339         /*
 340          * hold the lock for the bio list because we need
 341          * to make sure the bio list is empty
 342          */
 343         spin_lock(&rbio->bio_list_lock);
 344
 345         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 346                 list_del_init(&rbio->stripe_cache);
 347                 table->cache_size -= 1;
 348                 freeit = 1;
 349
 350                 /* if the bio list isn't empty, this rbio is
 351                  * still involved in an IO.  We take it out
 352                  * of the cache list, and drop the ref that
 353                  * was held for the list.
 354                  *
 355                  * If the bio_list was empty, we also remove
 356                  * the rbio from the hash_table, and drop
 357                  * the corresponding ref
 358                  */
 359                 if (bio_list_empty(&rbio->bio_list)) {
 360                         if (!list_empty(&rbio->hash_list)) {
 361                                 list_del_init(&rbio->hash_list);
 362                                 atomic_dec(&rbio->refs);
 363                                 BUG_ON(!list_empty(&rbio->plug_list));
 364                         }
 365                 }
 366         }
 367
 368         spin_unlock(&rbio->bio_list_lock);
 369         spin_unlock(&h->lock);
 370
 371         if (freeit)
 372                 __free_raid_bio(rbio);
 373 }
 374
 375 /*
 376  * prune a given rbio from the cache
 377  */
 378 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 379 {
 380         struct btrfs_stripe_hash_table *table;
 381         unsigned long flags;
 382
 383         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 384                 return;
 385
 386         table = rbio->fs_info->stripe_hash_table;
 387
 388         spin_lock_irqsave(&table->cache_lock, flags);
 389         __remove_rbio_from_cache(rbio);
 390         spin_unlock_irqrestore(&table->cache_lock, flags);
 391 }
 392
 393 /*
 394  * remove everything in the cache
 395  */
 396 void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 397 {
 398         struct btrfs_stripe_hash_table *table;
 399         unsigned long flags;
 400         struct btrfs_raid_bio *rbio;
 401
 402         table = info->stripe_hash_table;
 403
 404         spin_lock_irqsave(&table->cache_lock, flags);
 405         while (!list_empty(&table->stripe_cache)) {
 406                 rbio = list_entry(table->stripe_cache.next,
 407                                   struct btrfs_raid_bio,
 408                                   stripe_cache);
 409                 __remove_rbio_from_cache(rbio);
 410         }
 411         spin_unlock_irqrestore(&table->cache_lock, flags);
 412 }
 413
 414 /*
 415  * remove all cached entries and free the hash table
 416  * used by unmount
 417  */
 418 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 419 {
 420         if (!info->stripe_hash_table)
 421                 return;
 422         btrfs_clear_rbio_cache(info);
 423         kfree(info->stripe_hash_table);
 424         info->stripe_hash_table = NULL;
 425 }
 426
 427 /*
 428  * insert an rbio into the stripe cache.  It
 429  * must have already been prepared by calling
 430  * cache_rbio_pages
 431  *
 432  * If this rbio was already cached, it gets
 433  * moved to the front of the lru.
 434  *
 435  * If the size of the rbio cache is too big, we
 436  * prune an item.
 437  */
 438 static void cache_rbio(struct btrfs_raid_bio *rbio)
 439 {
 440         struct btrfs_stripe_hash_table *table;
 441         unsigned long flags;
 442
 443         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 444                 return;
 445
 446         table = rbio->fs_info->stripe_hash_table;
 447
 448         spin_lock_irqsave(&table->cache_lock, flags);
 449         spin_lock(&rbio->bio_list_lock);
 450
 451         /* bump our ref if we were not in the list before */
 452         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 453                 atomic_inc(&rbio->refs);
 454
 455         if (!list_empty(&rbio->stripe_cache)){
 456                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 457         } else {
 458                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 459                 table->cache_size += 1;
 460         }
 461
 462         spin_unlock(&rbio->bio_list_lock);
 463
 464         if (table->cache_size > RBIO_CACHE_SIZE) {
 465                 struct btrfs_raid_bio *found;
 466
 467                 found = list_entry(table->stripe_cache.prev,
 468                                   struct btrfs_raid_bio,
 469                                   stripe_cache);
 470
 471                 if (found != rbio)
 472                         __remove_rbio_from_cache(found);
 473         }
 474
 475         spin_unlock_irqrestore(&table->cache_lock, flags);
 476         return;
 477 }
 478
 479 /*
 480  * helper function to run the xor_blocks api.  It is only
 481  * able to do MAX_XOR_BLOCKS at a time, so we need to
 482  * loop through.
 483  */
 484 static void run_xor(void **pages, int src_cnt, ssize_t len)
 485 {
 486         int src_off = 0;
 487         int xor_src_cnt = 0;
 488         void *dest = pages[src_cnt];
 489
 490         while(src_cnt > 0) {
 491                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 492                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 493
 494                 src_cnt -= xor_src_cnt;
 495                 src_off += xor_src_cnt;
 496         }
 497 }
 498
 499 /*
 500  * returns true if the bio list inside this rbio
 501  * covers an entire stripe (no rmw required).
 502  * Must be called with the bio list lock held, or
 503  * at a time when you know it is impossible to add
 504  * new bios into the list
 505  */
 506 static int __rbio_is_full(struct btrfs_raid_bio *rbio)
 507 {
 508         unsigned long size = rbio->bio_list_bytes;
 509         int ret = 1;
 510
 511         if (size != rbio->nr_data * rbio->stripe_len)
 512                 ret = 0;
 513
 514         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 515         return ret;
 516 }
 517
 518 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 519 {
 520         unsigned long flags;
 521         int ret;
 522
 523         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 524         ret = __rbio_is_full(rbio);
 525         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 526         return ret;
 527 }
 528
 529 /*
 530  * returns 1 if it is safe to merge two rbios together.
 531  * The merging is safe if the two rbios correspond to
 532  * the same stripe and if they are both going in the same
 533  * direction (read vs write), and if neither one is
 534  * locked for final IO
 535  *
 536  * The caller is responsible for locking such that
 537  * rmw_locked is safe to test
 538  */
 539 static int rbio_can_merge(struct btrfs_raid_bio *last,
 540                           struct btrfs_raid_bio *cur)
 541 {
 542         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 543             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 544                 return 0;
 545
 546         /*
 547          * we can't merge with cached rbios, since the
 548          * idea is that when we merge the destination
 549          * rbio is going to run our IO for us.  We can
 550          * steal from cached rbio's though, other functions
 551          * handle that.
 552          */
 553         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 554             test_bit(RBIO_CACHE_BIT, &cur->flags))
 555                 return 0;
 556
 557         if (last->raid_map[0] !=
 558             cur->raid_map[0])
 559                 return 0;
 560
 561         /* reads can't merge with writes */
 562         if (last->read_rebuild !=
 563             cur->read_rebuild) {
 564                 return 0;
 565         }
 566
 567         return 1;
 568 }
 569
 570 /*
 571  * helper to index into the pstripe
 572  */
 573 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
 574 {
 575         index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 576         return rbio->stripe_pages[index];
 577 }
 578
 579 /*
 580  * helper to index into the qstripe, returns null
 581  * if there is no qstripe
 582  */
 583 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
 584 {
 585         if (rbio->nr_data + 1 == rbio->bbio->num_stripes)
 586                 return NULL;
 587
 588         index += ((rbio->nr_data + 1) * rbio->stripe_len) >>
 589                 PAGE_CACHE_SHIFT;
 590         return rbio->stripe_pages[index];
 591 }
 592
 593 /*
 594  * The first stripe in the table for a logical address
 595  * has the lock.  rbios are added in one of three ways:
 596  *
 597  * 1) Nobody has the stripe locked yet.  The rbio is given
 598  * the lock and 0 is returned.  The caller must start the IO
 599  * themselves.
 600  *
 601  * 2) Someone has the stripe locked, but we're able to merge
 602  * with the lock owner.  The rbio is freed and the IO will
 603  * start automatically along with the existing rbio.  1 is returned.
 604  *
 605  * 3) Someone has the stripe locked, but we're not able to merge.
 606  * The rbio is added to the lock owner's plug list, or merged into
 607  * an rbio already on the plug list.  When the lock owner unlocks,
 608  * the next rbio on the list is run and the IO is started automatically.
 609  * 1 is returned
 610  *
 611  * If we return 0, the caller still owns the rbio and must continue with
 612  * IO submission.  If we return 1, the caller must assume the rbio has
 613  * already been freed.
 614  */
 615 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 616 {
 617         int bucket = rbio_bucket(rbio);
 618         struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
 619         struct btrfs_raid_bio *cur;
 620         struct btrfs_raid_bio *pending;
 621         unsigned long flags;
 622         DEFINE_WAIT(wait);
 623         struct btrfs_raid_bio *freeit = NULL;
 624         struct btrfs_raid_bio *cache_drop = NULL;
 625         int ret = 0;
 626         int walk = 0;
 627
 628         spin_lock_irqsave(&h->lock, flags);
 629         list_for_each_entry(cur, &h->hash_list, hash_list) {
 630                 walk++;
 631                 if (cur->raid_map[0] == rbio->raid_map[0]) {
 632                         spin_lock(&cur->bio_list_lock);
 633
 634                         /* can we steal this cached rbio's pages? */
 635                         if (bio_list_empty(&cur->bio_list) &&
 636                             list_empty(&cur->plug_list) &&
 637                             test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 638                             !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 639                                 list_del_init(&cur->hash_list);
 640                                 atomic_dec(&cur->refs);
 641
 642                                 steal_rbio(cur, rbio);
 643                                 cache_drop = cur;
 644                                 spin_unlock(&cur->bio_list_lock);
 645
 646                                 goto lockit;
 647                         }
 648
 649                         /* can we merge into the lock owner? */
 650                         if (rbio_can_merge(cur, rbio)) {
 651                                 merge_rbio(cur, rbio);
 652                                 spin_unlock(&cur->bio_list_lock);
 653                                 freeit = rbio;
 654                                 ret = 1;
 655                                 goto out;
 656                         }
 657
 658
 659                         /*
 660                          * we couldn't merge with the running
 661                          * rbio, see if we can merge with the
 662                          * pending ones.  We don't have to
 663                          * check for rmw_locked because there
 664                          * is no way they are inside finish_rmw
 665                          * right now
 666                          */
 667                         list_for_each_entry(pending, &cur->plug_list,
 668                                             plug_list) {
 669                                 if (rbio_can_merge(pending, rbio)) {
 670                                         merge_rbio(pending, rbio);
 671                                         spin_unlock(&cur->bio_list_lock);
 672                                         freeit = rbio;
 673                                         ret = 1;
 674                                         goto out;
 675                                 }
 676                         }
 677
 678                         /* no merging, put us on the tail of the plug list,
 679                          * our rbio will be started with the currently
 680                          * running rbio unlocks
 681                          */
 682                         list_add_tail(&rbio->plug_list, &cur->plug_list);
 683                         spin_unlock(&cur->bio_list_lock);
 684                         ret = 1;
 685                         goto out;
 686                 }
 687         }
 688 lockit:
 689         atomic_inc(&rbio->refs);
 690         list_add(&rbio->hash_list, &h->hash_list);
 691 out:
 692         spin_unlock_irqrestore(&h->lock, flags);
 693         if (cache_drop)
 694                 remove_rbio_from_cache(cache_drop);
 695         if (freeit)
 696                 __free_raid_bio(freeit);
 697         return ret;
 698 }
 699
 700 /*
 701  * called as rmw or parity rebuild is completed.  If the plug list has more
 702  * rbios waiting for this stripe, the next one on the list will be started
 703  */
 704 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 705 {
 706         int bucket;
 707         struct btrfs_stripe_hash *h;
 708         unsigned long flags;
 709         int keep_cache = 0;
 710
 711         bucket = rbio_bucket(rbio);
 712         h = rbio->fs_info->stripe_hash_table->table + bucket;
 713
 714         if (list_empty(&rbio->plug_list))
 715                 cache_rbio(rbio);
 716
 717         spin_lock_irqsave(&h->lock, flags);
 718         spin_lock(&rbio->bio_list_lock);
 719
 720         if (!list_empty(&rbio->hash_list)) {
 721                 /*
 722                  * if we're still cached and there is no other IO
 723                  * to perform, just leave this rbio here for others
 724                  * to steal from later
 725                  */
 726                 if (list_empty(&rbio->plug_list) &&
 727                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 728                         keep_cache = 1;
 729                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 730                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 731                         goto done;
 732                 }
 733
 734                 list_del_init(&rbio->hash_list);
 735                 atomic_dec(&rbio->refs);
 736
 737                 /*
 738                  * we use the plug list to hold all the rbios
 739                  * waiting for the chance to lock this stripe.
 740                  * hand the lock over to one of them.
 741                  */
 742                 if (!list_empty(&rbio->plug_list)) {
 743                         struct btrfs_raid_bio *next;
 744                         struct list_head *head = rbio->plug_list.next;
 745
 746                         next = list_entry(head, struct btrfs_raid_bio,
 747                                           plug_list);
 748
 749                         list_del_init(&rbio->plug_list);
 750
 751                         list_add(&next->hash_list, &h->hash_list);
 752                         atomic_inc(&next->refs);
 753                         spin_unlock(&rbio->bio_list_lock);
 754                         spin_unlock_irqrestore(&h->lock, flags);
 755
 756                         if (next->read_rebuild)
 757                                 async_read_rebuild(next);
 758                         else {
 759                                 steal_rbio(rbio, next);
 760                                 async_rmw_stripe(next);
 761                         }
 762
 763                         goto done_nolock;
 764                 } else  if (waitqueue_active(&h->wait)) {
 765                         spin_unlock(&rbio->bio_list_lock);
 766                         spin_unlock_irqrestore(&h->lock, flags);
 767                         wake_up(&h->wait);
 768                         goto done_nolock;
 769                 }
 770         }
 771 done:
 772         spin_unlock(&rbio->bio_list_lock);
 773         spin_unlock_irqrestore(&h->lock, flags);
 774
 775 done_nolock:
 776         if (!keep_cache)
 777                 remove_rbio_from_cache(rbio);
 778 }
 779
 780 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 781 {
 782         int i;
 783
 784         WARN_ON(atomic_read(&rbio->refs) < 0);
 785         if (!atomic_dec_and_test(&rbio->refs))
 786                 return;
 787
 788         WARN_ON(!list_empty(&rbio->stripe_cache));
 789         WARN_ON(!list_empty(&rbio->hash_list));
 790         WARN_ON(!bio_list_empty(&rbio->bio_list));
 791
 792         for (i = 0; i < rbio->nr_pages; i++) {
 793                 if (rbio->stripe_pages[i]) {
 794                         __free_page(rbio->stripe_pages[i]);
 795                         rbio->stripe_pages[i] = NULL;
 796                 }
 797         }
 798         kfree(rbio->raid_map);
 799         kfree(rbio->bbio);
 800         kfree(rbio);
 801 }
 802
 803 static void free_raid_bio(struct btrfs_raid_bio *rbio)
 804 {
 805         unlock_stripe(rbio);
 806         __free_raid_bio(rbio);
 807 }
 808
 809 /*
 810  * this frees the rbio and runs through all the bios in the
 811  * bio_list and calls end_io on them
 812  */
 813 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, int err, int uptodate)
 814 {
 815         struct bio *cur = bio_list_get(&rbio->bio_list);
 816         struct bio *next;
 817         free_raid_bio(rbio);
 818
 819         while (cur) {
 820                 next = cur->bi_next;
 821                 cur->bi_next = NULL;
 822                 if (uptodate)
 823                         set_bit(BIO_UPTODATE, &cur->bi_flags);
 824                 bio_endio(cur, err);
 825                 cur = next;
 826         }
 827 }
 828
 829 /*
 830  * end io function used by finish_rmw.  When we finally
 831  * get here, we've written a full stripe
 832  */
 833 static void raid_write_end_io(struct bio *bio, int err)
 834 {
 835         struct btrfs_raid_bio *rbio = bio->bi_private;
 836
 837         if (err)
 838                 fail_bio_stripe(rbio, bio);
 839
 840         bio_put(bio);
 841
 842         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
 843                 return;
 844
 845         err = 0;
 846
 847         /* OK, we have read all the stripes we need to. */
 848         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
 849                 err = -EIO;
 850
 851         rbio_orig_end_io(rbio, err, 0);
 852         return;
 853 }
 854
 855 /*
 856  * the read/modify/write code wants to use the original bio for
 857  * any pages it included, and then use the rbio for everything
 858  * else.  This function decides if a given index (stripe number)
 859  * and page number in that stripe fall inside the original bio
 860  * or the rbio.
 861  *
 862  * if you set bio_list_only, you'll get a NULL back for any ranges
 863  * that are outside the bio_list
 864  *
 865  * This doesn't take any refs on anything, you get a bare page pointer
 866  * and the caller must bump refs as required.
 867  *
 868  * You must call index_rbio_pages once before you can trust
 869  * the answers from this function.
 870  */
 871 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
 872                                  int index, int pagenr, int bio_list_only)
 873 {
 874         int chunk_page;
 875         struct page *p = NULL;
 876
 877         chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
 878
 879         spin_lock_irq(&rbio->bio_list_lock);
 880         p = rbio->bio_pages[chunk_page];
 881         spin_unlock_irq(&rbio->bio_list_lock);
 882
 883         if (p || bio_list_only)
 884                 return p;
 885
 886         return rbio->stripe_pages[chunk_page];
 887 }
 888
 889 /*
 890  * number of pages we need for the entire stripe across all the
 891  * drives
 892  */
 893 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
 894 {
 895         unsigned long nr = stripe_len * nr_stripes;
 896         return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 897 }
 898
 899 /*
 900  * allocation and initial setup for the btrfs_raid_bio.  Not
 901  * this does not allocate any pages for rbio->pages.
 902  */
 903 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root,
 904                           struct btrfs_bio *bbio, u64 *raid_map,
 905                           u64 stripe_len)
 906 {
 907         struct btrfs_raid_bio *rbio;
 908         int nr_data = 0;
 909         int num_pages = rbio_nr_pages(stripe_len, bbio->num_stripes);
 910         void *p;
 911
 912         rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2,
 913                         GFP_NOFS);
 914         if (!rbio) {
 915                 kfree(raid_map);
 916                 kfree(bbio);
 917                 return ERR_PTR(-ENOMEM);
 918         }
 919
 920         bio_list_init(&rbio->bio_list);
 921         INIT_LIST_HEAD(&rbio->plug_list);
 922         spin_lock_init(&rbio->bio_list_lock);
 923         INIT_LIST_HEAD(&rbio->stripe_cache);
 924         INIT_LIST_HEAD(&rbio->hash_list);
 925         rbio->bbio = bbio;
 926         rbio->raid_map = raid_map;
 927         rbio->fs_info = root->fs_info;
 928         rbio->stripe_len = stripe_len;
 929         rbio->nr_pages = num_pages;
 930         rbio->faila = -1;
 931         rbio->failb = -1;
 932         atomic_set(&rbio->refs, 1);
 933
 934         /*
 935          * the stripe_pages and bio_pages array point to the extra
 936          * memory we allocated past the end of the rbio
 937          */
 938         p = rbio + 1;
 939         rbio->stripe_pages = p;
 940         rbio->bio_pages = p + sizeof(struct page *) * num_pages;
 941
 942         if (raid_map[bbio->num_stripes - 1] == RAID6_Q_STRIPE)
 943                 nr_data = bbio->num_stripes - 2;
 944         else
 945                 nr_data = bbio->num_stripes - 1;
 946
 947         rbio->nr_data = nr_data;
 948         return rbio;
 949 }
 950
 951 /* allocate pages for all the stripes in the bio, including parity */
 952 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 953 {
 954         int i;
 955         struct page *page;
 956
 957         for (i = 0; i < rbio->nr_pages; i++) {
 958                 if (rbio->stripe_pages[i])
 959                         continue;
 960                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 961                 if (!page)
 962                         return -ENOMEM;
 963                 rbio->stripe_pages[i] = page;
 964                 ClearPageUptodate(page);
 965         }
 966         return 0;
 967 }
 968
 969 /* allocate pages for just the p/q stripes */
 970 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 971 {
 972         int i;
 973         struct page *page;
 974
 975         i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT;
 976
 977         for (; i < rbio->nr_pages; i++) {
 978                 if (rbio->stripe_pages[i])
 979                         continue;
 980                 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
 981                 if (!page)
 982                         return -ENOMEM;
 983                 rbio->stripe_pages[i] = page;
 984         }
 985         return 0;
 986 }
 987
 988 /*
 989  * add a single page from a specific stripe into our list of bios for IO
 990  * this will try to merge into existing bios if possible, and returns
 991  * zero if all went well.
 992  */
 993 int rbio_add_io_page(struct btrfs_raid_bio *rbio,
 994                      struct bio_list *bio_list,
 995                      struct page *page,
 996                      int stripe_nr,
 997                      unsigned long page_index,
 998                      unsigned long bio_max_len)
 999 {
1000         struct bio *last = bio_list->tail;
1001         u64 last_end = 0;
1002         int ret;
1003         struct bio *bio;
1004         struct btrfs_bio_stripe *stripe;
1005         u64 disk_start;
1006
1007         stripe = &rbio->bbio->stripes[stripe_nr];
1008         disk_start = stripe->physical + (page_index << PAGE_CACHE_SHIFT);
1009
1010         /* if the device is missing, just fail this stripe */
1011         if (!stripe->dev->bdev)
1012                 return fail_rbio_index(rbio, stripe_nr);
1013
1014         /* see if we can add this page onto our existing bio */
1015         if (last) {
1016                 last_end = (u64)last->bi_sector << 9;
1017                 last_end += last->bi_size;
1018
1019                 /*
1020                  * we can't merge these if they are from different
1021                  * devices or if they are not contiguous
1022                  */
1023                 if (last_end == disk_start && stripe->dev->bdev &&
1024                     test_bit(BIO_UPTODATE, &last->bi_flags) &&
1025                     last->bi_bdev == stripe->dev->bdev) {
1026                         ret = bio_add_page(last, page, PAGE_CACHE_SIZE, 0);
1027                         if (ret == PAGE_CACHE_SIZE)
1028                                 return 0;
1029                 }
1030         }
1031
1032         /* put a new bio on the list */
1033         bio = bio_alloc(GFP_NOFS, bio_max_len >> PAGE_SHIFT?:1);
1034         if (!bio)
1035                 return -ENOMEM;
1036
1037         bio->bi_size = 0;
1038         bio->bi_bdev = stripe->dev->bdev;
1039         bio->bi_sector = disk_start >> 9;
1040         set_bit(BIO_UPTODATE, &bio->bi_flags);
1041
1042         bio_add_page(bio, page, PAGE_CACHE_SIZE, 0);
1043         bio_list_add(bio_list, bio);
1044         return 0;
1045 }
1046
1047 /*
1048  * while we're doing the read/modify/write cycle, we could
1049  * have errors in reading pages off the disk.  This checks
1050  * for errors and if we're not able to read the page it'll
1051  * trigger parity reconstruction.  The rmw will be finished
1052  * after we've reconstructed the failed stripes
1053  */
1054 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1055 {
1056         if (rbio->faila >= 0 || rbio->failb >= 0) {
1057                 BUG_ON(rbio->faila == rbio->bbio->num_stripes - 1);
1058                 __raid56_parity_recover(rbio);
1059         } else {
1060                 finish_rmw(rbio);
1061         }
1062 }
1063
1064 /*
1065  * these are just the pages from the rbio array, not from anything
1066  * the FS sent down to us
1067  */
1068 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page)
1069 {
1070         int index;
1071         index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT);
1072         index += page;
1073         return rbio->stripe_pages[index];
1074 }
1075
1076 /*
1077  * helper function to walk our bio list and populate the bio_pages array with
1078  * the result.  This seems expensive, but it is faster than constantly
1079  * searching through the bio list as we setup the IO in finish_rmw or stripe
1080  * reconstruction.
1081  *
1082  * This must be called before you trust the answers from page_in_rbio
1083  */
1084 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1085 {
1086         struct bio *bio;
1087         u64 start;
1088         unsigned long stripe_offset;
1089         unsigned long page_index;
1090         struct page *p;
1091         int i;
1092
1093         spin_lock_irq(&rbio->bio_list_lock);
1094         bio_list_for_each(bio, &rbio->bio_list) {
1095                 start = (u64)bio->bi_sector << 9;
1096                 stripe_offset = start - rbio->raid_map[0];
1097                 page_index = stripe_offset >> PAGE_CACHE_SHIFT;
1098
1099                 for (i = 0; i < bio->bi_vcnt; i++) {
1100                         p = bio->bi_io_vec[i].bv_page;
1101                         rbio->bio_pages[page_index + i] = p;
1102                 }
1103         }
1104         spin_unlock_irq(&rbio->bio_list_lock);
1105 }
1106
1107 /*
1108  * this is called from one of two situations.  We either
1109  * have a full stripe from the higher layers, or we've read all
1110  * the missing bits off disk.
1111  *
1112  * This will calculate the parity and then send down any
1113  * changed blocks.
1114  */
1115 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1116 {
1117         struct btrfs_bio *bbio = rbio->bbio;
1118         void *pointers[bbio->num_stripes];
1119         int stripe_len = rbio->stripe_len;
1120         int nr_data = rbio->nr_data;
1121         int stripe;
1122         int pagenr;
1123         int p_stripe = -1;
1124         int q_stripe = -1;
1125         struct bio_list bio_list;
1126         struct bio *bio;
1127         int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT;
1128         int ret;
1129
1130         bio_list_init(&bio_list);
1131
1132         if (bbio->num_stripes - rbio->nr_data == 1) {
1133                 p_stripe = bbio->num_stripes - 1;
1134         } else if (bbio->num_stripes - rbio->nr_data == 2) {
1135                 p_stripe = bbio->num_stripes - 2;
1136                 q_stripe = bbio->num_stripes - 1;
1137         } else {
1138                 BUG();
1139         }
1140
1141         /* at this point we either have a full stripe,
1142          * or we've read the full stripe from the drive.
1143          * recalculate the parity and write the new results.
1144          *
1145          * We're not allowed to add any new bios to the
1146          * bio list here, anyone else that wants to
1147          * change this stripe needs to do their own rmw.
1148          */
1149         spin_lock_irq(&rbio->bio_list_lock);
1150         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1151         spin_unlock_irq(&rbio->bio_list_lock);
1152
1153         atomic_set(&rbio->bbio->error, 0);
1154
1155         /*
1156          * now that we've set rmw_locked, run through the
1157          * bio list one last time and map the page pointers
1158          *
1159          * We don't cache full rbios because we're assuming
1160          * the higher layers are unlikely to use this area of
1161          * the disk again soon.  If they do use it again,
1162          * hopefully they will send another full bio.
1163          */
1164         index_rbio_pages(rbio);
1165         if (!rbio_is_full(rbio))
1166                 cache_rbio_pages(rbio);
1167         else
1168                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1169
1170         for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1171                 struct page *p;
1172                 /* first collect one page from each data stripe */
1173                 for (stripe = 0; stripe < nr_data; stripe++) {
1174                         p = page_in_rbio(rbio, stripe, pagenr, 0);
1175                         pointers[stripe] = kmap(p);
1176                 }
1177
1178                 /* then add the parity stripe */
1179                 p = rbio_pstripe_page(rbio, pagenr);
1180                 SetPageUptodate(p);
1181                 pointers[stripe++] = kmap(p);
1182
1183                 if (q_stripe != -1) {
1184
1185                         /*
1186                          * raid6, add the qstripe and call the
1187                          * library function to fill in our p/q
1188                          */
1189                         p = rbio_qstripe_page(rbio, pagenr);
1190                         SetPageUptodate(p);
1191                         pointers[stripe++] = kmap(p);
1192
1193                         raid6_call.gen_syndrome(bbio->num_stripes, PAGE_SIZE,
1194                                                 pointers);
1195                 } else {
1196                         /* raid5 */
1197                         memcpy(pointers[nr_data], pointers[0], PAGE_SIZE);
1198                         run_xor(pointers + 1, nr_data - 1, PAGE_CACHE_SIZE);
1199                 }
1200
1201
1202                 for (stripe = 0; stripe < bbio->num_stripes; stripe++)
1203                         kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1204         }
1205
1206         /*
1207          * time to start writing.  Make bios for everything from the
1208          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1209          * everything else.
1210          */
1211         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1212                 for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) {
1213                         struct page *page;
1214                         if (stripe < rbio->nr_data) {
1215                                 page = page_in_rbio(rbio, stripe, pagenr, 1);
1216                                 if (!page)
1217                                         continue;
1218                         } else {
1219                                page = rbio_stripe_page(rbio, stripe, pagenr);
1220                         }
1221
1222                         ret = rbio_add_io_page(rbio, &bio_list,
1223                                        page, stripe, pagenr, rbio->stripe_len);
1224                         if (ret)
1225                                 goto cleanup;
1226                 }
1227         }
1228
1229         atomic_set(&bbio->stripes_pending, bio_list_size(&bio_list));
1230         BUG_ON(atomic_read(&bbio->stripes_pending) == 0);
1231
1232         while (1) {
1233                 bio = bio_list_pop(&bio_list);
1234                 if (!bio)
1235                         break;
1236
1237                 bio->bi_private = rbio;
1238                 bio->bi_end_io = raid_write_end_io;
1239                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1240                 submit_bio(WRITE, bio);
1241         }
1242         return;
1243
1244 cleanup:
1245         rbio_orig_end_io(rbio, -EIO, 0);
1246 }
1247
1248 /*
1249  * helper to find the stripe number for a given bio.  Used to figure out which
1250  * stripe has failed.  This expects the bio to correspond to a physical disk,
1251  * so it looks up based on physical sector numbers.
1252  */
1253 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1254                            struct bio *bio)
1255 {
1256         u64 physical = bio->bi_sector;
1257         u64 stripe_start;
1258         int i;
1259         struct btrfs_bio_stripe *stripe;
1260
1261         physical <<= 9;
1262
1263         for (i = 0; i < rbio->bbio->num_stripes; i++) {
1264                 stripe = &rbio->bbio->stripes[i];
1265                 stripe_start = stripe->physical;
1266                 if (physical >= stripe_start &&
1267                     physical < stripe_start + rbio->stripe_len) {
1268                         return i;
1269                 }
1270         }
1271         return -1;
1272 }
1273
1274 /*
1275  * helper to find the stripe number for a given
1276  * bio (before mapping).  Used to figure out which stripe has
1277  * failed.  This looks up based on logical block numbers.
1278  */
1279 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1280                                    struct bio *bio)
1281 {
1282         u64 logical = bio->bi_sector;
1283         u64 stripe_start;
1284         int i;
1285
1286         logical <<= 9;
1287
1288         for (i = 0; i < rbio->nr_data; i++) {
1289                 stripe_start = rbio->raid_map[i];
1290                 if (logical >= stripe_start &&
1291                     logical < stripe_start + rbio->stripe_len) {
1292                         return i;
1293                 }
1294         }
1295         return -1;
1296 }
1297
1298 /*
1299  * returns -EIO if we had too many failures
1300  */
1301 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1302 {
1303         unsigned long flags;
1304         int ret = 0;
1305
1306         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1307
1308         /* we already know this stripe is bad, move on */
1309         if (rbio->faila == failed || rbio->failb == failed)
1310                 goto out;
1311
1312         if (rbio->faila == -1) {
1313                 /* first failure on this rbio */
1314                 rbio->faila = failed;
1315                 atomic_inc(&rbio->bbio->error);
1316         } else if (rbio->failb == -1) {
1317                 /* second failure on this rbio */
1318                 rbio->failb = failed;
1319                 atomic_inc(&rbio->bbio->error);
1320         } else {
1321                 ret = -EIO;
1322         }
1323 out:
1324         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1325
1326         return ret;
1327 }
1328
1329 /*
1330  * helper to fail a stripe based on a physical disk
1331  * bio.
1332  */
1333 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1334                            struct bio *bio)
1335 {
1336         int failed = find_bio_stripe(rbio, bio);
1337
1338         if (failed < 0)
1339                 return -EIO;
1340
1341         return fail_rbio_index(rbio, failed);
1342 }
1343
1344 /*
1345  * this sets each page in the bio uptodate.  It should only be used on private
1346  * rbio pages, nothing that comes in from the higher layers
1347  */
1348 static void set_bio_pages_uptodate(struct bio *bio)
1349 {
1350         int i;
1351         struct page *p;
1352
1353         for (i = 0; i < bio->bi_vcnt; i++) {
1354                 p = bio->bi_io_vec[i].bv_page;
1355                 SetPageUptodate(p);
1356         }
1357 }
1358
1359 /*
1360  * end io for the read phase of the rmw cycle.  All the bios here are physical
1361  * stripe bios we've read from the disk so we can recalculate the parity of the
1362  * stripe.
1363  *
1364  * This will usually kick off finish_rmw once all the bios are read in, but it
1365  * may trigger parity reconstruction if we had any errors along the way
1366  */
1367 static void raid_rmw_end_io(struct bio *bio, int err)
1368 {
1369         struct btrfs_raid_bio *rbio = bio->bi_private;
1370
1371         if (err)
1372                 fail_bio_stripe(rbio, bio);
1373         else
1374                 set_bio_pages_uptodate(bio);
1375
1376         bio_put(bio);
1377
1378         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1379                 return;
1380
1381         err = 0;
1382         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1383                 goto cleanup;
1384
1385         /*
1386          * this will normally call finish_rmw to start our write
1387          * but if there are any failed stripes we'll reconstruct
1388          * from parity first
1389          */
1390         validate_rbio_for_rmw(rbio);
1391         return;
1392
1393 cleanup:
1394
1395         rbio_orig_end_io(rbio, -EIO, 0);
1396 }
1397
1398 static void async_rmw_stripe(struct btrfs_raid_bio *rbio)
1399 {
1400         rbio->work.flags = 0;
1401         rbio->work.func = rmw_work;
1402
1403         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1404                            &rbio->work);
1405 }
1406
1407 static void async_read_rebuild(struct btrfs_raid_bio *rbio)
1408 {
1409         rbio->work.flags = 0;
1410         rbio->work.func = read_rebuild_work;
1411
1412         btrfs_queue_worker(&rbio->fs_info->rmw_workers,
1413                            &rbio->work);
1414 }
1415
1416 /*
1417  * the stripe must be locked by the caller.  It will
1418  * unlock after all the writes are done
1419  */
1420 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1421 {
1422         int bios_to_read = 0;
1423         struct btrfs_bio *bbio = rbio->bbio;
1424         struct bio_list bio_list;
1425         int ret;
1426         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1427         int pagenr;
1428         int stripe;
1429         struct bio *bio;
1430
1431         bio_list_init(&bio_list);
1432
1433         ret = alloc_rbio_pages(rbio);
1434         if (ret)
1435                 goto cleanup;
1436
1437         index_rbio_pages(rbio);
1438
1439         atomic_set(&rbio->bbio->error, 0);
1440         /*
1441          * build a list of bios to read all the missing parts of this
1442          * stripe
1443          */
1444         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1445                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1446                         struct page *page;
1447                         /*
1448                          * we want to find all the pages missing from
1449                          * the rbio and read them from the disk.  If
1450                          * page_in_rbio finds a page in the bio list
1451                          * we don't need to read it off the stripe.
1452                          */
1453                         page = page_in_rbio(rbio, stripe, pagenr, 1);
1454                         if (page)
1455                                 continue;
1456
1457                         page = rbio_stripe_page(rbio, stripe, pagenr);
1458                         /*
1459                          * the bio cache may have handed us an uptodate
1460                          * page.  If so, be happy and use it
1461                          */
1462                         if (PageUptodate(page))
1463                                 continue;
1464
1465                         ret = rbio_add_io_page(rbio, &bio_list, page,
1466                                        stripe, pagenr, rbio->stripe_len);
1467                         if (ret)
1468                                 goto cleanup;
1469                 }
1470         }
1471
1472         bios_to_read = bio_list_size(&bio_list);
1473         if (!bios_to_read) {
1474                 /*
1475                  * this can happen if others have merged with
1476                  * us, it means there is nothing left to read.
1477                  * But if there are missing devices it may not be
1478                  * safe to do the full stripe write yet.
1479                  */
1480                 goto finish;
1481         }
1482
1483         /*
1484          * the bbio may be freed once we submit the last bio.  Make sure
1485          * not to touch it after that
1486          */
1487         atomic_set(&bbio->stripes_pending, bios_to_read);
1488         while (1) {
1489                 bio = bio_list_pop(&bio_list);
1490                 if (!bio)
1491                         break;
1492
1493                 bio->bi_private = rbio;
1494                 bio->bi_end_io = raid_rmw_end_io;
1495
1496                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1497                                     BTRFS_WQ_ENDIO_RAID56);
1498
1499                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1500                 submit_bio(READ, bio);
1501         }
1502         /* the actual write will happen once the reads are done */
1503         return 0;
1504
1505 cleanup:
1506         rbio_orig_end_io(rbio, -EIO, 0);
1507         return -EIO;
1508
1509 finish:
1510         validate_rbio_for_rmw(rbio);
1511         return 0;
1512 }
1513
1514 /*
1515  * if the upper layers pass in a full stripe, we thank them by only allocating
1516  * enough pages to hold the parity, and sending it all down quickly.
1517  */
1518 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1519 {
1520         int ret;
1521
1522         ret = alloc_rbio_parity_pages(rbio);
1523         if (ret)
1524                 return ret;
1525
1526         ret = lock_stripe_add(rbio);
1527         if (ret == 0)
1528                 finish_rmw(rbio);
1529         return 0;
1530 }
1531
1532 /*
1533  * partial stripe writes get handed over to async helpers.
1534  * We're really hoping to merge a few more writes into this
1535  * rbio before calculating new parity
1536  */
1537 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1538 {
1539         int ret;
1540
1541         ret = lock_stripe_add(rbio);
1542         if (ret == 0)
1543                 async_rmw_stripe(rbio);
1544         return 0;
1545 }
1546
1547 /*
1548  * sometimes while we were reading from the drive to
1549  * recalculate parity, enough new bios come into create
1550  * a full stripe.  So we do a check here to see if we can
1551  * go directly to finish_rmw
1552  */
1553 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1554 {
1555         /* head off into rmw land if we don't have a full stripe */
1556         if (!rbio_is_full(rbio))
1557                 return partial_stripe_write(rbio);
1558         return full_stripe_write(rbio);
1559 }
1560
1561 /*
1562  * We use plugging call backs to collect full stripes.
1563  * Any time we get a partial stripe write while plugged
1564  * we collect it into a list.  When the unplug comes down,
1565  * we sort the list by logical block number and merge
1566  * everything we can into the same rbios
1567  */
1568 struct btrfs_plug_cb {
1569         struct blk_plug_cb cb;
1570         struct btrfs_fs_info *info;
1571         struct list_head rbio_list;
1572         struct btrfs_work work;
1573 };
1574
1575 /*
1576  * rbios on the plug list are sorted for easier merging.
1577  */
1578 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1579 {
1580         struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1581                                                  plug_list);
1582         struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1583                                                  plug_list);
1584         u64 a_sector = ra->bio_list.head->bi_sector;
1585         u64 b_sector = rb->bio_list.head->bi_sector;
1586
1587         if (a_sector < b_sector)
1588                 return -1;
1589         if (a_sector > b_sector)
1590                 return 1;
1591         return 0;
1592 }
1593
1594 static void run_plug(struct btrfs_plug_cb *plug)
1595 {
1596         struct btrfs_raid_bio *cur;
1597         struct btrfs_raid_bio *last = NULL;
1598
1599         /*
1600          * sort our plug list then try to merge
1601          * everything we can in hopes of creating full
1602          * stripes.
1603          */
1604         list_sort(NULL, &plug->rbio_list, plug_cmp);
1605         while (!list_empty(&plug->rbio_list)) {
1606                 cur = list_entry(plug->rbio_list.next,
1607                                  struct btrfs_raid_bio, plug_list);
1608                 list_del_init(&cur->plug_list);
1609
1610                 if (rbio_is_full(cur)) {
1611                         /* we have a full stripe, send it down */
1612                         full_stripe_write(cur);
1613                         continue;
1614                 }
1615                 if (last) {
1616                         if (rbio_can_merge(last, cur)) {
1617                                 merge_rbio(last, cur);
1618                                 __free_raid_bio(cur);
1619                                 continue;
1620
1621                         }
1622                         __raid56_parity_write(last);
1623                 }
1624                 last = cur;
1625         }
1626         if (last) {
1627                 __raid56_parity_write(last);
1628         }
1629         kfree(plug);
1630 }
1631
1632 /*
1633  * if the unplug comes from schedule, we have to push the
1634  * work off to a helper thread
1635  */
1636 static void unplug_work(struct btrfs_work *work)
1637 {
1638         struct btrfs_plug_cb *plug;
1639         plug = container_of(work, struct btrfs_plug_cb, work);
1640         run_plug(plug);
1641 }
1642
1643 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1644 {
1645         struct btrfs_plug_cb *plug;
1646         plug = container_of(cb, struct btrfs_plug_cb, cb);
1647
1648         if (from_schedule) {
1649                 plug->work.flags = 0;
1650                 plug->work.func = unplug_work;
1651                 btrfs_queue_worker(&plug->info->rmw_workers,
1652                                    &plug->work);
1653                 return;
1654         }
1655         run_plug(plug);
1656 }
1657
1658 /*
1659  * our main entry point for writes from the rest of the FS.
1660  */
1661 int raid56_parity_write(struct btrfs_root *root, struct bio *bio,
1662                         struct btrfs_bio *bbio, u64 *raid_map,
1663                         u64 stripe_len)
1664 {
1665         struct btrfs_raid_bio *rbio;
1666         struct btrfs_plug_cb *plug = NULL;
1667         struct blk_plug_cb *cb;
1668
1669         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
1670         if (IS_ERR(rbio)) {
1671                 kfree(raid_map);
1672                 kfree(bbio);
1673                 return PTR_ERR(rbio);
1674         }
1675         bio_list_add(&rbio->bio_list, bio);
1676         rbio->bio_list_bytes = bio->bi_size;
1677
1678         /*
1679          * don't plug on full rbios, just get them out the door
1680          * as quickly as we can
1681          */
1682         if (rbio_is_full(rbio))
1683                 return full_stripe_write(rbio);
1684
1685         cb = blk_check_plugged(btrfs_raid_unplug, root->fs_info,
1686                                sizeof(*plug));
1687         if (cb) {
1688                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1689                 if (!plug->info) {
1690                         plug->info = root->fs_info;
1691                         INIT_LIST_HEAD(&plug->rbio_list);
1692                 }
1693                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1694         } else {
1695                 return __raid56_parity_write(rbio);
1696         }
1697         return 0;
1698 }
1699
1700 /*
1701  * all parity reconstruction happens here.  We've read in everything
1702  * we can find from the drives and this does the heavy lifting of
1703  * sorting the good from the bad.
1704  */
1705 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1706 {
1707         int pagenr, stripe;
1708         void **pointers;
1709         int faila = -1, failb = -1;
1710         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1711         struct page *page;
1712         int err;
1713         int i;
1714
1715         pointers = kzalloc(rbio->bbio->num_stripes * sizeof(void *),
1716                            GFP_NOFS);
1717         if (!pointers) {
1718                 err = -ENOMEM;
1719                 goto cleanup_io;
1720         }
1721
1722         faila = rbio->faila;
1723         failb = rbio->failb;
1724
1725         if (rbio->read_rebuild) {
1726                 spin_lock_irq(&rbio->bio_list_lock);
1727                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1728                 spin_unlock_irq(&rbio->bio_list_lock);
1729         }
1730
1731         index_rbio_pages(rbio);
1732
1733         for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1734                 /* setup our array of pointers with pages
1735                  * from each stripe
1736                  */
1737                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1738                         /*
1739                          * if we're rebuilding a read, we have to use
1740                          * pages from the bio list
1741                          */
1742                         if (rbio->read_rebuild &&
1743                             (stripe == faila || stripe == failb)) {
1744                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1745                         } else {
1746                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1747                         }
1748                         pointers[stripe] = kmap(page);
1749                 }
1750
1751                 /* all raid6 handling here */
1752                 if (rbio->raid_map[rbio->bbio->num_stripes - 1] ==
1753                     RAID6_Q_STRIPE) {
1754
1755                         /*
1756                          * single failure, rebuild from parity raid5
1757                          * style
1758                          */
1759                         if (failb < 0) {
1760                                 if (faila == rbio->nr_data) {
1761                                         /*
1762                                          * Just the P stripe has failed, without
1763                                          * a bad data or Q stripe.
1764                                          * TODO, we should redo the xor here.
1765                                          */
1766                                         err = -EIO;
1767                                         goto cleanup;
1768                                 }
1769                                 /*
1770                                  * a single failure in raid6 is rebuilt
1771                                  * in the pstripe code below
1772                                  */
1773                                 goto pstripe;
1774                         }
1775
1776                         /* make sure our ps and qs are in order */
1777                         if (faila > failb) {
1778                                 int tmp = failb;
1779                                 failb = faila;
1780                                 faila = tmp;
1781                         }
1782
1783                         /* if the q stripe is failed, do a pstripe reconstruction
1784                          * from the xors.
1785                          * If both the q stripe and the P stripe are failed, we're
1786                          * here due to a crc mismatch and we can't give them the
1787                          * data they want
1788                          */
1789                         if (rbio->raid_map[failb] == RAID6_Q_STRIPE) {
1790                                 if (rbio->raid_map[faila] == RAID5_P_STRIPE) {
1791                                         err = -EIO;
1792                                         goto cleanup;
1793                                 }
1794                                 /*
1795                                  * otherwise we have one bad data stripe and
1796                                  * a good P stripe.  raid5!
1797                                  */
1798                                 goto pstripe;
1799                         }
1800
1801                         if (rbio->raid_map[failb] == RAID5_P_STRIPE) {
1802                                 raid6_datap_recov(rbio->bbio->num_stripes,
1803                                                   PAGE_SIZE, faila, pointers);
1804                         } else {
1805                                 raid6_2data_recov(rbio->bbio->num_stripes,
1806                                                   PAGE_SIZE, faila, failb,
1807                                                   pointers);
1808                         }
1809                 } else {
1810                         void *p;
1811
1812                         /* rebuild from P stripe here (raid5 or raid6) */
1813                         BUG_ON(failb != -1);
1814 pstripe:
1815                         /* Copy parity block into failed block to start with */
1816                         memcpy(pointers[faila],
1817                                pointers[rbio->nr_data],
1818                                PAGE_CACHE_SIZE);
1819
1820                         /* rearrange the pointer array */
1821                         p = pointers[faila];
1822                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1823                                 pointers[stripe] = pointers[stripe + 1];
1824                         pointers[rbio->nr_data - 1] = p;
1825
1826                         /* xor in the rest */
1827                         run_xor(pointers, rbio->nr_data - 1, PAGE_CACHE_SIZE);
1828                 }
1829                 /* if we're doing this rebuild as part of an rmw, go through
1830                  * and set all of our private rbio pages in the
1831                  * failed stripes as uptodate.  This way finish_rmw will
1832                  * know they can be trusted.  If this was a read reconstruction,
1833                  * other endio functions will fiddle the uptodate bits
1834                  */
1835                 if (!rbio->read_rebuild) {
1836                         for (i = 0;  i < nr_pages; i++) {
1837                                 if (faila != -1) {
1838                                         page = rbio_stripe_page(rbio, faila, i);
1839                                         SetPageUptodate(page);
1840                                 }
1841                                 if (failb != -1) {
1842                                         page = rbio_stripe_page(rbio, failb, i);
1843                                         SetPageUptodate(page);
1844                                 }
1845                         }
1846                 }
1847                 for (stripe = 0; stripe < rbio->bbio->num_stripes; stripe++) {
1848                         /*
1849                          * if we're rebuilding a read, we have to use
1850                          * pages from the bio list
1851                          */
1852                         if (rbio->read_rebuild &&
1853                             (stripe == faila || stripe == failb)) {
1854                                 page = page_in_rbio(rbio, stripe, pagenr, 0);
1855                         } else {
1856                                 page = rbio_stripe_page(rbio, stripe, pagenr);
1857                         }
1858                         kunmap(page);
1859                 }
1860         }
1861
1862         err = 0;
1863 cleanup:
1864         kfree(pointers);
1865
1866 cleanup_io:
1867
1868         if (rbio->read_rebuild) {
1869                 if (err == 0)
1870                         cache_rbio_pages(rbio);
1871                 else
1872                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1873
1874                 rbio_orig_end_io(rbio, err, err == 0);
1875         } else if (err == 0) {
1876                 rbio->faila = -1;
1877                 rbio->failb = -1;
1878                 finish_rmw(rbio);
1879         } else {
1880                 rbio_orig_end_io(rbio, err, 0);
1881         }
1882 }
1883
1884 /*
1885  * This is called only for stripes we've read from disk to
1886  * reconstruct the parity.
1887  */
1888 static void raid_recover_end_io(struct bio *bio, int err)
1889 {
1890         struct btrfs_raid_bio *rbio = bio->bi_private;
1891
1892         /*
1893          * we only read stripe pages off the disk, set them
1894          * up to date if there were no errors
1895          */
1896         if (err)
1897                 fail_bio_stripe(rbio, bio);
1898         else
1899                 set_bio_pages_uptodate(bio);
1900         bio_put(bio);
1901
1902         if (!atomic_dec_and_test(&rbio->bbio->stripes_pending))
1903                 return;
1904
1905         if (atomic_read(&rbio->bbio->error) > rbio->bbio->max_errors)
1906                 rbio_orig_end_io(rbio, -EIO, 0);
1907         else
1908                 __raid_recover_end_io(rbio);
1909 }
1910
1911 /*
1912  * reads everything we need off the disk to reconstruct
1913  * the parity. endio handlers trigger final reconstruction
1914  * when the IO is done.
1915  *
1916  * This is used both for reads from the higher layers and for
1917  * parity construction required to finish a rmw cycle.
1918  */
1919 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
1920 {
1921         int bios_to_read = 0;
1922         struct btrfs_bio *bbio = rbio->bbio;
1923         struct bio_list bio_list;
1924         int ret;
1925         int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
1926         int pagenr;
1927         int stripe;
1928         struct bio *bio;
1929
1930         bio_list_init(&bio_list);
1931
1932         ret = alloc_rbio_pages(rbio);
1933         if (ret)
1934                 goto cleanup;
1935
1936         atomic_set(&rbio->bbio->error, 0);
1937
1938         /*
1939          * read everything that hasn't failed.  Thanks to the
1940          * stripe cache, it is possible that some or all of these
1941          * pages are going to be uptodate.
1942          */
1943         for (stripe = 0; stripe < bbio->num_stripes; stripe++) {
1944                 if (rbio->faila == stripe ||
1945                     rbio->failb == stripe)
1946                         continue;
1947
1948                 for (pagenr = 0; pagenr < nr_pages; pagenr++) {
1949                         struct page *p;
1950
1951                         /*
1952                          * the rmw code may have already read this
1953                          * page in
1954                          */
1955                         p = rbio_stripe_page(rbio, stripe, pagenr);
1956                         if (PageUptodate(p))
1957                                 continue;
1958
1959                         ret = rbio_add_io_page(rbio, &bio_list,
1960                                        rbio_stripe_page(rbio, stripe, pagenr),
1961                                        stripe, pagenr, rbio->stripe_len);
1962                         if (ret < 0)
1963                                 goto cleanup;
1964                 }
1965         }
1966
1967         bios_to_read = bio_list_size(&bio_list);
1968         if (!bios_to_read) {
1969                 /*
1970                  * we might have no bios to read just because the pages
1971                  * were up to date, or we might have no bios to read because
1972                  * the devices were gone.
1973                  */
1974                 if (atomic_read(&rbio->bbio->error) <= rbio->bbio->max_errors) {
1975                         __raid_recover_end_io(rbio);
1976                         goto out;
1977                 } else {
1978                         goto cleanup;
1979                 }
1980         }
1981
1982         /*
1983          * the bbio may be freed once we submit the last bio.  Make sure
1984          * not to touch it after that
1985          */
1986         atomic_set(&bbio->stripes_pending, bios_to_read);
1987         while (1) {
1988                 bio = bio_list_pop(&bio_list);
1989                 if (!bio)
1990                         break;
1991
1992                 bio->bi_private = rbio;
1993                 bio->bi_end_io = raid_recover_end_io;
1994
1995                 btrfs_bio_wq_end_io(rbio->fs_info, bio,
1996                                     BTRFS_WQ_ENDIO_RAID56);
1997
1998                 BUG_ON(!test_bit(BIO_UPTODATE, &bio->bi_flags));
1999                 submit_bio(READ, bio);
2000         }
2001 out:
2002         return 0;
2003
2004 cleanup:
2005         if (rbio->read_rebuild)
2006                 rbio_orig_end_io(rbio, -EIO, 0);
2007         return -EIO;
2008 }
2009
2010 /*
2011  * the main entry point for reads from the higher layers.  This
2012  * is really only called when the normal read path had a failure,
2013  * so we assume the bio they send down corresponds to a failed part
2014  * of the drive.
2015  */
2016 int raid56_parity_recover(struct btrfs_root *root, struct bio *bio,
2017                           struct btrfs_bio *bbio, u64 *raid_map,
2018                           u64 stripe_len, int mirror_num)
2019 {
2020         struct btrfs_raid_bio *rbio;
2021         int ret;
2022
2023         rbio = alloc_rbio(root, bbio, raid_map, stripe_len);
2024         if (IS_ERR(rbio)) {
2025                 return PTR_ERR(rbio);
2026         }
2027
2028         rbio->read_rebuild = 1;
2029         bio_list_add(&rbio->bio_list, bio);
2030         rbio->bio_list_bytes = bio->bi_size;
2031
2032         rbio->faila = find_logical_bio_stripe(rbio, bio);
2033         if (rbio->faila == -1) {
2034                 BUG();
2035                 kfree(rbio);
2036                 return -EIO;
2037         }
2038
2039         /*
2040          * reconstruct from the q stripe if they are
2041          * asking for mirror 3
2042          */
2043         if (mirror_num == 3)
2044                 rbio->failb = bbio->num_stripes - 2;
2045
2046         ret = lock_stripe_add(rbio);
2047
2048         /*
2049          * __raid56_parity_recover will end the bio with
2050          * any errors it hits.  We don't want to return
2051          * its error value up the stack because our caller
2052          * will end up calling bio_endio with any nonzero
2053          * return
2054          */
2055         if (ret == 0)
2056                 __raid56_parity_recover(rbio);
2057         /*
2058          * our rbio has been added to the list of
2059          * rbios that will be handled after the
2060          * currently lock owner is done
2061          */
2062         return 0;
2063
2064 }
2065
2066 static void rmw_work(struct btrfs_work *work)
2067 {
2068         struct btrfs_raid_bio *rbio;
2069
2070         rbio = container_of(work, struct btrfs_raid_bio, work);
2071         raid56_rmw_stripe(rbio);
2072 }
2073
2074 static void read_rebuild_work(struct btrfs_work *work)
2075 {
2076         struct btrfs_raid_bio *rbio;
2077
2078         rbio = container_of(work, struct btrfs_raid_bio, work);
2079         __raid56_parity_recover(rbio);
2080 }