]> rtime.felk.cvut.cz Git - linux-imx.git/blobdiff - fs/btrfs/extent-tree.c
btrfs: limit fallocate extent reservation to 256MB
[linux-imx.git] / fs / btrfs / extent-tree.c
index e035731b36083929788c88628e83ba6e07a7b85e..d2b3a5e9a6211bb875b3db0e48ec9552a7764832 100644 (file)
@@ -31,6 +31,7 @@
 #include "print-tree.h"
 #include "transaction.h"
 #include "volumes.h"
+#include "raid56.h"
 #include "locking.h"
 #include "free-space-cache.h"
 #include "math.h"
@@ -102,6 +103,8 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
                            int dump_block_groups);
 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
                                       u64 num_bytes, int reserve);
+static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
+                              u64 num_bytes);
 
 static noinline int
 block_group_cache_done(struct btrfs_block_group_cache *cache)
@@ -1850,6 +1853,8 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
                *actual_bytes = discarded_bytes;
 
 
+       if (ret == -EOPNOTSUPP)
+               ret = 0;
        return ret;
 }
 
@@ -2438,6 +2443,16 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
        return ret;
 }
 
+static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
+                     int count)
+{
+       int val = atomic_read(&delayed_refs->ref_seq);
+
+       if (val < seq || val >= seq + count)
+               return 1;
+       return 0;
+}
+
 /*
  * this starts processing the delayed reference count updates and
  * extent insertions we have queued up so far.  count can be
@@ -2472,6 +2487,44 @@ int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
 
        delayed_refs = &trans->transaction->delayed_refs;
        INIT_LIST_HEAD(&cluster);
+       if (count == 0) {
+               count = delayed_refs->num_entries * 2;
+               run_most = 1;
+       }
+
+       if (!run_all && !run_most) {
+               int old;
+               int seq = atomic_read(&delayed_refs->ref_seq);
+
+progress:
+               old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+               if (old) {
+                       DEFINE_WAIT(__wait);
+                       if (delayed_refs->num_entries < 16348)
+                               return 0;
+
+                       prepare_to_wait(&delayed_refs->wait, &__wait,
+                                       TASK_UNINTERRUPTIBLE);
+
+                       old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
+                       if (old) {
+                               schedule();
+                               finish_wait(&delayed_refs->wait, &__wait);
+
+                               if (!refs_newer(delayed_refs, seq, 256))
+                                       goto progress;
+                               else
+                                       return 0;
+                       } else {
+                               finish_wait(&delayed_refs->wait, &__wait);
+                               goto again;
+                       }
+               }
+
+       } else {
+               atomic_inc(&delayed_refs->procs_running_refs);
+       }
+
 again:
        loops = 0;
        spin_lock(&delayed_refs->lock);
@@ -2480,10 +2533,6 @@ again:
        delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
 #endif
 
-       if (count == 0) {
-               count = delayed_refs->num_entries * 2;
-               run_most = 1;
-       }
        while (1) {
                if (!(run_all || run_most) &&
                    delayed_refs->num_heads_ready < 64)
@@ -2506,9 +2555,12 @@ again:
                        btrfs_release_ref_cluster(&cluster);
                        spin_unlock(&delayed_refs->lock);
                        btrfs_abort_transaction(trans, root, ret);
+                       atomic_dec(&delayed_refs->procs_running_refs);
                        return ret;
                }
 
+               atomic_add(ret, &delayed_refs->ref_seq);
+
                count -= min_t(unsigned long, ret, count);
 
                if (count == 0)
@@ -2577,6 +2629,11 @@ again:
                goto again;
        }
 out:
+       atomic_dec(&delayed_refs->procs_running_refs);
+       smp_mb();
+       if (waitqueue_active(&delayed_refs->wait))
+               wake_up(&delayed_refs->wait);
+
        spin_unlock(&delayed_refs->lock);
        assert_qgroups_uptodate(trans);
        return 0;
@@ -3282,6 +3339,7 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        u64 num_devices = root->fs_info->fs_devices->rw_devices +
                root->fs_info->fs_devices->missing_devices;
        u64 target;
+       u64 tmp;
 
        /*
         * see if restripe for this chunk_type is in progress, if so
@@ -3298,30 +3356,32 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
        }
        spin_unlock(&root->fs_info->balance_lock);
 
+       /* First, mask out the RAID levels which aren't possible */
        if (num_devices == 1)
-               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
+               flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
+                          BTRFS_BLOCK_GROUP_RAID5);
+       if (num_devices < 3)
+               flags &= ~BTRFS_BLOCK_GROUP_RAID6;
        if (num_devices < 4)
                flags &= ~BTRFS_BLOCK_GROUP_RAID10;
 
-       if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
-           (flags & (BTRFS_BLOCK_GROUP_RAID1 |
-                     BTRFS_BLOCK_GROUP_RAID10))) {
-               flags &= ~BTRFS_BLOCK_GROUP_DUP;
-       }
-
-       if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
-           (flags & BTRFS_BLOCK_GROUP_RAID10)) {
-               flags &= ~BTRFS_BLOCK_GROUP_RAID1;
-       }
+       tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
+                      BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
+                      BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
+       flags &= ~tmp;
 
-       if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
-           ((flags & BTRFS_BLOCK_GROUP_RAID1) |
-            (flags & BTRFS_BLOCK_GROUP_RAID10) |
-            (flags & BTRFS_BLOCK_GROUP_DUP))) {
-               flags &= ~BTRFS_BLOCK_GROUP_RAID0;
-       }
+       if (tmp & BTRFS_BLOCK_GROUP_RAID6)
+               tmp = BTRFS_BLOCK_GROUP_RAID6;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
+               tmp = BTRFS_BLOCK_GROUP_RAID5;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
+               tmp = BTRFS_BLOCK_GROUP_RAID10;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
+               tmp = BTRFS_BLOCK_GROUP_RAID1;
+       else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
+               tmp = BTRFS_BLOCK_GROUP_RAID0;
 
-       return extended_to_chunk(flags);
+       return extended_to_chunk(flags | tmp);
 }
 
 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
@@ -3345,6 +3405,7 @@ static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
 {
        u64 flags;
+       u64 ret;
 
        if (data)
                flags = BTRFS_BLOCK_GROUP_DATA;
@@ -3353,7 +3414,8 @@ u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
        else
                flags = BTRFS_BLOCK_GROUP_METADATA;
 
-       return get_alloc_profile(root, flags);
+       ret = get_alloc_profile(root, flags);
+       return ret;
 }
 
 /*
@@ -3528,8 +3590,10 @@ static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
 {
        u64 num_dev;
 
-       if (type & BTRFS_BLOCK_GROUP_RAID10 ||
-           type & BTRFS_BLOCK_GROUP_RAID0)
+       if (type & (BTRFS_BLOCK_GROUP_RAID10 |
+                   BTRFS_BLOCK_GROUP_RAID0 |
+                   BTRFS_BLOCK_GROUP_RAID5 |
+                   BTRFS_BLOCK_GROUP_RAID6))
                num_dev = root->fs_info->fs_devices->rw_devices;
        else if (type & BTRFS_BLOCK_GROUP_RAID1)
                num_dev = 2;
@@ -3677,6 +3741,7 @@ static int can_overcommit(struct btrfs_root *root,
        u64 rsv_size = 0;
        u64 avail;
        u64 used;
+       u64 to_add;
 
        used = space_info->bytes_used + space_info->bytes_reserved +
                space_info->bytes_pinned + space_info->bytes_readonly;
@@ -3703,24 +3768,34 @@ static int can_overcommit(struct btrfs_root *root,
 
        /*
         * If we have dup, raid1 or raid10 then only half of the free
-        * space is actually useable.
+        * space is actually useable.  For raid56, the space info used
+        * doesn't include the parity drive, so we don't have to
+        * change the math
         */
        if (profile & (BTRFS_BLOCK_GROUP_DUP |
                       BTRFS_BLOCK_GROUP_RAID1 |
                       BTRFS_BLOCK_GROUP_RAID10))
                avail >>= 1;
 
+       to_add = space_info->total_bytes;
+
        /*
         * If we aren't flushing all things, let us overcommit up to
         * 1/2th of the space. If we can flush, don't let us overcommit
         * too much, let it overcommit up to 1/8 of the space.
         */
        if (flush == BTRFS_RESERVE_FLUSH_ALL)
-               avail >>= 3;
+               to_add >>= 3;
        else
-               avail >>= 1;
+               to_add >>= 1;
+
+       /*
+        * Limit the overcommit to the amount of free space we could possibly
+        * allocate for chunks.
+        */
+       to_add = min(avail, to_add);
 
-       if (used + bytes < space_info->total_bytes + avail)
+       if (used + bytes < space_info->total_bytes + to_add)
                return 1;
        return 0;
 }
@@ -3958,7 +4033,7 @@ static int flush_space(struct btrfs_root *root,
  * @root - the root we're allocating for
  * @block_rsv - the block_rsv we're allocating for
  * @orig_bytes - the number of bytes we want
- * @flush - wether or not we can flush to make our reservation
+ * @flush - whether or not we can flush to make our reservation
  *
  * This will reserve orgi_bytes number of bytes from the space info associated
  * with the block_rsv.  If there is not enough space it will make an attempt to
@@ -4090,6 +4165,15 @@ again:
                goto again;
 
 out:
+       if (ret == -ENOSPC &&
+           unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
+               struct btrfs_block_rsv *global_rsv =
+                       &root->fs_info->global_block_rsv;
+
+               if (block_rsv != global_rsv &&
+                   !block_rsv_use_bytes(global_rsv, orig_bytes))
+                       ret = 0;
+       }
        if (flushing) {
                spin_lock(&space_info->lock);
                space_info->flush = 0;
@@ -4728,7 +4812,8 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
        spin_lock(&BTRFS_I(inode)->lock);
        dropped = drop_outstanding_extent(inode);
 
-       to_free = calc_csum_metadata_size(inode, num_bytes, 0);
+       if (num_bytes)
+               to_free = calc_csum_metadata_size(inode, num_bytes, 0);
        spin_unlock(&BTRFS_I(inode)->lock);
        if (dropped > 0)
                to_free += btrfs_calc_trans_metadata_size(root, dropped);
@@ -5518,10 +5603,14 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
        return ret;
 }
 
-static u64 stripe_align(struct btrfs_root *root, u64 val)
+static u64 stripe_align(struct btrfs_root *root,
+                       struct btrfs_block_group_cache *cache,
+                       u64 val, u64 num_bytes)
 {
-       u64 mask = ((u64)root->stripesize - 1);
-       u64 ret = (val + mask) & ~mask;
+       u64 mask;
+       u64 ret;
+       mask = ((u64)root->stripesize - 1);
+       ret = (val + mask) & ~mask;
        return ret;
 }
 
@@ -5541,7 +5630,6 @@ wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
                                u64 num_bytes)
 {
        struct btrfs_caching_control *caching_ctl;
-       DEFINE_WAIT(wait);
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5558,7 +5646,6 @@ static noinline int
 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
 {
        struct btrfs_caching_control *caching_ctl;
-       DEFINE_WAIT(wait);
 
        caching_ctl = get_caching_control(cache);
        if (!caching_ctl)
@@ -5580,8 +5667,12 @@ int __get_raid_index(u64 flags)
                return BTRFS_RAID_DUP;
        else if (flags & BTRFS_BLOCK_GROUP_RAID0)
                return BTRFS_RAID_RAID0;
-       else
-               return BTRFS_RAID_SINGLE;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID5)
+               return BTRFS_RAID_RAID5;
+       else if (flags & BTRFS_BLOCK_GROUP_RAID6)
+               return BTRFS_RAID_RAID6;
+
+       return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
 }
 
 static int get_block_group_index(struct btrfs_block_group_cache *cache)
@@ -5724,6 +5815,8 @@ search:
                if (!block_group_bits(block_group, data)) {
                    u64 extra = BTRFS_BLOCK_GROUP_DUP |
                                BTRFS_BLOCK_GROUP_RAID1 |
+                               BTRFS_BLOCK_GROUP_RAID5 |
+                               BTRFS_BLOCK_GROUP_RAID6 |
                                BTRFS_BLOCK_GROUP_RAID10;
 
                        /*
@@ -5752,6 +5845,7 @@ have_block_group:
                 * lets look there
                 */
                if (last_ptr) {
+                       unsigned long aligned_cluster;
                        /*
                         * the refill lock keeps out other
                         * people trying to start a new cluster
@@ -5818,11 +5912,15 @@ refill_cluster:
                                goto unclustered_alloc;
                        }
 
+                       aligned_cluster = max_t(unsigned long,
+                                               empty_cluster + empty_size,
+                                             block_group->full_stripe_len);
+
                        /* allocate a cluster in this block group */
                        ret = btrfs_find_space_cluster(trans, root,
                                               block_group, last_ptr,
                                               search_start, num_bytes,
-                                              empty_cluster + empty_size);
+                                              aligned_cluster);
                        if (ret == 0) {
                                /*
                                 * now pull our allocation out of this
@@ -5893,7 +5991,8 @@ unclustered_alloc:
                        goto loop;
                }
 checks:
-               search_start = stripe_align(root, offset);
+               search_start = stripe_align(root, used_block_group,
+                                           offset, num_bytes);
 
                /* move on to the next group */
                if (search_start + num_bytes >
@@ -6044,7 +6143,7 @@ again:
        if (ret == -ENOSPC) {
                if (!final_tried) {
                        num_bytes = num_bytes >> 1;
-                       num_bytes = num_bytes & ~(root->sectorsize - 1);
+                       num_bytes = round_down(num_bytes, root->sectorsize);
                        num_bytes = max(num_bytes, min_alloc_size);
                        if (num_bytes == min_alloc_size)
                                final_tried = true;
@@ -6389,12 +6488,14 @@ use_block_rsv(struct btrfs_trans_handle *trans,
        if (!ret)
                return block_rsv;
        if (ret && !block_rsv->failfast) {
-               static DEFINE_RATELIMIT_STATE(_rs,
-                               DEFAULT_RATELIMIT_INTERVAL,
-                               /*DEFAULT_RATELIMIT_BURST*/ 2);
-               if (__ratelimit(&_rs))
-                       WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
-                            ret);
+               if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
+                       static DEFINE_RATELIMIT_STATE(_rs,
+                                       DEFAULT_RATELIMIT_INTERVAL * 10,
+                                       /*DEFAULT_RATELIMIT_BURST*/ 1);
+                       if (__ratelimit(&_rs))
+                               WARN(1, KERN_DEBUG
+                                       "btrfs: block rsv returned %d\n", ret);
+               }
                ret = reserve_metadata_bytes(root, block_rsv, blocksize,
                                             BTRFS_RESERVE_NO_FLUSH);
                if (!ret) {
@@ -7263,6 +7364,7 @@ static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
                root->fs_info->fs_devices->missing_devices;
 
        stripped = BTRFS_BLOCK_GROUP_RAID0 |
+               BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
                BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
 
        if (num_devices == 1) {
@@ -7711,11 +7813,13 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
                space_info = list_entry(info->space_info.next,
                                        struct btrfs_space_info,
                                        list);
-               if (space_info->bytes_pinned > 0 ||
-                   space_info->bytes_reserved > 0 ||
-                   space_info->bytes_may_use > 0) {
-                       WARN_ON(1);
-                       dump_space_info(space_info, 0, 0);
+               if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
+                       if (space_info->bytes_pinned > 0 ||
+                           space_info->bytes_reserved > 0 ||
+                           space_info->bytes_may_use > 0) {
+                               WARN_ON(1);
+                               dump_space_info(space_info, 0, 0);
+                       }
                }
                list_del(&space_info->list);
                kfree(space_info);
@@ -7814,7 +7918,9 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                btrfs_release_path(path);
                cache->flags = btrfs_block_group_flags(&cache->item);
                cache->sectorsize = root->sectorsize;
-
+               cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                              &root->fs_info->mapping_tree,
+                                              found_key.objectid);
                btrfs_init_free_space_ctl(cache);
 
                /*
@@ -7868,6 +7974,8 @@ int btrfs_read_block_groups(struct btrfs_root *root)
                if (!(get_alloc_profile(root, space_info->flags) &
                      (BTRFS_BLOCK_GROUP_RAID10 |
                       BTRFS_BLOCK_GROUP_RAID1 |
+                      BTRFS_BLOCK_GROUP_RAID5 |
+                      BTRFS_BLOCK_GROUP_RAID6 |
                       BTRFS_BLOCK_GROUP_DUP)))
                        continue;
                /*
@@ -7943,6 +8051,9 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
        cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
        cache->sectorsize = root->sectorsize;
        cache->fs_info = root->fs_info;
+       cache->full_stripe_len = btrfs_full_stripe_len(root,
+                                              &root->fs_info->mapping_tree,
+                                              chunk_offset);
 
        atomic_set(&cache->count, 1);
        spin_lock_init(&cache->lock);