]> rtime.felk.cvut.cz Git - linux-imx.git/blob - fs/btrfs/extent-tree.c
Btrfs: only exclude supers in the range of our block group
[linux-imx.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "raid56.h"
35 #include "locking.h"
36 #include "free-space-cache.h"
37 #include "math.h"
38
39 #undef SCRAMBLE_DELAYED_REFS
40
41 /*
42  * control flags for do_chunk_alloc's force field
43  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
44  * if we really need one.
45  *
46  * CHUNK_ALLOC_LIMITED means to only try and allocate one
47  * if we have very few chunks already allocated.  This is
48  * used as part of the clustering code to help make sure
49  * we have a good pool of storage to cluster in, without
50  * filling the FS with empty chunks
51  *
52  * CHUNK_ALLOC_FORCE means it must try to allocate one
53  *
54  */
55 enum {
56         CHUNK_ALLOC_NO_FORCE = 0,
57         CHUNK_ALLOC_LIMITED = 1,
58         CHUNK_ALLOC_FORCE = 2,
59 };
60
61 /*
62  * Control how reservations are dealt with.
63  *
64  * RESERVE_FREE - freeing a reservation.
65  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
66  *   ENOSPC accounting
67  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
68  *   bytes_may_use as the ENOSPC accounting is done elsewhere
69  */
70 enum {
71         RESERVE_FREE = 0,
72         RESERVE_ALLOC = 1,
73         RESERVE_ALLOC_NO_ACCOUNT = 2,
74 };
75
76 static int update_block_group(struct btrfs_root *root,
77                               u64 bytenr, u64 num_bytes, int alloc);
78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79                                 struct btrfs_root *root,
80                                 u64 bytenr, u64 num_bytes, u64 parent,
81                                 u64 root_objectid, u64 owner_objectid,
82                                 u64 owner_offset, int refs_to_drop,
83                                 struct btrfs_delayed_extent_op *extra_op);
84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
85                                     struct extent_buffer *leaf,
86                                     struct btrfs_extent_item *ei);
87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
88                                       struct btrfs_root *root,
89                                       u64 parent, u64 root_objectid,
90                                       u64 flags, u64 owner, u64 offset,
91                                       struct btrfs_key *ins, int ref_mod);
92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
93                                      struct btrfs_root *root,
94                                      u64 parent, u64 root_objectid,
95                                      u64 flags, struct btrfs_disk_key *key,
96                                      int level, struct btrfs_key *ins);
97 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98                           struct btrfs_root *extent_root, u64 flags,
99                           int force);
100 static int find_next_key(struct btrfs_path *path, int level,
101                          struct btrfs_key *key);
102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103                             int dump_block_groups);
104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105                                        u64 num_bytes, int reserve);
106 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
107                                u64 num_bytes);
108
109 static noinline int
110 block_group_cache_done(struct btrfs_block_group_cache *cache)
111 {
112         smp_mb();
113         return cache->cached == BTRFS_CACHE_FINISHED;
114 }
115
116 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
117 {
118         return (cache->flags & bits) == bits;
119 }
120
121 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
122 {
123         atomic_inc(&cache->count);
124 }
125
126 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
127 {
128         if (atomic_dec_and_test(&cache->count)) {
129                 WARN_ON(cache->pinned > 0);
130                 WARN_ON(cache->reserved > 0);
131                 kfree(cache->free_space_ctl);
132                 kfree(cache);
133         }
134 }
135
136 /*
137  * this adds the block group to the fs_info rb tree for the block group
138  * cache
139  */
140 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
141                                 struct btrfs_block_group_cache *block_group)
142 {
143         struct rb_node **p;
144         struct rb_node *parent = NULL;
145         struct btrfs_block_group_cache *cache;
146
147         spin_lock(&info->block_group_cache_lock);
148         p = &info->block_group_cache_tree.rb_node;
149
150         while (*p) {
151                 parent = *p;
152                 cache = rb_entry(parent, struct btrfs_block_group_cache,
153                                  cache_node);
154                 if (block_group->key.objectid < cache->key.objectid) {
155                         p = &(*p)->rb_left;
156                 } else if (block_group->key.objectid > cache->key.objectid) {
157                         p = &(*p)->rb_right;
158                 } else {
159                         spin_unlock(&info->block_group_cache_lock);
160                         return -EEXIST;
161                 }
162         }
163
164         rb_link_node(&block_group->cache_node, parent, p);
165         rb_insert_color(&block_group->cache_node,
166                         &info->block_group_cache_tree);
167
168         if (info->first_logical_byte > block_group->key.objectid)
169                 info->first_logical_byte = block_group->key.objectid;
170
171         spin_unlock(&info->block_group_cache_lock);
172
173         return 0;
174 }
175
176 /*
177  * This will return the block group at or after bytenr if contains is 0, else
178  * it will return the block group that contains the bytenr
179  */
180 static struct btrfs_block_group_cache *
181 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
182                               int contains)
183 {
184         struct btrfs_block_group_cache *cache, *ret = NULL;
185         struct rb_node *n;
186         u64 end, start;
187
188         spin_lock(&info->block_group_cache_lock);
189         n = info->block_group_cache_tree.rb_node;
190
191         while (n) {
192                 cache = rb_entry(n, struct btrfs_block_group_cache,
193                                  cache_node);
194                 end = cache->key.objectid + cache->key.offset - 1;
195                 start = cache->key.objectid;
196
197                 if (bytenr < start) {
198                         if (!contains && (!ret || start < ret->key.objectid))
199                                 ret = cache;
200                         n = n->rb_left;
201                 } else if (bytenr > start) {
202                         if (contains && bytenr <= end) {
203                                 ret = cache;
204                                 break;
205                         }
206                         n = n->rb_right;
207                 } else {
208                         ret = cache;
209                         break;
210                 }
211         }
212         if (ret) {
213                 btrfs_get_block_group(ret);
214                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
215                         info->first_logical_byte = ret->key.objectid;
216         }
217         spin_unlock(&info->block_group_cache_lock);
218
219         return ret;
220 }
221
222 static int add_excluded_extent(struct btrfs_root *root,
223                                u64 start, u64 num_bytes)
224 {
225         u64 end = start + num_bytes - 1;
226         set_extent_bits(&root->fs_info->freed_extents[0],
227                         start, end, EXTENT_UPTODATE, GFP_NOFS);
228         set_extent_bits(&root->fs_info->freed_extents[1],
229                         start, end, EXTENT_UPTODATE, GFP_NOFS);
230         return 0;
231 }
232
233 static void free_excluded_extents(struct btrfs_root *root,
234                                   struct btrfs_block_group_cache *cache)
235 {
236         u64 start, end;
237
238         start = cache->key.objectid;
239         end = start + cache->key.offset - 1;
240
241         clear_extent_bits(&root->fs_info->freed_extents[0],
242                           start, end, EXTENT_UPTODATE, GFP_NOFS);
243         clear_extent_bits(&root->fs_info->freed_extents[1],
244                           start, end, EXTENT_UPTODATE, GFP_NOFS);
245 }
246
247 static int exclude_super_stripes(struct btrfs_root *root,
248                                  struct btrfs_block_group_cache *cache)
249 {
250         u64 bytenr;
251         u64 *logical;
252         int stripe_len;
253         int i, nr, ret;
254
255         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
256                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
257                 cache->bytes_super += stripe_len;
258                 ret = add_excluded_extent(root, cache->key.objectid,
259                                           stripe_len);
260                 if (ret)
261                         return ret;
262         }
263
264         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
265                 bytenr = btrfs_sb_offset(i);
266                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
267                                        cache->key.objectid, bytenr,
268                                        0, &logical, &nr, &stripe_len);
269                 if (ret)
270                         return ret;
271
272                 while (nr--) {
273                         u64 start, len;
274
275                         if (logical[nr] > cache->key.objectid +
276                             cache->key.offset)
277                                 continue;
278
279                         if (logical[nr] + stripe_len <= cache->key.objectid)
280                                 continue;
281
282                         start = logical[nr];
283                         if (start < cache->key.objectid) {
284                                 start = cache->key.objectid;
285                                 len = (logical[nr] + stripe_len) - start;
286                         } else {
287                                 len = min_t(u64, stripe_len,
288                                             cache->key.objectid +
289                                             cache->key.offset - start);
290                         }
291
292                         cache->bytes_super += len;
293                         ret = add_excluded_extent(root, start, len);
294                         if (ret) {
295                                 kfree(logical);
296                                 return ret;
297                         }
298                 }
299
300                 kfree(logical);
301         }
302         return 0;
303 }
304
305 static struct btrfs_caching_control *
306 get_caching_control(struct btrfs_block_group_cache *cache)
307 {
308         struct btrfs_caching_control *ctl;
309
310         spin_lock(&cache->lock);
311         if (cache->cached != BTRFS_CACHE_STARTED) {
312                 spin_unlock(&cache->lock);
313                 return NULL;
314         }
315
316         /* We're loading it the fast way, so we don't have a caching_ctl. */
317         if (!cache->caching_ctl) {
318                 spin_unlock(&cache->lock);
319                 return NULL;
320         }
321
322         ctl = cache->caching_ctl;
323         atomic_inc(&ctl->count);
324         spin_unlock(&cache->lock);
325         return ctl;
326 }
327
328 static void put_caching_control(struct btrfs_caching_control *ctl)
329 {
330         if (atomic_dec_and_test(&ctl->count))
331                 kfree(ctl);
332 }
333
334 /*
335  * this is only called by cache_block_group, since we could have freed extents
336  * we need to check the pinned_extents for any extents that can't be used yet
337  * since their free space will be released as soon as the transaction commits.
338  */
339 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
340                               struct btrfs_fs_info *info, u64 start, u64 end)
341 {
342         u64 extent_start, extent_end, size, total_added = 0;
343         int ret;
344
345         while (start < end) {
346                 ret = find_first_extent_bit(info->pinned_extents, start,
347                                             &extent_start, &extent_end,
348                                             EXTENT_DIRTY | EXTENT_UPTODATE,
349                                             NULL);
350                 if (ret)
351                         break;
352
353                 if (extent_start <= start) {
354                         start = extent_end + 1;
355                 } else if (extent_start > start && extent_start < end) {
356                         size = extent_start - start;
357                         total_added += size;
358                         ret = btrfs_add_free_space(block_group, start,
359                                                    size);
360                         BUG_ON(ret); /* -ENOMEM or logic error */
361                         start = extent_end + 1;
362                 } else {
363                         break;
364                 }
365         }
366
367         if (start < end) {
368                 size = end - start;
369                 total_added += size;
370                 ret = btrfs_add_free_space(block_group, start, size);
371                 BUG_ON(ret); /* -ENOMEM or logic error */
372         }
373
374         return total_added;
375 }
376
377 static noinline void caching_thread(struct btrfs_work *work)
378 {
379         struct btrfs_block_group_cache *block_group;
380         struct btrfs_fs_info *fs_info;
381         struct btrfs_caching_control *caching_ctl;
382         struct btrfs_root *extent_root;
383         struct btrfs_path *path;
384         struct extent_buffer *leaf;
385         struct btrfs_key key;
386         u64 total_found = 0;
387         u64 last = 0;
388         u32 nritems;
389         int ret = 0;
390
391         caching_ctl = container_of(work, struct btrfs_caching_control, work);
392         block_group = caching_ctl->block_group;
393         fs_info = block_group->fs_info;
394         extent_root = fs_info->extent_root;
395
396         path = btrfs_alloc_path();
397         if (!path)
398                 goto out;
399
400         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
401
402         /*
403          * We don't want to deadlock with somebody trying to allocate a new
404          * extent for the extent root while also trying to search the extent
405          * root to add free space.  So we skip locking and search the commit
406          * root, since its read-only
407          */
408         path->skip_locking = 1;
409         path->search_commit_root = 1;
410         path->reada = 1;
411
412         key.objectid = last;
413         key.offset = 0;
414         key.type = BTRFS_EXTENT_ITEM_KEY;
415 again:
416         mutex_lock(&caching_ctl->mutex);
417         /* need to make sure the commit_root doesn't disappear */
418         down_read(&fs_info->extent_commit_sem);
419
420         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
421         if (ret < 0)
422                 goto err;
423
424         leaf = path->nodes[0];
425         nritems = btrfs_header_nritems(leaf);
426
427         while (1) {
428                 if (btrfs_fs_closing(fs_info) > 1) {
429                         last = (u64)-1;
430                         break;
431                 }
432
433                 if (path->slots[0] < nritems) {
434                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
435                 } else {
436                         ret = find_next_key(path, 0, &key);
437                         if (ret)
438                                 break;
439
440                         if (need_resched()) {
441                                 caching_ctl->progress = last;
442                                 btrfs_release_path(path);
443                                 up_read(&fs_info->extent_commit_sem);
444                                 mutex_unlock(&caching_ctl->mutex);
445                                 cond_resched();
446                                 goto again;
447                         }
448
449                         ret = btrfs_next_leaf(extent_root, path);
450                         if (ret < 0)
451                                 goto err;
452                         if (ret)
453                                 break;
454                         leaf = path->nodes[0];
455                         nritems = btrfs_header_nritems(leaf);
456                         continue;
457                 }
458
459                 if (key.objectid < block_group->key.objectid) {
460                         path->slots[0]++;
461                         continue;
462                 }
463
464                 if (key.objectid >= block_group->key.objectid +
465                     block_group->key.offset)
466                         break;
467
468                 if (key.type == BTRFS_EXTENT_ITEM_KEY ||
469                     key.type == BTRFS_METADATA_ITEM_KEY) {
470                         total_found += add_new_free_space(block_group,
471                                                           fs_info, last,
472                                                           key.objectid);
473                         if (key.type == BTRFS_METADATA_ITEM_KEY)
474                                 last = key.objectid +
475                                         fs_info->tree_root->leafsize;
476                         else
477                                 last = key.objectid + key.offset;
478
479                         if (total_found > (1024 * 1024 * 2)) {
480                                 total_found = 0;
481                                 wake_up(&caching_ctl->wait);
482                         }
483                 }
484                 path->slots[0]++;
485         }
486         ret = 0;
487
488         total_found += add_new_free_space(block_group, fs_info, last,
489                                           block_group->key.objectid +
490                                           block_group->key.offset);
491         caching_ctl->progress = (u64)-1;
492
493         spin_lock(&block_group->lock);
494         block_group->caching_ctl = NULL;
495         block_group->cached = BTRFS_CACHE_FINISHED;
496         spin_unlock(&block_group->lock);
497
498 err:
499         btrfs_free_path(path);
500         up_read(&fs_info->extent_commit_sem);
501
502         free_excluded_extents(extent_root, block_group);
503
504         mutex_unlock(&caching_ctl->mutex);
505 out:
506         wake_up(&caching_ctl->wait);
507
508         put_caching_control(caching_ctl);
509         btrfs_put_block_group(block_group);
510 }
511
512 static int cache_block_group(struct btrfs_block_group_cache *cache,
513                              int load_cache_only)
514 {
515         DEFINE_WAIT(wait);
516         struct btrfs_fs_info *fs_info = cache->fs_info;
517         struct btrfs_caching_control *caching_ctl;
518         int ret = 0;
519
520         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
521         if (!caching_ctl)
522                 return -ENOMEM;
523
524         INIT_LIST_HEAD(&caching_ctl->list);
525         mutex_init(&caching_ctl->mutex);
526         init_waitqueue_head(&caching_ctl->wait);
527         caching_ctl->block_group = cache;
528         caching_ctl->progress = cache->key.objectid;
529         atomic_set(&caching_ctl->count, 1);
530         caching_ctl->work.func = caching_thread;
531
532         spin_lock(&cache->lock);
533         /*
534          * This should be a rare occasion, but this could happen I think in the
535          * case where one thread starts to load the space cache info, and then
536          * some other thread starts a transaction commit which tries to do an
537          * allocation while the other thread is still loading the space cache
538          * info.  The previous loop should have kept us from choosing this block
539          * group, but if we've moved to the state where we will wait on caching
540          * block groups we need to first check if we're doing a fast load here,
541          * so we can wait for it to finish, otherwise we could end up allocating
542          * from a block group who's cache gets evicted for one reason or
543          * another.
544          */
545         while (cache->cached == BTRFS_CACHE_FAST) {
546                 struct btrfs_caching_control *ctl;
547
548                 ctl = cache->caching_ctl;
549                 atomic_inc(&ctl->count);
550                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
551                 spin_unlock(&cache->lock);
552
553                 schedule();
554
555                 finish_wait(&ctl->wait, &wait);
556                 put_caching_control(ctl);
557                 spin_lock(&cache->lock);
558         }
559
560         if (cache->cached != BTRFS_CACHE_NO) {
561                 spin_unlock(&cache->lock);
562                 kfree(caching_ctl);
563                 return 0;
564         }
565         WARN_ON(cache->caching_ctl);
566         cache->caching_ctl = caching_ctl;
567         cache->cached = BTRFS_CACHE_FAST;
568         spin_unlock(&cache->lock);
569
570         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
571                 ret = load_free_space_cache(fs_info, cache);
572
573                 spin_lock(&cache->lock);
574                 if (ret == 1) {
575                         cache->caching_ctl = NULL;
576                         cache->cached = BTRFS_CACHE_FINISHED;
577                         cache->last_byte_to_unpin = (u64)-1;
578                 } else {
579                         if (load_cache_only) {
580                                 cache->caching_ctl = NULL;
581                                 cache->cached = BTRFS_CACHE_NO;
582                         } else {
583                                 cache->cached = BTRFS_CACHE_STARTED;
584                         }
585                 }
586                 spin_unlock(&cache->lock);
587                 wake_up(&caching_ctl->wait);
588                 if (ret == 1) {
589                         put_caching_control(caching_ctl);
590                         free_excluded_extents(fs_info->extent_root, cache);
591                         return 0;
592                 }
593         } else {
594                 /*
595                  * We are not going to do the fast caching, set cached to the
596                  * appropriate value and wakeup any waiters.
597                  */
598                 spin_lock(&cache->lock);
599                 if (load_cache_only) {
600                         cache->caching_ctl = NULL;
601                         cache->cached = BTRFS_CACHE_NO;
602                 } else {
603                         cache->cached = BTRFS_CACHE_STARTED;
604                 }
605                 spin_unlock(&cache->lock);
606                 wake_up(&caching_ctl->wait);
607         }
608
609         if (load_cache_only) {
610                 put_caching_control(caching_ctl);
611                 return 0;
612         }
613
614         down_write(&fs_info->extent_commit_sem);
615         atomic_inc(&caching_ctl->count);
616         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
617         up_write(&fs_info->extent_commit_sem);
618
619         btrfs_get_block_group(cache);
620
621         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
622
623         return ret;
624 }
625
626 /*
627  * return the block group that starts at or after bytenr
628  */
629 static struct btrfs_block_group_cache *
630 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
631 {
632         struct btrfs_block_group_cache *cache;
633
634         cache = block_group_cache_tree_search(info, bytenr, 0);
635
636         return cache;
637 }
638
639 /*
640  * return the block group that contains the given bytenr
641  */
642 struct btrfs_block_group_cache *btrfs_lookup_block_group(
643                                                  struct btrfs_fs_info *info,
644                                                  u64 bytenr)
645 {
646         struct btrfs_block_group_cache *cache;
647
648         cache = block_group_cache_tree_search(info, bytenr, 1);
649
650         return cache;
651 }
652
653 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
654                                                   u64 flags)
655 {
656         struct list_head *head = &info->space_info;
657         struct btrfs_space_info *found;
658
659         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
660
661         rcu_read_lock();
662         list_for_each_entry_rcu(found, head, list) {
663                 if (found->flags & flags) {
664                         rcu_read_unlock();
665                         return found;
666                 }
667         }
668         rcu_read_unlock();
669         return NULL;
670 }
671
672 /*
673  * after adding space to the filesystem, we need to clear the full flags
674  * on all the space infos.
675  */
676 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
677 {
678         struct list_head *head = &info->space_info;
679         struct btrfs_space_info *found;
680
681         rcu_read_lock();
682         list_for_each_entry_rcu(found, head, list)
683                 found->full = 0;
684         rcu_read_unlock();
685 }
686
687 u64 btrfs_find_block_group(struct btrfs_root *root,
688                            u64 search_start, u64 search_hint, int owner)
689 {
690         struct btrfs_block_group_cache *cache;
691         u64 used;
692         u64 last = max(search_hint, search_start);
693         u64 group_start = 0;
694         int full_search = 0;
695         int factor = 9;
696         int wrapped = 0;
697 again:
698         while (1) {
699                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
700                 if (!cache)
701                         break;
702
703                 spin_lock(&cache->lock);
704                 last = cache->key.objectid + cache->key.offset;
705                 used = btrfs_block_group_used(&cache->item);
706
707                 if ((full_search || !cache->ro) &&
708                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
709                         if (used + cache->pinned + cache->reserved <
710                             div_factor(cache->key.offset, factor)) {
711                                 group_start = cache->key.objectid;
712                                 spin_unlock(&cache->lock);
713                                 btrfs_put_block_group(cache);
714                                 goto found;
715                         }
716                 }
717                 spin_unlock(&cache->lock);
718                 btrfs_put_block_group(cache);
719                 cond_resched();
720         }
721         if (!wrapped) {
722                 last = search_start;
723                 wrapped = 1;
724                 goto again;
725         }
726         if (!full_search && factor < 10) {
727                 last = search_start;
728                 full_search = 1;
729                 factor = 10;
730                 goto again;
731         }
732 found:
733         return group_start;
734 }
735
736 /* simple helper to search for an existing extent at a given offset */
737 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
738 {
739         int ret;
740         struct btrfs_key key;
741         struct btrfs_path *path;
742
743         path = btrfs_alloc_path();
744         if (!path)
745                 return -ENOMEM;
746
747         key.objectid = start;
748         key.offset = len;
749         key.type = BTRFS_EXTENT_ITEM_KEY;
750         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
751                                 0, 0);
752         if (ret > 0) {
753                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
754                 if (key.objectid == start &&
755                     key.type == BTRFS_METADATA_ITEM_KEY)
756                         ret = 0;
757         }
758         btrfs_free_path(path);
759         return ret;
760 }
761
762 /*
763  * helper function to lookup reference count and flags of a tree block.
764  *
765  * the head node for delayed ref is used to store the sum of all the
766  * reference count modifications queued up in the rbtree. the head
767  * node may also store the extent flags to set. This way you can check
768  * to see what the reference count and extent flags would be if all of
769  * the delayed refs are not processed.
770  */
771 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
772                              struct btrfs_root *root, u64 bytenr,
773                              u64 offset, int metadata, u64 *refs, u64 *flags)
774 {
775         struct btrfs_delayed_ref_head *head;
776         struct btrfs_delayed_ref_root *delayed_refs;
777         struct btrfs_path *path;
778         struct btrfs_extent_item *ei;
779         struct extent_buffer *leaf;
780         struct btrfs_key key;
781         u32 item_size;
782         u64 num_refs;
783         u64 extent_flags;
784         int ret;
785
786         /*
787          * If we don't have skinny metadata, don't bother doing anything
788          * different
789          */
790         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) {
791                 offset = root->leafsize;
792                 metadata = 0;
793         }
794
795         path = btrfs_alloc_path();
796         if (!path)
797                 return -ENOMEM;
798
799         if (metadata) {
800                 key.objectid = bytenr;
801                 key.type = BTRFS_METADATA_ITEM_KEY;
802                 key.offset = offset;
803         } else {
804                 key.objectid = bytenr;
805                 key.type = BTRFS_EXTENT_ITEM_KEY;
806                 key.offset = offset;
807         }
808
809         if (!trans) {
810                 path->skip_locking = 1;
811                 path->search_commit_root = 1;
812         }
813 again:
814         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
815                                 &key, path, 0, 0);
816         if (ret < 0)
817                 goto out_free;
818
819         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
820                 key.type = BTRFS_EXTENT_ITEM_KEY;
821                 key.offset = root->leafsize;
822                 btrfs_release_path(path);
823                 goto again;
824         }
825
826         if (ret == 0) {
827                 leaf = path->nodes[0];
828                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
829                 if (item_size >= sizeof(*ei)) {
830                         ei = btrfs_item_ptr(leaf, path->slots[0],
831                                             struct btrfs_extent_item);
832                         num_refs = btrfs_extent_refs(leaf, ei);
833                         extent_flags = btrfs_extent_flags(leaf, ei);
834                 } else {
835 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
836                         struct btrfs_extent_item_v0 *ei0;
837                         BUG_ON(item_size != sizeof(*ei0));
838                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
839                                              struct btrfs_extent_item_v0);
840                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
841                         /* FIXME: this isn't correct for data */
842                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
843 #else
844                         BUG();
845 #endif
846                 }
847                 BUG_ON(num_refs == 0);
848         } else {
849                 num_refs = 0;
850                 extent_flags = 0;
851                 ret = 0;
852         }
853
854         if (!trans)
855                 goto out;
856
857         delayed_refs = &trans->transaction->delayed_refs;
858         spin_lock(&delayed_refs->lock);
859         head = btrfs_find_delayed_ref_head(trans, bytenr);
860         if (head) {
861                 if (!mutex_trylock(&head->mutex)) {
862                         atomic_inc(&head->node.refs);
863                         spin_unlock(&delayed_refs->lock);
864
865                         btrfs_release_path(path);
866
867                         /*
868                          * Mutex was contended, block until it's released and try
869                          * again
870                          */
871                         mutex_lock(&head->mutex);
872                         mutex_unlock(&head->mutex);
873                         btrfs_put_delayed_ref(&head->node);
874                         goto again;
875                 }
876                 if (head->extent_op && head->extent_op->update_flags)
877                         extent_flags |= head->extent_op->flags_to_set;
878                 else
879                         BUG_ON(num_refs == 0);
880
881                 num_refs += head->node.ref_mod;
882                 mutex_unlock(&head->mutex);
883         }
884         spin_unlock(&delayed_refs->lock);
885 out:
886         WARN_ON(num_refs == 0);
887         if (refs)
888                 *refs = num_refs;
889         if (flags)
890                 *flags = extent_flags;
891 out_free:
892         btrfs_free_path(path);
893         return ret;
894 }
895
896 /*
897  * Back reference rules.  Back refs have three main goals:
898  *
899  * 1) differentiate between all holders of references to an extent so that
900  *    when a reference is dropped we can make sure it was a valid reference
901  *    before freeing the extent.
902  *
903  * 2) Provide enough information to quickly find the holders of an extent
904  *    if we notice a given block is corrupted or bad.
905  *
906  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
907  *    maintenance.  This is actually the same as #2, but with a slightly
908  *    different use case.
909  *
910  * There are two kinds of back refs. The implicit back refs is optimized
911  * for pointers in non-shared tree blocks. For a given pointer in a block,
912  * back refs of this kind provide information about the block's owner tree
913  * and the pointer's key. These information allow us to find the block by
914  * b-tree searching. The full back refs is for pointers in tree blocks not
915  * referenced by their owner trees. The location of tree block is recorded
916  * in the back refs. Actually the full back refs is generic, and can be
917  * used in all cases the implicit back refs is used. The major shortcoming
918  * of the full back refs is its overhead. Every time a tree block gets
919  * COWed, we have to update back refs entry for all pointers in it.
920  *
921  * For a newly allocated tree block, we use implicit back refs for
922  * pointers in it. This means most tree related operations only involve
923  * implicit back refs. For a tree block created in old transaction, the
924  * only way to drop a reference to it is COW it. So we can detect the
925  * event that tree block loses its owner tree's reference and do the
926  * back refs conversion.
927  *
928  * When a tree block is COW'd through a tree, there are four cases:
929  *
930  * The reference count of the block is one and the tree is the block's
931  * owner tree. Nothing to do in this case.
932  *
933  * The reference count of the block is one and the tree is not the
934  * block's owner tree. In this case, full back refs is used for pointers
935  * in the block. Remove these full back refs, add implicit back refs for
936  * every pointers in the new block.
937  *
938  * The reference count of the block is greater than one and the tree is
939  * the block's owner tree. In this case, implicit back refs is used for
940  * pointers in the block. Add full back refs for every pointers in the
941  * block, increase lower level extents' reference counts. The original
942  * implicit back refs are entailed to the new block.
943  *
944  * The reference count of the block is greater than one and the tree is
945  * not the block's owner tree. Add implicit back refs for every pointer in
946  * the new block, increase lower level extents' reference count.
947  *
948  * Back Reference Key composing:
949  *
950  * The key objectid corresponds to the first byte in the extent,
951  * The key type is used to differentiate between types of back refs.
952  * There are different meanings of the key offset for different types
953  * of back refs.
954  *
955  * File extents can be referenced by:
956  *
957  * - multiple snapshots, subvolumes, or different generations in one subvol
958  * - different files inside a single subvolume
959  * - different offsets inside a file (bookend extents in file.c)
960  *
961  * The extent ref structure for the implicit back refs has fields for:
962  *
963  * - Objectid of the subvolume root
964  * - objectid of the file holding the reference
965  * - original offset in the file
966  * - how many bookend extents
967  *
968  * The key offset for the implicit back refs is hash of the first
969  * three fields.
970  *
971  * The extent ref structure for the full back refs has field for:
972  *
973  * - number of pointers in the tree leaf
974  *
975  * The key offset for the implicit back refs is the first byte of
976  * the tree leaf
977  *
978  * When a file extent is allocated, The implicit back refs is used.
979  * the fields are filled in:
980  *
981  *     (root_key.objectid, inode objectid, offset in file, 1)
982  *
983  * When a file extent is removed file truncation, we find the
984  * corresponding implicit back refs and check the following fields:
985  *
986  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
987  *
988  * Btree extents can be referenced by:
989  *
990  * - Different subvolumes
991  *
992  * Both the implicit back refs and the full back refs for tree blocks
993  * only consist of key. The key offset for the implicit back refs is
994  * objectid of block's owner tree. The key offset for the full back refs
995  * is the first byte of parent block.
996  *
997  * When implicit back refs is used, information about the lowest key and
998  * level of the tree block are required. These information are stored in
999  * tree block info structure.
1000  */
1001
1002 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1003 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
1004                                   struct btrfs_root *root,
1005                                   struct btrfs_path *path,
1006                                   u64 owner, u32 extra_size)
1007 {
1008         struct btrfs_extent_item *item;
1009         struct btrfs_extent_item_v0 *ei0;
1010         struct btrfs_extent_ref_v0 *ref0;
1011         struct btrfs_tree_block_info *bi;
1012         struct extent_buffer *leaf;
1013         struct btrfs_key key;
1014         struct btrfs_key found_key;
1015         u32 new_size = sizeof(*item);
1016         u64 refs;
1017         int ret;
1018
1019         leaf = path->nodes[0];
1020         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
1021
1022         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1023         ei0 = btrfs_item_ptr(leaf, path->slots[0],
1024                              struct btrfs_extent_item_v0);
1025         refs = btrfs_extent_refs_v0(leaf, ei0);
1026
1027         if (owner == (u64)-1) {
1028                 while (1) {
1029                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
1030                                 ret = btrfs_next_leaf(root, path);
1031                                 if (ret < 0)
1032                                         return ret;
1033                                 BUG_ON(ret > 0); /* Corruption */
1034                                 leaf = path->nodes[0];
1035                         }
1036                         btrfs_item_key_to_cpu(leaf, &found_key,
1037                                               path->slots[0]);
1038                         BUG_ON(key.objectid != found_key.objectid);
1039                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
1040                                 path->slots[0]++;
1041                                 continue;
1042                         }
1043                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1044                                               struct btrfs_extent_ref_v0);
1045                         owner = btrfs_ref_objectid_v0(leaf, ref0);
1046                         break;
1047                 }
1048         }
1049         btrfs_release_path(path);
1050
1051         if (owner < BTRFS_FIRST_FREE_OBJECTID)
1052                 new_size += sizeof(*bi);
1053
1054         new_size -= sizeof(*ei0);
1055         ret = btrfs_search_slot(trans, root, &key, path,
1056                                 new_size + extra_size, 1);
1057         if (ret < 0)
1058                 return ret;
1059         BUG_ON(ret); /* Corruption */
1060
1061         btrfs_extend_item(root, path, new_size);
1062
1063         leaf = path->nodes[0];
1064         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1065         btrfs_set_extent_refs(leaf, item, refs);
1066         /* FIXME: get real generation */
1067         btrfs_set_extent_generation(leaf, item, 0);
1068         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1069                 btrfs_set_extent_flags(leaf, item,
1070                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1071                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1072                 bi = (struct btrfs_tree_block_info *)(item + 1);
1073                 /* FIXME: get first key of the block */
1074                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1075                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1076         } else {
1077                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1078         }
1079         btrfs_mark_buffer_dirty(leaf);
1080         return 0;
1081 }
1082 #endif
1083
1084 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1085 {
1086         u32 high_crc = ~(u32)0;
1087         u32 low_crc = ~(u32)0;
1088         __le64 lenum;
1089
1090         lenum = cpu_to_le64(root_objectid);
1091         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1092         lenum = cpu_to_le64(owner);
1093         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1094         lenum = cpu_to_le64(offset);
1095         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1096
1097         return ((u64)high_crc << 31) ^ (u64)low_crc;
1098 }
1099
1100 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1101                                      struct btrfs_extent_data_ref *ref)
1102 {
1103         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1104                                     btrfs_extent_data_ref_objectid(leaf, ref),
1105                                     btrfs_extent_data_ref_offset(leaf, ref));
1106 }
1107
1108 static int match_extent_data_ref(struct extent_buffer *leaf,
1109                                  struct btrfs_extent_data_ref *ref,
1110                                  u64 root_objectid, u64 owner, u64 offset)
1111 {
1112         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1113             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1114             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1115                 return 0;
1116         return 1;
1117 }
1118
1119 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1120                                            struct btrfs_root *root,
1121                                            struct btrfs_path *path,
1122                                            u64 bytenr, u64 parent,
1123                                            u64 root_objectid,
1124                                            u64 owner, u64 offset)
1125 {
1126         struct btrfs_key key;
1127         struct btrfs_extent_data_ref *ref;
1128         struct extent_buffer *leaf;
1129         u32 nritems;
1130         int ret;
1131         int recow;
1132         int err = -ENOENT;
1133
1134         key.objectid = bytenr;
1135         if (parent) {
1136                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1137                 key.offset = parent;
1138         } else {
1139                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1140                 key.offset = hash_extent_data_ref(root_objectid,
1141                                                   owner, offset);
1142         }
1143 again:
1144         recow = 0;
1145         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1146         if (ret < 0) {
1147                 err = ret;
1148                 goto fail;
1149         }
1150
1151         if (parent) {
1152                 if (!ret)
1153                         return 0;
1154 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1155                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1156                 btrfs_release_path(path);
1157                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1158                 if (ret < 0) {
1159                         err = ret;
1160                         goto fail;
1161                 }
1162                 if (!ret)
1163                         return 0;
1164 #endif
1165                 goto fail;
1166         }
1167
1168         leaf = path->nodes[0];
1169         nritems = btrfs_header_nritems(leaf);
1170         while (1) {
1171                 if (path->slots[0] >= nritems) {
1172                         ret = btrfs_next_leaf(root, path);
1173                         if (ret < 0)
1174                                 err = ret;
1175                         if (ret)
1176                                 goto fail;
1177
1178                         leaf = path->nodes[0];
1179                         nritems = btrfs_header_nritems(leaf);
1180                         recow = 1;
1181                 }
1182
1183                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1184                 if (key.objectid != bytenr ||
1185                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1186                         goto fail;
1187
1188                 ref = btrfs_item_ptr(leaf, path->slots[0],
1189                                      struct btrfs_extent_data_ref);
1190
1191                 if (match_extent_data_ref(leaf, ref, root_objectid,
1192                                           owner, offset)) {
1193                         if (recow) {
1194                                 btrfs_release_path(path);
1195                                 goto again;
1196                         }
1197                         err = 0;
1198                         break;
1199                 }
1200                 path->slots[0]++;
1201         }
1202 fail:
1203         return err;
1204 }
1205
1206 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1207                                            struct btrfs_root *root,
1208                                            struct btrfs_path *path,
1209                                            u64 bytenr, u64 parent,
1210                                            u64 root_objectid, u64 owner,
1211                                            u64 offset, int refs_to_add)
1212 {
1213         struct btrfs_key key;
1214         struct extent_buffer *leaf;
1215         u32 size;
1216         u32 num_refs;
1217         int ret;
1218
1219         key.objectid = bytenr;
1220         if (parent) {
1221                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1222                 key.offset = parent;
1223                 size = sizeof(struct btrfs_shared_data_ref);
1224         } else {
1225                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1226                 key.offset = hash_extent_data_ref(root_objectid,
1227                                                   owner, offset);
1228                 size = sizeof(struct btrfs_extent_data_ref);
1229         }
1230
1231         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1232         if (ret && ret != -EEXIST)
1233                 goto fail;
1234
1235         leaf = path->nodes[0];
1236         if (parent) {
1237                 struct btrfs_shared_data_ref *ref;
1238                 ref = btrfs_item_ptr(leaf, path->slots[0],
1239                                      struct btrfs_shared_data_ref);
1240                 if (ret == 0) {
1241                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1242                 } else {
1243                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1244                         num_refs += refs_to_add;
1245                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1246                 }
1247         } else {
1248                 struct btrfs_extent_data_ref *ref;
1249                 while (ret == -EEXIST) {
1250                         ref = btrfs_item_ptr(leaf, path->slots[0],
1251                                              struct btrfs_extent_data_ref);
1252                         if (match_extent_data_ref(leaf, ref, root_objectid,
1253                                                   owner, offset))
1254                                 break;
1255                         btrfs_release_path(path);
1256                         key.offset++;
1257                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1258                                                       size);
1259                         if (ret && ret != -EEXIST)
1260                                 goto fail;
1261
1262                         leaf = path->nodes[0];
1263                 }
1264                 ref = btrfs_item_ptr(leaf, path->slots[0],
1265                                      struct btrfs_extent_data_ref);
1266                 if (ret == 0) {
1267                         btrfs_set_extent_data_ref_root(leaf, ref,
1268                                                        root_objectid);
1269                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1270                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1271                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1272                 } else {
1273                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1274                         num_refs += refs_to_add;
1275                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1276                 }
1277         }
1278         btrfs_mark_buffer_dirty(leaf);
1279         ret = 0;
1280 fail:
1281         btrfs_release_path(path);
1282         return ret;
1283 }
1284
1285 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1286                                            struct btrfs_root *root,
1287                                            struct btrfs_path *path,
1288                                            int refs_to_drop)
1289 {
1290         struct btrfs_key key;
1291         struct btrfs_extent_data_ref *ref1 = NULL;
1292         struct btrfs_shared_data_ref *ref2 = NULL;
1293         struct extent_buffer *leaf;
1294         u32 num_refs = 0;
1295         int ret = 0;
1296
1297         leaf = path->nodes[0];
1298         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1299
1300         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1301                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1302                                       struct btrfs_extent_data_ref);
1303                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1304         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1305                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1306                                       struct btrfs_shared_data_ref);
1307                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1308 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1309         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1310                 struct btrfs_extent_ref_v0 *ref0;
1311                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1312                                       struct btrfs_extent_ref_v0);
1313                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1314 #endif
1315         } else {
1316                 BUG();
1317         }
1318
1319         BUG_ON(num_refs < refs_to_drop);
1320         num_refs -= refs_to_drop;
1321
1322         if (num_refs == 0) {
1323                 ret = btrfs_del_item(trans, root, path);
1324         } else {
1325                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1326                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1327                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1328                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1329 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1330                 else {
1331                         struct btrfs_extent_ref_v0 *ref0;
1332                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1333                                         struct btrfs_extent_ref_v0);
1334                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1335                 }
1336 #endif
1337                 btrfs_mark_buffer_dirty(leaf);
1338         }
1339         return ret;
1340 }
1341
1342 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1343                                           struct btrfs_path *path,
1344                                           struct btrfs_extent_inline_ref *iref)
1345 {
1346         struct btrfs_key key;
1347         struct extent_buffer *leaf;
1348         struct btrfs_extent_data_ref *ref1;
1349         struct btrfs_shared_data_ref *ref2;
1350         u32 num_refs = 0;
1351
1352         leaf = path->nodes[0];
1353         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1354         if (iref) {
1355                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1356                     BTRFS_EXTENT_DATA_REF_KEY) {
1357                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1358                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1359                 } else {
1360                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1361                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1362                 }
1363         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1364                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1365                                       struct btrfs_extent_data_ref);
1366                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1367         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1368                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1369                                       struct btrfs_shared_data_ref);
1370                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1371 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1372         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1373                 struct btrfs_extent_ref_v0 *ref0;
1374                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1375                                       struct btrfs_extent_ref_v0);
1376                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1377 #endif
1378         } else {
1379                 WARN_ON(1);
1380         }
1381         return num_refs;
1382 }
1383
1384 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1385                                           struct btrfs_root *root,
1386                                           struct btrfs_path *path,
1387                                           u64 bytenr, u64 parent,
1388                                           u64 root_objectid)
1389 {
1390         struct btrfs_key key;
1391         int ret;
1392
1393         key.objectid = bytenr;
1394         if (parent) {
1395                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1396                 key.offset = parent;
1397         } else {
1398                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1399                 key.offset = root_objectid;
1400         }
1401
1402         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1403         if (ret > 0)
1404                 ret = -ENOENT;
1405 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1406         if (ret == -ENOENT && parent) {
1407                 btrfs_release_path(path);
1408                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1409                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1410                 if (ret > 0)
1411                         ret = -ENOENT;
1412         }
1413 #endif
1414         return ret;
1415 }
1416
1417 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1418                                           struct btrfs_root *root,
1419                                           struct btrfs_path *path,
1420                                           u64 bytenr, u64 parent,
1421                                           u64 root_objectid)
1422 {
1423         struct btrfs_key key;
1424         int ret;
1425
1426         key.objectid = bytenr;
1427         if (parent) {
1428                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1429                 key.offset = parent;
1430         } else {
1431                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1432                 key.offset = root_objectid;
1433         }
1434
1435         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1436         btrfs_release_path(path);
1437         return ret;
1438 }
1439
1440 static inline int extent_ref_type(u64 parent, u64 owner)
1441 {
1442         int type;
1443         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1444                 if (parent > 0)
1445                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1446                 else
1447                         type = BTRFS_TREE_BLOCK_REF_KEY;
1448         } else {
1449                 if (parent > 0)
1450                         type = BTRFS_SHARED_DATA_REF_KEY;
1451                 else
1452                         type = BTRFS_EXTENT_DATA_REF_KEY;
1453         }
1454         return type;
1455 }
1456
1457 static int find_next_key(struct btrfs_path *path, int level,
1458                          struct btrfs_key *key)
1459
1460 {
1461         for (; level < BTRFS_MAX_LEVEL; level++) {
1462                 if (!path->nodes[level])
1463                         break;
1464                 if (path->slots[level] + 1 >=
1465                     btrfs_header_nritems(path->nodes[level]))
1466                         continue;
1467                 if (level == 0)
1468                         btrfs_item_key_to_cpu(path->nodes[level], key,
1469                                               path->slots[level] + 1);
1470                 else
1471                         btrfs_node_key_to_cpu(path->nodes[level], key,
1472                                               path->slots[level] + 1);
1473                 return 0;
1474         }
1475         return 1;
1476 }
1477
1478 /*
1479  * look for inline back ref. if back ref is found, *ref_ret is set
1480  * to the address of inline back ref, and 0 is returned.
1481  *
1482  * if back ref isn't found, *ref_ret is set to the address where it
1483  * should be inserted, and -ENOENT is returned.
1484  *
1485  * if insert is true and there are too many inline back refs, the path
1486  * points to the extent item, and -EAGAIN is returned.
1487  *
1488  * NOTE: inline back refs are ordered in the same way that back ref
1489  *       items in the tree are ordered.
1490  */
1491 static noinline_for_stack
1492 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1493                                  struct btrfs_root *root,
1494                                  struct btrfs_path *path,
1495                                  struct btrfs_extent_inline_ref **ref_ret,
1496                                  u64 bytenr, u64 num_bytes,
1497                                  u64 parent, u64 root_objectid,
1498                                  u64 owner, u64 offset, int insert)
1499 {
1500         struct btrfs_key key;
1501         struct extent_buffer *leaf;
1502         struct btrfs_extent_item *ei;
1503         struct btrfs_extent_inline_ref *iref;
1504         u64 flags;
1505         u64 item_size;
1506         unsigned long ptr;
1507         unsigned long end;
1508         int extra_size;
1509         int type;
1510         int want;
1511         int ret;
1512         int err = 0;
1513         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
1514                                                  SKINNY_METADATA);
1515
1516         key.objectid = bytenr;
1517         key.type = BTRFS_EXTENT_ITEM_KEY;
1518         key.offset = num_bytes;
1519
1520         want = extent_ref_type(parent, owner);
1521         if (insert) {
1522                 extra_size = btrfs_extent_inline_ref_size(want);
1523                 path->keep_locks = 1;
1524         } else
1525                 extra_size = -1;
1526
1527         /*
1528          * Owner is our parent level, so we can just add one to get the level
1529          * for the block we are interested in.
1530          */
1531         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
1532                 key.type = BTRFS_METADATA_ITEM_KEY;
1533                 key.offset = owner;
1534         }
1535
1536 again:
1537         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1538         if (ret < 0) {
1539                 err = ret;
1540                 goto out;
1541         }
1542
1543         /*
1544          * We may be a newly converted file system which still has the old fat
1545          * extent entries for metadata, so try and see if we have one of those.
1546          */
1547         if (ret > 0 && skinny_metadata) {
1548                 skinny_metadata = false;
1549                 if (path->slots[0]) {
1550                         path->slots[0]--;
1551                         btrfs_item_key_to_cpu(path->nodes[0], &key,
1552                                               path->slots[0]);
1553                         if (key.objectid == bytenr &&
1554                             key.type == BTRFS_EXTENT_ITEM_KEY &&
1555                             key.offset == num_bytes)
1556                                 ret = 0;
1557                 }
1558                 if (ret) {
1559                         key.type = BTRFS_EXTENT_ITEM_KEY;
1560                         key.offset = num_bytes;
1561                         btrfs_release_path(path);
1562                         goto again;
1563                 }
1564         }
1565
1566         if (ret && !insert) {
1567                 err = -ENOENT;
1568                 goto out;
1569         } else if (ret) {
1570                 err = -EIO;
1571                 WARN_ON(1);
1572                 goto out;
1573         }
1574
1575         leaf = path->nodes[0];
1576         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1577 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1578         if (item_size < sizeof(*ei)) {
1579                 if (!insert) {
1580                         err = -ENOENT;
1581                         goto out;
1582                 }
1583                 ret = convert_extent_item_v0(trans, root, path, owner,
1584                                              extra_size);
1585                 if (ret < 0) {
1586                         err = ret;
1587                         goto out;
1588                 }
1589                 leaf = path->nodes[0];
1590                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1591         }
1592 #endif
1593         BUG_ON(item_size < sizeof(*ei));
1594
1595         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1596         flags = btrfs_extent_flags(leaf, ei);
1597
1598         ptr = (unsigned long)(ei + 1);
1599         end = (unsigned long)ei + item_size;
1600
1601         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
1602                 ptr += sizeof(struct btrfs_tree_block_info);
1603                 BUG_ON(ptr > end);
1604         }
1605
1606         err = -ENOENT;
1607         while (1) {
1608                 if (ptr >= end) {
1609                         WARN_ON(ptr > end);
1610                         break;
1611                 }
1612                 iref = (struct btrfs_extent_inline_ref *)ptr;
1613                 type = btrfs_extent_inline_ref_type(leaf, iref);
1614                 if (want < type)
1615                         break;
1616                 if (want > type) {
1617                         ptr += btrfs_extent_inline_ref_size(type);
1618                         continue;
1619                 }
1620
1621                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1622                         struct btrfs_extent_data_ref *dref;
1623                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1624                         if (match_extent_data_ref(leaf, dref, root_objectid,
1625                                                   owner, offset)) {
1626                                 err = 0;
1627                                 break;
1628                         }
1629                         if (hash_extent_data_ref_item(leaf, dref) <
1630                             hash_extent_data_ref(root_objectid, owner, offset))
1631                                 break;
1632                 } else {
1633                         u64 ref_offset;
1634                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1635                         if (parent > 0) {
1636                                 if (parent == ref_offset) {
1637                                         err = 0;
1638                                         break;
1639                                 }
1640                                 if (ref_offset < parent)
1641                                         break;
1642                         } else {
1643                                 if (root_objectid == ref_offset) {
1644                                         err = 0;
1645                                         break;
1646                                 }
1647                                 if (ref_offset < root_objectid)
1648                                         break;
1649                         }
1650                 }
1651                 ptr += btrfs_extent_inline_ref_size(type);
1652         }
1653         if (err == -ENOENT && insert) {
1654                 if (item_size + extra_size >=
1655                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1656                         err = -EAGAIN;
1657                         goto out;
1658                 }
1659                 /*
1660                  * To add new inline back ref, we have to make sure
1661                  * there is no corresponding back ref item.
1662                  * For simplicity, we just do not add new inline back
1663                  * ref if there is any kind of item for this block
1664                  */
1665                 if (find_next_key(path, 0, &key) == 0 &&
1666                     key.objectid == bytenr &&
1667                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1668                         err = -EAGAIN;
1669                         goto out;
1670                 }
1671         }
1672         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1673 out:
1674         if (insert) {
1675                 path->keep_locks = 0;
1676                 btrfs_unlock_up_safe(path, 1);
1677         }
1678         return err;
1679 }
1680
1681 /*
1682  * helper to add new inline back ref
1683  */
1684 static noinline_for_stack
1685 void setup_inline_extent_backref(struct btrfs_root *root,
1686                                  struct btrfs_path *path,
1687                                  struct btrfs_extent_inline_ref *iref,
1688                                  u64 parent, u64 root_objectid,
1689                                  u64 owner, u64 offset, int refs_to_add,
1690                                  struct btrfs_delayed_extent_op *extent_op)
1691 {
1692         struct extent_buffer *leaf;
1693         struct btrfs_extent_item *ei;
1694         unsigned long ptr;
1695         unsigned long end;
1696         unsigned long item_offset;
1697         u64 refs;
1698         int size;
1699         int type;
1700
1701         leaf = path->nodes[0];
1702         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1703         item_offset = (unsigned long)iref - (unsigned long)ei;
1704
1705         type = extent_ref_type(parent, owner);
1706         size = btrfs_extent_inline_ref_size(type);
1707
1708         btrfs_extend_item(root, path, size);
1709
1710         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1711         refs = btrfs_extent_refs(leaf, ei);
1712         refs += refs_to_add;
1713         btrfs_set_extent_refs(leaf, ei, refs);
1714         if (extent_op)
1715                 __run_delayed_extent_op(extent_op, leaf, ei);
1716
1717         ptr = (unsigned long)ei + item_offset;
1718         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1719         if (ptr < end - size)
1720                 memmove_extent_buffer(leaf, ptr + size, ptr,
1721                                       end - size - ptr);
1722
1723         iref = (struct btrfs_extent_inline_ref *)ptr;
1724         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1725         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1726                 struct btrfs_extent_data_ref *dref;
1727                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1728                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1729                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1730                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1731                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1732         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1733                 struct btrfs_shared_data_ref *sref;
1734                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1735                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1736                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1737         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1738                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1739         } else {
1740                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1741         }
1742         btrfs_mark_buffer_dirty(leaf);
1743 }
1744
1745 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1746                                  struct btrfs_root *root,
1747                                  struct btrfs_path *path,
1748                                  struct btrfs_extent_inline_ref **ref_ret,
1749                                  u64 bytenr, u64 num_bytes, u64 parent,
1750                                  u64 root_objectid, u64 owner, u64 offset)
1751 {
1752         int ret;
1753
1754         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1755                                            bytenr, num_bytes, parent,
1756                                            root_objectid, owner, offset, 0);
1757         if (ret != -ENOENT)
1758                 return ret;
1759
1760         btrfs_release_path(path);
1761         *ref_ret = NULL;
1762
1763         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1764                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1765                                             root_objectid);
1766         } else {
1767                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1768                                              root_objectid, owner, offset);
1769         }
1770         return ret;
1771 }
1772
1773 /*
1774  * helper to update/remove inline back ref
1775  */
1776 static noinline_for_stack
1777 void update_inline_extent_backref(struct btrfs_root *root,
1778                                   struct btrfs_path *path,
1779                                   struct btrfs_extent_inline_ref *iref,
1780                                   int refs_to_mod,
1781                                   struct btrfs_delayed_extent_op *extent_op)
1782 {
1783         struct extent_buffer *leaf;
1784         struct btrfs_extent_item *ei;
1785         struct btrfs_extent_data_ref *dref = NULL;
1786         struct btrfs_shared_data_ref *sref = NULL;
1787         unsigned long ptr;
1788         unsigned long end;
1789         u32 item_size;
1790         int size;
1791         int type;
1792         u64 refs;
1793
1794         leaf = path->nodes[0];
1795         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1796         refs = btrfs_extent_refs(leaf, ei);
1797         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1798         refs += refs_to_mod;
1799         btrfs_set_extent_refs(leaf, ei, refs);
1800         if (extent_op)
1801                 __run_delayed_extent_op(extent_op, leaf, ei);
1802
1803         type = btrfs_extent_inline_ref_type(leaf, iref);
1804
1805         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1806                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1807                 refs = btrfs_extent_data_ref_count(leaf, dref);
1808         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1809                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1810                 refs = btrfs_shared_data_ref_count(leaf, sref);
1811         } else {
1812                 refs = 1;
1813                 BUG_ON(refs_to_mod != -1);
1814         }
1815
1816         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1817         refs += refs_to_mod;
1818
1819         if (refs > 0) {
1820                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1821                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1822                 else
1823                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1824         } else {
1825                 size =  btrfs_extent_inline_ref_size(type);
1826                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1827                 ptr = (unsigned long)iref;
1828                 end = (unsigned long)ei + item_size;
1829                 if (ptr + size < end)
1830                         memmove_extent_buffer(leaf, ptr, ptr + size,
1831                                               end - ptr - size);
1832                 item_size -= size;
1833                 btrfs_truncate_item(root, path, item_size, 1);
1834         }
1835         btrfs_mark_buffer_dirty(leaf);
1836 }
1837
1838 static noinline_for_stack
1839 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1840                                  struct btrfs_root *root,
1841                                  struct btrfs_path *path,
1842                                  u64 bytenr, u64 num_bytes, u64 parent,
1843                                  u64 root_objectid, u64 owner,
1844                                  u64 offset, int refs_to_add,
1845                                  struct btrfs_delayed_extent_op *extent_op)
1846 {
1847         struct btrfs_extent_inline_ref *iref;
1848         int ret;
1849
1850         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1851                                            bytenr, num_bytes, parent,
1852                                            root_objectid, owner, offset, 1);
1853         if (ret == 0) {
1854                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1855                 update_inline_extent_backref(root, path, iref,
1856                                              refs_to_add, extent_op);
1857         } else if (ret == -ENOENT) {
1858                 setup_inline_extent_backref(root, path, iref, parent,
1859                                             root_objectid, owner, offset,
1860                                             refs_to_add, extent_op);
1861                 ret = 0;
1862         }
1863         return ret;
1864 }
1865
1866 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1867                                  struct btrfs_root *root,
1868                                  struct btrfs_path *path,
1869                                  u64 bytenr, u64 parent, u64 root_objectid,
1870                                  u64 owner, u64 offset, int refs_to_add)
1871 {
1872         int ret;
1873         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1874                 BUG_ON(refs_to_add != 1);
1875                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1876                                             parent, root_objectid);
1877         } else {
1878                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1879                                              parent, root_objectid,
1880                                              owner, offset, refs_to_add);
1881         }
1882         return ret;
1883 }
1884
1885 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1886                                  struct btrfs_root *root,
1887                                  struct btrfs_path *path,
1888                                  struct btrfs_extent_inline_ref *iref,
1889                                  int refs_to_drop, int is_data)
1890 {
1891         int ret = 0;
1892
1893         BUG_ON(!is_data && refs_to_drop != 1);
1894         if (iref) {
1895                 update_inline_extent_backref(root, path, iref,
1896                                              -refs_to_drop, NULL);
1897         } else if (is_data) {
1898                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1899         } else {
1900                 ret = btrfs_del_item(trans, root, path);
1901         }
1902         return ret;
1903 }
1904
1905 static int btrfs_issue_discard(struct block_device *bdev,
1906                                 u64 start, u64 len)
1907 {
1908         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1909 }
1910
1911 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1912                                 u64 num_bytes, u64 *actual_bytes)
1913 {
1914         int ret;
1915         u64 discarded_bytes = 0;
1916         struct btrfs_bio *bbio = NULL;
1917
1918
1919         /* Tell the block device(s) that the sectors can be discarded */
1920         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1921                               bytenr, &num_bytes, &bbio, 0);
1922         /* Error condition is -ENOMEM */
1923         if (!ret) {
1924                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1925                 int i;
1926
1927
1928                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1929                         if (!stripe->dev->can_discard)
1930                                 continue;
1931
1932                         ret = btrfs_issue_discard(stripe->dev->bdev,
1933                                                   stripe->physical,
1934                                                   stripe->length);
1935                         if (!ret)
1936                                 discarded_bytes += stripe->length;
1937                         else if (ret != -EOPNOTSUPP)
1938                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1939
1940                         /*
1941                          * Just in case we get back EOPNOTSUPP for some reason,
1942                          * just ignore the return value so we don't screw up
1943                          * people calling discard_extent.
1944                          */
1945                         ret = 0;
1946                 }
1947                 kfree(bbio);
1948         }
1949
1950         if (actual_bytes)
1951                 *actual_bytes = discarded_bytes;
1952
1953
1954         if (ret == -EOPNOTSUPP)
1955                 ret = 0;
1956         return ret;
1957 }
1958
1959 /* Can return -ENOMEM */
1960 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1961                          struct btrfs_root *root,
1962                          u64 bytenr, u64 num_bytes, u64 parent,
1963                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1964 {
1965         int ret;
1966         struct btrfs_fs_info *fs_info = root->fs_info;
1967
1968         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1969                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1970
1971         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1972                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1973                                         num_bytes,
1974                                         parent, root_objectid, (int)owner,
1975                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1976         } else {
1977                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1978                                         num_bytes,
1979                                         parent, root_objectid, owner, offset,
1980                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1981         }
1982         return ret;
1983 }
1984
1985 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1986                                   struct btrfs_root *root,
1987                                   u64 bytenr, u64 num_bytes,
1988                                   u64 parent, u64 root_objectid,
1989                                   u64 owner, u64 offset, int refs_to_add,
1990                                   struct btrfs_delayed_extent_op *extent_op)
1991 {
1992         struct btrfs_path *path;
1993         struct extent_buffer *leaf;
1994         struct btrfs_extent_item *item;
1995         u64 refs;
1996         int ret;
1997         int err = 0;
1998
1999         path = btrfs_alloc_path();
2000         if (!path)
2001                 return -ENOMEM;
2002
2003         path->reada = 1;
2004         path->leave_spinning = 1;
2005         /* this will setup the path even if it fails to insert the back ref */
2006         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
2007                                            path, bytenr, num_bytes, parent,
2008                                            root_objectid, owner, offset,
2009                                            refs_to_add, extent_op);
2010         if (ret == 0)
2011                 goto out;
2012
2013         if (ret != -EAGAIN) {
2014                 err = ret;
2015                 goto out;
2016         }
2017
2018         leaf = path->nodes[0];
2019         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2020         refs = btrfs_extent_refs(leaf, item);
2021         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
2022         if (extent_op)
2023                 __run_delayed_extent_op(extent_op, leaf, item);
2024
2025         btrfs_mark_buffer_dirty(leaf);
2026         btrfs_release_path(path);
2027
2028         path->reada = 1;
2029         path->leave_spinning = 1;
2030
2031         /* now insert the actual backref */
2032         ret = insert_extent_backref(trans, root->fs_info->extent_root,
2033                                     path, bytenr, parent, root_objectid,
2034                                     owner, offset, refs_to_add);
2035         if (ret)
2036                 btrfs_abort_transaction(trans, root, ret);
2037 out:
2038         btrfs_free_path(path);
2039         return err;
2040 }
2041
2042 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
2043                                 struct btrfs_root *root,
2044                                 struct btrfs_delayed_ref_node *node,
2045                                 struct btrfs_delayed_extent_op *extent_op,
2046                                 int insert_reserved)
2047 {
2048         int ret = 0;
2049         struct btrfs_delayed_data_ref *ref;
2050         struct btrfs_key ins;
2051         u64 parent = 0;
2052         u64 ref_root = 0;
2053         u64 flags = 0;
2054
2055         ins.objectid = node->bytenr;
2056         ins.offset = node->num_bytes;
2057         ins.type = BTRFS_EXTENT_ITEM_KEY;
2058
2059         ref = btrfs_delayed_node_to_data_ref(node);
2060         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
2061                 parent = ref->parent;
2062         else
2063                 ref_root = ref->root;
2064
2065         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2066                 if (extent_op)
2067                         flags |= extent_op->flags_to_set;
2068                 ret = alloc_reserved_file_extent(trans, root,
2069                                                  parent, ref_root, flags,
2070                                                  ref->objectid, ref->offset,
2071                                                  &ins, node->ref_mod);
2072         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2073                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2074                                              node->num_bytes, parent,
2075                                              ref_root, ref->objectid,
2076                                              ref->offset, node->ref_mod,
2077                                              extent_op);
2078         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2079                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2080                                           node->num_bytes, parent,
2081                                           ref_root, ref->objectid,
2082                                           ref->offset, node->ref_mod,
2083                                           extent_op);
2084         } else {
2085                 BUG();
2086         }
2087         return ret;
2088 }
2089
2090 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
2091                                     struct extent_buffer *leaf,
2092                                     struct btrfs_extent_item *ei)
2093 {
2094         u64 flags = btrfs_extent_flags(leaf, ei);
2095         if (extent_op->update_flags) {
2096                 flags |= extent_op->flags_to_set;
2097                 btrfs_set_extent_flags(leaf, ei, flags);
2098         }
2099
2100         if (extent_op->update_key) {
2101                 struct btrfs_tree_block_info *bi;
2102                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2103                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2104                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2105         }
2106 }
2107
2108 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2109                                  struct btrfs_root *root,
2110                                  struct btrfs_delayed_ref_node *node,
2111                                  struct btrfs_delayed_extent_op *extent_op)
2112 {
2113         struct btrfs_key key;
2114         struct btrfs_path *path;
2115         struct btrfs_extent_item *ei;
2116         struct extent_buffer *leaf;
2117         u32 item_size;
2118         int ret;
2119         int err = 0;
2120         int metadata = (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2121                         node->type == BTRFS_SHARED_BLOCK_REF_KEY);
2122
2123         if (trans->aborted)
2124                 return 0;
2125
2126         if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA))
2127                 metadata = 0;
2128
2129         path = btrfs_alloc_path();
2130         if (!path)
2131                 return -ENOMEM;
2132
2133         key.objectid = node->bytenr;
2134
2135         if (metadata) {
2136                 struct btrfs_delayed_tree_ref *tree_ref;
2137
2138                 tree_ref = btrfs_delayed_node_to_tree_ref(node);
2139                 key.type = BTRFS_METADATA_ITEM_KEY;
2140                 key.offset = tree_ref->level;
2141         } else {
2142                 key.type = BTRFS_EXTENT_ITEM_KEY;
2143                 key.offset = node->num_bytes;
2144         }
2145
2146 again:
2147         path->reada = 1;
2148         path->leave_spinning = 1;
2149         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2150                                 path, 0, 1);
2151         if (ret < 0) {
2152                 err = ret;
2153                 goto out;
2154         }
2155         if (ret > 0) {
2156                 if (metadata) {
2157                         btrfs_release_path(path);
2158                         metadata = 0;
2159
2160                         key.offset = node->num_bytes;
2161                         key.type = BTRFS_EXTENT_ITEM_KEY;
2162                         goto again;
2163                 }
2164                 err = -EIO;
2165                 goto out;
2166         }
2167
2168         leaf = path->nodes[0];
2169         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2170 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2171         if (item_size < sizeof(*ei)) {
2172                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2173                                              path, (u64)-1, 0);
2174                 if (ret < 0) {
2175                         err = ret;
2176                         goto out;
2177                 }
2178                 leaf = path->nodes[0];
2179                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2180         }
2181 #endif
2182         BUG_ON(item_size < sizeof(*ei));
2183         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2184         __run_delayed_extent_op(extent_op, leaf, ei);
2185
2186         btrfs_mark_buffer_dirty(leaf);
2187 out:
2188         btrfs_free_path(path);
2189         return err;
2190 }
2191
2192 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2193                                 struct btrfs_root *root,
2194                                 struct btrfs_delayed_ref_node *node,
2195                                 struct btrfs_delayed_extent_op *extent_op,
2196                                 int insert_reserved)
2197 {
2198         int ret = 0;
2199         struct btrfs_delayed_tree_ref *ref;
2200         struct btrfs_key ins;
2201         u64 parent = 0;
2202         u64 ref_root = 0;
2203         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
2204                                                  SKINNY_METADATA);
2205
2206         ref = btrfs_delayed_node_to_tree_ref(node);
2207         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2208                 parent = ref->parent;
2209         else
2210                 ref_root = ref->root;
2211
2212         ins.objectid = node->bytenr;
2213         if (skinny_metadata) {
2214                 ins.offset = ref->level;
2215                 ins.type = BTRFS_METADATA_ITEM_KEY;
2216         } else {
2217                 ins.offset = node->num_bytes;
2218                 ins.type = BTRFS_EXTENT_ITEM_KEY;
2219         }
2220
2221         BUG_ON(node->ref_mod != 1);
2222         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2223                 BUG_ON(!extent_op || !extent_op->update_flags);
2224                 ret = alloc_reserved_tree_block(trans, root,
2225                                                 parent, ref_root,
2226                                                 extent_op->flags_to_set,
2227                                                 &extent_op->key,
2228                                                 ref->level, &ins);
2229         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2230                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2231                                              node->num_bytes, parent, ref_root,
2232                                              ref->level, 0, 1, extent_op);
2233         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2234                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2235                                           node->num_bytes, parent, ref_root,
2236                                           ref->level, 0, 1, extent_op);
2237         } else {
2238                 BUG();
2239         }
2240         return ret;
2241 }
2242
2243 /* helper function to actually process a single delayed ref entry */
2244 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2245                                struct btrfs_root *root,
2246                                struct btrfs_delayed_ref_node *node,
2247                                struct btrfs_delayed_extent_op *extent_op,
2248                                int insert_reserved)
2249 {
2250         int ret = 0;
2251
2252         if (trans->aborted)
2253                 return 0;
2254
2255         if (btrfs_delayed_ref_is_head(node)) {
2256                 struct btrfs_delayed_ref_head *head;
2257                 /*
2258                  * we've hit the end of the chain and we were supposed
2259                  * to insert this extent into the tree.  But, it got
2260                  * deleted before we ever needed to insert it, so all
2261                  * we have to do is clean up the accounting
2262                  */
2263                 BUG_ON(extent_op);
2264                 head = btrfs_delayed_node_to_head(node);
2265                 if (insert_reserved) {
2266                         btrfs_pin_extent(root, node->bytenr,
2267                                          node->num_bytes, 1);
2268                         if (head->is_data) {
2269                                 ret = btrfs_del_csums(trans, root,
2270                                                       node->bytenr,
2271                                                       node->num_bytes);
2272                         }
2273                 }
2274                 return ret;
2275         }
2276
2277         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2278             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2279                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2280                                            insert_reserved);
2281         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2282                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2283                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2284                                            insert_reserved);
2285         else
2286                 BUG();
2287         return ret;
2288 }
2289
2290 static noinline struct btrfs_delayed_ref_node *
2291 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2292 {
2293         struct rb_node *node;
2294         struct btrfs_delayed_ref_node *ref;
2295         int action = BTRFS_ADD_DELAYED_REF;
2296 again:
2297         /*
2298          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2299          * this prevents ref count from going down to zero when
2300          * there still are pending delayed ref.
2301          */
2302         node = rb_prev(&head->node.rb_node);
2303         while (1) {
2304                 if (!node)
2305                         break;
2306                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2307                                 rb_node);
2308                 if (ref->bytenr != head->node.bytenr)
2309                         break;
2310                 if (ref->action == action)
2311                         return ref;
2312                 node = rb_prev(node);
2313         }
2314         if (action == BTRFS_ADD_DELAYED_REF) {
2315                 action = BTRFS_DROP_DELAYED_REF;
2316                 goto again;
2317         }
2318         return NULL;
2319 }
2320
2321 /*
2322  * Returns 0 on success or if called with an already aborted transaction.
2323  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2324  */
2325 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2326                                        struct btrfs_root *root,
2327                                        struct list_head *cluster)
2328 {
2329         struct btrfs_delayed_ref_root *delayed_refs;
2330         struct btrfs_delayed_ref_node *ref;
2331         struct btrfs_delayed_ref_head *locked_ref = NULL;
2332         struct btrfs_delayed_extent_op *extent_op;
2333         struct btrfs_fs_info *fs_info = root->fs_info;
2334         int ret;
2335         int count = 0;
2336         int must_insert_reserved = 0;
2337
2338         delayed_refs = &trans->transaction->delayed_refs;
2339         while (1) {
2340                 if (!locked_ref) {
2341                         /* pick a new head ref from the cluster list */
2342                         if (list_empty(cluster))
2343                                 break;
2344
2345                         locked_ref = list_entry(cluster->next,
2346                                      struct btrfs_delayed_ref_head, cluster);
2347
2348                         /* grab the lock that says we are going to process
2349                          * all the refs for this head */
2350                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2351
2352                         /*
2353                          * we may have dropped the spin lock to get the head
2354                          * mutex lock, and that might have given someone else
2355                          * time to free the head.  If that's true, it has been
2356                          * removed from our list and we can move on.
2357                          */
2358                         if (ret == -EAGAIN) {
2359                                 locked_ref = NULL;
2360                                 count++;
2361                                 continue;
2362                         }
2363                 }
2364
2365                 /*
2366                  * We need to try and merge add/drops of the same ref since we
2367                  * can run into issues with relocate dropping the implicit ref
2368                  * and then it being added back again before the drop can
2369                  * finish.  If we merged anything we need to re-loop so we can
2370                  * get a good ref.
2371                  */
2372                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2373                                          locked_ref);
2374
2375                 /*
2376                  * locked_ref is the head node, so we have to go one
2377                  * node back for any delayed ref updates
2378                  */
2379                 ref = select_delayed_ref(locked_ref);
2380
2381                 if (ref && ref->seq &&
2382                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2383                         /*
2384                          * there are still refs with lower seq numbers in the
2385                          * process of being added. Don't run this ref yet.
2386                          */
2387                         list_del_init(&locked_ref->cluster);
2388                         btrfs_delayed_ref_unlock(locked_ref);
2389                         locked_ref = NULL;
2390                         delayed_refs->num_heads_ready++;
2391                         spin_unlock(&delayed_refs->lock);
2392                         cond_resched();
2393                         spin_lock(&delayed_refs->lock);
2394                         continue;
2395                 }
2396
2397                 /*
2398                  * record the must insert reserved flag before we
2399                  * drop the spin lock.
2400                  */
2401                 must_insert_reserved = locked_ref->must_insert_reserved;
2402                 locked_ref->must_insert_reserved = 0;
2403
2404                 extent_op = locked_ref->extent_op;
2405                 locked_ref->extent_op = NULL;
2406
2407                 if (!ref) {
2408                         /* All delayed refs have been processed, Go ahead
2409                          * and send the head node to run_one_delayed_ref,
2410                          * so that any accounting fixes can happen
2411                          */
2412                         ref = &locked_ref->node;
2413
2414                         if (extent_op && must_insert_reserved) {
2415                                 btrfs_free_delayed_extent_op(extent_op);
2416                                 extent_op = NULL;
2417                         }
2418
2419                         if (extent_op) {
2420                                 spin_unlock(&delayed_refs->lock);
2421
2422                                 ret = run_delayed_extent_op(trans, root,
2423                                                             ref, extent_op);
2424                                 btrfs_free_delayed_extent_op(extent_op);
2425
2426                                 if (ret) {
2427                                         btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
2428                                         spin_lock(&delayed_refs->lock);
2429                                         btrfs_delayed_ref_unlock(locked_ref);
2430                                         return ret;
2431                                 }
2432
2433                                 goto next;
2434                         }
2435                 }
2436
2437                 ref->in_tree = 0;
2438                 rb_erase(&ref->rb_node, &delayed_refs->root);
2439                 delayed_refs->num_entries--;
2440                 if (!btrfs_delayed_ref_is_head(ref)) {
2441                         /*
2442                          * when we play the delayed ref, also correct the
2443                          * ref_mod on head
2444                          */
2445                         switch (ref->action) {
2446                         case BTRFS_ADD_DELAYED_REF:
2447                         case BTRFS_ADD_DELAYED_EXTENT:
2448                                 locked_ref->node.ref_mod -= ref->ref_mod;
2449                                 break;
2450                         case BTRFS_DROP_DELAYED_REF:
2451                                 locked_ref->node.ref_mod += ref->ref_mod;
2452                                 break;
2453                         default:
2454                                 WARN_ON(1);
2455                         }
2456                 }
2457                 spin_unlock(&delayed_refs->lock);
2458
2459                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2460                                           must_insert_reserved);
2461
2462                 btrfs_free_delayed_extent_op(extent_op);
2463                 if (ret) {
2464                         btrfs_delayed_ref_unlock(locked_ref);
2465                         btrfs_put_delayed_ref(ref);
2466                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d", ret);
2467                         spin_lock(&delayed_refs->lock);
2468                         return ret;
2469                 }
2470
2471                 /*
2472                  * If this node is a head, that means all the refs in this head
2473                  * have been dealt with, and we will pick the next head to deal
2474                  * with, so we must unlock the head and drop it from the cluster
2475                  * list before we release it.
2476                  */
2477                 if (btrfs_delayed_ref_is_head(ref)) {
2478                         list_del_init(&locked_ref->cluster);
2479                         btrfs_delayed_ref_unlock(locked_ref);
2480                         locked_ref = NULL;
2481                 }
2482                 btrfs_put_delayed_ref(ref);
2483                 count++;
2484 next:
2485                 cond_resched();
2486                 spin_lock(&delayed_refs->lock);
2487         }
2488         return count;
2489 }
2490
2491 #ifdef SCRAMBLE_DELAYED_REFS
2492 /*
2493  * Normally delayed refs get processed in ascending bytenr order. This
2494  * correlates in most cases to the order added. To expose dependencies on this
2495  * order, we start to process the tree in the middle instead of the beginning
2496  */
2497 static u64 find_middle(struct rb_root *root)
2498 {
2499         struct rb_node *n = root->rb_node;
2500         struct btrfs_delayed_ref_node *entry;
2501         int alt = 1;
2502         u64 middle;
2503         u64 first = 0, last = 0;
2504
2505         n = rb_first(root);
2506         if (n) {
2507                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2508                 first = entry->bytenr;
2509         }
2510         n = rb_last(root);
2511         if (n) {
2512                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2513                 last = entry->bytenr;
2514         }
2515         n = root->rb_node;
2516
2517         while (n) {
2518                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2519                 WARN_ON(!entry->in_tree);
2520
2521                 middle = entry->bytenr;
2522
2523                 if (alt)
2524                         n = n->rb_left;
2525                 else
2526                         n = n->rb_right;
2527
2528                 alt = 1 - alt;
2529         }
2530         return middle;
2531 }
2532 #endif
2533
2534 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2535                                          struct btrfs_fs_info *fs_info)
2536 {
2537         struct qgroup_update *qgroup_update;
2538         int ret = 0;
2539
2540         if (list_empty(&trans->qgroup_ref_list) !=
2541             !trans->delayed_ref_elem.seq) {
2542                 /* list without seq or seq without list */
2543                 btrfs_err(fs_info,
2544                         "qgroup accounting update error, list is%s empty, seq is %llu",
2545                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2546                         trans->delayed_ref_elem.seq);
2547                 BUG();
2548         }
2549
2550         if (!trans->delayed_ref_elem.seq)
2551                 return 0;
2552
2553         while (!list_empty(&trans->qgroup_ref_list)) {
2554                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2555                                                  struct qgroup_update, list);
2556                 list_del(&qgroup_update->list);
2557                 if (!ret)
2558                         ret = btrfs_qgroup_account_ref(
2559                                         trans, fs_info, qgroup_update->node,
2560                                         qgroup_update->extent_op);
2561                 kfree(qgroup_update);
2562         }
2563
2564         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2565
2566         return ret;
2567 }
2568
2569 static int refs_newer(struct btrfs_delayed_ref_root *delayed_refs, int seq,
2570                       int count)
2571 {
2572         int val = atomic_read(&delayed_refs->ref_seq);
2573
2574         if (val < seq || val >= seq + count)
2575                 return 1;
2576         return 0;
2577 }
2578
2579 /*
2580  * this starts processing the delayed reference count updates and
2581  * extent insertions we have queued up so far.  count can be
2582  * 0, which means to process everything in the tree at the start
2583  * of the run (but not newly added entries), or it can be some target
2584  * number you'd like to process.
2585  *
2586  * Returns 0 on success or if called with an aborted transaction
2587  * Returns <0 on error and aborts the transaction
2588  */
2589 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2590                            struct btrfs_root *root, unsigned long count)
2591 {
2592         struct rb_node *node;
2593         struct btrfs_delayed_ref_root *delayed_refs;
2594         struct btrfs_delayed_ref_node *ref;
2595         struct list_head cluster;
2596         int ret;
2597         u64 delayed_start;
2598         int run_all = count == (unsigned long)-1;
2599         int run_most = 0;
2600         int loops;
2601
2602         /* We'll clean this up in btrfs_cleanup_transaction */
2603         if (trans->aborted)
2604                 return 0;
2605
2606         if (root == root->fs_info->extent_root)
2607                 root = root->fs_info->tree_root;
2608
2609         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2610
2611         delayed_refs = &trans->transaction->delayed_refs;
2612         INIT_LIST_HEAD(&cluster);
2613         if (count == 0) {
2614                 count = delayed_refs->num_entries * 2;
2615                 run_most = 1;
2616         }
2617
2618         if (!run_all && !run_most) {
2619                 int old;
2620                 int seq = atomic_read(&delayed_refs->ref_seq);
2621
2622 progress:
2623                 old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2624                 if (old) {
2625                         DEFINE_WAIT(__wait);
2626                         if (delayed_refs->num_entries < 16348)
2627                                 return 0;
2628
2629                         prepare_to_wait(&delayed_refs->wait, &__wait,
2630                                         TASK_UNINTERRUPTIBLE);
2631
2632                         old = atomic_cmpxchg(&delayed_refs->procs_running_refs, 0, 1);
2633                         if (old) {
2634                                 schedule();
2635                                 finish_wait(&delayed_refs->wait, &__wait);
2636
2637                                 if (!refs_newer(delayed_refs, seq, 256))
2638                                         goto progress;
2639                                 else
2640                                         return 0;
2641                         } else {
2642                                 finish_wait(&delayed_refs->wait, &__wait);
2643                                 goto again;
2644                         }
2645                 }
2646
2647         } else {
2648                 atomic_inc(&delayed_refs->procs_running_refs);
2649         }
2650
2651 again:
2652         loops = 0;
2653         spin_lock(&delayed_refs->lock);
2654
2655 #ifdef SCRAMBLE_DELAYED_REFS
2656         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2657 #endif
2658
2659         while (1) {
2660                 if (!(run_all || run_most) &&
2661                     delayed_refs->num_heads_ready < 64)
2662                         break;
2663
2664                 /*
2665                  * go find something we can process in the rbtree.  We start at
2666                  * the beginning of the tree, and then build a cluster
2667                  * of refs to process starting at the first one we are able to
2668                  * lock
2669                  */
2670                 delayed_start = delayed_refs->run_delayed_start;
2671                 ret = btrfs_find_ref_cluster(trans, &cluster,
2672                                              delayed_refs->run_delayed_start);
2673                 if (ret)
2674                         break;
2675
2676                 ret = run_clustered_refs(trans, root, &cluster);
2677                 if (ret < 0) {
2678                         btrfs_release_ref_cluster(&cluster);
2679                         spin_unlock(&delayed_refs->lock);
2680                         btrfs_abort_transaction(trans, root, ret);
2681                         atomic_dec(&delayed_refs->procs_running_refs);
2682                         return ret;
2683                 }
2684
2685                 atomic_add(ret, &delayed_refs->ref_seq);
2686
2687                 count -= min_t(unsigned long, ret, count);
2688
2689                 if (count == 0)
2690                         break;
2691
2692                 if (delayed_start >= delayed_refs->run_delayed_start) {
2693                         if (loops == 0) {
2694                                 /*
2695                                  * btrfs_find_ref_cluster looped. let's do one
2696                                  * more cycle. if we don't run any delayed ref
2697                                  * during that cycle (because we can't because
2698                                  * all of them are blocked), bail out.
2699                                  */
2700                                 loops = 1;
2701                         } else {
2702                                 /*
2703                                  * no runnable refs left, stop trying
2704                                  */
2705                                 BUG_ON(run_all);
2706                                 break;
2707                         }
2708                 }
2709                 if (ret) {
2710                         /* refs were run, let's reset staleness detection */
2711                         loops = 0;
2712                 }
2713         }
2714
2715         if (run_all) {
2716                 if (!list_empty(&trans->new_bgs)) {
2717                         spin_unlock(&delayed_refs->lock);
2718                         btrfs_create_pending_block_groups(trans, root);
2719                         spin_lock(&delayed_refs->lock);
2720                 }
2721
2722                 node = rb_first(&delayed_refs->root);
2723                 if (!node)
2724                         goto out;
2725                 count = (unsigned long)-1;
2726
2727                 while (node) {
2728                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2729                                        rb_node);
2730                         if (btrfs_delayed_ref_is_head(ref)) {
2731                                 struct btrfs_delayed_ref_head *head;
2732
2733                                 head = btrfs_delayed_node_to_head(ref);
2734                                 atomic_inc(&ref->refs);
2735
2736                                 spin_unlock(&delayed_refs->lock);
2737                                 /*
2738                                  * Mutex was contended, block until it's
2739                                  * released and try again
2740                                  */
2741                                 mutex_lock(&head->mutex);
2742                                 mutex_unlock(&head->mutex);
2743
2744                                 btrfs_put_delayed_ref(ref);
2745                                 cond_resched();
2746                                 goto again;
2747                         }
2748                         node = rb_next(node);
2749                 }
2750                 spin_unlock(&delayed_refs->lock);
2751                 schedule_timeout(1);
2752                 goto again;
2753         }
2754 out:
2755         atomic_dec(&delayed_refs->procs_running_refs);
2756         smp_mb();
2757         if (waitqueue_active(&delayed_refs->wait))
2758                 wake_up(&delayed_refs->wait);
2759
2760         spin_unlock(&delayed_refs->lock);
2761         assert_qgroups_uptodate(trans);
2762         return 0;
2763 }
2764
2765 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2766                                 struct btrfs_root *root,
2767                                 u64 bytenr, u64 num_bytes, u64 flags,
2768                                 int is_data)
2769 {
2770         struct btrfs_delayed_extent_op *extent_op;
2771         int ret;
2772
2773         extent_op = btrfs_alloc_delayed_extent_op();
2774         if (!extent_op)
2775                 return -ENOMEM;
2776
2777         extent_op->flags_to_set = flags;
2778         extent_op->update_flags = 1;
2779         extent_op->update_key = 0;
2780         extent_op->is_data = is_data ? 1 : 0;
2781
2782         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2783                                           num_bytes, extent_op);
2784         if (ret)
2785                 btrfs_free_delayed_extent_op(extent_op);
2786         return ret;
2787 }
2788
2789 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2790                                       struct btrfs_root *root,
2791                                       struct btrfs_path *path,
2792                                       u64 objectid, u64 offset, u64 bytenr)
2793 {
2794         struct btrfs_delayed_ref_head *head;
2795         struct btrfs_delayed_ref_node *ref;
2796         struct btrfs_delayed_data_ref *data_ref;
2797         struct btrfs_delayed_ref_root *delayed_refs;
2798         struct rb_node *node;
2799         int ret = 0;
2800
2801         ret = -ENOENT;
2802         delayed_refs = &trans->transaction->delayed_refs;
2803         spin_lock(&delayed_refs->lock);
2804         head = btrfs_find_delayed_ref_head(trans, bytenr);
2805         if (!head)
2806                 goto out;
2807
2808         if (!mutex_trylock(&head->mutex)) {
2809                 atomic_inc(&head->node.refs);
2810                 spin_unlock(&delayed_refs->lock);
2811
2812                 btrfs_release_path(path);
2813
2814                 /*
2815                  * Mutex was contended, block until it's released and let
2816                  * caller try again
2817                  */
2818                 mutex_lock(&head->mutex);
2819                 mutex_unlock(&head->mutex);
2820                 btrfs_put_delayed_ref(&head->node);
2821                 return -EAGAIN;
2822         }
2823
2824         node = rb_prev(&head->node.rb_node);
2825         if (!node)
2826                 goto out_unlock;
2827
2828         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2829
2830         if (ref->bytenr != bytenr)
2831                 goto out_unlock;
2832
2833         ret = 1;
2834         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2835                 goto out_unlock;
2836
2837         data_ref = btrfs_delayed_node_to_data_ref(ref);
2838
2839         node = rb_prev(node);
2840         if (node) {
2841                 int seq = ref->seq;
2842
2843                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2844                 if (ref->bytenr == bytenr && ref->seq == seq)
2845                         goto out_unlock;
2846         }
2847
2848         if (data_ref->root != root->root_key.objectid ||
2849             data_ref->objectid != objectid || data_ref->offset != offset)
2850                 goto out_unlock;
2851
2852         ret = 0;
2853 out_unlock:
2854         mutex_unlock(&head->mutex);
2855 out:
2856         spin_unlock(&delayed_refs->lock);
2857         return ret;
2858 }
2859
2860 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2861                                         struct btrfs_root *root,
2862                                         struct btrfs_path *path,
2863                                         u64 objectid, u64 offset, u64 bytenr)
2864 {
2865         struct btrfs_root *extent_root = root->fs_info->extent_root;
2866         struct extent_buffer *leaf;
2867         struct btrfs_extent_data_ref *ref;
2868         struct btrfs_extent_inline_ref *iref;
2869         struct btrfs_extent_item *ei;
2870         struct btrfs_key key;
2871         u32 item_size;
2872         int ret;
2873
2874         key.objectid = bytenr;
2875         key.offset = (u64)-1;
2876         key.type = BTRFS_EXTENT_ITEM_KEY;
2877
2878         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2879         if (ret < 0)
2880                 goto out;
2881         BUG_ON(ret == 0); /* Corruption */
2882
2883         ret = -ENOENT;
2884         if (path->slots[0] == 0)
2885                 goto out;
2886
2887         path->slots[0]--;
2888         leaf = path->nodes[0];
2889         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2890
2891         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2892                 goto out;
2893
2894         ret = 1;
2895         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2896 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2897         if (item_size < sizeof(*ei)) {
2898                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2899                 goto out;
2900         }
2901 #endif
2902         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2903
2904         if (item_size != sizeof(*ei) +
2905             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2906                 goto out;
2907
2908         if (btrfs_extent_generation(leaf, ei) <=
2909             btrfs_root_last_snapshot(&root->root_item))
2910                 goto out;
2911
2912         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2913         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2914             BTRFS_EXTENT_DATA_REF_KEY)
2915                 goto out;
2916
2917         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2918         if (btrfs_extent_refs(leaf, ei) !=
2919             btrfs_extent_data_ref_count(leaf, ref) ||
2920             btrfs_extent_data_ref_root(leaf, ref) !=
2921             root->root_key.objectid ||
2922             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2923             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2924                 goto out;
2925
2926         ret = 0;
2927 out:
2928         return ret;
2929 }
2930
2931 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2932                           struct btrfs_root *root,
2933                           u64 objectid, u64 offset, u64 bytenr)
2934 {
2935         struct btrfs_path *path;
2936         int ret;
2937         int ret2;
2938
2939         path = btrfs_alloc_path();
2940         if (!path)
2941                 return -ENOENT;
2942
2943         do {
2944                 ret = check_committed_ref(trans, root, path, objectid,
2945                                           offset, bytenr);
2946                 if (ret && ret != -ENOENT)
2947                         goto out;
2948
2949                 ret2 = check_delayed_ref(trans, root, path, objectid,
2950                                          offset, bytenr);
2951         } while (ret2 == -EAGAIN);
2952
2953         if (ret2 && ret2 != -ENOENT) {
2954                 ret = ret2;
2955                 goto out;
2956         }
2957
2958         if (ret != -ENOENT || ret2 != -ENOENT)
2959                 ret = 0;
2960 out:
2961         btrfs_free_path(path);
2962         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2963                 WARN_ON(ret > 0);
2964         return ret;
2965 }
2966
2967 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2968                            struct btrfs_root *root,
2969                            struct extent_buffer *buf,
2970                            int full_backref, int inc, int for_cow)
2971 {
2972         u64 bytenr;
2973         u64 num_bytes;
2974         u64 parent;
2975         u64 ref_root;
2976         u32 nritems;
2977         struct btrfs_key key;
2978         struct btrfs_file_extent_item *fi;
2979         int i;
2980         int level;
2981         int ret = 0;
2982         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2983                             u64, u64, u64, u64, u64, u64, int);
2984
2985         ref_root = btrfs_header_owner(buf);
2986         nritems = btrfs_header_nritems(buf);
2987         level = btrfs_header_level(buf);
2988
2989         if (!root->ref_cows && level == 0)
2990                 return 0;
2991
2992         if (inc)
2993                 process_func = btrfs_inc_extent_ref;
2994         else
2995                 process_func = btrfs_free_extent;
2996
2997         if (full_backref)
2998                 parent = buf->start;
2999         else
3000                 parent = 0;
3001
3002         for (i = 0; i < nritems; i++) {
3003                 if (level == 0) {
3004                         btrfs_item_key_to_cpu(buf, &key, i);
3005                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
3006                                 continue;
3007                         fi = btrfs_item_ptr(buf, i,
3008                                             struct btrfs_file_extent_item);
3009                         if (btrfs_file_extent_type(buf, fi) ==
3010                             BTRFS_FILE_EXTENT_INLINE)
3011                                 continue;
3012                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
3013                         if (bytenr == 0)
3014                                 continue;
3015
3016                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
3017                         key.offset -= btrfs_file_extent_offset(buf, fi);
3018                         ret = process_func(trans, root, bytenr, num_bytes,
3019                                            parent, ref_root, key.objectid,
3020                                            key.offset, for_cow);
3021                         if (ret)
3022                                 goto fail;
3023                 } else {
3024                         bytenr = btrfs_node_blockptr(buf, i);
3025                         num_bytes = btrfs_level_size(root, level - 1);
3026                         ret = process_func(trans, root, bytenr, num_bytes,
3027                                            parent, ref_root, level - 1, 0,
3028                                            for_cow);
3029                         if (ret)
3030                                 goto fail;
3031                 }
3032         }
3033         return 0;
3034 fail:
3035         return ret;
3036 }
3037
3038 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3039                   struct extent_buffer *buf, int full_backref, int for_cow)
3040 {
3041         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
3042 }
3043
3044 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
3045                   struct extent_buffer *buf, int full_backref, int for_cow)
3046 {
3047         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
3048 }
3049
3050 static int write_one_cache_group(struct btrfs_trans_handle *trans,
3051                                  struct btrfs_root *root,
3052                                  struct btrfs_path *path,
3053                                  struct btrfs_block_group_cache *cache)
3054 {
3055         int ret;
3056         struct btrfs_root *extent_root = root->fs_info->extent_root;
3057         unsigned long bi;
3058         struct extent_buffer *leaf;
3059
3060         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
3061         if (ret < 0)
3062                 goto fail;
3063         BUG_ON(ret); /* Corruption */
3064
3065         leaf = path->nodes[0];
3066         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
3067         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
3068         btrfs_mark_buffer_dirty(leaf);
3069         btrfs_release_path(path);
3070 fail:
3071         if (ret) {
3072                 btrfs_abort_transaction(trans, root, ret);
3073                 return ret;
3074         }
3075         return 0;
3076
3077 }
3078
3079 static struct btrfs_block_group_cache *
3080 next_block_group(struct btrfs_root *root,
3081                  struct btrfs_block_group_cache *cache)
3082 {
3083         struct rb_node *node;
3084         spin_lock(&root->fs_info->block_group_cache_lock);
3085         node = rb_next(&cache->cache_node);
3086         btrfs_put_block_group(cache);
3087         if (node) {
3088                 cache = rb_entry(node, struct btrfs_block_group_cache,
3089                                  cache_node);
3090                 btrfs_get_block_group(cache);
3091         } else
3092                 cache = NULL;
3093         spin_unlock(&root->fs_info->block_group_cache_lock);
3094         return cache;
3095 }
3096
3097 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
3098                             struct btrfs_trans_handle *trans,
3099                             struct btrfs_path *path)
3100 {
3101         struct btrfs_root *root = block_group->fs_info->tree_root;
3102         struct inode *inode = NULL;
3103         u64 alloc_hint = 0;
3104         int dcs = BTRFS_DC_ERROR;
3105         int num_pages = 0;
3106         int retries = 0;
3107         int ret = 0;
3108
3109         /*
3110          * If this block group is smaller than 100 megs don't bother caching the
3111          * block group.
3112          */
3113         if (block_group->key.offset < (100 * 1024 * 1024)) {
3114                 spin_lock(&block_group->lock);
3115                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
3116                 spin_unlock(&block_group->lock);
3117                 return 0;
3118         }
3119
3120 again:
3121         inode = lookup_free_space_inode(root, block_group, path);
3122         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
3123                 ret = PTR_ERR(inode);
3124                 btrfs_release_path(path);
3125                 goto out;
3126         }
3127
3128         if (IS_ERR(inode)) {
3129                 BUG_ON(retries);
3130                 retries++;
3131
3132                 if (block_group->ro)
3133                         goto out_free;
3134
3135                 ret = create_free_space_inode(root, trans, block_group, path);
3136                 if (ret)
3137                         goto out_free;
3138                 goto again;
3139         }
3140
3141         /* We've already setup this transaction, go ahead and exit */
3142         if (block_group->cache_generation == trans->transid &&
3143             i_size_read(inode)) {
3144                 dcs = BTRFS_DC_SETUP;
3145                 goto out_put;
3146         }
3147
3148         /*
3149          * We want to set the generation to 0, that way if anything goes wrong
3150          * from here on out we know not to trust this cache when we load up next
3151          * time.
3152          */
3153         BTRFS_I(inode)->generation = 0;
3154         ret = btrfs_update_inode(trans, root, inode);
3155         WARN_ON(ret);
3156
3157         if (i_size_read(inode) > 0) {
3158                 ret = btrfs_truncate_free_space_cache(root, trans, path,
3159                                                       inode);
3160                 if (ret)
3161                         goto out_put;
3162         }
3163
3164         spin_lock(&block_group->lock);
3165         if (block_group->cached != BTRFS_CACHE_FINISHED ||
3166             !btrfs_test_opt(root, SPACE_CACHE)) {
3167                 /*
3168                  * don't bother trying to write stuff out _if_
3169                  * a) we're not cached,
3170                  * b) we're with nospace_cache mount option.
3171                  */
3172                 dcs = BTRFS_DC_WRITTEN;
3173                 spin_unlock(&block_group->lock);
3174                 goto out_put;
3175         }
3176         spin_unlock(&block_group->lock);
3177
3178         /*
3179          * Try to preallocate enough space based on how big the block group is.
3180          * Keep in mind this has to include any pinned space which could end up
3181          * taking up quite a bit since it's not folded into the other space
3182          * cache.
3183          */
3184         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3185         if (!num_pages)
3186                 num_pages = 1;
3187
3188         num_pages *= 16;
3189         num_pages *= PAGE_CACHE_SIZE;
3190
3191         ret = btrfs_check_data_free_space(inode, num_pages);
3192         if (ret)
3193                 goto out_put;
3194
3195         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3196                                               num_pages, num_pages,
3197                                               &alloc_hint);
3198         if (!ret)
3199                 dcs = BTRFS_DC_SETUP;
3200         btrfs_free_reserved_data_space(inode, num_pages);
3201
3202 out_put:
3203         iput(inode);
3204 out_free:
3205         btrfs_release_path(path);
3206 out:
3207         spin_lock(&block_group->lock);
3208         if (!ret && dcs == BTRFS_DC_SETUP)
3209                 block_group->cache_generation = trans->transid;
3210         block_group->disk_cache_state = dcs;
3211         spin_unlock(&block_group->lock);
3212
3213         return ret;
3214 }
3215
3216 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3217                                    struct btrfs_root *root)
3218 {
3219         struct btrfs_block_group_cache *cache;
3220         int err = 0;
3221         struct btrfs_path *path;
3222         u64 last = 0;
3223
3224         path = btrfs_alloc_path();
3225         if (!path)
3226                 return -ENOMEM;
3227
3228 again:
3229         while (1) {
3230                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3231                 while (cache) {
3232                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3233                                 break;
3234                         cache = next_block_group(root, cache);
3235                 }
3236                 if (!cache) {
3237                         if (last == 0)
3238                                 break;
3239                         last = 0;
3240                         continue;
3241                 }
3242                 err = cache_save_setup(cache, trans, path);
3243                 last = cache->key.objectid + cache->key.offset;
3244                 btrfs_put_block_group(cache);
3245         }
3246
3247         while (1) {
3248                 if (last == 0) {
3249                         err = btrfs_run_delayed_refs(trans, root,
3250                                                      (unsigned long)-1);
3251                         if (err) /* File system offline */
3252                                 goto out;
3253                 }
3254
3255                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3256                 while (cache) {
3257                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3258                                 btrfs_put_block_group(cache);
3259                                 goto again;
3260                         }
3261
3262                         if (cache->dirty)
3263                                 break;
3264                         cache = next_block_group(root, cache);
3265                 }
3266                 if (!cache) {
3267                         if (last == 0)
3268                                 break;
3269                         last = 0;
3270                         continue;
3271                 }
3272
3273                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3274                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3275                 cache->dirty = 0;
3276                 last = cache->key.objectid + cache->key.offset;
3277
3278                 err = write_one_cache_group(trans, root, path, cache);
3279                 if (err) /* File system offline */
3280                         goto out;
3281
3282                 btrfs_put_block_group(cache);
3283         }
3284
3285         while (1) {
3286                 /*
3287                  * I don't think this is needed since we're just marking our
3288                  * preallocated extent as written, but just in case it can't
3289                  * hurt.
3290                  */
3291                 if (last == 0) {
3292                         err = btrfs_run_delayed_refs(trans, root,
3293                                                      (unsigned long)-1);
3294                         if (err) /* File system offline */
3295                                 goto out;
3296                 }
3297
3298                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3299                 while (cache) {
3300                         /*
3301                          * Really this shouldn't happen, but it could if we
3302                          * couldn't write the entire preallocated extent and
3303                          * splitting the extent resulted in a new block.
3304                          */
3305                         if (cache->dirty) {
3306                                 btrfs_put_block_group(cache);
3307                                 goto again;
3308                         }
3309                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3310                                 break;
3311                         cache = next_block_group(root, cache);
3312                 }
3313                 if (!cache) {
3314                         if (last == 0)
3315                                 break;
3316                         last = 0;
3317                         continue;
3318                 }
3319
3320                 err = btrfs_write_out_cache(root, trans, cache, path);
3321
3322                 /*
3323                  * If we didn't have an error then the cache state is still
3324                  * NEED_WRITE, so we can set it to WRITTEN.
3325                  */
3326                 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3327                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
3328                 last = cache->key.objectid + cache->key.offset;
3329                 btrfs_put_block_group(cache);
3330         }
3331 out:
3332
3333         btrfs_free_path(path);
3334         return err;
3335 }
3336
3337 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3338 {
3339         struct btrfs_block_group_cache *block_group;
3340         int readonly = 0;
3341
3342         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3343         if (!block_group || block_group->ro)
3344                 readonly = 1;
3345         if (block_group)
3346                 btrfs_put_block_group(block_group);
3347         return readonly;
3348 }
3349
3350 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3351                              u64 total_bytes, u64 bytes_used,
3352                              struct btrfs_space_info **space_info)
3353 {
3354         struct btrfs_space_info *found;
3355         int i;
3356         int factor;
3357
3358         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3359                      BTRFS_BLOCK_GROUP_RAID10))
3360                 factor = 2;
3361         else
3362                 factor = 1;
3363
3364         found = __find_space_info(info, flags);
3365         if (found) {
3366                 spin_lock(&found->lock);
3367                 found->total_bytes += total_bytes;
3368                 found->disk_total += total_bytes * factor;
3369                 found->bytes_used += bytes_used;
3370                 found->disk_used += bytes_used * factor;
3371                 found->full = 0;
3372                 spin_unlock(&found->lock);
3373                 *space_info = found;
3374                 return 0;
3375         }
3376         found = kzalloc(sizeof(*found), GFP_NOFS);
3377         if (!found)
3378                 return -ENOMEM;
3379
3380         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3381                 INIT_LIST_HEAD(&found->block_groups[i]);
3382         init_rwsem(&found->groups_sem);
3383         spin_lock_init(&found->lock);
3384         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3385         found->total_bytes = total_bytes;
3386         found->disk_total = total_bytes * factor;
3387         found->bytes_used = bytes_used;
3388         found->disk_used = bytes_used * factor;
3389         found->bytes_pinned = 0;
3390         found->bytes_reserved = 0;
3391         found->bytes_readonly = 0;
3392         found->bytes_may_use = 0;
3393         found->full = 0;
3394         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3395         found->chunk_alloc = 0;
3396         found->flush = 0;
3397         init_waitqueue_head(&found->wait);
3398         *space_info = found;
3399         list_add_rcu(&found->list, &info->space_info);
3400         if (flags & BTRFS_BLOCK_GROUP_DATA)
3401                 info->data_sinfo = found;
3402         return 0;
3403 }
3404
3405 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3406 {
3407         u64 extra_flags = chunk_to_extended(flags) &
3408                                 BTRFS_EXTENDED_PROFILE_MASK;
3409
3410         write_seqlock(&fs_info->profiles_lock);
3411         if (flags & BTRFS_BLOCK_GROUP_DATA)
3412                 fs_info->avail_data_alloc_bits |= extra_flags;
3413         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3414                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3415         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3416                 fs_info->avail_system_alloc_bits |= extra_flags;
3417         write_sequnlock(&fs_info->profiles_lock);
3418 }
3419
3420 /*
3421  * returns target flags in extended format or 0 if restripe for this
3422  * chunk_type is not in progress
3423  *
3424  * should be called with either volume_mutex or balance_lock held
3425  */
3426 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3427 {
3428         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3429         u64 target = 0;
3430
3431         if (!bctl)
3432                 return 0;
3433
3434         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3435             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3436                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3437         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3438                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3439                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3440         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3441                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3442                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3443         }
3444
3445         return target;
3446 }
3447
3448 /*
3449  * @flags: available profiles in extended format (see ctree.h)
3450  *
3451  * Returns reduced profile in chunk format.  If profile changing is in
3452  * progress (either running or paused) picks the target profile (if it's
3453  * already available), otherwise falls back to plain reducing.
3454  */
3455 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3456 {
3457         /*
3458          * we add in the count of missing devices because we want
3459          * to make sure that any RAID levels on a degraded FS
3460          * continue to be honored.
3461          */
3462         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3463                 root->fs_info->fs_devices->missing_devices;
3464         u64 target;
3465         u64 tmp;
3466
3467         /*
3468          * see if restripe for this chunk_type is in progress, if so
3469          * try to reduce to the target profile
3470          */
3471         spin_lock(&root->fs_info->balance_lock);
3472         target = get_restripe_target(root->fs_info, flags);
3473         if (target) {
3474                 /* pick target profile only if it's already available */
3475                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3476                         spin_unlock(&root->fs_info->balance_lock);
3477                         return extended_to_chunk(target);
3478                 }
3479         }
3480         spin_unlock(&root->fs_info->balance_lock);
3481
3482         /* First, mask out the RAID levels which aren't possible */
3483         if (num_devices == 1)
3484                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0 |
3485                            BTRFS_BLOCK_GROUP_RAID5);
3486         if (num_devices < 3)
3487                 flags &= ~BTRFS_BLOCK_GROUP_RAID6;
3488         if (num_devices < 4)
3489                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3490
3491         tmp = flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID0 |
3492                        BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID5 |
3493                        BTRFS_BLOCK_GROUP_RAID6 | BTRFS_BLOCK_GROUP_RAID10);
3494         flags &= ~tmp;
3495
3496         if (tmp & BTRFS_BLOCK_GROUP_RAID6)
3497                 tmp = BTRFS_BLOCK_GROUP_RAID6;
3498         else if (tmp & BTRFS_BLOCK_GROUP_RAID5)
3499                 tmp = BTRFS_BLOCK_GROUP_RAID5;
3500         else if (tmp & BTRFS_BLOCK_GROUP_RAID10)
3501                 tmp = BTRFS_BLOCK_GROUP_RAID10;
3502         else if (tmp & BTRFS_BLOCK_GROUP_RAID1)
3503                 tmp = BTRFS_BLOCK_GROUP_RAID1;
3504         else if (tmp & BTRFS_BLOCK_GROUP_RAID0)
3505                 tmp = BTRFS_BLOCK_GROUP_RAID0;
3506
3507         return extended_to_chunk(flags | tmp);
3508 }
3509
3510 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3511 {
3512         unsigned seq;
3513
3514         do {
3515                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3516
3517                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3518                         flags |= root->fs_info->avail_data_alloc_bits;
3519                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3520                         flags |= root->fs_info->avail_system_alloc_bits;
3521                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3522                         flags |= root->fs_info->avail_metadata_alloc_bits;
3523         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3524
3525         return btrfs_reduce_alloc_profile(root, flags);
3526 }
3527
3528 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3529 {
3530         u64 flags;
3531         u64 ret;
3532
3533         if (data)
3534                 flags = BTRFS_BLOCK_GROUP_DATA;
3535         else if (root == root->fs_info->chunk_root)
3536                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3537         else
3538                 flags = BTRFS_BLOCK_GROUP_METADATA;
3539
3540         ret = get_alloc_profile(root, flags);
3541         return ret;
3542 }
3543
3544 /*
3545  * This will check the space that the inode allocates from to make sure we have
3546  * enough space for bytes.
3547  */
3548 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3549 {
3550         struct btrfs_space_info *data_sinfo;
3551         struct btrfs_root *root = BTRFS_I(inode)->root;
3552         struct btrfs_fs_info *fs_info = root->fs_info;
3553         u64 used;
3554         int ret = 0, committed = 0, alloc_chunk = 1;
3555
3556         /* make sure bytes are sectorsize aligned */
3557         bytes = ALIGN(bytes, root->sectorsize);
3558
3559         if (root == root->fs_info->tree_root ||
3560             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3561                 alloc_chunk = 0;
3562                 committed = 1;
3563         }
3564
3565         data_sinfo = fs_info->data_sinfo;
3566         if (!data_sinfo)
3567                 goto alloc;
3568
3569 again:
3570         /* make sure we have enough space to handle the data first */
3571         spin_lock(&data_sinfo->lock);
3572         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3573                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3574                 data_sinfo->bytes_may_use;
3575
3576         if (used + bytes > data_sinfo->total_bytes) {
3577                 struct btrfs_trans_handle *trans;
3578
3579                 /*
3580                  * if we don't have enough free bytes in this space then we need
3581                  * to alloc a new chunk.
3582                  */
3583                 if (!data_sinfo->full && alloc_chunk) {
3584                         u64 alloc_target;
3585
3586                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3587                         spin_unlock(&data_sinfo->lock);
3588 alloc:
3589                         alloc_target = btrfs_get_alloc_profile(root, 1);
3590                         trans = btrfs_join_transaction(root);
3591                         if (IS_ERR(trans))
3592                                 return PTR_ERR(trans);
3593
3594                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3595                                              alloc_target,
3596                                              CHUNK_ALLOC_NO_FORCE);
3597                         btrfs_end_transaction(trans, root);
3598                         if (ret < 0) {
3599                                 if (ret != -ENOSPC)
3600                                         return ret;
3601                                 else
3602                                         goto commit_trans;
3603                         }
3604
3605                         if (!data_sinfo)
3606                                 data_sinfo = fs_info->data_sinfo;
3607
3608                         goto again;
3609                 }
3610
3611                 /*
3612                  * If we have less pinned bytes than we want to allocate then
3613                  * don't bother committing the transaction, it won't help us.
3614                  */
3615                 if (data_sinfo->bytes_pinned < bytes)
3616                         committed = 1;
3617                 spin_unlock(&data_sinfo->lock);
3618
3619                 /* commit the current transaction and try again */
3620 commit_trans:
3621                 if (!committed &&
3622                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3623                         committed = 1;
3624                         trans = btrfs_join_transaction(root);
3625                         if (IS_ERR(trans))
3626                                 return PTR_ERR(trans);
3627                         ret = btrfs_commit_transaction(trans, root);
3628                         if (ret)
3629                                 return ret;
3630                         goto again;
3631                 }
3632
3633                 return -ENOSPC;
3634         }
3635         data_sinfo->bytes_may_use += bytes;
3636         trace_btrfs_space_reservation(root->fs_info, "space_info",
3637                                       data_sinfo->flags, bytes, 1);
3638         spin_unlock(&data_sinfo->lock);
3639
3640         return 0;
3641 }
3642
3643 /*
3644  * Called if we need to clear a data reservation for this inode.
3645  */
3646 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3647 {
3648         struct btrfs_root *root = BTRFS_I(inode)->root;
3649         struct btrfs_space_info *data_sinfo;
3650
3651         /* make sure bytes are sectorsize aligned */
3652         bytes = ALIGN(bytes, root->sectorsize);
3653
3654         data_sinfo = root->fs_info->data_sinfo;
3655         spin_lock(&data_sinfo->lock);
3656         data_sinfo->bytes_may_use -= bytes;
3657         trace_btrfs_space_reservation(root->fs_info, "space_info",
3658                                       data_sinfo->flags, bytes, 0);
3659         spin_unlock(&data_sinfo->lock);
3660 }
3661
3662 static void force_metadata_allocation(struct btrfs_fs_info *info)
3663 {
3664         struct list_head *head = &info->space_info;
3665         struct btrfs_space_info *found;
3666
3667         rcu_read_lock();
3668         list_for_each_entry_rcu(found, head, list) {
3669                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3670                         found->force_alloc = CHUNK_ALLOC_FORCE;
3671         }
3672         rcu_read_unlock();
3673 }
3674
3675 static int should_alloc_chunk(struct btrfs_root *root,
3676                               struct btrfs_space_info *sinfo, int force)
3677 {
3678         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3679         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3680         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3681         u64 thresh;
3682
3683         if (force == CHUNK_ALLOC_FORCE)
3684                 return 1;
3685
3686         /*
3687          * We need to take into account the global rsv because for all intents
3688          * and purposes it's used space.  Don't worry about locking the
3689          * global_rsv, it doesn't change except when the transaction commits.
3690          */
3691         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3692                 num_allocated += global_rsv->size;
3693
3694         /*
3695          * in limited mode, we want to have some free space up to
3696          * about 1% of the FS size.
3697          */
3698         if (force == CHUNK_ALLOC_LIMITED) {
3699                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3700                 thresh = max_t(u64, 64 * 1024 * 1024,
3701                                div_factor_fine(thresh, 1));
3702
3703                 if (num_bytes - num_allocated < thresh)
3704                         return 1;
3705         }
3706
3707         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3708                 return 0;
3709         return 1;
3710 }
3711
3712 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3713 {
3714         u64 num_dev;
3715
3716         if (type & (BTRFS_BLOCK_GROUP_RAID10 |
3717                     BTRFS_BLOCK_GROUP_RAID0 |
3718                     BTRFS_BLOCK_GROUP_RAID5 |
3719                     BTRFS_BLOCK_GROUP_RAID6))
3720                 num_dev = root->fs_info->fs_devices->rw_devices;
3721         else if (type & BTRFS_BLOCK_GROUP_RAID1)
3722                 num_dev = 2;
3723         else
3724                 num_dev = 1;    /* DUP or single */
3725
3726         /* metadata for updaing devices and chunk tree */
3727         return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3728 }
3729
3730 static void check_system_chunk(struct btrfs_trans_handle *trans,
3731                                struct btrfs_root *root, u64 type)
3732 {
3733         struct btrfs_space_info *info;
3734         u64 left;
3735         u64 thresh;
3736
3737         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3738         spin_lock(&info->lock);
3739         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3740                 info->bytes_reserved - info->bytes_readonly;
3741         spin_unlock(&info->lock);
3742
3743         thresh = get_system_chunk_thresh(root, type);
3744         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3745                 btrfs_info(root->fs_info, "left=%llu, need=%llu, flags=%llu",
3746                         left, thresh, type);
3747                 dump_space_info(info, 0, 0);
3748         }
3749
3750         if (left < thresh) {
3751                 u64 flags;
3752
3753                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3754                 btrfs_alloc_chunk(trans, root, flags);
3755         }
3756 }
3757
3758 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3759                           struct btrfs_root *extent_root, u64 flags, int force)
3760 {
3761         struct btrfs_space_info *space_info;
3762         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3763         int wait_for_alloc = 0;
3764         int ret = 0;
3765
3766         /* Don't re-enter if we're already allocating a chunk */
3767         if (trans->allocating_chunk)
3768                 return -ENOSPC;
3769
3770         space_info = __find_space_info(extent_root->fs_info, flags);
3771         if (!space_info) {
3772                 ret = update_space_info(extent_root->fs_info, flags,
3773                                         0, 0, &space_info);
3774                 BUG_ON(ret); /* -ENOMEM */
3775         }
3776         BUG_ON(!space_info); /* Logic error */
3777
3778 again:
3779         spin_lock(&space_info->lock);
3780         if (force < space_info->force_alloc)
3781                 force = space_info->force_alloc;
3782         if (space_info->full) {
3783                 spin_unlock(&space_info->lock);
3784                 return 0;
3785         }
3786
3787         if (!should_alloc_chunk(extent_root, space_info, force)) {
3788                 spin_unlock(&space_info->lock);
3789                 return 0;
3790         } else if (space_info->chunk_alloc) {
3791                 wait_for_alloc = 1;
3792         } else {
3793                 space_info->chunk_alloc = 1;
3794         }
3795
3796         spin_unlock(&space_info->lock);
3797
3798         mutex_lock(&fs_info->chunk_mutex);
3799
3800         /*
3801          * The chunk_mutex is held throughout the entirety of a chunk
3802          * allocation, so once we've acquired the chunk_mutex we know that the
3803          * other guy is done and we need to recheck and see if we should
3804          * allocate.
3805          */
3806         if (wait_for_alloc) {
3807                 mutex_unlock(&fs_info->chunk_mutex);
3808                 wait_for_alloc = 0;
3809                 goto again;
3810         }
3811
3812         trans->allocating_chunk = true;
3813
3814         /*
3815          * If we have mixed data/metadata chunks we want to make sure we keep
3816          * allocating mixed chunks instead of individual chunks.
3817          */
3818         if (btrfs_mixed_space_info(space_info))
3819                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3820
3821         /*
3822          * if we're doing a data chunk, go ahead and make sure that
3823          * we keep a reasonable number of metadata chunks allocated in the
3824          * FS as well.
3825          */
3826         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3827                 fs_info->data_chunk_allocations++;
3828                 if (!(fs_info->data_chunk_allocations %
3829                       fs_info->metadata_ratio))
3830                         force_metadata_allocation(fs_info);
3831         }
3832
3833         /*
3834          * Check if we have enough space in SYSTEM chunk because we may need
3835          * to update devices.
3836          */
3837         check_system_chunk(trans, extent_root, flags);
3838
3839         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3840         trans->allocating_chunk = false;
3841
3842         spin_lock(&space_info->lock);
3843         if (ret < 0 && ret != -ENOSPC)
3844                 goto out;
3845         if (ret)
3846                 space_info->full = 1;
3847         else
3848                 ret = 1;
3849
3850         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3851 out:
3852         space_info->chunk_alloc = 0;
3853         spin_unlock(&space_info->lock);
3854         mutex_unlock(&fs_info->chunk_mutex);
3855         return ret;
3856 }
3857
3858 static int can_overcommit(struct btrfs_root *root,
3859                           struct btrfs_space_info *space_info, u64 bytes,
3860                           enum btrfs_reserve_flush_enum flush)
3861 {
3862         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3863         u64 profile = btrfs_get_alloc_profile(root, 0);
3864         u64 rsv_size = 0;
3865         u64 avail;
3866         u64 used;
3867         u64 to_add;
3868
3869         used = space_info->bytes_used + space_info->bytes_reserved +
3870                 space_info->bytes_pinned + space_info->bytes_readonly;
3871
3872         spin_lock(&global_rsv->lock);
3873         rsv_size = global_rsv->size;
3874         spin_unlock(&global_rsv->lock);
3875
3876         /*
3877          * We only want to allow over committing if we have lots of actual space
3878          * free, but if we don't have enough space to handle the global reserve
3879          * space then we could end up having a real enospc problem when trying
3880          * to allocate a chunk or some other such important allocation.
3881          */
3882         rsv_size <<= 1;
3883         if (used + rsv_size >= space_info->total_bytes)
3884                 return 0;
3885
3886         used += space_info->bytes_may_use;
3887
3888         spin_lock(&root->fs_info->free_chunk_lock);
3889         avail = root->fs_info->free_chunk_space;
3890         spin_unlock(&root->fs_info->free_chunk_lock);
3891
3892         /*
3893          * If we have dup, raid1 or raid10 then only half of the free
3894          * space is actually useable.  For raid56, the space info used
3895          * doesn't include the parity drive, so we don't have to
3896          * change the math
3897          */
3898         if (profile & (BTRFS_BLOCK_GROUP_DUP |
3899                        BTRFS_BLOCK_GROUP_RAID1 |
3900                        BTRFS_BLOCK_GROUP_RAID10))
3901                 avail >>= 1;
3902
3903         to_add = space_info->total_bytes;
3904
3905         /*
3906          * If we aren't flushing all things, let us overcommit up to
3907          * 1/2th of the space. If we can flush, don't let us overcommit
3908          * too much, let it overcommit up to 1/8 of the space.
3909          */
3910         if (flush == BTRFS_RESERVE_FLUSH_ALL)
3911                 to_add >>= 3;
3912         else
3913                 to_add >>= 1;
3914
3915         /*
3916          * Limit the overcommit to the amount of free space we could possibly
3917          * allocate for chunks.
3918          */
3919         to_add = min(avail, to_add);
3920
3921         if (used + bytes < space_info->total_bytes + to_add)
3922                 return 1;
3923         return 0;
3924 }
3925
3926 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3927                                   unsigned long nr_pages)
3928 {
3929         struct super_block *sb = root->fs_info->sb;
3930         int started;
3931
3932         /* If we can not start writeback, just sync all the delalloc file. */
3933         started = try_to_writeback_inodes_sb_nr(sb, nr_pages,
3934                                                       WB_REASON_FS_FREE_SPACE);
3935         if (!started) {
3936                 /*
3937                  * We needn't worry the filesystem going from r/w to r/o though
3938                  * we don't acquire ->s_umount mutex, because the filesystem
3939                  * should guarantee the delalloc inodes list be empty after
3940                  * the filesystem is readonly(all dirty pages are written to
3941                  * the disk).
3942                  */
3943                 btrfs_start_delalloc_inodes(root, 0);
3944                 if (!current->journal_info)
3945                         btrfs_wait_ordered_extents(root, 0);
3946         }
3947 }
3948
3949 /*
3950  * shrink metadata reservation for delalloc
3951  */
3952 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3953                             bool wait_ordered)
3954 {
3955         struct btrfs_block_rsv *block_rsv;
3956         struct btrfs_space_info *space_info;
3957         struct btrfs_trans_handle *trans;
3958         u64 delalloc_bytes;
3959         u64 max_reclaim;
3960         long time_left;
3961         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3962         int loops = 0;
3963         enum btrfs_reserve_flush_enum flush;
3964
3965         trans = (struct btrfs_trans_handle *)current->journal_info;
3966         block_rsv = &root->fs_info->delalloc_block_rsv;
3967         space_info = block_rsv->space_info;
3968
3969         smp_mb();
3970         delalloc_bytes = percpu_counter_sum_positive(
3971                                                 &root->fs_info->delalloc_bytes);
3972         if (delalloc_bytes == 0) {
3973                 if (trans)
3974                         return;
3975                 btrfs_wait_ordered_extents(root, 0);
3976                 return;
3977         }
3978
3979         while (delalloc_bytes && loops < 3) {
3980                 max_reclaim = min(delalloc_bytes, to_reclaim);
3981                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3982                 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3983                 /*
3984                  * We need to wait for the async pages to actually start before
3985                  * we do anything.
3986                  */
3987                 wait_event(root->fs_info->async_submit_wait,
3988                            !atomic_read(&root->fs_info->async_delalloc_pages));
3989
3990                 if (!trans)
3991                         flush = BTRFS_RESERVE_FLUSH_ALL;
3992                 else
3993                         flush = BTRFS_RESERVE_NO_FLUSH;
3994                 spin_lock(&space_info->lock);
3995                 if (can_overcommit(root, space_info, orig, flush)) {
3996                         spin_unlock(&space_info->lock);
3997                         break;
3998                 }
3999                 spin_unlock(&space_info->lock);
4000
4001                 loops++;
4002                 if (wait_ordered && !trans) {
4003                         btrfs_wait_ordered_extents(root, 0);
4004                 } else {
4005                         time_left = schedule_timeout_killable(1);
4006                         if (time_left)
4007                                 break;
4008                 }
4009                 smp_mb();
4010                 delalloc_bytes = percpu_counter_sum_positive(
4011                                                 &root->fs_info->delalloc_bytes);
4012         }
4013 }
4014
4015 /**
4016  * maybe_commit_transaction - possibly commit the transaction if its ok to
4017  * @root - the root we're allocating for
4018  * @bytes - the number of bytes we want to reserve
4019  * @force - force the commit
4020  *
4021  * This will check to make sure that committing the transaction will actually
4022  * get us somewhere and then commit the transaction if it does.  Otherwise it
4023  * will return -ENOSPC.
4024  */
4025 static int may_commit_transaction(struct btrfs_root *root,
4026                                   struct btrfs_space_info *space_info,
4027                                   u64 bytes, int force)
4028 {
4029         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
4030         struct btrfs_trans_handle *trans;
4031
4032         trans = (struct btrfs_trans_handle *)current->journal_info;
4033         if (trans)
4034                 return -EAGAIN;
4035
4036         if (force)
4037                 goto commit;
4038
4039         /* See if there is enough pinned space to make this reservation */
4040         spin_lock(&space_info->lock);
4041         if (space_info->bytes_pinned >= bytes) {
4042                 spin_unlock(&space_info->lock);
4043                 goto commit;
4044         }
4045         spin_unlock(&space_info->lock);
4046
4047         /*
4048          * See if there is some space in the delayed insertion reservation for
4049          * this reservation.
4050          */
4051         if (space_info != delayed_rsv->space_info)
4052                 return -ENOSPC;
4053
4054         spin_lock(&space_info->lock);
4055         spin_lock(&delayed_rsv->lock);
4056         if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
4057                 spin_unlock(&delayed_rsv->lock);
4058                 spin_unlock(&space_info->lock);
4059                 return -ENOSPC;
4060         }
4061         spin_unlock(&delayed_rsv->lock);
4062         spin_unlock(&space_info->lock);
4063
4064 commit:
4065         trans = btrfs_join_transaction(root);
4066         if (IS_ERR(trans))
4067                 return -ENOSPC;
4068
4069         return btrfs_commit_transaction(trans, root);
4070 }
4071
4072 enum flush_state {
4073         FLUSH_DELAYED_ITEMS_NR  =       1,
4074         FLUSH_DELAYED_ITEMS     =       2,
4075         FLUSH_DELALLOC          =       3,
4076         FLUSH_DELALLOC_WAIT     =       4,
4077         ALLOC_CHUNK             =       5,
4078         COMMIT_TRANS            =       6,
4079 };
4080
4081 static int flush_space(struct btrfs_root *root,
4082                        struct btrfs_space_info *space_info, u64 num_bytes,
4083                        u64 orig_bytes, int state)
4084 {
4085         struct btrfs_trans_handle *trans;
4086         int nr;
4087         int ret = 0;
4088
4089         switch (state) {
4090         case FLUSH_DELAYED_ITEMS_NR:
4091         case FLUSH_DELAYED_ITEMS:
4092                 if (state == FLUSH_DELAYED_ITEMS_NR) {
4093                         u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
4094
4095                         nr = (int)div64_u64(num_bytes, bytes);
4096                         if (!nr)
4097                                 nr = 1;
4098                         nr *= 2;
4099                 } else {
4100                         nr = -1;
4101                 }
4102                 trans = btrfs_join_transaction(root);
4103                 if (IS_ERR(trans)) {
4104                         ret = PTR_ERR(trans);
4105                         break;
4106                 }
4107                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
4108                 btrfs_end_transaction(trans, root);
4109                 break;
4110         case FLUSH_DELALLOC:
4111         case FLUSH_DELALLOC_WAIT:
4112                 shrink_delalloc(root, num_bytes, orig_bytes,
4113                                 state == FLUSH_DELALLOC_WAIT);
4114                 break;
4115         case ALLOC_CHUNK:
4116                 trans = btrfs_join_transaction(root);
4117                 if (IS_ERR(trans)) {
4118                         ret = PTR_ERR(trans);
4119                         break;
4120                 }
4121                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
4122                                      btrfs_get_alloc_profile(root, 0),
4123                                      CHUNK_ALLOC_NO_FORCE);
4124                 btrfs_end_transaction(trans, root);
4125                 if (ret == -ENOSPC)
4126                         ret = 0;
4127                 break;
4128         case COMMIT_TRANS:
4129                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
4130                 break;
4131         default:
4132                 ret = -ENOSPC;
4133                 break;
4134         }
4135
4136         return ret;
4137 }
4138 /**
4139  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
4140  * @root - the root we're allocating for
4141  * @block_rsv - the block_rsv we're allocating for
4142  * @orig_bytes - the number of bytes we want
4143  * @flush - whether or not we can flush to make our reservation
4144  *
4145  * This will reserve orgi_bytes number of bytes from the space info associated
4146  * with the block_rsv.  If there is not enough space it will make an attempt to
4147  * flush out space to make room.  It will do this by flushing delalloc if
4148  * possible or committing the transaction.  If flush is 0 then no attempts to
4149  * regain reservations will be made and this will fail if there is not enough
4150  * space already.
4151  */
4152 static int reserve_metadata_bytes(struct btrfs_root *root,
4153                                   struct btrfs_block_rsv *block_rsv,
4154                                   u64 orig_bytes,
4155                                   enum btrfs_reserve_flush_enum flush)
4156 {
4157         struct btrfs_space_info *space_info = block_rsv->space_info;
4158         u64 used;
4159         u64 num_bytes = orig_bytes;
4160         int flush_state = FLUSH_DELAYED_ITEMS_NR;
4161         int ret = 0;
4162         bool flushing = false;
4163
4164 again:
4165         ret = 0;
4166         spin_lock(&space_info->lock);
4167         /*
4168          * We only want to wait if somebody other than us is flushing and we
4169          * are actually allowed to flush all things.
4170          */
4171         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
4172                space_info->flush) {
4173                 spin_unlock(&space_info->lock);
4174                 /*
4175                  * If we have a trans handle we can't wait because the flusher
4176                  * may have to commit the transaction, which would mean we would
4177                  * deadlock since we are waiting for the flusher to finish, but
4178                  * hold the current transaction open.
4179                  */
4180                 if (current->journal_info)
4181                         return -EAGAIN;
4182                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4183                 /* Must have been killed, return */
4184                 if (ret)
4185                         return -EINTR;
4186
4187                 spin_lock(&space_info->lock);
4188         }
4189
4190         ret = -ENOSPC;
4191         used = space_info->bytes_used + space_info->bytes_reserved +
4192                 space_info->bytes_pinned + space_info->bytes_readonly +
4193                 space_info->bytes_may_use;
4194
4195         /*
4196          * The idea here is that we've not already over-reserved the block group
4197          * then we can go ahead and save our reservation first and then start
4198          * flushing if we need to.  Otherwise if we've already overcommitted
4199          * lets start flushing stuff first and then come back and try to make
4200          * our reservation.
4201          */
4202         if (used <= space_info->total_bytes) {
4203                 if (used + orig_bytes <= space_info->total_bytes) {
4204                         space_info->bytes_may_use += orig_bytes;
4205                         trace_btrfs_space_reservation(root->fs_info,
4206                                 "space_info", space_info->flags, orig_bytes, 1);
4207                         ret = 0;
4208                 } else {
4209                         /*
4210                          * Ok set num_bytes to orig_bytes since we aren't
4211                          * overocmmitted, this way we only try and reclaim what
4212                          * we need.
4213                          */
4214                         num_bytes = orig_bytes;
4215                 }
4216         } else {
4217                 /*
4218                  * Ok we're over committed, set num_bytes to the overcommitted
4219                  * amount plus the amount of bytes that we need for this
4220                  * reservation.
4221                  */
4222                 num_bytes = used - space_info->total_bytes +
4223                         (orig_bytes * 2);
4224         }
4225
4226         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4227                 space_info->bytes_may_use += orig_bytes;
4228                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4229                                               space_info->flags, orig_bytes,
4230                                               1);
4231                 ret = 0;
4232         }
4233
4234         /*
4235          * Couldn't make our reservation, save our place so while we're trying
4236          * to reclaim space we can actually use it instead of somebody else
4237          * stealing it from us.
4238          *
4239          * We make the other tasks wait for the flush only when we can flush
4240          * all things.
4241          */
4242         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4243                 flushing = true;
4244                 space_info->flush = 1;
4245         }
4246
4247         spin_unlock(&space_info->lock);
4248
4249         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4250                 goto out;
4251
4252         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4253                           flush_state);
4254         flush_state++;
4255
4256         /*
4257          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4258          * would happen. So skip delalloc flush.
4259          */
4260         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4261             (flush_state == FLUSH_DELALLOC ||
4262              flush_state == FLUSH_DELALLOC_WAIT))
4263                 flush_state = ALLOC_CHUNK;
4264
4265         if (!ret)
4266                 goto again;
4267         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4268                  flush_state < COMMIT_TRANS)
4269                 goto again;
4270         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4271                  flush_state <= COMMIT_TRANS)
4272                 goto again;
4273
4274 out:
4275         if (ret == -ENOSPC &&
4276             unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) {
4277                 struct btrfs_block_rsv *global_rsv =
4278                         &root->fs_info->global_block_rsv;
4279
4280                 if (block_rsv != global_rsv &&
4281                     !block_rsv_use_bytes(global_rsv, orig_bytes))
4282                         ret = 0;
4283         }
4284         if (flushing) {
4285                 spin_lock(&space_info->lock);
4286                 space_info->flush = 0;
4287                 wake_up_all(&space_info->wait);
4288                 spin_unlock(&space_info->lock);
4289         }
4290         return ret;
4291 }
4292
4293 static struct btrfs_block_rsv *get_block_rsv(
4294                                         const struct btrfs_trans_handle *trans,
4295                                         const struct btrfs_root *root)
4296 {
4297         struct btrfs_block_rsv *block_rsv = NULL;
4298
4299         if (root->ref_cows)
4300                 block_rsv = trans->block_rsv;
4301
4302         if (root == root->fs_info->csum_root && trans->adding_csums)
4303                 block_rsv = trans->block_rsv;
4304
4305         if (!block_rsv)
4306                 block_rsv = root->block_rsv;
4307
4308         if (!block_rsv)
4309                 block_rsv = &root->fs_info->empty_block_rsv;
4310
4311         return block_rsv;
4312 }
4313
4314 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4315                                u64 num_bytes)
4316 {
4317         int ret = -ENOSPC;
4318         spin_lock(&block_rsv->lock);
4319         if (block_rsv->reserved >= num_bytes) {
4320                 block_rsv->reserved -= num_bytes;
4321                 if (block_rsv->reserved < block_rsv->size)
4322                         block_rsv->full = 0;
4323                 ret = 0;
4324         }
4325         spin_unlock(&block_rsv->lock);
4326         return ret;
4327 }
4328
4329 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4330                                 u64 num_bytes, int update_size)
4331 {
4332         spin_lock(&block_rsv->lock);
4333         block_rsv->reserved += num_bytes;
4334         if (update_size)
4335                 block_rsv->size += num_bytes;
4336         else if (block_rsv->reserved >= block_rsv->size)
4337                 block_rsv->full = 1;
4338         spin_unlock(&block_rsv->lock);
4339 }
4340
4341 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4342                                     struct btrfs_block_rsv *block_rsv,
4343                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4344 {
4345         struct btrfs_space_info *space_info = block_rsv->space_info;
4346
4347         spin_lock(&block_rsv->lock);
4348         if (num_bytes == (u64)-1)
4349                 num_bytes = block_rsv->size;
4350         block_rsv->size -= num_bytes;
4351         if (block_rsv->reserved >= block_rsv->size) {
4352                 num_bytes = block_rsv->reserved - block_rsv->size;
4353                 block_rsv->reserved = block_rsv->size;
4354                 block_rsv->full = 1;
4355         } else {
4356                 num_bytes = 0;
4357         }
4358         spin_unlock(&block_rsv->lock);
4359
4360         if (num_bytes > 0) {
4361                 if (dest) {
4362                         spin_lock(&dest->lock);
4363                         if (!dest->full) {
4364                                 u64 bytes_to_add;
4365
4366                                 bytes_to_add = dest->size - dest->reserved;
4367                                 bytes_to_add = min(num_bytes, bytes_to_add);
4368                                 dest->reserved += bytes_to_add;
4369                                 if (dest->reserved >= dest->size)
4370                                         dest->full = 1;
4371                                 num_bytes -= bytes_to_add;
4372                         }
4373                         spin_unlock(&dest->lock);
4374                 }
4375                 if (num_bytes) {
4376                         spin_lock(&space_info->lock);
4377                         space_info->bytes_may_use -= num_bytes;
4378                         trace_btrfs_space_reservation(fs_info, "space_info",
4379                                         space_info->flags, num_bytes, 0);
4380                         space_info->reservation_progress++;
4381                         spin_unlock(&space_info->lock);
4382                 }
4383         }
4384 }
4385
4386 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4387                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4388 {
4389         int ret;
4390
4391         ret = block_rsv_use_bytes(src, num_bytes);
4392         if (ret)
4393                 return ret;
4394
4395         block_rsv_add_bytes(dst, num_bytes, 1);
4396         return 0;
4397 }
4398
4399 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4400 {
4401         memset(rsv, 0, sizeof(*rsv));
4402         spin_lock_init(&rsv->lock);
4403         rsv->type = type;
4404 }
4405
4406 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4407                                               unsigned short type)
4408 {
4409         struct btrfs_block_rsv *block_rsv;
4410         struct btrfs_fs_info *fs_info = root->fs_info;
4411
4412         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4413         if (!block_rsv)
4414                 return NULL;
4415
4416         btrfs_init_block_rsv(block_rsv, type);
4417         block_rsv->space_info = __find_space_info(fs_info,
4418                                                   BTRFS_BLOCK_GROUP_METADATA);
4419         return block_rsv;
4420 }
4421
4422 void btrfs_free_block_rsv(struct btrfs_root *root,
4423                           struct btrfs_block_rsv *rsv)
4424 {
4425         if (!rsv)
4426                 return;
4427         btrfs_block_rsv_release(root, rsv, (u64)-1);
4428         kfree(rsv);
4429 }
4430
4431 int btrfs_block_rsv_add(struct btrfs_root *root,
4432                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4433                         enum btrfs_reserve_flush_enum flush)
4434 {
4435         int ret;
4436
4437         if (num_bytes == 0)
4438                 return 0;
4439
4440         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4441         if (!ret) {
4442                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
4443                 return 0;
4444         }
4445
4446         return ret;
4447 }
4448
4449 int btrfs_block_rsv_check(struct btrfs_root *root,
4450                           struct btrfs_block_rsv *block_rsv, int min_factor)
4451 {
4452         u64 num_bytes = 0;
4453         int ret = -ENOSPC;
4454
4455         if (!block_rsv)
4456                 return 0;
4457
4458         spin_lock(&block_rsv->lock);
4459         num_bytes = div_factor(block_rsv->size, min_factor);
4460         if (block_rsv->reserved >= num_bytes)
4461                 ret = 0;
4462         spin_unlock(&block_rsv->lock);
4463
4464         return ret;
4465 }
4466
4467 int btrfs_block_rsv_refill(struct btrfs_root *root,
4468                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4469                            enum btrfs_reserve_flush_enum flush)
4470 {
4471         u64 num_bytes = 0;
4472         int ret = -ENOSPC;
4473
4474         if (!block_rsv)
4475                 return 0;
4476
4477         spin_lock(&block_rsv->lock);
4478         num_bytes = min_reserved;
4479         if (block_rsv->reserved >= num_bytes)
4480                 ret = 0;
4481         else
4482                 num_bytes -= block_rsv->reserved;
4483         spin_unlock(&block_rsv->lock);
4484
4485         if (!ret)
4486                 return 0;
4487
4488         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4489         if (!ret) {
4490                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4491                 return 0;
4492         }
4493
4494         return ret;
4495 }
4496
4497 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4498                             struct btrfs_block_rsv *dst_rsv,
4499                             u64 num_bytes)
4500 {
4501         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4502 }
4503
4504 void btrfs_block_rsv_release(struct btrfs_root *root,
4505                              struct btrfs_block_rsv *block_rsv,
4506                              u64 num_bytes)
4507 {
4508         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4509         if (global_rsv->full || global_rsv == block_rsv ||
4510             block_rsv->space_info != global_rsv->space_info)
4511                 global_rsv = NULL;
4512         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4513                                 num_bytes);
4514 }
4515
4516 /*
4517  * helper to calculate size of global block reservation.
4518  * the desired value is sum of space used by extent tree,
4519  * checksum tree and root tree
4520  */
4521 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4522 {
4523         struct btrfs_space_info *sinfo;
4524         u64 num_bytes;
4525         u64 meta_used;
4526         u64 data_used;
4527         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4528
4529         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4530         spin_lock(&sinfo->lock);
4531         data_used = sinfo->bytes_used;
4532         spin_unlock(&sinfo->lock);
4533
4534         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4535         spin_lock(&sinfo->lock);
4536         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4537                 data_used = 0;
4538         meta_used = sinfo->bytes_used;
4539         spin_unlock(&sinfo->lock);
4540
4541         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4542                     csum_size * 2;
4543         num_bytes += div64_u64(data_used + meta_used, 50);
4544
4545         if (num_bytes * 3 > meta_used)
4546                 num_bytes = div64_u64(meta_used, 3);
4547
4548         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4549 }
4550
4551 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4552 {
4553         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4554         struct btrfs_space_info *sinfo = block_rsv->space_info;
4555         u64 num_bytes;
4556
4557         num_bytes = calc_global_metadata_size(fs_info);
4558
4559         spin_lock(&sinfo->lock);
4560         spin_lock(&block_rsv->lock);
4561
4562         block_rsv->size = min_t(u64, num_bytes, 512 * 1024 * 1024);
4563
4564         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4565                     sinfo->bytes_reserved + sinfo->bytes_readonly +
4566                     sinfo->bytes_may_use;
4567
4568         if (sinfo->total_bytes > num_bytes) {
4569                 num_bytes = sinfo->total_bytes - num_bytes;
4570                 block_rsv->reserved += num_bytes;
4571                 sinfo->bytes_may_use += num_bytes;
4572                 trace_btrfs_space_reservation(fs_info, "space_info",
4573                                       sinfo->flags, num_bytes, 1);
4574         }
4575
4576         if (block_rsv->reserved >= block_rsv->size) {
4577                 num_bytes = block_rsv->reserved - block_rsv->size;
4578                 sinfo->bytes_may_use -= num_bytes;
4579                 trace_btrfs_space_reservation(fs_info, "space_info",
4580                                       sinfo->flags, num_bytes, 0);
4581                 sinfo->reservation_progress++;
4582                 block_rsv->reserved = block_rsv->size;
4583                 block_rsv->full = 1;
4584         }
4585
4586         spin_unlock(&block_rsv->lock);
4587         spin_unlock(&sinfo->lock);
4588 }
4589
4590 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4591 {
4592         struct btrfs_space_info *space_info;
4593
4594         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4595         fs_info->chunk_block_rsv.space_info = space_info;
4596
4597         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4598         fs_info->global_block_rsv.space_info = space_info;
4599         fs_info->delalloc_block_rsv.space_info = space_info;
4600         fs_info->trans_block_rsv.space_info = space_info;
4601         fs_info->empty_block_rsv.space_info = space_info;
4602         fs_info->delayed_block_rsv.space_info = space_info;
4603
4604         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4605         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4606         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4607         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4608         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4609
4610         update_global_block_rsv(fs_info);
4611 }
4612
4613 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4614 {
4615         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4616                                 (u64)-1);
4617         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4618         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4619         WARN_ON(fs_info->trans_block_rsv.size > 0);
4620         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4621         WARN_ON(fs_info->chunk_block_rsv.size > 0);
4622         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4623         WARN_ON(fs_info->delayed_block_rsv.size > 0);
4624         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4625 }
4626
4627 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4628                                   struct btrfs_root *root)
4629 {
4630         if (!trans->block_rsv)
4631                 return;
4632
4633         if (!trans->bytes_reserved)
4634                 return;
4635
4636         trace_btrfs_space_reservation(root->fs_info, "transaction",
4637                                       trans->transid, trans->bytes_reserved, 0);
4638         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4639         trans->bytes_reserved = 0;
4640 }
4641
4642 /* Can only return 0 or -ENOSPC */
4643 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4644                                   struct inode *inode)
4645 {
4646         struct btrfs_root *root = BTRFS_I(inode)->root;
4647         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4648         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4649
4650         /*
4651          * We need to hold space in order to delete our orphan item once we've
4652          * added it, so this takes the reservation so we can release it later
4653          * when we are truly done with the orphan item.
4654          */
4655         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4656         trace_btrfs_space_reservation(root->fs_info, "orphan",
4657                                       btrfs_ino(inode), num_bytes, 1);
4658         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4659 }
4660
4661 void btrfs_orphan_release_metadata(struct inode *inode)
4662 {
4663         struct btrfs_root *root = BTRFS_I(inode)->root;
4664         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4665         trace_btrfs_space_reservation(root->fs_info, "orphan",
4666                                       btrfs_ino(inode), num_bytes, 0);
4667         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4668 }
4669
4670 /*
4671  * btrfs_subvolume_reserve_metadata() - reserve space for subvolume operation
4672  * root: the root of the parent directory
4673  * rsv: block reservation
4674  * items: the number of items that we need do reservation
4675  * qgroup_reserved: used to return the reserved size in qgroup
4676  *
4677  * This function is used to reserve the space for snapshot/subvolume
4678  * creation and deletion. Those operations are different with the
4679  * common file/directory operations, they change two fs/file trees
4680  * and root tree, the number of items that the qgroup reserves is
4681  * different with the free space reservation. So we can not use
4682  * the space reseravtion mechanism in start_transaction().
4683  */
4684 int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
4685                                      struct btrfs_block_rsv *rsv,
4686                                      int items,
4687                                      u64 *qgroup_reserved)
4688 {
4689         u64 num_bytes;
4690         int ret;
4691
4692         if (root->fs_info->quota_enabled) {
4693                 /* One for parent inode, two for dir entries */
4694                 num_bytes = 3 * root->leafsize;
4695                 ret = btrfs_qgroup_reserve(root, num_bytes);
4696                 if (ret)
4697                         return ret;
4698         } else {
4699                 num_bytes = 0;
4700         }
4701
4702         *qgroup_reserved = num_bytes;
4703
4704         num_bytes = btrfs_calc_trans_metadata_size(root, items);
4705         rsv->space_info = __find_space_info(root->fs_info,
4706                                             BTRFS_BLOCK_GROUP_METADATA);
4707         ret = btrfs_block_rsv_add(root, rsv, num_bytes,
4708                                   BTRFS_RESERVE_FLUSH_ALL);
4709         if (ret) {
4710                 if (*qgroup_reserved)
4711                         btrfs_qgroup_free(root, *qgroup_reserved);
4712         }
4713
4714         return ret;
4715 }
4716
4717 void btrfs_subvolume_release_metadata(struct btrfs_root *root,
4718                                       struct btrfs_block_rsv *rsv,
4719                                       u64 qgroup_reserved)
4720 {
4721         btrfs_block_rsv_release(root, rsv, (u64)-1);
4722         if (qgroup_reserved)
4723                 btrfs_qgroup_free(root, qgroup_reserved);
4724 }
4725
4726 /**
4727  * drop_outstanding_extent - drop an outstanding extent
4728  * @inode: the inode we're dropping the extent for
4729  *
4730  * This is called when we are freeing up an outstanding extent, either called
4731  * after an error or after an extent is written.  This will return the number of
4732  * reserved extents that need to be freed.  This must be called with
4733  * BTRFS_I(inode)->lock held.
4734  */
4735 static unsigned drop_outstanding_extent(struct inode *inode)
4736 {
4737         unsigned drop_inode_space = 0;
4738         unsigned dropped_extents = 0;
4739
4740         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4741         BTRFS_I(inode)->outstanding_extents--;
4742
4743         if (BTRFS_I(inode)->outstanding_extents == 0 &&
4744             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4745                                &BTRFS_I(inode)->runtime_flags))
4746                 drop_inode_space = 1;
4747
4748         /*
4749          * If we have more or the same amount of outsanding extents than we have
4750          * reserved then we need to leave the reserved extents count alone.
4751          */
4752         if (BTRFS_I(inode)->outstanding_extents >=
4753             BTRFS_I(inode)->reserved_extents)
4754                 return drop_inode_space;
4755
4756         dropped_extents = BTRFS_I(inode)->reserved_extents -
4757                 BTRFS_I(inode)->outstanding_extents;
4758         BTRFS_I(inode)->reserved_extents -= dropped_extents;
4759         return dropped_extents + drop_inode_space;
4760 }
4761
4762 /**
4763  * calc_csum_metadata_size - return the amount of metada space that must be
4764  *      reserved/free'd for the given bytes.
4765  * @inode: the inode we're manipulating
4766  * @num_bytes: the number of bytes in question
4767  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4768  *
4769  * This adjusts the number of csum_bytes in the inode and then returns the
4770  * correct amount of metadata that must either be reserved or freed.  We
4771  * calculate how many checksums we can fit into one leaf and then divide the
4772  * number of bytes that will need to be checksumed by this value to figure out
4773  * how many checksums will be required.  If we are adding bytes then the number
4774  * may go up and we will return the number of additional bytes that must be
4775  * reserved.  If it is going down we will return the number of bytes that must
4776  * be freed.
4777  *
4778  * This must be called with BTRFS_I(inode)->lock held.
4779  */
4780 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4781                                    int reserve)
4782 {
4783         struct btrfs_root *root = BTRFS_I(inode)->root;
4784         u64 csum_size;
4785         int num_csums_per_leaf;
4786         int num_csums;
4787         int old_csums;
4788
4789         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4790             BTRFS_I(inode)->csum_bytes == 0)
4791                 return 0;
4792
4793         old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4794         if (reserve)
4795                 BTRFS_I(inode)->csum_bytes += num_bytes;
4796         else
4797                 BTRFS_I(inode)->csum_bytes -= num_bytes;
4798         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4799         num_csums_per_leaf = (int)div64_u64(csum_size,
4800                                             sizeof(struct btrfs_csum_item) +
4801                                             sizeof(struct btrfs_disk_key));
4802         num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4803         num_csums = num_csums + num_csums_per_leaf - 1;
4804         num_csums = num_csums / num_csums_per_leaf;
4805
4806         old_csums = old_csums + num_csums_per_leaf - 1;
4807         old_csums = old_csums / num_csums_per_leaf;
4808
4809         /* No change, no need to reserve more */
4810         if (old_csums == num_csums)
4811                 return 0;
4812
4813         if (reserve)
4814                 return btrfs_calc_trans_metadata_size(root,
4815                                                       num_csums - old_csums);
4816
4817         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4818 }
4819
4820 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4821 {
4822         struct btrfs_root *root = BTRFS_I(inode)->root;
4823         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4824         u64 to_reserve = 0;
4825         u64 csum_bytes;
4826         unsigned nr_extents = 0;
4827         int extra_reserve = 0;
4828         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4829         int ret = 0;
4830         bool delalloc_lock = true;
4831         u64 to_free = 0;
4832         unsigned dropped;
4833
4834         /* If we are a free space inode we need to not flush since we will be in
4835          * the middle of a transaction commit.  We also don't need the delalloc
4836          * mutex since we won't race with anybody.  We need this mostly to make
4837          * lockdep shut its filthy mouth.
4838          */
4839         if (btrfs_is_free_space_inode(inode)) {
4840                 flush = BTRFS_RESERVE_NO_FLUSH;
4841                 delalloc_lock = false;
4842         }
4843
4844         if (flush != BTRFS_RESERVE_NO_FLUSH &&
4845             btrfs_transaction_in_commit(root->fs_info))
4846                 schedule_timeout(1);
4847
4848         if (delalloc_lock)
4849                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4850
4851         num_bytes = ALIGN(num_bytes, root->sectorsize);
4852
4853         spin_lock(&BTRFS_I(inode)->lock);
4854         BTRFS_I(inode)->outstanding_extents++;
4855
4856         if (BTRFS_I(inode)->outstanding_extents >
4857             BTRFS_I(inode)->reserved_extents)
4858                 nr_extents = BTRFS_I(inode)->outstanding_extents -
4859                         BTRFS_I(inode)->reserved_extents;
4860
4861         /*
4862          * Add an item to reserve for updating the inode when we complete the
4863          * delalloc io.
4864          */
4865         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4866                       &BTRFS_I(inode)->runtime_flags)) {
4867                 nr_extents++;
4868                 extra_reserve = 1;
4869         }
4870
4871         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4872         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4873         csum_bytes = BTRFS_I(inode)->csum_bytes;
4874         spin_unlock(&BTRFS_I(inode)->lock);
4875
4876         if (root->fs_info->quota_enabled) {
4877                 ret = btrfs_qgroup_reserve(root, num_bytes +
4878                                            nr_extents * root->leafsize);
4879                 if (ret)
4880                         goto out_fail;
4881         }
4882
4883         ret = reserve_metadata_bytes(root, block_rsv, to_reserve, flush);
4884         if (unlikely(ret)) {
4885                 if (root->fs_info->quota_enabled)
4886                         btrfs_qgroup_free(root, num_bytes +
4887                                                 nr_extents * root->leafsize);
4888                 goto out_fail;
4889         }
4890
4891         spin_lock(&BTRFS_I(inode)->lock);
4892         if (extra_reserve) {
4893                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4894                         &BTRFS_I(inode)->runtime_flags);
4895                 nr_extents--;
4896         }
4897         BTRFS_I(inode)->reserved_extents += nr_extents;
4898         spin_unlock(&BTRFS_I(inode)->lock);
4899
4900         if (delalloc_lock)
4901                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4902
4903         if (to_reserve)
4904                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4905                                               btrfs_ino(inode), to_reserve, 1);
4906         block_rsv_add_bytes(block_rsv, to_reserve, 1);
4907
4908         return 0;
4909
4910 out_fail:
4911         spin_lock(&BTRFS_I(inode)->lock);
4912         dropped = drop_outstanding_extent(inode);
4913         /*
4914          * If the inodes csum_bytes is the same as the original
4915          * csum_bytes then we know we haven't raced with any free()ers
4916          * so we can just reduce our inodes csum bytes and carry on.
4917          */
4918         if (BTRFS_I(inode)->csum_bytes == csum_bytes) {
4919                 calc_csum_metadata_size(inode, num_bytes, 0);
4920         } else {
4921                 u64 orig_csum_bytes = BTRFS_I(inode)->csum_bytes;
4922                 u64 bytes;
4923
4924                 /*
4925                  * This is tricky, but first we need to figure out how much we
4926                  * free'd from any free-ers that occured during this
4927                  * reservation, so we reset ->csum_bytes to the csum_bytes
4928                  * before we dropped our lock, and then call the free for the
4929                  * number of bytes that were freed while we were trying our
4930                  * reservation.
4931                  */
4932                 bytes = csum_bytes - BTRFS_I(inode)->csum_bytes;
4933                 BTRFS_I(inode)->csum_bytes = csum_bytes;
4934                 to_free = calc_csum_metadata_size(inode, bytes, 0);
4935
4936
4937                 /*
4938                  * Now we need to see how much we would have freed had we not
4939                  * been making this reservation and our ->csum_bytes were not
4940                  * artificially inflated.
4941                  */
4942                 BTRFS_I(inode)->csum_bytes = csum_bytes - num_bytes;
4943                 bytes = csum_bytes - orig_csum_bytes;
4944                 bytes = calc_csum_metadata_size(inode, bytes, 0);
4945
4946                 /*
4947                  * Now reset ->csum_bytes to what it should be.  If bytes is
4948                  * more than to_free then we would have free'd more space had we
4949                  * not had an artificially high ->csum_bytes, so we need to free
4950                  * the remainder.  If bytes is the same or less then we don't
4951                  * need to do anything, the other free-ers did the correct
4952                  * thing.
4953                  */
4954                 BTRFS_I(inode)->csum_bytes = orig_csum_bytes - num_bytes;
4955                 if (bytes > to_free)
4956                         to_free = bytes - to_free;
4957                 else
4958                         to_free = 0;
4959         }
4960         spin_unlock(&BTRFS_I(inode)->lock);
4961         if (dropped)
4962                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4963
4964         if (to_free) {
4965                 btrfs_block_rsv_release(root, block_rsv, to_free);
4966                 trace_btrfs_space_reservation(root->fs_info, "delalloc",
4967                                               btrfs_ino(inode), to_free, 0);
4968         }
4969         if (delalloc_lock)
4970                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4971         return ret;
4972 }
4973
4974 /**
4975  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4976  * @inode: the inode to release the reservation for
4977  * @num_bytes: the number of bytes we're releasing
4978  *
4979  * This will release the metadata reservation for an inode.  This can be called
4980  * once we complete IO for a given set of bytes to release their metadata
4981  * reservations.
4982  */
4983 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4984 {
4985         struct btrfs_root *root = BTRFS_I(inode)->root;
4986         u64 to_free = 0;
4987         unsigned dropped;
4988
4989         num_bytes = ALIGN(num_bytes, root->sectorsize);
4990         spin_lock(&BTRFS_I(inode)->lock);
4991         dropped = drop_outstanding_extent(inode);
4992
4993         if (num_bytes)
4994                 to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4995         spin_unlock(&BTRFS_I(inode)->lock);
4996         if (dropped > 0)
4997                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4998
4999         trace_btrfs_space_reservation(root->fs_info, "delalloc",
5000                                       btrfs_ino(inode), to_free, 0);
5001         if (root->fs_info->quota_enabled) {
5002                 btrfs_qgroup_free(root, num_bytes +
5003                                         dropped * root->leafsize);
5004         }
5005
5006         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
5007                                 to_free);
5008 }
5009
5010 /**
5011  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
5012  * @inode: inode we're writing to
5013  * @num_bytes: the number of bytes we want to allocate
5014  *
5015  * This will do the following things
5016  *
5017  * o reserve space in the data space info for num_bytes
5018  * o reserve space in the metadata space info based on number of outstanding
5019  *   extents and how much csums will be needed
5020  * o add to the inodes ->delalloc_bytes
5021  * o add it to the fs_info's delalloc inodes list.
5022  *
5023  * This will return 0 for success and -ENOSPC if there is no space left.
5024  */
5025 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
5026 {
5027         int ret;
5028
5029         ret = btrfs_check_data_free_space(inode, num_bytes);
5030         if (ret)
5031                 return ret;
5032
5033         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
5034         if (ret) {
5035                 btrfs_free_reserved_data_space(inode, num_bytes);
5036                 return ret;
5037         }
5038
5039         return 0;
5040 }
5041
5042 /**
5043  * btrfs_delalloc_release_space - release data and metadata space for delalloc
5044  * @inode: inode we're releasing space for
5045  * @num_bytes: the number of bytes we want to free up
5046  *
5047  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
5048  * called in the case that we don't need the metadata AND data reservations
5049  * anymore.  So if there is an error or we insert an inline extent.
5050  *
5051  * This function will release the metadata space that was not used and will
5052  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
5053  * list if there are no delalloc bytes left.
5054  */
5055 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
5056 {
5057         btrfs_delalloc_release_metadata(inode, num_bytes);
5058         btrfs_free_reserved_data_space(inode, num_bytes);
5059 }
5060
5061 static int update_block_group(struct btrfs_root *root,
5062                               u64 bytenr, u64 num_bytes, int alloc)
5063 {
5064         struct btrfs_block_group_cache *cache = NULL;
5065         struct btrfs_fs_info *info = root->fs_info;
5066         u64 total = num_bytes;
5067         u64 old_val;
5068         u64 byte_in_group;
5069         int factor;
5070
5071         /* block accounting for super block */
5072         spin_lock(&info->delalloc_lock);
5073         old_val = btrfs_super_bytes_used(info->super_copy);
5074         if (alloc)
5075                 old_val += num_bytes;
5076         else
5077                 old_val -= num_bytes;
5078         btrfs_set_super_bytes_used(info->super_copy, old_val);
5079         spin_unlock(&info->delalloc_lock);
5080
5081         while (total) {
5082                 cache = btrfs_lookup_block_group(info, bytenr);
5083                 if (!cache)
5084                         return -ENOENT;
5085                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
5086                                     BTRFS_BLOCK_GROUP_RAID1 |
5087                                     BTRFS_BLOCK_GROUP_RAID10))
5088                         factor = 2;
5089                 else
5090                         factor = 1;
5091                 /*
5092                  * If this block group has free space cache written out, we
5093                  * need to make sure to load it if we are removing space.  This
5094                  * is because we need the unpinning stage to actually add the
5095                  * space back to the block group, otherwise we will leak space.
5096                  */
5097                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
5098                         cache_block_group(cache, 1);
5099
5100                 byte_in_group = bytenr - cache->key.objectid;
5101                 WARN_ON(byte_in_group > cache->key.offset);
5102
5103                 spin_lock(&cache->space_info->lock);
5104                 spin_lock(&cache->lock);
5105
5106                 if (btrfs_test_opt(root, SPACE_CACHE) &&
5107                     cache->disk_cache_state < BTRFS_DC_CLEAR)
5108                         cache->disk_cache_state = BTRFS_DC_CLEAR;
5109
5110                 cache->dirty = 1;
5111                 old_val = btrfs_block_group_used(&cache->item);
5112                 num_bytes = min(total, cache->key.offset - byte_in_group);
5113                 if (alloc) {
5114                         old_val += num_bytes;
5115                         btrfs_set_block_group_used(&cache->item, old_val);
5116                         cache->reserved -= num_bytes;
5117                         cache->space_info->bytes_reserved -= num_bytes;
5118                         cache->space_info->bytes_used += num_bytes;
5119                         cache->space_info->disk_used += num_bytes * factor;
5120                         spin_unlock(&cache->lock);
5121                         spin_unlock(&cache->space_info->lock);
5122                 } else {
5123                         old_val -= num_bytes;
5124                         btrfs_set_block_group_used(&cache->item, old_val);
5125                         cache->pinned += num_bytes;
5126                         cache->space_info->bytes_pinned += num_bytes;
5127                         cache->space_info->bytes_used -= num_bytes;
5128                         cache->space_info->disk_used -= num_bytes * factor;
5129                         spin_unlock(&cache->lock);
5130                         spin_unlock(&cache->space_info->lock);
5131
5132                         set_extent_dirty(info->pinned_extents,
5133                                          bytenr, bytenr + num_bytes - 1,
5134                                          GFP_NOFS | __GFP_NOFAIL);
5135                 }
5136                 btrfs_put_block_group(cache);
5137                 total -= num_bytes;
5138                 bytenr += num_bytes;
5139         }
5140         return 0;
5141 }
5142
5143 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
5144 {
5145         struct btrfs_block_group_cache *cache;
5146         u64 bytenr;
5147
5148         spin_lock(&root->fs_info->block_group_cache_lock);
5149         bytenr = root->fs_info->first_logical_byte;
5150         spin_unlock(&root->fs_info->block_group_cache_lock);
5151
5152         if (bytenr < (u64)-1)
5153                 return bytenr;
5154
5155         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
5156         if (!cache)
5157                 return 0;
5158
5159         bytenr = cache->key.objectid;
5160         btrfs_put_block_group(cache);
5161
5162         return bytenr;
5163 }
5164
5165 static int pin_down_extent(struct btrfs_root *root,
5166                            struct btrfs_block_group_cache *cache,
5167                            u64 bytenr, u64 num_bytes, int reserved)
5168 {
5169         spin_lock(&cache->space_info->lock);
5170         spin_lock(&cache->lock);
5171         cache->pinned += num_bytes;
5172         cache->space_info->bytes_pinned += num_bytes;
5173         if (reserved) {
5174                 cache->reserved -= num_bytes;
5175                 cache->space_info->bytes_reserved -= num_bytes;
5176         }
5177         spin_unlock(&cache->lock);
5178         spin_unlock(&cache->space_info->lock);
5179
5180         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
5181                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
5182         return 0;
5183 }
5184
5185 /*
5186  * this function must be called within transaction
5187  */
5188 int btrfs_pin_extent(struct btrfs_root *root,
5189                      u64 bytenr, u64 num_bytes, int reserved)
5190 {
5191         struct btrfs_block_group_cache *cache;
5192
5193         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5194         BUG_ON(!cache); /* Logic error */
5195
5196         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
5197
5198         btrfs_put_block_group(cache);
5199         return 0;
5200 }
5201
5202 /*
5203  * this function must be called within transaction
5204  */
5205 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
5206                                     u64 bytenr, u64 num_bytes)
5207 {
5208         struct btrfs_block_group_cache *cache;
5209
5210         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
5211         BUG_ON(!cache); /* Logic error */
5212
5213         /*
5214          * pull in the free space cache (if any) so that our pin
5215          * removes the free space from the cache.  We have load_only set
5216          * to one because the slow code to read in the free extents does check
5217          * the pinned extents.
5218          */
5219         cache_block_group(cache, 1);
5220
5221         pin_down_extent(root, cache, bytenr, num_bytes, 0);
5222
5223         /* remove us from the free space cache (if we're there at all) */
5224         btrfs_remove_free_space(cache, bytenr, num_bytes);
5225         btrfs_put_block_group(cache);
5226         return 0;
5227 }
5228
5229 /**
5230  * btrfs_update_reserved_bytes - update the block_group and space info counters
5231  * @cache:      The cache we are manipulating
5232  * @num_bytes:  The number of bytes in question
5233  * @reserve:    One of the reservation enums
5234  *
5235  * This is called by the allocator when it reserves space, or by somebody who is
5236  * freeing space that was never actually used on disk.  For example if you
5237  * reserve some space for a new leaf in transaction A and before transaction A
5238  * commits you free that leaf, you call this with reserve set to 0 in order to
5239  * clear the reservation.
5240  *
5241  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
5242  * ENOSPC accounting.  For data we handle the reservation through clearing the
5243  * delalloc bits in the io_tree.  We have to do this since we could end up
5244  * allocating less disk space for the amount of data we have reserved in the
5245  * case of compression.
5246  *
5247  * If this is a reservation and the block group has become read only we cannot
5248  * make the reservation and return -EAGAIN, otherwise this function always
5249  * succeeds.
5250  */
5251 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
5252                                        u64 num_bytes, int reserve)
5253 {
5254         struct btrfs_space_info *space_info = cache->space_info;
5255         int ret = 0;
5256
5257         spin_lock(&space_info->lock);
5258         spin_lock(&cache->lock);
5259         if (reserve != RESERVE_FREE) {
5260                 if (cache->ro) {
5261                         ret = -EAGAIN;
5262                 } else {
5263                         cache->reserved += num_bytes;
5264                         space_info->bytes_reserved += num_bytes;
5265                         if (reserve == RESERVE_ALLOC) {
5266                                 trace_btrfs_space_reservation(cache->fs_info,
5267                                                 "space_info", space_info->flags,
5268                                                 num_bytes, 0);
5269                                 space_info->bytes_may_use -= num_bytes;
5270                         }
5271                 }
5272         } else {
5273                 if (cache->ro)
5274                         space_info->bytes_readonly += num_bytes;
5275                 cache->reserved -= num_bytes;
5276                 space_info->bytes_reserved -= num_bytes;
5277                 space_info->reservation_progress++;
5278         }
5279         spin_unlock(&cache->lock);
5280         spin_unlock(&space_info->lock);
5281         return ret;
5282 }
5283
5284 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5285                                 struct btrfs_root *root)
5286 {
5287         struct btrfs_fs_info *fs_info = root->fs_info;
5288         struct btrfs_caching_control *next;
5289         struct btrfs_caching_control *caching_ctl;
5290         struct btrfs_block_group_cache *cache;
5291
5292         down_write(&fs_info->extent_commit_sem);
5293
5294         list_for_each_entry_safe(caching_ctl, next,
5295                                  &fs_info->caching_block_groups, list) {
5296                 cache = caching_ctl->block_group;
5297                 if (block_group_cache_done(cache)) {
5298                         cache->last_byte_to_unpin = (u64)-1;
5299                         list_del_init(&caching_ctl->list);
5300                         put_caching_control(caching_ctl);
5301                 } else {
5302                         cache->last_byte_to_unpin = caching_ctl->progress;
5303                 }
5304         }
5305
5306         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5307                 fs_info->pinned_extents = &fs_info->freed_extents[1];
5308         else
5309                 fs_info->pinned_extents = &fs_info->freed_extents[0];
5310
5311         up_write(&fs_info->extent_commit_sem);
5312
5313         update_global_block_rsv(fs_info);
5314 }
5315
5316 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5317 {
5318         struct btrfs_fs_info *fs_info = root->fs_info;
5319         struct btrfs_block_group_cache *cache = NULL;
5320         struct btrfs_space_info *space_info;
5321         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5322         u64 len;
5323         bool readonly;
5324
5325         while (start <= end) {
5326                 readonly = false;
5327                 if (!cache ||
5328                     start >= cache->key.objectid + cache->key.offset) {
5329                         if (cache)
5330                                 btrfs_put_block_group(cache);
5331                         cache = btrfs_lookup_block_group(fs_info, start);
5332                         BUG_ON(!cache); /* Logic error */
5333                 }
5334
5335                 len = cache->key.objectid + cache->key.offset - start;
5336                 len = min(len, end + 1 - start);
5337
5338                 if (start < cache->last_byte_to_unpin) {
5339                         len = min(len, cache->last_byte_to_unpin - start);
5340                         btrfs_add_free_space(cache, start, len);
5341                 }
5342
5343                 start += len;
5344                 space_info = cache->space_info;
5345
5346                 spin_lock(&space_info->lock);
5347                 spin_lock(&cache->lock);
5348                 cache->pinned -= len;
5349                 space_info->bytes_pinned -= len;
5350                 if (cache->ro) {
5351                         space_info->bytes_readonly += len;
5352                         readonly = true;
5353                 }
5354                 spin_unlock(&cache->lock);
5355                 if (!readonly && global_rsv->space_info == space_info) {
5356                         spin_lock(&global_rsv->lock);
5357                         if (!global_rsv->full) {
5358                                 len = min(len, global_rsv->size -
5359                                           global_rsv->reserved);
5360                                 global_rsv->reserved += len;
5361                                 space_info->bytes_may_use += len;
5362                                 if (global_rsv->reserved >= global_rsv->size)
5363                                         global_rsv->full = 1;
5364                         }
5365                         spin_unlock(&global_rsv->lock);
5366                 }
5367                 spin_unlock(&space_info->lock);
5368         }
5369
5370         if (cache)
5371                 btrfs_put_block_group(cache);
5372         return 0;
5373 }
5374
5375 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5376                                struct btrfs_root *root)
5377 {
5378         struct btrfs_fs_info *fs_info = root->fs_info;
5379         struct extent_io_tree *unpin;
5380         u64 start;
5381         u64 end;
5382         int ret;
5383
5384         if (trans->aborted)
5385                 return 0;
5386
5387         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5388                 unpin = &fs_info->freed_extents[1];
5389         else
5390                 unpin = &fs_info->freed_extents[0];
5391
5392         while (1) {
5393                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5394                                             EXTENT_DIRTY, NULL);
5395                 if (ret)
5396                         break;
5397
5398                 if (btrfs_test_opt(root, DISCARD))
5399                         ret = btrfs_discard_extent(root, start,
5400                                                    end + 1 - start, NULL);
5401
5402                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5403                 unpin_extent_range(root, start, end);
5404                 cond_resched();
5405         }
5406
5407         return 0;
5408 }
5409
5410 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5411                                 struct btrfs_root *root,
5412                                 u64 bytenr, u64 num_bytes, u64 parent,
5413                                 u64 root_objectid, u64 owner_objectid,
5414                                 u64 owner_offset, int refs_to_drop,
5415                                 struct btrfs_delayed_extent_op *extent_op)
5416 {
5417         struct btrfs_key key;
5418         struct btrfs_path *path;
5419         struct btrfs_fs_info *info = root->fs_info;
5420         struct btrfs_root *extent_root = info->extent_root;
5421         struct extent_buffer *leaf;
5422         struct btrfs_extent_item *ei;
5423         struct btrfs_extent_inline_ref *iref;
5424         int ret;
5425         int is_data;
5426         int extent_slot = 0;
5427         int found_extent = 0;
5428         int num_to_del = 1;
5429         u32 item_size;
5430         u64 refs;
5431         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
5432                                                  SKINNY_METADATA);
5433
5434         path = btrfs_alloc_path();
5435         if (!path)
5436                 return -ENOMEM;
5437
5438         path->reada = 1;
5439         path->leave_spinning = 1;
5440
5441         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5442         BUG_ON(!is_data && refs_to_drop != 1);
5443
5444         if (is_data)
5445                 skinny_metadata = 0;
5446
5447         ret = lookup_extent_backref(trans, extent_root, path, &iref,
5448                                     bytenr, num_bytes, parent,
5449                                     root_objectid, owner_objectid,
5450                                     owner_offset);
5451         if (ret == 0) {
5452                 extent_slot = path->slots[0];
5453                 while (extent_slot >= 0) {
5454                         btrfs_item_key_to_cpu(path->nodes[0], &key,
5455                                               extent_slot);
5456                         if (key.objectid != bytenr)
5457                                 break;
5458                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5459                             key.offset == num_bytes) {
5460                                 found_extent = 1;
5461                                 break;
5462                         }
5463                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
5464                             key.offset == owner_objectid) {
5465                                 found_extent = 1;
5466                                 break;
5467                         }
5468                         if (path->slots[0] - extent_slot > 5)
5469                                 break;
5470                         extent_slot--;
5471                 }
5472 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5473                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5474                 if (found_extent && item_size < sizeof(*ei))
5475                         found_extent = 0;
5476 #endif
5477                 if (!found_extent) {
5478                         BUG_ON(iref);
5479                         ret = remove_extent_backref(trans, extent_root, path,
5480                                                     NULL, refs_to_drop,
5481                                                     is_data);
5482                         if (ret) {
5483                                 btrfs_abort_transaction(trans, extent_root, ret);
5484                                 goto out;
5485                         }
5486                         btrfs_release_path(path);
5487                         path->leave_spinning = 1;
5488
5489                         key.objectid = bytenr;
5490                         key.type = BTRFS_EXTENT_ITEM_KEY;
5491                         key.offset = num_bytes;
5492
5493                         if (!is_data && skinny_metadata) {
5494                                 key.type = BTRFS_METADATA_ITEM_KEY;
5495                                 key.offset = owner_objectid;
5496                         }
5497
5498                         ret = btrfs_search_slot(trans, extent_root,
5499                                                 &key, path, -1, 1);
5500                         if (ret > 0 && skinny_metadata && path->slots[0]) {
5501                                 /*
5502                                  * Couldn't find our skinny metadata item,
5503                                  * see if we have ye olde extent item.
5504                                  */
5505                                 path->slots[0]--;
5506                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
5507                                                       path->slots[0]);
5508                                 if (key.objectid == bytenr &&
5509                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
5510                                     key.offset == num_bytes)
5511                                         ret = 0;
5512                         }
5513
5514                         if (ret > 0 && skinny_metadata) {
5515                                 skinny_metadata = false;
5516                                 key.type = BTRFS_EXTENT_ITEM_KEY;
5517                                 key.offset = num_bytes;
5518                                 btrfs_release_path(path);
5519                                 ret = btrfs_search_slot(trans, extent_root,
5520                                                         &key, path, -1, 1);
5521                         }
5522
5523                         if (ret) {
5524                                 btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5525                                         ret, (unsigned long long)bytenr);
5526                                 if (ret > 0)
5527                                         btrfs_print_leaf(extent_root,
5528                                                          path->nodes[0]);
5529                         }
5530                         if (ret < 0) {
5531                                 btrfs_abort_transaction(trans, extent_root, ret);
5532                                 goto out;
5533                         }
5534                         extent_slot = path->slots[0];
5535                 }
5536         } else if (ret == -ENOENT) {
5537                 btrfs_print_leaf(extent_root, path->nodes[0]);
5538                 WARN_ON(1);
5539                 btrfs_err(info,
5540                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
5541                         (unsigned long long)bytenr,
5542                         (unsigned long long)parent,
5543                         (unsigned long long)root_objectid,
5544                         (unsigned long long)owner_objectid,
5545                         (unsigned long long)owner_offset);
5546         } else {
5547                 btrfs_abort_transaction(trans, extent_root, ret);
5548                 goto out;
5549         }
5550
5551         leaf = path->nodes[0];
5552         item_size = btrfs_item_size_nr(leaf, extent_slot);
5553 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5554         if (item_size < sizeof(*ei)) {
5555                 BUG_ON(found_extent || extent_slot != path->slots[0]);
5556                 ret = convert_extent_item_v0(trans, extent_root, path,
5557                                              owner_objectid, 0);
5558                 if (ret < 0) {
5559                         btrfs_abort_transaction(trans, extent_root, ret);
5560                         goto out;
5561                 }
5562
5563                 btrfs_release_path(path);
5564                 path->leave_spinning = 1;
5565
5566                 key.objectid = bytenr;
5567                 key.type = BTRFS_EXTENT_ITEM_KEY;
5568                 key.offset = num_bytes;
5569
5570                 ret = btrfs_search_slot(trans, extent_root, &key, path,
5571                                         -1, 1);
5572                 if (ret) {
5573                         btrfs_err(info, "umm, got %d back from search, was looking for %llu",
5574                                 ret, (unsigned long long)bytenr);
5575                         btrfs_print_leaf(extent_root, path->nodes[0]);
5576                 }
5577                 if (ret < 0) {
5578                         btrfs_abort_transaction(trans, extent_root, ret);
5579                         goto out;
5580                 }
5581
5582                 extent_slot = path->slots[0];
5583                 leaf = path->nodes[0];
5584                 item_size = btrfs_item_size_nr(leaf, extent_slot);
5585         }
5586 #endif
5587         BUG_ON(item_size < sizeof(*ei));
5588         ei = btrfs_item_ptr(leaf, extent_slot,
5589                             struct btrfs_extent_item);
5590         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
5591             key.type == BTRFS_EXTENT_ITEM_KEY) {
5592                 struct btrfs_tree_block_info *bi;
5593                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5594                 bi = (struct btrfs_tree_block_info *)(ei + 1);
5595                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5596         }
5597
5598         refs = btrfs_extent_refs(leaf, ei);
5599         BUG_ON(refs < refs_to_drop);
5600         refs -= refs_to_drop;
5601
5602         if (refs > 0) {
5603                 if (extent_op)
5604                         __run_delayed_extent_op(extent_op, leaf, ei);
5605                 /*
5606                  * In the case of inline back ref, reference count will
5607                  * be updated by remove_extent_backref
5608                  */
5609                 if (iref) {
5610                         BUG_ON(!found_extent);
5611                 } else {
5612                         btrfs_set_extent_refs(leaf, ei, refs);
5613                         btrfs_mark_buffer_dirty(leaf);
5614                 }
5615                 if (found_extent) {
5616                         ret = remove_extent_backref(trans, extent_root, path,
5617                                                     iref, refs_to_drop,
5618                                                     is_data);
5619                         if (ret) {
5620                                 btrfs_abort_transaction(trans, extent_root, ret);
5621                                 goto out;
5622                         }
5623                 }
5624         } else {
5625                 if (found_extent) {
5626                         BUG_ON(is_data && refs_to_drop !=
5627                                extent_data_ref_count(root, path, iref));
5628                         if (iref) {
5629                                 BUG_ON(path->slots[0] != extent_slot);
5630                         } else {
5631                                 BUG_ON(path->slots[0] != extent_slot + 1);
5632                                 path->slots[0] = extent_slot;
5633                                 num_to_del = 2;
5634                         }
5635                 }
5636
5637                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5638                                       num_to_del);
5639                 if (ret) {
5640                         btrfs_abort_transaction(trans, extent_root, ret);
5641                         goto out;
5642                 }
5643                 btrfs_release_path(path);
5644
5645                 if (is_data) {
5646                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5647                         if (ret) {
5648                                 btrfs_abort_transaction(trans, extent_root, ret);
5649                                 goto out;
5650                         }
5651                 }
5652
5653                 ret = update_block_group(root, bytenr, num_bytes, 0);
5654                 if (ret) {
5655                         btrfs_abort_transaction(trans, extent_root, ret);
5656                         goto out;
5657                 }
5658         }
5659 out:
5660         btrfs_free_path(path);
5661         return ret;
5662 }
5663
5664 /*
5665  * when we free an block, it is possible (and likely) that we free the last
5666  * delayed ref for that extent as well.  This searches the delayed ref tree for
5667  * a given extent, and if there are no other delayed refs to be processed, it
5668  * removes it from the tree.
5669  */
5670 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5671                                       struct btrfs_root *root, u64 bytenr)
5672 {
5673         struct btrfs_delayed_ref_head *head;
5674         struct btrfs_delayed_ref_root *delayed_refs;
5675         struct btrfs_delayed_ref_node *ref;
5676         struct rb_node *node;
5677         int ret = 0;
5678
5679         delayed_refs = &trans->transaction->delayed_refs;
5680         spin_lock(&delayed_refs->lock);
5681         head = btrfs_find_delayed_ref_head(trans, bytenr);
5682         if (!head)
5683                 goto out;
5684
5685         node = rb_prev(&head->node.rb_node);
5686         if (!node)
5687                 goto out;
5688
5689         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5690
5691         /* there are still entries for this ref, we can't drop it */
5692         if (ref->bytenr == bytenr)
5693                 goto out;
5694
5695         if (head->extent_op) {
5696                 if (!head->must_insert_reserved)
5697                         goto out;
5698                 btrfs_free_delayed_extent_op(head->extent_op);
5699                 head->extent_op = NULL;
5700         }
5701
5702         /*
5703          * waiting for the lock here would deadlock.  If someone else has it
5704          * locked they are already in the process of dropping it anyway
5705          */
5706         if (!mutex_trylock(&head->mutex))
5707                 goto out;
5708
5709         /*
5710          * at this point we have a head with no other entries.  Go
5711          * ahead and process it.
5712          */
5713         head->node.in_tree = 0;
5714         rb_erase(&head->node.rb_node, &delayed_refs->root);
5715
5716         delayed_refs->num_entries--;
5717
5718         /*
5719          * we don't take a ref on the node because we're removing it from the
5720          * tree, so we just steal the ref the tree was holding.
5721          */
5722         delayed_refs->num_heads--;
5723         if (list_empty(&head->cluster))
5724                 delayed_refs->num_heads_ready--;
5725
5726         list_del_init(&head->cluster);
5727         spin_unlock(&delayed_refs->lock);
5728
5729         BUG_ON(head->extent_op);
5730         if (head->must_insert_reserved)
5731                 ret = 1;
5732
5733         mutex_unlock(&head->mutex);
5734         btrfs_put_delayed_ref(&head->node);
5735         return ret;
5736 out:
5737         spin_unlock(&delayed_refs->lock);
5738         return 0;
5739 }
5740
5741 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5742                            struct btrfs_root *root,
5743                            struct extent_buffer *buf,
5744                            u64 parent, int last_ref)
5745 {
5746         struct btrfs_block_group_cache *cache = NULL;
5747         int ret;
5748
5749         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5750                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5751                                         buf->start, buf->len,
5752                                         parent, root->root_key.objectid,
5753                                         btrfs_header_level(buf),
5754                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
5755                 BUG_ON(ret); /* -ENOMEM */
5756         }
5757
5758         if (!last_ref)
5759                 return;
5760
5761         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5762
5763         if (btrfs_header_generation(buf) == trans->transid) {
5764                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5765                         ret = check_ref_cleanup(trans, root, buf->start);
5766                         if (!ret)
5767                                 goto out;
5768                 }
5769
5770                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5771                         pin_down_extent(root, cache, buf->start, buf->len, 1);
5772                         goto out;
5773                 }
5774
5775                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5776
5777                 btrfs_add_free_space(cache, buf->start, buf->len);
5778                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5779         }
5780 out:
5781         /*
5782          * Deleting the buffer, clear the corrupt flag since it doesn't matter
5783          * anymore.
5784          */
5785         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5786         btrfs_put_block_group(cache);
5787 }
5788
5789 /* Can return -ENOMEM */
5790 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5791                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5792                       u64 owner, u64 offset, int for_cow)
5793 {
5794         int ret;
5795         struct btrfs_fs_info *fs_info = root->fs_info;
5796
5797         /*
5798          * tree log blocks never actually go into the extent allocation
5799          * tree, just update pinning info and exit early.
5800          */
5801         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5802                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5803                 /* unlocks the pinned mutex */
5804                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5805                 ret = 0;
5806         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5807                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5808                                         num_bytes,
5809                                         parent, root_objectid, (int)owner,
5810                                         BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5811         } else {
5812                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5813                                                 num_bytes,
5814                                                 parent, root_objectid, owner,
5815                                                 offset, BTRFS_DROP_DELAYED_REF,
5816                                                 NULL, for_cow);
5817         }
5818         return ret;
5819 }
5820
5821 static u64 stripe_align(struct btrfs_root *root,
5822                         struct btrfs_block_group_cache *cache,
5823                         u64 val, u64 num_bytes)
5824 {
5825         u64 ret = ALIGN(val, root->stripesize);
5826         return ret;
5827 }
5828
5829 /*
5830  * when we wait for progress in the block group caching, its because
5831  * our allocation attempt failed at least once.  So, we must sleep
5832  * and let some progress happen before we try again.
5833  *
5834  * This function will sleep at least once waiting for new free space to
5835  * show up, and then it will check the block group free space numbers
5836  * for our min num_bytes.  Another option is to have it go ahead
5837  * and look in the rbtree for a free extent of a given size, but this
5838  * is a good start.
5839  */
5840 static noinline int
5841 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5842                                 u64 num_bytes)
5843 {
5844         struct btrfs_caching_control *caching_ctl;
5845
5846         caching_ctl = get_caching_control(cache);
5847         if (!caching_ctl)
5848                 return 0;
5849
5850         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5851                    (cache->free_space_ctl->free_space >= num_bytes));
5852
5853         put_caching_control(caching_ctl);
5854         return 0;
5855 }
5856
5857 static noinline int
5858 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5859 {
5860         struct btrfs_caching_control *caching_ctl;
5861
5862         caching_ctl = get_caching_control(cache);
5863         if (!caching_ctl)
5864                 return 0;
5865
5866         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5867
5868         put_caching_control(caching_ctl);
5869         return 0;
5870 }
5871
5872 int __get_raid_index(u64 flags)
5873 {
5874         if (flags & BTRFS_BLOCK_GROUP_RAID10)
5875                 return BTRFS_RAID_RAID10;
5876         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5877                 return BTRFS_RAID_RAID1;
5878         else if (flags & BTRFS_BLOCK_GROUP_DUP)
5879                 return BTRFS_RAID_DUP;
5880         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5881                 return BTRFS_RAID_RAID0;
5882         else if (flags & BTRFS_BLOCK_GROUP_RAID5)
5883                 return BTRFS_RAID_RAID5;
5884         else if (flags & BTRFS_BLOCK_GROUP_RAID6)
5885                 return BTRFS_RAID_RAID6;
5886
5887         return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
5888 }
5889
5890 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5891 {
5892         return __get_raid_index(cache->flags);
5893 }
5894
5895 enum btrfs_loop_type {
5896         LOOP_CACHING_NOWAIT = 0,
5897         LOOP_CACHING_WAIT = 1,
5898         LOOP_ALLOC_CHUNK = 2,
5899         LOOP_NO_EMPTY_SIZE = 3,
5900 };
5901
5902 /*
5903  * walks the btree of allocated extents and find a hole of a given size.
5904  * The key ins is changed to record the hole:
5905  * ins->objectid == block start
5906  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5907  * ins->offset == number of blocks
5908  * Any available blocks before search_start are skipped.
5909  */
5910 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5911                                      struct btrfs_root *orig_root,
5912                                      u64 num_bytes, u64 empty_size,
5913                                      u64 hint_byte, struct btrfs_key *ins,
5914                                      u64 data)
5915 {
5916         int ret = 0;
5917         struct btrfs_root *root = orig_root->fs_info->extent_root;
5918         struct btrfs_free_cluster *last_ptr = NULL;
5919         struct btrfs_block_group_cache *block_group = NULL;
5920         struct btrfs_block_group_cache *used_block_group;
5921         u64 search_start = 0;
5922         int empty_cluster = 2 * 1024 * 1024;
5923         struct btrfs_space_info *space_info;
5924         int loop = 0;
5925         int index = __get_raid_index(data);
5926         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5927                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5928         bool found_uncached_bg = false;
5929         bool failed_cluster_refill = false;
5930         bool failed_alloc = false;
5931         bool use_cluster = true;
5932         bool have_caching_bg = false;
5933
5934         WARN_ON(num_bytes < root->sectorsize);
5935         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5936         ins->objectid = 0;
5937         ins->offset = 0;
5938
5939         trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5940
5941         space_info = __find_space_info(root->fs_info, data);
5942         if (!space_info) {
5943                 btrfs_err(root->fs_info, "No space info for %llu", data);
5944                 return -ENOSPC;
5945         }
5946
5947         /*
5948          * If the space info is for both data and metadata it means we have a
5949          * small filesystem and we can't use the clustering stuff.
5950          */
5951         if (btrfs_mixed_space_info(space_info))
5952                 use_cluster = false;
5953
5954         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5955                 last_ptr = &root->fs_info->meta_alloc_cluster;
5956                 if (!btrfs_test_opt(root, SSD))
5957                         empty_cluster = 64 * 1024;
5958         }
5959
5960         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5961             btrfs_test_opt(root, SSD)) {
5962                 last_ptr = &root->fs_info->data_alloc_cluster;
5963         }
5964
5965         if (last_ptr) {
5966                 spin_lock(&last_ptr->lock);
5967                 if (last_ptr->block_group)
5968                         hint_byte = last_ptr->window_start;
5969                 spin_unlock(&last_ptr->lock);
5970         }
5971
5972         search_start = max(search_start, first_logical_byte(root, 0));
5973         search_start = max(search_start, hint_byte);
5974
5975         if (!last_ptr)
5976                 empty_cluster = 0;
5977
5978         if (search_start == hint_byte) {
5979                 block_group = btrfs_lookup_block_group(root->fs_info,
5980                                                        search_start);
5981                 used_block_group = block_group;
5982                 /*
5983                  * we don't want to use the block group if it doesn't match our
5984                  * allocation bits, or if its not cached.
5985                  *
5986                  * However if we are re-searching with an ideal block group
5987                  * picked out then we don't care that the block group is cached.
5988                  */
5989                 if (block_group && block_group_bits(block_group, data) &&
5990                     block_group->cached != BTRFS_CACHE_NO) {
5991                         down_read(&space_info->groups_sem);
5992                         if (list_empty(&block_group->list) ||
5993                             block_group->ro) {
5994                                 /*
5995                                  * someone is removing this block group,
5996                                  * we can't jump into the have_block_group
5997                                  * target because our list pointers are not
5998                                  * valid
5999                                  */
6000                                 btrfs_put_block_group(block_group);
6001                                 up_read(&space_info->groups_sem);
6002                         } else {
6003                                 index = get_block_group_index(block_group);
6004                                 goto have_block_group;
6005                         }
6006                 } else if (block_group) {
6007                         btrfs_put_block_group(block_group);
6008                 }
6009         }
6010 search:
6011         have_caching_bg = false;
6012         down_read(&space_info->groups_sem);
6013         list_for_each_entry(block_group, &space_info->block_groups[index],
6014                             list) {
6015                 u64 offset;
6016                 int cached;
6017
6018                 used_block_group = block_group;
6019                 btrfs_get_block_group(block_group);
6020                 search_start = block_group->key.objectid;
6021
6022                 /*
6023                  * this can happen if we end up cycling through all the
6024                  * raid types, but we want to make sure we only allocate
6025                  * for the proper type.
6026                  */
6027                 if (!block_group_bits(block_group, data)) {
6028                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
6029                                 BTRFS_BLOCK_GROUP_RAID1 |
6030                                 BTRFS_BLOCK_GROUP_RAID5 |
6031                                 BTRFS_BLOCK_GROUP_RAID6 |
6032                                 BTRFS_BLOCK_GROUP_RAID10;
6033
6034                         /*
6035                          * if they asked for extra copies and this block group
6036                          * doesn't provide them, bail.  This does allow us to
6037                          * fill raid0 from raid1.
6038                          */
6039                         if ((data & extra) && !(block_group->flags & extra))
6040                                 goto loop;
6041                 }
6042
6043 have_block_group:
6044                 cached = block_group_cache_done(block_group);
6045                 if (unlikely(!cached)) {
6046                         found_uncached_bg = true;
6047                         ret = cache_block_group(block_group, 0);
6048                         BUG_ON(ret < 0);
6049                         ret = 0;
6050                 }
6051
6052                 if (unlikely(block_group->ro))
6053                         goto loop;
6054
6055                 /*
6056                  * Ok we want to try and use the cluster allocator, so
6057                  * lets look there
6058                  */
6059                 if (last_ptr) {
6060                         unsigned long aligned_cluster;
6061                         /*
6062                          * the refill lock keeps out other
6063                          * people trying to start a new cluster
6064                          */
6065                         spin_lock(&last_ptr->refill_lock);
6066                         used_block_group = last_ptr->block_group;
6067                         if (used_block_group != block_group &&
6068                             (!used_block_group ||
6069                              used_block_group->ro ||
6070                              !block_group_bits(used_block_group, data))) {
6071                                 used_block_group = block_group;
6072                                 goto refill_cluster;
6073                         }
6074
6075                         if (used_block_group != block_group)
6076                                 btrfs_get_block_group(used_block_group);
6077
6078                         offset = btrfs_alloc_from_cluster(used_block_group,
6079                           last_ptr, num_bytes, used_block_group->key.objectid);
6080                         if (offset) {
6081                                 /* we have a block, we're done */
6082                                 spin_unlock(&last_ptr->refill_lock);
6083                                 trace_btrfs_reserve_extent_cluster(root,
6084                                         block_group, search_start, num_bytes);
6085                                 goto checks;
6086                         }
6087
6088                         WARN_ON(last_ptr->block_group != used_block_group);
6089                         if (used_block_group != block_group) {
6090                                 btrfs_put_block_group(used_block_group);
6091                                 used_block_group = block_group;
6092                         }
6093 refill_cluster:
6094                         BUG_ON(used_block_group != block_group);
6095                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
6096                          * set up a new clusters, so lets just skip it
6097                          * and let the allocator find whatever block
6098                          * it can find.  If we reach this point, we
6099                          * will have tried the cluster allocator
6100                          * plenty of times and not have found
6101                          * anything, so we are likely way too
6102                          * fragmented for the clustering stuff to find
6103                          * anything.
6104                          *
6105                          * However, if the cluster is taken from the
6106                          * current block group, release the cluster
6107                          * first, so that we stand a better chance of
6108                          * succeeding in the unclustered
6109                          * allocation.  */
6110                         if (loop >= LOOP_NO_EMPTY_SIZE &&
6111                             last_ptr->block_group != block_group) {
6112                                 spin_unlock(&last_ptr->refill_lock);
6113                                 goto unclustered_alloc;
6114                         }
6115
6116                         /*
6117                          * this cluster didn't work out, free it and
6118                          * start over
6119                          */
6120                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
6121
6122                         if (loop >= LOOP_NO_EMPTY_SIZE) {
6123                                 spin_unlock(&last_ptr->refill_lock);
6124                                 goto unclustered_alloc;
6125                         }
6126
6127                         aligned_cluster = max_t(unsigned long,
6128                                                 empty_cluster + empty_size,
6129                                               block_group->full_stripe_len);
6130
6131                         /* allocate a cluster in this block group */
6132                         ret = btrfs_find_space_cluster(trans, root,
6133                                                block_group, last_ptr,
6134                                                search_start, num_bytes,
6135                                                aligned_cluster);
6136                         if (ret == 0) {
6137                                 /*
6138                                  * now pull our allocation out of this
6139                                  * cluster
6140                                  */
6141                                 offset = btrfs_alloc_from_cluster(block_group,
6142                                                   last_ptr, num_bytes,
6143                                                   search_start);
6144                                 if (offset) {
6145                                         /* we found one, proceed */
6146                                         spin_unlock(&last_ptr->refill_lock);
6147                                         trace_btrfs_reserve_extent_cluster(root,
6148                                                 block_group, search_start,
6149                                                 num_bytes);
6150                                         goto checks;
6151                                 }
6152                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
6153                                    && !failed_cluster_refill) {
6154                                 spin_unlock(&last_ptr->refill_lock);
6155
6156                                 failed_cluster_refill = true;
6157                                 wait_block_group_cache_progress(block_group,
6158                                        num_bytes + empty_cluster + empty_size);
6159                                 goto have_block_group;
6160                         }
6161
6162                         /*
6163                          * at this point we either didn't find a cluster
6164                          * or we weren't able to allocate a block from our
6165                          * cluster.  Free the cluster we've been trying
6166                          * to use, and go to the next block group
6167                          */
6168                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
6169                         spin_unlock(&last_ptr->refill_lock);
6170                         goto loop;
6171                 }
6172
6173 unclustered_alloc:
6174                 spin_lock(&block_group->free_space_ctl->tree_lock);
6175                 if (cached &&
6176                     block_group->free_space_ctl->free_space <
6177                     num_bytes + empty_cluster + empty_size) {
6178                         spin_unlock(&block_group->free_space_ctl->tree_lock);
6179                         goto loop;
6180                 }
6181                 spin_unlock(&block_group->free_space_ctl->tree_lock);
6182
6183                 offset = btrfs_find_space_for_alloc(block_group, search_start,
6184                                                     num_bytes, empty_size);
6185                 /*
6186                  * If we didn't find a chunk, and we haven't failed on this
6187                  * block group before, and this block group is in the middle of
6188                  * caching and we are ok with waiting, then go ahead and wait
6189                  * for progress to be made, and set failed_alloc to true.
6190                  *
6191                  * If failed_alloc is true then we've already waited on this
6192                  * block group once and should move on to the next block group.
6193                  */
6194                 if (!offset && !failed_alloc && !cached &&
6195                     loop > LOOP_CACHING_NOWAIT) {
6196                         wait_block_group_cache_progress(block_group,
6197                                                 num_bytes + empty_size);
6198                         failed_alloc = true;
6199                         goto have_block_group;
6200                 } else if (!offset) {
6201                         if (!cached)
6202                                 have_caching_bg = true;
6203                         goto loop;
6204                 }
6205 checks:
6206                 search_start = stripe_align(root, used_block_group,
6207                                             offset, num_bytes);
6208
6209                 /* move on to the next group */
6210                 if (search_start + num_bytes >
6211                     used_block_group->key.objectid + used_block_group->key.offset) {
6212                         btrfs_add_free_space(used_block_group, offset, num_bytes);
6213                         goto loop;
6214                 }
6215
6216                 if (offset < search_start)
6217                         btrfs_add_free_space(used_block_group, offset,
6218                                              search_start - offset);
6219                 BUG_ON(offset > search_start);
6220
6221                 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
6222                                                   alloc_type);
6223                 if (ret == -EAGAIN) {
6224                         btrfs_add_free_space(used_block_group, offset, num_bytes);
6225                         goto loop;
6226                 }
6227
6228                 /* we are all good, lets return */
6229                 ins->objectid = search_start;
6230                 ins->offset = num_bytes;
6231
6232                 trace_btrfs_reserve_extent(orig_root, block_group,
6233                                            search_start, num_bytes);
6234                 if (used_block_group != block_group)
6235                         btrfs_put_block_group(used_block_group);
6236                 btrfs_put_block_group(block_group);
6237                 break;
6238 loop:
6239                 failed_cluster_refill = false;
6240                 failed_alloc = false;
6241                 BUG_ON(index != get_block_group_index(block_group));
6242                 if (used_block_group != block_group)
6243                         btrfs_put_block_group(used_block_group);
6244                 btrfs_put_block_group(block_group);
6245         }
6246         up_read(&space_info->groups_sem);
6247
6248         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
6249                 goto search;
6250
6251         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
6252                 goto search;
6253
6254         /*
6255          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
6256          *                      caching kthreads as we move along
6257          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
6258          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
6259          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
6260          *                      again
6261          */
6262         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
6263                 index = 0;
6264                 loop++;
6265                 if (loop == LOOP_ALLOC_CHUNK) {
6266                         ret = do_chunk_alloc(trans, root, data,
6267                                              CHUNK_ALLOC_FORCE);
6268                         /*
6269                          * Do not bail out on ENOSPC since we
6270                          * can do more things.
6271                          */
6272                         if (ret < 0 && ret != -ENOSPC) {
6273                                 btrfs_abort_transaction(trans,
6274                                                         root, ret);
6275                                 goto out;
6276                         }
6277                 }
6278
6279                 if (loop == LOOP_NO_EMPTY_SIZE) {
6280                         empty_size = 0;
6281                         empty_cluster = 0;
6282                 }
6283
6284                 goto search;
6285         } else if (!ins->objectid) {
6286                 ret = -ENOSPC;
6287         } else if (ins->objectid) {
6288                 ret = 0;
6289         }
6290 out:
6291
6292         return ret;
6293 }
6294
6295 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
6296                             int dump_block_groups)
6297 {
6298         struct btrfs_block_group_cache *cache;
6299         int index = 0;
6300
6301         spin_lock(&info->lock);
6302         printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
6303                (unsigned long long)info->flags,
6304                (unsigned long long)(info->total_bytes - info->bytes_used -
6305                                     info->bytes_pinned - info->bytes_reserved -
6306                                     info->bytes_readonly),
6307                (info->full) ? "" : "not ");
6308         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
6309                "reserved=%llu, may_use=%llu, readonly=%llu\n",
6310                (unsigned long long)info->total_bytes,
6311                (unsigned long long)info->bytes_used,
6312                (unsigned long long)info->bytes_pinned,
6313                (unsigned long long)info->bytes_reserved,
6314                (unsigned long long)info->bytes_may_use,
6315                (unsigned long long)info->bytes_readonly);
6316         spin_unlock(&info->lock);
6317
6318         if (!dump_block_groups)
6319                 return;
6320
6321         down_read(&info->groups_sem);
6322 again:
6323         list_for_each_entry(cache, &info->block_groups[index], list) {
6324                 spin_lock(&cache->lock);
6325                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6326                        (unsigned long long)cache->key.objectid,
6327                        (unsigned long long)cache->key.offset,
6328                        (unsigned long long)btrfs_block_group_used(&cache->item),
6329                        (unsigned long long)cache->pinned,
6330                        (unsigned long long)cache->reserved,
6331                        cache->ro ? "[readonly]" : "");
6332                 btrfs_dump_free_space(cache, bytes);
6333                 spin_unlock(&cache->lock);
6334         }
6335         if (++index < BTRFS_NR_RAID_TYPES)
6336                 goto again;
6337         up_read(&info->groups_sem);
6338 }
6339
6340 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6341                          struct btrfs_root *root,
6342                          u64 num_bytes, u64 min_alloc_size,
6343                          u64 empty_size, u64 hint_byte,
6344                          struct btrfs_key *ins, u64 data)
6345 {
6346         bool final_tried = false;
6347         int ret;
6348
6349         data = btrfs_get_alloc_profile(root, data);
6350 again:
6351         WARN_ON(num_bytes < root->sectorsize);
6352         ret = find_free_extent(trans, root, num_bytes, empty_size,
6353                                hint_byte, ins, data);
6354
6355         if (ret == -ENOSPC) {
6356                 if (!final_tried) {
6357                         num_bytes = num_bytes >> 1;
6358                         num_bytes = round_down(num_bytes, root->sectorsize);
6359                         num_bytes = max(num_bytes, min_alloc_size);
6360                         if (num_bytes == min_alloc_size)
6361                                 final_tried = true;
6362                         goto again;
6363                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6364                         struct btrfs_space_info *sinfo;
6365
6366                         sinfo = __find_space_info(root->fs_info, data);
6367                         btrfs_err(root->fs_info, "allocation failed flags %llu, wanted %llu",
6368                                 (unsigned long long)data,
6369                                 (unsigned long long)num_bytes);
6370                         if (sinfo)
6371                                 dump_space_info(sinfo, num_bytes, 1);
6372                 }
6373         }
6374
6375         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6376
6377         return ret;
6378 }
6379
6380 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6381                                         u64 start, u64 len, int pin)
6382 {
6383         struct btrfs_block_group_cache *cache;
6384         int ret = 0;
6385
6386         cache = btrfs_lookup_block_group(root->fs_info, start);
6387         if (!cache) {
6388                 btrfs_err(root->fs_info, "Unable to find block group for %llu",
6389                         (unsigned long long)start);
6390                 return -ENOSPC;
6391         }
6392
6393         if (btrfs_test_opt(root, DISCARD))
6394                 ret = btrfs_discard_extent(root, start, len, NULL);
6395
6396         if (pin)
6397                 pin_down_extent(root, cache, start, len, 1);
6398         else {
6399                 btrfs_add_free_space(cache, start, len);
6400                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6401         }
6402         btrfs_put_block_group(cache);
6403
6404         trace_btrfs_reserved_extent_free(root, start, len);
6405
6406         return ret;
6407 }
6408
6409 int btrfs_free_reserved_extent(struct btrfs_root *root,
6410                                         u64 start, u64 len)
6411 {
6412         return __btrfs_free_reserved_extent(root, start, len, 0);
6413 }
6414
6415 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6416                                        u64 start, u64 len)
6417 {
6418         return __btrfs_free_reserved_extent(root, start, len, 1);
6419 }
6420
6421 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6422                                       struct btrfs_root *root,
6423                                       u64 parent, u64 root_objectid,
6424                                       u64 flags, u64 owner, u64 offset,
6425                                       struct btrfs_key *ins, int ref_mod)
6426 {
6427         int ret;
6428         struct btrfs_fs_info *fs_info = root->fs_info;
6429         struct btrfs_extent_item *extent_item;
6430         struct btrfs_extent_inline_ref *iref;
6431         struct btrfs_path *path;
6432         struct extent_buffer *leaf;
6433         int type;
6434         u32 size;
6435
6436         if (parent > 0)
6437                 type = BTRFS_SHARED_DATA_REF_KEY;
6438         else
6439                 type = BTRFS_EXTENT_DATA_REF_KEY;
6440
6441         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6442
6443         path = btrfs_alloc_path();
6444         if (!path)
6445                 return -ENOMEM;
6446
6447         path->leave_spinning = 1;
6448         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6449                                       ins, size);
6450         if (ret) {
6451                 btrfs_free_path(path);
6452                 return ret;
6453         }
6454
6455         leaf = path->nodes[0];
6456         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6457                                      struct btrfs_extent_item);
6458         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6459         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6460         btrfs_set_extent_flags(leaf, extent_item,
6461                                flags | BTRFS_EXTENT_FLAG_DATA);
6462
6463         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6464         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6465         if (parent > 0) {
6466                 struct btrfs_shared_data_ref *ref;
6467                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6468                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6469                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6470         } else {
6471                 struct btrfs_extent_data_ref *ref;
6472                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6473                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6474                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6475                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6476                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6477         }
6478
6479         btrfs_mark_buffer_dirty(path->nodes[0]);
6480         btrfs_free_path(path);
6481
6482         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6483         if (ret) { /* -ENOENT, logic error */
6484                 btrfs_err(fs_info, "update block group failed for %llu %llu",
6485                         (unsigned long long)ins->objectid,
6486                         (unsigned long long)ins->offset);
6487                 BUG();
6488         }
6489         return ret;
6490 }
6491
6492 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6493                                      struct btrfs_root *root,
6494                                      u64 parent, u64 root_objectid,
6495                                      u64 flags, struct btrfs_disk_key *key,
6496                                      int level, struct btrfs_key *ins)
6497 {
6498         int ret;
6499         struct btrfs_fs_info *fs_info = root->fs_info;
6500         struct btrfs_extent_item *extent_item;
6501         struct btrfs_tree_block_info *block_info;
6502         struct btrfs_extent_inline_ref *iref;
6503         struct btrfs_path *path;
6504         struct extent_buffer *leaf;
6505         u32 size = sizeof(*extent_item) + sizeof(*iref);
6506         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6507                                                  SKINNY_METADATA);
6508
6509         if (!skinny_metadata)
6510                 size += sizeof(*block_info);
6511
6512         path = btrfs_alloc_path();
6513         if (!path)
6514                 return -ENOMEM;
6515
6516         path->leave_spinning = 1;
6517         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6518                                       ins, size);
6519         if (ret) {
6520                 btrfs_free_path(path);
6521                 return ret;
6522         }
6523
6524         leaf = path->nodes[0];
6525         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6526                                      struct btrfs_extent_item);
6527         btrfs_set_extent_refs(leaf, extent_item, 1);
6528         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6529         btrfs_set_extent_flags(leaf, extent_item,
6530                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6531
6532         if (skinny_metadata) {
6533                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6534         } else {
6535                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6536                 btrfs_set_tree_block_key(leaf, block_info, key);
6537                 btrfs_set_tree_block_level(leaf, block_info, level);
6538                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6539         }
6540
6541         if (parent > 0) {
6542                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6543                 btrfs_set_extent_inline_ref_type(leaf, iref,
6544                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6545                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6546         } else {
6547                 btrfs_set_extent_inline_ref_type(leaf, iref,
6548                                                  BTRFS_TREE_BLOCK_REF_KEY);
6549                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6550         }
6551
6552         btrfs_mark_buffer_dirty(leaf);
6553         btrfs_free_path(path);
6554
6555         ret = update_block_group(root, ins->objectid, root->leafsize, 1);
6556         if (ret) { /* -ENOENT, logic error */
6557                 btrfs_err(fs_info, "update block group failed for %llu %llu",
6558                         (unsigned long long)ins->objectid,
6559                         (unsigned long long)ins->offset);
6560                 BUG();
6561         }
6562         return ret;
6563 }
6564
6565 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6566                                      struct btrfs_root *root,
6567                                      u64 root_objectid, u64 owner,
6568                                      u64 offset, struct btrfs_key *ins)
6569 {
6570         int ret;
6571
6572         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6573
6574         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6575                                          ins->offset, 0,
6576                                          root_objectid, owner, offset,
6577                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6578         return ret;
6579 }
6580
6581 /*
6582  * this is used by the tree logging recovery code.  It records that
6583  * an extent has been allocated and makes sure to clear the free
6584  * space cache bits as well
6585  */
6586 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6587                                    struct btrfs_root *root,
6588                                    u64 root_objectid, u64 owner, u64 offset,
6589                                    struct btrfs_key *ins)
6590 {
6591         int ret;
6592         struct btrfs_block_group_cache *block_group;
6593         struct btrfs_caching_control *caching_ctl;
6594         u64 start = ins->objectid;
6595         u64 num_bytes = ins->offset;
6596
6597         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6598         cache_block_group(block_group, 0);
6599         caching_ctl = get_caching_control(block_group);
6600
6601         if (!caching_ctl) {
6602                 BUG_ON(!block_group_cache_done(block_group));
6603                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6604                 BUG_ON(ret); /* -ENOMEM */
6605         } else {
6606                 mutex_lock(&caching_ctl->mutex);
6607
6608                 if (start >= caching_ctl->progress) {
6609                         ret = add_excluded_extent(root, start, num_bytes);
6610                         BUG_ON(ret); /* -ENOMEM */
6611                 } else if (start + num_bytes <= caching_ctl->progress) {
6612                         ret = btrfs_remove_free_space(block_group,
6613                                                       start, num_bytes);
6614                         BUG_ON(ret); /* -ENOMEM */
6615                 } else {
6616                         num_bytes = caching_ctl->progress - start;
6617                         ret = btrfs_remove_free_space(block_group,
6618                                                       start, num_bytes);
6619                         BUG_ON(ret); /* -ENOMEM */
6620
6621                         start = caching_ctl->progress;
6622                         num_bytes = ins->objectid + ins->offset -
6623                                     caching_ctl->progress;
6624                         ret = add_excluded_extent(root, start, num_bytes);
6625                         BUG_ON(ret); /* -ENOMEM */
6626                 }
6627
6628                 mutex_unlock(&caching_ctl->mutex);
6629                 put_caching_control(caching_ctl);
6630         }
6631
6632         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6633                                           RESERVE_ALLOC_NO_ACCOUNT);
6634         BUG_ON(ret); /* logic error */
6635         btrfs_put_block_group(block_group);
6636         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6637                                          0, owner, offset, ins, 1);
6638         return ret;
6639 }
6640
6641 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6642                                             struct btrfs_root *root,
6643                                             u64 bytenr, u32 blocksize,
6644                                             int level)
6645 {
6646         struct extent_buffer *buf;
6647
6648         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6649         if (!buf)
6650                 return ERR_PTR(-ENOMEM);
6651         btrfs_set_header_generation(buf, trans->transid);
6652         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6653         btrfs_tree_lock(buf);
6654         clean_tree_block(trans, root, buf);
6655         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6656
6657         btrfs_set_lock_blocking(buf);
6658         btrfs_set_buffer_uptodate(buf);
6659
6660         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6661                 /*
6662                  * we allow two log transactions at a time, use different
6663                  * EXENT bit to differentiate dirty pages.
6664                  */
6665                 if (root->log_transid % 2 == 0)
6666                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6667                                         buf->start + buf->len - 1, GFP_NOFS);
6668                 else
6669                         set_extent_new(&root->dirty_log_pages, buf->start,
6670                                         buf->start + buf->len - 1, GFP_NOFS);
6671         } else {
6672                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6673                          buf->start + buf->len - 1, GFP_NOFS);
6674         }
6675         trans->blocks_used++;
6676         /* this returns a buffer locked for blocking */
6677         return buf;
6678 }
6679
6680 static struct btrfs_block_rsv *
6681 use_block_rsv(struct btrfs_trans_handle *trans,
6682               struct btrfs_root *root, u32 blocksize)
6683 {
6684         struct btrfs_block_rsv *block_rsv;
6685         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6686         int ret;
6687
6688         block_rsv = get_block_rsv(trans, root);
6689
6690         if (block_rsv->size == 0) {
6691                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6692                                              BTRFS_RESERVE_NO_FLUSH);
6693                 /*
6694                  * If we couldn't reserve metadata bytes try and use some from
6695                  * the global reserve.
6696                  */
6697                 if (ret && block_rsv != global_rsv) {
6698                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6699                         if (!ret)
6700                                 return global_rsv;
6701                         return ERR_PTR(ret);
6702                 } else if (ret) {
6703                         return ERR_PTR(ret);
6704                 }
6705                 return block_rsv;
6706         }
6707
6708         ret = block_rsv_use_bytes(block_rsv, blocksize);
6709         if (!ret)
6710                 return block_rsv;
6711         if (ret && !block_rsv->failfast) {
6712                 if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6713                         static DEFINE_RATELIMIT_STATE(_rs,
6714                                         DEFAULT_RATELIMIT_INTERVAL * 10,
6715                                         /*DEFAULT_RATELIMIT_BURST*/ 1);
6716                         if (__ratelimit(&_rs))
6717                                 WARN(1, KERN_DEBUG
6718                                         "btrfs: block rsv returned %d\n", ret);
6719                 }
6720                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6721                                              BTRFS_RESERVE_NO_FLUSH);
6722                 if (!ret) {
6723                         return block_rsv;
6724                 } else if (ret && block_rsv != global_rsv) {
6725                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6726                         if (!ret)
6727                                 return global_rsv;
6728                 }
6729         }
6730
6731         return ERR_PTR(-ENOSPC);
6732 }
6733
6734 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6735                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
6736 {
6737         block_rsv_add_bytes(block_rsv, blocksize, 0);
6738         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6739 }
6740
6741 /*
6742  * finds a free extent and does all the dirty work required for allocation
6743  * returns the key for the extent through ins, and a tree buffer for
6744  * the first block of the extent through buf.
6745  *
6746  * returns the tree buffer or NULL.
6747  */
6748 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6749                                         struct btrfs_root *root, u32 blocksize,
6750                                         u64 parent, u64 root_objectid,
6751                                         struct btrfs_disk_key *key, int level,
6752                                         u64 hint, u64 empty_size)
6753 {
6754         struct btrfs_key ins;
6755         struct btrfs_block_rsv *block_rsv;
6756         struct extent_buffer *buf;
6757         u64 flags = 0;
6758         int ret;
6759         bool skinny_metadata = btrfs_fs_incompat(root->fs_info,
6760                                                  SKINNY_METADATA);
6761
6762         block_rsv = use_block_rsv(trans, root, blocksize);
6763         if (IS_ERR(block_rsv))
6764                 return ERR_CAST(block_rsv);
6765
6766         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6767                                    empty_size, hint, &ins, 0);
6768         if (ret) {
6769                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6770                 return ERR_PTR(ret);
6771         }
6772
6773         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6774                                     blocksize, level);
6775         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6776
6777         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6778                 if (parent == 0)
6779                         parent = ins.objectid;
6780                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6781         } else
6782                 BUG_ON(parent > 0);
6783
6784         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6785                 struct btrfs_delayed_extent_op *extent_op;
6786                 extent_op = btrfs_alloc_delayed_extent_op();
6787                 BUG_ON(!extent_op); /* -ENOMEM */
6788                 if (key)
6789                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6790                 else
6791                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6792                 extent_op->flags_to_set = flags;
6793                 if (skinny_metadata)
6794                         extent_op->update_key = 0;
6795                 else
6796                         extent_op->update_key = 1;
6797                 extent_op->update_flags = 1;
6798                 extent_op->is_data = 0;
6799
6800                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6801                                         ins.objectid,
6802                                         ins.offset, parent, root_objectid,
6803                                         level, BTRFS_ADD_DELAYED_EXTENT,
6804                                         extent_op, 0);
6805                 BUG_ON(ret); /* -ENOMEM */
6806         }
6807         return buf;
6808 }
6809
6810 struct walk_control {
6811         u64 refs[BTRFS_MAX_LEVEL];
6812         u64 flags[BTRFS_MAX_LEVEL];
6813         struct btrfs_key update_progress;
6814         int stage;
6815         int level;
6816         int shared_level;
6817         int update_ref;
6818         int keep_locks;
6819         int reada_slot;
6820         int reada_count;
6821         int for_reloc;
6822 };
6823
6824 #define DROP_REFERENCE  1
6825 #define UPDATE_BACKREF  2
6826
6827 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6828                                      struct btrfs_root *root,
6829                                      struct walk_control *wc,
6830                                      struct btrfs_path *path)
6831 {
6832         u64 bytenr;
6833         u64 generation;
6834         u64 refs;
6835         u64 flags;
6836         u32 nritems;
6837         u32 blocksize;
6838         struct btrfs_key key;
6839         struct extent_buffer *eb;
6840         int ret;
6841         int slot;
6842         int nread = 0;
6843
6844         if (path->slots[wc->level] < wc->reada_slot) {
6845                 wc->reada_count = wc->reada_count * 2 / 3;
6846                 wc->reada_count = max(wc->reada_count, 2);
6847         } else {
6848                 wc->reada_count = wc->reada_count * 3 / 2;
6849                 wc->reada_count = min_t(int, wc->reada_count,
6850                                         BTRFS_NODEPTRS_PER_BLOCK(root));
6851         }
6852
6853         eb = path->nodes[wc->level];
6854         nritems = btrfs_header_nritems(eb);
6855         blocksize = btrfs_level_size(root, wc->level - 1);
6856
6857         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6858                 if (nread >= wc->reada_count)
6859                         break;
6860
6861                 cond_resched();
6862                 bytenr = btrfs_node_blockptr(eb, slot);
6863                 generation = btrfs_node_ptr_generation(eb, slot);
6864
6865                 if (slot == path->slots[wc->level])
6866                         goto reada;
6867
6868                 if (wc->stage == UPDATE_BACKREF &&
6869                     generation <= root->root_key.offset)
6870                         continue;
6871
6872                 /* We don't lock the tree block, it's OK to be racy here */
6873                 ret = btrfs_lookup_extent_info(trans, root, bytenr,
6874                                                wc->level - 1, 1, &refs,
6875                                                &flags);
6876                 /* We don't care about errors in readahead. */
6877                 if (ret < 0)
6878                         continue;
6879                 BUG_ON(refs == 0);
6880
6881                 if (wc->stage == DROP_REFERENCE) {
6882                         if (refs == 1)
6883                                 goto reada;
6884
6885                         if (wc->level == 1 &&
6886                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6887                                 continue;
6888                         if (!wc->update_ref ||
6889                             generation <= root->root_key.offset)
6890                                 continue;
6891                         btrfs_node_key_to_cpu(eb, &key, slot);
6892                         ret = btrfs_comp_cpu_keys(&key,
6893                                                   &wc->update_progress);
6894                         if (ret < 0)
6895                                 continue;
6896                 } else {
6897                         if (wc->level == 1 &&
6898                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6899                                 continue;
6900                 }
6901 reada:
6902                 ret = readahead_tree_block(root, bytenr, blocksize,
6903                                            generation);
6904                 if (ret)
6905                         break;
6906                 nread++;
6907         }
6908         wc->reada_slot = slot;
6909 }
6910
6911 /*
6912  * helper to process tree block while walking down the tree.
6913  *
6914  * when wc->stage == UPDATE_BACKREF, this function updates
6915  * back refs for pointers in the block.
6916  *
6917  * NOTE: return value 1 means we should stop walking down.
6918  */
6919 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6920                                    struct btrfs_root *root,
6921                                    struct btrfs_path *path,
6922                                    struct walk_control *wc, int lookup_info)
6923 {
6924         int level = wc->level;
6925         struct extent_buffer *eb = path->nodes[level];
6926         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6927         int ret;
6928
6929         if (wc->stage == UPDATE_BACKREF &&
6930             btrfs_header_owner(eb) != root->root_key.objectid)
6931                 return 1;
6932
6933         /*
6934          * when reference count of tree block is 1, it won't increase
6935          * again. once full backref flag is set, we never clear it.
6936          */
6937         if (lookup_info &&
6938             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6939              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6940                 BUG_ON(!path->locks[level]);
6941                 ret = btrfs_lookup_extent_info(trans, root,
6942                                                eb->start, level, 1,
6943                                                &wc->refs[level],
6944                                                &wc->flags[level]);
6945                 BUG_ON(ret == -ENOMEM);
6946                 if (ret)
6947                         return ret;
6948                 BUG_ON(wc->refs[level] == 0);
6949         }
6950
6951         if (wc->stage == DROP_REFERENCE) {
6952                 if (wc->refs[level] > 1)
6953                         return 1;
6954
6955                 if (path->locks[level] && !wc->keep_locks) {
6956                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6957                         path->locks[level] = 0;
6958                 }
6959                 return 0;
6960         }
6961
6962         /* wc->stage == UPDATE_BACKREF */
6963         if (!(wc->flags[level] & flag)) {
6964                 BUG_ON(!path->locks[level]);
6965                 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6966                 BUG_ON(ret); /* -ENOMEM */
6967                 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6968                 BUG_ON(ret); /* -ENOMEM */
6969                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6970                                                   eb->len, flag, 0);
6971                 BUG_ON(ret); /* -ENOMEM */
6972                 wc->flags[level] |= flag;
6973         }
6974
6975         /*
6976          * the block is shared by multiple trees, so it's not good to
6977          * keep the tree lock
6978          */
6979         if (path->locks[level] && level > 0) {
6980                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6981                 path->locks[level] = 0;
6982         }
6983         return 0;
6984 }
6985
6986 /*
6987  * helper to process tree block pointer.
6988  *
6989  * when wc->stage == DROP_REFERENCE, this function checks
6990  * reference count of the block pointed to. if the block
6991  * is shared and we need update back refs for the subtree
6992  * rooted at the block, this function changes wc->stage to
6993  * UPDATE_BACKREF. if the block is shared and there is no
6994  * need to update back, this function drops the reference
6995  * to the block.
6996  *
6997  * NOTE: return value 1 means we should stop walking down.
6998  */
6999 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
7000                                  struct btrfs_root *root,
7001                                  struct btrfs_path *path,
7002                                  struct walk_control *wc, int *lookup_info)
7003 {
7004         u64 bytenr;
7005         u64 generation;
7006         u64 parent;
7007         u32 blocksize;
7008         struct btrfs_key key;
7009         struct extent_buffer *next;
7010         int level = wc->level;
7011         int reada = 0;
7012         int ret = 0;
7013
7014         generation = btrfs_node_ptr_generation(path->nodes[level],
7015                                                path->slots[level]);
7016         /*
7017          * if the lower level block was created before the snapshot
7018          * was created, we know there is no need to update back refs
7019          * for the subtree
7020          */
7021         if (wc->stage == UPDATE_BACKREF &&
7022             generation <= root->root_key.offset) {
7023                 *lookup_info = 1;
7024                 return 1;
7025         }
7026
7027         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
7028         blocksize = btrfs_level_size(root, level - 1);
7029
7030         next = btrfs_find_tree_block(root, bytenr, blocksize);
7031         if (!next) {
7032                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
7033                 if (!next)
7034                         return -ENOMEM;
7035                 reada = 1;
7036         }
7037         btrfs_tree_lock(next);
7038         btrfs_set_lock_blocking(next);
7039
7040         ret = btrfs_lookup_extent_info(trans, root, bytenr, level - 1, 1,
7041                                        &wc->refs[level - 1],
7042                                        &wc->flags[level - 1]);
7043         if (ret < 0) {
7044                 btrfs_tree_unlock(next);
7045                 return ret;
7046         }
7047
7048         if (unlikely(wc->refs[level - 1] == 0)) {
7049                 btrfs_err(root->fs_info, "Missing references.");
7050                 BUG();
7051         }
7052         *lookup_info = 0;
7053
7054         if (wc->stage == DROP_REFERENCE) {
7055                 if (wc->refs[level - 1] > 1) {
7056                         if (level == 1 &&
7057                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7058                                 goto skip;
7059
7060                         if (!wc->update_ref ||
7061                             generation <= root->root_key.offset)
7062                                 goto skip;
7063
7064                         btrfs_node_key_to_cpu(path->nodes[level], &key,
7065                                               path->slots[level]);
7066                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
7067                         if (ret < 0)
7068                                 goto skip;
7069
7070                         wc->stage = UPDATE_BACKREF;
7071                         wc->shared_level = level - 1;
7072                 }
7073         } else {
7074                 if (level == 1 &&
7075                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
7076                         goto skip;
7077         }
7078
7079         if (!btrfs_buffer_uptodate(next, generation, 0)) {
7080                 btrfs_tree_unlock(next);
7081                 free_extent_buffer(next);
7082                 next = NULL;
7083                 *lookup_info = 1;
7084         }
7085
7086         if (!next) {
7087                 if (reada && level == 1)
7088                         reada_walk_down(trans, root, wc, path);
7089                 next = read_tree_block(root, bytenr, blocksize, generation);
7090                 if (!next)
7091                         return -EIO;
7092                 btrfs_tree_lock(next);
7093                 btrfs_set_lock_blocking(next);
7094         }
7095
7096         level--;
7097         BUG_ON(level != btrfs_header_level(next));
7098         path->nodes[level] = next;
7099         path->slots[level] = 0;
7100         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7101         wc->level = level;
7102         if (wc->level == 1)
7103                 wc->reada_slot = 0;
7104         return 0;
7105 skip:
7106         wc->refs[level - 1] = 0;
7107         wc->flags[level - 1] = 0;
7108         if (wc->stage == DROP_REFERENCE) {
7109                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
7110                         parent = path->nodes[level]->start;
7111                 } else {
7112                         BUG_ON(root->root_key.objectid !=
7113                                btrfs_header_owner(path->nodes[level]));
7114                         parent = 0;
7115                 }
7116
7117                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
7118                                 root->root_key.objectid, level - 1, 0, 0);
7119                 BUG_ON(ret); /* -ENOMEM */
7120         }
7121         btrfs_tree_unlock(next);
7122         free_extent_buffer(next);
7123         *lookup_info = 1;
7124         return 1;
7125 }
7126
7127 /*
7128  * helper to process tree block while walking up the tree.
7129  *
7130  * when wc->stage == DROP_REFERENCE, this function drops
7131  * reference count on the block.
7132  *
7133  * when wc->stage == UPDATE_BACKREF, this function changes
7134  * wc->stage back to DROP_REFERENCE if we changed wc->stage
7135  * to UPDATE_BACKREF previously while processing the block.
7136  *
7137  * NOTE: return value 1 means we should stop walking up.
7138  */
7139 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
7140                                  struct btrfs_root *root,
7141                                  struct btrfs_path *path,
7142                                  struct walk_control *wc)
7143 {
7144         int ret;
7145         int level = wc->level;
7146         struct extent_buffer *eb = path->nodes[level];
7147         u64 parent = 0;
7148
7149         if (wc->stage == UPDATE_BACKREF) {
7150                 BUG_ON(wc->shared_level < level);
7151                 if (level < wc->shared_level)
7152                         goto out;
7153
7154                 ret = find_next_key(path, level + 1, &wc->update_progress);
7155                 if (ret > 0)
7156                         wc->update_ref = 0;
7157
7158                 wc->stage = DROP_REFERENCE;
7159                 wc->shared_level = -1;
7160                 path->slots[level] = 0;
7161
7162                 /*
7163                  * check reference count again if the block isn't locked.
7164                  * we should start walking down the tree again if reference
7165                  * count is one.
7166                  */
7167                 if (!path->locks[level]) {
7168                         BUG_ON(level == 0);
7169                         btrfs_tree_lock(eb);
7170                         btrfs_set_lock_blocking(eb);
7171                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7172
7173                         ret = btrfs_lookup_extent_info(trans, root,
7174                                                        eb->start, level, 1,
7175                                                        &wc->refs[level],
7176                                                        &wc->flags[level]);
7177                         if (ret < 0) {
7178                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
7179                                 path->locks[level] = 0;
7180                                 return ret;
7181                         }
7182                         BUG_ON(wc->refs[level] == 0);
7183                         if (wc->refs[level] == 1) {
7184                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
7185                                 path->locks[level] = 0;
7186                                 return 1;
7187                         }
7188                 }
7189         }
7190
7191         /* wc->stage == DROP_REFERENCE */
7192         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
7193
7194         if (wc->refs[level] == 1) {
7195                 if (level == 0) {
7196                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7197                                 ret = btrfs_dec_ref(trans, root, eb, 1,
7198                                                     wc->for_reloc);
7199                         else
7200                                 ret = btrfs_dec_ref(trans, root, eb, 0,
7201                                                     wc->for_reloc);
7202                         BUG_ON(ret); /* -ENOMEM */
7203                 }
7204                 /* make block locked assertion in clean_tree_block happy */
7205                 if (!path->locks[level] &&
7206                     btrfs_header_generation(eb) == trans->transid) {
7207                         btrfs_tree_lock(eb);
7208                         btrfs_set_lock_blocking(eb);
7209                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7210                 }
7211                 clean_tree_block(trans, root, eb);
7212         }
7213
7214         if (eb == root->node) {
7215                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7216                         parent = eb->start;
7217                 else
7218                         BUG_ON(root->root_key.objectid !=
7219                                btrfs_header_owner(eb));
7220         } else {
7221                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
7222                         parent = path->nodes[level + 1]->start;
7223                 else
7224                         BUG_ON(root->root_key.objectid !=
7225                                btrfs_header_owner(path->nodes[level + 1]));
7226         }
7227
7228         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
7229 out:
7230         wc->refs[level] = 0;
7231         wc->flags[level] = 0;
7232         return 0;
7233 }
7234
7235 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
7236                                    struct btrfs_root *root,
7237                                    struct btrfs_path *path,
7238                                    struct walk_control *wc)
7239 {
7240         int level = wc->level;
7241         int lookup_info = 1;
7242         int ret;
7243
7244         while (level >= 0) {
7245                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
7246                 if (ret > 0)
7247                         break;
7248
7249                 if (level == 0)
7250                         break;
7251
7252                 if (path->slots[level] >=
7253                     btrfs_header_nritems(path->nodes[level]))
7254                         break;
7255
7256                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
7257                 if (ret > 0) {
7258                         path->slots[level]++;
7259                         continue;
7260                 } else if (ret < 0)
7261                         return ret;
7262                 level = wc->level;
7263         }
7264         return 0;
7265 }
7266
7267 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
7268                                  struct btrfs_root *root,
7269                                  struct btrfs_path *path,
7270                                  struct walk_control *wc, int max_level)
7271 {
7272         int level = wc->level;
7273         int ret;
7274
7275         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
7276         while (level < max_level && path->nodes[level]) {
7277                 wc->level = level;
7278                 if (path->slots[level] + 1 <
7279                     btrfs_header_nritems(path->nodes[level])) {
7280                         path->slots[level]++;
7281                         return 0;
7282                 } else {
7283                         ret = walk_up_proc(trans, root, path, wc);
7284                         if (ret > 0)
7285                                 return 0;
7286
7287                         if (path->locks[level]) {
7288                                 btrfs_tree_unlock_rw(path->nodes[level],
7289                                                      path->locks[level]);
7290                                 path->locks[level] = 0;
7291                         }
7292                         free_extent_buffer(path->nodes[level]);
7293                         path->nodes[level] = NULL;
7294                         level++;
7295                 }
7296         }
7297         return 1;
7298 }
7299
7300 /*
7301  * drop a subvolume tree.
7302  *
7303  * this function traverses the tree freeing any blocks that only
7304  * referenced by the tree.
7305  *
7306  * when a shared tree block is found. this function decreases its
7307  * reference count by one. if update_ref is true, this function
7308  * also make sure backrefs for the shared block and all lower level
7309  * blocks are properly updated.
7310  *
7311  * If called with for_reloc == 0, may exit early with -EAGAIN
7312  */
7313 int btrfs_drop_snapshot(struct btrfs_root *root,
7314                          struct btrfs_block_rsv *block_rsv, int update_ref,
7315                          int for_reloc)
7316 {
7317         struct btrfs_path *path;
7318         struct btrfs_trans_handle *trans;
7319         struct btrfs_root *tree_root = root->fs_info->tree_root;
7320         struct btrfs_root_item *root_item = &root->root_item;
7321         struct walk_control *wc;
7322         struct btrfs_key key;
7323         int err = 0;
7324         int ret;
7325         int level;
7326
7327         path = btrfs_alloc_path();
7328         if (!path) {
7329                 err = -ENOMEM;
7330                 goto out;
7331         }
7332
7333         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7334         if (!wc) {
7335                 btrfs_free_path(path);
7336                 err = -ENOMEM;
7337                 goto out;
7338         }
7339
7340         trans = btrfs_start_transaction(tree_root, 0);
7341         if (IS_ERR(trans)) {
7342                 err = PTR_ERR(trans);
7343                 goto out_free;
7344         }
7345
7346         if (block_rsv)
7347                 trans->block_rsv = block_rsv;
7348
7349         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7350                 level = btrfs_header_level(root->node);
7351                 path->nodes[level] = btrfs_lock_root_node(root);
7352                 btrfs_set_lock_blocking(path->nodes[level]);
7353                 path->slots[level] = 0;
7354                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7355                 memset(&wc->update_progress, 0,
7356                        sizeof(wc->update_progress));
7357         } else {
7358                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7359                 memcpy(&wc->update_progress, &key,
7360                        sizeof(wc->update_progress));
7361
7362                 level = root_item->drop_level;
7363                 BUG_ON(level == 0);
7364                 path->lowest_level = level;
7365                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7366                 path->lowest_level = 0;
7367                 if (ret < 0) {
7368                         err = ret;
7369                         goto out_end_trans;
7370                 }
7371                 WARN_ON(ret > 0);
7372
7373                 /*
7374                  * unlock our path, this is safe because only this
7375                  * function is allowed to delete this snapshot
7376                  */
7377                 btrfs_unlock_up_safe(path, 0);
7378
7379                 level = btrfs_header_level(root->node);
7380                 while (1) {
7381                         btrfs_tree_lock(path->nodes[level]);
7382                         btrfs_set_lock_blocking(path->nodes[level]);
7383
7384                         ret = btrfs_lookup_extent_info(trans, root,
7385                                                 path->nodes[level]->start,
7386                                                 level, 1, &wc->refs[level],
7387                                                 &wc->flags[level]);
7388                         if (ret < 0) {
7389                                 err = ret;
7390                                 goto out_end_trans;
7391                         }
7392                         BUG_ON(wc->refs[level] == 0);
7393
7394                         if (level == root_item->drop_level)
7395                                 break;
7396
7397                         btrfs_tree_unlock(path->nodes[level]);
7398                         WARN_ON(wc->refs[level] != 1);
7399                         level--;
7400                 }
7401         }
7402
7403         wc->level = level;
7404         wc->shared_level = -1;
7405         wc->stage = DROP_REFERENCE;
7406         wc->update_ref = update_ref;
7407         wc->keep_locks = 0;
7408         wc->for_reloc = for_reloc;
7409         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7410
7411         while (1) {
7412                 if (!for_reloc && btrfs_fs_closing(root->fs_info)) {
7413                         pr_debug("btrfs: drop snapshot early exit\n");
7414                         err = -EAGAIN;
7415                         goto out_end_trans;
7416                 }
7417
7418                 ret = walk_down_tree(trans, root, path, wc);
7419                 if (ret < 0) {
7420                         err = ret;
7421                         break;
7422                 }
7423
7424                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7425                 if (ret < 0) {
7426                         err = ret;
7427                         break;
7428                 }
7429
7430                 if (ret > 0) {
7431                         BUG_ON(wc->stage != DROP_REFERENCE);
7432                         break;
7433                 }
7434
7435                 if (wc->stage == DROP_REFERENCE) {
7436                         level = wc->level;
7437                         btrfs_node_key(path->nodes[level],
7438                                        &root_item->drop_progress,
7439                                        path->slots[level]);
7440                         root_item->drop_level = level;
7441                 }
7442
7443                 BUG_ON(wc->level == 0);
7444                 if (btrfs_should_end_transaction(trans, tree_root)) {
7445                         ret = btrfs_update_root(trans, tree_root,
7446                                                 &root->root_key,
7447                                                 root_item);
7448                         if (ret) {
7449                                 btrfs_abort_transaction(trans, tree_root, ret);
7450                                 err = ret;
7451                                 goto out_end_trans;
7452                         }
7453
7454                         btrfs_end_transaction_throttle(trans, tree_root);
7455                         trans = btrfs_start_transaction(tree_root, 0);
7456                         if (IS_ERR(trans)) {
7457                                 err = PTR_ERR(trans);
7458                                 goto out_free;
7459                         }
7460                         if (block_rsv)
7461                                 trans->block_rsv = block_rsv;
7462                 }
7463         }
7464         btrfs_release_path(path);
7465         if (err)
7466                 goto out_end_trans;
7467
7468         ret = btrfs_del_root(trans, tree_root, &root->root_key);
7469         if (ret) {
7470                 btrfs_abort_transaction(trans, tree_root, ret);
7471                 goto out_end_trans;
7472         }
7473
7474         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7475                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7476                                            NULL, NULL);
7477                 if (ret < 0) {
7478                         btrfs_abort_transaction(trans, tree_root, ret);
7479                         err = ret;
7480                         goto out_end_trans;
7481                 } else if (ret > 0) {
7482                         /* if we fail to delete the orphan item this time
7483                          * around, it'll get picked up the next time.
7484                          *
7485                          * The most common failure here is just -ENOENT.
7486                          */
7487                         btrfs_del_orphan_item(trans, tree_root,
7488                                               root->root_key.objectid);
7489                 }
7490         }
7491
7492         if (root->in_radix) {
7493                 btrfs_free_fs_root(tree_root->fs_info, root);
7494         } else {
7495                 free_extent_buffer(root->node);
7496                 free_extent_buffer(root->commit_root);
7497                 kfree(root);
7498         }
7499 out_end_trans:
7500         btrfs_end_transaction_throttle(trans, tree_root);
7501 out_free:
7502         kfree(wc);
7503         btrfs_free_path(path);
7504 out:
7505         if (err)
7506                 btrfs_std_error(root->fs_info, err);
7507         return err;
7508 }
7509
7510 /*
7511  * drop subtree rooted at tree block 'node'.
7512  *
7513  * NOTE: this function will unlock and release tree block 'node'
7514  * only used by relocation code
7515  */
7516 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7517                         struct btrfs_root *root,
7518                         struct extent_buffer *node,
7519                         struct extent_buffer *parent)
7520 {
7521         struct btrfs_path *path;
7522         struct walk_control *wc;
7523         int level;
7524         int parent_level;
7525         int ret = 0;
7526         int wret;
7527
7528         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7529
7530         path = btrfs_alloc_path();
7531         if (!path)
7532                 return -ENOMEM;
7533
7534         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7535         if (!wc) {
7536                 btrfs_free_path(path);
7537                 return -ENOMEM;
7538         }
7539
7540         btrfs_assert_tree_locked(parent);
7541         parent_level = btrfs_header_level(parent);
7542         extent_buffer_get(parent);
7543         path->nodes[parent_level] = parent;
7544         path->slots[parent_level] = btrfs_header_nritems(parent);
7545
7546         btrfs_assert_tree_locked(node);
7547         level = btrfs_header_level(node);
7548         path->nodes[level] = node;
7549         path->slots[level] = 0;
7550         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7551
7552         wc->refs[parent_level] = 1;
7553         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7554         wc->level = level;
7555         wc->shared_level = -1;
7556         wc->stage = DROP_REFERENCE;
7557         wc->update_ref = 0;
7558         wc->keep_locks = 1;
7559         wc->for_reloc = 1;
7560         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7561
7562         while (1) {
7563                 wret = walk_down_tree(trans, root, path, wc);
7564                 if (wret < 0) {
7565                         ret = wret;
7566                         break;
7567                 }
7568
7569                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7570                 if (wret < 0)
7571                         ret = wret;
7572                 if (wret != 0)
7573                         break;
7574         }
7575
7576         kfree(wc);
7577         btrfs_free_path(path);
7578         return ret;
7579 }
7580
7581 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7582 {
7583         u64 num_devices;
7584         u64 stripped;
7585
7586         /*
7587          * if restripe for this chunk_type is on pick target profile and
7588          * return, otherwise do the usual balance
7589          */
7590         stripped = get_restripe_target(root->fs_info, flags);
7591         if (stripped)
7592                 return extended_to_chunk(stripped);
7593
7594         /*
7595          * we add in the count of missing devices because we want
7596          * to make sure that any RAID levels on a degraded FS
7597          * continue to be honored.
7598          */
7599         num_devices = root->fs_info->fs_devices->rw_devices +
7600                 root->fs_info->fs_devices->missing_devices;
7601
7602         stripped = BTRFS_BLOCK_GROUP_RAID0 |
7603                 BTRFS_BLOCK_GROUP_RAID5 | BTRFS_BLOCK_GROUP_RAID6 |
7604                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7605
7606         if (num_devices == 1) {
7607                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7608                 stripped = flags & ~stripped;
7609
7610                 /* turn raid0 into single device chunks */
7611                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7612                         return stripped;
7613
7614                 /* turn mirroring into duplication */
7615                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7616                              BTRFS_BLOCK_GROUP_RAID10))
7617                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7618         } else {
7619                 /* they already had raid on here, just return */
7620                 if (flags & stripped)
7621                         return flags;
7622
7623                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7624                 stripped = flags & ~stripped;
7625
7626                 /* switch duplicated blocks with raid1 */
7627                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7628                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7629
7630                 /* this is drive concat, leave it alone */
7631         }
7632
7633         return flags;
7634 }
7635
7636 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7637 {
7638         struct btrfs_space_info *sinfo = cache->space_info;
7639         u64 num_bytes;
7640         u64 min_allocable_bytes;
7641         int ret = -ENOSPC;
7642
7643
7644         /*
7645          * We need some metadata space and system metadata space for
7646          * allocating chunks in some corner cases until we force to set
7647          * it to be readonly.
7648          */
7649         if ((sinfo->flags &
7650              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7651             !force)
7652                 min_allocable_bytes = 1 * 1024 * 1024;
7653         else
7654                 min_allocable_bytes = 0;
7655
7656         spin_lock(&sinfo->lock);
7657         spin_lock(&cache->lock);
7658
7659         if (cache->ro) {
7660                 ret = 0;
7661                 goto out;
7662         }
7663
7664         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7665                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7666
7667         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7668             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7669             min_allocable_bytes <= sinfo->total_bytes) {
7670                 sinfo->bytes_readonly += num_bytes;
7671                 cache->ro = 1;
7672                 ret = 0;
7673         }
7674 out:
7675         spin_unlock(&cache->lock);
7676         spin_unlock(&sinfo->lock);
7677         return ret;
7678 }
7679
7680 int btrfs_set_block_group_ro(struct btrfs_root *root,
7681                              struct btrfs_block_group_cache *cache)
7682
7683 {
7684         struct btrfs_trans_handle *trans;
7685         u64 alloc_flags;
7686         int ret;
7687
7688         BUG_ON(cache->ro);
7689
7690         trans = btrfs_join_transaction(root);
7691         if (IS_ERR(trans))
7692                 return PTR_ERR(trans);
7693
7694         alloc_flags = update_block_group_flags(root, cache->flags);
7695         if (alloc_flags != cache->flags) {
7696                 ret = do_chunk_alloc(trans, root, alloc_flags,
7697                                      CHUNK_ALLOC_FORCE);
7698                 if (ret < 0)
7699                         goto out;
7700         }
7701
7702         ret = set_block_group_ro(cache, 0);
7703         if (!ret)
7704                 goto out;
7705         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7706         ret = do_chunk_alloc(trans, root, alloc_flags,
7707                              CHUNK_ALLOC_FORCE);
7708         if (ret < 0)
7709                 goto out;
7710         ret = set_block_group_ro(cache, 0);
7711 out:
7712         btrfs_end_transaction(trans, root);
7713         return ret;
7714 }
7715
7716 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7717                             struct btrfs_root *root, u64 type)
7718 {
7719         u64 alloc_flags = get_alloc_profile(root, type);
7720         return do_chunk_alloc(trans, root, alloc_flags,
7721                               CHUNK_ALLOC_FORCE);
7722 }
7723
7724 /*
7725  * helper to account the unused space of all the readonly block group in the
7726  * list. takes mirrors into account.
7727  */
7728 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7729 {
7730         struct btrfs_block_group_cache *block_group;
7731         u64 free_bytes = 0;
7732         int factor;
7733
7734         list_for_each_entry(block_group, groups_list, list) {
7735                 spin_lock(&block_group->lock);
7736
7737                 if (!block_group->ro) {
7738                         spin_unlock(&block_group->lock);
7739                         continue;
7740                 }
7741
7742                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7743                                           BTRFS_BLOCK_GROUP_RAID10 |
7744                                           BTRFS_BLOCK_GROUP_DUP))
7745                         factor = 2;
7746                 else
7747                         factor = 1;
7748
7749                 free_bytes += (block_group->key.offset -
7750                                btrfs_block_group_used(&block_group->item)) *
7751                                factor;
7752
7753                 spin_unlock(&block_group->lock);
7754         }
7755
7756         return free_bytes;
7757 }
7758
7759 /*
7760  * helper to account the unused space of all the readonly block group in the
7761  * space_info. takes mirrors into account.
7762  */
7763 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7764 {
7765         int i;
7766         u64 free_bytes = 0;
7767
7768         spin_lock(&sinfo->lock);
7769
7770         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7771                 if (!list_empty(&sinfo->block_groups[i]))
7772                         free_bytes += __btrfs_get_ro_block_group_free_space(
7773                                                 &sinfo->block_groups[i]);
7774
7775         spin_unlock(&sinfo->lock);
7776
7777         return free_bytes;
7778 }
7779
7780 void btrfs_set_block_group_rw(struct btrfs_root *root,
7781                               struct btrfs_block_group_cache *cache)
7782 {
7783         struct btrfs_space_info *sinfo = cache->space_info;
7784         u64 num_bytes;
7785
7786         BUG_ON(!cache->ro);
7787
7788         spin_lock(&sinfo->lock);
7789         spin_lock(&cache->lock);
7790         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7791                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7792         sinfo->bytes_readonly -= num_bytes;
7793         cache->ro = 0;
7794         spin_unlock(&cache->lock);
7795         spin_unlock(&sinfo->lock);
7796 }
7797
7798 /*
7799  * checks to see if its even possible to relocate this block group.
7800  *
7801  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7802  * ok to go ahead and try.
7803  */
7804 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7805 {
7806         struct btrfs_block_group_cache *block_group;
7807         struct btrfs_space_info *space_info;
7808         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7809         struct btrfs_device *device;
7810         u64 min_free;
7811         u64 dev_min = 1;
7812         u64 dev_nr = 0;
7813         u64 target;
7814         int index;
7815         int full = 0;
7816         int ret = 0;
7817
7818         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7819
7820         /* odd, couldn't find the block group, leave it alone */
7821         if (!block_group)
7822                 return -1;
7823
7824         min_free = btrfs_block_group_used(&block_group->item);
7825
7826         /* no bytes used, we're good */
7827         if (!min_free)
7828                 goto out;
7829
7830         space_info = block_group->space_info;
7831         spin_lock(&space_info->lock);
7832
7833         full = space_info->full;
7834
7835         /*
7836          * if this is the last block group we have in this space, we can't
7837          * relocate it unless we're able to allocate a new chunk below.
7838          *
7839          * Otherwise, we need to make sure we have room in the space to handle
7840          * all of the extents from this block group.  If we can, we're good
7841          */
7842         if ((space_info->total_bytes != block_group->key.offset) &&
7843             (space_info->bytes_used + space_info->bytes_reserved +
7844              space_info->bytes_pinned + space_info->bytes_readonly +
7845              min_free < space_info->total_bytes)) {
7846                 spin_unlock(&space_info->lock);
7847                 goto out;
7848         }
7849         spin_unlock(&space_info->lock);
7850
7851         /*
7852          * ok we don't have enough space, but maybe we have free space on our
7853          * devices to allocate new chunks for relocation, so loop through our
7854          * alloc devices and guess if we have enough space.  if this block
7855          * group is going to be restriped, run checks against the target
7856          * profile instead of the current one.
7857          */
7858         ret = -1;
7859
7860         /*
7861          * index:
7862          *      0: raid10
7863          *      1: raid1
7864          *      2: dup
7865          *      3: raid0
7866          *      4: single
7867          */
7868         target = get_restripe_target(root->fs_info, block_group->flags);
7869         if (target) {
7870                 index = __get_raid_index(extended_to_chunk(target));
7871         } else {
7872                 /*
7873                  * this is just a balance, so if we were marked as full
7874                  * we know there is no space for a new chunk
7875                  */
7876                 if (full)
7877                         goto out;
7878
7879                 index = get_block_group_index(block_group);
7880         }
7881
7882         if (index == BTRFS_RAID_RAID10) {
7883                 dev_min = 4;
7884                 /* Divide by 2 */
7885                 min_free >>= 1;
7886         } else if (index == BTRFS_RAID_RAID1) {
7887                 dev_min = 2;
7888         } else if (index == BTRFS_RAID_DUP) {
7889                 /* Multiply by 2 */
7890                 min_free <<= 1;
7891         } else if (index == BTRFS_RAID_RAID0) {
7892                 dev_min = fs_devices->rw_devices;
7893                 do_div(min_free, dev_min);
7894         }
7895
7896         mutex_lock(&root->fs_info->chunk_mutex);
7897         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7898                 u64 dev_offset;
7899
7900                 /*
7901                  * check to make sure we can actually find a chunk with enough
7902                  * space to fit our block group in.
7903                  */
7904                 if (device->total_bytes > device->bytes_used + min_free &&
7905                     !device->is_tgtdev_for_dev_replace) {
7906                         ret = find_free_dev_extent(device, min_free,
7907                                                    &dev_offset, NULL);
7908                         if (!ret)
7909                                 dev_nr++;
7910
7911                         if (dev_nr >= dev_min)
7912                                 break;
7913
7914                         ret = -1;
7915                 }
7916         }
7917         mutex_unlock(&root->fs_info->chunk_mutex);
7918 out:
7919         btrfs_put_block_group(block_group);
7920         return ret;
7921 }
7922
7923 static int find_first_block_group(struct btrfs_root *root,
7924                 struct btrfs_path *path, struct btrfs_key *key)
7925 {
7926         int ret = 0;
7927         struct btrfs_key found_key;
7928         struct extent_buffer *leaf;
7929         int slot;
7930
7931         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7932         if (ret < 0)
7933                 goto out;
7934
7935         while (1) {
7936                 slot = path->slots[0];
7937                 leaf = path->nodes[0];
7938                 if (slot >= btrfs_header_nritems(leaf)) {
7939                         ret = btrfs_next_leaf(root, path);
7940                         if (ret == 0)
7941                                 continue;
7942                         if (ret < 0)
7943                                 goto out;
7944                         break;
7945                 }
7946                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7947
7948                 if (found_key.objectid >= key->objectid &&
7949                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7950                         ret = 0;
7951                         goto out;
7952                 }
7953                 path->slots[0]++;
7954         }
7955 out:
7956         return ret;
7957 }
7958
7959 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7960 {
7961         struct btrfs_block_group_cache *block_group;
7962         u64 last = 0;
7963
7964         while (1) {
7965                 struct inode *inode;
7966
7967                 block_group = btrfs_lookup_first_block_group(info, last);
7968                 while (block_group) {
7969                         spin_lock(&block_group->lock);
7970                         if (block_group->iref)
7971                                 break;
7972                         spin_unlock(&block_group->lock);
7973                         block_group = next_block_group(info->tree_root,
7974                                                        block_group);
7975                 }
7976                 if (!block_group) {
7977                         if (last == 0)
7978                                 break;
7979                         last = 0;
7980                         continue;
7981                 }
7982
7983                 inode = block_group->inode;
7984                 block_group->iref = 0;
7985                 block_group->inode = NULL;
7986                 spin_unlock(&block_group->lock);
7987                 iput(inode);
7988                 last = block_group->key.objectid + block_group->key.offset;
7989                 btrfs_put_block_group(block_group);
7990         }
7991 }
7992
7993 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7994 {
7995         struct btrfs_block_group_cache *block_group;
7996         struct btrfs_space_info *space_info;
7997         struct btrfs_caching_control *caching_ctl;
7998         struct rb_node *n;
7999
8000         down_write(&info->extent_commit_sem);
8001         while (!list_empty(&info->caching_block_groups)) {
8002                 caching_ctl = list_entry(info->caching_block_groups.next,
8003                                          struct btrfs_caching_control, list);
8004                 list_del(&caching_ctl->list);
8005                 put_caching_control(caching_ctl);
8006         }
8007         up_write(&info->extent_commit_sem);
8008
8009         spin_lock(&info->block_group_cache_lock);
8010         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
8011                 block_group = rb_entry(n, struct btrfs_block_group_cache,
8012                                        cache_node);
8013                 rb_erase(&block_group->cache_node,
8014                          &info->block_group_cache_tree);
8015                 spin_unlock(&info->block_group_cache_lock);
8016
8017                 down_write(&block_group->space_info->groups_sem);
8018                 list_del(&block_group->list);
8019                 up_write(&block_group->space_info->groups_sem);
8020
8021                 if (block_group->cached == BTRFS_CACHE_STARTED)
8022                         wait_block_group_cache_done(block_group);
8023
8024                 /*
8025                  * We haven't cached this block group, which means we could
8026                  * possibly have excluded extents on this block group.
8027                  */
8028                 if (block_group->cached == BTRFS_CACHE_NO)
8029                         free_excluded_extents(info->extent_root, block_group);
8030
8031                 btrfs_remove_free_space_cache(block_group);
8032                 btrfs_put_block_group(block_group);
8033
8034                 spin_lock(&info->block_group_cache_lock);
8035         }
8036         spin_unlock(&info->block_group_cache_lock);
8037
8038         /* now that all the block groups are freed, go through and
8039          * free all the space_info structs.  This is only called during
8040          * the final stages of unmount, and so we know nobody is
8041          * using them.  We call synchronize_rcu() once before we start,
8042          * just to be on the safe side.
8043          */
8044         synchronize_rcu();
8045
8046         release_global_block_rsv(info);
8047
8048         while(!list_empty(&info->space_info)) {
8049                 space_info = list_entry(info->space_info.next,
8050                                         struct btrfs_space_info,
8051                                         list);
8052                 if (btrfs_test_opt(info->tree_root, ENOSPC_DEBUG)) {
8053                         if (space_info->bytes_pinned > 0 ||
8054                             space_info->bytes_reserved > 0 ||
8055                             space_info->bytes_may_use > 0) {
8056                                 WARN_ON(1);
8057                                 dump_space_info(space_info, 0, 0);
8058                         }
8059                 }
8060                 list_del(&space_info->list);
8061                 kfree(space_info);
8062         }
8063         return 0;
8064 }
8065
8066 static void __link_block_group(struct btrfs_space_info *space_info,
8067                                struct btrfs_block_group_cache *cache)
8068 {
8069         int index = get_block_group_index(cache);
8070
8071         down_write(&space_info->groups_sem);
8072         list_add_tail(&cache->list, &space_info->block_groups[index]);
8073         up_write(&space_info->groups_sem);
8074 }
8075
8076 int btrfs_read_block_groups(struct btrfs_root *root)
8077 {
8078         struct btrfs_path *path;
8079         int ret;
8080         struct btrfs_block_group_cache *cache;
8081         struct btrfs_fs_info *info = root->fs_info;
8082         struct btrfs_space_info *space_info;
8083         struct btrfs_key key;
8084         struct btrfs_key found_key;
8085         struct extent_buffer *leaf;
8086         int need_clear = 0;
8087         u64 cache_gen;
8088
8089         root = info->extent_root;
8090         key.objectid = 0;
8091         key.offset = 0;
8092         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
8093         path = btrfs_alloc_path();
8094         if (!path)
8095                 return -ENOMEM;
8096         path->reada = 1;
8097
8098         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
8099         if (btrfs_test_opt(root, SPACE_CACHE) &&
8100             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
8101                 need_clear = 1;
8102         if (btrfs_test_opt(root, CLEAR_CACHE))
8103                 need_clear = 1;
8104
8105         while (1) {
8106                 ret = find_first_block_group(root, path, &key);
8107                 if (ret > 0)
8108                         break;
8109                 if (ret != 0)
8110                         goto error;
8111                 leaf = path->nodes[0];
8112                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
8113                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
8114                 if (!cache) {
8115                         ret = -ENOMEM;
8116                         goto error;
8117                 }
8118                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8119                                                 GFP_NOFS);
8120                 if (!cache->free_space_ctl) {
8121                         kfree(cache);
8122                         ret = -ENOMEM;
8123                         goto error;
8124                 }
8125
8126                 atomic_set(&cache->count, 1);
8127                 spin_lock_init(&cache->lock);
8128                 cache->fs_info = info;
8129                 INIT_LIST_HEAD(&cache->list);
8130                 INIT_LIST_HEAD(&cache->cluster_list);
8131
8132                 if (need_clear) {
8133                         /*
8134                          * When we mount with old space cache, we need to
8135                          * set BTRFS_DC_CLEAR and set dirty flag.
8136                          *
8137                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
8138                          *    truncate the old free space cache inode and
8139                          *    setup a new one.
8140                          * b) Setting 'dirty flag' makes sure that we flush
8141                          *    the new space cache info onto disk.
8142                          */
8143                         cache->disk_cache_state = BTRFS_DC_CLEAR;
8144                         if (btrfs_test_opt(root, SPACE_CACHE))
8145                                 cache->dirty = 1;
8146                 }
8147
8148                 read_extent_buffer(leaf, &cache->item,
8149                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
8150                                    sizeof(cache->item));
8151                 memcpy(&cache->key, &found_key, sizeof(found_key));
8152
8153                 key.objectid = found_key.objectid + found_key.offset;
8154                 btrfs_release_path(path);
8155                 cache->flags = btrfs_block_group_flags(&cache->item);
8156                 cache->sectorsize = root->sectorsize;
8157                 cache->full_stripe_len = btrfs_full_stripe_len(root,
8158                                                &root->fs_info->mapping_tree,
8159                                                found_key.objectid);
8160                 btrfs_init_free_space_ctl(cache);
8161
8162                 /*
8163                  * We need to exclude the super stripes now so that the space
8164                  * info has super bytes accounted for, otherwise we'll think
8165                  * we have more space than we actually do.
8166                  */
8167                 ret = exclude_super_stripes(root, cache);
8168                 if (ret) {
8169                         /*
8170                          * We may have excluded something, so call this just in
8171                          * case.
8172                          */
8173                         free_excluded_extents(root, cache);
8174                         kfree(cache->free_space_ctl);
8175                         kfree(cache);
8176                         goto error;
8177                 }
8178
8179                 /*
8180                  * check for two cases, either we are full, and therefore
8181                  * don't need to bother with the caching work since we won't
8182                  * find any space, or we are empty, and we can just add all
8183                  * the space in and be done with it.  This saves us _alot_ of
8184                  * time, particularly in the full case.
8185                  */
8186                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
8187                         cache->last_byte_to_unpin = (u64)-1;
8188                         cache->cached = BTRFS_CACHE_FINISHED;
8189                         free_excluded_extents(root, cache);
8190                 } else if (btrfs_block_group_used(&cache->item) == 0) {
8191                         cache->last_byte_to_unpin = (u64)-1;
8192                         cache->cached = BTRFS_CACHE_FINISHED;
8193                         add_new_free_space(cache, root->fs_info,
8194                                            found_key.objectid,
8195                                            found_key.objectid +
8196                                            found_key.offset);
8197                         free_excluded_extents(root, cache);
8198                 }
8199
8200                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
8201                 if (ret) {
8202                         btrfs_remove_free_space_cache(cache);
8203                         btrfs_put_block_group(cache);
8204                         goto error;
8205                 }
8206
8207                 ret = update_space_info(info, cache->flags, found_key.offset,
8208                                         btrfs_block_group_used(&cache->item),
8209                                         &space_info);
8210                 if (ret) {
8211                         btrfs_remove_free_space_cache(cache);
8212                         spin_lock(&info->block_group_cache_lock);
8213                         rb_erase(&cache->cache_node,
8214                                  &info->block_group_cache_tree);
8215                         spin_unlock(&info->block_group_cache_lock);
8216                         btrfs_put_block_group(cache);
8217                         goto error;
8218                 }
8219
8220                 cache->space_info = space_info;
8221                 spin_lock(&cache->space_info->lock);
8222                 cache->space_info->bytes_readonly += cache->bytes_super;
8223                 spin_unlock(&cache->space_info->lock);
8224
8225                 __link_block_group(space_info, cache);
8226
8227                 set_avail_alloc_bits(root->fs_info, cache->flags);
8228                 if (btrfs_chunk_readonly(root, cache->key.objectid))
8229                         set_block_group_ro(cache, 1);
8230         }
8231
8232         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
8233                 if (!(get_alloc_profile(root, space_info->flags) &
8234                       (BTRFS_BLOCK_GROUP_RAID10 |
8235                        BTRFS_BLOCK_GROUP_RAID1 |
8236                        BTRFS_BLOCK_GROUP_RAID5 |
8237                        BTRFS_BLOCK_GROUP_RAID6 |
8238                        BTRFS_BLOCK_GROUP_DUP)))
8239                         continue;
8240                 /*
8241                  * avoid allocating from un-mirrored block group if there are
8242                  * mirrored block groups.
8243                  */
8244                 list_for_each_entry(cache, &space_info->block_groups[3], list)
8245                         set_block_group_ro(cache, 1);
8246                 list_for_each_entry(cache, &space_info->block_groups[4], list)
8247                         set_block_group_ro(cache, 1);
8248         }
8249
8250         init_global_block_rsv(info);
8251         ret = 0;
8252 error:
8253         btrfs_free_path(path);
8254         return ret;
8255 }
8256
8257 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
8258                                        struct btrfs_root *root)
8259 {
8260         struct btrfs_block_group_cache *block_group, *tmp;
8261         struct btrfs_root *extent_root = root->fs_info->extent_root;
8262         struct btrfs_block_group_item item;
8263         struct btrfs_key key;
8264         int ret = 0;
8265
8266         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
8267                                  new_bg_list) {
8268                 list_del_init(&block_group->new_bg_list);
8269
8270                 if (ret)
8271                         continue;
8272
8273                 spin_lock(&block_group->lock);
8274                 memcpy(&item, &block_group->item, sizeof(item));
8275                 memcpy(&key, &block_group->key, sizeof(key));
8276                 spin_unlock(&block_group->lock);
8277
8278                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
8279                                         sizeof(item));
8280                 if (ret)
8281                         btrfs_abort_transaction(trans, extent_root, ret);
8282         }
8283 }
8284
8285 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
8286                            struct btrfs_root *root, u64 bytes_used,
8287                            u64 type, u64 chunk_objectid, u64 chunk_offset,
8288                            u64 size)
8289 {
8290         int ret;
8291         struct btrfs_root *extent_root;
8292         struct btrfs_block_group_cache *cache;
8293
8294         extent_root = root->fs_info->extent_root;
8295
8296         root->fs_info->last_trans_log_full_commit = trans->transid;
8297
8298         cache = kzalloc(sizeof(*cache), GFP_NOFS);
8299         if (!cache)
8300                 return -ENOMEM;
8301         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
8302                                         GFP_NOFS);
8303         if (!cache->free_space_ctl) {
8304                 kfree(cache);
8305                 return -ENOMEM;
8306         }
8307
8308         cache->key.objectid = chunk_offset;
8309         cache->key.offset = size;
8310         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
8311         cache->sectorsize = root->sectorsize;
8312         cache->fs_info = root->fs_info;
8313         cache->full_stripe_len = btrfs_full_stripe_len(root,
8314                                                &root->fs_info->mapping_tree,
8315                                                chunk_offset);
8316
8317         atomic_set(&cache->count, 1);
8318         spin_lock_init(&cache->lock);
8319         INIT_LIST_HEAD(&cache->list);
8320         INIT_LIST_HEAD(&cache->cluster_list);
8321         INIT_LIST_HEAD(&cache->new_bg_list);
8322
8323         btrfs_init_free_space_ctl(cache);
8324
8325         btrfs_set_block_group_used(&cache->item, bytes_used);
8326         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
8327         cache->flags = type;
8328         btrfs_set_block_group_flags(&cache->item, type);
8329
8330         cache->last_byte_to_unpin = (u64)-1;
8331         cache->cached = BTRFS_CACHE_FINISHED;
8332         ret = exclude_super_stripes(root, cache);
8333         if (ret) {
8334                 /*
8335                  * We may have excluded something, so call this just in
8336                  * case.
8337                  */
8338                 free_excluded_extents(root, cache);
8339                 kfree(cache->free_space_ctl);
8340                 kfree(cache);
8341                 return ret;
8342         }
8343
8344         add_new_free_space(cache, root->fs_info, chunk_offset,
8345                            chunk_offset + size);
8346
8347         free_excluded_extents(root, cache);
8348
8349         ret = btrfs_add_block_group_cache(root->fs_info, cache);
8350         if (ret) {
8351                 btrfs_remove_free_space_cache(cache);
8352                 btrfs_put_block_group(cache);
8353                 return ret;
8354         }
8355
8356         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
8357                                 &cache->space_info);
8358         if (ret) {
8359                 btrfs_remove_free_space_cache(cache);
8360                 spin_lock(&root->fs_info->block_group_cache_lock);
8361                 rb_erase(&cache->cache_node,
8362                          &root->fs_info->block_group_cache_tree);
8363                 spin_unlock(&root->fs_info->block_group_cache_lock);
8364                 btrfs_put_block_group(cache);
8365                 return ret;
8366         }
8367         update_global_block_rsv(root->fs_info);
8368
8369         spin_lock(&cache->space_info->lock);
8370         cache->space_info->bytes_readonly += cache->bytes_super;
8371         spin_unlock(&cache->space_info->lock);
8372
8373         __link_block_group(cache->space_info, cache);
8374
8375         list_add_tail(&cache->new_bg_list, &trans->new_bgs);
8376
8377         set_avail_alloc_bits(extent_root->fs_info, type);
8378
8379         return 0;
8380 }
8381
8382 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
8383 {
8384         u64 extra_flags = chunk_to_extended(flags) &
8385                                 BTRFS_EXTENDED_PROFILE_MASK;
8386
8387         write_seqlock(&fs_info->profiles_lock);
8388         if (flags & BTRFS_BLOCK_GROUP_DATA)
8389                 fs_info->avail_data_alloc_bits &= ~extra_flags;
8390         if (flags & BTRFS_BLOCK_GROUP_METADATA)
8391                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8392         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8393                 fs_info->avail_system_alloc_bits &= ~extra_flags;
8394         write_sequnlock(&fs_info->profiles_lock);
8395 }
8396
8397 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8398                              struct btrfs_root *root, u64 group_start)
8399 {
8400         struct btrfs_path *path;
8401         struct btrfs_block_group_cache *block_group;
8402         struct btrfs_free_cluster *cluster;
8403         struct btrfs_root *tree_root = root->fs_info->tree_root;
8404         struct btrfs_key key;
8405         struct inode *inode;
8406         int ret;
8407         int index;
8408         int factor;
8409
8410         root = root->fs_info->extent_root;
8411
8412         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8413         BUG_ON(!block_group);
8414         BUG_ON(!block_group->ro);
8415
8416         /*
8417          * Free the reserved super bytes from this block group before
8418          * remove it.
8419          */
8420         free_excluded_extents(root, block_group);
8421
8422         memcpy(&key, &block_group->key, sizeof(key));
8423         index = get_block_group_index(block_group);
8424         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8425                                   BTRFS_BLOCK_GROUP_RAID1 |
8426                                   BTRFS_BLOCK_GROUP_RAID10))
8427                 factor = 2;
8428         else
8429                 factor = 1;
8430
8431         /* make sure this block group isn't part of an allocation cluster */
8432         cluster = &root->fs_info->data_alloc_cluster;
8433         spin_lock(&cluster->refill_lock);
8434         btrfs_return_cluster_to_free_space(block_group, cluster);
8435         spin_unlock(&cluster->refill_lock);
8436
8437         /*
8438          * make sure this block group isn't part of a metadata
8439          * allocation cluster
8440          */
8441         cluster = &root->fs_info->meta_alloc_cluster;
8442         spin_lock(&cluster->refill_lock);
8443         btrfs_return_cluster_to_free_space(block_group, cluster);
8444         spin_unlock(&cluster->refill_lock);
8445
8446         path = btrfs_alloc_path();
8447         if (!path) {
8448                 ret = -ENOMEM;
8449                 goto out;
8450         }
8451
8452         inode = lookup_free_space_inode(tree_root, block_group, path);
8453         if (!IS_ERR(inode)) {
8454                 ret = btrfs_orphan_add(trans, inode);
8455                 if (ret) {
8456                         btrfs_add_delayed_iput(inode);
8457                         goto out;
8458                 }
8459                 clear_nlink(inode);
8460                 /* One for the block groups ref */
8461                 spin_lock(&block_group->lock);
8462                 if (block_group->iref) {
8463                         block_group->iref = 0;
8464                         block_group->inode = NULL;
8465                         spin_unlock(&block_group->lock);
8466                         iput(inode);
8467                 } else {
8468                         spin_unlock(&block_group->lock);
8469                 }
8470                 /* One for our lookup ref */
8471                 btrfs_add_delayed_iput(inode);
8472         }
8473
8474         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8475         key.offset = block_group->key.objectid;
8476         key.type = 0;
8477
8478         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8479         if (ret < 0)
8480                 goto out;
8481         if (ret > 0)
8482                 btrfs_release_path(path);
8483         if (ret == 0) {
8484                 ret = btrfs_del_item(trans, tree_root, path);
8485                 if (ret)
8486                         goto out;
8487                 btrfs_release_path(path);
8488         }
8489
8490         spin_lock(&root->fs_info->block_group_cache_lock);
8491         rb_erase(&block_group->cache_node,
8492                  &root->fs_info->block_group_cache_tree);
8493
8494         if (root->fs_info->first_logical_byte == block_group->key.objectid)
8495                 root->fs_info->first_logical_byte = (u64)-1;
8496         spin_unlock(&root->fs_info->block_group_cache_lock);
8497
8498         down_write(&block_group->space_info->groups_sem);
8499         /*
8500          * we must use list_del_init so people can check to see if they
8501          * are still on the list after taking the semaphore
8502          */
8503         list_del_init(&block_group->list);
8504         if (list_empty(&block_group->space_info->block_groups[index]))
8505                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8506         up_write(&block_group->space_info->groups_sem);
8507
8508         if (block_group->cached == BTRFS_CACHE_STARTED)
8509                 wait_block_group_cache_done(block_group);
8510
8511         btrfs_remove_free_space_cache(block_group);
8512
8513         spin_lock(&block_group->space_info->lock);
8514         block_group->space_info->total_bytes -= block_group->key.offset;
8515         block_group->space_info->bytes_readonly -= block_group->key.offset;
8516         block_group->space_info->disk_total -= block_group->key.offset * factor;
8517         spin_unlock(&block_group->space_info->lock);
8518
8519         memcpy(&key, &block_group->key, sizeof(key));
8520
8521         btrfs_clear_space_info_full(root->fs_info);
8522
8523         btrfs_put_block_group(block_group);
8524         btrfs_put_block_group(block_group);
8525
8526         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8527         if (ret > 0)
8528                 ret = -EIO;
8529         if (ret < 0)
8530                 goto out;
8531
8532         ret = btrfs_del_item(trans, root, path);
8533 out:
8534         btrfs_free_path(path);
8535         return ret;
8536 }
8537
8538 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8539 {
8540         struct btrfs_space_info *space_info;
8541         struct btrfs_super_block *disk_super;
8542         u64 features;
8543         u64 flags;
8544         int mixed = 0;
8545         int ret;
8546
8547         disk_super = fs_info->super_copy;
8548         if (!btrfs_super_root(disk_super))
8549                 return 1;
8550
8551         features = btrfs_super_incompat_flags(disk_super);
8552         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8553                 mixed = 1;
8554
8555         flags = BTRFS_BLOCK_GROUP_SYSTEM;
8556         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8557         if (ret)
8558                 goto out;
8559
8560         if (mixed) {
8561                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8562                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8563         } else {
8564                 flags = BTRFS_BLOCK_GROUP_METADATA;
8565                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8566                 if (ret)
8567                         goto out;
8568
8569                 flags = BTRFS_BLOCK_GROUP_DATA;
8570                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8571         }
8572 out:
8573         return ret;
8574 }
8575
8576 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8577 {
8578         return unpin_extent_range(root, start, end);
8579 }
8580
8581 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8582                                u64 num_bytes, u64 *actual_bytes)
8583 {
8584         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8585 }
8586
8587 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8588 {
8589         struct btrfs_fs_info *fs_info = root->fs_info;
8590         struct btrfs_block_group_cache *cache = NULL;
8591         u64 group_trimmed;
8592         u64 start;
8593         u64 end;
8594         u64 trimmed = 0;
8595         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8596         int ret = 0;
8597
8598         /*
8599          * try to trim all FS space, our block group may start from non-zero.
8600          */
8601         if (range->len == total_bytes)
8602                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
8603         else
8604                 cache = btrfs_lookup_block_group(fs_info, range->start);
8605
8606         while (cache) {
8607                 if (cache->key.objectid >= (range->start + range->len)) {
8608                         btrfs_put_block_group(cache);
8609                         break;
8610                 }
8611
8612                 start = max(range->start, cache->key.objectid);
8613                 end = min(range->start + range->len,
8614                                 cache->key.objectid + cache->key.offset);
8615
8616                 if (end - start >= range->minlen) {
8617                         if (!block_group_cache_done(cache)) {
8618                                 ret = cache_block_group(cache, 0);
8619                                 if (!ret)
8620                                         wait_block_group_cache_done(cache);
8621                         }
8622                         ret = btrfs_trim_block_group(cache,
8623                                                      &group_trimmed,
8624                                                      start,
8625                                                      end,
8626                                                      range->minlen);
8627
8628                         trimmed += group_trimmed;
8629                         if (ret) {
8630                                 btrfs_put_block_group(cache);
8631                                 break;
8632                         }
8633                 }
8634
8635                 cache = next_block_group(fs_info->tree_root, cache);
8636         }
8637
8638         range->len = trimmed;
8639         return ret;
8640 }