]> rtime.felk.cvut.cz Git - linux-imx.git/blob - fs/btrfs/extent-tree.c
Btrfs: don't re-enter when allocating a chunk
[linux-imx.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "math.h"
37
38 #undef SCRAMBLE_DELAYED_REFS
39
40 /*
41  * control flags for do_chunk_alloc's force field
42  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
43  * if we really need one.
44  *
45  * CHUNK_ALLOC_LIMITED means to only try and allocate one
46  * if we have very few chunks already allocated.  This is
47  * used as part of the clustering code to help make sure
48  * we have a good pool of storage to cluster in, without
49  * filling the FS with empty chunks
50  *
51  * CHUNK_ALLOC_FORCE means it must try to allocate one
52  *
53  */
54 enum {
55         CHUNK_ALLOC_NO_FORCE = 0,
56         CHUNK_ALLOC_LIMITED = 1,
57         CHUNK_ALLOC_FORCE = 2,
58 };
59
60 /*
61  * Control how reservations are dealt with.
62  *
63  * RESERVE_FREE - freeing a reservation.
64  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
65  *   ENOSPC accounting
66  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
67  *   bytes_may_use as the ENOSPC accounting is done elsewhere
68  */
69 enum {
70         RESERVE_FREE = 0,
71         RESERVE_ALLOC = 1,
72         RESERVE_ALLOC_NO_ACCOUNT = 2,
73 };
74
75 static int update_block_group(struct btrfs_trans_handle *trans,
76                               struct btrfs_root *root,
77                               u64 bytenr, u64 num_bytes, int alloc);
78 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
79                                 struct btrfs_root *root,
80                                 u64 bytenr, u64 num_bytes, u64 parent,
81                                 u64 root_objectid, u64 owner_objectid,
82                                 u64 owner_offset, int refs_to_drop,
83                                 struct btrfs_delayed_extent_op *extra_op);
84 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
85                                     struct extent_buffer *leaf,
86                                     struct btrfs_extent_item *ei);
87 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
88                                       struct btrfs_root *root,
89                                       u64 parent, u64 root_objectid,
90                                       u64 flags, u64 owner, u64 offset,
91                                       struct btrfs_key *ins, int ref_mod);
92 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
93                                      struct btrfs_root *root,
94                                      u64 parent, u64 root_objectid,
95                                      u64 flags, struct btrfs_disk_key *key,
96                                      int level, struct btrfs_key *ins);
97 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
98                           struct btrfs_root *extent_root, u64 flags,
99                           int force);
100 static int find_next_key(struct btrfs_path *path, int level,
101                          struct btrfs_key *key);
102 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
103                             int dump_block_groups);
104 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
105                                        u64 num_bytes, int reserve);
106
107 static noinline int
108 block_group_cache_done(struct btrfs_block_group_cache *cache)
109 {
110         smp_mb();
111         return cache->cached == BTRFS_CACHE_FINISHED;
112 }
113
114 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
115 {
116         return (cache->flags & bits) == bits;
117 }
118
119 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
120 {
121         atomic_inc(&cache->count);
122 }
123
124 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
125 {
126         if (atomic_dec_and_test(&cache->count)) {
127                 WARN_ON(cache->pinned > 0);
128                 WARN_ON(cache->reserved > 0);
129                 kfree(cache->free_space_ctl);
130                 kfree(cache);
131         }
132 }
133
134 /*
135  * this adds the block group to the fs_info rb tree for the block group
136  * cache
137  */
138 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
139                                 struct btrfs_block_group_cache *block_group)
140 {
141         struct rb_node **p;
142         struct rb_node *parent = NULL;
143         struct btrfs_block_group_cache *cache;
144
145         spin_lock(&info->block_group_cache_lock);
146         p = &info->block_group_cache_tree.rb_node;
147
148         while (*p) {
149                 parent = *p;
150                 cache = rb_entry(parent, struct btrfs_block_group_cache,
151                                  cache_node);
152                 if (block_group->key.objectid < cache->key.objectid) {
153                         p = &(*p)->rb_left;
154                 } else if (block_group->key.objectid > cache->key.objectid) {
155                         p = &(*p)->rb_right;
156                 } else {
157                         spin_unlock(&info->block_group_cache_lock);
158                         return -EEXIST;
159                 }
160         }
161
162         rb_link_node(&block_group->cache_node, parent, p);
163         rb_insert_color(&block_group->cache_node,
164                         &info->block_group_cache_tree);
165         spin_unlock(&info->block_group_cache_lock);
166
167         return 0;
168 }
169
170 /*
171  * This will return the block group at or after bytenr if contains is 0, else
172  * it will return the block group that contains the bytenr
173  */
174 static struct btrfs_block_group_cache *
175 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
176                               int contains)
177 {
178         struct btrfs_block_group_cache *cache, *ret = NULL;
179         struct rb_node *n;
180         u64 end, start;
181
182         spin_lock(&info->block_group_cache_lock);
183         n = info->block_group_cache_tree.rb_node;
184
185         while (n) {
186                 cache = rb_entry(n, struct btrfs_block_group_cache,
187                                  cache_node);
188                 end = cache->key.objectid + cache->key.offset - 1;
189                 start = cache->key.objectid;
190
191                 if (bytenr < start) {
192                         if (!contains && (!ret || start < ret->key.objectid))
193                                 ret = cache;
194                         n = n->rb_left;
195                 } else if (bytenr > start) {
196                         if (contains && bytenr <= end) {
197                                 ret = cache;
198                                 break;
199                         }
200                         n = n->rb_right;
201                 } else {
202                         ret = cache;
203                         break;
204                 }
205         }
206         if (ret)
207                 btrfs_get_block_group(ret);
208         spin_unlock(&info->block_group_cache_lock);
209
210         return ret;
211 }
212
213 static int add_excluded_extent(struct btrfs_root *root,
214                                u64 start, u64 num_bytes)
215 {
216         u64 end = start + num_bytes - 1;
217         set_extent_bits(&root->fs_info->freed_extents[0],
218                         start, end, EXTENT_UPTODATE, GFP_NOFS);
219         set_extent_bits(&root->fs_info->freed_extents[1],
220                         start, end, EXTENT_UPTODATE, GFP_NOFS);
221         return 0;
222 }
223
224 static void free_excluded_extents(struct btrfs_root *root,
225                                   struct btrfs_block_group_cache *cache)
226 {
227         u64 start, end;
228
229         start = cache->key.objectid;
230         end = start + cache->key.offset - 1;
231
232         clear_extent_bits(&root->fs_info->freed_extents[0],
233                           start, end, EXTENT_UPTODATE, GFP_NOFS);
234         clear_extent_bits(&root->fs_info->freed_extents[1],
235                           start, end, EXTENT_UPTODATE, GFP_NOFS);
236 }
237
238 static int exclude_super_stripes(struct btrfs_root *root,
239                                  struct btrfs_block_group_cache *cache)
240 {
241         u64 bytenr;
242         u64 *logical;
243         int stripe_len;
244         int i, nr, ret;
245
246         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
247                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
248                 cache->bytes_super += stripe_len;
249                 ret = add_excluded_extent(root, cache->key.objectid,
250                                           stripe_len);
251                 BUG_ON(ret); /* -ENOMEM */
252         }
253
254         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
255                 bytenr = btrfs_sb_offset(i);
256                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
257                                        cache->key.objectid, bytenr,
258                                        0, &logical, &nr, &stripe_len);
259                 BUG_ON(ret); /* -ENOMEM */
260
261                 while (nr--) {
262                         cache->bytes_super += stripe_len;
263                         ret = add_excluded_extent(root, logical[nr],
264                                                   stripe_len);
265                         BUG_ON(ret); /* -ENOMEM */
266                 }
267
268                 kfree(logical);
269         }
270         return 0;
271 }
272
273 static struct btrfs_caching_control *
274 get_caching_control(struct btrfs_block_group_cache *cache)
275 {
276         struct btrfs_caching_control *ctl;
277
278         spin_lock(&cache->lock);
279         if (cache->cached != BTRFS_CACHE_STARTED) {
280                 spin_unlock(&cache->lock);
281                 return NULL;
282         }
283
284         /* We're loading it the fast way, so we don't have a caching_ctl. */
285         if (!cache->caching_ctl) {
286                 spin_unlock(&cache->lock);
287                 return NULL;
288         }
289
290         ctl = cache->caching_ctl;
291         atomic_inc(&ctl->count);
292         spin_unlock(&cache->lock);
293         return ctl;
294 }
295
296 static void put_caching_control(struct btrfs_caching_control *ctl)
297 {
298         if (atomic_dec_and_test(&ctl->count))
299                 kfree(ctl);
300 }
301
302 /*
303  * this is only called by cache_block_group, since we could have freed extents
304  * we need to check the pinned_extents for any extents that can't be used yet
305  * since their free space will be released as soon as the transaction commits.
306  */
307 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
308                               struct btrfs_fs_info *info, u64 start, u64 end)
309 {
310         u64 extent_start, extent_end, size, total_added = 0;
311         int ret;
312
313         while (start < end) {
314                 ret = find_first_extent_bit(info->pinned_extents, start,
315                                             &extent_start, &extent_end,
316                                             EXTENT_DIRTY | EXTENT_UPTODATE,
317                                             NULL);
318                 if (ret)
319                         break;
320
321                 if (extent_start <= start) {
322                         start = extent_end + 1;
323                 } else if (extent_start > start && extent_start < end) {
324                         size = extent_start - start;
325                         total_added += size;
326                         ret = btrfs_add_free_space(block_group, start,
327                                                    size);
328                         BUG_ON(ret); /* -ENOMEM or logic error */
329                         start = extent_end + 1;
330                 } else {
331                         break;
332                 }
333         }
334
335         if (start < end) {
336                 size = end - start;
337                 total_added += size;
338                 ret = btrfs_add_free_space(block_group, start, size);
339                 BUG_ON(ret); /* -ENOMEM or logic error */
340         }
341
342         return total_added;
343 }
344
345 static noinline void caching_thread(struct btrfs_work *work)
346 {
347         struct btrfs_block_group_cache *block_group;
348         struct btrfs_fs_info *fs_info;
349         struct btrfs_caching_control *caching_ctl;
350         struct btrfs_root *extent_root;
351         struct btrfs_path *path;
352         struct extent_buffer *leaf;
353         struct btrfs_key key;
354         u64 total_found = 0;
355         u64 last = 0;
356         u32 nritems;
357         int ret = 0;
358
359         caching_ctl = container_of(work, struct btrfs_caching_control, work);
360         block_group = caching_ctl->block_group;
361         fs_info = block_group->fs_info;
362         extent_root = fs_info->extent_root;
363
364         path = btrfs_alloc_path();
365         if (!path)
366                 goto out;
367
368         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
369
370         /*
371          * We don't want to deadlock with somebody trying to allocate a new
372          * extent for the extent root while also trying to search the extent
373          * root to add free space.  So we skip locking and search the commit
374          * root, since its read-only
375          */
376         path->skip_locking = 1;
377         path->search_commit_root = 1;
378         path->reada = 1;
379
380         key.objectid = last;
381         key.offset = 0;
382         key.type = BTRFS_EXTENT_ITEM_KEY;
383 again:
384         mutex_lock(&caching_ctl->mutex);
385         /* need to make sure the commit_root doesn't disappear */
386         down_read(&fs_info->extent_commit_sem);
387
388         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
389         if (ret < 0)
390                 goto err;
391
392         leaf = path->nodes[0];
393         nritems = btrfs_header_nritems(leaf);
394
395         while (1) {
396                 if (btrfs_fs_closing(fs_info) > 1) {
397                         last = (u64)-1;
398                         break;
399                 }
400
401                 if (path->slots[0] < nritems) {
402                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
403                 } else {
404                         ret = find_next_key(path, 0, &key);
405                         if (ret)
406                                 break;
407
408                         if (need_resched() ||
409                             btrfs_next_leaf(extent_root, path)) {
410                                 caching_ctl->progress = last;
411                                 btrfs_release_path(path);
412                                 up_read(&fs_info->extent_commit_sem);
413                                 mutex_unlock(&caching_ctl->mutex);
414                                 cond_resched();
415                                 goto again;
416                         }
417                         leaf = path->nodes[0];
418                         nritems = btrfs_header_nritems(leaf);
419                         continue;
420                 }
421
422                 if (key.objectid < block_group->key.objectid) {
423                         path->slots[0]++;
424                         continue;
425                 }
426
427                 if (key.objectid >= block_group->key.objectid +
428                     block_group->key.offset)
429                         break;
430
431                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
432                         total_found += add_new_free_space(block_group,
433                                                           fs_info, last,
434                                                           key.objectid);
435                         last = key.objectid + key.offset;
436
437                         if (total_found > (1024 * 1024 * 2)) {
438                                 total_found = 0;
439                                 wake_up(&caching_ctl->wait);
440                         }
441                 }
442                 path->slots[0]++;
443         }
444         ret = 0;
445
446         total_found += add_new_free_space(block_group, fs_info, last,
447                                           block_group->key.objectid +
448                                           block_group->key.offset);
449         caching_ctl->progress = (u64)-1;
450
451         spin_lock(&block_group->lock);
452         block_group->caching_ctl = NULL;
453         block_group->cached = BTRFS_CACHE_FINISHED;
454         spin_unlock(&block_group->lock);
455
456 err:
457         btrfs_free_path(path);
458         up_read(&fs_info->extent_commit_sem);
459
460         free_excluded_extents(extent_root, block_group);
461
462         mutex_unlock(&caching_ctl->mutex);
463 out:
464         wake_up(&caching_ctl->wait);
465
466         put_caching_control(caching_ctl);
467         btrfs_put_block_group(block_group);
468 }
469
470 static int cache_block_group(struct btrfs_block_group_cache *cache,
471                              struct btrfs_trans_handle *trans,
472                              struct btrfs_root *root,
473                              int load_cache_only)
474 {
475         DEFINE_WAIT(wait);
476         struct btrfs_fs_info *fs_info = cache->fs_info;
477         struct btrfs_caching_control *caching_ctl;
478         int ret = 0;
479
480         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
481         if (!caching_ctl)
482                 return -ENOMEM;
483
484         INIT_LIST_HEAD(&caching_ctl->list);
485         mutex_init(&caching_ctl->mutex);
486         init_waitqueue_head(&caching_ctl->wait);
487         caching_ctl->block_group = cache;
488         caching_ctl->progress = cache->key.objectid;
489         atomic_set(&caching_ctl->count, 1);
490         caching_ctl->work.func = caching_thread;
491
492         spin_lock(&cache->lock);
493         /*
494          * This should be a rare occasion, but this could happen I think in the
495          * case where one thread starts to load the space cache info, and then
496          * some other thread starts a transaction commit which tries to do an
497          * allocation while the other thread is still loading the space cache
498          * info.  The previous loop should have kept us from choosing this block
499          * group, but if we've moved to the state where we will wait on caching
500          * block groups we need to first check if we're doing a fast load here,
501          * so we can wait for it to finish, otherwise we could end up allocating
502          * from a block group who's cache gets evicted for one reason or
503          * another.
504          */
505         while (cache->cached == BTRFS_CACHE_FAST) {
506                 struct btrfs_caching_control *ctl;
507
508                 ctl = cache->caching_ctl;
509                 atomic_inc(&ctl->count);
510                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
511                 spin_unlock(&cache->lock);
512
513                 schedule();
514
515                 finish_wait(&ctl->wait, &wait);
516                 put_caching_control(ctl);
517                 spin_lock(&cache->lock);
518         }
519
520         if (cache->cached != BTRFS_CACHE_NO) {
521                 spin_unlock(&cache->lock);
522                 kfree(caching_ctl);
523                 return 0;
524         }
525         WARN_ON(cache->caching_ctl);
526         cache->caching_ctl = caching_ctl;
527         cache->cached = BTRFS_CACHE_FAST;
528         spin_unlock(&cache->lock);
529
530         /*
531          * We can't do the read from on-disk cache during a commit since we need
532          * to have the normal tree locking.  Also if we are currently trying to
533          * allocate blocks for the tree root we can't do the fast caching since
534          * we likely hold important locks.
535          */
536         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
537                 ret = load_free_space_cache(fs_info, cache);
538
539                 spin_lock(&cache->lock);
540                 if (ret == 1) {
541                         cache->caching_ctl = NULL;
542                         cache->cached = BTRFS_CACHE_FINISHED;
543                         cache->last_byte_to_unpin = (u64)-1;
544                 } else {
545                         if (load_cache_only) {
546                                 cache->caching_ctl = NULL;
547                                 cache->cached = BTRFS_CACHE_NO;
548                         } else {
549                                 cache->cached = BTRFS_CACHE_STARTED;
550                         }
551                 }
552                 spin_unlock(&cache->lock);
553                 wake_up(&caching_ctl->wait);
554                 if (ret == 1) {
555                         put_caching_control(caching_ctl);
556                         free_excluded_extents(fs_info->extent_root, cache);
557                         return 0;
558                 }
559         } else {
560                 /*
561                  * We are not going to do the fast caching, set cached to the
562                  * appropriate value and wakeup any waiters.
563                  */
564                 spin_lock(&cache->lock);
565                 if (load_cache_only) {
566                         cache->caching_ctl = NULL;
567                         cache->cached = BTRFS_CACHE_NO;
568                 } else {
569                         cache->cached = BTRFS_CACHE_STARTED;
570                 }
571                 spin_unlock(&cache->lock);
572                 wake_up(&caching_ctl->wait);
573         }
574
575         if (load_cache_only) {
576                 put_caching_control(caching_ctl);
577                 return 0;
578         }
579
580         down_write(&fs_info->extent_commit_sem);
581         atomic_inc(&caching_ctl->count);
582         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
583         up_write(&fs_info->extent_commit_sem);
584
585         btrfs_get_block_group(cache);
586
587         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
588
589         return ret;
590 }
591
592 /*
593  * return the block group that starts at or after bytenr
594  */
595 static struct btrfs_block_group_cache *
596 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
597 {
598         struct btrfs_block_group_cache *cache;
599
600         cache = block_group_cache_tree_search(info, bytenr, 0);
601
602         return cache;
603 }
604
605 /*
606  * return the block group that contains the given bytenr
607  */
608 struct btrfs_block_group_cache *btrfs_lookup_block_group(
609                                                  struct btrfs_fs_info *info,
610                                                  u64 bytenr)
611 {
612         struct btrfs_block_group_cache *cache;
613
614         cache = block_group_cache_tree_search(info, bytenr, 1);
615
616         return cache;
617 }
618
619 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
620                                                   u64 flags)
621 {
622         struct list_head *head = &info->space_info;
623         struct btrfs_space_info *found;
624
625         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
626
627         rcu_read_lock();
628         list_for_each_entry_rcu(found, head, list) {
629                 if (found->flags & flags) {
630                         rcu_read_unlock();
631                         return found;
632                 }
633         }
634         rcu_read_unlock();
635         return NULL;
636 }
637
638 /*
639  * after adding space to the filesystem, we need to clear the full flags
640  * on all the space infos.
641  */
642 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
643 {
644         struct list_head *head = &info->space_info;
645         struct btrfs_space_info *found;
646
647         rcu_read_lock();
648         list_for_each_entry_rcu(found, head, list)
649                 found->full = 0;
650         rcu_read_unlock();
651 }
652
653 u64 btrfs_find_block_group(struct btrfs_root *root,
654                            u64 search_start, u64 search_hint, int owner)
655 {
656         struct btrfs_block_group_cache *cache;
657         u64 used;
658         u64 last = max(search_hint, search_start);
659         u64 group_start = 0;
660         int full_search = 0;
661         int factor = 9;
662         int wrapped = 0;
663 again:
664         while (1) {
665                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
666                 if (!cache)
667                         break;
668
669                 spin_lock(&cache->lock);
670                 last = cache->key.objectid + cache->key.offset;
671                 used = btrfs_block_group_used(&cache->item);
672
673                 if ((full_search || !cache->ro) &&
674                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
675                         if (used + cache->pinned + cache->reserved <
676                             div_factor(cache->key.offset, factor)) {
677                                 group_start = cache->key.objectid;
678                                 spin_unlock(&cache->lock);
679                                 btrfs_put_block_group(cache);
680                                 goto found;
681                         }
682                 }
683                 spin_unlock(&cache->lock);
684                 btrfs_put_block_group(cache);
685                 cond_resched();
686         }
687         if (!wrapped) {
688                 last = search_start;
689                 wrapped = 1;
690                 goto again;
691         }
692         if (!full_search && factor < 10) {
693                 last = search_start;
694                 full_search = 1;
695                 factor = 10;
696                 goto again;
697         }
698 found:
699         return group_start;
700 }
701
702 /* simple helper to search for an existing extent at a given offset */
703 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
704 {
705         int ret;
706         struct btrfs_key key;
707         struct btrfs_path *path;
708
709         path = btrfs_alloc_path();
710         if (!path)
711                 return -ENOMEM;
712
713         key.objectid = start;
714         key.offset = len;
715         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
716         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
717                                 0, 0);
718         btrfs_free_path(path);
719         return ret;
720 }
721
722 /*
723  * helper function to lookup reference count and flags of extent.
724  *
725  * the head node for delayed ref is used to store the sum of all the
726  * reference count modifications queued up in the rbtree. the head
727  * node may also store the extent flags to set. This way you can check
728  * to see what the reference count and extent flags would be if all of
729  * the delayed refs are not processed.
730  */
731 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
732                              struct btrfs_root *root, u64 bytenr,
733                              u64 num_bytes, u64 *refs, u64 *flags)
734 {
735         struct btrfs_delayed_ref_head *head;
736         struct btrfs_delayed_ref_root *delayed_refs;
737         struct btrfs_path *path;
738         struct btrfs_extent_item *ei;
739         struct extent_buffer *leaf;
740         struct btrfs_key key;
741         u32 item_size;
742         u64 num_refs;
743         u64 extent_flags;
744         int ret;
745
746         path = btrfs_alloc_path();
747         if (!path)
748                 return -ENOMEM;
749
750         key.objectid = bytenr;
751         key.type = BTRFS_EXTENT_ITEM_KEY;
752         key.offset = num_bytes;
753         if (!trans) {
754                 path->skip_locking = 1;
755                 path->search_commit_root = 1;
756         }
757 again:
758         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
759                                 &key, path, 0, 0);
760         if (ret < 0)
761                 goto out_free;
762
763         if (ret == 0) {
764                 leaf = path->nodes[0];
765                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
766                 if (item_size >= sizeof(*ei)) {
767                         ei = btrfs_item_ptr(leaf, path->slots[0],
768                                             struct btrfs_extent_item);
769                         num_refs = btrfs_extent_refs(leaf, ei);
770                         extent_flags = btrfs_extent_flags(leaf, ei);
771                 } else {
772 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
773                         struct btrfs_extent_item_v0 *ei0;
774                         BUG_ON(item_size != sizeof(*ei0));
775                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
776                                              struct btrfs_extent_item_v0);
777                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
778                         /* FIXME: this isn't correct for data */
779                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
780 #else
781                         BUG();
782 #endif
783                 }
784                 BUG_ON(num_refs == 0);
785         } else {
786                 num_refs = 0;
787                 extent_flags = 0;
788                 ret = 0;
789         }
790
791         if (!trans)
792                 goto out;
793
794         delayed_refs = &trans->transaction->delayed_refs;
795         spin_lock(&delayed_refs->lock);
796         head = btrfs_find_delayed_ref_head(trans, bytenr);
797         if (head) {
798                 if (!mutex_trylock(&head->mutex)) {
799                         atomic_inc(&head->node.refs);
800                         spin_unlock(&delayed_refs->lock);
801
802                         btrfs_release_path(path);
803
804                         /*
805                          * Mutex was contended, block until it's released and try
806                          * again
807                          */
808                         mutex_lock(&head->mutex);
809                         mutex_unlock(&head->mutex);
810                         btrfs_put_delayed_ref(&head->node);
811                         goto again;
812                 }
813                 if (head->extent_op && head->extent_op->update_flags)
814                         extent_flags |= head->extent_op->flags_to_set;
815                 else
816                         BUG_ON(num_refs == 0);
817
818                 num_refs += head->node.ref_mod;
819                 mutex_unlock(&head->mutex);
820         }
821         spin_unlock(&delayed_refs->lock);
822 out:
823         WARN_ON(num_refs == 0);
824         if (refs)
825                 *refs = num_refs;
826         if (flags)
827                 *flags = extent_flags;
828 out_free:
829         btrfs_free_path(path);
830         return ret;
831 }
832
833 /*
834  * Back reference rules.  Back refs have three main goals:
835  *
836  * 1) differentiate between all holders of references to an extent so that
837  *    when a reference is dropped we can make sure it was a valid reference
838  *    before freeing the extent.
839  *
840  * 2) Provide enough information to quickly find the holders of an extent
841  *    if we notice a given block is corrupted or bad.
842  *
843  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
844  *    maintenance.  This is actually the same as #2, but with a slightly
845  *    different use case.
846  *
847  * There are two kinds of back refs. The implicit back refs is optimized
848  * for pointers in non-shared tree blocks. For a given pointer in a block,
849  * back refs of this kind provide information about the block's owner tree
850  * and the pointer's key. These information allow us to find the block by
851  * b-tree searching. The full back refs is for pointers in tree blocks not
852  * referenced by their owner trees. The location of tree block is recorded
853  * in the back refs. Actually the full back refs is generic, and can be
854  * used in all cases the implicit back refs is used. The major shortcoming
855  * of the full back refs is its overhead. Every time a tree block gets
856  * COWed, we have to update back refs entry for all pointers in it.
857  *
858  * For a newly allocated tree block, we use implicit back refs for
859  * pointers in it. This means most tree related operations only involve
860  * implicit back refs. For a tree block created in old transaction, the
861  * only way to drop a reference to it is COW it. So we can detect the
862  * event that tree block loses its owner tree's reference and do the
863  * back refs conversion.
864  *
865  * When a tree block is COW'd through a tree, there are four cases:
866  *
867  * The reference count of the block is one and the tree is the block's
868  * owner tree. Nothing to do in this case.
869  *
870  * The reference count of the block is one and the tree is not the
871  * block's owner tree. In this case, full back refs is used for pointers
872  * in the block. Remove these full back refs, add implicit back refs for
873  * every pointers in the new block.
874  *
875  * The reference count of the block is greater than one and the tree is
876  * the block's owner tree. In this case, implicit back refs is used for
877  * pointers in the block. Add full back refs for every pointers in the
878  * block, increase lower level extents' reference counts. The original
879  * implicit back refs are entailed to the new block.
880  *
881  * The reference count of the block is greater than one and the tree is
882  * not the block's owner tree. Add implicit back refs for every pointer in
883  * the new block, increase lower level extents' reference count.
884  *
885  * Back Reference Key composing:
886  *
887  * The key objectid corresponds to the first byte in the extent,
888  * The key type is used to differentiate between types of back refs.
889  * There are different meanings of the key offset for different types
890  * of back refs.
891  *
892  * File extents can be referenced by:
893  *
894  * - multiple snapshots, subvolumes, or different generations in one subvol
895  * - different files inside a single subvolume
896  * - different offsets inside a file (bookend extents in file.c)
897  *
898  * The extent ref structure for the implicit back refs has fields for:
899  *
900  * - Objectid of the subvolume root
901  * - objectid of the file holding the reference
902  * - original offset in the file
903  * - how many bookend extents
904  *
905  * The key offset for the implicit back refs is hash of the first
906  * three fields.
907  *
908  * The extent ref structure for the full back refs has field for:
909  *
910  * - number of pointers in the tree leaf
911  *
912  * The key offset for the implicit back refs is the first byte of
913  * the tree leaf
914  *
915  * When a file extent is allocated, The implicit back refs is used.
916  * the fields are filled in:
917  *
918  *     (root_key.objectid, inode objectid, offset in file, 1)
919  *
920  * When a file extent is removed file truncation, we find the
921  * corresponding implicit back refs and check the following fields:
922  *
923  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
924  *
925  * Btree extents can be referenced by:
926  *
927  * - Different subvolumes
928  *
929  * Both the implicit back refs and the full back refs for tree blocks
930  * only consist of key. The key offset for the implicit back refs is
931  * objectid of block's owner tree. The key offset for the full back refs
932  * is the first byte of parent block.
933  *
934  * When implicit back refs is used, information about the lowest key and
935  * level of the tree block are required. These information are stored in
936  * tree block info structure.
937  */
938
939 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
940 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
941                                   struct btrfs_root *root,
942                                   struct btrfs_path *path,
943                                   u64 owner, u32 extra_size)
944 {
945         struct btrfs_extent_item *item;
946         struct btrfs_extent_item_v0 *ei0;
947         struct btrfs_extent_ref_v0 *ref0;
948         struct btrfs_tree_block_info *bi;
949         struct extent_buffer *leaf;
950         struct btrfs_key key;
951         struct btrfs_key found_key;
952         u32 new_size = sizeof(*item);
953         u64 refs;
954         int ret;
955
956         leaf = path->nodes[0];
957         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
958
959         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
960         ei0 = btrfs_item_ptr(leaf, path->slots[0],
961                              struct btrfs_extent_item_v0);
962         refs = btrfs_extent_refs_v0(leaf, ei0);
963
964         if (owner == (u64)-1) {
965                 while (1) {
966                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
967                                 ret = btrfs_next_leaf(root, path);
968                                 if (ret < 0)
969                                         return ret;
970                                 BUG_ON(ret > 0); /* Corruption */
971                                 leaf = path->nodes[0];
972                         }
973                         btrfs_item_key_to_cpu(leaf, &found_key,
974                                               path->slots[0]);
975                         BUG_ON(key.objectid != found_key.objectid);
976                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
977                                 path->slots[0]++;
978                                 continue;
979                         }
980                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
981                                               struct btrfs_extent_ref_v0);
982                         owner = btrfs_ref_objectid_v0(leaf, ref0);
983                         break;
984                 }
985         }
986         btrfs_release_path(path);
987
988         if (owner < BTRFS_FIRST_FREE_OBJECTID)
989                 new_size += sizeof(*bi);
990
991         new_size -= sizeof(*ei0);
992         ret = btrfs_search_slot(trans, root, &key, path,
993                                 new_size + extra_size, 1);
994         if (ret < 0)
995                 return ret;
996         BUG_ON(ret); /* Corruption */
997
998         btrfs_extend_item(trans, root, path, new_size);
999
1000         leaf = path->nodes[0];
1001         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1002         btrfs_set_extent_refs(leaf, item, refs);
1003         /* FIXME: get real generation */
1004         btrfs_set_extent_generation(leaf, item, 0);
1005         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1006                 btrfs_set_extent_flags(leaf, item,
1007                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1008                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1009                 bi = (struct btrfs_tree_block_info *)(item + 1);
1010                 /* FIXME: get first key of the block */
1011                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1012                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1013         } else {
1014                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1015         }
1016         btrfs_mark_buffer_dirty(leaf);
1017         return 0;
1018 }
1019 #endif
1020
1021 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1022 {
1023         u32 high_crc = ~(u32)0;
1024         u32 low_crc = ~(u32)0;
1025         __le64 lenum;
1026
1027         lenum = cpu_to_le64(root_objectid);
1028         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1029         lenum = cpu_to_le64(owner);
1030         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1031         lenum = cpu_to_le64(offset);
1032         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1033
1034         return ((u64)high_crc << 31) ^ (u64)low_crc;
1035 }
1036
1037 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1038                                      struct btrfs_extent_data_ref *ref)
1039 {
1040         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1041                                     btrfs_extent_data_ref_objectid(leaf, ref),
1042                                     btrfs_extent_data_ref_offset(leaf, ref));
1043 }
1044
1045 static int match_extent_data_ref(struct extent_buffer *leaf,
1046                                  struct btrfs_extent_data_ref *ref,
1047                                  u64 root_objectid, u64 owner, u64 offset)
1048 {
1049         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1050             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1051             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1052                 return 0;
1053         return 1;
1054 }
1055
1056 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1057                                            struct btrfs_root *root,
1058                                            struct btrfs_path *path,
1059                                            u64 bytenr, u64 parent,
1060                                            u64 root_objectid,
1061                                            u64 owner, u64 offset)
1062 {
1063         struct btrfs_key key;
1064         struct btrfs_extent_data_ref *ref;
1065         struct extent_buffer *leaf;
1066         u32 nritems;
1067         int ret;
1068         int recow;
1069         int err = -ENOENT;
1070
1071         key.objectid = bytenr;
1072         if (parent) {
1073                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1074                 key.offset = parent;
1075         } else {
1076                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1077                 key.offset = hash_extent_data_ref(root_objectid,
1078                                                   owner, offset);
1079         }
1080 again:
1081         recow = 0;
1082         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1083         if (ret < 0) {
1084                 err = ret;
1085                 goto fail;
1086         }
1087
1088         if (parent) {
1089                 if (!ret)
1090                         return 0;
1091 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1092                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1093                 btrfs_release_path(path);
1094                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1095                 if (ret < 0) {
1096                         err = ret;
1097                         goto fail;
1098                 }
1099                 if (!ret)
1100                         return 0;
1101 #endif
1102                 goto fail;
1103         }
1104
1105         leaf = path->nodes[0];
1106         nritems = btrfs_header_nritems(leaf);
1107         while (1) {
1108                 if (path->slots[0] >= nritems) {
1109                         ret = btrfs_next_leaf(root, path);
1110                         if (ret < 0)
1111                                 err = ret;
1112                         if (ret)
1113                                 goto fail;
1114
1115                         leaf = path->nodes[0];
1116                         nritems = btrfs_header_nritems(leaf);
1117                         recow = 1;
1118                 }
1119
1120                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1121                 if (key.objectid != bytenr ||
1122                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1123                         goto fail;
1124
1125                 ref = btrfs_item_ptr(leaf, path->slots[0],
1126                                      struct btrfs_extent_data_ref);
1127
1128                 if (match_extent_data_ref(leaf, ref, root_objectid,
1129                                           owner, offset)) {
1130                         if (recow) {
1131                                 btrfs_release_path(path);
1132                                 goto again;
1133                         }
1134                         err = 0;
1135                         break;
1136                 }
1137                 path->slots[0]++;
1138         }
1139 fail:
1140         return err;
1141 }
1142
1143 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1144                                            struct btrfs_root *root,
1145                                            struct btrfs_path *path,
1146                                            u64 bytenr, u64 parent,
1147                                            u64 root_objectid, u64 owner,
1148                                            u64 offset, int refs_to_add)
1149 {
1150         struct btrfs_key key;
1151         struct extent_buffer *leaf;
1152         u32 size;
1153         u32 num_refs;
1154         int ret;
1155
1156         key.objectid = bytenr;
1157         if (parent) {
1158                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1159                 key.offset = parent;
1160                 size = sizeof(struct btrfs_shared_data_ref);
1161         } else {
1162                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1163                 key.offset = hash_extent_data_ref(root_objectid,
1164                                                   owner, offset);
1165                 size = sizeof(struct btrfs_extent_data_ref);
1166         }
1167
1168         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1169         if (ret && ret != -EEXIST)
1170                 goto fail;
1171
1172         leaf = path->nodes[0];
1173         if (parent) {
1174                 struct btrfs_shared_data_ref *ref;
1175                 ref = btrfs_item_ptr(leaf, path->slots[0],
1176                                      struct btrfs_shared_data_ref);
1177                 if (ret == 0) {
1178                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1179                 } else {
1180                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1181                         num_refs += refs_to_add;
1182                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1183                 }
1184         } else {
1185                 struct btrfs_extent_data_ref *ref;
1186                 while (ret == -EEXIST) {
1187                         ref = btrfs_item_ptr(leaf, path->slots[0],
1188                                              struct btrfs_extent_data_ref);
1189                         if (match_extent_data_ref(leaf, ref, root_objectid,
1190                                                   owner, offset))
1191                                 break;
1192                         btrfs_release_path(path);
1193                         key.offset++;
1194                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1195                                                       size);
1196                         if (ret && ret != -EEXIST)
1197                                 goto fail;
1198
1199                         leaf = path->nodes[0];
1200                 }
1201                 ref = btrfs_item_ptr(leaf, path->slots[0],
1202                                      struct btrfs_extent_data_ref);
1203                 if (ret == 0) {
1204                         btrfs_set_extent_data_ref_root(leaf, ref,
1205                                                        root_objectid);
1206                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1207                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1208                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1209                 } else {
1210                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1211                         num_refs += refs_to_add;
1212                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1213                 }
1214         }
1215         btrfs_mark_buffer_dirty(leaf);
1216         ret = 0;
1217 fail:
1218         btrfs_release_path(path);
1219         return ret;
1220 }
1221
1222 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1223                                            struct btrfs_root *root,
1224                                            struct btrfs_path *path,
1225                                            int refs_to_drop)
1226 {
1227         struct btrfs_key key;
1228         struct btrfs_extent_data_ref *ref1 = NULL;
1229         struct btrfs_shared_data_ref *ref2 = NULL;
1230         struct extent_buffer *leaf;
1231         u32 num_refs = 0;
1232         int ret = 0;
1233
1234         leaf = path->nodes[0];
1235         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1236
1237         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1238                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1239                                       struct btrfs_extent_data_ref);
1240                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1241         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1242                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1243                                       struct btrfs_shared_data_ref);
1244                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1245 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1246         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1247                 struct btrfs_extent_ref_v0 *ref0;
1248                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1249                                       struct btrfs_extent_ref_v0);
1250                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1251 #endif
1252         } else {
1253                 BUG();
1254         }
1255
1256         BUG_ON(num_refs < refs_to_drop);
1257         num_refs -= refs_to_drop;
1258
1259         if (num_refs == 0) {
1260                 ret = btrfs_del_item(trans, root, path);
1261         } else {
1262                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1263                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1264                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1265                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1266 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1267                 else {
1268                         struct btrfs_extent_ref_v0 *ref0;
1269                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1270                                         struct btrfs_extent_ref_v0);
1271                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1272                 }
1273 #endif
1274                 btrfs_mark_buffer_dirty(leaf);
1275         }
1276         return ret;
1277 }
1278
1279 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1280                                           struct btrfs_path *path,
1281                                           struct btrfs_extent_inline_ref *iref)
1282 {
1283         struct btrfs_key key;
1284         struct extent_buffer *leaf;
1285         struct btrfs_extent_data_ref *ref1;
1286         struct btrfs_shared_data_ref *ref2;
1287         u32 num_refs = 0;
1288
1289         leaf = path->nodes[0];
1290         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1291         if (iref) {
1292                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1293                     BTRFS_EXTENT_DATA_REF_KEY) {
1294                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1295                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1296                 } else {
1297                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1298                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1299                 }
1300         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1301                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1302                                       struct btrfs_extent_data_ref);
1303                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1304         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1305                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1306                                       struct btrfs_shared_data_ref);
1307                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1308 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1309         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1310                 struct btrfs_extent_ref_v0 *ref0;
1311                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1312                                       struct btrfs_extent_ref_v0);
1313                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1314 #endif
1315         } else {
1316                 WARN_ON(1);
1317         }
1318         return num_refs;
1319 }
1320
1321 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1322                                           struct btrfs_root *root,
1323                                           struct btrfs_path *path,
1324                                           u64 bytenr, u64 parent,
1325                                           u64 root_objectid)
1326 {
1327         struct btrfs_key key;
1328         int ret;
1329
1330         key.objectid = bytenr;
1331         if (parent) {
1332                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1333                 key.offset = parent;
1334         } else {
1335                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1336                 key.offset = root_objectid;
1337         }
1338
1339         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1340         if (ret > 0)
1341                 ret = -ENOENT;
1342 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1343         if (ret == -ENOENT && parent) {
1344                 btrfs_release_path(path);
1345                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1346                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1347                 if (ret > 0)
1348                         ret = -ENOENT;
1349         }
1350 #endif
1351         return ret;
1352 }
1353
1354 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1355                                           struct btrfs_root *root,
1356                                           struct btrfs_path *path,
1357                                           u64 bytenr, u64 parent,
1358                                           u64 root_objectid)
1359 {
1360         struct btrfs_key key;
1361         int ret;
1362
1363         key.objectid = bytenr;
1364         if (parent) {
1365                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1366                 key.offset = parent;
1367         } else {
1368                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1369                 key.offset = root_objectid;
1370         }
1371
1372         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1373         btrfs_release_path(path);
1374         return ret;
1375 }
1376
1377 static inline int extent_ref_type(u64 parent, u64 owner)
1378 {
1379         int type;
1380         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1381                 if (parent > 0)
1382                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1383                 else
1384                         type = BTRFS_TREE_BLOCK_REF_KEY;
1385         } else {
1386                 if (parent > 0)
1387                         type = BTRFS_SHARED_DATA_REF_KEY;
1388                 else
1389                         type = BTRFS_EXTENT_DATA_REF_KEY;
1390         }
1391         return type;
1392 }
1393
1394 static int find_next_key(struct btrfs_path *path, int level,
1395                          struct btrfs_key *key)
1396
1397 {
1398         for (; level < BTRFS_MAX_LEVEL; level++) {
1399                 if (!path->nodes[level])
1400                         break;
1401                 if (path->slots[level] + 1 >=
1402                     btrfs_header_nritems(path->nodes[level]))
1403                         continue;
1404                 if (level == 0)
1405                         btrfs_item_key_to_cpu(path->nodes[level], key,
1406                                               path->slots[level] + 1);
1407                 else
1408                         btrfs_node_key_to_cpu(path->nodes[level], key,
1409                                               path->slots[level] + 1);
1410                 return 0;
1411         }
1412         return 1;
1413 }
1414
1415 /*
1416  * look for inline back ref. if back ref is found, *ref_ret is set
1417  * to the address of inline back ref, and 0 is returned.
1418  *
1419  * if back ref isn't found, *ref_ret is set to the address where it
1420  * should be inserted, and -ENOENT is returned.
1421  *
1422  * if insert is true and there are too many inline back refs, the path
1423  * points to the extent item, and -EAGAIN is returned.
1424  *
1425  * NOTE: inline back refs are ordered in the same way that back ref
1426  *       items in the tree are ordered.
1427  */
1428 static noinline_for_stack
1429 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1430                                  struct btrfs_root *root,
1431                                  struct btrfs_path *path,
1432                                  struct btrfs_extent_inline_ref **ref_ret,
1433                                  u64 bytenr, u64 num_bytes,
1434                                  u64 parent, u64 root_objectid,
1435                                  u64 owner, u64 offset, int insert)
1436 {
1437         struct btrfs_key key;
1438         struct extent_buffer *leaf;
1439         struct btrfs_extent_item *ei;
1440         struct btrfs_extent_inline_ref *iref;
1441         u64 flags;
1442         u64 item_size;
1443         unsigned long ptr;
1444         unsigned long end;
1445         int extra_size;
1446         int type;
1447         int want;
1448         int ret;
1449         int err = 0;
1450
1451         key.objectid = bytenr;
1452         key.type = BTRFS_EXTENT_ITEM_KEY;
1453         key.offset = num_bytes;
1454
1455         want = extent_ref_type(parent, owner);
1456         if (insert) {
1457                 extra_size = btrfs_extent_inline_ref_size(want);
1458                 path->keep_locks = 1;
1459         } else
1460                 extra_size = -1;
1461         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1462         if (ret < 0) {
1463                 err = ret;
1464                 goto out;
1465         }
1466         if (ret && !insert) {
1467                 err = -ENOENT;
1468                 goto out;
1469         }
1470         BUG_ON(ret); /* Corruption */
1471
1472         leaf = path->nodes[0];
1473         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1474 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1475         if (item_size < sizeof(*ei)) {
1476                 if (!insert) {
1477                         err = -ENOENT;
1478                         goto out;
1479                 }
1480                 ret = convert_extent_item_v0(trans, root, path, owner,
1481                                              extra_size);
1482                 if (ret < 0) {
1483                         err = ret;
1484                         goto out;
1485                 }
1486                 leaf = path->nodes[0];
1487                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1488         }
1489 #endif
1490         BUG_ON(item_size < sizeof(*ei));
1491
1492         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1493         flags = btrfs_extent_flags(leaf, ei);
1494
1495         ptr = (unsigned long)(ei + 1);
1496         end = (unsigned long)ei + item_size;
1497
1498         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1499                 ptr += sizeof(struct btrfs_tree_block_info);
1500                 BUG_ON(ptr > end);
1501         } else {
1502                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1503         }
1504
1505         err = -ENOENT;
1506         while (1) {
1507                 if (ptr >= end) {
1508                         WARN_ON(ptr > end);
1509                         break;
1510                 }
1511                 iref = (struct btrfs_extent_inline_ref *)ptr;
1512                 type = btrfs_extent_inline_ref_type(leaf, iref);
1513                 if (want < type)
1514                         break;
1515                 if (want > type) {
1516                         ptr += btrfs_extent_inline_ref_size(type);
1517                         continue;
1518                 }
1519
1520                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1521                         struct btrfs_extent_data_ref *dref;
1522                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1523                         if (match_extent_data_ref(leaf, dref, root_objectid,
1524                                                   owner, offset)) {
1525                                 err = 0;
1526                                 break;
1527                         }
1528                         if (hash_extent_data_ref_item(leaf, dref) <
1529                             hash_extent_data_ref(root_objectid, owner, offset))
1530                                 break;
1531                 } else {
1532                         u64 ref_offset;
1533                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1534                         if (parent > 0) {
1535                                 if (parent == ref_offset) {
1536                                         err = 0;
1537                                         break;
1538                                 }
1539                                 if (ref_offset < parent)
1540                                         break;
1541                         } else {
1542                                 if (root_objectid == ref_offset) {
1543                                         err = 0;
1544                                         break;
1545                                 }
1546                                 if (ref_offset < root_objectid)
1547                                         break;
1548                         }
1549                 }
1550                 ptr += btrfs_extent_inline_ref_size(type);
1551         }
1552         if (err == -ENOENT && insert) {
1553                 if (item_size + extra_size >=
1554                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1555                         err = -EAGAIN;
1556                         goto out;
1557                 }
1558                 /*
1559                  * To add new inline back ref, we have to make sure
1560                  * there is no corresponding back ref item.
1561                  * For simplicity, we just do not add new inline back
1562                  * ref if there is any kind of item for this block
1563                  */
1564                 if (find_next_key(path, 0, &key) == 0 &&
1565                     key.objectid == bytenr &&
1566                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1567                         err = -EAGAIN;
1568                         goto out;
1569                 }
1570         }
1571         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1572 out:
1573         if (insert) {
1574                 path->keep_locks = 0;
1575                 btrfs_unlock_up_safe(path, 1);
1576         }
1577         return err;
1578 }
1579
1580 /*
1581  * helper to add new inline back ref
1582  */
1583 static noinline_for_stack
1584 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1585                                  struct btrfs_root *root,
1586                                  struct btrfs_path *path,
1587                                  struct btrfs_extent_inline_ref *iref,
1588                                  u64 parent, u64 root_objectid,
1589                                  u64 owner, u64 offset, int refs_to_add,
1590                                  struct btrfs_delayed_extent_op *extent_op)
1591 {
1592         struct extent_buffer *leaf;
1593         struct btrfs_extent_item *ei;
1594         unsigned long ptr;
1595         unsigned long end;
1596         unsigned long item_offset;
1597         u64 refs;
1598         int size;
1599         int type;
1600
1601         leaf = path->nodes[0];
1602         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1603         item_offset = (unsigned long)iref - (unsigned long)ei;
1604
1605         type = extent_ref_type(parent, owner);
1606         size = btrfs_extent_inline_ref_size(type);
1607
1608         btrfs_extend_item(trans, root, path, size);
1609
1610         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1611         refs = btrfs_extent_refs(leaf, ei);
1612         refs += refs_to_add;
1613         btrfs_set_extent_refs(leaf, ei, refs);
1614         if (extent_op)
1615                 __run_delayed_extent_op(extent_op, leaf, ei);
1616
1617         ptr = (unsigned long)ei + item_offset;
1618         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1619         if (ptr < end - size)
1620                 memmove_extent_buffer(leaf, ptr + size, ptr,
1621                                       end - size - ptr);
1622
1623         iref = (struct btrfs_extent_inline_ref *)ptr;
1624         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1625         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1626                 struct btrfs_extent_data_ref *dref;
1627                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1628                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1629                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1630                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1631                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1632         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1633                 struct btrfs_shared_data_ref *sref;
1634                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1635                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1636                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1637         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1638                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1639         } else {
1640                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1641         }
1642         btrfs_mark_buffer_dirty(leaf);
1643 }
1644
1645 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1646                                  struct btrfs_root *root,
1647                                  struct btrfs_path *path,
1648                                  struct btrfs_extent_inline_ref **ref_ret,
1649                                  u64 bytenr, u64 num_bytes, u64 parent,
1650                                  u64 root_objectid, u64 owner, u64 offset)
1651 {
1652         int ret;
1653
1654         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1655                                            bytenr, num_bytes, parent,
1656                                            root_objectid, owner, offset, 0);
1657         if (ret != -ENOENT)
1658                 return ret;
1659
1660         btrfs_release_path(path);
1661         *ref_ret = NULL;
1662
1663         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1664                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1665                                             root_objectid);
1666         } else {
1667                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1668                                              root_objectid, owner, offset);
1669         }
1670         return ret;
1671 }
1672
1673 /*
1674  * helper to update/remove inline back ref
1675  */
1676 static noinline_for_stack
1677 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1678                                   struct btrfs_root *root,
1679                                   struct btrfs_path *path,
1680                                   struct btrfs_extent_inline_ref *iref,
1681                                   int refs_to_mod,
1682                                   struct btrfs_delayed_extent_op *extent_op)
1683 {
1684         struct extent_buffer *leaf;
1685         struct btrfs_extent_item *ei;
1686         struct btrfs_extent_data_ref *dref = NULL;
1687         struct btrfs_shared_data_ref *sref = NULL;
1688         unsigned long ptr;
1689         unsigned long end;
1690         u32 item_size;
1691         int size;
1692         int type;
1693         u64 refs;
1694
1695         leaf = path->nodes[0];
1696         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1697         refs = btrfs_extent_refs(leaf, ei);
1698         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1699         refs += refs_to_mod;
1700         btrfs_set_extent_refs(leaf, ei, refs);
1701         if (extent_op)
1702                 __run_delayed_extent_op(extent_op, leaf, ei);
1703
1704         type = btrfs_extent_inline_ref_type(leaf, iref);
1705
1706         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1707                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1708                 refs = btrfs_extent_data_ref_count(leaf, dref);
1709         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1710                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1711                 refs = btrfs_shared_data_ref_count(leaf, sref);
1712         } else {
1713                 refs = 1;
1714                 BUG_ON(refs_to_mod != -1);
1715         }
1716
1717         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1718         refs += refs_to_mod;
1719
1720         if (refs > 0) {
1721                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1722                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1723                 else
1724                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1725         } else {
1726                 size =  btrfs_extent_inline_ref_size(type);
1727                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1728                 ptr = (unsigned long)iref;
1729                 end = (unsigned long)ei + item_size;
1730                 if (ptr + size < end)
1731                         memmove_extent_buffer(leaf, ptr, ptr + size,
1732                                               end - ptr - size);
1733                 item_size -= size;
1734                 btrfs_truncate_item(trans, root, path, item_size, 1);
1735         }
1736         btrfs_mark_buffer_dirty(leaf);
1737 }
1738
1739 static noinline_for_stack
1740 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1741                                  struct btrfs_root *root,
1742                                  struct btrfs_path *path,
1743                                  u64 bytenr, u64 num_bytes, u64 parent,
1744                                  u64 root_objectid, u64 owner,
1745                                  u64 offset, int refs_to_add,
1746                                  struct btrfs_delayed_extent_op *extent_op)
1747 {
1748         struct btrfs_extent_inline_ref *iref;
1749         int ret;
1750
1751         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1752                                            bytenr, num_bytes, parent,
1753                                            root_objectid, owner, offset, 1);
1754         if (ret == 0) {
1755                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1756                 update_inline_extent_backref(trans, root, path, iref,
1757                                              refs_to_add, extent_op);
1758         } else if (ret == -ENOENT) {
1759                 setup_inline_extent_backref(trans, root, path, iref, parent,
1760                                             root_objectid, owner, offset,
1761                                             refs_to_add, extent_op);
1762                 ret = 0;
1763         }
1764         return ret;
1765 }
1766
1767 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1768                                  struct btrfs_root *root,
1769                                  struct btrfs_path *path,
1770                                  u64 bytenr, u64 parent, u64 root_objectid,
1771                                  u64 owner, u64 offset, int refs_to_add)
1772 {
1773         int ret;
1774         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1775                 BUG_ON(refs_to_add != 1);
1776                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1777                                             parent, root_objectid);
1778         } else {
1779                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1780                                              parent, root_objectid,
1781                                              owner, offset, refs_to_add);
1782         }
1783         return ret;
1784 }
1785
1786 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1787                                  struct btrfs_root *root,
1788                                  struct btrfs_path *path,
1789                                  struct btrfs_extent_inline_ref *iref,
1790                                  int refs_to_drop, int is_data)
1791 {
1792         int ret = 0;
1793
1794         BUG_ON(!is_data && refs_to_drop != 1);
1795         if (iref) {
1796                 update_inline_extent_backref(trans, root, path, iref,
1797                                              -refs_to_drop, NULL);
1798         } else if (is_data) {
1799                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1800         } else {
1801                 ret = btrfs_del_item(trans, root, path);
1802         }
1803         return ret;
1804 }
1805
1806 static int btrfs_issue_discard(struct block_device *bdev,
1807                                 u64 start, u64 len)
1808 {
1809         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1810 }
1811
1812 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1813                                 u64 num_bytes, u64 *actual_bytes)
1814 {
1815         int ret;
1816         u64 discarded_bytes = 0;
1817         struct btrfs_bio *bbio = NULL;
1818
1819
1820         /* Tell the block device(s) that the sectors can be discarded */
1821         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1822                               bytenr, &num_bytes, &bbio, 0);
1823         /* Error condition is -ENOMEM */
1824         if (!ret) {
1825                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1826                 int i;
1827
1828
1829                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1830                         if (!stripe->dev->can_discard)
1831                                 continue;
1832
1833                         ret = btrfs_issue_discard(stripe->dev->bdev,
1834                                                   stripe->physical,
1835                                                   stripe->length);
1836                         if (!ret)
1837                                 discarded_bytes += stripe->length;
1838                         else if (ret != -EOPNOTSUPP)
1839                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1840
1841                         /*
1842                          * Just in case we get back EOPNOTSUPP for some reason,
1843                          * just ignore the return value so we don't screw up
1844                          * people calling discard_extent.
1845                          */
1846                         ret = 0;
1847                 }
1848                 kfree(bbio);
1849         }
1850
1851         if (actual_bytes)
1852                 *actual_bytes = discarded_bytes;
1853
1854
1855         return ret;
1856 }
1857
1858 /* Can return -ENOMEM */
1859 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1860                          struct btrfs_root *root,
1861                          u64 bytenr, u64 num_bytes, u64 parent,
1862                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1863 {
1864         int ret;
1865         struct btrfs_fs_info *fs_info = root->fs_info;
1866
1867         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1868                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1869
1870         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1871                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1872                                         num_bytes,
1873                                         parent, root_objectid, (int)owner,
1874                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1875         } else {
1876                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1877                                         num_bytes,
1878                                         parent, root_objectid, owner, offset,
1879                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1880         }
1881         return ret;
1882 }
1883
1884 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1885                                   struct btrfs_root *root,
1886                                   u64 bytenr, u64 num_bytes,
1887                                   u64 parent, u64 root_objectid,
1888                                   u64 owner, u64 offset, int refs_to_add,
1889                                   struct btrfs_delayed_extent_op *extent_op)
1890 {
1891         struct btrfs_path *path;
1892         struct extent_buffer *leaf;
1893         struct btrfs_extent_item *item;
1894         u64 refs;
1895         int ret;
1896         int err = 0;
1897
1898         path = btrfs_alloc_path();
1899         if (!path)
1900                 return -ENOMEM;
1901
1902         path->reada = 1;
1903         path->leave_spinning = 1;
1904         /* this will setup the path even if it fails to insert the back ref */
1905         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1906                                            path, bytenr, num_bytes, parent,
1907                                            root_objectid, owner, offset,
1908                                            refs_to_add, extent_op);
1909         if (ret == 0)
1910                 goto out;
1911
1912         if (ret != -EAGAIN) {
1913                 err = ret;
1914                 goto out;
1915         }
1916
1917         leaf = path->nodes[0];
1918         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1919         refs = btrfs_extent_refs(leaf, item);
1920         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1921         if (extent_op)
1922                 __run_delayed_extent_op(extent_op, leaf, item);
1923
1924         btrfs_mark_buffer_dirty(leaf);
1925         btrfs_release_path(path);
1926
1927         path->reada = 1;
1928         path->leave_spinning = 1;
1929
1930         /* now insert the actual backref */
1931         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1932                                     path, bytenr, parent, root_objectid,
1933                                     owner, offset, refs_to_add);
1934         if (ret)
1935                 btrfs_abort_transaction(trans, root, ret);
1936 out:
1937         btrfs_free_path(path);
1938         return err;
1939 }
1940
1941 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1942                                 struct btrfs_root *root,
1943                                 struct btrfs_delayed_ref_node *node,
1944                                 struct btrfs_delayed_extent_op *extent_op,
1945                                 int insert_reserved)
1946 {
1947         int ret = 0;
1948         struct btrfs_delayed_data_ref *ref;
1949         struct btrfs_key ins;
1950         u64 parent = 0;
1951         u64 ref_root = 0;
1952         u64 flags = 0;
1953
1954         ins.objectid = node->bytenr;
1955         ins.offset = node->num_bytes;
1956         ins.type = BTRFS_EXTENT_ITEM_KEY;
1957
1958         ref = btrfs_delayed_node_to_data_ref(node);
1959         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1960                 parent = ref->parent;
1961         else
1962                 ref_root = ref->root;
1963
1964         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1965                 if (extent_op) {
1966                         BUG_ON(extent_op->update_key);
1967                         flags |= extent_op->flags_to_set;
1968                 }
1969                 ret = alloc_reserved_file_extent(trans, root,
1970                                                  parent, ref_root, flags,
1971                                                  ref->objectid, ref->offset,
1972                                                  &ins, node->ref_mod);
1973         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1974                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1975                                              node->num_bytes, parent,
1976                                              ref_root, ref->objectid,
1977                                              ref->offset, node->ref_mod,
1978                                              extent_op);
1979         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1980                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1981                                           node->num_bytes, parent,
1982                                           ref_root, ref->objectid,
1983                                           ref->offset, node->ref_mod,
1984                                           extent_op);
1985         } else {
1986                 BUG();
1987         }
1988         return ret;
1989 }
1990
1991 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1992                                     struct extent_buffer *leaf,
1993                                     struct btrfs_extent_item *ei)
1994 {
1995         u64 flags = btrfs_extent_flags(leaf, ei);
1996         if (extent_op->update_flags) {
1997                 flags |= extent_op->flags_to_set;
1998                 btrfs_set_extent_flags(leaf, ei, flags);
1999         }
2000
2001         if (extent_op->update_key) {
2002                 struct btrfs_tree_block_info *bi;
2003                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2004                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2005                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2006         }
2007 }
2008
2009 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2010                                  struct btrfs_root *root,
2011                                  struct btrfs_delayed_ref_node *node,
2012                                  struct btrfs_delayed_extent_op *extent_op)
2013 {
2014         struct btrfs_key key;
2015         struct btrfs_path *path;
2016         struct btrfs_extent_item *ei;
2017         struct extent_buffer *leaf;
2018         u32 item_size;
2019         int ret;
2020         int err = 0;
2021
2022         if (trans->aborted)
2023                 return 0;
2024
2025         path = btrfs_alloc_path();
2026         if (!path)
2027                 return -ENOMEM;
2028
2029         key.objectid = node->bytenr;
2030         key.type = BTRFS_EXTENT_ITEM_KEY;
2031         key.offset = node->num_bytes;
2032
2033         path->reada = 1;
2034         path->leave_spinning = 1;
2035         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2036                                 path, 0, 1);
2037         if (ret < 0) {
2038                 err = ret;
2039                 goto out;
2040         }
2041         if (ret > 0) {
2042                 err = -EIO;
2043                 goto out;
2044         }
2045
2046         leaf = path->nodes[0];
2047         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2048 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2049         if (item_size < sizeof(*ei)) {
2050                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2051                                              path, (u64)-1, 0);
2052                 if (ret < 0) {
2053                         err = ret;
2054                         goto out;
2055                 }
2056                 leaf = path->nodes[0];
2057                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2058         }
2059 #endif
2060         BUG_ON(item_size < sizeof(*ei));
2061         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2062         __run_delayed_extent_op(extent_op, leaf, ei);
2063
2064         btrfs_mark_buffer_dirty(leaf);
2065 out:
2066         btrfs_free_path(path);
2067         return err;
2068 }
2069
2070 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2071                                 struct btrfs_root *root,
2072                                 struct btrfs_delayed_ref_node *node,
2073                                 struct btrfs_delayed_extent_op *extent_op,
2074                                 int insert_reserved)
2075 {
2076         int ret = 0;
2077         struct btrfs_delayed_tree_ref *ref;
2078         struct btrfs_key ins;
2079         u64 parent = 0;
2080         u64 ref_root = 0;
2081
2082         ins.objectid = node->bytenr;
2083         ins.offset = node->num_bytes;
2084         ins.type = BTRFS_EXTENT_ITEM_KEY;
2085
2086         ref = btrfs_delayed_node_to_tree_ref(node);
2087         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2088                 parent = ref->parent;
2089         else
2090                 ref_root = ref->root;
2091
2092         BUG_ON(node->ref_mod != 1);
2093         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2094                 BUG_ON(!extent_op || !extent_op->update_flags ||
2095                        !extent_op->update_key);
2096                 ret = alloc_reserved_tree_block(trans, root,
2097                                                 parent, ref_root,
2098                                                 extent_op->flags_to_set,
2099                                                 &extent_op->key,
2100                                                 ref->level, &ins);
2101         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2102                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2103                                              node->num_bytes, parent, ref_root,
2104                                              ref->level, 0, 1, extent_op);
2105         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2106                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2107                                           node->num_bytes, parent, ref_root,
2108                                           ref->level, 0, 1, extent_op);
2109         } else {
2110                 BUG();
2111         }
2112         return ret;
2113 }
2114
2115 /* helper function to actually process a single delayed ref entry */
2116 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2117                                struct btrfs_root *root,
2118                                struct btrfs_delayed_ref_node *node,
2119                                struct btrfs_delayed_extent_op *extent_op,
2120                                int insert_reserved)
2121 {
2122         int ret = 0;
2123
2124         if (trans->aborted)
2125                 return 0;
2126
2127         if (btrfs_delayed_ref_is_head(node)) {
2128                 struct btrfs_delayed_ref_head *head;
2129                 /*
2130                  * we've hit the end of the chain and we were supposed
2131                  * to insert this extent into the tree.  But, it got
2132                  * deleted before we ever needed to insert it, so all
2133                  * we have to do is clean up the accounting
2134                  */
2135                 BUG_ON(extent_op);
2136                 head = btrfs_delayed_node_to_head(node);
2137                 if (insert_reserved) {
2138                         btrfs_pin_extent(root, node->bytenr,
2139                                          node->num_bytes, 1);
2140                         if (head->is_data) {
2141                                 ret = btrfs_del_csums(trans, root,
2142                                                       node->bytenr,
2143                                                       node->num_bytes);
2144                         }
2145                 }
2146                 return ret;
2147         }
2148
2149         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2150             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2151                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2152                                            insert_reserved);
2153         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2154                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2155                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2156                                            insert_reserved);
2157         else
2158                 BUG();
2159         return ret;
2160 }
2161
2162 static noinline struct btrfs_delayed_ref_node *
2163 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2164 {
2165         struct rb_node *node;
2166         struct btrfs_delayed_ref_node *ref;
2167         int action = BTRFS_ADD_DELAYED_REF;
2168 again:
2169         /*
2170          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2171          * this prevents ref count from going down to zero when
2172          * there still are pending delayed ref.
2173          */
2174         node = rb_prev(&head->node.rb_node);
2175         while (1) {
2176                 if (!node)
2177                         break;
2178                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2179                                 rb_node);
2180                 if (ref->bytenr != head->node.bytenr)
2181                         break;
2182                 if (ref->action == action)
2183                         return ref;
2184                 node = rb_prev(node);
2185         }
2186         if (action == BTRFS_ADD_DELAYED_REF) {
2187                 action = BTRFS_DROP_DELAYED_REF;
2188                 goto again;
2189         }
2190         return NULL;
2191 }
2192
2193 /*
2194  * Returns 0 on success or if called with an already aborted transaction.
2195  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2196  */
2197 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2198                                        struct btrfs_root *root,
2199                                        struct list_head *cluster)
2200 {
2201         struct btrfs_delayed_ref_root *delayed_refs;
2202         struct btrfs_delayed_ref_node *ref;
2203         struct btrfs_delayed_ref_head *locked_ref = NULL;
2204         struct btrfs_delayed_extent_op *extent_op;
2205         struct btrfs_fs_info *fs_info = root->fs_info;
2206         int ret;
2207         int count = 0;
2208         int must_insert_reserved = 0;
2209
2210         delayed_refs = &trans->transaction->delayed_refs;
2211         while (1) {
2212                 if (!locked_ref) {
2213                         /* pick a new head ref from the cluster list */
2214                         if (list_empty(cluster))
2215                                 break;
2216
2217                         locked_ref = list_entry(cluster->next,
2218                                      struct btrfs_delayed_ref_head, cluster);
2219
2220                         /* grab the lock that says we are going to process
2221                          * all the refs for this head */
2222                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2223
2224                         /*
2225                          * we may have dropped the spin lock to get the head
2226                          * mutex lock, and that might have given someone else
2227                          * time to free the head.  If that's true, it has been
2228                          * removed from our list and we can move on.
2229                          */
2230                         if (ret == -EAGAIN) {
2231                                 locked_ref = NULL;
2232                                 count++;
2233                                 continue;
2234                         }
2235                 }
2236
2237                 /*
2238                  * We need to try and merge add/drops of the same ref since we
2239                  * can run into issues with relocate dropping the implicit ref
2240                  * and then it being added back again before the drop can
2241                  * finish.  If we merged anything we need to re-loop so we can
2242                  * get a good ref.
2243                  */
2244                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2245                                          locked_ref);
2246
2247                 /*
2248                  * locked_ref is the head node, so we have to go one
2249                  * node back for any delayed ref updates
2250                  */
2251                 ref = select_delayed_ref(locked_ref);
2252
2253                 if (ref && ref->seq &&
2254                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2255                         /*
2256                          * there are still refs with lower seq numbers in the
2257                          * process of being added. Don't run this ref yet.
2258                          */
2259                         list_del_init(&locked_ref->cluster);
2260                         btrfs_delayed_ref_unlock(locked_ref);
2261                         locked_ref = NULL;
2262                         delayed_refs->num_heads_ready++;
2263                         spin_unlock(&delayed_refs->lock);
2264                         cond_resched();
2265                         spin_lock(&delayed_refs->lock);
2266                         continue;
2267                 }
2268
2269                 /*
2270                  * record the must insert reserved flag before we
2271                  * drop the spin lock.
2272                  */
2273                 must_insert_reserved = locked_ref->must_insert_reserved;
2274                 locked_ref->must_insert_reserved = 0;
2275
2276                 extent_op = locked_ref->extent_op;
2277                 locked_ref->extent_op = NULL;
2278
2279                 if (!ref) {
2280                         /* All delayed refs have been processed, Go ahead
2281                          * and send the head node to run_one_delayed_ref,
2282                          * so that any accounting fixes can happen
2283                          */
2284                         ref = &locked_ref->node;
2285
2286                         if (extent_op && must_insert_reserved) {
2287                                 btrfs_free_delayed_extent_op(extent_op);
2288                                 extent_op = NULL;
2289                         }
2290
2291                         if (extent_op) {
2292                                 spin_unlock(&delayed_refs->lock);
2293
2294                                 ret = run_delayed_extent_op(trans, root,
2295                                                             ref, extent_op);
2296                                 btrfs_free_delayed_extent_op(extent_op);
2297
2298                                 if (ret) {
2299                                         printk(KERN_DEBUG
2300                                                "btrfs: run_delayed_extent_op "
2301                                                "returned %d\n", ret);
2302                                         spin_lock(&delayed_refs->lock);
2303                                         btrfs_delayed_ref_unlock(locked_ref);
2304                                         return ret;
2305                                 }
2306
2307                                 goto next;
2308                         }
2309                 }
2310
2311                 ref->in_tree = 0;
2312                 rb_erase(&ref->rb_node, &delayed_refs->root);
2313                 delayed_refs->num_entries--;
2314                 if (!btrfs_delayed_ref_is_head(ref)) {
2315                         /*
2316                          * when we play the delayed ref, also correct the
2317                          * ref_mod on head
2318                          */
2319                         switch (ref->action) {
2320                         case BTRFS_ADD_DELAYED_REF:
2321                         case BTRFS_ADD_DELAYED_EXTENT:
2322                                 locked_ref->node.ref_mod -= ref->ref_mod;
2323                                 break;
2324                         case BTRFS_DROP_DELAYED_REF:
2325                                 locked_ref->node.ref_mod += ref->ref_mod;
2326                                 break;
2327                         default:
2328                                 WARN_ON(1);
2329                         }
2330                 }
2331                 spin_unlock(&delayed_refs->lock);
2332
2333                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2334                                           must_insert_reserved);
2335
2336                 btrfs_free_delayed_extent_op(extent_op);
2337                 if (ret) {
2338                         btrfs_delayed_ref_unlock(locked_ref);
2339                         btrfs_put_delayed_ref(ref);
2340                         printk(KERN_DEBUG
2341                                "btrfs: run_one_delayed_ref returned %d\n", ret);
2342                         spin_lock(&delayed_refs->lock);
2343                         return ret;
2344                 }
2345
2346                 /*
2347                  * If this node is a head, that means all the refs in this head
2348                  * have been dealt with, and we will pick the next head to deal
2349                  * with, so we must unlock the head and drop it from the cluster
2350                  * list before we release it.
2351                  */
2352                 if (btrfs_delayed_ref_is_head(ref)) {
2353                         list_del_init(&locked_ref->cluster);
2354                         btrfs_delayed_ref_unlock(locked_ref);
2355                         locked_ref = NULL;
2356                 }
2357                 btrfs_put_delayed_ref(ref);
2358                 count++;
2359 next:
2360                 cond_resched();
2361                 spin_lock(&delayed_refs->lock);
2362         }
2363         return count;
2364 }
2365
2366 #ifdef SCRAMBLE_DELAYED_REFS
2367 /*
2368  * Normally delayed refs get processed in ascending bytenr order. This
2369  * correlates in most cases to the order added. To expose dependencies on this
2370  * order, we start to process the tree in the middle instead of the beginning
2371  */
2372 static u64 find_middle(struct rb_root *root)
2373 {
2374         struct rb_node *n = root->rb_node;
2375         struct btrfs_delayed_ref_node *entry;
2376         int alt = 1;
2377         u64 middle;
2378         u64 first = 0, last = 0;
2379
2380         n = rb_first(root);
2381         if (n) {
2382                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2383                 first = entry->bytenr;
2384         }
2385         n = rb_last(root);
2386         if (n) {
2387                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2388                 last = entry->bytenr;
2389         }
2390         n = root->rb_node;
2391
2392         while (n) {
2393                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2394                 WARN_ON(!entry->in_tree);
2395
2396                 middle = entry->bytenr;
2397
2398                 if (alt)
2399                         n = n->rb_left;
2400                 else
2401                         n = n->rb_right;
2402
2403                 alt = 1 - alt;
2404         }
2405         return middle;
2406 }
2407 #endif
2408
2409 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2410                                          struct btrfs_fs_info *fs_info)
2411 {
2412         struct qgroup_update *qgroup_update;
2413         int ret = 0;
2414
2415         if (list_empty(&trans->qgroup_ref_list) !=
2416             !trans->delayed_ref_elem.seq) {
2417                 /* list without seq or seq without list */
2418                 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2419                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2420                         trans->delayed_ref_elem.seq);
2421                 BUG();
2422         }
2423
2424         if (!trans->delayed_ref_elem.seq)
2425                 return 0;
2426
2427         while (!list_empty(&trans->qgroup_ref_list)) {
2428                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2429                                                  struct qgroup_update, list);
2430                 list_del(&qgroup_update->list);
2431                 if (!ret)
2432                         ret = btrfs_qgroup_account_ref(
2433                                         trans, fs_info, qgroup_update->node,
2434                                         qgroup_update->extent_op);
2435                 kfree(qgroup_update);
2436         }
2437
2438         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2439
2440         return ret;
2441 }
2442
2443 /*
2444  * this starts processing the delayed reference count updates and
2445  * extent insertions we have queued up so far.  count can be
2446  * 0, which means to process everything in the tree at the start
2447  * of the run (but not newly added entries), or it can be some target
2448  * number you'd like to process.
2449  *
2450  * Returns 0 on success or if called with an aborted transaction
2451  * Returns <0 on error and aborts the transaction
2452  */
2453 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2454                            struct btrfs_root *root, unsigned long count)
2455 {
2456         struct rb_node *node;
2457         struct btrfs_delayed_ref_root *delayed_refs;
2458         struct btrfs_delayed_ref_node *ref;
2459         struct list_head cluster;
2460         int ret;
2461         u64 delayed_start;
2462         int run_all = count == (unsigned long)-1;
2463         int run_most = 0;
2464         int loops;
2465
2466         /* We'll clean this up in btrfs_cleanup_transaction */
2467         if (trans->aborted)
2468                 return 0;
2469
2470         if (root == root->fs_info->extent_root)
2471                 root = root->fs_info->tree_root;
2472
2473         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2474
2475         delayed_refs = &trans->transaction->delayed_refs;
2476         INIT_LIST_HEAD(&cluster);
2477 again:
2478         loops = 0;
2479         spin_lock(&delayed_refs->lock);
2480
2481 #ifdef SCRAMBLE_DELAYED_REFS
2482         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2483 #endif
2484
2485         if (count == 0) {
2486                 count = delayed_refs->num_entries * 2;
2487                 run_most = 1;
2488         }
2489         while (1) {
2490                 if (!(run_all || run_most) &&
2491                     delayed_refs->num_heads_ready < 64)
2492                         break;
2493
2494                 /*
2495                  * go find something we can process in the rbtree.  We start at
2496                  * the beginning of the tree, and then build a cluster
2497                  * of refs to process starting at the first one we are able to
2498                  * lock
2499                  */
2500                 delayed_start = delayed_refs->run_delayed_start;
2501                 ret = btrfs_find_ref_cluster(trans, &cluster,
2502                                              delayed_refs->run_delayed_start);
2503                 if (ret)
2504                         break;
2505
2506                 ret = run_clustered_refs(trans, root, &cluster);
2507                 if (ret < 0) {
2508                         btrfs_release_ref_cluster(&cluster);
2509                         spin_unlock(&delayed_refs->lock);
2510                         btrfs_abort_transaction(trans, root, ret);
2511                         return ret;
2512                 }
2513
2514                 count -= min_t(unsigned long, ret, count);
2515
2516                 if (count == 0)
2517                         break;
2518
2519                 if (delayed_start >= delayed_refs->run_delayed_start) {
2520                         if (loops == 0) {
2521                                 /*
2522                                  * btrfs_find_ref_cluster looped. let's do one
2523                                  * more cycle. if we don't run any delayed ref
2524                                  * during that cycle (because we can't because
2525                                  * all of them are blocked), bail out.
2526                                  */
2527                                 loops = 1;
2528                         } else {
2529                                 /*
2530                                  * no runnable refs left, stop trying
2531                                  */
2532                                 BUG_ON(run_all);
2533                                 break;
2534                         }
2535                 }
2536                 if (ret) {
2537                         /* refs were run, let's reset staleness detection */
2538                         loops = 0;
2539                 }
2540         }
2541
2542         if (run_all) {
2543                 if (!list_empty(&trans->new_bgs)) {
2544                         spin_unlock(&delayed_refs->lock);
2545                         btrfs_create_pending_block_groups(trans, root);
2546                         spin_lock(&delayed_refs->lock);
2547                 }
2548
2549                 node = rb_first(&delayed_refs->root);
2550                 if (!node)
2551                         goto out;
2552                 count = (unsigned long)-1;
2553
2554                 while (node) {
2555                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2556                                        rb_node);
2557                         if (btrfs_delayed_ref_is_head(ref)) {
2558                                 struct btrfs_delayed_ref_head *head;
2559
2560                                 head = btrfs_delayed_node_to_head(ref);
2561                                 atomic_inc(&ref->refs);
2562
2563                                 spin_unlock(&delayed_refs->lock);
2564                                 /*
2565                                  * Mutex was contended, block until it's
2566                                  * released and try again
2567                                  */
2568                                 mutex_lock(&head->mutex);
2569                                 mutex_unlock(&head->mutex);
2570
2571                                 btrfs_put_delayed_ref(ref);
2572                                 cond_resched();
2573                                 goto again;
2574                         }
2575                         node = rb_next(node);
2576                 }
2577                 spin_unlock(&delayed_refs->lock);
2578                 schedule_timeout(1);
2579                 goto again;
2580         }
2581 out:
2582         spin_unlock(&delayed_refs->lock);
2583         assert_qgroups_uptodate(trans);
2584         return 0;
2585 }
2586
2587 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2588                                 struct btrfs_root *root,
2589                                 u64 bytenr, u64 num_bytes, u64 flags,
2590                                 int is_data)
2591 {
2592         struct btrfs_delayed_extent_op *extent_op;
2593         int ret;
2594
2595         extent_op = btrfs_alloc_delayed_extent_op();
2596         if (!extent_op)
2597                 return -ENOMEM;
2598
2599         extent_op->flags_to_set = flags;
2600         extent_op->update_flags = 1;
2601         extent_op->update_key = 0;
2602         extent_op->is_data = is_data ? 1 : 0;
2603
2604         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2605                                           num_bytes, extent_op);
2606         if (ret)
2607                 btrfs_free_delayed_extent_op(extent_op);
2608         return ret;
2609 }
2610
2611 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2612                                       struct btrfs_root *root,
2613                                       struct btrfs_path *path,
2614                                       u64 objectid, u64 offset, u64 bytenr)
2615 {
2616         struct btrfs_delayed_ref_head *head;
2617         struct btrfs_delayed_ref_node *ref;
2618         struct btrfs_delayed_data_ref *data_ref;
2619         struct btrfs_delayed_ref_root *delayed_refs;
2620         struct rb_node *node;
2621         int ret = 0;
2622
2623         ret = -ENOENT;
2624         delayed_refs = &trans->transaction->delayed_refs;
2625         spin_lock(&delayed_refs->lock);
2626         head = btrfs_find_delayed_ref_head(trans, bytenr);
2627         if (!head)
2628                 goto out;
2629
2630         if (!mutex_trylock(&head->mutex)) {
2631                 atomic_inc(&head->node.refs);
2632                 spin_unlock(&delayed_refs->lock);
2633
2634                 btrfs_release_path(path);
2635
2636                 /*
2637                  * Mutex was contended, block until it's released and let
2638                  * caller try again
2639                  */
2640                 mutex_lock(&head->mutex);
2641                 mutex_unlock(&head->mutex);
2642                 btrfs_put_delayed_ref(&head->node);
2643                 return -EAGAIN;
2644         }
2645
2646         node = rb_prev(&head->node.rb_node);
2647         if (!node)
2648                 goto out_unlock;
2649
2650         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2651
2652         if (ref->bytenr != bytenr)
2653                 goto out_unlock;
2654
2655         ret = 1;
2656         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2657                 goto out_unlock;
2658
2659         data_ref = btrfs_delayed_node_to_data_ref(ref);
2660
2661         node = rb_prev(node);
2662         if (node) {
2663                 int seq = ref->seq;
2664
2665                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2666                 if (ref->bytenr == bytenr && ref->seq == seq)
2667                         goto out_unlock;
2668         }
2669
2670         if (data_ref->root != root->root_key.objectid ||
2671             data_ref->objectid != objectid || data_ref->offset != offset)
2672                 goto out_unlock;
2673
2674         ret = 0;
2675 out_unlock:
2676         mutex_unlock(&head->mutex);
2677 out:
2678         spin_unlock(&delayed_refs->lock);
2679         return ret;
2680 }
2681
2682 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2683                                         struct btrfs_root *root,
2684                                         struct btrfs_path *path,
2685                                         u64 objectid, u64 offset, u64 bytenr)
2686 {
2687         struct btrfs_root *extent_root = root->fs_info->extent_root;
2688         struct extent_buffer *leaf;
2689         struct btrfs_extent_data_ref *ref;
2690         struct btrfs_extent_inline_ref *iref;
2691         struct btrfs_extent_item *ei;
2692         struct btrfs_key key;
2693         u32 item_size;
2694         int ret;
2695
2696         key.objectid = bytenr;
2697         key.offset = (u64)-1;
2698         key.type = BTRFS_EXTENT_ITEM_KEY;
2699
2700         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2701         if (ret < 0)
2702                 goto out;
2703         BUG_ON(ret == 0); /* Corruption */
2704
2705         ret = -ENOENT;
2706         if (path->slots[0] == 0)
2707                 goto out;
2708
2709         path->slots[0]--;
2710         leaf = path->nodes[0];
2711         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2712
2713         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2714                 goto out;
2715
2716         ret = 1;
2717         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2718 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2719         if (item_size < sizeof(*ei)) {
2720                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2721                 goto out;
2722         }
2723 #endif
2724         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2725
2726         if (item_size != sizeof(*ei) +
2727             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2728                 goto out;
2729
2730         if (btrfs_extent_generation(leaf, ei) <=
2731             btrfs_root_last_snapshot(&root->root_item))
2732                 goto out;
2733
2734         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2735         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2736             BTRFS_EXTENT_DATA_REF_KEY)
2737                 goto out;
2738
2739         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2740         if (btrfs_extent_refs(leaf, ei) !=
2741             btrfs_extent_data_ref_count(leaf, ref) ||
2742             btrfs_extent_data_ref_root(leaf, ref) !=
2743             root->root_key.objectid ||
2744             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2745             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2746                 goto out;
2747
2748         ret = 0;
2749 out:
2750         return ret;
2751 }
2752
2753 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2754                           struct btrfs_root *root,
2755                           u64 objectid, u64 offset, u64 bytenr)
2756 {
2757         struct btrfs_path *path;
2758         int ret;
2759         int ret2;
2760
2761         path = btrfs_alloc_path();
2762         if (!path)
2763                 return -ENOENT;
2764
2765         do {
2766                 ret = check_committed_ref(trans, root, path, objectid,
2767                                           offset, bytenr);
2768                 if (ret && ret != -ENOENT)
2769                         goto out;
2770
2771                 ret2 = check_delayed_ref(trans, root, path, objectid,
2772                                          offset, bytenr);
2773         } while (ret2 == -EAGAIN);
2774
2775         if (ret2 && ret2 != -ENOENT) {
2776                 ret = ret2;
2777                 goto out;
2778         }
2779
2780         if (ret != -ENOENT || ret2 != -ENOENT)
2781                 ret = 0;
2782 out:
2783         btrfs_free_path(path);
2784         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2785                 WARN_ON(ret > 0);
2786         return ret;
2787 }
2788
2789 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2790                            struct btrfs_root *root,
2791                            struct extent_buffer *buf,
2792                            int full_backref, int inc, int for_cow)
2793 {
2794         u64 bytenr;
2795         u64 num_bytes;
2796         u64 parent;
2797         u64 ref_root;
2798         u32 nritems;
2799         struct btrfs_key key;
2800         struct btrfs_file_extent_item *fi;
2801         int i;
2802         int level;
2803         int ret = 0;
2804         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2805                             u64, u64, u64, u64, u64, u64, int);
2806
2807         ref_root = btrfs_header_owner(buf);
2808         nritems = btrfs_header_nritems(buf);
2809         level = btrfs_header_level(buf);
2810
2811         if (!root->ref_cows && level == 0)
2812                 return 0;
2813
2814         if (inc)
2815                 process_func = btrfs_inc_extent_ref;
2816         else
2817                 process_func = btrfs_free_extent;
2818
2819         if (full_backref)
2820                 parent = buf->start;
2821         else
2822                 parent = 0;
2823
2824         for (i = 0; i < nritems; i++) {
2825                 if (level == 0) {
2826                         btrfs_item_key_to_cpu(buf, &key, i);
2827                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2828                                 continue;
2829                         fi = btrfs_item_ptr(buf, i,
2830                                             struct btrfs_file_extent_item);
2831                         if (btrfs_file_extent_type(buf, fi) ==
2832                             BTRFS_FILE_EXTENT_INLINE)
2833                                 continue;
2834                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2835                         if (bytenr == 0)
2836                                 continue;
2837
2838                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2839                         key.offset -= btrfs_file_extent_offset(buf, fi);
2840                         ret = process_func(trans, root, bytenr, num_bytes,
2841                                            parent, ref_root, key.objectid,
2842                                            key.offset, for_cow);
2843                         if (ret)
2844                                 goto fail;
2845                 } else {
2846                         bytenr = btrfs_node_blockptr(buf, i);
2847                         num_bytes = btrfs_level_size(root, level - 1);
2848                         ret = process_func(trans, root, bytenr, num_bytes,
2849                                            parent, ref_root, level - 1, 0,
2850                                            for_cow);
2851                         if (ret)
2852                                 goto fail;
2853                 }
2854         }
2855         return 0;
2856 fail:
2857         return ret;
2858 }
2859
2860 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2861                   struct extent_buffer *buf, int full_backref, int for_cow)
2862 {
2863         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2864 }
2865
2866 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2867                   struct extent_buffer *buf, int full_backref, int for_cow)
2868 {
2869         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2870 }
2871
2872 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2873                                  struct btrfs_root *root,
2874                                  struct btrfs_path *path,
2875                                  struct btrfs_block_group_cache *cache)
2876 {
2877         int ret;
2878         struct btrfs_root *extent_root = root->fs_info->extent_root;
2879         unsigned long bi;
2880         struct extent_buffer *leaf;
2881
2882         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2883         if (ret < 0)
2884                 goto fail;
2885         BUG_ON(ret); /* Corruption */
2886
2887         leaf = path->nodes[0];
2888         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2889         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2890         btrfs_mark_buffer_dirty(leaf);
2891         btrfs_release_path(path);
2892 fail:
2893         if (ret) {
2894                 btrfs_abort_transaction(trans, root, ret);
2895                 return ret;
2896         }
2897         return 0;
2898
2899 }
2900
2901 static struct btrfs_block_group_cache *
2902 next_block_group(struct btrfs_root *root,
2903                  struct btrfs_block_group_cache *cache)
2904 {
2905         struct rb_node *node;
2906         spin_lock(&root->fs_info->block_group_cache_lock);
2907         node = rb_next(&cache->cache_node);
2908         btrfs_put_block_group(cache);
2909         if (node) {
2910                 cache = rb_entry(node, struct btrfs_block_group_cache,
2911                                  cache_node);
2912                 btrfs_get_block_group(cache);
2913         } else
2914                 cache = NULL;
2915         spin_unlock(&root->fs_info->block_group_cache_lock);
2916         return cache;
2917 }
2918
2919 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2920                             struct btrfs_trans_handle *trans,
2921                             struct btrfs_path *path)
2922 {
2923         struct btrfs_root *root = block_group->fs_info->tree_root;
2924         struct inode *inode = NULL;
2925         u64 alloc_hint = 0;
2926         int dcs = BTRFS_DC_ERROR;
2927         int num_pages = 0;
2928         int retries = 0;
2929         int ret = 0;
2930
2931         /*
2932          * If this block group is smaller than 100 megs don't bother caching the
2933          * block group.
2934          */
2935         if (block_group->key.offset < (100 * 1024 * 1024)) {
2936                 spin_lock(&block_group->lock);
2937                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2938                 spin_unlock(&block_group->lock);
2939                 return 0;
2940         }
2941
2942 again:
2943         inode = lookup_free_space_inode(root, block_group, path);
2944         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2945                 ret = PTR_ERR(inode);
2946                 btrfs_release_path(path);
2947                 goto out;
2948         }
2949
2950         if (IS_ERR(inode)) {
2951                 BUG_ON(retries);
2952                 retries++;
2953
2954                 if (block_group->ro)
2955                         goto out_free;
2956
2957                 ret = create_free_space_inode(root, trans, block_group, path);
2958                 if (ret)
2959                         goto out_free;
2960                 goto again;
2961         }
2962
2963         /* We've already setup this transaction, go ahead and exit */
2964         if (block_group->cache_generation == trans->transid &&
2965             i_size_read(inode)) {
2966                 dcs = BTRFS_DC_SETUP;
2967                 goto out_put;
2968         }
2969
2970         /*
2971          * We want to set the generation to 0, that way if anything goes wrong
2972          * from here on out we know not to trust this cache when we load up next
2973          * time.
2974          */
2975         BTRFS_I(inode)->generation = 0;
2976         ret = btrfs_update_inode(trans, root, inode);
2977         WARN_ON(ret);
2978
2979         if (i_size_read(inode) > 0) {
2980                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2981                                                       inode);
2982                 if (ret)
2983                         goto out_put;
2984         }
2985
2986         spin_lock(&block_group->lock);
2987         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2988             !btrfs_test_opt(root, SPACE_CACHE)) {
2989                 /*
2990                  * don't bother trying to write stuff out _if_
2991                  * a) we're not cached,
2992                  * b) we're with nospace_cache mount option.
2993                  */
2994                 dcs = BTRFS_DC_WRITTEN;
2995                 spin_unlock(&block_group->lock);
2996                 goto out_put;
2997         }
2998         spin_unlock(&block_group->lock);
2999
3000         /*
3001          * Try to preallocate enough space based on how big the block group is.
3002          * Keep in mind this has to include any pinned space which could end up
3003          * taking up quite a bit since it's not folded into the other space
3004          * cache.
3005          */
3006         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3007         if (!num_pages)
3008                 num_pages = 1;
3009
3010         num_pages *= 16;
3011         num_pages *= PAGE_CACHE_SIZE;
3012
3013         ret = btrfs_check_data_free_space(inode, num_pages);
3014         if (ret)
3015                 goto out_put;
3016
3017         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3018                                               num_pages, num_pages,
3019                                               &alloc_hint);
3020         if (!ret)
3021                 dcs = BTRFS_DC_SETUP;
3022         btrfs_free_reserved_data_space(inode, num_pages);
3023
3024 out_put:
3025         iput(inode);
3026 out_free:
3027         btrfs_release_path(path);
3028 out:
3029         spin_lock(&block_group->lock);
3030         if (!ret && dcs == BTRFS_DC_SETUP)
3031                 block_group->cache_generation = trans->transid;
3032         block_group->disk_cache_state = dcs;
3033         spin_unlock(&block_group->lock);
3034
3035         return ret;
3036 }
3037
3038 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3039                                    struct btrfs_root *root)
3040 {
3041         struct btrfs_block_group_cache *cache;
3042         int err = 0;
3043         struct btrfs_path *path;
3044         u64 last = 0;
3045
3046         path = btrfs_alloc_path();
3047         if (!path)
3048                 return -ENOMEM;
3049
3050 again:
3051         while (1) {
3052                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3053                 while (cache) {
3054                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3055                                 break;
3056                         cache = next_block_group(root, cache);
3057                 }
3058                 if (!cache) {
3059                         if (last == 0)
3060                                 break;
3061                         last = 0;
3062                         continue;
3063                 }
3064                 err = cache_save_setup(cache, trans, path);
3065                 last = cache->key.objectid + cache->key.offset;
3066                 btrfs_put_block_group(cache);
3067         }
3068
3069         while (1) {
3070                 if (last == 0) {
3071                         err = btrfs_run_delayed_refs(trans, root,
3072                                                      (unsigned long)-1);
3073                         if (err) /* File system offline */
3074                                 goto out;
3075                 }
3076
3077                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3078                 while (cache) {
3079                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3080                                 btrfs_put_block_group(cache);
3081                                 goto again;
3082                         }
3083
3084                         if (cache->dirty)
3085                                 break;
3086                         cache = next_block_group(root, cache);
3087                 }
3088                 if (!cache) {
3089                         if (last == 0)
3090                                 break;
3091                         last = 0;
3092                         continue;
3093                 }
3094
3095                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3096                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3097                 cache->dirty = 0;
3098                 last = cache->key.objectid + cache->key.offset;
3099
3100                 err = write_one_cache_group(trans, root, path, cache);
3101                 if (err) /* File system offline */
3102                         goto out;
3103
3104                 btrfs_put_block_group(cache);
3105         }
3106
3107         while (1) {
3108                 /*
3109                  * I don't think this is needed since we're just marking our
3110                  * preallocated extent as written, but just in case it can't
3111                  * hurt.
3112                  */
3113                 if (last == 0) {
3114                         err = btrfs_run_delayed_refs(trans, root,
3115                                                      (unsigned long)-1);
3116                         if (err) /* File system offline */
3117                                 goto out;
3118                 }
3119
3120                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3121                 while (cache) {
3122                         /*
3123                          * Really this shouldn't happen, but it could if we
3124                          * couldn't write the entire preallocated extent and
3125                          * splitting the extent resulted in a new block.
3126                          */
3127                         if (cache->dirty) {
3128                                 btrfs_put_block_group(cache);
3129                                 goto again;
3130                         }
3131                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3132                                 break;
3133                         cache = next_block_group(root, cache);
3134                 }
3135                 if (!cache) {
3136                         if (last == 0)
3137                                 break;
3138                         last = 0;
3139                         continue;
3140                 }
3141
3142                 err = btrfs_write_out_cache(root, trans, cache, path);
3143
3144                 /*
3145                  * If we didn't have an error then the cache state is still
3146                  * NEED_WRITE, so we can set it to WRITTEN.
3147                  */
3148                 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3149                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
3150                 last = cache->key.objectid + cache->key.offset;
3151                 btrfs_put_block_group(cache);
3152         }
3153 out:
3154
3155         btrfs_free_path(path);
3156         return err;
3157 }
3158
3159 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3160 {
3161         struct btrfs_block_group_cache *block_group;
3162         int readonly = 0;
3163
3164         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3165         if (!block_group || block_group->ro)
3166                 readonly = 1;
3167         if (block_group)
3168                 btrfs_put_block_group(block_group);
3169         return readonly;
3170 }
3171
3172 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3173                              u64 total_bytes, u64 bytes_used,
3174                              struct btrfs_space_info **space_info)
3175 {
3176         struct btrfs_space_info *found;
3177         int i;
3178         int factor;
3179
3180         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3181                      BTRFS_BLOCK_GROUP_RAID10))
3182                 factor = 2;
3183         else
3184                 factor = 1;
3185
3186         found = __find_space_info(info, flags);
3187         if (found) {
3188                 spin_lock(&found->lock);
3189                 found->total_bytes += total_bytes;
3190                 found->disk_total += total_bytes * factor;
3191                 found->bytes_used += bytes_used;
3192                 found->disk_used += bytes_used * factor;
3193                 found->full = 0;
3194                 spin_unlock(&found->lock);
3195                 *space_info = found;
3196                 return 0;
3197         }
3198         found = kzalloc(sizeof(*found), GFP_NOFS);
3199         if (!found)
3200                 return -ENOMEM;
3201
3202         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3203                 INIT_LIST_HEAD(&found->block_groups[i]);
3204         init_rwsem(&found->groups_sem);
3205         spin_lock_init(&found->lock);
3206         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3207         found->total_bytes = total_bytes;
3208         found->disk_total = total_bytes * factor;
3209         found->bytes_used = bytes_used;
3210         found->disk_used = bytes_used * factor;
3211         found->bytes_pinned = 0;
3212         found->bytes_reserved = 0;
3213         found->bytes_readonly = 0;
3214         found->bytes_may_use = 0;
3215         found->full = 0;
3216         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3217         found->chunk_alloc = 0;
3218         found->flush = 0;
3219         init_waitqueue_head(&found->wait);
3220         *space_info = found;
3221         list_add_rcu(&found->list, &info->space_info);
3222         if (flags & BTRFS_BLOCK_GROUP_DATA)
3223                 info->data_sinfo = found;
3224         return 0;
3225 }
3226
3227 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3228 {
3229         u64 extra_flags = chunk_to_extended(flags) &
3230                                 BTRFS_EXTENDED_PROFILE_MASK;
3231
3232         if (flags & BTRFS_BLOCK_GROUP_DATA)
3233                 fs_info->avail_data_alloc_bits |= extra_flags;
3234         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3235                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3236         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3237                 fs_info->avail_system_alloc_bits |= extra_flags;
3238 }
3239
3240 /*
3241  * returns target flags in extended format or 0 if restripe for this
3242  * chunk_type is not in progress
3243  *
3244  * should be called with either volume_mutex or balance_lock held
3245  */
3246 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3247 {
3248         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3249         u64 target = 0;
3250
3251         if (!bctl)
3252                 return 0;
3253
3254         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3255             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3256                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3257         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3258                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3259                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3260         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3261                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3262                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3263         }
3264
3265         return target;
3266 }
3267
3268 /*
3269  * @flags: available profiles in extended format (see ctree.h)
3270  *
3271  * Returns reduced profile in chunk format.  If profile changing is in
3272  * progress (either running or paused) picks the target profile (if it's
3273  * already available), otherwise falls back to plain reducing.
3274  */
3275 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 {
3277         /*
3278          * we add in the count of missing devices because we want
3279          * to make sure that any RAID levels on a degraded FS
3280          * continue to be honored.
3281          */
3282         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3283                 root->fs_info->fs_devices->missing_devices;
3284         u64 target;
3285
3286         /*
3287          * see if restripe for this chunk_type is in progress, if so
3288          * try to reduce to the target profile
3289          */
3290         spin_lock(&root->fs_info->balance_lock);
3291         target = get_restripe_target(root->fs_info, flags);
3292         if (target) {
3293                 /* pick target profile only if it's already available */
3294                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3295                         spin_unlock(&root->fs_info->balance_lock);
3296                         return extended_to_chunk(target);
3297                 }
3298         }
3299         spin_unlock(&root->fs_info->balance_lock);
3300
3301         if (num_devices == 1)
3302                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3303         if (num_devices < 4)
3304                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3305
3306         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3307             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3308                       BTRFS_BLOCK_GROUP_RAID10))) {
3309                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
3310         }
3311
3312         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3313             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3314                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3315         }
3316
3317         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3318             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3319              (flags & BTRFS_BLOCK_GROUP_RAID10) |
3320              (flags & BTRFS_BLOCK_GROUP_DUP))) {
3321                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3322         }
3323
3324         return extended_to_chunk(flags);
3325 }
3326
3327 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3328 {
3329         if (flags & BTRFS_BLOCK_GROUP_DATA)
3330                 flags |= root->fs_info->avail_data_alloc_bits;
3331         else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3332                 flags |= root->fs_info->avail_system_alloc_bits;
3333         else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3334                 flags |= root->fs_info->avail_metadata_alloc_bits;
3335
3336         return btrfs_reduce_alloc_profile(root, flags);
3337 }
3338
3339 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3340 {
3341         u64 flags;
3342
3343         if (data)
3344                 flags = BTRFS_BLOCK_GROUP_DATA;
3345         else if (root == root->fs_info->chunk_root)
3346                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3347         else
3348                 flags = BTRFS_BLOCK_GROUP_METADATA;
3349
3350         return get_alloc_profile(root, flags);
3351 }
3352
3353 /*
3354  * This will check the space that the inode allocates from to make sure we have
3355  * enough space for bytes.
3356  */
3357 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3358 {
3359         struct btrfs_space_info *data_sinfo;
3360         struct btrfs_root *root = BTRFS_I(inode)->root;
3361         struct btrfs_fs_info *fs_info = root->fs_info;
3362         u64 used;
3363         int ret = 0, committed = 0, alloc_chunk = 1;
3364
3365         /* make sure bytes are sectorsize aligned */
3366         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3367
3368         if (root == root->fs_info->tree_root ||
3369             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3370                 alloc_chunk = 0;
3371                 committed = 1;
3372         }
3373
3374         data_sinfo = fs_info->data_sinfo;
3375         if (!data_sinfo)
3376                 goto alloc;
3377
3378 again:
3379         /* make sure we have enough space to handle the data first */
3380         spin_lock(&data_sinfo->lock);
3381         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3382                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3383                 data_sinfo->bytes_may_use;
3384
3385         if (used + bytes > data_sinfo->total_bytes) {
3386                 struct btrfs_trans_handle *trans;
3387
3388                 /*
3389                  * if we don't have enough free bytes in this space then we need
3390                  * to alloc a new chunk.
3391                  */
3392                 if (!data_sinfo->full && alloc_chunk) {
3393                         u64 alloc_target;
3394
3395                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3396                         spin_unlock(&data_sinfo->lock);
3397 alloc:
3398                         alloc_target = btrfs_get_alloc_profile(root, 1);
3399                         trans = btrfs_join_transaction(root);
3400                         if (IS_ERR(trans))
3401                                 return PTR_ERR(trans);
3402
3403                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3404                                              alloc_target,
3405                                              CHUNK_ALLOC_NO_FORCE);
3406                         btrfs_end_transaction(trans, root);
3407                         if (ret < 0) {
3408                                 if (ret != -ENOSPC)
3409                                         return ret;
3410                                 else
3411                                         goto commit_trans;
3412                         }
3413
3414                         if (!data_sinfo)
3415                                 data_sinfo = fs_info->data_sinfo;
3416
3417                         goto again;
3418                 }
3419
3420                 /*
3421                  * If we have less pinned bytes than we want to allocate then
3422                  * don't bother committing the transaction, it won't help us.
3423                  */
3424                 if (data_sinfo->bytes_pinned < bytes)
3425                         committed = 1;
3426                 spin_unlock(&data_sinfo->lock);
3427
3428                 /* commit the current transaction and try again */
3429 commit_trans:
3430                 if (!committed &&
3431                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3432                         committed = 1;
3433                         trans = btrfs_join_transaction(root);
3434                         if (IS_ERR(trans))
3435                                 return PTR_ERR(trans);
3436                         ret = btrfs_commit_transaction(trans, root);
3437                         if (ret)
3438                                 return ret;
3439                         goto again;
3440                 }
3441
3442                 return -ENOSPC;
3443         }
3444         data_sinfo->bytes_may_use += bytes;
3445         trace_btrfs_space_reservation(root->fs_info, "space_info",
3446                                       data_sinfo->flags, bytes, 1);
3447         spin_unlock(&data_sinfo->lock);
3448
3449         return 0;
3450 }
3451
3452 /*
3453  * Called if we need to clear a data reservation for this inode.
3454  */
3455 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3456 {
3457         struct btrfs_root *root = BTRFS_I(inode)->root;
3458         struct btrfs_space_info *data_sinfo;
3459
3460         /* make sure bytes are sectorsize aligned */
3461         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3462
3463         data_sinfo = root->fs_info->data_sinfo;
3464         spin_lock(&data_sinfo->lock);
3465         data_sinfo->bytes_may_use -= bytes;
3466         trace_btrfs_space_reservation(root->fs_info, "space_info",
3467                                       data_sinfo->flags, bytes, 0);
3468         spin_unlock(&data_sinfo->lock);
3469 }
3470
3471 static void force_metadata_allocation(struct btrfs_fs_info *info)
3472 {
3473         struct list_head *head = &info->space_info;
3474         struct btrfs_space_info *found;
3475
3476         rcu_read_lock();
3477         list_for_each_entry_rcu(found, head, list) {
3478                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3479                         found->force_alloc = CHUNK_ALLOC_FORCE;
3480         }
3481         rcu_read_unlock();
3482 }
3483
3484 static int should_alloc_chunk(struct btrfs_root *root,
3485                               struct btrfs_space_info *sinfo, int force)
3486 {
3487         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3488         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3489         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3490         u64 thresh;
3491
3492         if (force == CHUNK_ALLOC_FORCE)
3493                 return 1;
3494
3495         /*
3496          * We need to take into account the global rsv because for all intents
3497          * and purposes it's used space.  Don't worry about locking the
3498          * global_rsv, it doesn't change except when the transaction commits.
3499          */
3500         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3501                 num_allocated += global_rsv->size;
3502
3503         /*
3504          * in limited mode, we want to have some free space up to
3505          * about 1% of the FS size.
3506          */
3507         if (force == CHUNK_ALLOC_LIMITED) {
3508                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3509                 thresh = max_t(u64, 64 * 1024 * 1024,
3510                                div_factor_fine(thresh, 1));
3511
3512                 if (num_bytes - num_allocated < thresh)
3513                         return 1;
3514         }
3515
3516         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3517                 return 0;
3518         return 1;
3519 }
3520
3521 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3522 {
3523         u64 num_dev;
3524
3525         if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3526             type & BTRFS_BLOCK_GROUP_RAID0)
3527                 num_dev = root->fs_info->fs_devices->rw_devices;
3528         else if (type & BTRFS_BLOCK_GROUP_RAID1)
3529                 num_dev = 2;
3530         else
3531                 num_dev = 1;    /* DUP or single */
3532
3533         /* metadata for updaing devices and chunk tree */
3534         return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3535 }
3536
3537 static void check_system_chunk(struct btrfs_trans_handle *trans,
3538                                struct btrfs_root *root, u64 type)
3539 {
3540         struct btrfs_space_info *info;
3541         u64 left;
3542         u64 thresh;
3543
3544         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3545         spin_lock(&info->lock);
3546         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3547                 info->bytes_reserved - info->bytes_readonly;
3548         spin_unlock(&info->lock);
3549
3550         thresh = get_system_chunk_thresh(root, type);
3551         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3552                 printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3553                        left, thresh, type);
3554                 dump_space_info(info, 0, 0);
3555         }
3556
3557         if (left < thresh) {
3558                 u64 flags;
3559
3560                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3561                 btrfs_alloc_chunk(trans, root, flags);
3562         }
3563 }
3564
3565 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3566                           struct btrfs_root *extent_root, u64 flags, int force)
3567 {
3568         struct btrfs_space_info *space_info;
3569         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3570         int wait_for_alloc = 0;
3571         int ret = 0;
3572
3573         /* Don't re-enter if we're already allocating a chunk */
3574         if (trans->allocating_chunk)
3575                 return -ENOSPC;
3576
3577         space_info = __find_space_info(extent_root->fs_info, flags);
3578         if (!space_info) {
3579                 ret = update_space_info(extent_root->fs_info, flags,
3580                                         0, 0, &space_info);
3581                 BUG_ON(ret); /* -ENOMEM */
3582         }
3583         BUG_ON(!space_info); /* Logic error */
3584
3585 again:
3586         spin_lock(&space_info->lock);
3587         if (force < space_info->force_alloc)
3588                 force = space_info->force_alloc;
3589         if (space_info->full) {
3590                 spin_unlock(&space_info->lock);
3591                 return 0;
3592         }
3593
3594         if (!should_alloc_chunk(extent_root, space_info, force)) {
3595                 spin_unlock(&space_info->lock);
3596                 return 0;
3597         } else if (space_info->chunk_alloc) {
3598                 wait_for_alloc = 1;
3599         } else {
3600                 space_info->chunk_alloc = 1;
3601         }
3602
3603         spin_unlock(&space_info->lock);
3604
3605         mutex_lock(&fs_info->chunk_mutex);
3606
3607         /*
3608          * The chunk_mutex is held throughout the entirety of a chunk
3609          * allocation, so once we've acquired the chunk_mutex we know that the
3610          * other guy is done and we need to recheck and see if we should
3611          * allocate.
3612          */
3613         if (wait_for_alloc) {
3614                 mutex_unlock(&fs_info->chunk_mutex);
3615                 wait_for_alloc = 0;
3616                 goto again;
3617         }
3618
3619         trans->allocating_chunk = true;
3620
3621         /*
3622          * If we have mixed data/metadata chunks we want to make sure we keep
3623          * allocating mixed chunks instead of individual chunks.
3624          */
3625         if (btrfs_mixed_space_info(space_info))
3626                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3627
3628         /*
3629          * if we're doing a data chunk, go ahead and make sure that
3630          * we keep a reasonable number of metadata chunks allocated in the
3631          * FS as well.
3632          */
3633         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3634                 fs_info->data_chunk_allocations++;
3635                 if (!(fs_info->data_chunk_allocations %
3636                       fs_info->metadata_ratio))
3637                         force_metadata_allocation(fs_info);
3638         }
3639
3640         /*
3641          * Check if we have enough space in SYSTEM chunk because we may need
3642          * to update devices.
3643          */
3644         check_system_chunk(trans, extent_root, flags);
3645
3646         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3647         trans->allocating_chunk = false;
3648         if (ret < 0 && ret != -ENOSPC)
3649                 goto out;
3650
3651         spin_lock(&space_info->lock);
3652         if (ret)
3653                 space_info->full = 1;
3654         else
3655                 ret = 1;
3656
3657         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3658         space_info->chunk_alloc = 0;
3659         spin_unlock(&space_info->lock);
3660 out:
3661         mutex_unlock(&fs_info->chunk_mutex);
3662         return ret;
3663 }
3664
3665 static int can_overcommit(struct btrfs_root *root,
3666                           struct btrfs_space_info *space_info, u64 bytes,
3667                           enum btrfs_reserve_flush_enum flush)
3668 {
3669         u64 profile = btrfs_get_alloc_profile(root, 0);
3670         u64 avail;
3671         u64 used;
3672
3673         used = space_info->bytes_used + space_info->bytes_reserved +
3674                 space_info->bytes_pinned + space_info->bytes_readonly +
3675                 space_info->bytes_may_use;
3676
3677         spin_lock(&root->fs_info->free_chunk_lock);
3678         avail = root->fs_info->free_chunk_space;
3679         spin_unlock(&root->fs_info->free_chunk_lock);
3680
3681         /*
3682          * If we have dup, raid1 or raid10 then only half of the free
3683          * space is actually useable.
3684          */
3685         if (profile & (BTRFS_BLOCK_GROUP_DUP |
3686                        BTRFS_BLOCK_GROUP_RAID1 |
3687                        BTRFS_BLOCK_GROUP_RAID10))
3688                 avail >>= 1;
3689
3690         /*
3691          * If we aren't flushing all things, let us overcommit up to
3692          * 1/2th of the space. If we can flush, don't let us overcommit
3693          * too much, let it overcommit up to 1/8 of the space.
3694          */
3695         if (flush == BTRFS_RESERVE_FLUSH_ALL)
3696                 avail >>= 3;
3697         else
3698                 avail >>= 1;
3699
3700         if (used + bytes < space_info->total_bytes + avail)
3701                 return 1;
3702         return 0;
3703 }
3704
3705 static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3706                                                       unsigned long nr_pages,
3707                                                       enum wb_reason reason)
3708 {
3709         /* the flusher is dealing with the dirty inodes now. */
3710         if (writeback_in_progress(sb->s_bdi))
3711                 return 1;
3712
3713         if (down_read_trylock(&sb->s_umount)) {
3714                 writeback_inodes_sb_nr(sb, nr_pages, reason);
3715                 up_read(&sb->s_umount);
3716                 return 1;
3717         }
3718
3719         return 0;
3720 }
3721
3722 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3723                                   unsigned long nr_pages)
3724 {
3725         struct super_block *sb = root->fs_info->sb;
3726         int started;
3727
3728         /* If we can not start writeback, just sync all the delalloc file. */
3729         started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
3730                                                       WB_REASON_FS_FREE_SPACE);
3731         if (!started) {
3732                 /*
3733                  * We needn't worry the filesystem going from r/w to r/o though
3734                  * we don't acquire ->s_umount mutex, because the filesystem
3735                  * should guarantee the delalloc inodes list be empty after
3736                  * the filesystem is readonly(all dirty pages are written to
3737                  * the disk).
3738                  */
3739                 btrfs_start_delalloc_inodes(root, 0);
3740                 btrfs_wait_ordered_extents(root, 0);
3741         }
3742 }
3743
3744 /*
3745  * shrink metadata reservation for delalloc
3746  */
3747 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3748                             bool wait_ordered)
3749 {
3750         struct btrfs_block_rsv *block_rsv;
3751         struct btrfs_space_info *space_info;
3752         struct btrfs_trans_handle *trans;
3753         u64 delalloc_bytes;
3754         u64 max_reclaim;
3755         long time_left;
3756         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3757         int loops = 0;
3758         enum btrfs_reserve_flush_enum flush;
3759
3760         trans = (struct btrfs_trans_handle *)current->journal_info;
3761         block_rsv = &root->fs_info->delalloc_block_rsv;
3762         space_info = block_rsv->space_info;
3763
3764         smp_mb();
3765         delalloc_bytes = root->fs_info->delalloc_bytes;
3766         if (delalloc_bytes == 0) {
3767                 if (trans)
3768                         return;
3769                 btrfs_wait_ordered_extents(root, 0);
3770                 return;
3771         }
3772
3773         while (delalloc_bytes && loops < 3) {
3774                 max_reclaim = min(delalloc_bytes, to_reclaim);
3775                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3776                 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3777                 /*
3778                  * We need to wait for the async pages to actually start before
3779                  * we do anything.
3780                  */
3781                 wait_event(root->fs_info->async_submit_wait,
3782                            !atomic_read(&root->fs_info->async_delalloc_pages));
3783
3784                 if (!trans)
3785                         flush = BTRFS_RESERVE_FLUSH_ALL;
3786                 else
3787                         flush = BTRFS_RESERVE_NO_FLUSH;
3788                 spin_lock(&space_info->lock);
3789                 if (can_overcommit(root, space_info, orig, flush)) {
3790                         spin_unlock(&space_info->lock);
3791                         break;
3792                 }
3793                 spin_unlock(&space_info->lock);
3794
3795                 loops++;
3796                 if (wait_ordered && !trans) {
3797                         btrfs_wait_ordered_extents(root, 0);
3798                 } else {
3799                         time_left = schedule_timeout_killable(1);
3800                         if (time_left)
3801                                 break;
3802                 }
3803                 smp_mb();
3804                 delalloc_bytes = root->fs_info->delalloc_bytes;
3805         }
3806 }
3807
3808 /**
3809  * maybe_commit_transaction - possibly commit the transaction if its ok to
3810  * @root - the root we're allocating for
3811  * @bytes - the number of bytes we want to reserve
3812  * @force - force the commit
3813  *
3814  * This will check to make sure that committing the transaction will actually
3815  * get us somewhere and then commit the transaction if it does.  Otherwise it
3816  * will return -ENOSPC.
3817  */
3818 static int may_commit_transaction(struct btrfs_root *root,
3819                                   struct btrfs_space_info *space_info,
3820                                   u64 bytes, int force)
3821 {
3822         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3823         struct btrfs_trans_handle *trans;
3824
3825         trans = (struct btrfs_trans_handle *)current->journal_info;
3826         if (trans)
3827                 return -EAGAIN;
3828
3829         if (force)
3830                 goto commit;
3831
3832         /* See if there is enough pinned space to make this reservation */
3833         spin_lock(&space_info->lock);
3834         if (space_info->bytes_pinned >= bytes) {
3835                 spin_unlock(&space_info->lock);
3836                 goto commit;
3837         }
3838         spin_unlock(&space_info->lock);
3839
3840         /*
3841          * See if there is some space in the delayed insertion reservation for
3842          * this reservation.
3843          */
3844         if (space_info != delayed_rsv->space_info)
3845                 return -ENOSPC;
3846
3847         spin_lock(&space_info->lock);
3848         spin_lock(&delayed_rsv->lock);
3849         if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3850                 spin_unlock(&delayed_rsv->lock);
3851                 spin_unlock(&space_info->lock);
3852                 return -ENOSPC;
3853         }
3854         spin_unlock(&delayed_rsv->lock);
3855         spin_unlock(&space_info->lock);
3856
3857 commit:
3858         trans = btrfs_join_transaction(root);
3859         if (IS_ERR(trans))
3860                 return -ENOSPC;
3861
3862         return btrfs_commit_transaction(trans, root);
3863 }
3864
3865 enum flush_state {
3866         FLUSH_DELAYED_ITEMS_NR  =       1,
3867         FLUSH_DELAYED_ITEMS     =       2,
3868         FLUSH_DELALLOC          =       3,
3869         FLUSH_DELALLOC_WAIT     =       4,
3870         ALLOC_CHUNK             =       5,
3871         COMMIT_TRANS            =       6,
3872 };
3873
3874 static int flush_space(struct btrfs_root *root,
3875                        struct btrfs_space_info *space_info, u64 num_bytes,
3876                        u64 orig_bytes, int state)
3877 {
3878         struct btrfs_trans_handle *trans;
3879         int nr;
3880         int ret = 0;
3881
3882         switch (state) {
3883         case FLUSH_DELAYED_ITEMS_NR:
3884         case FLUSH_DELAYED_ITEMS:
3885                 if (state == FLUSH_DELAYED_ITEMS_NR) {
3886                         u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3887
3888                         nr = (int)div64_u64(num_bytes, bytes);
3889                         if (!nr)
3890                                 nr = 1;
3891                         nr *= 2;
3892                 } else {
3893                         nr = -1;
3894                 }
3895                 trans = btrfs_join_transaction(root);
3896                 if (IS_ERR(trans)) {
3897                         ret = PTR_ERR(trans);
3898                         break;
3899                 }
3900                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3901                 btrfs_end_transaction(trans, root);
3902                 break;
3903         case FLUSH_DELALLOC:
3904         case FLUSH_DELALLOC_WAIT:
3905                 shrink_delalloc(root, num_bytes, orig_bytes,
3906                                 state == FLUSH_DELALLOC_WAIT);
3907                 break;
3908         case ALLOC_CHUNK:
3909                 trans = btrfs_join_transaction(root);
3910                 if (IS_ERR(trans)) {
3911                         ret = PTR_ERR(trans);
3912                         break;
3913                 }
3914                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3915                                      btrfs_get_alloc_profile(root, 0),
3916                                      CHUNK_ALLOC_NO_FORCE);
3917                 btrfs_end_transaction(trans, root);
3918                 if (ret == -ENOSPC)
3919                         ret = 0;
3920                 break;
3921         case COMMIT_TRANS:
3922                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3923                 break;
3924         default:
3925                 ret = -ENOSPC;
3926                 break;
3927         }
3928
3929         return ret;
3930 }
3931 /**
3932  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3933  * @root - the root we're allocating for
3934  * @block_rsv - the block_rsv we're allocating for
3935  * @orig_bytes - the number of bytes we want
3936  * @flush - wether or not we can flush to make our reservation
3937  *
3938  * This will reserve orgi_bytes number of bytes from the space info associated
3939  * with the block_rsv.  If there is not enough space it will make an attempt to
3940  * flush out space to make room.  It will do this by flushing delalloc if
3941  * possible or committing the transaction.  If flush is 0 then no attempts to
3942  * regain reservations will be made and this will fail if there is not enough
3943  * space already.
3944  */
3945 static int reserve_metadata_bytes(struct btrfs_root *root,
3946                                   struct btrfs_block_rsv *block_rsv,
3947                                   u64 orig_bytes,
3948                                   enum btrfs_reserve_flush_enum flush)
3949 {
3950         struct btrfs_space_info *space_info = block_rsv->space_info;
3951         u64 used;
3952         u64 num_bytes = orig_bytes;
3953         int flush_state = FLUSH_DELAYED_ITEMS_NR;
3954         int ret = 0;
3955         bool flushing = false;
3956
3957 again:
3958         ret = 0;
3959         spin_lock(&space_info->lock);
3960         /*
3961          * We only want to wait if somebody other than us is flushing and we
3962          * are actually allowed to flush all things.
3963          */
3964         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3965                space_info->flush) {
3966                 spin_unlock(&space_info->lock);
3967                 /*
3968                  * If we have a trans handle we can't wait because the flusher
3969                  * may have to commit the transaction, which would mean we would
3970                  * deadlock since we are waiting for the flusher to finish, but
3971                  * hold the current transaction open.
3972                  */
3973                 if (current->journal_info)
3974                         return -EAGAIN;
3975                 ret = wait_event_killable(space_info->wait, !space_info->flush);
3976                 /* Must have been killed, return */
3977                 if (ret)
3978                         return -EINTR;
3979
3980                 spin_lock(&space_info->lock);
3981         }
3982
3983         ret = -ENOSPC;
3984         used = space_info->bytes_used + space_info->bytes_reserved +
3985                 space_info->bytes_pinned + space_info->bytes_readonly +
3986                 space_info->bytes_may_use;
3987
3988         /*
3989          * The idea here is that we've not already over-reserved the block group
3990          * then we can go ahead and save our reservation first and then start
3991          * flushing if we need to.  Otherwise if we've already overcommitted
3992          * lets start flushing stuff first and then come back and try to make
3993          * our reservation.
3994          */
3995         if (used <= space_info->total_bytes) {
3996                 if (used + orig_bytes <= space_info->total_bytes) {
3997                         space_info->bytes_may_use += orig_bytes;
3998                         trace_btrfs_space_reservation(root->fs_info,
3999                                 "space_info", space_info->flags, orig_bytes, 1);
4000                         ret = 0;
4001                 } else {
4002                         /*
4003                          * Ok set num_bytes to orig_bytes since we aren't
4004                          * overocmmitted, this way we only try and reclaim what
4005                          * we need.
4006                          */
4007                         num_bytes = orig_bytes;
4008                 }
4009         } else {
4010                 /*
4011                  * Ok we're over committed, set num_bytes to the overcommitted
4012                  * amount plus the amount of bytes that we need for this
4013                  * reservation.
4014                  */
4015                 num_bytes = used - space_info->total_bytes +
4016                         (orig_bytes * 2);
4017         }
4018
4019         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4020                 space_info->bytes_may_use += orig_bytes;
4021                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4022                                               space_info->flags, orig_bytes,
4023                                               1);
4024                 ret = 0;
4025         }
4026
4027         /*
4028          * Couldn't make our reservation, save our place so while we're trying
4029          * to reclaim space we can actually use it instead of somebody else
4030          * stealing it from us.
4031          *
4032          * We make the other tasks wait for the flush only when we can flush
4033          * all things.
4034          */
4035         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4036                 flushing = true;
4037                 space_info->flush = 1;
4038         }
4039
4040         spin_unlock(&space_info->lock);
4041
4042         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4043                 goto out;
4044
4045         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4046                           flush_state);
4047         flush_state++;
4048
4049         /*
4050          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4051          * would happen. So skip delalloc flush.
4052          */
4053         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4054             (flush_state == FLUSH_DELALLOC ||
4055              flush_state == FLUSH_DELALLOC_WAIT))
4056                 flush_state = ALLOC_CHUNK;
4057
4058         if (!ret)
4059                 goto again;
4060         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4061                  flush_state < COMMIT_TRANS)
4062                 goto again;
4063         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4064                  flush_state <= COMMIT_TRANS)
4065                 goto again;
4066
4067 out:
4068         if (flushing) {
4069                 spin_lock(&space_info->lock);
4070                 space_info->flush = 0;
4071                 wake_up_all(&space_info->wait);
4072                 spin_unlock(&space_info->lock);
4073         }
4074         return ret;
4075 }
4076
4077 static struct btrfs_block_rsv *get_block_rsv(
4078                                         const struct btrfs_trans_handle *trans,
4079                                         const struct btrfs_root *root)
4080 {
4081         struct btrfs_block_rsv *block_rsv = NULL;
4082
4083         if (root->ref_cows)
4084                 block_rsv = trans->block_rsv;
4085
4086         if (root == root->fs_info->csum_root && trans->adding_csums)
4087                 block_rsv = trans->block_rsv;
4088
4089         if (!block_rsv)
4090                 block_rsv = root->block_rsv;
4091
4092         if (!block_rsv)
4093                 block_rsv = &root->fs_info->empty_block_rsv;
4094
4095         return block_rsv;
4096 }
4097
4098 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4099                                u64 num_bytes)
4100 {
4101         int ret = -ENOSPC;
4102         spin_lock(&block_rsv->lock);
4103         if (block_rsv->reserved >= num_bytes) {
4104                 block_rsv->reserved -= num_bytes;
4105                 if (block_rsv->reserved < block_rsv->size)
4106                         block_rsv->full = 0;
4107                 ret = 0;
4108         }
4109         spin_unlock(&block_rsv->lock);
4110         return ret;
4111 }
4112
4113 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4114                                 u64 num_bytes, int update_size)
4115 {
4116         spin_lock(&block_rsv->lock);
4117         block_rsv->reserved += num_bytes;
4118         if (update_size)
4119                 block_rsv->size += num_bytes;
4120         else if (block_rsv->reserved >= block_rsv->size)
4121                 block_rsv->full = 1;
4122         spin_unlock(&block_rsv->lock);
4123 }
4124
4125 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4126                                     struct btrfs_block_rsv *block_rsv,
4127                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4128 {
4129         struct btrfs_space_info *space_info = block_rsv->space_info;
4130
4131         spin_lock(&block_rsv->lock);
4132         if (num_bytes == (u64)-1)
4133                 num_bytes = block_rsv->size;
4134         block_rsv->size -= num_bytes;
4135         if (block_rsv->reserved >= block_rsv->size) {
4136                 num_bytes = block_rsv->reserved - block_rsv->size;
4137                 block_rsv->reserved = block_rsv->size;
4138                 block_rsv->full = 1;
4139         } else {
4140                 num_bytes = 0;
4141         }
4142         spin_unlock(&block_rsv->lock);
4143
4144         if (num_bytes > 0) {
4145                 if (dest) {
4146                         spin_lock(&dest->lock);
4147                         if (!dest->full) {
4148                                 u64 bytes_to_add;
4149
4150                                 bytes_to_add = dest->size - dest->reserved;
4151                                 bytes_to_add = min(num_bytes, bytes_to_add);
4152                                 dest->reserved += bytes_to_add;
4153                                 if (dest->reserved >= dest->size)
4154                                         dest->full = 1;
4155                                 num_bytes -= bytes_to_add;
4156                         }
4157                         spin_unlock(&dest->lock);
4158                 }
4159                 if (num_bytes) {
4160                         spin_lock(&space_info->lock);
4161                         space_info->bytes_may_use -= num_bytes;
4162                         trace_btrfs_space_reservation(fs_info, "space_info",
4163                                         space_info->flags, num_bytes, 0);
4164                         space_info->reservation_progress++;
4165                         spin_unlock(&space_info->lock);
4166                 }
4167         }
4168 }
4169
4170 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4171                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4172 {
4173         int ret;
4174
4175         ret = block_rsv_use_bytes(src, num_bytes);
4176         if (ret)
4177                 return ret;
4178
4179         block_rsv_add_bytes(dst, num_bytes, 1);
4180         return 0;
4181 }
4182
4183 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4184 {
4185         memset(rsv, 0, sizeof(*rsv));
4186         spin_lock_init(&rsv->lock);
4187         rsv->type = type;
4188 }
4189
4190 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4191                                               unsigned short type)
4192 {
4193         struct btrfs_block_rsv *block_rsv;
4194         struct btrfs_fs_info *fs_info = root->fs_info;
4195
4196         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4197         if (!block_rsv)
4198                 return NULL;
4199
4200         btrfs_init_block_rsv(block_rsv, type);
4201         block_rsv->space_info = __find_space_info(fs_info,
4202                                                   BTRFS_BLOCK_GROUP_METADATA);
4203         return block_rsv;
4204 }
4205
4206 void btrfs_free_block_rsv(struct btrfs_root *root,
4207                           struct btrfs_block_rsv *rsv)
4208 {
4209         if (!rsv)
4210                 return;
4211         btrfs_block_rsv_release(root, rsv, (u64)-1);
4212         kfree(rsv);
4213 }
4214
4215 int btrfs_block_rsv_add(struct btrfs_root *root,
4216                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4217                         enum btrfs_reserve_flush_enum flush)
4218 {
4219         int ret;
4220
4221         if (num_bytes == 0)
4222                 return 0;
4223
4224         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4225         if (!ret) {
4226                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
4227                 return 0;
4228         }
4229
4230         return ret;
4231 }
4232
4233 int btrfs_block_rsv_check(struct btrfs_root *root,
4234                           struct btrfs_block_rsv *block_rsv, int min_factor)
4235 {
4236         u64 num_bytes = 0;
4237         int ret = -ENOSPC;
4238
4239         if (!block_rsv)
4240                 return 0;
4241
4242         spin_lock(&block_rsv->lock);
4243         num_bytes = div_factor(block_rsv->size, min_factor);
4244         if (block_rsv->reserved >= num_bytes)
4245                 ret = 0;
4246         spin_unlock(&block_rsv->lock);
4247
4248         return ret;
4249 }
4250
4251 int btrfs_block_rsv_refill(struct btrfs_root *root,
4252                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4253                            enum btrfs_reserve_flush_enum flush)
4254 {
4255         u64 num_bytes = 0;
4256         int ret = -ENOSPC;
4257
4258         if (!block_rsv)
4259                 return 0;
4260
4261         spin_lock(&block_rsv->lock);
4262         num_bytes = min_reserved;
4263         if (block_rsv->reserved >= num_bytes)
4264                 ret = 0;
4265         else
4266                 num_bytes -= block_rsv->reserved;
4267         spin_unlock(&block_rsv->lock);
4268
4269         if (!ret)
4270                 return 0;
4271
4272         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4273         if (!ret) {
4274                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4275                 return 0;
4276         }
4277
4278         return ret;
4279 }
4280
4281 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4282                             struct btrfs_block_rsv *dst_rsv,
4283                             u64 num_bytes)
4284 {
4285         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4286 }
4287
4288 void btrfs_block_rsv_release(struct btrfs_root *root,
4289                              struct btrfs_block_rsv *block_rsv,
4290                              u64 num_bytes)
4291 {
4292         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4293         if (global_rsv->full || global_rsv == block_rsv ||
4294             block_rsv->space_info != global_rsv->space_info)
4295                 global_rsv = NULL;
4296         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4297                                 num_bytes);
4298 }
4299
4300 /*
4301  * helper to calculate size of global block reservation.
4302  * the desired value is sum of space used by extent tree,
4303  * checksum tree and root tree
4304  */
4305 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4306 {
4307         struct btrfs_space_info *sinfo;
4308         u64 num_bytes;
4309         u64 meta_used;
4310         u64 data_used;
4311         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4312
4313         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4314         spin_lock(&sinfo->lock);
4315         data_used = sinfo->bytes_used;
4316         spin_unlock(&sinfo->lock);
4317
4318         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4319         spin_lock(&sinfo->lock);
4320         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4321                 data_used = 0;
4322         meta_used = sinfo->bytes_used;
4323         spin_unlock(&sinfo->lock);
4324
4325         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4326                     csum_size * 2;
4327         num_bytes += div64_u64(data_used + meta_used, 50);
4328
4329         if (num_bytes * 3 > meta_used)
4330                 num_bytes = div64_u64(meta_used, 3);
4331
4332         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4333 }
4334
4335 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4336 {
4337         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4338         struct btrfs_space_info *sinfo = block_rsv->space_info;
4339         u64 num_bytes;
4340
4341         num_bytes = calc_global_metadata_size(fs_info);
4342
4343         spin_lock(&sinfo->lock);
4344         spin_lock(&block_rsv->lock);
4345
4346         block_rsv->size = num_bytes;
4347
4348         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4349                     sinfo->bytes_reserved + sinfo->bytes_readonly +
4350                     sinfo->bytes_may_use;
4351
4352         if (sinfo->total_bytes > num_bytes) {
4353                 num_bytes = sinfo->total_bytes - num_bytes;
4354                 block_rsv->reserved += num_bytes;
4355                 sinfo->bytes_may_use += num_bytes;
4356                 trace_btrfs_space_reservation(fs_info, "space_info",
4357                                       sinfo->flags, num_bytes, 1);
4358         }
4359
4360         if (block_rsv->reserved >= block_rsv->size) {
4361                 num_bytes = block_rsv->reserved - block_rsv->size;
4362                 sinfo->bytes_may_use -= num_bytes;
4363                 trace_btrfs_space_reservation(fs_info, "space_info",
4364                                       sinfo->flags, num_bytes, 0);
4365                 sinfo->reservation_progress++;
4366                 block_rsv->reserved = block_rsv->size;
4367                 block_rsv->full = 1;
4368         }
4369
4370         spin_unlock(&block_rsv->lock);
4371         spin_unlock(&sinfo->lock);
4372 }
4373
4374 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4375 {
4376         struct btrfs_space_info *space_info;
4377
4378         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4379         fs_info->chunk_block_rsv.space_info = space_info;
4380
4381         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4382         fs_info->global_block_rsv.space_info = space_info;
4383         fs_info->delalloc_block_rsv.space_info = space_info;
4384         fs_info->trans_block_rsv.space_info = space_info;
4385         fs_info->empty_block_rsv.space_info = space_info;
4386         fs_info->delayed_block_rsv.space_info = space_info;
4387
4388         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4389         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4390         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4391         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4392         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4393
4394         update_global_block_rsv(fs_info);
4395 }
4396
4397 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4398 {
4399         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4400                                 (u64)-1);
4401         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4402         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4403         WARN_ON(fs_info->trans_block_rsv.size > 0);
4404         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4405         WARN_ON(fs_info->chunk_block_rsv.size > 0);
4406         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4407         WARN_ON(fs_info->delayed_block_rsv.size > 0);
4408         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4409 }
4410
4411 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4412                                   struct btrfs_root *root)
4413 {
4414         if (!trans->block_rsv)
4415                 return;
4416
4417         if (!trans->bytes_reserved)
4418                 return;
4419
4420         trace_btrfs_space_reservation(root->fs_info, "transaction",
4421                                       trans->transid, trans->bytes_reserved, 0);
4422         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4423         trans->bytes_reserved = 0;
4424 }
4425
4426 /* Can only return 0 or -ENOSPC */
4427 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4428                                   struct inode *inode)
4429 {
4430         struct btrfs_root *root = BTRFS_I(inode)->root;
4431         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4432         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4433
4434         /*
4435          * We need to hold space in order to delete our orphan item once we've
4436          * added it, so this takes the reservation so we can release it later
4437          * when we are truly done with the orphan item.
4438          */
4439         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4440         trace_btrfs_space_reservation(root->fs_info, "orphan",
4441                                       btrfs_ino(inode), num_bytes, 1);
4442         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4443 }
4444
4445 void btrfs_orphan_release_metadata(struct inode *inode)
4446 {
4447         struct btrfs_root *root = BTRFS_I(inode)->root;
4448         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4449         trace_btrfs_space_reservation(root->fs_info, "orphan",
4450                                       btrfs_ino(inode), num_bytes, 0);
4451         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4452 }
4453
4454 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4455                                 struct btrfs_pending_snapshot *pending)
4456 {
4457         struct btrfs_root *root = pending->root;
4458         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4459         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4460         /*
4461          * two for root back/forward refs, two for directory entries,
4462          * one for root of the snapshot and one for parent inode.
4463          */
4464         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4465         dst_rsv->space_info = src_rsv->space_info;
4466         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4467 }
4468
4469 /**
4470  * drop_outstanding_extent - drop an outstanding extent
4471  * @inode: the inode we're dropping the extent for
4472  *
4473  * This is called when we are freeing up an outstanding extent, either called
4474  * after an error or after an extent is written.  This will return the number of
4475  * reserved extents that need to be freed.  This must be called with
4476  * BTRFS_I(inode)->lock held.
4477  */
4478 static unsigned drop_outstanding_extent(struct inode *inode)
4479 {
4480         unsigned drop_inode_space = 0;
4481         unsigned dropped_extents = 0;
4482
4483         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4484         BTRFS_I(inode)->outstanding_extents--;
4485
4486         if (BTRFS_I(inode)->outstanding_extents == 0 &&
4487             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4488                                &BTRFS_I(inode)->runtime_flags))
4489                 drop_inode_space = 1;
4490
4491         /*
4492          * If we have more or the same amount of outsanding extents than we have
4493          * reserved then we need to leave the reserved extents count alone.
4494          */
4495         if (BTRFS_I(inode)->outstanding_extents >=
4496             BTRFS_I(inode)->reserved_extents)
4497                 return drop_inode_space;
4498
4499         dropped_extents = BTRFS_I(inode)->reserved_extents -
4500                 BTRFS_I(inode)->outstanding_extents;
4501         BTRFS_I(inode)->reserved_extents -= dropped_extents;
4502         return dropped_extents + drop_inode_space;
4503 }
4504
4505 /**
4506  * calc_csum_metadata_size - return the amount of metada space that must be
4507  *      reserved/free'd for the given bytes.
4508  * @inode: the inode we're manipulating
4509  * @num_bytes: the number of bytes in question
4510  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4511  *
4512  * This adjusts the number of csum_bytes in the inode and then returns the
4513  * correct amount of metadata that must either be reserved or freed.  We
4514  * calculate how many checksums we can fit into one leaf and then divide the
4515  * number of bytes that will need to be checksumed by this value to figure out
4516  * how many checksums will be required.  If we are adding bytes then the number
4517  * may go up and we will return the number of additional bytes that must be
4518  * reserved.  If it is going down we will return the number of bytes that must
4519  * be freed.
4520  *
4521  * This must be called with BTRFS_I(inode)->lock held.
4522  */
4523 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4524                                    int reserve)
4525 {
4526         struct btrfs_root *root = BTRFS_I(inode)->root;
4527         u64 csum_size;
4528         int num_csums_per_leaf;
4529         int num_csums;
4530         int old_csums;
4531
4532         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4533             BTRFS_I(inode)->csum_bytes == 0)
4534                 return 0;
4535
4536         old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4537         if (reserve)
4538                 BTRFS_I(inode)->csum_bytes += num_bytes;
4539         else
4540                 BTRFS_I(inode)->csum_bytes -= num_bytes;
4541         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4542         num_csums_per_leaf = (int)div64_u64(csum_size,
4543                                             sizeof(struct btrfs_csum_item) +
4544                                             sizeof(struct btrfs_disk_key));
4545         num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4546         num_csums = num_csums + num_csums_per_leaf - 1;
4547         num_csums = num_csums / num_csums_per_leaf;
4548
4549         old_csums = old_csums + num_csums_per_leaf - 1;
4550         old_csums = old_csums / num_csums_per_leaf;
4551
4552         /* No change, no need to reserve more */
4553         if (old_csums == num_csums)
4554                 return 0;
4555
4556         if (reserve)
4557                 return btrfs_calc_trans_metadata_size(root,
4558                                                       num_csums - old_csums);
4559
4560         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4561 }
4562
4563 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4564 {
4565         struct btrfs_root *root = BTRFS_I(inode)->root;
4566         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4567         u64 to_reserve = 0;
4568         u64 csum_bytes;
4569         unsigned nr_extents = 0;
4570         int extra_reserve = 0;
4571         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4572         int ret = 0;
4573         bool delalloc_lock = true;
4574
4575         /* If we are a free space inode we need to not flush since we will be in
4576          * the middle of a transaction commit.  We also don't need the delalloc
4577          * mutex since we won't race with anybody.  We need this mostly to make
4578          * lockdep shut its filthy mouth.
4579          */
4580         if (btrfs_is_free_space_inode(inode)) {
4581                 flush = BTRFS_RESERVE_NO_FLUSH;
4582                 delalloc_lock = false;
4583         }
4584
4585         if (flush != BTRFS_RESERVE_NO_FLUSH &&
4586             btrfs_transaction_in_commit(root->fs_info))
4587                 schedule_timeout(1);
4588
4589         if (delalloc_lock)
4590                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4591
4592         num_bytes = ALIGN(num_bytes, root->sectorsize);
4593
4594         spin_lock(&BTRFS_I(inode)->lock);
4595         BTRFS_I(inode)->outstanding_extents++;
4596
4597         if (BTRFS_I(inode)->outstanding_extents >
4598             BTRFS_I(inode)->reserved_extents)
4599                 nr_extents = BTRFS_I(inode)->outstanding_extents -
4600                         BTRFS_I(inode)->reserved_extents;
4601
4602         /*
4603          * Add an item to reserve for updating the inode when we complete the
4604          * delalloc io.
4605          */
4606         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4607                       &BTRFS_I(inode)->runtime_flags)) {
4608                 nr_extents++;
4609                 extra_reserve = 1;
4610         }
4611
4612         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4613         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4614         csum_bytes = BTRFS_I(inode)->csum_bytes;
4615         spin_unlock(&BTRFS_I(inode)->lock);
4616
4617         if (root->fs_info->quota_enabled)
4618                 ret = btrfs_qgroup_reserve(root, num_bytes +
4619                                            nr_extents * root->leafsize);
4620
4621         /*
4622          * ret != 0 here means the qgroup reservation failed, we go straight to
4623          * the shared error handling then.
4624          */
4625         if (ret == 0)
4626                 ret = reserve_metadata_bytes(root, block_rsv,
4627                                              to_reserve, flush);
4628
4629         if (ret) {
4630                 u64 to_free = 0;
4631                 unsigned dropped;
4632
4633                 spin_lock(&BTRFS_I(inode)->lock);
4634                 dropped = drop_outstanding_extent(inode);
4635                 /*
4636                  * If the inodes csum_bytes is the same as the original
4637                  * csum_bytes then we know we haven't raced with any free()ers
4638                  * so we can just reduce our inodes csum bytes and carry on.
4639                  * Otherwise we have to do the normal free thing to account for
4640                  * the case that the free side didn't free up its reserve
4641                  * because of this outstanding reservation.
4642                  */
4643                 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4644                         calc_csum_metadata_size(inode, num_bytes, 0);
4645                 else
4646                         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4647                 spin_unlock(&BTRFS_I(inode)->lock);
4648                 if (dropped)
4649                         to_free += btrfs_calc_trans_metadata_size(root, dropped);
4650
4651                 if (to_free) {
4652                         btrfs_block_rsv_release(root, block_rsv, to_free);
4653                         trace_btrfs_space_reservation(root->fs_info,
4654                                                       "delalloc",
4655                                                       btrfs_ino(inode),
4656                                                       to_free, 0);
4657                 }
4658                 if (root->fs_info->quota_enabled) {
4659                         btrfs_qgroup_free(root, num_bytes +
4660                                                 nr_extents * root->leafsize);
4661                 }
4662                 if (delalloc_lock)
4663                         mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4664                 return ret;
4665         }
4666
4667         spin_lock(&BTRFS_I(inode)->lock);
4668         if (extra_reserve) {
4669                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4670                         &BTRFS_I(inode)->runtime_flags);
4671                 nr_extents--;
4672         }
4673         BTRFS_I(inode)->reserved_extents += nr_extents;
4674         spin_unlock(&BTRFS_I(inode)->lock);
4675
4676         if (delalloc_lock)
4677                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4678
4679         if (to_reserve)
4680                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4681                                               btrfs_ino(inode), to_reserve, 1);
4682         block_rsv_add_bytes(block_rsv, to_reserve, 1);
4683
4684         return 0;
4685 }
4686
4687 /**
4688  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4689  * @inode: the inode to release the reservation for
4690  * @num_bytes: the number of bytes we're releasing
4691  *
4692  * This will release the metadata reservation for an inode.  This can be called
4693  * once we complete IO for a given set of bytes to release their metadata
4694  * reservations.
4695  */
4696 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4697 {
4698         struct btrfs_root *root = BTRFS_I(inode)->root;
4699         u64 to_free = 0;
4700         unsigned dropped;
4701
4702         num_bytes = ALIGN(num_bytes, root->sectorsize);
4703         spin_lock(&BTRFS_I(inode)->lock);
4704         dropped = drop_outstanding_extent(inode);
4705
4706         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4707         spin_unlock(&BTRFS_I(inode)->lock);
4708         if (dropped > 0)
4709                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4710
4711         trace_btrfs_space_reservation(root->fs_info, "delalloc",
4712                                       btrfs_ino(inode), to_free, 0);
4713         if (root->fs_info->quota_enabled) {
4714                 btrfs_qgroup_free(root, num_bytes +
4715                                         dropped * root->leafsize);
4716         }
4717
4718         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4719                                 to_free);
4720 }
4721
4722 /**
4723  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4724  * @inode: inode we're writing to
4725  * @num_bytes: the number of bytes we want to allocate
4726  *
4727  * This will do the following things
4728  *
4729  * o reserve space in the data space info for num_bytes
4730  * o reserve space in the metadata space info based on number of outstanding
4731  *   extents and how much csums will be needed
4732  * o add to the inodes ->delalloc_bytes
4733  * o add it to the fs_info's delalloc inodes list.
4734  *
4735  * This will return 0 for success and -ENOSPC if there is no space left.
4736  */
4737 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4738 {
4739         int ret;
4740
4741         ret = btrfs_check_data_free_space(inode, num_bytes);
4742         if (ret)
4743                 return ret;
4744
4745         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4746         if (ret) {
4747                 btrfs_free_reserved_data_space(inode, num_bytes);
4748                 return ret;
4749         }
4750
4751         return 0;
4752 }
4753
4754 /**
4755  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4756  * @inode: inode we're releasing space for
4757  * @num_bytes: the number of bytes we want to free up
4758  *
4759  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4760  * called in the case that we don't need the metadata AND data reservations
4761  * anymore.  So if there is an error or we insert an inline extent.
4762  *
4763  * This function will release the metadata space that was not used and will
4764  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4765  * list if there are no delalloc bytes left.
4766  */
4767 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4768 {
4769         btrfs_delalloc_release_metadata(inode, num_bytes);
4770         btrfs_free_reserved_data_space(inode, num_bytes);
4771 }
4772
4773 static int update_block_group(struct btrfs_trans_handle *trans,
4774                               struct btrfs_root *root,
4775                               u64 bytenr, u64 num_bytes, int alloc)
4776 {
4777         struct btrfs_block_group_cache *cache = NULL;
4778         struct btrfs_fs_info *info = root->fs_info;
4779         u64 total = num_bytes;
4780         u64 old_val;
4781         u64 byte_in_group;
4782         int factor;
4783
4784         /* block accounting for super block */
4785         spin_lock(&info->delalloc_lock);
4786         old_val = btrfs_super_bytes_used(info->super_copy);
4787         if (alloc)
4788                 old_val += num_bytes;
4789         else
4790                 old_val -= num_bytes;
4791         btrfs_set_super_bytes_used(info->super_copy, old_val);
4792         spin_unlock(&info->delalloc_lock);
4793
4794         while (total) {
4795                 cache = btrfs_lookup_block_group(info, bytenr);
4796                 if (!cache)
4797                         return -ENOENT;
4798                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4799                                     BTRFS_BLOCK_GROUP_RAID1 |
4800                                     BTRFS_BLOCK_GROUP_RAID10))
4801                         factor = 2;
4802                 else
4803                         factor = 1;
4804                 /*
4805                  * If this block group has free space cache written out, we
4806                  * need to make sure to load it if we are removing space.  This
4807                  * is because we need the unpinning stage to actually add the
4808                  * space back to the block group, otherwise we will leak space.
4809                  */
4810                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4811                         cache_block_group(cache, trans, NULL, 1);
4812
4813                 byte_in_group = bytenr - cache->key.objectid;
4814                 WARN_ON(byte_in_group > cache->key.offset);
4815
4816                 spin_lock(&cache->space_info->lock);
4817                 spin_lock(&cache->lock);
4818
4819                 if (btrfs_test_opt(root, SPACE_CACHE) &&
4820                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4821                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4822
4823                 cache->dirty = 1;
4824                 old_val = btrfs_block_group_used(&cache->item);
4825                 num_bytes = min(total, cache->key.offset - byte_in_group);
4826                 if (alloc) {
4827                         old_val += num_bytes;
4828                         btrfs_set_block_group_used(&cache->item, old_val);
4829                         cache->reserved -= num_bytes;
4830                         cache->space_info->bytes_reserved -= num_bytes;
4831                         cache->space_info->bytes_used += num_bytes;
4832                         cache->space_info->disk_used += num_bytes * factor;
4833                         spin_unlock(&cache->lock);
4834                         spin_unlock(&cache->space_info->lock);
4835                 } else {
4836                         old_val -= num_bytes;
4837                         btrfs_set_block_group_used(&cache->item, old_val);
4838                         cache->pinned += num_bytes;
4839                         cache->space_info->bytes_pinned += num_bytes;
4840                         cache->space_info->bytes_used -= num_bytes;
4841                         cache->space_info->disk_used -= num_bytes * factor;
4842                         spin_unlock(&cache->lock);
4843                         spin_unlock(&cache->space_info->lock);
4844
4845                         set_extent_dirty(info->pinned_extents,
4846                                          bytenr, bytenr + num_bytes - 1,
4847                                          GFP_NOFS | __GFP_NOFAIL);
4848                 }
4849                 btrfs_put_block_group(cache);
4850                 total -= num_bytes;
4851                 bytenr += num_bytes;
4852         }
4853         return 0;
4854 }
4855
4856 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4857 {
4858         struct btrfs_block_group_cache *cache;
4859         u64 bytenr;
4860
4861         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4862         if (!cache)
4863                 return 0;
4864
4865         bytenr = cache->key.objectid;
4866         btrfs_put_block_group(cache);
4867
4868         return bytenr;
4869 }
4870
4871 static int pin_down_extent(struct btrfs_root *root,
4872                            struct btrfs_block_group_cache *cache,
4873                            u64 bytenr, u64 num_bytes, int reserved)
4874 {
4875         spin_lock(&cache->space_info->lock);
4876         spin_lock(&cache->lock);
4877         cache->pinned += num_bytes;
4878         cache->space_info->bytes_pinned += num_bytes;
4879         if (reserved) {
4880                 cache->reserved -= num_bytes;
4881                 cache->space_info->bytes_reserved -= num_bytes;
4882         }
4883         spin_unlock(&cache->lock);
4884         spin_unlock(&cache->space_info->lock);
4885
4886         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4887                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4888         return 0;
4889 }
4890
4891 /*
4892  * this function must be called within transaction
4893  */
4894 int btrfs_pin_extent(struct btrfs_root *root,
4895                      u64 bytenr, u64 num_bytes, int reserved)
4896 {
4897         struct btrfs_block_group_cache *cache;
4898
4899         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4900         BUG_ON(!cache); /* Logic error */
4901
4902         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4903
4904         btrfs_put_block_group(cache);
4905         return 0;
4906 }
4907
4908 /*
4909  * this function must be called within transaction
4910  */
4911 int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
4912                                     struct btrfs_root *root,
4913                                     u64 bytenr, u64 num_bytes)
4914 {
4915         struct btrfs_block_group_cache *cache;
4916
4917         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4918         BUG_ON(!cache); /* Logic error */
4919
4920         /*
4921          * pull in the free space cache (if any) so that our pin
4922          * removes the free space from the cache.  We have load_only set
4923          * to one because the slow code to read in the free extents does check
4924          * the pinned extents.
4925          */
4926         cache_block_group(cache, trans, root, 1);
4927
4928         pin_down_extent(root, cache, bytenr, num_bytes, 0);
4929
4930         /* remove us from the free space cache (if we're there at all) */
4931         btrfs_remove_free_space(cache, bytenr, num_bytes);
4932         btrfs_put_block_group(cache);
4933         return 0;
4934 }
4935
4936 /**
4937  * btrfs_update_reserved_bytes - update the block_group and space info counters
4938  * @cache:      The cache we are manipulating
4939  * @num_bytes:  The number of bytes in question
4940  * @reserve:    One of the reservation enums
4941  *
4942  * This is called by the allocator when it reserves space, or by somebody who is
4943  * freeing space that was never actually used on disk.  For example if you
4944  * reserve some space for a new leaf in transaction A and before transaction A
4945  * commits you free that leaf, you call this with reserve set to 0 in order to
4946  * clear the reservation.
4947  *
4948  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4949  * ENOSPC accounting.  For data we handle the reservation through clearing the
4950  * delalloc bits in the io_tree.  We have to do this since we could end up
4951  * allocating less disk space for the amount of data we have reserved in the
4952  * case of compression.
4953  *
4954  * If this is a reservation and the block group has become read only we cannot
4955  * make the reservation and return -EAGAIN, otherwise this function always
4956  * succeeds.
4957  */
4958 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4959                                        u64 num_bytes, int reserve)
4960 {
4961         struct btrfs_space_info *space_info = cache->space_info;
4962         int ret = 0;
4963
4964         spin_lock(&space_info->lock);
4965         spin_lock(&cache->lock);
4966         if (reserve != RESERVE_FREE) {
4967                 if (cache->ro) {
4968                         ret = -EAGAIN;
4969                 } else {
4970                         cache->reserved += num_bytes;
4971                         space_info->bytes_reserved += num_bytes;
4972                         if (reserve == RESERVE_ALLOC) {
4973                                 trace_btrfs_space_reservation(cache->fs_info,
4974                                                 "space_info", space_info->flags,
4975                                                 num_bytes, 0);
4976                                 space_info->bytes_may_use -= num_bytes;
4977                         }
4978                 }
4979         } else {
4980                 if (cache->ro)
4981                         space_info->bytes_readonly += num_bytes;
4982                 cache->reserved -= num_bytes;
4983                 space_info->bytes_reserved -= num_bytes;
4984                 space_info->reservation_progress++;
4985         }
4986         spin_unlock(&cache->lock);
4987         spin_unlock(&space_info->lock);
4988         return ret;
4989 }
4990
4991 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
4992                                 struct btrfs_root *root)
4993 {
4994         struct btrfs_fs_info *fs_info = root->fs_info;
4995         struct btrfs_caching_control *next;
4996         struct btrfs_caching_control *caching_ctl;
4997         struct btrfs_block_group_cache *cache;
4998
4999         down_write(&fs_info->extent_commit_sem);
5000
5001         list_for_each_entry_safe(caching_ctl, next,
5002                                  &fs_info->caching_block_groups, list) {
5003                 cache = caching_ctl->block_group;
5004                 if (block_group_cache_done(cache)) {
5005                         cache->last_byte_to_unpin = (u64)-1;
5006                         list_del_init(&caching_ctl->list);
5007                         put_caching_control(caching_ctl);
5008                 } else {
5009                         cache->last_byte_to_unpin = caching_ctl->progress;
5010                 }
5011         }
5012
5013         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5014                 fs_info->pinned_extents = &fs_info->freed_extents[1];
5015         else
5016                 fs_info->pinned_extents = &fs_info->freed_extents[0];
5017
5018         up_write(&fs_info->extent_commit_sem);
5019
5020         update_global_block_rsv(fs_info);
5021 }
5022
5023 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5024 {
5025         struct btrfs_fs_info *fs_info = root->fs_info;
5026         struct btrfs_block_group_cache *cache = NULL;
5027         struct btrfs_space_info *space_info;
5028         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5029         u64 len;
5030         bool readonly;
5031
5032         while (start <= end) {
5033                 readonly = false;
5034                 if (!cache ||
5035                     start >= cache->key.objectid + cache->key.offset) {
5036                         if (cache)
5037                                 btrfs_put_block_group(cache);
5038                         cache = btrfs_lookup_block_group(fs_info, start);
5039                         BUG_ON(!cache); /* Logic error */
5040                 }
5041
5042                 len = cache->key.objectid + cache->key.offset - start;
5043                 len = min(len, end + 1 - start);
5044
5045                 if (start < cache->last_byte_to_unpin) {
5046                         len = min(len, cache->last_byte_to_unpin - start);
5047                         btrfs_add_free_space(cache, start, len);
5048                 }
5049
5050                 start += len;
5051                 space_info = cache->space_info;
5052
5053                 spin_lock(&space_info->lock);
5054                 spin_lock(&cache->lock);
5055                 cache->pinned -= len;
5056                 space_info->bytes_pinned -= len;
5057                 if (cache->ro) {
5058                         space_info->bytes_readonly += len;
5059                         readonly = true;
5060                 }
5061                 spin_unlock(&cache->lock);
5062                 if (!readonly && global_rsv->space_info == space_info) {
5063                         spin_lock(&global_rsv->lock);
5064                         if (!global_rsv->full) {
5065                                 len = min(len, global_rsv->size -
5066                                           global_rsv->reserved);
5067                                 global_rsv->reserved += len;
5068                                 space_info->bytes_may_use += len;
5069                                 if (global_rsv->reserved >= global_rsv->size)
5070                                         global_rsv->full = 1;
5071                         }
5072                         spin_unlock(&global_rsv->lock);
5073                 }
5074                 spin_unlock(&space_info->lock);
5075         }
5076
5077         if (cache)
5078                 btrfs_put_block_group(cache);
5079         return 0;
5080 }
5081
5082 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5083                                struct btrfs_root *root)
5084 {
5085         struct btrfs_fs_info *fs_info = root->fs_info;
5086         struct extent_io_tree *unpin;
5087         u64 start;
5088         u64 end;
5089         int ret;
5090
5091         if (trans->aborted)
5092                 return 0;
5093
5094         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5095                 unpin = &fs_info->freed_extents[1];
5096         else
5097                 unpin = &fs_info->freed_extents[0];
5098
5099         while (1) {
5100                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5101                                             EXTENT_DIRTY, NULL);
5102                 if (ret)
5103                         break;
5104
5105                 if (btrfs_test_opt(root, DISCARD))
5106                         ret = btrfs_discard_extent(root, start,
5107                                                    end + 1 - start, NULL);
5108
5109                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5110                 unpin_extent_range(root, start, end);
5111                 cond_resched();
5112         }
5113
5114         return 0;
5115 }
5116
5117 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5118                                 struct btrfs_root *root,
5119                                 u64 bytenr, u64 num_bytes, u64 parent,
5120                                 u64 root_objectid, u64 owner_objectid,
5121                                 u64 owner_offset, int refs_to_drop,
5122                                 struct btrfs_delayed_extent_op *extent_op)
5123 {
5124         struct btrfs_key key;
5125         struct btrfs_path *path;
5126         struct btrfs_fs_info *info = root->fs_info;
5127         struct btrfs_root *extent_root = info->extent_root;
5128         struct extent_buffer *leaf;
5129         struct btrfs_extent_item *ei;
5130         struct btrfs_extent_inline_ref *iref;
5131         int ret;
5132         int is_data;
5133         int extent_slot = 0;
5134         int found_extent = 0;
5135         int num_to_del = 1;
5136         u32 item_size;
5137         u64 refs;
5138
5139         path = btrfs_alloc_path();
5140         if (!path)
5141                 return -ENOMEM;
5142
5143         path->reada = 1;
5144         path->leave_spinning = 1;
5145
5146         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5147         BUG_ON(!is_data && refs_to_drop != 1);
5148
5149         ret = lookup_extent_backref(trans, extent_root, path, &iref,
5150                                     bytenr, num_bytes, parent,
5151                                     root_objectid, owner_objectid,
5152                                     owner_offset);
5153         if (ret == 0) {
5154                 extent_slot = path->slots[0];
5155                 while (extent_slot >= 0) {
5156                         btrfs_item_key_to_cpu(path->nodes[0], &key,
5157                                               extent_slot);
5158                         if (key.objectid != bytenr)
5159                                 break;
5160                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5161                             key.offset == num_bytes) {
5162                                 found_extent = 1;
5163                                 break;
5164                         }
5165                         if (path->slots[0] - extent_slot > 5)
5166                                 break;
5167                         extent_slot--;
5168                 }
5169 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5170                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5171                 if (found_extent && item_size < sizeof(*ei))
5172                         found_extent = 0;
5173 #endif
5174                 if (!found_extent) {
5175                         BUG_ON(iref);
5176                         ret = remove_extent_backref(trans, extent_root, path,
5177                                                     NULL, refs_to_drop,
5178                                                     is_data);
5179                         if (ret) {
5180                                 btrfs_abort_transaction(trans, extent_root, ret);
5181                                 goto out;
5182                         }
5183                         btrfs_release_path(path);
5184                         path->leave_spinning = 1;
5185
5186                         key.objectid = bytenr;
5187                         key.type = BTRFS_EXTENT_ITEM_KEY;
5188                         key.offset = num_bytes;
5189
5190                         ret = btrfs_search_slot(trans, extent_root,
5191                                                 &key, path, -1, 1);
5192                         if (ret) {
5193                                 printk(KERN_ERR "umm, got %d back from search"
5194                                        ", was looking for %llu\n", ret,
5195                                        (unsigned long long)bytenr);
5196                                 if (ret > 0)
5197                                         btrfs_print_leaf(extent_root,
5198                                                          path->nodes[0]);
5199                         }
5200                         if (ret < 0) {
5201                                 btrfs_abort_transaction(trans, extent_root, ret);
5202                                 goto out;
5203                         }
5204                         extent_slot = path->slots[0];
5205                 }
5206         } else if (ret == -ENOENT) {
5207                 btrfs_print_leaf(extent_root, path->nodes[0]);
5208                 WARN_ON(1);
5209                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5210                        "parent %llu root %llu  owner %llu offset %llu\n",
5211                        (unsigned long long)bytenr,
5212                        (unsigned long long)parent,
5213                        (unsigned long long)root_objectid,
5214                        (unsigned long long)owner_objectid,
5215                        (unsigned long long)owner_offset);
5216         } else {
5217                 btrfs_abort_transaction(trans, extent_root, ret);
5218                 goto out;
5219         }
5220
5221         leaf = path->nodes[0];
5222         item_size = btrfs_item_size_nr(leaf, extent_slot);
5223 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5224         if (item_size < sizeof(*ei)) {
5225                 BUG_ON(found_extent || extent_slot != path->slots[0]);
5226                 ret = convert_extent_item_v0(trans, extent_root, path,
5227                                              owner_objectid, 0);
5228                 if (ret < 0) {
5229                         btrfs_abort_transaction(trans, extent_root, ret);
5230                         goto out;
5231                 }
5232
5233                 btrfs_release_path(path);
5234                 path->leave_spinning = 1;
5235
5236                 key.objectid = bytenr;
5237                 key.type = BTRFS_EXTENT_ITEM_KEY;
5238                 key.offset = num_bytes;
5239
5240                 ret = btrfs_search_slot(trans, extent_root, &key, path,
5241                                         -1, 1);
5242                 if (ret) {
5243                         printk(KERN_ERR "umm, got %d back from search"
5244                                ", was looking for %llu\n", ret,
5245                                (unsigned long long)bytenr);
5246                         btrfs_print_leaf(extent_root, path->nodes[0]);
5247                 }
5248                 if (ret < 0) {
5249                         btrfs_abort_transaction(trans, extent_root, ret);
5250                         goto out;
5251                 }
5252
5253                 extent_slot = path->slots[0];
5254                 leaf = path->nodes[0];
5255                 item_size = btrfs_item_size_nr(leaf, extent_slot);
5256         }
5257 #endif
5258         BUG_ON(item_size < sizeof(*ei));
5259         ei = btrfs_item_ptr(leaf, extent_slot,
5260                             struct btrfs_extent_item);
5261         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5262                 struct btrfs_tree_block_info *bi;
5263                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5264                 bi = (struct btrfs_tree_block_info *)(ei + 1);
5265                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5266         }
5267
5268         refs = btrfs_extent_refs(leaf, ei);
5269         BUG_ON(refs < refs_to_drop);
5270         refs -= refs_to_drop;
5271
5272         if (refs > 0) {
5273                 if (extent_op)
5274                         __run_delayed_extent_op(extent_op, leaf, ei);
5275                 /*
5276                  * In the case of inline back ref, reference count will
5277                  * be updated by remove_extent_backref
5278                  */
5279                 if (iref) {
5280                         BUG_ON(!found_extent);
5281                 } else {
5282                         btrfs_set_extent_refs(leaf, ei, refs);
5283                         btrfs_mark_buffer_dirty(leaf);
5284                 }
5285                 if (found_extent) {
5286                         ret = remove_extent_backref(trans, extent_root, path,
5287                                                     iref, refs_to_drop,
5288                                                     is_data);
5289                         if (ret) {
5290                                 btrfs_abort_transaction(trans, extent_root, ret);
5291                                 goto out;
5292                         }
5293                 }
5294         } else {
5295                 if (found_extent) {
5296                         BUG_ON(is_data && refs_to_drop !=
5297                                extent_data_ref_count(root, path, iref));
5298                         if (iref) {
5299                                 BUG_ON(path->slots[0] != extent_slot);
5300                         } else {
5301                                 BUG_ON(path->slots[0] != extent_slot + 1);
5302                                 path->slots[0] = extent_slot;
5303                                 num_to_del = 2;
5304                         }
5305                 }
5306
5307                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5308                                       num_to_del);
5309                 if (ret) {
5310                         btrfs_abort_transaction(trans, extent_root, ret);
5311                         goto out;
5312                 }
5313                 btrfs_release_path(path);
5314
5315                 if (is_data) {
5316                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5317                         if (ret) {
5318                                 btrfs_abort_transaction(trans, extent_root, ret);
5319                                 goto out;
5320                         }
5321                 }
5322
5323                 ret = update_block_group(trans, root, bytenr, num_bytes, 0);
5324                 if (ret) {
5325                         btrfs_abort_transaction(trans, extent_root, ret);
5326                         goto out;
5327                 }
5328         }
5329 out:
5330         btrfs_free_path(path);
5331         return ret;
5332 }
5333
5334 /*
5335  * when we free an block, it is possible (and likely) that we free the last
5336  * delayed ref for that extent as well.  This searches the delayed ref tree for
5337  * a given extent, and if there are no other delayed refs to be processed, it
5338  * removes it from the tree.
5339  */
5340 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5341                                       struct btrfs_root *root, u64 bytenr)
5342 {
5343         struct btrfs_delayed_ref_head *head;
5344         struct btrfs_delayed_ref_root *delayed_refs;
5345         struct btrfs_delayed_ref_node *ref;
5346         struct rb_node *node;
5347         int ret = 0;
5348
5349         delayed_refs = &trans->transaction->delayed_refs;
5350         spin_lock(&delayed_refs->lock);
5351         head = btrfs_find_delayed_ref_head(trans, bytenr);
5352         if (!head)
5353                 goto out;
5354
5355         node = rb_prev(&head->node.rb_node);
5356         if (!node)
5357                 goto out;
5358
5359         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5360
5361         /* there are still entries for this ref, we can't drop it */
5362         if (ref->bytenr == bytenr)
5363                 goto out;
5364
5365         if (head->extent_op) {
5366                 if (!head->must_insert_reserved)
5367                         goto out;
5368                 btrfs_free_delayed_extent_op(head->extent_op);
5369                 head->extent_op = NULL;
5370         }
5371
5372         /*
5373          * waiting for the lock here would deadlock.  If someone else has it
5374          * locked they are already in the process of dropping it anyway
5375          */
5376         if (!mutex_trylock(&head->mutex))
5377                 goto out;
5378
5379         /*
5380          * at this point we have a head with no other entries.  Go
5381          * ahead and process it.
5382          */
5383         head->node.in_tree = 0;
5384         rb_erase(&head->node.rb_node, &delayed_refs->root);
5385
5386         delayed_refs->num_entries--;
5387
5388         /*
5389          * we don't take a ref on the node because we're removing it from the
5390          * tree, so we just steal the ref the tree was holding.
5391          */
5392         delayed_refs->num_heads--;
5393         if (list_empty(&head->cluster))
5394                 delayed_refs->num_heads_ready--;
5395
5396         list_del_init(&head->cluster);
5397         spin_unlock(&delayed_refs->lock);
5398
5399         BUG_ON(head->extent_op);
5400         if (head->must_insert_reserved)
5401                 ret = 1;
5402
5403         mutex_unlock(&head->mutex);
5404         btrfs_put_delayed_ref(&head->node);
5405         return ret;
5406 out:
5407         spin_unlock(&delayed_refs->lock);
5408         return 0;
5409 }
5410
5411 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5412                            struct btrfs_root *root,
5413                            struct extent_buffer *buf,
5414                            u64 parent, int last_ref)
5415 {
5416         struct btrfs_block_group_cache *cache = NULL;
5417         int ret;
5418
5419         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5420                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5421                                         buf->start, buf->len,
5422                                         parent, root->root_key.objectid,
5423                                         btrfs_header_level(buf),
5424                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
5425                 BUG_ON(ret); /* -ENOMEM */
5426         }
5427
5428         if (!last_ref)
5429                 return;
5430
5431         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5432
5433         if (btrfs_header_generation(buf) == trans->transid) {
5434                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5435                         ret = check_ref_cleanup(trans, root, buf->start);
5436                         if (!ret)
5437                                 goto out;
5438                 }
5439
5440                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5441                         pin_down_extent(root, cache, buf->start, buf->len, 1);
5442                         goto out;
5443                 }
5444
5445                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5446
5447                 btrfs_add_free_space(cache, buf->start, buf->len);
5448                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5449         }
5450 out:
5451         /*
5452          * Deleting the buffer, clear the corrupt flag since it doesn't matter
5453          * anymore.
5454          */
5455         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5456         btrfs_put_block_group(cache);
5457 }
5458
5459 /* Can return -ENOMEM */
5460 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5461                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5462                       u64 owner, u64 offset, int for_cow)
5463 {
5464         int ret;
5465         struct btrfs_fs_info *fs_info = root->fs_info;
5466
5467         /*
5468          * tree log blocks never actually go into the extent allocation
5469          * tree, just update pinning info and exit early.
5470          */
5471         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5472                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5473                 /* unlocks the pinned mutex */
5474                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5475                 ret = 0;
5476         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5477                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5478                                         num_bytes,
5479                                         parent, root_objectid, (int)owner,
5480                                         BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5481         } else {
5482                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5483                                                 num_bytes,
5484                                                 parent, root_objectid, owner,
5485                                                 offset, BTRFS_DROP_DELAYED_REF,
5486                                                 NULL, for_cow);
5487         }
5488         return ret;
5489 }
5490
5491 static u64 stripe_align(struct btrfs_root *root, u64 val)
5492 {
5493         u64 mask = ((u64)root->stripesize - 1);
5494         u64 ret = (val + mask) & ~mask;
5495         return ret;
5496 }
5497
5498 /*
5499  * when we wait for progress in the block group caching, its because
5500  * our allocation attempt failed at least once.  So, we must sleep
5501  * and let some progress happen before we try again.
5502  *
5503  * This function will sleep at least once waiting for new free space to
5504  * show up, and then it will check the block group free space numbers
5505  * for our min num_bytes.  Another option is to have it go ahead
5506  * and look in the rbtree for a free extent of a given size, but this
5507  * is a good start.
5508  */
5509 static noinline int
5510 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5511                                 u64 num_bytes)
5512 {
5513         struct btrfs_caching_control *caching_ctl;
5514         DEFINE_WAIT(wait);
5515
5516         caching_ctl = get_caching_control(cache);
5517         if (!caching_ctl)
5518                 return 0;
5519
5520         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5521                    (cache->free_space_ctl->free_space >= num_bytes));
5522
5523         put_caching_control(caching_ctl);
5524         return 0;
5525 }
5526
5527 static noinline int
5528 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5529 {
5530         struct btrfs_caching_control *caching_ctl;
5531         DEFINE_WAIT(wait);
5532
5533         caching_ctl = get_caching_control(cache);
5534         if (!caching_ctl)
5535                 return 0;
5536
5537         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5538
5539         put_caching_control(caching_ctl);
5540         return 0;
5541 }
5542
5543 int __get_raid_index(u64 flags)
5544 {
5545         int index;
5546
5547         if (flags & BTRFS_BLOCK_GROUP_RAID10)
5548                 index = 0;
5549         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5550                 index = 1;
5551         else if (flags & BTRFS_BLOCK_GROUP_DUP)
5552                 index = 2;
5553         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5554                 index = 3;
5555         else
5556                 index = 4;
5557
5558         return index;
5559 }
5560
5561 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5562 {
5563         return __get_raid_index(cache->flags);
5564 }
5565
5566 enum btrfs_loop_type {
5567         LOOP_CACHING_NOWAIT = 0,
5568         LOOP_CACHING_WAIT = 1,
5569         LOOP_ALLOC_CHUNK = 2,
5570         LOOP_NO_EMPTY_SIZE = 3,
5571 };
5572
5573 /*
5574  * walks the btree of allocated extents and find a hole of a given size.
5575  * The key ins is changed to record the hole:
5576  * ins->objectid == block start
5577  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5578  * ins->offset == number of blocks
5579  * Any available blocks before search_start are skipped.
5580  */
5581 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5582                                      struct btrfs_root *orig_root,
5583                                      u64 num_bytes, u64 empty_size,
5584                                      u64 hint_byte, struct btrfs_key *ins,
5585                                      u64 data)
5586 {
5587         int ret = 0;
5588         struct btrfs_root *root = orig_root->fs_info->extent_root;
5589         struct btrfs_free_cluster *last_ptr = NULL;
5590         struct btrfs_block_group_cache *block_group = NULL;
5591         struct btrfs_block_group_cache *used_block_group;
5592         u64 search_start = 0;
5593         int empty_cluster = 2 * 1024 * 1024;
5594         struct btrfs_space_info *space_info;
5595         int loop = 0;
5596         int index = __get_raid_index(data);
5597         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5598                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5599         bool found_uncached_bg = false;
5600         bool failed_cluster_refill = false;
5601         bool failed_alloc = false;
5602         bool use_cluster = true;
5603         bool have_caching_bg = false;
5604
5605         WARN_ON(num_bytes < root->sectorsize);
5606         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5607         ins->objectid = 0;
5608         ins->offset = 0;
5609
5610         trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5611
5612         space_info = __find_space_info(root->fs_info, data);
5613         if (!space_info) {
5614                 printk(KERN_ERR "No space info for %llu\n", data);
5615                 return -ENOSPC;
5616         }
5617
5618         /*
5619          * If the space info is for both data and metadata it means we have a
5620          * small filesystem and we can't use the clustering stuff.
5621          */
5622         if (btrfs_mixed_space_info(space_info))
5623                 use_cluster = false;
5624
5625         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5626                 last_ptr = &root->fs_info->meta_alloc_cluster;
5627                 if (!btrfs_test_opt(root, SSD))
5628                         empty_cluster = 64 * 1024;
5629         }
5630
5631         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5632             btrfs_test_opt(root, SSD)) {
5633                 last_ptr = &root->fs_info->data_alloc_cluster;
5634         }
5635
5636         if (last_ptr) {
5637                 spin_lock(&last_ptr->lock);
5638                 if (last_ptr->block_group)
5639                         hint_byte = last_ptr->window_start;
5640                 spin_unlock(&last_ptr->lock);
5641         }
5642
5643         search_start = max(search_start, first_logical_byte(root, 0));
5644         search_start = max(search_start, hint_byte);
5645
5646         if (!last_ptr)
5647                 empty_cluster = 0;
5648
5649         if (search_start == hint_byte) {
5650                 block_group = btrfs_lookup_block_group(root->fs_info,
5651                                                        search_start);
5652                 used_block_group = block_group;
5653                 /*
5654                  * we don't want to use the block group if it doesn't match our
5655                  * allocation bits, or if its not cached.
5656                  *
5657                  * However if we are re-searching with an ideal block group
5658                  * picked out then we don't care that the block group is cached.
5659                  */
5660                 if (block_group && block_group_bits(block_group, data) &&
5661                     block_group->cached != BTRFS_CACHE_NO) {
5662                         down_read(&space_info->groups_sem);
5663                         if (list_empty(&block_group->list) ||
5664                             block_group->ro) {
5665                                 /*
5666                                  * someone is removing this block group,
5667                                  * we can't jump into the have_block_group
5668                                  * target because our list pointers are not
5669                                  * valid
5670                                  */
5671                                 btrfs_put_block_group(block_group);
5672                                 up_read(&space_info->groups_sem);
5673                         } else {
5674                                 index = get_block_group_index(block_group);
5675                                 goto have_block_group;
5676                         }
5677                 } else if (block_group) {
5678                         btrfs_put_block_group(block_group);
5679                 }
5680         }
5681 search:
5682         have_caching_bg = false;
5683         down_read(&space_info->groups_sem);
5684         list_for_each_entry(block_group, &space_info->block_groups[index],
5685                             list) {
5686                 u64 offset;
5687                 int cached;
5688
5689                 used_block_group = block_group;
5690                 btrfs_get_block_group(block_group);
5691                 search_start = block_group->key.objectid;
5692
5693                 /*
5694                  * this can happen if we end up cycling through all the
5695                  * raid types, but we want to make sure we only allocate
5696                  * for the proper type.
5697                  */
5698                 if (!block_group_bits(block_group, data)) {
5699                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
5700                                 BTRFS_BLOCK_GROUP_RAID1 |
5701                                 BTRFS_BLOCK_GROUP_RAID10;
5702
5703                         /*
5704                          * if they asked for extra copies and this block group
5705                          * doesn't provide them, bail.  This does allow us to
5706                          * fill raid0 from raid1.
5707                          */
5708                         if ((data & extra) && !(block_group->flags & extra))
5709                                 goto loop;
5710                 }
5711
5712 have_block_group:
5713                 cached = block_group_cache_done(block_group);
5714                 if (unlikely(!cached)) {
5715                         found_uncached_bg = true;
5716                         ret = cache_block_group(block_group, trans,
5717                                                 orig_root, 0);
5718                         BUG_ON(ret < 0);
5719                         ret = 0;
5720                 }
5721
5722                 if (unlikely(block_group->ro))
5723                         goto loop;
5724
5725                 /*
5726                  * Ok we want to try and use the cluster allocator, so
5727                  * lets look there
5728                  */
5729                 if (last_ptr) {
5730                         /*
5731                          * the refill lock keeps out other
5732                          * people trying to start a new cluster
5733                          */
5734                         spin_lock(&last_ptr->refill_lock);
5735                         used_block_group = last_ptr->block_group;
5736                         if (used_block_group != block_group &&
5737                             (!used_block_group ||
5738                              used_block_group->ro ||
5739                              !block_group_bits(used_block_group, data))) {
5740                                 used_block_group = block_group;
5741                                 goto refill_cluster;
5742                         }
5743
5744                         if (used_block_group != block_group)
5745                                 btrfs_get_block_group(used_block_group);
5746
5747                         offset = btrfs_alloc_from_cluster(used_block_group,
5748                           last_ptr, num_bytes, used_block_group->key.objectid);
5749                         if (offset) {
5750                                 /* we have a block, we're done */
5751                                 spin_unlock(&last_ptr->refill_lock);
5752                                 trace_btrfs_reserve_extent_cluster(root,
5753                                         block_group, search_start, num_bytes);
5754                                 goto checks;
5755                         }
5756
5757                         WARN_ON(last_ptr->block_group != used_block_group);
5758                         if (used_block_group != block_group) {
5759                                 btrfs_put_block_group(used_block_group);
5760                                 used_block_group = block_group;
5761                         }
5762 refill_cluster:
5763                         BUG_ON(used_block_group != block_group);
5764                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5765                          * set up a new clusters, so lets just skip it
5766                          * and let the allocator find whatever block
5767                          * it can find.  If we reach this point, we
5768                          * will have tried the cluster allocator
5769                          * plenty of times and not have found
5770                          * anything, so we are likely way too
5771                          * fragmented for the clustering stuff to find
5772                          * anything.
5773                          *
5774                          * However, if the cluster is taken from the
5775                          * current block group, release the cluster
5776                          * first, so that we stand a better chance of
5777                          * succeeding in the unclustered
5778                          * allocation.  */
5779                         if (loop >= LOOP_NO_EMPTY_SIZE &&
5780                             last_ptr->block_group != block_group) {
5781                                 spin_unlock(&last_ptr->refill_lock);
5782                                 goto unclustered_alloc;
5783                         }
5784
5785                         /*
5786                          * this cluster didn't work out, free it and
5787                          * start over
5788                          */
5789                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5790
5791                         if (loop >= LOOP_NO_EMPTY_SIZE) {
5792                                 spin_unlock(&last_ptr->refill_lock);
5793                                 goto unclustered_alloc;
5794                         }
5795
5796                         /* allocate a cluster in this block group */
5797                         ret = btrfs_find_space_cluster(trans, root,
5798                                                block_group, last_ptr,
5799                                                search_start, num_bytes,
5800                                                empty_cluster + empty_size);
5801                         if (ret == 0) {
5802                                 /*
5803                                  * now pull our allocation out of this
5804                                  * cluster
5805                                  */
5806                                 offset = btrfs_alloc_from_cluster(block_group,
5807                                                   last_ptr, num_bytes,
5808                                                   search_start);
5809                                 if (offset) {
5810                                         /* we found one, proceed */
5811                                         spin_unlock(&last_ptr->refill_lock);
5812                                         trace_btrfs_reserve_extent_cluster(root,
5813                                                 block_group, search_start,
5814                                                 num_bytes);
5815                                         goto checks;
5816                                 }
5817                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5818                                    && !failed_cluster_refill) {
5819                                 spin_unlock(&last_ptr->refill_lock);
5820
5821                                 failed_cluster_refill = true;
5822                                 wait_block_group_cache_progress(block_group,
5823                                        num_bytes + empty_cluster + empty_size);
5824                                 goto have_block_group;
5825                         }
5826
5827                         /*
5828                          * at this point we either didn't find a cluster
5829                          * or we weren't able to allocate a block from our
5830                          * cluster.  Free the cluster we've been trying
5831                          * to use, and go to the next block group
5832                          */
5833                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5834                         spin_unlock(&last_ptr->refill_lock);
5835                         goto loop;
5836                 }
5837
5838 unclustered_alloc:
5839                 spin_lock(&block_group->free_space_ctl->tree_lock);
5840                 if (cached &&
5841                     block_group->free_space_ctl->free_space <
5842                     num_bytes + empty_cluster + empty_size) {
5843                         spin_unlock(&block_group->free_space_ctl->tree_lock);
5844                         goto loop;
5845                 }
5846                 spin_unlock(&block_group->free_space_ctl->tree_lock);
5847
5848                 offset = btrfs_find_space_for_alloc(block_group, search_start,
5849                                                     num_bytes, empty_size);
5850                 /*
5851                  * If we didn't find a chunk, and we haven't failed on this
5852                  * block group before, and this block group is in the middle of
5853                  * caching and we are ok with waiting, then go ahead and wait
5854                  * for progress to be made, and set failed_alloc to true.
5855                  *
5856                  * If failed_alloc is true then we've already waited on this
5857                  * block group once and should move on to the next block group.
5858                  */
5859                 if (!offset && !failed_alloc && !cached &&
5860                     loop > LOOP_CACHING_NOWAIT) {
5861                         wait_block_group_cache_progress(block_group,
5862                                                 num_bytes + empty_size);
5863                         failed_alloc = true;
5864                         goto have_block_group;
5865                 } else if (!offset) {
5866                         if (!cached)
5867                                 have_caching_bg = true;
5868                         goto loop;
5869                 }
5870 checks:
5871                 search_start = stripe_align(root, offset);
5872
5873                 /* move on to the next group */
5874                 if (search_start + num_bytes >
5875                     used_block_group->key.objectid + used_block_group->key.offset) {
5876                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5877                         goto loop;
5878                 }
5879
5880                 if (offset < search_start)
5881                         btrfs_add_free_space(used_block_group, offset,
5882                                              search_start - offset);
5883                 BUG_ON(offset > search_start);
5884
5885                 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5886                                                   alloc_type);
5887                 if (ret == -EAGAIN) {
5888                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5889                         goto loop;
5890                 }
5891
5892                 /* we are all good, lets return */
5893                 ins->objectid = search_start;
5894                 ins->offset = num_bytes;
5895
5896                 trace_btrfs_reserve_extent(orig_root, block_group,
5897                                            search_start, num_bytes);
5898                 if (used_block_group != block_group)
5899                         btrfs_put_block_group(used_block_group);
5900                 btrfs_put_block_group(block_group);
5901                 break;
5902 loop:
5903                 failed_cluster_refill = false;
5904                 failed_alloc = false;
5905                 BUG_ON(index != get_block_group_index(block_group));
5906                 if (used_block_group != block_group)
5907                         btrfs_put_block_group(used_block_group);
5908                 btrfs_put_block_group(block_group);
5909         }
5910         up_read(&space_info->groups_sem);
5911
5912         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5913                 goto search;
5914
5915         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5916                 goto search;
5917
5918         /*
5919          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5920          *                      caching kthreads as we move along
5921          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5922          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5923          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5924          *                      again
5925          */
5926         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5927                 index = 0;
5928                 loop++;
5929                 if (loop == LOOP_ALLOC_CHUNK) {
5930                         ret = do_chunk_alloc(trans, root, data,
5931                                              CHUNK_ALLOC_FORCE);
5932                         /*
5933                          * Do not bail out on ENOSPC since we
5934                          * can do more things.
5935                          */
5936                         if (ret < 0 && ret != -ENOSPC) {
5937                                 btrfs_abort_transaction(trans,
5938                                                         root, ret);
5939                                 goto out;
5940                         }
5941                 }
5942
5943                 if (loop == LOOP_NO_EMPTY_SIZE) {
5944                         empty_size = 0;
5945                         empty_cluster = 0;
5946                 }
5947
5948                 goto search;
5949         } else if (!ins->objectid) {
5950                 ret = -ENOSPC;
5951         } else if (ins->objectid) {
5952                 ret = 0;
5953         }
5954 out:
5955
5956         return ret;
5957 }
5958
5959 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5960                             int dump_block_groups)
5961 {
5962         struct btrfs_block_group_cache *cache;
5963         int index = 0;
5964
5965         spin_lock(&info->lock);
5966         printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5967                (unsigned long long)info->flags,
5968                (unsigned long long)(info->total_bytes - info->bytes_used -
5969                                     info->bytes_pinned - info->bytes_reserved -
5970                                     info->bytes_readonly),
5971                (info->full) ? "" : "not ");
5972         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
5973                "reserved=%llu, may_use=%llu, readonly=%llu\n",
5974                (unsigned long long)info->total_bytes,
5975                (unsigned long long)info->bytes_used,
5976                (unsigned long long)info->bytes_pinned,
5977                (unsigned long long)info->bytes_reserved,
5978                (unsigned long long)info->bytes_may_use,
5979                (unsigned long long)info->bytes_readonly);
5980         spin_unlock(&info->lock);
5981
5982         if (!dump_block_groups)
5983                 return;
5984
5985         down_read(&info->groups_sem);
5986 again:
5987         list_for_each_entry(cache, &info->block_groups[index], list) {
5988                 spin_lock(&cache->lock);
5989                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
5990                        (unsigned long long)cache->key.objectid,
5991                        (unsigned long long)cache->key.offset,
5992                        (unsigned long long)btrfs_block_group_used(&cache->item),
5993                        (unsigned long long)cache->pinned,
5994                        (unsigned long long)cache->reserved,
5995                        cache->ro ? "[readonly]" : "");
5996                 btrfs_dump_free_space(cache, bytes);
5997                 spin_unlock(&cache->lock);
5998         }
5999         if (++index < BTRFS_NR_RAID_TYPES)
6000                 goto again;
6001         up_read(&info->groups_sem);
6002 }
6003
6004 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6005                          struct btrfs_root *root,
6006                          u64 num_bytes, u64 min_alloc_size,
6007                          u64 empty_size, u64 hint_byte,
6008                          struct btrfs_key *ins, u64 data)
6009 {
6010         bool final_tried = false;
6011         int ret;
6012
6013         data = btrfs_get_alloc_profile(root, data);
6014 again:
6015         WARN_ON(num_bytes < root->sectorsize);
6016         ret = find_free_extent(trans, root, num_bytes, empty_size,
6017                                hint_byte, ins, data);
6018
6019         if (ret == -ENOSPC) {
6020                 if (!final_tried) {
6021                         num_bytes = num_bytes >> 1;
6022                         num_bytes = num_bytes & ~(root->sectorsize - 1);
6023                         num_bytes = max(num_bytes, min_alloc_size);
6024                         if (num_bytes == min_alloc_size)
6025                                 final_tried = true;
6026                         goto again;
6027                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6028                         struct btrfs_space_info *sinfo;
6029
6030                         sinfo = __find_space_info(root->fs_info, data);
6031                         printk(KERN_ERR "btrfs allocation failed flags %llu, "
6032                                "wanted %llu\n", (unsigned long long)data,
6033                                (unsigned long long)num_bytes);
6034                         if (sinfo)
6035                                 dump_space_info(sinfo, num_bytes, 1);
6036                 }
6037         }
6038
6039         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6040
6041         return ret;
6042 }
6043
6044 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6045                                         u64 start, u64 len, int pin)
6046 {
6047         struct btrfs_block_group_cache *cache;
6048         int ret = 0;
6049
6050         cache = btrfs_lookup_block_group(root->fs_info, start);
6051         if (!cache) {
6052                 printk(KERN_ERR "Unable to find block group for %llu\n",
6053                        (unsigned long long)start);
6054                 return -ENOSPC;
6055         }
6056
6057         if (btrfs_test_opt(root, DISCARD))
6058                 ret = btrfs_discard_extent(root, start, len, NULL);
6059
6060         if (pin)
6061                 pin_down_extent(root, cache, start, len, 1);
6062         else {
6063                 btrfs_add_free_space(cache, start, len);
6064                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6065         }
6066         btrfs_put_block_group(cache);
6067
6068         trace_btrfs_reserved_extent_free(root, start, len);
6069
6070         return ret;
6071 }
6072
6073 int btrfs_free_reserved_extent(struct btrfs_root *root,
6074                                         u64 start, u64 len)
6075 {
6076         return __btrfs_free_reserved_extent(root, start, len, 0);
6077 }
6078
6079 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6080                                        u64 start, u64 len)
6081 {
6082         return __btrfs_free_reserved_extent(root, start, len, 1);
6083 }
6084
6085 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6086                                       struct btrfs_root *root,
6087                                       u64 parent, u64 root_objectid,
6088                                       u64 flags, u64 owner, u64 offset,
6089                                       struct btrfs_key *ins, int ref_mod)
6090 {
6091         int ret;
6092         struct btrfs_fs_info *fs_info = root->fs_info;
6093         struct btrfs_extent_item *extent_item;
6094         struct btrfs_extent_inline_ref *iref;
6095         struct btrfs_path *path;
6096         struct extent_buffer *leaf;
6097         int type;
6098         u32 size;
6099
6100         if (parent > 0)
6101                 type = BTRFS_SHARED_DATA_REF_KEY;
6102         else
6103                 type = BTRFS_EXTENT_DATA_REF_KEY;
6104
6105         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6106
6107         path = btrfs_alloc_path();
6108         if (!path)
6109                 return -ENOMEM;
6110
6111         path->leave_spinning = 1;
6112         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6113                                       ins, size);
6114         if (ret) {
6115                 btrfs_free_path(path);
6116                 return ret;
6117         }
6118
6119         leaf = path->nodes[0];
6120         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6121                                      struct btrfs_extent_item);
6122         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6123         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6124         btrfs_set_extent_flags(leaf, extent_item,
6125                                flags | BTRFS_EXTENT_FLAG_DATA);
6126
6127         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6128         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6129         if (parent > 0) {
6130                 struct btrfs_shared_data_ref *ref;
6131                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6132                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6133                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6134         } else {
6135                 struct btrfs_extent_data_ref *ref;
6136                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6137                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6138                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6139                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6140                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6141         }
6142
6143         btrfs_mark_buffer_dirty(path->nodes[0]);
6144         btrfs_free_path(path);
6145
6146         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6147         if (ret) { /* -ENOENT, logic error */
6148                 printk(KERN_ERR "btrfs update block group failed for %llu "
6149                        "%llu\n", (unsigned long long)ins->objectid,
6150                        (unsigned long long)ins->offset);
6151                 BUG();
6152         }
6153         return ret;
6154 }
6155
6156 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6157                                      struct btrfs_root *root,
6158                                      u64 parent, u64 root_objectid,
6159                                      u64 flags, struct btrfs_disk_key *key,
6160                                      int level, struct btrfs_key *ins)
6161 {
6162         int ret;
6163         struct btrfs_fs_info *fs_info = root->fs_info;
6164         struct btrfs_extent_item *extent_item;
6165         struct btrfs_tree_block_info *block_info;
6166         struct btrfs_extent_inline_ref *iref;
6167         struct btrfs_path *path;
6168         struct extent_buffer *leaf;
6169         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6170
6171         path = btrfs_alloc_path();
6172         if (!path)
6173                 return -ENOMEM;
6174
6175         path->leave_spinning = 1;
6176         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6177                                       ins, size);
6178         if (ret) {
6179                 btrfs_free_path(path);
6180                 return ret;
6181         }
6182
6183         leaf = path->nodes[0];
6184         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6185                                      struct btrfs_extent_item);
6186         btrfs_set_extent_refs(leaf, extent_item, 1);
6187         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6188         btrfs_set_extent_flags(leaf, extent_item,
6189                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6190         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6191
6192         btrfs_set_tree_block_key(leaf, block_info, key);
6193         btrfs_set_tree_block_level(leaf, block_info, level);
6194
6195         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6196         if (parent > 0) {
6197                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6198                 btrfs_set_extent_inline_ref_type(leaf, iref,
6199                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6200                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6201         } else {
6202                 btrfs_set_extent_inline_ref_type(leaf, iref,
6203                                                  BTRFS_TREE_BLOCK_REF_KEY);
6204                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6205         }
6206
6207         btrfs_mark_buffer_dirty(leaf);
6208         btrfs_free_path(path);
6209
6210         ret = update_block_group(trans, root, ins->objectid, ins->offset, 1);
6211         if (ret) { /* -ENOENT, logic error */
6212                 printk(KERN_ERR "btrfs update block group failed for %llu "
6213                        "%llu\n", (unsigned long long)ins->objectid,
6214                        (unsigned long long)ins->offset);
6215                 BUG();
6216         }
6217         return ret;
6218 }
6219
6220 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6221                                      struct btrfs_root *root,
6222                                      u64 root_objectid, u64 owner,
6223                                      u64 offset, struct btrfs_key *ins)
6224 {
6225         int ret;
6226
6227         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6228
6229         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6230                                          ins->offset, 0,
6231                                          root_objectid, owner, offset,
6232                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6233         return ret;
6234 }
6235
6236 /*
6237  * this is used by the tree logging recovery code.  It records that
6238  * an extent has been allocated and makes sure to clear the free
6239  * space cache bits as well
6240  */
6241 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6242                                    struct btrfs_root *root,
6243                                    u64 root_objectid, u64 owner, u64 offset,
6244                                    struct btrfs_key *ins)
6245 {
6246         int ret;
6247         struct btrfs_block_group_cache *block_group;
6248         struct btrfs_caching_control *caching_ctl;
6249         u64 start = ins->objectid;
6250         u64 num_bytes = ins->offset;
6251
6252         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6253         cache_block_group(block_group, trans, NULL, 0);
6254         caching_ctl = get_caching_control(block_group);
6255
6256         if (!caching_ctl) {
6257                 BUG_ON(!block_group_cache_done(block_group));
6258                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6259                 BUG_ON(ret); /* -ENOMEM */
6260         } else {
6261                 mutex_lock(&caching_ctl->mutex);
6262
6263                 if (start >= caching_ctl->progress) {
6264                         ret = add_excluded_extent(root, start, num_bytes);
6265                         BUG_ON(ret); /* -ENOMEM */
6266                 } else if (start + num_bytes <= caching_ctl->progress) {
6267                         ret = btrfs_remove_free_space(block_group,
6268                                                       start, num_bytes);
6269                         BUG_ON(ret); /* -ENOMEM */
6270                 } else {
6271                         num_bytes = caching_ctl->progress - start;
6272                         ret = btrfs_remove_free_space(block_group,
6273                                                       start, num_bytes);
6274                         BUG_ON(ret); /* -ENOMEM */
6275
6276                         start = caching_ctl->progress;
6277                         num_bytes = ins->objectid + ins->offset -
6278                                     caching_ctl->progress;
6279                         ret = add_excluded_extent(root, start, num_bytes);
6280                         BUG_ON(ret); /* -ENOMEM */
6281                 }
6282
6283                 mutex_unlock(&caching_ctl->mutex);
6284                 put_caching_control(caching_ctl);
6285         }
6286
6287         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6288                                           RESERVE_ALLOC_NO_ACCOUNT);
6289         BUG_ON(ret); /* logic error */
6290         btrfs_put_block_group(block_group);
6291         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6292                                          0, owner, offset, ins, 1);
6293         return ret;
6294 }
6295
6296 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6297                                             struct btrfs_root *root,
6298                                             u64 bytenr, u32 blocksize,
6299                                             int level)
6300 {
6301         struct extent_buffer *buf;
6302
6303         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6304         if (!buf)
6305                 return ERR_PTR(-ENOMEM);
6306         btrfs_set_header_generation(buf, trans->transid);
6307         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6308         btrfs_tree_lock(buf);
6309         clean_tree_block(trans, root, buf);
6310         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6311
6312         btrfs_set_lock_blocking(buf);
6313         btrfs_set_buffer_uptodate(buf);
6314
6315         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6316                 /*
6317                  * we allow two log transactions at a time, use different
6318                  * EXENT bit to differentiate dirty pages.
6319                  */
6320                 if (root->log_transid % 2 == 0)
6321                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6322                                         buf->start + buf->len - 1, GFP_NOFS);
6323                 else
6324                         set_extent_new(&root->dirty_log_pages, buf->start,
6325                                         buf->start + buf->len - 1, GFP_NOFS);
6326         } else {
6327                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6328                          buf->start + buf->len - 1, GFP_NOFS);
6329         }
6330         trans->blocks_used++;
6331         /* this returns a buffer locked for blocking */
6332         return buf;
6333 }
6334
6335 static struct btrfs_block_rsv *
6336 use_block_rsv(struct btrfs_trans_handle *trans,
6337               struct btrfs_root *root, u32 blocksize)
6338 {
6339         struct btrfs_block_rsv *block_rsv;
6340         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6341         int ret;
6342
6343         block_rsv = get_block_rsv(trans, root);
6344
6345         if (block_rsv->size == 0) {
6346                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6347                                              BTRFS_RESERVE_NO_FLUSH);
6348                 /*
6349                  * If we couldn't reserve metadata bytes try and use some from
6350                  * the global reserve.
6351                  */
6352                 if (ret && block_rsv != global_rsv) {
6353                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6354                         if (!ret)
6355                                 return global_rsv;
6356                         return ERR_PTR(ret);
6357                 } else if (ret) {
6358                         return ERR_PTR(ret);
6359                 }
6360                 return block_rsv;
6361         }
6362
6363         ret = block_rsv_use_bytes(block_rsv, blocksize);
6364         if (!ret)
6365                 return block_rsv;
6366         if (ret && !block_rsv->failfast) {
6367                 static DEFINE_RATELIMIT_STATE(_rs,
6368                                 DEFAULT_RATELIMIT_INTERVAL,
6369                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
6370                 if (__ratelimit(&_rs))
6371                         WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6372                              ret);
6373                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6374                                              BTRFS_RESERVE_NO_FLUSH);
6375                 if (!ret) {
6376                         return block_rsv;
6377                 } else if (ret && block_rsv != global_rsv) {
6378                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6379                         if (!ret)
6380                                 return global_rsv;
6381                 }
6382         }
6383
6384         return ERR_PTR(-ENOSPC);
6385 }
6386
6387 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6388                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
6389 {
6390         block_rsv_add_bytes(block_rsv, blocksize, 0);
6391         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6392 }
6393
6394 /*
6395  * finds a free extent and does all the dirty work required for allocation
6396  * returns the key for the extent through ins, and a tree buffer for
6397  * the first block of the extent through buf.
6398  *
6399  * returns the tree buffer or NULL.
6400  */
6401 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6402                                         struct btrfs_root *root, u32 blocksize,
6403                                         u64 parent, u64 root_objectid,
6404                                         struct btrfs_disk_key *key, int level,
6405                                         u64 hint, u64 empty_size)
6406 {
6407         struct btrfs_key ins;
6408         struct btrfs_block_rsv *block_rsv;
6409         struct extent_buffer *buf;
6410         u64 flags = 0;
6411         int ret;
6412
6413
6414         block_rsv = use_block_rsv(trans, root, blocksize);
6415         if (IS_ERR(block_rsv))
6416                 return ERR_CAST(block_rsv);
6417
6418         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6419                                    empty_size, hint, &ins, 0);
6420         if (ret) {
6421                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6422                 return ERR_PTR(ret);
6423         }
6424
6425         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6426                                     blocksize, level);
6427         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6428
6429         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6430                 if (parent == 0)
6431                         parent = ins.objectid;
6432                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6433         } else
6434                 BUG_ON(parent > 0);
6435
6436         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6437                 struct btrfs_delayed_extent_op *extent_op;
6438                 extent_op = btrfs_alloc_delayed_extent_op();
6439                 BUG_ON(!extent_op); /* -ENOMEM */
6440                 if (key)
6441                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6442                 else
6443                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6444                 extent_op->flags_to_set = flags;
6445                 extent_op->update_key = 1;
6446                 extent_op->update_flags = 1;
6447                 extent_op->is_data = 0;
6448
6449                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6450                                         ins.objectid,
6451                                         ins.offset, parent, root_objectid,
6452                                         level, BTRFS_ADD_DELAYED_EXTENT,
6453                                         extent_op, 0);
6454                 BUG_ON(ret); /* -ENOMEM */
6455         }
6456         return buf;
6457 }
6458
6459 struct walk_control {
6460         u64 refs[BTRFS_MAX_LEVEL];
6461         u64 flags[BTRFS_MAX_LEVEL];
6462         struct btrfs_key update_progress;
6463         int stage;
6464         int level;
6465         int shared_level;
6466         int update_ref;
6467         int keep_locks;
6468         int reada_slot;
6469         int reada_count;
6470         int for_reloc;
6471 };
6472
6473 #define DROP_REFERENCE  1
6474 #define UPDATE_BACKREF  2
6475
6476 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6477                                      struct btrfs_root *root,
6478                                      struct walk_control *wc,
6479                                      struct btrfs_path *path)
6480 {
6481         u64 bytenr;
6482         u64 generation;
6483         u64 refs;
6484         u64 flags;
6485         u32 nritems;
6486         u32 blocksize;
6487         struct btrfs_key key;
6488         struct extent_buffer *eb;
6489         int ret;
6490         int slot;
6491         int nread = 0;
6492
6493         if (path->slots[wc->level] < wc->reada_slot) {
6494                 wc->reada_count = wc->reada_count * 2 / 3;
6495                 wc->reada_count = max(wc->reada_count, 2);
6496         } else {
6497                 wc->reada_count = wc->reada_count * 3 / 2;
6498                 wc->reada_count = min_t(int, wc->reada_count,
6499                                         BTRFS_NODEPTRS_PER_BLOCK(root));
6500         }
6501
6502         eb = path->nodes[wc->level];
6503         nritems = btrfs_header_nritems(eb);
6504         blocksize = btrfs_level_size(root, wc->level - 1);
6505
6506         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6507                 if (nread >= wc->reada_count)
6508                         break;
6509
6510                 cond_resched();
6511                 bytenr = btrfs_node_blockptr(eb, slot);
6512                 generation = btrfs_node_ptr_generation(eb, slot);
6513
6514                 if (slot == path->slots[wc->level])
6515                         goto reada;
6516
6517                 if (wc->stage == UPDATE_BACKREF &&
6518                     generation <= root->root_key.offset)
6519                         continue;
6520
6521                 /* We don't lock the tree block, it's OK to be racy here */
6522                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6523                                                &refs, &flags);
6524                 /* We don't care about errors in readahead. */
6525                 if (ret < 0)
6526                         continue;
6527                 BUG_ON(refs == 0);
6528
6529                 if (wc->stage == DROP_REFERENCE) {
6530                         if (refs == 1)
6531                                 goto reada;
6532
6533                         if (wc->level == 1 &&
6534                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6535                                 continue;
6536                         if (!wc->update_ref ||
6537                             generation <= root->root_key.offset)
6538                                 continue;
6539                         btrfs_node_key_to_cpu(eb, &key, slot);
6540                         ret = btrfs_comp_cpu_keys(&key,
6541                                                   &wc->update_progress);
6542                         if (ret < 0)
6543                                 continue;
6544                 } else {
6545                         if (wc->level == 1 &&
6546                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6547                                 continue;
6548                 }
6549 reada:
6550                 ret = readahead_tree_block(root, bytenr, blocksize,
6551                                            generation);
6552                 if (ret)
6553                         break;
6554                 nread++;
6555         }
6556         wc->reada_slot = slot;
6557 }
6558
6559 /*
6560  * hepler to process tree block while walking down the tree.
6561  *
6562  * when wc->stage == UPDATE_BACKREF, this function updates
6563  * back refs for pointers in the block.
6564  *
6565  * NOTE: return value 1 means we should stop walking down.
6566  */
6567 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6568                                    struct btrfs_root *root,
6569                                    struct btrfs_path *path,
6570                                    struct walk_control *wc, int lookup_info)
6571 {
6572         int level = wc->level;
6573         struct extent_buffer *eb = path->nodes[level];
6574         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6575         int ret;
6576
6577         if (wc->stage == UPDATE_BACKREF &&
6578             btrfs_header_owner(eb) != root->root_key.objectid)
6579                 return 1;
6580
6581         /*
6582          * when reference count of tree block is 1, it won't increase
6583          * again. once full backref flag is set, we never clear it.
6584          */
6585         if (lookup_info &&
6586             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6587              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6588                 BUG_ON(!path->locks[level]);
6589                 ret = btrfs_lookup_extent_info(trans, root,
6590                                                eb->start, eb->len,
6591                                                &wc->refs[level],
6592                                                &wc->flags[level]);
6593                 BUG_ON(ret == -ENOMEM);
6594                 if (ret)
6595                         return ret;
6596                 BUG_ON(wc->refs[level] == 0);
6597         }
6598
6599         if (wc->stage == DROP_REFERENCE) {
6600                 if (wc->refs[level] > 1)
6601                         return 1;
6602
6603                 if (path->locks[level] && !wc->keep_locks) {
6604                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6605                         path->locks[level] = 0;
6606                 }
6607                 return 0;
6608         }
6609
6610         /* wc->stage == UPDATE_BACKREF */
6611         if (!(wc->flags[level] & flag)) {
6612                 BUG_ON(!path->locks[level]);
6613                 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6614                 BUG_ON(ret); /* -ENOMEM */
6615                 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6616                 BUG_ON(ret); /* -ENOMEM */
6617                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6618                                                   eb->len, flag, 0);
6619                 BUG_ON(ret); /* -ENOMEM */
6620                 wc->flags[level] |= flag;
6621         }
6622
6623         /*
6624          * the block is shared by multiple trees, so it's not good to
6625          * keep the tree lock
6626          */
6627         if (path->locks[level] && level > 0) {
6628                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6629                 path->locks[level] = 0;
6630         }
6631         return 0;
6632 }
6633
6634 /*
6635  * hepler to process tree block pointer.
6636  *
6637  * when wc->stage == DROP_REFERENCE, this function checks
6638  * reference count of the block pointed to. if the block
6639  * is shared and we need update back refs for the subtree
6640  * rooted at the block, this function changes wc->stage to
6641  * UPDATE_BACKREF. if the block is shared and there is no
6642  * need to update back, this function drops the reference
6643  * to the block.
6644  *
6645  * NOTE: return value 1 means we should stop walking down.
6646  */
6647 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6648                                  struct btrfs_root *root,
6649                                  struct btrfs_path *path,
6650                                  struct walk_control *wc, int *lookup_info)
6651 {
6652         u64 bytenr;
6653         u64 generation;
6654         u64 parent;
6655         u32 blocksize;
6656         struct btrfs_key key;
6657         struct extent_buffer *next;
6658         int level = wc->level;
6659         int reada = 0;
6660         int ret = 0;
6661
6662         generation = btrfs_node_ptr_generation(path->nodes[level],
6663                                                path->slots[level]);
6664         /*
6665          * if the lower level block was created before the snapshot
6666          * was created, we know there is no need to update back refs
6667          * for the subtree
6668          */
6669         if (wc->stage == UPDATE_BACKREF &&
6670             generation <= root->root_key.offset) {
6671                 *lookup_info = 1;
6672                 return 1;
6673         }
6674
6675         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6676         blocksize = btrfs_level_size(root, level - 1);
6677
6678         next = btrfs_find_tree_block(root, bytenr, blocksize);
6679         if (!next) {
6680                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6681                 if (!next)
6682                         return -ENOMEM;
6683                 reada = 1;
6684         }
6685         btrfs_tree_lock(next);
6686         btrfs_set_lock_blocking(next);
6687
6688         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6689                                        &wc->refs[level - 1],
6690                                        &wc->flags[level - 1]);
6691         if (ret < 0) {
6692                 btrfs_tree_unlock(next);
6693                 return ret;
6694         }
6695
6696         BUG_ON(wc->refs[level - 1] == 0);
6697         *lookup_info = 0;
6698
6699         if (wc->stage == DROP_REFERENCE) {
6700                 if (wc->refs[level - 1] > 1) {
6701                         if (level == 1 &&
6702                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6703                                 goto skip;
6704
6705                         if (!wc->update_ref ||
6706                             generation <= root->root_key.offset)
6707                                 goto skip;
6708
6709                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6710                                               path->slots[level]);
6711                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6712                         if (ret < 0)
6713                                 goto skip;
6714
6715                         wc->stage = UPDATE_BACKREF;
6716                         wc->shared_level = level - 1;
6717                 }
6718         } else {
6719                 if (level == 1 &&
6720                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6721                         goto skip;
6722         }
6723
6724         if (!btrfs_buffer_uptodate(next, generation, 0)) {
6725                 btrfs_tree_unlock(next);
6726                 free_extent_buffer(next);
6727                 next = NULL;
6728                 *lookup_info = 1;
6729         }
6730
6731         if (!next) {
6732                 if (reada && level == 1)
6733                         reada_walk_down(trans, root, wc, path);
6734                 next = read_tree_block(root, bytenr, blocksize, generation);
6735                 if (!next)
6736                         return -EIO;
6737                 btrfs_tree_lock(next);
6738                 btrfs_set_lock_blocking(next);
6739         }
6740
6741         level--;
6742         BUG_ON(level != btrfs_header_level(next));
6743         path->nodes[level] = next;
6744         path->slots[level] = 0;
6745         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6746         wc->level = level;
6747         if (wc->level == 1)
6748                 wc->reada_slot = 0;
6749         return 0;
6750 skip:
6751         wc->refs[level - 1] = 0;
6752         wc->flags[level - 1] = 0;
6753         if (wc->stage == DROP_REFERENCE) {
6754                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6755                         parent = path->nodes[level]->start;
6756                 } else {
6757                         BUG_ON(root->root_key.objectid !=
6758                                btrfs_header_owner(path->nodes[level]));
6759                         parent = 0;
6760                 }
6761
6762                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6763                                 root->root_key.objectid, level - 1, 0, 0);
6764                 BUG_ON(ret); /* -ENOMEM */
6765         }
6766         btrfs_tree_unlock(next);
6767         free_extent_buffer(next);
6768         *lookup_info = 1;
6769         return 1;
6770 }
6771
6772 /*
6773  * hepler to process tree block while walking up the tree.
6774  *
6775  * when wc->stage == DROP_REFERENCE, this function drops
6776  * reference count on the block.
6777  *
6778  * when wc->stage == UPDATE_BACKREF, this function changes
6779  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6780  * to UPDATE_BACKREF previously while processing the block.
6781  *
6782  * NOTE: return value 1 means we should stop walking up.
6783  */
6784 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6785                                  struct btrfs_root *root,
6786                                  struct btrfs_path *path,
6787                                  struct walk_control *wc)
6788 {
6789         int ret;
6790         int level = wc->level;
6791         struct extent_buffer *eb = path->nodes[level];
6792         u64 parent = 0;
6793
6794         if (wc->stage == UPDATE_BACKREF) {
6795                 BUG_ON(wc->shared_level < level);
6796                 if (level < wc->shared_level)
6797                         goto out;
6798
6799                 ret = find_next_key(path, level + 1, &wc->update_progress);
6800                 if (ret > 0)
6801                         wc->update_ref = 0;
6802
6803                 wc->stage = DROP_REFERENCE;
6804                 wc->shared_level = -1;
6805                 path->slots[level] = 0;
6806
6807                 /*
6808                  * check reference count again if the block isn't locked.
6809                  * we should start walking down the tree again if reference
6810                  * count is one.
6811                  */
6812                 if (!path->locks[level]) {
6813                         BUG_ON(level == 0);
6814                         btrfs_tree_lock(eb);
6815                         btrfs_set_lock_blocking(eb);
6816                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6817
6818                         ret = btrfs_lookup_extent_info(trans, root,
6819                                                        eb->start, eb->len,
6820                                                        &wc->refs[level],
6821                                                        &wc->flags[level]);
6822                         if (ret < 0) {
6823                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6824                                 path->locks[level] = 0;
6825                                 return ret;
6826                         }
6827                         BUG_ON(wc->refs[level] == 0);
6828                         if (wc->refs[level] == 1) {
6829                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6830                                 path->locks[level] = 0;
6831                                 return 1;
6832                         }
6833                 }
6834         }
6835
6836         /* wc->stage == DROP_REFERENCE */
6837         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6838
6839         if (wc->refs[level] == 1) {
6840                 if (level == 0) {
6841                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6842                                 ret = btrfs_dec_ref(trans, root, eb, 1,
6843                                                     wc->for_reloc);
6844                         else
6845                                 ret = btrfs_dec_ref(trans, root, eb, 0,
6846                                                     wc->for_reloc);
6847                         BUG_ON(ret); /* -ENOMEM */
6848                 }
6849                 /* make block locked assertion in clean_tree_block happy */
6850                 if (!path->locks[level] &&
6851                     btrfs_header_generation(eb) == trans->transid) {
6852                         btrfs_tree_lock(eb);
6853                         btrfs_set_lock_blocking(eb);
6854                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6855                 }
6856                 clean_tree_block(trans, root, eb);
6857         }
6858
6859         if (eb == root->node) {
6860                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6861                         parent = eb->start;
6862                 else
6863                         BUG_ON(root->root_key.objectid !=
6864                                btrfs_header_owner(eb));
6865         } else {
6866                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6867                         parent = path->nodes[level + 1]->start;
6868                 else
6869                         BUG_ON(root->root_key.objectid !=
6870                                btrfs_header_owner(path->nodes[level + 1]));
6871         }
6872
6873         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6874 out:
6875         wc->refs[level] = 0;
6876         wc->flags[level] = 0;
6877         return 0;
6878 }
6879
6880 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6881                                    struct btrfs_root *root,
6882                                    struct btrfs_path *path,
6883                                    struct walk_control *wc)
6884 {
6885         int level = wc->level;
6886         int lookup_info = 1;
6887         int ret;
6888
6889         while (level >= 0) {
6890                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6891                 if (ret > 0)
6892                         break;
6893
6894                 if (level == 0)
6895                         break;
6896
6897                 if (path->slots[level] >=
6898                     btrfs_header_nritems(path->nodes[level]))
6899                         break;
6900
6901                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6902                 if (ret > 0) {
6903                         path->slots[level]++;
6904                         continue;
6905                 } else if (ret < 0)
6906                         return ret;
6907                 level = wc->level;
6908         }
6909         return 0;
6910 }
6911
6912 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6913                                  struct btrfs_root *root,
6914                                  struct btrfs_path *path,
6915                                  struct walk_control *wc, int max_level)
6916 {
6917         int level = wc->level;
6918         int ret;
6919
6920         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6921         while (level < max_level && path->nodes[level]) {
6922                 wc->level = level;
6923                 if (path->slots[level] + 1 <
6924                     btrfs_header_nritems(path->nodes[level])) {
6925                         path->slots[level]++;
6926                         return 0;
6927                 } else {
6928                         ret = walk_up_proc(trans, root, path, wc);
6929                         if (ret > 0)
6930                                 return 0;
6931
6932                         if (path->locks[level]) {
6933                                 btrfs_tree_unlock_rw(path->nodes[level],
6934                                                      path->locks[level]);
6935                                 path->locks[level] = 0;
6936                         }
6937                         free_extent_buffer(path->nodes[level]);
6938                         path->nodes[level] = NULL;
6939                         level++;
6940                 }
6941         }
6942         return 1;
6943 }
6944
6945 /*
6946  * drop a subvolume tree.
6947  *
6948  * this function traverses the tree freeing any blocks that only
6949  * referenced by the tree.
6950  *
6951  * when a shared tree block is found. this function decreases its
6952  * reference count by one. if update_ref is true, this function
6953  * also make sure backrefs for the shared block and all lower level
6954  * blocks are properly updated.
6955  */
6956 int btrfs_drop_snapshot(struct btrfs_root *root,
6957                          struct btrfs_block_rsv *block_rsv, int update_ref,
6958                          int for_reloc)
6959 {
6960         struct btrfs_path *path;
6961         struct btrfs_trans_handle *trans;
6962         struct btrfs_root *tree_root = root->fs_info->tree_root;
6963         struct btrfs_root_item *root_item = &root->root_item;
6964         struct walk_control *wc;
6965         struct btrfs_key key;
6966         int err = 0;
6967         int ret;
6968         int level;
6969
6970         path = btrfs_alloc_path();
6971         if (!path) {
6972                 err = -ENOMEM;
6973                 goto out;
6974         }
6975
6976         wc = kzalloc(sizeof(*wc), GFP_NOFS);
6977         if (!wc) {
6978                 btrfs_free_path(path);
6979                 err = -ENOMEM;
6980                 goto out;
6981         }
6982
6983         trans = btrfs_start_transaction(tree_root, 0);
6984         if (IS_ERR(trans)) {
6985                 err = PTR_ERR(trans);
6986                 goto out_free;
6987         }
6988
6989         if (block_rsv)
6990                 trans->block_rsv = block_rsv;
6991
6992         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
6993                 level = btrfs_header_level(root->node);
6994                 path->nodes[level] = btrfs_lock_root_node(root);
6995                 btrfs_set_lock_blocking(path->nodes[level]);
6996                 path->slots[level] = 0;
6997                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6998                 memset(&wc->update_progress, 0,
6999                        sizeof(wc->update_progress));
7000         } else {
7001                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7002                 memcpy(&wc->update_progress, &key,
7003                        sizeof(wc->update_progress));
7004
7005                 level = root_item->drop_level;
7006                 BUG_ON(level == 0);
7007                 path->lowest_level = level;
7008                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7009                 path->lowest_level = 0;
7010                 if (ret < 0) {
7011                         err = ret;
7012                         goto out_end_trans;
7013                 }
7014                 WARN_ON(ret > 0);
7015
7016                 /*
7017                  * unlock our path, this is safe because only this
7018                  * function is allowed to delete this snapshot
7019                  */
7020                 btrfs_unlock_up_safe(path, 0);
7021
7022                 level = btrfs_header_level(root->node);
7023                 while (1) {
7024                         btrfs_tree_lock(path->nodes[level]);
7025                         btrfs_set_lock_blocking(path->nodes[level]);
7026
7027                         ret = btrfs_lookup_extent_info(trans, root,
7028                                                 path->nodes[level]->start,
7029                                                 path->nodes[level]->len,
7030                                                 &wc->refs[level],
7031                                                 &wc->flags[level]);
7032                         if (ret < 0) {
7033                                 err = ret;
7034                                 goto out_end_trans;
7035                         }
7036                         BUG_ON(wc->refs[level] == 0);
7037
7038                         if (level == root_item->drop_level)
7039                                 break;
7040
7041                         btrfs_tree_unlock(path->nodes[level]);
7042                         WARN_ON(wc->refs[level] != 1);
7043                         level--;
7044                 }
7045         }
7046
7047         wc->level = level;
7048         wc->shared_level = -1;
7049         wc->stage = DROP_REFERENCE;
7050         wc->update_ref = update_ref;
7051         wc->keep_locks = 0;
7052         wc->for_reloc = for_reloc;
7053         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7054
7055         while (1) {
7056                 ret = walk_down_tree(trans, root, path, wc);
7057                 if (ret < 0) {
7058                         err = ret;
7059                         break;
7060                 }
7061
7062                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7063                 if (ret < 0) {
7064                         err = ret;
7065                         break;
7066                 }
7067
7068                 if (ret > 0) {
7069                         BUG_ON(wc->stage != DROP_REFERENCE);
7070                         break;
7071                 }
7072
7073                 if (wc->stage == DROP_REFERENCE) {
7074                         level = wc->level;
7075                         btrfs_node_key(path->nodes[level],
7076                                        &root_item->drop_progress,
7077                                        path->slots[level]);
7078                         root_item->drop_level = level;
7079                 }
7080
7081                 BUG_ON(wc->level == 0);
7082                 if (btrfs_should_end_transaction(trans, tree_root)) {
7083                         ret = btrfs_update_root(trans, tree_root,
7084                                                 &root->root_key,
7085                                                 root_item);
7086                         if (ret) {
7087                                 btrfs_abort_transaction(trans, tree_root, ret);
7088                                 err = ret;
7089                                 goto out_end_trans;
7090                         }
7091
7092                         btrfs_end_transaction_throttle(trans, tree_root);
7093                         trans = btrfs_start_transaction(tree_root, 0);
7094                         if (IS_ERR(trans)) {
7095                                 err = PTR_ERR(trans);
7096                                 goto out_free;
7097                         }
7098                         if (block_rsv)
7099                                 trans->block_rsv = block_rsv;
7100                 }
7101         }
7102         btrfs_release_path(path);
7103         if (err)
7104                 goto out_end_trans;
7105
7106         ret = btrfs_del_root(trans, tree_root, &root->root_key);
7107         if (ret) {
7108                 btrfs_abort_transaction(trans, tree_root, ret);
7109                 goto out_end_trans;
7110         }
7111
7112         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7113                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7114                                            NULL, NULL);
7115                 if (ret < 0) {
7116                         btrfs_abort_transaction(trans, tree_root, ret);
7117                         err = ret;
7118                         goto out_end_trans;
7119                 } else if (ret > 0) {
7120                         /* if we fail to delete the orphan item this time
7121                          * around, it'll get picked up the next time.
7122                          *
7123                          * The most common failure here is just -ENOENT.
7124                          */
7125                         btrfs_del_orphan_item(trans, tree_root,
7126                                               root->root_key.objectid);
7127                 }
7128         }
7129
7130         if (root->in_radix) {
7131                 btrfs_free_fs_root(tree_root->fs_info, root);
7132         } else {
7133                 free_extent_buffer(root->node);
7134                 free_extent_buffer(root->commit_root);
7135                 kfree(root);
7136         }
7137 out_end_trans:
7138         btrfs_end_transaction_throttle(trans, tree_root);
7139 out_free:
7140         kfree(wc);
7141         btrfs_free_path(path);
7142 out:
7143         if (err)
7144                 btrfs_std_error(root->fs_info, err);
7145         return err;
7146 }
7147
7148 /*
7149  * drop subtree rooted at tree block 'node'.
7150  *
7151  * NOTE: this function will unlock and release tree block 'node'
7152  * only used by relocation code
7153  */
7154 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7155                         struct btrfs_root *root,
7156                         struct extent_buffer *node,
7157                         struct extent_buffer *parent)
7158 {
7159         struct btrfs_path *path;
7160         struct walk_control *wc;
7161         int level;
7162         int parent_level;
7163         int ret = 0;
7164         int wret;
7165
7166         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7167
7168         path = btrfs_alloc_path();
7169         if (!path)
7170                 return -ENOMEM;
7171
7172         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7173         if (!wc) {
7174                 btrfs_free_path(path);
7175                 return -ENOMEM;
7176         }
7177
7178         btrfs_assert_tree_locked(parent);
7179         parent_level = btrfs_header_level(parent);
7180         extent_buffer_get(parent);
7181         path->nodes[parent_level] = parent;
7182         path->slots[parent_level] = btrfs_header_nritems(parent);
7183
7184         btrfs_assert_tree_locked(node);
7185         level = btrfs_header_level(node);
7186         path->nodes[level] = node;
7187         path->slots[level] = 0;
7188         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7189
7190         wc->refs[parent_level] = 1;
7191         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7192         wc->level = level;
7193         wc->shared_level = -1;
7194         wc->stage = DROP_REFERENCE;
7195         wc->update_ref = 0;
7196         wc->keep_locks = 1;
7197         wc->for_reloc = 1;
7198         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7199
7200         while (1) {
7201                 wret = walk_down_tree(trans, root, path, wc);
7202                 if (wret < 0) {
7203                         ret = wret;
7204                         break;
7205                 }
7206
7207                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7208                 if (wret < 0)
7209                         ret = wret;
7210                 if (wret != 0)
7211                         break;
7212         }
7213
7214         kfree(wc);
7215         btrfs_free_path(path);
7216         return ret;
7217 }
7218
7219 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7220 {
7221         u64 num_devices;
7222         u64 stripped;
7223
7224         /*
7225          * if restripe for this chunk_type is on pick target profile and
7226          * return, otherwise do the usual balance
7227          */
7228         stripped = get_restripe_target(root->fs_info, flags);
7229         if (stripped)
7230                 return extended_to_chunk(stripped);
7231
7232         /*
7233          * we add in the count of missing devices because we want
7234          * to make sure that any RAID levels on a degraded FS
7235          * continue to be honored.
7236          */
7237         num_devices = root->fs_info->fs_devices->rw_devices +
7238                 root->fs_info->fs_devices->missing_devices;
7239
7240         stripped = BTRFS_BLOCK_GROUP_RAID0 |
7241                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7242
7243         if (num_devices == 1) {
7244                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7245                 stripped = flags & ~stripped;
7246
7247                 /* turn raid0 into single device chunks */
7248                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7249                         return stripped;
7250
7251                 /* turn mirroring into duplication */
7252                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7253                              BTRFS_BLOCK_GROUP_RAID10))
7254                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7255         } else {
7256                 /* they already had raid on here, just return */
7257                 if (flags & stripped)
7258                         return flags;
7259
7260                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7261                 stripped = flags & ~stripped;
7262
7263                 /* switch duplicated blocks with raid1 */
7264                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7265                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7266
7267                 /* this is drive concat, leave it alone */
7268         }
7269
7270         return flags;
7271 }
7272
7273 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7274 {
7275         struct btrfs_space_info *sinfo = cache->space_info;
7276         u64 num_bytes;
7277         u64 min_allocable_bytes;
7278         int ret = -ENOSPC;
7279
7280
7281         /*
7282          * We need some metadata space and system metadata space for
7283          * allocating chunks in some corner cases until we force to set
7284          * it to be readonly.
7285          */
7286         if ((sinfo->flags &
7287              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7288             !force)
7289                 min_allocable_bytes = 1 * 1024 * 1024;
7290         else
7291                 min_allocable_bytes = 0;
7292
7293         spin_lock(&sinfo->lock);
7294         spin_lock(&cache->lock);
7295
7296         if (cache->ro) {
7297                 ret = 0;
7298                 goto out;
7299         }
7300
7301         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7302                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7303
7304         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7305             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7306             min_allocable_bytes <= sinfo->total_bytes) {
7307                 sinfo->bytes_readonly += num_bytes;
7308                 cache->ro = 1;
7309                 ret = 0;
7310         }
7311 out:
7312         spin_unlock(&cache->lock);
7313         spin_unlock(&sinfo->lock);
7314         return ret;
7315 }
7316
7317 int btrfs_set_block_group_ro(struct btrfs_root *root,
7318                              struct btrfs_block_group_cache *cache)
7319
7320 {
7321         struct btrfs_trans_handle *trans;
7322         u64 alloc_flags;
7323         int ret;
7324
7325         BUG_ON(cache->ro);
7326
7327         trans = btrfs_join_transaction(root);
7328         if (IS_ERR(trans))
7329                 return PTR_ERR(trans);
7330
7331         alloc_flags = update_block_group_flags(root, cache->flags);
7332         if (alloc_flags != cache->flags) {
7333                 ret = do_chunk_alloc(trans, root, alloc_flags,
7334                                      CHUNK_ALLOC_FORCE);
7335                 if (ret < 0)
7336                         goto out;
7337         }
7338
7339         ret = set_block_group_ro(cache, 0);
7340         if (!ret)
7341                 goto out;
7342         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7343         ret = do_chunk_alloc(trans, root, alloc_flags,
7344                              CHUNK_ALLOC_FORCE);
7345         if (ret < 0)
7346                 goto out;
7347         ret = set_block_group_ro(cache, 0);
7348 out:
7349         btrfs_end_transaction(trans, root);
7350         return ret;
7351 }
7352
7353 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7354                             struct btrfs_root *root, u64 type)
7355 {
7356         u64 alloc_flags = get_alloc_profile(root, type);
7357         return do_chunk_alloc(trans, root, alloc_flags,
7358                               CHUNK_ALLOC_FORCE);
7359 }
7360
7361 /*
7362  * helper to account the unused space of all the readonly block group in the
7363  * list. takes mirrors into account.
7364  */
7365 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7366 {
7367         struct btrfs_block_group_cache *block_group;
7368         u64 free_bytes = 0;
7369         int factor;
7370
7371         list_for_each_entry(block_group, groups_list, list) {
7372                 spin_lock(&block_group->lock);
7373
7374                 if (!block_group->ro) {
7375                         spin_unlock(&block_group->lock);
7376                         continue;
7377                 }
7378
7379                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7380                                           BTRFS_BLOCK_GROUP_RAID10 |
7381                                           BTRFS_BLOCK_GROUP_DUP))
7382                         factor = 2;
7383                 else
7384                         factor = 1;
7385
7386                 free_bytes += (block_group->key.offset -
7387                                btrfs_block_group_used(&block_group->item)) *
7388                                factor;
7389
7390                 spin_unlock(&block_group->lock);
7391         }
7392
7393         return free_bytes;
7394 }
7395
7396 /*
7397  * helper to account the unused space of all the readonly block group in the
7398  * space_info. takes mirrors into account.
7399  */
7400 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7401 {
7402         int i;
7403         u64 free_bytes = 0;
7404
7405         spin_lock(&sinfo->lock);
7406
7407         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7408                 if (!list_empty(&sinfo->block_groups[i]))
7409                         free_bytes += __btrfs_get_ro_block_group_free_space(
7410                                                 &sinfo->block_groups[i]);
7411
7412         spin_unlock(&sinfo->lock);
7413
7414         return free_bytes;
7415 }
7416
7417 void btrfs_set_block_group_rw(struct btrfs_root *root,
7418                               struct btrfs_block_group_cache *cache)
7419 {
7420         struct btrfs_space_info *sinfo = cache->space_info;
7421         u64 num_bytes;
7422
7423         BUG_ON(!cache->ro);
7424
7425         spin_lock(&sinfo->lock);
7426         spin_lock(&cache->lock);
7427         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7428                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7429         sinfo->bytes_readonly -= num_bytes;
7430         cache->ro = 0;
7431         spin_unlock(&cache->lock);
7432         spin_unlock(&sinfo->lock);
7433 }
7434
7435 /*
7436  * checks to see if its even possible to relocate this block group.
7437  *
7438  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7439  * ok to go ahead and try.
7440  */
7441 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7442 {
7443         struct btrfs_block_group_cache *block_group;
7444         struct btrfs_space_info *space_info;
7445         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7446         struct btrfs_device *device;
7447         u64 min_free;
7448         u64 dev_min = 1;
7449         u64 dev_nr = 0;
7450         u64 target;
7451         int index;
7452         int full = 0;
7453         int ret = 0;
7454
7455         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7456
7457         /* odd, couldn't find the block group, leave it alone */
7458         if (!block_group)
7459                 return -1;
7460
7461         min_free = btrfs_block_group_used(&block_group->item);
7462
7463         /* no bytes used, we're good */
7464         if (!min_free)
7465                 goto out;
7466
7467         space_info = block_group->space_info;
7468         spin_lock(&space_info->lock);
7469
7470         full = space_info->full;
7471
7472         /*
7473          * if this is the last block group we have in this space, we can't
7474          * relocate it unless we're able to allocate a new chunk below.
7475          *
7476          * Otherwise, we need to make sure we have room in the space to handle
7477          * all of the extents from this block group.  If we can, we're good
7478          */
7479         if ((space_info->total_bytes != block_group->key.offset) &&
7480             (space_info->bytes_used + space_info->bytes_reserved +
7481              space_info->bytes_pinned + space_info->bytes_readonly +
7482              min_free < space_info->total_bytes)) {
7483                 spin_unlock(&space_info->lock);
7484                 goto out;
7485         }
7486         spin_unlock(&space_info->lock);
7487
7488         /*
7489          * ok we don't have enough space, but maybe we have free space on our
7490          * devices to allocate new chunks for relocation, so loop through our
7491          * alloc devices and guess if we have enough space.  if this block
7492          * group is going to be restriped, run checks against the target
7493          * profile instead of the current one.
7494          */
7495         ret = -1;
7496
7497         /*
7498          * index:
7499          *      0: raid10
7500          *      1: raid1
7501          *      2: dup
7502          *      3: raid0
7503          *      4: single
7504          */
7505         target = get_restripe_target(root->fs_info, block_group->flags);
7506         if (target) {
7507                 index = __get_raid_index(extended_to_chunk(target));
7508         } else {
7509                 /*
7510                  * this is just a balance, so if we were marked as full
7511                  * we know there is no space for a new chunk
7512                  */
7513                 if (full)
7514                         goto out;
7515
7516                 index = get_block_group_index(block_group);
7517         }
7518
7519         if (index == 0) {
7520                 dev_min = 4;
7521                 /* Divide by 2 */
7522                 min_free >>= 1;
7523         } else if (index == 1) {
7524                 dev_min = 2;
7525         } else if (index == 2) {
7526                 /* Multiply by 2 */
7527                 min_free <<= 1;
7528         } else if (index == 3) {
7529                 dev_min = fs_devices->rw_devices;
7530                 do_div(min_free, dev_min);
7531         }
7532
7533         mutex_lock(&root->fs_info->chunk_mutex);
7534         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7535                 u64 dev_offset;
7536
7537                 /*
7538                  * check to make sure we can actually find a chunk with enough
7539                  * space to fit our block group in.
7540                  */
7541                 if (device->total_bytes > device->bytes_used + min_free &&
7542                     !device->is_tgtdev_for_dev_replace) {
7543                         ret = find_free_dev_extent(device, min_free,
7544                                                    &dev_offset, NULL);
7545                         if (!ret)
7546                                 dev_nr++;
7547
7548                         if (dev_nr >= dev_min)
7549                                 break;
7550
7551                         ret = -1;
7552                 }
7553         }
7554         mutex_unlock(&root->fs_info->chunk_mutex);
7555 out:
7556         btrfs_put_block_group(block_group);
7557         return ret;
7558 }
7559
7560 static int find_first_block_group(struct btrfs_root *root,
7561                 struct btrfs_path *path, struct btrfs_key *key)
7562 {
7563         int ret = 0;
7564         struct btrfs_key found_key;
7565         struct extent_buffer *leaf;
7566         int slot;
7567
7568         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7569         if (ret < 0)
7570                 goto out;
7571
7572         while (1) {
7573                 slot = path->slots[0];
7574                 leaf = path->nodes[0];
7575                 if (slot >= btrfs_header_nritems(leaf)) {
7576                         ret = btrfs_next_leaf(root, path);
7577                         if (ret == 0)
7578                                 continue;
7579                         if (ret < 0)
7580                                 goto out;
7581                         break;
7582                 }
7583                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7584
7585                 if (found_key.objectid >= key->objectid &&
7586                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7587                         ret = 0;
7588                         goto out;
7589                 }
7590                 path->slots[0]++;
7591         }
7592 out:
7593         return ret;
7594 }
7595
7596 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7597 {
7598         struct btrfs_block_group_cache *block_group;
7599         u64 last = 0;
7600
7601         while (1) {
7602                 struct inode *inode;
7603
7604                 block_group = btrfs_lookup_first_block_group(info, last);
7605                 while (block_group) {
7606                         spin_lock(&block_group->lock);
7607                         if (block_group->iref)
7608                                 break;
7609                         spin_unlock(&block_group->lock);
7610                         block_group = next_block_group(info->tree_root,
7611                                                        block_group);
7612                 }
7613                 if (!block_group) {
7614                         if (last == 0)
7615                                 break;
7616                         last = 0;
7617                         continue;
7618                 }
7619
7620                 inode = block_group->inode;
7621                 block_group->iref = 0;
7622                 block_group->inode = NULL;
7623                 spin_unlock(&block_group->lock);
7624                 iput(inode);
7625                 last = block_group->key.objectid + block_group->key.offset;
7626                 btrfs_put_block_group(block_group);
7627         }
7628 }
7629
7630 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7631 {
7632         struct btrfs_block_group_cache *block_group;
7633         struct btrfs_space_info *space_info;
7634         struct btrfs_caching_control *caching_ctl;
7635         struct rb_node *n;
7636
7637         down_write(&info->extent_commit_sem);
7638         while (!list_empty(&info->caching_block_groups)) {
7639                 caching_ctl = list_entry(info->caching_block_groups.next,
7640                                          struct btrfs_caching_control, list);
7641                 list_del(&caching_ctl->list);
7642                 put_caching_control(caching_ctl);
7643         }
7644         up_write(&info->extent_commit_sem);
7645
7646         spin_lock(&info->block_group_cache_lock);
7647         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7648                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7649                                        cache_node);
7650                 rb_erase(&block_group->cache_node,
7651                          &info->block_group_cache_tree);
7652                 spin_unlock(&info->block_group_cache_lock);
7653
7654                 down_write(&block_group->space_info->groups_sem);
7655                 list_del(&block_group->list);
7656                 up_write(&block_group->space_info->groups_sem);
7657
7658                 if (block_group->cached == BTRFS_CACHE_STARTED)
7659                         wait_block_group_cache_done(block_group);
7660
7661                 /*
7662                  * We haven't cached this block group, which means we could
7663                  * possibly have excluded extents on this block group.
7664                  */
7665                 if (block_group->cached == BTRFS_CACHE_NO)
7666                         free_excluded_extents(info->extent_root, block_group);
7667
7668                 btrfs_remove_free_space_cache(block_group);
7669                 btrfs_put_block_group(block_group);
7670
7671                 spin_lock(&info->block_group_cache_lock);
7672         }
7673         spin_unlock(&info->block_group_cache_lock);
7674
7675         /* now that all the block groups are freed, go through and
7676          * free all the space_info structs.  This is only called during
7677          * the final stages of unmount, and so we know nobody is
7678          * using them.  We call synchronize_rcu() once before we start,
7679          * just to be on the safe side.
7680          */
7681         synchronize_rcu();
7682
7683         release_global_block_rsv(info);
7684
7685         while(!list_empty(&info->space_info)) {
7686                 space_info = list_entry(info->space_info.next,
7687                                         struct btrfs_space_info,
7688                                         list);
7689                 if (space_info->bytes_pinned > 0 ||
7690                     space_info->bytes_reserved > 0 ||
7691                     space_info->bytes_may_use > 0) {
7692                         WARN_ON(1);
7693                         dump_space_info(space_info, 0, 0);
7694                 }
7695                 list_del(&space_info->list);
7696                 kfree(space_info);
7697         }
7698         return 0;
7699 }
7700
7701 static void __link_block_group(struct btrfs_space_info *space_info,
7702                                struct btrfs_block_group_cache *cache)
7703 {
7704         int index = get_block_group_index(cache);
7705
7706         down_write(&space_info->groups_sem);
7707         list_add_tail(&cache->list, &space_info->block_groups[index]);
7708         up_write(&space_info->groups_sem);
7709 }
7710
7711 int btrfs_read_block_groups(struct btrfs_root *root)
7712 {
7713         struct btrfs_path *path;
7714         int ret;
7715         struct btrfs_block_group_cache *cache;
7716         struct btrfs_fs_info *info = root->fs_info;
7717         struct btrfs_space_info *space_info;
7718         struct btrfs_key key;
7719         struct btrfs_key found_key;
7720         struct extent_buffer *leaf;
7721         int need_clear = 0;
7722         u64 cache_gen;
7723
7724         root = info->extent_root;
7725         key.objectid = 0;
7726         key.offset = 0;
7727         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7728         path = btrfs_alloc_path();
7729         if (!path)
7730                 return -ENOMEM;
7731         path->reada = 1;
7732
7733         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7734         if (btrfs_test_opt(root, SPACE_CACHE) &&
7735             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7736                 need_clear = 1;
7737         if (btrfs_test_opt(root, CLEAR_CACHE))
7738                 need_clear = 1;
7739
7740         while (1) {
7741                 ret = find_first_block_group(root, path, &key);
7742                 if (ret > 0)
7743                         break;
7744                 if (ret != 0)
7745                         goto error;
7746                 leaf = path->nodes[0];
7747                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7748                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7749                 if (!cache) {
7750                         ret = -ENOMEM;
7751                         goto error;
7752                 }
7753                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7754                                                 GFP_NOFS);
7755                 if (!cache->free_space_ctl) {
7756                         kfree(cache);
7757                         ret = -ENOMEM;
7758                         goto error;
7759                 }
7760
7761                 atomic_set(&cache->count, 1);
7762                 spin_lock_init(&cache->lock);
7763                 cache->fs_info = info;
7764                 INIT_LIST_HEAD(&cache->list);
7765                 INIT_LIST_HEAD(&cache->cluster_list);
7766
7767                 if (need_clear) {
7768                         /*
7769                          * When we mount with old space cache, we need to
7770                          * set BTRFS_DC_CLEAR and set dirty flag.
7771                          *
7772                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7773                          *    truncate the old free space cache inode and
7774                          *    setup a new one.
7775                          * b) Setting 'dirty flag' makes sure that we flush
7776                          *    the new space cache info onto disk.
7777                          */
7778                         cache->disk_cache_state = BTRFS_DC_CLEAR;
7779                         if (btrfs_test_opt(root, SPACE_CACHE))
7780                                 cache->dirty = 1;
7781                 }
7782
7783                 read_extent_buffer(leaf, &cache->item,
7784                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7785                                    sizeof(cache->item));
7786                 memcpy(&cache->key, &found_key, sizeof(found_key));
7787
7788                 key.objectid = found_key.objectid + found_key.offset;
7789                 btrfs_release_path(path);
7790                 cache->flags = btrfs_block_group_flags(&cache->item);
7791                 cache->sectorsize = root->sectorsize;
7792
7793                 btrfs_init_free_space_ctl(cache);
7794
7795                 /*
7796                  * We need to exclude the super stripes now so that the space
7797                  * info has super bytes accounted for, otherwise we'll think
7798                  * we have more space than we actually do.
7799                  */
7800                 exclude_super_stripes(root, cache);
7801
7802                 /*
7803                  * check for two cases, either we are full, and therefore
7804                  * don't need to bother with the caching work since we won't
7805                  * find any space, or we are empty, and we can just add all
7806                  * the space in and be done with it.  This saves us _alot_ of
7807                  * time, particularly in the full case.
7808                  */
7809                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7810                         cache->last_byte_to_unpin = (u64)-1;
7811                         cache->cached = BTRFS_CACHE_FINISHED;
7812                         free_excluded_extents(root, cache);
7813                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7814                         cache->last_byte_to_unpin = (u64)-1;
7815                         cache->cached = BTRFS_CACHE_FINISHED;
7816                         add_new_free_space(cache, root->fs_info,
7817                                            found_key.objectid,
7818                                            found_key.objectid +
7819                                            found_key.offset);
7820                         free_excluded_extents(root, cache);
7821                 }
7822
7823                 ret = update_space_info(info, cache->flags, found_key.offset,
7824                                         btrfs_block_group_used(&cache->item),
7825                                         &space_info);
7826                 BUG_ON(ret); /* -ENOMEM */
7827                 cache->space_info = space_info;
7828                 spin_lock(&cache->space_info->lock);
7829                 cache->space_info->bytes_readonly += cache->bytes_super;
7830                 spin_unlock(&cache->space_info->lock);
7831
7832                 __link_block_group(space_info, cache);
7833
7834                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7835                 BUG_ON(ret); /* Logic error */
7836
7837                 set_avail_alloc_bits(root->fs_info, cache->flags);
7838                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7839                         set_block_group_ro(cache, 1);
7840         }
7841
7842         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7843                 if (!(get_alloc_profile(root, space_info->flags) &
7844                       (BTRFS_BLOCK_GROUP_RAID10 |
7845                        BTRFS_BLOCK_GROUP_RAID1 |
7846                        BTRFS_BLOCK_GROUP_DUP)))
7847                         continue;
7848                 /*
7849                  * avoid allocating from un-mirrored block group if there are
7850                  * mirrored block groups.
7851                  */
7852                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7853                         set_block_group_ro(cache, 1);
7854                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7855                         set_block_group_ro(cache, 1);
7856         }
7857
7858         init_global_block_rsv(info);
7859         ret = 0;
7860 error:
7861         btrfs_free_path(path);
7862         return ret;
7863 }
7864
7865 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7866                                        struct btrfs_root *root)
7867 {
7868         struct btrfs_block_group_cache *block_group, *tmp;
7869         struct btrfs_root *extent_root = root->fs_info->extent_root;
7870         struct btrfs_block_group_item item;
7871         struct btrfs_key key;
7872         int ret = 0;
7873
7874         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7875                                  new_bg_list) {
7876                 list_del_init(&block_group->new_bg_list);
7877
7878                 if (ret)
7879                         continue;
7880
7881                 spin_lock(&block_group->lock);
7882                 memcpy(&item, &block_group->item, sizeof(item));
7883                 memcpy(&key, &block_group->key, sizeof(key));
7884                 spin_unlock(&block_group->lock);
7885
7886                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7887                                         sizeof(item));
7888                 if (ret)
7889                         btrfs_abort_transaction(trans, extent_root, ret);
7890         }
7891 }
7892
7893 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7894                            struct btrfs_root *root, u64 bytes_used,
7895                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7896                            u64 size)
7897 {
7898         int ret;
7899         struct btrfs_root *extent_root;
7900         struct btrfs_block_group_cache *cache;
7901
7902         extent_root = root->fs_info->extent_root;
7903
7904         root->fs_info->last_trans_log_full_commit = trans->transid;
7905
7906         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7907         if (!cache)
7908                 return -ENOMEM;
7909         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7910                                         GFP_NOFS);
7911         if (!cache->free_space_ctl) {
7912                 kfree(cache);
7913                 return -ENOMEM;
7914         }
7915
7916         cache->key.objectid = chunk_offset;
7917         cache->key.offset = size;
7918         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7919         cache->sectorsize = root->sectorsize;
7920         cache->fs_info = root->fs_info;
7921
7922         atomic_set(&cache->count, 1);
7923         spin_lock_init(&cache->lock);
7924         INIT_LIST_HEAD(&cache->list);
7925         INIT_LIST_HEAD(&cache->cluster_list);
7926         INIT_LIST_HEAD(&cache->new_bg_list);
7927
7928         btrfs_init_free_space_ctl(cache);
7929
7930         btrfs_set_block_group_used(&cache->item, bytes_used);
7931         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7932         cache->flags = type;
7933         btrfs_set_block_group_flags(&cache->item, type);
7934
7935         cache->last_byte_to_unpin = (u64)-1;
7936         cache->cached = BTRFS_CACHE_FINISHED;
7937         exclude_super_stripes(root, cache);
7938
7939         add_new_free_space(cache, root->fs_info, chunk_offset,
7940                            chunk_offset + size);
7941
7942         free_excluded_extents(root, cache);
7943
7944         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7945                                 &cache->space_info);
7946         BUG_ON(ret); /* -ENOMEM */
7947         update_global_block_rsv(root->fs_info);
7948
7949         spin_lock(&cache->space_info->lock);
7950         cache->space_info->bytes_readonly += cache->bytes_super;
7951         spin_unlock(&cache->space_info->lock);
7952
7953         __link_block_group(cache->space_info, cache);
7954
7955         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7956         BUG_ON(ret); /* Logic error */
7957
7958         list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7959
7960         set_avail_alloc_bits(extent_root->fs_info, type);
7961
7962         return 0;
7963 }
7964
7965 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7966 {
7967         u64 extra_flags = chunk_to_extended(flags) &
7968                                 BTRFS_EXTENDED_PROFILE_MASK;
7969
7970         if (flags & BTRFS_BLOCK_GROUP_DATA)
7971                 fs_info->avail_data_alloc_bits &= ~extra_flags;
7972         if (flags & BTRFS_BLOCK_GROUP_METADATA)
7973                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
7974         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
7975                 fs_info->avail_system_alloc_bits &= ~extra_flags;
7976 }
7977
7978 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
7979                              struct btrfs_root *root, u64 group_start)
7980 {
7981         struct btrfs_path *path;
7982         struct btrfs_block_group_cache *block_group;
7983         struct btrfs_free_cluster *cluster;
7984         struct btrfs_root *tree_root = root->fs_info->tree_root;
7985         struct btrfs_key key;
7986         struct inode *inode;
7987         int ret;
7988         int index;
7989         int factor;
7990
7991         root = root->fs_info->extent_root;
7992
7993         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
7994         BUG_ON(!block_group);
7995         BUG_ON(!block_group->ro);
7996
7997         /*
7998          * Free the reserved super bytes from this block group before
7999          * remove it.
8000          */
8001         free_excluded_extents(root, block_group);
8002
8003         memcpy(&key, &block_group->key, sizeof(key));
8004         index = get_block_group_index(block_group);
8005         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8006                                   BTRFS_BLOCK_GROUP_RAID1 |
8007                                   BTRFS_BLOCK_GROUP_RAID10))
8008                 factor = 2;
8009         else
8010                 factor = 1;
8011
8012         /* make sure this block group isn't part of an allocation cluster */
8013         cluster = &root->fs_info->data_alloc_cluster;
8014         spin_lock(&cluster->refill_lock);
8015         btrfs_return_cluster_to_free_space(block_group, cluster);
8016         spin_unlock(&cluster->refill_lock);
8017
8018         /*
8019          * make sure this block group isn't part of a metadata
8020          * allocation cluster
8021          */
8022         cluster = &root->fs_info->meta_alloc_cluster;
8023         spin_lock(&cluster->refill_lock);
8024         btrfs_return_cluster_to_free_space(block_group, cluster);
8025         spin_unlock(&cluster->refill_lock);
8026
8027         path = btrfs_alloc_path();
8028         if (!path) {
8029                 ret = -ENOMEM;
8030                 goto out;
8031         }
8032
8033         inode = lookup_free_space_inode(tree_root, block_group, path);
8034         if (!IS_ERR(inode)) {
8035                 ret = btrfs_orphan_add(trans, inode);
8036                 if (ret) {
8037                         btrfs_add_delayed_iput(inode);
8038                         goto out;
8039                 }
8040                 clear_nlink(inode);
8041                 /* One for the block groups ref */
8042                 spin_lock(&block_group->lock);
8043                 if (block_group->iref) {
8044                         block_group->iref = 0;
8045                         block_group->inode = NULL;
8046                         spin_unlock(&block_group->lock);
8047                         iput(inode);
8048                 } else {
8049                         spin_unlock(&block_group->lock);
8050                 }
8051                 /* One for our lookup ref */
8052                 btrfs_add_delayed_iput(inode);
8053         }
8054
8055         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8056         key.offset = block_group->key.objectid;
8057         key.type = 0;
8058
8059         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8060         if (ret < 0)
8061                 goto out;
8062         if (ret > 0)
8063                 btrfs_release_path(path);
8064         if (ret == 0) {
8065                 ret = btrfs_del_item(trans, tree_root, path);
8066                 if (ret)
8067                         goto out;
8068                 btrfs_release_path(path);
8069         }
8070
8071         spin_lock(&root->fs_info->block_group_cache_lock);
8072         rb_erase(&block_group->cache_node,
8073                  &root->fs_info->block_group_cache_tree);
8074         spin_unlock(&root->fs_info->block_group_cache_lock);
8075
8076         down_write(&block_group->space_info->groups_sem);
8077         /*
8078          * we must use list_del_init so people can check to see if they
8079          * are still on the list after taking the semaphore
8080          */
8081         list_del_init(&block_group->list);
8082         if (list_empty(&block_group->space_info->block_groups[index]))
8083                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8084         up_write(&block_group->space_info->groups_sem);
8085
8086         if (block_group->cached == BTRFS_CACHE_STARTED)
8087                 wait_block_group_cache_done(block_group);
8088
8089         btrfs_remove_free_space_cache(block_group);
8090
8091         spin_lock(&block_group->space_info->lock);
8092         block_group->space_info->total_bytes -= block_group->key.offset;
8093         block_group->space_info->bytes_readonly -= block_group->key.offset;
8094         block_group->space_info->disk_total -= block_group->key.offset * factor;
8095         spin_unlock(&block_group->space_info->lock);
8096
8097         memcpy(&key, &block_group->key, sizeof(key));
8098
8099         btrfs_clear_space_info_full(root->fs_info);
8100
8101         btrfs_put_block_group(block_group);
8102         btrfs_put_block_group(block_group);
8103
8104         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8105         if (ret > 0)
8106                 ret = -EIO;
8107         if (ret < 0)
8108                 goto out;
8109
8110         ret = btrfs_del_item(trans, root, path);
8111 out:
8112         btrfs_free_path(path);
8113         return ret;
8114 }
8115
8116 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8117 {
8118         struct btrfs_space_info *space_info;
8119         struct btrfs_super_block *disk_super;
8120         u64 features;
8121         u64 flags;
8122         int mixed = 0;
8123         int ret;
8124
8125         disk_super = fs_info->super_copy;
8126         if (!btrfs_super_root(disk_super))
8127                 return 1;
8128
8129         features = btrfs_super_incompat_flags(disk_super);
8130         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8131                 mixed = 1;
8132
8133         flags = BTRFS_BLOCK_GROUP_SYSTEM;
8134         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8135         if (ret)
8136                 goto out;
8137
8138         if (mixed) {
8139                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8140                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8141         } else {
8142                 flags = BTRFS_BLOCK_GROUP_METADATA;
8143                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8144                 if (ret)
8145                         goto out;
8146
8147                 flags = BTRFS_BLOCK_GROUP_DATA;
8148                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8149         }
8150 out:
8151         return ret;
8152 }
8153
8154 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8155 {
8156         return unpin_extent_range(root, start, end);
8157 }
8158
8159 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8160                                u64 num_bytes, u64 *actual_bytes)
8161 {
8162         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8163 }
8164
8165 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8166 {
8167         struct btrfs_fs_info *fs_info = root->fs_info;
8168         struct btrfs_block_group_cache *cache = NULL;
8169         u64 group_trimmed;
8170         u64 start;
8171         u64 end;
8172         u64 trimmed = 0;
8173         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8174         int ret = 0;
8175
8176         /*
8177          * try to trim all FS space, our block group may start from non-zero.
8178          */
8179         if (range->len == total_bytes)
8180                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
8181         else
8182                 cache = btrfs_lookup_block_group(fs_info, range->start);
8183
8184         while (cache) {
8185                 if (cache->key.objectid >= (range->start + range->len)) {
8186                         btrfs_put_block_group(cache);
8187                         break;
8188                 }
8189
8190                 start = max(range->start, cache->key.objectid);
8191                 end = min(range->start + range->len,
8192                                 cache->key.objectid + cache->key.offset);
8193
8194                 if (end - start >= range->minlen) {
8195                         if (!block_group_cache_done(cache)) {
8196                                 ret = cache_block_group(cache, NULL, root, 0);
8197                                 if (!ret)
8198                                         wait_block_group_cache_done(cache);
8199                         }
8200                         ret = btrfs_trim_block_group(cache,
8201                                                      &group_trimmed,
8202                                                      start,
8203                                                      end,
8204                                                      range->minlen);
8205
8206                         trimmed += group_trimmed;
8207                         if (ret) {
8208                                 btrfs_put_block_group(cache);
8209                                 break;
8210                         }
8211                 }
8212
8213                 cache = next_block_group(fs_info->tree_root, cache);
8214         }
8215
8216         range->len = trimmed;
8217         return ret;
8218 }