]> rtime.felk.cvut.cz Git - linux-imx.git/blob - fs/btrfs/extent-tree.c
Btrfs: rework the overcommit logic to be based on the total size
[linux-imx.git] / fs / btrfs / extent-tree.c
1 /*
2  * Copyright (C) 2007 Oracle.  All rights reserved.
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public
6  * License v2 as published by the Free Software Foundation.
7  *
8  * This program is distributed in the hope that it will be useful,
9  * but WITHOUT ANY WARRANTY; without even the implied warranty of
10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11  * General Public License for more details.
12  *
13  * You should have received a copy of the GNU General Public
14  * License along with this program; if not, write to the
15  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
16  * Boston, MA 021110-1307, USA.
17  */
18 #include <linux/sched.h>
19 #include <linux/pagemap.h>
20 #include <linux/writeback.h>
21 #include <linux/blkdev.h>
22 #include <linux/sort.h>
23 #include <linux/rcupdate.h>
24 #include <linux/kthread.h>
25 #include <linux/slab.h>
26 #include <linux/ratelimit.h>
27 #include "compat.h"
28 #include "hash.h"
29 #include "ctree.h"
30 #include "disk-io.h"
31 #include "print-tree.h"
32 #include "transaction.h"
33 #include "volumes.h"
34 #include "locking.h"
35 #include "free-space-cache.h"
36 #include "math.h"
37
38 #undef SCRAMBLE_DELAYED_REFS
39
40 /*
41  * control flags for do_chunk_alloc's force field
42  * CHUNK_ALLOC_NO_FORCE means to only allocate a chunk
43  * if we really need one.
44  *
45  * CHUNK_ALLOC_LIMITED means to only try and allocate one
46  * if we have very few chunks already allocated.  This is
47  * used as part of the clustering code to help make sure
48  * we have a good pool of storage to cluster in, without
49  * filling the FS with empty chunks
50  *
51  * CHUNK_ALLOC_FORCE means it must try to allocate one
52  *
53  */
54 enum {
55         CHUNK_ALLOC_NO_FORCE = 0,
56         CHUNK_ALLOC_LIMITED = 1,
57         CHUNK_ALLOC_FORCE = 2,
58 };
59
60 /*
61  * Control how reservations are dealt with.
62  *
63  * RESERVE_FREE - freeing a reservation.
64  * RESERVE_ALLOC - allocating space and we need to update bytes_may_use for
65  *   ENOSPC accounting
66  * RESERVE_ALLOC_NO_ACCOUNT - allocating space and we should not update
67  *   bytes_may_use as the ENOSPC accounting is done elsewhere
68  */
69 enum {
70         RESERVE_FREE = 0,
71         RESERVE_ALLOC = 1,
72         RESERVE_ALLOC_NO_ACCOUNT = 2,
73 };
74
75 static int update_block_group(struct btrfs_root *root,
76                               u64 bytenr, u64 num_bytes, int alloc);
77 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
78                                 struct btrfs_root *root,
79                                 u64 bytenr, u64 num_bytes, u64 parent,
80                                 u64 root_objectid, u64 owner_objectid,
81                                 u64 owner_offset, int refs_to_drop,
82                                 struct btrfs_delayed_extent_op *extra_op);
83 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
84                                     struct extent_buffer *leaf,
85                                     struct btrfs_extent_item *ei);
86 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
87                                       struct btrfs_root *root,
88                                       u64 parent, u64 root_objectid,
89                                       u64 flags, u64 owner, u64 offset,
90                                       struct btrfs_key *ins, int ref_mod);
91 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
92                                      struct btrfs_root *root,
93                                      u64 parent, u64 root_objectid,
94                                      u64 flags, struct btrfs_disk_key *key,
95                                      int level, struct btrfs_key *ins);
96 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
97                           struct btrfs_root *extent_root, u64 flags,
98                           int force);
99 static int find_next_key(struct btrfs_path *path, int level,
100                          struct btrfs_key *key);
101 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
102                             int dump_block_groups);
103 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
104                                        u64 num_bytes, int reserve);
105
106 static noinline int
107 block_group_cache_done(struct btrfs_block_group_cache *cache)
108 {
109         smp_mb();
110         return cache->cached == BTRFS_CACHE_FINISHED;
111 }
112
113 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
114 {
115         return (cache->flags & bits) == bits;
116 }
117
118 static void btrfs_get_block_group(struct btrfs_block_group_cache *cache)
119 {
120         atomic_inc(&cache->count);
121 }
122
123 void btrfs_put_block_group(struct btrfs_block_group_cache *cache)
124 {
125         if (atomic_dec_and_test(&cache->count)) {
126                 WARN_ON(cache->pinned > 0);
127                 WARN_ON(cache->reserved > 0);
128                 kfree(cache->free_space_ctl);
129                 kfree(cache);
130         }
131 }
132
133 /*
134  * this adds the block group to the fs_info rb tree for the block group
135  * cache
136  */
137 static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
138                                 struct btrfs_block_group_cache *block_group)
139 {
140         struct rb_node **p;
141         struct rb_node *parent = NULL;
142         struct btrfs_block_group_cache *cache;
143
144         spin_lock(&info->block_group_cache_lock);
145         p = &info->block_group_cache_tree.rb_node;
146
147         while (*p) {
148                 parent = *p;
149                 cache = rb_entry(parent, struct btrfs_block_group_cache,
150                                  cache_node);
151                 if (block_group->key.objectid < cache->key.objectid) {
152                         p = &(*p)->rb_left;
153                 } else if (block_group->key.objectid > cache->key.objectid) {
154                         p = &(*p)->rb_right;
155                 } else {
156                         spin_unlock(&info->block_group_cache_lock);
157                         return -EEXIST;
158                 }
159         }
160
161         rb_link_node(&block_group->cache_node, parent, p);
162         rb_insert_color(&block_group->cache_node,
163                         &info->block_group_cache_tree);
164
165         if (info->first_logical_byte > block_group->key.objectid)
166                 info->first_logical_byte = block_group->key.objectid;
167
168         spin_unlock(&info->block_group_cache_lock);
169
170         return 0;
171 }
172
173 /*
174  * This will return the block group at or after bytenr if contains is 0, else
175  * it will return the block group that contains the bytenr
176  */
177 static struct btrfs_block_group_cache *
178 block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr,
179                               int contains)
180 {
181         struct btrfs_block_group_cache *cache, *ret = NULL;
182         struct rb_node *n;
183         u64 end, start;
184
185         spin_lock(&info->block_group_cache_lock);
186         n = info->block_group_cache_tree.rb_node;
187
188         while (n) {
189                 cache = rb_entry(n, struct btrfs_block_group_cache,
190                                  cache_node);
191                 end = cache->key.objectid + cache->key.offset - 1;
192                 start = cache->key.objectid;
193
194                 if (bytenr < start) {
195                         if (!contains && (!ret || start < ret->key.objectid))
196                                 ret = cache;
197                         n = n->rb_left;
198                 } else if (bytenr > start) {
199                         if (contains && bytenr <= end) {
200                                 ret = cache;
201                                 break;
202                         }
203                         n = n->rb_right;
204                 } else {
205                         ret = cache;
206                         break;
207                 }
208         }
209         if (ret) {
210                 btrfs_get_block_group(ret);
211                 if (bytenr == 0 && info->first_logical_byte > ret->key.objectid)
212                         info->first_logical_byte = ret->key.objectid;
213         }
214         spin_unlock(&info->block_group_cache_lock);
215
216         return ret;
217 }
218
219 static int add_excluded_extent(struct btrfs_root *root,
220                                u64 start, u64 num_bytes)
221 {
222         u64 end = start + num_bytes - 1;
223         set_extent_bits(&root->fs_info->freed_extents[0],
224                         start, end, EXTENT_UPTODATE, GFP_NOFS);
225         set_extent_bits(&root->fs_info->freed_extents[1],
226                         start, end, EXTENT_UPTODATE, GFP_NOFS);
227         return 0;
228 }
229
230 static void free_excluded_extents(struct btrfs_root *root,
231                                   struct btrfs_block_group_cache *cache)
232 {
233         u64 start, end;
234
235         start = cache->key.objectid;
236         end = start + cache->key.offset - 1;
237
238         clear_extent_bits(&root->fs_info->freed_extents[0],
239                           start, end, EXTENT_UPTODATE, GFP_NOFS);
240         clear_extent_bits(&root->fs_info->freed_extents[1],
241                           start, end, EXTENT_UPTODATE, GFP_NOFS);
242 }
243
244 static int exclude_super_stripes(struct btrfs_root *root,
245                                  struct btrfs_block_group_cache *cache)
246 {
247         u64 bytenr;
248         u64 *logical;
249         int stripe_len;
250         int i, nr, ret;
251
252         if (cache->key.objectid < BTRFS_SUPER_INFO_OFFSET) {
253                 stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->key.objectid;
254                 cache->bytes_super += stripe_len;
255                 ret = add_excluded_extent(root, cache->key.objectid,
256                                           stripe_len);
257                 BUG_ON(ret); /* -ENOMEM */
258         }
259
260         for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
261                 bytenr = btrfs_sb_offset(i);
262                 ret = btrfs_rmap_block(&root->fs_info->mapping_tree,
263                                        cache->key.objectid, bytenr,
264                                        0, &logical, &nr, &stripe_len);
265                 BUG_ON(ret); /* -ENOMEM */
266
267                 while (nr--) {
268                         cache->bytes_super += stripe_len;
269                         ret = add_excluded_extent(root, logical[nr],
270                                                   stripe_len);
271                         BUG_ON(ret); /* -ENOMEM */
272                 }
273
274                 kfree(logical);
275         }
276         return 0;
277 }
278
279 static struct btrfs_caching_control *
280 get_caching_control(struct btrfs_block_group_cache *cache)
281 {
282         struct btrfs_caching_control *ctl;
283
284         spin_lock(&cache->lock);
285         if (cache->cached != BTRFS_CACHE_STARTED) {
286                 spin_unlock(&cache->lock);
287                 return NULL;
288         }
289
290         /* We're loading it the fast way, so we don't have a caching_ctl. */
291         if (!cache->caching_ctl) {
292                 spin_unlock(&cache->lock);
293                 return NULL;
294         }
295
296         ctl = cache->caching_ctl;
297         atomic_inc(&ctl->count);
298         spin_unlock(&cache->lock);
299         return ctl;
300 }
301
302 static void put_caching_control(struct btrfs_caching_control *ctl)
303 {
304         if (atomic_dec_and_test(&ctl->count))
305                 kfree(ctl);
306 }
307
308 /*
309  * this is only called by cache_block_group, since we could have freed extents
310  * we need to check the pinned_extents for any extents that can't be used yet
311  * since their free space will be released as soon as the transaction commits.
312  */
313 static u64 add_new_free_space(struct btrfs_block_group_cache *block_group,
314                               struct btrfs_fs_info *info, u64 start, u64 end)
315 {
316         u64 extent_start, extent_end, size, total_added = 0;
317         int ret;
318
319         while (start < end) {
320                 ret = find_first_extent_bit(info->pinned_extents, start,
321                                             &extent_start, &extent_end,
322                                             EXTENT_DIRTY | EXTENT_UPTODATE,
323                                             NULL);
324                 if (ret)
325                         break;
326
327                 if (extent_start <= start) {
328                         start = extent_end + 1;
329                 } else if (extent_start > start && extent_start < end) {
330                         size = extent_start - start;
331                         total_added += size;
332                         ret = btrfs_add_free_space(block_group, start,
333                                                    size);
334                         BUG_ON(ret); /* -ENOMEM or logic error */
335                         start = extent_end + 1;
336                 } else {
337                         break;
338                 }
339         }
340
341         if (start < end) {
342                 size = end - start;
343                 total_added += size;
344                 ret = btrfs_add_free_space(block_group, start, size);
345                 BUG_ON(ret); /* -ENOMEM or logic error */
346         }
347
348         return total_added;
349 }
350
351 static noinline void caching_thread(struct btrfs_work *work)
352 {
353         struct btrfs_block_group_cache *block_group;
354         struct btrfs_fs_info *fs_info;
355         struct btrfs_caching_control *caching_ctl;
356         struct btrfs_root *extent_root;
357         struct btrfs_path *path;
358         struct extent_buffer *leaf;
359         struct btrfs_key key;
360         u64 total_found = 0;
361         u64 last = 0;
362         u32 nritems;
363         int ret = 0;
364
365         caching_ctl = container_of(work, struct btrfs_caching_control, work);
366         block_group = caching_ctl->block_group;
367         fs_info = block_group->fs_info;
368         extent_root = fs_info->extent_root;
369
370         path = btrfs_alloc_path();
371         if (!path)
372                 goto out;
373
374         last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET);
375
376         /*
377          * We don't want to deadlock with somebody trying to allocate a new
378          * extent for the extent root while also trying to search the extent
379          * root to add free space.  So we skip locking and search the commit
380          * root, since its read-only
381          */
382         path->skip_locking = 1;
383         path->search_commit_root = 1;
384         path->reada = 1;
385
386         key.objectid = last;
387         key.offset = 0;
388         key.type = BTRFS_EXTENT_ITEM_KEY;
389 again:
390         mutex_lock(&caching_ctl->mutex);
391         /* need to make sure the commit_root doesn't disappear */
392         down_read(&fs_info->extent_commit_sem);
393
394         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
395         if (ret < 0)
396                 goto err;
397
398         leaf = path->nodes[0];
399         nritems = btrfs_header_nritems(leaf);
400
401         while (1) {
402                 if (btrfs_fs_closing(fs_info) > 1) {
403                         last = (u64)-1;
404                         break;
405                 }
406
407                 if (path->slots[0] < nritems) {
408                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
409                 } else {
410                         ret = find_next_key(path, 0, &key);
411                         if (ret)
412                                 break;
413
414                         if (need_resched() ||
415                             btrfs_next_leaf(extent_root, path)) {
416                                 caching_ctl->progress = last;
417                                 btrfs_release_path(path);
418                                 up_read(&fs_info->extent_commit_sem);
419                                 mutex_unlock(&caching_ctl->mutex);
420                                 cond_resched();
421                                 goto again;
422                         }
423                         leaf = path->nodes[0];
424                         nritems = btrfs_header_nritems(leaf);
425                         continue;
426                 }
427
428                 if (key.objectid < block_group->key.objectid) {
429                         path->slots[0]++;
430                         continue;
431                 }
432
433                 if (key.objectid >= block_group->key.objectid +
434                     block_group->key.offset)
435                         break;
436
437                 if (key.type == BTRFS_EXTENT_ITEM_KEY) {
438                         total_found += add_new_free_space(block_group,
439                                                           fs_info, last,
440                                                           key.objectid);
441                         last = key.objectid + key.offset;
442
443                         if (total_found > (1024 * 1024 * 2)) {
444                                 total_found = 0;
445                                 wake_up(&caching_ctl->wait);
446                         }
447                 }
448                 path->slots[0]++;
449         }
450         ret = 0;
451
452         total_found += add_new_free_space(block_group, fs_info, last,
453                                           block_group->key.objectid +
454                                           block_group->key.offset);
455         caching_ctl->progress = (u64)-1;
456
457         spin_lock(&block_group->lock);
458         block_group->caching_ctl = NULL;
459         block_group->cached = BTRFS_CACHE_FINISHED;
460         spin_unlock(&block_group->lock);
461
462 err:
463         btrfs_free_path(path);
464         up_read(&fs_info->extent_commit_sem);
465
466         free_excluded_extents(extent_root, block_group);
467
468         mutex_unlock(&caching_ctl->mutex);
469 out:
470         wake_up(&caching_ctl->wait);
471
472         put_caching_control(caching_ctl);
473         btrfs_put_block_group(block_group);
474 }
475
476 static int cache_block_group(struct btrfs_block_group_cache *cache,
477                              int load_cache_only)
478 {
479         DEFINE_WAIT(wait);
480         struct btrfs_fs_info *fs_info = cache->fs_info;
481         struct btrfs_caching_control *caching_ctl;
482         int ret = 0;
483
484         caching_ctl = kzalloc(sizeof(*caching_ctl), GFP_NOFS);
485         if (!caching_ctl)
486                 return -ENOMEM;
487
488         INIT_LIST_HEAD(&caching_ctl->list);
489         mutex_init(&caching_ctl->mutex);
490         init_waitqueue_head(&caching_ctl->wait);
491         caching_ctl->block_group = cache;
492         caching_ctl->progress = cache->key.objectid;
493         atomic_set(&caching_ctl->count, 1);
494         caching_ctl->work.func = caching_thread;
495
496         spin_lock(&cache->lock);
497         /*
498          * This should be a rare occasion, but this could happen I think in the
499          * case where one thread starts to load the space cache info, and then
500          * some other thread starts a transaction commit which tries to do an
501          * allocation while the other thread is still loading the space cache
502          * info.  The previous loop should have kept us from choosing this block
503          * group, but if we've moved to the state where we will wait on caching
504          * block groups we need to first check if we're doing a fast load here,
505          * so we can wait for it to finish, otherwise we could end up allocating
506          * from a block group who's cache gets evicted for one reason or
507          * another.
508          */
509         while (cache->cached == BTRFS_CACHE_FAST) {
510                 struct btrfs_caching_control *ctl;
511
512                 ctl = cache->caching_ctl;
513                 atomic_inc(&ctl->count);
514                 prepare_to_wait(&ctl->wait, &wait, TASK_UNINTERRUPTIBLE);
515                 spin_unlock(&cache->lock);
516
517                 schedule();
518
519                 finish_wait(&ctl->wait, &wait);
520                 put_caching_control(ctl);
521                 spin_lock(&cache->lock);
522         }
523
524         if (cache->cached != BTRFS_CACHE_NO) {
525                 spin_unlock(&cache->lock);
526                 kfree(caching_ctl);
527                 return 0;
528         }
529         WARN_ON(cache->caching_ctl);
530         cache->caching_ctl = caching_ctl;
531         cache->cached = BTRFS_CACHE_FAST;
532         spin_unlock(&cache->lock);
533
534         if (fs_info->mount_opt & BTRFS_MOUNT_SPACE_CACHE) {
535                 ret = load_free_space_cache(fs_info, cache);
536
537                 spin_lock(&cache->lock);
538                 if (ret == 1) {
539                         cache->caching_ctl = NULL;
540                         cache->cached = BTRFS_CACHE_FINISHED;
541                         cache->last_byte_to_unpin = (u64)-1;
542                 } else {
543                         if (load_cache_only) {
544                                 cache->caching_ctl = NULL;
545                                 cache->cached = BTRFS_CACHE_NO;
546                         } else {
547                                 cache->cached = BTRFS_CACHE_STARTED;
548                         }
549                 }
550                 spin_unlock(&cache->lock);
551                 wake_up(&caching_ctl->wait);
552                 if (ret == 1) {
553                         put_caching_control(caching_ctl);
554                         free_excluded_extents(fs_info->extent_root, cache);
555                         return 0;
556                 }
557         } else {
558                 /*
559                  * We are not going to do the fast caching, set cached to the
560                  * appropriate value and wakeup any waiters.
561                  */
562                 spin_lock(&cache->lock);
563                 if (load_cache_only) {
564                         cache->caching_ctl = NULL;
565                         cache->cached = BTRFS_CACHE_NO;
566                 } else {
567                         cache->cached = BTRFS_CACHE_STARTED;
568                 }
569                 spin_unlock(&cache->lock);
570                 wake_up(&caching_ctl->wait);
571         }
572
573         if (load_cache_only) {
574                 put_caching_control(caching_ctl);
575                 return 0;
576         }
577
578         down_write(&fs_info->extent_commit_sem);
579         atomic_inc(&caching_ctl->count);
580         list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
581         up_write(&fs_info->extent_commit_sem);
582
583         btrfs_get_block_group(cache);
584
585         btrfs_queue_worker(&fs_info->caching_workers, &caching_ctl->work);
586
587         return ret;
588 }
589
590 /*
591  * return the block group that starts at or after bytenr
592  */
593 static struct btrfs_block_group_cache *
594 btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr)
595 {
596         struct btrfs_block_group_cache *cache;
597
598         cache = block_group_cache_tree_search(info, bytenr, 0);
599
600         return cache;
601 }
602
603 /*
604  * return the block group that contains the given bytenr
605  */
606 struct btrfs_block_group_cache *btrfs_lookup_block_group(
607                                                  struct btrfs_fs_info *info,
608                                                  u64 bytenr)
609 {
610         struct btrfs_block_group_cache *cache;
611
612         cache = block_group_cache_tree_search(info, bytenr, 1);
613
614         return cache;
615 }
616
617 static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info,
618                                                   u64 flags)
619 {
620         struct list_head *head = &info->space_info;
621         struct btrfs_space_info *found;
622
623         flags &= BTRFS_BLOCK_GROUP_TYPE_MASK;
624
625         rcu_read_lock();
626         list_for_each_entry_rcu(found, head, list) {
627                 if (found->flags & flags) {
628                         rcu_read_unlock();
629                         return found;
630                 }
631         }
632         rcu_read_unlock();
633         return NULL;
634 }
635
636 /*
637  * after adding space to the filesystem, we need to clear the full flags
638  * on all the space infos.
639  */
640 void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
641 {
642         struct list_head *head = &info->space_info;
643         struct btrfs_space_info *found;
644
645         rcu_read_lock();
646         list_for_each_entry_rcu(found, head, list)
647                 found->full = 0;
648         rcu_read_unlock();
649 }
650
651 u64 btrfs_find_block_group(struct btrfs_root *root,
652                            u64 search_start, u64 search_hint, int owner)
653 {
654         struct btrfs_block_group_cache *cache;
655         u64 used;
656         u64 last = max(search_hint, search_start);
657         u64 group_start = 0;
658         int full_search = 0;
659         int factor = 9;
660         int wrapped = 0;
661 again:
662         while (1) {
663                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
664                 if (!cache)
665                         break;
666
667                 spin_lock(&cache->lock);
668                 last = cache->key.objectid + cache->key.offset;
669                 used = btrfs_block_group_used(&cache->item);
670
671                 if ((full_search || !cache->ro) &&
672                     block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) {
673                         if (used + cache->pinned + cache->reserved <
674                             div_factor(cache->key.offset, factor)) {
675                                 group_start = cache->key.objectid;
676                                 spin_unlock(&cache->lock);
677                                 btrfs_put_block_group(cache);
678                                 goto found;
679                         }
680                 }
681                 spin_unlock(&cache->lock);
682                 btrfs_put_block_group(cache);
683                 cond_resched();
684         }
685         if (!wrapped) {
686                 last = search_start;
687                 wrapped = 1;
688                 goto again;
689         }
690         if (!full_search && factor < 10) {
691                 last = search_start;
692                 full_search = 1;
693                 factor = 10;
694                 goto again;
695         }
696 found:
697         return group_start;
698 }
699
700 /* simple helper to search for an existing extent at a given offset */
701 int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len)
702 {
703         int ret;
704         struct btrfs_key key;
705         struct btrfs_path *path;
706
707         path = btrfs_alloc_path();
708         if (!path)
709                 return -ENOMEM;
710
711         key.objectid = start;
712         key.offset = len;
713         btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
714         ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path,
715                                 0, 0);
716         btrfs_free_path(path);
717         return ret;
718 }
719
720 /*
721  * helper function to lookup reference count and flags of extent.
722  *
723  * the head node for delayed ref is used to store the sum of all the
724  * reference count modifications queued up in the rbtree. the head
725  * node may also store the extent flags to set. This way you can check
726  * to see what the reference count and extent flags would be if all of
727  * the delayed refs are not processed.
728  */
729 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
730                              struct btrfs_root *root, u64 bytenr,
731                              u64 num_bytes, u64 *refs, u64 *flags)
732 {
733         struct btrfs_delayed_ref_head *head;
734         struct btrfs_delayed_ref_root *delayed_refs;
735         struct btrfs_path *path;
736         struct btrfs_extent_item *ei;
737         struct extent_buffer *leaf;
738         struct btrfs_key key;
739         u32 item_size;
740         u64 num_refs;
741         u64 extent_flags;
742         int ret;
743
744         path = btrfs_alloc_path();
745         if (!path)
746                 return -ENOMEM;
747
748         key.objectid = bytenr;
749         key.type = BTRFS_EXTENT_ITEM_KEY;
750         key.offset = num_bytes;
751         if (!trans) {
752                 path->skip_locking = 1;
753                 path->search_commit_root = 1;
754         }
755 again:
756         ret = btrfs_search_slot(trans, root->fs_info->extent_root,
757                                 &key, path, 0, 0);
758         if (ret < 0)
759                 goto out_free;
760
761         if (ret == 0) {
762                 leaf = path->nodes[0];
763                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
764                 if (item_size >= sizeof(*ei)) {
765                         ei = btrfs_item_ptr(leaf, path->slots[0],
766                                             struct btrfs_extent_item);
767                         num_refs = btrfs_extent_refs(leaf, ei);
768                         extent_flags = btrfs_extent_flags(leaf, ei);
769                 } else {
770 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
771                         struct btrfs_extent_item_v0 *ei0;
772                         BUG_ON(item_size != sizeof(*ei0));
773                         ei0 = btrfs_item_ptr(leaf, path->slots[0],
774                                              struct btrfs_extent_item_v0);
775                         num_refs = btrfs_extent_refs_v0(leaf, ei0);
776                         /* FIXME: this isn't correct for data */
777                         extent_flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
778 #else
779                         BUG();
780 #endif
781                 }
782                 BUG_ON(num_refs == 0);
783         } else {
784                 num_refs = 0;
785                 extent_flags = 0;
786                 ret = 0;
787         }
788
789         if (!trans)
790                 goto out;
791
792         delayed_refs = &trans->transaction->delayed_refs;
793         spin_lock(&delayed_refs->lock);
794         head = btrfs_find_delayed_ref_head(trans, bytenr);
795         if (head) {
796                 if (!mutex_trylock(&head->mutex)) {
797                         atomic_inc(&head->node.refs);
798                         spin_unlock(&delayed_refs->lock);
799
800                         btrfs_release_path(path);
801
802                         /*
803                          * Mutex was contended, block until it's released and try
804                          * again
805                          */
806                         mutex_lock(&head->mutex);
807                         mutex_unlock(&head->mutex);
808                         btrfs_put_delayed_ref(&head->node);
809                         goto again;
810                 }
811                 if (head->extent_op && head->extent_op->update_flags)
812                         extent_flags |= head->extent_op->flags_to_set;
813                 else
814                         BUG_ON(num_refs == 0);
815
816                 num_refs += head->node.ref_mod;
817                 mutex_unlock(&head->mutex);
818         }
819         spin_unlock(&delayed_refs->lock);
820 out:
821         WARN_ON(num_refs == 0);
822         if (refs)
823                 *refs = num_refs;
824         if (flags)
825                 *flags = extent_flags;
826 out_free:
827         btrfs_free_path(path);
828         return ret;
829 }
830
831 /*
832  * Back reference rules.  Back refs have three main goals:
833  *
834  * 1) differentiate between all holders of references to an extent so that
835  *    when a reference is dropped we can make sure it was a valid reference
836  *    before freeing the extent.
837  *
838  * 2) Provide enough information to quickly find the holders of an extent
839  *    if we notice a given block is corrupted or bad.
840  *
841  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
842  *    maintenance.  This is actually the same as #2, but with a slightly
843  *    different use case.
844  *
845  * There are two kinds of back refs. The implicit back refs is optimized
846  * for pointers in non-shared tree blocks. For a given pointer in a block,
847  * back refs of this kind provide information about the block's owner tree
848  * and the pointer's key. These information allow us to find the block by
849  * b-tree searching. The full back refs is for pointers in tree blocks not
850  * referenced by their owner trees. The location of tree block is recorded
851  * in the back refs. Actually the full back refs is generic, and can be
852  * used in all cases the implicit back refs is used. The major shortcoming
853  * of the full back refs is its overhead. Every time a tree block gets
854  * COWed, we have to update back refs entry for all pointers in it.
855  *
856  * For a newly allocated tree block, we use implicit back refs for
857  * pointers in it. This means most tree related operations only involve
858  * implicit back refs. For a tree block created in old transaction, the
859  * only way to drop a reference to it is COW it. So we can detect the
860  * event that tree block loses its owner tree's reference and do the
861  * back refs conversion.
862  *
863  * When a tree block is COW'd through a tree, there are four cases:
864  *
865  * The reference count of the block is one and the tree is the block's
866  * owner tree. Nothing to do in this case.
867  *
868  * The reference count of the block is one and the tree is not the
869  * block's owner tree. In this case, full back refs is used for pointers
870  * in the block. Remove these full back refs, add implicit back refs for
871  * every pointers in the new block.
872  *
873  * The reference count of the block is greater than one and the tree is
874  * the block's owner tree. In this case, implicit back refs is used for
875  * pointers in the block. Add full back refs for every pointers in the
876  * block, increase lower level extents' reference counts. The original
877  * implicit back refs are entailed to the new block.
878  *
879  * The reference count of the block is greater than one and the tree is
880  * not the block's owner tree. Add implicit back refs for every pointer in
881  * the new block, increase lower level extents' reference count.
882  *
883  * Back Reference Key composing:
884  *
885  * The key objectid corresponds to the first byte in the extent,
886  * The key type is used to differentiate between types of back refs.
887  * There are different meanings of the key offset for different types
888  * of back refs.
889  *
890  * File extents can be referenced by:
891  *
892  * - multiple snapshots, subvolumes, or different generations in one subvol
893  * - different files inside a single subvolume
894  * - different offsets inside a file (bookend extents in file.c)
895  *
896  * The extent ref structure for the implicit back refs has fields for:
897  *
898  * - Objectid of the subvolume root
899  * - objectid of the file holding the reference
900  * - original offset in the file
901  * - how many bookend extents
902  *
903  * The key offset for the implicit back refs is hash of the first
904  * three fields.
905  *
906  * The extent ref structure for the full back refs has field for:
907  *
908  * - number of pointers in the tree leaf
909  *
910  * The key offset for the implicit back refs is the first byte of
911  * the tree leaf
912  *
913  * When a file extent is allocated, The implicit back refs is used.
914  * the fields are filled in:
915  *
916  *     (root_key.objectid, inode objectid, offset in file, 1)
917  *
918  * When a file extent is removed file truncation, we find the
919  * corresponding implicit back refs and check the following fields:
920  *
921  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
922  *
923  * Btree extents can be referenced by:
924  *
925  * - Different subvolumes
926  *
927  * Both the implicit back refs and the full back refs for tree blocks
928  * only consist of key. The key offset for the implicit back refs is
929  * objectid of block's owner tree. The key offset for the full back refs
930  * is the first byte of parent block.
931  *
932  * When implicit back refs is used, information about the lowest key and
933  * level of the tree block are required. These information are stored in
934  * tree block info structure.
935  */
936
937 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
938 static int convert_extent_item_v0(struct btrfs_trans_handle *trans,
939                                   struct btrfs_root *root,
940                                   struct btrfs_path *path,
941                                   u64 owner, u32 extra_size)
942 {
943         struct btrfs_extent_item *item;
944         struct btrfs_extent_item_v0 *ei0;
945         struct btrfs_extent_ref_v0 *ref0;
946         struct btrfs_tree_block_info *bi;
947         struct extent_buffer *leaf;
948         struct btrfs_key key;
949         struct btrfs_key found_key;
950         u32 new_size = sizeof(*item);
951         u64 refs;
952         int ret;
953
954         leaf = path->nodes[0];
955         BUG_ON(btrfs_item_size_nr(leaf, path->slots[0]) != sizeof(*ei0));
956
957         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
958         ei0 = btrfs_item_ptr(leaf, path->slots[0],
959                              struct btrfs_extent_item_v0);
960         refs = btrfs_extent_refs_v0(leaf, ei0);
961
962         if (owner == (u64)-1) {
963                 while (1) {
964                         if (path->slots[0] >= btrfs_header_nritems(leaf)) {
965                                 ret = btrfs_next_leaf(root, path);
966                                 if (ret < 0)
967                                         return ret;
968                                 BUG_ON(ret > 0); /* Corruption */
969                                 leaf = path->nodes[0];
970                         }
971                         btrfs_item_key_to_cpu(leaf, &found_key,
972                                               path->slots[0]);
973                         BUG_ON(key.objectid != found_key.objectid);
974                         if (found_key.type != BTRFS_EXTENT_REF_V0_KEY) {
975                                 path->slots[0]++;
976                                 continue;
977                         }
978                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
979                                               struct btrfs_extent_ref_v0);
980                         owner = btrfs_ref_objectid_v0(leaf, ref0);
981                         break;
982                 }
983         }
984         btrfs_release_path(path);
985
986         if (owner < BTRFS_FIRST_FREE_OBJECTID)
987                 new_size += sizeof(*bi);
988
989         new_size -= sizeof(*ei0);
990         ret = btrfs_search_slot(trans, root, &key, path,
991                                 new_size + extra_size, 1);
992         if (ret < 0)
993                 return ret;
994         BUG_ON(ret); /* Corruption */
995
996         btrfs_extend_item(trans, root, path, new_size);
997
998         leaf = path->nodes[0];
999         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1000         btrfs_set_extent_refs(leaf, item, refs);
1001         /* FIXME: get real generation */
1002         btrfs_set_extent_generation(leaf, item, 0);
1003         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1004                 btrfs_set_extent_flags(leaf, item,
1005                                        BTRFS_EXTENT_FLAG_TREE_BLOCK |
1006                                        BTRFS_BLOCK_FLAG_FULL_BACKREF);
1007                 bi = (struct btrfs_tree_block_info *)(item + 1);
1008                 /* FIXME: get first key of the block */
1009                 memset_extent_buffer(leaf, 0, (unsigned long)bi, sizeof(*bi));
1010                 btrfs_set_tree_block_level(leaf, bi, (int)owner);
1011         } else {
1012                 btrfs_set_extent_flags(leaf, item, BTRFS_EXTENT_FLAG_DATA);
1013         }
1014         btrfs_mark_buffer_dirty(leaf);
1015         return 0;
1016 }
1017 #endif
1018
1019 static u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
1020 {
1021         u32 high_crc = ~(u32)0;
1022         u32 low_crc = ~(u32)0;
1023         __le64 lenum;
1024
1025         lenum = cpu_to_le64(root_objectid);
1026         high_crc = crc32c(high_crc, &lenum, sizeof(lenum));
1027         lenum = cpu_to_le64(owner);
1028         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1029         lenum = cpu_to_le64(offset);
1030         low_crc = crc32c(low_crc, &lenum, sizeof(lenum));
1031
1032         return ((u64)high_crc << 31) ^ (u64)low_crc;
1033 }
1034
1035 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
1036                                      struct btrfs_extent_data_ref *ref)
1037 {
1038         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
1039                                     btrfs_extent_data_ref_objectid(leaf, ref),
1040                                     btrfs_extent_data_ref_offset(leaf, ref));
1041 }
1042
1043 static int match_extent_data_ref(struct extent_buffer *leaf,
1044                                  struct btrfs_extent_data_ref *ref,
1045                                  u64 root_objectid, u64 owner, u64 offset)
1046 {
1047         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
1048             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
1049             btrfs_extent_data_ref_offset(leaf, ref) != offset)
1050                 return 0;
1051         return 1;
1052 }
1053
1054 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
1055                                            struct btrfs_root *root,
1056                                            struct btrfs_path *path,
1057                                            u64 bytenr, u64 parent,
1058                                            u64 root_objectid,
1059                                            u64 owner, u64 offset)
1060 {
1061         struct btrfs_key key;
1062         struct btrfs_extent_data_ref *ref;
1063         struct extent_buffer *leaf;
1064         u32 nritems;
1065         int ret;
1066         int recow;
1067         int err = -ENOENT;
1068
1069         key.objectid = bytenr;
1070         if (parent) {
1071                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1072                 key.offset = parent;
1073         } else {
1074                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1075                 key.offset = hash_extent_data_ref(root_objectid,
1076                                                   owner, offset);
1077         }
1078 again:
1079         recow = 0;
1080         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1081         if (ret < 0) {
1082                 err = ret;
1083                 goto fail;
1084         }
1085
1086         if (parent) {
1087                 if (!ret)
1088                         return 0;
1089 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1090                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1091                 btrfs_release_path(path);
1092                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1093                 if (ret < 0) {
1094                         err = ret;
1095                         goto fail;
1096                 }
1097                 if (!ret)
1098                         return 0;
1099 #endif
1100                 goto fail;
1101         }
1102
1103         leaf = path->nodes[0];
1104         nritems = btrfs_header_nritems(leaf);
1105         while (1) {
1106                 if (path->slots[0] >= nritems) {
1107                         ret = btrfs_next_leaf(root, path);
1108                         if (ret < 0)
1109                                 err = ret;
1110                         if (ret)
1111                                 goto fail;
1112
1113                         leaf = path->nodes[0];
1114                         nritems = btrfs_header_nritems(leaf);
1115                         recow = 1;
1116                 }
1117
1118                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1119                 if (key.objectid != bytenr ||
1120                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
1121                         goto fail;
1122
1123                 ref = btrfs_item_ptr(leaf, path->slots[0],
1124                                      struct btrfs_extent_data_ref);
1125
1126                 if (match_extent_data_ref(leaf, ref, root_objectid,
1127                                           owner, offset)) {
1128                         if (recow) {
1129                                 btrfs_release_path(path);
1130                                 goto again;
1131                         }
1132                         err = 0;
1133                         break;
1134                 }
1135                 path->slots[0]++;
1136         }
1137 fail:
1138         return err;
1139 }
1140
1141 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
1142                                            struct btrfs_root *root,
1143                                            struct btrfs_path *path,
1144                                            u64 bytenr, u64 parent,
1145                                            u64 root_objectid, u64 owner,
1146                                            u64 offset, int refs_to_add)
1147 {
1148         struct btrfs_key key;
1149         struct extent_buffer *leaf;
1150         u32 size;
1151         u32 num_refs;
1152         int ret;
1153
1154         key.objectid = bytenr;
1155         if (parent) {
1156                 key.type = BTRFS_SHARED_DATA_REF_KEY;
1157                 key.offset = parent;
1158                 size = sizeof(struct btrfs_shared_data_ref);
1159         } else {
1160                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
1161                 key.offset = hash_extent_data_ref(root_objectid,
1162                                                   owner, offset);
1163                 size = sizeof(struct btrfs_extent_data_ref);
1164         }
1165
1166         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
1167         if (ret && ret != -EEXIST)
1168                 goto fail;
1169
1170         leaf = path->nodes[0];
1171         if (parent) {
1172                 struct btrfs_shared_data_ref *ref;
1173                 ref = btrfs_item_ptr(leaf, path->slots[0],
1174                                      struct btrfs_shared_data_ref);
1175                 if (ret == 0) {
1176                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
1177                 } else {
1178                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
1179                         num_refs += refs_to_add;
1180                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
1181                 }
1182         } else {
1183                 struct btrfs_extent_data_ref *ref;
1184                 while (ret == -EEXIST) {
1185                         ref = btrfs_item_ptr(leaf, path->slots[0],
1186                                              struct btrfs_extent_data_ref);
1187                         if (match_extent_data_ref(leaf, ref, root_objectid,
1188                                                   owner, offset))
1189                                 break;
1190                         btrfs_release_path(path);
1191                         key.offset++;
1192                         ret = btrfs_insert_empty_item(trans, root, path, &key,
1193                                                       size);
1194                         if (ret && ret != -EEXIST)
1195                                 goto fail;
1196
1197                         leaf = path->nodes[0];
1198                 }
1199                 ref = btrfs_item_ptr(leaf, path->slots[0],
1200                                      struct btrfs_extent_data_ref);
1201                 if (ret == 0) {
1202                         btrfs_set_extent_data_ref_root(leaf, ref,
1203                                                        root_objectid);
1204                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
1205                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
1206                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
1207                 } else {
1208                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
1209                         num_refs += refs_to_add;
1210                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
1211                 }
1212         }
1213         btrfs_mark_buffer_dirty(leaf);
1214         ret = 0;
1215 fail:
1216         btrfs_release_path(path);
1217         return ret;
1218 }
1219
1220 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
1221                                            struct btrfs_root *root,
1222                                            struct btrfs_path *path,
1223                                            int refs_to_drop)
1224 {
1225         struct btrfs_key key;
1226         struct btrfs_extent_data_ref *ref1 = NULL;
1227         struct btrfs_shared_data_ref *ref2 = NULL;
1228         struct extent_buffer *leaf;
1229         u32 num_refs = 0;
1230         int ret = 0;
1231
1232         leaf = path->nodes[0];
1233         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1234
1235         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1236                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1237                                       struct btrfs_extent_data_ref);
1238                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1239         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1240                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1241                                       struct btrfs_shared_data_ref);
1242                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1243 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1244         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1245                 struct btrfs_extent_ref_v0 *ref0;
1246                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1247                                       struct btrfs_extent_ref_v0);
1248                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1249 #endif
1250         } else {
1251                 BUG();
1252         }
1253
1254         BUG_ON(num_refs < refs_to_drop);
1255         num_refs -= refs_to_drop;
1256
1257         if (num_refs == 0) {
1258                 ret = btrfs_del_item(trans, root, path);
1259         } else {
1260                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
1261                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
1262                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
1263                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
1264 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1265                 else {
1266                         struct btrfs_extent_ref_v0 *ref0;
1267                         ref0 = btrfs_item_ptr(leaf, path->slots[0],
1268                                         struct btrfs_extent_ref_v0);
1269                         btrfs_set_ref_count_v0(leaf, ref0, num_refs);
1270                 }
1271 #endif
1272                 btrfs_mark_buffer_dirty(leaf);
1273         }
1274         return ret;
1275 }
1276
1277 static noinline u32 extent_data_ref_count(struct btrfs_root *root,
1278                                           struct btrfs_path *path,
1279                                           struct btrfs_extent_inline_ref *iref)
1280 {
1281         struct btrfs_key key;
1282         struct extent_buffer *leaf;
1283         struct btrfs_extent_data_ref *ref1;
1284         struct btrfs_shared_data_ref *ref2;
1285         u32 num_refs = 0;
1286
1287         leaf = path->nodes[0];
1288         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1289         if (iref) {
1290                 if (btrfs_extent_inline_ref_type(leaf, iref) ==
1291                     BTRFS_EXTENT_DATA_REF_KEY) {
1292                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
1293                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1294                 } else {
1295                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
1296                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1297                 }
1298         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
1299                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
1300                                       struct btrfs_extent_data_ref);
1301                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
1302         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
1303                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
1304                                       struct btrfs_shared_data_ref);
1305                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
1306 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1307         } else if (key.type == BTRFS_EXTENT_REF_V0_KEY) {
1308                 struct btrfs_extent_ref_v0 *ref0;
1309                 ref0 = btrfs_item_ptr(leaf, path->slots[0],
1310                                       struct btrfs_extent_ref_v0);
1311                 num_refs = btrfs_ref_count_v0(leaf, ref0);
1312 #endif
1313         } else {
1314                 WARN_ON(1);
1315         }
1316         return num_refs;
1317 }
1318
1319 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
1320                                           struct btrfs_root *root,
1321                                           struct btrfs_path *path,
1322                                           u64 bytenr, u64 parent,
1323                                           u64 root_objectid)
1324 {
1325         struct btrfs_key key;
1326         int ret;
1327
1328         key.objectid = bytenr;
1329         if (parent) {
1330                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1331                 key.offset = parent;
1332         } else {
1333                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1334                 key.offset = root_objectid;
1335         }
1336
1337         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1338         if (ret > 0)
1339                 ret = -ENOENT;
1340 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1341         if (ret == -ENOENT && parent) {
1342                 btrfs_release_path(path);
1343                 key.type = BTRFS_EXTENT_REF_V0_KEY;
1344                 ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1345                 if (ret > 0)
1346                         ret = -ENOENT;
1347         }
1348 #endif
1349         return ret;
1350 }
1351
1352 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
1353                                           struct btrfs_root *root,
1354                                           struct btrfs_path *path,
1355                                           u64 bytenr, u64 parent,
1356                                           u64 root_objectid)
1357 {
1358         struct btrfs_key key;
1359         int ret;
1360
1361         key.objectid = bytenr;
1362         if (parent) {
1363                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
1364                 key.offset = parent;
1365         } else {
1366                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
1367                 key.offset = root_objectid;
1368         }
1369
1370         ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
1371         btrfs_release_path(path);
1372         return ret;
1373 }
1374
1375 static inline int extent_ref_type(u64 parent, u64 owner)
1376 {
1377         int type;
1378         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1379                 if (parent > 0)
1380                         type = BTRFS_SHARED_BLOCK_REF_KEY;
1381                 else
1382                         type = BTRFS_TREE_BLOCK_REF_KEY;
1383         } else {
1384                 if (parent > 0)
1385                         type = BTRFS_SHARED_DATA_REF_KEY;
1386                 else
1387                         type = BTRFS_EXTENT_DATA_REF_KEY;
1388         }
1389         return type;
1390 }
1391
1392 static int find_next_key(struct btrfs_path *path, int level,
1393                          struct btrfs_key *key)
1394
1395 {
1396         for (; level < BTRFS_MAX_LEVEL; level++) {
1397                 if (!path->nodes[level])
1398                         break;
1399                 if (path->slots[level] + 1 >=
1400                     btrfs_header_nritems(path->nodes[level]))
1401                         continue;
1402                 if (level == 0)
1403                         btrfs_item_key_to_cpu(path->nodes[level], key,
1404                                               path->slots[level] + 1);
1405                 else
1406                         btrfs_node_key_to_cpu(path->nodes[level], key,
1407                                               path->slots[level] + 1);
1408                 return 0;
1409         }
1410         return 1;
1411 }
1412
1413 /*
1414  * look for inline back ref. if back ref is found, *ref_ret is set
1415  * to the address of inline back ref, and 0 is returned.
1416  *
1417  * if back ref isn't found, *ref_ret is set to the address where it
1418  * should be inserted, and -ENOENT is returned.
1419  *
1420  * if insert is true and there are too many inline back refs, the path
1421  * points to the extent item, and -EAGAIN is returned.
1422  *
1423  * NOTE: inline back refs are ordered in the same way that back ref
1424  *       items in the tree are ordered.
1425  */
1426 static noinline_for_stack
1427 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
1428                                  struct btrfs_root *root,
1429                                  struct btrfs_path *path,
1430                                  struct btrfs_extent_inline_ref **ref_ret,
1431                                  u64 bytenr, u64 num_bytes,
1432                                  u64 parent, u64 root_objectid,
1433                                  u64 owner, u64 offset, int insert)
1434 {
1435         struct btrfs_key key;
1436         struct extent_buffer *leaf;
1437         struct btrfs_extent_item *ei;
1438         struct btrfs_extent_inline_ref *iref;
1439         u64 flags;
1440         u64 item_size;
1441         unsigned long ptr;
1442         unsigned long end;
1443         int extra_size;
1444         int type;
1445         int want;
1446         int ret;
1447         int err = 0;
1448
1449         key.objectid = bytenr;
1450         key.type = BTRFS_EXTENT_ITEM_KEY;
1451         key.offset = num_bytes;
1452
1453         want = extent_ref_type(parent, owner);
1454         if (insert) {
1455                 extra_size = btrfs_extent_inline_ref_size(want);
1456                 path->keep_locks = 1;
1457         } else
1458                 extra_size = -1;
1459         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
1460         if (ret < 0) {
1461                 err = ret;
1462                 goto out;
1463         }
1464         if (ret && !insert) {
1465                 err = -ENOENT;
1466                 goto out;
1467         }
1468         BUG_ON(ret); /* Corruption */
1469
1470         leaf = path->nodes[0];
1471         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1472 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
1473         if (item_size < sizeof(*ei)) {
1474                 if (!insert) {
1475                         err = -ENOENT;
1476                         goto out;
1477                 }
1478                 ret = convert_extent_item_v0(trans, root, path, owner,
1479                                              extra_size);
1480                 if (ret < 0) {
1481                         err = ret;
1482                         goto out;
1483                 }
1484                 leaf = path->nodes[0];
1485                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1486         }
1487 #endif
1488         BUG_ON(item_size < sizeof(*ei));
1489
1490         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1491         flags = btrfs_extent_flags(leaf, ei);
1492
1493         ptr = (unsigned long)(ei + 1);
1494         end = (unsigned long)ei + item_size;
1495
1496         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
1497                 ptr += sizeof(struct btrfs_tree_block_info);
1498                 BUG_ON(ptr > end);
1499         } else {
1500                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_DATA));
1501         }
1502
1503         err = -ENOENT;
1504         while (1) {
1505                 if (ptr >= end) {
1506                         WARN_ON(ptr > end);
1507                         break;
1508                 }
1509                 iref = (struct btrfs_extent_inline_ref *)ptr;
1510                 type = btrfs_extent_inline_ref_type(leaf, iref);
1511                 if (want < type)
1512                         break;
1513                 if (want > type) {
1514                         ptr += btrfs_extent_inline_ref_size(type);
1515                         continue;
1516                 }
1517
1518                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1519                         struct btrfs_extent_data_ref *dref;
1520                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1521                         if (match_extent_data_ref(leaf, dref, root_objectid,
1522                                                   owner, offset)) {
1523                                 err = 0;
1524                                 break;
1525                         }
1526                         if (hash_extent_data_ref_item(leaf, dref) <
1527                             hash_extent_data_ref(root_objectid, owner, offset))
1528                                 break;
1529                 } else {
1530                         u64 ref_offset;
1531                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
1532                         if (parent > 0) {
1533                                 if (parent == ref_offset) {
1534                                         err = 0;
1535                                         break;
1536                                 }
1537                                 if (ref_offset < parent)
1538                                         break;
1539                         } else {
1540                                 if (root_objectid == ref_offset) {
1541                                         err = 0;
1542                                         break;
1543                                 }
1544                                 if (ref_offset < root_objectid)
1545                                         break;
1546                         }
1547                 }
1548                 ptr += btrfs_extent_inline_ref_size(type);
1549         }
1550         if (err == -ENOENT && insert) {
1551                 if (item_size + extra_size >=
1552                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
1553                         err = -EAGAIN;
1554                         goto out;
1555                 }
1556                 /*
1557                  * To add new inline back ref, we have to make sure
1558                  * there is no corresponding back ref item.
1559                  * For simplicity, we just do not add new inline back
1560                  * ref if there is any kind of item for this block
1561                  */
1562                 if (find_next_key(path, 0, &key) == 0 &&
1563                     key.objectid == bytenr &&
1564                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
1565                         err = -EAGAIN;
1566                         goto out;
1567                 }
1568         }
1569         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1570 out:
1571         if (insert) {
1572                 path->keep_locks = 0;
1573                 btrfs_unlock_up_safe(path, 1);
1574         }
1575         return err;
1576 }
1577
1578 /*
1579  * helper to add new inline back ref
1580  */
1581 static noinline_for_stack
1582 void setup_inline_extent_backref(struct btrfs_trans_handle *trans,
1583                                  struct btrfs_root *root,
1584                                  struct btrfs_path *path,
1585                                  struct btrfs_extent_inline_ref *iref,
1586                                  u64 parent, u64 root_objectid,
1587                                  u64 owner, u64 offset, int refs_to_add,
1588                                  struct btrfs_delayed_extent_op *extent_op)
1589 {
1590         struct extent_buffer *leaf;
1591         struct btrfs_extent_item *ei;
1592         unsigned long ptr;
1593         unsigned long end;
1594         unsigned long item_offset;
1595         u64 refs;
1596         int size;
1597         int type;
1598
1599         leaf = path->nodes[0];
1600         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1601         item_offset = (unsigned long)iref - (unsigned long)ei;
1602
1603         type = extent_ref_type(parent, owner);
1604         size = btrfs_extent_inline_ref_size(type);
1605
1606         btrfs_extend_item(trans, root, path, size);
1607
1608         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1609         refs = btrfs_extent_refs(leaf, ei);
1610         refs += refs_to_add;
1611         btrfs_set_extent_refs(leaf, ei, refs);
1612         if (extent_op)
1613                 __run_delayed_extent_op(extent_op, leaf, ei);
1614
1615         ptr = (unsigned long)ei + item_offset;
1616         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1617         if (ptr < end - size)
1618                 memmove_extent_buffer(leaf, ptr + size, ptr,
1619                                       end - size - ptr);
1620
1621         iref = (struct btrfs_extent_inline_ref *)ptr;
1622         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1623         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1624                 struct btrfs_extent_data_ref *dref;
1625                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1626                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1627                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1628                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1629                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1630         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1631                 struct btrfs_shared_data_ref *sref;
1632                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1633                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1634                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1635         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1636                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1637         } else {
1638                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1639         }
1640         btrfs_mark_buffer_dirty(leaf);
1641 }
1642
1643 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1644                                  struct btrfs_root *root,
1645                                  struct btrfs_path *path,
1646                                  struct btrfs_extent_inline_ref **ref_ret,
1647                                  u64 bytenr, u64 num_bytes, u64 parent,
1648                                  u64 root_objectid, u64 owner, u64 offset)
1649 {
1650         int ret;
1651
1652         ret = lookup_inline_extent_backref(trans, root, path, ref_ret,
1653                                            bytenr, num_bytes, parent,
1654                                            root_objectid, owner, offset, 0);
1655         if (ret != -ENOENT)
1656                 return ret;
1657
1658         btrfs_release_path(path);
1659         *ref_ret = NULL;
1660
1661         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1662                 ret = lookup_tree_block_ref(trans, root, path, bytenr, parent,
1663                                             root_objectid);
1664         } else {
1665                 ret = lookup_extent_data_ref(trans, root, path, bytenr, parent,
1666                                              root_objectid, owner, offset);
1667         }
1668         return ret;
1669 }
1670
1671 /*
1672  * helper to update/remove inline back ref
1673  */
1674 static noinline_for_stack
1675 void update_inline_extent_backref(struct btrfs_trans_handle *trans,
1676                                   struct btrfs_root *root,
1677                                   struct btrfs_path *path,
1678                                   struct btrfs_extent_inline_ref *iref,
1679                                   int refs_to_mod,
1680                                   struct btrfs_delayed_extent_op *extent_op)
1681 {
1682         struct extent_buffer *leaf;
1683         struct btrfs_extent_item *ei;
1684         struct btrfs_extent_data_ref *dref = NULL;
1685         struct btrfs_shared_data_ref *sref = NULL;
1686         unsigned long ptr;
1687         unsigned long end;
1688         u32 item_size;
1689         int size;
1690         int type;
1691         u64 refs;
1692
1693         leaf = path->nodes[0];
1694         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1695         refs = btrfs_extent_refs(leaf, ei);
1696         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1697         refs += refs_to_mod;
1698         btrfs_set_extent_refs(leaf, ei, refs);
1699         if (extent_op)
1700                 __run_delayed_extent_op(extent_op, leaf, ei);
1701
1702         type = btrfs_extent_inline_ref_type(leaf, iref);
1703
1704         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1705                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1706                 refs = btrfs_extent_data_ref_count(leaf, dref);
1707         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1708                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1709                 refs = btrfs_shared_data_ref_count(leaf, sref);
1710         } else {
1711                 refs = 1;
1712                 BUG_ON(refs_to_mod != -1);
1713         }
1714
1715         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1716         refs += refs_to_mod;
1717
1718         if (refs > 0) {
1719                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1720                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1721                 else
1722                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1723         } else {
1724                 size =  btrfs_extent_inline_ref_size(type);
1725                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1726                 ptr = (unsigned long)iref;
1727                 end = (unsigned long)ei + item_size;
1728                 if (ptr + size < end)
1729                         memmove_extent_buffer(leaf, ptr, ptr + size,
1730                                               end - ptr - size);
1731                 item_size -= size;
1732                 btrfs_truncate_item(trans, root, path, item_size, 1);
1733         }
1734         btrfs_mark_buffer_dirty(leaf);
1735 }
1736
1737 static noinline_for_stack
1738 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1739                                  struct btrfs_root *root,
1740                                  struct btrfs_path *path,
1741                                  u64 bytenr, u64 num_bytes, u64 parent,
1742                                  u64 root_objectid, u64 owner,
1743                                  u64 offset, int refs_to_add,
1744                                  struct btrfs_delayed_extent_op *extent_op)
1745 {
1746         struct btrfs_extent_inline_ref *iref;
1747         int ret;
1748
1749         ret = lookup_inline_extent_backref(trans, root, path, &iref,
1750                                            bytenr, num_bytes, parent,
1751                                            root_objectid, owner, offset, 1);
1752         if (ret == 0) {
1753                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1754                 update_inline_extent_backref(trans, root, path, iref,
1755                                              refs_to_add, extent_op);
1756         } else if (ret == -ENOENT) {
1757                 setup_inline_extent_backref(trans, root, path, iref, parent,
1758                                             root_objectid, owner, offset,
1759                                             refs_to_add, extent_op);
1760                 ret = 0;
1761         }
1762         return ret;
1763 }
1764
1765 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1766                                  struct btrfs_root *root,
1767                                  struct btrfs_path *path,
1768                                  u64 bytenr, u64 parent, u64 root_objectid,
1769                                  u64 owner, u64 offset, int refs_to_add)
1770 {
1771         int ret;
1772         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1773                 BUG_ON(refs_to_add != 1);
1774                 ret = insert_tree_block_ref(trans, root, path, bytenr,
1775                                             parent, root_objectid);
1776         } else {
1777                 ret = insert_extent_data_ref(trans, root, path, bytenr,
1778                                              parent, root_objectid,
1779                                              owner, offset, refs_to_add);
1780         }
1781         return ret;
1782 }
1783
1784 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1785                                  struct btrfs_root *root,
1786                                  struct btrfs_path *path,
1787                                  struct btrfs_extent_inline_ref *iref,
1788                                  int refs_to_drop, int is_data)
1789 {
1790         int ret = 0;
1791
1792         BUG_ON(!is_data && refs_to_drop != 1);
1793         if (iref) {
1794                 update_inline_extent_backref(trans, root, path, iref,
1795                                              -refs_to_drop, NULL);
1796         } else if (is_data) {
1797                 ret = remove_extent_data_ref(trans, root, path, refs_to_drop);
1798         } else {
1799                 ret = btrfs_del_item(trans, root, path);
1800         }
1801         return ret;
1802 }
1803
1804 static int btrfs_issue_discard(struct block_device *bdev,
1805                                 u64 start, u64 len)
1806 {
1807         return blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_NOFS, 0);
1808 }
1809
1810 static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
1811                                 u64 num_bytes, u64 *actual_bytes)
1812 {
1813         int ret;
1814         u64 discarded_bytes = 0;
1815         struct btrfs_bio *bbio = NULL;
1816
1817
1818         /* Tell the block device(s) that the sectors can be discarded */
1819         ret = btrfs_map_block(root->fs_info, REQ_DISCARD,
1820                               bytenr, &num_bytes, &bbio, 0);
1821         /* Error condition is -ENOMEM */
1822         if (!ret) {
1823                 struct btrfs_bio_stripe *stripe = bbio->stripes;
1824                 int i;
1825
1826
1827                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1828                         if (!stripe->dev->can_discard)
1829                                 continue;
1830
1831                         ret = btrfs_issue_discard(stripe->dev->bdev,
1832                                                   stripe->physical,
1833                                                   stripe->length);
1834                         if (!ret)
1835                                 discarded_bytes += stripe->length;
1836                         else if (ret != -EOPNOTSUPP)
1837                                 break; /* Logic errors or -ENOMEM, or -EIO but I don't know how that could happen JDM */
1838
1839                         /*
1840                          * Just in case we get back EOPNOTSUPP for some reason,
1841                          * just ignore the return value so we don't screw up
1842                          * people calling discard_extent.
1843                          */
1844                         ret = 0;
1845                 }
1846                 kfree(bbio);
1847         }
1848
1849         if (actual_bytes)
1850                 *actual_bytes = discarded_bytes;
1851
1852
1853         return ret;
1854 }
1855
1856 /* Can return -ENOMEM */
1857 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1858                          struct btrfs_root *root,
1859                          u64 bytenr, u64 num_bytes, u64 parent,
1860                          u64 root_objectid, u64 owner, u64 offset, int for_cow)
1861 {
1862         int ret;
1863         struct btrfs_fs_info *fs_info = root->fs_info;
1864
1865         BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID &&
1866                root_objectid == BTRFS_TREE_LOG_OBJECTID);
1867
1868         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1869                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
1870                                         num_bytes,
1871                                         parent, root_objectid, (int)owner,
1872                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1873         } else {
1874                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
1875                                         num_bytes,
1876                                         parent, root_objectid, owner, offset,
1877                                         BTRFS_ADD_DELAYED_REF, NULL, for_cow);
1878         }
1879         return ret;
1880 }
1881
1882 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1883                                   struct btrfs_root *root,
1884                                   u64 bytenr, u64 num_bytes,
1885                                   u64 parent, u64 root_objectid,
1886                                   u64 owner, u64 offset, int refs_to_add,
1887                                   struct btrfs_delayed_extent_op *extent_op)
1888 {
1889         struct btrfs_path *path;
1890         struct extent_buffer *leaf;
1891         struct btrfs_extent_item *item;
1892         u64 refs;
1893         int ret;
1894         int err = 0;
1895
1896         path = btrfs_alloc_path();
1897         if (!path)
1898                 return -ENOMEM;
1899
1900         path->reada = 1;
1901         path->leave_spinning = 1;
1902         /* this will setup the path even if it fails to insert the back ref */
1903         ret = insert_inline_extent_backref(trans, root->fs_info->extent_root,
1904                                            path, bytenr, num_bytes, parent,
1905                                            root_objectid, owner, offset,
1906                                            refs_to_add, extent_op);
1907         if (ret == 0)
1908                 goto out;
1909
1910         if (ret != -EAGAIN) {
1911                 err = ret;
1912                 goto out;
1913         }
1914
1915         leaf = path->nodes[0];
1916         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1917         refs = btrfs_extent_refs(leaf, item);
1918         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1919         if (extent_op)
1920                 __run_delayed_extent_op(extent_op, leaf, item);
1921
1922         btrfs_mark_buffer_dirty(leaf);
1923         btrfs_release_path(path);
1924
1925         path->reada = 1;
1926         path->leave_spinning = 1;
1927
1928         /* now insert the actual backref */
1929         ret = insert_extent_backref(trans, root->fs_info->extent_root,
1930                                     path, bytenr, parent, root_objectid,
1931                                     owner, offset, refs_to_add);
1932         if (ret)
1933                 btrfs_abort_transaction(trans, root, ret);
1934 out:
1935         btrfs_free_path(path);
1936         return err;
1937 }
1938
1939 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1940                                 struct btrfs_root *root,
1941                                 struct btrfs_delayed_ref_node *node,
1942                                 struct btrfs_delayed_extent_op *extent_op,
1943                                 int insert_reserved)
1944 {
1945         int ret = 0;
1946         struct btrfs_delayed_data_ref *ref;
1947         struct btrfs_key ins;
1948         u64 parent = 0;
1949         u64 ref_root = 0;
1950         u64 flags = 0;
1951
1952         ins.objectid = node->bytenr;
1953         ins.offset = node->num_bytes;
1954         ins.type = BTRFS_EXTENT_ITEM_KEY;
1955
1956         ref = btrfs_delayed_node_to_data_ref(node);
1957         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1958                 parent = ref->parent;
1959         else
1960                 ref_root = ref->root;
1961
1962         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1963                 if (extent_op) {
1964                         BUG_ON(extent_op->update_key);
1965                         flags |= extent_op->flags_to_set;
1966                 }
1967                 ret = alloc_reserved_file_extent(trans, root,
1968                                                  parent, ref_root, flags,
1969                                                  ref->objectid, ref->offset,
1970                                                  &ins, node->ref_mod);
1971         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1972                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
1973                                              node->num_bytes, parent,
1974                                              ref_root, ref->objectid,
1975                                              ref->offset, node->ref_mod,
1976                                              extent_op);
1977         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1978                 ret = __btrfs_free_extent(trans, root, node->bytenr,
1979                                           node->num_bytes, parent,
1980                                           ref_root, ref->objectid,
1981                                           ref->offset, node->ref_mod,
1982                                           extent_op);
1983         } else {
1984                 BUG();
1985         }
1986         return ret;
1987 }
1988
1989 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1990                                     struct extent_buffer *leaf,
1991                                     struct btrfs_extent_item *ei)
1992 {
1993         u64 flags = btrfs_extent_flags(leaf, ei);
1994         if (extent_op->update_flags) {
1995                 flags |= extent_op->flags_to_set;
1996                 btrfs_set_extent_flags(leaf, ei, flags);
1997         }
1998
1999         if (extent_op->update_key) {
2000                 struct btrfs_tree_block_info *bi;
2001                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
2002                 bi = (struct btrfs_tree_block_info *)(ei + 1);
2003                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
2004         }
2005 }
2006
2007 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
2008                                  struct btrfs_root *root,
2009                                  struct btrfs_delayed_ref_node *node,
2010                                  struct btrfs_delayed_extent_op *extent_op)
2011 {
2012         struct btrfs_key key;
2013         struct btrfs_path *path;
2014         struct btrfs_extent_item *ei;
2015         struct extent_buffer *leaf;
2016         u32 item_size;
2017         int ret;
2018         int err = 0;
2019
2020         if (trans->aborted)
2021                 return 0;
2022
2023         path = btrfs_alloc_path();
2024         if (!path)
2025                 return -ENOMEM;
2026
2027         key.objectid = node->bytenr;
2028         key.type = BTRFS_EXTENT_ITEM_KEY;
2029         key.offset = node->num_bytes;
2030
2031         path->reada = 1;
2032         path->leave_spinning = 1;
2033         ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
2034                                 path, 0, 1);
2035         if (ret < 0) {
2036                 err = ret;
2037                 goto out;
2038         }
2039         if (ret > 0) {
2040                 err = -EIO;
2041                 goto out;
2042         }
2043
2044         leaf = path->nodes[0];
2045         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2046 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2047         if (item_size < sizeof(*ei)) {
2048                 ret = convert_extent_item_v0(trans, root->fs_info->extent_root,
2049                                              path, (u64)-1, 0);
2050                 if (ret < 0) {
2051                         err = ret;
2052                         goto out;
2053                 }
2054                 leaf = path->nodes[0];
2055                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2056         }
2057 #endif
2058         BUG_ON(item_size < sizeof(*ei));
2059         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2060         __run_delayed_extent_op(extent_op, leaf, ei);
2061
2062         btrfs_mark_buffer_dirty(leaf);
2063 out:
2064         btrfs_free_path(path);
2065         return err;
2066 }
2067
2068 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
2069                                 struct btrfs_root *root,
2070                                 struct btrfs_delayed_ref_node *node,
2071                                 struct btrfs_delayed_extent_op *extent_op,
2072                                 int insert_reserved)
2073 {
2074         int ret = 0;
2075         struct btrfs_delayed_tree_ref *ref;
2076         struct btrfs_key ins;
2077         u64 parent = 0;
2078         u64 ref_root = 0;
2079
2080         ins.objectid = node->bytenr;
2081         ins.offset = node->num_bytes;
2082         ins.type = BTRFS_EXTENT_ITEM_KEY;
2083
2084         ref = btrfs_delayed_node_to_tree_ref(node);
2085         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2086                 parent = ref->parent;
2087         else
2088                 ref_root = ref->root;
2089
2090         BUG_ON(node->ref_mod != 1);
2091         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
2092                 BUG_ON(!extent_op || !extent_op->update_flags ||
2093                        !extent_op->update_key);
2094                 ret = alloc_reserved_tree_block(trans, root,
2095                                                 parent, ref_root,
2096                                                 extent_op->flags_to_set,
2097                                                 &extent_op->key,
2098                                                 ref->level, &ins);
2099         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
2100                 ret = __btrfs_inc_extent_ref(trans, root, node->bytenr,
2101                                              node->num_bytes, parent, ref_root,
2102                                              ref->level, 0, 1, extent_op);
2103         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
2104                 ret = __btrfs_free_extent(trans, root, node->bytenr,
2105                                           node->num_bytes, parent, ref_root,
2106                                           ref->level, 0, 1, extent_op);
2107         } else {
2108                 BUG();
2109         }
2110         return ret;
2111 }
2112
2113 /* helper function to actually process a single delayed ref entry */
2114 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
2115                                struct btrfs_root *root,
2116                                struct btrfs_delayed_ref_node *node,
2117                                struct btrfs_delayed_extent_op *extent_op,
2118                                int insert_reserved)
2119 {
2120         int ret = 0;
2121
2122         if (trans->aborted)
2123                 return 0;
2124
2125         if (btrfs_delayed_ref_is_head(node)) {
2126                 struct btrfs_delayed_ref_head *head;
2127                 /*
2128                  * we've hit the end of the chain and we were supposed
2129                  * to insert this extent into the tree.  But, it got
2130                  * deleted before we ever needed to insert it, so all
2131                  * we have to do is clean up the accounting
2132                  */
2133                 BUG_ON(extent_op);
2134                 head = btrfs_delayed_node_to_head(node);
2135                 if (insert_reserved) {
2136                         btrfs_pin_extent(root, node->bytenr,
2137                                          node->num_bytes, 1);
2138                         if (head->is_data) {
2139                                 ret = btrfs_del_csums(trans, root,
2140                                                       node->bytenr,
2141                                                       node->num_bytes);
2142                         }
2143                 }
2144                 return ret;
2145         }
2146
2147         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
2148             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
2149                 ret = run_delayed_tree_ref(trans, root, node, extent_op,
2150                                            insert_reserved);
2151         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
2152                  node->type == BTRFS_SHARED_DATA_REF_KEY)
2153                 ret = run_delayed_data_ref(trans, root, node, extent_op,
2154                                            insert_reserved);
2155         else
2156                 BUG();
2157         return ret;
2158 }
2159
2160 static noinline struct btrfs_delayed_ref_node *
2161 select_delayed_ref(struct btrfs_delayed_ref_head *head)
2162 {
2163         struct rb_node *node;
2164         struct btrfs_delayed_ref_node *ref;
2165         int action = BTRFS_ADD_DELAYED_REF;
2166 again:
2167         /*
2168          * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
2169          * this prevents ref count from going down to zero when
2170          * there still are pending delayed ref.
2171          */
2172         node = rb_prev(&head->node.rb_node);
2173         while (1) {
2174                 if (!node)
2175                         break;
2176                 ref = rb_entry(node, struct btrfs_delayed_ref_node,
2177                                 rb_node);
2178                 if (ref->bytenr != head->node.bytenr)
2179                         break;
2180                 if (ref->action == action)
2181                         return ref;
2182                 node = rb_prev(node);
2183         }
2184         if (action == BTRFS_ADD_DELAYED_REF) {
2185                 action = BTRFS_DROP_DELAYED_REF;
2186                 goto again;
2187         }
2188         return NULL;
2189 }
2190
2191 /*
2192  * Returns 0 on success or if called with an already aborted transaction.
2193  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2194  */
2195 static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
2196                                        struct btrfs_root *root,
2197                                        struct list_head *cluster)
2198 {
2199         struct btrfs_delayed_ref_root *delayed_refs;
2200         struct btrfs_delayed_ref_node *ref;
2201         struct btrfs_delayed_ref_head *locked_ref = NULL;
2202         struct btrfs_delayed_extent_op *extent_op;
2203         struct btrfs_fs_info *fs_info = root->fs_info;
2204         int ret;
2205         int count = 0;
2206         int must_insert_reserved = 0;
2207
2208         delayed_refs = &trans->transaction->delayed_refs;
2209         while (1) {
2210                 if (!locked_ref) {
2211                         /* pick a new head ref from the cluster list */
2212                         if (list_empty(cluster))
2213                                 break;
2214
2215                         locked_ref = list_entry(cluster->next,
2216                                      struct btrfs_delayed_ref_head, cluster);
2217
2218                         /* grab the lock that says we are going to process
2219                          * all the refs for this head */
2220                         ret = btrfs_delayed_ref_lock(trans, locked_ref);
2221
2222                         /*
2223                          * we may have dropped the spin lock to get the head
2224                          * mutex lock, and that might have given someone else
2225                          * time to free the head.  If that's true, it has been
2226                          * removed from our list and we can move on.
2227                          */
2228                         if (ret == -EAGAIN) {
2229                                 locked_ref = NULL;
2230                                 count++;
2231                                 continue;
2232                         }
2233                 }
2234
2235                 /*
2236                  * We need to try and merge add/drops of the same ref since we
2237                  * can run into issues with relocate dropping the implicit ref
2238                  * and then it being added back again before the drop can
2239                  * finish.  If we merged anything we need to re-loop so we can
2240                  * get a good ref.
2241                  */
2242                 btrfs_merge_delayed_refs(trans, fs_info, delayed_refs,
2243                                          locked_ref);
2244
2245                 /*
2246                  * locked_ref is the head node, so we have to go one
2247                  * node back for any delayed ref updates
2248                  */
2249                 ref = select_delayed_ref(locked_ref);
2250
2251                 if (ref && ref->seq &&
2252                     btrfs_check_delayed_seq(fs_info, delayed_refs, ref->seq)) {
2253                         /*
2254                          * there are still refs with lower seq numbers in the
2255                          * process of being added. Don't run this ref yet.
2256                          */
2257                         list_del_init(&locked_ref->cluster);
2258                         btrfs_delayed_ref_unlock(locked_ref);
2259                         locked_ref = NULL;
2260                         delayed_refs->num_heads_ready++;
2261                         spin_unlock(&delayed_refs->lock);
2262                         cond_resched();
2263                         spin_lock(&delayed_refs->lock);
2264                         continue;
2265                 }
2266
2267                 /*
2268                  * record the must insert reserved flag before we
2269                  * drop the spin lock.
2270                  */
2271                 must_insert_reserved = locked_ref->must_insert_reserved;
2272                 locked_ref->must_insert_reserved = 0;
2273
2274                 extent_op = locked_ref->extent_op;
2275                 locked_ref->extent_op = NULL;
2276
2277                 if (!ref) {
2278                         /* All delayed refs have been processed, Go ahead
2279                          * and send the head node to run_one_delayed_ref,
2280                          * so that any accounting fixes can happen
2281                          */
2282                         ref = &locked_ref->node;
2283
2284                         if (extent_op && must_insert_reserved) {
2285                                 btrfs_free_delayed_extent_op(extent_op);
2286                                 extent_op = NULL;
2287                         }
2288
2289                         if (extent_op) {
2290                                 spin_unlock(&delayed_refs->lock);
2291
2292                                 ret = run_delayed_extent_op(trans, root,
2293                                                             ref, extent_op);
2294                                 btrfs_free_delayed_extent_op(extent_op);
2295
2296                                 if (ret) {
2297                                         printk(KERN_DEBUG
2298                                                "btrfs: run_delayed_extent_op "
2299                                                "returned %d\n", ret);
2300                                         spin_lock(&delayed_refs->lock);
2301                                         btrfs_delayed_ref_unlock(locked_ref);
2302                                         return ret;
2303                                 }
2304
2305                                 goto next;
2306                         }
2307                 }
2308
2309                 ref->in_tree = 0;
2310                 rb_erase(&ref->rb_node, &delayed_refs->root);
2311                 delayed_refs->num_entries--;
2312                 if (!btrfs_delayed_ref_is_head(ref)) {
2313                         /*
2314                          * when we play the delayed ref, also correct the
2315                          * ref_mod on head
2316                          */
2317                         switch (ref->action) {
2318                         case BTRFS_ADD_DELAYED_REF:
2319                         case BTRFS_ADD_DELAYED_EXTENT:
2320                                 locked_ref->node.ref_mod -= ref->ref_mod;
2321                                 break;
2322                         case BTRFS_DROP_DELAYED_REF:
2323                                 locked_ref->node.ref_mod += ref->ref_mod;
2324                                 break;
2325                         default:
2326                                 WARN_ON(1);
2327                         }
2328                 }
2329                 spin_unlock(&delayed_refs->lock);
2330
2331                 ret = run_one_delayed_ref(trans, root, ref, extent_op,
2332                                           must_insert_reserved);
2333
2334                 btrfs_free_delayed_extent_op(extent_op);
2335                 if (ret) {
2336                         btrfs_delayed_ref_unlock(locked_ref);
2337                         btrfs_put_delayed_ref(ref);
2338                         printk(KERN_DEBUG
2339                                "btrfs: run_one_delayed_ref returned %d\n", ret);
2340                         spin_lock(&delayed_refs->lock);
2341                         return ret;
2342                 }
2343
2344                 /*
2345                  * If this node is a head, that means all the refs in this head
2346                  * have been dealt with, and we will pick the next head to deal
2347                  * with, so we must unlock the head and drop it from the cluster
2348                  * list before we release it.
2349                  */
2350                 if (btrfs_delayed_ref_is_head(ref)) {
2351                         list_del_init(&locked_ref->cluster);
2352                         btrfs_delayed_ref_unlock(locked_ref);
2353                         locked_ref = NULL;
2354                 }
2355                 btrfs_put_delayed_ref(ref);
2356                 count++;
2357 next:
2358                 cond_resched();
2359                 spin_lock(&delayed_refs->lock);
2360         }
2361         return count;
2362 }
2363
2364 #ifdef SCRAMBLE_DELAYED_REFS
2365 /*
2366  * Normally delayed refs get processed in ascending bytenr order. This
2367  * correlates in most cases to the order added. To expose dependencies on this
2368  * order, we start to process the tree in the middle instead of the beginning
2369  */
2370 static u64 find_middle(struct rb_root *root)
2371 {
2372         struct rb_node *n = root->rb_node;
2373         struct btrfs_delayed_ref_node *entry;
2374         int alt = 1;
2375         u64 middle;
2376         u64 first = 0, last = 0;
2377
2378         n = rb_first(root);
2379         if (n) {
2380                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2381                 first = entry->bytenr;
2382         }
2383         n = rb_last(root);
2384         if (n) {
2385                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2386                 last = entry->bytenr;
2387         }
2388         n = root->rb_node;
2389
2390         while (n) {
2391                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2392                 WARN_ON(!entry->in_tree);
2393
2394                 middle = entry->bytenr;
2395
2396                 if (alt)
2397                         n = n->rb_left;
2398                 else
2399                         n = n->rb_right;
2400
2401                 alt = 1 - alt;
2402         }
2403         return middle;
2404 }
2405 #endif
2406
2407 int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans,
2408                                          struct btrfs_fs_info *fs_info)
2409 {
2410         struct qgroup_update *qgroup_update;
2411         int ret = 0;
2412
2413         if (list_empty(&trans->qgroup_ref_list) !=
2414             !trans->delayed_ref_elem.seq) {
2415                 /* list without seq or seq without list */
2416                 printk(KERN_ERR "btrfs: qgroup accounting update error, list is%s empty, seq is %llu\n",
2417                         list_empty(&trans->qgroup_ref_list) ? "" : " not",
2418                         trans->delayed_ref_elem.seq);
2419                 BUG();
2420         }
2421
2422         if (!trans->delayed_ref_elem.seq)
2423                 return 0;
2424
2425         while (!list_empty(&trans->qgroup_ref_list)) {
2426                 qgroup_update = list_first_entry(&trans->qgroup_ref_list,
2427                                                  struct qgroup_update, list);
2428                 list_del(&qgroup_update->list);
2429                 if (!ret)
2430                         ret = btrfs_qgroup_account_ref(
2431                                         trans, fs_info, qgroup_update->node,
2432                                         qgroup_update->extent_op);
2433                 kfree(qgroup_update);
2434         }
2435
2436         btrfs_put_tree_mod_seq(fs_info, &trans->delayed_ref_elem);
2437
2438         return ret;
2439 }
2440
2441 /*
2442  * this starts processing the delayed reference count updates and
2443  * extent insertions we have queued up so far.  count can be
2444  * 0, which means to process everything in the tree at the start
2445  * of the run (but not newly added entries), or it can be some target
2446  * number you'd like to process.
2447  *
2448  * Returns 0 on success or if called with an aborted transaction
2449  * Returns <0 on error and aborts the transaction
2450  */
2451 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2452                            struct btrfs_root *root, unsigned long count)
2453 {
2454         struct rb_node *node;
2455         struct btrfs_delayed_ref_root *delayed_refs;
2456         struct btrfs_delayed_ref_node *ref;
2457         struct list_head cluster;
2458         int ret;
2459         u64 delayed_start;
2460         int run_all = count == (unsigned long)-1;
2461         int run_most = 0;
2462         int loops;
2463
2464         /* We'll clean this up in btrfs_cleanup_transaction */
2465         if (trans->aborted)
2466                 return 0;
2467
2468         if (root == root->fs_info->extent_root)
2469                 root = root->fs_info->tree_root;
2470
2471         btrfs_delayed_refs_qgroup_accounting(trans, root->fs_info);
2472
2473         delayed_refs = &trans->transaction->delayed_refs;
2474         INIT_LIST_HEAD(&cluster);
2475 again:
2476         loops = 0;
2477         spin_lock(&delayed_refs->lock);
2478
2479 #ifdef SCRAMBLE_DELAYED_REFS
2480         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2481 #endif
2482
2483         if (count == 0) {
2484                 count = delayed_refs->num_entries * 2;
2485                 run_most = 1;
2486         }
2487         while (1) {
2488                 if (!(run_all || run_most) &&
2489                     delayed_refs->num_heads_ready < 64)
2490                         break;
2491
2492                 /*
2493                  * go find something we can process in the rbtree.  We start at
2494                  * the beginning of the tree, and then build a cluster
2495                  * of refs to process starting at the first one we are able to
2496                  * lock
2497                  */
2498                 delayed_start = delayed_refs->run_delayed_start;
2499                 ret = btrfs_find_ref_cluster(trans, &cluster,
2500                                              delayed_refs->run_delayed_start);
2501                 if (ret)
2502                         break;
2503
2504                 ret = run_clustered_refs(trans, root, &cluster);
2505                 if (ret < 0) {
2506                         btrfs_release_ref_cluster(&cluster);
2507                         spin_unlock(&delayed_refs->lock);
2508                         btrfs_abort_transaction(trans, root, ret);
2509                         return ret;
2510                 }
2511
2512                 count -= min_t(unsigned long, ret, count);
2513
2514                 if (count == 0)
2515                         break;
2516
2517                 if (delayed_start >= delayed_refs->run_delayed_start) {
2518                         if (loops == 0) {
2519                                 /*
2520                                  * btrfs_find_ref_cluster looped. let's do one
2521                                  * more cycle. if we don't run any delayed ref
2522                                  * during that cycle (because we can't because
2523                                  * all of them are blocked), bail out.
2524                                  */
2525                                 loops = 1;
2526                         } else {
2527                                 /*
2528                                  * no runnable refs left, stop trying
2529                                  */
2530                                 BUG_ON(run_all);
2531                                 break;
2532                         }
2533                 }
2534                 if (ret) {
2535                         /* refs were run, let's reset staleness detection */
2536                         loops = 0;
2537                 }
2538         }
2539
2540         if (run_all) {
2541                 if (!list_empty(&trans->new_bgs)) {
2542                         spin_unlock(&delayed_refs->lock);
2543                         btrfs_create_pending_block_groups(trans, root);
2544                         spin_lock(&delayed_refs->lock);
2545                 }
2546
2547                 node = rb_first(&delayed_refs->root);
2548                 if (!node)
2549                         goto out;
2550                 count = (unsigned long)-1;
2551
2552                 while (node) {
2553                         ref = rb_entry(node, struct btrfs_delayed_ref_node,
2554                                        rb_node);
2555                         if (btrfs_delayed_ref_is_head(ref)) {
2556                                 struct btrfs_delayed_ref_head *head;
2557
2558                                 head = btrfs_delayed_node_to_head(ref);
2559                                 atomic_inc(&ref->refs);
2560
2561                                 spin_unlock(&delayed_refs->lock);
2562                                 /*
2563                                  * Mutex was contended, block until it's
2564                                  * released and try again
2565                                  */
2566                                 mutex_lock(&head->mutex);
2567                                 mutex_unlock(&head->mutex);
2568
2569                                 btrfs_put_delayed_ref(ref);
2570                                 cond_resched();
2571                                 goto again;
2572                         }
2573                         node = rb_next(node);
2574                 }
2575                 spin_unlock(&delayed_refs->lock);
2576                 schedule_timeout(1);
2577                 goto again;
2578         }
2579 out:
2580         spin_unlock(&delayed_refs->lock);
2581         assert_qgroups_uptodate(trans);
2582         return 0;
2583 }
2584
2585 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2586                                 struct btrfs_root *root,
2587                                 u64 bytenr, u64 num_bytes, u64 flags,
2588                                 int is_data)
2589 {
2590         struct btrfs_delayed_extent_op *extent_op;
2591         int ret;
2592
2593         extent_op = btrfs_alloc_delayed_extent_op();
2594         if (!extent_op)
2595                 return -ENOMEM;
2596
2597         extent_op->flags_to_set = flags;
2598         extent_op->update_flags = 1;
2599         extent_op->update_key = 0;
2600         extent_op->is_data = is_data ? 1 : 0;
2601
2602         ret = btrfs_add_delayed_extent_op(root->fs_info, trans, bytenr,
2603                                           num_bytes, extent_op);
2604         if (ret)
2605                 btrfs_free_delayed_extent_op(extent_op);
2606         return ret;
2607 }
2608
2609 static noinline int check_delayed_ref(struct btrfs_trans_handle *trans,
2610                                       struct btrfs_root *root,
2611                                       struct btrfs_path *path,
2612                                       u64 objectid, u64 offset, u64 bytenr)
2613 {
2614         struct btrfs_delayed_ref_head *head;
2615         struct btrfs_delayed_ref_node *ref;
2616         struct btrfs_delayed_data_ref *data_ref;
2617         struct btrfs_delayed_ref_root *delayed_refs;
2618         struct rb_node *node;
2619         int ret = 0;
2620
2621         ret = -ENOENT;
2622         delayed_refs = &trans->transaction->delayed_refs;
2623         spin_lock(&delayed_refs->lock);
2624         head = btrfs_find_delayed_ref_head(trans, bytenr);
2625         if (!head)
2626                 goto out;
2627
2628         if (!mutex_trylock(&head->mutex)) {
2629                 atomic_inc(&head->node.refs);
2630                 spin_unlock(&delayed_refs->lock);
2631
2632                 btrfs_release_path(path);
2633
2634                 /*
2635                  * Mutex was contended, block until it's released and let
2636                  * caller try again
2637                  */
2638                 mutex_lock(&head->mutex);
2639                 mutex_unlock(&head->mutex);
2640                 btrfs_put_delayed_ref(&head->node);
2641                 return -EAGAIN;
2642         }
2643
2644         node = rb_prev(&head->node.rb_node);
2645         if (!node)
2646                 goto out_unlock;
2647
2648         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2649
2650         if (ref->bytenr != bytenr)
2651                 goto out_unlock;
2652
2653         ret = 1;
2654         if (ref->type != BTRFS_EXTENT_DATA_REF_KEY)
2655                 goto out_unlock;
2656
2657         data_ref = btrfs_delayed_node_to_data_ref(ref);
2658
2659         node = rb_prev(node);
2660         if (node) {
2661                 int seq = ref->seq;
2662
2663                 ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
2664                 if (ref->bytenr == bytenr && ref->seq == seq)
2665                         goto out_unlock;
2666         }
2667
2668         if (data_ref->root != root->root_key.objectid ||
2669             data_ref->objectid != objectid || data_ref->offset != offset)
2670                 goto out_unlock;
2671
2672         ret = 0;
2673 out_unlock:
2674         mutex_unlock(&head->mutex);
2675 out:
2676         spin_unlock(&delayed_refs->lock);
2677         return ret;
2678 }
2679
2680 static noinline int check_committed_ref(struct btrfs_trans_handle *trans,
2681                                         struct btrfs_root *root,
2682                                         struct btrfs_path *path,
2683                                         u64 objectid, u64 offset, u64 bytenr)
2684 {
2685         struct btrfs_root *extent_root = root->fs_info->extent_root;
2686         struct extent_buffer *leaf;
2687         struct btrfs_extent_data_ref *ref;
2688         struct btrfs_extent_inline_ref *iref;
2689         struct btrfs_extent_item *ei;
2690         struct btrfs_key key;
2691         u32 item_size;
2692         int ret;
2693
2694         key.objectid = bytenr;
2695         key.offset = (u64)-1;
2696         key.type = BTRFS_EXTENT_ITEM_KEY;
2697
2698         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2699         if (ret < 0)
2700                 goto out;
2701         BUG_ON(ret == 0); /* Corruption */
2702
2703         ret = -ENOENT;
2704         if (path->slots[0] == 0)
2705                 goto out;
2706
2707         path->slots[0]--;
2708         leaf = path->nodes[0];
2709         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2710
2711         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2712                 goto out;
2713
2714         ret = 1;
2715         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2716 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
2717         if (item_size < sizeof(*ei)) {
2718                 WARN_ON(item_size != sizeof(struct btrfs_extent_item_v0));
2719                 goto out;
2720         }
2721 #endif
2722         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2723
2724         if (item_size != sizeof(*ei) +
2725             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2726                 goto out;
2727
2728         if (btrfs_extent_generation(leaf, ei) <=
2729             btrfs_root_last_snapshot(&root->root_item))
2730                 goto out;
2731
2732         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2733         if (btrfs_extent_inline_ref_type(leaf, iref) !=
2734             BTRFS_EXTENT_DATA_REF_KEY)
2735                 goto out;
2736
2737         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2738         if (btrfs_extent_refs(leaf, ei) !=
2739             btrfs_extent_data_ref_count(leaf, ref) ||
2740             btrfs_extent_data_ref_root(leaf, ref) !=
2741             root->root_key.objectid ||
2742             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2743             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2744                 goto out;
2745
2746         ret = 0;
2747 out:
2748         return ret;
2749 }
2750
2751 int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
2752                           struct btrfs_root *root,
2753                           u64 objectid, u64 offset, u64 bytenr)
2754 {
2755         struct btrfs_path *path;
2756         int ret;
2757         int ret2;
2758
2759         path = btrfs_alloc_path();
2760         if (!path)
2761                 return -ENOENT;
2762
2763         do {
2764                 ret = check_committed_ref(trans, root, path, objectid,
2765                                           offset, bytenr);
2766                 if (ret && ret != -ENOENT)
2767                         goto out;
2768
2769                 ret2 = check_delayed_ref(trans, root, path, objectid,
2770                                          offset, bytenr);
2771         } while (ret2 == -EAGAIN);
2772
2773         if (ret2 && ret2 != -ENOENT) {
2774                 ret = ret2;
2775                 goto out;
2776         }
2777
2778         if (ret != -ENOENT || ret2 != -ENOENT)
2779                 ret = 0;
2780 out:
2781         btrfs_free_path(path);
2782         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2783                 WARN_ON(ret > 0);
2784         return ret;
2785 }
2786
2787 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2788                            struct btrfs_root *root,
2789                            struct extent_buffer *buf,
2790                            int full_backref, int inc, int for_cow)
2791 {
2792         u64 bytenr;
2793         u64 num_bytes;
2794         u64 parent;
2795         u64 ref_root;
2796         u32 nritems;
2797         struct btrfs_key key;
2798         struct btrfs_file_extent_item *fi;
2799         int i;
2800         int level;
2801         int ret = 0;
2802         int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
2803                             u64, u64, u64, u64, u64, u64, int);
2804
2805         ref_root = btrfs_header_owner(buf);
2806         nritems = btrfs_header_nritems(buf);
2807         level = btrfs_header_level(buf);
2808
2809         if (!root->ref_cows && level == 0)
2810                 return 0;
2811
2812         if (inc)
2813                 process_func = btrfs_inc_extent_ref;
2814         else
2815                 process_func = btrfs_free_extent;
2816
2817         if (full_backref)
2818                 parent = buf->start;
2819         else
2820                 parent = 0;
2821
2822         for (i = 0; i < nritems; i++) {
2823                 if (level == 0) {
2824                         btrfs_item_key_to_cpu(buf, &key, i);
2825                         if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY)
2826                                 continue;
2827                         fi = btrfs_item_ptr(buf, i,
2828                                             struct btrfs_file_extent_item);
2829                         if (btrfs_file_extent_type(buf, fi) ==
2830                             BTRFS_FILE_EXTENT_INLINE)
2831                                 continue;
2832                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2833                         if (bytenr == 0)
2834                                 continue;
2835
2836                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2837                         key.offset -= btrfs_file_extent_offset(buf, fi);
2838                         ret = process_func(trans, root, bytenr, num_bytes,
2839                                            parent, ref_root, key.objectid,
2840                                            key.offset, for_cow);
2841                         if (ret)
2842                                 goto fail;
2843                 } else {
2844                         bytenr = btrfs_node_blockptr(buf, i);
2845                         num_bytes = btrfs_level_size(root, level - 1);
2846                         ret = process_func(trans, root, bytenr, num_bytes,
2847                                            parent, ref_root, level - 1, 0,
2848                                            for_cow);
2849                         if (ret)
2850                                 goto fail;
2851                 }
2852         }
2853         return 0;
2854 fail:
2855         return ret;
2856 }
2857
2858 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2859                   struct extent_buffer *buf, int full_backref, int for_cow)
2860 {
2861         return __btrfs_mod_ref(trans, root, buf, full_backref, 1, for_cow);
2862 }
2863
2864 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2865                   struct extent_buffer *buf, int full_backref, int for_cow)
2866 {
2867         return __btrfs_mod_ref(trans, root, buf, full_backref, 0, for_cow);
2868 }
2869
2870 static int write_one_cache_group(struct btrfs_trans_handle *trans,
2871                                  struct btrfs_root *root,
2872                                  struct btrfs_path *path,
2873                                  struct btrfs_block_group_cache *cache)
2874 {
2875         int ret;
2876         struct btrfs_root *extent_root = root->fs_info->extent_root;
2877         unsigned long bi;
2878         struct extent_buffer *leaf;
2879
2880         ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1);
2881         if (ret < 0)
2882                 goto fail;
2883         BUG_ON(ret); /* Corruption */
2884
2885         leaf = path->nodes[0];
2886         bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
2887         write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item));
2888         btrfs_mark_buffer_dirty(leaf);
2889         btrfs_release_path(path);
2890 fail:
2891         if (ret) {
2892                 btrfs_abort_transaction(trans, root, ret);
2893                 return ret;
2894         }
2895         return 0;
2896
2897 }
2898
2899 static struct btrfs_block_group_cache *
2900 next_block_group(struct btrfs_root *root,
2901                  struct btrfs_block_group_cache *cache)
2902 {
2903         struct rb_node *node;
2904         spin_lock(&root->fs_info->block_group_cache_lock);
2905         node = rb_next(&cache->cache_node);
2906         btrfs_put_block_group(cache);
2907         if (node) {
2908                 cache = rb_entry(node, struct btrfs_block_group_cache,
2909                                  cache_node);
2910                 btrfs_get_block_group(cache);
2911         } else
2912                 cache = NULL;
2913         spin_unlock(&root->fs_info->block_group_cache_lock);
2914         return cache;
2915 }
2916
2917 static int cache_save_setup(struct btrfs_block_group_cache *block_group,
2918                             struct btrfs_trans_handle *trans,
2919                             struct btrfs_path *path)
2920 {
2921         struct btrfs_root *root = block_group->fs_info->tree_root;
2922         struct inode *inode = NULL;
2923         u64 alloc_hint = 0;
2924         int dcs = BTRFS_DC_ERROR;
2925         int num_pages = 0;
2926         int retries = 0;
2927         int ret = 0;
2928
2929         /*
2930          * If this block group is smaller than 100 megs don't bother caching the
2931          * block group.
2932          */
2933         if (block_group->key.offset < (100 * 1024 * 1024)) {
2934                 spin_lock(&block_group->lock);
2935                 block_group->disk_cache_state = BTRFS_DC_WRITTEN;
2936                 spin_unlock(&block_group->lock);
2937                 return 0;
2938         }
2939
2940 again:
2941         inode = lookup_free_space_inode(root, block_group, path);
2942         if (IS_ERR(inode) && PTR_ERR(inode) != -ENOENT) {
2943                 ret = PTR_ERR(inode);
2944                 btrfs_release_path(path);
2945                 goto out;
2946         }
2947
2948         if (IS_ERR(inode)) {
2949                 BUG_ON(retries);
2950                 retries++;
2951
2952                 if (block_group->ro)
2953                         goto out_free;
2954
2955                 ret = create_free_space_inode(root, trans, block_group, path);
2956                 if (ret)
2957                         goto out_free;
2958                 goto again;
2959         }
2960
2961         /* We've already setup this transaction, go ahead and exit */
2962         if (block_group->cache_generation == trans->transid &&
2963             i_size_read(inode)) {
2964                 dcs = BTRFS_DC_SETUP;
2965                 goto out_put;
2966         }
2967
2968         /*
2969          * We want to set the generation to 0, that way if anything goes wrong
2970          * from here on out we know not to trust this cache when we load up next
2971          * time.
2972          */
2973         BTRFS_I(inode)->generation = 0;
2974         ret = btrfs_update_inode(trans, root, inode);
2975         WARN_ON(ret);
2976
2977         if (i_size_read(inode) > 0) {
2978                 ret = btrfs_truncate_free_space_cache(root, trans, path,
2979                                                       inode);
2980                 if (ret)
2981                         goto out_put;
2982         }
2983
2984         spin_lock(&block_group->lock);
2985         if (block_group->cached != BTRFS_CACHE_FINISHED ||
2986             !btrfs_test_opt(root, SPACE_CACHE)) {
2987                 /*
2988                  * don't bother trying to write stuff out _if_
2989                  * a) we're not cached,
2990                  * b) we're with nospace_cache mount option.
2991                  */
2992                 dcs = BTRFS_DC_WRITTEN;
2993                 spin_unlock(&block_group->lock);
2994                 goto out_put;
2995         }
2996         spin_unlock(&block_group->lock);
2997
2998         /*
2999          * Try to preallocate enough space based on how big the block group is.
3000          * Keep in mind this has to include any pinned space which could end up
3001          * taking up quite a bit since it's not folded into the other space
3002          * cache.
3003          */
3004         num_pages = (int)div64_u64(block_group->key.offset, 256 * 1024 * 1024);
3005         if (!num_pages)
3006                 num_pages = 1;
3007
3008         num_pages *= 16;
3009         num_pages *= PAGE_CACHE_SIZE;
3010
3011         ret = btrfs_check_data_free_space(inode, num_pages);
3012         if (ret)
3013                 goto out_put;
3014
3015         ret = btrfs_prealloc_file_range_trans(inode, trans, 0, 0, num_pages,
3016                                               num_pages, num_pages,
3017                                               &alloc_hint);
3018         if (!ret)
3019                 dcs = BTRFS_DC_SETUP;
3020         btrfs_free_reserved_data_space(inode, num_pages);
3021
3022 out_put:
3023         iput(inode);
3024 out_free:
3025         btrfs_release_path(path);
3026 out:
3027         spin_lock(&block_group->lock);
3028         if (!ret && dcs == BTRFS_DC_SETUP)
3029                 block_group->cache_generation = trans->transid;
3030         block_group->disk_cache_state = dcs;
3031         spin_unlock(&block_group->lock);
3032
3033         return ret;
3034 }
3035
3036 int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans,
3037                                    struct btrfs_root *root)
3038 {
3039         struct btrfs_block_group_cache *cache;
3040         int err = 0;
3041         struct btrfs_path *path;
3042         u64 last = 0;
3043
3044         path = btrfs_alloc_path();
3045         if (!path)
3046                 return -ENOMEM;
3047
3048 again:
3049         while (1) {
3050                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3051                 while (cache) {
3052                         if (cache->disk_cache_state == BTRFS_DC_CLEAR)
3053                                 break;
3054                         cache = next_block_group(root, cache);
3055                 }
3056                 if (!cache) {
3057                         if (last == 0)
3058                                 break;
3059                         last = 0;
3060                         continue;
3061                 }
3062                 err = cache_save_setup(cache, trans, path);
3063                 last = cache->key.objectid + cache->key.offset;
3064                 btrfs_put_block_group(cache);
3065         }
3066
3067         while (1) {
3068                 if (last == 0) {
3069                         err = btrfs_run_delayed_refs(trans, root,
3070                                                      (unsigned long)-1);
3071                         if (err) /* File system offline */
3072                                 goto out;
3073                 }
3074
3075                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3076                 while (cache) {
3077                         if (cache->disk_cache_state == BTRFS_DC_CLEAR) {
3078                                 btrfs_put_block_group(cache);
3079                                 goto again;
3080                         }
3081
3082                         if (cache->dirty)
3083                                 break;
3084                         cache = next_block_group(root, cache);
3085                 }
3086                 if (!cache) {
3087                         if (last == 0)
3088                                 break;
3089                         last = 0;
3090                         continue;
3091                 }
3092
3093                 if (cache->disk_cache_state == BTRFS_DC_SETUP)
3094                         cache->disk_cache_state = BTRFS_DC_NEED_WRITE;
3095                 cache->dirty = 0;
3096                 last = cache->key.objectid + cache->key.offset;
3097
3098                 err = write_one_cache_group(trans, root, path, cache);
3099                 if (err) /* File system offline */
3100                         goto out;
3101
3102                 btrfs_put_block_group(cache);
3103         }
3104
3105         while (1) {
3106                 /*
3107                  * I don't think this is needed since we're just marking our
3108                  * preallocated extent as written, but just in case it can't
3109                  * hurt.
3110                  */
3111                 if (last == 0) {
3112                         err = btrfs_run_delayed_refs(trans, root,
3113                                                      (unsigned long)-1);
3114                         if (err) /* File system offline */
3115                                 goto out;
3116                 }
3117
3118                 cache = btrfs_lookup_first_block_group(root->fs_info, last);
3119                 while (cache) {
3120                         /*
3121                          * Really this shouldn't happen, but it could if we
3122                          * couldn't write the entire preallocated extent and
3123                          * splitting the extent resulted in a new block.
3124                          */
3125                         if (cache->dirty) {
3126                                 btrfs_put_block_group(cache);
3127                                 goto again;
3128                         }
3129                         if (cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3130                                 break;
3131                         cache = next_block_group(root, cache);
3132                 }
3133                 if (!cache) {
3134                         if (last == 0)
3135                                 break;
3136                         last = 0;
3137                         continue;
3138                 }
3139
3140                 err = btrfs_write_out_cache(root, trans, cache, path);
3141
3142                 /*
3143                  * If we didn't have an error then the cache state is still
3144                  * NEED_WRITE, so we can set it to WRITTEN.
3145                  */
3146                 if (!err && cache->disk_cache_state == BTRFS_DC_NEED_WRITE)
3147                         cache->disk_cache_state = BTRFS_DC_WRITTEN;
3148                 last = cache->key.objectid + cache->key.offset;
3149                 btrfs_put_block_group(cache);
3150         }
3151 out:
3152
3153         btrfs_free_path(path);
3154         return err;
3155 }
3156
3157 int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr)
3158 {
3159         struct btrfs_block_group_cache *block_group;
3160         int readonly = 0;
3161
3162         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
3163         if (!block_group || block_group->ro)
3164                 readonly = 1;
3165         if (block_group)
3166                 btrfs_put_block_group(block_group);
3167         return readonly;
3168 }
3169
3170 static int update_space_info(struct btrfs_fs_info *info, u64 flags,
3171                              u64 total_bytes, u64 bytes_used,
3172                              struct btrfs_space_info **space_info)
3173 {
3174         struct btrfs_space_info *found;
3175         int i;
3176         int factor;
3177
3178         if (flags & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1 |
3179                      BTRFS_BLOCK_GROUP_RAID10))
3180                 factor = 2;
3181         else
3182                 factor = 1;
3183
3184         found = __find_space_info(info, flags);
3185         if (found) {
3186                 spin_lock(&found->lock);
3187                 found->total_bytes += total_bytes;
3188                 found->disk_total += total_bytes * factor;
3189                 found->bytes_used += bytes_used;
3190                 found->disk_used += bytes_used * factor;
3191                 found->full = 0;
3192                 spin_unlock(&found->lock);
3193                 *space_info = found;
3194                 return 0;
3195         }
3196         found = kzalloc(sizeof(*found), GFP_NOFS);
3197         if (!found)
3198                 return -ENOMEM;
3199
3200         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
3201                 INIT_LIST_HEAD(&found->block_groups[i]);
3202         init_rwsem(&found->groups_sem);
3203         spin_lock_init(&found->lock);
3204         found->flags = flags & BTRFS_BLOCK_GROUP_TYPE_MASK;
3205         found->total_bytes = total_bytes;
3206         found->disk_total = total_bytes * factor;
3207         found->bytes_used = bytes_used;
3208         found->disk_used = bytes_used * factor;
3209         found->bytes_pinned = 0;
3210         found->bytes_reserved = 0;
3211         found->bytes_readonly = 0;
3212         found->bytes_may_use = 0;
3213         found->full = 0;
3214         found->force_alloc = CHUNK_ALLOC_NO_FORCE;
3215         found->chunk_alloc = 0;
3216         found->flush = 0;
3217         init_waitqueue_head(&found->wait);
3218         *space_info = found;
3219         list_add_rcu(&found->list, &info->space_info);
3220         if (flags & BTRFS_BLOCK_GROUP_DATA)
3221                 info->data_sinfo = found;
3222         return 0;
3223 }
3224
3225 static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
3226 {
3227         u64 extra_flags = chunk_to_extended(flags) &
3228                                 BTRFS_EXTENDED_PROFILE_MASK;
3229
3230         write_seqlock(&fs_info->profiles_lock);
3231         if (flags & BTRFS_BLOCK_GROUP_DATA)
3232                 fs_info->avail_data_alloc_bits |= extra_flags;
3233         if (flags & BTRFS_BLOCK_GROUP_METADATA)
3234                 fs_info->avail_metadata_alloc_bits |= extra_flags;
3235         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3236                 fs_info->avail_system_alloc_bits |= extra_flags;
3237         write_sequnlock(&fs_info->profiles_lock);
3238 }
3239
3240 /*
3241  * returns target flags in extended format or 0 if restripe for this
3242  * chunk_type is not in progress
3243  *
3244  * should be called with either volume_mutex or balance_lock held
3245  */
3246 static u64 get_restripe_target(struct btrfs_fs_info *fs_info, u64 flags)
3247 {
3248         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3249         u64 target = 0;
3250
3251         if (!bctl)
3252                 return 0;
3253
3254         if (flags & BTRFS_BLOCK_GROUP_DATA &&
3255             bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3256                 target = BTRFS_BLOCK_GROUP_DATA | bctl->data.target;
3257         } else if (flags & BTRFS_BLOCK_GROUP_SYSTEM &&
3258                    bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3259                 target = BTRFS_BLOCK_GROUP_SYSTEM | bctl->sys.target;
3260         } else if (flags & BTRFS_BLOCK_GROUP_METADATA &&
3261                    bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) {
3262                 target = BTRFS_BLOCK_GROUP_METADATA | bctl->meta.target;
3263         }
3264
3265         return target;
3266 }
3267
3268 /*
3269  * @flags: available profiles in extended format (see ctree.h)
3270  *
3271  * Returns reduced profile in chunk format.  If profile changing is in
3272  * progress (either running or paused) picks the target profile (if it's
3273  * already available), otherwise falls back to plain reducing.
3274  */
3275 u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags)
3276 {
3277         /*
3278          * we add in the count of missing devices because we want
3279          * to make sure that any RAID levels on a degraded FS
3280          * continue to be honored.
3281          */
3282         u64 num_devices = root->fs_info->fs_devices->rw_devices +
3283                 root->fs_info->fs_devices->missing_devices;
3284         u64 target;
3285
3286         /*
3287          * see if restripe for this chunk_type is in progress, if so
3288          * try to reduce to the target profile
3289          */
3290         spin_lock(&root->fs_info->balance_lock);
3291         target = get_restripe_target(root->fs_info, flags);
3292         if (target) {
3293                 /* pick target profile only if it's already available */
3294                 if ((flags & target) & BTRFS_EXTENDED_PROFILE_MASK) {
3295                         spin_unlock(&root->fs_info->balance_lock);
3296                         return extended_to_chunk(target);
3297                 }
3298         }
3299         spin_unlock(&root->fs_info->balance_lock);
3300
3301         if (num_devices == 1)
3302                 flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0);
3303         if (num_devices < 4)
3304                 flags &= ~BTRFS_BLOCK_GROUP_RAID10;
3305
3306         if ((flags & BTRFS_BLOCK_GROUP_DUP) &&
3307             (flags & (BTRFS_BLOCK_GROUP_RAID1 |
3308                       BTRFS_BLOCK_GROUP_RAID10))) {
3309                 flags &= ~BTRFS_BLOCK_GROUP_DUP;
3310         }
3311
3312         if ((flags & BTRFS_BLOCK_GROUP_RAID1) &&
3313             (flags & BTRFS_BLOCK_GROUP_RAID10)) {
3314                 flags &= ~BTRFS_BLOCK_GROUP_RAID1;
3315         }
3316
3317         if ((flags & BTRFS_BLOCK_GROUP_RAID0) &&
3318             ((flags & BTRFS_BLOCK_GROUP_RAID1) |
3319              (flags & BTRFS_BLOCK_GROUP_RAID10) |
3320              (flags & BTRFS_BLOCK_GROUP_DUP))) {
3321                 flags &= ~BTRFS_BLOCK_GROUP_RAID0;
3322         }
3323
3324         return extended_to_chunk(flags);
3325 }
3326
3327 static u64 get_alloc_profile(struct btrfs_root *root, u64 flags)
3328 {
3329         unsigned seq;
3330
3331         do {
3332                 seq = read_seqbegin(&root->fs_info->profiles_lock);
3333
3334                 if (flags & BTRFS_BLOCK_GROUP_DATA)
3335                         flags |= root->fs_info->avail_data_alloc_bits;
3336                 else if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
3337                         flags |= root->fs_info->avail_system_alloc_bits;
3338                 else if (flags & BTRFS_BLOCK_GROUP_METADATA)
3339                         flags |= root->fs_info->avail_metadata_alloc_bits;
3340         } while (read_seqretry(&root->fs_info->profiles_lock, seq));
3341
3342         return btrfs_reduce_alloc_profile(root, flags);
3343 }
3344
3345 u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data)
3346 {
3347         u64 flags;
3348
3349         if (data)
3350                 flags = BTRFS_BLOCK_GROUP_DATA;
3351         else if (root == root->fs_info->chunk_root)
3352                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
3353         else
3354                 flags = BTRFS_BLOCK_GROUP_METADATA;
3355
3356         return get_alloc_profile(root, flags);
3357 }
3358
3359 /*
3360  * This will check the space that the inode allocates from to make sure we have
3361  * enough space for bytes.
3362  */
3363 int btrfs_check_data_free_space(struct inode *inode, u64 bytes)
3364 {
3365         struct btrfs_space_info *data_sinfo;
3366         struct btrfs_root *root = BTRFS_I(inode)->root;
3367         struct btrfs_fs_info *fs_info = root->fs_info;
3368         u64 used;
3369         int ret = 0, committed = 0, alloc_chunk = 1;
3370
3371         /* make sure bytes are sectorsize aligned */
3372         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3373
3374         if (root == root->fs_info->tree_root ||
3375             BTRFS_I(inode)->location.objectid == BTRFS_FREE_INO_OBJECTID) {
3376                 alloc_chunk = 0;
3377                 committed = 1;
3378         }
3379
3380         data_sinfo = fs_info->data_sinfo;
3381         if (!data_sinfo)
3382                 goto alloc;
3383
3384 again:
3385         /* make sure we have enough space to handle the data first */
3386         spin_lock(&data_sinfo->lock);
3387         used = data_sinfo->bytes_used + data_sinfo->bytes_reserved +
3388                 data_sinfo->bytes_pinned + data_sinfo->bytes_readonly +
3389                 data_sinfo->bytes_may_use;
3390
3391         if (used + bytes > data_sinfo->total_bytes) {
3392                 struct btrfs_trans_handle *trans;
3393
3394                 /*
3395                  * if we don't have enough free bytes in this space then we need
3396                  * to alloc a new chunk.
3397                  */
3398                 if (!data_sinfo->full && alloc_chunk) {
3399                         u64 alloc_target;
3400
3401                         data_sinfo->force_alloc = CHUNK_ALLOC_FORCE;
3402                         spin_unlock(&data_sinfo->lock);
3403 alloc:
3404                         alloc_target = btrfs_get_alloc_profile(root, 1);
3405                         trans = btrfs_join_transaction(root);
3406                         if (IS_ERR(trans))
3407                                 return PTR_ERR(trans);
3408
3409                         ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3410                                              alloc_target,
3411                                              CHUNK_ALLOC_NO_FORCE);
3412                         btrfs_end_transaction(trans, root);
3413                         if (ret < 0) {
3414                                 if (ret != -ENOSPC)
3415                                         return ret;
3416                                 else
3417                                         goto commit_trans;
3418                         }
3419
3420                         if (!data_sinfo)
3421                                 data_sinfo = fs_info->data_sinfo;
3422
3423                         goto again;
3424                 }
3425
3426                 /*
3427                  * If we have less pinned bytes than we want to allocate then
3428                  * don't bother committing the transaction, it won't help us.
3429                  */
3430                 if (data_sinfo->bytes_pinned < bytes)
3431                         committed = 1;
3432                 spin_unlock(&data_sinfo->lock);
3433
3434                 /* commit the current transaction and try again */
3435 commit_trans:
3436                 if (!committed &&
3437                     !atomic_read(&root->fs_info->open_ioctl_trans)) {
3438                         committed = 1;
3439                         trans = btrfs_join_transaction(root);
3440                         if (IS_ERR(trans))
3441                                 return PTR_ERR(trans);
3442                         ret = btrfs_commit_transaction(trans, root);
3443                         if (ret)
3444                                 return ret;
3445                         goto again;
3446                 }
3447
3448                 return -ENOSPC;
3449         }
3450         data_sinfo->bytes_may_use += bytes;
3451         trace_btrfs_space_reservation(root->fs_info, "space_info",
3452                                       data_sinfo->flags, bytes, 1);
3453         spin_unlock(&data_sinfo->lock);
3454
3455         return 0;
3456 }
3457
3458 /*
3459  * Called if we need to clear a data reservation for this inode.
3460  */
3461 void btrfs_free_reserved_data_space(struct inode *inode, u64 bytes)
3462 {
3463         struct btrfs_root *root = BTRFS_I(inode)->root;
3464         struct btrfs_space_info *data_sinfo;
3465
3466         /* make sure bytes are sectorsize aligned */
3467         bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1);
3468
3469         data_sinfo = root->fs_info->data_sinfo;
3470         spin_lock(&data_sinfo->lock);
3471         data_sinfo->bytes_may_use -= bytes;
3472         trace_btrfs_space_reservation(root->fs_info, "space_info",
3473                                       data_sinfo->flags, bytes, 0);
3474         spin_unlock(&data_sinfo->lock);
3475 }
3476
3477 static void force_metadata_allocation(struct btrfs_fs_info *info)
3478 {
3479         struct list_head *head = &info->space_info;
3480         struct btrfs_space_info *found;
3481
3482         rcu_read_lock();
3483         list_for_each_entry_rcu(found, head, list) {
3484                 if (found->flags & BTRFS_BLOCK_GROUP_METADATA)
3485                         found->force_alloc = CHUNK_ALLOC_FORCE;
3486         }
3487         rcu_read_unlock();
3488 }
3489
3490 static int should_alloc_chunk(struct btrfs_root *root,
3491                               struct btrfs_space_info *sinfo, int force)
3492 {
3493         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3494         u64 num_bytes = sinfo->total_bytes - sinfo->bytes_readonly;
3495         u64 num_allocated = sinfo->bytes_used + sinfo->bytes_reserved;
3496         u64 thresh;
3497
3498         if (force == CHUNK_ALLOC_FORCE)
3499                 return 1;
3500
3501         /*
3502          * We need to take into account the global rsv because for all intents
3503          * and purposes it's used space.  Don't worry about locking the
3504          * global_rsv, it doesn't change except when the transaction commits.
3505          */
3506         if (sinfo->flags & BTRFS_BLOCK_GROUP_METADATA)
3507                 num_allocated += global_rsv->size;
3508
3509         /*
3510          * in limited mode, we want to have some free space up to
3511          * about 1% of the FS size.
3512          */
3513         if (force == CHUNK_ALLOC_LIMITED) {
3514                 thresh = btrfs_super_total_bytes(root->fs_info->super_copy);
3515                 thresh = max_t(u64, 64 * 1024 * 1024,
3516                                div_factor_fine(thresh, 1));
3517
3518                 if (num_bytes - num_allocated < thresh)
3519                         return 1;
3520         }
3521
3522         if (num_allocated + 2 * 1024 * 1024 < div_factor(num_bytes, 8))
3523                 return 0;
3524         return 1;
3525 }
3526
3527 static u64 get_system_chunk_thresh(struct btrfs_root *root, u64 type)
3528 {
3529         u64 num_dev;
3530
3531         if (type & BTRFS_BLOCK_GROUP_RAID10 ||
3532             type & BTRFS_BLOCK_GROUP_RAID0)
3533                 num_dev = root->fs_info->fs_devices->rw_devices;
3534         else if (type & BTRFS_BLOCK_GROUP_RAID1)
3535                 num_dev = 2;
3536         else
3537                 num_dev = 1;    /* DUP or single */
3538
3539         /* metadata for updaing devices and chunk tree */
3540         return btrfs_calc_trans_metadata_size(root, num_dev + 1);
3541 }
3542
3543 static void check_system_chunk(struct btrfs_trans_handle *trans,
3544                                struct btrfs_root *root, u64 type)
3545 {
3546         struct btrfs_space_info *info;
3547         u64 left;
3548         u64 thresh;
3549
3550         info = __find_space_info(root->fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
3551         spin_lock(&info->lock);
3552         left = info->total_bytes - info->bytes_used - info->bytes_pinned -
3553                 info->bytes_reserved - info->bytes_readonly;
3554         spin_unlock(&info->lock);
3555
3556         thresh = get_system_chunk_thresh(root, type);
3557         if (left < thresh && btrfs_test_opt(root, ENOSPC_DEBUG)) {
3558                 printk(KERN_INFO "left=%llu, need=%llu, flags=%llu\n",
3559                        left, thresh, type);
3560                 dump_space_info(info, 0, 0);
3561         }
3562
3563         if (left < thresh) {
3564                 u64 flags;
3565
3566                 flags = btrfs_get_alloc_profile(root->fs_info->chunk_root, 0);
3567                 btrfs_alloc_chunk(trans, root, flags);
3568         }
3569 }
3570
3571 static int do_chunk_alloc(struct btrfs_trans_handle *trans,
3572                           struct btrfs_root *extent_root, u64 flags, int force)
3573 {
3574         struct btrfs_space_info *space_info;
3575         struct btrfs_fs_info *fs_info = extent_root->fs_info;
3576         int wait_for_alloc = 0;
3577         int ret = 0;
3578
3579         /* Don't re-enter if we're already allocating a chunk */
3580         if (trans->allocating_chunk)
3581                 return -ENOSPC;
3582
3583         space_info = __find_space_info(extent_root->fs_info, flags);
3584         if (!space_info) {
3585                 ret = update_space_info(extent_root->fs_info, flags,
3586                                         0, 0, &space_info);
3587                 BUG_ON(ret); /* -ENOMEM */
3588         }
3589         BUG_ON(!space_info); /* Logic error */
3590
3591 again:
3592         spin_lock(&space_info->lock);
3593         if (force < space_info->force_alloc)
3594                 force = space_info->force_alloc;
3595         if (space_info->full) {
3596                 spin_unlock(&space_info->lock);
3597                 return 0;
3598         }
3599
3600         if (!should_alloc_chunk(extent_root, space_info, force)) {
3601                 spin_unlock(&space_info->lock);
3602                 return 0;
3603         } else if (space_info->chunk_alloc) {
3604                 wait_for_alloc = 1;
3605         } else {
3606                 space_info->chunk_alloc = 1;
3607         }
3608
3609         spin_unlock(&space_info->lock);
3610
3611         mutex_lock(&fs_info->chunk_mutex);
3612
3613         /*
3614          * The chunk_mutex is held throughout the entirety of a chunk
3615          * allocation, so once we've acquired the chunk_mutex we know that the
3616          * other guy is done and we need to recheck and see if we should
3617          * allocate.
3618          */
3619         if (wait_for_alloc) {
3620                 mutex_unlock(&fs_info->chunk_mutex);
3621                 wait_for_alloc = 0;
3622                 goto again;
3623         }
3624
3625         trans->allocating_chunk = true;
3626
3627         /*
3628          * If we have mixed data/metadata chunks we want to make sure we keep
3629          * allocating mixed chunks instead of individual chunks.
3630          */
3631         if (btrfs_mixed_space_info(space_info))
3632                 flags |= (BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA);
3633
3634         /*
3635          * if we're doing a data chunk, go ahead and make sure that
3636          * we keep a reasonable number of metadata chunks allocated in the
3637          * FS as well.
3638          */
3639         if (flags & BTRFS_BLOCK_GROUP_DATA && fs_info->metadata_ratio) {
3640                 fs_info->data_chunk_allocations++;
3641                 if (!(fs_info->data_chunk_allocations %
3642                       fs_info->metadata_ratio))
3643                         force_metadata_allocation(fs_info);
3644         }
3645
3646         /*
3647          * Check if we have enough space in SYSTEM chunk because we may need
3648          * to update devices.
3649          */
3650         check_system_chunk(trans, extent_root, flags);
3651
3652         ret = btrfs_alloc_chunk(trans, extent_root, flags);
3653         trans->allocating_chunk = false;
3654         if (ret < 0 && ret != -ENOSPC)
3655                 goto out;
3656
3657         spin_lock(&space_info->lock);
3658         if (ret)
3659                 space_info->full = 1;
3660         else
3661                 ret = 1;
3662
3663         space_info->force_alloc = CHUNK_ALLOC_NO_FORCE;
3664         space_info->chunk_alloc = 0;
3665         spin_unlock(&space_info->lock);
3666 out:
3667         mutex_unlock(&fs_info->chunk_mutex);
3668         return ret;
3669 }
3670
3671 static int can_overcommit(struct btrfs_root *root,
3672                           struct btrfs_space_info *space_info, u64 bytes,
3673                           enum btrfs_reserve_flush_enum flush)
3674 {
3675         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
3676         u64 profile = btrfs_get_alloc_profile(root, 0);
3677         u64 rsv_size = 0;
3678         u64 avail;
3679         u64 used;
3680         u64 to_add;
3681
3682         used = space_info->bytes_used + space_info->bytes_reserved +
3683                 space_info->bytes_pinned + space_info->bytes_readonly;
3684
3685         spin_lock(&global_rsv->lock);
3686         rsv_size = global_rsv->size;
3687         spin_unlock(&global_rsv->lock);
3688
3689         /*
3690          * We only want to allow over committing if we have lots of actual space
3691          * free, but if we don't have enough space to handle the global reserve
3692          * space then we could end up having a real enospc problem when trying
3693          * to allocate a chunk or some other such important allocation.
3694          */
3695         rsv_size <<= 1;
3696         if (used + rsv_size >= space_info->total_bytes)
3697                 return 0;
3698
3699         used += space_info->bytes_may_use;
3700
3701         spin_lock(&root->fs_info->free_chunk_lock);
3702         avail = root->fs_info->free_chunk_space;
3703         spin_unlock(&root->fs_info->free_chunk_lock);
3704
3705         /*
3706          * If we have dup, raid1 or raid10 then only half of the free
3707          * space is actually useable.
3708          */
3709         if (profile & (BTRFS_BLOCK_GROUP_DUP |
3710                        BTRFS_BLOCK_GROUP_RAID1 |
3711                        BTRFS_BLOCK_GROUP_RAID10))
3712                 avail >>= 1;
3713
3714         to_add = space_info->total_bytes;
3715
3716         /*
3717          * If we aren't flushing all things, let us overcommit up to
3718          * 1/2th of the space. If we can flush, don't let us overcommit
3719          * too much, let it overcommit up to 1/8 of the space.
3720          */
3721         if (flush == BTRFS_RESERVE_FLUSH_ALL)
3722                 to_add >>= 3;
3723         else
3724                 to_add >>= 1;
3725
3726         /*
3727          * Limit the overcommit to the amount of free space we could possibly
3728          * allocate for chunks.
3729          */
3730         to_add = min(avail, to_add);
3731
3732         if (used + bytes < space_info->total_bytes + to_add)
3733                 return 1;
3734         return 0;
3735 }
3736
3737 static inline int writeback_inodes_sb_nr_if_idle_safe(struct super_block *sb,
3738                                                       unsigned long nr_pages,
3739                                                       enum wb_reason reason)
3740 {
3741         /* the flusher is dealing with the dirty inodes now. */
3742         if (writeback_in_progress(sb->s_bdi))
3743                 return 1;
3744
3745         if (down_read_trylock(&sb->s_umount)) {
3746                 writeback_inodes_sb_nr(sb, nr_pages, reason);
3747                 up_read(&sb->s_umount);
3748                 return 1;
3749         }
3750
3751         return 0;
3752 }
3753
3754 void btrfs_writeback_inodes_sb_nr(struct btrfs_root *root,
3755                                   unsigned long nr_pages)
3756 {
3757         struct super_block *sb = root->fs_info->sb;
3758         int started;
3759
3760         /* If we can not start writeback, just sync all the delalloc file. */
3761         started = writeback_inodes_sb_nr_if_idle_safe(sb, nr_pages,
3762                                                       WB_REASON_FS_FREE_SPACE);
3763         if (!started) {
3764                 /*
3765                  * We needn't worry the filesystem going from r/w to r/o though
3766                  * we don't acquire ->s_umount mutex, because the filesystem
3767                  * should guarantee the delalloc inodes list be empty after
3768                  * the filesystem is readonly(all dirty pages are written to
3769                  * the disk).
3770                  */
3771                 btrfs_start_delalloc_inodes(root, 0);
3772                 btrfs_wait_ordered_extents(root, 0);
3773         }
3774 }
3775
3776 /*
3777  * shrink metadata reservation for delalloc
3778  */
3779 static void shrink_delalloc(struct btrfs_root *root, u64 to_reclaim, u64 orig,
3780                             bool wait_ordered)
3781 {
3782         struct btrfs_block_rsv *block_rsv;
3783         struct btrfs_space_info *space_info;
3784         struct btrfs_trans_handle *trans;
3785         u64 delalloc_bytes;
3786         u64 max_reclaim;
3787         long time_left;
3788         unsigned long nr_pages = (2 * 1024 * 1024) >> PAGE_CACHE_SHIFT;
3789         int loops = 0;
3790         enum btrfs_reserve_flush_enum flush;
3791
3792         trans = (struct btrfs_trans_handle *)current->journal_info;
3793         block_rsv = &root->fs_info->delalloc_block_rsv;
3794         space_info = block_rsv->space_info;
3795
3796         smp_mb();
3797         delalloc_bytes = percpu_counter_sum_positive(
3798                                                 &root->fs_info->delalloc_bytes);
3799         if (delalloc_bytes == 0) {
3800                 if (trans)
3801                         return;
3802                 btrfs_wait_ordered_extents(root, 0);
3803                 return;
3804         }
3805
3806         while (delalloc_bytes && loops < 3) {
3807                 max_reclaim = min(delalloc_bytes, to_reclaim);
3808                 nr_pages = max_reclaim >> PAGE_CACHE_SHIFT;
3809                 btrfs_writeback_inodes_sb_nr(root, nr_pages);
3810                 /*
3811                  * We need to wait for the async pages to actually start before
3812                  * we do anything.
3813                  */
3814                 wait_event(root->fs_info->async_submit_wait,
3815                            !atomic_read(&root->fs_info->async_delalloc_pages));
3816
3817                 if (!trans)
3818                         flush = BTRFS_RESERVE_FLUSH_ALL;
3819                 else
3820                         flush = BTRFS_RESERVE_NO_FLUSH;
3821                 spin_lock(&space_info->lock);
3822                 if (can_overcommit(root, space_info, orig, flush)) {
3823                         spin_unlock(&space_info->lock);
3824                         break;
3825                 }
3826                 spin_unlock(&space_info->lock);
3827
3828                 loops++;
3829                 if (wait_ordered && !trans) {
3830                         btrfs_wait_ordered_extents(root, 0);
3831                 } else {
3832                         time_left = schedule_timeout_killable(1);
3833                         if (time_left)
3834                                 break;
3835                 }
3836                 smp_mb();
3837                 delalloc_bytes = percpu_counter_sum_positive(
3838                                                 &root->fs_info->delalloc_bytes);
3839         }
3840 }
3841
3842 /**
3843  * maybe_commit_transaction - possibly commit the transaction if its ok to
3844  * @root - the root we're allocating for
3845  * @bytes - the number of bytes we want to reserve
3846  * @force - force the commit
3847  *
3848  * This will check to make sure that committing the transaction will actually
3849  * get us somewhere and then commit the transaction if it does.  Otherwise it
3850  * will return -ENOSPC.
3851  */
3852 static int may_commit_transaction(struct btrfs_root *root,
3853                                   struct btrfs_space_info *space_info,
3854                                   u64 bytes, int force)
3855 {
3856         struct btrfs_block_rsv *delayed_rsv = &root->fs_info->delayed_block_rsv;
3857         struct btrfs_trans_handle *trans;
3858
3859         trans = (struct btrfs_trans_handle *)current->journal_info;
3860         if (trans)
3861                 return -EAGAIN;
3862
3863         if (force)
3864                 goto commit;
3865
3866         /* See if there is enough pinned space to make this reservation */
3867         spin_lock(&space_info->lock);
3868         if (space_info->bytes_pinned >= bytes) {
3869                 spin_unlock(&space_info->lock);
3870                 goto commit;
3871         }
3872         spin_unlock(&space_info->lock);
3873
3874         /*
3875          * See if there is some space in the delayed insertion reservation for
3876          * this reservation.
3877          */
3878         if (space_info != delayed_rsv->space_info)
3879                 return -ENOSPC;
3880
3881         spin_lock(&space_info->lock);
3882         spin_lock(&delayed_rsv->lock);
3883         if (space_info->bytes_pinned + delayed_rsv->size < bytes) {
3884                 spin_unlock(&delayed_rsv->lock);
3885                 spin_unlock(&space_info->lock);
3886                 return -ENOSPC;
3887         }
3888         spin_unlock(&delayed_rsv->lock);
3889         spin_unlock(&space_info->lock);
3890
3891 commit:
3892         trans = btrfs_join_transaction(root);
3893         if (IS_ERR(trans))
3894                 return -ENOSPC;
3895
3896         return btrfs_commit_transaction(trans, root);
3897 }
3898
3899 enum flush_state {
3900         FLUSH_DELAYED_ITEMS_NR  =       1,
3901         FLUSH_DELAYED_ITEMS     =       2,
3902         FLUSH_DELALLOC          =       3,
3903         FLUSH_DELALLOC_WAIT     =       4,
3904         ALLOC_CHUNK             =       5,
3905         COMMIT_TRANS            =       6,
3906 };
3907
3908 static int flush_space(struct btrfs_root *root,
3909                        struct btrfs_space_info *space_info, u64 num_bytes,
3910                        u64 orig_bytes, int state)
3911 {
3912         struct btrfs_trans_handle *trans;
3913         int nr;
3914         int ret = 0;
3915
3916         switch (state) {
3917         case FLUSH_DELAYED_ITEMS_NR:
3918         case FLUSH_DELAYED_ITEMS:
3919                 if (state == FLUSH_DELAYED_ITEMS_NR) {
3920                         u64 bytes = btrfs_calc_trans_metadata_size(root, 1);
3921
3922                         nr = (int)div64_u64(num_bytes, bytes);
3923                         if (!nr)
3924                                 nr = 1;
3925                         nr *= 2;
3926                 } else {
3927                         nr = -1;
3928                 }
3929                 trans = btrfs_join_transaction(root);
3930                 if (IS_ERR(trans)) {
3931                         ret = PTR_ERR(trans);
3932                         break;
3933                 }
3934                 ret = btrfs_run_delayed_items_nr(trans, root, nr);
3935                 btrfs_end_transaction(trans, root);
3936                 break;
3937         case FLUSH_DELALLOC:
3938         case FLUSH_DELALLOC_WAIT:
3939                 shrink_delalloc(root, num_bytes, orig_bytes,
3940                                 state == FLUSH_DELALLOC_WAIT);
3941                 break;
3942         case ALLOC_CHUNK:
3943                 trans = btrfs_join_transaction(root);
3944                 if (IS_ERR(trans)) {
3945                         ret = PTR_ERR(trans);
3946                         break;
3947                 }
3948                 ret = do_chunk_alloc(trans, root->fs_info->extent_root,
3949                                      btrfs_get_alloc_profile(root, 0),
3950                                      CHUNK_ALLOC_NO_FORCE);
3951                 btrfs_end_transaction(trans, root);
3952                 if (ret == -ENOSPC)
3953                         ret = 0;
3954                 break;
3955         case COMMIT_TRANS:
3956                 ret = may_commit_transaction(root, space_info, orig_bytes, 0);
3957                 break;
3958         default:
3959                 ret = -ENOSPC;
3960                 break;
3961         }
3962
3963         return ret;
3964 }
3965 /**
3966  * reserve_metadata_bytes - try to reserve bytes from the block_rsv's space
3967  * @root - the root we're allocating for
3968  * @block_rsv - the block_rsv we're allocating for
3969  * @orig_bytes - the number of bytes we want
3970  * @flush - wether or not we can flush to make our reservation
3971  *
3972  * This will reserve orgi_bytes number of bytes from the space info associated
3973  * with the block_rsv.  If there is not enough space it will make an attempt to
3974  * flush out space to make room.  It will do this by flushing delalloc if
3975  * possible or committing the transaction.  If flush is 0 then no attempts to
3976  * regain reservations will be made and this will fail if there is not enough
3977  * space already.
3978  */
3979 static int reserve_metadata_bytes(struct btrfs_root *root,
3980                                   struct btrfs_block_rsv *block_rsv,
3981                                   u64 orig_bytes,
3982                                   enum btrfs_reserve_flush_enum flush)
3983 {
3984         struct btrfs_space_info *space_info = block_rsv->space_info;
3985         u64 used;
3986         u64 num_bytes = orig_bytes;
3987         int flush_state = FLUSH_DELAYED_ITEMS_NR;
3988         int ret = 0;
3989         bool flushing = false;
3990
3991 again:
3992         ret = 0;
3993         spin_lock(&space_info->lock);
3994         /*
3995          * We only want to wait if somebody other than us is flushing and we
3996          * are actually allowed to flush all things.
3997          */
3998         while (flush == BTRFS_RESERVE_FLUSH_ALL && !flushing &&
3999                space_info->flush) {
4000                 spin_unlock(&space_info->lock);
4001                 /*
4002                  * If we have a trans handle we can't wait because the flusher
4003                  * may have to commit the transaction, which would mean we would
4004                  * deadlock since we are waiting for the flusher to finish, but
4005                  * hold the current transaction open.
4006                  */
4007                 if (current->journal_info)
4008                         return -EAGAIN;
4009                 ret = wait_event_killable(space_info->wait, !space_info->flush);
4010                 /* Must have been killed, return */
4011                 if (ret)
4012                         return -EINTR;
4013
4014                 spin_lock(&space_info->lock);
4015         }
4016
4017         ret = -ENOSPC;
4018         used = space_info->bytes_used + space_info->bytes_reserved +
4019                 space_info->bytes_pinned + space_info->bytes_readonly +
4020                 space_info->bytes_may_use;
4021
4022         /*
4023          * The idea here is that we've not already over-reserved the block group
4024          * then we can go ahead and save our reservation first and then start
4025          * flushing if we need to.  Otherwise if we've already overcommitted
4026          * lets start flushing stuff first and then come back and try to make
4027          * our reservation.
4028          */
4029         if (used <= space_info->total_bytes) {
4030                 if (used + orig_bytes <= space_info->total_bytes) {
4031                         space_info->bytes_may_use += orig_bytes;
4032                         trace_btrfs_space_reservation(root->fs_info,
4033                                 "space_info", space_info->flags, orig_bytes, 1);
4034                         ret = 0;
4035                 } else {
4036                         /*
4037                          * Ok set num_bytes to orig_bytes since we aren't
4038                          * overocmmitted, this way we only try and reclaim what
4039                          * we need.
4040                          */
4041                         num_bytes = orig_bytes;
4042                 }
4043         } else {
4044                 /*
4045                  * Ok we're over committed, set num_bytes to the overcommitted
4046                  * amount plus the amount of bytes that we need for this
4047                  * reservation.
4048                  */
4049                 num_bytes = used - space_info->total_bytes +
4050                         (orig_bytes * 2);
4051         }
4052
4053         if (ret && can_overcommit(root, space_info, orig_bytes, flush)) {
4054                 space_info->bytes_may_use += orig_bytes;
4055                 trace_btrfs_space_reservation(root->fs_info, "space_info",
4056                                               space_info->flags, orig_bytes,
4057                                               1);
4058                 ret = 0;
4059         }
4060
4061         /*
4062          * Couldn't make our reservation, save our place so while we're trying
4063          * to reclaim space we can actually use it instead of somebody else
4064          * stealing it from us.
4065          *
4066          * We make the other tasks wait for the flush only when we can flush
4067          * all things.
4068          */
4069         if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
4070                 flushing = true;
4071                 space_info->flush = 1;
4072         }
4073
4074         spin_unlock(&space_info->lock);
4075
4076         if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
4077                 goto out;
4078
4079         ret = flush_space(root, space_info, num_bytes, orig_bytes,
4080                           flush_state);
4081         flush_state++;
4082
4083         /*
4084          * If we are FLUSH_LIMIT, we can not flush delalloc, or the deadlock
4085          * would happen. So skip delalloc flush.
4086          */
4087         if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4088             (flush_state == FLUSH_DELALLOC ||
4089              flush_state == FLUSH_DELALLOC_WAIT))
4090                 flush_state = ALLOC_CHUNK;
4091
4092         if (!ret)
4093                 goto again;
4094         else if (flush == BTRFS_RESERVE_FLUSH_LIMIT &&
4095                  flush_state < COMMIT_TRANS)
4096                 goto again;
4097         else if (flush == BTRFS_RESERVE_FLUSH_ALL &&
4098                  flush_state <= COMMIT_TRANS)
4099                 goto again;
4100
4101 out:
4102         if (flushing) {
4103                 spin_lock(&space_info->lock);
4104                 space_info->flush = 0;
4105                 wake_up_all(&space_info->wait);
4106                 spin_unlock(&space_info->lock);
4107         }
4108         return ret;
4109 }
4110
4111 static struct btrfs_block_rsv *get_block_rsv(
4112                                         const struct btrfs_trans_handle *trans,
4113                                         const struct btrfs_root *root)
4114 {
4115         struct btrfs_block_rsv *block_rsv = NULL;
4116
4117         if (root->ref_cows)
4118                 block_rsv = trans->block_rsv;
4119
4120         if (root == root->fs_info->csum_root && trans->adding_csums)
4121                 block_rsv = trans->block_rsv;
4122
4123         if (!block_rsv)
4124                 block_rsv = root->block_rsv;
4125
4126         if (!block_rsv)
4127                 block_rsv = &root->fs_info->empty_block_rsv;
4128
4129         return block_rsv;
4130 }
4131
4132 static int block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv,
4133                                u64 num_bytes)
4134 {
4135         int ret = -ENOSPC;
4136         spin_lock(&block_rsv->lock);
4137         if (block_rsv->reserved >= num_bytes) {
4138                 block_rsv->reserved -= num_bytes;
4139                 if (block_rsv->reserved < block_rsv->size)
4140                         block_rsv->full = 0;
4141                 ret = 0;
4142         }
4143         spin_unlock(&block_rsv->lock);
4144         return ret;
4145 }
4146
4147 static void block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
4148                                 u64 num_bytes, int update_size)
4149 {
4150         spin_lock(&block_rsv->lock);
4151         block_rsv->reserved += num_bytes;
4152         if (update_size)
4153                 block_rsv->size += num_bytes;
4154         else if (block_rsv->reserved >= block_rsv->size)
4155                 block_rsv->full = 1;
4156         spin_unlock(&block_rsv->lock);
4157 }
4158
4159 static void block_rsv_release_bytes(struct btrfs_fs_info *fs_info,
4160                                     struct btrfs_block_rsv *block_rsv,
4161                                     struct btrfs_block_rsv *dest, u64 num_bytes)
4162 {
4163         struct btrfs_space_info *space_info = block_rsv->space_info;
4164
4165         spin_lock(&block_rsv->lock);
4166         if (num_bytes == (u64)-1)
4167                 num_bytes = block_rsv->size;
4168         block_rsv->size -= num_bytes;
4169         if (block_rsv->reserved >= block_rsv->size) {
4170                 num_bytes = block_rsv->reserved - block_rsv->size;
4171                 block_rsv->reserved = block_rsv->size;
4172                 block_rsv->full = 1;
4173         } else {
4174                 num_bytes = 0;
4175         }
4176         spin_unlock(&block_rsv->lock);
4177
4178         if (num_bytes > 0) {
4179                 if (dest) {
4180                         spin_lock(&dest->lock);
4181                         if (!dest->full) {
4182                                 u64 bytes_to_add;
4183
4184                                 bytes_to_add = dest->size - dest->reserved;
4185                                 bytes_to_add = min(num_bytes, bytes_to_add);
4186                                 dest->reserved += bytes_to_add;
4187                                 if (dest->reserved >= dest->size)
4188                                         dest->full = 1;
4189                                 num_bytes -= bytes_to_add;
4190                         }
4191                         spin_unlock(&dest->lock);
4192                 }
4193                 if (num_bytes) {
4194                         spin_lock(&space_info->lock);
4195                         space_info->bytes_may_use -= num_bytes;
4196                         trace_btrfs_space_reservation(fs_info, "space_info",
4197                                         space_info->flags, num_bytes, 0);
4198                         space_info->reservation_progress++;
4199                         spin_unlock(&space_info->lock);
4200                 }
4201         }
4202 }
4203
4204 static int block_rsv_migrate_bytes(struct btrfs_block_rsv *src,
4205                                    struct btrfs_block_rsv *dst, u64 num_bytes)
4206 {
4207         int ret;
4208
4209         ret = block_rsv_use_bytes(src, num_bytes);
4210         if (ret)
4211                 return ret;
4212
4213         block_rsv_add_bytes(dst, num_bytes, 1);
4214         return 0;
4215 }
4216
4217 void btrfs_init_block_rsv(struct btrfs_block_rsv *rsv, unsigned short type)
4218 {
4219         memset(rsv, 0, sizeof(*rsv));
4220         spin_lock_init(&rsv->lock);
4221         rsv->type = type;
4222 }
4223
4224 struct btrfs_block_rsv *btrfs_alloc_block_rsv(struct btrfs_root *root,
4225                                               unsigned short type)
4226 {
4227         struct btrfs_block_rsv *block_rsv;
4228         struct btrfs_fs_info *fs_info = root->fs_info;
4229
4230         block_rsv = kmalloc(sizeof(*block_rsv), GFP_NOFS);
4231         if (!block_rsv)
4232                 return NULL;
4233
4234         btrfs_init_block_rsv(block_rsv, type);
4235         block_rsv->space_info = __find_space_info(fs_info,
4236                                                   BTRFS_BLOCK_GROUP_METADATA);
4237         return block_rsv;
4238 }
4239
4240 void btrfs_free_block_rsv(struct btrfs_root *root,
4241                           struct btrfs_block_rsv *rsv)
4242 {
4243         if (!rsv)
4244                 return;
4245         btrfs_block_rsv_release(root, rsv, (u64)-1);
4246         kfree(rsv);
4247 }
4248
4249 int btrfs_block_rsv_add(struct btrfs_root *root,
4250                         struct btrfs_block_rsv *block_rsv, u64 num_bytes,
4251                         enum btrfs_reserve_flush_enum flush)
4252 {
4253         int ret;
4254
4255         if (num_bytes == 0)
4256                 return 0;
4257
4258         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4259         if (!ret) {
4260                 block_rsv_add_bytes(block_rsv, num_bytes, 1);
4261                 return 0;
4262         }
4263
4264         return ret;
4265 }
4266
4267 int btrfs_block_rsv_check(struct btrfs_root *root,
4268                           struct btrfs_block_rsv *block_rsv, int min_factor)
4269 {
4270         u64 num_bytes = 0;
4271         int ret = -ENOSPC;
4272
4273         if (!block_rsv)
4274                 return 0;
4275
4276         spin_lock(&block_rsv->lock);
4277         num_bytes = div_factor(block_rsv->size, min_factor);
4278         if (block_rsv->reserved >= num_bytes)
4279                 ret = 0;
4280         spin_unlock(&block_rsv->lock);
4281
4282         return ret;
4283 }
4284
4285 int btrfs_block_rsv_refill(struct btrfs_root *root,
4286                            struct btrfs_block_rsv *block_rsv, u64 min_reserved,
4287                            enum btrfs_reserve_flush_enum flush)
4288 {
4289         u64 num_bytes = 0;
4290         int ret = -ENOSPC;
4291
4292         if (!block_rsv)
4293                 return 0;
4294
4295         spin_lock(&block_rsv->lock);
4296         num_bytes = min_reserved;
4297         if (block_rsv->reserved >= num_bytes)
4298                 ret = 0;
4299         else
4300                 num_bytes -= block_rsv->reserved;
4301         spin_unlock(&block_rsv->lock);
4302
4303         if (!ret)
4304                 return 0;
4305
4306         ret = reserve_metadata_bytes(root, block_rsv, num_bytes, flush);
4307         if (!ret) {
4308                 block_rsv_add_bytes(block_rsv, num_bytes, 0);
4309                 return 0;
4310         }
4311
4312         return ret;
4313 }
4314
4315 int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
4316                             struct btrfs_block_rsv *dst_rsv,
4317                             u64 num_bytes)
4318 {
4319         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4320 }
4321
4322 void btrfs_block_rsv_release(struct btrfs_root *root,
4323                              struct btrfs_block_rsv *block_rsv,
4324                              u64 num_bytes)
4325 {
4326         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
4327         if (global_rsv->full || global_rsv == block_rsv ||
4328             block_rsv->space_info != global_rsv->space_info)
4329                 global_rsv = NULL;
4330         block_rsv_release_bytes(root->fs_info, block_rsv, global_rsv,
4331                                 num_bytes);
4332 }
4333
4334 /*
4335  * helper to calculate size of global block reservation.
4336  * the desired value is sum of space used by extent tree,
4337  * checksum tree and root tree
4338  */
4339 static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info)
4340 {
4341         struct btrfs_space_info *sinfo;
4342         u64 num_bytes;
4343         u64 meta_used;
4344         u64 data_used;
4345         int csum_size = btrfs_super_csum_size(fs_info->super_copy);
4346
4347         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_DATA);
4348         spin_lock(&sinfo->lock);
4349         data_used = sinfo->bytes_used;
4350         spin_unlock(&sinfo->lock);
4351
4352         sinfo = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4353         spin_lock(&sinfo->lock);
4354         if (sinfo->flags & BTRFS_BLOCK_GROUP_DATA)
4355                 data_used = 0;
4356         meta_used = sinfo->bytes_used;
4357         spin_unlock(&sinfo->lock);
4358
4359         num_bytes = (data_used >> fs_info->sb->s_blocksize_bits) *
4360                     csum_size * 2;
4361         num_bytes += div64_u64(data_used + meta_used, 50);
4362
4363         if (num_bytes * 3 > meta_used)
4364                 num_bytes = div64_u64(meta_used, 3);
4365
4366         return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10);
4367 }
4368
4369 static void update_global_block_rsv(struct btrfs_fs_info *fs_info)
4370 {
4371         struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
4372         struct btrfs_space_info *sinfo = block_rsv->space_info;
4373         u64 num_bytes;
4374
4375         num_bytes = calc_global_metadata_size(fs_info);
4376
4377         spin_lock(&sinfo->lock);
4378         spin_lock(&block_rsv->lock);
4379
4380         block_rsv->size = num_bytes;
4381
4382         num_bytes = sinfo->bytes_used + sinfo->bytes_pinned +
4383                     sinfo->bytes_reserved + sinfo->bytes_readonly +
4384                     sinfo->bytes_may_use;
4385
4386         if (sinfo->total_bytes > num_bytes) {
4387                 num_bytes = sinfo->total_bytes - num_bytes;
4388                 block_rsv->reserved += num_bytes;
4389                 sinfo->bytes_may_use += num_bytes;
4390                 trace_btrfs_space_reservation(fs_info, "space_info",
4391                                       sinfo->flags, num_bytes, 1);
4392         }
4393
4394         if (block_rsv->reserved >= block_rsv->size) {
4395                 num_bytes = block_rsv->reserved - block_rsv->size;
4396                 sinfo->bytes_may_use -= num_bytes;
4397                 trace_btrfs_space_reservation(fs_info, "space_info",
4398                                       sinfo->flags, num_bytes, 0);
4399                 sinfo->reservation_progress++;
4400                 block_rsv->reserved = block_rsv->size;
4401                 block_rsv->full = 1;
4402         }
4403
4404         spin_unlock(&block_rsv->lock);
4405         spin_unlock(&sinfo->lock);
4406 }
4407
4408 static void init_global_block_rsv(struct btrfs_fs_info *fs_info)
4409 {
4410         struct btrfs_space_info *space_info;
4411
4412         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
4413         fs_info->chunk_block_rsv.space_info = space_info;
4414
4415         space_info = __find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA);
4416         fs_info->global_block_rsv.space_info = space_info;
4417         fs_info->delalloc_block_rsv.space_info = space_info;
4418         fs_info->trans_block_rsv.space_info = space_info;
4419         fs_info->empty_block_rsv.space_info = space_info;
4420         fs_info->delayed_block_rsv.space_info = space_info;
4421
4422         fs_info->extent_root->block_rsv = &fs_info->global_block_rsv;
4423         fs_info->csum_root->block_rsv = &fs_info->global_block_rsv;
4424         fs_info->dev_root->block_rsv = &fs_info->global_block_rsv;
4425         fs_info->tree_root->block_rsv = &fs_info->global_block_rsv;
4426         fs_info->chunk_root->block_rsv = &fs_info->chunk_block_rsv;
4427
4428         update_global_block_rsv(fs_info);
4429 }
4430
4431 static void release_global_block_rsv(struct btrfs_fs_info *fs_info)
4432 {
4433         block_rsv_release_bytes(fs_info, &fs_info->global_block_rsv, NULL,
4434                                 (u64)-1);
4435         WARN_ON(fs_info->delalloc_block_rsv.size > 0);
4436         WARN_ON(fs_info->delalloc_block_rsv.reserved > 0);
4437         WARN_ON(fs_info->trans_block_rsv.size > 0);
4438         WARN_ON(fs_info->trans_block_rsv.reserved > 0);
4439         WARN_ON(fs_info->chunk_block_rsv.size > 0);
4440         WARN_ON(fs_info->chunk_block_rsv.reserved > 0);
4441         WARN_ON(fs_info->delayed_block_rsv.size > 0);
4442         WARN_ON(fs_info->delayed_block_rsv.reserved > 0);
4443 }
4444
4445 void btrfs_trans_release_metadata(struct btrfs_trans_handle *trans,
4446                                   struct btrfs_root *root)
4447 {
4448         if (!trans->block_rsv)
4449                 return;
4450
4451         if (!trans->bytes_reserved)
4452                 return;
4453
4454         trace_btrfs_space_reservation(root->fs_info, "transaction",
4455                                       trans->transid, trans->bytes_reserved, 0);
4456         btrfs_block_rsv_release(root, trans->block_rsv, trans->bytes_reserved);
4457         trans->bytes_reserved = 0;
4458 }
4459
4460 /* Can only return 0 or -ENOSPC */
4461 int btrfs_orphan_reserve_metadata(struct btrfs_trans_handle *trans,
4462                                   struct inode *inode)
4463 {
4464         struct btrfs_root *root = BTRFS_I(inode)->root;
4465         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4466         struct btrfs_block_rsv *dst_rsv = root->orphan_block_rsv;
4467
4468         /*
4469          * We need to hold space in order to delete our orphan item once we've
4470          * added it, so this takes the reservation so we can release it later
4471          * when we are truly done with the orphan item.
4472          */
4473         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4474         trace_btrfs_space_reservation(root->fs_info, "orphan",
4475                                       btrfs_ino(inode), num_bytes, 1);
4476         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4477 }
4478
4479 void btrfs_orphan_release_metadata(struct inode *inode)
4480 {
4481         struct btrfs_root *root = BTRFS_I(inode)->root;
4482         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 1);
4483         trace_btrfs_space_reservation(root->fs_info, "orphan",
4484                                       btrfs_ino(inode), num_bytes, 0);
4485         btrfs_block_rsv_release(root, root->orphan_block_rsv, num_bytes);
4486 }
4487
4488 int btrfs_snap_reserve_metadata(struct btrfs_trans_handle *trans,
4489                                 struct btrfs_pending_snapshot *pending)
4490 {
4491         struct btrfs_root *root = pending->root;
4492         struct btrfs_block_rsv *src_rsv = get_block_rsv(trans, root);
4493         struct btrfs_block_rsv *dst_rsv = &pending->block_rsv;
4494         /*
4495          * two for root back/forward refs, two for directory entries,
4496          * one for root of the snapshot and one for parent inode.
4497          */
4498         u64 num_bytes = btrfs_calc_trans_metadata_size(root, 6);
4499         dst_rsv->space_info = src_rsv->space_info;
4500         return block_rsv_migrate_bytes(src_rsv, dst_rsv, num_bytes);
4501 }
4502
4503 /**
4504  * drop_outstanding_extent - drop an outstanding extent
4505  * @inode: the inode we're dropping the extent for
4506  *
4507  * This is called when we are freeing up an outstanding extent, either called
4508  * after an error or after an extent is written.  This will return the number of
4509  * reserved extents that need to be freed.  This must be called with
4510  * BTRFS_I(inode)->lock held.
4511  */
4512 static unsigned drop_outstanding_extent(struct inode *inode)
4513 {
4514         unsigned drop_inode_space = 0;
4515         unsigned dropped_extents = 0;
4516
4517         BUG_ON(!BTRFS_I(inode)->outstanding_extents);
4518         BTRFS_I(inode)->outstanding_extents--;
4519
4520         if (BTRFS_I(inode)->outstanding_extents == 0 &&
4521             test_and_clear_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4522                                &BTRFS_I(inode)->runtime_flags))
4523                 drop_inode_space = 1;
4524
4525         /*
4526          * If we have more or the same amount of outsanding extents than we have
4527          * reserved then we need to leave the reserved extents count alone.
4528          */
4529         if (BTRFS_I(inode)->outstanding_extents >=
4530             BTRFS_I(inode)->reserved_extents)
4531                 return drop_inode_space;
4532
4533         dropped_extents = BTRFS_I(inode)->reserved_extents -
4534                 BTRFS_I(inode)->outstanding_extents;
4535         BTRFS_I(inode)->reserved_extents -= dropped_extents;
4536         return dropped_extents + drop_inode_space;
4537 }
4538
4539 /**
4540  * calc_csum_metadata_size - return the amount of metada space that must be
4541  *      reserved/free'd for the given bytes.
4542  * @inode: the inode we're manipulating
4543  * @num_bytes: the number of bytes in question
4544  * @reserve: 1 if we are reserving space, 0 if we are freeing space
4545  *
4546  * This adjusts the number of csum_bytes in the inode and then returns the
4547  * correct amount of metadata that must either be reserved or freed.  We
4548  * calculate how many checksums we can fit into one leaf and then divide the
4549  * number of bytes that will need to be checksumed by this value to figure out
4550  * how many checksums will be required.  If we are adding bytes then the number
4551  * may go up and we will return the number of additional bytes that must be
4552  * reserved.  If it is going down we will return the number of bytes that must
4553  * be freed.
4554  *
4555  * This must be called with BTRFS_I(inode)->lock held.
4556  */
4557 static u64 calc_csum_metadata_size(struct inode *inode, u64 num_bytes,
4558                                    int reserve)
4559 {
4560         struct btrfs_root *root = BTRFS_I(inode)->root;
4561         u64 csum_size;
4562         int num_csums_per_leaf;
4563         int num_csums;
4564         int old_csums;
4565
4566         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM &&
4567             BTRFS_I(inode)->csum_bytes == 0)
4568                 return 0;
4569
4570         old_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4571         if (reserve)
4572                 BTRFS_I(inode)->csum_bytes += num_bytes;
4573         else
4574                 BTRFS_I(inode)->csum_bytes -= num_bytes;
4575         csum_size = BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item);
4576         num_csums_per_leaf = (int)div64_u64(csum_size,
4577                                             sizeof(struct btrfs_csum_item) +
4578                                             sizeof(struct btrfs_disk_key));
4579         num_csums = (int)div64_u64(BTRFS_I(inode)->csum_bytes, root->sectorsize);
4580         num_csums = num_csums + num_csums_per_leaf - 1;
4581         num_csums = num_csums / num_csums_per_leaf;
4582
4583         old_csums = old_csums + num_csums_per_leaf - 1;
4584         old_csums = old_csums / num_csums_per_leaf;
4585
4586         /* No change, no need to reserve more */
4587         if (old_csums == num_csums)
4588                 return 0;
4589
4590         if (reserve)
4591                 return btrfs_calc_trans_metadata_size(root,
4592                                                       num_csums - old_csums);
4593
4594         return btrfs_calc_trans_metadata_size(root, old_csums - num_csums);
4595 }
4596
4597 int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes)
4598 {
4599         struct btrfs_root *root = BTRFS_I(inode)->root;
4600         struct btrfs_block_rsv *block_rsv = &root->fs_info->delalloc_block_rsv;
4601         u64 to_reserve = 0;
4602         u64 csum_bytes;
4603         unsigned nr_extents = 0;
4604         int extra_reserve = 0;
4605         enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_ALL;
4606         int ret = 0;
4607         bool delalloc_lock = true;
4608
4609         /* If we are a free space inode we need to not flush since we will be in
4610          * the middle of a transaction commit.  We also don't need the delalloc
4611          * mutex since we won't race with anybody.  We need this mostly to make
4612          * lockdep shut its filthy mouth.
4613          */
4614         if (btrfs_is_free_space_inode(inode)) {
4615                 flush = BTRFS_RESERVE_NO_FLUSH;
4616                 delalloc_lock = false;
4617         }
4618
4619         if (flush != BTRFS_RESERVE_NO_FLUSH &&
4620             btrfs_transaction_in_commit(root->fs_info))
4621                 schedule_timeout(1);
4622
4623         if (delalloc_lock)
4624                 mutex_lock(&BTRFS_I(inode)->delalloc_mutex);
4625
4626         num_bytes = ALIGN(num_bytes, root->sectorsize);
4627
4628         spin_lock(&BTRFS_I(inode)->lock);
4629         BTRFS_I(inode)->outstanding_extents++;
4630
4631         if (BTRFS_I(inode)->outstanding_extents >
4632             BTRFS_I(inode)->reserved_extents)
4633                 nr_extents = BTRFS_I(inode)->outstanding_extents -
4634                         BTRFS_I(inode)->reserved_extents;
4635
4636         /*
4637          * Add an item to reserve for updating the inode when we complete the
4638          * delalloc io.
4639          */
4640         if (!test_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4641                       &BTRFS_I(inode)->runtime_flags)) {
4642                 nr_extents++;
4643                 extra_reserve = 1;
4644         }
4645
4646         to_reserve = btrfs_calc_trans_metadata_size(root, nr_extents);
4647         to_reserve += calc_csum_metadata_size(inode, num_bytes, 1);
4648         csum_bytes = BTRFS_I(inode)->csum_bytes;
4649         spin_unlock(&BTRFS_I(inode)->lock);
4650
4651         if (root->fs_info->quota_enabled)
4652                 ret = btrfs_qgroup_reserve(root, num_bytes +
4653                                            nr_extents * root->leafsize);
4654
4655         /*
4656          * ret != 0 here means the qgroup reservation failed, we go straight to
4657          * the shared error handling then.
4658          */
4659         if (ret == 0)
4660                 ret = reserve_metadata_bytes(root, block_rsv,
4661                                              to_reserve, flush);
4662
4663         if (ret) {
4664                 u64 to_free = 0;
4665                 unsigned dropped;
4666
4667                 spin_lock(&BTRFS_I(inode)->lock);
4668                 dropped = drop_outstanding_extent(inode);
4669                 /*
4670                  * If the inodes csum_bytes is the same as the original
4671                  * csum_bytes then we know we haven't raced with any free()ers
4672                  * so we can just reduce our inodes csum bytes and carry on.
4673                  * Otherwise we have to do the normal free thing to account for
4674                  * the case that the free side didn't free up its reserve
4675                  * because of this outstanding reservation.
4676                  */
4677                 if (BTRFS_I(inode)->csum_bytes == csum_bytes)
4678                         calc_csum_metadata_size(inode, num_bytes, 0);
4679                 else
4680                         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4681                 spin_unlock(&BTRFS_I(inode)->lock);
4682                 if (dropped)
4683                         to_free += btrfs_calc_trans_metadata_size(root, dropped);
4684
4685                 if (to_free) {
4686                         btrfs_block_rsv_release(root, block_rsv, to_free);
4687                         trace_btrfs_space_reservation(root->fs_info,
4688                                                       "delalloc",
4689                                                       btrfs_ino(inode),
4690                                                       to_free, 0);
4691                 }
4692                 if (root->fs_info->quota_enabled) {
4693                         btrfs_qgroup_free(root, num_bytes +
4694                                                 nr_extents * root->leafsize);
4695                 }
4696                 if (delalloc_lock)
4697                         mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4698                 return ret;
4699         }
4700
4701         spin_lock(&BTRFS_I(inode)->lock);
4702         if (extra_reserve) {
4703                 set_bit(BTRFS_INODE_DELALLOC_META_RESERVED,
4704                         &BTRFS_I(inode)->runtime_flags);
4705                 nr_extents--;
4706         }
4707         BTRFS_I(inode)->reserved_extents += nr_extents;
4708         spin_unlock(&BTRFS_I(inode)->lock);
4709
4710         if (delalloc_lock)
4711                 mutex_unlock(&BTRFS_I(inode)->delalloc_mutex);
4712
4713         if (to_reserve)
4714                 trace_btrfs_space_reservation(root->fs_info,"delalloc",
4715                                               btrfs_ino(inode), to_reserve, 1);
4716         block_rsv_add_bytes(block_rsv, to_reserve, 1);
4717
4718         return 0;
4719 }
4720
4721 /**
4722  * btrfs_delalloc_release_metadata - release a metadata reservation for an inode
4723  * @inode: the inode to release the reservation for
4724  * @num_bytes: the number of bytes we're releasing
4725  *
4726  * This will release the metadata reservation for an inode.  This can be called
4727  * once we complete IO for a given set of bytes to release their metadata
4728  * reservations.
4729  */
4730 void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes)
4731 {
4732         struct btrfs_root *root = BTRFS_I(inode)->root;
4733         u64 to_free = 0;
4734         unsigned dropped;
4735
4736         num_bytes = ALIGN(num_bytes, root->sectorsize);
4737         spin_lock(&BTRFS_I(inode)->lock);
4738         dropped = drop_outstanding_extent(inode);
4739
4740         to_free = calc_csum_metadata_size(inode, num_bytes, 0);
4741         spin_unlock(&BTRFS_I(inode)->lock);
4742         if (dropped > 0)
4743                 to_free += btrfs_calc_trans_metadata_size(root, dropped);
4744
4745         trace_btrfs_space_reservation(root->fs_info, "delalloc",
4746                                       btrfs_ino(inode), to_free, 0);
4747         if (root->fs_info->quota_enabled) {
4748                 btrfs_qgroup_free(root, num_bytes +
4749                                         dropped * root->leafsize);
4750         }
4751
4752         btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv,
4753                                 to_free);
4754 }
4755
4756 /**
4757  * btrfs_delalloc_reserve_space - reserve data and metadata space for delalloc
4758  * @inode: inode we're writing to
4759  * @num_bytes: the number of bytes we want to allocate
4760  *
4761  * This will do the following things
4762  *
4763  * o reserve space in the data space info for num_bytes
4764  * o reserve space in the metadata space info based on number of outstanding
4765  *   extents and how much csums will be needed
4766  * o add to the inodes ->delalloc_bytes
4767  * o add it to the fs_info's delalloc inodes list.
4768  *
4769  * This will return 0 for success and -ENOSPC if there is no space left.
4770  */
4771 int btrfs_delalloc_reserve_space(struct inode *inode, u64 num_bytes)
4772 {
4773         int ret;
4774
4775         ret = btrfs_check_data_free_space(inode, num_bytes);
4776         if (ret)
4777                 return ret;
4778
4779         ret = btrfs_delalloc_reserve_metadata(inode, num_bytes);
4780         if (ret) {
4781                 btrfs_free_reserved_data_space(inode, num_bytes);
4782                 return ret;
4783         }
4784
4785         return 0;
4786 }
4787
4788 /**
4789  * btrfs_delalloc_release_space - release data and metadata space for delalloc
4790  * @inode: inode we're releasing space for
4791  * @num_bytes: the number of bytes we want to free up
4792  *
4793  * This must be matched with a call to btrfs_delalloc_reserve_space.  This is
4794  * called in the case that we don't need the metadata AND data reservations
4795  * anymore.  So if there is an error or we insert an inline extent.
4796  *
4797  * This function will release the metadata space that was not used and will
4798  * decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
4799  * list if there are no delalloc bytes left.
4800  */
4801 void btrfs_delalloc_release_space(struct inode *inode, u64 num_bytes)
4802 {
4803         btrfs_delalloc_release_metadata(inode, num_bytes);
4804         btrfs_free_reserved_data_space(inode, num_bytes);
4805 }
4806
4807 static int update_block_group(struct btrfs_root *root,
4808                               u64 bytenr, u64 num_bytes, int alloc)
4809 {
4810         struct btrfs_block_group_cache *cache = NULL;
4811         struct btrfs_fs_info *info = root->fs_info;
4812         u64 total = num_bytes;
4813         u64 old_val;
4814         u64 byte_in_group;
4815         int factor;
4816
4817         /* block accounting for super block */
4818         spin_lock(&info->delalloc_lock);
4819         old_val = btrfs_super_bytes_used(info->super_copy);
4820         if (alloc)
4821                 old_val += num_bytes;
4822         else
4823                 old_val -= num_bytes;
4824         btrfs_set_super_bytes_used(info->super_copy, old_val);
4825         spin_unlock(&info->delalloc_lock);
4826
4827         while (total) {
4828                 cache = btrfs_lookup_block_group(info, bytenr);
4829                 if (!cache)
4830                         return -ENOENT;
4831                 if (cache->flags & (BTRFS_BLOCK_GROUP_DUP |
4832                                     BTRFS_BLOCK_GROUP_RAID1 |
4833                                     BTRFS_BLOCK_GROUP_RAID10))
4834                         factor = 2;
4835                 else
4836                         factor = 1;
4837                 /*
4838                  * If this block group has free space cache written out, we
4839                  * need to make sure to load it if we are removing space.  This
4840                  * is because we need the unpinning stage to actually add the
4841                  * space back to the block group, otherwise we will leak space.
4842                  */
4843                 if (!alloc && cache->cached == BTRFS_CACHE_NO)
4844                         cache_block_group(cache, 1);
4845
4846                 byte_in_group = bytenr - cache->key.objectid;
4847                 WARN_ON(byte_in_group > cache->key.offset);
4848
4849                 spin_lock(&cache->space_info->lock);
4850                 spin_lock(&cache->lock);
4851
4852                 if (btrfs_test_opt(root, SPACE_CACHE) &&
4853                     cache->disk_cache_state < BTRFS_DC_CLEAR)
4854                         cache->disk_cache_state = BTRFS_DC_CLEAR;
4855
4856                 cache->dirty = 1;
4857                 old_val = btrfs_block_group_used(&cache->item);
4858                 num_bytes = min(total, cache->key.offset - byte_in_group);
4859                 if (alloc) {
4860                         old_val += num_bytes;
4861                         btrfs_set_block_group_used(&cache->item, old_val);
4862                         cache->reserved -= num_bytes;
4863                         cache->space_info->bytes_reserved -= num_bytes;
4864                         cache->space_info->bytes_used += num_bytes;
4865                         cache->space_info->disk_used += num_bytes * factor;
4866                         spin_unlock(&cache->lock);
4867                         spin_unlock(&cache->space_info->lock);
4868                 } else {
4869                         old_val -= num_bytes;
4870                         btrfs_set_block_group_used(&cache->item, old_val);
4871                         cache->pinned += num_bytes;
4872                         cache->space_info->bytes_pinned += num_bytes;
4873                         cache->space_info->bytes_used -= num_bytes;
4874                         cache->space_info->disk_used -= num_bytes * factor;
4875                         spin_unlock(&cache->lock);
4876                         spin_unlock(&cache->space_info->lock);
4877
4878                         set_extent_dirty(info->pinned_extents,
4879                                          bytenr, bytenr + num_bytes - 1,
4880                                          GFP_NOFS | __GFP_NOFAIL);
4881                 }
4882                 btrfs_put_block_group(cache);
4883                 total -= num_bytes;
4884                 bytenr += num_bytes;
4885         }
4886         return 0;
4887 }
4888
4889 static u64 first_logical_byte(struct btrfs_root *root, u64 search_start)
4890 {
4891         struct btrfs_block_group_cache *cache;
4892         u64 bytenr;
4893
4894         spin_lock(&root->fs_info->block_group_cache_lock);
4895         bytenr = root->fs_info->first_logical_byte;
4896         spin_unlock(&root->fs_info->block_group_cache_lock);
4897
4898         if (bytenr < (u64)-1)
4899                 return bytenr;
4900
4901         cache = btrfs_lookup_first_block_group(root->fs_info, search_start);
4902         if (!cache)
4903                 return 0;
4904
4905         bytenr = cache->key.objectid;
4906         btrfs_put_block_group(cache);
4907
4908         return bytenr;
4909 }
4910
4911 static int pin_down_extent(struct btrfs_root *root,
4912                            struct btrfs_block_group_cache *cache,
4913                            u64 bytenr, u64 num_bytes, int reserved)
4914 {
4915         spin_lock(&cache->space_info->lock);
4916         spin_lock(&cache->lock);
4917         cache->pinned += num_bytes;
4918         cache->space_info->bytes_pinned += num_bytes;
4919         if (reserved) {
4920                 cache->reserved -= num_bytes;
4921                 cache->space_info->bytes_reserved -= num_bytes;
4922         }
4923         spin_unlock(&cache->lock);
4924         spin_unlock(&cache->space_info->lock);
4925
4926         set_extent_dirty(root->fs_info->pinned_extents, bytenr,
4927                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
4928         return 0;
4929 }
4930
4931 /*
4932  * this function must be called within transaction
4933  */
4934 int btrfs_pin_extent(struct btrfs_root *root,
4935                      u64 bytenr, u64 num_bytes, int reserved)
4936 {
4937         struct btrfs_block_group_cache *cache;
4938
4939         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4940         BUG_ON(!cache); /* Logic error */
4941
4942         pin_down_extent(root, cache, bytenr, num_bytes, reserved);
4943
4944         btrfs_put_block_group(cache);
4945         return 0;
4946 }
4947
4948 /*
4949  * this function must be called within transaction
4950  */
4951 int btrfs_pin_extent_for_log_replay(struct btrfs_root *root,
4952                                     u64 bytenr, u64 num_bytes)
4953 {
4954         struct btrfs_block_group_cache *cache;
4955
4956         cache = btrfs_lookup_block_group(root->fs_info, bytenr);
4957         BUG_ON(!cache); /* Logic error */
4958
4959         /*
4960          * pull in the free space cache (if any) so that our pin
4961          * removes the free space from the cache.  We have load_only set
4962          * to one because the slow code to read in the free extents does check
4963          * the pinned extents.
4964          */
4965         cache_block_group(cache, 1);
4966
4967         pin_down_extent(root, cache, bytenr, num_bytes, 0);
4968
4969         /* remove us from the free space cache (if we're there at all) */
4970         btrfs_remove_free_space(cache, bytenr, num_bytes);
4971         btrfs_put_block_group(cache);
4972         return 0;
4973 }
4974
4975 /**
4976  * btrfs_update_reserved_bytes - update the block_group and space info counters
4977  * @cache:      The cache we are manipulating
4978  * @num_bytes:  The number of bytes in question
4979  * @reserve:    One of the reservation enums
4980  *
4981  * This is called by the allocator when it reserves space, or by somebody who is
4982  * freeing space that was never actually used on disk.  For example if you
4983  * reserve some space for a new leaf in transaction A and before transaction A
4984  * commits you free that leaf, you call this with reserve set to 0 in order to
4985  * clear the reservation.
4986  *
4987  * Metadata reservations should be called with RESERVE_ALLOC so we do the proper
4988  * ENOSPC accounting.  For data we handle the reservation through clearing the
4989  * delalloc bits in the io_tree.  We have to do this since we could end up
4990  * allocating less disk space for the amount of data we have reserved in the
4991  * case of compression.
4992  *
4993  * If this is a reservation and the block group has become read only we cannot
4994  * make the reservation and return -EAGAIN, otherwise this function always
4995  * succeeds.
4996  */
4997 static int btrfs_update_reserved_bytes(struct btrfs_block_group_cache *cache,
4998                                        u64 num_bytes, int reserve)
4999 {
5000         struct btrfs_space_info *space_info = cache->space_info;
5001         int ret = 0;
5002
5003         spin_lock(&space_info->lock);
5004         spin_lock(&cache->lock);
5005         if (reserve != RESERVE_FREE) {
5006                 if (cache->ro) {
5007                         ret = -EAGAIN;
5008                 } else {
5009                         cache->reserved += num_bytes;
5010                         space_info->bytes_reserved += num_bytes;
5011                         if (reserve == RESERVE_ALLOC) {
5012                                 trace_btrfs_space_reservation(cache->fs_info,
5013                                                 "space_info", space_info->flags,
5014                                                 num_bytes, 0);
5015                                 space_info->bytes_may_use -= num_bytes;
5016                         }
5017                 }
5018         } else {
5019                 if (cache->ro)
5020                         space_info->bytes_readonly += num_bytes;
5021                 cache->reserved -= num_bytes;
5022                 space_info->bytes_reserved -= num_bytes;
5023                 space_info->reservation_progress++;
5024         }
5025         spin_unlock(&cache->lock);
5026         spin_unlock(&space_info->lock);
5027         return ret;
5028 }
5029
5030 void btrfs_prepare_extent_commit(struct btrfs_trans_handle *trans,
5031                                 struct btrfs_root *root)
5032 {
5033         struct btrfs_fs_info *fs_info = root->fs_info;
5034         struct btrfs_caching_control *next;
5035         struct btrfs_caching_control *caching_ctl;
5036         struct btrfs_block_group_cache *cache;
5037
5038         down_write(&fs_info->extent_commit_sem);
5039
5040         list_for_each_entry_safe(caching_ctl, next,
5041                                  &fs_info->caching_block_groups, list) {
5042                 cache = caching_ctl->block_group;
5043                 if (block_group_cache_done(cache)) {
5044                         cache->last_byte_to_unpin = (u64)-1;
5045                         list_del_init(&caching_ctl->list);
5046                         put_caching_control(caching_ctl);
5047                 } else {
5048                         cache->last_byte_to_unpin = caching_ctl->progress;
5049                 }
5050         }
5051
5052         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5053                 fs_info->pinned_extents = &fs_info->freed_extents[1];
5054         else
5055                 fs_info->pinned_extents = &fs_info->freed_extents[0];
5056
5057         up_write(&fs_info->extent_commit_sem);
5058
5059         update_global_block_rsv(fs_info);
5060 }
5061
5062 static int unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
5063 {
5064         struct btrfs_fs_info *fs_info = root->fs_info;
5065         struct btrfs_block_group_cache *cache = NULL;
5066         struct btrfs_space_info *space_info;
5067         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
5068         u64 len;
5069         bool readonly;
5070
5071         while (start <= end) {
5072                 readonly = false;
5073                 if (!cache ||
5074                     start >= cache->key.objectid + cache->key.offset) {
5075                         if (cache)
5076                                 btrfs_put_block_group(cache);
5077                         cache = btrfs_lookup_block_group(fs_info, start);
5078                         BUG_ON(!cache); /* Logic error */
5079                 }
5080
5081                 len = cache->key.objectid + cache->key.offset - start;
5082                 len = min(len, end + 1 - start);
5083
5084                 if (start < cache->last_byte_to_unpin) {
5085                         len = min(len, cache->last_byte_to_unpin - start);
5086                         btrfs_add_free_space(cache, start, len);
5087                 }
5088
5089                 start += len;
5090                 space_info = cache->space_info;
5091
5092                 spin_lock(&space_info->lock);
5093                 spin_lock(&cache->lock);
5094                 cache->pinned -= len;
5095                 space_info->bytes_pinned -= len;
5096                 if (cache->ro) {
5097                         space_info->bytes_readonly += len;
5098                         readonly = true;
5099                 }
5100                 spin_unlock(&cache->lock);
5101                 if (!readonly && global_rsv->space_info == space_info) {
5102                         spin_lock(&global_rsv->lock);
5103                         if (!global_rsv->full) {
5104                                 len = min(len, global_rsv->size -
5105                                           global_rsv->reserved);
5106                                 global_rsv->reserved += len;
5107                                 space_info->bytes_may_use += len;
5108                                 if (global_rsv->reserved >= global_rsv->size)
5109                                         global_rsv->full = 1;
5110                         }
5111                         spin_unlock(&global_rsv->lock);
5112                 }
5113                 spin_unlock(&space_info->lock);
5114         }
5115
5116         if (cache)
5117                 btrfs_put_block_group(cache);
5118         return 0;
5119 }
5120
5121 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
5122                                struct btrfs_root *root)
5123 {
5124         struct btrfs_fs_info *fs_info = root->fs_info;
5125         struct extent_io_tree *unpin;
5126         u64 start;
5127         u64 end;
5128         int ret;
5129
5130         if (trans->aborted)
5131                 return 0;
5132
5133         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
5134                 unpin = &fs_info->freed_extents[1];
5135         else
5136                 unpin = &fs_info->freed_extents[0];
5137
5138         while (1) {
5139                 ret = find_first_extent_bit(unpin, 0, &start, &end,
5140                                             EXTENT_DIRTY, NULL);
5141                 if (ret)
5142                         break;
5143
5144                 if (btrfs_test_opt(root, DISCARD))
5145                         ret = btrfs_discard_extent(root, start,
5146                                                    end + 1 - start, NULL);
5147
5148                 clear_extent_dirty(unpin, start, end, GFP_NOFS);
5149                 unpin_extent_range(root, start, end);
5150                 cond_resched();
5151         }
5152
5153         return 0;
5154 }
5155
5156 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
5157                                 struct btrfs_root *root,
5158                                 u64 bytenr, u64 num_bytes, u64 parent,
5159                                 u64 root_objectid, u64 owner_objectid,
5160                                 u64 owner_offset, int refs_to_drop,
5161                                 struct btrfs_delayed_extent_op *extent_op)
5162 {
5163         struct btrfs_key key;
5164         struct btrfs_path *path;
5165         struct btrfs_fs_info *info = root->fs_info;
5166         struct btrfs_root *extent_root = info->extent_root;
5167         struct extent_buffer *leaf;
5168         struct btrfs_extent_item *ei;
5169         struct btrfs_extent_inline_ref *iref;
5170         int ret;
5171         int is_data;
5172         int extent_slot = 0;
5173         int found_extent = 0;
5174         int num_to_del = 1;
5175         u32 item_size;
5176         u64 refs;
5177
5178         path = btrfs_alloc_path();
5179         if (!path)
5180                 return -ENOMEM;
5181
5182         path->reada = 1;
5183         path->leave_spinning = 1;
5184
5185         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
5186         BUG_ON(!is_data && refs_to_drop != 1);
5187
5188         ret = lookup_extent_backref(trans, extent_root, path, &iref,
5189                                     bytenr, num_bytes, parent,
5190                                     root_objectid, owner_objectid,
5191                                     owner_offset);
5192         if (ret == 0) {
5193                 extent_slot = path->slots[0];
5194                 while (extent_slot >= 0) {
5195                         btrfs_item_key_to_cpu(path->nodes[0], &key,
5196                                               extent_slot);
5197                         if (key.objectid != bytenr)
5198                                 break;
5199                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
5200                             key.offset == num_bytes) {
5201                                 found_extent = 1;
5202                                 break;
5203                         }
5204                         if (path->slots[0] - extent_slot > 5)
5205                                 break;
5206                         extent_slot--;
5207                 }
5208 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5209                 item_size = btrfs_item_size_nr(path->nodes[0], extent_slot);
5210                 if (found_extent && item_size < sizeof(*ei))
5211                         found_extent = 0;
5212 #endif
5213                 if (!found_extent) {
5214                         BUG_ON(iref);
5215                         ret = remove_extent_backref(trans, extent_root, path,
5216                                                     NULL, refs_to_drop,
5217                                                     is_data);
5218                         if (ret) {
5219                                 btrfs_abort_transaction(trans, extent_root, ret);
5220                                 goto out;
5221                         }
5222                         btrfs_release_path(path);
5223                         path->leave_spinning = 1;
5224
5225                         key.objectid = bytenr;
5226                         key.type = BTRFS_EXTENT_ITEM_KEY;
5227                         key.offset = num_bytes;
5228
5229                         ret = btrfs_search_slot(trans, extent_root,
5230                                                 &key, path, -1, 1);
5231                         if (ret) {
5232                                 printk(KERN_ERR "umm, got %d back from search"
5233                                        ", was looking for %llu\n", ret,
5234                                        (unsigned long long)bytenr);
5235                                 if (ret > 0)
5236                                         btrfs_print_leaf(extent_root,
5237                                                          path->nodes[0]);
5238                         }
5239                         if (ret < 0) {
5240                                 btrfs_abort_transaction(trans, extent_root, ret);
5241                                 goto out;
5242                         }
5243                         extent_slot = path->slots[0];
5244                 }
5245         } else if (ret == -ENOENT) {
5246                 btrfs_print_leaf(extent_root, path->nodes[0]);
5247                 WARN_ON(1);
5248                 printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
5249                        "parent %llu root %llu  owner %llu offset %llu\n",
5250                        (unsigned long long)bytenr,
5251                        (unsigned long long)parent,
5252                        (unsigned long long)root_objectid,
5253                        (unsigned long long)owner_objectid,
5254                        (unsigned long long)owner_offset);
5255         } else {
5256                 btrfs_abort_transaction(trans, extent_root, ret);
5257                 goto out;
5258         }
5259
5260         leaf = path->nodes[0];
5261         item_size = btrfs_item_size_nr(leaf, extent_slot);
5262 #ifdef BTRFS_COMPAT_EXTENT_TREE_V0
5263         if (item_size < sizeof(*ei)) {
5264                 BUG_ON(found_extent || extent_slot != path->slots[0]);
5265                 ret = convert_extent_item_v0(trans, extent_root, path,
5266                                              owner_objectid, 0);
5267                 if (ret < 0) {
5268                         btrfs_abort_transaction(trans, extent_root, ret);
5269                         goto out;
5270                 }
5271
5272                 btrfs_release_path(path);
5273                 path->leave_spinning = 1;
5274
5275                 key.objectid = bytenr;
5276                 key.type = BTRFS_EXTENT_ITEM_KEY;
5277                 key.offset = num_bytes;
5278
5279                 ret = btrfs_search_slot(trans, extent_root, &key, path,
5280                                         -1, 1);
5281                 if (ret) {
5282                         printk(KERN_ERR "umm, got %d back from search"
5283                                ", was looking for %llu\n", ret,
5284                                (unsigned long long)bytenr);
5285                         btrfs_print_leaf(extent_root, path->nodes[0]);
5286                 }
5287                 if (ret < 0) {
5288                         btrfs_abort_transaction(trans, extent_root, ret);
5289                         goto out;
5290                 }
5291
5292                 extent_slot = path->slots[0];
5293                 leaf = path->nodes[0];
5294                 item_size = btrfs_item_size_nr(leaf, extent_slot);
5295         }
5296 #endif
5297         BUG_ON(item_size < sizeof(*ei));
5298         ei = btrfs_item_ptr(leaf, extent_slot,
5299                             struct btrfs_extent_item);
5300         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
5301                 struct btrfs_tree_block_info *bi;
5302                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
5303                 bi = (struct btrfs_tree_block_info *)(ei + 1);
5304                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
5305         }
5306
5307         refs = btrfs_extent_refs(leaf, ei);
5308         BUG_ON(refs < refs_to_drop);
5309         refs -= refs_to_drop;
5310
5311         if (refs > 0) {
5312                 if (extent_op)
5313                         __run_delayed_extent_op(extent_op, leaf, ei);
5314                 /*
5315                  * In the case of inline back ref, reference count will
5316                  * be updated by remove_extent_backref
5317                  */
5318                 if (iref) {
5319                         BUG_ON(!found_extent);
5320                 } else {
5321                         btrfs_set_extent_refs(leaf, ei, refs);
5322                         btrfs_mark_buffer_dirty(leaf);
5323                 }
5324                 if (found_extent) {
5325                         ret = remove_extent_backref(trans, extent_root, path,
5326                                                     iref, refs_to_drop,
5327                                                     is_data);
5328                         if (ret) {
5329                                 btrfs_abort_transaction(trans, extent_root, ret);
5330                                 goto out;
5331                         }
5332                 }
5333         } else {
5334                 if (found_extent) {
5335                         BUG_ON(is_data && refs_to_drop !=
5336                                extent_data_ref_count(root, path, iref));
5337                         if (iref) {
5338                                 BUG_ON(path->slots[0] != extent_slot);
5339                         } else {
5340                                 BUG_ON(path->slots[0] != extent_slot + 1);
5341                                 path->slots[0] = extent_slot;
5342                                 num_to_del = 2;
5343                         }
5344                 }
5345
5346                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
5347                                       num_to_del);
5348                 if (ret) {
5349                         btrfs_abort_transaction(trans, extent_root, ret);
5350                         goto out;
5351                 }
5352                 btrfs_release_path(path);
5353
5354                 if (is_data) {
5355                         ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
5356                         if (ret) {
5357                                 btrfs_abort_transaction(trans, extent_root, ret);
5358                                 goto out;
5359                         }
5360                 }
5361
5362                 ret = update_block_group(root, bytenr, num_bytes, 0);
5363                 if (ret) {
5364                         btrfs_abort_transaction(trans, extent_root, ret);
5365                         goto out;
5366                 }
5367         }
5368 out:
5369         btrfs_free_path(path);
5370         return ret;
5371 }
5372
5373 /*
5374  * when we free an block, it is possible (and likely) that we free the last
5375  * delayed ref for that extent as well.  This searches the delayed ref tree for
5376  * a given extent, and if there are no other delayed refs to be processed, it
5377  * removes it from the tree.
5378  */
5379 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
5380                                       struct btrfs_root *root, u64 bytenr)
5381 {
5382         struct btrfs_delayed_ref_head *head;
5383         struct btrfs_delayed_ref_root *delayed_refs;
5384         struct btrfs_delayed_ref_node *ref;
5385         struct rb_node *node;
5386         int ret = 0;
5387
5388         delayed_refs = &trans->transaction->delayed_refs;
5389         spin_lock(&delayed_refs->lock);
5390         head = btrfs_find_delayed_ref_head(trans, bytenr);
5391         if (!head)
5392                 goto out;
5393
5394         node = rb_prev(&head->node.rb_node);
5395         if (!node)
5396                 goto out;
5397
5398         ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
5399
5400         /* there are still entries for this ref, we can't drop it */
5401         if (ref->bytenr == bytenr)
5402                 goto out;
5403
5404         if (head->extent_op) {
5405                 if (!head->must_insert_reserved)
5406                         goto out;
5407                 btrfs_free_delayed_extent_op(head->extent_op);
5408                 head->extent_op = NULL;
5409         }
5410
5411         /*
5412          * waiting for the lock here would deadlock.  If someone else has it
5413          * locked they are already in the process of dropping it anyway
5414          */
5415         if (!mutex_trylock(&head->mutex))
5416                 goto out;
5417
5418         /*
5419          * at this point we have a head with no other entries.  Go
5420          * ahead and process it.
5421          */
5422         head->node.in_tree = 0;
5423         rb_erase(&head->node.rb_node, &delayed_refs->root);
5424
5425         delayed_refs->num_entries--;
5426
5427         /*
5428          * we don't take a ref on the node because we're removing it from the
5429          * tree, so we just steal the ref the tree was holding.
5430          */
5431         delayed_refs->num_heads--;
5432         if (list_empty(&head->cluster))
5433                 delayed_refs->num_heads_ready--;
5434
5435         list_del_init(&head->cluster);
5436         spin_unlock(&delayed_refs->lock);
5437
5438         BUG_ON(head->extent_op);
5439         if (head->must_insert_reserved)
5440                 ret = 1;
5441
5442         mutex_unlock(&head->mutex);
5443         btrfs_put_delayed_ref(&head->node);
5444         return ret;
5445 out:
5446         spin_unlock(&delayed_refs->lock);
5447         return 0;
5448 }
5449
5450 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
5451                            struct btrfs_root *root,
5452                            struct extent_buffer *buf,
5453                            u64 parent, int last_ref)
5454 {
5455         struct btrfs_block_group_cache *cache = NULL;
5456         int ret;
5457
5458         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5459                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
5460                                         buf->start, buf->len,
5461                                         parent, root->root_key.objectid,
5462                                         btrfs_header_level(buf),
5463                                         BTRFS_DROP_DELAYED_REF, NULL, 0);
5464                 BUG_ON(ret); /* -ENOMEM */
5465         }
5466
5467         if (!last_ref)
5468                 return;
5469
5470         cache = btrfs_lookup_block_group(root->fs_info, buf->start);
5471
5472         if (btrfs_header_generation(buf) == trans->transid) {
5473                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
5474                         ret = check_ref_cleanup(trans, root, buf->start);
5475                         if (!ret)
5476                                 goto out;
5477                 }
5478
5479                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
5480                         pin_down_extent(root, cache, buf->start, buf->len, 1);
5481                         goto out;
5482                 }
5483
5484                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
5485
5486                 btrfs_add_free_space(cache, buf->start, buf->len);
5487                 btrfs_update_reserved_bytes(cache, buf->len, RESERVE_FREE);
5488         }
5489 out:
5490         /*
5491          * Deleting the buffer, clear the corrupt flag since it doesn't matter
5492          * anymore.
5493          */
5494         clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
5495         btrfs_put_block_group(cache);
5496 }
5497
5498 /* Can return -ENOMEM */
5499 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5500                       u64 bytenr, u64 num_bytes, u64 parent, u64 root_objectid,
5501                       u64 owner, u64 offset, int for_cow)
5502 {
5503         int ret;
5504         struct btrfs_fs_info *fs_info = root->fs_info;
5505
5506         /*
5507          * tree log blocks never actually go into the extent allocation
5508          * tree, just update pinning info and exit early.
5509          */
5510         if (root_objectid == BTRFS_TREE_LOG_OBJECTID) {
5511                 WARN_ON(owner >= BTRFS_FIRST_FREE_OBJECTID);
5512                 /* unlocks the pinned mutex */
5513                 btrfs_pin_extent(root, bytenr, num_bytes, 1);
5514                 ret = 0;
5515         } else if (owner < BTRFS_FIRST_FREE_OBJECTID) {
5516                 ret = btrfs_add_delayed_tree_ref(fs_info, trans, bytenr,
5517                                         num_bytes,
5518                                         parent, root_objectid, (int)owner,
5519                                         BTRFS_DROP_DELAYED_REF, NULL, for_cow);
5520         } else {
5521                 ret = btrfs_add_delayed_data_ref(fs_info, trans, bytenr,
5522                                                 num_bytes,
5523                                                 parent, root_objectid, owner,
5524                                                 offset, BTRFS_DROP_DELAYED_REF,
5525                                                 NULL, for_cow);
5526         }
5527         return ret;
5528 }
5529
5530 static u64 stripe_align(struct btrfs_root *root, u64 val)
5531 {
5532         u64 mask = ((u64)root->stripesize - 1);
5533         u64 ret = (val + mask) & ~mask;
5534         return ret;
5535 }
5536
5537 /*
5538  * when we wait for progress in the block group caching, its because
5539  * our allocation attempt failed at least once.  So, we must sleep
5540  * and let some progress happen before we try again.
5541  *
5542  * This function will sleep at least once waiting for new free space to
5543  * show up, and then it will check the block group free space numbers
5544  * for our min num_bytes.  Another option is to have it go ahead
5545  * and look in the rbtree for a free extent of a given size, but this
5546  * is a good start.
5547  */
5548 static noinline int
5549 wait_block_group_cache_progress(struct btrfs_block_group_cache *cache,
5550                                 u64 num_bytes)
5551 {
5552         struct btrfs_caching_control *caching_ctl;
5553
5554         caching_ctl = get_caching_control(cache);
5555         if (!caching_ctl)
5556                 return 0;
5557
5558         wait_event(caching_ctl->wait, block_group_cache_done(cache) ||
5559                    (cache->free_space_ctl->free_space >= num_bytes));
5560
5561         put_caching_control(caching_ctl);
5562         return 0;
5563 }
5564
5565 static noinline int
5566 wait_block_group_cache_done(struct btrfs_block_group_cache *cache)
5567 {
5568         struct btrfs_caching_control *caching_ctl;
5569
5570         caching_ctl = get_caching_control(cache);
5571         if (!caching_ctl)
5572                 return 0;
5573
5574         wait_event(caching_ctl->wait, block_group_cache_done(cache));
5575
5576         put_caching_control(caching_ctl);
5577         return 0;
5578 }
5579
5580 int __get_raid_index(u64 flags)
5581 {
5582         if (flags & BTRFS_BLOCK_GROUP_RAID10)
5583                 return BTRFS_RAID_RAID10;
5584         else if (flags & BTRFS_BLOCK_GROUP_RAID1)
5585                 return BTRFS_RAID_RAID1;
5586         else if (flags & BTRFS_BLOCK_GROUP_DUP)
5587                 return BTRFS_RAID_DUP;
5588         else if (flags & BTRFS_BLOCK_GROUP_RAID0)
5589                 return BTRFS_RAID_RAID0;
5590         else
5591                 return BTRFS_RAID_SINGLE;
5592 }
5593
5594 static int get_block_group_index(struct btrfs_block_group_cache *cache)
5595 {
5596         return __get_raid_index(cache->flags);
5597 }
5598
5599 enum btrfs_loop_type {
5600         LOOP_CACHING_NOWAIT = 0,
5601         LOOP_CACHING_WAIT = 1,
5602         LOOP_ALLOC_CHUNK = 2,
5603         LOOP_NO_EMPTY_SIZE = 3,
5604 };
5605
5606 /*
5607  * walks the btree of allocated extents and find a hole of a given size.
5608  * The key ins is changed to record the hole:
5609  * ins->objectid == block start
5610  * ins->flags = BTRFS_EXTENT_ITEM_KEY
5611  * ins->offset == number of blocks
5612  * Any available blocks before search_start are skipped.
5613  */
5614 static noinline int find_free_extent(struct btrfs_trans_handle *trans,
5615                                      struct btrfs_root *orig_root,
5616                                      u64 num_bytes, u64 empty_size,
5617                                      u64 hint_byte, struct btrfs_key *ins,
5618                                      u64 data)
5619 {
5620         int ret = 0;
5621         struct btrfs_root *root = orig_root->fs_info->extent_root;
5622         struct btrfs_free_cluster *last_ptr = NULL;
5623         struct btrfs_block_group_cache *block_group = NULL;
5624         struct btrfs_block_group_cache *used_block_group;
5625         u64 search_start = 0;
5626         int empty_cluster = 2 * 1024 * 1024;
5627         struct btrfs_space_info *space_info;
5628         int loop = 0;
5629         int index = __get_raid_index(data);
5630         int alloc_type = (data & BTRFS_BLOCK_GROUP_DATA) ?
5631                 RESERVE_ALLOC_NO_ACCOUNT : RESERVE_ALLOC;
5632         bool found_uncached_bg = false;
5633         bool failed_cluster_refill = false;
5634         bool failed_alloc = false;
5635         bool use_cluster = true;
5636         bool have_caching_bg = false;
5637
5638         WARN_ON(num_bytes < root->sectorsize);
5639         btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
5640         ins->objectid = 0;
5641         ins->offset = 0;
5642
5643         trace_find_free_extent(orig_root, num_bytes, empty_size, data);
5644
5645         space_info = __find_space_info(root->fs_info, data);
5646         if (!space_info) {
5647                 printk(KERN_ERR "No space info for %llu\n", data);
5648                 return -ENOSPC;
5649         }
5650
5651         /*
5652          * If the space info is for both data and metadata it means we have a
5653          * small filesystem and we can't use the clustering stuff.
5654          */
5655         if (btrfs_mixed_space_info(space_info))
5656                 use_cluster = false;
5657
5658         if (data & BTRFS_BLOCK_GROUP_METADATA && use_cluster) {
5659                 last_ptr = &root->fs_info->meta_alloc_cluster;
5660                 if (!btrfs_test_opt(root, SSD))
5661                         empty_cluster = 64 * 1024;
5662         }
5663
5664         if ((data & BTRFS_BLOCK_GROUP_DATA) && use_cluster &&
5665             btrfs_test_opt(root, SSD)) {
5666                 last_ptr = &root->fs_info->data_alloc_cluster;
5667         }
5668
5669         if (last_ptr) {
5670                 spin_lock(&last_ptr->lock);
5671                 if (last_ptr->block_group)
5672                         hint_byte = last_ptr->window_start;
5673                 spin_unlock(&last_ptr->lock);
5674         }
5675
5676         search_start = max(search_start, first_logical_byte(root, 0));
5677         search_start = max(search_start, hint_byte);
5678
5679         if (!last_ptr)
5680                 empty_cluster = 0;
5681
5682         if (search_start == hint_byte) {
5683                 block_group = btrfs_lookup_block_group(root->fs_info,
5684                                                        search_start);
5685                 used_block_group = block_group;
5686                 /*
5687                  * we don't want to use the block group if it doesn't match our
5688                  * allocation bits, or if its not cached.
5689                  *
5690                  * However if we are re-searching with an ideal block group
5691                  * picked out then we don't care that the block group is cached.
5692                  */
5693                 if (block_group && block_group_bits(block_group, data) &&
5694                     block_group->cached != BTRFS_CACHE_NO) {
5695                         down_read(&space_info->groups_sem);
5696                         if (list_empty(&block_group->list) ||
5697                             block_group->ro) {
5698                                 /*
5699                                  * someone is removing this block group,
5700                                  * we can't jump into the have_block_group
5701                                  * target because our list pointers are not
5702                                  * valid
5703                                  */
5704                                 btrfs_put_block_group(block_group);
5705                                 up_read(&space_info->groups_sem);
5706                         } else {
5707                                 index = get_block_group_index(block_group);
5708                                 goto have_block_group;
5709                         }
5710                 } else if (block_group) {
5711                         btrfs_put_block_group(block_group);
5712                 }
5713         }
5714 search:
5715         have_caching_bg = false;
5716         down_read(&space_info->groups_sem);
5717         list_for_each_entry(block_group, &space_info->block_groups[index],
5718                             list) {
5719                 u64 offset;
5720                 int cached;
5721
5722                 used_block_group = block_group;
5723                 btrfs_get_block_group(block_group);
5724                 search_start = block_group->key.objectid;
5725
5726                 /*
5727                  * this can happen if we end up cycling through all the
5728                  * raid types, but we want to make sure we only allocate
5729                  * for the proper type.
5730                  */
5731                 if (!block_group_bits(block_group, data)) {
5732                     u64 extra = BTRFS_BLOCK_GROUP_DUP |
5733                                 BTRFS_BLOCK_GROUP_RAID1 |
5734                                 BTRFS_BLOCK_GROUP_RAID10;
5735
5736                         /*
5737                          * if they asked for extra copies and this block group
5738                          * doesn't provide them, bail.  This does allow us to
5739                          * fill raid0 from raid1.
5740                          */
5741                         if ((data & extra) && !(block_group->flags & extra))
5742                                 goto loop;
5743                 }
5744
5745 have_block_group:
5746                 cached = block_group_cache_done(block_group);
5747                 if (unlikely(!cached)) {
5748                         found_uncached_bg = true;
5749                         ret = cache_block_group(block_group, 0);
5750                         BUG_ON(ret < 0);
5751                         ret = 0;
5752                 }
5753
5754                 if (unlikely(block_group->ro))
5755                         goto loop;
5756
5757                 /*
5758                  * Ok we want to try and use the cluster allocator, so
5759                  * lets look there
5760                  */
5761                 if (last_ptr) {
5762                         /*
5763                          * the refill lock keeps out other
5764                          * people trying to start a new cluster
5765                          */
5766                         spin_lock(&last_ptr->refill_lock);
5767                         used_block_group = last_ptr->block_group;
5768                         if (used_block_group != block_group &&
5769                             (!used_block_group ||
5770                              used_block_group->ro ||
5771                              !block_group_bits(used_block_group, data))) {
5772                                 used_block_group = block_group;
5773                                 goto refill_cluster;
5774                         }
5775
5776                         if (used_block_group != block_group)
5777                                 btrfs_get_block_group(used_block_group);
5778
5779                         offset = btrfs_alloc_from_cluster(used_block_group,
5780                           last_ptr, num_bytes, used_block_group->key.objectid);
5781                         if (offset) {
5782                                 /* we have a block, we're done */
5783                                 spin_unlock(&last_ptr->refill_lock);
5784                                 trace_btrfs_reserve_extent_cluster(root,
5785                                         block_group, search_start, num_bytes);
5786                                 goto checks;
5787                         }
5788
5789                         WARN_ON(last_ptr->block_group != used_block_group);
5790                         if (used_block_group != block_group) {
5791                                 btrfs_put_block_group(used_block_group);
5792                                 used_block_group = block_group;
5793                         }
5794 refill_cluster:
5795                         BUG_ON(used_block_group != block_group);
5796                         /* If we are on LOOP_NO_EMPTY_SIZE, we can't
5797                          * set up a new clusters, so lets just skip it
5798                          * and let the allocator find whatever block
5799                          * it can find.  If we reach this point, we
5800                          * will have tried the cluster allocator
5801                          * plenty of times and not have found
5802                          * anything, so we are likely way too
5803                          * fragmented for the clustering stuff to find
5804                          * anything.
5805                          *
5806                          * However, if the cluster is taken from the
5807                          * current block group, release the cluster
5808                          * first, so that we stand a better chance of
5809                          * succeeding in the unclustered
5810                          * allocation.  */
5811                         if (loop >= LOOP_NO_EMPTY_SIZE &&
5812                             last_ptr->block_group != block_group) {
5813                                 spin_unlock(&last_ptr->refill_lock);
5814                                 goto unclustered_alloc;
5815                         }
5816
5817                         /*
5818                          * this cluster didn't work out, free it and
5819                          * start over
5820                          */
5821                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5822
5823                         if (loop >= LOOP_NO_EMPTY_SIZE) {
5824                                 spin_unlock(&last_ptr->refill_lock);
5825                                 goto unclustered_alloc;
5826                         }
5827
5828                         /* allocate a cluster in this block group */
5829                         ret = btrfs_find_space_cluster(trans, root,
5830                                                block_group, last_ptr,
5831                                                search_start, num_bytes,
5832                                                empty_cluster + empty_size);
5833                         if (ret == 0) {
5834                                 /*
5835                                  * now pull our allocation out of this
5836                                  * cluster
5837                                  */
5838                                 offset = btrfs_alloc_from_cluster(block_group,
5839                                                   last_ptr, num_bytes,
5840                                                   search_start);
5841                                 if (offset) {
5842                                         /* we found one, proceed */
5843                                         spin_unlock(&last_ptr->refill_lock);
5844                                         trace_btrfs_reserve_extent_cluster(root,
5845                                                 block_group, search_start,
5846                                                 num_bytes);
5847                                         goto checks;
5848                                 }
5849                         } else if (!cached && loop > LOOP_CACHING_NOWAIT
5850                                    && !failed_cluster_refill) {
5851                                 spin_unlock(&last_ptr->refill_lock);
5852
5853                                 failed_cluster_refill = true;
5854                                 wait_block_group_cache_progress(block_group,
5855                                        num_bytes + empty_cluster + empty_size);
5856                                 goto have_block_group;
5857                         }
5858
5859                         /*
5860                          * at this point we either didn't find a cluster
5861                          * or we weren't able to allocate a block from our
5862                          * cluster.  Free the cluster we've been trying
5863                          * to use, and go to the next block group
5864                          */
5865                         btrfs_return_cluster_to_free_space(NULL, last_ptr);
5866                         spin_unlock(&last_ptr->refill_lock);
5867                         goto loop;
5868                 }
5869
5870 unclustered_alloc:
5871                 spin_lock(&block_group->free_space_ctl->tree_lock);
5872                 if (cached &&
5873                     block_group->free_space_ctl->free_space <
5874                     num_bytes + empty_cluster + empty_size) {
5875                         spin_unlock(&block_group->free_space_ctl->tree_lock);
5876                         goto loop;
5877                 }
5878                 spin_unlock(&block_group->free_space_ctl->tree_lock);
5879
5880                 offset = btrfs_find_space_for_alloc(block_group, search_start,
5881                                                     num_bytes, empty_size);
5882                 /*
5883                  * If we didn't find a chunk, and we haven't failed on this
5884                  * block group before, and this block group is in the middle of
5885                  * caching and we are ok with waiting, then go ahead and wait
5886                  * for progress to be made, and set failed_alloc to true.
5887                  *
5888                  * If failed_alloc is true then we've already waited on this
5889                  * block group once and should move on to the next block group.
5890                  */
5891                 if (!offset && !failed_alloc && !cached &&
5892                     loop > LOOP_CACHING_NOWAIT) {
5893                         wait_block_group_cache_progress(block_group,
5894                                                 num_bytes + empty_size);
5895                         failed_alloc = true;
5896                         goto have_block_group;
5897                 } else if (!offset) {
5898                         if (!cached)
5899                                 have_caching_bg = true;
5900                         goto loop;
5901                 }
5902 checks:
5903                 search_start = stripe_align(root, offset);
5904
5905                 /* move on to the next group */
5906                 if (search_start + num_bytes >
5907                     used_block_group->key.objectid + used_block_group->key.offset) {
5908                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5909                         goto loop;
5910                 }
5911
5912                 if (offset < search_start)
5913                         btrfs_add_free_space(used_block_group, offset,
5914                                              search_start - offset);
5915                 BUG_ON(offset > search_start);
5916
5917                 ret = btrfs_update_reserved_bytes(used_block_group, num_bytes,
5918                                                   alloc_type);
5919                 if (ret == -EAGAIN) {
5920                         btrfs_add_free_space(used_block_group, offset, num_bytes);
5921                         goto loop;
5922                 }
5923
5924                 /* we are all good, lets return */
5925                 ins->objectid = search_start;
5926                 ins->offset = num_bytes;
5927
5928                 trace_btrfs_reserve_extent(orig_root, block_group,
5929                                            search_start, num_bytes);
5930                 if (used_block_group != block_group)
5931                         btrfs_put_block_group(used_block_group);
5932                 btrfs_put_block_group(block_group);
5933                 break;
5934 loop:
5935                 failed_cluster_refill = false;
5936                 failed_alloc = false;
5937                 BUG_ON(index != get_block_group_index(block_group));
5938                 if (used_block_group != block_group)
5939                         btrfs_put_block_group(used_block_group);
5940                 btrfs_put_block_group(block_group);
5941         }
5942         up_read(&space_info->groups_sem);
5943
5944         if (!ins->objectid && loop >= LOOP_CACHING_WAIT && have_caching_bg)
5945                 goto search;
5946
5947         if (!ins->objectid && ++index < BTRFS_NR_RAID_TYPES)
5948                 goto search;
5949
5950         /*
5951          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
5952          *                      caching kthreads as we move along
5953          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
5954          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
5955          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
5956          *                      again
5957          */
5958         if (!ins->objectid && loop < LOOP_NO_EMPTY_SIZE) {
5959                 index = 0;
5960                 loop++;
5961                 if (loop == LOOP_ALLOC_CHUNK) {
5962                         ret = do_chunk_alloc(trans, root, data,
5963                                              CHUNK_ALLOC_FORCE);
5964                         /*
5965                          * Do not bail out on ENOSPC since we
5966                          * can do more things.
5967                          */
5968                         if (ret < 0 && ret != -ENOSPC) {
5969                                 btrfs_abort_transaction(trans,
5970                                                         root, ret);
5971                                 goto out;
5972                         }
5973                 }
5974
5975                 if (loop == LOOP_NO_EMPTY_SIZE) {
5976                         empty_size = 0;
5977                         empty_cluster = 0;
5978                 }
5979
5980                 goto search;
5981         } else if (!ins->objectid) {
5982                 ret = -ENOSPC;
5983         } else if (ins->objectid) {
5984                 ret = 0;
5985         }
5986 out:
5987
5988         return ret;
5989 }
5990
5991 static void dump_space_info(struct btrfs_space_info *info, u64 bytes,
5992                             int dump_block_groups)
5993 {
5994         struct btrfs_block_group_cache *cache;
5995         int index = 0;
5996
5997         spin_lock(&info->lock);
5998         printk(KERN_INFO "space_info %llu has %llu free, is %sfull\n",
5999                (unsigned long long)info->flags,
6000                (unsigned long long)(info->total_bytes - info->bytes_used -
6001                                     info->bytes_pinned - info->bytes_reserved -
6002                                     info->bytes_readonly),
6003                (info->full) ? "" : "not ");
6004         printk(KERN_INFO "space_info total=%llu, used=%llu, pinned=%llu, "
6005                "reserved=%llu, may_use=%llu, readonly=%llu\n",
6006                (unsigned long long)info->total_bytes,
6007                (unsigned long long)info->bytes_used,
6008                (unsigned long long)info->bytes_pinned,
6009                (unsigned long long)info->bytes_reserved,
6010                (unsigned long long)info->bytes_may_use,
6011                (unsigned long long)info->bytes_readonly);
6012         spin_unlock(&info->lock);
6013
6014         if (!dump_block_groups)
6015                 return;
6016
6017         down_read(&info->groups_sem);
6018 again:
6019         list_for_each_entry(cache, &info->block_groups[index], list) {
6020                 spin_lock(&cache->lock);
6021                 printk(KERN_INFO "block group %llu has %llu bytes, %llu used %llu pinned %llu reserved %s\n",
6022                        (unsigned long long)cache->key.objectid,
6023                        (unsigned long long)cache->key.offset,
6024                        (unsigned long long)btrfs_block_group_used(&cache->item),
6025                        (unsigned long long)cache->pinned,
6026                        (unsigned long long)cache->reserved,
6027                        cache->ro ? "[readonly]" : "");
6028                 btrfs_dump_free_space(cache, bytes);
6029                 spin_unlock(&cache->lock);
6030         }
6031         if (++index < BTRFS_NR_RAID_TYPES)
6032                 goto again;
6033         up_read(&info->groups_sem);
6034 }
6035
6036 int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
6037                          struct btrfs_root *root,
6038                          u64 num_bytes, u64 min_alloc_size,
6039                          u64 empty_size, u64 hint_byte,
6040                          struct btrfs_key *ins, u64 data)
6041 {
6042         bool final_tried = false;
6043         int ret;
6044
6045         data = btrfs_get_alloc_profile(root, data);
6046 again:
6047         WARN_ON(num_bytes < root->sectorsize);
6048         ret = find_free_extent(trans, root, num_bytes, empty_size,
6049                                hint_byte, ins, data);
6050
6051         if (ret == -ENOSPC) {
6052                 if (!final_tried) {
6053                         num_bytes = num_bytes >> 1;
6054                         num_bytes = num_bytes & ~(root->sectorsize - 1);
6055                         num_bytes = max(num_bytes, min_alloc_size);
6056                         if (num_bytes == min_alloc_size)
6057                                 final_tried = true;
6058                         goto again;
6059                 } else if (btrfs_test_opt(root, ENOSPC_DEBUG)) {
6060                         struct btrfs_space_info *sinfo;
6061
6062                         sinfo = __find_space_info(root->fs_info, data);
6063                         printk(KERN_ERR "btrfs allocation failed flags %llu, "
6064                                "wanted %llu\n", (unsigned long long)data,
6065                                (unsigned long long)num_bytes);
6066                         if (sinfo)
6067                                 dump_space_info(sinfo, num_bytes, 1);
6068                 }
6069         }
6070
6071         trace_btrfs_reserved_extent_alloc(root, ins->objectid, ins->offset);
6072
6073         return ret;
6074 }
6075
6076 static int __btrfs_free_reserved_extent(struct btrfs_root *root,
6077                                         u64 start, u64 len, int pin)
6078 {
6079         struct btrfs_block_group_cache *cache;
6080         int ret = 0;
6081
6082         cache = btrfs_lookup_block_group(root->fs_info, start);
6083         if (!cache) {
6084                 printk(KERN_ERR "Unable to find block group for %llu\n",
6085                        (unsigned long long)start);
6086                 return -ENOSPC;
6087         }
6088
6089         if (btrfs_test_opt(root, DISCARD))
6090                 ret = btrfs_discard_extent(root, start, len, NULL);
6091
6092         if (pin)
6093                 pin_down_extent(root, cache, start, len, 1);
6094         else {
6095                 btrfs_add_free_space(cache, start, len);
6096                 btrfs_update_reserved_bytes(cache, len, RESERVE_FREE);
6097         }
6098         btrfs_put_block_group(cache);
6099
6100         trace_btrfs_reserved_extent_free(root, start, len);
6101
6102         return ret;
6103 }
6104
6105 int btrfs_free_reserved_extent(struct btrfs_root *root,
6106                                         u64 start, u64 len)
6107 {
6108         return __btrfs_free_reserved_extent(root, start, len, 0);
6109 }
6110
6111 int btrfs_free_and_pin_reserved_extent(struct btrfs_root *root,
6112                                        u64 start, u64 len)
6113 {
6114         return __btrfs_free_reserved_extent(root, start, len, 1);
6115 }
6116
6117 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6118                                       struct btrfs_root *root,
6119                                       u64 parent, u64 root_objectid,
6120                                       u64 flags, u64 owner, u64 offset,
6121                                       struct btrfs_key *ins, int ref_mod)
6122 {
6123         int ret;
6124         struct btrfs_fs_info *fs_info = root->fs_info;
6125         struct btrfs_extent_item *extent_item;
6126         struct btrfs_extent_inline_ref *iref;
6127         struct btrfs_path *path;
6128         struct extent_buffer *leaf;
6129         int type;
6130         u32 size;
6131
6132         if (parent > 0)
6133                 type = BTRFS_SHARED_DATA_REF_KEY;
6134         else
6135                 type = BTRFS_EXTENT_DATA_REF_KEY;
6136
6137         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
6138
6139         path = btrfs_alloc_path();
6140         if (!path)
6141                 return -ENOMEM;
6142
6143         path->leave_spinning = 1;
6144         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6145                                       ins, size);
6146         if (ret) {
6147                 btrfs_free_path(path);
6148                 return ret;
6149         }
6150
6151         leaf = path->nodes[0];
6152         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6153                                      struct btrfs_extent_item);
6154         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
6155         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6156         btrfs_set_extent_flags(leaf, extent_item,
6157                                flags | BTRFS_EXTENT_FLAG_DATA);
6158
6159         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
6160         btrfs_set_extent_inline_ref_type(leaf, iref, type);
6161         if (parent > 0) {
6162                 struct btrfs_shared_data_ref *ref;
6163                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
6164                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6165                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
6166         } else {
6167                 struct btrfs_extent_data_ref *ref;
6168                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
6169                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
6170                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
6171                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
6172                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
6173         }
6174
6175         btrfs_mark_buffer_dirty(path->nodes[0]);
6176         btrfs_free_path(path);
6177
6178         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6179         if (ret) { /* -ENOENT, logic error */
6180                 printk(KERN_ERR "btrfs update block group failed for %llu "
6181                        "%llu\n", (unsigned long long)ins->objectid,
6182                        (unsigned long long)ins->offset);
6183                 BUG();
6184         }
6185         return ret;
6186 }
6187
6188 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
6189                                      struct btrfs_root *root,
6190                                      u64 parent, u64 root_objectid,
6191                                      u64 flags, struct btrfs_disk_key *key,
6192                                      int level, struct btrfs_key *ins)
6193 {
6194         int ret;
6195         struct btrfs_fs_info *fs_info = root->fs_info;
6196         struct btrfs_extent_item *extent_item;
6197         struct btrfs_tree_block_info *block_info;
6198         struct btrfs_extent_inline_ref *iref;
6199         struct btrfs_path *path;
6200         struct extent_buffer *leaf;
6201         u32 size = sizeof(*extent_item) + sizeof(*block_info) + sizeof(*iref);
6202
6203         path = btrfs_alloc_path();
6204         if (!path)
6205                 return -ENOMEM;
6206
6207         path->leave_spinning = 1;
6208         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
6209                                       ins, size);
6210         if (ret) {
6211                 btrfs_free_path(path);
6212                 return ret;
6213         }
6214
6215         leaf = path->nodes[0];
6216         extent_item = btrfs_item_ptr(leaf, path->slots[0],
6217                                      struct btrfs_extent_item);
6218         btrfs_set_extent_refs(leaf, extent_item, 1);
6219         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
6220         btrfs_set_extent_flags(leaf, extent_item,
6221                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
6222         block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
6223
6224         btrfs_set_tree_block_key(leaf, block_info, key);
6225         btrfs_set_tree_block_level(leaf, block_info, level);
6226
6227         iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
6228         if (parent > 0) {
6229                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
6230                 btrfs_set_extent_inline_ref_type(leaf, iref,
6231                                                  BTRFS_SHARED_BLOCK_REF_KEY);
6232                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
6233         } else {
6234                 btrfs_set_extent_inline_ref_type(leaf, iref,
6235                                                  BTRFS_TREE_BLOCK_REF_KEY);
6236                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
6237         }
6238
6239         btrfs_mark_buffer_dirty(leaf);
6240         btrfs_free_path(path);
6241
6242         ret = update_block_group(root, ins->objectid, ins->offset, 1);
6243         if (ret) { /* -ENOENT, logic error */
6244                 printk(KERN_ERR "btrfs update block group failed for %llu "
6245                        "%llu\n", (unsigned long long)ins->objectid,
6246                        (unsigned long long)ins->offset);
6247                 BUG();
6248         }
6249         return ret;
6250 }
6251
6252 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
6253                                      struct btrfs_root *root,
6254                                      u64 root_objectid, u64 owner,
6255                                      u64 offset, struct btrfs_key *ins)
6256 {
6257         int ret;
6258
6259         BUG_ON(root_objectid == BTRFS_TREE_LOG_OBJECTID);
6260
6261         ret = btrfs_add_delayed_data_ref(root->fs_info, trans, ins->objectid,
6262                                          ins->offset, 0,
6263                                          root_objectid, owner, offset,
6264                                          BTRFS_ADD_DELAYED_EXTENT, NULL, 0);
6265         return ret;
6266 }
6267
6268 /*
6269  * this is used by the tree logging recovery code.  It records that
6270  * an extent has been allocated and makes sure to clear the free
6271  * space cache bits as well
6272  */
6273 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
6274                                    struct btrfs_root *root,
6275                                    u64 root_objectid, u64 owner, u64 offset,
6276                                    struct btrfs_key *ins)
6277 {
6278         int ret;
6279         struct btrfs_block_group_cache *block_group;
6280         struct btrfs_caching_control *caching_ctl;
6281         u64 start = ins->objectid;
6282         u64 num_bytes = ins->offset;
6283
6284         block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid);
6285         cache_block_group(block_group, 0);
6286         caching_ctl = get_caching_control(block_group);
6287
6288         if (!caching_ctl) {
6289                 BUG_ON(!block_group_cache_done(block_group));
6290                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
6291                 BUG_ON(ret); /* -ENOMEM */
6292         } else {
6293                 mutex_lock(&caching_ctl->mutex);
6294
6295                 if (start >= caching_ctl->progress) {
6296                         ret = add_excluded_extent(root, start, num_bytes);
6297                         BUG_ON(ret); /* -ENOMEM */
6298                 } else if (start + num_bytes <= caching_ctl->progress) {
6299                         ret = btrfs_remove_free_space(block_group,
6300                                                       start, num_bytes);
6301                         BUG_ON(ret); /* -ENOMEM */
6302                 } else {
6303                         num_bytes = caching_ctl->progress - start;
6304                         ret = btrfs_remove_free_space(block_group,
6305                                                       start, num_bytes);
6306                         BUG_ON(ret); /* -ENOMEM */
6307
6308                         start = caching_ctl->progress;
6309                         num_bytes = ins->objectid + ins->offset -
6310                                     caching_ctl->progress;
6311                         ret = add_excluded_extent(root, start, num_bytes);
6312                         BUG_ON(ret); /* -ENOMEM */
6313                 }
6314
6315                 mutex_unlock(&caching_ctl->mutex);
6316                 put_caching_control(caching_ctl);
6317         }
6318
6319         ret = btrfs_update_reserved_bytes(block_group, ins->offset,
6320                                           RESERVE_ALLOC_NO_ACCOUNT);
6321         BUG_ON(ret); /* logic error */
6322         btrfs_put_block_group(block_group);
6323         ret = alloc_reserved_file_extent(trans, root, 0, root_objectid,
6324                                          0, owner, offset, ins, 1);
6325         return ret;
6326 }
6327
6328 struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans,
6329                                             struct btrfs_root *root,
6330                                             u64 bytenr, u32 blocksize,
6331                                             int level)
6332 {
6333         struct extent_buffer *buf;
6334
6335         buf = btrfs_find_create_tree_block(root, bytenr, blocksize);
6336         if (!buf)
6337                 return ERR_PTR(-ENOMEM);
6338         btrfs_set_header_generation(buf, trans->transid);
6339         btrfs_set_buffer_lockdep_class(root->root_key.objectid, buf, level);
6340         btrfs_tree_lock(buf);
6341         clean_tree_block(trans, root, buf);
6342         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
6343
6344         btrfs_set_lock_blocking(buf);
6345         btrfs_set_buffer_uptodate(buf);
6346
6347         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
6348                 /*
6349                  * we allow two log transactions at a time, use different
6350                  * EXENT bit to differentiate dirty pages.
6351                  */
6352                 if (root->log_transid % 2 == 0)
6353                         set_extent_dirty(&root->dirty_log_pages, buf->start,
6354                                         buf->start + buf->len - 1, GFP_NOFS);
6355                 else
6356                         set_extent_new(&root->dirty_log_pages, buf->start,
6357                                         buf->start + buf->len - 1, GFP_NOFS);
6358         } else {
6359                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
6360                          buf->start + buf->len - 1, GFP_NOFS);
6361         }
6362         trans->blocks_used++;
6363         /* this returns a buffer locked for blocking */
6364         return buf;
6365 }
6366
6367 static struct btrfs_block_rsv *
6368 use_block_rsv(struct btrfs_trans_handle *trans,
6369               struct btrfs_root *root, u32 blocksize)
6370 {
6371         struct btrfs_block_rsv *block_rsv;
6372         struct btrfs_block_rsv *global_rsv = &root->fs_info->global_block_rsv;
6373         int ret;
6374
6375         block_rsv = get_block_rsv(trans, root);
6376
6377         if (block_rsv->size == 0) {
6378                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6379                                              BTRFS_RESERVE_NO_FLUSH);
6380                 /*
6381                  * If we couldn't reserve metadata bytes try and use some from
6382                  * the global reserve.
6383                  */
6384                 if (ret && block_rsv != global_rsv) {
6385                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6386                         if (!ret)
6387                                 return global_rsv;
6388                         return ERR_PTR(ret);
6389                 } else if (ret) {
6390                         return ERR_PTR(ret);
6391                 }
6392                 return block_rsv;
6393         }
6394
6395         ret = block_rsv_use_bytes(block_rsv, blocksize);
6396         if (!ret)
6397                 return block_rsv;
6398         if (ret && !block_rsv->failfast) {
6399                 static DEFINE_RATELIMIT_STATE(_rs,
6400                                 DEFAULT_RATELIMIT_INTERVAL,
6401                                 /*DEFAULT_RATELIMIT_BURST*/ 2);
6402                 if (__ratelimit(&_rs))
6403                         WARN(1, KERN_DEBUG "btrfs: block rsv returned %d\n",
6404                              ret);
6405                 ret = reserve_metadata_bytes(root, block_rsv, blocksize,
6406                                              BTRFS_RESERVE_NO_FLUSH);
6407                 if (!ret) {
6408                         return block_rsv;
6409                 } else if (ret && block_rsv != global_rsv) {
6410                         ret = block_rsv_use_bytes(global_rsv, blocksize);
6411                         if (!ret)
6412                                 return global_rsv;
6413                 }
6414         }
6415
6416         return ERR_PTR(-ENOSPC);
6417 }
6418
6419 static void unuse_block_rsv(struct btrfs_fs_info *fs_info,
6420                             struct btrfs_block_rsv *block_rsv, u32 blocksize)
6421 {
6422         block_rsv_add_bytes(block_rsv, blocksize, 0);
6423         block_rsv_release_bytes(fs_info, block_rsv, NULL, 0);
6424 }
6425
6426 /*
6427  * finds a free extent and does all the dirty work required for allocation
6428  * returns the key for the extent through ins, and a tree buffer for
6429  * the first block of the extent through buf.
6430  *
6431  * returns the tree buffer or NULL.
6432  */
6433 struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans,
6434                                         struct btrfs_root *root, u32 blocksize,
6435                                         u64 parent, u64 root_objectid,
6436                                         struct btrfs_disk_key *key, int level,
6437                                         u64 hint, u64 empty_size)
6438 {
6439         struct btrfs_key ins;
6440         struct btrfs_block_rsv *block_rsv;
6441         struct extent_buffer *buf;
6442         u64 flags = 0;
6443         int ret;
6444
6445
6446         block_rsv = use_block_rsv(trans, root, blocksize);
6447         if (IS_ERR(block_rsv))
6448                 return ERR_CAST(block_rsv);
6449
6450         ret = btrfs_reserve_extent(trans, root, blocksize, blocksize,
6451                                    empty_size, hint, &ins, 0);
6452         if (ret) {
6453                 unuse_block_rsv(root->fs_info, block_rsv, blocksize);
6454                 return ERR_PTR(ret);
6455         }
6456
6457         buf = btrfs_init_new_buffer(trans, root, ins.objectid,
6458                                     blocksize, level);
6459         BUG_ON(IS_ERR(buf)); /* -ENOMEM */
6460
6461         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
6462                 if (parent == 0)
6463                         parent = ins.objectid;
6464                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
6465         } else
6466                 BUG_ON(parent > 0);
6467
6468         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
6469                 struct btrfs_delayed_extent_op *extent_op;
6470                 extent_op = btrfs_alloc_delayed_extent_op();
6471                 BUG_ON(!extent_op); /* -ENOMEM */
6472                 if (key)
6473                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
6474                 else
6475                         memset(&extent_op->key, 0, sizeof(extent_op->key));
6476                 extent_op->flags_to_set = flags;
6477                 extent_op->update_key = 1;
6478                 extent_op->update_flags = 1;
6479                 extent_op->is_data = 0;
6480
6481                 ret = btrfs_add_delayed_tree_ref(root->fs_info, trans,
6482                                         ins.objectid,
6483                                         ins.offset, parent, root_objectid,
6484                                         level, BTRFS_ADD_DELAYED_EXTENT,
6485                                         extent_op, 0);
6486                 BUG_ON(ret); /* -ENOMEM */
6487         }
6488         return buf;
6489 }
6490
6491 struct walk_control {
6492         u64 refs[BTRFS_MAX_LEVEL];
6493         u64 flags[BTRFS_MAX_LEVEL];
6494         struct btrfs_key update_progress;
6495         int stage;
6496         int level;
6497         int shared_level;
6498         int update_ref;
6499         int keep_locks;
6500         int reada_slot;
6501         int reada_count;
6502         int for_reloc;
6503 };
6504
6505 #define DROP_REFERENCE  1
6506 #define UPDATE_BACKREF  2
6507
6508 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
6509                                      struct btrfs_root *root,
6510                                      struct walk_control *wc,
6511                                      struct btrfs_path *path)
6512 {
6513         u64 bytenr;
6514         u64 generation;
6515         u64 refs;
6516         u64 flags;
6517         u32 nritems;
6518         u32 blocksize;
6519         struct btrfs_key key;
6520         struct extent_buffer *eb;
6521         int ret;
6522         int slot;
6523         int nread = 0;
6524
6525         if (path->slots[wc->level] < wc->reada_slot) {
6526                 wc->reada_count = wc->reada_count * 2 / 3;
6527                 wc->reada_count = max(wc->reada_count, 2);
6528         } else {
6529                 wc->reada_count = wc->reada_count * 3 / 2;
6530                 wc->reada_count = min_t(int, wc->reada_count,
6531                                         BTRFS_NODEPTRS_PER_BLOCK(root));
6532         }
6533
6534         eb = path->nodes[wc->level];
6535         nritems = btrfs_header_nritems(eb);
6536         blocksize = btrfs_level_size(root, wc->level - 1);
6537
6538         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
6539                 if (nread >= wc->reada_count)
6540                         break;
6541
6542                 cond_resched();
6543                 bytenr = btrfs_node_blockptr(eb, slot);
6544                 generation = btrfs_node_ptr_generation(eb, slot);
6545
6546                 if (slot == path->slots[wc->level])
6547                         goto reada;
6548
6549                 if (wc->stage == UPDATE_BACKREF &&
6550                     generation <= root->root_key.offset)
6551                         continue;
6552
6553                 /* We don't lock the tree block, it's OK to be racy here */
6554                 ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6555                                                &refs, &flags);
6556                 /* We don't care about errors in readahead. */
6557                 if (ret < 0)
6558                         continue;
6559                 BUG_ON(refs == 0);
6560
6561                 if (wc->stage == DROP_REFERENCE) {
6562                         if (refs == 1)
6563                                 goto reada;
6564
6565                         if (wc->level == 1 &&
6566                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6567                                 continue;
6568                         if (!wc->update_ref ||
6569                             generation <= root->root_key.offset)
6570                                 continue;
6571                         btrfs_node_key_to_cpu(eb, &key, slot);
6572                         ret = btrfs_comp_cpu_keys(&key,
6573                                                   &wc->update_progress);
6574                         if (ret < 0)
6575                                 continue;
6576                 } else {
6577                         if (wc->level == 1 &&
6578                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6579                                 continue;
6580                 }
6581 reada:
6582                 ret = readahead_tree_block(root, bytenr, blocksize,
6583                                            generation);
6584                 if (ret)
6585                         break;
6586                 nread++;
6587         }
6588         wc->reada_slot = slot;
6589 }
6590
6591 /*
6592  * hepler to process tree block while walking down the tree.
6593  *
6594  * when wc->stage == UPDATE_BACKREF, this function updates
6595  * back refs for pointers in the block.
6596  *
6597  * NOTE: return value 1 means we should stop walking down.
6598  */
6599 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
6600                                    struct btrfs_root *root,
6601                                    struct btrfs_path *path,
6602                                    struct walk_control *wc, int lookup_info)
6603 {
6604         int level = wc->level;
6605         struct extent_buffer *eb = path->nodes[level];
6606         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
6607         int ret;
6608
6609         if (wc->stage == UPDATE_BACKREF &&
6610             btrfs_header_owner(eb) != root->root_key.objectid)
6611                 return 1;
6612
6613         /*
6614          * when reference count of tree block is 1, it won't increase
6615          * again. once full backref flag is set, we never clear it.
6616          */
6617         if (lookup_info &&
6618             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
6619              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
6620                 BUG_ON(!path->locks[level]);
6621                 ret = btrfs_lookup_extent_info(trans, root,
6622                                                eb->start, eb->len,
6623                                                &wc->refs[level],
6624                                                &wc->flags[level]);
6625                 BUG_ON(ret == -ENOMEM);
6626                 if (ret)
6627                         return ret;
6628                 BUG_ON(wc->refs[level] == 0);
6629         }
6630
6631         if (wc->stage == DROP_REFERENCE) {
6632                 if (wc->refs[level] > 1)
6633                         return 1;
6634
6635                 if (path->locks[level] && !wc->keep_locks) {
6636                         btrfs_tree_unlock_rw(eb, path->locks[level]);
6637                         path->locks[level] = 0;
6638                 }
6639                 return 0;
6640         }
6641
6642         /* wc->stage == UPDATE_BACKREF */
6643         if (!(wc->flags[level] & flag)) {
6644                 BUG_ON(!path->locks[level]);
6645                 ret = btrfs_inc_ref(trans, root, eb, 1, wc->for_reloc);
6646                 BUG_ON(ret); /* -ENOMEM */
6647                 ret = btrfs_dec_ref(trans, root, eb, 0, wc->for_reloc);
6648                 BUG_ON(ret); /* -ENOMEM */
6649                 ret = btrfs_set_disk_extent_flags(trans, root, eb->start,
6650                                                   eb->len, flag, 0);
6651                 BUG_ON(ret); /* -ENOMEM */
6652                 wc->flags[level] |= flag;
6653         }
6654
6655         /*
6656          * the block is shared by multiple trees, so it's not good to
6657          * keep the tree lock
6658          */
6659         if (path->locks[level] && level > 0) {
6660                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6661                 path->locks[level] = 0;
6662         }
6663         return 0;
6664 }
6665
6666 /*
6667  * hepler to process tree block pointer.
6668  *
6669  * when wc->stage == DROP_REFERENCE, this function checks
6670  * reference count of the block pointed to. if the block
6671  * is shared and we need update back refs for the subtree
6672  * rooted at the block, this function changes wc->stage to
6673  * UPDATE_BACKREF. if the block is shared and there is no
6674  * need to update back, this function drops the reference
6675  * to the block.
6676  *
6677  * NOTE: return value 1 means we should stop walking down.
6678  */
6679 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
6680                                  struct btrfs_root *root,
6681                                  struct btrfs_path *path,
6682                                  struct walk_control *wc, int *lookup_info)
6683 {
6684         u64 bytenr;
6685         u64 generation;
6686         u64 parent;
6687         u32 blocksize;
6688         struct btrfs_key key;
6689         struct extent_buffer *next;
6690         int level = wc->level;
6691         int reada = 0;
6692         int ret = 0;
6693
6694         generation = btrfs_node_ptr_generation(path->nodes[level],
6695                                                path->slots[level]);
6696         /*
6697          * if the lower level block was created before the snapshot
6698          * was created, we know there is no need to update back refs
6699          * for the subtree
6700          */
6701         if (wc->stage == UPDATE_BACKREF &&
6702             generation <= root->root_key.offset) {
6703                 *lookup_info = 1;
6704                 return 1;
6705         }
6706
6707         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
6708         blocksize = btrfs_level_size(root, level - 1);
6709
6710         next = btrfs_find_tree_block(root, bytenr, blocksize);
6711         if (!next) {
6712                 next = btrfs_find_create_tree_block(root, bytenr, blocksize);
6713                 if (!next)
6714                         return -ENOMEM;
6715                 reada = 1;
6716         }
6717         btrfs_tree_lock(next);
6718         btrfs_set_lock_blocking(next);
6719
6720         ret = btrfs_lookup_extent_info(trans, root, bytenr, blocksize,
6721                                        &wc->refs[level - 1],
6722                                        &wc->flags[level - 1]);
6723         if (ret < 0) {
6724                 btrfs_tree_unlock(next);
6725                 return ret;
6726         }
6727
6728         BUG_ON(wc->refs[level - 1] == 0);
6729         *lookup_info = 0;
6730
6731         if (wc->stage == DROP_REFERENCE) {
6732                 if (wc->refs[level - 1] > 1) {
6733                         if (level == 1 &&
6734                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6735                                 goto skip;
6736
6737                         if (!wc->update_ref ||
6738                             generation <= root->root_key.offset)
6739                                 goto skip;
6740
6741                         btrfs_node_key_to_cpu(path->nodes[level], &key,
6742                                               path->slots[level]);
6743                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
6744                         if (ret < 0)
6745                                 goto skip;
6746
6747                         wc->stage = UPDATE_BACKREF;
6748                         wc->shared_level = level - 1;
6749                 }
6750         } else {
6751                 if (level == 1 &&
6752                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
6753                         goto skip;
6754         }
6755
6756         if (!btrfs_buffer_uptodate(next, generation, 0)) {
6757                 btrfs_tree_unlock(next);
6758                 free_extent_buffer(next);
6759                 next = NULL;
6760                 *lookup_info = 1;
6761         }
6762
6763         if (!next) {
6764                 if (reada && level == 1)
6765                         reada_walk_down(trans, root, wc, path);
6766                 next = read_tree_block(root, bytenr, blocksize, generation);
6767                 if (!next)
6768                         return -EIO;
6769                 btrfs_tree_lock(next);
6770                 btrfs_set_lock_blocking(next);
6771         }
6772
6773         level--;
6774         BUG_ON(level != btrfs_header_level(next));
6775         path->nodes[level] = next;
6776         path->slots[level] = 0;
6777         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6778         wc->level = level;
6779         if (wc->level == 1)
6780                 wc->reada_slot = 0;
6781         return 0;
6782 skip:
6783         wc->refs[level - 1] = 0;
6784         wc->flags[level - 1] = 0;
6785         if (wc->stage == DROP_REFERENCE) {
6786                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
6787                         parent = path->nodes[level]->start;
6788                 } else {
6789                         BUG_ON(root->root_key.objectid !=
6790                                btrfs_header_owner(path->nodes[level]));
6791                         parent = 0;
6792                 }
6793
6794                 ret = btrfs_free_extent(trans, root, bytenr, blocksize, parent,
6795                                 root->root_key.objectid, level - 1, 0, 0);
6796                 BUG_ON(ret); /* -ENOMEM */
6797         }
6798         btrfs_tree_unlock(next);
6799         free_extent_buffer(next);
6800         *lookup_info = 1;
6801         return 1;
6802 }
6803
6804 /*
6805  * hepler to process tree block while walking up the tree.
6806  *
6807  * when wc->stage == DROP_REFERENCE, this function drops
6808  * reference count on the block.
6809  *
6810  * when wc->stage == UPDATE_BACKREF, this function changes
6811  * wc->stage back to DROP_REFERENCE if we changed wc->stage
6812  * to UPDATE_BACKREF previously while processing the block.
6813  *
6814  * NOTE: return value 1 means we should stop walking up.
6815  */
6816 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
6817                                  struct btrfs_root *root,
6818                                  struct btrfs_path *path,
6819                                  struct walk_control *wc)
6820 {
6821         int ret;
6822         int level = wc->level;
6823         struct extent_buffer *eb = path->nodes[level];
6824         u64 parent = 0;
6825
6826         if (wc->stage == UPDATE_BACKREF) {
6827                 BUG_ON(wc->shared_level < level);
6828                 if (level < wc->shared_level)
6829                         goto out;
6830
6831                 ret = find_next_key(path, level + 1, &wc->update_progress);
6832                 if (ret > 0)
6833                         wc->update_ref = 0;
6834
6835                 wc->stage = DROP_REFERENCE;
6836                 wc->shared_level = -1;
6837                 path->slots[level] = 0;
6838
6839                 /*
6840                  * check reference count again if the block isn't locked.
6841                  * we should start walking down the tree again if reference
6842                  * count is one.
6843                  */
6844                 if (!path->locks[level]) {
6845                         BUG_ON(level == 0);
6846                         btrfs_tree_lock(eb);
6847                         btrfs_set_lock_blocking(eb);
6848                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6849
6850                         ret = btrfs_lookup_extent_info(trans, root,
6851                                                        eb->start, eb->len,
6852                                                        &wc->refs[level],
6853                                                        &wc->flags[level]);
6854                         if (ret < 0) {
6855                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6856                                 path->locks[level] = 0;
6857                                 return ret;
6858                         }
6859                         BUG_ON(wc->refs[level] == 0);
6860                         if (wc->refs[level] == 1) {
6861                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
6862                                 path->locks[level] = 0;
6863                                 return 1;
6864                         }
6865                 }
6866         }
6867
6868         /* wc->stage == DROP_REFERENCE */
6869         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
6870
6871         if (wc->refs[level] == 1) {
6872                 if (level == 0) {
6873                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6874                                 ret = btrfs_dec_ref(trans, root, eb, 1,
6875                                                     wc->for_reloc);
6876                         else
6877                                 ret = btrfs_dec_ref(trans, root, eb, 0,
6878                                                     wc->for_reloc);
6879                         BUG_ON(ret); /* -ENOMEM */
6880                 }
6881                 /* make block locked assertion in clean_tree_block happy */
6882                 if (!path->locks[level] &&
6883                     btrfs_header_generation(eb) == trans->transid) {
6884                         btrfs_tree_lock(eb);
6885                         btrfs_set_lock_blocking(eb);
6886                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
6887                 }
6888                 clean_tree_block(trans, root, eb);
6889         }
6890
6891         if (eb == root->node) {
6892                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6893                         parent = eb->start;
6894                 else
6895                         BUG_ON(root->root_key.objectid !=
6896                                btrfs_header_owner(eb));
6897         } else {
6898                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
6899                         parent = path->nodes[level + 1]->start;
6900                 else
6901                         BUG_ON(root->root_key.objectid !=
6902                                btrfs_header_owner(path->nodes[level + 1]));
6903         }
6904
6905         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
6906 out:
6907         wc->refs[level] = 0;
6908         wc->flags[level] = 0;
6909         return 0;
6910 }
6911
6912 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
6913                                    struct btrfs_root *root,
6914                                    struct btrfs_path *path,
6915                                    struct walk_control *wc)
6916 {
6917         int level = wc->level;
6918         int lookup_info = 1;
6919         int ret;
6920
6921         while (level >= 0) {
6922                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
6923                 if (ret > 0)
6924                         break;
6925
6926                 if (level == 0)
6927                         break;
6928
6929                 if (path->slots[level] >=
6930                     btrfs_header_nritems(path->nodes[level]))
6931                         break;
6932
6933                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
6934                 if (ret > 0) {
6935                         path->slots[level]++;
6936                         continue;
6937                 } else if (ret < 0)
6938                         return ret;
6939                 level = wc->level;
6940         }
6941         return 0;
6942 }
6943
6944 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
6945                                  struct btrfs_root *root,
6946                                  struct btrfs_path *path,
6947                                  struct walk_control *wc, int max_level)
6948 {
6949         int level = wc->level;
6950         int ret;
6951
6952         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
6953         while (level < max_level && path->nodes[level]) {
6954                 wc->level = level;
6955                 if (path->slots[level] + 1 <
6956                     btrfs_header_nritems(path->nodes[level])) {
6957                         path->slots[level]++;
6958                         return 0;
6959                 } else {
6960                         ret = walk_up_proc(trans, root, path, wc);
6961                         if (ret > 0)
6962                                 return 0;
6963
6964                         if (path->locks[level]) {
6965                                 btrfs_tree_unlock_rw(path->nodes[level],
6966                                                      path->locks[level]);
6967                                 path->locks[level] = 0;
6968                         }
6969                         free_extent_buffer(path->nodes[level]);
6970                         path->nodes[level] = NULL;
6971                         level++;
6972                 }
6973         }
6974         return 1;
6975 }
6976
6977 /*
6978  * drop a subvolume tree.
6979  *
6980  * this function traverses the tree freeing any blocks that only
6981  * referenced by the tree.
6982  *
6983  * when a shared tree block is found. this function decreases its
6984  * reference count by one. if update_ref is true, this function
6985  * also make sure backrefs for the shared block and all lower level
6986  * blocks are properly updated.
6987  */
6988 int btrfs_drop_snapshot(struct btrfs_root *root,
6989                          struct btrfs_block_rsv *block_rsv, int update_ref,
6990                          int for_reloc)
6991 {
6992         struct btrfs_path *path;
6993         struct btrfs_trans_handle *trans;
6994         struct btrfs_root *tree_root = root->fs_info->tree_root;
6995         struct btrfs_root_item *root_item = &root->root_item;
6996         struct walk_control *wc;
6997         struct btrfs_key key;
6998         int err = 0;
6999         int ret;
7000         int level;
7001
7002         path = btrfs_alloc_path();
7003         if (!path) {
7004                 err = -ENOMEM;
7005                 goto out;
7006         }
7007
7008         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7009         if (!wc) {
7010                 btrfs_free_path(path);
7011                 err = -ENOMEM;
7012                 goto out;
7013         }
7014
7015         trans = btrfs_start_transaction(tree_root, 0);
7016         if (IS_ERR(trans)) {
7017                 err = PTR_ERR(trans);
7018                 goto out_free;
7019         }
7020
7021         if (block_rsv)
7022                 trans->block_rsv = block_rsv;
7023
7024         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
7025                 level = btrfs_header_level(root->node);
7026                 path->nodes[level] = btrfs_lock_root_node(root);
7027                 btrfs_set_lock_blocking(path->nodes[level]);
7028                 path->slots[level] = 0;
7029                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7030                 memset(&wc->update_progress, 0,
7031                        sizeof(wc->update_progress));
7032         } else {
7033                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
7034                 memcpy(&wc->update_progress, &key,
7035                        sizeof(wc->update_progress));
7036
7037                 level = root_item->drop_level;
7038                 BUG_ON(level == 0);
7039                 path->lowest_level = level;
7040                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7041                 path->lowest_level = 0;
7042                 if (ret < 0) {
7043                         err = ret;
7044                         goto out_end_trans;
7045                 }
7046                 WARN_ON(ret > 0);
7047
7048                 /*
7049                  * unlock our path, this is safe because only this
7050                  * function is allowed to delete this snapshot
7051                  */
7052                 btrfs_unlock_up_safe(path, 0);
7053
7054                 level = btrfs_header_level(root->node);
7055                 while (1) {
7056                         btrfs_tree_lock(path->nodes[level]);
7057                         btrfs_set_lock_blocking(path->nodes[level]);
7058
7059                         ret = btrfs_lookup_extent_info(trans, root,
7060                                                 path->nodes[level]->start,
7061                                                 path->nodes[level]->len,
7062                                                 &wc->refs[level],
7063                                                 &wc->flags[level]);
7064                         if (ret < 0) {
7065                                 err = ret;
7066                                 goto out_end_trans;
7067                         }
7068                         BUG_ON(wc->refs[level] == 0);
7069
7070                         if (level == root_item->drop_level)
7071                                 break;
7072
7073                         btrfs_tree_unlock(path->nodes[level]);
7074                         WARN_ON(wc->refs[level] != 1);
7075                         level--;
7076                 }
7077         }
7078
7079         wc->level = level;
7080         wc->shared_level = -1;
7081         wc->stage = DROP_REFERENCE;
7082         wc->update_ref = update_ref;
7083         wc->keep_locks = 0;
7084         wc->for_reloc = for_reloc;
7085         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7086
7087         while (1) {
7088                 ret = walk_down_tree(trans, root, path, wc);
7089                 if (ret < 0) {
7090                         err = ret;
7091                         break;
7092                 }
7093
7094                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
7095                 if (ret < 0) {
7096                         err = ret;
7097                         break;
7098                 }
7099
7100                 if (ret > 0) {
7101                         BUG_ON(wc->stage != DROP_REFERENCE);
7102                         break;
7103                 }
7104
7105                 if (wc->stage == DROP_REFERENCE) {
7106                         level = wc->level;
7107                         btrfs_node_key(path->nodes[level],
7108                                        &root_item->drop_progress,
7109                                        path->slots[level]);
7110                         root_item->drop_level = level;
7111                 }
7112
7113                 BUG_ON(wc->level == 0);
7114                 if (btrfs_should_end_transaction(trans, tree_root)) {
7115                         ret = btrfs_update_root(trans, tree_root,
7116                                                 &root->root_key,
7117                                                 root_item);
7118                         if (ret) {
7119                                 btrfs_abort_transaction(trans, tree_root, ret);
7120                                 err = ret;
7121                                 goto out_end_trans;
7122                         }
7123
7124                         btrfs_end_transaction_throttle(trans, tree_root);
7125                         trans = btrfs_start_transaction(tree_root, 0);
7126                         if (IS_ERR(trans)) {
7127                                 err = PTR_ERR(trans);
7128                                 goto out_free;
7129                         }
7130                         if (block_rsv)
7131                                 trans->block_rsv = block_rsv;
7132                 }
7133         }
7134         btrfs_release_path(path);
7135         if (err)
7136                 goto out_end_trans;
7137
7138         ret = btrfs_del_root(trans, tree_root, &root->root_key);
7139         if (ret) {
7140                 btrfs_abort_transaction(trans, tree_root, ret);
7141                 goto out_end_trans;
7142         }
7143
7144         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
7145                 ret = btrfs_find_last_root(tree_root, root->root_key.objectid,
7146                                            NULL, NULL);
7147                 if (ret < 0) {
7148                         btrfs_abort_transaction(trans, tree_root, ret);
7149                         err = ret;
7150                         goto out_end_trans;
7151                 } else if (ret > 0) {
7152                         /* if we fail to delete the orphan item this time
7153                          * around, it'll get picked up the next time.
7154                          *
7155                          * The most common failure here is just -ENOENT.
7156                          */
7157                         btrfs_del_orphan_item(trans, tree_root,
7158                                               root->root_key.objectid);
7159                 }
7160         }
7161
7162         if (root->in_radix) {
7163                 btrfs_free_fs_root(tree_root->fs_info, root);
7164         } else {
7165                 free_extent_buffer(root->node);
7166                 free_extent_buffer(root->commit_root);
7167                 kfree(root);
7168         }
7169 out_end_trans:
7170         btrfs_end_transaction_throttle(trans, tree_root);
7171 out_free:
7172         kfree(wc);
7173         btrfs_free_path(path);
7174 out:
7175         if (err)
7176                 btrfs_std_error(root->fs_info, err);
7177         return err;
7178 }
7179
7180 /*
7181  * drop subtree rooted at tree block 'node'.
7182  *
7183  * NOTE: this function will unlock and release tree block 'node'
7184  * only used by relocation code
7185  */
7186 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
7187                         struct btrfs_root *root,
7188                         struct extent_buffer *node,
7189                         struct extent_buffer *parent)
7190 {
7191         struct btrfs_path *path;
7192         struct walk_control *wc;
7193         int level;
7194         int parent_level;
7195         int ret = 0;
7196         int wret;
7197
7198         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
7199
7200         path = btrfs_alloc_path();
7201         if (!path)
7202                 return -ENOMEM;
7203
7204         wc = kzalloc(sizeof(*wc), GFP_NOFS);
7205         if (!wc) {
7206                 btrfs_free_path(path);
7207                 return -ENOMEM;
7208         }
7209
7210         btrfs_assert_tree_locked(parent);
7211         parent_level = btrfs_header_level(parent);
7212         extent_buffer_get(parent);
7213         path->nodes[parent_level] = parent;
7214         path->slots[parent_level] = btrfs_header_nritems(parent);
7215
7216         btrfs_assert_tree_locked(node);
7217         level = btrfs_header_level(node);
7218         path->nodes[level] = node;
7219         path->slots[level] = 0;
7220         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
7221
7222         wc->refs[parent_level] = 1;
7223         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
7224         wc->level = level;
7225         wc->shared_level = -1;
7226         wc->stage = DROP_REFERENCE;
7227         wc->update_ref = 0;
7228         wc->keep_locks = 1;
7229         wc->for_reloc = 1;
7230         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(root);
7231
7232         while (1) {
7233                 wret = walk_down_tree(trans, root, path, wc);
7234                 if (wret < 0) {
7235                         ret = wret;
7236                         break;
7237                 }
7238
7239                 wret = walk_up_tree(trans, root, path, wc, parent_level);
7240                 if (wret < 0)
7241                         ret = wret;
7242                 if (wret != 0)
7243                         break;
7244         }
7245
7246         kfree(wc);
7247         btrfs_free_path(path);
7248         return ret;
7249 }
7250
7251 static u64 update_block_group_flags(struct btrfs_root *root, u64 flags)
7252 {
7253         u64 num_devices;
7254         u64 stripped;
7255
7256         /*
7257          * if restripe for this chunk_type is on pick target profile and
7258          * return, otherwise do the usual balance
7259          */
7260         stripped = get_restripe_target(root->fs_info, flags);
7261         if (stripped)
7262                 return extended_to_chunk(stripped);
7263
7264         /*
7265          * we add in the count of missing devices because we want
7266          * to make sure that any RAID levels on a degraded FS
7267          * continue to be honored.
7268          */
7269         num_devices = root->fs_info->fs_devices->rw_devices +
7270                 root->fs_info->fs_devices->missing_devices;
7271
7272         stripped = BTRFS_BLOCK_GROUP_RAID0 |
7273                 BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10;
7274
7275         if (num_devices == 1) {
7276                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7277                 stripped = flags & ~stripped;
7278
7279                 /* turn raid0 into single device chunks */
7280                 if (flags & BTRFS_BLOCK_GROUP_RAID0)
7281                         return stripped;
7282
7283                 /* turn mirroring into duplication */
7284                 if (flags & (BTRFS_BLOCK_GROUP_RAID1 |
7285                              BTRFS_BLOCK_GROUP_RAID10))
7286                         return stripped | BTRFS_BLOCK_GROUP_DUP;
7287         } else {
7288                 /* they already had raid on here, just return */
7289                 if (flags & stripped)
7290                         return flags;
7291
7292                 stripped |= BTRFS_BLOCK_GROUP_DUP;
7293                 stripped = flags & ~stripped;
7294
7295                 /* switch duplicated blocks with raid1 */
7296                 if (flags & BTRFS_BLOCK_GROUP_DUP)
7297                         return stripped | BTRFS_BLOCK_GROUP_RAID1;
7298
7299                 /* this is drive concat, leave it alone */
7300         }
7301
7302         return flags;
7303 }
7304
7305 static int set_block_group_ro(struct btrfs_block_group_cache *cache, int force)
7306 {
7307         struct btrfs_space_info *sinfo = cache->space_info;
7308         u64 num_bytes;
7309         u64 min_allocable_bytes;
7310         int ret = -ENOSPC;
7311
7312
7313         /*
7314          * We need some metadata space and system metadata space for
7315          * allocating chunks in some corner cases until we force to set
7316          * it to be readonly.
7317          */
7318         if ((sinfo->flags &
7319              (BTRFS_BLOCK_GROUP_SYSTEM | BTRFS_BLOCK_GROUP_METADATA)) &&
7320             !force)
7321                 min_allocable_bytes = 1 * 1024 * 1024;
7322         else
7323                 min_allocable_bytes = 0;
7324
7325         spin_lock(&sinfo->lock);
7326         spin_lock(&cache->lock);
7327
7328         if (cache->ro) {
7329                 ret = 0;
7330                 goto out;
7331         }
7332
7333         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7334                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7335
7336         if (sinfo->bytes_used + sinfo->bytes_reserved + sinfo->bytes_pinned +
7337             sinfo->bytes_may_use + sinfo->bytes_readonly + num_bytes +
7338             min_allocable_bytes <= sinfo->total_bytes) {
7339                 sinfo->bytes_readonly += num_bytes;
7340                 cache->ro = 1;
7341                 ret = 0;
7342         }
7343 out:
7344         spin_unlock(&cache->lock);
7345         spin_unlock(&sinfo->lock);
7346         return ret;
7347 }
7348
7349 int btrfs_set_block_group_ro(struct btrfs_root *root,
7350                              struct btrfs_block_group_cache *cache)
7351
7352 {
7353         struct btrfs_trans_handle *trans;
7354         u64 alloc_flags;
7355         int ret;
7356
7357         BUG_ON(cache->ro);
7358
7359         trans = btrfs_join_transaction(root);
7360         if (IS_ERR(trans))
7361                 return PTR_ERR(trans);
7362
7363         alloc_flags = update_block_group_flags(root, cache->flags);
7364         if (alloc_flags != cache->flags) {
7365                 ret = do_chunk_alloc(trans, root, alloc_flags,
7366                                      CHUNK_ALLOC_FORCE);
7367                 if (ret < 0)
7368                         goto out;
7369         }
7370
7371         ret = set_block_group_ro(cache, 0);
7372         if (!ret)
7373                 goto out;
7374         alloc_flags = get_alloc_profile(root, cache->space_info->flags);
7375         ret = do_chunk_alloc(trans, root, alloc_flags,
7376                              CHUNK_ALLOC_FORCE);
7377         if (ret < 0)
7378                 goto out;
7379         ret = set_block_group_ro(cache, 0);
7380 out:
7381         btrfs_end_transaction(trans, root);
7382         return ret;
7383 }
7384
7385 int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans,
7386                             struct btrfs_root *root, u64 type)
7387 {
7388         u64 alloc_flags = get_alloc_profile(root, type);
7389         return do_chunk_alloc(trans, root, alloc_flags,
7390                               CHUNK_ALLOC_FORCE);
7391 }
7392
7393 /*
7394  * helper to account the unused space of all the readonly block group in the
7395  * list. takes mirrors into account.
7396  */
7397 static u64 __btrfs_get_ro_block_group_free_space(struct list_head *groups_list)
7398 {
7399         struct btrfs_block_group_cache *block_group;
7400         u64 free_bytes = 0;
7401         int factor;
7402
7403         list_for_each_entry(block_group, groups_list, list) {
7404                 spin_lock(&block_group->lock);
7405
7406                 if (!block_group->ro) {
7407                         spin_unlock(&block_group->lock);
7408                         continue;
7409                 }
7410
7411                 if (block_group->flags & (BTRFS_BLOCK_GROUP_RAID1 |
7412                                           BTRFS_BLOCK_GROUP_RAID10 |
7413                                           BTRFS_BLOCK_GROUP_DUP))
7414                         factor = 2;
7415                 else
7416                         factor = 1;
7417
7418                 free_bytes += (block_group->key.offset -
7419                                btrfs_block_group_used(&block_group->item)) *
7420                                factor;
7421
7422                 spin_unlock(&block_group->lock);
7423         }
7424
7425         return free_bytes;
7426 }
7427
7428 /*
7429  * helper to account the unused space of all the readonly block group in the
7430  * space_info. takes mirrors into account.
7431  */
7432 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
7433 {
7434         int i;
7435         u64 free_bytes = 0;
7436
7437         spin_lock(&sinfo->lock);
7438
7439         for(i = 0; i < BTRFS_NR_RAID_TYPES; i++)
7440                 if (!list_empty(&sinfo->block_groups[i]))
7441                         free_bytes += __btrfs_get_ro_block_group_free_space(
7442                                                 &sinfo->block_groups[i]);
7443
7444         spin_unlock(&sinfo->lock);
7445
7446         return free_bytes;
7447 }
7448
7449 void btrfs_set_block_group_rw(struct btrfs_root *root,
7450                               struct btrfs_block_group_cache *cache)
7451 {
7452         struct btrfs_space_info *sinfo = cache->space_info;
7453         u64 num_bytes;
7454
7455         BUG_ON(!cache->ro);
7456
7457         spin_lock(&sinfo->lock);
7458         spin_lock(&cache->lock);
7459         num_bytes = cache->key.offset - cache->reserved - cache->pinned -
7460                     cache->bytes_super - btrfs_block_group_used(&cache->item);
7461         sinfo->bytes_readonly -= num_bytes;
7462         cache->ro = 0;
7463         spin_unlock(&cache->lock);
7464         spin_unlock(&sinfo->lock);
7465 }
7466
7467 /*
7468  * checks to see if its even possible to relocate this block group.
7469  *
7470  * @return - -1 if it's not a good idea to relocate this block group, 0 if its
7471  * ok to go ahead and try.
7472  */
7473 int btrfs_can_relocate(struct btrfs_root *root, u64 bytenr)
7474 {
7475         struct btrfs_block_group_cache *block_group;
7476         struct btrfs_space_info *space_info;
7477         struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices;
7478         struct btrfs_device *device;
7479         u64 min_free;
7480         u64 dev_min = 1;
7481         u64 dev_nr = 0;
7482         u64 target;
7483         int index;
7484         int full = 0;
7485         int ret = 0;
7486
7487         block_group = btrfs_lookup_block_group(root->fs_info, bytenr);
7488
7489         /* odd, couldn't find the block group, leave it alone */
7490         if (!block_group)
7491                 return -1;
7492
7493         min_free = btrfs_block_group_used(&block_group->item);
7494
7495         /* no bytes used, we're good */
7496         if (!min_free)
7497                 goto out;
7498
7499         space_info = block_group->space_info;
7500         spin_lock(&space_info->lock);
7501
7502         full = space_info->full;
7503
7504         /*
7505          * if this is the last block group we have in this space, we can't
7506          * relocate it unless we're able to allocate a new chunk below.
7507          *
7508          * Otherwise, we need to make sure we have room in the space to handle
7509          * all of the extents from this block group.  If we can, we're good
7510          */
7511         if ((space_info->total_bytes != block_group->key.offset) &&
7512             (space_info->bytes_used + space_info->bytes_reserved +
7513              space_info->bytes_pinned + space_info->bytes_readonly +
7514              min_free < space_info->total_bytes)) {
7515                 spin_unlock(&space_info->lock);
7516                 goto out;
7517         }
7518         spin_unlock(&space_info->lock);
7519
7520         /*
7521          * ok we don't have enough space, but maybe we have free space on our
7522          * devices to allocate new chunks for relocation, so loop through our
7523          * alloc devices and guess if we have enough space.  if this block
7524          * group is going to be restriped, run checks against the target
7525          * profile instead of the current one.
7526          */
7527         ret = -1;
7528
7529         /*
7530          * index:
7531          *      0: raid10
7532          *      1: raid1
7533          *      2: dup
7534          *      3: raid0
7535          *      4: single
7536          */
7537         target = get_restripe_target(root->fs_info, block_group->flags);
7538         if (target) {
7539                 index = __get_raid_index(extended_to_chunk(target));
7540         } else {
7541                 /*
7542                  * this is just a balance, so if we were marked as full
7543                  * we know there is no space for a new chunk
7544                  */
7545                 if (full)
7546                         goto out;
7547
7548                 index = get_block_group_index(block_group);
7549         }
7550
7551         if (index == BTRFS_RAID_RAID10) {
7552                 dev_min = 4;
7553                 /* Divide by 2 */
7554                 min_free >>= 1;
7555         } else if (index == BTRFS_RAID_RAID1) {
7556                 dev_min = 2;
7557         } else if (index == BTRFS_RAID_DUP) {
7558                 /* Multiply by 2 */
7559                 min_free <<= 1;
7560         } else if (index == BTRFS_RAID_RAID0) {
7561                 dev_min = fs_devices->rw_devices;
7562                 do_div(min_free, dev_min);
7563         }
7564
7565         mutex_lock(&root->fs_info->chunk_mutex);
7566         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
7567                 u64 dev_offset;
7568
7569                 /*
7570                  * check to make sure we can actually find a chunk with enough
7571                  * space to fit our block group in.
7572                  */
7573                 if (device->total_bytes > device->bytes_used + min_free &&
7574                     !device->is_tgtdev_for_dev_replace) {
7575                         ret = find_free_dev_extent(device, min_free,
7576                                                    &dev_offset, NULL);
7577                         if (!ret)
7578                                 dev_nr++;
7579
7580                         if (dev_nr >= dev_min)
7581                                 break;
7582
7583                         ret = -1;
7584                 }
7585         }
7586         mutex_unlock(&root->fs_info->chunk_mutex);
7587 out:
7588         btrfs_put_block_group(block_group);
7589         return ret;
7590 }
7591
7592 static int find_first_block_group(struct btrfs_root *root,
7593                 struct btrfs_path *path, struct btrfs_key *key)
7594 {
7595         int ret = 0;
7596         struct btrfs_key found_key;
7597         struct extent_buffer *leaf;
7598         int slot;
7599
7600         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
7601         if (ret < 0)
7602                 goto out;
7603
7604         while (1) {
7605                 slot = path->slots[0];
7606                 leaf = path->nodes[0];
7607                 if (slot >= btrfs_header_nritems(leaf)) {
7608                         ret = btrfs_next_leaf(root, path);
7609                         if (ret == 0)
7610                                 continue;
7611                         if (ret < 0)
7612                                 goto out;
7613                         break;
7614                 }
7615                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7616
7617                 if (found_key.objectid >= key->objectid &&
7618                     found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
7619                         ret = 0;
7620                         goto out;
7621                 }
7622                 path->slots[0]++;
7623         }
7624 out:
7625         return ret;
7626 }
7627
7628 void btrfs_put_block_group_cache(struct btrfs_fs_info *info)
7629 {
7630         struct btrfs_block_group_cache *block_group;
7631         u64 last = 0;
7632
7633         while (1) {
7634                 struct inode *inode;
7635
7636                 block_group = btrfs_lookup_first_block_group(info, last);
7637                 while (block_group) {
7638                         spin_lock(&block_group->lock);
7639                         if (block_group->iref)
7640                                 break;
7641                         spin_unlock(&block_group->lock);
7642                         block_group = next_block_group(info->tree_root,
7643                                                        block_group);
7644                 }
7645                 if (!block_group) {
7646                         if (last == 0)
7647                                 break;
7648                         last = 0;
7649                         continue;
7650                 }
7651
7652                 inode = block_group->inode;
7653                 block_group->iref = 0;
7654                 block_group->inode = NULL;
7655                 spin_unlock(&block_group->lock);
7656                 iput(inode);
7657                 last = block_group->key.objectid + block_group->key.offset;
7658                 btrfs_put_block_group(block_group);
7659         }
7660 }
7661
7662 int btrfs_free_block_groups(struct btrfs_fs_info *info)
7663 {
7664         struct btrfs_block_group_cache *block_group;
7665         struct btrfs_space_info *space_info;
7666         struct btrfs_caching_control *caching_ctl;
7667         struct rb_node *n;
7668
7669         down_write(&info->extent_commit_sem);
7670         while (!list_empty(&info->caching_block_groups)) {
7671                 caching_ctl = list_entry(info->caching_block_groups.next,
7672                                          struct btrfs_caching_control, list);
7673                 list_del(&caching_ctl->list);
7674                 put_caching_control(caching_ctl);
7675         }
7676         up_write(&info->extent_commit_sem);
7677
7678         spin_lock(&info->block_group_cache_lock);
7679         while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
7680                 block_group = rb_entry(n, struct btrfs_block_group_cache,
7681                                        cache_node);
7682                 rb_erase(&block_group->cache_node,
7683                          &info->block_group_cache_tree);
7684                 spin_unlock(&info->block_group_cache_lock);
7685
7686                 down_write(&block_group->space_info->groups_sem);
7687                 list_del(&block_group->list);
7688                 up_write(&block_group->space_info->groups_sem);
7689
7690                 if (block_group->cached == BTRFS_CACHE_STARTED)
7691                         wait_block_group_cache_done(block_group);
7692
7693                 /*
7694                  * We haven't cached this block group, which means we could
7695                  * possibly have excluded extents on this block group.
7696                  */
7697                 if (block_group->cached == BTRFS_CACHE_NO)
7698                         free_excluded_extents(info->extent_root, block_group);
7699
7700                 btrfs_remove_free_space_cache(block_group);
7701                 btrfs_put_block_group(block_group);
7702
7703                 spin_lock(&info->block_group_cache_lock);
7704         }
7705         spin_unlock(&info->block_group_cache_lock);
7706
7707         /* now that all the block groups are freed, go through and
7708          * free all the space_info structs.  This is only called during
7709          * the final stages of unmount, and so we know nobody is
7710          * using them.  We call synchronize_rcu() once before we start,
7711          * just to be on the safe side.
7712          */
7713         synchronize_rcu();
7714
7715         release_global_block_rsv(info);
7716
7717         while(!list_empty(&info->space_info)) {
7718                 space_info = list_entry(info->space_info.next,
7719                                         struct btrfs_space_info,
7720                                         list);
7721                 if (space_info->bytes_pinned > 0 ||
7722                     space_info->bytes_reserved > 0 ||
7723                     space_info->bytes_may_use > 0) {
7724                         WARN_ON(1);
7725                         dump_space_info(space_info, 0, 0);
7726                 }
7727                 list_del(&space_info->list);
7728                 kfree(space_info);
7729         }
7730         return 0;
7731 }
7732
7733 static void __link_block_group(struct btrfs_space_info *space_info,
7734                                struct btrfs_block_group_cache *cache)
7735 {
7736         int index = get_block_group_index(cache);
7737
7738         down_write(&space_info->groups_sem);
7739         list_add_tail(&cache->list, &space_info->block_groups[index]);
7740         up_write(&space_info->groups_sem);
7741 }
7742
7743 int btrfs_read_block_groups(struct btrfs_root *root)
7744 {
7745         struct btrfs_path *path;
7746         int ret;
7747         struct btrfs_block_group_cache *cache;
7748         struct btrfs_fs_info *info = root->fs_info;
7749         struct btrfs_space_info *space_info;
7750         struct btrfs_key key;
7751         struct btrfs_key found_key;
7752         struct extent_buffer *leaf;
7753         int need_clear = 0;
7754         u64 cache_gen;
7755
7756         root = info->extent_root;
7757         key.objectid = 0;
7758         key.offset = 0;
7759         btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
7760         path = btrfs_alloc_path();
7761         if (!path)
7762                 return -ENOMEM;
7763         path->reada = 1;
7764
7765         cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy);
7766         if (btrfs_test_opt(root, SPACE_CACHE) &&
7767             btrfs_super_generation(root->fs_info->super_copy) != cache_gen)
7768                 need_clear = 1;
7769         if (btrfs_test_opt(root, CLEAR_CACHE))
7770                 need_clear = 1;
7771
7772         while (1) {
7773                 ret = find_first_block_group(root, path, &key);
7774                 if (ret > 0)
7775                         break;
7776                 if (ret != 0)
7777                         goto error;
7778                 leaf = path->nodes[0];
7779                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
7780                 cache = kzalloc(sizeof(*cache), GFP_NOFS);
7781                 if (!cache) {
7782                         ret = -ENOMEM;
7783                         goto error;
7784                 }
7785                 cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7786                                                 GFP_NOFS);
7787                 if (!cache->free_space_ctl) {
7788                         kfree(cache);
7789                         ret = -ENOMEM;
7790                         goto error;
7791                 }
7792
7793                 atomic_set(&cache->count, 1);
7794                 spin_lock_init(&cache->lock);
7795                 cache->fs_info = info;
7796                 INIT_LIST_HEAD(&cache->list);
7797                 INIT_LIST_HEAD(&cache->cluster_list);
7798
7799                 if (need_clear) {
7800                         /*
7801                          * When we mount with old space cache, we need to
7802                          * set BTRFS_DC_CLEAR and set dirty flag.
7803                          *
7804                          * a) Setting 'BTRFS_DC_CLEAR' makes sure that we
7805                          *    truncate the old free space cache inode and
7806                          *    setup a new one.
7807                          * b) Setting 'dirty flag' makes sure that we flush
7808                          *    the new space cache info onto disk.
7809                          */
7810                         cache->disk_cache_state = BTRFS_DC_CLEAR;
7811                         if (btrfs_test_opt(root, SPACE_CACHE))
7812                                 cache->dirty = 1;
7813                 }
7814
7815                 read_extent_buffer(leaf, &cache->item,
7816                                    btrfs_item_ptr_offset(leaf, path->slots[0]),
7817                                    sizeof(cache->item));
7818                 memcpy(&cache->key, &found_key, sizeof(found_key));
7819
7820                 key.objectid = found_key.objectid + found_key.offset;
7821                 btrfs_release_path(path);
7822                 cache->flags = btrfs_block_group_flags(&cache->item);
7823                 cache->sectorsize = root->sectorsize;
7824
7825                 btrfs_init_free_space_ctl(cache);
7826
7827                 /*
7828                  * We need to exclude the super stripes now so that the space
7829                  * info has super bytes accounted for, otherwise we'll think
7830                  * we have more space than we actually do.
7831                  */
7832                 exclude_super_stripes(root, cache);
7833
7834                 /*
7835                  * check for two cases, either we are full, and therefore
7836                  * don't need to bother with the caching work since we won't
7837                  * find any space, or we are empty, and we can just add all
7838                  * the space in and be done with it.  This saves us _alot_ of
7839                  * time, particularly in the full case.
7840                  */
7841                 if (found_key.offset == btrfs_block_group_used(&cache->item)) {
7842                         cache->last_byte_to_unpin = (u64)-1;
7843                         cache->cached = BTRFS_CACHE_FINISHED;
7844                         free_excluded_extents(root, cache);
7845                 } else if (btrfs_block_group_used(&cache->item) == 0) {
7846                         cache->last_byte_to_unpin = (u64)-1;
7847                         cache->cached = BTRFS_CACHE_FINISHED;
7848                         add_new_free_space(cache, root->fs_info,
7849                                            found_key.objectid,
7850                                            found_key.objectid +
7851                                            found_key.offset);
7852                         free_excluded_extents(root, cache);
7853                 }
7854
7855                 ret = update_space_info(info, cache->flags, found_key.offset,
7856                                         btrfs_block_group_used(&cache->item),
7857                                         &space_info);
7858                 BUG_ON(ret); /* -ENOMEM */
7859                 cache->space_info = space_info;
7860                 spin_lock(&cache->space_info->lock);
7861                 cache->space_info->bytes_readonly += cache->bytes_super;
7862                 spin_unlock(&cache->space_info->lock);
7863
7864                 __link_block_group(space_info, cache);
7865
7866                 ret = btrfs_add_block_group_cache(root->fs_info, cache);
7867                 BUG_ON(ret); /* Logic error */
7868
7869                 set_avail_alloc_bits(root->fs_info, cache->flags);
7870                 if (btrfs_chunk_readonly(root, cache->key.objectid))
7871                         set_block_group_ro(cache, 1);
7872         }
7873
7874         list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) {
7875                 if (!(get_alloc_profile(root, space_info->flags) &
7876                       (BTRFS_BLOCK_GROUP_RAID10 |
7877                        BTRFS_BLOCK_GROUP_RAID1 |
7878                        BTRFS_BLOCK_GROUP_DUP)))
7879                         continue;
7880                 /*
7881                  * avoid allocating from un-mirrored block group if there are
7882                  * mirrored block groups.
7883                  */
7884                 list_for_each_entry(cache, &space_info->block_groups[3], list)
7885                         set_block_group_ro(cache, 1);
7886                 list_for_each_entry(cache, &space_info->block_groups[4], list)
7887                         set_block_group_ro(cache, 1);
7888         }
7889
7890         init_global_block_rsv(info);
7891         ret = 0;
7892 error:
7893         btrfs_free_path(path);
7894         return ret;
7895 }
7896
7897 void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans,
7898                                        struct btrfs_root *root)
7899 {
7900         struct btrfs_block_group_cache *block_group, *tmp;
7901         struct btrfs_root *extent_root = root->fs_info->extent_root;
7902         struct btrfs_block_group_item item;
7903         struct btrfs_key key;
7904         int ret = 0;
7905
7906         list_for_each_entry_safe(block_group, tmp, &trans->new_bgs,
7907                                  new_bg_list) {
7908                 list_del_init(&block_group->new_bg_list);
7909
7910                 if (ret)
7911                         continue;
7912
7913                 spin_lock(&block_group->lock);
7914                 memcpy(&item, &block_group->item, sizeof(item));
7915                 memcpy(&key, &block_group->key, sizeof(key));
7916                 spin_unlock(&block_group->lock);
7917
7918                 ret = btrfs_insert_item(trans, extent_root, &key, &item,
7919                                         sizeof(item));
7920                 if (ret)
7921                         btrfs_abort_transaction(trans, extent_root, ret);
7922         }
7923 }
7924
7925 int btrfs_make_block_group(struct btrfs_trans_handle *trans,
7926                            struct btrfs_root *root, u64 bytes_used,
7927                            u64 type, u64 chunk_objectid, u64 chunk_offset,
7928                            u64 size)
7929 {
7930         int ret;
7931         struct btrfs_root *extent_root;
7932         struct btrfs_block_group_cache *cache;
7933
7934         extent_root = root->fs_info->extent_root;
7935
7936         root->fs_info->last_trans_log_full_commit = trans->transid;
7937
7938         cache = kzalloc(sizeof(*cache), GFP_NOFS);
7939         if (!cache)
7940                 return -ENOMEM;
7941         cache->free_space_ctl = kzalloc(sizeof(*cache->free_space_ctl),
7942                                         GFP_NOFS);
7943         if (!cache->free_space_ctl) {
7944                 kfree(cache);
7945                 return -ENOMEM;
7946         }
7947
7948         cache->key.objectid = chunk_offset;
7949         cache->key.offset = size;
7950         cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
7951         cache->sectorsize = root->sectorsize;
7952         cache->fs_info = root->fs_info;
7953
7954         atomic_set(&cache->count, 1);
7955         spin_lock_init(&cache->lock);
7956         INIT_LIST_HEAD(&cache->list);
7957         INIT_LIST_HEAD(&cache->cluster_list);
7958         INIT_LIST_HEAD(&cache->new_bg_list);
7959
7960         btrfs_init_free_space_ctl(cache);
7961
7962         btrfs_set_block_group_used(&cache->item, bytes_used);
7963         btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid);
7964         cache->flags = type;
7965         btrfs_set_block_group_flags(&cache->item, type);
7966
7967         cache->last_byte_to_unpin = (u64)-1;
7968         cache->cached = BTRFS_CACHE_FINISHED;
7969         exclude_super_stripes(root, cache);
7970
7971         add_new_free_space(cache, root->fs_info, chunk_offset,
7972                            chunk_offset + size);
7973
7974         free_excluded_extents(root, cache);
7975
7976         ret = update_space_info(root->fs_info, cache->flags, size, bytes_used,
7977                                 &cache->space_info);
7978         BUG_ON(ret); /* -ENOMEM */
7979         update_global_block_rsv(root->fs_info);
7980
7981         spin_lock(&cache->space_info->lock);
7982         cache->space_info->bytes_readonly += cache->bytes_super;
7983         spin_unlock(&cache->space_info->lock);
7984
7985         __link_block_group(cache->space_info, cache);
7986
7987         ret = btrfs_add_block_group_cache(root->fs_info, cache);
7988         BUG_ON(ret); /* Logic error */
7989
7990         list_add_tail(&cache->new_bg_list, &trans->new_bgs);
7991
7992         set_avail_alloc_bits(extent_root->fs_info, type);
7993
7994         return 0;
7995 }
7996
7997 static void clear_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
7998 {
7999         u64 extra_flags = chunk_to_extended(flags) &
8000                                 BTRFS_EXTENDED_PROFILE_MASK;
8001
8002         write_seqlock(&fs_info->profiles_lock);
8003         if (flags & BTRFS_BLOCK_GROUP_DATA)
8004                 fs_info->avail_data_alloc_bits &= ~extra_flags;
8005         if (flags & BTRFS_BLOCK_GROUP_METADATA)
8006                 fs_info->avail_metadata_alloc_bits &= ~extra_flags;
8007         if (flags & BTRFS_BLOCK_GROUP_SYSTEM)
8008                 fs_info->avail_system_alloc_bits &= ~extra_flags;
8009         write_sequnlock(&fs_info->profiles_lock);
8010 }
8011
8012 int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
8013                              struct btrfs_root *root, u64 group_start)
8014 {
8015         struct btrfs_path *path;
8016         struct btrfs_block_group_cache *block_group;
8017         struct btrfs_free_cluster *cluster;
8018         struct btrfs_root *tree_root = root->fs_info->tree_root;
8019         struct btrfs_key key;
8020         struct inode *inode;
8021         int ret;
8022         int index;
8023         int factor;
8024
8025         root = root->fs_info->extent_root;
8026
8027         block_group = btrfs_lookup_block_group(root->fs_info, group_start);
8028         BUG_ON(!block_group);
8029         BUG_ON(!block_group->ro);
8030
8031         /*
8032          * Free the reserved super bytes from this block group before
8033          * remove it.
8034          */
8035         free_excluded_extents(root, block_group);
8036
8037         memcpy(&key, &block_group->key, sizeof(key));
8038         index = get_block_group_index(block_group);
8039         if (block_group->flags & (BTRFS_BLOCK_GROUP_DUP |
8040                                   BTRFS_BLOCK_GROUP_RAID1 |
8041                                   BTRFS_BLOCK_GROUP_RAID10))
8042                 factor = 2;
8043         else
8044                 factor = 1;
8045
8046         /* make sure this block group isn't part of an allocation cluster */
8047         cluster = &root->fs_info->data_alloc_cluster;
8048         spin_lock(&cluster->refill_lock);
8049         btrfs_return_cluster_to_free_space(block_group, cluster);
8050         spin_unlock(&cluster->refill_lock);
8051
8052         /*
8053          * make sure this block group isn't part of a metadata
8054          * allocation cluster
8055          */
8056         cluster = &root->fs_info->meta_alloc_cluster;
8057         spin_lock(&cluster->refill_lock);
8058         btrfs_return_cluster_to_free_space(block_group, cluster);
8059         spin_unlock(&cluster->refill_lock);
8060
8061         path = btrfs_alloc_path();
8062         if (!path) {
8063                 ret = -ENOMEM;
8064                 goto out;
8065         }
8066
8067         inode = lookup_free_space_inode(tree_root, block_group, path);
8068         if (!IS_ERR(inode)) {
8069                 ret = btrfs_orphan_add(trans, inode);
8070                 if (ret) {
8071                         btrfs_add_delayed_iput(inode);
8072                         goto out;
8073                 }
8074                 clear_nlink(inode);
8075                 /* One for the block groups ref */
8076                 spin_lock(&block_group->lock);
8077                 if (block_group->iref) {
8078                         block_group->iref = 0;
8079                         block_group->inode = NULL;
8080                         spin_unlock(&block_group->lock);
8081                         iput(inode);
8082                 } else {
8083                         spin_unlock(&block_group->lock);
8084                 }
8085                 /* One for our lookup ref */
8086                 btrfs_add_delayed_iput(inode);
8087         }
8088
8089         key.objectid = BTRFS_FREE_SPACE_OBJECTID;
8090         key.offset = block_group->key.objectid;
8091         key.type = 0;
8092
8093         ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
8094         if (ret < 0)
8095                 goto out;
8096         if (ret > 0)
8097                 btrfs_release_path(path);
8098         if (ret == 0) {
8099                 ret = btrfs_del_item(trans, tree_root, path);
8100                 if (ret)
8101                         goto out;
8102                 btrfs_release_path(path);
8103         }
8104
8105         spin_lock(&root->fs_info->block_group_cache_lock);
8106         rb_erase(&block_group->cache_node,
8107                  &root->fs_info->block_group_cache_tree);
8108
8109         if (root->fs_info->first_logical_byte == block_group->key.objectid)
8110                 root->fs_info->first_logical_byte = (u64)-1;
8111         spin_unlock(&root->fs_info->block_group_cache_lock);
8112
8113         down_write(&block_group->space_info->groups_sem);
8114         /*
8115          * we must use list_del_init so people can check to see if they
8116          * are still on the list after taking the semaphore
8117          */
8118         list_del_init(&block_group->list);
8119         if (list_empty(&block_group->space_info->block_groups[index]))
8120                 clear_avail_alloc_bits(root->fs_info, block_group->flags);
8121         up_write(&block_group->space_info->groups_sem);
8122
8123         if (block_group->cached == BTRFS_CACHE_STARTED)
8124                 wait_block_group_cache_done(block_group);
8125
8126         btrfs_remove_free_space_cache(block_group);
8127
8128         spin_lock(&block_group->space_info->lock);
8129         block_group->space_info->total_bytes -= block_group->key.offset;
8130         block_group->space_info->bytes_readonly -= block_group->key.offset;
8131         block_group->space_info->disk_total -= block_group->key.offset * factor;
8132         spin_unlock(&block_group->space_info->lock);
8133
8134         memcpy(&key, &block_group->key, sizeof(key));
8135
8136         btrfs_clear_space_info_full(root->fs_info);
8137
8138         btrfs_put_block_group(block_group);
8139         btrfs_put_block_group(block_group);
8140
8141         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
8142         if (ret > 0)
8143                 ret = -EIO;
8144         if (ret < 0)
8145                 goto out;
8146
8147         ret = btrfs_del_item(trans, root, path);
8148 out:
8149         btrfs_free_path(path);
8150         return ret;
8151 }
8152
8153 int btrfs_init_space_info(struct btrfs_fs_info *fs_info)
8154 {
8155         struct btrfs_space_info *space_info;
8156         struct btrfs_super_block *disk_super;
8157         u64 features;
8158         u64 flags;
8159         int mixed = 0;
8160         int ret;
8161
8162         disk_super = fs_info->super_copy;
8163         if (!btrfs_super_root(disk_super))
8164                 return 1;
8165
8166         features = btrfs_super_incompat_flags(disk_super);
8167         if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
8168                 mixed = 1;
8169
8170         flags = BTRFS_BLOCK_GROUP_SYSTEM;
8171         ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8172         if (ret)
8173                 goto out;
8174
8175         if (mixed) {
8176                 flags = BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA;
8177                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8178         } else {
8179                 flags = BTRFS_BLOCK_GROUP_METADATA;
8180                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8181                 if (ret)
8182                         goto out;
8183
8184                 flags = BTRFS_BLOCK_GROUP_DATA;
8185                 ret = update_space_info(fs_info, flags, 0, 0, &space_info);
8186         }
8187 out:
8188         return ret;
8189 }
8190
8191 int btrfs_error_unpin_extent_range(struct btrfs_root *root, u64 start, u64 end)
8192 {
8193         return unpin_extent_range(root, start, end);
8194 }
8195
8196 int btrfs_error_discard_extent(struct btrfs_root *root, u64 bytenr,
8197                                u64 num_bytes, u64 *actual_bytes)
8198 {
8199         return btrfs_discard_extent(root, bytenr, num_bytes, actual_bytes);
8200 }
8201
8202 int btrfs_trim_fs(struct btrfs_root *root, struct fstrim_range *range)
8203 {
8204         struct btrfs_fs_info *fs_info = root->fs_info;
8205         struct btrfs_block_group_cache *cache = NULL;
8206         u64 group_trimmed;
8207         u64 start;
8208         u64 end;
8209         u64 trimmed = 0;
8210         u64 total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
8211         int ret = 0;
8212
8213         /*
8214          * try to trim all FS space, our block group may start from non-zero.
8215          */
8216         if (range->len == total_bytes)
8217                 cache = btrfs_lookup_first_block_group(fs_info, range->start);
8218         else
8219                 cache = btrfs_lookup_block_group(fs_info, range->start);
8220
8221         while (cache) {
8222                 if (cache->key.objectid >= (range->start + range->len)) {
8223                         btrfs_put_block_group(cache);
8224                         break;
8225                 }
8226
8227                 start = max(range->start, cache->key.objectid);
8228                 end = min(range->start + range->len,
8229                                 cache->key.objectid + cache->key.offset);
8230
8231                 if (end - start >= range->minlen) {
8232                         if (!block_group_cache_done(cache)) {
8233                                 ret = cache_block_group(cache, 0);
8234                                 if (!ret)
8235                                         wait_block_group_cache_done(cache);
8236                         }
8237                         ret = btrfs_trim_block_group(cache,
8238                                                      &group_trimmed,
8239                                                      start,
8240                                                      end,
8241                                                      range->minlen);
8242
8243                         trimmed += group_trimmed;
8244                         if (ret) {
8245                                 btrfs_put_block_group(cache);
8246                                 break;
8247                         }
8248                 }
8249
8250                 cache = next_block_group(fs_info->tree_root, cache);
8251         }
8252
8253         range->len = trimmed;
8254         return ret;
8255 }