3 #include <flux/x86/types.h> // for vm_offset_t, vm_size_t
4 #include "space.h" // for space_index_t
6 enum mapping_type_t { Map_mem = 0, Map_io };
8 class mapping_tree_t; // forward decls
17 friend class mapping_tree_t;
21 mapping_t(const mapping_t&); // this constructor is undefined.
25 } __attribute__((packed));
28 struct physframe_data;
31 // class mapdb_t: The mapping database
38 enum { Size_factor = 4,
39 Size_id_max = 8 /* can be up to 15 (4 bits) */ };
43 physframe_data *physframe;
44 kmem_slab_t *allocator_for_treesize[Size_id_max + 1];
49 // The mapping database.
51 // This implementation encodes mapping trees in very compact arrays of
52 // fixed sizes, prefixed by a tree header (mapping_tree_t). Array
53 // sizes can vary from 4 mappings to 4<<15 mappings. For each size,
54 // we set up a slab allocator. To grow or shrink the size of an
55 // array, we have to allocate a larger or smaller tree from the
56 // corresponding allocator and then copy the array elements.
58 // The array elements (mapping_t) contain a tree depth element. This
59 // depth and the relative position in the array is all information we
60 // need to derive tree structure information. Here is an example:
64 // number value comment
65 // --------------------------
67 // 1 1 child of element #0 with depth 0
68 // 2 2 child of element #1 with depth 1
69 // 3 2 child of element #1 with depth 1
70 // 4 3 child of element #3 with depth 2
71 // 5 2 child of element #1 with depth 1
72 // 6 3 child of element #5 with depth 2
73 // 7 1 child of element #0 with depth 0
75 // This array is a pre-order encoding of the following tree:
85 // IDEAS for enhancing this implementation:
87 // We often have to find a tree header corresponding to a mapping.
88 // Currently, we do this by iterating backwards over the array
89 // containing the mappings until we find the Sigma0 mapping, from
90 // whose address we can compute the address of the tree header. If
91 // this becomes a problem, we could add one more byte to the mappings
92 // with a hint (negative array offset) where to find the sigma0
93 // mapping. (If the hint value overflows, just iterate using the hint
94 // value of the mapping we find with the first hint value.) Another
95 // idea (from Adam) would be to just look up the tree header by using
96 // the physical address from the page-table lookup, but we would need
97 // to change the interface of the mapping database for that (pass in
98 // the physical address at all times), or we would have to include the
99 // physical address (or just the address of the tree header) in the
100 // mapdb_t-user-visible mapping_t (which could be different from the
101 // internal tree representation). (XXX: Implementing one of these
102 // ideas is probably worthwile doing!)
104 // Instead of copying whole trees around when they grow or shrink a
105 // lot, or copying parts of trees when inserting an element, we could
106 // give up the array representation and add a "next" pointer to the
107 // elements -- that is, keep the tree of mappings in a
108 // pre-order-encoded singly-linked list (credits to: Christan Szmajda
109 // and Adam Wiggins). 24 bits would probably be enough to encode that
110 // pointer. Disadvantages: Mapping entries would be larger, and the
111 // cache-friendly space-locality of tree entries would be lost.
113 // The current handling of superpages sucks rocks both in this module
114 // and in our user, ipc_map.cc. We could support multiple page sizes
115 // by not using a physframe[] array only for the largest page size.
116 // (Entries of that array point to the top-level mappings -- sigma0
117 // mappings.) Mapping-tree entries would then either be regular
118 // mappings or pointers to an array of mappings of the next-smaller
119 // size. (credits: Christan Szmajda)
122 // -------------------------------
123 // | | | | | | | | | | | | | | | | array of ptr to 4M mapping_tree_t's
124 // ---|---------------------------
126 // v a mapping_tree_t
130 // | | mapping_t *or* ptr to array of ptr to 4K trees
132 // | ----------------|
133 // | | v array of ptr to 4M mapping_tree_t's
134 // --------------- -------------------------------
135 // | | | | | | | | | | | | | | | |
136 // ---|---------------------------
138 // v a mapping_tree_t
152 #ifndef offsetof // should be defined in stddef.h, but isn't
153 #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
156 // For implementation of mapping_t functions, we need mapping_tree_t.
159 // class mapping_tree_t
163 friend class mapping_t;
164 friend class mapdb_t;
169 unsigned _size_id: 4;
170 unsigned _empty_count: 11;
171 unsigned _unused: 1; // make this 32 bits to avoid a compiler bug
173 mapping_t _mappings[0] __attribute__((packed));
175 } __attribute__((packed));
177 // public routines with inline implementations
180 mapping_tree_t::number_of_entries() const
182 return mapdb_t::Size_factor << _size_id;
187 mapping_tree_t::mappings()
189 return & _mappings[0];
192 // Define (otherwise private) stuff needed by public inline
197 Depth_sigma0_mapping = 0, Depth_max = 253,
198 Depth_empty = 254, Depth_end = 255
206 mapping_depth_t depth:8;
207 unsigned do_not_touch: 24; // make this 64 bits, and make sure the
208 // compiler never touches memory after the
213 mapping_t::mapping_t()
220 return reinterpret_cast<mapping_s*>(_data);
223 PUBLIC inline NEEDS[mapping_depth_t, mapping_s, mapping_t::data]
227 return space_index_t(data()->space);
230 PUBLIC inline NEEDS[mapping_depth_t, mapping_s, mapping_t::data]
234 return (data()->address << PAGE_SHIFT);
237 PUBLIC inline NEEDS[mapping_depth_t, mapping_s, mapping_t::data]
242 return SUPERPAGE_SIZE;
251 // return data()->type;;
255 inline NEEDS[mapping_depth_t, mapping_s, mapping_t::data]
259 return (data()->depth > Depth_max);
267 while (m->data()->depth > Depth_sigma0_mapping)
269 // jump in bigger steps if this is not a free mapping
272 m -= m->data()->depth;
279 return reinterpret_cast<mapping_tree_t *>
280 (reinterpret_cast<char *>(m) - offsetof(mapping_tree_t, _mappings));
284 // more of class mapping_t
290 if (data()->depth <= Depth_sigma0_mapping)
292 // Sigma0 mappings don't have a parent.
296 // Iterate over mapping entries of this tree backwards until we find
297 // an entry with a depth smaller than ours. (We assume here that
298 // "special" depths (empty, end) are larger than Depth_max.)
299 mapping_t *m = this - 1;
301 while (m->data()->depth >= data()->depth)
308 mapping_t::next_iter()
310 mapping_tree_t *t = tree();
311 mapping_t *last_in_tree = t->mappings() + t->number_of_entries() - 1;
315 // Look for a valid element in the tree structure.
316 while (m != last_in_tree)
320 return m; // Found a valid entry!
322 // Don't iterate to end of tree if this is the last tree entry.
323 if (m->data()->depth == Depth_end)
327 // Couldn't find a non-empty element behind ourselves in the tree
333 mapping_t::next_child(mapping_t *parent)
335 // Find the next valid entry in the tree structure.
336 mapping_t *m = next_iter();
338 // If we didn't find an entry, or if the entry cannot be a child of
339 // "parent", return 0
341 || m->data()->depth <= parent->data()->depth)
350 // class mapping_tree_t
353 // This function copies the elements of mapping tree src to mapping
354 // tree dst, ignoring empty elements (that is, compressing the
355 // source tree. In-place compression is supported.
357 mapping_tree_t::copy_compact_tree(mapping_tree_t *dst, mapping_tree_t *src)
360 dst->_empty_count = 0;
362 mapping_t *d = dst->mappings();
364 for (mapping_t *s = src->mappings();
365 s < src->mappings() + src->number_of_entries();
368 if (s->unused()) // entry free
370 if (s->data()->depth == Depth_end)
379 assert (dst->_count == src->_count);
380 assert (d < dst->mappings() + dst->number_of_entries());
382 d->data()->depth = Depth_end;
383 dst->mappings()[dst->number_of_entries() - 1].data()->depth = Depth_end;
384 } // copy_compact_tree()
391 #include "kmem_slab.h"
393 struct physframe_data {
394 mapping_tree_t *tree;
401 vm_offset_t page_number = kmem::info()->main_memory.high / PAGE_SIZE + 1;
403 // allocate physframe array
404 check ( physframe = reinterpret_cast<physframe_data *>
405 (kmem::alloc(page_number * sizeof(physframe_data))) );
407 memset(physframe, 0, page_number * sizeof(physframe_data));
409 // create a slab for each mapping tree size
410 for (int slab_number = 0;
411 slab_number <= Size_id_max;
415 (Size_factor << slab_number) * sizeof(mapping_t)
416 + sizeof(mapping_tree_t);
418 allocator_for_treesize[slab_number] =
419 new kmem_slab_t(((PAGE_SIZE / elem_size) < 40
420 ? 8*PAGE_SIZE : PAGE_SIZE),
424 // create a sigma0 mapping for all physical pages
425 for (unsigned page_id = 0; page_id < page_number; page_id++)
428 check( (t = physframe[page_id].tree
429 = reinterpret_cast<mapping_tree_t*>
430 (allocator_for_treesize[0]->alloc())) );
432 t->_count = 1; // 1 valid mapping
433 t->_size_id = 0; // size is equal to Size_factor << 0
434 t->_empty_count = 0; // no gaps in tree representation
436 t->mappings()[0].data()->depth = Depth_sigma0_mapping;
437 t->mappings()[0].data()->address = page_id;
438 t->mappings()[0].data()->space = config::sigma0_taskno;
439 t->mappings()[0].data()->size = 0;
441 t->mappings()[1].data()->depth = Depth_end;
443 // We also always set the end tag on last entry so that we can
444 // check whether it has been overwritten.
445 t->mappings()[t->number_of_entries() - 1].data()->depth = Depth_end;
450 // insert a new mapping entry with the given values as child of
451 // "parent" After locating the right place for the new entry, it will
452 // be stored there (if this place is empty) or the following entries
453 // moved by one entry.
455 // We assume that there is at least one free entry at the end of the
456 // array so that at least one insert() operation can succeed between a
457 // lock()/free() pair of calls. This is guaranteed by the free()
458 // operation which allocates a larger tree if the current one becomes
461 mapdb_t::insert(mapping_t *parent,
467 assert(type == Map_mem); // we don't yet support Map_io
469 mapping_tree_t *t = parent->tree();
471 // We cannot continue if the last array entry is not free. This
472 // only happens if an earlier call to free() with this mapping tree
473 // couldn't allocate a bigger array. In this case, signal an
474 // out-of-memory condition.
475 if (! t->mappings()[t->number_of_entries() - 1].unused())
478 // If the parent mapping already has the maximum depth, we cannot
480 if (parent->data()->depth == Depth_max)
483 mapping_t *insert = 0;
486 found_free_entry = false,
487 need_to_move = false;
491 // Find a free entry in the array encoding the tree, and find the
492 // insertion point for the new entry. These two entries might not
493 // be equivalent, so we may need to move entries backwards in the
494 // array. In this implementation, we move entries as we traverse
495 // the array, instead of doing one big memmove
496 for (mapping_t *m = parent + 1;
497 m < t->mappings() + t->number_of_entries();
502 // We found a free entry in the tree -- allocate it.
503 found_free_entry = true;
506 // Did we find an empty element != the end tag?
507 if (m->data()->depth == Depth_empty)
509 t->_empty_count -= 1;
511 // Else we have found the end tag. If there is another
512 // array entry left, apply a new end tag to the array
513 else if (m + 1 < t->mappings() + t->number_of_entries())
515 (m + 1)->data()->depth = Depth_end;
518 // if we haven't identified a place for inserting the new
519 // element, this is it.
523 else if (! need_to_move
524 && m->data()->depth <= parent->data()->depth)
526 // we found a non-descendant of ourselves -- need to make
527 // space for new child
537 // replace *m with temp (which used to be *(m - 1); but
538 // *(m - 1) has since been overwritten), and load temp with
539 // the old value of *m
547 if (found_free_entry)
551 assert(insert && found_free_entry);
553 // found a place to insert new child.
554 insert->data()->depth = mapping_depth_t(parent->data()->depth + 1);
555 insert->data()->address = va >> PAGE_SHIFT;
556 insert->data()->space = space->space();
557 insert->data()->size = (size == SUPERPAGE_SIZE);
563 mapdb_t::lookup(space_t *space,
567 vm_offset_t phys = space->virt_to_phys(va);
569 if (phys == 0xffffffff)
574 // get and lock the tree.
575 // XXX For now, use a simple lock with helping. Later
576 // unify locking with our request scheme.
577 physframe[phys >> PAGE_SHIFT].lock.lock();
579 t = physframe[phys >> PAGE_SHIFT].tree;
584 for (m = t->mappings();
585 m->data()->depth != Depth_end
586 && m < t->mappings() + t->number_of_entries();
589 if (m->data()->space == space->space()
590 && m->data()->address == va >> PAGE_SHIFT)
597 // not found -- unlock tree
598 physframe[phys >> PAGE_SHIFT].lock.clear();
604 mapdb_t::free(mapping_t* mapping_of_tree)
606 mapping_tree_t *t = mapping_of_tree->tree();
608 // We assume that the zeroth mapping of the tree is a sigma0
609 // mapping, that is, its virtual address == the page's physical
611 vm_offset_t phys_pno = t->mappings()[0].data()->address;
613 // We are the owner of the tree lock.
614 assert(physframe[phys_pno].lock.lock_owner() == current());
616 // Before we unlock the tree, we need to make sure that there is
617 // room for at least one new mapping. In particular, this means
618 // that the last entry of the array encoding the tree must be free.
620 // (1) When we use up less than a quarter of all entries of the
621 // array encoding the tree, copy to a smaller tree. Otherwise, (2)
622 // if the last entry is free, do nothing. Otherwise, (3) if less
623 // than 3/4 of the entries are used, compress the tree. Otherwise,
624 // (4) copy to a larger tree.
626 bool maybe_out_of_memory = false;
628 do // (this is not actually a loop, just a block we can "break" out of)
630 // (1) Do we need to allocate a smaller tree?
631 if (t->_size_id > 0 // must not be smallest size
632 && (t->_count << 2) < t->number_of_entries())
634 mapping_tree_t *new_t =
635 reinterpret_cast<mapping_tree_t *>
636 (allocator_for_treesize[t->_size_id - 1]->alloc());
640 // XXX should be asserted by allocator:
641 new_t->_size_id = t->_size_id - 1;
642 new_t->mappings()[new_t->number_of_entries() - 1].data()->depth
645 mapping_tree_t::copy_compact_tree(new_t, t);
647 // Register new tree.
648 physframe[phys_pno].tree = new_t;
650 allocator_for_treesize[t->_size_id]->free(t);
657 // (2) Is last entry is free?
658 if (t->mappings()[t->number_of_entries() - 1].unused())
659 break; // OK, last entry is free.
661 // Last entry is not free -- either compress current array
662 // (i.e., move free entries to end of array), or allocate bigger
665 // (3) Should we compress the tree?
666 // We also try to compress if we cannot allocate a bigger
667 // tree because there is no bigger tree size.
668 if (t->_count < (t->number_of_entries() >> 2)
669 + (t->number_of_entries() >> 1)
670 || t->_size_id == Size_id_max) // cannot enlarge?
672 if (t->_size_id == Size_id_max)
673 maybe_out_of_memory = true;
675 mapping_tree_t::copy_compact_tree(t, t); // in-place compression
680 // (4) OK, allocate a bigger array.
682 mapping_tree_t *new_t =
683 reinterpret_cast<mapping_tree_t*>
684 (allocator_for_treesize[t->_size_id + 1]->alloc());
688 // XXX should be asserted by allocator:
689 new_t->_size_id = t->_size_id + 1;
690 new_t->mappings()[new_t->number_of_entries() - 1].data()->depth
693 mapping_tree_t::copy_compact_tree(new_t, t);
695 // Register new tree.
696 physframe[phys_pno].tree = new_t;
698 allocator_for_treesize[t->_size_id]->free(t);
703 // out of memory -- just do tree compression and hope that helps.
704 maybe_out_of_memory = true;
706 mapping_tree_t::copy_compact_tree(t, t); // in-place compression
711 // The last entry of the tree should now be free -- exept if we're
713 assert(t->mappings()[t->number_of_entries() - 1].unused()
714 || maybe_out_of_memory);
717 physframe[phys_pno].lock.clear();
720 // Delete mappings from a tree. This is easy to do: We just have to
721 // iterate over the array encoding the tree.
723 mapdb_t::flush(mapping_t *m, bool me_too)
725 mapping_tree_t *t = m->tree();
726 mapping_t *start_of_deletions = m;
727 unsigned m_depth = m->data()->depth;
728 unsigned deleted = 0, empty_elems_passed = 0;
732 m->data()->depth = Depth_empty;
737 start_of_deletions++;
742 m < t->mappings() + t->number_of_entries()
743 && m->data()->depth != Depth_end;
746 if (unsigned (m->data()->depth) <= m_depth)
748 // Found another data element -- stop deleting. Since we
749 // created holes in the tree representation, account for it.
750 t->_empty_count += deleted;
755 if (m->data()->depth == Depth_empty)
757 empty_elems_passed++;
761 // Delete the element.
762 m->data()->depth = Depth_empty;
767 // We deleted stuff at the end of the array -- move end tag
768 if (start_of_deletions < t->mappings() + t->number_of_entries())
770 start_of_deletions->data()->depth = Depth_end;
772 // also, reduce number of free entries
773 t->_empty_count -= empty_elems_passed;
780 mapdb_t::grant(mapping_t *m, space_t *new_space, vm_offset_t va)
782 m->data()->space = new_space->space();
783 m->data()->address = va >> PAGE_SHIFT;