6 * Definitions of applications, instances
8 * (c) 2011-2013 Björn Döbel <doebel@os.inf.tu-dresden.de>,
9 * economic rights: Technische Universität Dresden (Germany)
10 * This file is part of TUD:OS and distributed under the terms of the
11 * GNU General Public License 2.
12 * Please see the COPYING-GPL-2 file for details.
22 #include <semaphore.h>
23 #include <pthread-l4.h>
26 #include <l4/sys/types.h>
27 #include <l4/sys/utcb.h>
28 #include <l4/sys/factory>
29 #include <l4/sys/thread>
30 #include <l4/sys/task>
31 #include <l4/sys/scheduler>
32 #include <l4/sys/segment.h>
33 #include <l4/sys/debugger.h>
35 #include <l4/vcpu/vcpu>
36 #include <l4/plr/measurements.h>
37 #include <l4/util/util.h>
38 #include <l4/util/bitops.h>
40 #include <l4/re/error_helper>
41 #include <l4/re/util/cap_alloc>
42 #include <l4/re/util/kumem_alloc>
46 #include "constants.h"
52 extern "C" void my_handler(void);
59 * Instance of an application
61 * Every instance of the app is run within a dedicated vCPU address space.
62 * Instances are created depending on the amount of redundancy/checking
67 // XXX: For multithreading, we might consider having a vCPU task for
68 // every thread of the app -> see papers on deterministic multithreading
69 L4::Cap<L4::Task> _vcpu_task;
76 * Map of addr -> addr mappings.
78 * This is a dirty trick keeping track of all pages in the master AS
79 * that are mapped to the replica AS. We need it, because the usual sequence
80 * for using a dataspace is:
88 * And in the last unmap(), we cannot consult the region map for
89 * this mapping anymore.
91 * XXX: Real fix would be to slightly adapt the region map for our
92 * purposes, because otherwise we are storing *a lot* of
93 * page-to-page mappings here.
95 std::map<l4_addr_t, l4_addr_t> _mappings;
97 enum { debug_name_size = 16 };
100 explicit App_instance(char const *name = "", l4_umword_t const instanceID = 0)
104 * Create instance vCPU
106 _vcpu_task = chkcap(L4Re::Util::cap_alloc.alloc<L4::Task>(),
108 chksys(L4Re::Env::env()->factory()->create_task(_vcpu_task,
114 * Every replica gets a name set as the debug ID
116 char namebuf[debug_name_size];
117 snprintf(namebuf, debug_name_size, "V%ld %s", _id, name);
118 l4_debugger_set_object_name(_vcpu_task.cap(), namebuf);
121 L4::Cap<L4::Task> vcpu_task() const { return _vcpu_task; }
122 l4_umword_t id() const { return _id; }
125 * Map a flexpage in an aligned way.
127 * Current impl.: simply map the page as we indirectly assume that
128 * we are always called for a single page.
130 * Future: this should align the local and remote targets and use the
131 * largest possible mapping so that we can avoid a couple
132 * of page faults if possible. XXX
134 void map_aligned(l4_addr_t local, l4_addr_t remote, l4_umword_t shift, l4_umword_t flags)
136 //DEBUG() << "map_aligned(" << std::hex << local << ", " << remote
137 // << ", " << shift << ", " << flags << ")";
138 l4_fpage_t fp = l4_fpage(local, shift, flags);
139 //DEBUG() << "fp: " << fp.raw;
140 l4_msgtag_t tag = vcpu_task()->map(L4Re::This_task, fp, remote);
141 _check(l4_msgtag_has_error(tag), "error mapping page");
142 //DEBUG() << "mapped " << std::hex << fp.raw << " : " << std::hex << tag.raw;
143 for (l4_umword_t offs = 0; offs < (L4_PAGESIZE << (shift - L4_PAGESHIFT));
144 offs += L4_PAGESIZE) {
145 _mappings[remote + offs] = local + offs;
151 * Unmap a flexpage from replica
153 void unmap(l4_umword_t fpraw)
158 remote = l4_fpage_page(fp) << L4_PAGESHIFT;
160 l4_addr_t a = _mappings[remote];
161 DEBUG() << "unmap @ " << std::hex << remote << " -> " << "0x" << a;
162 vcpu_task()->unmap(l4_fpage(a, L4_PAGESIZE, L4_FPAGE_RO), L4_FP_ALL_SPACES);
163 _mappings[remote] = 0;
164 //enter_kdebug("unmapped");
169 * Representation of an application-level thread
171 * In fact, a vCPU is used for every such thread. This class also includes
172 * the stacks needed for setting up the thread and later on running the
173 * VCPU exception handlers.
178 l4_addr_t _handler_fn; // pointer to exception handler code
179 l4_addr_t _thread_fn; // pointer to initial startup code
181 /* Handler stack layout:
183 * +-------------------------------+ _handler_stack + sizeof(_handler_stack)
184 * | Instance Mgr pointer |
185 * | App_instance pointer |
186 * | App_thread pointer |
187 * | Thread group pointer |
188 * | App_model pointer |
189 * +-------------------------------+ _initial stack ptr
190 * | handler entry ebp |
192 * +-------------------------------+ _handler_stack
194 char *_handler_stack;
196 l4_addr_t _handler_sp;
197 l4_addr_t _thread_sp;
200 L4::Cap<L4::Thread> _vcpu_cap; // cap for vcpu object
201 L4vcpu::Vcpu *_vcpu; // vcpu state area
202 l4_utcb_t *_vcpu_utcb; // vcpu UTCB
203 pthread_t _pthread; // pthread backing this VCPU
204 l4_addr_t _remote_utcb; // address of remote UTCB
207 * Master segment registers. Restored whenever we enter the
208 * master through a VCPU fault.
210 l4_umword_t _master_ds;
211 l4_umword_t _master_fs;
212 l4_umword_t _master_gs;
214 l4_umword_t _pending_trap; // for injecting HW traps
215 l4_umword_t _events; // keeping track of handle events
217 /* Set if we detected a page fault that could not be handled.
218 * Thereby, the PF handler can then bail out if this fault is
220 Unhandled_Page_Fault = 1,
223 struct gdt_entry_struct
225 l4_uint16_t limit_low; // The lower 16 bits of the limit.
226 l4_uint16_t base_low; // The lower 16 bits of the base.
227 l4_uint8_t base_middle; // The next 8 bits of the base.
228 l4_uint8_t access; // Access flags, determine what ring this segment can be used in.
229 l4_uint8_t granularity;
230 l4_uint8_t base_high; // The last 8 bits of the base.
231 } __attribute__((packed))
233 bool _gdt_modified; // track if GDT was modified
237 * Watchdog: set on creation and defined in config file
240 int _watchdog_timeout;
243 * Watchdog: interrupt object set on vcpu startup
245 L4::Cap<L4::Irq> _watchdog_irq;
248 * Watchdog: use single-stepping for synchronization
251 unsigned _watchdog_ss_count;
254 * Watchdog: use breakpoints for synchronization
256 bool _watchdog_breakpointing;
257 Breakpoint *_watchdog_breakpoint;
260 * Watchdog: am I the replica that passed the watchdog interrupt
263 bool _watchdog_passed;
266 bool _got_other_trap;
267 bool _watchdog_suspended;
268 bool _watchdog_met_leader;
272 * Benchmarking: counters used to determine number and cycles spent in
273 * different parts of the master if BENCHMARKING is set to 1 in
274 * server/src/constants.h
276 * t_* -> accumulate cycles spent
277 * c_* -> count the number of times certain paths were entered
279 unsigned long long t_lock, c_lock; // lock observer
280 unsigned long long t_pfh, c_pfh; // page fault handling
281 unsigned long long t_syscalls, c_syscalls; // syscall observer
282 unsigned long long t_kiptime, c_kiptime; // KIP time observer
283 unsigned long long t_traps, c_traps; // trap handling
285 unsigned long long t_handling; // total handling time
286 unsigned long long t_observer, c_observer; // time in observers
287 unsigned long long t_keepup, c_keepup; // passive replicas: time to keep up with leader
288 unsigned long long t_user; // time in user mode
289 unsigned long long last_user_resume; // timestamp of last ret to user
291 unsigned long long t_sync_enter_all;
292 unsigned long long t_sync_wait_for_active, c_sync_wait_for_active;
293 unsigned long long t_sync_wait, t_sync_waitforarrival;
294 unsigned long long t_sync_getdata, c_sync_getdata;
295 unsigned long long t_sync_active_validate, c_sync_active_validate;
297 //unsigned long long t_sync;
298 unsigned long long t_sync_enter; // TS before DMR::enter()
299 unsigned long long t_sync_entered; // TS before sleeping / validating replicas
300 unsigned long long t_sync_leave;
302 unsigned long long t_resume_active, c_resume_active;
303 unsigned long long t_resume_passive, c_resume_passive;
304 unsigned long long t_resume_enter;
307 * Tracks if we are the currently active
308 * trap handling replica
313 * Get topmost address of exception handler/thread stacks
315 l4_addr_t top_of_handler_stack() { return (l4_addr_t)(_handler_stack + HANDLER_STACK_SIZE); }
318 * Initialize handler and init thread stacks.
320 * This ensures that the handler stack is paged in correctly before we
321 * do anything. Otherwise the handler might raise a page fault upon
328 * Create the vCPU kernel object
330 void alloc_vcpu_cap();
334 * Alloc and setup vCPU UTCB
336 * The setup code stores a pointer to this App_thread object on
337 * the handler's stack so that it can be found upon an exception.
339 void alloc_vcpu_mem();
342 App_thread(const App_thread&) { }
346 App_thread(l4_addr_t eip,
348 l4_addr_t handler_fn,
350 bool use_watchdog = false,
351 l4_umword_t watchdog_timeout = 0)
353 _handler_fn(handler_fn),
354 _thread_fn(thread_fn),
358 _remote_utcb(0xFFFFFFFF),
364 _use_watchdog(use_watchdog),
365 _watchdog_timeout(watchdog_timeout),
367 _watchdog_ss_count(0),
368 _watchdog_passed(false),
369 _watchdog_breakpointing(false),
370 _watchdog_suspended(false),
371 _got_watchdog(false),
372 _got_other_trap(false),
373 _watchdog_met_leader(false)
375 , t_lock(0ULL), c_lock(0ULL), t_pfh(0ULL), c_pfh(0ULL),
376 t_syscalls(0ULL), c_syscalls(0ULL), t_kiptime(0ULL),
377 c_kiptime(0ULL), t_traps(0ULL), c_traps(0ULL), t_handling(0ULL),
378 t_observer(0ULL), c_observer(0ULL),
379 t_keepup(0ULL), c_keepup(0ULL),
380 t_user(0ULL), last_user_resume(0ULL),
381 t_sync_enter_all(0ULL), t_sync_wait_for_active(0ULL), c_sync_wait_for_active(0ULL),
382 t_sync_wait(0ULL), t_sync_waitforarrival(0ULL),
383 t_sync_getdata(0ULL), c_sync_getdata(0ULL),
384 t_sync_active_validate(0ULL), c_sync_active_validate(0ULL),
385 t_resume_active(0ULL), c_resume_active(0ULL),
386 t_resume_passive(0ULL), c_resume_passive(0ULL),
397 _handler_stack = (char*)memalign(L4_PAGESIZE, HANDLER_STACK_SIZE);
398 _handler_sp = top_of_handler_stack();
399 DEBUG() << "HANDLER STACK: " << (void*)_handler_stack;
400 _check(!_handler_stack, "could not allocate handler stack");
406 memset(gdt(), 0, gdt_size());
408 DEBUG() << "vCPU cap: " << std::hex << vcpu_cap();
410 DEBUG() << "STACK: " << std::hex << (void*)esp;
411 vcpu()->r()->ip = eip;
412 vcpu()->r()->sp = esp;
413 DEBUG() << "EIP " << (void*)eip << " ESP " << (void*)esp;
417 void use_watchdog(bool u) { _use_watchdog = u; }
418 bool use_watchdog() { return _use_watchdog; }
419 void watchdog_timeout(l4_umword_t p) { _watchdog_timeout = p; }
420 l4_umword_t watchdog_timeout() { return _watchdog_timeout; }
422 void watchdog_ss(bool ss) { _watchdog_ss = ss; }
423 bool watchdog_ss() { return _watchdog_ss; }
424 unsigned watchdog_ss_count() { return _watchdog_ss_count; }
425 void increment_watchdog_ss_count() { ++_watchdog_ss_count; }
426 void reset_watchdog_ss_count() { _watchdog_ss_count = 0; }
428 void its_me_who_passed_the_watchdog(bool p) { _watchdog_passed = p; }
429 bool its_me_who_passed_the_watchdog() { return _watchdog_passed; }
431 void watchdog_irq(L4::Cap<L4::Irq> irq) { _watchdog_irq = irq; }
432 L4::Cap<L4::Irq> watchdog_irq() { return _watchdog_irq; }
434 void watchdog_breakpoint(Breakpoint *b) { _watchdog_breakpoint = b; }
435 Breakpoint *watchdog_breakpoint() { return _watchdog_breakpoint; }
437 void watchdog_breakpointing(bool b) { _watchdog_breakpointing = b; }
438 bool watchdog_breakpointing() { return _watchdog_breakpointing; }
440 void got_watchdog(bool w) { _got_watchdog = w; }
441 bool got_watchdog() { return _got_watchdog; }
443 void got_other_trap(bool t) { _got_other_trap = t; }
444 bool got_other_trap() { return _got_other_trap; }
446 void watchdog_suspended(bool s) { _watchdog_suspended = s; }
447 bool watchdog_suspended() { return _watchdog_suspended; }
449 void i_have_met_the_leader(bool m) { _watchdog_met_leader = m; }
450 bool i_have_met_the_leader() { return _watchdog_met_leader; }
453 bool is_active() { return active_handler; }
454 void activate() { active_handler = true; }
455 void deactivate() { active_handler = false; }
457 void count_lock(unsigned long long increment)
460 t_lock += increment; c_lock++;
464 void count_pfh(unsigned long long increment)
467 t_pfh += increment; c_pfh++;
470 void count_syscalls(unsigned long long increment)
473 t_syscalls += increment; c_syscalls++;
477 void count_kiptime(unsigned long long increment)
480 t_kiptime += increment; c_kiptime++;
484 void count_traps(unsigned long long increment)
487 t_traps += increment; c_traps++;
491 void count_handling(unsigned long long increment)
494 t_handling += increment;
498 void ts_from_user() {
500 t_sync_enter = l4_rdtsc();
501 t_user += (t_sync_enter - last_user_resume);
505 void ts_sync_entered() {
507 t_sync_entered = l4_rdtsc();
508 t_sync_enter_all += (t_sync_entered - t_sync_enter);
512 void ts_sync_leave() {
514 t_sync_leave = l4_rdtsc();
516 t_sync_active_validate += (t_sync_leave - t_sync_enter);
517 ++c_sync_active_validate;
519 t_sync_wait_for_active += (t_sync_leave - t_sync_enter);
520 ++c_sync_wait_for_active;
525 void ts_resume_start() {
527 t_resume_enter = l4_rdtsc();
529 t_observer += (t_resume_enter - t_sync_leave);
532 t_keepup += (t_resume_enter - t_sync_leave);
538 void ts_user_resume(bool first = false) {
540 last_user_resume = l4_rdtsc();
542 // the first call is only to set the resume TS, don't count
548 t_resume_active += (last_user_resume - t_resume_enter);
551 t_resume_passive += (last_user_resume - t_resume_enter);
554 deactivate(); // simply do this any time we resume
558 void inc_wait(unsigned long long increment)
561 t_sync_wait += increment;
565 void inc_waitleader(unsigned long long inc)
568 t_sync_waitforarrival += inc;
573 void inc_getdata(unsigned long long increment)
577 t_sync_getdata += increment;
582 void print_helper(char const *msg, unsigned long long time,
583 unsigned long long count = 0, bool withCount = false)
586 INFO() << std::left << std::setw(32) << msg
587 << " : " << std::right << std::setw(16) << time
588 << " [ " << std::setw(10) << count << " ]";
590 INFO() << std::left << std::setw(32) << msg
591 << " : " << std::right << std::setw(16) << time;
598 print_helper(GREEN "Clocks spent in user " NOCOLOR, t_user);
599 print_helper(GREEN "Clocks spent in master " NOCOLOR, t_handling);
600 print_helper(YELLOW " synchronization " NOCOLOR, t_sync_enter_all + t_sync_active_validate + t_sync_wait_for_active);
601 print_helper( " enter sync ", t_sync_enter_all);
602 print_helper( " active: check ", t_sync_active_validate, c_sync_active_validate, true);
603 print_helper( " passive: wait ", t_sync_wait_for_active, c_sync_wait_for_active, true);
604 print_helper( " (early wait) ", t_sync_waitforarrival);
605 print_helper( " (total wait) ", t_sync_wait);
606 print_helper( " (get data) ", t_sync_getdata, c_sync_getdata, true);
607 print_helper(YELLOW " observers " NOCOLOR, t_observer, c_observer, true);
608 print_helper( " PFH ", t_pfh, c_pfh, true);
609 print_helper( " Locking ", t_lock, c_lock, true);
610 print_helper( " Syscalls ", t_syscalls, c_syscalls, true);
611 print_helper( " gettime() ", t_kiptime, c_kiptime, true);
612 print_helper( " CPU Traps ", t_traps, c_traps, true);
613 print_helper(YELLOW " keepup with leader " NOCOLOR, t_keepup, c_keepup, true);
614 print_helper(YELLOW " resume " NOCOLOR, t_resume_active + t_resume_passive);
615 print_helper( " active ", t_resume_active, c_resume_active, true);
616 print_helper( " passive ", t_resume_passive, c_resume_passive, true);
617 INFO() << " ------------------------------------------------------";
622 * Manage fast lookup for the replica's UTCB address
624 void remote_utcb(l4_addr_t a) { _remote_utcb = a; }
625 l4_addr_t remote_utcb() const { return _remote_utcb; }
628 * Start the vCPU thread
633 l4_addr_t handler_sp() const { return _handler_sp; }
634 void handler_sp(l4_addr_t sp) { _handler_sp = sp; }
636 l4_addr_t thread_sp() const { return _thread_sp; }
637 void thread_sp(l4_addr_t sp) { _thread_sp = sp; }
639 l4_addr_t thread_entry() const { return _thread_fn; }
641 l4_umword_t cpu() const { return _cpu; }
642 void cpu(l4_umword_t c) { _cpu = c; }
643 L4::Cap<L4::Thread> vcpu_cap() const { return _vcpu_cap; }
644 void vcpu_cap(L4::Cap<L4::Thread> c) { _vcpu_cap = c; }
645 L4vcpu::Vcpu *vcpu() const { return _vcpu; }
646 l4_utcb_t *vcpu_utcb() const { return _vcpu_utcb; }
648 l4_umword_t ds() const { return _master_ds; }
649 l4_umword_t fs() const { return _master_fs; }
650 l4_umword_t gs() const { return _master_gs; }
651 // void gs(l4_addr_t a) { _master_gs = a; }
655 return (void*)&_client_gdt[0];
657 l4_umword_t gdt_size() const { return sizeof(_client_gdt); }
659 /***********************************************************************
660 * GDT Handling Explained
662 * Fiasco uses the FS register to store the current UTCB address,
663 * libpthread uses GS for providing thread-local storage. Both require
664 * a valid entry in the GDT, which user space can access through the
665 * fiasco_gdt_set() system call. Furthermore, Fiasco provides a range
666 * of user-defined segment entries at offsets 0x48, 0x50, and 0x58.
668 * By default, the GDT entry for the UTCB address is 0x40. As Romain
669 * uses pthreads, the first user-defined segment is used for Romain's
672 * Replicas use user-defined entries 2 and 3:
673 * - Entry 2 (0x50) contains the replica's UTCB address.
674 * - Entry 3 (0x58) can later be set up for thread-local storage.
676 * This means there are no free user-defined GDT entries anymore! If we
677 * wanted to fix this, we'd have to manually swap GDT entries every
678 * time we switch between replicas and master. This would require two
679 * additional system calls for modifying the GDT.
680 ***********************************************************************/
683 * Set up the initial GDT segment (e.g., UTCB address)
686 void setup_utcb_segdesc(l4_addr_t base, l4_addr_t limit)
688 DEBUG() << "Base " << std::hex << base
689 << " Limit " << limit;
690 memset(_client_gdt, 0, sizeof(_client_gdt));
692 _client_gdt[0].limit_low = limit & 0xFFFF;
693 _client_gdt[0].base_low = base & 0xFFFF;
694 _client_gdt[0].base_middle = (base >> 16) & 0xFF;
695 _client_gdt[0].base_high = (base >> 24) & 0xFF;
696 _client_gdt[0].access = 0xF2;
697 _client_gdt[0].granularity = 0x40;
699 _gdt_modified = true;
703 bool gdt_changed() { return _gdt_modified; }
707 * Write the second entry, actually.
710 void write_gdt_entry(l4_umword_t *src, l4_umword_t bytes)
712 memcpy(&_client_gdt[1], src, bytes);
713 _gdt_modified = true;
718 * Write the user GDT entries
720 void commit_client_gdt();
723 * Schedule a "virtual" trap
725 * The whole thing is used to mark pending events for future
726 * invocations of some fault observers. These events currently
729 * - unhandled page fault
731 void set_pending_trap(l4_umword_t no) { _pending_trap |= (1 << no); }
733 void set_unhandled_pf()
735 _events |= Unhandled_Page_Fault;
736 set_pending_trap(0xE);
739 void unset_unhandled_pf() { _events &= ~Unhandled_Page_Fault; }
740 bool unhandled_pf() { return _events & Unhandled_Page_Fault; }
742 l4_umword_t events_pending() { return _events; }
745 * Get the next pending trap (and remove it from pending list)
747 l4_umword_t get_pending_trap()
749 l4_umword_t ret = l4util_find_first_set_bit(&_pending_trap, sizeof(_pending_trap));
750 if (ret >= sizeof(_pending_trap) * 8) {
753 _pending_trap &= ~(1 << ret);
760 void print_vcpu_state()
763 snprintf(pref, 32, "[VCPU %p] ", vcpu());
764 vcpu()->print_state(pref);
767 l4_umword_t csum_state();
772 INFO() << " Halting VCPU " << std::hex << vcpu();
773 l4_sched_param_t sp = l4_sched_param(0);
774 if (pthread_l4_cap(pthread_self()) != vcpu_cap().cap()) {
775 chksys(L4Re::Env::env()->scheduler()->run_thread(vcpu_cap(), sp));
780 void return_to(l4_addr_t ret)
782 vcpu()->r()->sp += sizeof(l4_umword_t); // RET: inc. ESP
783 vcpu()->r()->ip = ret; // RET: return addr
791 * Common prolog to be executed upon entry to exception handler function. It
792 * restores this VCPU's ES, DS, FS, and GS registers before continuing
793 * execution in the handler address space.
795 #define handler_prolog(app_thread) \
803 "r"((app_thread)->ds()), "r"((app_thread)->fs()), \
804 "r"((app_thread)->gs())); \