4 #include <csignal> // for siginfo_t
24 #include <asm/unistd.h>
26 #include <sys/ptrace.h>
27 #include <sys/types.h>
30 #include "undef_page.h"
32 #include "boot_info.h"
34 #include "config_tcbsize.h"
36 #include "emulation.h"
39 #include "mem_layout.h"
41 #include "processor.h"
45 #include "thread_state.h"
49 Usermode::peek_at_addr (pid_t pid, Address addr, unsigned n)
53 if ((addr & sizeof (Mword) - 1) + n > sizeof (Mword))
54 val = ptrace (PTRACE_PEEKTEXT, pid, addr, NULL);
56 val = ptrace (PTRACE_PEEKTEXT, pid, addr & ~(sizeof (Mword) - 1), NULL) >>
57 CHAR_BIT * (addr & sizeof (Mword) - 1);
59 return val & (Mword) -1 >> CHAR_BIT * (sizeof (Mword) - n);
63 * Wait for host process to stop.
64 * @param pid process id to wait for.
65 * @return signal the host process stopped with.
67 PRIVATE static inline NOEXPORT
69 Usermode::wait_for_stop (pid_t pid)
73 check (waitpid (pid, &status, 0) == pid && WIFSTOPPED (status));
75 return WSTOPSIG (status);
79 * Set emulated internal processor interrupt state.
80 * @param mask signal mask to modify
81 * @param eflags processor flags register
83 PRIVATE static inline NOEXPORT
85 Usermode::sync_interrupt_state (sigset_t *mask, Mword eflags)
87 Proc::ux_set_virtual_processor_state (eflags);
92 if (eflags & EFLAGS_IF)
93 sigdelset (mask, SIGIO);
95 sigaddset (mask, SIGIO);
99 * Cancel a native system call in the host.
100 * @param pid process id of the host process.
101 * @param regs register set at the time of the system call.
103 PRIVATE static inline NOEXPORT
105 Usermode::cancel_syscall (pid_t pid, struct user_regs_struct *regs)
107 ptrace (PTRACE_POKEUSER, pid, offsetof (struct user, regs.orig_eax), -1);
108 ptrace (PTRACE_SYSCALL, pid, NULL, NULL);
112 regs->eax = regs->orig_eax;
113 ptrace (PTRACE_POKEUSER, pid, offsetof (struct user, regs.eax), regs->eax);
117 * Read debug register
118 * @param pid process id of the host process.
119 * @param reg number of debug register (0..7)
120 * @param value reference to register value
125 Usermode::read_debug_register (pid_t pid, Mword reg, Mword &value)
130 int ret = ptrace (PTRACE_PEEKUSER, pid,
131 ((struct user *) 0)->u_debugreg + reg, NULL);
133 if (ret == -1 && errno == -1)
142 * Write debug register
143 * @param pid process id of the host process.
144 * @param reg number of debug register (0..7)
145 * @param value register value to be written.
150 Usermode::write_debug_register (pid_t pid, Mword reg, Mword value)
155 if (ptrace (PTRACE_POKEUSER, pid,
156 ((struct user *) 0)->u_debugreg + reg, value) == -1)
163 * Set up kernel stack for kernel entry through an interrupt gate.
164 * We are running on the signal stack and modifying the interrupted context
165 * on the signal stack to allow us to return anywhere.
166 * Depending on the value of the code segment (CS) we set up a processor
167 * context of 3 (kernel mode) or 5 (user mode) words on the respective kernel
168 * stack, which is the current stack (kernel mode) or the stack determined by
169 * the Task State Segment (user mode). For some traps we need to put an
170 * additional error code on the stack.
171 * This is precisely what an ia32 processor does in hardware.
172 * @param context Interrupted context on signal stack
173 * @param trap Trap number that caused kernel entry (0xffffffff == shutdown)
174 * @param xss Stack Segment
175 * @param esp Stack Pointer
177 * @param xcs Code Segment
178 * @param eip Instruction Pointer
179 * @param err Error Code
180 * @param cr2 Page Fault Address (if applicable)
184 Usermode::kernel_entry (unsigned _cpu,
185 struct ucontext *context,
195 Mword *kesp = (xcs & 3) == 3
196 ? (Mword *) Cpu::cpus.cpu(_cpu).kernel_sp() - 5
197 : (Mword *) context->uc_mcontext.gregs[REG_ESP] - 3;
199 if (!Thread::is_tcb_address((Address)kesp))
201 printf("KERNEL BUG at EIP:%08x ESP:%08x -- PFA:%08lx kesp=%p trap=%lx xcs=%lx @ %p %lx\n",
202 context->uc_mcontext.gregs[REG_EIP],
203 context->uc_mcontext.gregs[REG_ESP],
204 context->uc_mcontext.cr2, kesp, trap, xcs, &Cpu::cpus.cpu(_cpu).kernel_sp(), Cpu::cpus.cpu(_cpu).kernel_sp());
208 // Make sure the kernel stack is sane
209 assert (Thread::is_tcb_address((Address)kesp));
211 // Make sure the kernel stack has enough space
212 if ((Mword) kesp % THREAD_BLOCK_SIZE <= 512)
214 printf("KERNEL BUG: Kernel stack of thread ");
215 printf("DBGID=%lx\n", static_cast<Thread*>(context_of(kesp))->dbg_id());
216 panic(" exceeded (%p, %c). \n"
217 " As a workaround, please make sure that you built \n"
218 " Fiasco-UX with enabled CONTEXT_4K.",
219 kesp, (xcs & 3) == 2 ? 'k' : 'u');
231 *(kesp + 2) = efl | (Proc::processor_state() & EFLAGS_IF);
232 *(kesp + 1) = xcs & ~1; // trap on iret
238 case 0xe: // Page Fault
239 Emulation::set_page_fault_addr (cr2);
240 case 0x8: // Double Fault
241 case 0xa: // Invalid TSS
242 case 0xb: // Segment Not Present
243 case 0xc: // Stack Fault
244 case 0xd: // General Protection Fault
245 case 0x11: // Alignment Check
249 context->uc_mcontext.gregs[REG_ESP] = (Mword) kesp;
250 context->uc_mcontext.gregs[REG_EIP] = Emulation::idt_vector (trap, false);
251 context->uc_mcontext.gregs[REG_EFL] = efl & ~(EFLAGS_TF | EFLAGS_NT | EFLAGS_RF | EFLAGS_VM);
252 sync_interrupt_state (&context->uc_sigmask, efl);
254 // Make sure interrupts are off
255 assert (!Proc::interrupts());
258 PRIVATE static inline NOEXPORT
260 Usermode::kip_syscall (Address eip)
262 if ((eip & Config::PAGE_MASK) != Mem_layout::Syscalls || eip & 0xff)
265 Mword trap = 0x30 + (eip - Mem_layout::Syscalls >> 8);
266 if (trap == 0x38) trap = 0x39;
268 return Emulation::idt_vector (trap, true) ? trap : 0;
271 PRIVATE static inline NOEXPORT
273 Usermode::l4_syscall (Mword opcode)
275 if (EXPECT_FALSE ((opcode & 0xff) != 0xcd))
278 Mword trap = opcode >> 8;
280 return Emulation::idt_vector (trap, true) ? trap : 0;
283 PRIVATE static inline NOEXPORT NEEDS["thread_state.h"]
285 Usermode::user_exception (unsigned _cpu, pid_t pid, struct ucontext *context,
286 struct user_regs_struct *regs)
288 Mword trap, error = 0, addr = 0;
290 if (EXPECT_FALSE ((trap = kip_syscall (regs->eip))))
292 Context *t = context_of(((Mword *)Cpu::cpus.cpu(_cpu).kernel_sp()) - 1);
294 /* The alien syscall code in entry-*.S substracts 2 bytes from the
295 * EIP to put the EIP back on the instruction to reexecute it.
296 * 'int X' and sysenter etc. are 2 byte instructions.
297 * So we add 2 here to have the EIP in the right position afterwards.
299 * Furthermore we leave ESP and EIP (with the adjustment) where they
300 * are so that the syscall can be re-executed.
302 * This is not a problem for native as it does not trap on the
303 * 'call 0xea......' itself there but on the real int/sysenter/etc.
304 * instructions in the syscall page.
306 if (EXPECT_FALSE((t->state() & (Thread_alien | Thread_dis_alien))
311 regs->eip = peek_at_addr (pid, regs->esp, 4);
316 else if ((trap = l4_syscall (peek_at_addr (pid, regs->eip, 2))))
321 struct ucontext *exception_context;
323 memcpy ((void *) Mem_layout::kernel_trampoline_page,
324 (void *) &Mem_layout::task_sighandler_start,
325 &Mem_layout::task_sighandler_end -
326 &Mem_layout::task_sighandler_start);
328 ptrace (PTRACE_CONT, pid, NULL, SIGSEGV);
332 // See corresponding code in sighandler.S
333 exception_context = reinterpret_cast<struct ucontext *>
334 (Mem_layout::kernel_trampoline_page +
335 *reinterpret_cast<Address *>
336 (Mem_layout::kernel_trampoline_page + 0x100));
338 addr = exception_context->uc_mcontext.cr2;
339 trap = exception_context->uc_mcontext.gregs[REG_TRAPNO];
340 error = exception_context->uc_mcontext.gregs[REG_ERR];
345 if (Boot_info::emulate_clisti())
346 switch (peek_at_addr (pid, regs->eip, 1))
349 Pic::set_owner (Boot_info::pid());
351 regs->eflags &= ~EFLAGS_IF;
352 sync_interrupt_state (0, regs->eflags);
353 check(ptrace (PTRACE_SETREGS, pid, NULL, regs));
357 Pic::set_owner (pid);
359 regs->eflags |= EFLAGS_IF;
360 sync_interrupt_state (0, regs->eflags);
361 check(ptrace (PTRACE_SETREGS, pid, NULL, regs));
367 error |= PF_ERR_USERADDR;
372 kernel_entry (_cpu, context, trap,
375 regs->eflags, /* EFL */
384 PRIVATE static inline NOEXPORT
386 Usermode::user_emulation (unsigned _cpu, int stop, pid_t pid,
387 struct ucontext *context,
388 struct user_regs_struct *regs)
390 Mword trap, error = 0;
395 return user_exception (_cpu, pid, context, regs);
399 if ((irq_pend = Pic::irq_pending()) == -1)
402 trap = Pic::map_irq_to_gate (irq_pend);
406 if (peek_at_addr (pid, regs->eip - 1, 1) == 0xcc)
411 else if (peek_at_addr (pid, regs->eip - 2, 2) == 0x80cd)
413 cancel_syscall (pid, regs);
415 error = 0x80 << 3 | 2;
435 kernel_entry (_cpu, context, trap,
438 regs->eflags, /* EFL */
448 * IRET to a user context.
449 * We restore the saved context on the stack, namely EIP, CS, EFLAGS, ESP, SS.
450 * Additionally all register values are transferred to the task's register set.
451 * @param ctx Kern context during iret
453 PRIVATE static inline NOEXPORT
455 Usermode::iret_to_user_mode (unsigned _cpu,
456 struct ucontext *context, Mword *kesp)
458 struct user_regs_struct regs;
460 Context *t = context_of (kesp);
461 pid_t pid = t->vcpu_aware_space()->pid();
463 Pic::set_owner (pid);
466 * If there are any interrupts pending up to this point, don't start the task
467 * but let it enter kernel immediately. Any interrupts occuring beyond this
468 * point will go directly to the task.
470 if ((irq_pend = Pic::irq_pending()) != -1)
474 Pic::set_owner (Boot_info::pid());
476 kernel_entry (_cpu, context,
477 Pic::map_irq_to_gate (irq_pend),
478 *(kesp + 4), /* XSS */
479 *(kesp + 3), /* ESP */
480 *(kesp + 2), /* EFL */
481 *(kesp + 1) | 3,/* XCS */
482 *(kesp + 0), /* EIP */
488 // Restore these from the kernel stack (iret context)
489 regs.eip = *(kesp + 0);
490 regs.xcs = *(kesp + 1) | 3;
491 regs.eflags = *(kesp + 2);
492 regs.esp = *(kesp + 3);
493 regs.xss = *(kesp + 4);
495 // Copy these from the kernel
496 regs.eax = context->uc_mcontext.gregs[REG_EAX];
497 regs.ebx = context->uc_mcontext.gregs[REG_EBX];
498 regs.ecx = context->uc_mcontext.gregs[REG_ECX];
499 regs.edx = context->uc_mcontext.gregs[REG_EDX];
500 regs.esi = context->uc_mcontext.gregs[REG_ESI];
501 regs.edi = context->uc_mcontext.gregs[REG_EDI];
502 regs.ebp = context->uc_mcontext.gregs[REG_EBP];
503 regs.xds = context->uc_mcontext.gregs[REG_DS];
504 regs.xes = context->uc_mcontext.gregs[REG_ES];
505 regs.xfs = context->uc_mcontext.gregs[REG_FS];
506 regs.xgs = Cpu::get_gs();
508 // ptrace will return with an error if we try to load invalid values to
510 int r = ptrace (PTRACE_SETREGS, pid, NULL, ®s);
511 if (EXPECT_FALSE(r == -EPERM))
513 WARN("Failure setting registers, probably invalid segment values.\n"
515 regs.xds = Cpu::kern_ds();
516 regs.xes = Cpu::kern_es();
518 check(ptrace (PTRACE_SETREGS, pid, NULL, ®s));
523 Fpu::restore_state (t->fpu_state());
527 ptrace (t->is_native() ? PTRACE_CONT : PTRACE_SYSCALL, pid, NULL, NULL);
529 stop = wait_for_stop (pid);
531 if (EXPECT_FALSE (stop == SIGWINCH || stop == SIGTERM || stop == SIGINT))
534 check(ptrace (PTRACE_GETREGS, pid, NULL, ®s) == 0);
536 if (EXPECT_TRUE (user_emulation (_cpu, stop, pid, context, ®s)))
540 Pic::set_owner (Boot_info::pid());
542 if (Pic::irq_pending() != -1)
543 kill (Boot_info::pid(), SIGIO);
545 context->uc_mcontext.gregs[REG_EAX] = regs.eax;
546 context->uc_mcontext.gregs[REG_EBX] = regs.ebx;
547 context->uc_mcontext.gregs[REG_ECX] = regs.ecx;
548 context->uc_mcontext.gregs[REG_EDX] = regs.edx;
549 context->uc_mcontext.gregs[REG_ESI] = regs.esi;
550 context->uc_mcontext.gregs[REG_EDI] = regs.edi;
551 context->uc_mcontext.gregs[REG_EBP] = regs.ebp;
552 context->uc_mcontext.gregs[REG_DS] = regs.xds;
553 context->uc_mcontext.gregs[REG_ES] = regs.xes;
554 context->uc_mcontext.gregs[REG_FS] = regs.xfs;
555 Cpu::set_gs(regs.xgs);
557 Fpu::save_state (t->fpu_state());
561 * IRET to a kernel context.
562 * We restore the saved context on the stack, namely EIP and EFLAGS.
563 * We do NOT restore CS, because the kernel thinks it has privilege level 0
564 * but in usermode it has to have privilege level 3. We also adjust ESP by
565 * 3 words, thus clearing the context from the stack.
566 * @param ctx Kern context during iret
568 PRIVATE static inline NOEXPORT
570 Usermode::iret_to_kern_mode (struct ucontext *context, Mword *kesp)
572 context->uc_mcontext.gregs[REG_EIP] = *(kesp + 0);
573 context->uc_mcontext.gregs[REG_EFL] = *(kesp + 2);
574 context->uc_mcontext.gregs[REG_ESP] += 3 * sizeof (Mword);
578 * Emulate IRET instruction.
579 * Depending on the value of the saved code segment (CS) on the kernel stack
580 * we return to kernel mode (CPL == 0) or user mode (CPL == 2).
581 * @param ctx Kern context during iret
583 PRIVATE static inline NOEXPORT
585 Usermode::iret (unsigned _cpu, struct ucontext *context)
587 Mword *kesp = (Mword *) context->uc_mcontext.gregs[REG_ESP];
589 sync_interrupt_state (&context->uc_sigmask, *(kesp + 2));
591 switch (*(kesp + 1) & 3)
593 case 0: /* CPL 0 -> Kernel */
594 iret_to_kern_mode (context, kesp);
597 case 2: /* CPL 2 -> User */
598 iret_to_user_mode (_cpu, context, kesp);
608 Usermode::emu_handler (int, siginfo_t *, void *ctx)
610 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
611 unsigned int trap = context->uc_mcontext.gregs[REG_TRAPNO];
613 unsigned _cpu = Cpu::p2l(Cpu::phys_id_direct());
615 if (trap == 0xd) /* General protection fault */
617 unsigned char opcode = *reinterpret_cast<unsigned char *>
618 (context->uc_mcontext.gregs[REG_EIP]);
623 context->uc_mcontext.gregs[REG_EIP]++;
624 context->uc_mcontext.gregs[REG_EFL] &= ~EFLAGS_IF;
625 sync_interrupt_state (&context->uc_sigmask,
626 context->uc_mcontext.gregs[REG_EFL]);
630 context->uc_mcontext.gregs[REG_EIP]++;
631 context->uc_mcontext.gregs[REG_EFL] |= EFLAGS_IF;
632 sync_interrupt_state (&context->uc_sigmask,
633 context->uc_mcontext.gregs[REG_EFL]);
636 case 0xcf: /* iret */
637 iret (_cpu, context);
642 kernel_entry (_cpu, context, trap,
643 context->uc_mcontext.gregs[REG_SS],
644 context->uc_mcontext.gregs[REG_ESP],
645 context->uc_mcontext.gregs[REG_EFL],
646 context->uc_mcontext.gregs[REG_CS] & ~3,
647 context->uc_mcontext.gregs[REG_EIP],
648 context->uc_mcontext.gregs[REG_ERR] & ~PF_ERR_USERMODE,
649 context->uc_mcontext.cr2);
654 Usermode::int_handler (int, siginfo_t *, void *ctx)
656 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
660 if ((irq = Pic::irq_pending()) == -1)
663 if (Pic::get_ipi_gate(irq, gate) == false)
664 gate = Pic::map_irq_to_gate(irq);
668 kernel_entry (Cpu::p2l(Cpu::phys_id_direct()),
671 context->uc_mcontext.gregs[REG_SS], /* XSS */
672 context->uc_mcontext.gregs[REG_ESP], /* ESP */
673 context->uc_mcontext.gregs[REG_EFL], /* EFL */
674 context->uc_mcontext.gregs[REG_CS] & ~3,/* XCS */
675 context->uc_mcontext.gregs[REG_EIP], /* EIP */
682 Usermode::jdb_handler (int sig, siginfo_t *, void *ctx)
684 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
686 if (!Thread::is_tcb_address(context->uc_mcontext.gregs[REG_ESP]))
690 * If a SIGSEGV is pending at the same time as SIGINT, i.e. because
691 * someone pressed Ctrl-C on an sti instruction, SIGINT will be delivered
692 * first. Since we warp to a different execution path the pending SIGSEGV
693 * will then hit an innocent instruction elsewhere with fatal consequences.
694 * Therefore a pending SIGSEGV must be cancelled - it will later reoccur.
697 signal (SIGSEGV, SIG_IGN); // Cancel signal
698 set_signal (SIGSEGV); // Reinstall handler
700 kernel_entry (Cpu::p2l(Cpu::phys_id_direct()),
701 context, sig == SIGTRAP ? 3 : 1,
702 context->uc_mcontext.gregs[REG_SS], /* XSS */
703 context->uc_mcontext.gregs[REG_ESP], /* ESP */
704 context->uc_mcontext.gregs[REG_EFL], /* EFL */
705 context->uc_mcontext.gregs[REG_CS] & ~3,/* XCS */
706 context->uc_mcontext.gregs[REG_EIP], /* EIP */
713 Usermode::set_signal (int sig)
715 void (*func)(int, siginfo_t *, void *);
716 struct sigaction action;
720 case SIGIO: func = int_handler; break;
721 case SIGSEGV: func = emu_handler; break;
722 default: func = jdb_handler; break;
725 sigfillset (&action.sa_mask); /* No other signals while we run */
726 action.sa_sigaction = func;
727 action.sa_flags = SA_RESTART | SA_ONSTACK | SA_SIGINFO;
729 check (sigaction (sig, &action, NULL) == 0);
732 PUBLIC static FIASCO_INIT_CPU
734 Usermode::init(unsigned cpu)
738 /* We want signals, aka interrupts to be delivered on an alternate stack */
740 stack.ss_sp = (void *) Mem_layout::phys_to_pmem
741 (Mem_layout::Sigstack_cpu0_start_frame);
743 stack.ss_sp = Mapped_allocator::allocator()->alloc(Mem_layout::Sigstack_log2_size);
744 stack.ss_size = Mem_layout::Sigstack_size;
747 check (sigaltstack (&stack, NULL) == 0);
749 signal (SIGWINCH, SIG_IGN);
750 signal (SIGPROF, SIG_IGN);
751 signal (SIGHUP, SIG_IGN);
752 signal (SIGUSR1, SIG_IGN);
753 signal (SIGUSR2, SIG_IGN);
755 set_signal (SIGSEGV);
760 signal (SIGINT, SIG_IGN);
761 set_signal (SIGTRAP);
762 set_signal (SIGTERM);
763 set_signal (SIGXCPU);