4 #include <csignal> // for siginfo_t
24 #include <asm/unistd.h>
26 #include <sys/ptrace.h>
27 #include <sys/types.h>
30 #include "undef_page.h"
32 #include "boot_info.h"
34 #include "config_tcbsize.h"
36 #include "emulation.h"
39 #include "mem_layout.h"
41 #include "processor.h"
45 #include "thread_state.h"
49 Usermode::peek_at_addr (pid_t pid, Address addr, unsigned n)
53 if ((addr & sizeof (Mword) - 1) + n > sizeof (Mword))
54 val = ptrace (PTRACE_PEEKTEXT, pid, addr, NULL);
56 val = ptrace (PTRACE_PEEKTEXT, pid, addr & ~(sizeof (Mword) - 1), NULL) >>
57 CHAR_BIT * (addr & sizeof (Mword) - 1);
59 return val & (Mword) -1 >> CHAR_BIT * (sizeof (Mword) - n);
63 * Wait for host process to stop.
64 * @param pid process id to wait for.
65 * @return signal the host process stopped with.
67 PRIVATE static inline NOEXPORT
69 Usermode::wait_for_stop (pid_t pid)
73 check (waitpid (pid, &status, 0) == pid && WIFSTOPPED (status));
75 return WSTOPSIG (status);
79 * Set emulated internal processor interrupt state.
80 * @param mask signal mask to modify
81 * @param eflags processor flags register
83 PRIVATE static inline NOEXPORT
85 Usermode::sync_interrupt_state (sigset_t *mask, Mword eflags)
87 Proc::ux_set_virtual_processor_state (eflags);
92 if (eflags & EFLAGS_IF)
93 sigdelset (mask, SIGIO);
95 sigaddset (mask, SIGIO);
99 * Cancel a native system call in the host.
100 * @param pid process id of the host process.
101 * @param regs register set at the time of the system call.
103 PRIVATE static inline NOEXPORT
105 Usermode::cancel_syscall (pid_t pid, struct user_regs_struct *regs)
107 ptrace (PTRACE_POKEUSER, pid, offsetof (struct user, regs.orig_eax), -1);
108 ptrace (PTRACE_SYSCALL, pid, NULL, NULL);
112 regs->eax = regs->orig_eax;
113 ptrace (PTRACE_POKEUSER, pid, offsetof (struct user, regs.eax), regs->eax);
117 * Read debug register
118 * @param pid process id of the host process.
119 * @param reg number of debug register (0..7)
120 * @param value reference to register value
125 Usermode::read_debug_register (pid_t pid, Mword reg, Mword &value)
130 int ret = ptrace (PTRACE_PEEKUSER, pid,
131 ((struct user *) 0)->u_debugreg + reg, NULL);
133 if (ret == -1 && errno == -1)
142 * Write debug register
143 * @param pid process id of the host process.
144 * @param reg number of debug register (0..7)
145 * @param value register value to be written.
150 Usermode::write_debug_register (pid_t pid, Mword reg, Mword value)
155 if (ptrace (PTRACE_POKEUSER, pid,
156 ((struct user *) 0)->u_debugreg + reg, value) == -1)
163 * Set up kernel stack for kernel entry through an interrupt gate.
164 * We are running on the signal stack and modifying the interrupted context
165 * on the signal stack to allow us to return anywhere.
166 * Depending on the value of the code segment (CS) we set up a processor
167 * context of 3 (kernel mode) or 5 (user mode) words on the respective kernel
168 * stack, which is the current stack (kernel mode) or the stack determined by
169 * the Task State Segment (user mode). For some traps we need to put an
170 * additional error code on the stack.
171 * This is precisely what an ia32 processor does in hardware.
172 * @param context Interrupted context on signal stack
173 * @param trap Trap number that caused kernel entry (0xffffffff == shutdown)
174 * @param xss Stack Segment
175 * @param esp Stack Pointer
177 * @param xcs Code Segment
178 * @param eip Instruction Pointer
179 * @param err Error Code
180 * @param cr2 Page Fault Address (if applicable)
184 Usermode::kernel_entry (unsigned _cpu,
185 struct ucontext *context,
195 Mword *kesp = (xcs & 3) == 3
196 ? (Mword *) Cpu::cpus.cpu(_cpu).kernel_sp() - 5
197 : (Mword *) context->uc_mcontext.gregs[REG_ESP] - 3;
199 if (!Thread::is_tcb_address((Address)kesp))
201 printf("KERNEL BUG at EIP:%08x ESP:%08x -- PFA:%08lx kesp=%p trap=%lx xcs=%lx @ %p %lx\n",
202 context->uc_mcontext.gregs[REG_EIP],
203 context->uc_mcontext.gregs[REG_ESP],
204 context->uc_mcontext.cr2, kesp, trap, xcs, &Cpu::cpus.cpu(_cpu).kernel_sp(), Cpu::cpus.cpu(_cpu).kernel_sp());
208 // Make sure the kernel stack is sane
209 assert (Thread::is_tcb_address((Address)kesp));
211 // Make sure the kernel stack has enough space
212 if ((Mword) kesp % THREAD_BLOCK_SIZE <= 512)
214 printf("KERNEL BUG: Kernel stack of thread ");
215 printf("DBGID=%lx\n", static_cast<Thread*>(context_of(kesp))->dbg_info()->dbg_id());
216 panic(" exceeded (%p, %c). \n"
217 " As a workaround, please make sure that you built \n"
218 " Fiasco-UX with enabled CONTEXT_4K.",
219 kesp, (xcs & 3) == 2 ? 'k' : 'u');
231 *(kesp + 2) = efl | (Proc::processor_state() & EFLAGS_IF);
232 *(kesp + 1) = xcs & ~1; // trap on iret
238 case 0xe: // Page Fault
239 Emulation::set_page_fault_addr (cr2);
240 case 0x8: // Double Fault
241 case 0xa: // Invalid TSS
242 case 0xb: // Segment Not Present
243 case 0xc: // Stack Fault
244 case 0xd: // General Protection Fault
245 case 0x11: // Alignment Check
249 context->uc_mcontext.gregs[REG_ESP] = (Mword) kesp;
250 context->uc_mcontext.gregs[REG_EIP] = Emulation::idt_vector (trap, false);
251 context->uc_mcontext.gregs[REG_EFL] = efl & ~(EFLAGS_TF | EFLAGS_NT | EFLAGS_RF | EFLAGS_VM);
252 sync_interrupt_state (&context->uc_sigmask, efl);
254 // Make sure interrupts are off
255 assert (!Proc::interrupts());
258 PRIVATE static inline NOEXPORT
260 Usermode::kip_syscall (Address eip)
262 if ((eip & Config::PAGE_MASK) != Mem_layout::Syscalls || eip & 0xff)
265 Mword trap = 0x30 + (eip - Mem_layout::Syscalls >> 8);
267 return Emulation::idt_vector (trap, true) ? trap : 0;
270 PRIVATE static inline NOEXPORT
272 Usermode::l4_syscall (Mword opcode)
274 if (EXPECT_FALSE ((opcode & 0xff) != 0xcd))
277 Mword trap = opcode >> 8;
279 return Emulation::idt_vector (trap, true) ? trap : 0;
282 PRIVATE static inline NOEXPORT NEEDS["thread_state.h"]
284 Usermode::user_exception (unsigned _cpu, pid_t pid, struct ucontext *context,
285 struct user_regs_struct *regs)
287 Mword trap, error = 0, addr = 0;
289 if (EXPECT_FALSE ((trap = kip_syscall (regs->eip))))
291 Context *t = context_of(((Mword *)Cpu::cpus.cpu(_cpu).kernel_sp()) - 1);
293 /* The alien syscall code in entry-*.S substracts 2 bytes from the
294 * EIP to put the EIP back on the instruction to reexecute it.
295 * 'int X' and sysenter etc. are 2 byte instructions.
296 * So we add 2 here to have the EIP in the right position afterwards.
298 * Furthermore we leave ESP and EIP (with the adjustment) where they
299 * are so that the syscall can be re-executed.
301 * This is not a problem for native as it does not trap on the
302 * 'call 0xea......' itself there but on the real int/sysenter/etc.
303 * instructions in the syscall page.
305 if (EXPECT_FALSE((t->state() & (Thread_alien | Thread_dis_alien))
306 == Thread_alien || t->space_ref()->user_mode()))
310 regs->eip = peek_at_addr (pid, regs->esp, 4);
315 else if ((trap = l4_syscall (peek_at_addr (pid, regs->eip, 2))))
320 struct ucontext *exception_context;
322 memcpy ((void *) Mem_layout::kernel_trampoline_page,
323 (void *) &Mem_layout::task_sighandler_start,
324 &Mem_layout::task_sighandler_end -
325 &Mem_layout::task_sighandler_start);
327 ptrace (PTRACE_CONT, pid, NULL, SIGSEGV);
331 // See corresponding code in sighandler.S
332 exception_context = reinterpret_cast<struct ucontext *>
333 (Mem_layout::kernel_trampoline_page +
334 *reinterpret_cast<Address *>
335 (Mem_layout::kernel_trampoline_page + 0x100));
337 addr = exception_context->uc_mcontext.cr2;
338 trap = exception_context->uc_mcontext.gregs[REG_TRAPNO];
339 error = exception_context->uc_mcontext.gregs[REG_ERR];
344 if (Boot_info::emulate_clisti())
345 switch (peek_at_addr (pid, regs->eip, 1))
348 Pic::set_owner (Boot_info::pid());
350 regs->eflags &= ~EFLAGS_IF;
351 sync_interrupt_state (0, regs->eflags);
352 check(ptrace (PTRACE_SETREGS, pid, NULL, regs));
356 Pic::set_owner (pid);
358 regs->eflags |= EFLAGS_IF;
359 sync_interrupt_state (0, regs->eflags);
360 check(ptrace (PTRACE_SETREGS, pid, NULL, regs));
366 error |= PF_ERR_USERADDR;
371 kernel_entry (_cpu, context, trap,
374 regs->eflags, /* EFL */
383 PRIVATE static inline NOEXPORT
385 Usermode::user_emulation (unsigned _cpu, int stop, pid_t pid,
386 struct ucontext *context,
387 struct user_regs_struct *regs)
389 Mword trap, error = 0;
394 return user_exception (_cpu, pid, context, regs);
398 if ((irq_pend = Pic::irq_pending()) == -1)
401 trap = Pic::map_irq_to_gate (irq_pend);
405 if (peek_at_addr (pid, regs->eip - 1, 1) == 0xcc)
410 else if (peek_at_addr (pid, regs->eip - 2, 2) == 0x80cd)
412 cancel_syscall (pid, regs);
414 error = 0x80 << 3 | 2;
434 kernel_entry (_cpu, context, trap,
437 regs->eflags, /* EFL */
447 * IRET to a user context.
448 * We restore the saved context on the stack, namely EIP, CS, EFLAGS, ESP, SS.
449 * Additionally all register values are transferred to the task's register set.
450 * @param ctx Kern context during iret
452 PRIVATE static inline NOEXPORT
454 Usermode::iret_to_user_mode (unsigned _cpu,
455 struct ucontext *context, Mword *kesp)
457 struct user_regs_struct regs;
459 Context *t = context_of (kesp);
460 pid_t pid = t->vcpu_aware_space()->pid();
462 Pic::set_owner (pid);
465 * If there are any interrupts pending up to this point, don't start the task
466 * but let it enter kernel immediately. Any interrupts occuring beyond this
467 * point will go directly to the task.
469 if ((irq_pend = Pic::irq_pending()) != -1)
473 Pic::set_owner (Boot_info::pid());
475 kernel_entry (_cpu, context,
476 Pic::map_irq_to_gate (irq_pend),
477 *(kesp + 4), /* XSS */
478 *(kesp + 3), /* ESP */
479 *(kesp + 2), /* EFL */
480 *(kesp + 1) | 3,/* XCS */
481 *(kesp + 0), /* EIP */
487 // Restore these from the kernel stack (iret context)
488 regs.eip = *(kesp + 0);
489 regs.xcs = *(kesp + 1) | 3;
490 regs.eflags = *(kesp + 2);
491 regs.esp = *(kesp + 3);
492 regs.xss = *(kesp + 4);
494 // Copy these from the kernel
495 regs.eax = context->uc_mcontext.gregs[REG_EAX];
496 regs.ebx = context->uc_mcontext.gregs[REG_EBX];
497 regs.ecx = context->uc_mcontext.gregs[REG_ECX];
498 regs.edx = context->uc_mcontext.gregs[REG_EDX];
499 regs.esi = context->uc_mcontext.gregs[REG_ESI];
500 regs.edi = context->uc_mcontext.gregs[REG_EDI];
501 regs.ebp = context->uc_mcontext.gregs[REG_EBP];
502 regs.xds = context->uc_mcontext.gregs[REG_DS];
503 regs.xes = context->uc_mcontext.gregs[REG_ES];
504 regs.xfs = Cpu::get_fs();
505 regs.xgs = Cpu::get_gs();
507 // ptrace will return with an error if we try to load invalid values to
509 int r = ptrace (PTRACE_SETREGS, pid, NULL, ®s);
510 if (EXPECT_FALSE(r == -EPERM))
512 WARN("Failure setting registers, probably invalid segment values.\n"
514 regs.xds = Cpu::kern_ds();
515 regs.xes = Cpu::kern_es();
517 check(ptrace (PTRACE_SETREGS, pid, NULL, ®s));
522 Fpu::restore_state (t->fpu_state());
526 ptrace (t->is_native() ? PTRACE_CONT : PTRACE_SYSCALL, pid, NULL, NULL);
528 int stop = wait_for_stop (pid);
530 if (EXPECT_FALSE (stop == SIGWINCH || stop == SIGTERM || stop == SIGINT))
533 check(ptrace (PTRACE_GETREGS, pid, NULL, ®s) == 0);
535 if (EXPECT_TRUE (user_emulation (_cpu, stop, pid, context, ®s)))
539 Pic::set_owner (Boot_info::pid());
541 if (Pic::irq_pending() != -1)
542 kill (Boot_info::pid(), SIGIO);
544 context->uc_mcontext.gregs[REG_EAX] = regs.eax;
545 context->uc_mcontext.gregs[REG_EBX] = regs.ebx;
546 context->uc_mcontext.gregs[REG_ECX] = regs.ecx;
547 context->uc_mcontext.gregs[REG_EDX] = regs.edx;
548 context->uc_mcontext.gregs[REG_ESI] = regs.esi;
549 context->uc_mcontext.gregs[REG_EDI] = regs.edi;
550 context->uc_mcontext.gregs[REG_EBP] = regs.ebp;
551 context->uc_mcontext.gregs[REG_DS] = regs.xds;
552 context->uc_mcontext.gregs[REG_ES] = regs.xes;
553 Cpu::set_fs(regs.xfs);
554 Cpu::set_gs(regs.xgs);
556 Fpu::save_state (t->fpu_state());
560 * IRET to a kernel context.
561 * We restore the saved context on the stack, namely EIP and EFLAGS.
562 * We do NOT restore CS, because the kernel thinks it has privilege level 0
563 * but in usermode it has to have privilege level 3. We also adjust ESP by
564 * 3 words, thus clearing the context from the stack.
565 * @param ctx Kern context during iret
567 PRIVATE static inline NOEXPORT
569 Usermode::iret_to_kern_mode (struct ucontext *context, Mword *kesp)
571 context->uc_mcontext.gregs[REG_EIP] = *(kesp + 0);
572 context->uc_mcontext.gregs[REG_EFL] = *(kesp + 2);
573 context->uc_mcontext.gregs[REG_ESP] += 3 * sizeof (Mword);
577 * Emulate IRET instruction.
578 * Depending on the value of the saved code segment (CS) on the kernel stack
579 * we return to kernel mode (CPL == 0) or user mode (CPL == 2).
580 * @param ctx Kern context during iret
582 PRIVATE static inline NOEXPORT
584 Usermode::iret (unsigned _cpu, struct ucontext *context)
586 Mword *kesp = (Mword *) context->uc_mcontext.gregs[REG_ESP];
588 sync_interrupt_state (&context->uc_sigmask, *(kesp + 2));
590 switch (*(kesp + 1) & 3)
592 case 0: /* CPL 0 -> Kernel */
593 iret_to_kern_mode (context, kesp);
596 case 2: /* CPL 2 -> User */
597 iret_to_user_mode (_cpu, context, kesp);
607 Usermode::emu_handler (int, siginfo_t *, void *ctx)
609 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
610 unsigned int trap = context->uc_mcontext.gregs[REG_TRAPNO];
612 unsigned _cpu = Cpu::cpus.find_cpu(Cpu::By_phys_id(Cpu::phys_id_direct()));
614 if (trap == 0xd) /* General protection fault */
616 unsigned char opcode = *reinterpret_cast<unsigned char *>
617 (context->uc_mcontext.gregs[REG_EIP]);
622 context->uc_mcontext.gregs[REG_EIP]++;
623 context->uc_mcontext.gregs[REG_EFL] &= ~EFLAGS_IF;
624 sync_interrupt_state (&context->uc_sigmask,
625 context->uc_mcontext.gregs[REG_EFL]);
629 context->uc_mcontext.gregs[REG_EIP]++;
630 context->uc_mcontext.gregs[REG_EFL] |= EFLAGS_IF;
631 sync_interrupt_state (&context->uc_sigmask,
632 context->uc_mcontext.gregs[REG_EFL]);
635 case 0xcf: /* iret */
636 iret (_cpu, context);
641 kernel_entry (_cpu, context, trap,
642 context->uc_mcontext.gregs[REG_SS],
643 context->uc_mcontext.gregs[REG_ESP],
644 context->uc_mcontext.gregs[REG_EFL],
645 context->uc_mcontext.gregs[REG_CS] & ~3,
646 context->uc_mcontext.gregs[REG_EIP],
647 context->uc_mcontext.gregs[REG_ERR] & ~PF_ERR_USERMODE,
648 context->uc_mcontext.cr2);
653 Usermode::int_handler (int, siginfo_t *, void *ctx)
655 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
659 if ((irq = Pic::irq_pending()) == -1)
662 if (Pic::get_ipi_gate(irq, gate) == false)
663 gate = Pic::map_irq_to_gate(irq);
667 kernel_entry (Cpu::cpus.find_cpu(Cpu::By_phys_id(Cpu::phys_id_direct())),
670 context->uc_mcontext.gregs[REG_SS], /* XSS */
671 context->uc_mcontext.gregs[REG_ESP], /* ESP */
672 context->uc_mcontext.gregs[REG_EFL], /* EFL */
673 context->uc_mcontext.gregs[REG_CS] & ~3,/* XCS */
674 context->uc_mcontext.gregs[REG_EIP], /* EIP */
681 Usermode::jdb_handler (int sig, siginfo_t *, void *ctx)
683 struct ucontext *context = reinterpret_cast<struct ucontext *>(ctx);
685 if (!Thread::is_tcb_address(context->uc_mcontext.gregs[REG_ESP]))
689 * If a SIGSEGV is pending at the same time as SIGINT, i.e. because
690 * someone pressed Ctrl-C on an sti instruction, SIGINT will be delivered
691 * first. Since we warp to a different execution path the pending SIGSEGV
692 * will then hit an innocent instruction elsewhere with fatal consequences.
693 * Therefore a pending SIGSEGV must be cancelled - it will later reoccur.
696 signal (SIGSEGV, SIG_IGN); // Cancel signal
697 set_signal (SIGSEGV); // Reinstall handler
699 kernel_entry (Cpu::cpus.find_cpu(Cpu::By_phys_id(Cpu::phys_id_direct())),
700 context, sig == SIGTRAP ? 3 : 1,
701 context->uc_mcontext.gregs[REG_SS], /* XSS */
702 context->uc_mcontext.gregs[REG_ESP], /* ESP */
703 context->uc_mcontext.gregs[REG_EFL], /* EFL */
704 context->uc_mcontext.gregs[REG_CS] & ~3,/* XCS */
705 context->uc_mcontext.gregs[REG_EIP], /* EIP */
712 Usermode::set_signal (int sig)
714 void (*func)(int, siginfo_t *, void *);
715 struct sigaction action;
719 case SIGIO: func = int_handler; break;
720 case SIGSEGV: func = emu_handler; break;
721 default: func = jdb_handler; break;
724 sigfillset (&action.sa_mask); /* No other signals while we run */
725 action.sa_sigaction = func;
726 action.sa_flags = SA_RESTART | SA_ONSTACK | SA_SIGINFO;
728 check (sigaction (sig, &action, NULL) == 0);
731 PUBLIC static FIASCO_INIT_CPU
733 Usermode::init(unsigned cpu)
737 /* We want signals, aka interrupts to be delivered on an alternate stack */
739 stack.ss_sp = (void *) Mem_layout::phys_to_pmem
740 (Mem_layout::Sigstack_cpu0_start_frame);
742 stack.ss_sp = Kmem_alloc::allocator()->alloc(Mem_layout::Sigstack_log2_size);
743 stack.ss_size = Mem_layout::Sigstack_size;
746 check (sigaltstack (&stack, NULL) == 0);
748 signal (SIGWINCH, SIG_IGN);
749 signal (SIGPROF, SIG_IGN);
750 signal (SIGHUP, SIG_IGN);
751 signal (SIGUSR1, SIG_IGN);
752 signal (SIGUSR2, SIG_IGN);
754 set_signal (SIGSEGV);
759 signal (SIGINT, SIG_IGN);
760 set_signal (SIGTRAP);
761 set_signal (SIGTERM);
762 set_signal (SIGXCPU);