]> err.no Git - linux-2.6/blob - arch/x86/kernel/ptrace_64.c
7373a99facf39ff132fc1be4e89b5fda09c0f02c
[linux-2.6] / arch / x86 / kernel / ptrace_64.c
1 /* By Ross Biro 1/23/92 */
2 /*
3  * Pentium III FXSR, SSE support
4  *      Gareth Hughes <gareth@valinux.com>, May 2000
5  * 
6  * x86-64 port 2000-2002 Andi Kleen
7  */
8
9 #include <linux/kernel.h>
10 #include <linux/sched.h>
11 #include <linux/mm.h>
12 #include <linux/smp.h>
13 #include <linux/errno.h>
14 #include <linux/ptrace.h>
15 #include <linux/user.h>
16 #include <linux/security.h>
17 #include <linux/audit.h>
18 #include <linux/seccomp.h>
19 #include <linux/signal.h>
20
21 #include <asm/uaccess.h>
22 #include <asm/pgtable.h>
23 #include <asm/system.h>
24 #include <asm/processor.h>
25 #include <asm/prctl.h>
26 #include <asm/i387.h>
27 #include <asm/debugreg.h>
28 #include <asm/ldt.h>
29 #include <asm/desc.h>
30 #include <asm/proto.h>
31 #include <asm/ia32.h>
32
33 /*
34  * does not yet catch signals sent when the child dies.
35  * in exit.c or in signal.c.
36  */
37
38 /*
39  * Determines which flags the user has access to [1 = access, 0 = no access].
40  * Prohibits changing ID(21), VIP(20), VIF(19), VM(17), IOPL(12-13), IF(9).
41  * Also masks reserved bits (63-22, 15, 5, 3, 1).
42  */
43 #define FLAG_MASK 0x54dd5UL
44
45 /*
46  * eflags and offset of eflags on child stack..
47  */
48 #define EFLAGS offsetof(struct pt_regs, eflags)
49 #define EFL_OFFSET ((int)(EFLAGS-sizeof(struct pt_regs)))
50
51 /*
52  * this routine will get a word off of the processes privileged stack. 
53  * the offset is how far from the base addr as stored in the TSS.  
54  * this routine assumes that all the privileged stacks are in our
55  * data space.
56  */   
57 static inline unsigned long get_stack_long(struct task_struct *task, int offset)
58 {
59         unsigned char *stack;
60
61         stack = (unsigned char *)task->thread.rsp0;
62         stack += offset;
63         return (*((unsigned long *)stack));
64 }
65
66 /*
67  * this routine will put a word on the processes privileged stack. 
68  * the offset is how far from the base addr as stored in the TSS.  
69  * this routine assumes that all the privileged stacks are in our
70  * data space.
71  */
72 static inline long put_stack_long(struct task_struct *task, int offset,
73         unsigned long data)
74 {
75         unsigned char * stack;
76
77         stack = (unsigned char *) task->thread.rsp0;
78         stack += offset;
79         *(unsigned long *) stack = data;
80         return 0;
81 }
82
83 #define LDT_SEGMENT 4
84
85 unsigned long convert_rip_to_linear(struct task_struct *child, struct pt_regs *regs)
86 {
87         unsigned long addr, seg;
88
89         addr = regs->rip;
90         seg = regs->cs & 0xffff;
91
92         /*
93          * We'll assume that the code segments in the GDT
94          * are all zero-based. That is largely true: the
95          * TLS segments are used for data, and the PNPBIOS
96          * and APM bios ones we just ignore here.
97          */
98         if (seg & LDT_SEGMENT) {
99                 u32 *desc;
100                 unsigned long base;
101
102                 seg &= ~7UL;
103
104                 mutex_lock(&child->mm->context.lock);
105                 if (unlikely((seg >> 3) >= child->mm->context.size))
106                         addr = -1L; /* bogus selector, access would fault */
107                 else {
108                         desc = child->mm->context.ldt + seg;
109                         base = ((desc[0] >> 16) |
110                                 ((desc[1] & 0xff) << 16) |
111                                 (desc[1] & 0xff000000));
112
113                         /* 16-bit code segment? */
114                         if (!((desc[1] >> 22) & 1))
115                                 addr &= 0xffff;
116                         addr += base;
117                 }
118                 mutex_unlock(&child->mm->context.lock);
119         }
120
121         return addr;
122 }
123
124 static int is_setting_trap_flag(struct task_struct *child, struct pt_regs *regs)
125 {
126         int i, copied;
127         unsigned char opcode[15];
128         unsigned long addr = convert_rip_to_linear(child, regs);
129
130         copied = access_process_vm(child, addr, opcode, sizeof(opcode), 0);
131         for (i = 0; i < copied; i++) {
132                 switch (opcode[i]) {
133                 /* popf and iret */
134                 case 0x9d: case 0xcf:
135                         return 1;
136
137                         /* CHECKME: 64 65 */
138
139                 /* opcode and address size prefixes */
140                 case 0x66: case 0x67:
141                         continue;
142                 /* irrelevant prefixes (segment overrides and repeats) */
143                 case 0x26: case 0x2e:
144                 case 0x36: case 0x3e:
145                 case 0x64: case 0x65:
146                 case 0xf2: case 0xf3:
147                         continue;
148
149                 case 0x40 ... 0x4f:
150                         if (regs->cs != __USER_CS)
151                                 /* 32-bit mode: register increment */
152                                 return 0;
153                         /* 64-bit mode: REX prefix */
154                         continue;
155
156                         /* CHECKME: f2, f3 */
157
158                 /*
159                  * pushf: NOTE! We should probably not let
160                  * the user see the TF bit being set. But
161                  * it's more pain than it's worth to avoid
162                  * it, and a debugger could emulate this
163                  * all in user space if it _really_ cares.
164                  */
165                 case 0x9c:
166                 default:
167                         return 0;
168                 }
169         }
170         return 0;
171 }
172
173 void user_enable_single_step(struct task_struct *child)
174 {
175         struct pt_regs *regs = task_pt_regs(child);
176
177         /*
178          * Always set TIF_SINGLESTEP - this guarantees that
179          * we single-step system calls etc..  This will also
180          * cause us to set TF when returning to user mode.
181          */
182         set_tsk_thread_flag(child, TIF_SINGLESTEP);
183
184         /*
185          * If TF was already set, don't do anything else
186          */
187         if (regs->eflags & X86_EFLAGS_TF)
188                 return;
189
190         /* Set TF on the kernel stack.. */
191         regs->eflags |= X86_EFLAGS_TF;
192
193         /*
194          * ..but if TF is changed by the instruction we will trace,
195          * don't mark it as being "us" that set it, so that we
196          * won't clear it by hand later.
197          */
198         if (is_setting_trap_flag(child, regs))
199                 return;
200
201         child->ptrace |= PT_DTRACE;
202 }
203
204 void user_disable_single_step(struct task_struct *child)
205 {
206         /* Always clear TIF_SINGLESTEP... */
207         clear_tsk_thread_flag(child, TIF_SINGLESTEP);
208
209         /* But touch TF only if it was set by us.. */
210         if (child->ptrace & PT_DTRACE) {
211                 struct pt_regs *regs = task_pt_regs(child);
212                 regs->eflags &= ~X86_EFLAGS_TF;
213                 child->ptrace &= ~PT_DTRACE;
214         }
215 }
216
217 /*
218  * Called by kernel/ptrace.c when detaching..
219  *
220  * Make sure the single step bit is not set.
221  */
222 void ptrace_disable(struct task_struct *child)
223
224         user_disable_single_step(child);
225 }
226
227 static int putreg(struct task_struct *child,
228         unsigned long regno, unsigned long value)
229 {
230         unsigned long tmp; 
231         
232         switch (regno) {
233                 case offsetof(struct user_regs_struct,fs):
234                         if (value && (value & 3) != 3)
235                                 return -EIO;
236                         child->thread.fsindex = value & 0xffff; 
237                         return 0;
238                 case offsetof(struct user_regs_struct,gs):
239                         if (value && (value & 3) != 3)
240                                 return -EIO;
241                         child->thread.gsindex = value & 0xffff;
242                         return 0;
243                 case offsetof(struct user_regs_struct,ds):
244                         if (value && (value & 3) != 3)
245                                 return -EIO;
246                         child->thread.ds = value & 0xffff;
247                         return 0;
248                 case offsetof(struct user_regs_struct,es): 
249                         if (value && (value & 3) != 3)
250                                 return -EIO;
251                         child->thread.es = value & 0xffff;
252                         return 0;
253                 case offsetof(struct user_regs_struct,ss):
254                         if ((value & 3) != 3)
255                                 return -EIO;
256                         value &= 0xffff;
257                         return 0;
258                 case offsetof(struct user_regs_struct,fs_base):
259                         if (value >= TASK_SIZE_OF(child))
260                                 return -EIO;
261                         /*
262                          * When changing the segment base, use do_arch_prctl
263                          * to set either thread.fs or thread.fsindex and the
264                          * corresponding GDT slot.
265                          */
266                         if (child->thread.fs != value)
267                                 return do_arch_prctl(child, ARCH_SET_FS, value);
268                         return 0;
269                 case offsetof(struct user_regs_struct,gs_base):
270                         /*
271                          * Exactly the same here as the %fs handling above.
272                          */
273                         if (value >= TASK_SIZE_OF(child))
274                                 return -EIO;
275                         if (child->thread.gs != value)
276                                 return do_arch_prctl(child, ARCH_SET_GS, value);
277                         return 0;
278                 case offsetof(struct user_regs_struct, eflags):
279                         value &= FLAG_MASK;
280                         tmp = get_stack_long(child, EFL_OFFSET); 
281                         tmp &= ~FLAG_MASK; 
282                         value |= tmp;
283                         break;
284                 case offsetof(struct user_regs_struct,cs): 
285                         if ((value & 3) != 3)
286                                 return -EIO;
287                         value &= 0xffff;
288                         break;
289         }
290         put_stack_long(child, regno - sizeof(struct pt_regs), value);
291         return 0;
292 }
293
294 static unsigned long getreg(struct task_struct *child, unsigned long regno)
295 {
296         unsigned long val;
297         switch (regno) {
298                 case offsetof(struct user_regs_struct, fs):
299                         return child->thread.fsindex;
300                 case offsetof(struct user_regs_struct, gs):
301                         return child->thread.gsindex;
302                 case offsetof(struct user_regs_struct, ds):
303                         return child->thread.ds;
304                 case offsetof(struct user_regs_struct, es):
305                         return child->thread.es; 
306                 case offsetof(struct user_regs_struct, fs_base):
307                         /*
308                          * do_arch_prctl may have used a GDT slot instead of
309                          * the MSR.  To userland, it appears the same either
310                          * way, except the %fs segment selector might not be 0.
311                          */
312                         if (child->thread.fs != 0)
313                                 return child->thread.fs;
314                         if (child->thread.fsindex != FS_TLS_SEL)
315                                 return 0;
316                         return get_desc_base(&child->thread.tls_array[FS_TLS]);
317                 case offsetof(struct user_regs_struct, gs_base):
318                         /*
319                          * Exactly the same here as the %fs handling above.
320                          */
321                         if (child->thread.gs != 0)
322                                 return child->thread.gs;
323                         if (child->thread.gsindex != GS_TLS_SEL)
324                                 return 0;
325                         return get_desc_base(&child->thread.tls_array[GS_TLS]);
326                 default:
327                         regno = regno - sizeof(struct pt_regs);
328                         val = get_stack_long(child, regno);
329                         if (test_tsk_thread_flag(child, TIF_IA32))
330                                 val &= 0xffffffff;
331                         return val;
332         }
333
334 }
335
336 long arch_ptrace(struct task_struct *child, long request, long addr, long data)
337 {
338         long i, ret;
339         unsigned ui;
340
341         switch (request) {
342         /* when I and D space are separate, these will need to be fixed. */
343         case PTRACE_PEEKTEXT: /* read word at location addr. */ 
344         case PTRACE_PEEKDATA:
345                 ret = generic_ptrace_peekdata(child, addr, data);
346                 break;
347
348         /* read the word at location addr in the USER area. */
349         case PTRACE_PEEKUSR: {
350                 unsigned long tmp;
351
352                 ret = -EIO;
353                 if ((addr & 7) ||
354                     addr > sizeof(struct user) - 7)
355                         break;
356
357                 switch (addr) { 
358                 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
359                         tmp = getreg(child, addr);
360                         break;
361                 case offsetof(struct user, u_debugreg[0]):
362                         tmp = child->thread.debugreg0;
363                         break;
364                 case offsetof(struct user, u_debugreg[1]):
365                         tmp = child->thread.debugreg1;
366                         break;
367                 case offsetof(struct user, u_debugreg[2]):
368                         tmp = child->thread.debugreg2;
369                         break;
370                 case offsetof(struct user, u_debugreg[3]):
371                         tmp = child->thread.debugreg3;
372                         break;
373                 case offsetof(struct user, u_debugreg[6]):
374                         tmp = child->thread.debugreg6;
375                         break;
376                 case offsetof(struct user, u_debugreg[7]):
377                         tmp = child->thread.debugreg7;
378                         break;
379                 default:
380                         tmp = 0;
381                         break;
382                 }
383                 ret = put_user(tmp,(unsigned long __user *) data);
384                 break;
385         }
386
387         /* when I and D space are separate, this will have to be fixed. */
388         case PTRACE_POKETEXT: /* write the word at location addr. */
389         case PTRACE_POKEDATA:
390                 ret = generic_ptrace_pokedata(child, addr, data);
391                 break;
392
393         case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
394         {
395                 int dsize = test_tsk_thread_flag(child, TIF_IA32) ? 3 : 7;
396                 ret = -EIO;
397                 if ((addr & 7) ||
398                     addr > sizeof(struct user) - 7)
399                         break;
400
401                 switch (addr) { 
402                 case 0 ... sizeof(struct user_regs_struct) - sizeof(long):
403                         ret = putreg(child, addr, data);
404                         break;
405                 /* Disallows to set a breakpoint into the vsyscall */
406                 case offsetof(struct user, u_debugreg[0]):
407                         if (data >= TASK_SIZE_OF(child) - dsize) break;
408                         child->thread.debugreg0 = data;
409                         ret = 0;
410                         break;
411                 case offsetof(struct user, u_debugreg[1]):
412                         if (data >= TASK_SIZE_OF(child) - dsize) break;
413                         child->thread.debugreg1 = data;
414                         ret = 0;
415                         break;
416                 case offsetof(struct user, u_debugreg[2]):
417                         if (data >= TASK_SIZE_OF(child) - dsize) break;
418                         child->thread.debugreg2 = data;
419                         ret = 0;
420                         break;
421                 case offsetof(struct user, u_debugreg[3]):
422                         if (data >= TASK_SIZE_OF(child) - dsize) break;
423                         child->thread.debugreg3 = data;
424                         ret = 0;
425                         break;
426                 case offsetof(struct user, u_debugreg[6]):
427                                   if (data >> 32)
428                                 break; 
429                         child->thread.debugreg6 = data;
430                         ret = 0;
431                         break;
432                 case offsetof(struct user, u_debugreg[7]):
433                         /* See arch/i386/kernel/ptrace.c for an explanation of
434                          * this awkward check.*/
435                         data &= ~DR_CONTROL_RESERVED;
436                         for(i=0; i<4; i++)
437                                 if ((0x5554 >> ((data >> (16 + 4*i)) & 0xf)) & 1)
438                                         break;
439                         if (i == 4) {
440                           child->thread.debugreg7 = data;
441                           if (data)
442                                 set_tsk_thread_flag(child, TIF_DEBUG);
443                           else
444                                 clear_tsk_thread_flag(child, TIF_DEBUG);
445                           ret = 0;
446                         }
447                   break;
448                 }
449                 break;
450         }
451         case PTRACE_SYSCALL: /* continue and stop at next (return from) syscall */
452         case PTRACE_CONT:    /* restart after signal. */
453
454                 ret = -EIO;
455                 if (!valid_signal(data))
456                         break;
457                 if (request == PTRACE_SYSCALL)
458                         set_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
459                 else
460                         clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
461                 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
462                 child->exit_code = data;
463                 /* make sure the single step bit is not set. */
464                 user_disable_single_step(child);
465                 wake_up_process(child);
466                 ret = 0;
467                 break;
468
469 #ifdef CONFIG_IA32_EMULATION
470                 /* This makes only sense with 32bit programs. Allow a
471                    64bit debugger to fully examine them too. Better
472                    don't use it against 64bit processes, use
473                    PTRACE_ARCH_PRCTL instead. */
474         case PTRACE_GET_THREAD_AREA:
475                 if (addr < 0)
476                         return -EIO;
477                 ret = do_get_thread_area(child, addr,
478                                          (struct user_desc __user *) data);
479
480                 break;
481         case PTRACE_SET_THREAD_AREA:
482                 if (addr < 0)
483                         return -EIO;
484                 ret = do_set_thread_area(child, addr,
485                                          (struct user_desc __user *) data, 0);
486                 break;
487 #endif
488                 /* normal 64bit interface to access TLS data. 
489                    Works just like arch_prctl, except that the arguments
490                    are reversed. */
491         case PTRACE_ARCH_PRCTL: 
492                 ret = do_arch_prctl(child, data, addr);
493                 break;
494
495 /*
496  * make the child exit.  Best I can do is send it a sigkill. 
497  * perhaps it should be put in the status that it wants to 
498  * exit.
499  */
500         case PTRACE_KILL:
501                 ret = 0;
502                 if (child->exit_state == EXIT_ZOMBIE)   /* already dead */
503                         break;
504                 clear_tsk_thread_flag(child, TIF_SINGLESTEP);
505                 child->exit_code = SIGKILL;
506                 /* make sure the single step bit is not set. */
507                 user_disable_single_step(child);
508                 wake_up_process(child);
509                 break;
510
511         case PTRACE_SINGLESTEP:    /* set the trap flag. */
512                 ret = -EIO;
513                 if (!valid_signal(data))
514                         break;
515                 clear_tsk_thread_flag(child,TIF_SYSCALL_TRACE);
516                 user_enable_single_step(child);
517                 child->exit_code = data;
518                 /* give it a chance to run. */
519                 wake_up_process(child);
520                 ret = 0;
521                 break;
522
523         case PTRACE_GETREGS: { /* Get all gp regs from the child. */
524                 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
525                                sizeof(struct user_regs_struct))) {
526                         ret = -EIO;
527                         break;
528                 }
529                 ret = 0;
530                 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
531                         ret |= __put_user(getreg(child, ui),(unsigned long __user *) data);
532                         data += sizeof(long);
533                 }
534                 break;
535         }
536
537         case PTRACE_SETREGS: { /* Set all gp regs in the child. */
538                 unsigned long tmp;
539                 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
540                                sizeof(struct user_regs_struct))) {
541                         ret = -EIO;
542                         break;
543                 }
544                 ret = 0;
545                 for (ui = 0; ui < sizeof(struct user_regs_struct); ui += sizeof(long)) {
546                         ret = __get_user(tmp, (unsigned long __user *) data);
547                         if (ret)
548                                 break;
549                         ret = putreg(child, ui, tmp);
550                         if (ret)
551                                 break;
552                         data += sizeof(long);
553                 }
554                 break;
555         }
556
557         case PTRACE_GETFPREGS: { /* Get the child extended FPU state. */
558                 if (!access_ok(VERIFY_WRITE, (unsigned __user *)data,
559                                sizeof(struct user_i387_struct))) {
560                         ret = -EIO;
561                         break;
562                 }
563                 ret = get_fpregs((struct user_i387_struct __user *)data, child);
564                 break;
565         }
566
567         case PTRACE_SETFPREGS: { /* Set the child extended FPU state. */
568                 if (!access_ok(VERIFY_READ, (unsigned __user *)data,
569                                sizeof(struct user_i387_struct))) {
570                         ret = -EIO;
571                         break;
572                 }
573                 set_stopped_child_used_math(child);
574                 ret = set_fpregs(child, (struct user_i387_struct __user *)data);
575                 break;
576         }
577
578         default:
579                 ret = ptrace_request(child, request, addr, data);
580                 break;
581         }
582         return ret;
583 }
584
585 static void syscall_trace(struct pt_regs *regs)
586 {
587
588 #if 0
589         printk("trace %s rip %lx rsp %lx rax %d origrax %d caller %lx tiflags %x ptrace %x\n",
590                current->comm,
591                regs->rip, regs->rsp, regs->rax, regs->orig_rax, __builtin_return_address(0),
592                current_thread_info()->flags, current->ptrace); 
593 #endif
594
595         ptrace_notify(SIGTRAP | ((current->ptrace & PT_TRACESYSGOOD)
596                                 ? 0x80 : 0));
597         /*
598          * this isn't the same as continuing with a signal, but it will do
599          * for normal use.  strace only continues with a signal if the
600          * stopping signal is not SIGTRAP.  -brl
601          */
602         if (current->exit_code) {
603                 send_sig(current->exit_code, current, 1);
604                 current->exit_code = 0;
605         }
606 }
607
608 asmlinkage void syscall_trace_enter(struct pt_regs *regs)
609 {
610         /* do the secure computing check first */
611         secure_computing(regs->orig_rax);
612
613         if (test_thread_flag(TIF_SYSCALL_TRACE)
614             && (current->ptrace & PT_PTRACED))
615                 syscall_trace(regs);
616
617         if (unlikely(current->audit_context)) {
618                 if (test_thread_flag(TIF_IA32)) {
619                         audit_syscall_entry(AUDIT_ARCH_I386,
620                                             regs->orig_rax,
621                                             regs->rbx, regs->rcx,
622                                             regs->rdx, regs->rsi);
623                 } else {
624                         audit_syscall_entry(AUDIT_ARCH_X86_64,
625                                             regs->orig_rax,
626                                             regs->rdi, regs->rsi,
627                                             regs->rdx, regs->r10);
628                 }
629         }
630 }
631
632 asmlinkage void syscall_trace_leave(struct pt_regs *regs)
633 {
634         if (unlikely(current->audit_context))
635                 audit_syscall_exit(AUDITSC_RESULT(regs->rax), regs->rax);
636
637         if ((test_thread_flag(TIF_SYSCALL_TRACE)
638              || test_thread_flag(TIF_SINGLESTEP))
639             && (current->ptrace & PT_PTRACED))
640                 syscall_trace(regs);
641 }