Commit | Line | Data |
---|---|---|
0ba6fbbf MD |
1 | /*- |
2 | * Copyright (c) 2003 Kip Macy | |
3 | * All rights reserved. | |
4 | * | |
5 | * Redistribution and use in source and binary forms, with or without | |
6 | * modification, are permitted provided that the following conditions | |
7 | * are met: | |
8 | * 1. Redistributions of source code must retain the above copyright | |
9 | * notice, this list of conditions and the following disclaimer. | |
10 | * 2. Redistributions in binary form must reproduce the above copyright | |
11 | * notice, this list of conditions and the following disclaimer in the | |
12 | * documentation and/or other materials provided with the distribution. | |
13 | * | |
14 | * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND | |
15 | * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |
16 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |
17 | * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE | |
18 | * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |
19 | * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS | |
20 | * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) | |
21 | * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT | |
22 | * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY | |
23 | * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF | |
24 | * SUCH DAMAGE. | |
25 | * | |
08f2f1bb | 26 | * $DragonFly: src/sys/kern/kern_checkpoint.c,v 1.16 2007/02/03 17:05:57 corecode Exp $ |
0ba6fbbf MD |
27 | */ |
28 | ||
29 | #include <sys/types.h> | |
30 | #include <sys/param.h> | |
31 | #include <sys/proc.h> | |
32 | #include <sys/module.h> | |
33 | #include <sys/sysent.h> | |
34 | #include <sys/kernel.h> | |
35 | #include <sys/systm.h> | |
36 | #include <sys/nlookup.h> | |
37 | ||
38 | #include <sys/file.h> | |
39 | /* only on dragonfly */ | |
40 | #include <sys/file2.h> | |
41 | #include <sys/fcntl.h> | |
42 | #include <sys/signal.h> | |
43 | #include <vm/vm_param.h> | |
44 | #include <vm/vm.h> | |
45 | #include <sys/imgact_elf.h> | |
46 | #include <sys/procfs.h> | |
47 | ||
48 | #include <sys/lock.h> | |
49 | #include <vm/pmap.h> | |
50 | #include <vm/vm_map.h> | |
51 | #include <vm/vm_extern.h> | |
52 | #include <sys/mman.h> | |
53 | #include <sys/sysent.h> | |
54 | #include <sys/sysproto.h> | |
55 | #include <sys/resource.h> | |
56 | #include <sys/resourcevar.h> | |
57 | #include <sys/malloc.h> | |
58 | #include <sys/stat.h> | |
59 | #include <sys/uio.h> | |
60 | #include <sys/namei.h> | |
61 | #include <sys/vnode.h> | |
62 | #include <machine/limits.h> | |
f8334305 | 63 | #include <machine/frame.h> |
0ba6fbbf MD |
64 | #include <sys/signalvar.h> |
65 | #include <sys/syslog.h> | |
66 | #include <sys/sysctl.h> | |
f8334305 | 67 | #include <machine/sigframe.h> |
0ba6fbbf MD |
68 | #include <sys/exec.h> |
69 | #include <sys/unistd.h> | |
70 | #include <sys/time.h> | |
71 | #include <sys/kern_syscall.h> | |
72 | #include <sys/checkpoint.h> | |
73 | #include <sys/mount.h> | |
74 | #include <sys/ckpt.h> | |
75 | ||
76 | ||
77 | static int elf_loadphdrs(struct file *fp, Elf_Phdr *phdr, int numsegs); | |
78 | static int elf_getnotes(struct proc *p, struct file *fp, size_t notesz); | |
79 | static int elf_demarshalnotes(void *src, prpsinfo_t *psinfo, | |
80 | prstatus_t *status, prfpregset_t *fpregset, int nthreads); | |
81 | static int elf_loadnotes(struct proc *, prpsinfo_t *, prstatus_t *, | |
82 | prfpregset_t *); | |
83 | static int elf_getsigs(struct proc *p, struct file *fp); | |
84 | static int elf_getfiles(struct proc *p, struct file *fp); | |
85 | static int elf_gettextvp(struct proc *p, struct file *fp); | |
86 | static char *ckpt_expand_name(const char *name, uid_t uid, pid_t pid); | |
87 | ||
88 | static int ckptgroup = 0; /* wheel only, -1 for any group */ | |
89 | SYSCTL_INT(_kern, OID_AUTO, ckptgroup, CTLFLAG_RW, &ckptgroup, 0, ""); | |
90 | ||
91 | /* ref count to see how many processes that are being checkpointed */ | |
92 | static int chptinuse = 0; | |
93 | ||
94 | static __inline | |
95 | int | |
96 | read_check(struct file *fp, void *buf, size_t nbyte) | |
97 | { | |
98 | size_t nread; | |
99 | int error; | |
100 | ||
374a52ac | 101 | PRINTF(("reading %d bytes\n", nbyte)); |
e7440b28 | 102 | error = fp_read(fp, buf, nbyte, &nread, 1, UIO_SYSSPACE); |
0ba6fbbf MD |
103 | if (error) { |
104 | PRINTF(("read failed - %d", error)); | |
105 | } else if (nread != nbyte) { | |
106 | PRINTF(("wanted to read %d - read %d\n", nbyte, nread)); | |
107 | error = EINVAL; | |
108 | } | |
109 | return error; | |
110 | } | |
111 | ||
112 | static int | |
113 | elf_gethdr(struct file *fp, Elf_Ehdr *ehdr) | |
114 | { | |
115 | size_t nbyte = sizeof(Elf_Ehdr); | |
116 | int error; | |
117 | ||
118 | if ((error = read_check(fp, ehdr, nbyte)) != 0) | |
119 | goto done; | |
120 | if (!(ehdr->e_ehsize == sizeof(Elf_Ehdr))) { | |
121 | PRINTF(("wrong elf header size: %d\n" | |
122 | "expected size : %d\n", | |
123 | ehdr->e_ehsize, sizeof(Elf_Ehdr))); | |
124 | return EINVAL; | |
125 | } | |
126 | if (!(ehdr->e_phentsize == sizeof(Elf_Phdr))) { | |
127 | PRINTF(("wrong program header size: %d\n" | |
128 | "expected size : %d\n", | |
129 | ehdr->e_phentsize, sizeof(Elf_Phdr))); | |
130 | return EINVAL; | |
131 | } | |
132 | ||
133 | if (!(ehdr->e_ident[EI_MAG0] == ELFMAG0 && | |
134 | ehdr->e_ident[EI_MAG1] == ELFMAG1 && | |
135 | ehdr->e_ident[EI_MAG2] == ELFMAG2 && | |
136 | ehdr->e_ident[EI_MAG3] == ELFMAG3 && | |
137 | ehdr->e_ident[EI_CLASS] == ELF_CLASS && | |
138 | ehdr->e_ident[EI_DATA] == ELF_DATA && | |
139 | ehdr->e_ident[EI_VERSION] == EV_CURRENT && | |
140 | ehdr->e_ident[EI_OSABI] == ELFOSABI_FREEBSD && | |
141 | ehdr->e_ident[EI_ABIVERSION] == 0)) { | |
142 | PRINTF(("bad elf header\n there are %d segments\n", | |
143 | ehdr->e_phnum)); | |
144 | return EINVAL; | |
145 | ||
146 | } | |
147 | PRINTF(("Elf header size: %d\n", ehdr->e_ehsize)); | |
148 | PRINTF(("Program header size: %d\n", ehdr->e_phentsize)); | |
149 | PRINTF(("Number of Program headers: %d\n", ehdr->e_phnum)); | |
150 | done: | |
151 | return error; | |
152 | } | |
153 | ||
154 | static int | |
155 | elf_getphdrs(struct file *fp, Elf_Phdr *phdr, size_t nbyte) | |
156 | { | |
157 | int i; | |
158 | int error; | |
159 | int nheaders = nbyte/sizeof(Elf_Phdr); | |
160 | ||
161 | PRINTF(("reading phdrs section\n")); | |
162 | if ((error = read_check(fp, phdr, nbyte)) != 0) | |
163 | goto done; | |
6ea70f76 | 164 | kprintf("headers section:\n"); |
0ba6fbbf | 165 | for (i = 0; i < nheaders; i++) { |
6ea70f76 SW |
166 | kprintf("entry type: %d\n", phdr[i].p_type); |
167 | kprintf("file offset: %d\n", phdr[i].p_offset); | |
168 | kprintf("virt address: %p\n", (uint32_t *)phdr[i].p_vaddr); | |
169 | kprintf("file size: %d\n", phdr[i].p_filesz); | |
170 | kprintf("memory size: %d\n", phdr[i].p_memsz); | |
171 | kprintf("\n"); | |
0ba6fbbf MD |
172 | } |
173 | done: | |
174 | return error; | |
175 | } | |
176 | ||
177 | ||
178 | static int | |
179 | elf_getnotes(struct proc *p, struct file *fp, size_t notesz) | |
180 | { | |
181 | int error; | |
182 | int nthreads; | |
183 | char *note; | |
184 | prpsinfo_t *psinfo; | |
185 | prstatus_t *status; | |
186 | prfpregset_t *fpregset; | |
187 | ||
188 | nthreads = (notesz - sizeof(prpsinfo_t))/(sizeof(prstatus_t) + | |
189 | sizeof(prfpregset_t)); | |
190 | PRINTF(("reading notes header nthreads=%d\n", nthreads)); | |
191 | if (nthreads <= 0 || nthreads > CKPT_MAXTHREADS) | |
192 | return EINVAL; | |
193 | ||
efda3bd0 MD |
194 | psinfo = kmalloc(sizeof(prpsinfo_t), M_TEMP, M_ZERO | M_WAITOK); |
195 | status = kmalloc(nthreads*sizeof(prstatus_t), M_TEMP, M_WAITOK); | |
196 | fpregset = kmalloc(nthreads*sizeof(prfpregset_t), M_TEMP, M_WAITOK); | |
197 | note = kmalloc(notesz, M_TEMP, M_WAITOK); | |
0ba6fbbf MD |
198 | |
199 | ||
200 | PRINTF(("reading notes section\n")); | |
201 | if ((error = read_check(fp, note, notesz)) != 0) | |
202 | goto done; | |
203 | error = elf_demarshalnotes(note, psinfo, status, fpregset, nthreads); | |
204 | if (error) | |
205 | goto done; | |
206 | /* fetch register state from notes */ | |
207 | error = elf_loadnotes(p, psinfo, status, fpregset); | |
208 | done: | |
209 | if (psinfo) | |
efda3bd0 | 210 | kfree(psinfo, M_TEMP); |
0ba6fbbf | 211 | if (status) |
efda3bd0 | 212 | kfree(status, M_TEMP); |
0ba6fbbf | 213 | if (fpregset) |
efda3bd0 | 214 | kfree(fpregset, M_TEMP); |
0ba6fbbf | 215 | if (note) |
efda3bd0 | 216 | kfree(note, M_TEMP); |
0ba6fbbf MD |
217 | return error; |
218 | } | |
219 | ||
220 | static int | |
221 | ckpt_thaw_proc(struct proc *p, struct file *fp) | |
222 | { | |
223 | ||
224 | Elf_Phdr *phdr = NULL; | |
225 | Elf_Ehdr *ehdr = NULL; | |
226 | int error; | |
227 | size_t nbyte; | |
228 | ||
229 | TRACE_ENTER; | |
230 | ||
efda3bd0 | 231 | ehdr = kmalloc(sizeof(Elf_Ehdr), M_TEMP, M_ZERO | M_WAITOK); |
0ba6fbbf MD |
232 | |
233 | if ((error = elf_gethdr(fp, ehdr)) != 0) | |
234 | goto done; | |
235 | nbyte = sizeof(Elf_Phdr) * ehdr->e_phnum; | |
efda3bd0 | 236 | phdr = kmalloc(nbyte, M_TEMP, M_WAITOK); |
0ba6fbbf MD |
237 | |
238 | /* fetch description of program writable mappings */ | |
239 | if ((error = elf_getphdrs(fp, phdr, nbyte)) != 0) | |
240 | goto done; | |
241 | ||
242 | /* fetch notes section containing register state */ | |
243 | if ((error = elf_getnotes(p, fp, phdr->p_filesz)) != 0) | |
244 | goto done; | |
245 | ||
246 | /* fetch program text vnodes */ | |
247 | if ((error = elf_gettextvp(p, fp)) != 0) | |
248 | goto done; | |
249 | ||
250 | /* fetch signal disposition */ | |
4f12bfd3 | 251 | if ((error = elf_getsigs(p, fp)) != 0) { |
6ea70f76 | 252 | kprintf("failure in recovering signals\n"); |
0ba6fbbf | 253 | goto done; |
4f12bfd3 | 254 | } |
0ba6fbbf MD |
255 | |
256 | /* fetch open files */ | |
257 | if ((error = elf_getfiles(p, fp)) != 0) | |
258 | goto done; | |
259 | ||
260 | /* handle mappings last in case we are reading from a socket */ | |
261 | error = elf_loadphdrs(fp, phdr, ehdr->e_phnum); | |
262 | ||
263 | /* | |
264 | * Set the textvp to the checkpoint file and mark the vnode so | |
265 | * a future checkpointing of this checkpoint-restored program | |
266 | * will copy out the contents of the mappings rather then trying | |
267 | * to record the vnode info related to the checkpoint file, which | |
268 | * is likely going to be destroyed when the program is re-checkpointed. | |
269 | */ | |
270 | if (error == 0 && fp->f_data && fp->f_type == DTYPE_VNODE) { | |
271 | if (p->p_textvp) | |
272 | vrele(p->p_textvp); | |
273 | p->p_textvp = (struct vnode *)fp->f_data; | |
274 | p->p_textvp->v_flag |= VCKPT; | |
275 | vref(p->p_textvp); | |
276 | } | |
277 | done: | |
278 | if (ehdr) | |
efda3bd0 | 279 | kfree(ehdr, M_TEMP); |
0ba6fbbf | 280 | if (phdr) |
efda3bd0 | 281 | kfree(phdr, M_TEMP); |
0ba6fbbf MD |
282 | TRACE_EXIT; |
283 | return error; | |
284 | } | |
285 | ||
286 | static int | |
287 | elf_loadnotes(struct proc *p, prpsinfo_t *psinfo, prstatus_t *status, | |
288 | prfpregset_t *fpregset) | |
289 | { | |
08f2f1bb | 290 | struct lwp *lp; |
0ba6fbbf MD |
291 | int error; |
292 | ||
293 | /* validate status and psinfo */ | |
294 | TRACE_ENTER; | |
295 | if (status->pr_version != PRSTATUS_VERSION || | |
296 | status->pr_statussz != sizeof(prstatus_t) || | |
297 | status->pr_gregsetsz != sizeof(gregset_t) || | |
298 | status->pr_fpregsetsz != sizeof(fpregset_t) || | |
299 | psinfo->pr_version != PRPSINFO_VERSION || | |
300 | psinfo->pr_psinfosz != sizeof(prpsinfo_t)) { | |
301 | PRINTF(("status check failed\n")); | |
302 | error = EINVAL; | |
303 | goto done; | |
304 | } | |
08f2f1bb SS |
305 | /* XXX lwp */ |
306 | lp = FIRST_LWP_IN_PROC(p); | |
307 | if ((error = set_regs(lp, &status->pr_reg)) != 0) | |
0ba6fbbf | 308 | goto done; |
08f2f1bb | 309 | error = set_fpregs(lp, fpregset); |
0ba6fbbf MD |
310 | strlcpy(p->p_comm, psinfo->pr_fname, sizeof(p->p_comm)); |
311 | /* XXX psinfo->pr_psargs not yet implemented */ | |
312 | done: | |
313 | TRACE_EXIT; | |
314 | return error; | |
315 | } | |
316 | ||
317 | static int | |
318 | elf_getnote(void *src, size_t *off, const char *name, unsigned int type, | |
319 | void **desc, size_t descsz) | |
320 | { | |
321 | Elf_Note note; | |
322 | int error; | |
323 | ||
324 | TRACE_ENTER; | |
325 | if (src == NULL) { | |
326 | error = EFAULT; | |
327 | goto done; | |
328 | } | |
329 | bcopy((char *)src + *off, ¬e, sizeof note); | |
330 | ||
331 | PRINTF(("at offset: %d expected note of type: %d - got: %d\n", | |
332 | *off, type, note.n_type)); | |
333 | *off += sizeof note; | |
334 | if (type != note.n_type) { | |
335 | TRACE_ERR; | |
336 | error = EINVAL; | |
337 | goto done; | |
338 | } | |
339 | if (strncmp(name, (char *) src + *off, note.n_namesz) != 0) { | |
340 | error = EINVAL; | |
341 | goto done; | |
342 | } | |
343 | *off += roundup2(note.n_namesz, sizeof(Elf_Size)); | |
344 | if (note.n_descsz != descsz) { | |
345 | TRACE_ERR; | |
346 | error = EINVAL; | |
347 | goto done; | |
348 | } | |
349 | if (desc) | |
350 | bcopy((char *)src + *off, *desc, note.n_descsz); | |
351 | *off += roundup2(note.n_descsz, sizeof(Elf_Size)); | |
352 | error = 0; | |
353 | done: | |
354 | TRACE_EXIT; | |
355 | return error; | |
356 | } | |
357 | ||
358 | static int | |
359 | elf_demarshalnotes(void *src, prpsinfo_t *psinfo, prstatus_t *status, | |
360 | prfpregset_t *fpregset, int nthreads) | |
361 | { | |
362 | int i; | |
363 | int error; | |
364 | int off = 0; | |
365 | ||
366 | TRACE_ENTER; | |
367 | error = elf_getnote(src, &off, "FreeBSD", NT_PRSTATUS, | |
368 | (void **)&status, sizeof(prstatus_t)); | |
369 | if (error) | |
370 | goto done; | |
371 | error = elf_getnote(src, &off, "FreeBSD", NT_FPREGSET, | |
372 | (void **)&fpregset, sizeof(prfpregset_t)); | |
373 | if (error) | |
374 | goto done; | |
375 | error = elf_getnote(src, &off, "FreeBSD", NT_PRPSINFO, | |
376 | (void **)&psinfo, sizeof(prpsinfo_t)); | |
377 | if (error) | |
378 | goto done; | |
379 | ||
380 | /* | |
381 | * The remaining portion needs to be an integer multiple | |
382 | * of prstatus_t and prfpregset_t | |
383 | */ | |
384 | for (i = 0 ; i < nthreads - 1; i++) { | |
385 | status++; fpregset++; | |
386 | error = elf_getnote(src, &off, "FreeBSD", NT_PRSTATUS, | |
387 | (void **)&status, sizeof (prstatus_t)); | |
388 | if (error) | |
389 | goto done; | |
390 | error = elf_getnote(src, &off, "FreeBSD", NT_FPREGSET, | |
391 | (void **)&fpregset, sizeof(prfpregset_t)); | |
392 | if (error) | |
393 | goto done; | |
394 | } | |
395 | ||
396 | done: | |
397 | TRACE_EXIT; | |
398 | return error; | |
399 | } | |
400 | ||
401 | ||
402 | static int | |
403 | mmap_phdr(struct file *fp, Elf_Phdr *phdr) | |
404 | { | |
405 | int error; | |
406 | size_t len; | |
407 | int prot; | |
408 | void *addr; | |
409 | int flags; | |
410 | off_t pos; | |
411 | ||
412 | TRACE_ENTER; | |
413 | pos = phdr->p_offset; | |
414 | len = phdr->p_filesz; | |
415 | addr = (void *)phdr->p_vaddr; | |
416 | flags = MAP_FIXED | MAP_NOSYNC | MAP_PRIVATE; | |
417 | prot = 0; | |
418 | if (phdr->p_flags & PF_R) | |
419 | prot |= PROT_READ; | |
420 | if (phdr->p_flags & PF_W) | |
421 | prot |= PROT_WRITE; | |
422 | if (phdr->p_flags & PF_X) | |
423 | prot |= PROT_EXEC; | |
424 | if ((error = fp_mmap(addr, len, prot, flags, fp, pos, &addr)) != 0) { | |
425 | PRINTF(("mmap failed: %d\n", error); ); | |
426 | } | |
374a52ac HP |
427 | PRINTF(("map @%08x-%08x fileoff %08x-%08x\n", (int)addr, |
428 | (int)((char *)addr + len), (int)pos, (int)(pos + len))); | |
0ba6fbbf MD |
429 | TRACE_EXIT; |
430 | return error; | |
431 | } | |
432 | ||
433 | ||
434 | static int | |
435 | elf_loadphdrs(struct file *fp, Elf_Phdr *phdr, int numsegs) | |
436 | { | |
437 | int i; | |
438 | int error = 0; | |
439 | ||
440 | TRACE_ENTER; | |
441 | for (i = 1; i < numsegs; i++) { | |
442 | if ((error = mmap_phdr(fp, &phdr[i])) != 0) | |
443 | break; | |
444 | } | |
445 | TRACE_EXIT; | |
446 | return error; | |
447 | } | |
448 | ||
449 | static int | |
450 | elf_getsigs(struct proc *p, struct file *fp) | |
451 | { | |
452 | int error; | |
453 | struct ckpt_siginfo *csi; | |
454 | struct sigacts *tmpsigacts; | |
08f2f1bb | 455 | struct lwp *lp; |
0ba6fbbf MD |
456 | |
457 | TRACE_ENTER; | |
efda3bd0 | 458 | csi = kmalloc(sizeof(struct ckpt_siginfo), M_TEMP, M_ZERO | M_WAITOK); |
0ba6fbbf MD |
459 | if ((error = read_check(fp, csi, sizeof(struct ckpt_siginfo))) != 0) |
460 | goto done; | |
461 | ||
462 | if (csi->csi_ckptpisz != sizeof(struct ckpt_siginfo)) { | |
463 | TRACE_ERR; | |
464 | error = EINVAL; | |
465 | goto done; | |
466 | } | |
467 | tmpsigacts = p->p_procsig->ps_sigacts; | |
468 | bcopy(&csi->csi_procsig, p->p_procsig, sizeof(struct procsig)); | |
469 | p->p_procsig->ps_sigacts = tmpsigacts; | |
470 | bcopy(&csi->csi_sigacts, p->p_procsig->ps_sigacts, sizeof(struct sigacts)); | |
471 | bcopy(&csi->csi_itimerval, &p->p_realtimer, sizeof(struct itimerval)); | |
4f12bfd3 | 472 | SIG_CANTMASK(csi->csi_sigmask); |
08f2f1bb SS |
473 | /* XXX lwp */ |
474 | lp = FIRST_LWP_IN_PROC(p); | |
475 | bcopy(&csi->csi_sigmask, &lp->lwp_sigmask, sizeof(sigset_t)); | |
0ba6fbbf MD |
476 | p->p_sigparent = csi->csi_sigparent; |
477 | done: | |
478 | if (csi) | |
efda3bd0 | 479 | kfree(csi, M_TEMP); |
0ba6fbbf MD |
480 | TRACE_EXIT; |
481 | return error; | |
482 | } | |
483 | ||
484 | /* | |
485 | * Returns a locked, refd vnode | |
486 | */ | |
487 | static int | |
488 | ckpt_fhtovp(fhandle_t *fh, struct vnode **vpp) | |
489 | { | |
490 | struct mount *mp; | |
491 | int error; | |
492 | ||
493 | TRACE_ENTER; | |
494 | mp = vfs_getvfs(&fh->fh_fsid); | |
495 | ||
496 | if (!mp) { | |
497 | TRACE_ERR; | |
498 | PRINTF(("failed to get mount - ESTALE\n")); | |
499 | TRACE_EXIT; | |
500 | return ESTALE; | |
501 | } | |
502 | error = VFS_FHTOVP(mp, &fh->fh_fid, vpp); | |
503 | if (error) { | |
504 | PRINTF(("failed with: %d\n", error)); | |
505 | TRACE_ERR; | |
506 | TRACE_EXIT; | |
507 | return error; | |
508 | } | |
509 | TRACE_EXIT; | |
510 | return 0; | |
511 | } | |
512 | ||
513 | static int | |
514 | mmap_vp(struct vn_hdr *vnh) | |
515 | { | |
516 | struct vnode *vp; | |
517 | Elf_Phdr *phdr; | |
518 | struct file *fp; | |
519 | int error; | |
520 | TRACE_ENTER; | |
521 | ||
522 | phdr = &vnh->vnh_phdr; | |
523 | ||
524 | if ((error = ckpt_fhtovp(&vnh->vnh_fh, &vp)) != 0) | |
525 | return error; | |
526 | /* | |
527 | * XXX O_RDONLY -> or O_RDWR if file is PROT_WRITE, MAP_SHARED | |
528 | */ | |
529 | if ((error = fp_vpopen(vp, O_RDONLY, &fp)) != 0) { | |
530 | vput(vp); | |
531 | return error; | |
532 | } | |
533 | error = mmap_phdr(fp, phdr); | |
534 | fp_close(fp); | |
535 | TRACE_EXIT; | |
536 | return error; | |
537 | } | |
538 | ||
539 | ||
540 | static int | |
541 | elf_gettextvp(struct proc *p, struct file *fp) | |
542 | { | |
543 | int i; | |
544 | int error; | |
545 | int vpcount; | |
546 | struct ckpt_vminfo vminfo; | |
547 | struct vn_hdr *vnh = NULL; | |
548 | ||
549 | TRACE_ENTER; | |
550 | if ((error = read_check(fp, &vminfo, sizeof(vminfo))) != 0) | |
551 | goto done; | |
552 | if (vminfo.cvm_dsize < 0 || | |
553 | vminfo.cvm_dsize > p->p_rlimit[RLIMIT_DATA].rlim_cur || | |
554 | vminfo.cvm_tsize < 0 || | |
555 | (u_quad_t)vminfo.cvm_tsize > maxtsiz || | |
88181b08 MD |
556 | vminfo.cvm_daddr >= (caddr_t)VM_MAX_USER_ADDRESS || |
557 | vminfo.cvm_taddr >= (caddr_t)VM_MAX_USER_ADDRESS | |
0ba6fbbf MD |
558 | ) { |
559 | error = ERANGE; | |
560 | goto done; | |
561 | } | |
562 | ||
563 | vmspace_exec(p, NULL); | |
564 | p->p_vmspace->vm_daddr = vminfo.cvm_daddr; | |
565 | p->p_vmspace->vm_dsize = vminfo.cvm_dsize; | |
566 | p->p_vmspace->vm_taddr = vminfo.cvm_taddr; | |
567 | p->p_vmspace->vm_tsize = vminfo.cvm_tsize; | |
568 | if ((error = read_check(fp, &vpcount, sizeof(int))) != 0) | |
569 | goto done; | |
efda3bd0 | 570 | vnh = kmalloc(sizeof(struct vn_hdr) * vpcount, M_TEMP, M_WAITOK); |
0ba6fbbf MD |
571 | if ((error = read_check(fp, vnh, sizeof(struct vn_hdr)*vpcount)) != 0) |
572 | goto done; | |
573 | for (i = 0; i < vpcount; i++) { | |
574 | if ((error = mmap_vp(&vnh[i])) != 0) | |
575 | goto done; | |
576 | } | |
577 | ||
578 | done: | |
579 | if (vnh) | |
efda3bd0 | 580 | kfree(vnh, M_TEMP); |
0ba6fbbf MD |
581 | TRACE_EXIT; |
582 | return error; | |
583 | } | |
584 | ||
585 | ||
586 | ||
587 | /* place holder */ | |
588 | static int | |
589 | elf_getfiles(struct proc *p, struct file *fp) | |
590 | { | |
591 | int error; | |
592 | int i; | |
593 | int filecount; | |
594 | int fd; | |
595 | struct ckpt_filehdr filehdr; | |
596 | struct ckpt_fileinfo *cfi_base = NULL; | |
597 | struct vnode *vp; | |
598 | struct file *tempfp; | |
599 | struct file *ofp; | |
600 | ||
601 | TRACE_ENTER; | |
602 | if ((error = read_check(fp, &filehdr, sizeof(filehdr))) != 0) | |
603 | goto done; | |
604 | filecount = filehdr.cfh_nfiles; | |
efda3bd0 | 605 | cfi_base = kmalloc(filecount*sizeof(struct ckpt_fileinfo), M_TEMP, M_WAITOK); |
0ba6fbbf MD |
606 | error = read_check(fp, cfi_base, filecount*sizeof(struct ckpt_fileinfo)); |
607 | if (error) | |
608 | goto done; | |
609 | ||
610 | /* | |
4f12bfd3 | 611 | * Close all file descriptors >= 3. These descriptors are from the |
0ba6fbbf | 612 | * checkpt(1) program itself and should not be retained. |
4f12bfd3 MD |
613 | * |
614 | * XXX we need a flag so a checkpoint restore can opt to supply the | |
615 | * descriptors, or the non-regular-file descripors. | |
0ba6fbbf MD |
616 | */ |
617 | for (i = 3; i < p->p_fd->fd_nfiles; ++i) | |
618 | kern_close(i); | |
619 | ||
620 | /* | |
621 | * Scan files to load | |
622 | */ | |
623 | for (i = 0; i < filecount; i++) { | |
624 | struct ckpt_fileinfo *cfi= &cfi_base[i]; | |
625 | /* | |
626 | * Ignore placeholder entries where cfi_index is less then | |
627 | * zero. This will occur if the elf core dump code thinks | |
628 | * it can save a vnode but winds up not being able to. | |
629 | */ | |
630 | if (cfi->cfi_index < 0) | |
631 | continue; | |
632 | ||
633 | if ((error = ckpt_fhtovp(&cfi->cfi_fh, &vp)) != 0) | |
634 | break; | |
635 | if ((error = fp_vpopen(vp, OFLAGS(cfi->cfi_flags), &tempfp)) != 0) { | |
636 | vput(vp); | |
637 | break; | |
638 | } | |
639 | tempfp->f_offset = cfi->cfi_offset; | |
640 | ||
641 | /* | |
642 | * If overwriting a descriptor close the old descriptor. This | |
643 | * only occurs if the saved core saved descriptors that we | |
644 | * have not already closed. | |
645 | */ | |
646 | if (cfi->cfi_index < p->p_fd->fd_nfiles && | |
0679adc4 | 647 | (ofp = p->p_fd->fd_files[cfi->cfi_index].fp) != NULL) { |
0ba6fbbf MD |
648 | kern_close(cfi->cfi_index); |
649 | } | |
650 | ||
651 | /* | |
652 | * Allocate the descriptor we want. | |
653 | */ | |
654 | if (fdalloc(p, cfi->cfi_index, &fd) != 0) { | |
655 | PRINTF(("can't currently restore fd: %d\n", | |
656 | cfi->cfi_index)); | |
657 | fp_close(fp); | |
658 | goto done; | |
659 | } | |
660 | KKASSERT(fd == cfi->cfi_index); | |
259b8ea0 MD |
661 | fsetfd(p, tempfp, fd); |
662 | fdrop(tempfp); | |
0ba6fbbf MD |
663 | cfi++; |
664 | PRINTF(("restoring %d\n", cfi->cfi_index)); | |
665 | } | |
666 | ||
667 | done: | |
668 | if (cfi_base) | |
efda3bd0 | 669 | kfree(cfi_base, M_TEMP); |
0ba6fbbf MD |
670 | TRACE_EXIT; |
671 | return error; | |
672 | } | |
673 | ||
674 | static int | |
675 | ckpt_freeze_proc (struct proc *p, struct file *fp) | |
676 | { | |
677 | rlim_t limit; | |
678 | int error; | |
679 | ||
680 | PRINTF(("calling generic_elf_coredump\n")); | |
681 | limit = p->p_rlimit[RLIMIT_CORE].rlim_cur; | |
682 | if (limit) { | |
683 | error = generic_elf_coredump(p, fp, limit); | |
684 | } else { | |
685 | error = ERANGE; | |
686 | } | |
687 | return error; | |
688 | } | |
689 | ||
690 | int | |
753fd850 | 691 | sys_sys_checkpoint(struct sys_checkpoint_args *uap) |
0ba6fbbf MD |
692 | { |
693 | int error = 0; | |
694 | struct proc *p = curthread->td_proc; | |
695 | struct file *fp; | |
696 | ||
697 | /* | |
698 | * Only certain groups (to reduce our security exposure). -1 | |
699 | * allows any group. | |
700 | */ | |
701 | if (ckptgroup >= 0 && groupmember(ckptgroup, p->p_ucred) == 0) | |
702 | return (EPERM); | |
703 | ||
704 | /* | |
705 | * For now we can only checkpoint the current process | |
706 | */ | |
707 | if (uap->pid != -1 && uap->pid != p->p_pid) | |
708 | return (EINVAL); | |
709 | ||
710 | switch (uap->type) { | |
711 | case CKPT_FREEZE: | |
712 | fp = NULL; | |
713 | if (uap->fd == -1 && uap->pid == (pid_t)-1) | |
714 | error = checkpoint_signal_handler(p); | |
715 | else if ((fp = holdfp(p->p_fd, uap->fd, FWRITE)) == NULL) | |
716 | error = EBADF; | |
717 | else | |
718 | error = ckpt_freeze_proc(p, fp); | |
719 | if (fp) | |
9f87144f | 720 | fdrop(fp); |
0ba6fbbf MD |
721 | break; |
722 | case CKPT_THAW: | |
723 | if (uap->pid != -1) | |
724 | return EINVAL; | |
725 | if ((fp = holdfp(p->p_fd, uap->fd, FREAD)) == NULL) | |
726 | return EBADF; | |
727 | uap->sysmsg_result = uap->retval; | |
728 | error = ckpt_thaw_proc(p, fp); | |
9f87144f | 729 | fdrop(fp); |
0ba6fbbf MD |
730 | break; |
731 | default: | |
732 | error = EOPNOTSUPP; | |
733 | break; | |
734 | } | |
735 | return error; | |
736 | } | |
737 | ||
738 | int | |
739 | checkpoint_signal_handler(struct proc *p) | |
740 | { | |
741 | char *buf; | |
742 | struct file *fp; | |
743 | struct nlookupdata nd; | |
744 | int error; | |
745 | ||
746 | chptinuse++; | |
747 | ||
748 | /* | |
749 | * Being able to checkpoint an suid or sgid program is not a good | |
750 | * idea. | |
751 | */ | |
752 | if (sugid_coredump == 0 && (p->p_flag & P_SUGID)) { | |
753 | chptinuse--; | |
754 | return (EPERM); | |
755 | } | |
756 | ||
757 | buf = ckpt_expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); | |
758 | if (buf == NULL) { | |
759 | chptinuse--; | |
760 | return (ENOMEM); | |
761 | } | |
762 | ||
763 | log(LOG_INFO, "pid %d (%s), uid %d: checkpointing to %s\n", | |
764 | p->p_pid, p->p_comm, | |
765 | (p->p_ucred ? p->p_ucred->cr_uid : -1), | |
766 | buf); | |
767 | ||
768 | PRINTF(("ckpt handler called, using '%s'\n", buf)); | |
769 | ||
770 | /* | |
771 | * Use the same safety flags that the coredump code uses. Remove | |
772 | * any previous checkpoint file before writing out the new one in | |
773 | * case we are re-checkpointing a program that had been checkpt | |
774 | * restored. Otherwise we will corrupt the program space (which is | |
775 | * made up of mmap()ings of the previous checkpoint file) while we | |
776 | * write out the new one. | |
777 | */ | |
778 | error = nlookup_init(&nd, buf, UIO_SYSSPACE, 0); | |
779 | if (error == 0) | |
780 | error = kern_unlink(&nd); | |
781 | nlookup_done(&nd); | |
782 | error = fp_open(buf, O_WRONLY|O_CREAT|O_TRUNC|O_NOFOLLOW, 0600, &fp); | |
783 | if (error == 0) { | |
784 | error = ckpt_freeze_proc(p, fp); | |
785 | fp_close(fp); | |
786 | } else { | |
6ea70f76 | 787 | kprintf("checkpoint failed with open - error: %d\n", error); |
0ba6fbbf | 788 | } |
efda3bd0 | 789 | kfree(buf, M_TEMP); |
0ba6fbbf MD |
790 | chptinuse--; |
791 | return (error); | |
792 | } | |
793 | ||
794 | static char ckptfilename[MAXPATHLEN] = {"%N.ckpt"}; | |
795 | SYSCTL_STRING(_kern, OID_AUTO, ckptfile, CTLFLAG_RW, ckptfilename, | |
796 | sizeof(ckptfilename), "process checkpoint name format string"); | |
797 | ||
798 | /* | |
799 | * expand_name(name, uid, pid) | |
800 | * Expand the name described in corefilename, using name, uid, and pid. | |
6ea70f76 | 801 | * corefilename is a kprintf-like string, with three format specifiers: |
0ba6fbbf MD |
802 | * %N name of process ("name") |
803 | * %P process id (pid) | |
804 | * %U user id (uid) | |
805 | * For example, "%N.core" is the default; they can be disabled completely | |
806 | * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". | |
807 | * This is controlled by the sysctl variable kern.corefile (see above). | |
808 | * | |
809 | * -- taken from the coredump code | |
810 | */ | |
811 | ||
812 | static | |
813 | char * | |
814 | ckpt_expand_name(const char *name, uid_t uid, pid_t pid) | |
815 | { | |
816 | char *temp; | |
817 | char *bp; | |
818 | char buf[11]; /* Buffer for pid/uid -- max 4B */ | |
819 | int error; | |
820 | int i; | |
821 | int n; | |
822 | char *format = ckptfilename; | |
823 | size_t namelen; | |
824 | ||
efda3bd0 | 825 | temp = kmalloc(MAXPATHLEN + 1, M_TEMP, M_NOWAIT); |
0ba6fbbf MD |
826 | if (temp == NULL) |
827 | return NULL; | |
828 | namelen = strlen(name); | |
829 | n = 0; | |
830 | if (ckptfilename[0] != '/') { | |
831 | if ((bp = kern_getcwd(temp, MAXPATHLEN - 1, &error)) == NULL) { | |
efda3bd0 | 832 | kfree(temp, M_TEMP); |
0ba6fbbf MD |
833 | return NULL; |
834 | } | |
835 | n = strlen(bp); | |
836 | bcopy(bp, temp, n + 1); /* normalize location of the path */ | |
837 | temp[n++] = '/'; | |
838 | temp[n] = '\0'; | |
839 | } | |
840 | for (i= 0; n < MAXPATHLEN && format[i]; i++) { | |
841 | int l; | |
842 | switch (format[i]) { | |
843 | case '%': /* Format character */ | |
844 | i++; | |
845 | switch (format[i]) { | |
846 | case '%': | |
847 | temp[n++] = '%'; | |
848 | break; | |
849 | case 'N': /* process name */ | |
850 | if ((n + namelen) > MAXPATHLEN) { | |
851 | log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", | |
852 | pid, name, uid, temp, name); | |
efda3bd0 | 853 | kfree(temp, M_TEMP); |
0ba6fbbf MD |
854 | return NULL; |
855 | } | |
856 | memcpy(temp+n, name, namelen); | |
857 | n += namelen; | |
858 | break; | |
859 | case 'P': /* process id */ | |
f8c7a42d | 860 | l = ksprintf(buf, "%u", pid); |
0ba6fbbf MD |
861 | if ((n + l) > MAXPATHLEN) { |
862 | log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", | |
863 | pid, name, uid, temp, name); | |
efda3bd0 | 864 | kfree(temp, M_TEMP); |
0ba6fbbf MD |
865 | return NULL; |
866 | } | |
867 | memcpy(temp+n, buf, l); | |
868 | n += l; | |
869 | break; | |
870 | case 'U': /* user id */ | |
f8c7a42d | 871 | l = ksprintf(buf, "%u", uid); |
0ba6fbbf MD |
872 | if ((n + l) > MAXPATHLEN) { |
873 | log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n", | |
874 | pid, name, uid, temp, name); | |
efda3bd0 | 875 | kfree(temp, M_TEMP); |
0ba6fbbf MD |
876 | return NULL; |
877 | } | |
878 | memcpy(temp+n, buf, l); | |
879 | n += l; | |
880 | break; | |
881 | default: | |
882 | log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); | |
883 | } | |
884 | break; | |
885 | default: | |
886 | temp[n++] = format[i]; | |
887 | } | |
888 | } | |
889 | temp[n] = '\0'; | |
890 | return temp; | |
891 | } | |
892 |