nrelease - fix/improve livecd
[dragonfly.git] / sys / kern / kern_checkpoint.c
... / ...
CommitLineData
1/*-
2 * Copyright (c) 2003 Kip Macy
3 * All rights reserved.
4 *
5 * Redistribution and use in source and binary forms, with or without
6 * modification, are permitted provided that the following conditions
7 * are met:
8 * 1. Redistributions of source code must retain the above copyright
9 * notice, this list of conditions and the following disclaimer.
10 * 2. Redistributions in binary form must reproduce the above copyright
11 * notice, this list of conditions and the following disclaimer in the
12 * documentation and/or other materials provided with the distribution.
13 *
14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
24 * SUCH DAMAGE.
25 */
26
27#include <sys/types.h>
28#include <sys/param.h>
29#include <sys/proc.h>
30#include <sys/module.h>
31#include <sys/sysent.h>
32#include <sys/kernel.h>
33#include <sys/systm.h>
34#include <sys/nlookup.h>
35
36#include <sys/file.h>
37#include <sys/fcntl.h>
38#include <sys/signal.h>
39#include <vm/vm_param.h>
40#include <vm/vm.h>
41#include <sys/imgact_elf.h>
42#include <sys/procfs.h>
43
44#include <sys/lock.h>
45#include <vm/pmap.h>
46#include <vm/vm_map.h>
47#include <vm/vm_extern.h>
48#include <sys/mman.h>
49#include <sys/sysmsg.h>
50#include <sys/resource.h>
51#include <sys/resourcevar.h>
52#include <sys/malloc.h>
53#include <sys/stat.h>
54#include <sys/uio.h>
55#include <sys/vnode.h>
56#include <machine/inttypes.h>
57#include <machine/limits.h>
58#include <machine/frame.h>
59#include <sys/signalvar.h>
60#include <sys/syslog.h>
61#include <sys/sysctl.h>
62#include <machine/sigframe.h>
63#include <sys/exec.h>
64#include <sys/unistd.h>
65#include <sys/time.h>
66#include <sys/kern_syscall.h>
67#include <sys/checkpoint.h>
68#include <sys/mount.h>
69#include <sys/ckpt.h>
70
71#include <sys/mplock2.h>
72#include <sys/file2.h>
73
74static int elf_loadphdrs(struct file *fp, Elf_Phdr *phdr, int numsegs);
75static int elf_getnotes(struct lwp *lp, struct file *fp, size_t notesz);
76static int elf_demarshalnotes(void *src, prpsinfo_t *psinfo,
77 prstatus_t *status, prfpregset_t *fpregset, int nthreads);
78static int elf_loadnotes(struct lwp *, prpsinfo_t *, prstatus_t *,
79 prfpregset_t *);
80static int elf_getsigs(struct lwp *lp, struct file *fp);
81static int elf_getfiles(struct lwp *lp, struct file *fp);
82static int elf_gettextvp(struct proc *p, struct file *fp);
83static char *ckpt_expand_name(const char *name, uid_t uid, pid_t pid);
84
85static int ckptgroup = 0; /* wheel only, -1 for any group */
86SYSCTL_INT(_kern, OID_AUTO, ckptgroup, CTLFLAG_RW, &ckptgroup, 0, "");
87
88/* ref count to see how many processes that are being checkpointed */
89static int chptinuse = 0;
90
91static __inline
92int
93read_check(struct file *fp, void *buf, size_t nbyte)
94{
95 size_t nread;
96 int error;
97
98 PRINTF(("reading %zd bytes\n", nbyte));
99 error = fp_read(fp, buf, nbyte, &nread, 1, UIO_SYSSPACE);
100 if (error) {
101 PRINTF(("read failed - %d", error));
102 } else if (nread != nbyte) {
103 PRINTF(("wanted to read %zd - read %zd\n", nbyte, nread));
104 error = EINVAL;
105 }
106 return error;
107}
108
109static int
110elf_gethdr(struct file *fp, Elf_Ehdr *ehdr)
111{
112 size_t nbyte = sizeof(Elf_Ehdr);
113 int error;
114
115 if ((error = read_check(fp, ehdr, nbyte)) != 0)
116 goto done;
117 if (!(ehdr->e_ehsize == sizeof(Elf_Ehdr))) {
118 PRINTF(("wrong elf header size: %d\n"
119 "expected size : %zd\n",
120 ehdr->e_ehsize, sizeof(Elf_Ehdr)));
121 return EINVAL;
122 }
123 if (!(ehdr->e_phentsize == sizeof(Elf_Phdr))) {
124 PRINTF(("wrong program header size: %d\n"
125 "expected size : %zd\n",
126 ehdr->e_phentsize, sizeof(Elf_Phdr)));
127 return EINVAL;
128 }
129
130 if (!(ehdr->e_ident[EI_MAG0] == ELFMAG0 &&
131 ehdr->e_ident[EI_MAG1] == ELFMAG1 &&
132 ehdr->e_ident[EI_MAG2] == ELFMAG2 &&
133 ehdr->e_ident[EI_MAG3] == ELFMAG3 &&
134 ehdr->e_ident[EI_CLASS] == ELF_CLASS &&
135 ehdr->e_ident[EI_DATA] == ELF_DATA &&
136 ehdr->e_ident[EI_VERSION] == EV_CURRENT &&
137 ehdr->e_ident[EI_OSABI] == ELFOSABI_NONE &&
138 ehdr->e_ident[EI_ABIVERSION] == 0)) {
139 PRINTF(("bad elf header\n there are %d segments\n",
140 ehdr->e_phnum));
141 return EINVAL;
142
143 }
144 PRINTF(("Elf header size: %d\n", ehdr->e_ehsize));
145 PRINTF(("Program header size: %d\n", ehdr->e_phentsize));
146 PRINTF(("Number of Program headers: %d\n", ehdr->e_phnum));
147 done:
148 return error;
149}
150
151static int
152elf_getphdrs(struct file *fp, Elf_Phdr *phdr, size_t nbyte)
153{
154 int i;
155 int error;
156 int nheaders = nbyte/sizeof(Elf_Phdr);
157
158 PRINTF(("reading phdrs section\n"));
159 if ((error = read_check(fp, phdr, nbyte)) != 0)
160 goto done;
161 PRINTF(("headers section:\n"));
162 for (i = 0; i < nheaders; i++) {
163 PRINTF(("entry type: %d\n", phdr[i].p_type));
164 PRINTF(("file offset: %jd\n", (intmax_t)phdr[i].p_offset));
165 PRINTF(("virt address: %p\n", (uint32_t *)phdr[i].p_vaddr));
166 PRINTF(("file size: %jd\n", (intmax_t)phdr[i].p_filesz));
167 PRINTF(("memory size: %jd\n", (intmax_t)phdr[i].p_memsz));
168 PRINTF(("\n"));
169 }
170 done:
171 return error;
172}
173
174
175static int
176elf_getnotes(struct lwp *lp, struct file *fp, size_t notesz)
177{
178 int error;
179 int nthreads;
180 char *note;
181 prpsinfo_t *psinfo;
182 prstatus_t *status;
183 prfpregset_t *fpregset;
184
185 nthreads = (notesz - sizeof(prpsinfo_t))/(sizeof(prstatus_t) +
186 sizeof(prfpregset_t));
187 PRINTF(("reading notes header nthreads=%d\n", nthreads));
188 if (nthreads <= 0 || nthreads > CKPT_MAXTHREADS)
189 return EINVAL;
190
191 psinfo = kmalloc(sizeof(prpsinfo_t), M_TEMP, M_ZERO | M_WAITOK);
192 status = kmalloc(nthreads*sizeof(prstatus_t), M_TEMP, M_WAITOK);
193 fpregset = kmalloc(nthreads*sizeof(prfpregset_t), M_TEMP, M_WAITOK);
194 note = kmalloc(notesz, M_TEMP, M_WAITOK);
195
196
197 PRINTF(("reading notes section\n"));
198 if ((error = read_check(fp, note, notesz)) != 0)
199 goto done;
200 error = elf_demarshalnotes(note, psinfo, status, fpregset, nthreads);
201 if (error)
202 goto done;
203 /* fetch register state from notes */
204 error = elf_loadnotes(lp, psinfo, status, fpregset);
205 done:
206 if (psinfo)
207 kfree(psinfo, M_TEMP);
208 if (status)
209 kfree(status, M_TEMP);
210 if (fpregset)
211 kfree(fpregset, M_TEMP);
212 if (note)
213 kfree(note, M_TEMP);
214 return error;
215}
216
217static int
218ckpt_thaw_proc(struct lwp *lp, struct file *fp)
219{
220 struct proc *p = lp->lwp_proc;
221 Elf_Phdr *phdr = NULL;
222 Elf_Ehdr *ehdr = NULL;
223 int error;
224 size_t nbyte;
225
226 TRACE_ENTER;
227
228 ehdr = kmalloc(sizeof(Elf_Ehdr), M_TEMP, M_ZERO | M_WAITOK);
229
230 if ((error = elf_gethdr(fp, ehdr)) != 0)
231 goto done;
232 nbyte = sizeof(Elf_Phdr) * ehdr->e_phnum;
233 phdr = kmalloc(nbyte, M_TEMP, M_WAITOK);
234
235 /* fetch description of program writable mappings */
236 if ((error = elf_getphdrs(fp, phdr, nbyte)) != 0)
237 goto done;
238
239 /* fetch notes section containing register state */
240 if ((error = elf_getnotes(lp, fp, phdr->p_filesz)) != 0)
241 goto done;
242
243 /* fetch program text vnodes */
244 if ((error = elf_gettextvp(p, fp)) != 0)
245 goto done;
246
247 /* fetch signal disposition */
248 if ((error = elf_getsigs(lp, fp)) != 0) {
249 kprintf("failure in recovering signals\n");
250 goto done;
251 }
252
253 /* fetch open files */
254 if ((error = elf_getfiles(lp, fp)) != 0)
255 goto done;
256
257 /* handle mappings last in case we are reading from a socket */
258 error = elf_loadphdrs(fp, phdr, ehdr->e_phnum);
259
260 /*
261 * Set the textvp to the checkpoint file and mark the vnode so
262 * a future checkpointing of this checkpoint-restored program
263 * will copy out the contents of the mappings rather then trying
264 * to record the vnode info related to the checkpoint file, which
265 * is likely going to be destroyed when the program is re-checkpointed.
266 */
267 if (error == 0 && fp->f_data && fp->f_type == DTYPE_VNODE) {
268 if (p->p_textvp)
269 vrele(p->p_textvp);
270 p->p_textvp = (struct vnode *)fp->f_data;
271 vsetflags(p->p_textvp, VCKPT);
272 vref(p->p_textvp);
273 }
274done:
275 if (ehdr)
276 kfree(ehdr, M_TEMP);
277 if (phdr)
278 kfree(phdr, M_TEMP);
279 TRACE_EXIT;
280 return error;
281}
282
283static int
284elf_loadnotes(struct lwp *lp, prpsinfo_t *psinfo, prstatus_t *status,
285 prfpregset_t *fpregset)
286{
287 struct proc *p = lp->lwp_proc;
288 int error;
289
290 /* validate status and psinfo */
291 TRACE_ENTER;
292 if (status->pr_version != PRSTATUS_VERSION ||
293 status->pr_statussz != sizeof(prstatus_t) ||
294 status->pr_gregsetsz != sizeof(gregset_t) ||
295 status->pr_fpregsetsz != sizeof(fpregset_t) ||
296 psinfo->pr_version != PRPSINFO_VERSION ||
297 psinfo->pr_psinfosz != sizeof(prpsinfo_t)) {
298 PRINTF(("status check failed\n"));
299 error = EINVAL;
300 goto done;
301 }
302 /* XXX lwp handle more than one lwp*/
303 if ((error = set_regs(lp, &status->pr_reg)) != 0)
304 goto done;
305 error = set_fpregs(lp, fpregset);
306 strlcpy(p->p_comm, psinfo->pr_fname, sizeof(p->p_comm));
307 /* XXX psinfo->pr_psargs not yet implemented */
308 done:
309 TRACE_EXIT;
310 return error;
311}
312
313static int
314elf_getnote(void *src, size_t *off, const char *name, unsigned int type,
315 void **desc, size_t descsz)
316{
317 Elf_Note note;
318 int error;
319
320 TRACE_ENTER;
321 if (src == NULL) {
322 error = EFAULT;
323 goto done;
324 }
325 bcopy((char *)src + *off, &note, sizeof note);
326
327 PRINTF(("at offset: %zd expected note of type: %d - got: %d\n",
328 *off, type, note.n_type));
329 *off += sizeof note;
330 if (type != note.n_type) {
331 TRACE_ERR;
332 error = EINVAL;
333 goto done;
334 }
335 if (strncmp(name, (char *) src + *off, note.n_namesz) != 0) {
336 error = EINVAL;
337 goto done;
338 }
339 *off += roundup2(note.n_namesz, sizeof(Elf_Size));
340 if (note.n_descsz != descsz) {
341 TRACE_ERR;
342 error = EINVAL;
343 goto done;
344 }
345 if (desc)
346 bcopy((char *)src + *off, *desc, note.n_descsz);
347 *off += roundup2(note.n_descsz, sizeof(Elf_Size));
348 error = 0;
349 done:
350 TRACE_EXIT;
351 return error;
352}
353
354static int
355elf_demarshalnotes(void *src, prpsinfo_t *psinfo, prstatus_t *status,
356 prfpregset_t *fpregset, int nthreads)
357{
358 int i;
359 int error;
360 size_t off = 0;
361
362 TRACE_ENTER;
363 error = elf_getnote(src, &off, "CORE", NT_PRPSINFO,
364 (void **)&psinfo, sizeof(prpsinfo_t));
365 if (error)
366 goto done;
367 error = elf_getnote(src, &off, "CORE", NT_PRSTATUS,
368 (void **)&status, sizeof(prstatus_t));
369 if (error)
370 goto done;
371 error = elf_getnote(src, &off, "CORE", NT_FPREGSET,
372 (void **)&fpregset, sizeof(prfpregset_t));
373 if (error)
374 goto done;
375
376 /*
377 * The remaining portion needs to be an integer multiple
378 * of prstatus_t and prfpregset_t
379 */
380 for (i = 0 ; i < nthreads - 1; i++) {
381 status++; fpregset++;
382 error = elf_getnote(src, &off, "CORE", NT_PRSTATUS,
383 (void **)&status, sizeof (prstatus_t));
384 if (error)
385 goto done;
386 error = elf_getnote(src, &off, "CORE", NT_FPREGSET,
387 (void **)&fpregset, sizeof(prfpregset_t));
388 if (error)
389 goto done;
390 }
391
392 done:
393 TRACE_EXIT;
394 return error;
395}
396
397
398static int
399mmap_phdr(struct file *fp, Elf_Phdr *phdr)
400{
401 int error;
402 size_t len;
403 int prot;
404 void *addr;
405 int flags;
406 off_t pos;
407
408 TRACE_ENTER;
409 pos = phdr->p_offset;
410 len = phdr->p_filesz;
411 addr = (void *)phdr->p_vaddr;
412 flags = MAP_FIXED | MAP_NOSYNC | MAP_PRIVATE;
413 prot = 0;
414 if (phdr->p_flags & PF_R)
415 prot |= PROT_READ;
416 if (phdr->p_flags & PF_W)
417 prot |= PROT_WRITE;
418 if (phdr->p_flags & PF_X)
419 prot |= PROT_EXEC;
420 if ((error = fp_mmap(addr, len, prot, flags, fp, pos, &addr)) != 0) {
421 PRINTF(("mmap failed: %d\n", error); );
422 }
423 PRINTF(("map @%08"PRIxPTR"-%08"PRIxPTR" fileoff %08x-%08x\n", (uintptr_t)addr,
424 (uintptr_t)((char *)addr + len), (int)pos, (int)(pos + len)));
425 TRACE_EXIT;
426 return error;
427}
428
429/*
430 * Load memory mapped segments. The segments are backed by the checkpoint
431 * file.
432 */
433static int
434elf_loadphdrs(struct file *fp, Elf_Phdr *phdr, int numsegs)
435{
436 int i;
437 int error = 0;
438
439 TRACE_ENTER;
440 for (i = 1; i < numsegs; i++) {
441 if ((error = mmap_phdr(fp, &phdr[i])) != 0)
442 break;
443 }
444 TRACE_EXIT;
445 return error;
446}
447
448static int
449elf_getsigs(struct lwp *lp, struct file *fp)
450{
451 struct proc *p = lp->lwp_proc;
452 int error;
453 struct ckpt_siginfo *csi;
454
455 TRACE_ENTER;
456 csi = kmalloc(sizeof(struct ckpt_siginfo), M_TEMP, M_ZERO | M_WAITOK);
457 if ((error = read_check(fp, csi, sizeof(struct ckpt_siginfo))) != 0)
458 goto done;
459
460 if (csi->csi_ckptpisz != sizeof(struct ckpt_siginfo)) {
461 TRACE_ERR;
462 error = EINVAL;
463 goto done;
464 }
465 bcopy(&csi->csi_sigacts, p->p_sigacts, sizeof(struct sigacts));
466 bcopy(&csi->csi_itimerval, &p->p_realtimer, sizeof(struct itimerval));
467 SIG_CANTMASK(csi->csi_sigmask);
468 /* XXX lwp handle more than one lwp */
469 bcopy(&csi->csi_sigmask, &lp->lwp_sigmask, sizeof(sigset_t));
470 p->p_sigparent = csi->csi_sigparent;
471 done:
472 if (csi)
473 kfree(csi, M_TEMP);
474 TRACE_EXIT;
475 return error;
476}
477
478/*
479 * Returns a locked, refd vnode
480 */
481static int
482ckpt_fhtovp(fhandle_t *fh, struct vnode **vpp)
483{
484 struct mount *mp;
485 int error;
486
487 TRACE_ENTER;
488 mp = vfs_getvfs(&fh->fh_fsid);
489
490 if (!mp) {
491 TRACE_ERR;
492 PRINTF(("failed to get mount - ESTALE\n"));
493 TRACE_EXIT;
494 return ESTALE;
495 }
496 error = VFS_FHTOVP(mp, NULL, &fh->fh_fid, vpp);
497 mount_drop(mp);
498 if (error) {
499 PRINTF(("failed with: %d\n", error));
500 TRACE_ERR;
501 }
502 TRACE_EXIT;
503 return error;
504}
505
506static int
507mmap_vp(struct vn_hdr *vnh)
508{
509 struct vnode *vp;
510 Elf_Phdr *phdr;
511 struct file *fp;
512 int error;
513 TRACE_ENTER;
514
515 phdr = &vnh->vnh_phdr;
516
517 if ((error = ckpt_fhtovp(&vnh->vnh_fh, &vp)) != 0)
518 return error;
519 /*
520 * XXX O_RDONLY -> or O_RDWR if file is PROT_WRITE, MAP_SHARED
521 */
522 if ((error = fp_vpopen(vp, O_RDONLY, &fp)) != 0) {
523 vput(vp);
524 return error;
525 }
526 error = mmap_phdr(fp, phdr);
527 fp_close(fp);
528 TRACE_EXIT;
529 return error;
530}
531
532
533static int
534elf_gettextvp(struct proc *p, struct file *fp)
535{
536 int i;
537 int error;
538 int vpcount;
539 struct ckpt_vminfo vminfo;
540 struct vn_hdr *vnh = NULL;
541
542 TRACE_ENTER;
543 if ((error = read_check(fp, &vminfo, sizeof(vminfo))) != 0)
544 goto done;
545 if (vminfo.cvm_dsize < 0 ||
546 vminfo.cvm_dsize > p->p_rlimit[RLIMIT_DATA].rlim_cur ||
547 vminfo.cvm_tsize < 0 ||
548 (u_quad_t)vminfo.cvm_tsize > maxtsiz ||
549 vminfo.cvm_daddr >= (caddr_t)VM_MAX_USER_ADDRESS ||
550 vminfo.cvm_taddr >= (caddr_t)VM_MAX_USER_ADDRESS
551 ) {
552 error = ERANGE;
553 goto done;
554 }
555
556 vmspace_exec(p, NULL);
557 p->p_vmspace->vm_daddr = vminfo.cvm_daddr;
558 p->p_vmspace->vm_dsize = ctob(vminfo.cvm_dsize); /* in bytes */
559 p->p_vmspace->vm_taddr = vminfo.cvm_taddr;
560 p->p_vmspace->vm_tsize = ctob(vminfo.cvm_tsize); /* in bytes */
561 if ((error = read_check(fp, &vpcount, sizeof(int))) != 0)
562 goto done;
563 vnh = kmalloc(sizeof(struct vn_hdr) * vpcount, M_TEMP, M_WAITOK);
564 if ((error = read_check(fp, vnh, sizeof(struct vn_hdr)*vpcount)) != 0)
565 goto done;
566 for (i = 0; i < vpcount; i++) {
567 if ((error = mmap_vp(&vnh[i])) != 0)
568 goto done;
569 }
570
571 done:
572 if (vnh)
573 kfree(vnh, M_TEMP);
574 TRACE_EXIT;
575 return error;
576}
577
578
579
580/* place holder */
581static int
582elf_getfiles(struct lwp *lp, struct file *fp)
583{
584 int error;
585 int i;
586 int filecount;
587 int fd;
588 struct ckpt_filehdr filehdr;
589 struct ckpt_fileinfo *cfi_base = NULL;
590 struct filedesc *fdp = lp->lwp_proc->p_fd;
591 struct vnode *vp;
592 struct file *tempfp;
593 struct file *ofp;
594
595 TRACE_ENTER;
596 if ((error = read_check(fp, &filehdr, sizeof(filehdr))) != 0)
597 goto done;
598 filecount = filehdr.cfh_nfiles;
599 cfi_base = kmalloc(filecount*sizeof(struct ckpt_fileinfo), M_TEMP, M_WAITOK);
600 error = read_check(fp, cfi_base, filecount*sizeof(struct ckpt_fileinfo));
601 if (error)
602 goto done;
603
604 /*
605 * Close all file descriptors >= 3. These descriptors are from the
606 * checkpt(1) program itself and should not be retained.
607 *
608 * XXX we need a flag so a checkpoint restore can opt to supply the
609 * descriptors, or the non-regular-file descripors.
610 */
611 for (i = 3; i < fdp->fd_nfiles; ++i)
612 kern_close(i);
613
614 /*
615 * Scan files to load
616 */
617 for (i = 0; i < filecount; i++) {
618 struct ckpt_fileinfo *cfi= &cfi_base[i];
619 /*
620 * Ignore placeholder entries where cfi_index is less then
621 * zero. This will occur if the elf core dump code thinks
622 * it can save a vnode but winds up not being able to.
623 */
624 if (cfi->cfi_index < 0)
625 continue;
626
627 /*
628 * Restore a saved file descriptor. If CKFIF_ISCKPTFD is
629 * set the descriptor represents the checkpoint file itself,
630 * probably due to the user calling sys_checkpoint(). We
631 * want to use the fp being used to restore the checkpoint
632 * instead of trying to restore the original filehandle.
633 */
634 if (cfi->cfi_ckflags & CKFIF_ISCKPTFD) {
635 fhold(fp);
636 tempfp = fp;
637 error = 0;
638 } else {
639 error = ckpt_fhtovp(&cfi->cfi_fh, &vp);
640 if (error == 0) {
641 error = fp_vpopen(vp, OFLAGS(cfi->cfi_flags),
642 &tempfp);
643 if (error)
644 vput(vp);
645 }
646 }
647 if (error)
648 break;
649 tempfp->f_offset = cfi->cfi_offset;
650
651 /*
652 * If overwriting a descriptor close the old descriptor. This
653 * only occurs if the saved core saved descriptors that we
654 * have not already closed.
655 */
656 if (cfi->cfi_index < fdp->fd_nfiles &&
657 (ofp = fdp->fd_files[cfi->cfi_index].fp) != NULL) {
658 kern_close(cfi->cfi_index);
659 }
660
661 /*
662 * Allocate the descriptor we want.
663 */
664 if (fdalloc(lp->lwp_proc, cfi->cfi_index, &fd) != 0) {
665 PRINTF(("can't currently restore fd: %d\n",
666 cfi->cfi_index));
667 fp_close(fp);
668 goto done;
669 }
670 KKASSERT(fd == cfi->cfi_index);
671 fsetfd(fdp, tempfp, fd);
672 fdrop(tempfp);
673 cfi++;
674 PRINTF(("restoring %d\n", cfi->cfi_index));
675 }
676
677 done:
678 if (cfi_base)
679 kfree(cfi_base, M_TEMP);
680 TRACE_EXIT;
681 return error;
682}
683
684static int
685ckpt_freeze_proc(struct lwp *lp, struct file *fp)
686{
687 struct proc *p = lp->lwp_proc;
688 rlim_t limit;
689 int error;
690
691 lwkt_gettoken(&p->p_token); /* needed for proc_*() calls */
692
693 PRINTF(("calling generic_elf_coredump\n"));
694 limit = p->p_rlimit[RLIMIT_CORE].rlim_cur;
695 if (limit) {
696 if (p->p_stat != SCORE) {
697 proc_stop(p, SCORE);
698 while (p->p_nstopped < p->p_nthreads - 1)
699 tsleep(&p->p_nstopped, 0, "freeze", 1);
700 error = generic_elf_coredump(lp, SIGCKPT, fp, limit);
701 proc_unstop(p, SCORE);
702 } else {
703 error = ERANGE;
704 }
705 } else {
706 error = ERANGE;
707 }
708 lwkt_reltoken(&p->p_token);
709 return error;
710}
711
712/*
713 * MPALMOSTSAFE
714 */
715int
716sys_sys_checkpoint(struct sysmsg *sysmsg,
717 const struct sys_checkpoint_args *uap)
718{
719 int error = 0;
720 struct thread *td = curthread;
721 struct proc *p = td->td_proc;
722 struct file *fp;
723
724 /*
725 * Only certain groups (to reduce our security exposure). -1
726 * allows any group.
727 */
728 if (ckptgroup >= 0 && groupmember(ckptgroup, td->td_ucred) == 0)
729 return (EPERM);
730
731 /*
732 * For now we can only checkpoint the current process
733 */
734 if (uap->pid != -1 && uap->pid != p->p_pid)
735 return (EINVAL);
736
737 get_mplock();
738
739 switch (uap->type) {
740 case CKPT_FREEZE:
741 fp = NULL;
742 if (uap->fd == -1 && uap->pid == (pid_t)-1)
743 error = checkpoint_signal_handler(td->td_lwp);
744 else if ((fp = holdfp(td, uap->fd, FWRITE)) == NULL)
745 error = EBADF;
746 else
747 error = ckpt_freeze_proc(td->td_lwp, fp);
748 if (fp)
749 dropfp(td, uap->fd, fp);
750 break;
751 case CKPT_THAW:
752 if (uap->pid != -1) {
753 error = EINVAL;
754 break;
755 }
756 if ((fp = holdfp(td, uap->fd, FREAD)) == NULL) {
757 error = EBADF;
758 break;
759 }
760 sysmsg->sysmsg_result = uap->retval;
761 error = ckpt_thaw_proc(td->td_lwp, fp);
762 dropfp(td, uap->fd, fp);
763 break;
764 default:
765 error = EOPNOTSUPP;
766 break;
767 }
768 rel_mplock();
769 return error;
770}
771
772int
773checkpoint_signal_handler(struct lwp *lp)
774{
775 struct thread *td = lp->lwp_thread;
776 struct proc *p = lp->lwp_proc;
777 char *buf;
778 struct file *fp;
779 struct nlookupdata nd;
780 int error;
781
782 chptinuse++;
783
784 /*
785 * Being able to checkpoint an suid or sgid program is not a good
786 * idea.
787 */
788 if (sugid_coredump == 0 && (p->p_flags & P_SUGID)) {
789 chptinuse--;
790 return (EPERM);
791 }
792
793 buf = ckpt_expand_name(p->p_comm, td->td_ucred->cr_uid, p->p_pid);
794 if (buf == NULL) {
795 chptinuse--;
796 return (ENOMEM);
797 }
798
799 log(LOG_INFO, "pid %d (%s), uid %d: checkpointing to %s\n",
800 p->p_pid, p->p_comm,
801 (td->td_ucred ? td->td_ucred->cr_uid : -1),
802 buf);
803
804 PRINTF(("ckpt handler called, using '%s'\n", buf));
805
806 /*
807 * Use the same safety flags that the coredump code uses. Remove
808 * any previous checkpoint file before writing out the new one in
809 * case we are re-checkpointing a program that had been checkpt
810 * restored. Otherwise we will corrupt the program space (which is
811 * made up of mmap()ings of the previous checkpoint file) while we
812 * write out the new one.
813 */
814 error = nlookup_init(&nd, buf, UIO_SYSSPACE, 0);
815 if (error == 0)
816 error = kern_unlink(&nd);
817 nlookup_done(&nd);
818 error = fp_open(buf, O_WRONLY|O_CREAT|O_TRUNC|O_NOFOLLOW, 0600, &fp);
819 if (error == 0) {
820 error = ckpt_freeze_proc(lp, fp);
821 fp_close(fp);
822 } else {
823 kprintf("checkpoint failed with open - error: %d\n", error);
824 }
825 kfree(buf, M_TEMP);
826 chptinuse--;
827 return (error);
828}
829
830static char ckptfilename[MAXPATHLEN] = {"%N.ckpt"};
831SYSCTL_STRING(_kern, OID_AUTO, ckptfile, CTLFLAG_RW, ckptfilename,
832 sizeof(ckptfilename), "process checkpoint name format string");
833
834/*
835 * expand_name(name, uid, pid)
836 * Expand the name described in corefilename, using name, uid, and pid.
837 * corefilename is a kprintf-like string, with three format specifiers:
838 * %N name of process ("name")
839 * %P process id (pid)
840 * %U user id (uid)
841 * For example, "%N.core" is the default; they can be disabled completely
842 * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P".
843 * This is controlled by the sysctl variable kern.corefile (see above).
844 *
845 * -- taken from the coredump code
846 */
847
848static
849char *
850ckpt_expand_name(const char *name, uid_t uid, pid_t pid)
851{
852 char *temp;
853 char *bp;
854 char buf[11]; /* Buffer for pid/uid -- max 4B */
855 int error;
856 int i;
857 int n;
858 char *format = ckptfilename;
859 size_t namelen;
860
861 temp = kmalloc(MAXPATHLEN + 1, M_TEMP, M_NOWAIT);
862 if (temp == NULL)
863 return NULL;
864 namelen = strlen(name);
865 n = 0;
866 if (ckptfilename[0] != '/') {
867 if ((bp = kern_getcwd(temp, MAXPATHLEN - 1, &error)) == NULL) {
868 kfree(temp, M_TEMP);
869 return NULL;
870 }
871 n = strlen(bp);
872 bcopy(bp, temp, n + 1); /* normalize location of the path */
873 temp[n++] = '/';
874 temp[n] = '\0';
875 }
876 for (i= 0; n < MAXPATHLEN && format[i]; i++) {
877 int l;
878 switch (format[i]) {
879 case '%': /* Format character */
880 i++;
881 switch (format[i]) {
882 case '%':
883 temp[n++] = '%';
884 break;
885 case 'N': /* process name */
886 if ((n + namelen) > MAXPATHLEN) {
887 log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n",
888 pid, name, uid, temp, name);
889 kfree(temp, M_TEMP);
890 return NULL;
891 }
892 memcpy(temp+n, name, namelen);
893 n += namelen;
894 break;
895 case 'P': /* process id */
896 l = ksprintf(buf, "%u", pid);
897 if ((n + l) > MAXPATHLEN) {
898 log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n",
899 pid, name, uid, temp, name);
900 kfree(temp, M_TEMP);
901 return NULL;
902 }
903 memcpy(temp+n, buf, l);
904 n += l;
905 break;
906 case 'U': /* user id */
907 l = ksprintf(buf, "%u", uid);
908 if ((n + l) > MAXPATHLEN) {
909 log(LOG_ERR, "pid %d (%s), uid (%u): Path `%s%s' is too long\n",
910 pid, name, uid, temp, name);
911 kfree(temp, M_TEMP);
912 return NULL;
913 }
914 memcpy(temp+n, buf, l);
915 n += l;
916 break;
917 default:
918 log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format);
919 }
920 break;
921 default:
922 temp[n++] = format[i];
923 }
924 }
925 temp[n] = '\0';
926 return temp;
927}
928