Skip to content

Commit

Permalink
page-cache: use the PAGEMAP_SCAN ioctl when it is available
Browse files Browse the repository at this point in the history
Signed-off-by: Andrei Vagin <[email protected]>
  • Loading branch information
avagin committed Oct 25, 2023
1 parent e635fe1 commit bcf96f2
Show file tree
Hide file tree
Showing 6 changed files with 149 additions and 73 deletions.
4 changes: 3 additions & 1 deletion criu/include/mem.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
#include "pid.h"
#include "proc_parse.h"
#include "inventory.pb-c.h"
#include "pagemap-cache.h"

struct parasite_ctl;
struct vm_area_list;
Expand Down Expand Up @@ -47,5 +48,6 @@ int open_vmas(struct pstree_item *t);
int prepare_vmas(struct pstree_item *t, struct task_restore_args *ta);
int unmap_guard_pages(struct pstree_item *t);
int prepare_mappings(struct pstree_item *t);
bool should_dump_page(VmaEntry *vmae, u64 pme);

u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, u64 *ppme);
#endif /* __CR_MEM_H__ */
12 changes: 10 additions & 2 deletions criu/include/pagemap-cache.h
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
#ifndef __CR_PAGEMAP_H__
#define __CR_PAGEMAP_H__

#include <stdbool.h>
#include <sys/types.h>
#include "int.h"

#include "common/list.h"
#include "pagemap_scan.h"

struct vma_area;

Expand All @@ -15,9 +17,15 @@ typedef struct {
unsigned long start; /* start of area */
unsigned long end; /* end of area */
const struct list_head *vma_head; /* list head of VMAs we're serving */
int fd; /* file to read PMs from */

u64 *map; /* local buffer */
size_t map_len; /* length of a buffer */
int fd; /* file to read PMs from */

struct page_region *regs; /* buffer for the PAGEMAP_SCAN ioctl */
size_t regs_len; /* lenght of regs */
size_t regs_max_len; /* maximum length of regs */
size_t regs_idx; /* current index in the regs array */
} pmc_t;

#define PMC_INIT \
Expand All @@ -26,7 +34,7 @@ typedef struct {
}

extern int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t size);
extern u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma);
extern int pmc_get_map(pmc_t *pmc, const struct vma_area *vma, bool soft_dirty);
extern void pmc_fini(pmc_t *pmc);

#endif /* __CR_PAGEMAP_H__ */
3 changes: 2 additions & 1 deletion criu/include/shmem.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,14 @@
#include "int.h"
#include "common/lock.h"
#include "images/vma.pb-c.h"
#include "pagemap-cache.h"

struct vma_area;

extern int collect_shmem(int pid, struct vma_area *vma);
extern int collect_sysv_shmem(unsigned long shmid, unsigned long size);
extern int cr_dump_shmem(void);
extern int add_shmem_area(pid_t pid, VmaEntry *vma, u64 *map);
extern int add_shmem_area(pid_t pid, VmaEntry *vma, pmc_t *pmc);
extern int fixup_sysv_shmems(void);
extern int dump_one_memfd_shmem(int fd, unsigned long shmid, unsigned long size);
extern int dump_one_sysv_shmem(void *addr, unsigned long size, unsigned long shmid);
Expand Down
111 changes: 70 additions & 41 deletions criu/mem.c
Original file line number Diff line number Diff line change
Expand Up @@ -99,38 +99,56 @@ static inline bool __page_in_parent(bool dirty)
return opts.track_mem && opts.img_parent && !dirty;
}

bool should_dump_page(VmaEntry *vmae, u64 pme)
static bool should_dump_entire_vma(VmaEntry *vmae)
{
/*
* vDSO area must be always dumped because on restore
* we might need to generate a proxy.
*/
if (vma_entry_is(vmae, VMA_AREA_VDSO))
if (vma_entry_is(vmae, VMA_AREA_VDSO)) {
return true;
/*
* In turn VVAR area is special and referenced from
* vDSO area by IP addressing (at least on x86) thus
* never ever dump its content but always use one provided
* by the kernel on restore, ie runtime VVAR area must
* be remapped into proper place..
*/
if (vma_entry_is(vmae, VMA_AREA_VVAR))
return false;

/*
* Optimisation for private mapping pages, that haven't
* yet being COW-ed
*/
if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
return false;
}
if (vma_entry_is(vmae, VMA_AREA_AIORING))
return true;
if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme))
return true;

return false;
}

/*
* should_dump_page returns vaddr if an addressed page has to be dumped.
* Otherwise, it returns an address that has to be inspected next.
*/
u64 should_dump_page(pmc_t *pmc, VmaEntry *vmae, u64 vaddr, u64 *ppme)
{
if (pmc->regs) {
while (1) {
if (pmc->regs_idx == pmc->regs_len)
return vmae->end;
if (vaddr < pmc->regs[pmc->regs_idx].end)
break;
pmc->regs_idx++;
}
if (vaddr < pmc->regs[pmc->regs_idx].start)
return pmc->regs[pmc->regs_idx].start;
return vaddr;
} else {
u64 pme = pmc->map[PAGE_PFN(vaddr - pmc->start)];

*ppme = pme;

/*
* Optimisation for private mapping pages, that haven't
* yet being COW-ed
*/
if (vma_entry_is(vmae, VMA_FILE_PRIVATE) && (pme & PME_FILE))
return vaddr + PAGE_SIZE;
if ((pme & (PME_PRESENT | PME_SWAP)) && !__page_is_zero(pme))
return vaddr;

return vaddr + PAGE_SIZE;
}
}

bool page_is_zero(u64 pme)
{
return __page_is_zero(pme);
Expand Down Expand Up @@ -164,25 +182,29 @@ static bool is_stack(struct pstree_item *item, unsigned long vaddr)
* the memory contents is present in the parent image set.
*/

static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, u64 *map, u64 *off,
static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct page_pipe *pp, pmc_t *pmc, u64 *pvaddr,
bool has_parent)
{
u64 *at = &map[PAGE_PFN(*off)];
unsigned long pfn, nr_to_scan;
unsigned long nr_scanned;
unsigned long pages[3] = {};
unsigned long vaddr;
bool dump_all_pages;
int ret = 0;

nr_to_scan = (vma_area_len(vma) - *off) / PAGE_SIZE;

for (pfn = 0; pfn < nr_to_scan; pfn++) {
unsigned long vaddr;
dump_all_pages = should_dump_entire_vma(vma->e);
nr_scanned = 0;
for (vaddr = *pvaddr; vaddr < vma->e->end; vaddr += PAGE_SIZE, nr_scanned++) {
unsigned int ppb_flags = 0;
u64 pme = 0, next;
int st;

if (!should_dump_page(vma->e, at[pfn]))
/* If dump_all_pages is true, should_dump_page is called to get pme. */
next = should_dump_page(pmc, vma->e, vaddr, &pme);
if (!dump_all_pages && next != vaddr) {
vaddr = next - PAGE_SIZE;
continue;

vaddr = vma->e->start + *off + pfn * PAGE_SIZE;
}

if (vma_entry_can_be_lazy(vma->e) && !is_stack(item, vaddr))
ppb_flags |= PPB_LAZY;
Expand All @@ -194,7 +216,7 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct
* page. The latter would be checked in page-xfer.
*/

if (has_parent && page_in_parent(at[pfn] & PME_SOFT_DIRTY)) {
if (has_parent && page_in_parent(pme & PME_SOFT_DIRTY)) {
ret = page_pipe_add_hole(pp, vaddr, PP_HOLE_PARENT);
st = 0;
} else {
Expand All @@ -214,9 +236,9 @@ static int generate_iovs(struct pstree_item *item, struct vma_area *vma, struct
pages[st]++;
}

*off += pfn * PAGE_SIZE;

cnt_add(CNT_PAGES_SCANNED, nr_to_scan);
*pvaddr = vaddr;
cnt_add(CNT_PAGES_SCANNED, nr_scanned);
cnt_add(CNT_PAGES_SKIPPED_PARENT, pages[0]);
cnt_add(CNT_PAGES_LAZY, pages[1]);
cnt_add(CNT_PAGES_WRITTEN, pages[2]);
Expand Down Expand Up @@ -356,12 +378,21 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str
struct page_xfer *xfer, struct parasite_dump_pages_args *args, struct parasite_ctl *ctl,
pmc_t *pmc, bool has_parent, bool pre_dump, int parent_predump_mode)
{
u64 off = 0;
u64 *map;
bool soft_dirty = has_parent;
u64 vaddr;
int ret;

if (!vma_area_is_private(vma, kdat.task_size) && !vma_area_is(vma, VMA_ANON_SHARED))
return 0;
/*
* In turn VVAR area is special and referenced from
* vDSO area by IP addressing (at least on x86) thus
* never ever dump its content but always use one provided
* by the kernel on restore, ie runtime VVAR area must
* be remapped into proper place..
*/
if (vma_entry_is(vma->e, VMA_AREA_VVAR))
return 0;

/*
* To facilitate any combination of pre-dump modes to run after
Expand Down Expand Up @@ -390,7 +421,6 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str
* pre-dump cycle has skipped processing & pagemap generation for
* non-PROT_READ regions. So SPLICE mode throws error of missing
* pagemap entry for encountered non-PROT_READ mapping.
*
* To resolve this, the pre-dump-mode is stored in current pre-dump's
* inventoy file. This pre-dump mode is read back from this file
* (present in parent pre-dump dir) as parent-pre-dump-mode during
Expand Down Expand Up @@ -421,15 +451,14 @@ static int generate_vma_iovs(struct pstree_item *item, struct vma_area *vma, str
has_parent = false;
}

map = pmc_get_map(pmc, vma);
if (!map)
if (pmc_get_map(pmc, vma, soft_dirty))
return -1;

if (vma_area_is(vma, VMA_ANON_SHARED))
return add_shmem_area(item->pid->real, vma->e, map);

return add_shmem_area(item->pid->real, vma->e, pmc);
vaddr = vma->e->start;
again:
ret = generate_iovs(item, vma, pp, map, &off, has_parent);
ret = generate_iovs(item, vma, pp, pmc, &vaddr, has_parent);
if (ret == -EAGAIN) {
BUG_ON(!(pp->flags & PP_CHUNK_MODE));

Expand Down
70 changes: 50 additions & 20 deletions criu/pagemap-cache.c
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include <unistd.h>
#include <fcntl.h>
#include <sys/ioctl.h>

#include "page.h"
#include "pagemap-cache.h"
Expand Down Expand Up @@ -50,10 +51,19 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz
pmc->pid = pid;
pmc->map_len = PAGEMAP_LEN(map_size);
pmc->vma_head = vma_head;
pmc->regs_max_len = (map_size / PAGE_SIZE + 1) / 2;
pmc->regs_len = 0;
pmc->regs_idx = 0;

pmc->map = xmalloc(pmc->map_len);
if (!pmc->map)
goto err;
if (kdat.has_pagemap_scan) {
pmc->regs = xmalloc(pmc->regs_max_len * sizeof(struct page_region));
if (!pmc->regs)
goto err;
} else {
pmc->map = xmalloc(pmc->map_len);
if (!pmc->map)
goto err;
}

if (pagemap_cache_disabled)
pr_warn_once("The pagemap cache is disabled\n");
Expand Down Expand Up @@ -87,12 +97,7 @@ int pmc_init(pmc_t *pmc, pid_t pid, const struct list_head *vma_head, size_t siz
return -1;
}

static inline u64 *__pmc_get_map(pmc_t *pmc, unsigned long addr)
{
return &pmc->map[PAGE_PFN(addr - pmc->start)];
}

static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma)
static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma, bool soft_dirty)
{
unsigned long low = vma->e->start & PMC_MASK;
unsigned long high = low + PMC_SIZE;
Expand Down Expand Up @@ -153,35 +158,60 @@ static int pmc_fill_cache(pmc_t *pmc, const struct vma_area *vma)
BUG_ON(pmc->map_len < size_map);
BUG_ON(pmc->fd < 0);

if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
pmc_zap(pmc);
pr_perror("Can't read %d's pagemap file", pmc->pid);
return -1;
if (kdat.has_pagemap_scan && !soft_dirty) {
struct pm_scan_arg args = {
.size = sizeof(struct pm_scan_arg),
.flags = 0,
.start = pmc->start,
.end = pmc->end,
.vec = (long)pmc->regs,
.vec_len = pmc->regs_max_len,
.max_pages = 0,
.category_inverted = PAGE_IS_PFNZERO | PAGE_IS_FILE,
.category_mask = PAGE_IS_PFNZERO | PAGE_IS_FILE,
.category_anyof_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED,
.return_mask = PAGE_IS_PRESENT | PAGE_IS_SWAPPED,
};
long ret;

ret = ioctl(pmc->fd, PAGEMAP_SCAN, &args);
if (ret == -1) {
pr_perror("PAGEMAP_SCAN");
pmc_zap(pmc);
return -1;
}
pmc->regs_len = ret;
pmc->regs_idx = 0;
} else {
if (pread(pmc->fd, pmc->map, size_map, PAGEMAP_PFN_OFF(pmc->start)) != size_map) {
pmc_zap(pmc);
pr_perror("Can't read %d's pagemap file", pmc->pid);
return -1;
}
}

return 0;
}

u64 *pmc_get_map(pmc_t *pmc, const struct vma_area *vma)
int pmc_get_map(pmc_t *pmc, const struct vma_area *vma, bool soft_dirty)
{
/* Hit */
if (likely(pmc->start <= vma->e->start && pmc->end >= vma->e->end))
return __pmc_get_map(pmc, vma->e->start);
return 0;

/* Miss, refill the cache */
if (pmc_fill_cache(pmc, vma)) {
if (pmc_fill_cache(pmc, vma, soft_dirty)) {
pr_err("Failed to fill cache for %d (%lx-%lx)\n", pmc->pid, (long)vma->e->start, (long)vma->e->end);
return NULL;
return -1;
}

/* Hit for sure */
return __pmc_get_map(pmc, vma->e->start);
return 0;
}

void pmc_fini(pmc_t *pmc)
{
close_safe(&pmc->fd);
xfree(pmc->map);
xfree(pmc->regs);
pmc_reset(pmc);
}

Expand Down
Loading

0 comments on commit bcf96f2

Please sign in to comment.