1559 lines
37 KiB
C
1559 lines
37 KiB
C
|
#define _GNU_SOURCE
|
||
|
#include <ctype.h>
|
||
|
#include <errno.h>
|
||
|
#include <fcntl.h>
|
||
|
#include <limits.h>
|
||
|
#include <dirent.h>
|
||
|
#include <signal.h>
|
||
|
#include <stdio.h>
|
||
|
#include <stdlib.h>
|
||
|
#include <stdbool.h>
|
||
|
#include <string.h>
|
||
|
#include <unistd.h>
|
||
|
|
||
|
#include <sys/mman.h>
|
||
|
#include <sys/wait.h>
|
||
|
#include <sys/types.h>
|
||
|
#include <sys/stat.h>
|
||
|
#include <sys/sysmacros.h>
|
||
|
#include <sys/vfs.h>
|
||
|
|
||
|
#include "linux/magic.h"
|
||
|
|
||
|
#include "vm_util.h"
|
||
|
|
||
|
#ifndef MADV_PAGEOUT
|
||
|
#define MADV_PAGEOUT 21
|
||
|
#endif
|
||
|
#ifndef MADV_POPULATE_READ
|
||
|
#define MADV_POPULATE_READ 22
|
||
|
#endif
|
||
|
#ifndef MADV_COLLAPSE
|
||
|
#define MADV_COLLAPSE 25
|
||
|
#endif
|
||
|
|
||
|
#define BASE_ADDR ((void *)(1UL << 30))
|
||
|
static unsigned long hpage_pmd_size;
|
||
|
static unsigned long page_size;
|
||
|
static int hpage_pmd_nr;
|
||
|
|
||
|
#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
|
||
|
#define PID_SMAPS "/proc/self/smaps"
|
||
|
#define TEST_FILE "collapse_test_file"
|
||
|
|
||
|
#define MAX_LINE_LENGTH 500
|
||
|
|
||
|
enum vma_type {
|
||
|
VMA_ANON,
|
||
|
VMA_FILE,
|
||
|
VMA_SHMEM,
|
||
|
};
|
||
|
|
||
|
struct mem_ops {
|
||
|
void *(*setup_area)(int nr_hpages);
|
||
|
void (*cleanup_area)(void *p, unsigned long size);
|
||
|
void (*fault)(void *p, unsigned long start, unsigned long end);
|
||
|
bool (*check_huge)(void *addr, int nr_hpages);
|
||
|
const char *name;
|
||
|
};
|
||
|
|
||
|
static struct mem_ops *file_ops;
|
||
|
static struct mem_ops *anon_ops;
|
||
|
static struct mem_ops *shmem_ops;
|
||
|
|
||
|
struct collapse_context {
|
||
|
void (*collapse)(const char *msg, char *p, int nr_hpages,
|
||
|
struct mem_ops *ops, bool expect);
|
||
|
bool enforce_pte_scan_limits;
|
||
|
const char *name;
|
||
|
};
|
||
|
|
||
|
static struct collapse_context *khugepaged_context;
|
||
|
static struct collapse_context *madvise_context;
|
||
|
|
||
|
struct file_info {
|
||
|
const char *dir;
|
||
|
char path[PATH_MAX];
|
||
|
enum vma_type type;
|
||
|
int fd;
|
||
|
char dev_queue_read_ahead_path[PATH_MAX];
|
||
|
};
|
||
|
|
||
|
static struct file_info finfo;
|
||
|
|
||
|
enum thp_enabled {
|
||
|
THP_ALWAYS,
|
||
|
THP_MADVISE,
|
||
|
THP_NEVER,
|
||
|
};
|
||
|
|
||
|
static const char *thp_enabled_strings[] = {
|
||
|
"always",
|
||
|
"madvise",
|
||
|
"never",
|
||
|
NULL
|
||
|
};
|
||
|
|
||
|
enum thp_defrag {
|
||
|
THP_DEFRAG_ALWAYS,
|
||
|
THP_DEFRAG_DEFER,
|
||
|
THP_DEFRAG_DEFER_MADVISE,
|
||
|
THP_DEFRAG_MADVISE,
|
||
|
THP_DEFRAG_NEVER,
|
||
|
};
|
||
|
|
||
|
static const char *thp_defrag_strings[] = {
|
||
|
"always",
|
||
|
"defer",
|
||
|
"defer+madvise",
|
||
|
"madvise",
|
||
|
"never",
|
||
|
NULL
|
||
|
};
|
||
|
|
||
|
enum shmem_enabled {
|
||
|
SHMEM_ALWAYS,
|
||
|
SHMEM_WITHIN_SIZE,
|
||
|
SHMEM_ADVISE,
|
||
|
SHMEM_NEVER,
|
||
|
SHMEM_DENY,
|
||
|
SHMEM_FORCE,
|
||
|
};
|
||
|
|
||
|
static const char *shmem_enabled_strings[] = {
|
||
|
"always",
|
||
|
"within_size",
|
||
|
"advise",
|
||
|
"never",
|
||
|
"deny",
|
||
|
"force",
|
||
|
NULL
|
||
|
};
|
||
|
|
||
|
struct khugepaged_settings {
|
||
|
bool defrag;
|
||
|
unsigned int alloc_sleep_millisecs;
|
||
|
unsigned int scan_sleep_millisecs;
|
||
|
unsigned int max_ptes_none;
|
||
|
unsigned int max_ptes_swap;
|
||
|
unsigned int max_ptes_shared;
|
||
|
unsigned long pages_to_scan;
|
||
|
};
|
||
|
|
||
|
struct settings {
|
||
|
enum thp_enabled thp_enabled;
|
||
|
enum thp_defrag thp_defrag;
|
||
|
enum shmem_enabled shmem_enabled;
|
||
|
bool use_zero_page;
|
||
|
struct khugepaged_settings khugepaged;
|
||
|
unsigned long read_ahead_kb;
|
||
|
};
|
||
|
|
||
|
static struct settings saved_settings;
|
||
|
static bool skip_settings_restore;
|
||
|
|
||
|
static int exit_status;
|
||
|
|
||
|
static void success(const char *msg)
|
||
|
{
|
||
|
printf(" \e[32m%s\e[0m\n", msg);
|
||
|
}
|
||
|
|
||
|
static void fail(const char *msg)
|
||
|
{
|
||
|
printf(" \e[31m%s\e[0m\n", msg);
|
||
|
exit_status++;
|
||
|
}
|
||
|
|
||
|
static void skip(const char *msg)
|
||
|
{
|
||
|
printf(" \e[33m%s\e[0m\n", msg);
|
||
|
}
|
||
|
|
||
|
static int read_file(const char *path, char *buf, size_t buflen)
|
||
|
{
|
||
|
int fd;
|
||
|
ssize_t numread;
|
||
|
|
||
|
fd = open(path, O_RDONLY);
|
||
|
if (fd == -1)
|
||
|
return 0;
|
||
|
|
||
|
numread = read(fd, buf, buflen - 1);
|
||
|
if (numread < 1) {
|
||
|
close(fd);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
buf[numread] = '\0';
|
||
|
close(fd);
|
||
|
|
||
|
return (unsigned int) numread;
|
||
|
}
|
||
|
|
||
|
static int write_file(const char *path, const char *buf, size_t buflen)
|
||
|
{
|
||
|
int fd;
|
||
|
ssize_t numwritten;
|
||
|
|
||
|
fd = open(path, O_WRONLY);
|
||
|
if (fd == -1) {
|
||
|
printf("open(%s)\n", path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
numwritten = write(fd, buf, buflen - 1);
|
||
|
close(fd);
|
||
|
if (numwritten < 1) {
|
||
|
printf("write(%s)\n", buf);
|
||
|
exit(EXIT_FAILURE);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
return (unsigned int) numwritten;
|
||
|
}
|
||
|
|
||
|
static int read_string(const char *name, const char *strings[])
|
||
|
{
|
||
|
char path[PATH_MAX];
|
||
|
char buf[256];
|
||
|
char *c;
|
||
|
int ret;
|
||
|
|
||
|
ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
|
||
|
if (ret >= PATH_MAX) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
if (!read_file(path, buf, sizeof(buf))) {
|
||
|
perror(path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
c = strchr(buf, '[');
|
||
|
if (!c) {
|
||
|
printf("%s: Parse failure\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
c++;
|
||
|
memmove(buf, c, sizeof(buf) - (c - buf));
|
||
|
|
||
|
c = strchr(buf, ']');
|
||
|
if (!c) {
|
||
|
printf("%s: Parse failure\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
*c = '\0';
|
||
|
|
||
|
ret = 0;
|
||
|
while (strings[ret]) {
|
||
|
if (!strcmp(strings[ret], buf))
|
||
|
return ret;
|
||
|
ret++;
|
||
|
}
|
||
|
|
||
|
printf("Failed to parse %s\n", name);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
static void write_string(const char *name, const char *val)
|
||
|
{
|
||
|
char path[PATH_MAX];
|
||
|
int ret;
|
||
|
|
||
|
ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
|
||
|
if (ret >= PATH_MAX) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
if (!write_file(path, val, strlen(val) + 1)) {
|
||
|
perror(path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static const unsigned long _read_num(const char *path)
|
||
|
{
|
||
|
char buf[21];
|
||
|
|
||
|
if (read_file(path, buf, sizeof(buf)) < 0) {
|
||
|
perror("read_file(read_num)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
return strtoul(buf, NULL, 10);
|
||
|
}
|
||
|
|
||
|
static const unsigned long read_num(const char *name)
|
||
|
{
|
||
|
char path[PATH_MAX];
|
||
|
int ret;
|
||
|
|
||
|
ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
|
||
|
if (ret >= PATH_MAX) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
return _read_num(path);
|
||
|
}
|
||
|
|
||
|
static void _write_num(const char *path, unsigned long num)
|
||
|
{
|
||
|
char buf[21];
|
||
|
|
||
|
sprintf(buf, "%ld", num);
|
||
|
if (!write_file(path, buf, strlen(buf) + 1)) {
|
||
|
perror(path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void write_num(const char *name, unsigned long num)
|
||
|
{
|
||
|
char path[PATH_MAX];
|
||
|
int ret;
|
||
|
|
||
|
ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name);
|
||
|
if (ret >= PATH_MAX) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
_write_num(path, num);
|
||
|
}
|
||
|
|
||
|
static void write_settings(struct settings *settings)
|
||
|
{
|
||
|
struct khugepaged_settings *khugepaged = &settings->khugepaged;
|
||
|
|
||
|
write_string("enabled", thp_enabled_strings[settings->thp_enabled]);
|
||
|
write_string("defrag", thp_defrag_strings[settings->thp_defrag]);
|
||
|
write_string("shmem_enabled",
|
||
|
shmem_enabled_strings[settings->shmem_enabled]);
|
||
|
write_num("use_zero_page", settings->use_zero_page);
|
||
|
|
||
|
write_num("khugepaged/defrag", khugepaged->defrag);
|
||
|
write_num("khugepaged/alloc_sleep_millisecs",
|
||
|
khugepaged->alloc_sleep_millisecs);
|
||
|
write_num("khugepaged/scan_sleep_millisecs",
|
||
|
khugepaged->scan_sleep_millisecs);
|
||
|
write_num("khugepaged/max_ptes_none", khugepaged->max_ptes_none);
|
||
|
write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap);
|
||
|
write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared);
|
||
|
write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
|
||
|
|
||
|
if (file_ops && finfo.type == VMA_FILE)
|
||
|
_write_num(finfo.dev_queue_read_ahead_path,
|
||
|
settings->read_ahead_kb);
|
||
|
}
|
||
|
|
||
|
#define MAX_SETTINGS_DEPTH 4
|
||
|
static struct settings settings_stack[MAX_SETTINGS_DEPTH];
|
||
|
static int settings_index;
|
||
|
|
||
|
static struct settings *current_settings(void)
|
||
|
{
|
||
|
if (!settings_index) {
|
||
|
printf("Fail: No settings set");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
return settings_stack + settings_index - 1;
|
||
|
}
|
||
|
|
||
|
static void push_settings(struct settings *settings)
|
||
|
{
|
||
|
if (settings_index >= MAX_SETTINGS_DEPTH) {
|
||
|
printf("Fail: Settings stack exceeded");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
settings_stack[settings_index++] = *settings;
|
||
|
write_settings(current_settings());
|
||
|
}
|
||
|
|
||
|
static void pop_settings(void)
|
||
|
{
|
||
|
if (settings_index <= 0) {
|
||
|
printf("Fail: Settings stack empty");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
--settings_index;
|
||
|
write_settings(current_settings());
|
||
|
}
|
||
|
|
||
|
static void restore_settings(int sig)
|
||
|
{
|
||
|
if (skip_settings_restore)
|
||
|
goto out;
|
||
|
|
||
|
printf("Restore THP and khugepaged settings...");
|
||
|
write_settings(&saved_settings);
|
||
|
success("OK");
|
||
|
if (sig)
|
||
|
exit(EXIT_FAILURE);
|
||
|
out:
|
||
|
exit(exit_status);
|
||
|
}
|
||
|
|
||
|
static void save_settings(void)
|
||
|
{
|
||
|
printf("Save THP and khugepaged settings...");
|
||
|
saved_settings = (struct settings) {
|
||
|
.thp_enabled = read_string("enabled", thp_enabled_strings),
|
||
|
.thp_defrag = read_string("defrag", thp_defrag_strings),
|
||
|
.shmem_enabled =
|
||
|
read_string("shmem_enabled", shmem_enabled_strings),
|
||
|
.use_zero_page = read_num("use_zero_page"),
|
||
|
};
|
||
|
saved_settings.khugepaged = (struct khugepaged_settings) {
|
||
|
.defrag = read_num("khugepaged/defrag"),
|
||
|
.alloc_sleep_millisecs =
|
||
|
read_num("khugepaged/alloc_sleep_millisecs"),
|
||
|
.scan_sleep_millisecs =
|
||
|
read_num("khugepaged/scan_sleep_millisecs"),
|
||
|
.max_ptes_none = read_num("khugepaged/max_ptes_none"),
|
||
|
.max_ptes_swap = read_num("khugepaged/max_ptes_swap"),
|
||
|
.max_ptes_shared = read_num("khugepaged/max_ptes_shared"),
|
||
|
.pages_to_scan = read_num("khugepaged/pages_to_scan"),
|
||
|
};
|
||
|
if (file_ops && finfo.type == VMA_FILE)
|
||
|
saved_settings.read_ahead_kb =
|
||
|
_read_num(finfo.dev_queue_read_ahead_path);
|
||
|
|
||
|
success("OK");
|
||
|
|
||
|
signal(SIGTERM, restore_settings);
|
||
|
signal(SIGINT, restore_settings);
|
||
|
signal(SIGHUP, restore_settings);
|
||
|
signal(SIGQUIT, restore_settings);
|
||
|
}
|
||
|
|
||
|
static void get_finfo(const char *dir)
|
||
|
{
|
||
|
struct stat path_stat;
|
||
|
struct statfs fs;
|
||
|
char buf[1 << 10];
|
||
|
char path[PATH_MAX];
|
||
|
char *str, *end;
|
||
|
|
||
|
finfo.dir = dir;
|
||
|
stat(finfo.dir, &path_stat);
|
||
|
if (!S_ISDIR(path_stat.st_mode)) {
|
||
|
printf("%s: Not a directory (%s)\n", __func__, finfo.dir);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE,
|
||
|
finfo.dir) >= sizeof(finfo.path)) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (statfs(finfo.dir, &fs)) {
|
||
|
perror("statfs()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE;
|
||
|
if (finfo.type == VMA_SHMEM)
|
||
|
return;
|
||
|
|
||
|
/* Find owning device's queue/read_ahead_kb control */
|
||
|
if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent",
|
||
|
major(path_stat.st_dev), minor(path_stat.st_dev))
|
||
|
>= sizeof(path)) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (read_file(path, buf, sizeof(buf)) < 0) {
|
||
|
perror("read_file(read_num)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (strstr(buf, "DEVTYPE=disk")) {
|
||
|
/* Found it */
|
||
|
if (snprintf(finfo.dev_queue_read_ahead_path,
|
||
|
sizeof(finfo.dev_queue_read_ahead_path),
|
||
|
"/sys/dev/block/%d:%d/queue/read_ahead_kb",
|
||
|
major(path_stat.st_dev), minor(path_stat.st_dev))
|
||
|
>= sizeof(finfo.dev_queue_read_ahead_path)) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
if (!strstr(buf, "DEVTYPE=partition")) {
|
||
|
printf("%s: Unknown device type: %s\n", __func__, path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
/*
|
||
|
* Partition of block device - need to find actual device.
|
||
|
* Using naming convention that devnameN is partition of
|
||
|
* device devname.
|
||
|
*/
|
||
|
str = strstr(buf, "DEVNAME=");
|
||
|
if (!str) {
|
||
|
printf("%s: Could not read: %s", __func__, path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
str += 8;
|
||
|
end = str;
|
||
|
while (*end) {
|
||
|
if (isdigit(*end)) {
|
||
|
*end = '\0';
|
||
|
if (snprintf(finfo.dev_queue_read_ahead_path,
|
||
|
sizeof(finfo.dev_queue_read_ahead_path),
|
||
|
"/sys/block/%s/queue/read_ahead_kb",
|
||
|
str) >= sizeof(finfo.dev_queue_read_ahead_path)) {
|
||
|
printf("%s: Pathname is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
return;
|
||
|
}
|
||
|
++end;
|
||
|
}
|
||
|
printf("%s: Could not read: %s\n", __func__, path);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
static bool check_swap(void *addr, unsigned long size)
|
||
|
{
|
||
|
bool swap = false;
|
||
|
int ret;
|
||
|
FILE *fp;
|
||
|
char buffer[MAX_LINE_LENGTH];
|
||
|
char addr_pattern[MAX_LINE_LENGTH];
|
||
|
|
||
|
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-",
|
||
|
(unsigned long) addr);
|
||
|
if (ret >= MAX_LINE_LENGTH) {
|
||
|
printf("%s: Pattern is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
|
||
|
fp = fopen(PID_SMAPS, "r");
|
||
|
if (!fp) {
|
||
|
printf("%s: Failed to open file %s\n", __func__, PID_SMAPS);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer)))
|
||
|
goto err_out;
|
||
|
|
||
|
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB",
|
||
|
size >> 10);
|
||
|
if (ret >= MAX_LINE_LENGTH) {
|
||
|
printf("%s: Pattern is too long\n", __func__);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
/*
|
||
|
* Fetch the Swap: in the same block and check whether it got
|
||
|
* the expected number of hugeepages next.
|
||
|
*/
|
||
|
if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer)))
|
||
|
goto err_out;
|
||
|
|
||
|
if (strncmp(buffer, addr_pattern, strlen(addr_pattern)))
|
||
|
goto err_out;
|
||
|
|
||
|
swap = true;
|
||
|
err_out:
|
||
|
fclose(fp);
|
||
|
return swap;
|
||
|
}
|
||
|
|
||
|
static void *alloc_mapping(int nr)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
|
||
|
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||
|
if (p != BASE_ADDR) {
|
||
|
printf("Failed to allocate VMA at %p\n", BASE_ADDR);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
static void fill_memory(int *p, unsigned long start, unsigned long end)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
for (i = start / page_size; i < end / page_size; i++)
|
||
|
p[i * page_size / sizeof(*p)] = i + 0xdead0000;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* MADV_COLLAPSE is a best-effort request and may fail if an internal
|
||
|
* resource is temporarily unavailable, in which case it will set errno to
|
||
|
* EAGAIN. In such a case, immediately reattempt the operation one more
|
||
|
* time.
|
||
|
*/
|
||
|
static int madvise_collapse_retry(void *p, unsigned long size)
|
||
|
{
|
||
|
bool retry = true;
|
||
|
int ret;
|
||
|
|
||
|
retry:
|
||
|
ret = madvise(p, size, MADV_COLLAPSE);
|
||
|
if (ret && errno == EAGAIN && retry) {
|
||
|
retry = false;
|
||
|
goto retry;
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
|
||
|
* validate_memory()'able contents.
|
||
|
*/
|
||
|
static void *alloc_hpage(struct mem_ops *ops)
|
||
|
{
|
||
|
void *p = ops->setup_area(1);
|
||
|
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
|
||
|
/*
|
||
|
* VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE.
|
||
|
* The latter is ineligible for collapse by MADV_COLLAPSE
|
||
|
* while the former might cause MADV_COLLAPSE to race with
|
||
|
* khugepaged on low-load system (like a test machine), which
|
||
|
* would cause MADV_COLLAPSE to fail with EAGAIN.
|
||
|
*/
|
||
|
printf("Allocate huge page...");
|
||
|
if (madvise_collapse_retry(p, hpage_pmd_size)) {
|
||
|
perror("madvise(MADV_COLLAPSE)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (!ops->check_huge(p, 1)) {
|
||
|
perror("madvise(MADV_COLLAPSE)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) {
|
||
|
perror("madvise(MADV_HUGEPAGE)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
success("OK");
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
static void validate_memory(int *p, unsigned long start, unsigned long end)
|
||
|
{
|
||
|
int i;
|
||
|
|
||
|
for (i = start / page_size; i < end / page_size; i++) {
|
||
|
if (p[i * page_size / sizeof(*p)] != i + 0xdead0000) {
|
||
|
printf("Page %d is corrupted: %#x\n",
|
||
|
i, p[i * page_size / sizeof(*p)]);
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void *anon_setup_area(int nr_hpages)
|
||
|
{
|
||
|
return alloc_mapping(nr_hpages);
|
||
|
}
|
||
|
|
||
|
static void anon_cleanup_area(void *p, unsigned long size)
|
||
|
{
|
||
|
munmap(p, size);
|
||
|
}
|
||
|
|
||
|
static void anon_fault(void *p, unsigned long start, unsigned long end)
|
||
|
{
|
||
|
fill_memory(p, start, end);
|
||
|
}
|
||
|
|
||
|
static bool anon_check_huge(void *addr, int nr_hpages)
|
||
|
{
|
||
|
return check_huge_anon(addr, nr_hpages, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void *file_setup_area(int nr_hpages)
|
||
|
{
|
||
|
int fd;
|
||
|
void *p;
|
||
|
unsigned long size;
|
||
|
|
||
|
unlink(finfo.path); /* Cleanup from previous failed tests */
|
||
|
printf("Creating %s for collapse%s...", finfo.path,
|
||
|
finfo.type == VMA_SHMEM ? " (tmpfs)" : "");
|
||
|
fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL,
|
||
|
777);
|
||
|
if (fd < 0) {
|
||
|
perror("open()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
size = nr_hpages * hpage_pmd_size;
|
||
|
p = alloc_mapping(nr_hpages);
|
||
|
fill_memory(p, 0, size);
|
||
|
write(fd, p, size);
|
||
|
close(fd);
|
||
|
munmap(p, size);
|
||
|
success("OK");
|
||
|
|
||
|
printf("Opening %s read only for collapse...", finfo.path);
|
||
|
finfo.fd = open(finfo.path, O_RDONLY, 777);
|
||
|
if (finfo.fd < 0) {
|
||
|
perror("open()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC,
|
||
|
MAP_PRIVATE, finfo.fd, 0);
|
||
|
if (p == MAP_FAILED || p != BASE_ADDR) {
|
||
|
perror("mmap()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
/* Drop page cache */
|
||
|
write_file("/proc/sys/vm/drop_caches", "3", 2);
|
||
|
success("OK");
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
static void file_cleanup_area(void *p, unsigned long size)
|
||
|
{
|
||
|
munmap(p, size);
|
||
|
close(finfo.fd);
|
||
|
unlink(finfo.path);
|
||
|
}
|
||
|
|
||
|
static void file_fault(void *p, unsigned long start, unsigned long end)
|
||
|
{
|
||
|
if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) {
|
||
|
perror("madvise(MADV_POPULATE_READ");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static bool file_check_huge(void *addr, int nr_hpages)
|
||
|
{
|
||
|
switch (finfo.type) {
|
||
|
case VMA_FILE:
|
||
|
return check_huge_file(addr, nr_hpages, hpage_pmd_size);
|
||
|
case VMA_SHMEM:
|
||
|
return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
|
||
|
default:
|
||
|
exit(EXIT_FAILURE);
|
||
|
return false;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static void *shmem_setup_area(int nr_hpages)
|
||
|
{
|
||
|
void *p;
|
||
|
unsigned long size = nr_hpages * hpage_pmd_size;
|
||
|
|
||
|
finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0);
|
||
|
if (finfo.fd < 0) {
|
||
|
perror("memfd_create()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (ftruncate(finfo.fd, size)) {
|
||
|
perror("ftruncate()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd,
|
||
|
0);
|
||
|
if (p != BASE_ADDR) {
|
||
|
perror("mmap()");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
return p;
|
||
|
}
|
||
|
|
||
|
static void shmem_cleanup_area(void *p, unsigned long size)
|
||
|
{
|
||
|
munmap(p, size);
|
||
|
close(finfo.fd);
|
||
|
}
|
||
|
|
||
|
static bool shmem_check_huge(void *addr, int nr_hpages)
|
||
|
{
|
||
|
return check_huge_shmem(addr, nr_hpages, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static struct mem_ops __anon_ops = {
|
||
|
.setup_area = &anon_setup_area,
|
||
|
.cleanup_area = &anon_cleanup_area,
|
||
|
.fault = &anon_fault,
|
||
|
.check_huge = &anon_check_huge,
|
||
|
.name = "anon",
|
||
|
};
|
||
|
|
||
|
static struct mem_ops __file_ops = {
|
||
|
.setup_area = &file_setup_area,
|
||
|
.cleanup_area = &file_cleanup_area,
|
||
|
.fault = &file_fault,
|
||
|
.check_huge = &file_check_huge,
|
||
|
.name = "file",
|
||
|
};
|
||
|
|
||
|
static struct mem_ops __shmem_ops = {
|
||
|
.setup_area = &shmem_setup_area,
|
||
|
.cleanup_area = &shmem_cleanup_area,
|
||
|
.fault = &anon_fault,
|
||
|
.check_huge = &shmem_check_huge,
|
||
|
.name = "shmem",
|
||
|
};
|
||
|
|
||
|
static void __madvise_collapse(const char *msg, char *p, int nr_hpages,
|
||
|
struct mem_ops *ops, bool expect)
|
||
|
{
|
||
|
int ret;
|
||
|
struct settings settings = *current_settings();
|
||
|
|
||
|
printf("%s...", msg);
|
||
|
|
||
|
/*
|
||
|
* Prevent khugepaged interference and tests that MADV_COLLAPSE
|
||
|
* ignores /sys/kernel/mm/transparent_hugepage/enabled
|
||
|
*/
|
||
|
settings.thp_enabled = THP_NEVER;
|
||
|
settings.shmem_enabled = SHMEM_NEVER;
|
||
|
push_settings(&settings);
|
||
|
|
||
|
/* Clear VM_NOHUGEPAGE */
|
||
|
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
|
||
|
ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size);
|
||
|
if (((bool)ret) == expect)
|
||
|
fail("Fail: Bad return value");
|
||
|
else if (!ops->check_huge(p, expect ? nr_hpages : 0))
|
||
|
fail("Fail: check_huge()");
|
||
|
else
|
||
|
success("OK");
|
||
|
|
||
|
pop_settings();
|
||
|
}
|
||
|
|
||
|
static void madvise_collapse(const char *msg, char *p, int nr_hpages,
|
||
|
struct mem_ops *ops, bool expect)
|
||
|
{
|
||
|
/* Sanity check */
|
||
|
if (!ops->check_huge(p, 0)) {
|
||
|
printf("Unexpected huge page\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
__madvise_collapse(msg, p, nr_hpages, ops, expect);
|
||
|
}
|
||
|
|
||
|
#define TICK 500000
|
||
|
static bool wait_for_scan(const char *msg, char *p, int nr_hpages,
|
||
|
struct mem_ops *ops)
|
||
|
{
|
||
|
int full_scans;
|
||
|
int timeout = 6; /* 3 seconds */
|
||
|
|
||
|
/* Sanity check */
|
||
|
if (!ops->check_huge(p, 0)) {
|
||
|
printf("Unexpected huge page\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
|
||
|
|
||
|
/* Wait until the second full_scan completed */
|
||
|
full_scans = read_num("khugepaged/full_scans") + 2;
|
||
|
|
||
|
printf("%s...", msg);
|
||
|
while (timeout--) {
|
||
|
if (ops->check_huge(p, nr_hpages))
|
||
|
break;
|
||
|
if (read_num("khugepaged/full_scans") >= full_scans)
|
||
|
break;
|
||
|
printf(".");
|
||
|
usleep(TICK);
|
||
|
}
|
||
|
|
||
|
madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
|
||
|
|
||
|
return timeout == -1;
|
||
|
}
|
||
|
|
||
|
static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
|
||
|
struct mem_ops *ops, bool expect)
|
||
|
{
|
||
|
if (wait_for_scan(msg, p, nr_hpages, ops)) {
|
||
|
if (expect)
|
||
|
fail("Timeout");
|
||
|
else
|
||
|
success("OK");
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* For file and shmem memory, khugepaged only retracts pte entries after
|
||
|
* putting the new hugepage in the page cache. The hugepage must be
|
||
|
* subsequently refaulted to install the pmd mapping for the mm.
|
||
|
*/
|
||
|
if (ops != &__anon_ops)
|
||
|
ops->fault(p, 0, nr_hpages * hpage_pmd_size);
|
||
|
|
||
|
if (ops->check_huge(p, expect ? nr_hpages : 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
}
|
||
|
|
||
|
static struct collapse_context __khugepaged_context = {
|
||
|
.collapse = &khugepaged_collapse,
|
||
|
.enforce_pte_scan_limits = true,
|
||
|
.name = "khugepaged",
|
||
|
};
|
||
|
|
||
|
static struct collapse_context __madvise_context = {
|
||
|
.collapse = &madvise_collapse,
|
||
|
.enforce_pte_scan_limits = false,
|
||
|
.name = "madvise",
|
||
|
};
|
||
|
|
||
|
static bool is_tmpfs(struct mem_ops *ops)
|
||
|
{
|
||
|
return ops == &__file_ops && finfo.type == VMA_SHMEM;
|
||
|
}
|
||
|
|
||
|
static void alloc_at_fault(void)
|
||
|
{
|
||
|
struct settings settings = *current_settings();
|
||
|
char *p;
|
||
|
|
||
|
settings.thp_enabled = THP_ALWAYS;
|
||
|
push_settings(&settings);
|
||
|
|
||
|
p = alloc_mapping(1);
|
||
|
*p = 1;
|
||
|
printf("Allocate huge page on fault...");
|
||
|
if (check_huge_anon(p, 1, hpage_pmd_size))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
pop_settings();
|
||
|
|
||
|
madvise(p, page_size, MADV_DONTNEED);
|
||
|
printf("Split huge PMD on MADV_DONTNEED...");
|
||
|
if (check_huge_anon(p, 0, hpage_pmd_size))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
munmap(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_full(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
int nr_hpages = 4;
|
||
|
unsigned long size = nr_hpages * hpage_pmd_size;
|
||
|
|
||
|
p = ops->setup_area(nr_hpages);
|
||
|
ops->fault(p, 0, size);
|
||
|
c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
|
||
|
ops, true);
|
||
|
validate_memory(p, 0, size);
|
||
|
ops->cleanup_area(p, size);
|
||
|
}
|
||
|
|
||
|
static void collapse_empty(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
c->collapse("Do not collapse empty PTE table", p, 1, ops, false);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
ops->fault(p, 0, page_size);
|
||
|
c->collapse("Collapse PTE table with single PTE entry present", p,
|
||
|
1, ops, true);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
int max_ptes_none = hpage_pmd_nr / 2;
|
||
|
struct settings settings = *current_settings();
|
||
|
void *p;
|
||
|
|
||
|
settings.khugepaged.max_ptes_none = max_ptes_none;
|
||
|
push_settings(&settings);
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
|
||
|
if (is_tmpfs(ops)) {
|
||
|
/* shmem pages always in the page cache */
|
||
|
printf("tmpfs...");
|
||
|
skip("Skip");
|
||
|
goto skip;
|
||
|
}
|
||
|
|
||
|
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
|
||
|
c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
|
||
|
ops, !c->enforce_pte_scan_limits);
|
||
|
validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
|
||
|
|
||
|
if (c->enforce_pte_scan_limits) {
|
||
|
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
|
||
|
c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops,
|
||
|
true);
|
||
|
validate_memory(p, 0,
|
||
|
(hpage_pmd_nr - max_ptes_none) * page_size);
|
||
|
}
|
||
|
skip:
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
pop_settings();
|
||
|
}
|
||
|
|
||
|
static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
|
||
|
printf("Swapout one page...");
|
||
|
if (madvise(p, page_size, MADV_PAGEOUT)) {
|
||
|
perror("madvise(MADV_PAGEOUT)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (check_swap(p, page_size)) {
|
||
|
success("OK");
|
||
|
} else {
|
||
|
fail("Fail");
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
c->collapse("Collapse with swapping in single PTE entry", p, 1, ops,
|
||
|
true);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
out:
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
|
||
|
printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
|
||
|
if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) {
|
||
|
perror("madvise(MADV_PAGEOUT)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (check_swap(p, (max_ptes_swap + 1) * page_size)) {
|
||
|
success("OK");
|
||
|
} else {
|
||
|
fail("Fail");
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops,
|
||
|
!c->enforce_pte_scan_limits);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
|
||
|
if (c->enforce_pte_scan_limits) {
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
printf("Swapout %d of %d pages...", max_ptes_swap,
|
||
|
hpage_pmd_nr);
|
||
|
if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
|
||
|
perror("madvise(MADV_PAGEOUT)");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
if (check_swap(p, max_ptes_swap * page_size)) {
|
||
|
success("OK");
|
||
|
} else {
|
||
|
fail("Fail");
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
c->collapse("Collapse with max_ptes_swap pages swapped out", p,
|
||
|
1, ops, true);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
}
|
||
|
out:
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = alloc_hpage(ops);
|
||
|
|
||
|
if (is_tmpfs(ops)) {
|
||
|
/* MADV_DONTNEED won't evict tmpfs pages */
|
||
|
printf("tmpfs...");
|
||
|
skip("Skip");
|
||
|
goto skip;
|
||
|
}
|
||
|
|
||
|
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||
|
printf("Split huge page leaving single PTE mapping compound page...");
|
||
|
madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
c->collapse("Collapse PTE table with single PTE mapping compound page",
|
||
|
p, 1, ops, true);
|
||
|
validate_memory(p, 0, page_size);
|
||
|
skip:
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = alloc_hpage(ops);
|
||
|
printf("Split huge page leaving single PTE page table full of compound pages...");
|
||
|
madvise(p, page_size, MADV_NOHUGEPAGE);
|
||
|
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
c->collapse("Collapse PTE table full of compound pages", p, 1, ops,
|
||
|
true);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
int i;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
for (i = 0; i < hpage_pmd_nr; i++) {
|
||
|
printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
|
||
|
i + 1, hpage_pmd_nr);
|
||
|
|
||
|
madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
|
||
|
ops->fault(BASE_ADDR, 0, hpage_pmd_size);
|
||
|
if (!ops->check_huge(BASE_ADDR, 1)) {
|
||
|
printf("Failed to allocate huge page\n");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
madvise(BASE_ADDR, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||
|
|
||
|
p = mremap(BASE_ADDR - i * page_size,
|
||
|
i * page_size + hpage_pmd_size,
|
||
|
(i + 1) * page_size,
|
||
|
MREMAP_MAYMOVE | MREMAP_FIXED,
|
||
|
BASE_ADDR + 2 * hpage_pmd_size);
|
||
|
if (p == MAP_FAILED) {
|
||
|
perror("mremap+unmap");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
|
||
|
p = mremap(BASE_ADDR + 2 * hpage_pmd_size,
|
||
|
(i + 1) * page_size,
|
||
|
(i + 1) * page_size + hpage_pmd_size,
|
||
|
MREMAP_MAYMOVE | MREMAP_FIXED,
|
||
|
BASE_ADDR - (i + 1) * page_size);
|
||
|
if (p == MAP_FAILED) {
|
||
|
perror("mremap+alloc");
|
||
|
exit(EXIT_FAILURE);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
ops->cleanup_area(BASE_ADDR, hpage_pmd_size);
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
if (!ops->check_huge(p, 1))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
c->collapse("Collapse PTE table full of different compound pages", p, 1,
|
||
|
ops, true);
|
||
|
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_fork(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
int wstatus;
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
|
||
|
printf("Allocate small page...");
|
||
|
ops->fault(p, 0, page_size);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
printf("Share small page over fork()...");
|
||
|
if (!fork()) {
|
||
|
/* Do not touch settings on child exit */
|
||
|
skip_settings_restore = true;
|
||
|
exit_status = 0;
|
||
|
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
ops->fault(p, page_size, 2 * page_size);
|
||
|
c->collapse("Collapse PTE table with single page shared with parent process",
|
||
|
p, 1, ops, true);
|
||
|
|
||
|
validate_memory(p, 0, page_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
exit(exit_status);
|
||
|
}
|
||
|
|
||
|
wait(&wstatus);
|
||
|
exit_status += WEXITSTATUS(wstatus);
|
||
|
|
||
|
printf("Check if parent still has small page...");
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
validate_memory(p, 0, page_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
int wstatus;
|
||
|
void *p;
|
||
|
|
||
|
p = alloc_hpage(ops);
|
||
|
printf("Share huge page over fork()...");
|
||
|
if (!fork()) {
|
||
|
/* Do not touch settings on child exit */
|
||
|
skip_settings_restore = true;
|
||
|
exit_status = 0;
|
||
|
|
||
|
if (ops->check_huge(p, 1))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
printf("Split huge page PMD in child process...");
|
||
|
madvise(p, page_size, MADV_NOHUGEPAGE);
|
||
|
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
ops->fault(p, 0, page_size);
|
||
|
|
||
|
write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
|
||
|
c->collapse("Collapse PTE table full of compound pages in child",
|
||
|
p, 1, ops, true);
|
||
|
write_num("khugepaged/max_ptes_shared",
|
||
|
current_settings()->khugepaged.max_ptes_shared);
|
||
|
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
exit(exit_status);
|
||
|
}
|
||
|
|
||
|
wait(&wstatus);
|
||
|
exit_status += WEXITSTATUS(wstatus);
|
||
|
|
||
|
printf("Check if parent still has huge page...");
|
||
|
if (ops->check_huge(p, 1))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops)
|
||
|
{
|
||
|
int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
|
||
|
int wstatus;
|
||
|
void *p;
|
||
|
|
||
|
p = alloc_hpage(ops);
|
||
|
printf("Share huge page over fork()...");
|
||
|
if (!fork()) {
|
||
|
/* Do not touch settings on child exit */
|
||
|
skip_settings_restore = true;
|
||
|
exit_status = 0;
|
||
|
|
||
|
if (ops->check_huge(p, 1))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
printf("Trigger CoW on page %d of %d...",
|
||
|
hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
|
||
|
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
|
||
|
1, ops, !c->enforce_pte_scan_limits);
|
||
|
|
||
|
if (c->enforce_pte_scan_limits) {
|
||
|
printf("Trigger CoW on page %d of %d...",
|
||
|
hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
|
||
|
ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) *
|
||
|
page_size);
|
||
|
if (ops->check_huge(p, 0))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
|
||
|
c->collapse("Collapse with max_ptes_shared PTEs shared",
|
||
|
p, 1, ops, true);
|
||
|
}
|
||
|
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
exit(exit_status);
|
||
|
}
|
||
|
|
||
|
wait(&wstatus);
|
||
|
exit_status += WEXITSTATUS(wstatus);
|
||
|
|
||
|
printf("Check if parent still has huge page...");
|
||
|
if (ops->check_huge(p, 1))
|
||
|
success("OK");
|
||
|
else
|
||
|
fail("Fail");
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
static void madvise_collapse_existing_thps(struct collapse_context *c,
|
||
|
struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
|
||
|
p = ops->setup_area(1);
|
||
|
ops->fault(p, 0, hpage_pmd_size);
|
||
|
c->collapse("Collapse fully populated PTE table...", p, 1, ops, true);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
|
||
|
/* c->collapse() will find a hugepage and complain - call directly. */
|
||
|
__madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true);
|
||
|
validate_memory(p, 0, hpage_pmd_size);
|
||
|
ops->cleanup_area(p, hpage_pmd_size);
|
||
|
}
|
||
|
|
||
|
/*
|
||
|
* Test race with khugepaged where page tables have been retracted and
|
||
|
* pmd cleared.
|
||
|
*/
|
||
|
static void madvise_retracted_page_tables(struct collapse_context *c,
|
||
|
struct mem_ops *ops)
|
||
|
{
|
||
|
void *p;
|
||
|
int nr_hpages = 1;
|
||
|
unsigned long size = nr_hpages * hpage_pmd_size;
|
||
|
|
||
|
p = ops->setup_area(nr_hpages);
|
||
|
ops->fault(p, 0, size);
|
||
|
|
||
|
/* Let khugepaged collapse and leave pmd cleared */
|
||
|
if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages,
|
||
|
ops)) {
|
||
|
fail("Timeout");
|
||
|
return;
|
||
|
}
|
||
|
success("OK");
|
||
|
c->collapse("Install huge PMD from page cache", p, nr_hpages, ops,
|
||
|
true);
|
||
|
validate_memory(p, 0, size);
|
||
|
ops->cleanup_area(p, size);
|
||
|
}
|
||
|
|
||
|
static void usage(void)
|
||
|
{
|
||
|
fprintf(stderr, "\nUsage: ./khugepaged <test type> [dir]\n\n");
|
||
|
fprintf(stderr, "\t<test type>\t: <context>:<mem_type>\n");
|
||
|
fprintf(stderr, "\t<context>\t: [all|khugepaged|madvise]\n");
|
||
|
fprintf(stderr, "\t<mem_type>\t: [all|anon|file|shmem]\n");
|
||
|
fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n");
|
||
|
fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n");
|
||
|
fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n");
|
||
|
fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n");
|
||
|
fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n");
|
||
|
exit(1);
|
||
|
}
|
||
|
|
||
|
static void parse_test_type(int argc, const char **argv)
|
||
|
{
|
||
|
char *buf;
|
||
|
const char *token;
|
||
|
|
||
|
if (argc == 1) {
|
||
|
/* Backwards compatibility */
|
||
|
khugepaged_context = &__khugepaged_context;
|
||
|
madvise_context = &__madvise_context;
|
||
|
anon_ops = &__anon_ops;
|
||
|
return;
|
||
|
}
|
||
|
|
||
|
buf = strdup(argv[1]);
|
||
|
token = strsep(&buf, ":");
|
||
|
|
||
|
if (!strcmp(token, "all")) {
|
||
|
khugepaged_context = &__khugepaged_context;
|
||
|
madvise_context = &__madvise_context;
|
||
|
} else if (!strcmp(token, "khugepaged")) {
|
||
|
khugepaged_context = &__khugepaged_context;
|
||
|
} else if (!strcmp(token, "madvise")) {
|
||
|
madvise_context = &__madvise_context;
|
||
|
} else {
|
||
|
usage();
|
||
|
}
|
||
|
|
||
|
if (!buf)
|
||
|
usage();
|
||
|
|
||
|
if (!strcmp(buf, "all")) {
|
||
|
file_ops = &__file_ops;
|
||
|
anon_ops = &__anon_ops;
|
||
|
shmem_ops = &__shmem_ops;
|
||
|
} else if (!strcmp(buf, "anon")) {
|
||
|
anon_ops = &__anon_ops;
|
||
|
} else if (!strcmp(buf, "file")) {
|
||
|
file_ops = &__file_ops;
|
||
|
} else if (!strcmp(buf, "shmem")) {
|
||
|
shmem_ops = &__shmem_ops;
|
||
|
} else {
|
||
|
usage();
|
||
|
}
|
||
|
|
||
|
if (!file_ops)
|
||
|
return;
|
||
|
|
||
|
if (argc != 3)
|
||
|
usage();
|
||
|
}
|
||
|
|
||
|
int main(int argc, const char **argv)
|
||
|
{
|
||
|
struct settings default_settings = {
|
||
|
.thp_enabled = THP_MADVISE,
|
||
|
.thp_defrag = THP_DEFRAG_ALWAYS,
|
||
|
.shmem_enabled = SHMEM_ADVISE,
|
||
|
.use_zero_page = 0,
|
||
|
.khugepaged = {
|
||
|
.defrag = 1,
|
||
|
.alloc_sleep_millisecs = 10,
|
||
|
.scan_sleep_millisecs = 10,
|
||
|
},
|
||
|
/*
|
||
|
* When testing file-backed memory, the collapse path
|
||
|
* looks at how many pages are found in the page cache, not
|
||
|
* what pages are mapped. Disable read ahead optimization so
|
||
|
* pages don't find their way into the page cache unless
|
||
|
* we mem_ops->fault() them in.
|
||
|
*/
|
||
|
.read_ahead_kb = 0,
|
||
|
};
|
||
|
|
||
|
parse_test_type(argc, argv);
|
||
|
|
||
|
if (file_ops)
|
||
|
get_finfo(argv[2]);
|
||
|
|
||
|
setbuf(stdout, NULL);
|
||
|
|
||
|
page_size = getpagesize();
|
||
|
hpage_pmd_size = read_pmd_pagesize();
|
||
|
hpage_pmd_nr = hpage_pmd_size / page_size;
|
||
|
|
||
|
default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1;
|
||
|
default_settings.khugepaged.max_ptes_swap = hpage_pmd_nr / 8;
|
||
|
default_settings.khugepaged.max_ptes_shared = hpage_pmd_nr / 2;
|
||
|
default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
|
||
|
|
||
|
save_settings();
|
||
|
push_settings(&default_settings);
|
||
|
|
||
|
alloc_at_fault();
|
||
|
|
||
|
#define TEST(t, c, o) do { \
|
||
|
if (c && o) { \
|
||
|
printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \
|
||
|
t(c, o); \
|
||
|
} \
|
||
|
} while (0)
|
||
|
|
||
|
TEST(collapse_full, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_full, khugepaged_context, file_ops);
|
||
|
TEST(collapse_full, khugepaged_context, shmem_ops);
|
||
|
TEST(collapse_full, madvise_context, anon_ops);
|
||
|
TEST(collapse_full, madvise_context, file_ops);
|
||
|
TEST(collapse_full, madvise_context, shmem_ops);
|
||
|
|
||
|
TEST(collapse_empty, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_empty, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_single_pte_entry, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_single_pte_entry, khugepaged_context, file_ops);
|
||
|
TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops);
|
||
|
TEST(collapse_single_pte_entry, madvise_context, anon_ops);
|
||
|
TEST(collapse_single_pte_entry, madvise_context, file_ops);
|
||
|
TEST(collapse_single_pte_entry, madvise_context, shmem_ops);
|
||
|
|
||
|
TEST(collapse_max_ptes_none, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_max_ptes_none, khugepaged_context, file_ops);
|
||
|
TEST(collapse_max_ptes_none, madvise_context, anon_ops);
|
||
|
TEST(collapse_max_ptes_none, madvise_context, file_ops);
|
||
|
|
||
|
TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops);
|
||
|
TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops);
|
||
|
TEST(collapse_single_pte_entry_compound, madvise_context, file_ops);
|
||
|
|
||
|
TEST(collapse_full_of_compound, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_full_of_compound, khugepaged_context, file_ops);
|
||
|
TEST(collapse_full_of_compound, khugepaged_context, shmem_ops);
|
||
|
TEST(collapse_full_of_compound, madvise_context, anon_ops);
|
||
|
TEST(collapse_full_of_compound, madvise_context, file_ops);
|
||
|
TEST(collapse_full_of_compound, madvise_context, shmem_ops);
|
||
|
|
||
|
TEST(collapse_compound_extreme, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_compound_extreme, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_swapin_single_pte, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_max_ptes_swap, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_fork, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_fork, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_fork_compound, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_fork_compound, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops);
|
||
|
TEST(collapse_max_ptes_shared, madvise_context, anon_ops);
|
||
|
|
||
|
TEST(madvise_collapse_existing_thps, madvise_context, anon_ops);
|
||
|
TEST(madvise_collapse_existing_thps, madvise_context, file_ops);
|
||
|
TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops);
|
||
|
|
||
|
TEST(madvise_retracted_page_tables, madvise_context, file_ops);
|
||
|
TEST(madvise_retracted_page_tables, madvise_context, shmem_ops);
|
||
|
|
||
|
restore_settings(0);
|
||
|
}
|