arm64 linux内核中memcpy是如何运作的？

起因：做某个内核漏洞利用时，需要控制 copy_from_user() 的返回值为类似 0x7e 这种数，于是想当然构造了如下用户态代码

char* buf = mmap(0x60000000, 0x1000, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0);
char* write_buffer = buf + (0x1000-0x180);
int write_size = 0x180+0x7e;
k_stackof_write(write_buffer,write_size);        
// 对应到内核态会执行copy_from_user(k_addr,write_buffer,write_size);

按理说，内核态会有0x7e个字节拷贝失败，copy_from_user()的返回值应当为0x7e。然而，调试发现copy_from_user()的返回值为0xae（0xae-0x7e=0x30）。试了0x63，返回值是0x93（0x93-0x63=0x30），试了0x23，返回值是0x23（符合预期）。

很奇怪的现象，为什么返回值会出现跟预期不一样的情况？copy_from_user()底层实际拷贝时究竟是怎么做的呢？

涉及到一段汇编代码：copy_template.S

以arm64架构、linux 5.4.50 为例，探索下内核中的拷贝过程是怎样的。以大于128字节的情况为例，主要的拷贝逻辑在 Lcpy_body_large 中。

/* SPDX-License-Identifier: GPL-2.0-only */
/*
 * Copyright (C) 2013 ARM Ltd.
 * Copyright (C) 2013 Linaro.
 *
 * This code is based on glibc cortex strings work originally authored by Linaro
 * be found @
 *
 * <http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/>
 * files/head:/src/aarch64/
 */

/*
 * Copy a buffer from src to dest (alignment handled by the hardware)
 *
 * Parameters:
 *    x0 - dest
 *    x1 - src
 *    x2 - n
 * Returns:
 *    x0 - dest
 */
dstin    .req    x0
src    .req    x1
count    .req    x2
tmp1    .req    x3
tmp1w    .req    w3
tmp2    .req    x4
tmp2w    .req    w4
dst    .req    x6

A_l    .req    x7
A_h    .req    x8
B_l    .req    x9
B_h    .req    x10
C_l    .req    x11
C_h    .req    x12
D_l    .req    x13
D_h    .req    x14

    mov    dst, dstin
    cmp    count, #16
    /*When memory length is less than 16, the accessed are not aligned.*/
    b.lo    .Ltiny15          /*如果count小于16，就跳转到标号Ltiny15处执行*/

    neg    tmp2, src           /*确定起始地址是否对齐*/
    ands    tmp2, tmp2, #15/* Bytes to reach alignment. */
    b.eq    .LSrcAligned      /*若对齐，则跳转到LSrcAligned*/
    sub    count, count, tmp2
    /*
    * Copy the leading memory data from src to dst in an increasing
    * address order.By this way,the risk of overwriting the source
    * memory data is eliminated when the distance between src and
    * dst is less than 16. The memory accesses here are alignment.
    */
    tbz    tmp2, #0, 1f
    ldrb1    tmp1w, src, #1
    strb1    tmp1w, dst, #1
1:
    tbz    tmp2, #1, 2f
    ldrh1    tmp1w, src, #2
    strh1    tmp1w, dst, #2
2:
    tbz    tmp2, #2, 3f
    ldr1    tmp1w, src, #4
    str1    tmp1w, dst, #4
3:
    tbz    tmp2, #3, .LSrcAligned
    ldr1    tmp1, src, #8
    str1    tmp1, dst, #8

.LSrcAligned:
    cmp    count, #64       
    b.ge    .Lcpy_over64       /*若待拷贝的大小大于等于64，则跳转至Lcpy_over64*/
    /*
    * Deal with small copies quickly by dropping straight into the
    * exit block.
    */
.Ltail63:
    /*
    * Copy up to 48 bytes of data. At this point we only need the
    * bottom 6 bits of count to be accurate.
    */
    ands    tmp1, count, #0x30
    b.eq    .Ltiny15
    cmp    tmp1w, #0x20
    b.eq    1f
    b.lt    2f
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
1:
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
2:
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
.Ltiny15:
    /*
    * Prefer to break one ldp/stp into several load/store to access
    * memory in an increasing address order,rather than to load/store 16
    * bytes from (src-16) to (dst-16) and to backward the src to aligned
    * address,which way is used in original cortex memcpy. If keeping
    * the original memcpy process here, memmove need to satisfy the
    * precondition that src address is at least 16 bytes bigger than dst
    * address,otherwise some source data will be overwritten when memove
    * call memcpy directly. To make memmove simpler and decouple the
    * memcpy's dependency on memmove, withdrew the original process.
    */
    tbz    count, #3, 1f
    ldr1    tmp1, src, #8
    str1    tmp1, dst, #8
1:
    tbz    count, #2, 2f
    ldr1    tmp1w, src, #4
    str1    tmp1w, dst, #4
2:
    tbz    count, #1, 3f
    ldrh1    tmp1w, src, #2
    strh1    tmp1w, dst, #2
3:
    tbz    count, #0, .Lexitfunc
    ldrb1    tmp1w, src, #1
    strb1    tmp1w, dst, #1

    b    .Lexitfunc

.Lcpy_over64:
    subs    count, count, #128
    b.ge    .Lcpy_body_large           /*若待拷贝的大小大于等于128，则跳转至Lcpy_body_large*/
    /*
    * Less than 128 bytes to copy, so handle 64 here and then jump
    * to the tail.
    */
    ldp1    A_l, A_h, src, #16
    stp1    A_l, A_h, dst, #16
    ldp1    B_l, B_h, src, #16
    ldp1    C_l, C_h, src, #16
    stp1    B_l, B_h, dst, #16
    stp1    C_l, C_h, dst, #16
    ldp1    D_l, D_h, src, #16
    stp1    D_l, D_h, dst, #16

    tst    count, #0x3f
    b.ne    .Ltail63
    b    .Lexitfunc

    /*
    * Critical loop.  Start at a new cache line boundary.  Assuming
    * 64 bytes per line this ensures the entire loop is in one line.
    */
    .p2align    L1_CACHE_SHIFT
.Lcpy_body_large:
    /* pre-get 64 bytes data. */
    ldp1    A_l, A_h, src, #16
    ldp1    B_l, B_h, src, #16
    ldp1    C_l, C_h, src, #16
    ldp1    D_l, D_h, src, #16             /*先将起始64字节内存加载到寄存器中*/
1:
    /*
    * interlace the load of next 64 bytes data block with store of the last
    * loaded 64 bytes data.
    */
    stp1    A_l, A_h, dst, #16         /*将寄存器A中16字节内容存入目的地址*/
    ldp1    A_l, A_h, src, #16         /*从源地址中下一个64字节内容中，加载16字节到A寄存器*/
    stp1    B_l, B_h, dst, #16         /*B C D寄存器依次重复A过程*/
    ldp1    B_l, B_h, src, #16
    stp1    C_l, C_h, dst, #16
    ldp1    C_l, C_h, src, #16
    stp1    D_l, D_h, dst, #16
    ldp1    D_l, D_h, src, #16        
    subs    count, count, #64          /*待拷贝长度减去64字节，表示下一个循环将从源地址加载64字节内容，同时将上一次A B C D寄存器的内容（共64字节）存到目的地址中去*/
    b.ge    1b                         /*count大于等于0的情况下，跳转到1处*/
    stp1    A_l, A_h, dst, #16
    stp1    B_l, B_h, dst, #16
    stp1    C_l, C_h, dst, #16
    stp1    D_l, D_h, dst, #16

    tst    count, #0x3f
    b.ne    .Ltail63
.Lexitfunc:

Lcpy_body_large 中主要逻辑如下图示。64字节为一组，上一轮中，已将源地址（用户态）64字节内容分别加载到A（A_l/A_h）B（B_l/B_h）C（C_l/C_h）D（D_l/D_h）对应的8个寄存器中。此轮中：

先将A存到目的地址（内核态）
然后从下一个分组读取16个字节到A寄存器（此时完成16字节从源地址到目的地址的写入）
后面的 3 4 5 6 7 8 依次重复 1 2的操作，完成共64字节的拷贝

理解汇编后，就能明白开头那段代码中为什么内核在 copy_from_user() 时返回值会出现 0x93(0x63+0x30) 和 0x23(0x23+0) 这两种情况。

第一种情况：本质是需要访问未映射页面的size大于等于0x40

假设左侧是已映射区域，右侧是未映射区域。那么当执行 2 时，内核访问未映射区域会进入错误页处理。此时存在寄存器B C D中的内容还未来的及写入目的地址中。于是未拷贝的长度实际是右侧未映射区域的大小（0x63），加上左侧B C D 寄存器中未写入的内容大小（0x30）。所以 copy_from_user() 返回未拷贝的长度是0x93。

第二种情况：本质是需要访问未映射页面的size小于0x40

剩余count小于0x40的情况下，不会再进入前面的循环拷贝过程，而是一次性将 A B C D 的内容写入目标地址，然后处理剩下的小于0x40的部分。访问这部分内容必然触发异常，于是未拷贝的长度就是未映射页面的size。

第三种情况：左侧已映射部分不是0x40的整数倍

前两种情况都是基于左侧已映射部分是0x40的整数倍为基础讨论的，假设左侧已映射部分是0x1b0，右侧是0x10。那么当执行到第8步时访问到非法内存，此时2 4 6步存到寄存器中的0x30字节内容未写入目的地址中，所以最终copy_from_user() 的返回值是 0x10+0x30=0x40。其他情况不枚举了。

总之，copy_from_user() 函数出现返回值跟预期不一致的原因，是因为实际拷贝操作中，异常发生时，寄存器内容未来得及写入目标地址。寄存器中未写入目标地址的内容也会被纳入未拷贝长度中，这是用户态不容易感知到的部分。

参考：

ARM64架构下memcpy实现原理

ARM64-memcpy.S 汇编源码分析