arm64 linux内核中memcpy是如何运作的?

起因:做某个内核漏洞利用时,需要控制 copy_from_user() 的返回值为类似 0x7e 这种数,于是想当然构造了如下用户态代码

1
2
3
4
5
char* buf = mmap(0x60000000, 0x1000, PROT_READ | PROT_WRITE, MAP_FIXED | MAP_SHARED | MAP_ANONYMOUS, -1, 0);
char* write_buffer = buf + (0x1000-0x180);
int write_size = 0x180+0x7e;
k_stackof_write(write_buffer,write_size);
// 对应到内核态会执行copy_from_user(k_addr,write_buffer,write_size);

按理说,内核态会有0x7e个字节拷贝失败,copy_from_user()的返回值应当为0x7e。然而,调试发现copy_from_user()的返回值为0xae(0xae-0x7e=0x30)。试了0x63,返回值是0x93(0x93-0x63=0x30),试了0x23,返回值是0x23(符合预期)。

很奇怪的现象,为什么返回值会出现跟预期不一样的情况?copy_from_user()底层实际拷贝时究竟是怎么做的呢?

涉及到一段汇编代码:copy_template.S

以arm64架构、linux 5.4.50 为例,探索下内核中的拷贝过程是怎样的。以大于128字节的情况为例,主要的拷贝逻辑在 Lcpy_body_large 中。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
/* SPDX-License-Identifier: GPL-2.0-only */
/*
* Copyright (C) 2013 ARM Ltd.
* Copyright (C) 2013 Linaro.
*
* This code is based on glibc cortex strings work originally authored by Linaro
* be found @
*
* <http://bazaar.launchpad.net/~linaro-toolchain-dev/cortex-strings/trunk/>
* files/head:/src/aarch64/
*/

/*
* Copy a buffer from src to dest (alignment handled by the hardware)
*
* Parameters:
* x0 - dest
* x1 - src
* x2 - n
* Returns:
* x0 - dest
*/
dstin .req x0
src .req x1
count .req x2
tmp1 .req x3
tmp1w .req w3
tmp2 .req x4
tmp2w .req w4
dst .req x6

A_l .req x7
A_h .req x8
B_l .req x9
B_h .req x10
C_l .req x11
C_h .req x12
D_l .req x13
D_h .req x14

mov dst, dstin
cmp count, #16
/*When memory length is less than 16, the accessed are not aligned.*/
b.lo .Ltiny15 /*如果count小于16,就跳转到标号Ltiny15处执行*/

neg tmp2, src /*确定起始地址是否对齐*/
ands tmp2, tmp2, #15/* Bytes to reach alignment. */
b.eq .LSrcAligned /*若对齐,则跳转到LSrcAligned*/
sub count, count, tmp2
/*
* Copy the leading memory data from src to dst in an increasing
* address order.By this way,the risk of overwriting the source
* memory data is eliminated when the distance between src and
* dst is less than 16. The memory accesses here are alignment.
*/
tbz tmp2, #0, 1f
ldrb1 tmp1w, src, #1
strb1 tmp1w, dst, #1
1:
tbz tmp2, #1, 2f
ldrh1 tmp1w, src, #2
strh1 tmp1w, dst, #2
2:
tbz tmp2, #2, 3f
ldr1 tmp1w, src, #4
str1 tmp1w, dst, #4
3:
tbz tmp2, #3, .LSrcAligned
ldr1 tmp1, src, #8
str1 tmp1, dst, #8

.LSrcAligned:
cmp count, #64
b.ge .Lcpy_over64 /*若待拷贝的大小大于等于64,则跳转至Lcpy_over64*/
/*
* Deal with small copies quickly by dropping straight into the
* exit block.
*/
.Ltail63:
/*
* Copy up to 48 bytes of data. At this point we only need the
* bottom 6 bits of count to be accurate.
*/
ands tmp1, count, #0x30
b.eq .Ltiny15
cmp tmp1w, #0x20
b.eq 1f
b.lt 2f
ldp1 A_l, A_h, src, #16
stp1 A_l, A_h, dst, #16
1:
ldp1 A_l, A_h, src, #16
stp1 A_l, A_h, dst, #16
2:
ldp1 A_l, A_h, src, #16
stp1 A_l, A_h, dst, #16
.Ltiny15:
/*
* Prefer to break one ldp/stp into several load/store to access
* memory in an increasing address order,rather than to load/store 16
* bytes from (src-16) to (dst-16) and to backward the src to aligned
* address,which way is used in original cortex memcpy. If keeping
* the original memcpy process here, memmove need to satisfy the
* precondition that src address is at least 16 bytes bigger than dst
* address,otherwise some source data will be overwritten when memove
* call memcpy directly. To make memmove simpler and decouple the
* memcpy's dependency on memmove, withdrew the original process.
*/
tbz count, #3, 1f
ldr1 tmp1, src, #8
str1 tmp1, dst, #8
1:
tbz count, #2, 2f
ldr1 tmp1w, src, #4
str1 tmp1w, dst, #4
2:
tbz count, #1, 3f
ldrh1 tmp1w, src, #2
strh1 tmp1w, dst, #2
3:
tbz count, #0, .Lexitfunc
ldrb1 tmp1w, src, #1
strb1 tmp1w, dst, #1

b .Lexitfunc

.Lcpy_over64:
subs count, count, #128
b.ge .Lcpy_body_large /*若待拷贝的大小大于等于128,则跳转至Lcpy_body_large*/
/*
* Less than 128 bytes to copy, so handle 64 here and then jump
* to the tail.
*/
ldp1 A_l, A_h, src, #16
stp1 A_l, A_h, dst, #16
ldp1 B_l, B_h, src, #16
ldp1 C_l, C_h, src, #16
stp1 B_l, B_h, dst, #16
stp1 C_l, C_h, dst, #16
ldp1 D_l, D_h, src, #16
stp1 D_l, D_h, dst, #16

tst count, #0x3f
b.ne .Ltail63
b .Lexitfunc

/*
* Critical loop. Start at a new cache line boundary. Assuming
* 64 bytes per line this ensures the entire loop is in one line.
*/
.p2align L1_CACHE_SHIFT
.Lcpy_body_large:
/* pre-get 64 bytes data. */
ldp1 A_l, A_h, src, #16
ldp1 B_l, B_h, src, #16
ldp1 C_l, C_h, src, #16
ldp1 D_l, D_h, src, #16 /*先将起始64字节内存加载到寄存器中*/
1:
/*
* interlace the load of next 64 bytes data block with store of the last
* loaded 64 bytes data.
*/
stp1 A_l, A_h, dst, #16 /*将寄存器A中16字节内容存入目的地址*/
ldp1 A_l, A_h, src, #16 /*从源地址中下一个64字节内容中,加载16字节到A寄存器*/
stp1 B_l, B_h, dst, #16 /*B C D寄存器依次重复A过程*/
ldp1 B_l, B_h, src, #16
stp1 C_l, C_h, dst, #16
ldp1 C_l, C_h, src, #16
stp1 D_l, D_h, dst, #16
ldp1 D_l, D_h, src, #16
subs count, count, #64 /*待拷贝长度减去64字节,表示下一个循环将从源地址加载64字节内容,同时将上一次A B C D寄存器的内容(共64字节)存到目的地址中去*/
b.ge 1b /*count大于等于0的情况下,跳转到1处*/
stp1 A_l, A_h, dst, #16
stp1 B_l, B_h, dst, #16
stp1 C_l, C_h, dst, #16
stp1 D_l, D_h, dst, #16

tst count, #0x3f
b.ne .Ltail63
.Lexitfunc:

Lcpy_body_large 中主要逻辑如下图示。64字节为一组,上一轮中,已将源地址(用户态)64字节内容分别加载到A(A_l/A_h)B(B_l/B_h)C(C_l/C_h)D(D_l/D_h)对应的8个寄存器中。此轮中:

  1. 先将A存到目的地址(内核态)
  2. 然后从下一个分组读取16个字节到A寄存器(此时完成16字节从源地址到目的地址的写入)
  3. 后面的 3 4 5 6 7 8 依次重复 1 2的操作,完成共64字节的拷贝

image-20230908172757504

理解汇编后,就能明白开头那段代码中为什么内核在 copy_from_user() 时返回值会出现 0x93(0x63+0x30)0x23(0x23+0) 这两种情况。

第一种情况:本质是需要访问未映射页面的size大于等于0x40

假设左侧是已映射区域,右侧是未映射区域。那么当执行 2 时,内核访问未映射区域会进入错误页处理。此时存在寄存器B C D中的内容还未来的及写入目的地址中。于是未拷贝的长度实际是右侧未映射区域的大小(0x63),加上左侧B C D 寄存器中未写入的内容大小(0x30)。所以 copy_from_user() 返回未拷贝的长度是0x93。

image-20230908172831999

第二种情况:本质是需要访问未映射页面的size小于0x40

剩余count小于0x40的情况下,不会再进入前面的循环拷贝过程,而是一次性将 A B C D 的内容写入目标地址,然后处理剩下的小于0x40的部分。访问这部分内容必然触发异常,于是未拷贝的长度就是未映射页面的size。

第三种情况:左侧已映射部分不是0x40的整数倍

前两种情况都是基于左侧已映射部分是0x40的整数倍为基础讨论的,假设左侧已映射部分是0x1b0,右侧是0x10。那么当执行到第8步时访问到非法内存,此时2 4 6步存到寄存器中的0x30字节内容未写入目的地址中,所以最终copy_from_user() 的返回值是 0x10+0x30=0x40。其他情况不枚举了。

image-20230908172857815

总之,copy_from_user() 函数出现返回值跟预期不一致的原因,是因为实际拷贝操作中,异常发生时,寄存器内容未来得及写入目标地址。寄存器中未写入目标地址的内容也会被纳入未拷贝长度中,这是用户态不容易感知到的部分。

参考:

ARM64架构下memcpy实现原理

ARM64-memcpy.S 汇编源码分析