2 * Copyright (C) 2008-2009 Michal Simek <monstr@monstr.eu>
3 * Copyright (C) 2008-2009 PetaLogix
4 * Copyright (C) 2008 Jim Law - Iris LP All rights reserved.
6 * This file is subject to the terms and conditions of the GNU General
7 * Public License. See the file COPYING in the main directory of this
8 * archive for more details.
10 * Written by Jim Law <jlaw@irispower.com>
12 * intended to replace:
13 * memcpy in memcpy.c and
14 * memmove in memmove.c
15 * ... in arch/microblaze/lib
20 * Attempt at quicker memcpy and memmove for MicroBlaze
21 * Input : Operand1 in Reg r5 - destination address
22 * Operand2 in Reg r6 - source address
23 * Operand3 in Reg r7 - number of bytes to transfer
24 * Output: Result in Reg r3 - starting destinaition address
28 * Perform (possibly unaligned) copy of a block of memory
29 * between mem locations with size of xfer spec'd in bytes
33 .type memmove, @function
36 #ifdef __MICROBLAZEEL__
45 cmpu r4, r5, r6 /* n = s - d */
46 bgei r4, HIDDEN_JUMPTARGET(memcpy)
48 fast_memcpy_descending:
49 /* move d to return register as value of function */
52 add r5, r5, r7 /* d = d + c */
53 add r6, r6, r7 /* s = s + c */
55 addi r4, r0, 4 /* n = 4 */
56 cmpu r4, r4, r7 /* n = c - n (unsigned) */
57 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
59 /* transfer first 0~3 bytes to get aligned dest address */
60 andi r4, r5, 3 /* n = d & 3 */
61 /* if zero, destination already aligned */
63 rsub r7, r4, r7 /* c = c - n adjust c */
66 /* if no bytes left to transfer, transfer the bulk */
68 addi r6, r6, -1 /* s-- */
69 addi r5, r5, -1 /* d-- */
70 lbui r11, r6, 0 /* h = *s */
71 sbi r11, r5, 0 /* *d = h */
72 brid d_xfer_first_loop /* loop */
73 addi r4, r4, -1 /* n-- (IN DELAY SLOT) */
76 addi r4, r0, 32 /* n = 32 */
77 cmpu r4, r4, r7 /* n = c - n (unsigned) */
78 /* if n < 0, less than one block to transfer */
82 andi r4, r7, 0xffffffe0 /* n = c & ~31 */
83 rsub r7, r4, r7 /* c = c - n */
85 andi r9, r6, 3 /* t1 = s & 3 */
86 /* if temp != 0, unaligned transfers needed */
87 bnei r9, d_block_unaligned
90 addi r6, r6, -32 /* s = s - 32 */
91 addi r5, r5, -32 /* d = d - 32 */
92 lwi r9, r6, 28 /* t1 = *(s + 28) */
93 lwi r10, r6, 24 /* t2 = *(s + 24) */
94 lwi r11, r6, 20 /* t3 = *(s + 20) */
95 lwi r12, r6, 16 /* t4 = *(s + 16) */
96 swi r9, r5, 28 /* *(d + 28) = t1 */
97 swi r10, r5, 24 /* *(d + 24) = t2 */
98 swi r11, r5, 20 /* *(d + 20) = t3 */
99 swi r12, r5, 16 /* *(d + 16) = t4 */
100 lwi r9, r6, 12 /* t1 = *(s + 12) */
101 lwi r10, r6, 8 /* t2 = *(s + 8) */
102 lwi r11, r6, 4 /* t3 = *(s + 4) */
103 lwi r12, r6, 0 /* t4 = *(s + 0) */
104 swi r9, r5, 12 /* *(d + 12) = t1 */
105 swi r10, r5, 8 /* *(d + 8) = t2 */
106 swi r11, r5, 4 /* *(d + 4) = t3 */
107 addi r4, r4, -32 /* n = n - 32 */
108 bneid r4, d_block_aligned /* while (n) loop */
109 swi r12, r5, 0 /* *(d + 0) = t4 (IN DELAY SLOT) */
113 andi r8, r6, 0xfffffffc /* as = s & ~3 */
114 rsub r6, r4, r6 /* s = s - n */
115 lwi r11, r8, 0 /* h = *(as + 0) */
118 beqi r9,d_block_u1 /* t1 was 1 => 1 byte offset */
120 beqi r9,d_block_u2 /* t1 was 2 => 2 byte offset */
123 BSRLI r11, r11, 8 /* h = h >> 8 */
125 addi r8, r8, -32 /* as = as - 32 */
126 addi r5, r5, -32 /* d = d - 32 */
127 lwi r12, r8, 28 /* v = *(as + 28) */
128 BSLLI r9, r12, 24 /* t1 = v << 24 */
129 or r9, r11, r9 /* t1 = h | t1 */
130 swi r9, r5, 28 /* *(d + 28) = t1 */
131 BSRLI r11, r12, 8 /* h = v >> 8 */
132 lwi r12, r8, 24 /* v = *(as + 24) */
133 BSLLI r9, r12, 24 /* t1 = v << 24 */
134 or r9, r11, r9 /* t1 = h | t1 */
135 swi r9, r5, 24 /* *(d + 24) = t1 */
136 BSRLI r11, r12, 8 /* h = v >> 8 */
137 lwi r12, r8, 20 /* v = *(as + 20) */
138 BSLLI r9, r12, 24 /* t1 = v << 24 */
139 or r9, r11, r9 /* t1 = h | t1 */
140 swi r9, r5, 20 /* *(d + 20) = t1 */
141 BSRLI r11, r12, 8 /* h = v >> 8 */
142 lwi r12, r8, 16 /* v = *(as + 16) */
143 BSLLI r9, r12, 24 /* t1 = v << 24 */
144 or r9, r11, r9 /* t1 = h | t1 */
145 swi r9, r5, 16 /* *(d + 16) = t1 */
146 BSRLI r11, r12, 8 /* h = v >> 8 */
147 lwi r12, r8, 12 /* v = *(as + 12) */
148 BSLLI r9, r12, 24 /* t1 = v << 24 */
149 or r9, r11, r9 /* t1 = h | t1 */
150 swi r9, r5, 12 /* *(d + 112) = t1 */
151 BSRLI r11, r12, 8 /* h = v >> 8 */
152 lwi r12, r8, 8 /* v = *(as + 8) */
153 BSLLI r9, r12, 24 /* t1 = v << 24 */
154 or r9, r11, r9 /* t1 = h | t1 */
155 swi r9, r5, 8 /* *(d + 8) = t1 */
156 BSRLI r11, r12, 8 /* h = v >> 8 */
157 lwi r12, r8, 4 /* v = *(as + 4) */
158 BSLLI r9, r12, 24 /* t1 = v << 24 */
159 or r9, r11, r9 /* t1 = h | t1 */
160 swi r9, r5, 4 /* *(d + 4) = t1 */
161 BSRLI r11, r12, 8 /* h = v >> 8 */
162 lwi r12, r8, 0 /* v = *(as + 0) */
163 BSLLI r9, r12, 24 /* t1 = v << 24 */
164 or r9, r11, r9 /* t1 = h | t1 */
165 swi r9, r5, 0 /* *(d + 0) = t1 */
166 addi r4, r4, -32 /* n = n - 32 */
167 bneid r4, d_bu3_loop /* while (n) loop */
168 BSRLI r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
172 BSRLI r11, r11, 24 /* h = h >> 24 */
174 addi r8, r8, -32 /* as = as - 32 */
175 addi r5, r5, -32 /* d = d - 32 */
176 lwi r12, r8, 28 /* v = *(as + 28) */
177 BSLLI r9, r12, 8 /* t1 = v << 8 */
178 or r9, r11, r9 /* t1 = h | t1 */
179 swi r9, r5, 28 /* *(d + 28) = t1 */
180 BSRLI r11, r12, 24 /* h = v >> 24 */
181 lwi r12, r8, 24 /* v = *(as + 24) */
182 BSLLI r9, r12, 8 /* t1 = v << 8 */
183 or r9, r11, r9 /* t1 = h | t1 */
184 swi r9, r5, 24 /* *(d + 24) = t1 */
185 BSRLI r11, r12, 24 /* h = v >> 24 */
186 lwi r12, r8, 20 /* v = *(as + 20) */
187 BSLLI r9, r12, 8 /* t1 = v << 8 */
188 or r9, r11, r9 /* t1 = h | t1 */
189 swi r9, r5, 20 /* *(d + 20) = t1 */
190 BSRLI r11, r12, 24 /* h = v >> 24 */
191 lwi r12, r8, 16 /* v = *(as + 16) */
192 BSLLI r9, r12, 8 /* t1 = v << 8 */
193 or r9, r11, r9 /* t1 = h | t1 */
194 swi r9, r5, 16 /* *(d + 16) = t1 */
195 BSRLI r11, r12, 24 /* h = v >> 24 */
196 lwi r12, r8, 12 /* v = *(as + 12) */
197 BSLLI r9, r12, 8 /* t1 = v << 8 */
198 or r9, r11, r9 /* t1 = h | t1 */
199 swi r9, r5, 12 /* *(d + 112) = t1 */
200 BSRLI r11, r12, 24 /* h = v >> 24 */
201 lwi r12, r8, 8 /* v = *(as + 8) */
202 BSLLI r9, r12, 8 /* t1 = v << 8 */
203 or r9, r11, r9 /* t1 = h | t1 */
204 swi r9, r5, 8 /* *(d + 8) = t1 */
205 BSRLI r11, r12, 24 /* h = v >> 24 */
206 lwi r12, r8, 4 /* v = *(as + 4) */
207 BSLLI r9, r12, 8 /* t1 = v << 8 */
208 or r9, r11, r9 /* t1 = h | t1 */
209 swi r9, r5, 4 /* *(d + 4) = t1 */
210 BSRLI r11, r12, 24 /* h = v >> 24 */
211 lwi r12, r8, 0 /* v = *(as + 0) */
212 BSLLI r9, r12, 8 /* t1 = v << 8 */
213 or r9, r11, r9 /* t1 = h | t1 */
214 swi r9, r5, 0 /* *(d + 0) = t1 */
215 addi r4, r4, -32 /* n = n - 32 */
216 bneid r4, d_bu1_loop /* while (n) loop */
217 BSRLI r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
221 BSRLI r11, r11, 16 /* h = h >> 16 */
223 addi r8, r8, -32 /* as = as - 32 */
224 addi r5, r5, -32 /* d = d - 32 */
225 lwi r12, r8, 28 /* v = *(as + 28) */
226 BSLLI r9, r12, 16 /* t1 = v << 16 */
227 or r9, r11, r9 /* t1 = h | t1 */
228 swi r9, r5, 28 /* *(d + 28) = t1 */
229 BSRLI r11, r12, 16 /* h = v >> 16 */
230 lwi r12, r8, 24 /* v = *(as + 24) */
231 BSLLI r9, r12, 16 /* t1 = v << 16 */
232 or r9, r11, r9 /* t1 = h | t1 */
233 swi r9, r5, 24 /* *(d + 24) = t1 */
234 BSRLI r11, r12, 16 /* h = v >> 16 */
235 lwi r12, r8, 20 /* v = *(as + 20) */
236 BSLLI r9, r12, 16 /* t1 = v << 16 */
237 or r9, r11, r9 /* t1 = h | t1 */
238 swi r9, r5, 20 /* *(d + 20) = t1 */
239 BSRLI r11, r12, 16 /* h = v >> 16 */
240 lwi r12, r8, 16 /* v = *(as + 16) */
241 BSLLI r9, r12, 16 /* t1 = v << 16 */
242 or r9, r11, r9 /* t1 = h | t1 */
243 swi r9, r5, 16 /* *(d + 16) = t1 */
244 BSRLI r11, r12, 16 /* h = v >> 16 */
245 lwi r12, r8, 12 /* v = *(as + 12) */
246 BSLLI r9, r12, 16 /* t1 = v << 16 */
247 or r9, r11, r9 /* t1 = h | t1 */
248 swi r9, r5, 12 /* *(d + 112) = t1 */
249 BSRLI r11, r12, 16 /* h = v >> 16 */
250 lwi r12, r8, 8 /* v = *(as + 8) */
251 BSLLI r9, r12, 16 /* t1 = v << 16 */
252 or r9, r11, r9 /* t1 = h | t1 */
253 swi r9, r5, 8 /* *(d + 8) = t1 */
254 BSRLI r11, r12, 16 /* h = v >> 16 */
255 lwi r12, r8, 4 /* v = *(as + 4) */
256 BSLLI r9, r12, 16 /* t1 = v << 16 */
257 or r9, r11, r9 /* t1 = h | t1 */
258 swi r9, r5, 4 /* *(d + 4) = t1 */
259 BSRLI r11, r12, 16 /* h = v >> 16 */
260 lwi r12, r8, 0 /* v = *(as + 0) */
261 BSLLI r9, r12, 16 /* t1 = v << 16 */
262 or r9, r11, r9 /* t1 = h | t1 */
263 swi r9, r5, 0 /* *(d + 0) = t1 */
264 addi r4, r4, -32 /* n = n - 32 */
265 bneid r4, d_bu2_loop /* while (n) loop */
266 BSRLI r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
269 addi r4, r0, 4 /* n = 4 */
270 cmpu r4, r4, r7 /* n = c - n (unsigned) */
271 blti r4,d_xfer_end /* if n < 0, less than one word to transfer */
274 andi r4, r7, 0xfffffffc /* n = c & ~3 */
275 rsub r5, r4, r5 /* d = d - n */
276 rsub r6, r4, r6 /* s = s - n */
277 rsub r7, r4, r7 /* c = c - n */
279 andi r9, r6, 3 /* t1 = s & 3 */
280 /* if temp != 0, unaligned transfers needed */
281 bnei r9, d_word_unaligned
284 addi r4, r4,-4 /* n-- */
285 lw r9, r6, r4 /* t1 = *(s+n) */
286 bneid r4, d_word_aligned /* loop */
287 sw r9, r5, r4 /* *(d+n) = t1 (IN DELAY SLOT) */
292 andi r8, r6, 0xfffffffc /* as = s & ~3 */
293 lw r11, r8, r4 /* h = *(as + n) */
296 beqi r9,d_word_u1 /* t1 was 1 => 1 byte offset */
298 beqi r9,d_word_u2 /* t1 was 2 => 2 byte offset */
301 BSRLI r11, r11, 8 /* h = h >> 8 */
303 addi r4, r4,-4 /* n = n - 4 */
304 lw r12, r8, r4 /* v = *(as + n) */
305 BSLLI r9, r12, 24 /* t1 = v << 24 */
306 or r9, r11, r9 /* t1 = h | t1 */
307 sw r9, r5, r4 /* *(d + n) = t1 */
308 bneid r4, d_wu3_loop /* while (n) loop */
309 BSRLI r11, r12, 8 /* h = v >> 8 (IN DELAY SLOT) */
314 BSRLI r11, r11, 24 /* h = h >> 24 */
316 addi r4, r4,-4 /* n = n - 4 */
317 lw r12, r8, r4 /* v = *(as + n) */
318 BSLLI r9, r12, 8 /* t1 = v << 8 */
319 or r9, r11, r9 /* t1 = h | t1 */
320 sw r9, r5, r4 /* *(d + n) = t1 */
321 bneid r4, d_wu1_loop /* while (n) loop */
322 BSRLI r11, r12, 24 /* h = v >> 24 (IN DELAY SLOT) */
327 BSRLI r11, r11, 16 /* h = h >> 16 */
329 addi r4, r4,-4 /* n = n - 4 */
330 lw r12, r8, r4 /* v = *(as + n) */
331 BSLLI r9, r12, 16 /* t1 = v << 16 */
332 or r9, r11, r9 /* t1 = h | t1 */
333 sw r9, r5, r4 /* *(d + n) = t1 */
334 bneid r4, d_wu2_loop /* while (n) loop */
335 BSRLI r11, r12, 16 /* h = v >> 16 (IN DELAY SLOT) */
341 beqi r7, a_done /* while (c) */
342 addi r6, r6, -1 /* s-- */
343 lbui r9, r6, 0 /* t1 = *s */
344 addi r5, r5, -1 /* d-- */
345 sbi r9, r5, 0 /* *d = t1 */
346 brid d_xfer_end_loop /* loop */
347 addi r7, r7, -1 /* c-- (IN DELAY SLOT) */
354 .size memmove, . - memmove
356 libc_hidden_def(memmove)