2 /* A program to test SSE4.1/SSE4.2 instructions.
3 Revisions: Nov.208 - wrote this file
4 Apr.10.2010 - added PEXTR* tests
5 Apr.16.2010 - added PINS* tests
9 gcc -m64 -g -O -Wall -o sse4-64 sse4-64.c
15 //#include "tests/malloc.h" // reenable when reintegrated
20 // rmme when reintegrated
21 // Allocates a 16-aligned block. Asserts if the allocation fails.
27 __attribute__((unused))
28 static void* memalign16(size_t szB)
31 #if defined(VGO_darwin)
32 // Darwin lacks memalign, but its malloc is always 16-aligned anyway.
35 x = memalign(16, szB);
38 assert(0 == ((16-1) & (unsigned long)x));
44 typedef unsigned char V128[16];
45 typedef unsigned int UInt;
46 typedef signed int Int;
47 typedef unsigned char UChar;
48 typedef unsigned long long int ULong;
50 typedef unsigned char Bool;
51 #define False ((Bool)0)
52 #define True ((Bool)1)
70 static void do64HLtoV128 ( /*OUT*/V128* res, ULong wHi, ULong wLo )
72 // try to sidestep strict-aliasing snafus by memcpying explicitly
73 UChar* p = (UChar*)res;
74 memcpy(&p[8], (UChar*)&wHi, 8);
75 memcpy(&p[0], (UChar*)&wLo, 8);
78 static UChar randUChar ( void )
80 static UInt seed = 80021;
81 seed = 1103515245 * seed + 12345;
82 return (seed >> 17) & 0xFF;
85 static ULong randULong ( void )
89 for (i = 0; i < 8; i++) {
90 r = (r << 8) | (ULong)(0xFF & randUChar());
95 static void randV128 ( V128* v )
98 for (i = 0; i < 16; i++)
99 (*v)[i] = randUChar();
102 static void showV128 ( V128* v )
105 for (i = 15; i >= 0; i--)
106 printf("%02x", (Int)(*v)[i]);
109 static void showMaskedV128 ( V128* v, V128* mask )
112 for (i = 15; i >= 0; i--)
113 printf("%02x", (Int)( ((*v)[i]) & ((*mask)[i]) ));
116 static void showIGVV( char* rOrM, char* op, Int imm,
117 ULong src64, V128* dst, V128* res )
119 printf("%s %10s $%d ", rOrM, op, imm);
120 printf("%016llx", src64);
128 static void showIAG ( char* rOrM, char* op, Int imm,
129 V128* argL, ULong argR, ULong res )
131 printf("%s %10s $%d ", rOrM, op, imm);
134 printf("%016llx", argR);
136 printf("%016llx", res);
140 static void showIAA ( char* rOrM, char* op, Int imm, RRArgs* rra, V128* rmask )
142 printf("%s %10s $%d ", rOrM, op, imm);
143 showV128(&rra->arg1);
145 showV128(&rra->arg2);
147 showMaskedV128(&rra->res, rmask);
151 static void showAA ( char* rOrM, char* op, RRArgs* rra, V128* rmask )
153 printf("%s %10s ", rOrM, op);
154 showV128(&rra->arg1);
156 showV128(&rra->arg2);
158 showMaskedV128(&rra->res, rmask);
162 /* Note: these are little endian. Hence first byte is the least
163 significant byte of lane zero. */
165 /* Mask for insns where all result bits are non-approximated. */
166 static V128 AllMask = { 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF,
167 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
169 /* Mark for insns which produce approximated vector short results. */
170 __attribute__((unused))
171 static V128 ApproxPS = { 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF,
172 0x00,0x00,0x80,0xFF, 0x00,0x00,0x80,0xFF };
174 /* Mark for insns which produce approximated scalar short results. */
175 __attribute__((unused))
176 static V128 ApproxSS = { 0x00,0x00,0x80,0xFF, 0xFF,0xFF,0xFF,0xFF,
177 0xFF,0xFF,0xFF,0xFF, 0xFF,0xFF,0xFF,0xFF };
179 static V128 fives = { 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55,
180 0x55,0x55,0x55,0x55, 0x55,0x55,0x55,0x55 };
182 static V128 zeroes = { 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,
183 0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00 };
185 double mkPosInf ( void ) { return 1.0 / 0.0; }
186 double mkNegInf ( void ) { return -mkPosInf(); }
187 double mkPosNan ( void ) { return 0.0 / 0.0; }
188 double mkNegNan ( void ) { return -mkPosNan(); }
190 __attribute__((noinline))
191 UInt get_mxcsr ( void )
194 __asm__ __volatile__(
195 "subq $8, %%rsp" "\n\t"
196 "stmxcsr (%%rsp)" "\n\t"
197 "movq (%%rsp), %0" "\n"
199 : /*OUT*/"=r"(w64) : /*IN*/ : "memory","cc"
201 if (0) printf("get %08x\n", (UInt)w64);
205 __attribute__((noinline))
206 void set_mxcsr ( UInt w32 )
208 if (0) printf("set %08x\n", w32);
209 ULong w64 = (ULong)w32;
210 __asm__ __volatile__(
211 "subq $8, %%rsp" "\n\t"
212 "movq %0, (%%rsp)" "\n\t"
213 "ldmxcsr (%%rsp)" "\n\t"
215 : /*OUT*/ : /*IN*/"r"(w64) : "memory",/*"mxcsr",*/"cc"
219 UInt get_sse_roundingmode ( void )
221 UInt w = get_mxcsr();
222 return (w >> 13) & 3;
225 void set_sse_roundingmode ( UInt m )
228 assert(0 == (m & ~3));
236 #define DO_imm_r_r(_opname, _imm, _src, _dst) \
239 __asm__ __volatile__( \
240 "movupd (%0), %%xmm2" "\n\t" \
241 "movupd (%1), %%xmm11" "\n\t" \
242 _opname " $" #_imm ", %%xmm2, %%xmm11" "\n\t" \
243 "movupd %%xmm11, (%2)" "\n" \
244 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
245 : "cc", "memory", "xmm2", "xmm11" \
248 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
249 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
250 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
251 showIAA("r", (_opname), (_imm), &rra, &AllMask); \
254 #define DO_imm_m_r(_opname, _imm, _src, _dst) \
257 V128* _srcM = memalign16(sizeof(V128)); \
258 memcpy(_srcM, &(_src), sizeof(V128)); \
259 __asm__ __volatile__( \
260 "movupd (%1), %%xmm11" "\n\t" \
261 _opname " $" #_imm ", (%0), %%xmm11" "\n\t" \
262 "movupd %%xmm11, (%2)" "\n" \
263 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
264 : "cc", "memory", "xmm11" \
267 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
268 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
269 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
270 showIAA("m", (_opname), (_imm), &rra, &AllMask); \
274 #define DO_imm_mandr_r(_opname, _imm, _src, _dst) \
275 DO_imm_r_r( _opname, _imm, _src, _dst ) \
276 DO_imm_m_r( _opname, _imm, _src, _dst )
282 #define DO_r_r(_opname, _src, _dst) \
285 __asm__ __volatile__( \
286 "movupd (%0), %%xmm2" "\n\t" \
287 "movupd (%1), %%xmm11" "\n\t" \
288 _opname " %%xmm2, %%xmm11" "\n\t" \
289 "movupd %%xmm11, (%2)" "\n" \
290 : /*out*/ : /*in*/ "r"(&(_src)), "r"(&(_dst)), "r"(&(_tmp)) \
291 : "cc", "memory", "xmm2", "xmm11" \
294 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
295 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
296 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
297 showAA("r", (_opname), &rra, &AllMask); \
300 #define DO_m_r(_opname, _src, _dst) \
303 V128* _srcM = memalign16(sizeof(V128)); \
304 memcpy(_srcM, &(_src), sizeof(V128)); \
305 __asm__ __volatile__( \
306 "movupd (%1), %%xmm11" "\n\t" \
307 _opname " (%0), %%xmm11" "\n\t" \
308 "movupd %%xmm11, (%2)" "\n" \
309 : /*out*/ : /*in*/ "r"(_srcM), "r"(&(_dst)), "r"(&(_tmp)) \
310 : "cc", "memory", "xmm11" \
313 memcpy(&rra.arg1, &(_src), sizeof(V128)); \
314 memcpy(&rra.arg2, &(_dst), sizeof(V128)); \
315 memcpy(&rra.res, &(_tmp), sizeof(V128)); \
316 showAA("m", (_opname), &rra, &AllMask); \
320 #define DO_mandr_r(_opname, _src, _dst) \
321 DO_r_r(_opname, _src, _dst) \
322 DO_m_r(_opname, _src, _dst)
327 #define DO_imm_r_to_rscalar(_opname, _imm, _src, _dstsuffix) \
329 ULong _scbefore = 0x5555555555555555ULL; \
330 ULong _scafter = 0xAAAAAAAAAAAAAAAAULL; \
331 /* This assumes that gcc won't make any of %0, %1, %2 */ \
332 /* be r11. That should be ensured (cough, cough) */ \
333 /* by declaring r11 to be clobbered. */ \
334 __asm__ __volatile__( \
335 "movupd (%0), %%xmm2" "\n\t" \
336 "movq (%1), %%r11" "\n\t" \
337 _opname " $" #_imm ", %%xmm2, %%r11" _dstsuffix "\n\t" \
338 "movq %%r11, (%2)" "\n" \
340 : /*in*/ "r"(&(_src)), "r"(&(_scbefore)), "r"(&(_scafter)) \
341 : "cc", "memory", "xmm2", "r11" \
343 showIAG("r", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
346 #define DO_imm_r_to_mscalar(_opname, _imm, _src) \
348 ULong _scbefore = 0x5555555555555555ULL; \
349 ULong _scafter = _scbefore; \
350 __asm__ __volatile__( \
351 "movupd (%0), %%xmm2" "\n\t" \
352 _opname " $" #_imm ", %%xmm2, (%1)" "\n\t" \
354 : /*in*/ "r"(&(_src)), "r"(&(_scafter)) \
355 : "cc", "memory", "xmm2" \
357 showIAG("m", (_opname), (_imm), &(_src), (_scbefore), (_scafter)); \
360 #define DO_imm_r_to_mandrscalar(_opname, _imm, _src, _dstsuffix) \
361 DO_imm_r_to_rscalar( _opname, _imm, _src, _dstsuffix ) \
362 DO_imm_r_to_mscalar( _opname, _imm, _src )
371 #define DO_imm_rscalar_to_r(_opname, _imm, _src, _srcsuffix) \
375 ULong src64 = (ULong)(_src); \
376 memcpy(dstv, fives, sizeof(dstv)); \
377 memcpy(res, zeroes, sizeof(res)); \
378 /* This assumes that gcc won't make any of %0, %1, %2 */ \
379 /* be r11. That should be ensured (cough, cough) */ \
380 /* by declaring r11 to be clobbered. */ \
381 __asm__ __volatile__( \
382 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
383 "movq (%1), %%r11" "\n\t" /*src64*/ \
384 _opname " $" #_imm ", %%r11" _srcsuffix ", %%xmm2" "\n\t" \
385 "movupd %%xmm2, (%2)" "\n" /*res*/ \
387 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
388 : "cc", "memory", "xmm2", "r11" \
390 showIGVV("r", (_opname), (_imm), src64, &dstv, &res); \
392 #define DO_imm_mscalar_to_r(_opname, _imm, _src) \
396 ULong src64 = (ULong)(_src); \
397 memcpy(dstv, fives, sizeof(dstv)); \
398 memcpy(res, zeroes, sizeof(res)); \
399 __asm__ __volatile__( \
400 "movupd (%0), %%xmm2" "\n\t" /*dstv*/ \
401 _opname " $" #_imm ", (%1), %%xmm2" "\n\t" \
402 "movupd %%xmm2, (%2)" "\n" /*res*/ \
404 : /*in*/ "r"(&dstv), "r"(&src64), "r"(&res) \
405 : "cc", "memory", "xmm2" \
407 showIGVV("m", (_opname), (_imm), src64, &dstv, &res); \
410 #define DO_imm_mandrscalar_to_r(_opname, _imm, _src, _dstsuffix) \
411 DO_imm_rscalar_to_r( _opname, _imm, _src, _dstsuffix ) \
412 DO_imm_mscalar_to_r( _opname, _imm, _src )
418 void test_BLENDPD ( void )
422 for (i = 0; i < 10; i++) {
425 DO_imm_mandr_r("blendpd", 0, src, dst);
426 DO_imm_mandr_r("blendpd", 1, src, dst);
427 DO_imm_mandr_r("blendpd", 2, src, dst);
428 DO_imm_mandr_r("blendpd", 3, src, dst);
432 void test_BLENDPS ( void )
436 for (i = 0; i < 10; i++) {
439 DO_imm_mandr_r("blendps", 0, src, dst);
440 DO_imm_mandr_r("blendps", 1, src, dst);
441 DO_imm_mandr_r("blendps", 2, src, dst);
442 DO_imm_mandr_r("blendps", 3, src, dst);
443 DO_imm_mandr_r("blendps", 4, src, dst);
444 DO_imm_mandr_r("blendps", 5, src, dst);
445 DO_imm_mandr_r("blendps", 6, src, dst);
446 DO_imm_mandr_r("blendps", 7, src, dst);
447 DO_imm_mandr_r("blendps", 8, src, dst);
448 DO_imm_mandr_r("blendps", 9, src, dst);
449 DO_imm_mandr_r("blendps", 10, src, dst);
450 DO_imm_mandr_r("blendps", 11, src, dst);
451 DO_imm_mandr_r("blendps", 12, src, dst);
452 DO_imm_mandr_r("blendps", 13, src, dst);
453 DO_imm_mandr_r("blendps", 14, src, dst);
454 DO_imm_mandr_r("blendps", 15, src, dst);
458 void test_DPPD ( void )
462 *(double*)(&src[0]) = 1.2345;
463 *(double*)(&src[8]) = -6.78910;
464 *(double*)(&dst[0]) = -11.121314;
465 *(double*)(&dst[8]) = 15.161718;
466 DO_imm_mandr_r("dppd", 0, src, dst);
467 DO_imm_mandr_r("dppd", 1, src, dst);
468 DO_imm_mandr_r("dppd", 2, src, dst);
469 DO_imm_mandr_r("dppd", 3, src, dst);
470 DO_imm_mandr_r("dppd", 4, src, dst);
471 DO_imm_mandr_r("dppd", 5, src, dst);
472 DO_imm_mandr_r("dppd", 6, src, dst);
473 DO_imm_mandr_r("dppd", 7, src, dst);
474 DO_imm_mandr_r("dppd", 8, src, dst);
475 DO_imm_mandr_r("dppd", 9, src, dst);
476 DO_imm_mandr_r("dppd", 10, src, dst);
477 DO_imm_mandr_r("dppd", 11, src, dst);
478 DO_imm_mandr_r("dppd", 12, src, dst);
479 DO_imm_mandr_r("dppd", 13, src, dst);
480 DO_imm_mandr_r("dppd", 14, src, dst);
481 DO_imm_mandr_r("dppd", 15, src, dst);
482 DO_imm_mandr_r("dppd", 16, src, dst);
483 DO_imm_mandr_r("dppd", 17, src, dst);
484 DO_imm_mandr_r("dppd", 18, src, dst);
485 DO_imm_mandr_r("dppd", 19, src, dst);
486 DO_imm_mandr_r("dppd", 20, src, dst);
487 DO_imm_mandr_r("dppd", 21, src, dst);
488 DO_imm_mandr_r("dppd", 22, src, dst);
489 DO_imm_mandr_r("dppd", 23, src, dst);
490 DO_imm_mandr_r("dppd", 24, src, dst);
491 DO_imm_mandr_r("dppd", 25, src, dst);
492 DO_imm_mandr_r("dppd", 26, src, dst);
493 DO_imm_mandr_r("dppd", 27, src, dst);
494 DO_imm_mandr_r("dppd", 28, src, dst);
495 DO_imm_mandr_r("dppd", 29, src, dst);
496 DO_imm_mandr_r("dppd", 30, src, dst);
497 DO_imm_mandr_r("dppd", 31, src, dst);
498 DO_imm_mandr_r("dppd", 32, src, dst);
499 DO_imm_mandr_r("dppd", 33, src, dst);
500 DO_imm_mandr_r("dppd", 34, src, dst);
501 DO_imm_mandr_r("dppd", 35, src, dst);
502 DO_imm_mandr_r("dppd", 36, src, dst);
503 DO_imm_mandr_r("dppd", 37, src, dst);
504 DO_imm_mandr_r("dppd", 38, src, dst);
505 DO_imm_mandr_r("dppd", 39, src, dst);
506 DO_imm_mandr_r("dppd", 40, src, dst);
507 DO_imm_mandr_r("dppd", 41, src, dst);
508 DO_imm_mandr_r("dppd", 42, src, dst);
509 DO_imm_mandr_r("dppd", 43, src, dst);
510 DO_imm_mandr_r("dppd", 44, src, dst);
511 DO_imm_mandr_r("dppd", 45, src, dst);
512 DO_imm_mandr_r("dppd", 46, src, dst);
513 DO_imm_mandr_r("dppd", 47, src, dst);
514 DO_imm_mandr_r("dppd", 48, src, dst);
515 DO_imm_mandr_r("dppd", 49, src, dst);
516 DO_imm_mandr_r("dppd", 50, src, dst);
517 DO_imm_mandr_r("dppd", 51, src, dst);
518 DO_imm_mandr_r("dppd", 52, src, dst);
519 DO_imm_mandr_r("dppd", 53, src, dst);
520 DO_imm_mandr_r("dppd", 54, src, dst);
521 DO_imm_mandr_r("dppd", 55, src, dst);
522 DO_imm_mandr_r("dppd", 56, src, dst);
523 DO_imm_mandr_r("dppd", 57, src, dst);
524 DO_imm_mandr_r("dppd", 58, src, dst);
525 DO_imm_mandr_r("dppd", 59, src, dst);
526 DO_imm_mandr_r("dppd", 60, src, dst);
527 DO_imm_mandr_r("dppd", 61, src, dst);
528 DO_imm_mandr_r("dppd", 62, src, dst);
529 DO_imm_mandr_r("dppd", 63, src, dst);
530 DO_imm_mandr_r("dppd", 64, src, dst);
531 DO_imm_mandr_r("dppd", 65, src, dst);
532 DO_imm_mandr_r("dppd", 66, src, dst);
533 DO_imm_mandr_r("dppd", 67, src, dst);
534 DO_imm_mandr_r("dppd", 68, src, dst);
535 DO_imm_mandr_r("dppd", 69, src, dst);
536 DO_imm_mandr_r("dppd", 70, src, dst);
537 DO_imm_mandr_r("dppd", 71, src, dst);
538 DO_imm_mandr_r("dppd", 72, src, dst);
539 DO_imm_mandr_r("dppd", 73, src, dst);
540 DO_imm_mandr_r("dppd", 74, src, dst);
541 DO_imm_mandr_r("dppd", 75, src, dst);
542 DO_imm_mandr_r("dppd", 76, src, dst);
543 DO_imm_mandr_r("dppd", 77, src, dst);
544 DO_imm_mandr_r("dppd", 78, src, dst);
545 DO_imm_mandr_r("dppd", 79, src, dst);
546 DO_imm_mandr_r("dppd", 80, src, dst);
547 DO_imm_mandr_r("dppd", 81, src, dst);
548 DO_imm_mandr_r("dppd", 82, src, dst);
549 DO_imm_mandr_r("dppd", 83, src, dst);
550 DO_imm_mandr_r("dppd", 84, src, dst);
551 DO_imm_mandr_r("dppd", 85, src, dst);
552 DO_imm_mandr_r("dppd", 86, src, dst);
553 DO_imm_mandr_r("dppd", 87, src, dst);
554 DO_imm_mandr_r("dppd", 88, src, dst);
555 DO_imm_mandr_r("dppd", 89, src, dst);
556 DO_imm_mandr_r("dppd", 90, src, dst);
557 DO_imm_mandr_r("dppd", 91, src, dst);
558 DO_imm_mandr_r("dppd", 92, src, dst);
559 DO_imm_mandr_r("dppd", 93, src, dst);
560 DO_imm_mandr_r("dppd", 94, src, dst);
561 DO_imm_mandr_r("dppd", 95, src, dst);
562 DO_imm_mandr_r("dppd", 96, src, dst);
563 DO_imm_mandr_r("dppd", 97, src, dst);
564 DO_imm_mandr_r("dppd", 98, src, dst);
565 DO_imm_mandr_r("dppd", 99, src, dst);
566 DO_imm_mandr_r("dppd", 100, src, dst);
567 DO_imm_mandr_r("dppd", 101, src, dst);
568 DO_imm_mandr_r("dppd", 102, src, dst);
569 DO_imm_mandr_r("dppd", 103, src, dst);
570 DO_imm_mandr_r("dppd", 104, src, dst);
571 DO_imm_mandr_r("dppd", 105, src, dst);
572 DO_imm_mandr_r("dppd", 106, src, dst);
573 DO_imm_mandr_r("dppd", 107, src, dst);
574 DO_imm_mandr_r("dppd", 108, src, dst);
575 DO_imm_mandr_r("dppd", 109, src, dst);
576 DO_imm_mandr_r("dppd", 110, src, dst);
577 DO_imm_mandr_r("dppd", 111, src, dst);
578 DO_imm_mandr_r("dppd", 112, src, dst);
579 DO_imm_mandr_r("dppd", 113, src, dst);
580 DO_imm_mandr_r("dppd", 114, src, dst);
581 DO_imm_mandr_r("dppd", 115, src, dst);
582 DO_imm_mandr_r("dppd", 116, src, dst);
583 DO_imm_mandr_r("dppd", 117, src, dst);
584 DO_imm_mandr_r("dppd", 118, src, dst);
585 DO_imm_mandr_r("dppd", 119, src, dst);
586 DO_imm_mandr_r("dppd", 120, src, dst);
587 DO_imm_mandr_r("dppd", 121, src, dst);
588 DO_imm_mandr_r("dppd", 122, src, dst);
589 DO_imm_mandr_r("dppd", 123, src, dst);
590 DO_imm_mandr_r("dppd", 124, src, dst);
591 DO_imm_mandr_r("dppd", 125, src, dst);
592 DO_imm_mandr_r("dppd", 126, src, dst);
593 DO_imm_mandr_r("dppd", 127, src, dst);
594 DO_imm_mandr_r("dppd", 128, src, dst);
595 DO_imm_mandr_r("dppd", 129, src, dst);
596 DO_imm_mandr_r("dppd", 130, src, dst);
597 DO_imm_mandr_r("dppd", 131, src, dst);
598 DO_imm_mandr_r("dppd", 132, src, dst);
599 DO_imm_mandr_r("dppd", 133, src, dst);
600 DO_imm_mandr_r("dppd", 134, src, dst);
601 DO_imm_mandr_r("dppd", 135, src, dst);
602 DO_imm_mandr_r("dppd", 136, src, dst);
603 DO_imm_mandr_r("dppd", 137, src, dst);
604 DO_imm_mandr_r("dppd", 138, src, dst);
605 DO_imm_mandr_r("dppd", 139, src, dst);
606 DO_imm_mandr_r("dppd", 140, src, dst);
607 DO_imm_mandr_r("dppd", 141, src, dst);
608 DO_imm_mandr_r("dppd", 142, src, dst);
609 DO_imm_mandr_r("dppd", 143, src, dst);
610 DO_imm_mandr_r("dppd", 144, src, dst);
611 DO_imm_mandr_r("dppd", 145, src, dst);
612 DO_imm_mandr_r("dppd", 146, src, dst);
613 DO_imm_mandr_r("dppd", 147, src, dst);
614 DO_imm_mandr_r("dppd", 148, src, dst);
615 DO_imm_mandr_r("dppd", 149, src, dst);
616 DO_imm_mandr_r("dppd", 150, src, dst);
617 DO_imm_mandr_r("dppd", 151, src, dst);
618 DO_imm_mandr_r("dppd", 152, src, dst);
619 DO_imm_mandr_r("dppd", 153, src, dst);
620 DO_imm_mandr_r("dppd", 154, src, dst);
621 DO_imm_mandr_r("dppd", 155, src, dst);
622 DO_imm_mandr_r("dppd", 156, src, dst);
623 DO_imm_mandr_r("dppd", 157, src, dst);
624 DO_imm_mandr_r("dppd", 158, src, dst);
625 DO_imm_mandr_r("dppd", 159, src, dst);
626 DO_imm_mandr_r("dppd", 160, src, dst);
627 DO_imm_mandr_r("dppd", 161, src, dst);
628 DO_imm_mandr_r("dppd", 162, src, dst);
629 DO_imm_mandr_r("dppd", 163, src, dst);
630 DO_imm_mandr_r("dppd", 164, src, dst);
631 DO_imm_mandr_r("dppd", 165, src, dst);
632 DO_imm_mandr_r("dppd", 166, src, dst);
633 DO_imm_mandr_r("dppd", 167, src, dst);
634 DO_imm_mandr_r("dppd", 168, src, dst);
635 DO_imm_mandr_r("dppd", 169, src, dst);
636 DO_imm_mandr_r("dppd", 170, src, dst);
637 DO_imm_mandr_r("dppd", 171, src, dst);
638 DO_imm_mandr_r("dppd", 172, src, dst);
639 DO_imm_mandr_r("dppd", 173, src, dst);
640 DO_imm_mandr_r("dppd", 174, src, dst);
641 DO_imm_mandr_r("dppd", 175, src, dst);
642 DO_imm_mandr_r("dppd", 176, src, dst);
643 DO_imm_mandr_r("dppd", 177, src, dst);
644 DO_imm_mandr_r("dppd", 178, src, dst);
645 DO_imm_mandr_r("dppd", 179, src, dst);
646 DO_imm_mandr_r("dppd", 180, src, dst);
647 DO_imm_mandr_r("dppd", 181, src, dst);
648 DO_imm_mandr_r("dppd", 182, src, dst);
649 DO_imm_mandr_r("dppd", 183, src, dst);
650 DO_imm_mandr_r("dppd", 184, src, dst);
651 DO_imm_mandr_r("dppd", 185, src, dst);
652 DO_imm_mandr_r("dppd", 186, src, dst);
653 DO_imm_mandr_r("dppd", 187, src, dst);
654 DO_imm_mandr_r("dppd", 188, src, dst);
655 DO_imm_mandr_r("dppd", 189, src, dst);
656 DO_imm_mandr_r("dppd", 190, src, dst);
657 DO_imm_mandr_r("dppd", 191, src, dst);
658 DO_imm_mandr_r("dppd", 192, src, dst);
659 DO_imm_mandr_r("dppd", 193, src, dst);
660 DO_imm_mandr_r("dppd", 194, src, dst);
661 DO_imm_mandr_r("dppd", 195, src, dst);
662 DO_imm_mandr_r("dppd", 196, src, dst);
663 DO_imm_mandr_r("dppd", 197, src, dst);
664 DO_imm_mandr_r("dppd", 198, src, dst);
665 DO_imm_mandr_r("dppd", 199, src, dst);
666 DO_imm_mandr_r("dppd", 200, src, dst);
667 DO_imm_mandr_r("dppd", 201, src, dst);
668 DO_imm_mandr_r("dppd", 202, src, dst);
669 DO_imm_mandr_r("dppd", 203, src, dst);
670 DO_imm_mandr_r("dppd", 204, src, dst);
671 DO_imm_mandr_r("dppd", 205, src, dst);
672 DO_imm_mandr_r("dppd", 206, src, dst);
673 DO_imm_mandr_r("dppd", 207, src, dst);
674 DO_imm_mandr_r("dppd", 208, src, dst);
675 DO_imm_mandr_r("dppd", 209, src, dst);
676 DO_imm_mandr_r("dppd", 210, src, dst);
677 DO_imm_mandr_r("dppd", 211, src, dst);
678 DO_imm_mandr_r("dppd", 212, src, dst);
679 DO_imm_mandr_r("dppd", 213, src, dst);
680 DO_imm_mandr_r("dppd", 214, src, dst);
681 DO_imm_mandr_r("dppd", 215, src, dst);
682 DO_imm_mandr_r("dppd", 216, src, dst);
683 DO_imm_mandr_r("dppd", 217, src, dst);
684 DO_imm_mandr_r("dppd", 218, src, dst);
685 DO_imm_mandr_r("dppd", 219, src, dst);
686 DO_imm_mandr_r("dppd", 220, src, dst);
687 DO_imm_mandr_r("dppd", 221, src, dst);
688 DO_imm_mandr_r("dppd", 222, src, dst);
689 DO_imm_mandr_r("dppd", 223, src, dst);
690 DO_imm_mandr_r("dppd", 224, src, dst);
691 DO_imm_mandr_r("dppd", 225, src, dst);
692 DO_imm_mandr_r("dppd", 226, src, dst);
693 DO_imm_mandr_r("dppd", 227, src, dst);
694 DO_imm_mandr_r("dppd", 228, src, dst);
695 DO_imm_mandr_r("dppd", 229, src, dst);
696 DO_imm_mandr_r("dppd", 230, src, dst);
697 DO_imm_mandr_r("dppd", 231, src, dst);
698 DO_imm_mandr_r("dppd", 232, src, dst);
699 DO_imm_mandr_r("dppd", 233, src, dst);
700 DO_imm_mandr_r("dppd", 234, src, dst);
701 DO_imm_mandr_r("dppd", 235, src, dst);
702 DO_imm_mandr_r("dppd", 236, src, dst);
703 DO_imm_mandr_r("dppd", 237, src, dst);
704 DO_imm_mandr_r("dppd", 238, src, dst);
705 DO_imm_mandr_r("dppd", 239, src, dst);
706 DO_imm_mandr_r("dppd", 240, src, dst);
707 DO_imm_mandr_r("dppd", 241, src, dst);
708 DO_imm_mandr_r("dppd", 242, src, dst);
709 DO_imm_mandr_r("dppd", 243, src, dst);
710 DO_imm_mandr_r("dppd", 244, src, dst);
711 DO_imm_mandr_r("dppd", 245, src, dst);
712 DO_imm_mandr_r("dppd", 246, src, dst);
713 DO_imm_mandr_r("dppd", 247, src, dst);
714 DO_imm_mandr_r("dppd", 248, src, dst);
715 DO_imm_mandr_r("dppd", 249, src, dst);
716 DO_imm_mandr_r("dppd", 250, src, dst);
717 DO_imm_mandr_r("dppd", 251, src, dst);
718 DO_imm_mandr_r("dppd", 252, src, dst);
719 DO_imm_mandr_r("dppd", 253, src, dst);
720 DO_imm_mandr_r("dppd", 254, src, dst);
721 DO_imm_mandr_r("dppd", 255, src, dst);
725 void test_DPPS ( void )
729 *(float*)(&src[0]) = 1.2;
730 *(float*)(&src[4]) = -3.4;
731 *(float*)(&src[8]) = -6.7;
732 *(float*)(&src[12]) = 8.9;
733 *(float*)(&dst[0]) = -10.11;
734 *(float*)(&dst[4]) = 12.13;
735 *(float*)(&dst[8]) = 14.15;
736 *(float*)(&dst[12]) = -16.17;
737 DO_imm_mandr_r("dpps", 0, src, dst);
738 DO_imm_mandr_r("dpps", 1, src, dst);
739 DO_imm_mandr_r("dpps", 2, src, dst);
740 DO_imm_mandr_r("dpps", 3, src, dst);
741 DO_imm_mandr_r("dpps", 4, src, dst);
742 DO_imm_mandr_r("dpps", 5, src, dst);
743 DO_imm_mandr_r("dpps", 6, src, dst);
744 DO_imm_mandr_r("dpps", 7, src, dst);
745 DO_imm_mandr_r("dpps", 8, src, dst);
746 DO_imm_mandr_r("dpps", 9, src, dst);
747 DO_imm_mandr_r("dpps", 10, src, dst);
748 DO_imm_mandr_r("dpps", 11, src, dst);
749 DO_imm_mandr_r("dpps", 12, src, dst);
750 DO_imm_mandr_r("dpps", 13, src, dst);
751 DO_imm_mandr_r("dpps", 14, src, dst);
752 DO_imm_mandr_r("dpps", 15, src, dst);
753 DO_imm_mandr_r("dpps", 16, src, dst);
754 DO_imm_mandr_r("dpps", 17, src, dst);
755 DO_imm_mandr_r("dpps", 18, src, dst);
756 DO_imm_mandr_r("dpps", 19, src, dst);
757 DO_imm_mandr_r("dpps", 20, src, dst);
758 DO_imm_mandr_r("dpps", 21, src, dst);
759 DO_imm_mandr_r("dpps", 22, src, dst);
760 DO_imm_mandr_r("dpps", 23, src, dst);
761 DO_imm_mandr_r("dpps", 24, src, dst);
762 DO_imm_mandr_r("dpps", 25, src, dst);
763 DO_imm_mandr_r("dpps", 26, src, dst);
764 DO_imm_mandr_r("dpps", 27, src, dst);
765 DO_imm_mandr_r("dpps", 28, src, dst);
766 DO_imm_mandr_r("dpps", 29, src, dst);
767 DO_imm_mandr_r("dpps", 30, src, dst);
768 DO_imm_mandr_r("dpps", 31, src, dst);
769 DO_imm_mandr_r("dpps", 32, src, dst);
770 DO_imm_mandr_r("dpps", 33, src, dst);
771 DO_imm_mandr_r("dpps", 34, src, dst);
772 DO_imm_mandr_r("dpps", 35, src, dst);
773 DO_imm_mandr_r("dpps", 36, src, dst);
774 DO_imm_mandr_r("dpps", 37, src, dst);
775 DO_imm_mandr_r("dpps", 38, src, dst);
776 DO_imm_mandr_r("dpps", 39, src, dst);
777 DO_imm_mandr_r("dpps", 40, src, dst);
778 DO_imm_mandr_r("dpps", 41, src, dst);
779 DO_imm_mandr_r("dpps", 42, src, dst);
780 DO_imm_mandr_r("dpps", 43, src, dst);
781 DO_imm_mandr_r("dpps", 44, src, dst);
782 DO_imm_mandr_r("dpps", 45, src, dst);
783 DO_imm_mandr_r("dpps", 46, src, dst);
784 DO_imm_mandr_r("dpps", 47, src, dst);
785 DO_imm_mandr_r("dpps", 48, src, dst);
786 DO_imm_mandr_r("dpps", 49, src, dst);
787 DO_imm_mandr_r("dpps", 50, src, dst);
788 DO_imm_mandr_r("dpps", 51, src, dst);
789 DO_imm_mandr_r("dpps", 52, src, dst);
790 DO_imm_mandr_r("dpps", 53, src, dst);
791 DO_imm_mandr_r("dpps", 54, src, dst);
792 DO_imm_mandr_r("dpps", 55, src, dst);
793 DO_imm_mandr_r("dpps", 56, src, dst);
794 DO_imm_mandr_r("dpps", 57, src, dst);
795 DO_imm_mandr_r("dpps", 58, src, dst);
796 DO_imm_mandr_r("dpps", 59, src, dst);
797 DO_imm_mandr_r("dpps", 60, src, dst);
798 DO_imm_mandr_r("dpps", 61, src, dst);
799 DO_imm_mandr_r("dpps", 62, src, dst);
800 DO_imm_mandr_r("dpps", 63, src, dst);
801 DO_imm_mandr_r("dpps", 64, src, dst);
802 DO_imm_mandr_r("dpps", 65, src, dst);
803 DO_imm_mandr_r("dpps", 66, src, dst);
804 DO_imm_mandr_r("dpps", 67, src, dst);
805 DO_imm_mandr_r("dpps", 68, src, dst);
806 DO_imm_mandr_r("dpps", 69, src, dst);
807 DO_imm_mandr_r("dpps", 70, src, dst);
808 DO_imm_mandr_r("dpps", 71, src, dst);
809 DO_imm_mandr_r("dpps", 72, src, dst);
810 DO_imm_mandr_r("dpps", 73, src, dst);
811 DO_imm_mandr_r("dpps", 74, src, dst);
812 DO_imm_mandr_r("dpps", 75, src, dst);
813 DO_imm_mandr_r("dpps", 76, src, dst);
814 DO_imm_mandr_r("dpps", 77, src, dst);
815 DO_imm_mandr_r("dpps", 78, src, dst);
816 DO_imm_mandr_r("dpps", 79, src, dst);
817 DO_imm_mandr_r("dpps", 80, src, dst);
818 DO_imm_mandr_r("dpps", 81, src, dst);
819 DO_imm_mandr_r("dpps", 82, src, dst);
820 DO_imm_mandr_r("dpps", 83, src, dst);
821 DO_imm_mandr_r("dpps", 84, src, dst);
822 DO_imm_mandr_r("dpps", 85, src, dst);
823 DO_imm_mandr_r("dpps", 86, src, dst);
824 DO_imm_mandr_r("dpps", 87, src, dst);
825 DO_imm_mandr_r("dpps", 88, src, dst);
826 DO_imm_mandr_r("dpps", 89, src, dst);
827 DO_imm_mandr_r("dpps", 90, src, dst);
828 DO_imm_mandr_r("dpps", 91, src, dst);
829 DO_imm_mandr_r("dpps", 92, src, dst);
830 DO_imm_mandr_r("dpps", 93, src, dst);
831 DO_imm_mandr_r("dpps", 94, src, dst);
832 DO_imm_mandr_r("dpps", 95, src, dst);
833 DO_imm_mandr_r("dpps", 96, src, dst);
834 DO_imm_mandr_r("dpps", 97, src, dst);
835 DO_imm_mandr_r("dpps", 98, src, dst);
836 DO_imm_mandr_r("dpps", 99, src, dst);
837 DO_imm_mandr_r("dpps", 100, src, dst);
838 DO_imm_mandr_r("dpps", 101, src, dst);
839 DO_imm_mandr_r("dpps", 102, src, dst);
840 DO_imm_mandr_r("dpps", 103, src, dst);
841 DO_imm_mandr_r("dpps", 104, src, dst);
842 DO_imm_mandr_r("dpps", 105, src, dst);
843 DO_imm_mandr_r("dpps", 106, src, dst);
844 DO_imm_mandr_r("dpps", 107, src, dst);
845 DO_imm_mandr_r("dpps", 108, src, dst);
846 DO_imm_mandr_r("dpps", 109, src, dst);
847 DO_imm_mandr_r("dpps", 110, src, dst);
848 DO_imm_mandr_r("dpps", 111, src, dst);
849 DO_imm_mandr_r("dpps", 112, src, dst);
850 DO_imm_mandr_r("dpps", 113, src, dst);
851 DO_imm_mandr_r("dpps", 114, src, dst);
852 DO_imm_mandr_r("dpps", 115, src, dst);
853 DO_imm_mandr_r("dpps", 116, src, dst);
854 DO_imm_mandr_r("dpps", 117, src, dst);
855 DO_imm_mandr_r("dpps", 118, src, dst);
856 DO_imm_mandr_r("dpps", 119, src, dst);
857 DO_imm_mandr_r("dpps", 120, src, dst);
858 DO_imm_mandr_r("dpps", 121, src, dst);
859 DO_imm_mandr_r("dpps", 122, src, dst);
860 DO_imm_mandr_r("dpps", 123, src, dst);
861 DO_imm_mandr_r("dpps", 124, src, dst);
862 DO_imm_mandr_r("dpps", 125, src, dst);
863 DO_imm_mandr_r("dpps", 126, src, dst);
864 DO_imm_mandr_r("dpps", 127, src, dst);
865 DO_imm_mandr_r("dpps", 128, src, dst);
866 DO_imm_mandr_r("dpps", 129, src, dst);
867 DO_imm_mandr_r("dpps", 130, src, dst);
868 DO_imm_mandr_r("dpps", 131, src, dst);
869 DO_imm_mandr_r("dpps", 132, src, dst);
870 DO_imm_mandr_r("dpps", 133, src, dst);
871 DO_imm_mandr_r("dpps", 134, src, dst);
872 DO_imm_mandr_r("dpps", 135, src, dst);
873 DO_imm_mandr_r("dpps", 136, src, dst);
874 DO_imm_mandr_r("dpps", 137, src, dst);
875 DO_imm_mandr_r("dpps", 138, src, dst);
876 DO_imm_mandr_r("dpps", 139, src, dst);
877 DO_imm_mandr_r("dpps", 140, src, dst);
878 DO_imm_mandr_r("dpps", 141, src, dst);
879 DO_imm_mandr_r("dpps", 142, src, dst);
880 DO_imm_mandr_r("dpps", 143, src, dst);
881 DO_imm_mandr_r("dpps", 144, src, dst);
882 DO_imm_mandr_r("dpps", 145, src, dst);
883 DO_imm_mandr_r("dpps", 146, src, dst);
884 DO_imm_mandr_r("dpps", 147, src, dst);
885 DO_imm_mandr_r("dpps", 148, src, dst);
886 DO_imm_mandr_r("dpps", 149, src, dst);
887 DO_imm_mandr_r("dpps", 150, src, dst);
888 DO_imm_mandr_r("dpps", 151, src, dst);
889 DO_imm_mandr_r("dpps", 152, src, dst);
890 DO_imm_mandr_r("dpps", 153, src, dst);
891 DO_imm_mandr_r("dpps", 154, src, dst);
892 DO_imm_mandr_r("dpps", 155, src, dst);
893 DO_imm_mandr_r("dpps", 156, src, dst);
894 DO_imm_mandr_r("dpps", 157, src, dst);
895 DO_imm_mandr_r("dpps", 158, src, dst);
896 DO_imm_mandr_r("dpps", 159, src, dst);
897 DO_imm_mandr_r("dpps", 160, src, dst);
898 DO_imm_mandr_r("dpps", 161, src, dst);
899 DO_imm_mandr_r("dpps", 162, src, dst);
900 DO_imm_mandr_r("dpps", 163, src, dst);
901 DO_imm_mandr_r("dpps", 164, src, dst);
902 DO_imm_mandr_r("dpps", 165, src, dst);
903 DO_imm_mandr_r("dpps", 166, src, dst);
904 DO_imm_mandr_r("dpps", 167, src, dst);
905 DO_imm_mandr_r("dpps", 168, src, dst);
906 DO_imm_mandr_r("dpps", 169, src, dst);
907 DO_imm_mandr_r("dpps", 170, src, dst);
908 DO_imm_mandr_r("dpps", 171, src, dst);
909 DO_imm_mandr_r("dpps", 172, src, dst);
910 DO_imm_mandr_r("dpps", 173, src, dst);
911 DO_imm_mandr_r("dpps", 174, src, dst);
912 DO_imm_mandr_r("dpps", 175, src, dst);
913 DO_imm_mandr_r("dpps", 176, src, dst);
914 DO_imm_mandr_r("dpps", 177, src, dst);
915 DO_imm_mandr_r("dpps", 178, src, dst);
916 DO_imm_mandr_r("dpps", 179, src, dst);
917 DO_imm_mandr_r("dpps", 180, src, dst);
918 DO_imm_mandr_r("dpps", 181, src, dst);
919 DO_imm_mandr_r("dpps", 182, src, dst);
920 DO_imm_mandr_r("dpps", 183, src, dst);
921 DO_imm_mandr_r("dpps", 184, src, dst);
922 DO_imm_mandr_r("dpps", 185, src, dst);
923 DO_imm_mandr_r("dpps", 186, src, dst);
924 DO_imm_mandr_r("dpps", 187, src, dst);
925 DO_imm_mandr_r("dpps", 188, src, dst);
926 DO_imm_mandr_r("dpps", 189, src, dst);
927 DO_imm_mandr_r("dpps", 190, src, dst);
928 DO_imm_mandr_r("dpps", 191, src, dst);
929 DO_imm_mandr_r("dpps", 192, src, dst);
930 DO_imm_mandr_r("dpps", 193, src, dst);
931 DO_imm_mandr_r("dpps", 194, src, dst);
932 DO_imm_mandr_r("dpps", 195, src, dst);
933 DO_imm_mandr_r("dpps", 196, src, dst);
934 DO_imm_mandr_r("dpps", 197, src, dst);
935 DO_imm_mandr_r("dpps", 198, src, dst);
936 DO_imm_mandr_r("dpps", 199, src, dst);
937 DO_imm_mandr_r("dpps", 200, src, dst);
938 DO_imm_mandr_r("dpps", 201, src, dst);
939 DO_imm_mandr_r("dpps", 202, src, dst);
940 DO_imm_mandr_r("dpps", 203, src, dst);
941 DO_imm_mandr_r("dpps", 204, src, dst);
942 DO_imm_mandr_r("dpps", 205, src, dst);
943 DO_imm_mandr_r("dpps", 206, src, dst);
944 DO_imm_mandr_r("dpps", 207, src, dst);
945 DO_imm_mandr_r("dpps", 208, src, dst);
946 DO_imm_mandr_r("dpps", 209, src, dst);
947 DO_imm_mandr_r("dpps", 210, src, dst);
948 DO_imm_mandr_r("dpps", 211, src, dst);
949 DO_imm_mandr_r("dpps", 212, src, dst);
950 DO_imm_mandr_r("dpps", 213, src, dst);
951 DO_imm_mandr_r("dpps", 214, src, dst);
952 DO_imm_mandr_r("dpps", 215, src, dst);
953 DO_imm_mandr_r("dpps", 216, src, dst);
954 DO_imm_mandr_r("dpps", 217, src, dst);
955 DO_imm_mandr_r("dpps", 218, src, dst);
956 DO_imm_mandr_r("dpps", 219, src, dst);
957 DO_imm_mandr_r("dpps", 220, src, dst);
958 DO_imm_mandr_r("dpps", 221, src, dst);
959 DO_imm_mandr_r("dpps", 222, src, dst);
960 DO_imm_mandr_r("dpps", 223, src, dst);
961 DO_imm_mandr_r("dpps", 224, src, dst);
962 DO_imm_mandr_r("dpps", 225, src, dst);
963 DO_imm_mandr_r("dpps", 226, src, dst);
964 DO_imm_mandr_r("dpps", 227, src, dst);
965 DO_imm_mandr_r("dpps", 228, src, dst);
966 DO_imm_mandr_r("dpps", 229, src, dst);
967 DO_imm_mandr_r("dpps", 230, src, dst);
968 DO_imm_mandr_r("dpps", 231, src, dst);
969 DO_imm_mandr_r("dpps", 232, src, dst);
970 DO_imm_mandr_r("dpps", 233, src, dst);
971 DO_imm_mandr_r("dpps", 234, src, dst);
972 DO_imm_mandr_r("dpps", 235, src, dst);
973 DO_imm_mandr_r("dpps", 236, src, dst);
974 DO_imm_mandr_r("dpps", 237, src, dst);
975 DO_imm_mandr_r("dpps", 238, src, dst);
976 DO_imm_mandr_r("dpps", 239, src, dst);
977 DO_imm_mandr_r("dpps", 240, src, dst);
978 DO_imm_mandr_r("dpps", 241, src, dst);
979 DO_imm_mandr_r("dpps", 242, src, dst);
980 DO_imm_mandr_r("dpps", 243, src, dst);
981 DO_imm_mandr_r("dpps", 244, src, dst);
982 DO_imm_mandr_r("dpps", 245, src, dst);
983 DO_imm_mandr_r("dpps", 246, src, dst);
984 DO_imm_mandr_r("dpps", 247, src, dst);
985 DO_imm_mandr_r("dpps", 248, src, dst);
986 DO_imm_mandr_r("dpps", 249, src, dst);
987 DO_imm_mandr_r("dpps", 250, src, dst);
988 DO_imm_mandr_r("dpps", 251, src, dst);
989 DO_imm_mandr_r("dpps", 252, src, dst);
990 DO_imm_mandr_r("dpps", 253, src, dst);
991 DO_imm_mandr_r("dpps", 254, src, dst);
992 DO_imm_mandr_r("dpps", 255, src, dst);
996 void test_INSERTPS ( void )
1000 *(float*)(&src[0]) = 1.2;
1001 *(float*)(&src[4]) = -3.4;
1002 *(float*)(&src[8]) = -6.7;
1003 *(float*)(&src[12]) = 8.9;
1004 *(float*)(&dst[0]) = -10.11;
1005 *(float*)(&dst[4]) = 12.13;
1006 *(float*)(&dst[8]) = 14.15;
1007 *(float*)(&dst[12]) = -16.17;
1008 DO_imm_mandr_r("insertps", 0, src, dst);
1009 DO_imm_mandr_r("insertps", 1, src, dst);
1010 DO_imm_mandr_r("insertps", 2, src, dst);
1011 DO_imm_mandr_r("insertps", 3, src, dst);
1012 DO_imm_mandr_r("insertps", 4, src, dst);
1013 DO_imm_mandr_r("insertps", 5, src, dst);
1014 DO_imm_mandr_r("insertps", 6, src, dst);
1015 DO_imm_mandr_r("insertps", 7, src, dst);
1016 DO_imm_mandr_r("insertps", 8, src, dst);
1017 DO_imm_mandr_r("insertps", 9, src, dst);
1018 DO_imm_mandr_r("insertps", 10, src, dst);
1019 DO_imm_mandr_r("insertps", 11, src, dst);
1020 DO_imm_mandr_r("insertps", 12, src, dst);
1021 DO_imm_mandr_r("insertps", 13, src, dst);
1022 DO_imm_mandr_r("insertps", 14, src, dst);
1023 DO_imm_mandr_r("insertps", 15, src, dst);
1024 DO_imm_mandr_r("insertps", 16, src, dst);
1025 DO_imm_mandr_r("insertps", 17, src, dst);
1026 DO_imm_mandr_r("insertps", 18, src, dst);
1027 DO_imm_mandr_r("insertps", 19, src, dst);
1028 DO_imm_mandr_r("insertps", 20, src, dst);
1029 DO_imm_mandr_r("insertps", 21, src, dst);
1030 DO_imm_mandr_r("insertps", 22, src, dst);
1031 DO_imm_mandr_r("insertps", 23, src, dst);
1032 DO_imm_mandr_r("insertps", 24, src, dst);
1033 DO_imm_mandr_r("insertps", 25, src, dst);
1034 DO_imm_mandr_r("insertps", 26, src, dst);
1035 DO_imm_mandr_r("insertps", 27, src, dst);
1036 DO_imm_mandr_r("insertps", 28, src, dst);
1037 DO_imm_mandr_r("insertps", 29, src, dst);
1038 DO_imm_mandr_r("insertps", 30, src, dst);
1039 DO_imm_mandr_r("insertps", 31, src, dst);
1040 DO_imm_mandr_r("insertps", 32, src, dst);
1041 DO_imm_mandr_r("insertps", 33, src, dst);
1042 DO_imm_mandr_r("insertps", 34, src, dst);
1043 DO_imm_mandr_r("insertps", 35, src, dst);
1044 DO_imm_mandr_r("insertps", 36, src, dst);
1045 DO_imm_mandr_r("insertps", 37, src, dst);
1046 DO_imm_mandr_r("insertps", 38, src, dst);
1047 DO_imm_mandr_r("insertps", 39, src, dst);
1048 DO_imm_mandr_r("insertps", 40, src, dst);
1049 DO_imm_mandr_r("insertps", 41, src, dst);
1050 DO_imm_mandr_r("insertps", 42, src, dst);
1051 DO_imm_mandr_r("insertps", 43, src, dst);
1052 DO_imm_mandr_r("insertps", 44, src, dst);
1053 DO_imm_mandr_r("insertps", 45, src, dst);
1054 DO_imm_mandr_r("insertps", 46, src, dst);
1055 DO_imm_mandr_r("insertps", 47, src, dst);
1056 DO_imm_mandr_r("insertps", 48, src, dst);
1057 DO_imm_mandr_r("insertps", 49, src, dst);
1058 DO_imm_mandr_r("insertps", 50, src, dst);
1059 DO_imm_mandr_r("insertps", 51, src, dst);
1060 DO_imm_mandr_r("insertps", 52, src, dst);
1061 DO_imm_mandr_r("insertps", 53, src, dst);
1062 DO_imm_mandr_r("insertps", 54, src, dst);
1063 DO_imm_mandr_r("insertps", 55, src, dst);
1064 DO_imm_mandr_r("insertps", 56, src, dst);
1065 DO_imm_mandr_r("insertps", 57, src, dst);
1066 DO_imm_mandr_r("insertps", 58, src, dst);
1067 DO_imm_mandr_r("insertps", 59, src, dst);
1068 DO_imm_mandr_r("insertps", 60, src, dst);
1069 DO_imm_mandr_r("insertps", 61, src, dst);
1070 DO_imm_mandr_r("insertps", 62, src, dst);
1071 DO_imm_mandr_r("insertps", 63, src, dst);
1072 DO_imm_mandr_r("insertps", 64, src, dst);
1073 DO_imm_mandr_r("insertps", 65, src, dst);
1074 DO_imm_mandr_r("insertps", 66, src, dst);
1075 DO_imm_mandr_r("insertps", 67, src, dst);
1076 DO_imm_mandr_r("insertps", 68, src, dst);
1077 DO_imm_mandr_r("insertps", 69, src, dst);
1078 DO_imm_mandr_r("insertps", 70, src, dst);
1079 DO_imm_mandr_r("insertps", 71, src, dst);
1080 DO_imm_mandr_r("insertps", 72, src, dst);
1081 DO_imm_mandr_r("insertps", 73, src, dst);
1082 DO_imm_mandr_r("insertps", 74, src, dst);
1083 DO_imm_mandr_r("insertps", 75, src, dst);
1084 DO_imm_mandr_r("insertps", 76, src, dst);
1085 DO_imm_mandr_r("insertps", 77, src, dst);
1086 DO_imm_mandr_r("insertps", 78, src, dst);
1087 DO_imm_mandr_r("insertps", 79, src, dst);
1088 DO_imm_mandr_r("insertps", 80, src, dst);
1089 DO_imm_mandr_r("insertps", 81, src, dst);
1090 DO_imm_mandr_r("insertps", 82, src, dst);
1091 DO_imm_mandr_r("insertps", 83, src, dst);
1092 DO_imm_mandr_r("insertps", 84, src, dst);
1093 DO_imm_mandr_r("insertps", 85, src, dst);
1094 DO_imm_mandr_r("insertps", 86, src, dst);
1095 DO_imm_mandr_r("insertps", 87, src, dst);
1096 DO_imm_mandr_r("insertps", 88, src, dst);
1097 DO_imm_mandr_r("insertps", 89, src, dst);
1098 DO_imm_mandr_r("insertps", 90, src, dst);
1099 DO_imm_mandr_r("insertps", 91, src, dst);
1100 DO_imm_mandr_r("insertps", 92, src, dst);
1101 DO_imm_mandr_r("insertps", 93, src, dst);
1102 DO_imm_mandr_r("insertps", 94, src, dst);
1103 DO_imm_mandr_r("insertps", 95, src, dst);
1104 DO_imm_mandr_r("insertps", 96, src, dst);
1105 DO_imm_mandr_r("insertps", 97, src, dst);
1106 DO_imm_mandr_r("insertps", 98, src, dst);
1107 DO_imm_mandr_r("insertps", 99, src, dst);
1108 DO_imm_mandr_r("insertps", 100, src, dst);
1109 DO_imm_mandr_r("insertps", 101, src, dst);
1110 DO_imm_mandr_r("insertps", 102, src, dst);
1111 DO_imm_mandr_r("insertps", 103, src, dst);
1112 DO_imm_mandr_r("insertps", 104, src, dst);
1113 DO_imm_mandr_r("insertps", 105, src, dst);
1114 DO_imm_mandr_r("insertps", 106, src, dst);
1115 DO_imm_mandr_r("insertps", 107, src, dst);
1116 DO_imm_mandr_r("insertps", 108, src, dst);
1117 DO_imm_mandr_r("insertps", 109, src, dst);
1118 DO_imm_mandr_r("insertps", 110, src, dst);
1119 DO_imm_mandr_r("insertps", 111, src, dst);
1120 DO_imm_mandr_r("insertps", 112, src, dst);
1121 DO_imm_mandr_r("insertps", 113, src, dst);
1122 DO_imm_mandr_r("insertps", 114, src, dst);
1123 DO_imm_mandr_r("insertps", 115, src, dst);
1124 DO_imm_mandr_r("insertps", 116, src, dst);
1125 DO_imm_mandr_r("insertps", 117, src, dst);
1126 DO_imm_mandr_r("insertps", 118, src, dst);
1127 DO_imm_mandr_r("insertps", 119, src, dst);
1128 DO_imm_mandr_r("insertps", 120, src, dst);
1129 DO_imm_mandr_r("insertps", 121, src, dst);
1130 DO_imm_mandr_r("insertps", 122, src, dst);
1131 DO_imm_mandr_r("insertps", 123, src, dst);
1132 DO_imm_mandr_r("insertps", 124, src, dst);
1133 DO_imm_mandr_r("insertps", 125, src, dst);
1134 DO_imm_mandr_r("insertps", 126, src, dst);
1135 DO_imm_mandr_r("insertps", 127, src, dst);
1136 DO_imm_mandr_r("insertps", 128, src, dst);
1137 DO_imm_mandr_r("insertps", 129, src, dst);
1138 DO_imm_mandr_r("insertps", 130, src, dst);
1139 DO_imm_mandr_r("insertps", 131, src, dst);
1140 DO_imm_mandr_r("insertps", 132, src, dst);
1141 DO_imm_mandr_r("insertps", 133, src, dst);
1142 DO_imm_mandr_r("insertps", 134, src, dst);
1143 DO_imm_mandr_r("insertps", 135, src, dst);
1144 DO_imm_mandr_r("insertps", 136, src, dst);
1145 DO_imm_mandr_r("insertps", 137, src, dst);
1146 DO_imm_mandr_r("insertps", 138, src, dst);
1147 DO_imm_mandr_r("insertps", 139, src, dst);
1148 DO_imm_mandr_r("insertps", 140, src, dst);
1149 DO_imm_mandr_r("insertps", 141, src, dst);
1150 DO_imm_mandr_r("insertps", 142, src, dst);
1151 DO_imm_mandr_r("insertps", 143, src, dst);
1152 DO_imm_mandr_r("insertps", 144, src, dst);
1153 DO_imm_mandr_r("insertps", 145, src, dst);
1154 DO_imm_mandr_r("insertps", 146, src, dst);
1155 DO_imm_mandr_r("insertps", 147, src, dst);
1156 DO_imm_mandr_r("insertps", 148, src, dst);
1157 DO_imm_mandr_r("insertps", 149, src, dst);
1158 DO_imm_mandr_r("insertps", 150, src, dst);
1159 DO_imm_mandr_r("insertps", 151, src, dst);
1160 DO_imm_mandr_r("insertps", 152, src, dst);
1161 DO_imm_mandr_r("insertps", 153, src, dst);
1162 DO_imm_mandr_r("insertps", 154, src, dst);
1163 DO_imm_mandr_r("insertps", 155, src, dst);
1164 DO_imm_mandr_r("insertps", 156, src, dst);
1165 DO_imm_mandr_r("insertps", 157, src, dst);
1166 DO_imm_mandr_r("insertps", 158, src, dst);
1167 DO_imm_mandr_r("insertps", 159, src, dst);
1168 DO_imm_mandr_r("insertps", 160, src, dst);
1169 DO_imm_mandr_r("insertps", 161, src, dst);
1170 DO_imm_mandr_r("insertps", 162, src, dst);
1171 DO_imm_mandr_r("insertps", 163, src, dst);
1172 DO_imm_mandr_r("insertps", 164, src, dst);
1173 DO_imm_mandr_r("insertps", 165, src, dst);
1174 DO_imm_mandr_r("insertps", 166, src, dst);
1175 DO_imm_mandr_r("insertps", 167, src, dst);
1176 DO_imm_mandr_r("insertps", 168, src, dst);
1177 DO_imm_mandr_r("insertps", 169, src, dst);
1178 DO_imm_mandr_r("insertps", 170, src, dst);
1179 DO_imm_mandr_r("insertps", 171, src, dst);
1180 DO_imm_mandr_r("insertps", 172, src, dst);
1181 DO_imm_mandr_r("insertps", 173, src, dst);
1182 DO_imm_mandr_r("insertps", 174, src, dst);
1183 DO_imm_mandr_r("insertps", 175, src, dst);
1184 DO_imm_mandr_r("insertps", 176, src, dst);
1185 DO_imm_mandr_r("insertps", 177, src, dst);
1186 DO_imm_mandr_r("insertps", 178, src, dst);
1187 DO_imm_mandr_r("insertps", 179, src, dst);
1188 DO_imm_mandr_r("insertps", 180, src, dst);
1189 DO_imm_mandr_r("insertps", 181, src, dst);
1190 DO_imm_mandr_r("insertps", 182, src, dst);
1191 DO_imm_mandr_r("insertps", 183, src, dst);
1192 DO_imm_mandr_r("insertps", 184, src, dst);
1193 DO_imm_mandr_r("insertps", 185, src, dst);
1194 DO_imm_mandr_r("insertps", 186, src, dst);
1195 DO_imm_mandr_r("insertps", 187, src, dst);
1196 DO_imm_mandr_r("insertps", 188, src, dst);
1197 DO_imm_mandr_r("insertps", 189, src, dst);
1198 DO_imm_mandr_r("insertps", 190, src, dst);
1199 DO_imm_mandr_r("insertps", 191, src, dst);
1200 DO_imm_mandr_r("insertps", 192, src, dst);
1201 DO_imm_mandr_r("insertps", 193, src, dst);
1202 DO_imm_mandr_r("insertps", 194, src, dst);
1203 DO_imm_mandr_r("insertps", 195, src, dst);
1204 DO_imm_mandr_r("insertps", 196, src, dst);
1205 DO_imm_mandr_r("insertps", 197, src, dst);
1206 DO_imm_mandr_r("insertps", 198, src, dst);
1207 DO_imm_mandr_r("insertps", 199, src, dst);
1208 DO_imm_mandr_r("insertps", 200, src, dst);
1209 DO_imm_mandr_r("insertps", 201, src, dst);
1210 DO_imm_mandr_r("insertps", 202, src, dst);
1211 DO_imm_mandr_r("insertps", 203, src, dst);
1212 DO_imm_mandr_r("insertps", 204, src, dst);
1213 DO_imm_mandr_r("insertps", 205, src, dst);
1214 DO_imm_mandr_r("insertps", 206, src, dst);
1215 DO_imm_mandr_r("insertps", 207, src, dst);
1216 DO_imm_mandr_r("insertps", 208, src, dst);
1217 DO_imm_mandr_r("insertps", 209, src, dst);
1218 DO_imm_mandr_r("insertps", 210, src, dst);
1219 DO_imm_mandr_r("insertps", 211, src, dst);
1220 DO_imm_mandr_r("insertps", 212, src, dst);
1221 DO_imm_mandr_r("insertps", 213, src, dst);
1222 DO_imm_mandr_r("insertps", 214, src, dst);
1223 DO_imm_mandr_r("insertps", 215, src, dst);
1224 DO_imm_mandr_r("insertps", 216, src, dst);
1225 DO_imm_mandr_r("insertps", 217, src, dst);
1226 DO_imm_mandr_r("insertps", 218, src, dst);
1227 DO_imm_mandr_r("insertps", 219, src, dst);
1228 DO_imm_mandr_r("insertps", 220, src, dst);
1229 DO_imm_mandr_r("insertps", 221, src, dst);
1230 DO_imm_mandr_r("insertps", 222, src, dst);
1231 DO_imm_mandr_r("insertps", 223, src, dst);
1232 DO_imm_mandr_r("insertps", 224, src, dst);
1233 DO_imm_mandr_r("insertps", 225, src, dst);
1234 DO_imm_mandr_r("insertps", 226, src, dst);
1235 DO_imm_mandr_r("insertps", 227, src, dst);
1236 DO_imm_mandr_r("insertps", 228, src, dst);
1237 DO_imm_mandr_r("insertps", 229, src, dst);
1238 DO_imm_mandr_r("insertps", 230, src, dst);
1239 DO_imm_mandr_r("insertps", 231, src, dst);
1240 DO_imm_mandr_r("insertps", 232, src, dst);
1241 DO_imm_mandr_r("insertps", 233, src, dst);
1242 DO_imm_mandr_r("insertps", 234, src, dst);
1243 DO_imm_mandr_r("insertps", 235, src, dst);
1244 DO_imm_mandr_r("insertps", 236, src, dst);
1245 DO_imm_mandr_r("insertps", 237, src, dst);
1246 DO_imm_mandr_r("insertps", 238, src, dst);
1247 DO_imm_mandr_r("insertps", 239, src, dst);
1248 DO_imm_mandr_r("insertps", 240, src, dst);
1249 DO_imm_mandr_r("insertps", 241, src, dst);
1250 DO_imm_mandr_r("insertps", 242, src, dst);
1251 DO_imm_mandr_r("insertps", 243, src, dst);
1252 DO_imm_mandr_r("insertps", 244, src, dst);
1253 DO_imm_mandr_r("insertps", 245, src, dst);
1254 DO_imm_mandr_r("insertps", 246, src, dst);
1255 DO_imm_mandr_r("insertps", 247, src, dst);
1256 DO_imm_mandr_r("insertps", 248, src, dst);
1257 DO_imm_mandr_r("insertps", 249, src, dst);
1258 DO_imm_mandr_r("insertps", 250, src, dst);
1259 DO_imm_mandr_r("insertps", 251, src, dst);
1260 DO_imm_mandr_r("insertps", 252, src, dst);
1261 DO_imm_mandr_r("insertps", 253, src, dst);
1262 DO_imm_mandr_r("insertps", 254, src, dst);
1263 DO_imm_mandr_r("insertps", 255, src, dst);
1267 void test_MPSADBW ( void )
1271 for (i = 0; i < 10; i++) {
1274 DO_imm_mandr_r("mpsadbw", 0, src, dst);
1275 DO_imm_mandr_r("mpsadbw", 1, src, dst);
1276 DO_imm_mandr_r("mpsadbw", 2, src, dst);
1277 DO_imm_mandr_r("mpsadbw", 3, src, dst);
1278 DO_imm_mandr_r("mpsadbw", 4, src, dst);
1279 DO_imm_mandr_r("mpsadbw", 5, src, dst);
1280 DO_imm_mandr_r("mpsadbw", 6, src, dst);
1281 DO_imm_mandr_r("mpsadbw", 7, src, dst);
1285 void test_PACKUSDW ( void )
1289 for (i = 0; i < 10; i++) {
1294 memset(&src, 0, sizeof(src));
1295 memset(&dst, 0, sizeof(src));
1296 src[0] = 0x11; src[1] = 0x22;
1297 src[4] = 0x33; src[5] = 0x44;
1298 src[8] = 0x55; src[9] = 0x66;
1299 src[12] = 0x77; src[13] = 0x88;
1300 dst[0] = 0xaa; dst[1] = 0xbb;
1301 dst[4] = 0xcc; dst[5] = 0xdd;
1302 dst[8] = 0xee; dst[9] = 0xff;
1303 dst[12] = 0xa1; dst[13] = 0xb2;
1305 DO_mandr_r("packusdw", src, dst);
1309 void test_PBLENDW ( void )
1315 DO_imm_mandr_r("pblendw", 0, src, dst);
1316 DO_imm_mandr_r("pblendw", 1, src, dst);
1317 DO_imm_mandr_r("pblendw", 2, src, dst);
1318 DO_imm_mandr_r("pblendw", 3, src, dst);
1319 DO_imm_mandr_r("pblendw", 4, src, dst);
1320 DO_imm_mandr_r("pblendw", 5, src, dst);
1321 DO_imm_mandr_r("pblendw", 6, src, dst);
1322 DO_imm_mandr_r("pblendw", 7, src, dst);
1323 DO_imm_mandr_r("pblendw", 8, src, dst);
1324 DO_imm_mandr_r("pblendw", 9, src, dst);
1325 DO_imm_mandr_r("pblendw", 10, src, dst);
1326 DO_imm_mandr_r("pblendw", 11, src, dst);
1327 DO_imm_mandr_r("pblendw", 12, src, dst);
1328 DO_imm_mandr_r("pblendw", 13, src, dst);
1329 DO_imm_mandr_r("pblendw", 14, src, dst);
1330 DO_imm_mandr_r("pblendw", 15, src, dst);
1331 DO_imm_mandr_r("pblendw", 16, src, dst);
1332 DO_imm_mandr_r("pblendw", 17, src, dst);
1333 DO_imm_mandr_r("pblendw", 18, src, dst);
1334 DO_imm_mandr_r("pblendw", 19, src, dst);
1335 DO_imm_mandr_r("pblendw", 20, src, dst);
1336 DO_imm_mandr_r("pblendw", 21, src, dst);
1337 DO_imm_mandr_r("pblendw", 22, src, dst);
1338 DO_imm_mandr_r("pblendw", 23, src, dst);
1339 DO_imm_mandr_r("pblendw", 24, src, dst);
1340 DO_imm_mandr_r("pblendw", 25, src, dst);
1341 DO_imm_mandr_r("pblendw", 26, src, dst);
1342 DO_imm_mandr_r("pblendw", 27, src, dst);
1343 DO_imm_mandr_r("pblendw", 28, src, dst);
1344 DO_imm_mandr_r("pblendw", 29, src, dst);
1345 DO_imm_mandr_r("pblendw", 30, src, dst);
1346 DO_imm_mandr_r("pblendw", 31, src, dst);
1347 DO_imm_mandr_r("pblendw", 32, src, dst);
1348 DO_imm_mandr_r("pblendw", 33, src, dst);
1349 DO_imm_mandr_r("pblendw", 34, src, dst);
1350 DO_imm_mandr_r("pblendw", 35, src, dst);
1351 DO_imm_mandr_r("pblendw", 36, src, dst);
1352 DO_imm_mandr_r("pblendw", 37, src, dst);
1353 DO_imm_mandr_r("pblendw", 38, src, dst);
1354 DO_imm_mandr_r("pblendw", 39, src, dst);
1355 DO_imm_mandr_r("pblendw", 40, src, dst);
1356 DO_imm_mandr_r("pblendw", 41, src, dst);
1357 DO_imm_mandr_r("pblendw", 42, src, dst);
1358 DO_imm_mandr_r("pblendw", 43, src, dst);
1359 DO_imm_mandr_r("pblendw", 44, src, dst);
1360 DO_imm_mandr_r("pblendw", 45, src, dst);
1361 DO_imm_mandr_r("pblendw", 46, src, dst);
1362 DO_imm_mandr_r("pblendw", 47, src, dst);
1363 DO_imm_mandr_r("pblendw", 48, src, dst);
1364 DO_imm_mandr_r("pblendw", 49, src, dst);
1365 DO_imm_mandr_r("pblendw", 50, src, dst);
1366 DO_imm_mandr_r("pblendw", 51, src, dst);
1367 DO_imm_mandr_r("pblendw", 52, src, dst);
1368 DO_imm_mandr_r("pblendw", 53, src, dst);
1369 DO_imm_mandr_r("pblendw", 54, src, dst);
1370 DO_imm_mandr_r("pblendw", 55, src, dst);
1371 DO_imm_mandr_r("pblendw", 56, src, dst);
1372 DO_imm_mandr_r("pblendw", 57, src, dst);
1373 DO_imm_mandr_r("pblendw", 58, src, dst);
1374 DO_imm_mandr_r("pblendw", 59, src, dst);
1375 DO_imm_mandr_r("pblendw", 60, src, dst);
1376 DO_imm_mandr_r("pblendw", 61, src, dst);
1377 DO_imm_mandr_r("pblendw", 62, src, dst);
1378 DO_imm_mandr_r("pblendw", 63, src, dst);
1379 DO_imm_mandr_r("pblendw", 64, src, dst);
1380 DO_imm_mandr_r("pblendw", 65, src, dst);
1381 DO_imm_mandr_r("pblendw", 66, src, dst);
1382 DO_imm_mandr_r("pblendw", 67, src, dst);
1383 DO_imm_mandr_r("pblendw", 68, src, dst);
1384 DO_imm_mandr_r("pblendw", 69, src, dst);
1385 DO_imm_mandr_r("pblendw", 70, src, dst);
1386 DO_imm_mandr_r("pblendw", 71, src, dst);
1387 DO_imm_mandr_r("pblendw", 72, src, dst);
1388 DO_imm_mandr_r("pblendw", 73, src, dst);
1389 DO_imm_mandr_r("pblendw", 74, src, dst);
1390 DO_imm_mandr_r("pblendw", 75, src, dst);
1391 DO_imm_mandr_r("pblendw", 76, src, dst);
1392 DO_imm_mandr_r("pblendw", 77, src, dst);
1393 DO_imm_mandr_r("pblendw", 78, src, dst);
1394 DO_imm_mandr_r("pblendw", 79, src, dst);
1395 DO_imm_mandr_r("pblendw", 80, src, dst);
1396 DO_imm_mandr_r("pblendw", 81, src, dst);
1397 DO_imm_mandr_r("pblendw", 82, src, dst);
1398 DO_imm_mandr_r("pblendw", 83, src, dst);
1399 DO_imm_mandr_r("pblendw", 84, src, dst);
1400 DO_imm_mandr_r("pblendw", 85, src, dst);
1401 DO_imm_mandr_r("pblendw", 86, src, dst);
1402 DO_imm_mandr_r("pblendw", 87, src, dst);
1403 DO_imm_mandr_r("pblendw", 88, src, dst);
1404 DO_imm_mandr_r("pblendw", 89, src, dst);
1405 DO_imm_mandr_r("pblendw", 90, src, dst);
1406 DO_imm_mandr_r("pblendw", 91, src, dst);
1407 DO_imm_mandr_r("pblendw", 92, src, dst);
1408 DO_imm_mandr_r("pblendw", 93, src, dst);
1409 DO_imm_mandr_r("pblendw", 94, src, dst);
1410 DO_imm_mandr_r("pblendw", 95, src, dst);
1411 DO_imm_mandr_r("pblendw", 96, src, dst);
1412 DO_imm_mandr_r("pblendw", 97, src, dst);
1413 DO_imm_mandr_r("pblendw", 98, src, dst);
1414 DO_imm_mandr_r("pblendw", 99, src, dst);
1415 DO_imm_mandr_r("pblendw", 100, src, dst);
1416 DO_imm_mandr_r("pblendw", 101, src, dst);
1417 DO_imm_mandr_r("pblendw", 102, src, dst);
1418 DO_imm_mandr_r("pblendw", 103, src, dst);
1419 DO_imm_mandr_r("pblendw", 104, src, dst);
1420 DO_imm_mandr_r("pblendw", 105, src, dst);
1421 DO_imm_mandr_r("pblendw", 106, src, dst);
1422 DO_imm_mandr_r("pblendw", 107, src, dst);
1423 DO_imm_mandr_r("pblendw", 108, src, dst);
1424 DO_imm_mandr_r("pblendw", 109, src, dst);
1425 DO_imm_mandr_r("pblendw", 110, src, dst);
1426 DO_imm_mandr_r("pblendw", 111, src, dst);
1427 DO_imm_mandr_r("pblendw", 112, src, dst);
1428 DO_imm_mandr_r("pblendw", 113, src, dst);
1429 DO_imm_mandr_r("pblendw", 114, src, dst);
1430 DO_imm_mandr_r("pblendw", 115, src, dst);
1431 DO_imm_mandr_r("pblendw", 116, src, dst);
1432 DO_imm_mandr_r("pblendw", 117, src, dst);
1433 DO_imm_mandr_r("pblendw", 118, src, dst);
1434 DO_imm_mandr_r("pblendw", 119, src, dst);
1435 DO_imm_mandr_r("pblendw", 120, src, dst);
1436 DO_imm_mandr_r("pblendw", 121, src, dst);
1437 DO_imm_mandr_r("pblendw", 122, src, dst);
1438 DO_imm_mandr_r("pblendw", 123, src, dst);
1439 DO_imm_mandr_r("pblendw", 124, src, dst);
1440 DO_imm_mandr_r("pblendw", 125, src, dst);
1441 DO_imm_mandr_r("pblendw", 126, src, dst);
1442 DO_imm_mandr_r("pblendw", 127, src, dst);
1443 DO_imm_mandr_r("pblendw", 128, src, dst);
1444 DO_imm_mandr_r("pblendw", 129, src, dst);
1445 DO_imm_mandr_r("pblendw", 130, src, dst);
1446 DO_imm_mandr_r("pblendw", 131, src, dst);
1447 DO_imm_mandr_r("pblendw", 132, src, dst);
1448 DO_imm_mandr_r("pblendw", 133, src, dst);
1449 DO_imm_mandr_r("pblendw", 134, src, dst);
1450 DO_imm_mandr_r("pblendw", 135, src, dst);
1451 DO_imm_mandr_r("pblendw", 136, src, dst);
1452 DO_imm_mandr_r("pblendw", 137, src, dst);
1453 DO_imm_mandr_r("pblendw", 138, src, dst);
1454 DO_imm_mandr_r("pblendw", 139, src, dst);
1455 DO_imm_mandr_r("pblendw", 140, src, dst);
1456 DO_imm_mandr_r("pblendw", 141, src, dst);
1457 DO_imm_mandr_r("pblendw", 142, src, dst);
1458 DO_imm_mandr_r("pblendw", 143, src, dst);
1459 DO_imm_mandr_r("pblendw", 144, src, dst);
1460 DO_imm_mandr_r("pblendw", 145, src, dst);
1461 DO_imm_mandr_r("pblendw", 146, src, dst);
1462 DO_imm_mandr_r("pblendw", 147, src, dst);
1463 DO_imm_mandr_r("pblendw", 148, src, dst);
1464 DO_imm_mandr_r("pblendw", 149, src, dst);
1465 DO_imm_mandr_r("pblendw", 150, src, dst);
1466 DO_imm_mandr_r("pblendw", 151, src, dst);
1467 DO_imm_mandr_r("pblendw", 152, src, dst);
1468 DO_imm_mandr_r("pblendw", 153, src, dst);
1469 DO_imm_mandr_r("pblendw", 154, src, dst);
1470 DO_imm_mandr_r("pblendw", 155, src, dst);
1471 DO_imm_mandr_r("pblendw", 156, src, dst);
1472 DO_imm_mandr_r("pblendw", 157, src, dst);
1473 DO_imm_mandr_r("pblendw", 158, src, dst);
1474 DO_imm_mandr_r("pblendw", 159, src, dst);
1475 DO_imm_mandr_r("pblendw", 160, src, dst);
1476 DO_imm_mandr_r("pblendw", 161, src, dst);
1477 DO_imm_mandr_r("pblendw", 162, src, dst);
1478 DO_imm_mandr_r("pblendw", 163, src, dst);
1479 DO_imm_mandr_r("pblendw", 164, src, dst);
1480 DO_imm_mandr_r("pblendw", 165, src, dst);
1481 DO_imm_mandr_r("pblendw", 166, src, dst);
1482 DO_imm_mandr_r("pblendw", 167, src, dst);
1483 DO_imm_mandr_r("pblendw", 168, src, dst);
1484 DO_imm_mandr_r("pblendw", 169, src, dst);
1485 DO_imm_mandr_r("pblendw", 170, src, dst);
1486 DO_imm_mandr_r("pblendw", 171, src, dst);
1487 DO_imm_mandr_r("pblendw", 172, src, dst);
1488 DO_imm_mandr_r("pblendw", 173, src, dst);
1489 DO_imm_mandr_r("pblendw", 174, src, dst);
1490 DO_imm_mandr_r("pblendw", 175, src, dst);
1491 DO_imm_mandr_r("pblendw", 176, src, dst);
1492 DO_imm_mandr_r("pblendw", 177, src, dst);
1493 DO_imm_mandr_r("pblendw", 178, src, dst);
1494 DO_imm_mandr_r("pblendw", 179, src, dst);
1495 DO_imm_mandr_r("pblendw", 180, src, dst);
1496 DO_imm_mandr_r("pblendw", 181, src, dst);
1497 DO_imm_mandr_r("pblendw", 182, src, dst);
1498 DO_imm_mandr_r("pblendw", 183, src, dst);
1499 DO_imm_mandr_r("pblendw", 184, src, dst);
1500 DO_imm_mandr_r("pblendw", 185, src, dst);
1501 DO_imm_mandr_r("pblendw", 186, src, dst);
1502 DO_imm_mandr_r("pblendw", 187, src, dst);
1503 DO_imm_mandr_r("pblendw", 188, src, dst);
1504 DO_imm_mandr_r("pblendw", 189, src, dst);
1505 DO_imm_mandr_r("pblendw", 190, src, dst);
1506 DO_imm_mandr_r("pblendw", 191, src, dst);
1507 DO_imm_mandr_r("pblendw", 192, src, dst);
1508 DO_imm_mandr_r("pblendw", 193, src, dst);
1509 DO_imm_mandr_r("pblendw", 194, src, dst);
1510 DO_imm_mandr_r("pblendw", 195, src, dst);
1511 DO_imm_mandr_r("pblendw", 196, src, dst);
1512 DO_imm_mandr_r("pblendw", 197, src, dst);
1513 DO_imm_mandr_r("pblendw", 198, src, dst);
1514 DO_imm_mandr_r("pblendw", 199, src, dst);
1515 DO_imm_mandr_r("pblendw", 200, src, dst);
1516 DO_imm_mandr_r("pblendw", 201, src, dst);
1517 DO_imm_mandr_r("pblendw", 202, src, dst);
1518 DO_imm_mandr_r("pblendw", 203, src, dst);
1519 DO_imm_mandr_r("pblendw", 204, src, dst);
1520 DO_imm_mandr_r("pblendw", 205, src, dst);
1521 DO_imm_mandr_r("pblendw", 206, src, dst);
1522 DO_imm_mandr_r("pblendw", 207, src, dst);
1523 DO_imm_mandr_r("pblendw", 208, src, dst);
1524 DO_imm_mandr_r("pblendw", 209, src, dst);
1525 DO_imm_mandr_r("pblendw", 210, src, dst);
1526 DO_imm_mandr_r("pblendw", 211, src, dst);
1527 DO_imm_mandr_r("pblendw", 212, src, dst);
1528 DO_imm_mandr_r("pblendw", 213, src, dst);
1529 DO_imm_mandr_r("pblendw", 214, src, dst);
1530 DO_imm_mandr_r("pblendw", 215, src, dst);
1531 DO_imm_mandr_r("pblendw", 216, src, dst);
1532 DO_imm_mandr_r("pblendw", 217, src, dst);
1533 DO_imm_mandr_r("pblendw", 218, src, dst);
1534 DO_imm_mandr_r("pblendw", 219, src, dst);
1535 DO_imm_mandr_r("pblendw", 220, src, dst);
1536 DO_imm_mandr_r("pblendw", 221, src, dst);
1537 DO_imm_mandr_r("pblendw", 222, src, dst);
1538 DO_imm_mandr_r("pblendw", 223, src, dst);
1539 DO_imm_mandr_r("pblendw", 224, src, dst);
1540 DO_imm_mandr_r("pblendw", 225, src, dst);
1541 DO_imm_mandr_r("pblendw", 226, src, dst);
1542 DO_imm_mandr_r("pblendw", 227, src, dst);
1543 DO_imm_mandr_r("pblendw", 228, src, dst);
1544 DO_imm_mandr_r("pblendw", 229, src, dst);
1545 DO_imm_mandr_r("pblendw", 230, src, dst);
1546 DO_imm_mandr_r("pblendw", 231, src, dst);
1547 DO_imm_mandr_r("pblendw", 232, src, dst);
1548 DO_imm_mandr_r("pblendw", 233, src, dst);
1549 DO_imm_mandr_r("pblendw", 234, src, dst);
1550 DO_imm_mandr_r("pblendw", 235, src, dst);
1551 DO_imm_mandr_r("pblendw", 236, src, dst);
1552 DO_imm_mandr_r("pblendw", 237, src, dst);
1553 DO_imm_mandr_r("pblendw", 238, src, dst);
1554 DO_imm_mandr_r("pblendw", 239, src, dst);
1555 DO_imm_mandr_r("pblendw", 240, src, dst);
1556 DO_imm_mandr_r("pblendw", 241, src, dst);
1557 DO_imm_mandr_r("pblendw", 242, src, dst);
1558 DO_imm_mandr_r("pblendw", 243, src, dst);
1559 DO_imm_mandr_r("pblendw", 244, src, dst);
1560 DO_imm_mandr_r("pblendw", 245, src, dst);
1561 DO_imm_mandr_r("pblendw", 246, src, dst);
1562 DO_imm_mandr_r("pblendw", 247, src, dst);
1563 DO_imm_mandr_r("pblendw", 248, src, dst);
1564 DO_imm_mandr_r("pblendw", 249, src, dst);
1565 DO_imm_mandr_r("pblendw", 250, src, dst);
1566 DO_imm_mandr_r("pblendw", 251, src, dst);
1567 DO_imm_mandr_r("pblendw", 252, src, dst);
1568 DO_imm_mandr_r("pblendw", 253, src, dst);
1569 DO_imm_mandr_r("pblendw", 254, src, dst);
1570 DO_imm_mandr_r("pblendw", 255, src, dst);
1575 void test_PCMPEQQ ( void )
1579 for (i = 0; i < 10; i++) {
1583 case 0: memset(&src[0], 0x55, 8);
1584 memset(&dst[0], 0x55, 8); break;
1585 case 1: memset(&src[8], 0x55, 8);
1586 memset(&dst[8], 0x55, 8); break;
1590 DO_mandr_r("pcmpeqq", src, dst);
1595 void test_PEXTRB ( void )
1599 DO_imm_r_to_mandrscalar("pextrb", 0, src, "d");
1600 DO_imm_r_to_mandrscalar("pextrb", 1, src, "d");
1601 DO_imm_r_to_mandrscalar("pextrb", 2, src, "d");
1602 DO_imm_r_to_mandrscalar("pextrb", 3, src, "d");
1603 DO_imm_r_to_mandrscalar("pextrb", 4, src, "d");
1604 DO_imm_r_to_mandrscalar("pextrb", 5, src, "d");
1605 DO_imm_r_to_mandrscalar("pextrb", 6, src, "d");
1606 DO_imm_r_to_mandrscalar("pextrb", 7, src, "d");
1607 DO_imm_r_to_mandrscalar("pextrb", 8, src, "d");
1608 DO_imm_r_to_mandrscalar("pextrb", 9, src, "d");
1609 DO_imm_r_to_mandrscalar("pextrb", 10, src, "d");
1610 DO_imm_r_to_mandrscalar("pextrb", 11, src, "d");
1611 DO_imm_r_to_mandrscalar("pextrb", 12, src, "d");
1612 DO_imm_r_to_mandrscalar("pextrb", 13, src, "d");
1613 DO_imm_r_to_mandrscalar("pextrb", 14, src, "d");
1614 DO_imm_r_to_mandrscalar("pextrb", 15, src, "d");
1617 void test_PINSRB ( void )
1621 DO_imm_mandrscalar_to_r("pinsrb", 0, src, "d");
1623 DO_imm_mandrscalar_to_r("pinsrb", 1, src, "d");
1625 DO_imm_mandrscalar_to_r("pinsrb", 2, src, "d");
1627 DO_imm_mandrscalar_to_r("pinsrb", 3, src, "d");
1629 DO_imm_mandrscalar_to_r("pinsrb", 4, src, "d");
1631 DO_imm_mandrscalar_to_r("pinsrb", 5, src, "d");
1633 DO_imm_mandrscalar_to_r("pinsrb", 6, src, "d");
1635 DO_imm_mandrscalar_to_r("pinsrb", 7, src, "d");
1637 DO_imm_mandrscalar_to_r("pinsrb", 8, src, "d");
1639 DO_imm_mandrscalar_to_r("pinsrb", 9, src, "d");
1641 DO_imm_mandrscalar_to_r("pinsrb", 10, src, "d");
1643 DO_imm_mandrscalar_to_r("pinsrb", 11, src, "d");
1645 DO_imm_mandrscalar_to_r("pinsrb", 12, src, "d");
1647 DO_imm_mandrscalar_to_r("pinsrb", 13, src, "d");
1649 DO_imm_mandrscalar_to_r("pinsrb", 14, src, "d");
1651 DO_imm_mandrscalar_to_r("pinsrb", 15, src, "d");
1655 void test_PEXTRW ( void )
1659 DO_imm_r_to_mandrscalar("pextrw", 0, src, "d");
1660 DO_imm_r_to_mandrscalar("pextrw", 1, src, "d");
1661 DO_imm_r_to_mandrscalar("pextrw", 2, src, "d");
1662 DO_imm_r_to_mandrscalar("pextrw", 3, src, "d");
1663 DO_imm_r_to_mandrscalar("pextrw", 4, src, "d");
1664 DO_imm_r_to_mandrscalar("pextrw", 5, src, "d");
1665 DO_imm_r_to_mandrscalar("pextrw", 6, src, "d");
1666 DO_imm_r_to_mandrscalar("pextrw", 7, src, "d");
1669 void test_PINSRW ( void )
1673 DO_imm_mandrscalar_to_r("pinsrw", 0, src, "d");
1675 DO_imm_mandrscalar_to_r("pinsrw", 1, src, "d");
1677 DO_imm_mandrscalar_to_r("pinsrw", 2, src, "d");
1679 DO_imm_mandrscalar_to_r("pinsrw", 3, src, "d");
1681 DO_imm_mandrscalar_to_r("pinsrw", 4, src, "d");
1683 DO_imm_mandrscalar_to_r("pinsrw", 5, src, "d");
1685 DO_imm_mandrscalar_to_r("pinsrw", 6, src, "d");
1687 DO_imm_mandrscalar_to_r("pinsrw", 7, src, "d");
1691 void test_PEXTRD ( void )
1695 DO_imm_r_to_mandrscalar("pextrd", 0, src, "d");
1696 DO_imm_r_to_mandrscalar("pextrd", 1, src, "d");
1697 DO_imm_r_to_mandrscalar("pextrd", 2, src, "d");
1698 DO_imm_r_to_mandrscalar("pextrd", 3, src, "d");
1701 void test_PINSRD ( void )
1705 DO_imm_mandrscalar_to_r("pinsrd", 0, src, "d");
1707 DO_imm_mandrscalar_to_r("pinsrd", 1, src, "d");
1709 DO_imm_mandrscalar_to_r("pinsrd", 2, src, "d");
1711 DO_imm_mandrscalar_to_r("pinsrd", 3, src, "d");
1715 void test_PEXTRQ ( void )
1719 DO_imm_r_to_mandrscalar("pextrq", 0, src, "");
1720 DO_imm_r_to_mandrscalar("pextrq", 1, src, "");
1723 void test_PINSRQ ( void )
1727 DO_imm_mandrscalar_to_r("pinsrq", 0, src, "");
1729 DO_imm_mandrscalar_to_r("pinsrq", 1, src, "");
1733 void test_EXTRACTPS ( void )
1737 DO_imm_r_to_mandrscalar("extractps", 0, src, "d");
1738 DO_imm_r_to_mandrscalar("extractps", 1, src, "d");
1739 DO_imm_r_to_mandrscalar("extractps", 2, src, "d");
1740 DO_imm_r_to_mandrscalar("extractps", 3, src, "d");
1744 void test_PHMINPOSUW ( void )
1748 for (i = 0; i < 10; i++) {
1751 DO_mandr_r("phminposuw", src, dst);
1755 void test_PMAXSB ( void )
1759 for (i = 0; i < 10; i++) {
1762 DO_mandr_r("pmaxsb", src, dst);
1766 void test_PMAXSD ( void )
1770 for (i = 0; i < 10; i++) {
1773 DO_mandr_r("pmaxsd", src, dst);
1777 void test_PMAXUD ( void )
1781 for (i = 0; i < 10; i++) {
1784 DO_mandr_r("pmaxud", src, dst);
1788 void test_PMAXUW ( void )
1792 for (i = 0; i < 10; i++) {
1795 DO_mandr_r("pmaxuw", src, dst);
1799 void test_PMINSB ( void )
1803 for (i = 0; i < 10; i++) {
1806 DO_mandr_r("pminsb", src, dst);
1810 void test_PMINSD ( void )
1814 for (i = 0; i < 10; i++) {
1817 DO_mandr_r("pminsd", src, dst);
1821 void test_PMINUD ( void )
1825 for (i = 0; i < 10; i++) {
1828 DO_mandr_r("pminud", src, dst);
1832 void test_PMINUW ( void )
1836 for (i = 0; i < 10; i++) {
1839 DO_mandr_r("pminuw", src, dst);
1843 void test_PMOVSXBW ( void )
1847 for (i = 0; i < 10; i++) {
1850 DO_mandr_r("pmovsxbw", src, dst);
1854 void test_PMOVSXBD ( void )
1858 for (i = 0; i < 10; i++) {
1861 DO_mandr_r("pmovsxbd", src, dst);
1865 void test_PMOVSXBQ ( void )
1869 for (i = 0; i < 10; i++) {
1872 DO_mandr_r("pmovsxbq", src, dst);
1876 void test_PMOVSXWD ( void )
1880 for (i = 0; i < 10; i++) {
1883 DO_mandr_r("pmovsxwd", src, dst);
1887 void test_PMOVSXWQ ( void )
1891 for (i = 0; i < 10; i++) {
1894 DO_mandr_r("pmovsxwq", src, dst);
1898 void test_PMOVSXDQ ( void )
1902 for (i = 0; i < 10; i++) {
1905 DO_mandr_r("pmovsxdq", src, dst);
1909 void test_PMOVZXBW ( void )
1913 for (i = 0; i < 10; i++) {
1916 DO_mandr_r("pmovzxbw", src, dst);
1920 void test_PMOVZXBD ( void )
1924 for (i = 0; i < 10; i++) {
1927 DO_mandr_r("pmovzxbd", src, dst);
1931 void test_PMOVZXBQ ( void )
1935 for (i = 0; i < 10; i++) {
1938 DO_mandr_r("pmovzxbq", src, dst);
1942 void test_PMOVZXWD ( void )
1946 for (i = 0; i < 10; i++) {
1949 DO_mandr_r("pmovzxwd", src, dst);
1953 void test_PMOVZXWQ ( void )
1957 for (i = 0; i < 10; i++) {
1960 DO_mandr_r("pmovzxwq", src, dst);
1964 void test_PMOVZXDQ ( void )
1968 for (i = 0; i < 10; i++) {
1971 DO_mandr_r("pmovzxdq", src, dst);
1975 void test_PMULDQ ( void )
1979 for (i = 0; i < 10; i++) {
1982 DO_mandr_r("pmuldq", src, dst);
1987 void test_PMULLD ( void )
1991 for (i = 0; i < 10; i++) {
1994 DO_mandr_r("pmulld", src, dst);
1999 void test_POPCNTQ ( void )
2003 ULong oszacp_mask = 0x8D5;
2004 for (i = 0; i < 10; i++) {
2005 block[0] = i == 0 ? 0 : randULong();
2006 block[1] = randULong();
2007 block[2] = randULong();
2008 block[3] = randULong();
2009 __asm__ __volatile__(
2010 "movq %0, %%rax" "\n\t"
2011 "movq 0(%%rax), %%rdi" "\n\t"
2012 "movq 8(%%rax), %%r11" "\n\t"
2013 #ifndef VGP_amd64_darwin
2014 "popcntq %%rdi, %%r11" "\n\t"
2016 "popcnt %%rdi, %%r11" "\n\t"
2018 "movq %%r11, 16(%%rax)" "\n\t"
2021 "movq %%r12, 24(%%rax)" "\n"
2023 : /*in*/"r"(&block[0])
2024 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2026 printf("r popcntq %016llx %016llx %016llx %016llx\n",
2027 block[0], block[1], block[2], block[3] & oszacp_mask);
2029 block[0] = i == 0 ? 0 : randULong();
2030 block[1] = randULong();
2031 block[2] = randULong();
2032 block[3] = randULong();
2033 __asm__ __volatile__(
2034 "movq %0, %%rax" "\n\t"
2035 "movq 8(%%rax), %%r11" "\n\t"
2036 #ifndef VGP_amd64_darwin
2037 "popcntq 0(%%rax), %%r11" "\n\t"
2039 "popcnt 0(%%rax), %%r11" "\n\t"
2041 "movq %%r11, 16(%%rax)" "\n\t"
2044 "movq %%r12, 24(%%rax)" "\n"
2046 : /*in*/"r"(&block[0])
2047 : /*trash*/ "cc", "memory", "r11", "r12"
2049 printf("m popcntq %016llx %016llx %016llx %016llx\n",
2050 block[0], block[1], block[2], block[3] & oszacp_mask);
2055 void test_POPCNTL ( void )
2059 ULong oszacp_mask = 0x8D5;
2060 for (i = 0; i < 10; i++) {
2061 block[0] = i == 0 ? 0 : randULong();
2062 block[1] = randULong();
2063 block[2] = randULong();
2064 block[3] = randULong();
2065 __asm__ __volatile__(
2066 "movq %0, %%rax" "\n\t"
2067 "movq 0(%%rax), %%rdi" "\n\t"
2068 "movq 8(%%rax), %%r11" "\n\t"
2069 #ifndef VGP_amd64_darwin
2070 "popcntl %%edi, %%r11d" "\n\t"
2072 "popcnt %%edi, %%r11d" "\n\t"
2074 "movq %%r11, 16(%%rax)" "\n\t"
2077 "movq %%r12, 24(%%rax)" "\n"
2079 : /*in*/"r"(&block[0])
2080 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2082 printf("r popcntl %016llx %016llx %016llx %016llx\n",
2083 block[0], block[1], block[2], block[3] & oszacp_mask);
2085 block[0] = i == 0 ? 0 : randULong();
2086 block[1] = randULong();
2087 block[2] = randULong();
2088 block[3] = randULong();
2089 __asm__ __volatile__(
2090 "movq %0, %%rax" "\n\t"
2091 "movq 8(%%rax), %%r11" "\n\t"
2092 #ifndef VGP_amd64_darwin
2093 "popcntl 0(%%rax), %%r11d" "\n\t"
2095 "popcnt 0(%%rax), %%r11d" "\n\t"
2097 "movq %%r11, 16(%%rax)" "\n\t"
2100 "movq %%r12, 24(%%rax)" "\n"
2102 : /*in*/"r"(&block[0])
2103 : /*trash*/ "cc", "memory", "r11", "r12"
2105 printf("m popcntl %016llx %016llx %016llx %016llx\n",
2106 block[0], block[1], block[2], block[3] & oszacp_mask);
2111 void test_POPCNTW ( void )
2115 ULong oszacp_mask = 0x8D5;
2116 for (i = 0; i < 10; i++) {
2117 block[0] = i == 0 ? 0 : randULong();
2118 block[1] = randULong();
2119 block[2] = randULong();
2120 block[3] = randULong();
2121 __asm__ __volatile__(
2122 "movq %0, %%rax" "\n\t"
2123 "movq 0(%%rax), %%rdi" "\n\t"
2124 "movq 8(%%rax), %%r11" "\n\t"
2125 #ifndef VGP_amd64_darwin
2126 "popcntw %%di, %%r11w" "\n\t"
2128 "popcnt %%di, %%r11w" "\n\t"
2130 "movq %%r11, 16(%%rax)" "\n\t"
2133 "movq %%r12, 24(%%rax)" "\n"
2135 : /*in*/"r"(&block[0])
2136 : /*trash*/ "cc", "memory", "rdi", "r11", "r12"
2138 printf("r popcntw %016llx %016llx %016llx %016llx\n",
2139 block[0], block[1], block[2], block[3] & oszacp_mask);
2141 block[0] = i == 0 ? 0 : randULong();
2142 block[1] = randULong();
2143 block[2] = randULong();
2144 block[3] = randULong();
2145 __asm__ __volatile__(
2146 "movq %0, %%rax" "\n\t"
2147 "movq 8(%%rax), %%r11" "\n\t"
2148 #ifndef VGP_amd64_darwin
2149 "popcntw 0(%%rax), %%r11w" "\n\t"
2151 "popcnt 0(%%rax), %%r11w" "\n\t"
2153 "movq %%r11, 16(%%rax)" "\n\t"
2156 "movq %%r12, 24(%%rax)" "\n"
2158 : /*in*/"r"(&block[0])
2159 : /*trash*/ "cc", "memory", "r11", "r12"
2161 printf("m popcntw %016llx %016llx %016llx %016llx\n",
2162 block[0], block[1], block[2], block[3] & oszacp_mask);
2167 void test_PCMPGTQ ( void )
2170 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0xffffffffffffffffULL );
2171 do64HLtoV128( &spec[1], 0x0000000000000001ULL, 0xfffffffffffffffeULL );
2172 do64HLtoV128( &spec[2], 0x7fffffffffffffffULL, 0x8000000000000001ULL );
2173 do64HLtoV128( &spec[3], 0x8000000000000000ULL, 0x8000000000000000ULL );
2174 do64HLtoV128( &spec[4], 0x8000000000000001ULL, 0x7fffffffffffffffULL );
2175 do64HLtoV128( &spec[5], 0xfffffffffffffffeULL, 0x0000000000000001ULL );
2176 do64HLtoV128( &spec[6], 0xffffffffffffffffULL, 0x0000000000000000ULL );
2180 for (i = 0; i < 10; i++) {
2183 DO_mandr_r("pcmpgtq", src, dst);
2185 for (i = 0; i < 7; i++) {
2186 for (j = 0; j < 7; j++) {
2187 memcpy(&src, &spec[i], 16);
2188 memcpy(&dst, &spec[j], 16);
2189 DO_mandr_r("pcmpgtq", src, dst);
2194 /* ------------ ROUNDSD ------------ */
2196 void do_ROUNDSD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2199 __asm__ __volatile__(
2200 "movupd (%1), %%xmm11" "\n\t"
2201 "roundsd $0, (%0), %%xmm11" "\n\t"
2202 "movupd %%xmm11, (%1)" "\n"
2204 : /*IN*/ "r"(src), "r"(dst)
2208 __asm__ __volatile__(
2209 "movupd (%1), %%xmm11" "\n\t"
2210 "movupd (%0), %%xmm2" "\n\t"
2211 "roundsd $0, %%xmm2, %%xmm11" "\n\t"
2212 "movupd %%xmm11, (%1)" "\n"
2214 : /*IN*/ "r"(src), "r"(dst)
2215 : /*TRASH*/ "xmm11","xmm2"
2220 void do_ROUNDSD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2223 __asm__ __volatile__(
2224 "movupd (%1), %%xmm11" "\n\t"
2225 "roundsd $1, (%0), %%xmm11" "\n\t"
2226 "movupd %%xmm11, (%1)" "\n"
2228 : /*IN*/ "r"(src), "r"(dst)
2232 __asm__ __volatile__(
2233 "movupd (%1), %%xmm11" "\n\t"
2234 "movupd (%0), %%xmm2" "\n\t"
2235 "roundsd $1, %%xmm2, %%xmm11" "\n\t"
2236 "movupd %%xmm11, (%1)" "\n"
2238 : /*IN*/ "r"(src), "r"(dst)
2239 : /*TRASH*/ "xmm11","xmm2"
2244 void do_ROUNDSD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2247 __asm__ __volatile__(
2248 "movupd (%1), %%xmm11" "\n\t"
2249 "roundsd $2, (%0), %%xmm11" "\n\t"
2250 "movupd %%xmm11, (%1)" "\n"
2252 : /*IN*/ "r"(src), "r"(dst)
2256 __asm__ __volatile__(
2257 "movupd (%1), %%xmm11" "\n\t"
2258 "movupd (%0), %%xmm2" "\n\t"
2259 "roundsd $2, %%xmm2, %%xmm11" "\n\t"
2260 "movupd %%xmm11, (%1)" "\n"
2262 : /*IN*/ "r"(src), "r"(dst)
2263 : /*TRASH*/ "xmm11","xmm2"
2268 void do_ROUNDSD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2271 __asm__ __volatile__(
2272 "movupd (%1), %%xmm11" "\n\t"
2273 "roundsd $3, (%0), %%xmm11" "\n\t"
2274 "movupd %%xmm11, (%1)" "\n"
2276 : /*IN*/ "r"(src), "r"(dst)
2280 __asm__ __volatile__(
2281 "movupd (%1), %%xmm11" "\n\t"
2282 "movupd (%0), %%xmm2" "\n\t"
2283 "roundsd $3, %%xmm2, %%xmm11" "\n\t"
2284 "movupd %%xmm11, (%1)" "\n"
2286 : /*IN*/ "r"(src), "r"(dst)
2287 : /*TRASH*/ "xmm11","xmm2"
2292 void do_ROUNDSD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2295 __asm__ __volatile__(
2296 "movupd (%1), %%xmm11" "\n\t"
2297 "roundsd $4, (%0), %%xmm11" "\n\t"
2298 "movupd %%xmm11, (%1)" "\n"
2300 : /*IN*/ "r"(src), "r"(dst)
2304 __asm__ __volatile__(
2305 "movupd (%1), %%xmm11" "\n\t"
2306 "movupd (%0), %%xmm2" "\n\t"
2307 "roundsd $4, %%xmm2, %%xmm11" "\n\t"
2308 "movupd %%xmm11, (%1)" "\n"
2310 : /*IN*/ "r"(src), "r"(dst)
2311 : /*TRASH*/ "xmm11","xmm2"
2316 void test_ROUNDSD_w_immediate_rounding ( void )
2322 vals[i++] = mkPosInf();
2323 vals[i++] = mkNegInf();
2324 vals[i++] = mkPosNan();
2325 vals[i++] = mkNegNan();
2330 vals[i++] = -0.50001;
2331 vals[i++] = -0.49999;
2336 vals[i++] = 0.49999;
2337 vals[i++] = 0.50001;
2344 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2349 memcpy(&src[0], &vals[i], 8);
2350 do_ROUNDSD_000(False/*reg*/, &src, &dst);
2351 printf("r roundsd_000 ");
2355 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2360 memcpy(&src[0], &vals[i], 8);
2361 do_ROUNDSD_000(True/*mem*/, &src, &dst);
2362 printf("m roundsd_000 ");
2366 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2372 memcpy(&src[0], &vals[i], 8);
2373 do_ROUNDSD_001(False/*reg*/, &src, &dst);
2374 printf("r roundsd_001 ");
2378 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2383 memcpy(&src[0], &vals[i], 8);
2384 do_ROUNDSD_001(True/*mem*/, &src, &dst);
2385 printf("m roundsd_001 ");
2389 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2395 memcpy(&src[0], &vals[i], 8);
2396 do_ROUNDSD_010(False/*reg*/, &src, &dst);
2397 printf("r roundsd_010 ");
2401 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2406 memcpy(&src[0], &vals[i], 8);
2407 do_ROUNDSD_010(True/*mem*/, &src, &dst);
2408 printf("m roundsd_010 ");
2412 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2418 memcpy(&src[0], &vals[i], 8);
2419 do_ROUNDSD_011(False/*reg*/, &src, &dst);
2420 printf("r roundsd_011 ");
2424 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2429 memcpy(&src[0], &vals[i], 8);
2430 do_ROUNDSD_011(True/*mem*/, &src, &dst);
2431 printf("m roundsd_011 ");
2435 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2440 void test_ROUNDSD_w_mxcsr_rounding ( void )
2447 vals[i++] = mkPosInf();
2448 vals[i++] = mkNegInf();
2449 vals[i++] = mkPosNan();
2450 vals[i++] = mkNegNan();
2455 vals[i++] = -0.50001;
2456 vals[i++] = -0.49999;
2461 vals[i++] = 0.49999;
2462 vals[i++] = 0.50001;
2469 rm = get_sse_roundingmode();
2470 assert(rm == 0); // 0 == RN == default
2472 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2475 for (rm = 0; rm <= 3; rm++) {
2476 set_sse_roundingmode(rm);
2480 memcpy(&src[0], &vals[i], 8);
2481 do_ROUNDSD_1XX(False/*reg*/, &src, &dst);
2482 printf("r (rm=%u) roundsd_1XX ", rm);
2486 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2491 memcpy(&src[0], &vals[i], 8);
2492 do_ROUNDSD_1XX(True/*mem*/, &src, &dst);
2493 printf("m (rm=%u) roundsd_1XX ", rm);
2497 printf(" %10f %10f", vals[i], *(double*)(&dst[0]));
2502 rm = get_sse_roundingmode();
2504 set_sse_roundingmode(0);
2505 rm = get_sse_roundingmode();
2506 assert(rm == 0); // 0 == RN == default
2510 /* ------------ ROUNDSS ------------ */
2512 void do_ROUNDSS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2515 __asm__ __volatile__(
2516 "movupd (%1), %%xmm11" "\n\t"
2517 "roundss $0, (%0), %%xmm11" "\n\t"
2518 "movupd %%xmm11, (%1)" "\n"
2520 : /*IN*/ "r"(src), "r"(dst)
2524 __asm__ __volatile__(
2525 "movupd (%1), %%xmm11" "\n\t"
2526 "movupd (%0), %%xmm2" "\n\t"
2527 "roundss $0, %%xmm2, %%xmm11" "\n\t"
2528 "movupd %%xmm11, (%1)" "\n"
2530 : /*IN*/ "r"(src), "r"(dst)
2531 : /*TRASH*/ "xmm11","xmm2"
2536 void do_ROUNDSS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2539 __asm__ __volatile__(
2540 "movupd (%1), %%xmm11" "\n\t"
2541 "roundss $1, (%0), %%xmm11" "\n\t"
2542 "movupd %%xmm11, (%1)" "\n"
2544 : /*IN*/ "r"(src), "r"(dst)
2548 __asm__ __volatile__(
2549 "movupd (%1), %%xmm11" "\n\t"
2550 "movupd (%0), %%xmm2" "\n\t"
2551 "roundss $1, %%xmm2, %%xmm11" "\n\t"
2552 "movupd %%xmm11, (%1)" "\n"
2554 : /*IN*/ "r"(src), "r"(dst)
2555 : /*TRASH*/ "xmm11","xmm2"
2560 void do_ROUNDSS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2563 __asm__ __volatile__(
2564 "movupd (%1), %%xmm11" "\n\t"
2565 "roundss $2, (%0), %%xmm11" "\n\t"
2566 "movupd %%xmm11, (%1)" "\n"
2568 : /*IN*/ "r"(src), "r"(dst)
2572 __asm__ __volatile__(
2573 "movupd (%1), %%xmm11" "\n\t"
2574 "movupd (%0), %%xmm2" "\n\t"
2575 "roundss $2, %%xmm2, %%xmm11" "\n\t"
2576 "movupd %%xmm11, (%1)" "\n"
2578 : /*IN*/ "r"(src), "r"(dst)
2579 : /*TRASH*/ "xmm11","xmm2"
2584 void do_ROUNDSS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2587 __asm__ __volatile__(
2588 "movupd (%1), %%xmm11" "\n\t"
2589 "roundss $3, (%0), %%xmm11" "\n\t"
2590 "movupd %%xmm11, (%1)" "\n"
2592 : /*IN*/ "r"(src), "r"(dst)
2596 __asm__ __volatile__(
2597 "movupd (%1), %%xmm11" "\n\t"
2598 "movupd (%0), %%xmm2" "\n\t"
2599 "roundss $3, %%xmm2, %%xmm11" "\n\t"
2600 "movupd %%xmm11, (%1)" "\n"
2602 : /*IN*/ "r"(src), "r"(dst)
2603 : /*TRASH*/ "xmm11","xmm2"
2608 void do_ROUNDSS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2611 __asm__ __volatile__(
2612 "movupd (%1), %%xmm11" "\n\t"
2613 "roundss $4, (%0), %%xmm11" "\n\t"
2614 "movupd %%xmm11, (%1)" "\n"
2616 : /*IN*/ "r"(src), "r"(dst)
2620 __asm__ __volatile__(
2621 "movupd (%1), %%xmm11" "\n\t"
2622 "movupd (%0), %%xmm2" "\n\t"
2623 "roundss $4, %%xmm2, %%xmm11" "\n\t"
2624 "movupd %%xmm11, (%1)" "\n"
2626 : /*IN*/ "r"(src), "r"(dst)
2627 : /*TRASH*/ "xmm11","xmm2"
2632 void test_ROUNDSS_w_immediate_rounding ( void )
2638 vals[i++] = mkPosInf();
2639 vals[i++] = mkNegInf();
2640 vals[i++] = mkPosNan();
2641 vals[i++] = mkNegNan();
2646 vals[i++] = -0.50001;
2647 vals[i++] = -0.49999;
2652 vals[i++] = 0.49999;
2653 vals[i++] = 0.50001;
2660 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2665 memcpy(&src[0], &vals[i], 4);
2666 do_ROUNDSS_000(False/*reg*/, &src, &dst);
2667 printf("r roundss_000 ");
2671 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2676 memcpy(&src[0], &vals[i], 4);
2677 do_ROUNDSS_000(True/*mem*/, &src, &dst);
2678 printf("m roundss_000 ");
2682 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2688 memcpy(&src[0], &vals[i], 4);
2689 do_ROUNDSS_001(False/*reg*/, &src, &dst);
2690 printf("r roundss_001 ");
2694 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2699 memcpy(&src[0], &vals[i], 4);
2700 do_ROUNDSS_001(True/*mem*/, &src, &dst);
2701 printf("m roundss_001 ");
2705 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2711 memcpy(&src[0], &vals[i], 4);
2712 do_ROUNDSS_010(False/*reg*/, &src, &dst);
2713 printf("r roundss_010 ");
2717 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2722 memcpy(&src[0], &vals[i], 4);
2723 do_ROUNDSS_010(True/*mem*/, &src, &dst);
2724 printf("m roundss_010 ");
2728 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2734 memcpy(&src[0], &vals[i], 4);
2735 do_ROUNDSS_011(False/*reg*/, &src, &dst);
2736 printf("r roundss_011 ");
2740 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2745 memcpy(&src[0], &vals[i], 4);
2746 do_ROUNDSS_011(True/*mem*/, &src, &dst);
2747 printf("m roundss_011 ");
2751 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2756 void test_ROUNDSS_w_mxcsr_rounding ( void )
2763 vals[i++] = mkPosInf();
2764 vals[i++] = mkNegInf();
2765 vals[i++] = mkPosNan();
2766 vals[i++] = mkNegNan();
2771 vals[i++] = -0.50001;
2772 vals[i++] = -0.49999;
2777 vals[i++] = 0.49999;
2778 vals[i++] = 0.50001;
2785 rm = get_sse_roundingmode();
2786 assert(rm == 0); // 0 == RN == default
2788 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2791 for (rm = 0; rm <= 3; rm++) {
2792 set_sse_roundingmode(rm);
2796 memcpy(&src[0], &vals[i], 4);
2797 do_ROUNDSS_1XX(False/*reg*/, &src, &dst);
2798 printf("r (rm=%u) roundss_1XX ", rm);
2802 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2807 memcpy(&src[0], &vals[i], 4);
2808 do_ROUNDSS_1XX(True/*mem*/, &src, &dst);
2809 printf("m (rm=%u) roundss_1XX ", rm);
2813 printf(" %10f %10f", (double)vals[i], (double)*(float*)(&dst[0]));
2818 rm = get_sse_roundingmode();
2820 set_sse_roundingmode(0);
2821 rm = get_sse_roundingmode();
2822 assert(rm == 0); // 0 == RN == default
2825 /* ------------ ROUNDPD ------------ */
2827 void do_ROUNDPD_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
2830 __asm__ __volatile__(
2831 "movupd (%1), %%xmm11" "\n\t"
2832 "roundpd $0, (%0), %%xmm11" "\n\t"
2833 "movupd %%xmm11, (%1)" "\n"
2835 : /*IN*/ "r"(src), "r"(dst)
2839 __asm__ __volatile__(
2840 "movupd (%1), %%xmm11" "\n\t"
2841 "movupd (%0), %%xmm2" "\n\t"
2842 "roundpd $0, %%xmm2, %%xmm11" "\n\t"
2843 "movupd %%xmm11, (%1)" "\n"
2845 : /*IN*/ "r"(src), "r"(dst)
2846 : /*TRASH*/ "xmm11","xmm2"
2851 void do_ROUNDPD_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
2854 __asm__ __volatile__(
2855 "movupd (%1), %%xmm11" "\n\t"
2856 "roundpd $1, (%0), %%xmm11" "\n\t"
2857 "movupd %%xmm11, (%1)" "\n"
2859 : /*IN*/ "r"(src), "r"(dst)
2863 __asm__ __volatile__(
2864 "movupd (%1), %%xmm11" "\n\t"
2865 "movupd (%0), %%xmm2" "\n\t"
2866 "roundpd $1, %%xmm2, %%xmm11" "\n\t"
2867 "movupd %%xmm11, (%1)" "\n"
2869 : /*IN*/ "r"(src), "r"(dst)
2870 : /*TRASH*/ "xmm11","xmm2"
2875 void do_ROUNDPD_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
2878 __asm__ __volatile__(
2879 "movupd (%1), %%xmm11" "\n\t"
2880 "roundpd $2, (%0), %%xmm11" "\n\t"
2881 "movupd %%xmm11, (%1)" "\n"
2883 : /*IN*/ "r"(src), "r"(dst)
2887 __asm__ __volatile__(
2888 "movupd (%1), %%xmm11" "\n\t"
2889 "movupd (%0), %%xmm2" "\n\t"
2890 "roundpd $2, %%xmm2, %%xmm11" "\n\t"
2891 "movupd %%xmm11, (%1)" "\n"
2893 : /*IN*/ "r"(src), "r"(dst)
2894 : /*TRASH*/ "xmm11","xmm2"
2899 void do_ROUNDPD_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
2902 __asm__ __volatile__(
2903 "movupd (%1), %%xmm11" "\n\t"
2904 "roundpd $3, (%0), %%xmm11" "\n\t"
2905 "movupd %%xmm11, (%1)" "\n"
2907 : /*IN*/ "r"(src), "r"(dst)
2911 __asm__ __volatile__(
2912 "movupd (%1), %%xmm11" "\n\t"
2913 "movupd (%0), %%xmm2" "\n\t"
2914 "roundpd $3, %%xmm2, %%xmm11" "\n\t"
2915 "movupd %%xmm11, (%1)" "\n"
2917 : /*IN*/ "r"(src), "r"(dst)
2918 : /*TRASH*/ "xmm11","xmm2"
2923 void do_ROUNDPD_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
2926 __asm__ __volatile__(
2927 "movupd (%1), %%xmm11" "\n\t"
2928 "roundpd $4, (%0), %%xmm11" "\n\t"
2929 "movupd %%xmm11, (%1)" "\n"
2931 : /*IN*/ "r"(src), "r"(dst)
2935 __asm__ __volatile__(
2936 "movupd (%1), %%xmm11" "\n\t"
2937 "movupd (%0), %%xmm2" "\n\t"
2938 "roundpd $4, %%xmm2, %%xmm11" "\n\t"
2939 "movupd %%xmm11, (%1)" "\n"
2941 : /*IN*/ "r"(src), "r"(dst)
2942 : /*TRASH*/ "xmm11","xmm2"
2947 void test_ROUNDPD_w_immediate_rounding ( void )
2953 vals[i++] = mkPosInf();
2954 vals[i++] = mkNegInf();
2955 vals[i++] = mkPosNan();
2956 vals[i++] = mkNegNan();
2961 vals[i++] = -0.50001;
2962 vals[i++] = -0.49999;
2967 vals[i++] = 0.49999;
2968 vals[i++] = 0.50001;
2975 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
2980 memcpy(&src[0], &vals[i], 8);
2981 memcpy(&src[8], &vals[(i+11)%22], 8);
2982 do_ROUNDPD_000(False/*reg*/, &src, &dst);
2983 printf("r roundpd_000 ");
2987 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
2988 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
2993 memcpy(&src[0], &vals[i], 8);
2994 memcpy(&src[8], &vals[(i+11)%22], 8);
2995 do_ROUNDPD_000(True/*mem*/, &src, &dst);
2996 printf("m roundpd_000 ");
3000 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3001 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3007 memcpy(&src[0], &vals[i], 8);
3008 memcpy(&src[8], &vals[(i+11)%22], 8);
3009 do_ROUNDPD_001(False/*reg*/, &src, &dst);
3010 printf("r roundpd_001 ");
3014 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3015 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3020 memcpy(&src[0], &vals[i], 8);
3021 memcpy(&src[8], &vals[(i+11)%22], 8);
3022 do_ROUNDPD_001(True/*mem*/, &src, &dst);
3023 printf("m roundpd_001 ");
3027 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3028 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3034 memcpy(&src[0], &vals[i], 8);
3035 memcpy(&src[8], &vals[(i+11)%22], 8);
3036 do_ROUNDPD_010(False/*reg*/, &src, &dst);
3037 printf("r roundpd_010 ");
3041 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3042 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3047 memcpy(&src[0], &vals[i], 8);
3048 memcpy(&src[8], &vals[(i+11)%22], 8);
3049 do_ROUNDPD_010(True/*mem*/, &src, &dst);
3050 printf("m roundpd_010 ");
3054 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3055 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3061 memcpy(&src[0], &vals[i], 8);
3062 memcpy(&src[8], &vals[(i+11)%22], 8);
3063 do_ROUNDPD_011(False/*reg*/, &src, &dst);
3064 printf("r roundpd_011 ");
3068 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3069 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3074 memcpy(&src[0], &vals[i], 8);
3075 memcpy(&src[8], &vals[(i+11)%22], 8);
3076 do_ROUNDPD_011(True/*mem*/, &src, &dst);
3077 printf("m roundpd_011 ");
3081 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3082 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3087 void test_ROUNDPD_w_mxcsr_rounding ( void )
3094 vals[i++] = mkPosInf();
3095 vals[i++] = mkNegInf();
3096 vals[i++] = mkPosNan();
3097 vals[i++] = mkNegNan();
3102 vals[i++] = -0.50001;
3103 vals[i++] = -0.49999;
3108 vals[i++] = 0.49999;
3109 vals[i++] = 0.50001;
3116 rm = get_sse_roundingmode();
3117 assert(rm == 0); // 0 == RN == default
3119 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3122 for (rm = 0; rm <= 3; rm++) {
3123 set_sse_roundingmode(rm);
3127 memcpy(&src[0], &vals[i], 8);
3128 memcpy(&src[8], &vals[(i+11)%22], 8);
3129 do_ROUNDPD_1XX(False/*reg*/, &src, &dst);
3130 printf("r (rm=%u) roundpd_1XX ", rm);
3134 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3135 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3140 memcpy(&src[0], &vals[i], 8);
3141 memcpy(&src[8], &vals[(i+11)%22], 8);
3142 do_ROUNDPD_1XX(True/*mem*/, &src, &dst);
3143 printf("m (rm=%u) roundpd_1XX ", rm);
3147 printf(" %10f -> %10f", vals[i], *(double*)(&dst[0]));
3148 printf(" %10f -> %10f", vals[(i+11)%22], *(double*)(&dst[8]));
3153 rm = get_sse_roundingmode();
3155 set_sse_roundingmode(0);
3156 rm = get_sse_roundingmode();
3157 assert(rm == 0); // 0 == RN == default
3160 /* ------------ ROUNDPS ------------ */
3162 void do_ROUNDPS_000 ( Bool mem, V128* src, /*OUT*/V128* dst )
3165 __asm__ __volatile__(
3166 "movupd (%1), %%xmm11" "\n\t"
3167 "roundps $0, (%0), %%xmm11" "\n\t"
3168 "movupd %%xmm11, (%1)" "\n"
3170 : /*IN*/ "r"(src), "r"(dst)
3174 __asm__ __volatile__(
3175 "movupd (%1), %%xmm11" "\n\t"
3176 "movupd (%0), %%xmm2" "\n\t"
3177 "roundps $0, %%xmm2, %%xmm11" "\n\t"
3178 "movupd %%xmm11, (%1)" "\n"
3180 : /*IN*/ "r"(src), "r"(dst)
3181 : /*TRASH*/ "xmm11","xmm2"
3186 void do_ROUNDPS_001 ( Bool mem, V128* src, /*OUT*/V128* dst )
3189 __asm__ __volatile__(
3190 "movupd (%1), %%xmm11" "\n\t"
3191 "roundps $1, (%0), %%xmm11" "\n\t"
3192 "movupd %%xmm11, (%1)" "\n"
3194 : /*IN*/ "r"(src), "r"(dst)
3198 __asm__ __volatile__(
3199 "movupd (%1), %%xmm11" "\n\t"
3200 "movupd (%0), %%xmm2" "\n\t"
3201 "roundps $1, %%xmm2, %%xmm11" "\n\t"
3202 "movupd %%xmm11, (%1)" "\n"
3204 : /*IN*/ "r"(src), "r"(dst)
3205 : /*TRASH*/ "xmm11","xmm2"
3210 void do_ROUNDPS_010 ( Bool mem, V128* src, /*OUT*/V128* dst )
3213 __asm__ __volatile__(
3214 "movupd (%1), %%xmm11" "\n\t"
3215 "roundps $2, (%0), %%xmm11" "\n\t"
3216 "movupd %%xmm11, (%1)" "\n"
3218 : /*IN*/ "r"(src), "r"(dst)
3222 __asm__ __volatile__(
3223 "movupd (%1), %%xmm11" "\n\t"
3224 "movupd (%0), %%xmm2" "\n\t"
3225 "roundps $2, %%xmm2, %%xmm11" "\n\t"
3226 "movupd %%xmm11, (%1)" "\n"
3228 : /*IN*/ "r"(src), "r"(dst)
3229 : /*TRASH*/ "xmm11","xmm2"
3234 void do_ROUNDPS_011 ( Bool mem, V128* src, /*OUT*/V128* dst )
3237 __asm__ __volatile__(
3238 "movupd (%1), %%xmm11" "\n\t"
3239 "roundps $3, (%0), %%xmm11" "\n\t"
3240 "movupd %%xmm11, (%1)" "\n"
3242 : /*IN*/ "r"(src), "r"(dst)
3246 __asm__ __volatile__(
3247 "movupd (%1), %%xmm11" "\n\t"
3248 "movupd (%0), %%xmm2" "\n\t"
3249 "roundps $3, %%xmm2, %%xmm11" "\n\t"
3250 "movupd %%xmm11, (%1)" "\n"
3252 : /*IN*/ "r"(src), "r"(dst)
3253 : /*TRASH*/ "xmm11","xmm2"
3258 void do_ROUNDPS_1XX ( Bool mem, V128* src, /*OUT*/V128* dst )
3261 __asm__ __volatile__(
3262 "movupd (%1), %%xmm11" "\n\t"
3263 "roundps $4, (%0), %%xmm11" "\n\t"
3264 "movupd %%xmm11, (%1)" "\n"
3266 : /*IN*/ "r"(src), "r"(dst)
3270 __asm__ __volatile__(
3271 "movupd (%1), %%xmm11" "\n\t"
3272 "movupd (%0), %%xmm2" "\n\t"
3273 "roundps $4, %%xmm2, %%xmm11" "\n\t"
3274 "movupd %%xmm11, (%1)" "\n"
3276 : /*IN*/ "r"(src), "r"(dst)
3277 : /*TRASH*/ "xmm11","xmm2"
3282 void test_ROUNDPS_w_immediate_rounding ( void )
3288 vals[i++] = mkPosInf();
3289 vals[i++] = mkNegInf();
3290 vals[i++] = mkPosNan();
3291 vals[i++] = mkNegNan();
3296 vals[i++] = -0.50001;
3297 vals[i++] = -0.49999;
3302 vals[i++] = 0.49999;
3303 vals[i++] = 0.50001;
3310 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3315 memcpy(&src[0], &vals[i], 4);
3316 memcpy(&src[4], &vals[(i+5)%22], 4);
3317 memcpy(&src[8], &vals[(i+11)%22], 4);
3318 memcpy(&src[12], &vals[(i+17)%22], 4);
3319 do_ROUNDPS_000(False/*reg*/, &src, &dst);
3320 printf("r roundps_000 ");
3324 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3325 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3326 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3327 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3332 memcpy(&src[0], &vals[i], 4);
3333 memcpy(&src[4], &vals[(i+5)%22], 4);
3334 memcpy(&src[8], &vals[(i+11)%22], 4);
3335 memcpy(&src[12], &vals[(i+17)%22], 4);
3336 do_ROUNDPS_000(True/*mem*/, &src, &dst);
3337 printf("m roundps_000 ");
3341 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3342 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3343 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3344 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3350 memcpy(&src[0], &vals[i], 4);
3351 memcpy(&src[4], &vals[(i+5)%22], 4);
3352 memcpy(&src[8], &vals[(i+11)%22], 4);
3353 memcpy(&src[12], &vals[(i+17)%22], 4);
3354 do_ROUNDPS_001(False/*reg*/, &src, &dst);
3355 printf("r roundps_001 ");
3359 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3360 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3361 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3362 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3367 memcpy(&src[0], &vals[i], 4);
3368 memcpy(&src[4], &vals[(i+5)%22], 4);
3369 memcpy(&src[8], &vals[(i+11)%22], 4);
3370 memcpy(&src[12], &vals[(i+17)%22], 4);
3371 do_ROUNDPS_001(True/*mem*/, &src, &dst);
3372 printf("m roundps_001 ");
3376 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3377 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3378 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3379 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3385 memcpy(&src[0], &vals[i], 4);
3386 memcpy(&src[4], &vals[(i+5)%22], 4);
3387 memcpy(&src[8], &vals[(i+11)%22], 4);
3388 memcpy(&src[12], &vals[(i+17)%22], 4);
3389 do_ROUNDPS_010(False/*reg*/, &src, &dst);
3390 printf("r roundps_010 ");
3394 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3395 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3396 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3397 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3402 memcpy(&src[0], &vals[i], 4);
3403 memcpy(&src[4], &vals[(i+5)%22], 4);
3404 memcpy(&src[8], &vals[(i+11)%22], 4);
3405 memcpy(&src[12], &vals[(i+17)%22], 4);
3406 do_ROUNDPS_010(True/*mem*/, &src, &dst);
3407 printf("m roundps_010 ");
3411 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3412 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3413 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3414 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3420 memcpy(&src[0], &vals[i], 4);
3421 memcpy(&src[4], &vals[(i+5)%22], 4);
3422 memcpy(&src[8], &vals[(i+11)%22], 4);
3423 memcpy(&src[12], &vals[(i+17)%22], 4);
3424 do_ROUNDPS_011(False/*reg*/, &src, &dst);
3425 printf("r roundps_011 ");
3429 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3430 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3431 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3432 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3437 memcpy(&src[0], &vals[i], 4);
3438 memcpy(&src[4], &vals[(i+5)%22], 4);
3439 memcpy(&src[8], &vals[(i+11)%22], 4);
3440 memcpy(&src[12], &vals[(i+17)%22], 4);
3441 do_ROUNDPS_011(True/*mem*/, &src, &dst);
3442 printf("m roundps_011 ");
3446 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3447 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3448 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3449 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3454 void test_ROUNDPS_w_mxcsr_rounding ( void )
3461 vals[i++] = mkPosInf();
3462 vals[i++] = mkNegInf();
3463 vals[i++] = mkPosNan();
3464 vals[i++] = mkNegNan();
3469 vals[i++] = -0.50001;
3470 vals[i++] = -0.49999;
3475 vals[i++] = 0.49999;
3476 vals[i++] = 0.50001;
3483 rm = get_sse_roundingmode();
3484 assert(rm == 0); // 0 == RN == default
3486 for (i = 0; i < sizeof(vals)/sizeof(vals[0]); i++) {
3489 for (rm = 0; rm <= 3; rm++) {
3490 set_sse_roundingmode(rm);
3494 memcpy(&src[0], &vals[i], 4);
3495 memcpy(&src[4], &vals[(i+5)%22], 4);
3496 memcpy(&src[8], &vals[(i+11)%22], 4);
3497 memcpy(&src[12], &vals[(i+17)%22], 4);
3498 do_ROUNDPS_1XX(False/*reg*/, &src, &dst);
3499 printf("r (rm=%u) roundps_1XX ", rm);
3503 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3504 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3505 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3506 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3511 memcpy(&src[0], &vals[i], 4);
3512 memcpy(&src[4], &vals[(i+5)%22], 4);
3513 memcpy(&src[8], &vals[(i+11)%22], 4);
3514 memcpy(&src[12], &vals[(i+17)%22], 4);
3515 do_ROUNDPS_1XX(True/*mem*/, &src, &dst);
3516 printf("m (rm=%u) roundps_1XX ", rm);
3520 printf(" %9f:%9f", vals[i], (double)*(float*)(&dst[0]));
3521 printf(" %9f:%9f", vals[(i+5)%22], (double)*(float*)(&dst[4]));
3522 printf(" %9f:%9f", vals[(i+11)%22], (double)*(float*)(&dst[8]));
3523 printf(" %9f:%9f", vals[(i+17)%22], (double)*(float*)(&dst[12]));
3528 rm = get_sse_roundingmode();
3530 set_sse_roundingmode(0);
3531 rm = get_sse_roundingmode();
3532 assert(rm == 0); // 0 == RN == default
3535 /* ------------ PTEST ------------ */
3537 void test_PTEST ( void )
3539 const Int ntests = 8;
3541 do64HLtoV128( &spec[0], 0x0000000000000000ULL, 0x0000000000000000ULL );
3542 do64HLtoV128( &spec[1], 0x0000000000000000ULL, 0x0000000000000001ULL );
3543 do64HLtoV128( &spec[2], 0x0000000000000001ULL, 0x0000000000000000ULL );
3544 do64HLtoV128( &spec[3], 0x0000000000000001ULL, 0x0000000000000001ULL );
3545 do64HLtoV128( &spec[4], 0xffffffffffffffffULL, 0xffffffffffffffffULL );
3546 do64HLtoV128( &spec[5], 0xffffffffffffffffULL, 0xfffffffffffffffeULL );
3547 do64HLtoV128( &spec[6], 0xfffffffffffffffeULL, 0xffffffffffffffffULL );
3548 do64HLtoV128( &spec[7], 0xfffffffffffffffeULL, 0xfffffffffffffffeULL );
3552 for (i = 0; i < ntests; i++) {
3553 for (j = 0; j < ntests; j++) {
3554 memcpy(&block[0], &spec[i], 16);
3555 memcpy(&block[1], &spec[j], 16);
3556 __asm__ __volatile__(
3557 "subq $256, %%rsp" "\n\t"
3558 "movupd 0(%1), %%xmm2" "\n\t"
3559 "ptest 16(%1), %%xmm2" "\n\t"
3562 "addq $256, %%rsp" "\n\t"
3563 : /*out*/"=r"(flags) : /*in*/ "r"(&block[0]) :
3564 "xmm2", "memory", "cc"
3567 showV128(&block[0]);
3569 showV128(&block[1]);
3570 printf(" -> eflags %04x\n", (UInt)flags & 0x8D5);
3575 /* ------------ PBLENDVB ------------ */
3577 void do_PBLENDVB ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3580 __asm__ __volatile__(
3581 "movupd (%2), %%xmm0" "\n\t"
3582 "movupd (%1), %%xmm11" "\n\t"
3583 "pblendvb (%0), %%xmm11" "\n\t"
3584 "movupd %%xmm11, (%1)" "\n"
3586 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3587 : /*TRASH*/ "xmm11","xmm0"
3590 __asm__ __volatile__(
3591 "movupd (%2), %%xmm0" "\n\t"
3592 "movupd (%1), %%xmm11" "\n\t"
3593 "movupd (%0), %%xmm2" "\n\t"
3594 "pblendvb %%xmm2, %%xmm11" "\n\t"
3595 "movupd %%xmm11, (%1)" "\n"
3597 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3598 : /*TRASH*/ "xmm11","xmm2","xmm0"
3603 void test_PBLENDVB ( void )
3605 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3607 for (i = 0; i < 10; i++) {
3612 memcpy(&xmm0, &t_xmm0, 16);
3613 memcpy(&src, &t_src, 16);
3614 memcpy(&dst, &t_dst, 16);
3615 do_PBLENDVB(False/*reg*/, &xmm0, &src, &dst);
3616 printf("r pblendvb ");
3626 memcpy(&xmm0, &t_xmm0, 16);
3627 memcpy(&src, &t_src, 16);
3628 memcpy(&dst, &t_dst, 16);
3629 do_PBLENDVB(True/*mem*/, &xmm0, &src, &dst);
3630 printf("m pblendvb ");
3642 /* ------------ BLENDVPD ------------ */
3644 void do_BLENDVPD ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3647 __asm__ __volatile__(
3648 "movupd (%2), %%xmm0" "\n\t"
3649 "movupd (%1), %%xmm11" "\n\t"
3650 "blendvpd (%0), %%xmm11" "\n\t"
3651 "movupd %%xmm11, (%1)" "\n"
3653 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3654 : /*TRASH*/ "xmm11","xmm0"
3657 __asm__ __volatile__(
3658 "movupd (%2), %%xmm0" "\n\t"
3659 "movupd (%1), %%xmm11" "\n\t"
3660 "movupd (%0), %%xmm2" "\n\t"
3661 "blendvpd %%xmm2, %%xmm11" "\n\t"
3662 "movupd %%xmm11, (%1)" "\n"
3664 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3665 : /*TRASH*/ "xmm11","xmm2","xmm0"
3670 void test_BLENDVPD ( void )
3672 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3674 for (i = 0; i < 10; i++) {
3679 memcpy(&xmm0, &t_xmm0, 16);
3680 memcpy(&src, &t_src, 16);
3681 memcpy(&dst, &t_dst, 16);
3682 do_BLENDVPD(False/*reg*/, &xmm0, &src, &dst);
3683 printf("r blendvpd ");
3693 memcpy(&xmm0, &t_xmm0, 16);
3694 memcpy(&src, &t_src, 16);
3695 memcpy(&dst, &t_dst, 16);
3696 do_BLENDVPD(True/*mem*/, &xmm0, &src, &dst);
3697 printf("m blendvpd ");
3709 /* ------------ BLENDVPS ------------ */
3711 void do_BLENDVPS ( Bool mem, V128* xmm0, V128* src, /*MOD*/V128* dst )
3714 __asm__ __volatile__(
3715 "movupd (%2), %%xmm0" "\n\t"
3716 "movupd (%1), %%xmm11" "\n\t"
3717 "blendvps (%0), %%xmm11" "\n\t"
3718 "movupd %%xmm11, (%1)" "\n"
3720 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3721 : /*TRASH*/ "xmm11","xmm0"
3724 __asm__ __volatile__(
3725 "movupd (%2), %%xmm0" "\n\t"
3726 "movupd (%1), %%xmm11" "\n\t"
3727 "movupd (%0), %%xmm2" "\n\t"
3728 "blendvps %%xmm2, %%xmm11" "\n\t"
3729 "movupd %%xmm11, (%1)" "\n"
3731 : /*IN*/ "r"(src), "r"(dst), "r"(xmm0)
3732 : /*TRASH*/ "xmm11","xmm2","xmm0"
3737 void test_BLENDVPS ( void )
3739 V128 xmm0, src, dst, t_xmm0, t_src, t_dst;
3741 for (i = 0; i < 10; i++) {
3746 memcpy(&xmm0, &t_xmm0, 16);
3747 memcpy(&src, &t_src, 16);
3748 memcpy(&dst, &t_dst, 16);
3749 do_BLENDVPS(False/*reg*/, &xmm0, &src, &dst);
3750 printf("r blendvps ");
3760 memcpy(&xmm0, &t_xmm0, 16);
3761 memcpy(&src, &t_src, 16);
3762 memcpy(&dst, &t_dst, 16);
3763 do_BLENDVPS(True/*mem*/, &xmm0, &src, &dst);
3764 printf("m blendvps ");
3776 /* ------------ main ------------ */
3778 int main ( int argc, char** argv )
3781 // ------ SSE 4.1 ------
3782 test_BLENDPD(); // done Apr.01.2010
3783 test_BLENDPS(); // done Apr.02.2010
3788 test_DPPD(); // done Apr.08.2010
3789 test_DPPS(); // done Apr.09.2010
3791 test_INSERTPS(); // done Apr.01.2010
3796 test_PEXTRB(); // done Apr.15.2010
3797 test_PEXTRD(); // done Apr.14.2010
3798 test_PEXTRQ(); // done Apr.14.2010
3799 test_PEXTRW(); // done Apr.14.2010
3800 test_PINSRQ(); // done Apr.16.2010
3801 test_PINSRD(); // todo
3802 test_PINSRW(); /* Umm, this is SSE2, not SSE4. Right? */
3803 test_PINSRB(); // todo
3804 //test_PHMINPOSUW();
3806 test_PMAXSD(); // done Apr.09.2010
3807 test_PMAXUD(); // done Apr.16.2010
3810 test_PMINSD(); // done Apr.09.2010
3813 test_PMOVSXBW(); // done Apr.02.2010
3814 test_PMOVSXBD(); // done Mar.30.2010
3815 test_PMOVSXBQ(); // done Mar.30.2010
3816 test_PMOVSXWD(); // done Mar.31.2010
3817 test_PMOVSXWQ(); // done Mar.31.2010
3818 test_PMOVSXDQ(); // done Mar.31.2010
3819 test_PMOVZXBW(); // done Mar.28.2010
3820 test_PMOVZXBD(); // done Mar.29.2010
3821 test_PMOVZXBQ(); // done Mar.29.2010
3822 test_PMOVZXWD(); // done Mar.28.2010
3823 test_PMOVZXWQ(); // done Mar.29.2010
3824 test_PMOVZXDQ(); // done Mar.29.2010
3831 test_ROUNDSD_w_immediate_rounding();
3832 test_ROUNDSS_w_immediate_rounding();
3833 test_ROUNDPD_w_immediate_rounding();
3834 test_ROUNDPS_w_immediate_rounding();
3835 test_ROUNDSD_w_mxcsr_rounding();
3836 test_ROUNDSS_w_mxcsr_rounding();
3837 test_ROUNDPD_w_mxcsr_rounding();
3838 test_ROUNDPS_w_mxcsr_rounding();
3839 // ------ SSE 4.2 ------