l4/pkg/valgrind/src/valgrind-3.6.0-svn/VEX/priv/guest_generic_x87.c

   1
   2 /*---------------------------------------------------------------*/
   3 /*--- begin                               guest_generic_x87.c ---*/
   4 /*---------------------------------------------------------------*/
   5
   6 /*
   7    This file is part of Valgrind, a dynamic binary instrumentation
   8    framework.
   9
  10    Copyright (C) 2004-2010 OpenWorks LLP
  11       info@open-works.net
  12
  13    This program is free software; you can redistribute it and/or
  14    modify it under the terms of the GNU General Public License as
  15    published by the Free Software Foundation; either version 2 of the
  16    License, or (at your option) any later version.
  17
  18    This program is distributed in the hope that it will be useful, but
  19    WITHOUT ANY WARRANTY; without even the implied warranty of
  20    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  21    General Public License for more details.
  22
  23    You should have received a copy of the GNU General Public License
  24    along with this program; if not, write to the Free Software
  25    Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  26    02110-1301, USA.
  27
  28    The GNU General Public License is contained in the file COPYING.
  29
  30    Neither the names of the U.S. Department of Energy nor the
  31    University of California nor the names of its contributors may be
  32    used to endorse or promote products derived from this software
  33    without prior written permission.
  34 */
  35
  36 /* This file contains functions for doing some x87-specific
  37    operations.  Both the amd64 and x86 front ends (guests) indirectly
  38    call these functions via guest helper calls.  By putting them here,
  39    code duplication is avoided.  Some of these functions are tricky
  40    and hard to verify, so there is much to be said for only having one
  41    copy thereof.
  42 */
  43
  44 #include "libvex_basictypes.h"
  45
  46 #include "main_util.h"
  47 #include "guest_generic_x87.h"
  48
  49
  50 /* 80 and 64-bit floating point formats:
  51
  52    80-bit:
  53
  54     S  0       0-------0      zero
  55     S  0       0X------X      denormals
  56     S  1-7FFE  1X------X      normals (all normals have leading 1)
  57     S  7FFF    10------0      infinity
  58     S  7FFF    10X-----X      snan
  59     S  7FFF    11X-----X      qnan
  60
  61    S is the sign bit.  For runs X----X, at least one of the Xs must be
  62    nonzero.  Exponent is 15 bits, fractional part is 63 bits, and
  63    there is an explicitly represented leading 1, and a sign bit,
  64    giving 80 in total.
  65
  66    64-bit avoids the confusion of an explicitly represented leading 1
  67    and so is simpler:
  68
  69     S  0      0------0   zero
  70     S  0      X------X   denormals
  71     S  1-7FE  any        normals
  72     S  7FF    0------0   infinity
  73     S  7FF    0X-----X   snan
  74     S  7FF    1X-----X   qnan
  75
  76    Exponent is 11 bits, fractional part is 52 bits, and there is a
  77    sign bit, giving 64 in total.
  78 */
  79
  80
  81 static inline UInt read_bit_array ( UChar* arr, UInt n )
  82 {
  83    UChar c = arr[n >> 3];
  84    c >>= (n&7);
  85    return c & 1;
  86 }
  87
  88 static inline void write_bit_array ( UChar* arr, UInt n, UInt b )
  89 {
  90    UChar c = arr[n >> 3];
  91    c = toUChar( c & ~(1 << (n&7)) );
  92    c = toUChar( c | ((b&1) << (n&7)) );
  93    arr[n >> 3] = c;
  94 }
  95
  96 /* Convert an IEEE754 double (64-bit) into an x87 extended double
  97    (80-bit), mimicing the hardware fairly closely.  Both numbers are
  98    stored little-endian.  Limitations, all of which could be fixed,
  99    given some level of hassle:
 100
 101    * Identity of NaNs is not preserved.
 102
 103    See comments in the code for more details.
 104 */
 105 void convert_f64le_to_f80le ( /*IN*/UChar* f64, /*OUT*/UChar* f80 )
 106 {
 107    Bool  mantissaIsZero;
 108    Int   bexp, i, j, shift;
 109    UChar sign;
 110
 111    sign = toUChar( (f64[7] >> 7) & 1 );
 112    bexp = (f64[7] << 4) | ((f64[6] >> 4) & 0x0F);
 113    bexp &= 0x7FF;
 114
 115    mantissaIsZero = False;
 116    if (bexp == 0 || bexp == 0x7FF) {
 117       /* We'll need to know whether or not the mantissa (bits 51:0) is
 118          all zeroes in order to handle these cases.  So figure it
 119          out. */
 120       mantissaIsZero
 121          = toBool(
 122               (f64[6] & 0x0F) == 0
 123               && f64[5] == 0 && f64[4] == 0 && f64[3] == 0
 124               && f64[2] == 0 && f64[1] == 0 && f64[0] == 0
 125            );
 126    }
 127
 128    /* If the exponent is zero, either we have a zero or a denormal.
 129       Produce a zero.  This is a hack in that it forces denormals to
 130       zero.  Could do better. */
 131    if (bexp == 0) {
 132       f80[9] = toUChar( sign << 7 );
 133       f80[8] = f80[7] = f80[6] = f80[5] = f80[4]
 134              = f80[3] = f80[2] = f80[1] = f80[0] = 0;
 135
 136       if (mantissaIsZero)
 137          /* It really is zero, so that's all we can do. */
 138          return;
 139
 140       /* There is at least one 1-bit in the mantissa.  So it's a
 141          potentially denormalised double -- but we can produce a
 142          normalised long double.  Count the leading zeroes in the
 143          mantissa so as to decide how much to bump the exponent down
 144          by.  Note, this is SLOW. */
 145       shift = 0;
 146       for (i = 51; i >= 0; i--) {
 147         if (read_bit_array(f64, i))
 148            break;
 149         shift++;
 150       }
 151
 152       /* and copy into place as many bits as we can get our hands on. */
 153       j = 63;
 154       for (i = 51 - shift; i >= 0; i--) {
 155          write_bit_array( f80, j,
 156          read_bit_array( f64, i ) );
 157          j--;
 158       }
 159
 160       /* Set the exponent appropriately, and we're done. */
 161       bexp -= shift;
 162       bexp += (16383 - 1023);
 163       f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
 164       f80[8] = toUChar( bexp & 0xFF );
 165       return;
 166    }
 167
 168    /* If the exponent is 7FF, this is either an Infinity, a SNaN or
 169       QNaN, as determined by examining bits 51:0, thus:
 170           0  ... 0    Inf
 171           0X ... X    SNaN
 172           1X ... X    QNaN
 173       where at least one of the Xs is not zero.
 174    */
 175    if (bexp == 0x7FF) {
 176       if (mantissaIsZero) {
 177          /* Produce an appropriately signed infinity:
 178             S 1--1 (15)  1  0--0 (63)
 179          */
 180          f80[9] = toUChar( (sign << 7) | 0x7F );
 181          f80[8] = 0xFF;
 182          f80[7] = 0x80;
 183          f80[6] = f80[5] = f80[4] = f80[3]
 184                 = f80[2] = f80[1] = f80[0] = 0;
 185          return;
 186       }
 187       /* So it's either a QNaN or SNaN.  Distinguish by considering
 188          bit 51.  Note, this destroys all the trailing bits
 189          (identity?) of the NaN.  IEEE754 doesn't require preserving
 190          these (it only requires that there be one QNaN value and one
 191          SNaN value), but x87 does seem to have some ability to
 192          preserve them.  Anyway, here, the NaN's identity is
 193          destroyed.  Could be improved. */
 194       if (f64[6] & 8) {
 195          /* QNaN.  Make a QNaN:
 196             S 1--1 (15)  1  1--1 (63)
 197          */
 198          f80[9] = toUChar( (sign << 7) | 0x7F );
 199          f80[8] = 0xFF;
 200          f80[7] = 0xFF;
 201          f80[6] = f80[5] = f80[4] = f80[3]
 202                 = f80[2] = f80[1] = f80[0] = 0xFF;
 203       } else {
 204          /* SNaN.  Make a SNaN:
 205             S 1--1 (15)  0  1--1 (63)
 206          */
 207          f80[9] = toUChar( (sign << 7) | 0x7F );
 208          f80[8] = 0xFF;
 209          f80[7] = 0x7F;
 210          f80[6] = f80[5] = f80[4] = f80[3]
 211                 = f80[2] = f80[1] = f80[0] = 0xFF;
 212       }
 213       return;
 214    }
 215
 216    /* It's not a zero, denormal, infinity or nan.  So it must be a
 217       normalised number.  Rebias the exponent and build the new
 218       number.  */
 219    bexp += (16383 - 1023);
 220
 221    f80[9] = toUChar( (sign << 7) | ((bexp >> 8) & 0xFF) );
 222    f80[8] = toUChar( bexp & 0xFF );
 223    f80[7] = toUChar( (1 << 7) | ((f64[6] << 3) & 0x78)
 224                               | ((f64[5] >> 5) & 7) );
 225    f80[6] = toUChar( ((f64[5] << 3) & 0xF8) | ((f64[4] >> 5) & 7) );
 226    f80[5] = toUChar( ((f64[4] << 3) & 0xF8) | ((f64[3] >> 5) & 7) );
 227    f80[4] = toUChar( ((f64[3] << 3) & 0xF8) | ((f64[2] >> 5) & 7) );
 228    f80[3] = toUChar( ((f64[2] << 3) & 0xF8) | ((f64[1] >> 5) & 7) );
 229    f80[2] = toUChar( ((f64[1] << 3) & 0xF8) | ((f64[0] >> 5) & 7) );
 230    f80[1] = toUChar( ((f64[0] << 3) & 0xF8) );
 231    f80[0] = toUChar( 0 );
 232 }
 233
 234
 235 /* Convert an x87 extended double (80-bit) into an IEEE 754 double
 236    (64-bit), mimicking the hardware fairly closely.  Both numbers are
 237    stored little-endian.  Limitations, both of which could be fixed,
 238    given some level of hassle:
 239
 240    * Rounding following truncation could be a bit better.
 241
 242    * Identity of NaNs is not preserved.
 243
 244    See comments in the code for more details.
 245 */
 246 void convert_f80le_to_f64le ( /*IN*/UChar* f80, /*OUT*/UChar* f64 )
 247 {
 248    Bool  isInf;
 249    Int   bexp, i, j;
 250    UChar sign;
 251
 252    sign = toUChar((f80[9] >> 7) & 1);
 253    bexp = (((UInt)f80[9]) << 8) | (UInt)f80[8];
 254    bexp &= 0x7FFF;
 255
 256    /* If the exponent is zero, either we have a zero or a denormal.
 257       But an extended precision denormal becomes a double precision
 258       zero, so in either case, just produce the appropriately signed
 259       zero. */
 260    if (bexp == 0) {
 261       f64[7] = toUChar(sign << 7);
 262       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
 263       return;
 264    }
 265
 266    /* If the exponent is 7FFF, this is either an Infinity, a SNaN or
 267       QNaN, as determined by examining bits 62:0, thus:
 268           0  ... 0    Inf
 269           0X ... X    SNaN
 270           1X ... X    QNaN
 271       where at least one of the Xs is not zero.
 272    */
 273    if (bexp == 0x7FFF) {
 274       isInf = toBool(
 275                  (f80[7] & 0x7F) == 0
 276                  && f80[6] == 0 && f80[5] == 0 && f80[4] == 0
 277                  && f80[3] == 0 && f80[2] == 0 && f80[1] == 0
 278                  && f80[0] == 0
 279               );
 280       if (isInf) {
 281          if (0 == (f80[7] & 0x80))
 282             goto wierd_NaN;
 283          /* Produce an appropriately signed infinity:
 284             S 1--1 (11)  0--0 (52)
 285          */
 286          f64[7] = toUChar((sign << 7) | 0x7F);
 287          f64[6] = 0xF0;
 288          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
 289          return;
 290       }
 291       /* So it's either a QNaN or SNaN.  Distinguish by considering
 292          bit 62.  Note, this destroys all the trailing bits
 293          (identity?) of the NaN.  IEEE754 doesn't require preserving
 294          these (it only requires that there be one QNaN value and one
 295          SNaN value), but x87 does seem to have some ability to
 296          preserve them.  Anyway, here, the NaN's identity is
 297          destroyed.  Could be improved. */
 298       if (f80[8] & 0x40) {
 299          /* QNaN.  Make a QNaN:
 300             S 1--1 (11)  1  1--1 (51)
 301          */
 302          f64[7] = toUChar((sign << 7) | 0x7F);
 303          f64[6] = 0xFF;
 304          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
 305       } else {
 306          /* SNaN.  Make a SNaN:
 307             S 1--1 (11)  0  1--1 (51)
 308          */
 309          f64[7] = toUChar((sign << 7) | 0x7F);
 310          f64[6] = 0xF7;
 311          f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0xFF;
 312       }
 313       return;
 314    }
 315
 316    /* If it's not a Zero, NaN or Inf, and the integer part (bit 62) is
 317       zero, the x87 FPU appears to consider the number denormalised
 318       and converts it to a QNaN. */
 319    if (0 == (f80[7] & 0x80)) {
 320       wierd_NaN:
 321       /* Strange hardware QNaN:
 322          S 1--1 (11)  1  0--0 (51)
 323       */
 324       /* On a PIII, these QNaNs always appear with sign==1.  I have
 325          no idea why. */
 326       f64[7] = (1 /*sign*/ << 7) | 0x7F;
 327       f64[6] = 0xF8;
 328       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
 329       return;
 330    }
 331
 332    /* It's not a zero, denormal, infinity or nan.  So it must be a
 333       normalised number.  Rebias the exponent and consider. */
 334    bexp -= (16383 - 1023);
 335    if (bexp >= 0x7FF) {
 336       /* It's too big for a double.  Construct an infinity. */
 337       f64[7] = toUChar((sign << 7) | 0x7F);
 338       f64[6] = 0xF0;
 339       f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
 340       return;
 341    }
 342
 343    if (bexp <= 0) {
 344       /* It's too small for a normalised double.  First construct a
 345          zero and then see if it can be improved into a denormal.  */
 346       f64[7] = toUChar(sign << 7);
 347       f64[6] = f64[5] = f64[4] = f64[3] = f64[2] = f64[1] = f64[0] = 0;
 348
 349       if (bexp < -52)
 350          /* Too small even for a denormal. */
 351          return;
 352
 353       /* Ok, let's make a denormal.  Note, this is SLOW. */
 354       /* Copy bits 63, 62, 61, etc of the src mantissa into the dst,
 355          indexes 52+bexp, 51+bexp, etc, until k+bexp < 0. */
 356       /* bexp is in range -52 .. 0 inclusive */
 357       for (i = 63; i >= 0; i--) {
 358          j = i - 12 + bexp;
 359          if (j < 0) break;
 360          /* We shouldn't really call vassert from generated code. */
 361          vassert(j >= 0 && j < 52);
 362          write_bit_array ( f64,
 363                            j,
 364                            read_bit_array ( f80, i ) );
 365       }
 366       /* and now we might have to round ... */
 367       if (read_bit_array(f80, 10+1 - bexp) == 1)
 368          goto do_rounding;
 369
 370       return;
 371    }
 372
 373    /* Ok, it's a normalised number which is representable as a double.
 374       Copy the exponent and mantissa into place. */
 375    /*
 376    for (i = 0; i < 52; i++)
 377       write_bit_array ( f64,
 378                         i,
 379                         read_bit_array ( f80, i+11 ) );
 380    */
 381    f64[0] = toUChar( (f80[1] >> 3) | (f80[2] << 5) );
 382    f64[1] = toUChar( (f80[2] >> 3) | (f80[3] << 5) );
 383    f64[2] = toUChar( (f80[3] >> 3) | (f80[4] << 5) );
 384    f64[3] = toUChar( (f80[4] >> 3) | (f80[5] << 5) );
 385    f64[4] = toUChar( (f80[5] >> 3) | (f80[6] << 5) );
 386    f64[5] = toUChar( (f80[6] >> 3) | (f80[7] << 5) );
 387
 388    f64[6] = toUChar( ((bexp << 4) & 0xF0) | ((f80[7] >> 3) & 0x0F) );
 389
 390    f64[7] = toUChar( (sign << 7) | ((bexp >> 4) & 0x7F) );
 391
 392    /* Now consider any rounding that needs to happen as a result of
 393       truncating the mantissa. */
 394    if (f80[1] & 4) /* read_bit_array(f80, 10) == 1) */ {
 395
 396       /* If the bottom bits of f80 are "100 0000 0000", then the
 397          infinitely precise value is deemed to be mid-way between the
 398          two closest representable values.  Since we're doing
 399          round-to-nearest (the default mode), in that case it is the
 400          bit immediately above which indicates whether we should round
 401          upwards or not -- if 0, we don't.  All that is encapsulated
 402          in the following simple test. */
 403       if ((f80[1] & 0xF) == 4/*0100b*/ && f80[0] == 0)
 404          return;
 405
 406       do_rounding:
 407       /* Round upwards.  This is a kludge.  Once in every 2^24
 408          roundings (statistically) the bottom three bytes are all 0xFF
 409          and so we don't round at all.  Could be improved. */
 410       if (f64[0] != 0xFF) {
 411          f64[0]++;
 412       }
 413       else
 414       if (f64[0] == 0xFF && f64[1] != 0xFF) {
 415          f64[0] = 0;
 416          f64[1]++;
 417       }
 418       else
 419       if (f64[0] == 0xFF && f64[1] == 0xFF && f64[2] != 0xFF) {
 420          f64[0] = 0;
 421          f64[1] = 0;
 422          f64[2]++;
 423       }
 424       /* else we don't round, but we should. */
 425    }
 426 }
 427
 428
 429 /* CALLED FROM GENERATED CODE: CLEAN HELPER */
 430 /* Extract the signed significand or exponent component as per
 431    fxtract.  Arg and result are doubles travelling under the guise of
 432    ULongs.  Returns significand when getExp is zero and exponent
 433    otherwise. */
 434 ULong x86amd64g_calculate_FXTRACT ( ULong arg, HWord getExp )
 435 {
 436    ULong  uSig, uExp;
 437    /* Long   sSig; */
 438    Int    sExp, i;
 439    UInt   sign, expExp;
 440
 441    /*
 442     S  7FF    0------0   infinity
 443     S  7FF    0X-----X   snan
 444     S  7FF    1X-----X   qnan
 445    */
 446    const ULong posInf  = 0x7FF0000000000000ULL;
 447    const ULong negInf  = 0xFFF0000000000000ULL;
 448    const ULong nanMask = 0x7FF0000000000000ULL;
 449    const ULong qNan    = 0x7FF8000000000000ULL;
 450    const ULong posZero = 0x0000000000000000ULL;
 451    const ULong negZero = 0x8000000000000000ULL;
 452    const ULong bit51   = 1ULL << 51;
 453    const ULong bit52   = 1ULL << 52;
 454    const ULong sigMask = bit52 - 1;
 455
 456    /* Mimic Core i5 behaviour for special cases. */
 457    if (arg == posInf)
 458       return getExp ? posInf : posInf;
 459    if (arg == negInf)
 460       return getExp ? posInf : negInf;
 461    if ((arg & nanMask) == nanMask)
 462       return qNan | (arg & (1ULL << 63));
 463    if (arg == posZero)
 464       return getExp ? negInf : posZero;
 465    if (arg == negZero)
 466       return getExp ? negInf : negZero;
 467
 468    /* Split into sign, exponent and significand. */
 469    sign = ((UInt)(arg >> 63)) & 1;
 470
 471    /* Mask off exponent & sign. uSig is in range 0 .. 2^52-1. */
 472    uSig = arg & sigMask;
 473
 474    /* Get the exponent. */
 475    sExp = ((Int)(arg >> 52)) & 0x7FF;
 476
 477    /* Deal with denormals: if the exponent is zero, then the
 478       significand cannot possibly be zero (negZero/posZero are handled
 479       above).  Shift the significand left until bit 51 of it becomes
 480       1, and decrease the exponent accordingly.
 481    */
 482    if (sExp == 0) {
 483       for (i = 0; i < 52; i++) {
 484          if (uSig & bit51)
 485             break;
 486          uSig <<= 1;
 487          sExp--;
 488       }
 489       uSig <<= 1;
 490    } else {
 491       /* Add the implied leading-1 in the significand. */
 492       uSig |= bit52;
 493    }
 494
 495    /* Roll in the sign. */
 496    /* sSig = uSig; */
 497    /* if (sign) sSig =- sSig; */
 498
 499    /* Convert sig into a double.  This should be an exact conversion.
 500       Then divide by 2^52, which should give a value in the range 1.0
 501       to 2.0-epsilon, at least for normalised args. */
 502    /* dSig = (Double)sSig; */
 503    /* dSig /= 67108864.0;  */ /* 2^26 */
 504    /* dSig /= 67108864.0;  */ /* 2^26 */
 505    uSig &= sigMask;
 506    uSig |= 0x3FF0000000000000ULL;
 507    if (sign)
 508       uSig ^= negZero;
 509
 510    /* Convert exp into a double.  Also an exact conversion. */
 511    /* dExp = (Double)(sExp - 1023); */
 512    sExp -= 1023;
 513    if (sExp == 0) {
 514       uExp = 0;
 515    } else {
 516       uExp   = sExp < 0 ? -sExp : sExp;
 517       expExp = 0x3FF +52;
 518       /* 1 <= uExp <= 1074 */
 519       /* Skip first 42 iterations of normalisation loop as we know they
 520          will always happen */
 521       uExp <<= 42;
 522       expExp -= 42;
 523       for (i = 0; i < 52-42; i++) {
 524          if (uExp & bit52)
 525             break;
 526          uExp <<= 1;
 527          expExp--;
 528       }
 529       uExp &= sigMask;
 530       uExp |= ((ULong)expExp) << 52;
 531       if (sExp < 0) uExp ^= negZero;
 532    }
 533
 534    return getExp ? uExp : uSig;
 535 }
 536
 537
 538
 539 /*---------------------------------------------------------*/
 540 /*--- SSE4.2 PCMP{E,I}STR{I,M} helpers                  ---*/
 541 /*---------------------------------------------------------*/
 542
 543 /* We need the definitions for OSZACP eflags/rflags offsets.
 544    #including guest_{amd64,x86}_defs.h causes chaos, so just copy the
 545    required values directly.  They are not going to change in the
 546    foreseeable future :-)
 547 */
 548
 549 #define SHIFT_O   11
 550 #define SHIFT_S   7
 551 #define SHIFT_Z   6
 552 #define SHIFT_A   4
 553 #define SHIFT_C   0
 554 #define SHIFT_P   2
 555
 556 #define MASK_O    (1 << SHIFT_O)
 557 #define MASK_S    (1 << SHIFT_S)
 558 #define MASK_Z    (1 << SHIFT_Z)
 559 #define MASK_A    (1 << SHIFT_A)
 560 #define MASK_C    (1 << SHIFT_C)
 561 #define MASK_P    (1 << SHIFT_P)
 562
 563
 564 /* Count leading zeroes, w/ 0-produces-32 semantics, a la Hacker's
 565    Delight. */
 566 static UInt clz32 ( UInt x )
 567 {
 568    Int y, m, n;
 569    y = -(x >> 16);
 570    m = (y >> 16) & 16;
 571    n = 16 - m;
 572    x = x >> m;
 573    y = x - 0x100;
 574    m = (y >> 16) & 8;
 575    n = n + m;
 576    x = x << m;
 577    y = x - 0x1000;
 578    m = (y >> 16) & 4;
 579    n = n + m;
 580    x = x << m;
 581    y = x - 0x4000;
 582    m = (y >> 16) & 2;
 583    n = n + m;
 584    x = x << m;
 585    y = x >> 14;
 586    m = y & ~(y >> 1);
 587    return n + 2 - m;
 588 }
 589
 590 static UInt ctz32 ( UInt x )
 591 {
 592    return 32 - clz32((~x) & (x-1));
 593 }
 594
 595 /* Convert a 4-bit value to a 32-bit value by cloning each bit 8
 596    times.  There's surely a better way to do this, but I don't know
 597    what it is. */
 598 static UInt bits4_to_bytes4 ( UInt bits4 )
 599 {
 600    UInt r = 0;
 601    r |= (bits4 & 1) ? 0x000000FF : 0;
 602    r |= (bits4 & 2) ? 0x0000FF00 : 0;
 603    r |= (bits4 & 4) ? 0x00FF0000 : 0;
 604    r |= (bits4 & 8) ? 0xFF000000 : 0;
 605    return r;
 606 }
 607
 608
 609 /* Given partial results from a pcmpXstrX operation (intRes1,
 610    basically), generate an I- or M-format output value, also the new
 611    OSZACP flags.  */
 612 static
 613 void compute_PCMPxSTRx_gen_output (/*OUT*/V128* resV,
 614                                    /*OUT*/UInt* resOSZACP,
 615                                    UInt intRes1,
 616                                    UInt zmaskL, UInt zmaskR,
 617                                    UInt validL,
 618                                    UInt pol, UInt idx,
 619                                    Bool isxSTRM )
 620 {
 621    vassert((pol >> 2) == 0);
 622    vassert((idx >> 1) == 0);
 623
 624    UInt intRes2 = 0;
 625    switch (pol) {
 626       case 0: intRes2 = intRes1;          break; // pol +
 627       case 1: intRes2 = ~intRes1;         break; // pol -
 628       case 2: intRes2 = intRes1;          break; // pol m+
 629       case 3: intRes2 = intRes1 ^ validL; break; // pol m-
 630    }
 631    intRes2 &= 0xFFFF;
 632
 633    if (isxSTRM) {
 634
 635       // generate M-format output (a bit or byte mask in XMM0)
 636       if (idx) {
 637          resV->w32[0] = bits4_to_bytes4( (intRes2 >>  0) & 0xF );
 638          resV->w32[1] = bits4_to_bytes4( (intRes2 >>  4) & 0xF );
 639          resV->w32[2] = bits4_to_bytes4( (intRes2 >>  8) & 0xF );
 640          resV->w32[3] = bits4_to_bytes4( (intRes2 >> 12) & 0xF );
 641       } else {
 642          resV->w32[0] = intRes2 & 0xFFFF;
 643          resV->w32[1] = 0;
 644          resV->w32[2] = 0;
 645          resV->w32[3] = 0;
 646       }
 647
 648    } else {
 649
 650       // generate I-format output (an index in ECX)
 651       // generate ecx value
 652       UInt newECX = 0;
 653       if (idx) {
 654          // index of ms-1-bit
 655          newECX = intRes2 == 0 ? 16 : (31 - clz32(intRes2));
 656       } else {
 657          // index of ls-1-bit
 658          newECX = intRes2 == 0 ? 16 : ctz32(intRes2);
 659       }
 660
 661       resV->w32[0] = newECX;
 662       resV->w32[1] = 0;
 663       resV->w32[2] = 0;
 664       resV->w32[3] = 0;
 665
 666    }
 667
 668    // generate new flags, common to all ISTRI and ISTRM cases
 669    *resOSZACP    // A, P are zero
 670      = ((intRes2 == 0) ? 0 : MASK_C) // C == 0 iff intRes2 == 0
 671      | ((zmaskL == 0)  ? 0 : MASK_Z) // Z == 1 iff any in argL is 0
 672      | ((zmaskR == 0)  ? 0 : MASK_S) // S == 1 iff any in argR is 0
 673      | ((intRes2 & 1) << SHIFT_O);   // O == IntRes2[0]
 674 }
 675
 676
 677 /* Compute result and new OSZACP flags for all PCMP{E,I}STR{I,M}
 678    variants.
 679
 680    For xSTRI variants, the new ECX value is placed in the 32 bits
 681    pointed to by *resV, and the top 96 bits are zeroed.  For xSTRM
 682    variants, the result is a 128 bit value and is placed at *resV in
 683    the obvious way.
 684
 685    For all variants, the new OSZACP value is placed at *resOSZACP.
 686
 687    argLV and argRV are the vector args.  The caller must prepare a
 688    16-bit mask for each, zmaskL and zmaskR.  For ISTRx variants this
 689    must be 1 for each zero byte of of the respective arg.  For ESTRx
 690    variants this is derived from the explicit length indication, and
 691    must be 0 in all places except at the bit index corresponding to
 692    the valid length (0 .. 16).  If the valid length is 16 then the
 693    mask must be all zeroes.  In all cases, bits 31:16 must be zero.
 694
 695    imm8 is the original immediate from the instruction.  isSTRM
 696    indicates whether this is a xSTRM or xSTRI variant, which controls
 697    how much of *res is written.
 698
 699    If the given imm8 case can be handled, the return value is True.
 700    If not, False is returned, and neither *res not *resOSZACP are
 701    altered.
 702 */
 703
 704 Bool compute_PCMPxSTRx ( /*OUT*/V128* resV,
 705                          /*OUT*/UInt* resOSZACP,
 706                          V128* argLV,  V128* argRV,
 707                          UInt zmaskL, UInt zmaskR,
 708                          UInt imm8,   Bool isxSTRM )
 709 {
 710    vassert(imm8 < 0x80);
 711    vassert((zmaskL >> 16) == 0);
 712    vassert((zmaskR >> 16) == 0);
 713
 714    /* Explicitly reject any imm8 values that haven't been validated,
 715       even if they would probably work.  Life is too short to have
 716       unvalidated cases in the code base. */
 717    switch (imm8) {
 718       case 0x00:
 719       case 0x02: case 0x08: case 0x0A: case 0x0C: case 0x12:
 720       case 0x1A: case 0x3A: case 0x44: case 0x4A:
 721          break;
 722       default:
 723          return False;
 724    }
 725
 726    UInt fmt = (imm8 >> 0) & 3; // imm8[1:0]  data format
 727    UInt agg = (imm8 >> 2) & 3; // imm8[3:2]  aggregation fn
 728    UInt pol = (imm8 >> 4) & 3; // imm8[5:4]  polarity
 729    UInt idx = (imm8 >> 6) & 1; // imm8[6]    1==msb/bytemask
 730
 731    /*----------------------------------------*/
 732    /*-- strcmp on byte data                --*/
 733    /*----------------------------------------*/
 734
 735    if (agg == 2/*equal each, aka strcmp*/
 736        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
 737       Int    i;
 738       UChar* argL = (UChar*)argLV;
 739       UChar* argR = (UChar*)argRV;
 740       UInt boolResII = 0;
 741       for (i = 15; i >= 0; i--) {
 742          UChar cL  = argL[i];
 743          UChar cR  = argR[i];
 744          boolResII = (boolResII << 1) | (cL == cR ? 1 : 0);
 745       }
 746       UInt validL = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
 747       UInt validR = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 748
 749       // do invalidation, common to all equal-each cases
 750       UInt intRes1
 751          = (boolResII & validL & validR)  // if both valid, use cmpres
 752            | (~ (validL | validR));       // if both invalid, force 1
 753                                           // else force 0
 754       intRes1 &= 0xFFFF;
 755
 756       // generate I-format output
 757       compute_PCMPxSTRx_gen_output(
 758          resV, resOSZACP,
 759          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
 760       );
 761
 762       return True;
 763    }
 764
 765    /*----------------------------------------*/
 766    /*-- set membership on byte data        --*/
 767    /*----------------------------------------*/
 768
 769    if (agg == 0/*equal any, aka find chars in a set*/
 770        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
 771       /* argL: the string,  argR: charset */
 772       UInt   si, ci;
 773       UChar* argL    = (UChar*)argLV;
 774       UChar* argR    = (UChar*)argRV;
 775       UInt   boolRes = 0;
 776       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
 777       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 778
 779       for (si = 0; si < 16; si++) {
 780          if ((validL & (1 << si)) == 0)
 781             // run off the end of the string.
 782             break;
 783          UInt m = 0;
 784          for (ci = 0; ci < 16; ci++) {
 785             if ((validR & (1 << ci)) == 0) break;
 786             if (argR[ci] == argL[si]) { m = 1; break; }
 787          }
 788          boolRes |= (m << si);
 789       }
 790
 791       // boolRes is "pre-invalidated"
 792       UInt intRes1 = boolRes & 0xFFFF;
 793
 794       // generate I-format output
 795       compute_PCMPxSTRx_gen_output(
 796          resV, resOSZACP,
 797          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
 798       );
 799
 800       return True;
 801    }
 802
 803    /*----------------------------------------*/
 804    /*-- substring search on byte data      --*/
 805    /*----------------------------------------*/
 806
 807    if (agg == 3/*equal ordered, aka substring search*/
 808        && (fmt == 0/*ub*/ || fmt == 2/*sb*/)) {
 809
 810       /* argL: haystack,  argR: needle */
 811       UInt   ni, hi;
 812       UChar* argL    = (UChar*)argLV;
 813       UChar* argR    = (UChar*)argRV;
 814       UInt   boolRes = 0;
 815       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
 816       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 817       for (hi = 0; hi < 16; hi++) {
 818          if ((validL & (1 << hi)) == 0)
 819             // run off the end of the haystack
 820             break;
 821          UInt m = 1;
 822          for (ni = 0; ni < 16; ni++) {
 823             if ((validR & (1 << ni)) == 0) break;
 824             UInt i = ni + hi;
 825             if (i >= 16) break;
 826             if (argL[i] != argR[ni]) { m = 0; break; }
 827          }
 828          boolRes |= (m << hi);
 829       }
 830
 831       // boolRes is "pre-invalidated"
 832       UInt intRes1 = boolRes & 0xFFFF;
 833
 834       // generate I-format output
 835       compute_PCMPxSTRx_gen_output(
 836          resV, resOSZACP,
 837          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
 838       );
 839
 840       return True;
 841    }
 842
 843    /*----------------------------------------*/
 844    /*-- ranges, unsigned byte data         --*/
 845    /*----------------------------------------*/
 846
 847    if (agg == 1/*ranges*/
 848        && fmt == 0/*ub*/) {
 849
 850       /* argL: string,  argR: range-pairs */
 851       UInt   ri, si;
 852       UChar* argL    = (UChar*)argLV;
 853       UChar* argR    = (UChar*)argRV;
 854       UInt   boolRes = 0;
 855       UInt   validL  = ~(zmaskL | -zmaskL);  // not(left(zmaskL))
 856       UInt   validR  = ~(zmaskR | -zmaskR);  // not(left(zmaskR))
 857       for (si = 0; si < 16; si++) {
 858          if ((validL & (1 << si)) == 0)
 859             // run off the end of the string
 860             break;
 861          UInt m = 0;
 862          for (ri = 0; ri < 16; ri += 2) {
 863             if ((validR & (3 << ri)) != (3 << ri)) break;
 864             if (argR[ri] <= argL[si] && argL[si] <= argR[ri+1]) {
 865                m = 1; break;
 866             }
 867          }
 868          boolRes |= (m << si);
 869       }
 870
 871       // boolRes is "pre-invalidated"
 872       UInt intRes1 = boolRes & 0xFFFF;
 873
 874       // generate I-format output
 875       compute_PCMPxSTRx_gen_output(
 876          resV, resOSZACP,
 877          intRes1, zmaskL, zmaskR, validL, pol, idx, isxSTRM
 878       );
 879
 880       return True;
 881    }
 882
 883    return False;
 884 }
 885
 886
 887 /*---------------------------------------------------------------*/
 888 /*--- end                                 guest_generic_x87.c ---*/
 889 /*---------------------------------------------------------------*/