l4/pkg/libstdc++-v3/contrib/libstdc++-v3-4.9/include/bits/regex_scanner.tcc

   1 // class template regex -*- C++ -*-
   2
   3 // Copyright (C) 2013-2014 Free Software Foundation, Inc.
   4 //
   5 // This file is part of the GNU ISO C++ Library.  This library is free
   6 // software; you can redistribute it and/or modify it under the
   7 // terms of the GNU General Public License as published by the
   8 // Free Software Foundation; either version 3, or (at your option)
   9 // any later version.
  10
  11 // This library is distributed in the hope that it will be useful,
  12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
  13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  14 // GNU General Public License for more details.
  15
  16 // Under Section 7 of GPL version 3, you are granted additional
  17 // permissions described in the GCC Runtime Library Exception, version
  18 // 3.1, as published by the Free Software Foundation.
  19
  20 // You should have received a copy of the GNU General Public License and
  21 // a copy of the GCC Runtime Library Exception along with this program;
  22 // see the files COPYING3 and COPYING.RUNTIME respectively.  If not, see
  23 // <http://www.gnu.org/licenses/>.
  24
  25 /**
  26  *  @file bits/regex_scanner.tcc
  27  *  This is an internal header file, included by other library headers.
  28  *  Do not attempt to use it directly. @headername{regex}
  29  */
  30
  31 // FIXME make comments doxygen format.
  32
  33 // N3376 specified 6 regex styles: ECMAScript, basic, extended, grep, egrep
  34 // and awk
  35 // 1) grep is basic except '\n' is treated as '|'
  36 // 2) egrep is extended except '\n' is treated as '|'
  37 // 3) awk is extended except special escaping rules, and there's no
  38 //    back-reference.
  39 //
  40 // References:
  41 //
  42 // ECMAScript: ECMA-262 15.10
  43 //
  44 // basic, extended:
  45 // http://pubs.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap09.html
  46 //
  47 // awk: http://pubs.opengroup.org/onlinepubs/000095399/utilities/awk.html
  48
  49 namespace std _GLIBCXX_VISIBILITY(default)
  50 {
  51 namespace __detail
  52 {
  53 _GLIBCXX_BEGIN_NAMESPACE_VERSION
  54
  55   template<typename _CharT>
  56     _Scanner<_CharT>::
  57     _Scanner(typename _Scanner::_IterT __begin,
  58              typename _Scanner::_IterT __end,
  59              _FlagT __flags, std::locale __loc)
  60     : _ScannerBase(__flags),
  61       _M_current(__begin), _M_end(__end),
  62       _M_ctype(std::use_facet<_CtypeT>(__loc)),
  63       _M_eat_escape(_M_is_ecma()
  64                     ? &_Scanner::_M_eat_escape_ecma
  65                     : &_Scanner::_M_eat_escape_posix)
  66     { _M_advance(); }
  67
  68   template<typename _CharT>
  69     void
  70     _Scanner<_CharT>::
  71     _M_advance()
  72     {
  73       if (_M_current == _M_end)
  74         {
  75           _M_token = _S_token_eof;
  76           return;
  77         }
  78
  79       if (_M_state == _S_state_normal)
  80         _M_scan_normal();
  81       else if (_M_state == _S_state_in_bracket)
  82         _M_scan_in_bracket();
  83       else if (_M_state == _S_state_in_brace)
  84         _M_scan_in_brace();
  85       else
  86         _GLIBCXX_DEBUG_ASSERT(false);
  87     }
  88
  89   // Differences between styles:
  90   // 1) "\(", "\)", "\{" in basic. It's not escaping.
  91   // 2) "(?:", "(?=", "(?!" in ECMAScript.
  92   template<typename _CharT>
  93     void
  94     _Scanner<_CharT>::
  95     _M_scan_normal()
  96     {
  97       auto __c = *_M_current++;
  98       const char* __pos;
  99
 100       if (__c == '\\')
 101         {
 102           if (_M_current == _M_end)
 103             __throw_regex_error(regex_constants::error_escape);
 104
 105           if (!_M_is_basic()
 106               || (*_M_current != '('
 107                   && *_M_current != ')'
 108                   && *_M_current != '{'))
 109             {
 110               (this->*_M_eat_escape)();
 111               return;
 112             }
 113           __c = *_M_current++;
 114         }
 115       if (__c == '(')
 116         {
 117           if (_M_is_ecma() && *_M_current == '?')
 118             {
 119               if (++_M_current == _M_end)
 120                 __throw_regex_error(regex_constants::error_paren);
 121
 122               if (*_M_current == ':')
 123                 {
 124                   ++_M_current;
 125                   _M_token = _S_token_subexpr_no_group_begin;
 126                 }
 127               else if (*_M_current == '=')
 128                 {
 129                   ++_M_current;
 130                   _M_token = _S_token_subexpr_lookahead_begin;
 131                   _M_value.assign(1, 'p');
 132                 }
 133               else if (*_M_current == '!')
 134                 {
 135                   ++_M_current;
 136                   _M_token = _S_token_subexpr_lookahead_begin;
 137                   _M_value.assign(1, 'n');
 138                 }
 139               else
 140                 __throw_regex_error(regex_constants::error_paren);
 141             }
 142           else if (_M_flags & regex_constants::nosubs)
 143             _M_token = _S_token_subexpr_no_group_begin;
 144           else
 145             _M_token = _S_token_subexpr_begin;
 146         }
 147       else if (__c == ')')
 148         _M_token = _S_token_subexpr_end;
 149       else if (__c == '[')
 150         {
 151           _M_state = _S_state_in_bracket;
 152           _M_at_bracket_start = true;
 153           if (_M_current != _M_end && *_M_current == '^')
 154             {
 155               _M_token = _S_token_bracket_neg_begin;
 156               ++_M_current;
 157             }
 158           else
 159             _M_token = _S_token_bracket_begin;
 160         }
 161       else if (__c == '{')
 162         {
 163           _M_state = _S_state_in_brace;
 164           _M_token = _S_token_interval_begin;
 165         }
 166       else if (((__pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0')))
 167                   != nullptr
 168                 && *__pos != '\0'
 169                 && __c != ']'
 170                 && __c != '}')
 171                || (_M_is_grep() && __c == '\n'))
 172         {
 173           auto __it = _M_token_tbl;
 174           auto __narrowc = _M_ctype.narrow(__c, '\0');
 175           for (; __it->first != '\0'; ++__it)
 176             if (__it->first == __narrowc)
 177               {
 178                 _M_token = __it->second;
 179                 return;
 180               }
 181           _GLIBCXX_DEBUG_ASSERT(false);
 182         }
 183       else
 184         {
 185           _M_token = _S_token_ord_char;
 186           _M_value.assign(1, __c);
 187         }
 188     }
 189
 190   // Differences between styles:
 191   // 1) different semantics of "[]" and "[^]".
 192   // 2) Escaping in bracket expr.
 193   template<typename _CharT>
 194     void
 195     _Scanner<_CharT>::
 196     _M_scan_in_bracket()
 197     {
 198       if (_M_current == _M_end)
 199         __throw_regex_error(regex_constants::error_brack);
 200
 201       auto __c = *_M_current++;
 202
 203       if (__c == '[')
 204         {
 205           if (_M_current == _M_end)
 206             __throw_regex_error(regex_constants::error_brack);
 207
 208           if (*_M_current == '.')
 209             {
 210               _M_token = _S_token_collsymbol;
 211               _M_eat_class(*_M_current++);
 212             }
 213           else if (*_M_current == ':')
 214             {
 215               _M_token = _S_token_char_class_name;
 216               _M_eat_class(*_M_current++);
 217             }
 218           else if (*_M_current == '=')
 219             {
 220               _M_token = _S_token_equiv_class_name;
 221               _M_eat_class(*_M_current++);
 222             }
 223           else
 224             {
 225               _M_token = _S_token_ord_char;
 226               _M_value.assign(1, __c);
 227             }
 228         }
 229       // In POSIX, when encountering "[]" or "[^]", the ']' is interpreted
 230       // literally. So "[]]" or "[^]]" is valid regex. See the testcases
 231       // `*/empty_range.cc`.
 232       else if (__c == ']' && (_M_is_ecma() || !_M_at_bracket_start))
 233         {
 234           _M_token = _S_token_bracket_end;
 235           _M_state = _S_state_normal;
 236         }
 237       // ECMAScirpt and awk permmits escaping in bracket.
 238       else if (__c == '\\' && (_M_is_ecma() || _M_is_awk()))
 239         (this->*_M_eat_escape)();
 240       else
 241         {
 242           _M_token = _S_token_ord_char;
 243           _M_value.assign(1, __c);
 244         }
 245       _M_at_bracket_start = false;
 246     }
 247
 248   // Differences between styles:
 249   // 1) "\}" in basic style.
 250   template<typename _CharT>
 251     void
 252     _Scanner<_CharT>::
 253     _M_scan_in_brace()
 254     {
 255       if (_M_current == _M_end)
 256         __throw_regex_error(regex_constants::error_brace);
 257
 258       auto __c = *_M_current++;
 259
 260       if (_M_ctype.is(_CtypeT::digit, __c))
 261         {
 262           _M_token = _S_token_dup_count;
 263           _M_value.assign(1, __c);
 264           while (_M_current != _M_end
 265                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 266             _M_value += *_M_current++;
 267         }
 268       else if (__c == ',')
 269         _M_token = _S_token_comma;
 270       // basic use \}.
 271       else if (_M_is_basic())
 272         {
 273           if (__c == '\\' && _M_current != _M_end && *_M_current == '}')
 274             {
 275               _M_state = _S_state_normal;
 276               _M_token = _S_token_interval_end;
 277               ++_M_current;
 278             }
 279           else
 280             __throw_regex_error(regex_constants::error_badbrace);
 281         }
 282       else if (__c == '}')
 283         {
 284           _M_state = _S_state_normal;
 285           _M_token = _S_token_interval_end;
 286         }
 287       else
 288         __throw_regex_error(regex_constants::error_badbrace);
 289     }
 290
 291   template<typename _CharT>
 292     void
 293     _Scanner<_CharT>::
 294     _M_eat_escape_ecma()
 295     {
 296       if (_M_current == _M_end)
 297         __throw_regex_error(regex_constants::error_escape);
 298
 299       auto __c = *_M_current++;
 300       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 301
 302       if (__pos != nullptr && (__c != 'b' || _M_state == _S_state_in_bracket))
 303         {
 304           _M_token = _S_token_ord_char;
 305           _M_value.assign(1, *__pos);
 306         }
 307       else if (__c == 'b')
 308         {
 309           _M_token = _S_token_word_bound;
 310           _M_value.assign(1, 'p');
 311         }
 312       else if (__c == 'B')
 313         {
 314           _M_token = _S_token_word_bound;
 315           _M_value.assign(1, 'n');
 316         }
 317       // N3376 28.13
 318       else if (__c == 'd'
 319                || __c == 'D'
 320                || __c == 's'
 321                || __c == 'S'
 322                || __c == 'w'
 323                || __c == 'W')
 324         {
 325           _M_token = _S_token_quoted_class;
 326           _M_value.assign(1, __c);
 327         }
 328       else if (__c == 'c')
 329         {
 330           if (_M_current == _M_end)
 331             __throw_regex_error(regex_constants::error_escape);
 332           _M_token = _S_token_ord_char;
 333           _M_value.assign(1, *_M_current++);
 334         }
 335       else if (__c == 'x' || __c == 'u')
 336         {
 337           _M_value.erase();
 338           for (int i = 0; i < (__c == 'x' ? 2 : 4); i++)
 339             {
 340               if (_M_current == _M_end
 341                   || !_M_ctype.is(_CtypeT::xdigit, *_M_current))
 342                 __throw_regex_error(regex_constants::error_escape);
 343               _M_value += *_M_current++;
 344             }
 345           _M_token = _S_token_hex_num;
 346         }
 347       // ECMAScript recongnizes multi-digit back-references.
 348       else if (_M_ctype.is(_CtypeT::digit, __c))
 349         {
 350           _M_value.assign(1, __c);
 351           while (_M_current != _M_end
 352                  && _M_ctype.is(_CtypeT::digit, *_M_current))
 353             _M_value += *_M_current++;
 354           _M_token = _S_token_backref;
 355         }
 356       else
 357         {
 358           _M_token = _S_token_ord_char;
 359           _M_value.assign(1, __c);
 360         }
 361     }
 362
 363   // Differences between styles:
 364   // 1) Extended doesn't support backref, but basic does.
 365   template<typename _CharT>
 366     void
 367     _Scanner<_CharT>::
 368     _M_eat_escape_posix()
 369     {
 370       if (_M_current == _M_end)
 371         __throw_regex_error(regex_constants::error_escape);
 372
 373       auto __c = *_M_current;
 374       auto __pos = std::strchr(_M_spec_char, _M_ctype.narrow(__c, '\0'));
 375
 376       if (__pos != nullptr && *__pos != '\0')
 377         {
 378           _M_token = _S_token_ord_char;
 379           _M_value.assign(1, __c);
 380         }
 381       // We MUST judge awk before handling backrefs. There's no backref in awk.
 382       else if (_M_is_awk())
 383         {
 384           _M_eat_escape_awk();
 385           return;
 386         }
 387       else if (_M_is_basic() && _M_ctype.is(_CtypeT::digit, __c) && __c != '0')
 388         {
 389           _M_token = _S_token_backref;
 390           _M_value.assign(1, __c);
 391         }
 392       else
 393         {
 394 #ifdef __STRICT_ANSI__
 395           __throw_regex_error(regex_constants::error_escape);
 396 #else
 397           _M_token = _S_token_ord_char;
 398           _M_value.assign(1, __c);
 399 #endif
 400         }
 401       ++_M_current;
 402     }
 403
 404   template<typename _CharT>
 405     void
 406     _Scanner<_CharT>::
 407     _M_eat_escape_awk()
 408     {
 409       auto __c = *_M_current++;
 410       auto __pos = _M_find_escape(_M_ctype.narrow(__c, '\0'));
 411
 412       if (__pos != nullptr)
 413         {
 414           _M_token = _S_token_ord_char;
 415           _M_value.assign(1, *__pos);
 416         }
 417       // \ddd for oct representation
 418       else if (_M_ctype.is(_CtypeT::digit, __c)
 419                && __c != '8'
 420                && __c != '9')
 421         {
 422           _M_value.assign(1,  __c);
 423           for (int __i = 0;
 424                __i < 2
 425                && _M_current != _M_end
 426                && _M_ctype.is(_CtypeT::digit, *_M_current)
 427                && *_M_current != '8'
 428                && *_M_current != '9';
 429                __i++)
 430             _M_value += *_M_current++;
 431           _M_token = _S_token_oct_num;
 432           return;
 433         }
 434       else
 435         __throw_regex_error(regex_constants::error_escape);
 436     }
 437
 438   // Eats a character class or throwns an exception.
 439   // __ch cound be ':', '.' or '=', _M_current is the char after ']' when
 440   // returning.
 441   template<typename _CharT>
 442     void
 443     _Scanner<_CharT>::
 444     _M_eat_class(char __ch)
 445     {
 446       for (_M_value.clear(); _M_current != _M_end && *_M_current != __ch;)
 447         _M_value += *_M_current++;
 448       if (_M_current == _M_end
 449           || *_M_current++ != __ch
 450           || _M_current == _M_end // skip __ch
 451           || *_M_current++ != ']') // skip ']'
 452         {
 453           if (__ch == ':')
 454             __throw_regex_error(regex_constants::error_ctype);
 455           else
 456             __throw_regex_error(regex_constants::error_collate);
 457         }
 458     }
 459
 460 #ifdef _GLIBCXX_DEBUG
 461   template<typename _CharT>
 462     std::ostream&
 463     _Scanner<_CharT>::
 464     _M_print(std::ostream& ostr)
 465     {
 466       switch (_M_token)
 467       {
 468       case _S_token_anychar:
 469         ostr << "any-character\n";
 470         break;
 471       case _S_token_backref:
 472         ostr << "backref\n";
 473         break;
 474       case _S_token_bracket_begin:
 475         ostr << "bracket-begin\n";
 476         break;
 477       case _S_token_bracket_neg_begin:
 478         ostr << "bracket-neg-begin\n";
 479         break;
 480       case _S_token_bracket_end:
 481         ostr << "bracket-end\n";
 482         break;
 483       case _S_token_char_class_name:
 484         ostr << "char-class-name \"" << _M_value << "\"\n";
 485         break;
 486       case _S_token_closure0:
 487         ostr << "closure0\n";
 488         break;
 489       case _S_token_closure1:
 490         ostr << "closure1\n";
 491         break;
 492       case _S_token_collsymbol:
 493         ostr << "collsymbol \"" << _M_value << "\"\n";
 494         break;
 495       case _S_token_comma:
 496         ostr << "comma\n";
 497         break;
 498       case _S_token_dup_count:
 499         ostr << "dup count: " << _M_value << "\n";
 500         break;
 501       case _S_token_eof:
 502         ostr << "EOF\n";
 503         break;
 504       case _S_token_equiv_class_name:
 505         ostr << "equiv-class-name \"" << _M_value << "\"\n";
 506         break;
 507       case _S_token_interval_begin:
 508         ostr << "interval begin\n";
 509         break;
 510       case _S_token_interval_end:
 511         ostr << "interval end\n";
 512         break;
 513       case _S_token_line_begin:
 514         ostr << "line begin\n";
 515         break;
 516       case _S_token_line_end:
 517         ostr << "line end\n";
 518         break;
 519       case _S_token_opt:
 520         ostr << "opt\n";
 521         break;
 522       case _S_token_or:
 523         ostr << "or\n";
 524         break;
 525       case _S_token_ord_char:
 526         ostr << "ordinary character: \"" << _M_value << "\"\n";
 527         break;
 528       case _S_token_subexpr_begin:
 529         ostr << "subexpr begin\n";
 530         break;
 531       case _S_token_subexpr_no_group_begin:
 532         ostr << "no grouping subexpr begin\n";
 533         break;
 534       case _S_token_subexpr_lookahead_begin:
 535         ostr << "lookahead subexpr begin\n";
 536         break;
 537       case _S_token_subexpr_end:
 538         ostr << "subexpr end\n";
 539         break;
 540       case _S_token_unknown:
 541         ostr << "-- unknown token --\n";
 542         break;
 543       case _S_token_oct_num:
 544         ostr << "oct number " << _M_value << "\n";
 545         break;
 546       case _S_token_hex_num:
 547         ostr << "hex number " << _M_value << "\n";
 548         break;
 549       case _S_token_quoted_class:
 550         ostr << "quoted class " << "\\" << _M_value << "\n";
 551         break;
 552       default:
 553         _GLIBCXX_DEBUG_ASSERT(false);
 554       }
 555       return ostr;
 556     }
 557 #endif
 558
 559 _GLIBCXX_END_NAMESPACE_VERSION
 560 } // namespace __detail
 561 } // namespace