doc/ip-cref.tex

   1 \documentstyle[12pt,twoside]{article}
   2 \def\TITLE{IP Command Reference}
   3 \input preamble
   4 \begin{center}
   5 \Large\bf IP Command Reference.
   6 \end{center}
   7
   8
   9 \begin{center}
  10 { \large Alexey~N.~Kuznetsov } \\
  11 \em Institute for Nuclear Research, Moscow \\
  12 \verb|kuznet@ms2.inr.ac.ru| \\
  13 \rm April 14, 1999
  14 \end{center}
  15
  16 \vspace{5mm}
  17
  18 \tableofcontents
  19
  20 \newpage
  21
  22 \section{About this document}
  23
  24 This document presents a comprehensive description of the \verb|ip| utility
  25 from the \verb|iproute2| package. It is not a tutorial or user's guide.
  26 It is a {\em dictionary\/}, not explaining terms,
  27 but translating them into other terms, which may also be unknown to the reader.
  28 However, the document is self-contained and the reader, provided they have a
  29 basic networking background, will find enough information
  30 and examples to understand and configure Linux-2.2 IP and IPv6
  31 networking.
  32
  33 This document is split into sections explaining \verb|ip| commands
  34 and options, decrypting \verb|ip| output and containing a few examples.
  35 More voluminous examples and some topics, which require more elaborate
  36 discussion, are in the appendix.
  37
  38 The paragraphs beginning with NB contain side notes, warnings about
  39 bugs and design drawbacks. They may be skipped at the first reading.
  40
  41 \section{{\tt ip} --- command syntax}
  42
  43 The generic form of an \verb|ip| command is:
  44 \begin{verbatim}
  45 ip [ OPTIONS ] OBJECT [ COMMAND [ ARGUMENTS ]]
  46 \end{verbatim}
  47 where \verb|OPTIONS| is a set of optional modifiers affecting the
  48 general behaviour of the \verb|ip| utility or changing its output. All options
  49 begin with the character \verb|'-'| and may be used in either long or abbreviated
  50 forms. Currently, the following options are available:
  51
  52 \begin{itemize}
  53 \item \verb|-V|, \verb|-Version|
  54
  55 --- print the version of the \verb|ip| utility and exit.
  56
  57
  58 \item \verb|-s|, \verb|-stats|, \verb|-statistics|
  59
  60 --- output more information. If the option
  61 appears twice or more, the amount of information increases.
  62 As a rule, the information is statistics or some time values.
  63
  64
  65 \item \verb|-f|, \verb|-family| followed by a protocol family
  66 identifier: \verb|inet|, \verb|inet6| or \verb|link|.
  67
  68 --- enforce the protocol family to use. If the option is not present,
  69 the protocol family is guessed from other arguments. If the rest of the command
  70 line does not give enough information to guess the family, \verb|ip| falls back to the default
  71 one, usually \verb|inet| or \verb|any|. \verb|link| is a special family
  72 identifier meaning that no networking protocol is involved.
  73
  74 \item \verb|-4|
  75
  76 --- shortcut for \verb|-family inet|.
  77
  78 \item \verb|-6|
  79
  80 --- shortcut for \verb|-family inet6|.
  81
  82 \item \verb|-0|
  83
  84 --- shortcut for \verb|-family link|.
  85
  86
  87 \item \verb|-o|, \verb|-oneline|
  88
  89 --- output each record on a single line, replacing line feeds
  90 with the \verb|'\'| character. This is convenient when you want to
  91 count records with \verb|wc| or to \verb|grep| the output. The trivial
  92 script \verb|rtpr| converts the output back into readable form.
  93
  94 \item \verb|-r|, \verb|-resolve|
  95
  96 --- use the system's name resolver to print DNS names instead of
  97 host addresses.
  98
  99 \begin{NB}
 100  Do not use this option when reporting bugs or asking for advice.
 101 \end{NB}
 102 \begin{NB}
 103  \verb|ip| never uses DNS to resolve names to addresses.
 104 \end{NB}
 105
 106 \end{itemize}
 107
 108 \verb|OBJECT| is the object to manage or to get information about.
 109 The object types currently understood by \verb|ip| are:
 110
 111 \begin{itemize}
 112 \item \verb|link| --- network device
 113 \item \verb|address| --- protocol (IP or IPv6) address on a device
 114 \item \verb|neighbour| --- ARP or NDISC cache entry
 115 \item \verb|route| --- routing table entry
 116 \item \verb|rule| --- rule in routing policy database
 117 \item \verb|maddress| --- multicast address
 118 \item \verb|mroute| --- multicast routing cache entry
 119 \item \verb|tunnel| --- tunnel over IP
 120 \end{itemize}
 121
 122 Again, the names of all objects may be written in full or
 123 abbreviated form, f.e.\ \verb|address| is abbreviated as \verb|addr|
 124 or just \verb|a|.
 125
 126 \verb|COMMAND| specifies the action to perform on the object.
 127 The set of possible actions depends on the object type.
 128 As a rule, it is possible to \verb|add|, \verb|delete| and
 129 \verb|show| (or \verb|list|) objects, but some objects
 130 do not allow all of these operations or have some additional commands.
 131 The \verb|help| command is available for all objects. It prints
 132 out a list of available commands and argument syntax conventions.
 133
 134 If no command is given, some default command is assumed.
 135 Usually it is \verb|list| or, if the objects of this class
 136 cannot be listed, \verb|help|.
 137
 138 \verb|ARGUMENTS| is a list of arguments to the command.
 139 The arguments depend on the command and object. There are two types of arguments:
 140 {\em flags\/}, consisting of a single keyword, and {\em parameters\/},
 141 consisting of a keyword followed by a value. For convenience,
 142 each command has some {\em default parameter\/}
 143 which may be omitted. F.e.\ parameter \verb|dev| is the default
 144 for the {\tt ip link} command, so {\tt ip link ls eth0} is equivalent
 145 to {\tt ip link ls dev eth0}.
 146 In the command descriptions below such parameters
 147 are distinguished with the marker: ``(default)''.
 148
 149 Almost all keywords may be abbreviated with several first (or even single)
 150 letters. The shortcuts are convenient when \verb|ip| is used interactively,
 151 but they are not recommended in scripts or when reporting bugs
 152 or asking for advice. ``Officially'' allowed abbreviations are listed
 153 in the document body.
 154
 155
 156
 157 \section{{\tt ip} --- error messages}
 158
 159 \verb|ip| may fail for one of the following reasons:
 160
 161 \begin{itemize}
 162 \item
 163 A syntax error on the command line: an unknown keyword, incorrectly formatted
 164 IP address {\em et al\/}. In this case \verb|ip| prints an error message
 165 and exits. As a rule, the error message will contain information
 166 about the reason for the failure. Sometimes it also prints a help page.
 167
 168 \item
 169 The arguments did not pass verification for self-consistency.
 170
 171 \item
 172 \verb|ip| failed to compile a kernel request from the arguments
 173 because the user didn't give enough information.
 174
 175 \item
 176 The kernel returned an error to some syscall. In this case \verb|ip|
 177 prints the error message, as it is output with \verb|perror(3)|,
 178 prefixed with a comment and a syscall identifier.
 179
 180 \item
 181 The kernel returned an error to some RTNETLINK request.
 182 In this case \verb|ip| prints the error message, as it is output
 183 with \verb|perror(3)| prefixed with ``RTNETLINK answers:''.
 184
 185 \end{itemize}
 186
 187 All the operations are atomic, i.e.\
 188 if the \verb|ip| utility fails, it does not change anything
 189 in the system. One harmful exception is \verb|ip link| command
 190 (Sec.\ref{IP-LINK}, p.\pageref{IP-LINK}),
 191 which may change only some of the device parameters given
 192 on command line.
 193
 194 It is difficult to list all the error messages (especially
 195 syntax errors). However, as a rule, their meaning is clear
 196 from the context of the command.
 197
 198 The most common mistakes are:
 199
 200 \begin{enumerate}
 201 \item Netlink is not configured in the kernel. The message is:
 202 \begin{verbatim}
 203 Cannot open netlink socket: Invalid value
 204 \end{verbatim}
 205
 206 \item RTNETLINK is not configured in the kernel. In this case
 207 one of the following messages may be printed, depending on the command:
 208 \begin{verbatim}
 209 Cannot talk to rtnetlink: Connection refused
 210 Cannot send dump request: Connection refused
 211 \end{verbatim}
 212
 213 \item The \verb|CONFIG_IP_MULTIPLE_TABLES| option was not selected
 214 when configuring the kernel. In this case any attempt to use the
 215 \verb|ip| \verb|rule| command will fail, f.e.
 216 \begin{verbatim}
 217 kuznet@kaiser $ ip rule list
 218 RTNETLINK error: Invalid argument
 219 dump terminated
 220 \end{verbatim}
 221
 222 \end{enumerate}
 223
 224
 225 \section{{\tt ip link} --- network device configuration}
 226 \label{IP-LINK}
 227
 228 \paragraph{Object:} A \verb|link| is a network device and the corresponding
 229 commands display and change the state of devices.
 230
 231 \paragraph{Commands:} \verb|set| and \verb|show| (or \verb|list|).
 232
 233 \subsection{{\tt ip link set} --- change device attributes}
 234
 235 \paragraph{Abbreviations:} \verb|set|, \verb|s|.
 236
 237 \paragraph{Arguments:}
 238
 239 \begin{itemize}
 240 \item \verb|dev NAME| (default)
 241
 242 --- \verb|NAME| specifies the network device on which to operate.
 243
 244 \item \verb|up| and \verb|down|
 245
 246 --- change the state of the device to \verb|UP| or \verb|DOWN|.
 247
 248 \item \verb|arp on| or \verb|arp off|
 249
 250 --- change the \verb|NOARP| flag on the device.
 251
 252 \begin{NB}
 253 This operation is {\em not allowed\/} if the device is in state \verb|UP|.
 254 Though neither the \verb|ip| utility nor the kernel check for this condition.
 255 You can get unpredictable results changing this flag while the
 256 device is running.
 257 \end{NB}
 258
 259 \item \verb|multicast on| or \verb|multicast off|
 260
 261 --- change the \verb|MULTICAST| flag on the device.
 262
 263 \item \verb|dynamic on| or \verb|dynamic off|
 264
 265 --- change the \verb|DYNAMIC| flag on the device.
 266
 267 \item \verb|name NAME|
 268
 269 --- change the name of the device. This operation is not
 270 recommended if the device is running or has some addresses
 271 already configured.
 272
 273 \item \verb|txqueuelen NUMBER| or \verb|txqlen NUMBER|
 274
 275 --- change the transmit queue length of the device.
 276
 277 \item \verb|mtu NUMBER|
 278
 279 --- change the MTU of the device.
 280
 281 \item \verb|address LLADDRESS|
 282
 283 --- change the station address of the interface.
 284
 285 \item \verb|broadcast LLADDRESS|, \verb|brd LLADDRESS| or \verb|peer LLADDRESS|
 286
 287 --- change the link layer broadcast address or the peer address when
 288 the interface is \verb|POINTOPOINT|.
 289
 290 \vskip 1mm
 291 \begin{NB}
 292 For most devices (f.e.\ for Ethernet) changing the link layer
 293 broadcast address will break networking.
 294 Do not use it, if you do not understand what this operation really does.
 295 \end{NB}
 296
 297 \item \verb|netns PID|
 298
 299 --- move the device to the network namespace associated with the process PID.
 300
 301 \end{itemize}
 302
 303 \vskip 1mm
 304 \begin{NB}
 305 The \verb|PROMISC| and \verb|ALLMULTI| flags are considered
 306 obsolete and should not be changed administratively, though
 307 the {\tt ip} utility will allow that.
 308 \end{NB}
 309
 310 \paragraph{Warning:} If multiple parameter changes are requested,
 311 \verb|ip| aborts immediately after any of the changes have failed.
 312 This is the only case when \verb|ip| can move the system to
 313 an unpredictable state. The solution is to avoid changing
 314 several parameters with one {\tt ip link set} call.
 315
 316 \paragraph{Examples:}
 317 \begin{itemize}
 318 \item \verb|ip link set dummy address 00:00:00:00:00:01|
 319
 320 --- change the station address of the interface \verb|dummy|.
 321
 322 \item \verb|ip link set dummy up|
 323
 324 --- start the interface \verb|dummy|.
 325
 326 \end{itemize}
 327
 328
 329 \subsection{{\tt ip link show} --- display device attributes}
 330 \label{IP-LINK-SHOW}
 331
 332 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|,
 333 \verb|l|.
 334
 335 \paragraph{Arguments:}
 336 \begin{itemize}
 337 \item \verb|dev NAME| (default)
 338
 339 --- \verb|NAME| specifies the network device to show.
 340 If this argument is omitted all devices are listed.
 341
 342 \item \verb|up|
 343
 344 --- only display running interfaces.
 345
 346 \end{itemize}
 347
 348
 349 \paragraph{Output format:}
 350
 351 \begin{verbatim}
 352 kuznet@alisa:~ $ ip link ls eth0
 353 3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
 354     link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
 355 kuznet@alisa:~ $ ip link ls sit0
 356 5: sit0@NONE: <NOARP,UP> mtu 1480 qdisc noqueue
 357     link/sit 0.0.0.0 brd 0.0.0.0
 358 kuznet@alisa:~ $ ip link ls dummy
 359 2: dummy: <BROADCAST,NOARP> mtu 1500 qdisc noop
 360     link/ether 00:00:00:00:00:00 brd ff:ff:ff:ff:ff:ff
 361 kuznet@alisa:~ $
 362 \end{verbatim}
 363
 364
 365 The number before each colon is an {\em interface index\/} or {\em ifindex\/}.
 366 This number uniquely identifies the interface. This is followed by the {\em interface name\/}
 367 (\verb|eth0|, \verb|sit0| etc.). The interface name is also
 368 unique at every given moment. However, the interface may disappear from the
 369 list (f.e.\ when the corresponding driver module is unloaded) and another
 370 one with the same name may be created later. Besides that,
 371 the administrator may change the name of any device with
 372 \verb|ip| \verb|link| \verb|set| \verb|name|
 373 to make it more intelligible.
 374
 375 The interface name may have another name or \verb|NONE| appended
 376 after the \verb|@| sign. This means that this device is bound to some other
 377 device,
 378 i.e.\ packets send through it are encapsulated and sent via the ``master''
 379 device. If the name is \verb|NONE|, the master is unknown.
 380
 381 Then we see the interface {\em mtu\/} (``maximal transfer unit''). This determines
 382 the maximal size of data which can be sent as a single packet over this interface.
 383
 384 {\em qdisc\/} (``queuing discipline'') shows the queuing algorithm used
 385 on the interface. Particularly, \verb|noqueue| means that this interface
 386 does not queue anything and \verb|noop| means that the interface is in blackhole
 387 mode i.e.\ all packets sent to it are immediately discarded.
 388 {\em qlen\/} is the default transmit queue length of the device measured
 389 in packets.
 390
 391 The interface flags are summarized in the angle brackets.
 392
 393 \begin{itemize}
 394 \item \verb|UP| --- the device is turned on. It is ready to accept
 395 packets for transmission and it may inject into the kernel packets received
 396 from other nodes on the network.
 397
 398 \item \verb|LOOPBACK| --- the interface does not communicate with other
 399 hosts. All packets sent through it will be returned
 400 and nothing but bounced packets can be received.
 401
 402 \item \verb|BROADCAST| --- the device has the facility to send packets
 403 to all hosts sharing the same link. A typical example is an Ethernet link.
 404
 405 \item \verb|POINTOPOINT| --- the link has only two ends with one node
 406 attached to each end. All packets sent to this link will reach the peer
 407 and all packets received by us came from this single peer.
 408
 409 If neither \verb|LOOPBACK| nor \verb|BROADCAST| nor \verb|POINTOPOINT|
 410 are set, the interface is assumed to be NMBA (Non-Broadcast Multi-Access).
 411 This is the most generic type of device and the most complicated one, because
 412 the host attached to a NBMA link has no means to send to anyone
 413 without additionally configured information.
 414
 415 \item \verb|MULTICAST| --- is an advisory flag indicating that the interface
 416 is aware of multicasting i.e.\ sending packets to some subset of neighbouring
 417 nodes. Broadcasting is a particular case of multicasting, where the multicast
 418 group consists of all nodes on the link. It is important to emphasize
 419 that software {\em must not\/} interpret the absence of this flag as the inability
 420 to use multicasting on this interface. Any \verb|POINTOPOINT| and
 421 \verb|BROADCAST| link is multicasting by definition, because we have
 422 direct access to all the neighbours and, hence, to any part of them.
 423 Certainly, the use of high bandwidth multicast transfers is not recommended
 424 on broadcast-only links because of high expense, but it is not strictly
 425 prohibited.
 426
 427 \item \verb|PROMISC| --- the device listens to and feeds to the kernel all
 428 traffic on the link even if it is not destined for us, not broadcasted
 429 and not destined for a multicast group of which we are member. Usually
 430 this mode exists only on broadcast links and is used by bridges and for network
 431 monitoring.
 432
 433 \item \verb|ALLMULTI| --- the device receives all multicast packets
 434 wandering on the link. This mode is used by multicast routers.
 435
 436 \item \verb|NOARP| --- this flag is different from the other ones. It has
 437 no invariant value and its interpretation depends on the network protocols
 438 involved. As a rule, it indicates that the device needs no address
 439 resolution and that the software or hardware knows how to deliver packets
 440 without any help from the protocol stacks.
 441
 442 \item \verb|DYNAMIC| --- is an advisory flag indicating that the interface is
 443 dynamically created and destroyed.
 444
 445 \item \verb|SLAVE| --- this interface is bonded to some other interfaces
 446 to share link capacities.
 447
 448 \end{itemize}
 449
 450 \vskip 1mm
 451 \begin{NB}
 452 There are other flags but they are either obsolete (\verb|NOTRAILERS|)
 453 or not implemented (\verb|DEBUG|) or specific to some devices
 454 (\verb|MASTER|, \verb|AUTOMEDIA| and \verb|PORTSEL|). We do not discuss
 455 them here.
 456 \end{NB}
 457
 458
 459 The second line contains information on the link layer addresses
 460 associated with the device. The first word (\verb|ether|, \verb|sit|)
 461 defines the interface hardware type. This type determines the format and semantics
 462 of the addresses and is logically part of the address.
 463 The default format of the station address and the broadcast address
 464 (or the peer address for pointopoint links) is a
 465 sequence of hexadecimal bytes separated by colons, but some link
 466 types may have their natural address format, f.e.\ addresses
 467 of tunnels over IP are printed as dotted-quad IP addresses.
 468
 469 \vskip 1mm
 470 \begin{NB}
 471   NBMA links have no well-defined broadcast or peer address,
 472   however this field may contain useful information, f.e.\
 473   about the address of broadcast relay or about the address of the ARP server.
 474 \end{NB}
 475 \begin{NB}
 476 Multicast addresses are not shown by this command, see
 477 \verb|ip maddr ls| in~Sec.\ref{IP-MADDR} (p.\pageref{IP-MADDR} of this
 478 document).
 479 \end{NB}
 480
 481
 482 \paragraph{Statistics:} With the \verb|-statistics| option, \verb|ip| also
 483 prints interface statistics:
 484
 485 \begin{verbatim}
 486 kuznet@alisa:~ $ ip -s link ls eth0
 487 3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
 488     link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
 489     RX: bytes  packets  errors  dropped overrun mcast
 490     2449949362 2786187  0       0       0       0
 491     TX: bytes  packets  errors  dropped carrier collsns
 492     178558497  1783945  332     0       332     35172
 493 kuznet@alisa:~ $
 494 \end{verbatim}
 495 \verb|RX:| and \verb|TX:| lines summarize receiver and transmitter
 496 statistics. They contain:
 497 \begin{itemize}
 498 \item \verb|bytes| --- the total number of bytes received or transmitted
 499 on the interface. This number wraps when the maximal length of the data type
 500 natural for the architecture is exceeded, so continuous monitoring requires
 501 a user level daemon snapping it periodically.
 502 \item \verb|packets| --- the total number of packets received or transmitted
 503 on the interface.
 504 \item \verb|errors| --- the total number of receiver or transmitter errors.
 505 \item \verb|dropped| --- the total number of packets dropped due to lack
 506 of resources.
 507 \item \verb|overrun| --- the total number of receiver overruns resulting
 508 in dropped packets. As a rule, if the interface is overrun, it means
 509 serious problems in the kernel or that your machine is too slow
 510 for this interface.
 511 \item \verb|mcast| --- the total number of received multicast packets. This option
 512 is only supported by a few devices.
 513 \item \verb|carrier| --- total number of link media failures f.e.\ because
 514 of lost carrier.
 515 \item \verb|collsns| --- the total number of collision events
 516 on Ethernet-like media. This number may have a different sense on other
 517 link types.
 518 \item \verb|compressed| --- the total number of compressed packets. This is
 519 available only for links using VJ header compression.
 520 \end{itemize}
 521
 522
 523 If the \verb|-s| option is entered twice or more,
 524 \verb|ip| prints more detailed statistics on receiver
 525 and transmitter errors.
 526
 527 \begin{verbatim}
 528 kuznet@alisa:~ $ ip -s -s link ls eth0
 529 3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
 530     link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
 531     RX: bytes  packets  errors  dropped overrun mcast
 532     2449949362 2786187  0       0       0       0
 533     RX errors: length   crc     frame   fifo    missed
 534                0        0       0       0       0
 535     TX: bytes  packets  errors  dropped carrier collsns
 536     178558497  1783945  332     0       332     35172
 537     TX errors: aborted  fifo    window  heartbeat
 538                0        0       0       332
 539 kuznet@alisa:~ $
 540 \end{verbatim}
 541 These error names are pure Ethernetisms. Other devices
 542 may have non zero values in these fields but they may be
 543 interpreted differently.
 544
 545
 546 \section{{\tt ip address} --- protocol address management}
 547
 548 \paragraph{Abbreviations:} \verb|address|, \verb|addr|, \verb|a|.
 549
 550 \paragraph{Object:} The \verb|address| is a protocol (IP or IPv6) address attached
 551 to a network device. Each device must have at least one address
 552 to use the corresponding protocol. It is possible to have several
 553 different addresses attached to one device. These addresses are not
 554 discriminated, so that the term {\em alias\/} is not quite appropriate
 555 for them and we do not use it in this document.
 556
 557 The \verb|ip addr| command displays addresses and their properties,
 558 adds new addresses and deletes old ones.
 559
 560 \paragraph{Commands:} \verb|add|, \verb|delete|, \verb|flush| and \verb|show|
 561 (or \verb|list|).
 562
 563
 564 \subsection{{\tt ip address add} --- add a new protocol address}
 565 \label{IP-ADDR-ADD}
 566
 567 \paragraph{Abbreviations:} \verb|add|, \verb|a|.
 568
 569 \paragraph{Arguments:}
 570
 571 \begin{itemize}
 572 \item \verb|dev NAME|
 573
 574 \noindent--- the name of the device to add the address to.
 575
 576 \item \verb|local ADDRESS| (default)
 577
 578 --- the address of the interface. The format of the address depends
 579 on the protocol. It is a dotted quad for IP and a sequence of hexadecimal halfwords
 580 separated by colons for IPv6. The \verb|ADDRESS| may be followed by
 581 a slash and a decimal number which encodes the network prefix length.
 582
 583
 584 \item \verb|peer ADDRESS|
 585
 586 --- the address of the remote endpoint for pointopoint interfaces.
 587 Again, the \verb|ADDRESS| may be followed by a slash and a decimal number,
 588 encoding the network prefix length. If a peer address is specified,
 589 the local address {\em cannot\/} have a prefix length. The network prefix is associated
 590 with the peer rather than with the local address.
 591
 592
 593 \item \verb|broadcast ADDRESS|
 594
 595 --- the broadcast address on the interface.
 596
 597 It is possible to use the special symbols \verb|'+'| and \verb|'-'|
 598 instead of the broadcast address. In this case, the broadcast address
 599 is derived by setting/resetting the host bits of the interface prefix.
 600
 601 \vskip 1mm
 602 \begin{NB}
 603 Unlike \verb|ifconfig|, the \verb|ip| utility {\em does not\/} set any broadcast
 604 address unless explicitly requested.
 605 \end{NB}
 606
 607
 608 \item \verb|label NAME|
 609
 610 --- Each address may be tagged with a label string.
 611 In order to preserve compatibility with Linux-2.0 net aliases,
 612 this string must coincide with the name of the device or must be prefixed
 613 with the device name followed by colon.
 614
 615
 616 \item \verb|scope SCOPE_VALUE|
 617
 618 --- the scope of the area where this address is valid.
 619 The available scopes are listed in file \verb|/etc/iproute2/rt_scopes|.
 620 Predefined scope values are:
 621
 622  \begin{itemize}
 623         \item \verb|global| --- the address is globally valid.
 624         \item \verb|site| --- (IPv6 only) the address is site local,
 625         i.e.\ it is valid inside this site.
 626         \item \verb|link| --- the address is link local, i.e.\
 627         it is valid only on this device.
 628         \item \verb|host| --- the address is valid only inside this host.
 629  \end{itemize}
 630
 631 Appendix~\ref{ADDR-SEL} (p.\pageref{ADDR-SEL} of this document)
 632 contains more details on address scopes.
 633
 634 \end{itemize}
 635
 636 \paragraph{Examples:}
 637 \begin{itemize}
 638 \item \verb|ip addr add 127.0.0.1/8 dev lo brd + scope host|
 639
 640 --- add the usual loopback address to the loopback device.
 641
 642 \item \verb|ip addr add 10.0.0.1/24 brd + dev eth0 label eth0:Alias|
 643
 644 --- add the address 10.0.0.1 with prefix length 24 (i.e.\ netmask
 645 \verb|255.255.255.0|), standard broadcast and label \verb|eth0:Alias|
 646 to the interface \verb|eth0|.
 647 \end{itemize}
 648
 649
 650 \subsection{{\tt ip address delete} --- delete a protocol address}
 651
 652 \paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
 653
 654 \paragraph{Arguments:} coincide with the arguments of \verb|ip addr add|.
 655 The device name is a required argument. The rest are optional.
 656 If no arguments are given, the first address is deleted.
 657
 658 \paragraph{Examples:}
 659 \begin{itemize}
 660 \item \verb|ip addr del 127.0.0.1/8 dev lo|
 661
 662 --- deletes the loopback address from the loopback device.
 663 It would be best not to repeat this experiment.
 664
 665 \item Disable IP on the interface \verb|eth0|:
 666 \begin{verbatim}
 667   while ip -f inet addr del dev eth0; do
 668     : nothing
 669   done
 670 \end{verbatim}
 671 Another method to disable IP on an interface using {\tt ip addr flush}
 672 may be found in sec.\ref{IP-ADDR-FLUSH}, p.\pageref{IP-ADDR-FLUSH}.
 673
 674 \end{itemize}
 675
 676
 677 \subsection{{\tt ip address show} --- display protocol addresses}
 678
 679 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|lst|, \verb|sh|, \verb|ls|,
 680 \verb|l|.
 681
 682 \paragraph{Arguments:}
 683
 684 \begin{itemize}
 685 \item \verb|dev NAME| (default)
 686
 687 --- the name of the device.
 688
 689 \item \verb|scope SCOPE_VAL|
 690
 691 --- only list addresses with this scope.
 692
 693 \item \verb|to PREFIX|
 694
 695 --- only list addresses matching this prefix.
 696
 697 \item \verb|label PATTERN|
 698
 699 --- only list addresses with labels matching the \verb|PATTERN|.
 700 \verb|PATTERN| is a usual shell style pattern.
 701
 702
 703 \item \verb|dynamic| and \verb|permanent|
 704
 705 --- (IPv6 only) only list addresses installed due to stateless
 706 address configuration or only list permanent (not dynamic) addresses.
 707
 708 \item \verb|tentative|
 709
 710 --- (IPv6 only) only list addresses which did not pass duplicate
 711 address detection.
 712
 713 \item \verb|deprecated|
 714
 715 --- (IPv6 only) only list deprecated addresses.
 716
 717
 718 \item  \verb|primary| and \verb|secondary|
 719
 720 --- only list primary (or secondary) addresses.
 721
 722 \end{itemize}
 723
 724
 725 \paragraph{Output format:}
 726
 727 \begin{verbatim}
 728 kuznet@alisa:~ $ ip addr ls eth0
 729 3: eth0: <BROADCAST,MULTICAST,UP> mtu 1500 qdisc cbq qlen 100
 730     link/ether 00:a0:cc:66:18:78 brd ff:ff:ff:ff:ff:ff
 731     inet 193.233.7.90/24 brd 193.233.7.255 scope global eth0
 732     inet6 3ffe:2400:0:1:2a0:ccff:fe66:1878/64 scope global dynamic
 733        valid_lft forever preferred_lft 604746sec
 734     inet6 fe80::2a0:ccff:fe66:1878/10 scope link
 735 kuznet@alisa:~ $
 736 \end{verbatim}
 737
 738 The first two lines coincide with the output of \verb|ip link ls|.
 739 It is natural to interpret link layer addresses
 740 as addresses of the protocol family \verb|AF_PACKET|.
 741
 742 Then the list of IP and IPv6 addresses follows, accompanied by
 743 additional address attributes: scope value (see Sec.\ref{IP-ADDR-ADD},
 744 p.\pageref{IP-ADDR-ADD} above), flags and the address label.
 745
 746 Address flags are set by the kernel and cannot be changed
 747 administratively. Currently, the following flags are defined:
 748
 749 \begin{enumerate}
 750 \item \verb|secondary|
 751
 752 --- the address is not used when selecting the default source address
 753 of outgoing packets (Cf.\ Appendix~\ref{ADDR-SEL}, p.\pageref{ADDR-SEL}.).
 754 An IP address becomes secondary if another address with the same
 755 prefix bits already exists. The first address is primary.
 756 It is the leader of the group of all secondary addresses. When the leader
 757 is deleted, all secondaries are purged too.
 758 There is a tweak in \verb|/proc/sys/net/ipv4/conf/<dev>/promote_secondaries|
 759 which activate secondaries promotion when a primary is deleted.
 760 To permanently enable this feature on all devices add
 761 \verb|net.ipv4.conf.all.promote_secondaries=1| to \verb|/etc/sysctl.conf|.
 762 This tweak is available in linux 2.6.15 and later.
 763
 764
 765 \item \verb|dynamic|
 766
 767 --- the address was created due to stateless autoconfiguration~\cite{RFC-ADDRCONF}.
 768 In this case the output also contains information on times, when
 769 the address is still valid. After \verb|preferred_lft| expires the address is
 770 moved to the deprecated state. After \verb|valid_lft| expires the address
 771 is finally invalidated.
 772
 773 \item \verb|deprecated|
 774
 775 --- the address is deprecated, i.e.\ it is still valid, but cannot
 776 be used by newly created connections.
 777
 778 \item \verb|tentative|
 779
 780 --- the address is not used because duplicate address detection~\cite{RFC-ADDRCONF}
 781 is still not complete or failed.
 782
 783 \end{enumerate}
 784
 785
 786 \subsection{{\tt ip address flush} --- flush protocol addresses}
 787 \label{IP-ADDR-FLUSH}
 788
 789 \paragraph{Abbreviations:} \verb|flush|, \verb|f|.
 790
 791 \paragraph{Description:}This command flushes the protocol addresses
 792 selected by some criteria.
 793
 794 \paragraph{Arguments:} This command has the same arguments as \verb|show|.
 795 The difference is that it does not run when no arguments are given.
 796
 797 \paragraph{Warning:} This command (and other \verb|flush| commands
 798 described below) is pretty dangerous. If you make a mistake, it will
 799 not forgive it, but will cruelly purge all the addresses.
 800
 801 \paragraph{Statistics:} With the \verb|-statistics| option, the command
 802 becomes verbose. It prints out the number of deleted addresses and the number
 803 of rounds made to flush the address list. If this option is given
 804 twice, \verb|ip addr flush| also dumps all the deleted addresses
 805 in the format described in the previous subsection.
 806
 807 \paragraph{Example:} Delete all the addresses from the private network
 808 10.0.0.0/8:
 809 \begin{verbatim}
 810 netadm@amber:~ # ip -s -s a f to 10/8
 811 2: dummy    inet 10.7.7.7/16 brd 10.7.255.255 scope global dummy
 812 3: eth0    inet 10.10.7.7/16 brd 10.10.255.255 scope global eth0
 813 4: eth1    inet 10.8.7.7/16 brd 10.8.255.255 scope global eth1
 814
 815 *** Round 1, deleting 3 addresses ***
 816 *** Flush is complete after 1 round ***
 817 netadm@amber:~ #
 818 \end{verbatim}
 819 Another instructive example is disabling IP on all the Ethernets:
 820 \begin{verbatim}
 821 netadm@amber:~ # ip -4 addr flush label "eth*"
 822 \end{verbatim}
 823 And the last example shows how to flush all the IPv6 addresses
 824 acquired by the host from stateless address autoconfiguration
 825 after you enabled forwarding or disabled autoconfiguration.
 826 \begin{verbatim}
 827 netadm@amber:~ # ip -6 addr flush dynamic
 828 \end{verbatim}
 829
 830
 831
 832 \section{{\tt ip neighbour} --- neighbour/arp tables management}
 833
 834 \paragraph{Abbreviations:} \verb|neighbour|, \verb|neighbor|, \verb|neigh|,
 835 \verb|n|.
 836
 837 \paragraph{Object:} \verb|neighbour| objects establish bindings between protocol
 838 addresses and link layer addresses for hosts sharing the same link.
 839 Neighbour entries are organized into tables. The IPv4 neighbour table
 840 is known by another name --- the ARP table.
 841
 842 The corresponding commands display neighbour bindings
 843 and their properties, add new neighbour entries and delete old ones.
 844
 845 \paragraph{Commands:} \verb|add|, \verb|change|, \verb|replace|,
 846 \verb|delete|, \verb|flush| and \verb|show| (or \verb|list|).
 847
 848 \paragraph{See also:} Appendix~\ref{PROXY-NEIGH}, p.\pageref{PROXY-NEIGH}
 849 describes how to manage proxy ARP/NDISC with the \verb|ip| utility.
 850
 851
 852 \subsection{{\tt ip neighbour add} --- add a new neighbour entry\\
 853         {\tt ip neighbour change} --- change an existing entry\\
 854         {\tt ip neighbour replace} --- add a new entry or change an existing one}
 855
 856 \paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
 857 \verb|replace|, \verb|repl|.
 858
 859 \paragraph{Description:} These commands create new neighbour records
 860 or update existing ones.
 861
 862 \paragraph{Arguments:}
 863
 864 \begin{itemize}
 865 \item \verb|to ADDRESS| (default)
 866
 867 --- the protocol address of the neighbour. It is either an IPv4 or IPv6 address.
 868
 869 \item \verb|dev NAME|
 870
 871 --- the interface to which this neighbour is attached.
 872
 873
 874 \item \verb|lladdr LLADDRESS|
 875
 876 --- the link layer address of the neighbour. \verb|LLADDRESS| can also be
 877 \verb|null|.
 878
 879 \item \verb|nud NUD_STATE|
 880
 881 --- the state of the neighbour entry. \verb|nud| is an abbreviation for ``Neighbour
 882 Unreachability Detection''. The state can take one of the following values:
 883
 884 \begin{enumerate}
 885 \item \verb|permanent| --- the neighbour entry is valid forever and can be only be removed
 886 administratively.
 887 \item \verb|noarp| --- the neighbour entry is valid. No attempts to validate
 888 this entry will be made but it can be removed when its lifetime expires.
 889 \item \verb|reachable| --- the neighbour entry is valid until the reachability
 890 timeout expires.
 891 \item \verb|stale| --- the neighbour entry is valid but suspicious.
 892 This option to \verb|ip neigh| does not change the neighbour state if
 893 it was valid and the address is not changed by this command.
 894 \end{enumerate}
 895
 896 \end{itemize}
 897
 898 \paragraph{Examples:}
 899 \begin{itemize}
 900 \item \verb|ip neigh add 10.0.0.3 lladdr 0:0:0:0:0:1 dev eth0 nud perm|
 901
 902 --- add a permanent ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|.
 903
 904 \item \verb|ip neigh chg 10.0.0.3 dev eth0 nud reachable|
 905
 906 --- change its state to \verb|reachable|.
 907 \end{itemize}
 908
 909
 910 \subsection{{\tt ip neighbour delete} --- delete a neighbour entry}
 911
 912 \paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
 913
 914 \paragraph{Description:} This command invalidates a neighbour entry.
 915
 916 \paragraph{Arguments:} The arguments are the same as with \verb|ip neigh add|,
 917 except that \verb|lladdr| and \verb|nud| are ignored.
 918
 919
 920 \paragraph{Example:}
 921 \begin{itemize}
 922 \item \verb|ip neigh del 10.0.0.3 dev eth0|
 923
 924 --- invalidate an ARP entry for the neighbour 10.0.0.3 on the device \verb|eth0|.
 925
 926 \end{itemize}
 927
 928 \begin{NB}
 929  The deleted neighbour entry will not disappear from the tables
 930  immediately. If it is in use it cannot be deleted until the last
 931  client releases it. Otherwise it will be destroyed during
 932  the next garbage collection.
 933 \end{NB}
 934
 935
 936 \paragraph{Warning:} Attempts to delete or manually change
 937 a \verb|noarp| entry created by the kernel may result in unpredictable behaviour.
 938 Particularly, the kernel may try to resolve this address even
 939 on a \verb|NOARP| interface or if the address is multicast or broadcast.
 940
 941
 942 \subsection{{\tt ip neighbour show} --- list neighbour entries}
 943
 944 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|.
 945
 946 \paragraph{Description:}This commands displays neighbour tables.
 947
 948 \paragraph{Arguments:}
 949
 950 \begin{itemize}
 951
 952 \item \verb|to ADDRESS| (default)
 953
 954 --- the prefix selecting the neighbours to list.
 955
 956 \item \verb|dev NAME|
 957
 958 --- only list the neighbours attached to this device.
 959
 960 \item \verb|unused|
 961
 962 --- only list neighbours which are not currently in use.
 963
 964 \item \verb|nud NUD_STATE|
 965
 966 --- only list neighbour entries in this state. \verb|NUD_STATE| takes
 967 values listed below or the special value \verb|all| which means all states.
 968 This option may occur more than once. If this option is absent, \verb|ip|
 969 lists all entries except for \verb|none| and \verb|noarp|.
 970
 971 \end{itemize}
 972
 973
 974 \paragraph{Output format:}
 975
 976 \begin{verbatim}
 977 kuznet@alisa:~ $ ip neigh ls
 978 :: dev lo lladdr 00:00:00:00:00:00 nud noarp
 979 fe80::200:cff:fe76:3f85 dev eth0 lladdr 00:00:0c:76:3f:85 router \
 980     nud stale
 981 0.0.0.0 dev lo lladdr 00:00:00:00:00:00 nud noarp
 982 193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 nud reachable
 983 193.233.7.85 dev eth0 lladdr 00:e0:1e:63:39:00 nud stale
 984 kuznet@alisa:~ $
 985 \end{verbatim}
 986
 987 The first word of each line is the protocol address of the neighbour.
 988 Then the device name follows. The rest of the line describes the contents of
 989 the neighbour entry identified by the pair (device, address).
 990
 991 \verb|lladdr| is the link layer address of the neighbour.
 992
 993 \verb|nud| is the state of the ``neighbour unreachability detection'' machine
 994 for this entry. The detailed description of the neighbour
 995 state machine can be found in~\cite{RFC-NDISC}. Here is the full list
 996 of the states with short descriptions:
 997
 998 \begin{enumerate}
 999 \item\verb|none| --- the state of the neighbour is void.
1000 \item\verb|incomplete| --- the neighbour is in the process of resolution.
1001 \item\verb|reachable| --- the neighbour is valid and apparently reachable.
1002 \item\verb|stale| --- the neighbour is valid, but is probably already
1003 unreachable, so the kernel will try to check it at the first transmission.
1004 \item\verb|delay| --- a packet has been sent to the stale neighbour and the kernel is waiting
1005 for confirmation.
1006 \item\verb|probe| --- the delay timer expired but no confirmation was received.
1007 The kernel has started to probe the neighbour with ARP/NDISC messages.
1008 \item\verb|failed| --- resolution has failed.
1009 \item\verb|noarp| --- the neighbour is valid. No attempts to check the entry
1010 will be made.
1011 \item\verb|permanent| --- it is a \verb|noarp| entry, but only the administrator
1012 may remove the entry from the neighbour table.
1013 \end{enumerate}
1014
1015 The link layer address is valid in all states except for \verb|none|,
1016 \verb|failed| and \verb|incomplete|.
1017
1018 IPv6 neighbours can be marked with the additional flag \verb|router|
1019 which means that the neighbour introduced itself as an IPv6 router~\cite{RFC-NDISC}.
1020
1021 \paragraph{Statistics:} The \verb|-statistics| option displays some usage
1022 statistics, f.e.\
1023
1024 \begin{verbatim}
1025 kuznet@alisa:~ $ ip -s n ls 193.233.7.254
1026 193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \
1027     nud reachable
1028 kuznet@alisa:~ $
1029 \end{verbatim}
1030
1031 Here \verb|ref| is the number of users of this entry
1032 and \verb|used| is a triplet of time intervals in seconds
1033 separated by slashes. In this case they show that:
1034
1035 \begin{enumerate}
1036 \item the entry was used 12 seconds ago.
1037 \item the entry was confirmed 13 seconds ago.
1038 \item the entry was updated 20 seconds ago.
1039 \end{enumerate}
1040
1041 \subsection{{\tt ip neighbour flush} --- flush neighbour entries}
1042
1043 \paragraph{Abbreviations:} \verb|flush|, \verb|f|.
1044
1045 \paragraph{Description:}This command flushes neighbour tables, selecting
1046 entries to flush by some criteria.
1047
1048 \paragraph{Arguments:} This command has the same arguments as \verb|show|.
1049 The differences are that it does not run when no arguments are given,
1050 and that the default neighbour states to be flushed do not include
1051 \verb|permanent| and \verb|noarp|.
1052
1053
1054 \paragraph{Statistics:} With the \verb|-statistics| option, the command
1055 becomes verbose. It prints out the number of deleted neighbours and the number
1056 of rounds made to flush the neighbour table. If the option is given
1057 twice, \verb|ip neigh flush| also dumps all the deleted neighbours
1058 in the format described in the previous subsection.
1059
1060 \paragraph{Example:}
1061 \begin{verbatim}
1062 netadm@alisa:~ # ip -s -s n f 193.233.7.254
1063 193.233.7.254 dev eth0 lladdr 00:00:0c:76:3f:85 ref 5 used 12/13/20 \
1064     nud reachable
1065
1066 *** Round 1, deleting 1 entries ***
1067 *** Flush is complete after 1 round ***
1068 netadm@alisa:~ #
1069 \end{verbatim}
1070
1071
1072 \section{{\tt ip route} --- routing table management}
1073 \label{IP-ROUTE}
1074
1075 \paragraph{Abbreviations:} \verb|route|, \verb|ro|, \verb|r|.
1076
1077 \paragraph{Object:} \verb|route| entries in the kernel routing tables keep
1078 information about paths to other networked nodes.
1079
1080 Each route entry has a {\em key\/} consisting of a {\em prefix\/}
1081 (i.e.\ a pair containing a network address and the length of its mask) and,
1082 optionally, the TOS value. An IP packet matches the route if the highest
1083 bits of its destination address are equal to the route prefix at least
1084 up to the prefix length and if the TOS of the route is zero or equal to
1085 the TOS of the packet.
1086
1087 If several routes match the packet, the following pruning rules
1088 are used to select the best one (see~\cite{RFC1812}):
1089 \begin{enumerate}
1090 \item The longest matching prefix is selected. All shorter ones
1091 are dropped.
1092
1093 \item If the TOS of some route with the longest prefix is equal to the TOS
1094 of the packet, the routes with different TOS are dropped.
1095
1096 If no exact TOS match was found and routes with TOS=0 exist,
1097 the rest of routes are pruned.
1098
1099 Otherwise, the route lookup fails.
1100
1101 \item If several routes remain after the previous steps, then
1102 the routes with the best preference values are selected.
1103
1104 \item If we still have several routes, then the {\em first\/} of them
1105 is selected.
1106
1107 \begin{NB}
1108  Note the ambiguity of the last step. Unfortunately, Linux
1109  historically allows such a bizarre situation. The sense of the
1110 word ``first'' depends on the order of route additions and it is practically
1111 impossible to maintain a bundle of such routes in this order.
1112 \end{NB}
1113
1114 For simplicity we will limit ourselves to the case where such a situation
1115 is impossible and routes are uniquely identified by the triplet
1116 \{prefix, tos, preference\}. Actually, it is impossible to create
1117 non-unique routes with \verb|ip| commands described in this section.
1118
1119 One useful exception to this rule is the default route on non-forwarding
1120 hosts. It is ``officially'' allowed to have several fallback routes
1121 when several routers are present on directly connected networks.
1122 In this case, Linux-2.2 makes ``dead gateway detection''~\cite{RFC1122}
1123 controlled by neighbour unreachability detection and by advice
1124 from transport protocols to select a working router, so the order
1125 of the routes is not essential. However, in this case,
1126 fiddling with default routes manually is not recommended. Use the Router Discovery
1127 protocol (see Appendix~\ref{EXAMPLE-SETUP}, p.\pageref{EXAMPLE-SETUP})
1128 instead. Actually, Linux-2.2 IPv6 does not give user level applications
1129 any access to default routes.
1130 \end{enumerate}
1131
1132 Certainly, the steps above are not performed exactly
1133 in this sequence. Instead, the routing table in the kernel is kept
1134 in some data structure to achieve the final result
1135 with minimal cost. However, not depending on a particular
1136 routing algorithm implemented in the kernel, we can summarize
1137 the statements above as: a route is identified by the triplet
1138 \{prefix, tos, preference\}. This {\em key\/} lets us locate
1139 the route in the routing table.
1140
1141 \paragraph{Route attributes:} Each route key refers to a routing
1142 information record containing
1143 the data required to deliver IP packets (f.e.\ output device and
1144 next hop router) and some optional attributes (f.e. the path MTU or
1145 the preferred source address when communicating with this destination).
1146 These attributes are described in the following subsection.
1147
1148 \paragraph{Route types:} \label{IP-ROUTE-TYPES}
1149 It is important that the set
1150 of required and optional attributes depend on the route {\em type\/}.
1151 The most important route type
1152 is \verb|unicast|. It describes real paths to other hosts.
1153 As a rule, common routing tables contain only such routes. However,
1154 there are other types of routes with different semantics. The
1155 full list of types understood by Linux-2.2 is:
1156 \begin{itemize}
1157 \item \verb|unicast| --- the route entry describes real paths to the
1158 destinations covered by the route prefix.
1159 \item \verb|unreachable| --- these destinations are unreachable. Packets
1160 are discarded and the ICMP message {\em host unreachable\/} is generated.
1161 The local senders get an \verb|EHOSTUNREACH| error.
1162 \item \verb|blackhole| --- these destinations are unreachable. Packets
1163 are discarded silently. The local senders get an \verb|EINVAL| error.
1164 \item \verb|prohibit| --- these destinations are unreachable. Packets
1165 are discarded and the ICMP message {\em communication administratively
1166 prohibited\/} is generated. The local senders get an \verb|EACCES| error.
1167 \item \verb|local| --- the destinations are assigned to this
1168 host. The packets are looped back and delivered locally.
1169 \item \verb|broadcast| --- the destinations are broadcast addresses.
1170 The packets are sent as link broadcasts.
1171 \item \verb|throw| --- a special control route used together with policy
1172 rules (see sec.\ref{IP-RULE}, p.\pageref{IP-RULE}). If such a route is selected, lookup
1173 in this table is terminated pretending that no route was found.
1174 Without policy routing it is equivalent to the absence of the route in the routing
1175 table. The packets are dropped and the ICMP message {\em net unreachable\/}
1176 is generated. The local senders get an \verb|ENETUNREACH| error.
1177 \item \verb|nat| --- a special NAT route. Destinations covered by the prefix
1178 are considered to be dummy (or external) addresses which require translation
1179 to real (or internal) ones before forwarding. The addresses to translate to
1180 are selected with the attribute \verb|via|. More about NAT is
1181 in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}.
1182 \item \verb|anycast| --- ({\em not implemented\/}) the destinations are
1183 {\em anycast\/} addresses assigned to this host. They are mainly equivalent
1184 to \verb|local| with one difference: such addresses are invalid when used
1185 as the source address of any packet.
1186 \item \verb|multicast| --- a special type used for multicast routing.
1187 It is not present in normal routing tables.
1188 \end{itemize}
1189
1190 \paragraph{Route tables:} Linux-2.2 can pack routes into several routing
1191 tables identified by a number in the range from 1 to 255 or by
1192 name from the file \verb|/etc/iproute2/rt_tables|. By default all normal
1193 routes are inserted into the \verb|main| table (ID 254) and the kernel only uses
1194 this table when calculating routes.
1195
1196 Actually, one other table always exists, which is invisible but
1197 even more important. It is the \verb|local| table (ID 255). This table
1198 consists of routes for local and broadcast addresses. The kernel maintains
1199 this table automatically and the administrator usually need not modify it
1200 or even look at it.
1201
1202 The multiple routing tables enter the game when {\em policy routing\/}
1203 is used. See sec.\ref{IP-RULE}, p.\pageref{IP-RULE}.
1204 In this case, the table identifier effectively becomes
1205 one more parameter, which should be added to the triplet
1206 \{prefix, tos, preference\} to uniquely identify the route.
1207
1208
1209 \subsection{{\tt ip route add} --- add a new route\\
1210         {\tt ip route change} --- change a route\\
1211         {\tt ip route replace} --- change a route or add a new one}
1212 \label{IP-ROUTE-ADD}
1213
1214 \paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
1215         \verb|replace|, \verb|repl|.
1216
1217
1218 \paragraph{Arguments:}
1219 \begin{itemize}
1220 \item \verb|to PREFIX| or \verb|to TYPE PREFIX| (default)
1221
1222 --- the destination prefix of the route. If \verb|TYPE| is omitted,
1223 \verb|ip| assumes type \verb|unicast|. Other values of \verb|TYPE|
1224 are listed above. \verb|PREFIX| is an IP or IPv6 address optionally followed
1225 by a slash and the prefix length. If the length of the prefix is missing,
1226 \verb|ip| assumes a full-length host route. There is also a special
1227 \verb|PREFIX| --- \verb|default| --- which is equivalent to IP \verb|0/0| or
1228 to IPv6 \verb|::/0|.
1229
1230 \item \verb|tos TOS| or \verb|dsfield TOS|
1231
1232 --- the Type Of Service (TOS) key. This key has no associated mask and
1233 the longest match is understood as: First, compare the TOS
1234 of the route and of the packet. If they are not equal, then the packet
1235 may still match a route with a zero TOS. \verb|TOS| is either an 8 bit hexadecimal
1236 number or an identifier from {\tt /etc/iproute2/rt\_dsfield}.
1237
1238
1239 \item \verb|metric NUMBER| or \verb|preference NUMBER|
1240
1241 --- the preference value of the route. \verb|NUMBER| is an arbitrary 32bit number.
1242
1243 \item \verb|table TABLEID|
1244
1245 --- the table to add this route to.
1246 \verb|TABLEID| may be a number or a string from the file
1247 \verb|/etc/iproute2/rt_tables|. If this parameter is omitted,
1248 \verb|ip| assumes the \verb|main| table, with the exception of
1249 \verb|local|, \verb|broadcast| and \verb|nat| routes, which are
1250 put into the \verb|local| table by default.
1251
1252 \item \verb|dev NAME|
1253
1254 --- the output device name.
1255
1256 \item \verb|via ADDRESS|
1257
1258 --- the address of the nexthop router. Actually, the sense of this field depends
1259 on the route type. For normal \verb|unicast| routes it is either the true nexthop
1260 router or, if it is a direct route installed in BSD compatibility mode,
1261 it can be a local address of the interface.
1262 For NAT routes it is the first address of the block of translated IP destinations.
1263
1264 \item \verb|src ADDRESS|
1265
1266 --- the source address to prefer when sending to the destinations
1267 covered by the route prefix.
1268
1269 \item \verb|realm REALMID|
1270
1271 --- the realm to which this route is assigned.
1272 \verb|REALMID| may be a number or a string from the file
1273 \verb|/etc/iproute2/rt_realms|. Sec.\ref{RT-REALMS} (p.\pageref{RT-REALMS})
1274 contains more information on realms.
1275
1276 \item \verb|mtu MTU| or \verb|mtu lock MTU|
1277
1278 --- the MTU along the path to the destination. If the modifier \verb|lock| is
1279 not used, the MTU may be updated by the kernel due to Path MTU Discovery.
1280 If the modifier \verb|lock| is used, no path MTU discovery will be tried,
1281 all packets will be sent without the DF bit in IPv4 case
1282 or fragmented to MTU for IPv6.
1283
1284 \item \verb|window NUMBER|
1285
1286 --- the maximal window for TCP to advertise to these destinations,
1287 measured in bytes. It limits maximal data bursts that our TCP
1288 peers are allowed to send to us.
1289
1290 \item \verb|rtt NUMBER|
1291
1292 --- the initial RTT (``Round Trip Time'') estimate.
1293
1294
1295 \item \verb|rttvar NUMBER|
1296
1297 --- \threeonly the initial RTT variance estimate.
1298
1299
1300 \item \verb|ssthresh NUMBER|
1301
1302 --- \threeonly an estimate for the initial slow start threshold.
1303
1304
1305 \item \verb|cwnd NUMBER|
1306
1307 --- \threeonly the clamp for congestion window. It is ignored if the \verb|lock|
1308     flag is not used.
1309
1310
1311 \item \verb|advmss NUMBER|
1312
1313 --- \threeonly the MSS (``Maximal Segment Size'') to advertise to these
1314     destinations when establishing TCP connections. If it is not given,
1315     Linux uses a default value calculated from the first hop device MTU.
1316
1317 \begin{NB}
1318   If the path to these destination is asymmetric, this guess may be wrong.
1319 \end{NB}
1320
1321 \item \verb|reordering NUMBER|
1322
1323 --- \threeonly Maximal reordering on the path to this destination.
1324     If it is not given, Linux uses the value selected with \verb|sysctl|
1325     variable \verb|net/ipv4/tcp_reordering|.
1326
1327 \item \verb|hoplimit NUMBER|
1328
1329 --- [2.5.74+ only] Maximum number of hops on the path to this destination.
1330     The default is the value selected with the \verb|sysctl| variable
1331     \verb|net/ipv4/ip_default_ttl|.
1332
1333 \item \verb|initcwnd NUMBER|
1334 --- [2.5.70+ only] Initial congestion window size for connections to
1335     this destination. Actual window size is this value multiplied by the
1336     MSS (``Maximal Segment Size'') for same connection. The default is
1337     zero, meaning to use the values specified in~\cite{RFC2414}.
1338
1339 +\item \verb|initrwnd NUMBER|
1340
1341 +--- [2.6.33+ only] Initial receive window size for connections to
1342 +    this destination. The actual window size is this value multiplied
1343 +    by the MSS (''Maximal Segment Size'') of the connection. The default
1344 +    value is zero, meaning to use Slow Start value.
1345
1346 \item \verb|nexthop NEXTHOP|
1347
1348 --- the nexthop of a multipath route. \verb|NEXTHOP| is a complex value
1349 with its own syntax similar to the top level argument lists:
1350 \begin{itemize}
1351 \item \verb|via ADDRESS| is the nexthop router.
1352 \item \verb|dev NAME| is the output device.
1353 \item \verb|weight NUMBER| is a weight for this element of a multipath
1354 route reflecting its relative bandwidth or quality.
1355 \end{itemize}
1356
1357 \item \verb|scope SCOPE_VAL|
1358
1359 --- the scope of the destinations covered by the route prefix.
1360 \verb|SCOPE_VAL| may be a number or a string from the file
1361 \verb|/etc/iproute2/rt_scopes|.
1362 If this parameter is omitted,
1363 \verb|ip| assumes scope \verb|global| for all gatewayed \verb|unicast|
1364 routes, scope \verb|link| for direct \verb|unicast| and \verb|broadcast| routes
1365 and scope \verb|host| for \verb|local| routes.
1366
1367 \item \verb|protocol RTPROTO|
1368
1369 --- the routing protocol identifier of this route.
1370 \verb|RTPROTO| may be a number or a string from the file
1371 \verb|/etc/iproute2/rt_protos|. If the routing protocol ID is
1372 not given, \verb|ip| assumes protocol \verb|boot| (i.e.\
1373 it assumes the route was added by someone who doesn't
1374 understand what they are doing). Several protocol values have a fixed interpretation.
1375 Namely:
1376 \begin{itemize}
1377 \item \verb|redirect| --- the route was installed due to an ICMP redirect.
1378 \item \verb|kernel| --- the route was installed by the kernel during
1379 autoconfiguration.
1380 \item \verb|boot| --- the route was installed during the bootup sequence.
1381 If a routing daemon starts, it will purge all of them.
1382 \item \verb|static| --- the route was installed by the administrator
1383 to override dynamic routing. Routing daemon will respect them
1384 and, probably, even advertise them to its peers.
1385 \item \verb|ra| --- the route was installed by Router Discovery protocol.
1386 \end{itemize}
1387 The rest of the values are not reserved and the administrator is free
1388 to assign (or not to assign) protocol tags. At least, routing
1389 daemons should take care of setting some unique protocol values,
1390 f.e.\ as they are assigned in \verb|rtnetlink.h| or in \verb|rt_protos|
1391 database.
1392
1393
1394 \item \verb|onlink|
1395
1396 --- pretend that the nexthop is directly attached to this link,
1397 even if it does not match any interface prefix. One application of this
1398 option may be found in~\cite{IP-TUNNELS}.
1399
1400 \end{itemize}
1401
1402
1403 \begin{NB}
1404   Actually there are more commands: \verb|prepend| does the same
1405   thing as classic \verb|route add|, i.e.\ adds a route, even if another
1406   route to the same destination exists. Its opposite case is \verb|append|,
1407   which adds the route to the end of the list. Avoid these
1408   features.
1409 \end{NB}
1410 \begin{NB}
1411   More sad news, IPv6 only understands the \verb|append| command correctly.
1412   All the others are translated into \verb|append| commands. Certainly,
1413   this will change in the future.
1414 \end{NB}
1415
1416 \paragraph{Examples:}
1417 \begin{itemize}
1418 \item add a plain route to network 10.0.0/24 via gateway 193.233.7.65
1419 \begin{verbatim}
1420   ip route add 10.0.0/24 via 193.233.7.65
1421 \end{verbatim}
1422 \item change it to a direct route via the \verb|dummy| device
1423 \begin{verbatim}
1424   ip ro chg 10.0.0/24 dev dummy
1425 \end{verbatim}
1426 \item add a default multipath route splitting the load between \verb|ppp0|
1427 and \verb|ppp1|
1428 \begin{verbatim}
1429   ip route add default scope global nexthop dev ppp0 \
1430                                     nexthop dev ppp1
1431 \end{verbatim}
1432 Note the scope value. It is not necessary but it informs the kernel
1433 that this route is gatewayed rather than direct. Actually, if you
1434 know the addresses of remote endpoints it would be better to use the
1435 \verb|via| parameter.
1436 \item announce that the address 192.203.80.144 is not a real one, but
1437 should be translated to 193.233.7.83 before forwarding
1438 \begin{verbatim}
1439   ip route add nat 192.203.80.144 via 193.233.7.83
1440 \end{verbatim}
1441 Backward translation is setup with policy rules described
1442 in the following section (sec.\ref{IP-RULE}, p.\pageref{IP-RULE}).
1443 \end{itemize}
1444
1445 \subsection{{\tt ip route delete} --- delete a route}
1446
1447 \paragraph{Abbreviations:} \verb|delete|, \verb|del|, \verb|d|.
1448
1449 \paragraph{Arguments:} \verb|ip route del| has the same arguments as
1450 \verb|ip route add|, but their semantics are a bit different.
1451
1452 Key values (\verb|to|, \verb|tos|, \verb|preference| and \verb|table|)
1453 select the route to delete. If optional attributes are present, \verb|ip|
1454 verifies that they coincide with the attributes of the route to delete.
1455 If no route with the given key and attributes was found, \verb|ip route del|
1456 fails.
1457 \begin{NB}
1458 Linux-2.0 had the option to delete a route selected only by prefix address,
1459 ignoring its length (i.e.\ netmask). This option no longer exists
1460 because it was ambiguous. However, look at {\tt ip route flush}
1461 (sec.\ref{IP-ROUTE-FLUSH}, p.\pageref{IP-ROUTE-FLUSH}) which
1462 provides similar and even richer functionality.
1463 \end{NB}
1464
1465 \paragraph{Example:}
1466 \begin{itemize}
1467 \item delete the multipath route created by the command in previous subsection
1468 \begin{verbatim}
1469   ip route del default scope global nexthop dev ppp0 \
1470                                     nexthop dev ppp1
1471 \end{verbatim}
1472 \end{itemize}
1473
1474
1475
1476 \subsection{{\tt ip route show} --- list routes}
1477
1478 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
1479
1480 \paragraph{Description:} the command displays the contents of the routing tables
1481 or the route(s) selected by some criteria.
1482
1483
1484 \paragraph{Arguments:}
1485 \begin{itemize}
1486 \item \verb|to SELECTOR| (default)
1487
1488 --- only select routes from the given range of destinations. \verb|SELECTOR|
1489 consists of an optional modifier (\verb|root|, \verb|match| or \verb|exact|)
1490 and a prefix. \verb|root PREFIX| selects routes with prefixes not shorter
1491 than \verb|PREFIX|. F.e.\ \verb|root 0/0| selects the entire routing table.
1492 \verb|match PREFIX| selects routes with prefixes not longer than
1493 \verb|PREFIX|. F.e.\ \verb|match 10.0/16| selects \verb|10.0/16|,
1494 \verb|10/8| and \verb|0/0|, but it does not select \verb|10.1/16| and
1495 \verb|10.0.0/24|. And \verb|exact PREFIX| (or just \verb|PREFIX|)
1496 selects routes with this exact prefix. If neither of these options
1497 are present, \verb|ip| assumes \verb|root 0/0| i.e.\ it lists the entire table.
1498
1499
1500 \item \verb|tos TOS| or \verb|dsfield TOS|
1501
1502  --- only select routes with the given TOS.
1503
1504
1505 \item \verb|table TABLEID|
1506
1507  --- show the routes from this table(s). The default setting is to show
1508 \verb|table| \verb|main|. \verb|TABLEID| may either be the ID of a real table
1509 or one of the special values:
1510   \begin{itemize}
1511   \item \verb|all| --- list all of the tables.
1512   \item \verb|cache| --- dump the routing cache.
1513   \end{itemize}
1514 \begin{NB}
1515   IPv6 has a single table. However, splitting it into \verb|main|, \verb|local|
1516   and \verb|cache| is emulated by the \verb|ip| utility.
1517 \end{NB}
1518
1519 \item \verb|cloned| or \verb|cached|
1520
1521 --- list cloned routes i.e.\ routes which were dynamically forked from
1522 other routes because some route attribute (f.e.\ MTU) was updated.
1523 Actually, it is equivalent to \verb|table cache|.
1524
1525 \item \verb|from SELECTOR|
1526
1527 --- the same syntax as for \verb|to|, but it binds the source address range
1528 rather than destinations. Note that the \verb|from| option only works with
1529 cloned routes.
1530
1531 \item \verb|protocol RTPROTO|
1532
1533 --- only list routes of this protocol.
1534
1535
1536 \item \verb|scope SCOPE_VAL|
1537
1538 --- only list routes with this scope.
1539
1540 \item \verb|type TYPE|
1541
1542 --- only list routes of this type.
1543
1544 \item \verb|dev NAME|
1545
1546 --- only list routes going via this device.
1547
1548 \item \verb|via PREFIX|
1549
1550 --- only list routes going via the nexthop routers selected by \verb|PREFIX|.
1551
1552 \item \verb|src PREFIX|
1553
1554 --- only list routes with preferred source addresses selected
1555 by \verb|PREFIX|.
1556
1557 \item \verb|realm REALMID| or \verb|realms FROMREALM/TOREALM|
1558
1559 --- only list routes with these realms.
1560
1561 \end{itemize}
1562
1563 \paragraph{Examples:} Let us count routes of protocol \verb|gated/bgp|
1564 on a router:
1565 \begin{verbatim}
1566 kuznet@amber:~ $ ip ro ls proto gated/bgp | wc
1567    1413    9891    79010
1568 kuznet@amber:~ $
1569 \end{verbatim}
1570 To count the size of the routing cache, we have to use the \verb|-o| option
1571 because cached attributes can take more than one line of output:
1572 \begin{verbatim}
1573 kuznet@amber:~ $ ip -o ro ls cloned | wc
1574    159    2543    18707
1575 kuznet@amber:~ $
1576 \end{verbatim}
1577
1578
1579 \paragraph{Output format:} The output of this command consists
1580 of per route records separated by line feeds.
1581 However, some records may consist
1582 of more than one line: particularly, this is the case when the route
1583 is cloned or you requested additional statistics. If the
1584 \verb|-o| option was given, then line feeds separating lines inside
1585 records are replaced with the backslash sign.
1586
1587 The output has the same syntax as arguments given to {\tt ip route add},
1588 so that it can be understood easily. F.e.\
1589 \begin{verbatim}
1590 kuznet@amber:~ $ ip ro ls 193.233.7/24
1591 193.233.7.0/24 dev eth0  proto gated/conn  scope link \
1592     src 193.233.7.65 realms inr.ac
1593 kuznet@amber:~ $
1594 \end{verbatim}
1595
1596 If you list cloned entries, the output contains other attributes which
1597 are evaluated during route calculation and updated during route
1598 lifetime. An example of the output is:
1599 \begin{verbatim}
1600 kuznet@amber:~ $ ip ro ls 193.233.7.82 tab cache
1601 193.233.7.82 from 193.233.7.82 dev eth0  src 193.233.7.65 \
1602   realms inr.ac/inr.ac
1603     cache <src-direct,redirect>  mtu 1500 rtt 300 iif eth0
1604 193.233.7.82 dev eth0  src 193.233.7.65 realms inr.ac
1605     cache  mtu 1500 rtt 300
1606 kuznet@amber:~ $
1607 \end{verbatim}
1608 \begin{NB}
1609   \label{NB-strange-route}
1610   The route looks a bit strange, doesn't it? Did you notice that
1611   it is a path from 193.233.7.82 back to 193.233.82? Well, you will
1612   see in the section on \verb|ip route get| (p.\pageref{NB-nature-of-strangeness})
1613   how it appeared.
1614 \end{NB}
1615 The second line, starting with the word \verb|cache|, shows
1616 additional attributes which normal routes do not possess.
1617 Cached flags are summarized in angle brackets:
1618 \begin{itemize}
1619 \item \verb|local| --- packets are delivered locally.
1620 It stands for loopback unicast routes, for broadcast routes
1621 and for multicast routes, if this host is a member of the corresponding
1622 group.
1623
1624 \item \verb|reject| --- the path is bad. Any attempt to use it results
1625 in an error. See attribute \verb|error| below (p.\pageref{IP-ROUTE-GET-error}).
1626
1627 \item \verb|mc| --- the destination is multicast.
1628
1629 \item \verb|brd| --- the destination is broadcast.
1630
1631 \item \verb|src-direct| --- the source is on a directly connected
1632 interface.
1633
1634 \item \verb|redirected| --- the route was created by an ICMP Redirect.
1635
1636 \item \verb|redirect| --- packets going via this route will
1637 trigger an ICMP redirect.
1638
1639 \item \verb|fastroute| --- the route is eligible to be used for fastroute.
1640
1641 \item \verb|equalize| --- make packet by packet randomization
1642 along this path.
1643
1644 \item \verb|dst-nat| --- the destination address requires translation.
1645
1646 \item \verb|src-nat| --- the source address requires translation.
1647
1648 \item \verb|masq| --- the source address requires masquerading.
1649 This feature disappeared in linux-2.4.
1650
1651 \item \verb|notify| --- ({\em not implemented}) change/deletion
1652 of this route will trigger RTNETLINK notification.
1653 \end{itemize}
1654
1655 Then some optional attributes follow:
1656 \begin{itemize}
1657 \item \verb|error| --- on \verb|reject| routes it is error code
1658 returned to local senders when they try to use this route.
1659 These error codes are translated into ICMP error codes, sent to remote
1660 senders, according to the rules described above in the subsection
1661 devoted to route types (p.\pageref{IP-ROUTE-TYPES}).
1662 \label{IP-ROUTE-GET-error}
1663
1664 \item \verb|expires| --- this entry will expire after this timeout.
1665
1666 \item \verb|iif| --- the packets for this path are expected to arrive
1667 on this interface.
1668 \end{itemize}
1669
1670 \paragraph{Statistics:} With the \verb|-statistics| option, more
1671 information about this route is shown:
1672 \begin{itemize}
1673 \item \verb|users| --- the number of users of this entry.
1674 \item \verb|age| --- shows when this route was last used.
1675 \item \verb|used| --- the number of lookups of this route since its creation.
1676 \end{itemize}
1677
1678
1679 \subsection{{\tt ip route flush} --- flush routing tables}
1680 \label{IP-ROUTE-FLUSH}
1681
1682 \paragraph{Abbreviations:} \verb|flush|, \verb|f|.
1683
1684 \paragraph{Description:} this command flushes routes selected
1685 by some criteria.
1686
1687 \paragraph{Arguments:} the arguments have the same syntax and semantics
1688 as the arguments of \verb|ip route show|, but routing tables are not
1689 listed but purged. The only difference is the default action: \verb|show|
1690 dumps all the IP main routing table but \verb|flush| prints the helper page.
1691 The reason for this difference does not require any explanation, does it?
1692
1693
1694 \paragraph{Statistics:} With the \verb|-statistics| option, the command
1695 becomes verbose. It prints out the number of deleted routes and the number
1696 of rounds made to flush the routing table. If the option is given
1697 twice, \verb|ip route flush| also dumps all the deleted routes
1698 in the format described in the previous subsection.
1699
1700 \paragraph{Examples:} The first example flushes all the
1701 gatewayed routes from the main table (f.e.\ after a routing daemon crash).
1702 \begin{verbatim}
1703 netadm@amber:~ # ip -4 ro flush scope global type unicast
1704 \end{verbatim}
1705 This option deserves to be put into a scriptlet \verb|routef|.
1706 \begin{NB}
1707 This option was described in the \verb|route(8)| man page borrowed
1708 from BSD, but was never implemented in Linux.
1709 \end{NB}
1710
1711 The second example flushes all IPv6 cloned routes:
1712 \begin{verbatim}
1713 netadm@amber:~ # ip -6 -s -s ro flush cache
1714 3ffe:2400::220:afff:fef4:c5d1 via 3ffe:2400::220:afff:fef4:c5d1 \
1715   dev eth0  metric 0
1716     cache  used 2 age 12sec mtu 1500 rtt 300
1717 3ffe:2400::280:adff:feb7:8034 via 3ffe:2400::280:adff:feb7:8034 \
1718   dev eth0  metric 0
1719     cache  used 2 age 15sec mtu 1500 rtt 300
1720 3ffe:2400::280:c8ff:fe59:5bcc via 3ffe:2400::280:c8ff:fe59:5bcc \
1721   dev eth0  metric 0
1722     cache  users 1 used 1 age 23sec mtu 1500 rtt 300
1723 3ffe:2400:0:1:2a0:ccff:fe66:1878 via 3ffe:2400:0:1:2a0:ccff:fe66:1878 \
1724   dev eth1  metric 0
1725     cache  used 2 age 20sec mtu 1500 rtt 300
1726 3ffe:2400:0:1:a00:20ff:fe71:fb30 via 3ffe:2400:0:1:a00:20ff:fe71:fb30 \
1727   dev eth1  metric 0
1728     cache  used 2 age 33sec mtu 1500 rtt 300
1729 ff02::1 via ff02::1 dev eth1  metric 0
1730     cache  users 1 used 1 age 45sec mtu 1500 rtt 300
1731
1732 *** Round 1, deleting 6 entries ***
1733 *** Flush is complete after 1 round ***
1734 netadm@amber:~ # ip -6 -s -s ro flush cache
1735 Nothing to flush.
1736 netadm@amber:~ #
1737 \end{verbatim}
1738
1739 The third example flushes BGP routing tables after a \verb|gated|
1740 death.
1741 \begin{verbatim}
1742 netadm@amber:~ # ip ro ls proto gated/bgp | wc
1743    1408    9856    78730
1744 netadm@amber:~ # ip -s ro f proto gated/bgp
1745
1746 *** Round 1, deleting 1408 entries ***
1747 *** Flush is complete after 1 round ***
1748 netadm@amber:~ # ip ro f proto gated/bgp
1749 Nothing to flush.
1750 netadm@amber:~ # ip ro ls proto gated/bgp
1751 netadm@amber:~ #
1752 \end{verbatim}
1753
1754
1755 \subsection{{\tt ip route get} --- get a single route}
1756 \label{IP-ROUTE-GET}
1757
1758 \paragraph{Abbreviations:} \verb|get|, \verb|g|.
1759
1760 \paragraph{Description:} this command gets a single route to a destination
1761 and prints its contents exactly as the kernel sees it.
1762
1763 \paragraph{Arguments:}
1764 \begin{itemize}
1765 \item \verb|to ADDRESS| (default)
1766
1767 --- the destination address.
1768
1769 \item \verb|from ADDRESS|
1770
1771 --- the source address.
1772
1773 \item \verb|tos TOS| or \verb|dsfield TOS|
1774
1775 --- the Type Of Service.
1776
1777 \item \verb|iif NAME|
1778
1779 --- the device from which this packet is expected to arrive.
1780
1781 \item \verb|oif NAME|
1782
1783 --- force the output device on which this packet will be routed.
1784
1785 \item \verb|connected|
1786
1787 --- if no source address (option \verb|from|) was given, relookup
1788 the route with the source set to the preferred address received from the first lookup.
1789 If policy routing is used, it may be a different route.
1790
1791 \end{itemize}
1792
1793 Note that this operation is not equivalent to \verb|ip route show|.
1794 \verb|show| shows existing routes. \verb|get| resolves them and
1795 creates new clones if necessary. Essentially, \verb|get|
1796 is equivalent to sending a packet along this path.
1797 If the \verb|iif| argument is not given, the kernel creates a route
1798 to output packets towards the requested destination.
1799 This is equivalent to pinging the destination
1800 with a subsequent {\tt ip route ls cache}, however, no packets are
1801 actually sent. With the \verb|iif| argument, the kernel pretends
1802 that a packet arrived from this interface and searches for
1803 a path to forward the packet.
1804
1805 \paragraph{Output format:} This command outputs routes in the same
1806 format as \verb|ip route ls|.
1807
1808 \paragraph{Examples:}
1809 \begin{itemize}
1810 \item Find a route to output packets to 193.233.7.82:
1811 \begin{verbatim}
1812 kuznet@amber:~ $ ip route get 193.233.7.82
1813 193.233.7.82 dev eth0  src 193.233.7.65 realms inr.ac
1814     cache  mtu 1500 rtt 300
1815 kuznet@amber:~ $
1816 \end{verbatim}
1817
1818 \item Find a route to forward packets arriving on \verb|eth0|
1819 from 193.233.7.82 and destined for 193.233.7.82:
1820 \begin{verbatim}
1821 kuznet@amber:~ $ ip r g 193.233.7.82 from 193.233.7.82 iif eth0
1822 193.233.7.82 from 193.233.7.82 dev eth0  src 193.233.7.65 \
1823   realms inr.ac/inr.ac
1824     cache <src-direct,redirect>  mtu 1500 rtt 300 iif eth0
1825 kuznet@amber:~ $
1826 \end{verbatim}
1827 \begin{NB}
1828   \label{NB-nature-of-strangeness}
1829   This is the command that created the funny route from 193.233.7.82
1830   looped back to 193.233.7.82 (cf.\ NB on~p.\pageref{NB-strange-route}).
1831   Note the \verb|redirect| flag on it.
1832 \end{NB}
1833
1834 \item Find a multicast route for packets arriving on \verb|eth0|
1835 from host 193.233.7.82 and destined for multicast group 224.2.127.254
1836 (it is assumed that a multicast routing daemon is running.
1837 In this case, it is \verb|pimd|)
1838 \begin{verbatim}
1839 kuznet@amber:~ $ ip r g 224.2.127.254 from 193.233.7.82 iif eth0
1840 multicast 224.2.127.254 from 193.233.7.82 dev lo  \
1841   src 193.233.7.65 realms inr.ac/cosmos
1842     cache <mc> iif eth0 Oifs: eth1 pimreg
1843 kuznet@amber:~ $
1844 \end{verbatim}
1845 This route differs from the ones seen before. It contains a ``normal'' part
1846 and a ``multicast'' part. The normal part is used to deliver (or not to
1847 deliver) the packet to local IP listeners. In this case the router
1848 is not a member
1849 of this group, so that route has no \verb|local| flag and only
1850 forwards packets. The output device for such entries is always loopback.
1851 The multicast part consists of an additional \verb|Oifs:| list showing
1852 the output interfaces.
1853 \end{itemize}
1854
1855
1856 It is time for a more complicated example. Let us add an invalid
1857 gatewayed route for a destination which is really directly connected:
1858 \begin{verbatim}
1859 netadm@alisa:~ # ip route add 193.233.7.98 via 193.233.7.254
1860 netadm@alisa:~ # ip route get 193.233.7.98
1861 193.233.7.98 via 193.233.7.254 dev eth0  src 193.233.7.90
1862     cache  mtu 1500 rtt 3072
1863 netadm@alisa:~ #
1864 \end{verbatim}
1865 and probe it with ping:
1866 \begin{verbatim}
1867 netadm@alisa:~ # ping -n 193.233.7.98
1868 PING 193.233.7.98 (193.233.7.98) from 193.233.7.90 : 56 data bytes
1869 From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98)
1870 64 bytes from 193.233.7.98: icmp_seq=0 ttl=255 time=3.5 ms
1871 From 193.233.7.254: Redirect Host(New nexthop: 193.233.7.98)
1872 64 bytes from 193.233.7.98: icmp_seq=1 ttl=255 time=2.2 ms
1873 64 bytes from 193.233.7.98: icmp_seq=2 ttl=255 time=0.4 ms
1874 64 bytes from 193.233.7.98: icmp_seq=3 ttl=255 time=0.4 ms
1875 64 bytes from 193.233.7.98: icmp_seq=4 ttl=255 time=0.4 ms
1876 ^C
1877 --- 193.233.7.98 ping statistics ---
1878 5 packets transmitted, 5 packets received, 0% packet loss
1879 round-trip min/avg/max = 0.4/1.3/3.5 ms
1880 netadm@alisa:~ #
1881 \end{verbatim}
1882 What happened? Router 193.233.7.254 understood that we have a much
1883 better path to the destination and sent us an ICMP redirect message.
1884 We may retry \verb|ip route get| to see what we have in the routing
1885 tables now:
1886 \begin{verbatim}
1887 netadm@alisa:~ # ip route get 193.233.7.98
1888 193.233.7.98 dev eth0  src 193.233.7.90
1889     cache <redirected>  mtu 1500 rtt 3072
1890 netadm@alisa:~ #
1891 \end{verbatim}
1892
1893
1894
1895 \section{{\tt ip rule} --- routing policy database management}
1896 \label{IP-RULE}
1897
1898 \paragraph{Abbreviations:} \verb|rule|, \verb|ru|.
1899
1900 \paragraph{Object:} \verb|rule|s in the routing policy database control
1901 the route selection algorithm.
1902
1903 Classic routing algorithms used in the Internet make routing decisions
1904 based only on the destination address of packets (and in theory,
1905 but not in practice, on the TOS field). The seminal review of classic
1906 routing algorithms and their modifications can be found in~\cite{RFC1812}.
1907
1908 In some circumstances we want to route packets differently depending not only
1909 on destination addresses, but also on other packet fields: source address,
1910 IP protocol, transport protocol ports or even packet payload.
1911 This task is called ``policy routing''.
1912
1913 \begin{NB}
1914   ``policy routing'' $\neq$ ``routing policy''.
1915
1916 \noindent       ``policy routing'' $=$ ``cunning routing''.
1917
1918 \noindent       ``routing policy'' $=$ ``routing tactics'' or ``routing plan''.
1919 \end{NB}
1920
1921 To solve this task, the conventional destination based routing table, ordered
1922 according to the longest match rule, is replaced with a ``routing policy
1923 database'' (or RPDB), which selects routes
1924 by executing some set of rules. The rules may have lots of keys of different
1925 natures and therefore they have no natural ordering, but one imposed
1926 by the administrator. Linux-2.2 RPDB is a linear list of rules
1927 ordered by numeric priority value.
1928 RPDB explicitly allows matching a few packet fields:
1929
1930 \begin{itemize}
1931 \item packet source address.
1932 \item packet destination address.
1933 \item TOS.
1934 \item incoming interface (which is packet metadata, rather than a packet field).
1935 \end{itemize}
1936
1937 Matching IP protocols and transport ports is also possible,
1938 indirectly, via \verb|ipchains|, by exploiting their ability
1939 to mark some classes of packets with \verb|fwmark|. Therefore,
1940 \verb|fwmark| is also included in the set of keys checked by rules.
1941
1942 Each policy routing rule consists of a {\em selector\/} and an {\em action\/}
1943 predicate. The RPDB is scanned in the order of increasing priority. The selector
1944 of each rule is applied to \{source address, destination address, incoming
1945 interface, tos, fwmark\} and, if the selector matches the packet,
1946 the action is performed.  The action predicate may return with success.
1947 In this case, it will either give a route or failure indication
1948 and the RPDB lookup is terminated. Otherwise, the RPDB program
1949 continues on the next rule.
1950
1951 What is the action, semantically? The natural action is to select the
1952 nexthop and the output device. This is what
1953 Cisco IOS~\cite{IOS} does. Let us call it ``match \& set''.
1954 The Linux-2.2 approach is more flexible. The action includes
1955 lookups in destination-based routing tables and selecting
1956 a route from these tables according to the classic longest match algorithm.
1957 The ``match \& set'' approach is the simplest case of the Linux one. It is realized
1958 when a second level routing table contains a single default route.
1959 Recall that Linux-2.2 supports multiple tables
1960 managed with the \verb|ip route| command, described in the previous section.
1961
1962 At startup time the kernel configures the default RPDB consisting of three
1963 rules:
1964
1965 \begin{enumerate}
1966 \item Priority: 0, Selector: match anything, Action: lookup routing
1967 table \verb|local| (ID 255).
1968 The \verb|local| table is a special routing table containing
1969 high priority control routes for local and broadcast addresses.
1970
1971 Rule 0 is special. It cannot be deleted or overridden.
1972
1973
1974 \item Priority: 32766, Selector: match anything, Action: lookup routing
1975 table \verb|main| (ID 254).
1976 The \verb|main| table is the normal routing table containing all non-policy
1977 routes. This rule may be deleted and/or overridden with other
1978 ones by the administrator.
1979
1980 \item Priority: 32767, Selector: match anything, Action: lookup routing
1981 table \verb|default| (ID 253).
1982 The \verb|default| table is empty. It is reserved for some
1983 post-processing if no previous default rules selected the packet.
1984 This rule may also be deleted.
1985
1986 \end{enumerate}
1987
1988 Do not confuse routing tables with rules: rules point to routing tables,
1989 several rules may refer to one routing table and some routing tables
1990 may have no rules pointing to them. If the administrator deletes all the rules
1991 referring to a table, the table is not used, but it still exists
1992 and will disappear only after all the routes contained in it are deleted.
1993
1994
1995 \paragraph{Rule attributes:} Each RPDB entry has additional
1996 attributes. F.e.\ each rule has a pointer to some routing
1997 table. NAT and masquerading rules have an attribute to select new IP
1998 address to translate/masquerade. Besides that, rules have some
1999 optional attributes, which routes have, namely \verb|realms|.
2000 These values do not override those contained in the routing tables. They
2001 are only used if the route did not select any attributes.
2002
2003
2004 \paragraph{Rule types:} The RPDB may contain rules of the following
2005 types:
2006 \begin{itemize}
2007 \item \verb|unicast| --- the rule prescribes to return the route found
2008 in the routing table referenced by the rule.
2009 \item \verb|blackhole| --- the rule prescribes to silently drop the packet.
2010 \item \verb|unreachable| --- the rule prescribes to generate a ``Network
2011 is unreachable'' error.
2012 \item \verb|prohibit| --- the rule prescribes to generate
2013 ``Communication is administratively prohibited'' error.
2014 \item \verb|nat| --- the rule prescribes to translate the source address
2015 of the IP packet into some other value. More about NAT is
2016 in Appendix~\ref{ROUTE-NAT}, p.\pageref{ROUTE-NAT}.
2017 \end{itemize}
2018
2019
2020 \paragraph{Commands:} \verb|add|, \verb|delete| and \verb|show|
2021 (or \verb|list|).
2022
2023 \subsection{{\tt ip rule add} --- insert a new rule\\
2024         {\tt ip rule delete} --- delete a rule}
2025 \label{IP-RULE-ADD}
2026
2027 \paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|,
2028         \verb|d|.
2029
2030 \paragraph{Arguments:}
2031
2032 \begin{itemize}
2033 \item \verb|type TYPE| (default)
2034
2035 --- the type of this rule. The list of valid types was given in the previous
2036 subsection.
2037
2038 \item \verb|from PREFIX|
2039
2040 --- select the source prefix to match.
2041
2042 \item \verb|to PREFIX|
2043
2044 --- select the destination prefix to match.
2045
2046 \item \verb|iif NAME|
2047
2048 --- select the incoming device to match. If the interface is loopback,
2049 the rule only matches packets originating from this host. This means that you
2050 may create separate routing tables for forwarded and local packets and,
2051 hence, completely segregate them.
2052
2053 \item \verb|tos TOS| or \verb|dsfield TOS|
2054
2055 --- select the TOS value to match.
2056
2057 \item \verb|fwmark MARK|
2058
2059 --- select the \verb|fwmark| value to match.
2060
2061 \item \verb|priority PREFERENCE|
2062
2063 --- the priority of this rule. Each rule should have an explicitly
2064 set {\em unique\/} priority value.
2065 \begin{NB}
2066   Really, for historical reasons \verb|ip rule add| does not require a
2067   priority value and allows them to be non-unique.
2068   If the user does not supplied a priority, it is selected by the kernel.
2069   If the user creates a rule with a priority value that
2070   already exists, the kernel does not reject the request. It adds
2071   the new rule before all old rules of the same priority.
2072
2073   It is mistake in design, no more. And it will be fixed one day,
2074   so do not rely on this feature. Use explicit priorities.
2075 \end{NB}
2076
2077
2078 \item \verb|table TABLEID|
2079
2080 --- the routing table identifier to lookup if the rule selector matches.
2081
2082 \item \verb|realms FROM/TO|
2083
2084 --- Realms to select if the rule matched and the routing table lookup
2085 succeeded. Realm \verb|TO| is only used if the route did not select
2086 any realm.
2087
2088 \item \verb|nat ADDRESS|
2089
2090 --- The base of the IP address block to translate (for source addresses).
2091 The \verb|ADDRESS| may be either the start of the block of NAT addresses
2092 (selected by NAT routes) or in linux-2.2 a local host address (or even zero).
2093 In the last case the router does not translate the packets,
2094 but masquerades them to this address; this feature disappered in 2.4.
2095 More about NAT is in Appendix~\ref{ROUTE-NAT},
2096 p.\pageref{ROUTE-NAT}.
2097
2098 \end{itemize}
2099
2100 \paragraph{Warning:} Changes to the RPDB made with these commands
2101 do not become active immediately. It is assumed that after
2102 a script finishes a batch of updates, it flushes the routing cache
2103 with \verb|ip route flush cache|.
2104
2105 \paragraph{Examples:}
2106 \begin{itemize}
2107 \item Route packets with source addresses from 192.203.80/24
2108 according to routing table \verb|inr.ruhep|:
2109 \begin{verbatim}
2110 ip ru add from 192.203.80.0/24 table inr.ruhep prio 220
2111 \end{verbatim}
2112
2113 \item Translate packet source address 193.233.7.83 into 192.203.80.144
2114 and route it according to table \#1 (actually, it is \verb|inr.ruhep|):
2115 \begin{verbatim}
2116 ip ru add from 193.233.7.83 nat 192.203.80.144 table 1 prio 320
2117 \end{verbatim}
2118
2119 \item Delete the unused default rule:
2120 \begin{verbatim}
2121 ip ru del prio 32767
2122 \end{verbatim}
2123
2124 \end{itemize}
2125
2126
2127
2128 \subsection{{\tt ip rule show} --- list rules}
2129 \label{IP-RULE-SHOW}
2130
2131 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
2132
2133
2134 \paragraph{Arguments:} Good news, this is one command that has no arguments.
2135
2136 \paragraph{Output format:}
2137
2138 \begin{verbatim}
2139 kuznet@amber:~ $ ip ru ls
2140 0:      from all lookup local
2141 200:    from 192.203.80.0/24 to 193.233.7.0/24 lookup main
2142 210:    from 192.203.80.0/24 to 192.203.80.0/24 lookup main
2143 220:    from 192.203.80.0/24 lookup inr.ruhep realms inr.ruhep/radio-msu
2144 300:    from 193.233.7.83 to 193.233.7.0/24 lookup main
2145 310:    from 193.233.7.83 to 192.203.80.0/24 lookup main
2146 320:    from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144
2147 32766:  from all lookup main
2148 kuznet@amber:~ $
2149 \end{verbatim}
2150
2151 In the first column is the rule priority value followed
2152 by a colon. Then the selectors follow. Each key is prefixed
2153 with the same keyword that was used to create the rule.
2154
2155 The keyword \verb|lookup| is followed by a routing table identifier,
2156 as it is recorded in the file \verb|/etc/iproute2/rt_tables|.
2157
2158 If the rule does NAT (f.e.\ rule \#320), it is shown by the keyword
2159 \verb|map-to| followed by the start of the block of addresses to map.
2160
2161 The sense of this example is pretty simple. The prefixes
2162 192.203.80.0/24 and 193.233.7.0/24 form the internal network, but
2163 they are routed differently when the packets leave it.
2164 Besides that, the host 193.233.7.83 is translated into
2165 another prefix to look like 192.203.80.144 when talking
2166 to the outer world.
2167
2168
2169
2170 \section{{\tt ip maddress} --- multicast addresses management}
2171 \label{IP-MADDR}
2172
2173 \paragraph{Object:} \verb|maddress| objects are multicast addresses.
2174
2175 \paragraph{Commands:} \verb|add|, \verb|delete|, \verb|show| (or \verb|list|).
2176
2177 \subsection{{\tt ip maddress show} --- list multicast addresses}
2178
2179 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
2180
2181 \paragraph{Arguments:}
2182
2183 \begin{itemize}
2184
2185 \item \verb|dev NAME| (default)
2186
2187 --- the device name.
2188
2189 \end{itemize}
2190
2191 \paragraph{Output format:}
2192
2193 \begin{verbatim}
2194 kuznet@alisa:~ $ ip maddr ls dummy
2195 2:  dummy
2196     link  33:33:00:00:00:01
2197     link  01:00:5e:00:00:01
2198     inet  224.0.0.1 users 2
2199     inet6 ff02::1
2200 kuznet@alisa:~ $
2201 \end{verbatim}
2202
2203 The first line of the output shows the interface index and its name.
2204 Then the multicast address list follows. Each line starts with the
2205 protocol identifier. The word \verb|link| denotes a link layer
2206 multicast addresses.
2207
2208 If a multicast address has more than one user, the number
2209 of users is shown after the \verb|users| keyword.
2210
2211 One additional feature not present in the example above
2212 is the \verb|static| flag, which indicates that the address was joined
2213 with \verb|ip maddr add|. See the following subsection.
2214
2215
2216
2217 \subsection{{\tt ip maddress add} --- add a multicast address\\
2218             {\tt ip maddress delete} --- delete a multicast address}
2219
2220 \paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|delete|, \verb|del|, \verb|d|.
2221
2222 \paragraph{Description:} these commands attach/detach
2223 a static link layer multicast address to listen on the interface.
2224 Note that it is impossible to join protocol multicast groups
2225 statically. This command only manages link layer addresses.
2226
2227
2228 \paragraph{Arguments:}
2229
2230 \begin{itemize}
2231 \item \verb|address LLADDRESS| (default)
2232
2233 --- the link layer multicast address.
2234
2235 \item \verb|dev NAME|
2236
2237 --- the device to join/leave this multicast address.
2238
2239 \end{itemize}
2240
2241
2242 \paragraph{Example:} Let us continue with the example from the previous subsection.
2243
2244 \begin{verbatim}
2245 netadm@alisa:~ # ip maddr add 33:33:00:00:00:01 dev dummy
2246 netadm@alisa:~ # ip -0 maddr ls dummy
2247 2:  dummy
2248     link  33:33:00:00:00:01 users 2 static
2249     link  01:00:5e:00:00:01
2250 netadm@alisa:~ # ip maddr del 33:33:00:00:00:01 dev dummy
2251 \end{verbatim}
2252
2253 \begin{NB}
2254  Neither \verb|ip| nor the kernel check for multicast address validity.
2255  Particularly, this means that you can try to load a unicast address
2256  instead of a multicast address. Most drivers will ignore such addresses,
2257  but several (f.e.\ Tulip) will intern it to their on-board filter.
2258  The effects may be strange. Namely, the addresses become additional
2259  local link addresses and, if you loaded the address of another host
2260  to the router, wait for duplicated packets on the wire.
2261  It is not a bug, but rather a hole in the API and intra-kernel interfaces.
2262  This feature is really more useful for traffic monitoring, but using it
2263  with Linux-2.2 you {\em have to\/} be sure that the host is not
2264  a router and, especially, that it is not a transparent proxy or masquerading
2265  agent.
2266 \end{NB}
2267
2268
2269
2270 \section{{\tt ip mroute} --- multicast routing cache management}
2271 \label{IP-MROUTE}
2272
2273 \paragraph{Abbreviations:} \verb|mroute|, \verb|mr|.
2274
2275 \paragraph{Object:} \verb|mroute| objects are multicast routing cache
2276 entries created by a user level mrouting daemon
2277 (f.e.\ \verb|pimd| or \verb|mrouted|).
2278
2279 Due to the limitations of the current interface to the multicast routing
2280 engine, it is impossible to change \verb|mroute| objects administratively,
2281 so we may only display them. This limitation will be removed
2282 in the future.
2283
2284 \paragraph{Commands:} \verb|show| (or \verb|list|).
2285
2286
2287 \subsection{{\tt ip mroute show} --- list mroute cache entries}
2288
2289 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
2290
2291 \paragraph{Arguments:}
2292
2293 \begin{itemize}
2294 \item \verb|to PREFIX| (default)
2295
2296 --- the prefix selecting the destination multicast addresses to list.
2297
2298
2299 \item \verb|iif NAME|
2300
2301 --- the interface on which multicast packets are received.
2302
2303
2304 \item \verb|from PREFIX|
2305
2306 --- the prefix selecting the IP source addresses of the multicast route.
2307
2308
2309 \end{itemize}
2310
2311 \paragraph{Output format:}
2312
2313 \begin{verbatim}
2314 kuznet@amber:~ $ ip mroute ls
2315 (193.232.127.6, 224.0.1.39)      Iif: unresolved
2316 (193.232.244.34, 224.0.1.40)     Iif: unresolved
2317 (193.233.7.65, 224.66.66.66)     Iif: eth0       Oifs: pimreg
2318 kuznet@amber:~ $
2319 \end{verbatim}
2320
2321 Each line shows one (S,G) entry in the multicast routing cache,
2322 where S is the source address and G is the multicast group. \verb|Iif| is
2323 the interface on which multicast packets are expected to arrive.
2324 If the word \verb|unresolved| is there instead of the interface name,
2325 it means that the routing daemon still hasn't resolved this entry.
2326 The keyword \verb|oifs| is followed by a list of output interfaces, separated
2327 by spaces. If a multicast routing entry is created with non-trivial
2328 TTL scope, administrative distances are appended to the device names
2329 in the \verb|oifs| list.
2330
2331 \paragraph{Statistics:} The \verb|-statistics| option also prints the
2332 number of packets and bytes forwarded along this route and
2333 the number of packets that arrived on the wrong interface, if this number is not zero.
2334
2335 \begin{verbatim}
2336 kuznet@amber:~ $ ip -s mr ls 224.66/16
2337 (193.233.7.65, 224.66.66.66)     Iif: eth0       Oifs: pimreg
2338   9383 packets, 300256 bytes
2339 kuznet@amber:~ $
2340 \end{verbatim}
2341
2342
2343 \section{{\tt ip tunnel} --- tunnel configuration}
2344 \label{IP-TUNNEL}
2345
2346 \paragraph{Abbreviations:} \verb|tunnel|, \verb|tunl|.
2347
2348 \paragraph{Object:} \verb|tunnel| objects are tunnels, encapsulating
2349 packets in IPv4 packets and then sending them over the IP infrastructure.
2350
2351 \paragraph{Commands:} \verb|add|, \verb|delete|, \verb|change|, \verb|show|
2352 (or \verb|list|).
2353
2354 \paragraph{See also:} A more informal discussion of tunneling
2355 over IP and the \verb|ip tunnel| command can be found in~\cite{IP-TUNNELS}.
2356
2357 \subsection{{\tt ip tunnel add} --- add a new tunnel\\
2358         {\tt ip tunnel change} --- change an existing tunnel\\
2359         {\tt ip tunnel delete} --- destroy a tunnel}
2360
2361 \paragraph{Abbreviations:} \verb|add|, \verb|a|; \verb|change|, \verb|chg|;
2362 \verb|delete|, \verb|del|, \verb|d|.
2363
2364
2365 \paragraph{Arguments:}
2366
2367 \begin{itemize}
2368
2369 \item \verb|name NAME| (default)
2370
2371 --- select the tunnel device name.
2372
2373 \item \verb|mode MODE|
2374
2375 --- set the tunnel mode. Three modes are currently available:
2376         \verb|ipip|, \verb|sit| and \verb|gre|.
2377
2378 \item \verb|remote ADDRESS|
2379
2380 --- set the remote endpoint of the tunnel.
2381
2382 \item \verb|local ADDRESS|
2383
2384 --- set the fixed local address for tunneled packets.
2385 It must be an address on another interface of this host.
2386
2387 \item \verb|ttl N|
2388
2389 --- set a fixed TTL \verb|N| on tunneled packets.
2390         \verb|N| is a number in the range 1--255. 0 is a special value
2391         meaning that packets inherit the TTL value.
2392                 The default value is: \verb|inherit|.
2393
2394 \item \verb|tos T| or \verb|dsfield T|
2395
2396 --- set a fixed TOS \verb|T| on tunneled packets.
2397                 The default value is: \verb|inherit|.
2398
2399
2400
2401 \item \verb|dev NAME|
2402
2403 --- bind the tunnel to the device \verb|NAME| so that
2404         tunneled packets will only be routed via this device and will
2405         not be able to escape to another device when the route to endpoint changes.
2406
2407 \item \verb|nopmtudisc|
2408
2409 --- disable Path MTU Discovery on this tunnel.
2410         It is enabled by default. Note that a fixed ttl is incompatible
2411         with this option: tunnelling with a fixed ttl always makes pmtu discovery.
2412
2413 \item \verb|key K|, \verb|ikey K|, \verb|okey K|
2414
2415 --- (only GRE tunnels) use keyed GRE with key \verb|K|. \verb|K| is
2416         either a number or an IP address-like dotted quad.
2417    The \verb|key| parameter sets the key to use in both directions.
2418    The \verb|ikey| and \verb|okey| parameters set different keys for input and output.
2419
2420
2421 \item \verb|csum|, \verb|icsum|, \verb|ocsum|
2422
2423 --- (only GRE tunnels) generate/require checksums for tunneled packets.
2424    The \verb|ocsum| flag calculates checksums for outgoing packets.
2425    The \verb|icsum| flag requires that all input packets have the correct
2426    checksum. The \verb|csum| flag is equivalent to the combination
2427   ``\verb|icsum| \verb|ocsum|''.
2428
2429 \item \verb|seq|, \verb|iseq|, \verb|oseq|
2430
2431 --- (only GRE tunnels) serialize packets.
2432    The \verb|oseq| flag enables sequencing of outgoing packets.
2433    The \verb|iseq| flag requires that all input packets are serialized.
2434    The \verb|seq| flag is equivalent to the combination ``\verb|iseq| \verb|oseq|''.
2435
2436 \begin{NB}
2437  I think this option does not
2438         work. At least, I did not test it, did not debug it and
2439         do not even understand how it is supposed to work or for what
2440         purpose Cisco planned to use it. Do not use it.
2441 \end{NB}
2442
2443
2444 \end{itemize}
2445
2446 \paragraph{Example:} Create a pointopoint IPv6 tunnel with maximal TTL of 32.
2447 \begin{verbatim}
2448 netadm@amber:~ # ip tunl add Cisco mode sit remote 192.31.7.104 \
2449     local 192.203.80.142 ttl 32
2450 \end{verbatim}
2451
2452 \subsection{{\tt ip tunnel show} --- list tunnels}
2453
2454 \paragraph{Abbreviations:} \verb|show|, \verb|list|, \verb|sh|, \verb|ls|, \verb|l|.
2455
2456
2457 \paragraph{Arguments:} None.
2458
2459 \paragraph{Output format:}
2460 \begin{verbatim}
2461 kuznet@amber:~ $ ip tunl ls Cisco
2462 Cisco: ipv6/ip  remote 192.31.7.104  local 192.203.80.142  ttl 32
2463 kuznet@amber:~ $
2464 \end{verbatim}
2465 The line starts with the tunnel device name followed by a colon.
2466 Then the tunnel mode follows. The parameters of the tunnel are listed
2467 with the same keywords that were used when creating the tunnel.
2468
2469 \paragraph{Statistics:}
2470
2471 \begin{verbatim}
2472 kuznet@amber:~ $ ip -s tunl ls Cisco
2473 Cisco: ipv6/ip  remote 192.31.7.104  local 192.203.80.142  ttl 32
2474 RX: Packets    Bytes        Errors CsumErrs OutOfSeq Mcasts
2475     12566      1707516      0      0        0        0
2476 TX: Packets    Bytes        Errors DeadLoop NoRoute  NoBufs
2477     13445      1879677      0      0        0        0
2478 kuznet@amber:~ $
2479 \end{verbatim}
2480 Essentially, these numbers are the same as the numbers
2481 printed with {\tt ip -s link show}
2482 (sec.\ref{IP-LINK-SHOW}, p.\pageref{IP-LINK-SHOW}) but the tags are different
2483 to reflect that they are tunnel specific.
2484 \begin{itemize}
2485 \item \verb|CsumErrs| --- the total number of packets dropped
2486 because of checksum failures for a GRE tunnel with checksumming enabled.
2487 \item \verb|OutOfSeq| --- the total number of packets dropped
2488 because they arrived out of sequence for a GRE tunnel with
2489 serialization enabled.
2490 \item \verb|Mcasts| --- the total number of multicast packets
2491 received on a broadcast GRE tunnel.
2492 \item \verb|DeadLoop| --- the total number of packets which were not
2493 transmitted because the tunnel is looped back to itself.
2494 \item \verb|NoRoute| --- the total number of packets which were not
2495 transmitted because there is no IP route to the remote endpoint.
2496 \item \verb|NoBufs| --- the total number of packets which were not
2497 transmitted because the kernel failed to allocate a buffer.
2498 \end{itemize}
2499
2500
2501 \section{{\tt ip monitor} and {\tt rtmon} --- state monitoring}
2502 \label{IP-MONITOR}
2503
2504 The \verb|ip| utility can monitor the state of devices, addresses
2505 and routes continuously. This option has a slightly different format.
2506 Namely,
2507 the \verb|monitor| command is the first in the command line and then
2508 the object list follows:
2509 \begin{verbatim}
2510   ip monitor [ file FILE ] [ all | OBJECT-LIST ]
2511 \end{verbatim}
2512 \verb|OBJECT-LIST| is the list of object types that we want to monitor.
2513 It may contain \verb|link|, \verb|address| and \verb|route|.
2514 If no \verb|file| argument is given, \verb|ip| opens RTNETLINK,
2515 listens on it and dumps state changes in the format described
2516 in previous sections.
2517
2518 If a file name is given, it does not listen on RTNETLINK,
2519 but opens the file containing RTNETLINK messages saved in binary format
2520 and dumps them. Such a history file can be generated with the
2521 \verb|rtmon| utility. This utility has a command line syntax similar to
2522 \verb|ip monitor|.
2523 Ideally, \verb|rtmon| should be started before
2524 the first network configuration command is issued. F.e.\ if
2525 you insert:
2526 \begin{verbatim}
2527   rtmon file /var/log/rtmon.log
2528 \end{verbatim}
2529 in a startup script, you will be able to view the full history
2530 later.
2531
2532 Certainly, it is possible to start \verb|rtmon| at any time.
2533 It prepends the history with the state snapshot dumped at the moment
2534 of starting.
2535
2536
2537 \section{Route realms and policy propagation, {\tt rtacct}}
2538 \label{RT-REALMS}
2539
2540 On routers using OSPF ASE or, especially, the BGP protocol, routing
2541 tables may be huge. If we want to classify or to account for the packets
2542 per route, we will have to keep lots of information. Even worse, if we
2543 want to distinguish the packets not only by their destination, but
2544 also by their source, the task gets quadratic complexity and its solution
2545 is physically impossible.
2546
2547 One approach to propagating the policy from routing protocols
2548 to the forwarding engine has been proposed in~\cite{IOS-BGP-PP}.
2549 Essentially, Cisco Policy Propagation via BGP is based on the fact
2550 that dedicated routers all have the RIB (Routing Information Base)
2551 close to the forwarding engine, so policy routing rules can
2552 check all the route attributes, including ASPATH information
2553 and community strings.
2554
2555 The Linux architecture, splitting the RIB (maintained by a user level
2556 daemon) and the kernel based FIB (Forwarding Information Base),
2557 does not allow such a simple approach.
2558
2559 It is to our fortune because there is another solution
2560 which allows even more flexible policy and richer semantics.
2561
2562 Namely, routes can be clustered together in user space, based on their
2563 attributes.  F.e.\ a BGP router knows route ASPATH, its community;
2564 an OSPF router knows the route tag or its area. The administrator, when adding
2565 routes manually, also knows their nature. Providing that the number of such
2566 aggregates (we call them {\em realms\/}) is low, the task of full
2567 classification both by source and destination becomes quite manageable.
2568
2569 So each route may be assigned to a realm. It is assumed that
2570 this identification is made by a routing daemon, but static routes
2571 can also be handled manually with \verb|ip route| (see sec.\ref{IP-ROUTE},
2572 p.\pageref{IP-ROUTE}).
2573 \begin{NB}
2574   There is a patch to \verb|gated|, allowing classification of routes
2575   to realms with all the set of policy rules implemented in \verb|gated|:
2576   by prefix, by ASPATH, by origin, by tag etc.
2577 \end{NB}
2578
2579 To facilitate the construction (f.e.\ in case the routing
2580 daemon is not aware of realms), missing realms may be completed
2581 with routing policy rules, see sec.~\ref{IP-RULE}, p.\pageref{IP-RULE}.
2582
2583 For each packet the kernel calculates a tuple of realms: source realm
2584 and destination realm, using the following algorithm:
2585
2586 \begin{enumerate}
2587 \item If the route has a realm, the destination realm of the packet is set to it.
2588 \item If the rule has a source realm, the source realm of the packet is set to it.
2589 If the destination realm was not inherited from the route and the rule has a destination realm,
2590 it is also set.
2591 \item If at least one of the realms is still unknown, the kernel finds
2592 the reversed route to the source of the packet.
2593 \item If the source realm is still unknown, get it from the reversed route.
2594 \item If one of the realms is still unknown, swap the realms of reversed
2595 routes and apply step 2 again.
2596 \end{enumerate}
2597
2598 After this procedure is completed we know what realm the packet
2599 arrived from and the realm where it is going to propagate to.
2600 If some of the realms are unknown, they are initialized to zero
2601 (or realm \verb|unknown|).
2602
2603 The main application of realms is the TC \verb|route| classifier~\cite{TC-CREF},
2604 where they are used to help assign packets to traffic classes,
2605 to account, police and schedule them according to this
2606 classification.
2607
2608 A much simpler but still very useful application is incoming packet
2609 accounting by realms. The kernel gathers a packet statistics summary
2610 which can be viewed with the \verb|rtacct| utility.
2611 \begin{verbatim}
2612 kuznet@amber:~ $ rtacct russia
2613 Realm      BytesTo    PktsTo     BytesFrom  PktsFrom
2614 russia     20576778   169176     47080168   153805
2615 kuznet@amber:~ $
2616 \end{verbatim}
2617 This shows that this router received 153805 packets from
2618 the realm \verb|russia| and forwarded 169176 packets to \verb|russia|.
2619 The realm \verb|russia| consists of routes with ASPATHs not leaving
2620 Russia.
2621
2622 Note that locally originating packets are not accounted here,
2623 \verb|rtacct| shows incoming packets only. Using the \verb|route|
2624 classifier (see~\cite{TC-CREF}) you can get even more detailed
2625 accounting information about outgoing packets, optionally
2626 summarizing traffic not only by source or destination, but
2627 by any pair of source and destination realms.
2628
2629
2630 \begin{thebibliography}{99}
2631 \addcontentsline{toc}{section}{References}
2632 \bibitem{RFC-NDISC} T.~Narten, E.~Nordmark, W.~Simpson.
2633 ``Neighbor Discovery for IP Version 6 (IPv6)'', RFC-2461.
2634
2635 \bibitem{RFC-ADDRCONF} S.~Thomson, T.~Narten.
2636 ``IPv6 Stateless Address Autoconfiguration'', RFC-2462.
2637
2638 \bibitem{RFC1812} F.~Baker.
2639 ``Requirements for IP Version 4 Routers'', RFC-1812.
2640
2641 \bibitem{RFC1122} R.~T.~Braden.
2642 ``Requirements for Internet hosts --- communication layers'', RFC-1122.
2643
2644 \bibitem{IOS} ``Cisco IOS Release 12.0 Network Protocols
2645 Command Reference, Part 1'' and
2646 ``Cisco IOS Release 12.0 Quality of Service Solutions
2647 Configuration Guide: Configuring Policy-Based Routing'',\\
2648 http://www.cisco.com/univercd/cc/td/doc/product/software/ios120.
2649
2650 \bibitem{IP-TUNNELS} A.~N.~Kuznetsov.
2651 ``Tunnels over IP in Linux-2.2'', \\
2652 In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}.
2653
2654 \bibitem{TC-CREF} A.~N.~Kuznetsov. ``TC Command Reference'',\\
2655 In: {\tt ftp://ftp.inr.ac.ru/ip-routing/iproute2-current.tar.gz}.
2656
2657 \bibitem{IOS-BGP-PP} ``Cisco IOS Release 12.0 Quality of Service Solutions
2658 Configuration Guide: Configuring QoS Policy Propagation via
2659 Border Gateway Protocol'',\\
2660 http://www.cisco.com/univercd/cc/td/doc/product/software/ios120.
2661
2662 \bibitem{RFC-DHCP} R.~Droms.
2663 ``Dynamic Host Configuration Protocol.'', RFC-2131
2664
2665 \bibitem{RFC2414}  M.~Allman, S.~Floyd, C.~Partridge.
2666 ``Increasing TCP's Initial Window'', RFC-2414.
2667
2668 \end{thebibliography}
2669
2670
2671
2672
2673 \appendix
2674 \addcontentsline{toc}{section}{Appendix}
2675
2676 \section{Source address selection}
2677 \label{ADDR-SEL}
2678
2679 When a host creates an IP packet, it must select some source
2680 address. Correct source address selection is a critical procedure,
2681 because it gives the receiver the information needed to deliver a
2682 reply. If the source is selected incorrectly, in the best case,
2683 the backward path may appear different to the forward one which
2684 is harmful for performance. In the worst case, when the addresses
2685 are administratively scoped, the reply may be lost entirely.
2686
2687 Linux-2.2 selects source addresses using the following algorithm:
2688
2689 \begin{itemize}
2690 \item
2691 The application may select a source address explicitly with \verb|bind(2)|
2692 syscall or supplying it to \verb|sendmsg(2)| via the ancillary data object
2693 \verb|IP_PKTINFO|. In this case the kernel only checks the validity
2694 of the address and never tries to ``improve'' an incorrect user choice,
2695 generating an error instead.
2696 \begin{NB}
2697  Never say ``Never''. The sysctl option \verb|ip_dynaddr| breaks
2698  this axiom. It has been made deliberately with the purpose
2699  of automatically reselecting the address on hosts with dynamic dial-out interfaces.
2700  However, this hack {\em must not\/} be used on multihomed hosts
2701  and especially on routers: it would break them.
2702 \end{NB}
2703
2704
2705 \item Otherwise, IP routing tables can contain an explicit source
2706 address hint for this destination. The hint is set with the \verb|src| parameter
2707 to the \verb|ip route| command, sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}.
2708
2709
2710 \item Otherwise, the kernel searches through the list of addresses
2711 attached to the interface through which the packets will be routed.
2712 The search strategies are different for IP and IPv6. Namely:
2713
2714 \begin{itemize}
2715 \item IPv6 searches for the first valid, not deprecated address
2716 with the same scope as the destination.
2717
2718 \item IP searches for the first valid address with a scope wider
2719 than the scope of the destination but it prefers addresses
2720 which fall to the same subnet as the nexthop of the route
2721 to the destination. Unlike IPv6, the scopes of IPv4 destinations
2722 are not encoded in their addresses but are supplied
2723 in routing tables instead (the \verb|scope| parameter to the \verb|ip route| command,
2724 sec.\ref{IP-ROUTE}, p.\pageref{IP-ROUTE}).
2725
2726 \end{itemize}
2727
2728
2729 \item Otherwise, if the scope of the destination is \verb|link| or \verb|host|,
2730 the algorithm fails and returns a zero source address.
2731
2732 \item Otherwise, all interfaces are scanned to search for an address
2733 with an appropriate scope. The loopback device \verb|lo| is always the first
2734 in the search list, so that if an address with global scope (not 127.0.0.1!)
2735 is configured on loopback, it is always preferred.
2736
2737 \end{itemize}
2738
2739
2740 \section{Proxy ARP/NDISC}
2741 \label{PROXY-NEIGH}
2742
2743 Routers may answer ARP/NDISC solicitations on behalf of other hosts.
2744 In Linux-2.2 proxy ARP on an interface may be enabled
2745 by setting the kernel \verb|sysctl| variable
2746 \verb|/proc/sys/net/ipv4/conf/<dev>/proxy_arp| to 1. After this, the router
2747 starts to answer ARP requests on the interface \verb|<dev>|, provided
2748 the route to the requested destination does {\em not\/} go back via the same
2749 device.
2750
2751 The variable \verb|/proc/sys/net/ipv4/conf/all/proxy_arp| enables proxy
2752 ARP on all the IP devices.
2753
2754 However, this approach fails in the case of IPv6 because the router
2755 must join the solicited node multicast address to listen for the corresponding
2756 NDISC queries. It means that proxy NDISC is possible only on a per destination
2757 basis.
2758
2759 Logically, proxy ARP/NDISC is not a kernel task. It can easily be implemented
2760 in user space. However, similar functionality was present in BSD kernels
2761 and in Linux-2.0, so we have to preserve it at least to the extent that
2762 is standardized in BSD.
2763 \begin{NB}
2764   Linux-2.0 ARP had a feature called {\em subnet\/} proxy ARP.
2765   It is replaced with the sysctl flag in Linux-2.2.
2766 \end{NB}
2767
2768
2769 The \verb|ip| utility provides a way to manage proxy ARP/NDISC
2770 with the \verb|ip neigh| command, namely:
2771 \begin{verbatim}
2772   ip neigh add proxy ADDRESS [ dev NAME ]
2773 \end{verbatim}
2774 adds a new proxy ARP/NDISC record and
2775 \begin{verbatim}
2776   ip neigh del proxy ADDRESS [ dev NAME ]
2777 \end{verbatim}
2778 deletes it.
2779
2780 If the name of the device is not given, the router will answer solicitations
2781 for address \verb|ADDRESS| on all devices, otherwise it will only serve
2782 the device \verb|NAME|. Even if the proxy entry is created with
2783 \verb|ip neigh|, the router {\em will not\/} answer a query if the route
2784 to the destination goes back via the interface from which the solicitation
2785 was received.
2786
2787 It is important to emphasize that proxy entries have {\em no\/}
2788 parameters other than these (IP/IPv6 address and optional device).
2789 Particularly, the entry does not store any link layer address.
2790 It always advertises the station address of the interface
2791 on which it sends advertisements (i.e. it's own station address).
2792
2793 \section{Route NAT status}
2794 \label{ROUTE-NAT}
2795
2796 NAT (or ``Network Address Translation'') remaps some parts
2797 of the IP address space into other ones. Linux-2.2 route NAT is supposed
2798 to be used to facilitate policy routing by rewriting addresses
2799 to other routing domains or to help while renumbering sites
2800 to another prefix.
2801
2802 \paragraph{What it is not:}
2803 It is necessary to emphasize that {\em it is not supposed\/}
2804 to be used to compress address space or to split load.
2805 This is not missing functionality but a design principle.
2806 Route NAT is {\em stateless\/}. It does not hold any state
2807 about translated sessions. This means that it handles any number
2808 of sessions flawlessly. But it also means that it is {\em static\/}.
2809 It cannot detect the moment when the last TCP client stops
2810 using an address. For the same reason, it will not help to split
2811 load between several servers.
2812 \begin{NB}
2813 It is a pretty commonly held belief that it is useful to split load between
2814 several servers with NAT. This is a mistake. All you get from this
2815 is the requirement that the router keep the state of all the TCP connections
2816 going via it. Well, if the router is so powerful, run apache on it. 8)
2817 \end{NB}
2818
2819 The second feature: it does not touch packet payload,
2820 does not try to ``improve'' broken protocols by looking
2821 through its data and mangling it. It mangles IP addresses,
2822 only IP addresses and nothing but IP addresses.
2823 This also, is not missing any functionality.
2824
2825 To resume: if you need to compress address space or keep
2826 active FTP clients happy, your choice is not route NAT but masquerading,
2827 port forwarding, NAPT etc.
2828 \begin{NB}
2829 By the way, you may also want to look at
2830 http://www.suse.com/\~mha/HyperNews/get/linux-ip-nat.html
2831 \end{NB}
2832
2833
2834 \paragraph{How it works.}
2835 Some part of the address space is reserved for dummy addresses
2836 which will look for all the world like some host addresses
2837 inside your network. No other hosts may use these addresses,
2838 however other routers may also be configured to translate them.
2839 \begin{NB}
2840 A great advantage of route NAT is that it may be used not
2841 only in stub networks but in environments with arbitrarily complicated
2842 structure. It does not firewall, it {\em forwards.}
2843 \end{NB}
2844 These addresses are selected by the \verb|ip route| command
2845 (sec.\ref{IP-ROUTE-ADD}, p.\pageref{IP-ROUTE-ADD}). F.e.\
2846 \begin{verbatim}
2847   ip route add nat 192.203.80.144 via 193.233.7.83
2848 \end{verbatim}
2849 states that the single address 192.203.80.144 is a dummy NAT address.
2850 For all the world it looks like a host address inside our network.
2851 For neighbouring hosts and routers it looks like the local address
2852 of the translating router. The router answers ARP for it, advertises
2853 this address as routed via it, {\em et al\/}. When the router
2854 receives a packet destined for 192.203.80.144, it replaces
2855 this address with 193.233.7.83 which is the address of some real
2856 host and forwards the packet. If you need to remap
2857 blocks of addresses, you may use a command like:
2858 \begin{verbatim}
2859   ip route add nat 192.203.80.192/26 via 193.233.7.64
2860 \end{verbatim}
2861 This command will map a block of 63 addresses 192.203.80.192-255 to
2862 193.233.7.64-127.
2863
2864 When an internal host (193.233.7.83 in the example above)
2865 sends something to the outer world and these packets are forwarded
2866 by our router, it should translate the source address 193.233.7.83
2867 into 192.203.80.144. This task is solved by setting a special
2868 policy rule (sec.\ref{IP-RULE-ADD}, p.\pageref{IP-RULE-ADD}):
2869 \begin{verbatim}
2870   ip rule add prio 320 from 193.233.7.83 nat 192.203.80.144
2871 \end{verbatim}
2872 This rule says that the source address 193.233.7.83
2873 should be translated into 192.203.80.144 before forwarding.
2874 It is important that the address after the \verb|nat| keyword
2875 is some NAT address, declared by {\tt ip route add nat}.
2876 If it is just a random address the router will not map to it.
2877 \begin{NB}
2878 The exception is when the address is a local address of this
2879 router (or 0.0.0.0) and masquerading is configured in the linux-2.2
2880 kernel. In this case the router will masquerade the packets as this address.
2881 If 0.0.0.0 is selected, the result is equivalent to one
2882 obtained with firewalling rules. Otherwise, you have the way
2883 to order Linux to masquerade to this fixed address.
2884 NAT mechanism used in linux-2.4 is more flexible than
2885 masquerading, so that this feature has lost meaning and disabled.
2886 \end{NB}
2887
2888 If the network has non-trivial internal structure, it is
2889 useful and even necessary to add rules disabling translation
2890 when a packet does not leave this network. Let us return to the
2891 example from sec.\ref{IP-RULE-SHOW} (p.\pageref{IP-RULE-SHOW}).
2892 \begin{verbatim}
2893 300:    from 193.233.7.83 to 193.233.7.0/24 lookup main
2894 310:    from 193.233.7.83 to 192.203.80.0/24 lookup main
2895 320:    from 193.233.7.83 lookup inr.ruhep map-to 192.203.80.144
2896 \end{verbatim}
2897 This block of rules causes normal forwarding when
2898 packets from 193.233.7.83 do not leave networks 193.233.7/24
2899 and 192.203.80/24. Also, if the \verb|inr.ruhep| table does not
2900 contain a route to the destination (which means that the routing
2901 domain owning addresses from 192.203.80/24 is dead), no translation
2902 will occur. Otherwise, the packets are translated.
2903
2904 \paragraph{How to only translate selected ports:}
2905 If you only want to translate selected ports (f.e.\ http)
2906 and leave the rest intact, you may use \verb|ipchains|
2907 to \verb|fwmark| a class of packets.
2908 Suppose you did and all the packets from 193.233.7.83
2909 destined for port 80 are marked with marker 0x1234 in input fwchain.
2910 In this case you may replace rule \#320 with:
2911 \begin{verbatim}
2912 320:    from 193.233.7.83 fwmark 1234 lookup main map-to 192.203.80.144
2913 \end{verbatim}
2914 and translation will only be enabled for outgoing http requests.
2915
2916 \section{Example: minimal host setup}
2917 \label{EXAMPLE-SETUP}
2918
2919 The following script gives an example of a fault safe
2920 setup of IP (and IPv6, if it is compiled into the kernel)
2921 in the common case of a node attached to a single broadcast
2922 network. A more advanced script, which may be used both on multihomed
2923 hosts and on routers, is described in the following
2924 section.
2925
2926 The utilities used in the script may be found in the
2927 directory ftp://ftp.inr.ac.ru/ip-routing/:
2928 \begin{enumerate}
2929 \item \verb|ip| --- package \verb|iproute2|.
2930 \item \verb|arping| --- package \verb|iputils|.
2931 \item \verb|rdisc| --- package \verb|iputils|.
2932 \end{enumerate}
2933 \begin{NB}
2934 It also refers to a DHCP client, \verb|dhcpcd|. I should refrain from
2935 recommending a good DHCP client to use. All that I can
2936 say is that ISC \verb|dhcp-2.0b1pl6| patched with the patch that
2937 can be found in the \verb|dhcp.bootp.rarp| subdirectory of
2938 the same ftp site {\em does\/} work,
2939 at least on Ethernet and Token Ring.
2940 \end{NB}
2941
2942 \begin{verbatim}
2943 #! /bin/bash
2944 \end{verbatim}
2945 \begin{flushleft}
2946 \# {\bf Usage: \verb|ifone ADDRESS[/PREFIX-LENGTH] [DEVICE]|}\\
2947 \# {\bf Parameters:}\\
2948 \# \$1 --- Static IP address, optionally followed by prefix length.\\
2949 \# \$2 --- Device name. If it is missing, \verb|eth0| is asssumed.\\
2950 \# F.e. \verb|ifone 193.233.7.90|
2951 \end{flushleft}
2952 \begin{verbatim}
2953 dev=$2
2954 : ${dev:=eth0}
2955 ipaddr=
2956 \end{verbatim}
2957 \# Parse IP address, splitting prefix length.
2958 \begin{verbatim}
2959 if [ "$1" != "" ]; then
2960   ipaddr=${1%/*}
2961   if [ "$1" != "$ipaddr" ]; then
2962     pfxlen=${1#*/}
2963   fi
2964   : ${pfxlen:=24}
2965 fi
2966 pfx="${ipaddr}/${pfxlen}"
2967 \end{verbatim}
2968
2969 \begin{flushleft}
2970 \# {\bf Step 0} --- enable loopback.\\
2971 \#\\
2972 \# This step is necessary on any networked box before attempt\\
2973 \# to configure any other device.\\
2974 \end{flushleft}
2975 \begin{verbatim}
2976 ip link set up dev lo
2977 ip addr add 127.0.0.1/8 dev lo brd + scope host
2978 \end{verbatim}
2979 \begin{flushleft}
2980 \# IPv6 autoconfigure themself on loopback.\\
2981 \#\\
2982 \# If user gave loopback as device, we add the address as alias and exit.
2983 \end{flushleft}
2984 \begin{verbatim}
2985 if [ "$dev" = "lo" ]; then
2986   if [ "$ipaddr" != "" -a  "$ipaddr" != "127.0.0.1" ]; then
2987     ip address add $ipaddr dev $dev
2988     exit $?
2989   fi
2990   exit 0
2991 fi
2992 \end{verbatim}
2993
2994 \noindent\# {\bf Step 1} --- enable device \verb|$dev|
2995
2996 \begin{verbatim}
2997 if ! ip link set up dev $dev ; then
2998   echo "Cannot enable interface $dev. Aborting." 1>&2
2999   exit 1
3000 fi
3001 \end{verbatim}
3002 \begin{flushleft}
3003 \# The interface is \verb|UP|. IPv6 started stateless autoconfiguration itself,\\
3004 \# and its configuration finishes here. However,\\
3005 \# IP still needs some static preconfigured address.
3006 \end{flushleft}
3007 \begin{verbatim}
3008 if [ "$ipaddr" = "" ]; then
3009   echo "No address for $dev is configured, trying DHCP..." 1>&2
3010   dhcpcd
3011   exit $?
3012 fi
3013 \end{verbatim}
3014
3015 \begin{flushleft}
3016 \# {\bf Step 2} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\
3017 \# Send two probes and wait for result for 3 seconds.\\
3018 \# If the interface opens slower f.e.\ due to long media detection,\\
3019 \# you want to increase the timeout.\\
3020 \end{flushleft}
3021 \begin{verbatim}
3022 if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then
3023   echo "Address $ipaddr is busy, trying DHCP..." 1>&2
3024   dhcpcd
3025   exit $?
3026 fi
3027 \end{verbatim}
3028 \begin{flushleft}
3029 \# OK, the address is unique, we may add it on the interface.\\
3030 \#\\
3031 \# {\bf Step 3} --- Configure the address on the interface.
3032 \end{flushleft}
3033
3034 \begin{verbatim}
3035 if ! ip address add $pfx brd + dev $dev; then
3036   echo "Failed to add $pfx on $dev, trying DHCP..." 1>&2
3037   dhcpcd
3038   exit $?
3039 fi
3040 \end{verbatim}
3041
3042 \noindent\# {\bf Step 4} --- Announce our presence on the link.
3043 \begin{verbatim}
3044 arping -A -c 1 -I $dev $ipaddr
3045 noarp=$?
3046 ( sleep 2;
3047   arping -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null &
3048 \end{verbatim}
3049
3050 \begin{flushleft}
3051 \# {\bf Step 5} (optional) --- Add some control routes.\\
3052 \#\\
3053 \# 1. Prohibit link local multicast addresses.\\
3054 \# 2. Prohibit link local (alias, limited) broadcast.\\
3055 \# 3. Add default multicast route.
3056 \end{flushleft}
3057 \begin{verbatim}
3058 ip route add unreachable 224.0.0.0/24
3059 ip route add unreachable 255.255.255.255
3060 if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then
3061   ip route add 224.0.0.0/4 dev $dev scope global
3062 fi
3063 \end{verbatim}
3064
3065 \begin{flushleft}
3066 \# {\bf Step 6} --- Add fallback default route with huge metric.\\
3067 \# If a proxy ARP server is present on the interface, we will be\\
3068 \# able to talk to all the Internet without further configuration.\\
3069 \# It is not so cheap though and we still hope that this route\\
3070 \# will be overridden by more correct one by rdisc.\\
3071 \# Do not make this step if the device is not ARPable,\\
3072 \# because dead nexthop detection does not work on them.
3073 \end{flushleft}
3074 \begin{verbatim}
3075 if [ "$noarp" = "0" ]; then
3076   ip ro add default dev $dev metric 30000 scope global
3077 fi
3078 \end{verbatim}
3079
3080 \begin{flushleft}
3081 \# {\bf Step 7} --- Restart router discovery and exit.
3082 \end{flushleft}
3083 \begin{verbatim}
3084 killall -HUP rdisc || rdisc -fs
3085 exit 0
3086 \end{verbatim}
3087
3088
3089 \section{Example: {\protect\tt ifcfg} --- interface address management}
3090 \label{EXAMPLE-IFCFG}
3091
3092 This is a simplistic script replacing one option of \verb|ifconfig|,
3093 namely, IP address management. It not only adds
3094 addresses, but also carries out Duplicate Address Detection~\cite{RFC-DHCP},
3095 sends unsolicited ARP to update the caches of other hosts sharing
3096 the interface, adds some control routes and restarts Router Discovery
3097 when it is necessary.
3098
3099 I strongly recommend using it {\em instead\/} of \verb|ifconfig| both
3100 on hosts and on routers.
3101
3102 \begin{verbatim}
3103 #! /bin/bash
3104 \end{verbatim}
3105 \begin{flushleft}
3106 \# {\bf Usage: \verb?ifcfg DEVICE[:ALIAS] [add|del] ADDRESS[/LENGTH] [PEER]?}\\
3107 \# {\bf Parameters:}\\
3108 \# ---Device name. It may have alias suffix, separated by colon.\\
3109 \# ---Command: add, delete or stop.\\
3110 \# ---IP address, optionally followed by prefix length.\\
3111 \# ---Optional peer address for pointopoint interfaces.\\
3112 \# F.e. \verb|ifcfg eth0 193.233.7.90/24|
3113
3114 \noindent\# This function determines, whether it is router or host.\\
3115 \# It returns 0, if the host is apparently not router.
3116 \end{flushleft}
3117 \begin{verbatim}
3118 CheckForwarding () {
3119   local sbase fwd
3120   sbase=/proc/sys/net/ipv4/conf
3121   fwd=0
3122   if [ -d $sbase ]; then
3123     for dir in $sbase/*/forwarding; do
3124       fwd=$[$fwd + `cat $dir`]
3125     done
3126   else
3127     fwd=2
3128   fi
3129   return $fwd
3130 }
3131 \end{verbatim}
3132 \begin{flushleft}
3133 \# This function restarts Router Discovery.\\
3134 \end{flushleft}
3135 \begin{verbatim}
3136 RestartRDISC () {
3137   killall -HUP rdisc || rdisc -fs
3138 }
3139 \end{verbatim}
3140 \begin{flushleft}
3141 \# Calculate ABC "natural" mask length\\
3142 \# Arg: \$1 = dotquad address
3143 \end{flushleft}
3144 \begin{verbatim}
3145 ABCMaskLen () {
3146   local class;
3147   class=${1%%.*}
3148   if [ $class -eq 0 -o $class -ge 224 ]; then return 0
3149   elif [ $class -ge 192 ]; then return 24
3150   elif [ $class -ge 128 ]; then return 16
3151   else  return 8 ; fi
3152 }
3153 \end{verbatim}
3154
3155
3156 \begin{flushleft}
3157 \# {\bf MAIN()}\\
3158 \#\\
3159 \# Strip alias suffix separated by colon.
3160 \end{flushleft}
3161 \begin{verbatim}
3162 label="label $1"
3163 ldev=$1
3164 dev=${1%:*}
3165 if [ "$dev" = "" -o "$1" = "help" ]; then
3166   echo "Usage: ifcfg DEV [[add|del [ADDR[/LEN]] [PEER] | stop]" 1>&2
3167   echo "       add - add new address" 1>&2
3168   echo "       del - delete address" 1>&2
3169   echo "       stop - completely disable IP" 1>&2
3170   exit 1
3171 fi
3172 shift
3173
3174 CheckForwarding
3175 fwd=$?
3176 \end{verbatim}
3177 \begin{flushleft}
3178 \# Parse command. If it is ``stop'', flush and exit.
3179 \end{flushleft}
3180 \begin{verbatim}
3181 deleting=0
3182 case "$1" in
3183 add) shift ;;
3184 stop)
3185   if [ "$ldev" != "$dev" ]; then
3186     echo "Cannot stop alias $ldev" 1>&2
3187     exit 1;
3188   fi
3189   ip -4 addr flush dev $dev $label || exit 1
3190   if [ $fwd -eq 0 ]; then RestartRDISC; fi
3191   exit 0 ;;
3192 del*)
3193   deleting=1; shift ;;
3194 *)
3195 esac
3196 \end{verbatim}
3197 \begin{flushleft}
3198 \# Parse prefix, split prefix length, separated by slash.
3199 \end{flushleft}
3200 \begin{verbatim}
3201 ipaddr=
3202 pfxlen=
3203 if [ "$1" != "" ]; then
3204   ipaddr=${1%/*}
3205   if [ "$1" != "$ipaddr" ]; then
3206     pfxlen=${1#*/}
3207   fi
3208   if [ "$ipaddr" = "" ]; then
3209     echo "$1 is bad IP address." 1>&2
3210     exit 1
3211   fi
3212 fi
3213 shift
3214 \end{verbatim}
3215 \begin{flushleft}
3216 \# If peer address is present, prefix length is 32.\\
3217 \# Otherwise, if prefix length was not given, guess it.
3218 \end{flushleft}
3219 \begin{verbatim}
3220 peer=$1
3221 if [ "$peer" != "" ]; then
3222   if [ "$pfxlen" != "" -a "$pfxlen" != "32" ]; then
3223     echo "Peer address with non-trivial netmask." 1>&2
3224     exit 1
3225   fi
3226   pfx="$ipaddr peer $peer"
3227 else
3228   if [ "$pfxlen" = "" ]; then
3229     ABCMaskLen $ipaddr
3230     pfxlen=$?
3231   fi
3232   pfx="$ipaddr/$pfxlen"
3233 fi
3234 if [ "$ldev" = "$dev" -a "$ipaddr" != "" ]; then
3235   label=
3236 fi
3237 \end{verbatim}
3238 \begin{flushleft}
3239 \# If deletion was requested, delete the address and restart RDISC
3240 \end{flushleft}
3241 \begin{verbatim}
3242 if [ $deleting -ne 0 ]; then
3243   ip addr del $pfx dev $dev $label || exit 1
3244   if [ $fwd -eq 0 ]; then RestartRDISC; fi
3245   exit 0
3246 fi
3247 \end{verbatim}
3248 \begin{flushleft}
3249 \# Start interface initialization.\\
3250 \#\\
3251 \# {\bf Step 0} --- enable device \verb|$dev|
3252 \end{flushleft}
3253 \begin{verbatim}
3254 if ! ip link set up dev $dev ; then
3255   echo "Error: cannot enable interface $dev." 1>&2
3256   exit 1
3257 fi
3258 if [ "$ipaddr" = "" ]; then exit 0; fi
3259 \end{verbatim}
3260 \begin{flushleft}
3261 \# {\bf Step 1} --- IP Duplicate Address Detection~\cite{RFC-DHCP}.\\
3262 \# Send two probes and wait for result for 3 seconds.\\
3263 \# If the interface opens slower f.e.\ due to long media detection,\\
3264 \# you want to increase the timeout.\\
3265 \end{flushleft}
3266 \begin{verbatim}
3267 if ! arping -q -c 2 -w 3 -D -I $dev $ipaddr ; then
3268   echo "Error: some host already uses address $ipaddr on $dev." 1>&2
3269   exit 1
3270 fi
3271 \end{verbatim}
3272 \begin{flushleft}
3273 \# OK, the address is unique. We may add it to the interface.\\
3274 \#\\
3275 \# {\bf Step 2} --- Configure the address on the interface.
3276 \end{flushleft}
3277 \begin{verbatim}
3278 if ! ip address add $pfx brd + dev $dev $label; then
3279   echo "Error: failed to add $pfx on $dev." 1>&2
3280   exit 1
3281 fi
3282 \end{verbatim}
3283 \noindent\# {\bf Step 3} --- Announce our presence on the link
3284 \begin{verbatim}
3285 arping -q -A -c 1 -I $dev $ipaddr
3286 noarp=$?
3287 ( sleep 2 ;
3288   arping -q -U -c 1 -I $dev $ipaddr ) >& /dev/null </dev/null &
3289 \end{verbatim}
3290 \begin{flushleft}
3291 \# {\bf Step 4} (optional) --- Add some control routes.\\
3292 \#\\
3293 \# 1. Prohibit link local multicast addresses.\\
3294 \# 2. Prohibit link local (alias, limited) broadcast.\\
3295 \# 3. Add default multicast route.
3296 \end{flushleft}
3297 \begin{verbatim}
3298 ip route add unreachable 224.0.0.0/24 >& /dev/null
3299 ip route add unreachable 255.255.255.255 >& /dev/null
3300 if [ `ip link ls $dev | grep -c MULTICAST` -ge 1 ]; then
3301   ip route add 224.0.0.0/4 dev $dev scope global >& /dev/null
3302 fi
3303 \end{verbatim}
3304 \begin{flushleft}
3305 \# {\bf Step 5} --- Add fallback default route with huge metric.\\
3306 \# If a proxy ARP server is present on the interface, we will be\\
3307 \# able to talk to all the Internet without further configuration.\\
3308 \# Do not make this step on router or if the device is not ARPable.\\
3309 \# because dead nexthop detection does not work on them.
3310 \end{flushleft}
3311 \begin{verbatim}
3312 if [ $fwd -eq 0 ]; then
3313   if [ $noarp -eq 0 ]; then
3314     ip ro append default dev $dev metric 30000 scope global
3315   elif [ "$peer" != "" ]; then
3316     if ping -q -c 2 -w 4 $peer ; then
3317       ip ro append default via $peer dev $dev metric 30001
3318     fi
3319   fi
3320   RestartRDISC
3321 fi
3322
3323 exit 0
3324 \end{verbatim}
3325 \begin{flushleft}
3326 \# End of {\bf MAIN()}
3327 \end{flushleft}
3328
3329
3330 \end{document}