Document ENOBUFS problem
authorMichal Sojka <sojkam1@fel.cvut.cz>
Sat, 4 Dec 2010 12:20:25 +0000 (13:20 +0100)
committerMichal Sojka <sojkam1@fel.cvut.cz>
Sat, 4 Dec 2010 12:20:25 +0000 (13:20 +0100)
doc/notes.org

index 08038b6..254ada5 100644 (file)
@@ -81,3 +81,231 @@ Call Trace:
 [c03f3fc0] [c039f918] start_kernel+0x2e4/0x2fc
 [c03f3ff0] [00003438] 0x3438
 Rebooting in 180 seconds..
+
+* write returns ENOBUFS
+The reason is queuing discipline - the queue is full and
+pfifo_fast_enqueue returns failure. This failure is propagated down to
+qdisc_enqueue_root(), __dev_xmit_skb(), dev_queue_xmit() and
+can_send()
+
+** See the follwing trace
+
+# tracer: function_graph
+#
+# CPU  DURATION                  FUNCTION CALLS
+# |     |   |                     |   |   |   |
+ 1)               |  can_send() {
+ 1)               |    dev_queue_xmit() {
+ 1)               |      rt_spin_lock() {
+ 1)   0.894 us    |        __might_sleep();
+ 1)   2.750 us    |      }
+ 1)               |      pfifo_fast_enqueue() {
+ 1)               |        kfree_skb() {
+ 1)               |          __kfree_skb() {
+ 1)               |            skb_release_head_state() {
+ 1)               |              sock_wfree() {
+ 1)               |                sock_def_write_space() {
+ 1)               |                  rt_read_lock() {
+ 1)               |                    __rt_spin_lock() {
+ 1)   0.677 us    |                      __might_sleep();
+ 1)   2.386 us    |                    }
+ 1)   4.150 us    |                  }
+ 1)               |                  rt_read_unlock() {
+ 1)   0.918 us    |                    __rt_spin_unlock();
+ 1)   2.644 us    |                  }
+ 1)   9.375 us    |                }
+ 1) + 10.999 us   |              }
+ 1) + 12.708 us   |            }
+ 1)               |            skb_release_data() {
+ 1)               |              kfree() {
+ 1)               |                _slab_irq_disable() {
+ 1)               |                  rt_spin_lock() {
+ 1)   0.849 us    |                    __might_sleep();
+ 1)   2.588 us    |                  }
+ 1)   4.242 us    |                }
+ 1)   0.864 us    |                __cache_free();
+ 1)   0.888 us    |                rt_spin_unlock();
+ 1)   9.456 us    |              }
+ 1) + 11.185 us   |            }
+ 1)               |            kmem_cache_free() {
+ 1)               |              _slab_irq_disable() {
+ 1)               |                rt_spin_lock() {
+ 1)   0.812 us    |                  __might_sleep();
+ 1)   2.542 us    |                }
+ 1)   4.240 us    |              }
+ 1)   0.813 us    |              __cache_free();
+ 1)   0.894 us    |              rt_spin_unlock();
+ 1)   9.329 us    |            }
+ 1) + 36.444 us   |          }
+ 1) + 38.136 us   |        }
+ 1) + 39.974 us   |      }
+ 1)               |      /* qdisc_enqueue_root = 1 */
+ 1)               |      __qdisc_run() {
+ 1)               |        sch_direct_xmit() {
+ 1)   0.855 us    |          rt_spin_unlock();
+ 1)               |          rt_spin_lock() {
+ 1)   0.827 us    |            __might_sleep();
+ 1)   2.521 us    |          }
+ 1)               |          dev_hard_start_xmit() {
+ 1)               |            sja1000_start_xmit() {
+ 1)   1.646 us    |              kvaser_pci_write_reg();
+ 1)   1.460 us    |              kvaser_pci_write_reg();
+ 1)   1.691 us    |              kvaser_pci_write_reg();
+ 1)   1.450 us    |              kvaser_pci_write_reg();
+ 1)   1.416 us    |              kvaser_pci_write_reg();
+ 1)               |              can_put_echo_skb() {
+ 1)               |                sock_wfree() {
+ 1)               |                  sock_def_write_space() {
+ 1)               |                    rt_read_lock() {
+ 1)               |                      __rt_spin_lock() {
+ 1)   0.939 us    |                        __might_sleep();
+ 1)   2.699 us    |                      }
+ 1)   4.337 us    |                    }
+ 1)               |                    rt_read_unlock() {
+ 1)   0.844 us    |                      __rt_spin_unlock();
+ 1)   2.507 us    |                    }
+ 1)   9.242 us    |                  }
+ 1) + 11.047 us   |                }
+ 1) + 12.791 us   |              }
+ 1)               |              sja1000_write_cmdreg() {
+ 1)               |                rt_spin_lock() {
+ 1)   0.802 us    |                  __might_sleep();
+ 1)   2.532 us    |                }
+ 1)   1.481 us    |                kvaser_pci_write_reg();
+ 1)   2.155 us    |                kvaser_pci_read_reg();
+ 1)   0.843 us    |                rt_spin_unlock();
+ 1) + 11.100 us   |              }
+ 1) + 38.263 us   |            }
+ 1)               |            /* ops->ndo_start_xmit(skb, dev) = 0 */
+ 1) + 41.508 us   |          }
+ 1)               |          /* dev_hard_start_xmit(skb, dev) = 0 */
+ 1)   0.884 us    |          rt_spin_unlock();
+ 1)               |          rt_spin_lock() {
+ 1)   0.817 us    |            __might_sleep();
+ 1)   2.583 us    |          }
+ 1)               |          /* qdisc_len = 9 */
+ 1)               |          /* sch_direct_xmit returns 0 */
+ 1) + 57.502 us   |        }
+ 1) + 59.288 us   |      }
+ 1)   0.877 us    |      rt_spin_unlock();
+ 1)               |      /* __dev_xmit_skb = 1 */
+ 1) ! 111.265 us  |    }
+ 1)               |  /* dev_queue_xmit = 1 */
+ 1)               |  /* net_xmit_errno = -105 */
+ 1)   0.956 us    |    kfree_skb();
+ 1) ! 117.340 us  |  }
+ 1)               |  /* write returned -1 */
+
+** The trace was generated by kernel v2.6.33.7-rt29-4-g92487e4 with the following patch applied:
+
+diff --git a/net/can/af_can.c b/net/can/af_can.c
+index 4d8479d7..c91968d 100644
+--- a/net/can/af_can.c
++++ b/net/can/af_can.c
+@@ -344,8 +344,11 @@ int can_send(struct sk_buff *skb, int loop)
+       /* send to netdevice */
+       err = dev_queue_xmit(skb);
+-      if (err > 0)
++      trace_printk("dev_queue_xmit = %d\n", err);
++      if (err > 0) {
+               err = net_xmit_errno(err);
++              trace_printk("net_xmit_errno = %d\n", err);
++      }
+       if (err) {
+ #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,17)
+diff --git a/net/core/dev.c b/net/core/dev.c
+index 7a5412e..10ca9b4 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -1828,6 +1828,8 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
+                       skb_dst_drop(skb);
+               rc = ops->ndo_start_xmit(skb, dev);
++              trace_printk("ops->ndo_start_xmit(skb, dev) = %d\n", rc);
++
+               if (rc == NETDEV_TX_OK)
+                       txq_trans_update(txq);
+               /*
+@@ -1854,6 +1856,7 @@ gso:
+               skb->next = nskb->next;
+               nskb->next = NULL;
+               rc = ops->ndo_start_xmit(nskb, dev);
++              trace_printk("gso: ops->ndo_start_xmit(skb, dev) = %d\n", rc);
+               if (unlikely(rc != NETDEV_TX_OK)) {
+                       if (rc & ~NETDEV_TX_MASK)
+                               goto out_kfree_gso_skb;
+@@ -1949,6 +1952,8 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+       spin_lock(root_lock);
+       if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) {
+               kfree_skb(skb);
++              trace_printk("QDISC_STATE_DEACTIVATED\n");
++
+               rc = NET_XMIT_DROP;
+       } else if ((q->flags & TCQ_F_CAN_BYPASS) && !qdisc_qlen(q) &&
+                  !test_and_set_bit(__QDISC_STATE_RUNNING, &q->state)) {
+@@ -1966,6 +1971,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
+               rc = NET_XMIT_SUCCESS;
+       } else {
+               rc = qdisc_enqueue_root(skb, q);
++              trace_printk("qdisc_enqueue_root = %d\n", rc);
+               qdisc_run(q);
+       }
+       spin_unlock(root_lock);
+@@ -2047,6 +2053,7 @@ gso:
+ #endif
+       if (q->enqueue) {
+               rc = __dev_xmit_skb(skb, q, dev, txq);
++              trace_printk("__dev_xmit_skb = %d\n", rc);
+               goto out;
+       }
+@@ -2072,6 +2079,8 @@ gso:
+                       if (!netif_tx_queue_stopped(txq)) {
+                               rc = dev_hard_start_xmit(skb, dev, txq);
++                              trace_printk("dev_hard_start_xmit = %d\n", rc);
++
+                               if (dev_xmit_complete(rc)) {
+                                       HARD_TX_UNLOCK(dev, txq);
+                                       goto out;
+diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
+index bf7f50e..aea0833 100644
+--- a/net/sched/sch_generic.c
++++ b/net/sched/sch_generic.c
+@@ -125,6 +125,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+       HARD_TX_LOCK(dev, txq);
+       if (!netif_tx_queue_stopped(txq) && !netif_tx_queue_frozen(txq))
+               ret = dev_hard_start_xmit(skb, dev, txq);
++      trace_printk("dev_hard_start_xmit(skb, dev) = %d\n", ret);
+       HARD_TX_UNLOCK(dev, txq);
+@@ -133,9 +134,11 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+       if (dev_xmit_complete(ret)) {
+               /* Driver sent out skb successfully or skb was consumed */
+               ret = qdisc_qlen(q);
++              trace_printk("qdisc_len = %d\n", ret);
+       } else if (ret == NETDEV_TX_LOCKED) {
+               /* Driver try lock failed */
+               ret = handle_dev_cpu_collision(skb, txq, q);
++              trace_printk("handle_dev_cpu_collision = %d\n", ret);
+       } else {
+               /* Driver returned NETDEV_TX_BUSY - requeue skb */
+               if (unlikely (ret != NETDEV_TX_BUSY && net_ratelimit()))
+@@ -143,12 +146,13 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
+                              dev->name, ret, q->q.qlen);
+               ret = dev_requeue_skb(skb, q);
++              trace_printk("dev_requeue_skb = %d\n", ret);
+       }
+       if (ret && (netif_tx_queue_stopped(txq) ||
+                   netif_tx_queue_frozen(txq)))
+               ret = 0;
+-
++      trace_printk("%s returns %d\n", __FUNCTION__, ret);
+       return ret;
+ }