From: Andy Gospodarek <gospo@redhat.com> Subject: [RHEL5.1 PATCH] e1000: fix watchdog timeout panics Date: Wed, 2 May 2007 14:01:19 -0400 Bugzilla: 217483 Message-Id: <20070502180119.GB9293@gospo.rdu.redhat.com> Changelog: [e1000] fix watchdog timeout panics This resolves a problem that we've seen for quite a while on rhel and upstream with the e1000 driver. The e1000 watchdog task times out (and panics the box) if a perfectly timed ethtool request is made. This resolves BZ 217483. It has been customer tested and verified that is resolves the reported issue. This patch is currently in -mm and I expect it will make it to netdev (if not Linus' tree shortly). The following 3 commits were pulled and backported from Auke Kok's e1000 git tree: git://lost.foo-projects.org/~ahkok/git/netdev-2.6 commit 1314bbf3a3d911218fc153e14873e2e384d08084 Author: Auke Kok <auke\-jan h kok intel com> e1000: driver state fixes (race fix) commit 313340de0cb03b0cef8cd5b91cfb384792d15a3b Author: Auke Kok <auke-jan h kok intel com> e1000: introduce watchdog task commit 38fe2f8adb14761600d571f3ec269c55060e0da3 Author: Auke Kok <auke-jan h kok intel com> e1000: skip unneeded PHY reads in watchdog when link is OK --- e1000.h | 4 ++- e1000_ethtool.c | 6 ++--- e1000_main.c | 67 ++++++++++++++++++++++++++++++++++++++++++-------------- 3 files changed, 57 insertions(+), 20 deletions(-) --- linux-2.6.18.x86_64/drivers/net/e1000/e1000.h.gospo 2007-05-07 08:09:29.701964000 -0400 +++ linux-2.6.18.x86_64/drivers/net/e1000/e1000.h 2007-05-07 08:20:31.732878000 -0400 @@ -256,6 +256,7 @@ struct e1000_adapter { #endif atomic_t irq_sem; struct work_struct reset_task; + struct work_struct watchdog_task; uint8_t fc_autoneg; struct timer_list blink_timer; @@ -346,8 +347,9 @@ struct e1000_adapter { }; enum e1000_state_t { - __E1000_DRIVER_TESTING, + __E1000_TESTING, __E1000_RESETTING, + __E1000_DOWN }; /* e1000_main.c */ --- linux-2.6.18.x86_64/drivers/net/e1000/e1000_ethtool.c.gospo 2007-05-07 08:09:06.318372000 -0400 +++ linux-2.6.18.x86_64/drivers/net/e1000/e1000_ethtool.c 2007-05-07 08:20:49.086500000 -0400 @@ -1606,7 +1606,7 @@ e1000_diag_test(struct net_device *netde struct e1000_adapter *adapter = netdev_priv(netdev); boolean_t if_running = netif_running(netdev); - set_bit(__E1000_DRIVER_TESTING, &adapter->flags); + set_bit(__E1000_TESTING, &adapter->flags); if (eth_test->flags == ETH_TEST_FL_OFFLINE) { /* Offline tests */ @@ -1651,7 +1651,7 @@ e1000_diag_test(struct net_device *netde adapter->hw.autoneg = autoneg; e1000_reset(adapter); - clear_bit(__E1000_DRIVER_TESTING, &adapter->flags); + clear_bit(__E1000_TESTING, &adapter->flags); if (if_running) dev_open(netdev); } else { @@ -1666,7 +1666,7 @@ e1000_diag_test(struct net_device *netde data[2] = 0; data[3] = 0; - clear_bit(__E1000_DRIVER_TESTING, &adapter->flags); + clear_bit(__E1000_TESTING, &adapter->flags); } msleep_interruptible(4 * 1000); } --- linux-2.6.18.x86_64/drivers/net/e1000/e1000_main.c.gospo 2007-05-07 08:09:17.271409000 -0400 +++ linux-2.6.18.x86_64/drivers/net/e1000/e1000_main.c 2007-05-07 08:20:31.753861000 -0400 @@ -144,6 +144,7 @@ static void e1000_clean_rx_ring(struct e static void e1000_set_multi(struct net_device *netdev); static void e1000_update_phy_info(unsigned long data); static void e1000_watchdog(unsigned long data); +static void e1000_watchdog_task(struct net_device *netdev); static void e1000_82547_tx_fifo_stall(unsigned long data); static int e1000_xmit_frame(struct sk_buff *skb, struct net_device *netdev); static struct net_device_stats * e1000_get_stats(struct net_device *netdev); @@ -469,13 +470,14 @@ e1000_up(struct e1000_adapter *adapter) adapter->tx_queue_len = netdev->tx_queue_len; - mod_timer(&adapter->watchdog_timer, jiffies); - #ifdef CONFIG_E1000_NAPI netif_poll_enable(netdev); #endif e1000_irq_enable(adapter); + clear_bit(__E1000_DOWN, &adapter->flags); + + mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ); return 0; } @@ -531,6 +533,10 @@ e1000_down(struct e1000_adapter *adapter { struct net_device *netdev = adapter->netdev; + /* signal that we're down so the interrupt handler does not + * reschedule our watchdog timer */ + set_bit(__E1000_DOWN, &adapter->flags); + e1000_irq_disable(adapter); del_timer_sync(&adapter->tx_fifo_stall_timer); @@ -865,11 +871,8 @@ e1000_probe(struct pci_dev *pdev, INIT_WORK(&adapter->reset_task, (void (*)(void *))e1000_reset_task, netdev); - - /* we're going to reset, so assume we have no link for now */ - - netif_carrier_off(netdev); - netif_stop_queue(netdev); + INIT_WORK(&adapter->watchdog_task, + (void (*)(void *))e1000_watchdog_task, netdev); e1000_check_options(adapter); @@ -978,6 +981,10 @@ e1000_probe(struct pci_dev *pdev, if ((err = register_netdev(netdev))) goto err_register; + /* tell the stack to leave us alone until e1000_open() is called */ + netif_carrier_off(netdev); + netif_stop_queue(netdev); + DPRINTK(PROBE, INFO, "Intel(R) PRO/1000 Network Connection\n"); cards_found++; @@ -1034,6 +1041,13 @@ e1000_remove(struct pci_dev *pdev) int i; #endif + /* flush_scheduled work may reschedule our watchdog task, so + * explicitly disable watchdog tasks from being rescheduled */ + set_bit(__E1000_DOWN, &adapter->flags); + del_timer_sync(&adapter->tx_fifo_stall_timer); + del_timer_sync(&adapter->watchdog_timer); + del_timer_sync(&adapter->phy_info_timer); + flush_scheduled_work(); if (adapter->hw.mac_type >= e1000_82540 && @@ -1165,6 +1179,8 @@ e1000_sw_init(struct e1000_adapter *adap atomic_set(&adapter->irq_sem, 1); spin_lock_init(&adapter->stats_lock); + set_bit(__E1000_DOWN, &adapter->flags); + return 0; } @@ -1230,7 +1246,7 @@ e1000_open(struct net_device *netdev) int err; /* disallow open during test */ - if (test_bit(__E1000_DRIVER_TESTING, &adapter->flags)) + if (test_bit(__E1000_TESTING, &adapter->flags)) return -EBUSY; /* allocate transmit descriptors */ @@ -2353,9 +2369,8 @@ e1000_82547_tx_fifo_stall(unsigned long adapter->tx_fifo_head = 0; atomic_set(&adapter->tx_fifo_stall, 0); netif_wake_queue(netdev); - } else { + } else if (!test_bit(__E1000_DOWN, &adapter->flags)) mod_timer(&adapter->tx_fifo_stall_timer, jiffies + 1); - } } } @@ -2367,11 +2382,23 @@ static void e1000_watchdog(unsigned long data) { struct e1000_adapter *adapter = (struct e1000_adapter *) data; - struct net_device *netdev = adapter->netdev; + + /* Do the rest outside of interrupt context */ + schedule_work(&adapter->watchdog_task); +} + +static void +e1000_watchdog_task(struct net_device *netdev) +{ + struct e1000_adapter *adapter = netdev_priv(netdev); struct e1000_tx_ring *txdr = adapter->tx_ring; uint32_t link, tctl; int32_t ret_val; + if ((netif_carrier_ok(netdev)) && + (E1000_READ_REG(&adapter->hw, STATUS) & E1000_STATUS_LU)) + goto link_up; + ret_val = e1000_check_for_link(&adapter->hw); if ((ret_val == E1000_ERR_PHY) && (adapter->hw.phy_type == e1000_phy_igp_3) && @@ -2461,7 +2488,8 @@ e1000_watchdog(unsigned long data) netif_carrier_on(netdev); netif_wake_queue(netdev); - mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ); + if (!test_bit(__E1000_DOWN, &adapter->flags)) + mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ); adapter->smartspeed = 0; } } else { @@ -2471,7 +2499,8 @@ e1000_watchdog(unsigned long data) DPRINTK(LINK, INFO, "NIC Link is Down\n"); netif_carrier_off(netdev); netif_stop_queue(netdev); - mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ); + if (!test_bit(__E1000_DOWN, &adapter->flags)) + mod_timer(&adapter->phy_info_timer, jiffies + 2 * HZ); /* 80003ES2LAN workaround-- * For packet buffer work-around on link down event; @@ -2486,6 +2515,7 @@ e1000_watchdog(unsigned long data) e1000_smartspeed(adapter); } +link_up: e1000_update_stats(adapter); adapter->hw.tx_packet_delta = adapter->stats.tpt - adapter->tpt_old; @@ -2536,7 +2566,8 @@ e1000_watchdog(unsigned long data) e1000_rar_set(&adapter->hw, adapter->hw.mac_addr, 0); /* Reset the timer */ - mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ); + if (!test_bit(__E1000_DOWN, &adapter->flags)) + mod_timer(&adapter->watchdog_timer, jiffies + 2 * HZ); } #define E1000_TX_FLAGS_CSUM 0x00000001 @@ -3028,7 +3059,9 @@ e1000_xmit_frame(struct sk_buff *skb, st if (unlikely(adapter->hw.mac_type == e1000_82547)) { if (unlikely(e1000_82547_fifo_workaround(adapter, skb))) { netif_stop_queue(netdev); - mod_timer(&adapter->tx_fifo_stall_timer, jiffies); + if (!test_bit(__E1000_DOWN, &adapter->flags)) + mod_timer(&adapter->tx_fifo_stall_timer, + jiffies + 1); spin_unlock_irqrestore(&tx_ring->tx_lock, flags); return NETDEV_TX_BUSY; } @@ -3422,7 +3455,9 @@ e1000_intr(int irq, void *data, struct p rctl = E1000_READ_REG(hw, RCTL); E1000_WRITE_REG(hw, RCTL, rctl & ~E1000_RCTL_EN); } - mod_timer(&adapter->watchdog_timer, jiffies); + /* guard against interrupt when we're going down */ + if (!test_bit(__E1000_DOWN, &adapter->flags)) + mod_timer(&adapter->watchdog_timer, jiffies + 1); } #ifdef CONFIG_E1000_NAPI