static int handle_en_event(struct hns_roce_dev *hr_dev, u8 port,
unsigned long event)
{
struct device *dev = &hr_dev->pdev->dev;
struct net_device *netdev;
netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "port(%d) can't find netdev\n", port);
return -ENODEV;
}
spin_lock_bh(&hr_dev->iboe.lock);
switch (event) {
case NETDEV_UP:
case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
case NETDEV_DOWN:
/*
* In v1 engine, only support all ports closed together.
*/
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
break;
}
spin_unlock_bh(&hr_dev->iboe.lock);
return 0;
}
这段code中在spin_lock_bh/spin_unlock_bh 之间调用hns_roce_set_mac,而hns_roce_set_mac->hns_roce_set_mac->hns_roce_v1_set_mac
void hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr)
{
u32 reg_smac_l;
u16 reg_smac_h;
u16 *p_h;
u32 *p;
u32 val;
/*
* When mac changed, loopback may fail
* because of smac not equal to dmac.
* We Need to release and create reserved qp again.
*/
if (hr_dev->hw->dereg_mr && hns_roce_v1_recreate_lp_qp(hr_dev))
dev_warn(&hr_dev->pdev->dev, "recreate lp qp timeout!\n");
}
hns_roce_v1_set_mac 中又会调用hns_roce_v1_recreate_lp_qp
static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
{
struct device *dev = &hr_dev->pdev->dev;
struct hns_roce_recreate_lp_qp_work *lp_qp_work;
struct hns_roce_free_mr *free_mr;
struct hns_roce_v1_priv *priv;
struct completion comp;
unsigned long end =
msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
while (time_before_eq(jiffies, end)) {
if (try_wait_for_completion(&comp))
return 0;
msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
}
}
在hns_roce_v1_recreate_lp_qp 中调用msleep函数,就违反了不能在atomic环境中sleep的情况,这样在
__schedule->schedule_debug
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
//这个条件就会符合,所以调用__schedule_bug
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
static noinline void __schedule_bug(struct task_struct *prev)
{
/* Save this before calling printk(), since that will clobber it */
unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
if (oops_in_progress)
return;
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
}
这样就会打印BUG: scheduling while atomic 这样的log。并出给当前京进程的name和pid。
而#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
可见就是就是检测preempt_count是否等于PREEMPT_DISABLE_OFFSET==0.不为0也就是说当前是原子环境
那么为啥spin_lock_bh/spin_unlock_bh 之间的环境是原子环境呢?
static __always_inline void spin_lock_bh(spinlock_t *lock)
{
raw_spin_lock_bh(&lock->rlock);
}
#define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock)
void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
{
local_irq_disable();
preempt_disable();
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}
重点是调用preempt_disable
#define preempt_disable() \
do { \
preempt_count_inc(); \
barrier(); \
} while (0)
原来在preempt_disable 中调用preempt_count_inc来让#define preempt_count_inc() preempt_count_add(1) preempt加一,这样再通过in_atomic_preempt_off()判断preempt是否为0肯定就是就是不成立的。因此kernel就会打印BUG: scheduling while atomic
unsigned long event)
{
struct device *dev = &hr_dev->pdev->dev;
struct net_device *netdev;
netdev = hr_dev->iboe.netdevs[port];
if (!netdev) {
dev_err(dev, "port(%d) can't find netdev\n", port);
return -ENODEV;
}
spin_lock_bh(&hr_dev->iboe.lock);
switch (event) {
case NETDEV_UP:
case NETDEV_CHANGE:
case NETDEV_REGISTER:
case NETDEV_CHANGEADDR:
hns_roce_set_mac(hr_dev, port, netdev->dev_addr);
break;
case NETDEV_DOWN:
/*
* In v1 engine, only support all ports closed together.
*/
break;
default:
dev_dbg(dev, "NETDEV event = 0x%x!\n", (u32)(event));
break;
}
spin_unlock_bh(&hr_dev->iboe.lock);
return 0;
}
这段code中在spin_lock_bh/spin_unlock_bh 之间调用hns_roce_set_mac,而hns_roce_set_mac->hns_roce_set_mac->hns_roce_v1_set_mac
void hns_roce_v1_set_mac(struct hns_roce_dev *hr_dev, u8 phy_port, u8 *addr)
{
u32 reg_smac_l;
u16 reg_smac_h;
u16 *p_h;
u32 *p;
u32 val;
/*
* When mac changed, loopback may fail
* because of smac not equal to dmac.
* We Need to release and create reserved qp again.
*/
if (hr_dev->hw->dereg_mr && hns_roce_v1_recreate_lp_qp(hr_dev))
dev_warn(&hr_dev->pdev->dev, "recreate lp qp timeout!\n");
}
hns_roce_v1_set_mac 中又会调用hns_roce_v1_recreate_lp_qp
static int hns_roce_v1_recreate_lp_qp(struct hns_roce_dev *hr_dev)
{
struct device *dev = &hr_dev->pdev->dev;
struct hns_roce_recreate_lp_qp_work *lp_qp_work;
struct hns_roce_free_mr *free_mr;
struct hns_roce_v1_priv *priv;
struct completion comp;
unsigned long end =
msecs_to_jiffies(HNS_ROCE_V1_RECREATE_LP_QP_TIMEOUT_MSECS) + jiffies;
while (time_before_eq(jiffies, end)) {
if (try_wait_for_completion(&comp))
return 0;
msleep(HNS_ROCE_V1_RECREATE_LP_QP_WAIT_VALUE);
}
}
在hns_roce_v1_recreate_lp_qp 中调用msleep函数,就违反了不能在atomic环境中sleep的情况,这样在
__schedule->schedule_debug
static inline void schedule_debug(struct task_struct *prev)
{
#ifdef CONFIG_SCHED_STACK_END_CHECK
if (task_stack_end_corrupted(prev))
panic("corrupted stack end detected inside scheduler\n");
#endif
//这个条件就会符合,所以调用__schedule_bug
if (unlikely(in_atomic_preempt_off())) {
__schedule_bug(prev);
preempt_count_set(PREEMPT_DISABLED);
}
rcu_sleep_check();
profile_hit(SCHED_PROFILING, __builtin_return_address(0));
schedstat_inc(this_rq()->sched_count);
}
static noinline void __schedule_bug(struct task_struct *prev)
{
/* Save this before calling printk(), since that will clobber it */
unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
if (oops_in_progress)
return;
printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
prev->comm, prev->pid, preempt_count());
}
这样就会打印BUG: scheduling while atomic 这样的log。并出给当前京进程的name和pid。
而#define in_atomic_preempt_off() (preempt_count() != PREEMPT_DISABLE_OFFSET)
可见就是就是检测preempt_count是否等于PREEMPT_DISABLE_OFFSET==0.不为0也就是说当前是原子环境
那么为啥spin_lock_bh/spin_unlock_bh 之间的环境是原子环境呢?
static __always_inline void spin_lock_bh(spinlock_t *lock)
{
raw_spin_lock_bh(&lock->rlock);
}
#define raw_spin_lock_bh(lock) _raw_spin_lock_bh(lock)
void __lockfunc _raw_spin_lock_bh(raw_spinlock_t *lock)
{
__raw_spin_lock_bh(lock);
}
static inline void __raw_spin_lock_irq(raw_spinlock_t *lock)
{
local_irq_disable();
preempt_disable();
spin_acquire(&lock->dep_map, 0, 0, _RET_IP_);
LOCK_CONTENDED(lock, do_raw_spin_trylock, do_raw_spin_lock);
}
重点是调用preempt_disable
#define preempt_disable() \
do { \
preempt_count_inc(); \
barrier(); \
} while (0)
原来在preempt_disable 中调用preempt_count_inc来让#define preempt_count_inc() preempt_count_add(1) preempt加一,这样再通过in_atomic_preempt_off()判断preempt是否为0肯定就是就是不成立的。因此kernel就会打印BUG: scheduling while atomic
本文深入探讨了在Linux内核的原子环境中调用sleep函数导致的问题,并详细分析了spin_lock_bh/spin_unlock_bh间的原子环境特性及引发BUG的原因。
1497





