此博客为一次记录
现象:
在应用层调用receive时,偶发有接收时间超过1ms的问题(工作频率是1000HZ,所以接收时间一般在100us以内才正常),甚至偶有4,5ms的时候。
环境
rtlinux+igh1.5.2
排查:
1 首先在驱动和lib库里增加计时操作,确定超时确实是由etherCAT导致。
2 细化计时范围,缩小至由于锁导致的receive的ioctl返回时间慢。
3 改写igh驱动,将锁由原来的inline改写为带计时功能的宏,并且将上次加锁的代码行号记录下来。
原本的锁 master/lock.h
static inline void ec_lock_init(ec_lock_t *sem) { sema_init(sem, 1); }
static inline void ec_lock_down(ec_lock_t *sem) { down(sem); }
static inline int ec_lock_down_interruptible(ec_lock_t *sem) { return down_interruptible(sem); }
static inline void ec_lock_up(ec_lock_t *sem) { up(sem);}
改写为
extern ktime_t start_time_ns_test;
extern ktime_t end_time_ns_test;
extern s64 elapsed_time_ns_test;
extern ktime_t start_time_ns_test2;
extern ktime_t end_time_ns_test2;
extern s64 elapsed_time_ns_test2;
extern volatile ktime_t start_time_ns_test3;
extern volatile ktime_t end_time_ns_test3;
extern volatile ktime_t lock_time[1024];
extern s64 elapsed_time_ns_test3;
extern int temporaryRet;
extern volatile int lock_array[1024];
extern volatile int master_fsm;
extern volatile int master_fsm_reason;
extern volatile s64 schedule_elapsed_time;
#define OVERTIME 300000
//#define ec_lock_init(sem) do { start_time_ns_test=ktime_get(); sema_init((sem), 1); end_time_ns_test=ktime_get();if((elapsed_time_ns_test=ktime_to_ns(ktime_sub(end_time_ns_test, start_time_ns_test)))>OVERTIME){printk("xiuwen%s.c: %d %s too long cost %lld\n",__FILE__,__LINE__,__FUNCTION__,elapsed_time_ns_test);}} while (0)
#define ec_lock_init(sem) do{ sema_init(sem, 1); printk("xiuwen%s.c: %d %s idx %d\n",__FILE__,__LINE__,__FUNCTION__,(uintptr_t)sem%1024);} while(0)
#define ec_lock_down(sem) do \
{\
start_time_ns_test2=ktime_get(); down(sem);end_time_ns_test2=ktime_get();\
if((elapsed_time_ns_test2=ktime_to_ns(ktime_sub(end_time_ns_test2, start_time_ns_test2)))>OVERTIME)\
{\
printk("xiuwen%s: %d %s lock down too long cost %lld lastLine %d idx %ld pid %d tid %d master %d reason %dn",\
__FILE__,__LINE__,__FUNCTION__,elapsed_time_ns_test2,lock_array[(uintptr_t)sem%1024],(uintptr_t)sem%1024,current->pid,current->tgid,master_fsm,master_fsm_reason);\
}\
lock_array[(uintptr_t)sem%1024]=__LINE__;\
lock_time[(uintptr_t)sem%1024]=end_time_ns_test2;\
} while (0)
#define ec_lock_down_interruptible(sem) ((((start_time_ns_test=ktime_get())&&(0==(temporaryRet=down_interruptible((sem)))))&&(end_time_ns_test=ktime_get()))&&\
((elapsed_time_ns_test=ktime_to_ns(ktime_sub(end_time_ns_test, start_time_ns_test)))>OVERTIME?\
(NULL!=(1+(printk("xiuwen%s: %d %s lock_down_interrupt too long %lld lastLine %d idx %ld pid %d tid %d master %d reason %d\n",\
__FILE__,__LINE__,__FUNCTION__,elapsed_time_ns_test,lock_array[(uintptr_t)sem%1024],(uintptr_t)sem%1024,current->pid,current->tgid,master_fsm,master_fsm_reason)\
))):1),(lock_array[(uintptr_t)sem%1024]=__LINE__),(lock_time[(uintptr_t)sem%1024]=end_time_ns_test),temporaryRet)
#define ec_lock_up(sem) do \
{ start_time_ns_test3=ktime_get(); \
up(sem);\
end_time_ns_test3=ktime_get();\
if((elapsed_time_ns_test3=ktime_to_ns(ktime_sub(end_time_ns_test3, start_time_ns_test3)))>OVERTIME)\
{\
printk("xiuwen%s: %d %s lock up too long cost %lld lastLine %d idx %ld pid %d tid %d master %d reason %d\n",\
__FILE__,__LINE__,__FUNCTION__,elapsed_time_ns_test3,lock_array[(uintptr_t)sem%1024],(uintptr_t)sem%1024,current->pid,current->tgid ,master_fsm,master_fsm_reason);\
}\
if((elapsed_time_ns_test3=ktime_to_ns(ktime_sub(end_time_ns_test3, lock_time[(uintptr_t)sem%1024])))>OVERTIME)\
{\
printk("xiuwen%s: %d %s down lock spend to long time %lld,",__FILE__,__LINE__,__FUNCTION__,elapsed_time_ns_test3);\
}\
lock_array[(uintptr_t)sem%1024]=__LINE__;\
} while (0)
4 运行测试发现,锁被etherCAT驱动的operation_pthread线程锁持有时间过长
master/master.c
5 读代码发现锁是因为执行slave状态机偶尔会出现过慢的情况,导致锁未被释放,进而导致上层接收超时,想追状态机函数,由于每次超时的子状态机不是同一个,且比较复杂就放弃了,想着从其他方向入手。
6 观察到应用层的调度策略是fifo,优先级是最高,而master的op线程却是普通调度策略,感觉像是状态机执行的次数少,然后出了某些意外,在下次执行状态机时去处理这个意外才导致此次时间过长。所以把OP线程同样改为FIFO,且优先级最高。
master/master.c : ec_master_operation_thread
static int ec_master_operation_thread(void *priv_data)
{
ec_master_t *master = (ec_master_t *) priv_data;
// struct cpumask mask;
struct sched_param param;
param.sched_priority = 99; // 设置优先级为99
if (sched_setscheduler(current, SCHED_FIFO, ¶m))
{
pr_err("Failed to set scheduler\n");
return -1;
}
EC_MASTER_DBG(master, 1, "Operation thread running"
" with fsm interval = %u us, max data size=%zu\n",
master->send_interval, master->max_queue_size);
while (!kthread_should_stop()) {
ec_datagram_output_stats(&master->fsm_datagram);
if (master->injection_seq_rt == master->injection_seq_fsm) {
// output statistics
ec_master_output_stats(master);
// execute master & slave state machines
if (ec_lock_down_interruptible(&master->master_sem)) {
break;
}
if (ec_fsm_master_exec(&master->fsm)) {
// Inject datagrams (let the RT thread queue them, see
// ecrt_master_send())
master->injection_seq_fsm++;
}
// if rt_slave_requests is true and the slaves are available
// this will be handled by the app explicitly calling
// ecrt_master_exec_slave_request()
if (!master->rt_slave_requests || !master->rt_slaves_available) {
ec_master_exec_slave_fsms(master);
}
ec_lock_up(&master->master_sem);
}
#ifdef EC_USE_HRTIMER
// the op thread should not work faster than the sending RT thread
ec_master_nanosleep(master->send_interval * 1000);
#else
if (ec_fsm_master_idle(&master->fsm)) {
set_current_state(TASK_INTERRUPTIBLE);
schedule_timeout(1);
}
else {
schedule();
}
#endif
}
EC_MASTER_DBG(master, 1, "Master OP thread exiting...\n");
return 0;
}
7 改完之后再次运行,未再出现过超时问题,使用stress-ng进行mem和cpu加压满载测试同样正常运行超过24H。
结论:
etherCAT协议还是比较复杂,igh的状态机不好排查,所以目前只排查到是进程调度策略导致的master的op线程被调度少或者抢占了才导致的(但是igh的OP线程是自旋锁,按理来说不应该会被抢占才对)。如果后续继续排查,有两个方向
1 OP线程循环执行时关中断,等此次循环结束再开中断来验证是否是被抢占导致的。
2 追slave状态机的代码
PS :
如果有大佬知道具体原因,麻烦留言说下哈。