enum netdev_state_t {
__LINK_STATE_START,
__LINK_STATE_PRESENT,
__LINK_STATE_NOCARRIER,
__LINK_STATE_LINKWATCH_PENDING,
__LINK_STATE_DORMANT,
};
22.6.14 Enhanced buffer descriptors(增强型缓冲区描述符)
imx6ull 用下表来描述一个缓冲区。
data buffer pointer 保存着缓冲区的地址,缓冲区必须位于外置内存中。
Enhanced buffer descriptors可以与驱动中的struct bufdesc 对应。
struct bufdesc {
__fec16 cbd_sc; /* Control and status info */
__fec16 cbd_datlen; /* Data length */
__fec32 cbd_bufaddr; /* Buffer address */
};
imx6ull 的数据包存放形式
网络数据包在接收时网卡会将数据包dma 到内存的ring buffer中,中断通知驱动数据已到达;同样发送时,驱动将数据包写到ring buffer中,通知网卡将数据拷走。所以无论是在接收和发送时都要通过ring buffer暂时保存数据包,那么数据包在ring buffer中是怎么存放的。
ring buffer 的结构如图所示,由许多个struct bufdesc 组成。
对于接收队列的ring buffer来说,网卡会将数据不停地放入 bufdesc,直到放满为止,驱动则不断的从bufdesc 取出数据,直到空为止。
网卡和驱动就形成一个生产者和消费者的关系。
imx6ull 中数据包存到队列中的形式就是以struct bufdesc 结构,ring buffer的大小也是按照这个结构计算——一个队列中能个存放若干个struct bufdesc 结构体。
struct sk_buff是内核描述一个数据包的结构体,imx6ull 驱动把sk_buff->data 和bufdesc->bd_bufaddr 映射到同一块区域,当网卡将数据dma 到bufdesc->bd_bufaddr 时,内核就可以通过sk_buff->data 来获得数据包内容,一个skb 对应一个bufdesc。
在imx6ull 的mac 驱动中 rxq 会创建一个skb数组,元素的个数与ringbuffer 中的bufdesc 个数是一致的。
fec_probe
static int
fec_probe(struct platform_device *pdev)
{
struct fec_enet_private *fep;
struct fec_platform_data *pdata;
struct net_device *ndev;
int i, irq, ret = 0;
struct resource *r;
const struct of_device_id *of_id;
static int dev_id;
struct device_node *np = pdev->dev.of_node, *phy_node;
int num_tx_qs;
int num_rx_qs;
fec_enet_get_queue_num(pdev, &num_tx_qs, &num_rx_qs);
/* Init network device */
ndev = alloc_etherdev_mqs(sizeof(struct fec_enet_private) +
FEC_STATS_SIZE, num_tx_qs, num_rx_qs);
if (!ndev)
return -ENOMEM;
SET_NETDEV_DEV(ndev, &pdev->dev);
/* setup board info structure */
fep = netdev_priv(ndev);
of_id = of_match_device(fec_dt_ids, &pdev->dev); //获取fec_dt_ids->data,保存这一些cpu 独有的特性
if (of_id)
pdev->id_entry = of_id->data;
fep->quirks = pdev->id_entry->driver_data; //特性保存到 quirks
fep->netdev = ndev; //填充私有数据结构 fep
fep->num_rx_queues = num_rx_qs;
fep->num_tx_queues = num_tx_qs;
#if !defined(CONFIG_M5272)
/* default enable pause frame auto negotiation */
if (fep->quirks & FEC_QUIRK_HAS_GBIT)
fep->pause_flag |= FEC_PAUSE_FLAG_AUTONEG;
#endif
/* Select default pin state */
pinctrl_pm_select_default_state(&pdev->dev);
r = platform_get_resource(pdev, IORESOURCE_MEM, 0); //获取reg 资源,保存到fep->hwp
fep->hwp = devm_ioremap_resource(&pdev->dev, r);
if (IS_ERR(fep->hwp)) {
ret = PTR_ERR(fep->hwp);
goto failed_ioremap;
}
fep->pdev = pdev;
fep->dev_id = dev_id++;
platform_set_drvdata(pdev, ndev); //设置driver data
if ((of_machine_is_compatible("fsl,imx6q") ||
of_machine_is_compatible("fsl,imx6dl")) &&
!of_property_read_bool(np, "fsl,err006687-workaround-present"))
fep->quirks |= FEC_QUIRK_ERR006687;
if (of_get_property(np, "fsl,magic-packet", NULL))
fep->wol_flag |= FEC_WOL_HAS_MAGIC_PACKET;
phy_node = of_parse_phandle(np, "phy-handle", 0); //获取phy 节点
if (!phy_node && of_phy_is_fixed_link(np)) {
ret = of_phy_register_fixed_link(np);
if (ret < 0) {
dev_err(&pdev->dev,
"broken fixed-link specification\n");
goto failed_phy;
}
phy_node = of_node_get(np);
}
fep->phy_node = phy_node;
ret = of_get_phy_mode(pdev->dev.of_node); //获取phy 接口模式
if (ret < 0) {
pdata = dev_get_platdata(&pdev->dev);
if (pdata)
fep->phy_interface = pdata->phy;
else
fep->phy_interface = PHY_INTERFACE_MODE_MII;
} else {
fep->phy_interface = ret;
}
fep->clk_ipg = devm_clk_get(&pdev->dev, "ipg"); //获取各路时钟
if (IS_ERR(fep->clk_ipg)) {
ret = PTR_ERR(fep->clk_ipg);
goto failed_clk;
}
...... //省略一部分关于时钟的代码
ret = fec_enet_clk_enable(ndev, true); //使能时钟
if (ret)
goto failed_clk;
ret = clk_prepare_enable(fep->clk_ipg);
if (ret)
goto failed_clk_ipg;
fep->reg_phy = devm_regulator_get(&pdev->dev, "phy");
if (!IS_ERR(fep->reg_phy)) {
ret = regulator_enable(fep->reg_phy);
if (ret) {
dev_err(&pdev->dev,
"Failed to enable phy regulator: %d\n", ret);
clk_disable_unprepare(fep->clk_ipg);
goto failed_regulator;
}
} else {
if (PTR_ERR(fep->reg_phy) == -EPROBE_DEFER) {
ret = -EPROBE_DEFER;
goto failed_regulator;
}
fep->reg_phy = NULL;
}
pm_runtime_set_autosuspend_delay(&pdev->dev, FEC_MDIO_PM_TIMEOUT);
pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_get_noresume(&pdev->dev);
pm_runtime_set_active(&pdev->dev);
pm_runtime_enable(&pdev->dev);
ret = fec_reset_phy(pdev); //使用gpio 复位phy 芯片
if (ret)
goto failed_reset;
if (fep->bufdesc_ex)
fec_ptp_init(pdev);
ret = fec_enet_init(ndev); //申请txq、rxq 以及他们的bufdesc,初始化bufdesc、设置netdev_ops、ethtool_ops,添加napi 处理函数
if (ret) //fec_restart(ndev); 初始化一些mac 寄存器、初始化rxq 的bufdesc,释放了txq 的bufdesc、设置rxq,rxq ring_buffer在ddr 中的地址
goto failed_init;
for (i = 0; i < FEC_IRQ_NUM; i++) {
irq = platform_get_irq(pdev, i);
if (irq < 0) {
if (i)
break;
ret = irq;
goto failed_irq;
}
ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt, //注册中断处理函数
0, pdev->name, ndev);
if (ret)
goto failed_irq;
fep->irq[i] = irq;
}
init_completion(&fep->mdio_done);
ret = fec_enet_mii_init(pdev); //读取dts mdio 节点,注册mdio 总线(用一个struct mii_bus 来描述,其中包含mdio 读写phy 的函数)
if (ret) //读取dts mdio节点下phy子节点,并注册phy_device
goto failed_mii_init;
/* Carrier starts down, phylib will bring it up */
netif_carrier_off(ndev); //通知内核不允许发送数据包
fec_enet_clk_enable(ndev, false);
pinctrl_pm_select_sleep_state(&pdev->dev);
ret = register_netdev(ndev); //注册网络设备
if(ret)
goto failed_register;
device_init_wakeup(&ndev->dev, fep->wol_flag &
FEC_WOL_HAS_MAGIC_PACKET);
if (fep->bufdesc_ex && fep->ptp_clock)
netdev_info(ndev, "registered PHC device %d\n", fep->dev_id);
fep->rx_copybreak = COPYBREAK_DEFAULT;
INIT_WORK(&fep->tx_timeout_work, fec_enet_timeout_work);
pm_runtime_mark_last_busy(&pdev->dev);
pm_runtime_put_autosuspend(&pdev->dev);
return 0;
}
alloc_etherdev_mqs
这里主要是做了以下事情:
① netdev的申请,以及一些以太网通用的设置ether_setup
函数中
② 私有数据内存(struct fec_enet_private)申请
③ net_device->netdev_rx_queue、net_device->netdev_queue 申请,这里的tx、rx 队列作用不太清楚;
imx6ull 驱动中使用fep->fec_enet_priv_tx_q、fep->fec_enet_priv_rx_q 来描述收发队列。
netdev和private的空间在一块内存中去申请,netdev前后都做了32字节的对齐。
//linux-5.4.47\net\ethernet\eth.c
struct net_device *alloc_etherdev_mqs(int sizeof_priv, unsigned int txqs,
unsigned int rxqs)
{
return alloc_netdev_mqs(sizeof_priv, "eth%d", NET_NAME_UNKNOWN,
ether_setup, txqs, rxqs); //ether_setup: 设置netdev 一些通用的值
}
struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
unsigned char name_assign_type,
void (*setup)(struct net_device *),
unsigned int txqs, unsigned int rxqs)
{
struct net_device *dev;
unsigned int alloc_size;
struct net_device *p;
BUG_ON(strlen(name) >= sizeof(dev->name)); //当设置名字的长度 > netdev->name 的空间长度时,BUG_ON(1),发生bug
if (txqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero queues\n");
return NULL;
}
if (rxqs < 1) {
pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n");
return NULL;
}
alloc_size = sizeof(struct net_device); //计算netdev 长度 alloc_size
if (sizeof_priv) {
/* ensure 32-byte alignment of private area */
alloc_size = ALIGN(alloc_size, NETDEV_ALIGN); //将netdev长度以32字节对齐
alloc_size += sizeof_priv; //alloc_size + 私有数据长度
}
/* ensure 32-byte alignment of whole construct */
alloc_size += NETDEV_ALIGN - 1; //alloc_size +32 -1
p = kvzalloc(alloc_size, GFP_KERNEL | __GFP_RETRY_MAYFAIL);
if (!p)
return NULL;
dev = PTR_ALIGN(p, NETDEV_ALIGN); //p以32 字节对齐,作为netdev的地址
dev->padded = (char *)dev - (char *)p; //netdev->padded 申请的内存首地址与netdev首地址之间做了多少填充
dev->pcpu_refcnt = alloc_percpu(int);
if (!dev->pcpu_refcnt)
goto free_dev;
if (dev_addr_init(dev))[] //初始化mac地址
goto free_pcpu;
dev_mc_init(dev);
dev_uc_init(dev);
dev_net_set(dev, &init_net);
netdev_register_lockdep_key(dev);
/*
TSO:全称是 TCP Segmentation Offload,利用网卡对大数据包进行自动分段处理,降低 CPU负载。
GSO:全称是 Generic Segmentation Offload,在发送数据之前先检查一下网卡是否支持 TSO,
如果支持的话就让网卡分段,如果不支持的话就由协议栈进行分段处理,分段处理完成以后再
交给网卡去发送。
*/
dev->gso_max_size = GSO_MAX_SIZE;
dev->gso_max_segs = GSO_MAX_SEGS;
dev->upper_level = 1;
dev->lower_level = 1;
INIT_LIST_HEAD(&dev->napi_list); //初始化napi 链表
INIT_LIST_HEAD(&dev->unreg_list);
INIT_LIST_HEAD(&dev->close_list);
INIT_LIST_HEAD(&dev->link_watch_list);
INIT_LIST_HEAD(&dev->adj_list.upper);
INIT_LIST_HEAD(&dev->adj_list.lower);
INIT_LIST_HEAD(&dev->ptype_all); //初始化ptype_all 链表,协议栈收包处理的时候会用到
INIT_LIST_HEAD(&dev->ptype_specific); //初始化ptype_specific 链表
#ifdef CONFIG_NET_SCHED
hash_init(dev->qdisc_hash);
#endif
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
if (!dev->tx_queue_len) { //队列长度设置
dev->priv_flags |= IFF_NO_QUEUE;
dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
}
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
if (netif_alloc_netdev_queues(dev)) //tx 队列申请
goto free_all;
dev->num_rx_queues = rxqs;
dev->real_num_rx_queues = rxqs;
if (netif_alloc_rx_queues(dev)) //rx 队列申请
goto free_all;
strcpy(dev->name, name);
dev->name_assign_type = name_assign_type;
dev->group = INIT_NETDEV_GROUP;
if (!dev->ethtool_ops)
dev->ethtool_ops = &default_ethtool_ops; //设置默认ethtool ops
nf_hook_ingress_init(dev);
return dev;
free_all:
free_netdev(dev);
return NULL;
free_pcpu:
free_percpu(dev->pcpu_refcnt);
free_dev:
netdev_freemem(dev);
return NULL;
}
fec_enet_init
这里主要内容
① 分配txq、rxq ring buffer 的bufdesc内存
② 设置netdev_ops、设置ethtool_ops
③ 添加napi 处理函数
④ 设置一些 特性quirks
static int fec_enet_init(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct bufdesc *cbd_base; //缓冲区秒速符
dma_addr_t bd_dma;
int bd_size;
unsigned int i;
unsigned dsize = fep->bufdesc_ex ? sizeof(struct bufdesc_ex) : sizeof(struct bufdesc); //计算缓冲描述符大小
unsigned dsize_log2 = __fls(dsize);
WARN_ON(dsize != (1 << dsize_log2));
#if defined(CONFIG_ARM)
fep->rx_align = 0xf;
fep->tx_align = 0xf;
#else
fep->rx_align = 0x3;
fep->tx_align = 0x3;
#endif
fec_enet_alloc_queue(ndev); //申请 fec_enet_priv_tx_q、fec_enet_priv_rx_q 并填充信息、dma申请tx tso 头部的大小txq->tso_hdrs
//主要就是txq 和rxq 的ring_buffer 大小,ring_buffer 用来存储bufdesc,以struct bufdesc 为单位计算大小
bd_size = (fep->total_tx_ring_size + fep->total_rx_ring_size) * dsize; //txq+rxq 总的bufdesc 大小
/* Allocate memory for buffer descriptors. */
cbd_base = dmam_alloc_coherent(&fep->pdev->dev, bd_size, &bd_dma, //dma 申请txq、rxq所有bufdesc 的内存
GFP_KERNEL);
if (!cbd_base) {
return -ENOMEM;
}
memset(cbd_base, 0, bd_size);
/* Get the Ethernet address */
fec_get_mac(ndev);
/* make sure MAC we just acquired is programmed into the hw */
fec_set_mac_address(ndev, NULL);
/* Set receive and transmit descriptor base. */
for (i = 0; i < fep->num_rx_queues; i++) {
struct fec_enet_priv_rx_q *rxq = fep->rx_queue[i];
unsigned size = dsize * rxq->bd.ring_size; //rxq ring_buffer大小
rxq->bd.qid = i; //队列id
rxq->bd.base = cbd_base; //缓冲区描述符基地址:一个ring_buffer 中有多个bufdesc,cbd_base 代表第一个bufdesc 的地址
rxq->bd.cur = cbd_base; //当前可用的bufdesc 地址
rxq->bd.dma = bd_dma; //dma 物理地址
rxq->bd.dsize = dsize; //一个 bufdesc 的大小
rxq->bd.dsize_log2 = dsize_log2;
rxq->bd.reg_desc_active = fep->hwp + offset_des_active_rxq[i]; //ENETx_RDAR 寄存器地址
bd_dma += size; //移动dma 物理地址到txq
cbd_base = (struct bufdesc *)(((void *)cbd_base) + size); //rxq->bd.base +size 移动地址到ring buffer 的尾部
rxq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize); //rxq 最后一个bufdesc
}
for (i = 0; i < fep->num_tx_queues; i++) {
struct fec_enet_priv_tx_q *txq = fep->tx_queue[i];
unsigned size = dsize * txq->bd.ring_size;
txq->bd.qid = i;
txq->bd.base = cbd_base;
txq->bd.cur = cbd_base;
txq->bd.dma = bd_dma;
txq->bd.dsize = dsize;
txq->bd.dsize_log2 = dsize_log2;
txq->bd.reg_desc_active = fep->hwp + offset_des_active_txq[i]; ENETx_TDAR 寄存器地址
bd_dma += size;
cbd_base = (struct bufdesc *)(((void *)cbd_base) + size);
txq->bd.last = (struct bufdesc *)(((void *)cbd_base) - dsize);
}
/* The FEC Ethernet specific entries in the device structure */
ndev->watchdog_timeo = TX_TIMEOUT; //发送超时时长
ndev->netdev_ops = &fec_netdev_ops; //设置netdev_ops
ndev->ethtool_ops = &fec_enet_ethtool_ops; //设置ethtool_ops
writel(FEC_RX_DISABLED_IMASK, fep->hwp + FEC_IMASK); //设置中断掩码寄存
writel(FEC_RX_DISABLED_IMASK, fep->hwp + FEC_IMASK); //设置中断掩码寄存器,允许中断发生
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT); //添加napi 处理函数
if (fep->quirks & FEC_QUIRK_HAS_VLAN) //添加vlan 特性
/* enable hw VLAN support */
ndev->features |= NETIF_F_HW_VLAN_CTAG_RX;
if (fep->quirks & FEC_QUIRK_HAS_CSUM) {
ndev->gso_max_segs = FEC_MAX_TSO_SEGS;
/* enable hw accelerator */ //硬件特性支持:数据包校验、TSO 等等
ndev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM
| NETIF_F_RXCSUM | NETIF_F_SG | NETIF_F_TSO);
fep->csum_flags |= FLAG_RX_CSUM_ENABLED;
}
if (fep->quirks & FEC_QUIRK_HAS_AVB) {
fep->tx_align = 0;
fep->rx_align = 0x3f;
}
ndev->hw_features = ndev->features; //硬件特性
fec_restart(ndev); //初始化一些mac 寄存器、初始化rxq 的bufdesc,释放了txq 的bufdesc、设置rxq,rxq ring_buffer在ddr 中的地址
if (fep->quirks & FEC_QUIRK_MIB_CLEAR)
fec_enet_clear_ethtool_stats(ndev);
else
fec_enet_update_ethtool_stats(ndev);
return 0;
}
fec_enet_alloc_queue
dma 分配内存时物理地址一定要时连续的,且需要一致性DMA分配。
在这个函数指定了 ring buffer的大小(一个能容纳多少个struct bufdesc)
struct bufdesc_prop {
int qid;
/* Address of Rx and Tx buffers */
struct bufdesc *base; //ring buffer中第一个bufdesc 的地址
struct bufdesc *last; //ring buffer 中最后一个bufdesc 的地址
struct bufdesc *cur; //ring buffer 中空闲的bufdesc 的地址
void __iomem *reg_desc_active;
dma_addr_t dma;
unsigned short ring_size;
unsigned char dsize;
unsigned char dsize_log2;
};
struct fec_enet_priv_tx_q {
struct bufdesc_prop bd;
unsigned char *tx_bounce[TX_RING_SIZE];
struct sk_buff *tx_skbuff[TX_RING_SIZE];
unsigned short tx_stop_threshold;
unsigned short tx_wake_threshold;
struct bufdesc *dirty_tx;
char *tso_hdrs;
dma_addr_t tso_hdrs_dma;
};
struct fec_enet_priv_rx_q {
struct bufdesc_prop bd;
struct sk_buff *rx_skbuff[RX_RING_SIZE];
};
static int fec_enet_alloc_queue(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int i;
int ret = 0;
struct fec_enet_priv_tx_q *txq;
for (i = 0; i < fep->num_tx_queues; i++) {
txq = kzalloc(sizeof(*txq), GFP_KERNEL); //fec_enet_priv_tx_q 用来描述一个fec 的发送队列
if (!txq) {
ret = -ENOMEM;
goto alloc_failed;
}
fep->tx_queue[i] = txq; //一个mac 可能有多个tx 队列
txq->bd.ring_size = TX_RING_SIZE; //队列大小
fep->total_tx_ring_size += fep->tx_queue[i]->bd.ring_size; //所有tx 队列大小总和
txq->tx_stop_threshold = FEC_MAX_SKB_DESCS; //TSO 分片大小
txq->tx_wake_threshold = (txq->bd.ring_size - txq->tx_stop_threshold) / 2;
txq->tso_hdrs = dma_alloc_coherent(&fep->pdev->dev, //dma申请 tso 头部的大小txq->tso_hdrs
txq->bd.ring_size * TSO_HEADER_SIZE,
&txq->tso_hdrs_dma,
GFP_KERNEL);
if (!txq->tso_hdrs) {
ret = -ENOMEM;
goto alloc_failed;
}
}
for (i = 0; i < fep->num_rx_queues; i++) {
fep->rx_queue[i] = kzalloc(sizeof(*fep->rx_queue[i]), //fec_enet_priv_rx_q 用来描述一个fec 的接收队列
GFP_KERNEL);
if (!fep->rx_queue[i]) {
ret = -ENOMEM;
goto alloc_failed;
}
fep->rx_queue[i]->bd.ring_size = RX_RING_SIZE;
fep->total_rx_ring_size += fep->rx_queue[i]->bd.ring_size;
}
return ret;
alloc_failed:
fec_enet_free_queue(ndev);
return ret;
}
fec_restart
static void
fec_restart(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
u32 val;
u32 temp_mac[2];
u32 rcntl = OPT_FRAME_SIZE | 0x04;
u32 ecntl = 0x2; /* ETHEREN */
if (fep->quirks & FEC_QUIRK_HAS_AVB) {
writel(0, fep->hwp + FEC_ECNTRL);
} else {
writel(1, fep->hwp + FEC_ECNTRL);
udelay(10);
}
/*
* enet-mac reset will reset mac address registers too,
* so need to reconfigure it.
*/
memcpy(&temp_mac, ndev->dev_addr, ETH_ALEN);
writel((__force u32)cpu_to_be32(temp_mac[0]),
fep->hwp + FEC_ADDR_LOW);
writel((__force u32)cpu_to_be32(temp_mac[1]),
fep->hwp + FEC_ADDR_HIGH);
/* Clear any outstanding interrupt. */
writel(0xffffffff, fep->hwp + FEC_IEVENT); //清除所有中断事件
fec_enet_bd_init(ndev); //初始化rxq 的bufdesc,释放了txq 的bufdesc
fec_enet_enable_ring(ndev); //设置rxq,rxq ring_buffer在ddr 中的地址
/* Reset tx SKB buffers. */
fec_enet_reset_skb(ndev);
/* Enable MII mode */
if (fep->full_duplex == DUPLEX_FULL) { //设置全双工
/* FD enable */
writel(0x04, fep->hwp + FEC_X_CNTRL);
} else {
/* No Rcv on Xmit */
rcntl |= 0x02;
writel(0x0, fep->hwp + FEC_X_CNTRL);
}
/* Set MII speed */
writel(fep->phy_speed, fep->hwp + FEC_MII_SPEED); //设置速度
#if !defined(CONFIG_M5272)
if (fep->quirks & FEC_QUIRK_HAS_RACC) {
val = readl(fep->hwp + FEC_RACC);
/* align IP header */
val |= FEC_RACC_SHIFT16;
if (fep->csum_flags & FLAG_RX_CSUM_ENABLED)
/* set RX checksum */
val |= FEC_RACC_OPTIONS;
else
val &= ~FEC_RACC_OPTIONS;
writel(val, fep->hwp + FEC_RACC);
writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_FTRL);
}
#endif
if (fep->quirks & FEC_QUIRK_ENET_MAC) { //设置接口模式、速度
/* Enable flow control and length check */
rcntl |= 0x40000000 | 0x00000020;
/* RGMII, RMII or MII */
if (fep->phy_interface == PHY_INTERFACE_MODE_RGMII ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_ID ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_RXID ||
fep->phy_interface == PHY_INTERFACE_MODE_RGMII_TXID)
rcntl |= (1 << 6);
else if (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
rcntl |= (1 << 8);
else
rcntl &= ~(1 << 8);
/* 1G, 100M or 10M */
if (ndev->phydev) {
if (ndev->phydev->speed == SPEED_1000)
ecntl |= (1 << 5);
else if (ndev->phydev->speed == SPEED_100)
rcntl &= ~(1 << 9);
else
rcntl |= (1 << 9);
}
} else {
#ifdef FEC_MIIGSK_ENR
if (fep->quirks & FEC_QUIRK_USE_GASKET) {
u32 cfgr;
/* disable the gasket and wait */
writel(0, fep->hwp + FEC_MIIGSK_ENR);
while (readl(fep->hwp + FEC_MIIGSK_ENR) & 4)
udelay(1);
/*
* configure the gasket:
* RMII, 50 MHz, no loopback, no echo
* MII, 25 MHz, no loopback, no echo
*/
cfgr = (fep->phy_interface == PHY_INTERFACE_MODE_RMII)
? BM_MIIGSK_CFGR_RMII : BM_MIIGSK_CFGR_MII;
if (ndev->phydev && ndev->phydev->speed == SPEED_10)
cfgr |= BM_MIIGSK_CFGR_FRCONT_10M;
writel(cfgr, fep->hwp + FEC_MIIGSK_CFGR);
/* re-enable the gasket */
writel(2, fep->hwp + FEC_MIIGSK_ENR);
}
#endif
}
#if !defined(CONFIG_M5272)
/* enable pause frame*/
if ((fep->pause_flag & FEC_PAUSE_FLAG_ENABLE) ||
((fep->pause_flag & FEC_PAUSE_FLAG_AUTONEG) &&
ndev->phydev && ndev->phydev->pause)) {
rcntl |= FEC_ENET_FCE;
/* set FIFO threshold parameter to reduce overrun */
writel(FEC_ENET_RSEM_V, fep->hwp + FEC_R_FIFO_RSEM);
writel(FEC_ENET_RSFL_V, fep->hwp + FEC_R_FIFO_RSFL);
writel(FEC_ENET_RAEM_V, fep->hwp + FEC_R_FIFO_RAEM);
writel(FEC_ENET_RAFL_V, fep->hwp + FEC_R_FIFO_RAFL);
/* OPD */
writel(FEC_ENET_OPD_V, fep->hwp + FEC_OPD);
} else {
rcntl &= ~FEC_ENET_FCE;
}
#endif /* !defined(CONFIG_M5272) */
writel(rcntl, fep->hwp + FEC_R_CNTRL);
/* Setup multicast filter. */
set_multicast_list(ndev);
#ifndef CONFIG_M5272
writel(0, fep->hwp + FEC_HASH_TABLE_HIGH);
writel(0, fep->hwp + FEC_HASH_TABLE_LOW);
#endif
if (fep->quirks & FEC_QUIRK_ENET_MAC) {
/* enable ENET endian swap */
ecntl |= (1 << 8);
/* enable ENET store and forward mode */
writel(1 << 8, fep->hwp + FEC_X_WMRK);
}
if (fep->bufdesc_ex)
ecntl |= (1 << 4);
#ifndef CONFIG_M5272
/* Enable the MIB statistic event counters */
writel(0 << 31, fep->hwp + FEC_MIB_CTRLSTAT);
#endif
/* And last, enable the transmit and receive processing */
writel(ecntl, fep->hwp + FEC_ECNTRL);
fec_enet_active_rxring(ndev); //激活接收描述符 Receive Descriptor Active
if (fep->bufdesc_ex)
fec_ptp_start_cyclecounter(ndev);
/* Enable interrupts we wish to service */
if (fep->link)
writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
else
writel(FEC_ENET_MII, fep->hwp + FEC_IMASK);
/* Init the interrupt coalescing */
fec_enet_itr_coal_init(ndev);
}
fec_enet_bd_init
队列中的缓冲区描述符初始化
/* Init RX & TX buffer descriptors
*/
static void fec_enet_bd_init(struct net_device *dev)
{
struct fec_enet_private *fep = netdev_priv(dev);
struct fec_enet_priv_tx_q *txq;
struct fec_enet_priv_rx_q *rxq;
struct bufdesc *bdp;
unsigned int i;
unsigned int q;
for (q = 0; q < fep->num_rx_queues; q++) { //遍历所有rxq
/* Initialize the receive buffer descriptors. */
rxq = fep->rx_queue[q];
bdp = rxq->bd.base;
for (i = 0; i < rxq->bd.ring_size; i++) { //遍历rxq 中所有bufdesc
/* Initialize the BD for every fragment in the page. */
if (bdp->cbd_bufaddr) //初始化缓冲区描述符控制和状态信息
bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY); //描述符状态设为空
else
bdp->cbd_sc = cpu_to_fec16(0);
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
}
/* Set the last buffer to wrap */
bdp = fec_enet_get_prevdesc(bdp, &rxq->bd); //设置rxq 最后一个bufdesc 标志
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
rxq->bd.cur = rxq->bd.base; //将空闲 bufdesc 位置移到初始地址
}
for (q = 0; q < fep->num_tx_queues; q++) {
/* ...and the same for transmit */
txq = fep->tx_queue[q];
bdp = txq->bd.base;
txq->bd.cur = bdp;
for (i = 0; i < txq->bd.ring_size; i++) { //释放txq 缓冲区描述符
/* Initialize the BD for every fragment in the page. */
bdp->cbd_sc = cpu_to_fec16(0);
if (bdp->cbd_bufaddr &&
!IS_TSO_HEADER(txq, fec32_to_cpu(bdp->cbd_bufaddr)))
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
fec16_to_cpu(bdp->cbd_datlen),
DMA_TO_DEVICE);
if (txq->tx_skbuff[i]) {
dev_kfree_skb_any(txq->tx_skbuff[i]);
txq->tx_skbuff[i] = NULL;
}
bdp->cbd_bufaddr = cpu_to_fec32(0);
bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
}
/* Set the last buffer to wrap */
bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
txq->dirty_tx = bdp;
}
}
fec_enet_enable_ring 使能队列:将队列首地址写入ENETx_RDSR 寄存器。
imx系列最多支持3个队列,imx6ull 只支持1个队列。
static void fec_enet_enable_ring(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct fec_enet_priv_tx_q *txq;
struct fec_enet_priv_rx_q *rxq;
int i;
for (i = 0; i < fep->num_rx_queues; i++) { //imx系列最多支持3个接收队列,imx6ull只支持1个,队列0的起始地址保存在寄存器FEC_R_DES_START_0(ENETx_RDSR)
rxq = fep->rx_queue[i];
writel(rxq->bd.dma, fep->hwp + FEC_R_DES_START(i)); //设置rxq ring_buffer在ddr 中的地址(dma 物理地址),ENETx_RDSR
writel(PKT_MAXBUF_SIZE, fep->hwp + FEC_R_BUFF_SIZE(i));
/* enable DMA1/2 */
if (i)
writel(RCMR_MATCHEN | RCMR_CMP(i), //多队列情况下,使能其它dma通道
fep->hwp + FEC_RCMR(i));
}
for (i = 0; i < fep->num_tx_queues; i++) {
txq = fep->tx_queue[i];
writel(txq->bd.dma, fep->hwp + FEC_X_DES_START(i)); //设置txq ring_buffer在ddr 中的地址,ENETx_TDSR
/* enable DMA1/2 */
if (i)
writel(DMA_CLASS_EN | IDLE_SLOPE(i),
fep->hwp + FEC_DMA_CFG(i));
}
}
fec_enet_mii_init
注册mdio 总线,Linux 驱动使用struct mii_bus 来描述一个mdio总线。
其中mii_bus 最重要的成员就是mii_bus->read 和 mii_bus->write,他们用来读写phy寄存器。这两个函数由nxp 编写,因为每个cpu它的控制器都不尽相同,需要厂家自己编写。
从设备树读取mdio 设备并注册,读取mdio 子节点中的phy 设备,注册phy设备,phy设备使用struct phy_device 来描述。
static int fec_enet_mii_init(struct platform_device *pdev)
{
static struct mii_bus *fec0_mii_bus;
static bool *fec_mii_bus_share;
struct net_device *ndev = platform_get_drvdata(pdev);
struct fec_enet_private *fep = netdev_priv(ndev);
struct device_node *node;
int err = -ENXIO;
u32 mii_speed, holdtime;
...... 省略一些代码
fep->mii_bus = mdiobus_alloc();
if (fep->mii_bus == NULL) {
err = -ENOMEM;
goto err_out;
}
fep->mii_bus->name = "fec_enet_mii_bus";
fep->mii_bus->read = fec_enet_mdio_read; //mii 读/写 函数设置,(读写phy芯片)
fep->mii_bus->write = fec_enet_mdio_write;
snprintf(fep->mii_bus->id, MII_BUS_ID_SIZE, "%s-%x",
pdev->name, fep->dev_id + 1);
fep->mii_bus->priv = fep;
fep->mii_bus->parent = &pdev->dev;
node = of_get_child_by_name(pdev->dev.of_node, "mdio"); //获取mdio节点
err = of_mdiobus_register(fep->mii_bus, node); //向内核注册mdio
of_node_put(node);
if (err)
goto err_out_free_mdiobus;
mii_cnt++;
/* save fec0 mii_bus */
if (fep->quirks & FEC_QUIRK_SINGLE_MDIO) {
fec0_mii_bus = fep->mii_bus;
fec_mii_bus_share = &fep->mii_bus_share;
}
return 0;
err_out_free_mdiobus:
mdiobus_free(fep->mii_bus);
err_out:
return err;
}
of_mdiobus_register
mdiobus_register(mdio); 向内核注册mii_bus 总线;
for_each_available_child_of_node(np, child) {} 查找设备树mdio节点下所有子节点——phy节点。
addr = of_mdio_parse_addr(&mdio->dev, child); 解析phy节点,获取节点下reg的值——phy地址。
of_mdiobus_register_phy(mdio, child, addr); 注册phy设备。
//linux-5.4.47\drivers\of\of_mdio.c
int of_mdiobus_register(struct mii_bus *mdio, struct device_node *np)
{
struct device_node *child;
bool scanphys = false;
int addr, rc;
if (!np)
return mdiobus_register(mdio);
/* Do not continue if the node is disabled */
if (!of_device_is_available(np))
return -ENODEV;
/*Register the MDIO bus */
rc = mdiobus_register(mdio);
if (rc)
return rc;
/* Loop over the child nodes and register a phy_device for each phy */
for_each_available_child_of_node(np, child) {
addr = of_mdio_parse_addr(&mdio->dev, child);
if (addr < 0) {
scanphys = true;
continue;
}
if (of_mdiobus_child_is_phy(child))
rc = of_mdiobus_register_phy(mdio, child, addr);
else
rc = of_mdiobus_register_device(mdio, child, addr);
if (rc == -ENODEV)
dev_err(&mdio->dev,
"MDIO device at address %d is missing.\n",
addr);
else if (rc)
goto unregister;
}
if (!scanphys)
return 0;
/* auto scan for PHYs with empty reg property */
for_each_available_child_of_node(np, child) {
/* Skip PHYs with reg property set */
if (of_find_property(child, "reg", NULL))
continue;
for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
/* skip already registered PHYs */
if (mdiobus_is_registered_device(mdio, addr))
continue;
/* be noisy to encourage people to set reg property */
dev_info(&mdio->dev, "scan phy %pOFn at address %i\n",
child, addr);
if (of_mdiobus_child_is_phy(child)) {
rc = of_mdiobus_register_phy(mdio, child, addr);
if (rc && rc != -ENODEV)
goto unregister;
break;
}
}
}
return 0;
unregister:
mdiobus_unregister(mdio);
return rc;
}
EXPORT_SYMBOL(of_mdiobus_register);
of_mdiobus_register_phy
注册phy设备
static int of_mdiobus_register_phy(struct mii_bus *mdio,
struct device_node *child, u32 addr)
{
struct phy_device *phy;
bool is_c45;
int rc;
u32 phy_id;
is_c45 = of_device_is_compatible(child,
"ethernet-phy-ieee802.3-c45");
if (!is_c45 && !of_get_phy_id(child, &phy_id))
phy = phy_device_create(mdio, addr, phy_id, 0, NULL);
else
phy = get_phy_device(mdio, addr, is_c45);
if (IS_ERR(phy))
return PTR_ERR(phy);
rc = of_irq_get(child, 0);
if (rc == -EPROBE_DEFER) {
phy_device_free(phy);
return rc;
}
if (rc > 0) {
phy->irq = rc;
mdio->irq[addr] = rc;
} else {
phy->irq = mdio->irq[addr];
}
if (of_property_read_bool(child, "broken-turn-around"))
mdio->phy_ignore_ta_mask |= 1 << addr;
of_property_read_u32(child, "reset-assert-us",
&phy->mdio.reset_assert_delay);
/* Associate the OF node with the device structure so it
* can be looked up later */
of_node_get(child);
phy->mdio.dev.of_node = child;
phy->mdio.dev.fwnode = of_fwnode_handle(child);
/* All data is now stored in the phy struct;
* register it */
rc = phy_device_register(phy);
if(rc) {
phy_device_free(phy);
of_node_put(child);
return rc;
}
dev_dbg(&mdio->dev, "registered phy %pOFn at address %i\n",
child, addr);
return 0;
}
phy = phy_device_create(mdio, addr, phy_id, 0, NULL); 创建phy设备,返回的是struct phy_device 类型,Linux内核用struct phy_device来描述一个phy设备。
rc = phy_device_register(phy); 注册phy设备。
net_device_ops
Linux 以太网驱动会向上层提供net_device_ops ,方便应用层控制网卡。
static const struct net_device_ops fec_netdev_ops = {
.ndo_open = fec_enet_open,
.ndo_stop = fec_enet_close,
.ndo_start_xmit = fec_enet_start_xmit,
.ndo_set_rx_mode = set_multicast_list,
.ndo_validate_addr = eth_validate_addr,
.ndo_tx_timeout = fec_timeout,
.ndo_set_mac_address = fec_set_mac_address,
.ndo_do_ioctl = fec_enet_ioctl,
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = fec_poll_controller,
#endif
.ndo_set_features = fec_set_features,
};
fec_enet_open
在fec_enet_open 中主要做了以下几点:
- 分配rxq bufdesc.bd_bufaddr、rxq skb数组; 分配 txq->tx_bounce 数组
- 初始化mac :设置一些mac 寄存器;初始化rxq bufdesc(将所有缓冲区恢复空状态);清除所有中断事件等等
- 连接phy设备,将phy_device 附着在netdev上,完成后会在phy_init_hw() 调用phy驱动初始化phy;启动phy 状态机
- 使能napi,允许napi调度
- 打开所有发送队列,允许发送
static int
fec_enet_open(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int ret;
ret = pm_runtime_get_sync(&fep->pdev->dev);
if (ret < 0)
return ret;
pinctrl_pm_select_default_state(&fep->pdev->dev);
ret = fec_enet_clk_enable(ndev, true);
if (ret)
goto clk_enable;
/* I should reset the ring buffers here, but I don't yet know
* a simple way to do that.
*/
ret = fec_enet_alloc_buffers(ndev); //dma 动态分配rxq bufdesc.bd_bufaddr、rxq skb数组; 分配 txq->tx_bounce[] ,接收数据和发送时真正存放数据包的内存地址
if (ret)
goto err_enet_alloc;
/* Init MAC prior to mii bus probe */
fec_restart(ndev); //初始化mac :设置一些mac 寄存器;初始化rxq bufdesc(将所有缓冲区恢复空状态);清除所有中断事件等等
/* Probe and connect to PHY when open the interface */
ret = fec_enet_mii_probe(ndev); //连接phy设备,将phy_device 附着在netdev上,完成后会在phy_init_hw() 调用phy驱动初始化phy;//启动phy 状态机
if (ret)
goto err_enet_mii_probe;
if (fep->quirks & FEC_QUIRK_ERR006687)
imx6q_cpuidle_fec_irqs_used();
napi_enable(&fep->napi); //使能napi,允许napi调度
phy_start(ndev->phydev);
netif_tx_start_all_queues(ndev); //打开所有发送队列,允许发送
device_set_wakeup_enable(&ndev->dev, fep->wol_flag &
FEC_WOL_FLAG_ENABLE);
return 0;
}
fec_enet_alloc_buffers
static int fec_enet_alloc_buffers(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
for (i = 0; i < fep->num_rx_queues; i++)
if (fec_enet_alloc_rxq_buffers(ndev, i)) //分配rxq bufdesc->cbd_bufaddr,分配rxq->rx_skbuff[]
return -ENOMEM;
for (i = 0; i < fep->num_tx_queues; i++)
if (fec_enet_alloc_txq_buffers(ndev, i)) //txq->tx_bounce[] 这个应该是发送时存放数据包的内存地址
return -ENOMEM;
return 0;
}
fec_enet_alloc_rxq_buffers
保存网卡传输数据的缓冲区应该使用流式的DMA buffer;网卡驱动和网卡DMA控制器往往是通过一些内存中的描述符(形成环或者链)进行交互,这些保存描述符的memory一般采用一致性dma 映射。
dma 流式分配传输缓冲区的内存给存储数据的skb->data,和bufdesc->cbd_bufaddr;前者保存的是dma地址,应该是给设备使用的,后者是虚拟地址,供驱动使用。
static int
fec_enet_alloc_rxq_buffers(struct net_device *ndev, unsigned int queue)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
struct sk_buff *skb;
struct bufdesc *bdp;
struct fec_enet_priv_rx_q *rxq;
rxq = fep->rx_queue[queue];
bdp = rxq->bd.base;
for (i = 0; i < rxq->bd.ring_size; i++) {
skb = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE); //申请skb
if (!skb)
goto err_alloc;
if (fec_enet_new_rxbdp(ndev, bdp, skb)) { //dma 动态分配bufdesc->cbd_bufaddr 内存,同时与skb->data绑定;一个skb 对应一个bufdesc
dev_kfree_skb(skb); //设备将数据dma到bufdesc->cbd_bufaddr,内核便可以从skb->data 获取到数据包
goto err_alloc;
}
rxq->rx_skbuff[i] = skb; //rxq->rx_skbuff[] 数组的大小和 ring_buffer 中bufdesc 的数量一致
bdp->cbd_sc = cpu_to_fec16(BD_ENET_RX_EMPTY);
if (fep->bufdesc_ex) {
struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
}
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
}
/* Set the last buffer to wrap. */
bdp = fec_enet_get_prevdesc(bdp, &rxq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
return 0;
err_alloc:
fec_enet_free_buffers(ndev);
return -ENOMEM;
}
fec_enet_new_rxbdp
static int
fec_enet_new_rxbdp(struct net_device *ndev, struct bufdesc *bdp, struct sk_buff *skb)
{
struct fec_enet_private *fep = netdev_priv(ndev);
int off;
off = ((unsigned long)skb->data) & fep->rx_align;
if (off)
skb_reserve(skb, fep->rx_align + 1 - off); //调整skb 缓冲区头部大小,skb头部=skb->head - skb->data
//分配bdp->cbd_bufaddr 内存,同时skb->data 也指向这段内存。说明bufdesc 已经和skb 绑定,一个skb 对应一个bufdesc
//dma 动态映射需要指明方向:DMA_FROM_DEVICE,从device dma到ddr
bdp->cbd_bufaddr = cpu_to_fec32(dma_map_single(&fep->pdev->dev, skb->data, FEC_ENET_RX_FRSIZE - fep->rx_align, DMA_FROM_DEVICE));
if (dma_mapping_error(&fep->pdev->dev, fec32_to_cpu(bdp->cbd_bufaddr))) {
if (net_ratelimit())
netdev_err(ndev, "Rx DMA memory map failed\n");
return -ENOMEM;
}
return 0;
}
fec_enet_alloc_txq_buffers
static int
fec_enet_alloc_txq_buffers(struct net_device *ndev, unsigned int queue)
{
struct fec_enet_private *fep = netdev_priv(ndev);
unsigned int i;
struct bufdesc *bdp;
struct fec_enet_priv_tx_q *txq;
txq = fep->tx_queue[queue];
bdp = txq->bd.base;
for (i = 0; i < txq->bd.ring_size; i++) {
txq->tx_bounce[i] = kmalloc(FEC_ENET_TX_FRSIZE, GFP_KERNEL); //txq->tx_bounce[] 这个应该是发送时存放数据包的内存地址
if (!txq->tx_bounce[i])
goto err_alloc;
bdp->cbd_sc = cpu_to_fec16(0);
bdp->cbd_bufaddr = cpu_to_fec32(0);
if (fep->bufdesc_ex) {
struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
ebdp->cbd_esc = cpu_to_fec32(BD_ENET_TX_INT);
}
bdp = fec_enet_get_nextdesc(bdp, &txq->bd);
}
/* Set the last buffer to wrap. */
bdp = fec_enet_get_prevdesc(bdp, &txq->bd);
bdp->cbd_sc |= cpu_to_fec16(BD_SC_WRAP);
return 0;
err_alloc:
fec_enet_free_buffers(ndev);
return -ENOMEM;
}
fec_enet_mii_probe
of_phy_connect 的调用
of_phy_connect 会调用到phy_attach_direct,在其中绑定了phydev 与netdev,并且在phy_init_hw 中会调用到phy驱动。
static int fec_enet_mii_probe(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct phy_device *phy_dev = NULL;
char mdio_bus_id[MII_BUS_ID_SIZE];
char phy_name[MII_BUS_ID_SIZE + 3];
int phy_id;
int dev_id = fep->dev_id;
if (fep->phy_node) {
phy_dev = of_phy_connect(ndev, fep->phy_node, //连接phy设备,将phy_device 附着在netdev上,完成后会在phy_init_hw() 调用phy驱动初始化phy;
&fec_enet_adjust_link, 0, //启动phy 状态机,phy 状态机(其实就是一个工作队列,不断地监测phy link状态)
fep->phy_interface);
if (!phy_dev) {
netdev_err(ndev, "Unable to connect to phy\n");
return -ENODEV;
}
} else {
/* check for attached phy */
for (phy_id = 0; (phy_id < PHY_MAX_ADDR); phy_id++) {
if (!mdiobus_is_registered_device(fep->mii_bus, phy_id))
continue;
if (dev_id--)
continue;
strlcpy(mdio_bus_id, fep->mii_bus->id, MII_BUS_ID_SIZE);
break;
}
if (phy_id >= PHY_MAX_ADDR) {
netdev_info(ndev, "no PHY, assuming direct connection to switch\n");
strlcpy(mdio_bus_id, "fixed-0", MII_BUS_ID_SIZE);
phy_id = 0;
}
snprintf(phy_name, sizeof(phy_name),
PHY_ID_FMT, mdio_bus_id, phy_id);
phy_dev = phy_connect(ndev, phy_name, &fec_enet_adjust_link,
fep->phy_interface);
}
if (IS_ERR(phy_dev)) {
netdev_err(ndev, "could not attach to PHY\n");
return PTR_ERR(phy_dev);
}
/* mask with MAC supported features */
if (fep->quirks & FEC_QUIRK_HAS_GBIT) {
phy_set_max_speed(phy_dev, 1000);
phy_remove_link_mode(phy_dev,
ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
#if !defined(CONFIG_M5272)
phy_support_sym_pause(phy_dev);
#endif
}
else
phy_set_max_speed(phy_dev, 100);
fep->link = 0;
fep->full_duplex = 0;
phy_attached_info(phy_dev);
return 0;
}
fec_enet_close
fec_enet_close 的内容与open 相反:
- 禁用napi
- netif_tx_stop_queue 停止发送队列
- fec_stop mac一些寄存器设置
- phy_disconnect 断开phydev 与netdev的关系
- fec_enet_free_buffers 释放txq、rxq 缓冲区
static int
fec_enet_close(struct net_device *ndev)
{
struct fec_enet_private *fep = netdev_priv(ndev);
phy_stop(ndev->phydev);
if (netif_device_present(ndev)) {
napi_disable(&fep->napi); //禁用napi
netif_tx_disable(ndev); //禁用发送队列
fec_stop(ndev); //关闭mac
}
phy_disconnect(ndev->phydev); //停止phy 状态机;断开netdev 与phy_device,取消两者附着关系
if (fep->quirks & FEC_QUIRK_ERR006687)
imx6q_cpuidle_fec_irqs_unused();
fec_enet_update_ethtool_stats(ndev);
fec_enet_clk_enable(ndev, false); //禁用mac 时钟
pinctrl_pm_select_sleep_state(&fep->pdev->dev);
pm_runtime_mark_last_busy(&fep->pdev->dev);
pm_runtime_put_autosuspend(&fep->pdev->dev);
fec_enet_free_buffers(ndev); //释放rxq bufdesc.bd_bufaddr、rxq skb数组; 分配 txq->tx_bounce[]
return 0;
}
网络收包流程
Linux 网络收包采用napi 技术,napi——中断+轮询。
在网卡将数据搬运到内存中的环形缓冲区后,会发出硬件中断通知cpu 数据到达,cpu跳入中断处理函数,会根据中断事件类型分别处理,当类型为接收数据时,启动napi 调度,napi 会将网卡驱动中的napi_poll (各种网卡的设计不同,各自的收包程序也会不同)添加到cpu 的网络软中断私有数据中,并标记网络接收的软中断。
软中断线程会不断判断是否有软中断标志位被标记,每一个标记位对应一个实例函数(这些函数是在内核启动时与标志位绑定),一旦发现标记软中断线程便会跳入该处理函数,也就是在这个处理函数中调用网卡驱动的napi_poll 开始真正的收包流程。
napi 调用流程
//在网卡驱动中会注册中断 与 napi(一般在初始化时,probe 函数中)
ret = devm_request_irq(&pdev->dev, irq, fec_enet_interrupt,
0, pdev->name, ndev);
netif_napi_add(ndev, &fep->napi, fec_enet_rx_napi, NAPI_POLL_WEIGHT);
/*
中断处理函数负责的事
1、读取中断事件
2、清除中断标志位
3、禁用napi 中断
4、将napi_struct->poll_list 添加到softirq_data->poll_list
5、napi 调度
*/
static irqreturn_t fec_enet_interrupt(int irq, void *dev_id)
{
struct net_device *ndev = dev_id;
struct fec_enet_private *fep = netdev_priv(ndev);
uint int_events;
irqreturn_t ret = IRQ_NONE;
int_events = readl(fep->hwp + FEC_IEVENT); //读取中断事件
writel(int_events, fep->hwp + FEC_IEVENT); //清除中断标志位
fec_enet_collect_events(fep, int_events); //此处根据事件标记fep->work_rx 或fep->work_tx
if ((fep->work_tx || fep->work_rx) && fep->link) {
ret = IRQ_HANDLED;
if (napi_schedule_prep(&fep->napi)) { //检测是否可以进行napi调度
/* Disable the NAPI interrupts */
writel(FEC_NAPI_IMASK, fep->hwp + FEC_IMASK);
__napi_schedule(&fep->napi); //开始napi调度
}
}
if (int_events & FEC_ENET_MII) { //napi 调度完成
ret = IRQ_HANDLED;
complete(&fep->mdio_done);
}
return ret;
}
void __napi_schedule(struct napi_struct *n)
{
unsigned long flags;
local_irq_save(flags);
____napi_schedule(this_cpu_ptr(&softnet_data), n); //this_cpu_ptr(&softnet_data) 获取到当前cpu的 网络软中断私有数据
local_irq_restore(flags);
}
static inline void ____napi_schedule(struct softnet_data *sd,
struct napi_struct *napi)
{ //sd->poll_list 中应该保存着各种网络设备注册的poll_list,包括wifi、eth等等,通过list 就可以找到napi_struct
list_add_tail(&napi->poll_list, &sd->poll_list); //将napi 的链表节点添加到softnet_data.poll_list(在软中断中可以凭借softnet_data.poll_list 找到以太网注册的poll函数)
__raise_softirq_irqoff(NET_RX_SOFTIRQ); //标记 NET_RX_SOFTIRQ 收包软中断标识位,所有的网络设备收包中断中应该都会标记这个位。
}
//__raise_softirq_irqoff 标志其实就是将对应标志位 或上 1
void __raise_softirq_irqoff(unsigned int nr)
{
trace_softirq_raise(nr);
or_softirq_pending(1UL << nr);
}
软中断
软中断:在多核的cpu中,每一个cpu都会运行一个软中断线程,它会处理硬中断来不及处理的事务,由硬件中断处理函数去标记软中断。硬件中断在哪个cpu上处理,对应的软中断也在哪个cpu上处理。单核cpu只有一个软中断线程。
使用napi的好处:提高了收包效率;比较早的内核版本中完全使用中断来接收数据包,这样在处理超长包时,cpu需要长时间陷入中断处理函数,导致其它程序无法调度,降低cpu性能。如果单纯使用轮询时,在大量的小包时,处理效率也会比较低。
软中断初始化
内核在启动时在spawn_ksoftirqd 函数中调用了smpboot_register_percpu_thread 为每一个cpu 创建了一个软中断线程。
//file: kernel/softirq.c
static struct smp_hotplug_thread softirq_threads = {
.store = &ksoftirqd,
.thread_should_run = ksoftirqd_should_run,
.thread_fn = run_ksoftirqd,
.thread_comm = "ksoftirqd/%u",
};
static __init int spawn_ksoftirqd(void)
{
register_cpu_notifier(&cpu_nfb);
BUG_ON(smpboot_register_percpu_thread(&softirq_threads));
return 0;
}
early_initcall(spawn_ksoftirqd);
smpboot_register_percpu_thread 定义在kernel/smpboot.c。
//kernel/smpboot.c
int smpboot_register_percpu_thread(struct smp_hotplug_thread *plug_thread)
{
unsigned int cpu;
int ret = 0;
get_online_cpus();
mutex_lock(&smpboot_threads_lock);
for_each_online_cpu(cpu) {
ret = __smpboot_create_thread(plug_thread, cpu);
if (ret) {
smpboot_destroy_threads(plug_thread);
goto out;
}
smpboot_unpark_thread(plug_thread, cpu);
}
list_add(&plug_thread->list, &hotplug_threads);
out:
mutex_unlock(&smpboot_threads_lock);
put_online_cpus();
return ret;
}
Linux内核启动时会调用 网络子系统初始化函数 net_dev_init。
在其中注册了网络发送和接收的action,并与它们的标识符绑定。
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
//file: net/core/dev.c
static int __init net_dev_init(void)
{
......
for_each_possible_cpu(i) {
struct softnet_data *sd = &per_cpu(softnet_data, i);
memset(sd, 0, sizeof(*sd));
skb_queue_head_init(&sd->input_pkt_queue);
skb_queue_head_init(&sd->process_queue);
sd->completion_queue = NULL;
INIT_LIST_HEAD(&sd->poll_list);
......
}
......
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
}
subsys_initcall(net_dev_init);
内核中不仅仅只有网络的软中断函数,还有起来类型的。
//file: include/linux/interrupt.h
enum
{
HI_SOFTIRQ=0,
TIMER_SOFTIRQ,
NET_TX_SOFTIRQ,
NET_RX_SOFTIRQ,
BLOCK_SOFTIRQ,
BLOCK_IOPOLL_SOFTIRQ,
TASKLET_SOFTIRQ,
SCHED_SOFTIRQ,
HRTIMER_SOFTIRQ,
RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */
NR_SOFTIRQS
};
软中断调用
softirq_threads 描述一个软中断线程,当软中断线程注册后,会不断调用ksoftirqd_should_run 判断是否有软中断发生。
在硬件中断 napi 调度的时候调用了or_softirq_pending 标识了软中断标志位,local_softirq_pending就可以读取到是否有软中断发生,发生软中断后会调用run_ksoftirqd
。
#define local_softirq_pending() (__this_cpu_read(local_softirq_pending_ref))
#define set_softirq_pending(x) (__this_cpu_write(local_softirq_pending_ref, (x)))
#define or_softirq_pending(x) (__this_cpu_or(local_softirq_pending_ref, (x)))
static int ksoftirqd_should_run(unsigned int cpu)
{
return local_softirq_pending();
}
进入软中断前需要禁用硬件中断(不太清楚为什么关闭,在韦东山的教程中软中断是可以被硬件中断打断的)
static void run_ksoftirqd(unsigned int cpu)
{
local_irq_disable(); //禁用硬件中断
if (local_softirq_pending()) {
/*
* We can safely run softirq on inline stack, as we are not deep
* in the task stack here.
*/
__do_softirq();
local_irq_enable();
cond_resched();
return;
}
local_irq_enable();
}
__do_softirq
h->action(h); 执行了action,我们这里执行的就是net_rx_action,它就是网络接收的软中断
asmlinkage void __do_softirq(void)
{
do {
if (pending & 1) {
unsigned int vec_nr = h - softirq_vec;
int prev_count = preempt_count();
...
trace_softirq_entry(vec_nr);
h->action(h);
trace_softirq_exit(vec_nr);
...
}
h++;
pending >>= 1;
} while (pending);
}
函数开头的time_limit和budget是用来控制net_rx_action函数主动退出的,目的是保证网络包的接收不霸占CPU不放。 等下次网卡再有硬中断过来的时候再处理剩下的接收数据包。其中budget可以通过内核参数调整。 这个函数中剩下的核心逻辑是获取到当前CPU变量softnet_data,对其poll_list进行遍历, 然后执行到网卡驱动注册到的poll函数。imx6ull 的poll 函数就是fec_enet_rx_napi.
static void net_rx_action(struct softirq_action *h)
{
struct softnet_data *sd = &__get_cpu_var(softnet_data);
unsigned long time_limit = jiffies + 2;
int budget = netdev_budget;
void *have;
local_irq_disable();
while (!list_empty(&sd->poll_list)) {
......
n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list); //取出链表的第一个节点
work = 0;
if (test_bit(NAPI_STATE_SCHED, &n->state)) {
work = n->poll(n, weight); //调用驱动注册的poll
trace_napi_poll(n);
}
budget -= work;
}
}
fec_enet_rx_napi
上面的代码,在软中断net_rx_action 函数中调用了网卡驱动注册的napi poll函数,imx6ull 的网卡驱动注册的napi poll函数就是fec_enet_rx_napi,接下去看一下imx6ull 网卡驱动收包函数的调用流程:
fec_enet_rx_napi
->fec_enet_rx
->fec_enet_rx_queue //重点是这个函数,在这里会将数据包从ringbuffer 拷贝到skb_buff,并交付给协议栈
static int fec_enet_rx_napi(struct napi_struct *napi, int budget)
{
struct net_device *ndev = napi->dev;
struct fec_enet_private *fep = netdev_priv(ndev);
int pkts; //packet 数量
pkts = fec_enet_rx(ndev, budget); //budget 是用来控制函数主动退出的,目的是保证网络包的接收不霸占CPU不放。
fec_enet_tx(ndev);
if (pkts < budget) {
napi_complete_done(napi, pkts);
writel(FEC_DEFAULT_IMASK, fep->hwp + FEC_IMASK);
}
return pkts;
}
static int
fec_enet_rx(struct net_device *ndev, int budget)
{
int pkt_received = 0;
u16 queue_id; //队列id
struct fec_enet_private *fep = netdev_priv(ndev);
/*
#define for_each_set_bit(bit, addr, size) \
for ((bit) = find_first_bit((addr), (size)); (bit) < (size); (bit) = find_next_bit((addr), (size), (bit) + 1))
{ }
for 循环获取三个队列的queue_id
*/
for_each_set_bit(queue_id, &fep->work_rx, FEC_ENET_MAX_RX_QS) {
int ret;
ret = fec_enet_rx_queue(ndev,
budget - pkt_received, queue_id);
if (ret < budget - pkt_received)
clear_bit(queue_id, &fep->work_rx);
pkt_received += ret; //pkt_received:已接收的数据包数量,budget:总共要接收的
}
return pkt_received;
}
static int
fec_enet_rx_queue(struct net_device *ndev, int budget, u16 queue_id)
{
struct fec_enet_private *fep = netdev_priv(ndev);
struct fec_enet_priv_rx_q *rxq;
struct bufdesc *bdp;
unsigned short status;
struct sk_buff *skb_new = NULL;
struct sk_buff *skb;
ushort pkt_len;
__u8 *data;
int pkt_received = 0;
struct bufdesc_ex *ebdp = NULL;
bool vlan_packet_rcvd = false;
u16 vlan_tag;
int index = 0;
bool is_copybreak;
bool need_swap = fep->quirks & FEC_QUIRK_SWAP_FRAME;
#ifdef CONFIG_M532x
flush_cache_all();
#endif
queue_id = FEC_ENET_GET_QUQUE(queue_id);
rxq = fep->rx_queue[queue_id]; //获取队列
/* First, grab all of the stats for the incoming packet.
* These get messed up if we get called due to a busy condition.
*/
bdp = rxq->bd.cur; //当前剩余可用的描述符
while (!((status = fec16_to_cpu(bdp->cbd_sc)) & BD_ENET_RX_EMPTY)) { //判断缓冲区有数据后开始收包
if (pkt_received >= budget) //达到budget 数量,退出收包
break;
pkt_received++;
writel(FEC_ENET_RXF, fep->hwp + FEC_IEVENT); //清除数据到达的事件
/* Check for errors. */
status ^= BD_ENET_RX_LAST; //BD_ENET_RX_LAST:判断该缓冲区,是否是帧的最后一个缓冲区(一个帧的数据可能会有多个缓冲区存放)
if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH | BD_ENET_RX_NO | //错误检测:帧长度检测、不以8位对齐的帧、CRC 校验错误、
BD_ENET_RX_CR | BD_ENET_RX_OV | BD_ENET_RX_LAST | //Overrun(溢出,接收帧超出FIFO 限制)
BD_ENET_RX_CL)) {
ndev->stats.rx_errors++;
if (status & BD_ENET_RX_OV) { //FIFO 溢出
/* FIFO overrun */
ndev->stats.rx_fifo_errors++;
goto rx_processing_done;
}
if (status & (BD_ENET_RX_LG | BD_ENET_RX_SH
| BD_ENET_RX_LAST)) {
/* Frame too long or too short. */
ndev->stats.rx_length_errors++;
if (status & BD_ENET_RX_LAST)
netdev_err(ndev, "rcv is not +last\n");
}
if (status & BD_ENET_RX_CR) /* CRC Error */
ndev->stats.rx_crc_errors++;
/* Report late collisions as a frame error. */
if (status & (BD_ENET_RX_NO | BD_ENET_RX_CL))
ndev->stats.rx_frame_errors++;
goto rx_processing_done;
}
/* Process the incoming frame. */
ndev->stats.rx_packets++; //netdev收包计数增加
pkt_len = fec16_to_cpu(bdp->cbd_datlen); //统计字节长度
ndev->stats.rx_bytes += pkt_len;
index = fec_enet_get_bd_index(bdp, &rxq->bd); //获取缓冲描述符的索引:第几个描述符
skb = rxq->rx_skbuff[index]; //获取与描述符相对应的skb
is_copybreak = fec_enet_copybreak(ndev, &skb, bdp, pkt_len - 4,
need_swap); //将数据从缓冲区拷贝到一个新的sk_buff, 并剥离FCS字段
if (!is_copybreak) {
skb_new = netdev_alloc_skb(ndev, FEC_ENET_RX_FRSIZE);
if (unlikely(!skb_new)) {
ndev->stats.rx_dropped++;
goto rx_processing_done;
}
dma_unmap_single(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
FEC_ENET_RX_FRSIZE - fep->rx_align,
DMA_FROM_DEVICE);
}
prefetch(skb->data - NET_IP_ALIGN);
skb_put(skb, pkt_len - 4); //扩展skb_buff 数据区;data~tail 地址范围是存放数据区域,它的长度是 pkt_len - 4
data = skb->data; //减4: 减去FCS 的长度
if (!is_copybreak && need_swap)
swap_buffer(data, pkt_len);
#if !defined(CONFIG_M5272)
if (fep->quirks & FEC_QUIRK_HAS_RACC)
data = skb_pull_inline(skb, 2);
#endif
/* Extract the enhanced buffer descriptor */
ebdp = NULL;
if (fep->bufdesc_ex)
ebdp = (struct bufdesc_ex *)bdp;
/* If this is a VLAN packet remove the VLAN Tag */
vlan_packet_rcvd = false;
if ((ndev->features & NETIF_F_HW_VLAN_CTAG_RX) &&
fep->bufdesc_ex &&
(ebdp->cbd_esc & cpu_to_fec32(BD_ENET_RX_VLAN))) {
/* Push and remove the vlan tag */
struct vlan_hdr *vlan_header = //获取数据包 vlan 头部
(struct vlan_hdr *) (data + ETH_HLEN);
vlan_tag = ntohs(vlan_header->h_vlan_TCI);
vlan_packet_rcvd = true; //vlan 截取完成
memmove(skb->data + VLAN_HLEN, data, ETH_ALEN * 2); //从以太网帧中删除vlan 头部
skb_pull(skb, VLAN_HLEN);
}
skb->protocol = eth_type_trans(skb, ndev); //确定协议
/* Get receive timestamp from the skb */ //获取时间戳
if (fep->hwts_rx_en && fep->bufdesc_ex)
fec_enet_hwtstamp(fep, fec32_to_cpu(ebdp->ts),
skb_hwtstamps(skb));
if (fep->bufdesc_ex &&
(fep->csum_flags & FLAG_RX_CSUM_ENABLED)) {
if (!(ebdp->cbd_esc & cpu_to_fec32(FLAG_RX_CSUM_ERROR))) {
/* don't check it */
skb->ip_summed = CHECKSUM_UNNECESSARY;
} else {
skb_checksum_none_assert(skb);
}
}
/* Handle received VLAN packets */
if (vlan_packet_rcvd)
__vlan_hwaccel_put_tag(skb,
htons(ETH_P_8021Q),
vlan_tag);
/* 在napi_gro_receive 会调用dev_gro_receive
dev_gro_receive这个函数代表的是网卡GRO特性,可以简单理解成把相关的小包合并成一个大包就行,
目的是减少传送给网络栈的包数,这有助于减少 CPU 的使用量。
*/
napi_gro_receive(&fep->napi, skb); //从这里开始就将数据包递交给协议栈处理了
if (is_copybreak) {
dma_sync_single_for_device(&fep->pdev->dev,
fec32_to_cpu(bdp->cbd_bufaddr),
FEC_ENET_RX_FRSIZE - fep->rx_align,
DMA_FROM_DEVICE);
} else {
rxq->rx_skbuff[index] = skb_new;
fec_enet_new_rxbdp(ndev, bdp, skb_new);
}
rx_processing_done:
/* Clear the status flags for this buffer */
status &= ~BD_ENET_RX_STATS;
/* Mark the buffer empty */
status |= BD_ENET_RX_EMPTY;
if (fep->bufdesc_ex) {
struct bufdesc_ex *ebdp = (struct bufdesc_ex *)bdp;
ebdp->cbd_esc = cpu_to_fec32(BD_ENET_RX_INT);
ebdp->cbd_prot = 0;
ebdp->cbd_bdu = 0;
}
/* Make sure the updates to rest of the descriptor are
* performed before transferring ownership.
*/
wmb();
bdp->cbd_sc = cpu_to_fec16(status);
/* Update BD pointer to next entry */
bdp = fec_enet_get_nextdesc(bdp, &rxq->bd);
/* Doing this here will keep the FEC running while we process
* incoming frames. On a heavily loaded network, we should be
* able to keep up at the expense of system resources.
*/
writel(0, rxq->bd.reg_desc_active);
}
rxq->bd.cur = bdp;
return pkt_received;
}
总结
fec_probe 主要做的事情是:
- 申请netdev
- dma分配txq、rxq
- 硬件初始化
- net_device_ops、ethtool_ops、发送超时等设置。
- napi 初始化
- 填充netdev、fec_enet_private
- 注册中断
- mdio初始化
- 注册netdev