<Linux Network 2.6.38> net_rx_action

sunzixun

浏览: 76023 次
性别:
来自: 苏州

最近访客更多访客>>

844700118

wd1282988143

sparktyy

xiangfirst

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

.net Linux Socket UP

这个函数在2.6.35之后改动还是很大的

先回顾几个和可调文件系统有关的变量，下面的代码里面要提到用处

netdev_max_backlog

------------------

Maximum number of packets, queued on the INPUT side, when the interface

receives packets faster than kernel can process them.

The input queue is managed by softnet_data->input_pkt_queue. Each input queue has a maximum length given by the global variable netdev_max_backlog, whose value is 300. This means that each CPU can have up to 300 frames in its input queue waiting to be processed, regardless of the number of devices in the system.[*]

net.core.netdev_max_backlog = 300000

偷偷的看一眼在什么地方

int netif_rx(struct sk_buff *skb){

//...

enqueue_to_backlog(skb, get_cpu(), &qtail);

//....

}

static int enqueue_to_backlog(struct sk_buff *skb, int cpu,

unsigned int *qtail)

{

//....

if (skb_queue_len(&sd->input_pkt_queue) <= netdev_max_backlog) {

if (skb_queue_len(&sd->input_pkt_queue)) {

enqueue:

/...

}

goto enqueue;

}

sd->dropped++;

//...

}

netdev_budget

----------------------

每个设备在一次poll过程中可以处理的最大packets

Maximum number of packets taken from all interfaces in one polling cycle (NAPI

poll). In one polling cycle interfaces which are registered to polling are

probed in a round-robin manner. The limit of packets in one such probe can be

set per-device via sysfs class/net/<device>/weight .

weight_p

-----------------

我换了个名字，够重要吧，记住也是一个 per CPU -V /proc/sys/net/core*

dev_weight - INTEGER

The maximum number of packets that kernel can handle on a NAPI

interrupt, it's a Per-CPU variable.

Default: 64

继续偷看一下这里

static int __init net_dev_init(void)

{

//...

#ifdef CONFIG_RPS

sd->csd.func = rps_trigger_softirq;

sd->csd.info = sd;

sd->csd.flags = 0;

sd->cpu = i;

#endif

sd->backlog.poll = process_backlog;

sd->backlog.weight = weight_p;

sd->backlog.gro_list = NULL;

sd->backlog.gro_count = 0;

//...

}

好了相信你还记得吧

static int __init net_dev_init(void){

/*注册的软中断行为吗*/

open_softirq(NET_RX_SOFTIRQ, net_rx_action);

}

原文这个不变:

Frames can wait in two places for net_rx_action to process them:

A shared CPU-specific queue

Non-NAPI devices' interrupt handlers, which call netif_rx, place frames into the softnet_data->input_pkt_queue of the CPU on which the interrupt handlers run.

由 netif_rx 处理过放在 softnet_data->input_pkt_queue 的代码

Device memory

The poll method used by NAPI drivers extracts frames directly from the device

在看一下和 napi调度有关的标志

enum {
	NAPI_STATE_SCHED,	/* Poll is scheduled */
	NAPI_STATE_DISABLE,	/* Disable pending */
	NAPI_STATE_NPSVC,	/* Netpoll - don't dequeue from poll_list */
};

struct napi_struct synchronization rules

========================================

napi->poll:

Synchronization: NAPI_STATE_SCHED bit in napi->state. Device

driver's dev->close method will invoke napi_disable() on

all NAPI instances which will do a sleeping poll on the

NAPI_STATE_SCHED napi->state bit, waiting for all pending

NAPI activity to cease.

Context: softirq

will be called with interrupts disabled by netconsole.

明白了吗，设备驱动如果调用dev->close(stop napi)方法就会引起

NAPI_STATE_DISABLE 被 (napi_disable)设置, 这是外部做的

一般是 struct net_device_ops 里面的

* int (*ndo_stop)(struct net_device *dev);

* This function is called when network device transistions to the down

* state.

下面就是主要的 net_rx_action

static void net_rx_action(struct softirq_action *h)
{
	struct softnet_data *sd = &__get_cpu_var(softnet_data);
	unsigned long time_limit = jiffies + 2;  
	int budget = netdev_budget;
	void *have;

	local_irq_disable();

	while (!list_empty(&sd->poll_list)) {
		struct napi_struct *n;
		int work, weight;

		/* If softirq window is exhuasted then punt.
		 * Allow this to run for 2 jiffies since which will allow
		 * an average latency of 1.5/HZ.
		 */
	/*当前可用的配额如果用完了，就直接去增加需要的time_squeeze*/
		if (unlikely(budget <= 0 || time_after(jiffies, time_limit)))
			goto softnet_break;
	
	/*虽然打开了 中断，但是他只会在 poll_list队尾加， 而poll 只在队首处理<又一个避免锁的方法>*/
	
		local_irq_enable();
	
	/*得到 softnet_data中挂着的struct napi_struc*/
		n = list_first_entry(&sd->poll_list, struct napi_struct, poll_list);
	/*锁定该 struct napi_struc ，并且记录当前调度的CPU*/
		have = netpoll_poll_lock(n);
	
		weight = n->weight;

	/*检查 NAPI_STATE_SCHED位 ，避免在挂接，移除链表时候的竞争，有点	while(flag) pthread_con_wait; 的味道*/
		work = 0;
		if (test_bit(NAPI_STATE_SCHED, &n->state)) {
			work = n->poll(n, weight);
			trace_napi_poll(n);
		}
		WARN_ON_ONCE(work > weight);
	/*消耗一个budget*/
		budget -= work;
	
		local_irq_disable();

		/* Drivers must not modify the NAPI state if they
		 * consume the entire weight.  In such cases this code
		 * still "owns" the NAPI instance and therefore can
		 * move the instance around on the list at-will.
		 */
/*如果队列中包和能处理的包 数目一样, 意味着要消耗光整个weight*/
		if (unlikely(work == weight)) {
/*如果设置了NAPI_STATE_DISABLE */
			if (unlikely(napi_disable_pending(n))) {
				local_irq_enable();
	/*既然不让干活了， 就洗洗闪人吧*/
				napi_complete(n);
				local_irq_disable();
			} else
	/*还有活干 就先把napi poll_list 移到 poll_list末尾*/
				list_move_tail(&n->poll_list, &sd->poll_list);
		}

		netpoll_poll_unlock(have);
	}
out:
	/*这个函数末尾的时候看一下 ，记住是开启RPS 后生效*/
	net_rps_action_and_irq_enable(sd);

#ifdef CONFIG_NET_DMA
	/*
	 * There may not be any more sk_buffs coming right now, so push
	 * any pending DMA copies to hardware
	 */
	/*看一下这个就行了  
	*tcp_dma_copybreak - INTEGER
	*Lower limit, in bytes, of the size of socket reads that will be
	*offloaded to a DMA copy engine, if one is present in the system
	*and CONFIG_NET_DMA is enabled.
	*Default: 4096*/
	dma_issue_pending_all();
#endif

	return;

softnet_break:
	/*等待下一次的调度处理未处理的buffer，NET_RX_SOFTIRQ很快就要开始咯*/
	sd->time_squeeze++;
	__raise_softirq_irqoff(NET_RX_SOFTIRQ);
	goto out;
}

static void net_rps_action_and_irq_enable(struct softnet_data *sd)

{
#ifdef CONFIG_RPS
	struct softnet_data *remsd = sd->rps_ipi_list;
	/*专属队列rps_ipi_list 上面已经有了buffer要处理*/
	if (remsd) {
		sd->rps_ipi_list = NULL;

		local_irq_enable();

		/* Send pending IPI's to kick RPS processing on remote cpus. */
		while (remsd) {
			struct softnet_data *next = remsd->rps_ipi_next;
	/*还记得我们在net_dev_init中存过吗*/
			if (cpu_online(remsd->cpu))
			/*记得csd把 就是那个struct softnet_data中的
	*struct call_single_data	csd ____cacheline_aligned_in_smp;
	*<在SMP架构中用于结构体对齐> 
	*/	
	/*当你的函数希望指定CPU去运行，就需要用这个函数
	*写下去就要涉及整个SMP.c 了，只要知道这里用指定的负载CPU执行了*rps_trigger_softirq;*/
	__smp_call_function_single(remsd->cpu,
							   &remsd->csd, 0);
			remsd = next;
		}
	} else
#endif
		local_irq_enable();
}

而

static void rps_trigger_softirq(void *data)

{

struct softnet_data *sd = data;

____napi_schedule(sd, &sd->backlog);

sd->received_rps++;

}

而

static inline void ____napi_schedule(struct softnet_data *sd,

struct napi_struct *napi)

{

list_add_tail(&napi->poll_list, &sd->poll_list);

__raise_softirq_irqoff(NET_RX_SOFTIRQ);

}

是不是有点搞笑。。。

好了回过头去看看那个napi的 poll吧

同样你还记得 net_dev_init 里面的

sd->backlog.poll = process_backlog;吧

这是一个通用的 process_backlog ，有的驱动程序这个注册函数会自己写napi

static int process_backlog(struct napi_struct *napi, int quota)
{
	int work = 0;
	/*根据struct napi_struct 找到 per CPU-V struct softnet_data*/
	struct softnet_data *sd = container_of(napi, struct softnet_data, backlog);

#ifdef CONFIG_RPS
	/*先看看是不是rps_ipi_list上面已经有东西了，如果有就赶紧处理掉*/
	if (sd->rps_ipi_list) {
		local_irq_disable();
		net_rps_action_and_irq_enable(sd);
	}
#endif
	napi->weight = weight_p;
	local_irq_disable();
	/*当前的quota 还有没到上限*/
	while (work < quota) {
		struct sk_buff *skb;
		unsigned int qlen;
	/*挨个的出队列接受L2->L3处理*/
		while ((skb = __skb_dequeue(&sd->process_queue))) {
			local_irq_enable();
	
			__netif_receive_skb(skb);
			local_irq_disable();
	/*该CPU struct napi_struct 中处理计数增加，用于CPU负载计算*/
			input_queue_head_incr(sd);
	/*随时检查上限*/
			if (++work >= quota) {
				local_irq_enable();
				return work;
			}
		}
	/*把该CPU 的softnet_data ->input_pkt_queue 保护起来 */
		rps_lock(sd);
	/*处理完之后 就直接把process_queue队列链接到 input_pkt_queue上去  等待后面的机会处理*/
		qlen = skb_queue_len(&sd->input_pkt_queue);
		if (qlen)
			skb_queue_splice_tail_init(&sd->input_pkt_queue,
						   &sd->process_queue);
	/*如果活全部干完了*/
		if (qlen < quota - work) {

	/*因为只有当前的cpu 会操作这个napi结构, 所以取消NAPI_STATE_SCHED
	 *标志, 删除napi poll_list队列都是安全的 也不需要 smp_mb() */
			list_del(&napi->poll_list);
			napi->state = 0;

			quota = work + qlen;
		}
		rps_unlock(sd);
	}
	local_irq_enable();
	/*返回用掉了的work数*/
	return work;
}

分享到：