<linux net>tcp optimize

sunzixun

浏览: 76842 次
性别:
来自: 苏州

最近访客更多访客>>

844700118

wd1282988143

sparktyy

xiangfirst

博主相关

博客

微博

相册

留言

关于我

文章分类

社区版块

存档分类

博客分类：

linux kernel

tcp_sack :
tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). 1
减小重复包。

/*This is what the send packet queuing engine uses to pass
* TCP per-packet control information to the transmission code.*/

struct tcp_skb_cb {
	union {
		struct inet_skb_parm	h4; /**/
	} header;	/* For incoming frames		*/
	__u32		seq;		/* Starting sequence number	*/
	__u32		end_seq;	/* SEQ + FIN + SYN + datalen	*/
	__u32		when;		/* used to compute rtt's	*/
	__u8		flags;		/* TCP header flags.		*/
	__u8		sacked;		/* State flags for SACK/FACK.	*/
	__u32		ack_seq;	/* Sequence number ACK'd	*/
};

To access IP options from IPCB, we need to access opt ﬁeld of struct inet_skb_parm
The Opt ﬁeld is embedded type ip_options in struct
inet_skb_parm

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb){
if (sk->sk_state == TCP_LISTEN) {
struct sock *nsk = tcp_v4_hnd_req(sk, skb); }

}

Tcp_v4_hnd_req() looks for any connection request in the SYN queue of the listening socket

Create a new socket for this connection and return the pointer to the new socket
static struct sock *tcp_v4_hnd_req(struct sock *sk, struct sk_buff *skb){
struct request_sock *req = inet_csk_search_req(sk, &prev, th->source,
iph->saddr, iph->daddr);
if (req)
return tcp_check_req(sk, skb, req, prev);

}

struct inet_hashinfo {
	/* This is for sockets with full identity only.  Sockets here will
	 * always be without wildcards and will have the following invariant:
	 *
	 *          TCP_ESTABLISHED <= sk->sk_state < TCP_CLOSE
	 *
	 * TIME_WAIT sockets use a separate chain (twchain).
	 */
	struct inet_ehash_bucket	*ehash;
	spinlock_t			*ehash_locks;
	unsigned int			ehash_mask;
	unsigned int			ehash_locks_mask;
	/* Ok, let's try this, I give up, we do need a local binding
	 * TCP hash as well as the others for fast bind/connect.
	 */
	struct inet_bind_hashbucket	*bhash;
	unsigned int			bhash_size;
	/* 4 bytes hole on 64 bit */

	struct kmem_cache		*bind_bucket_cachep;

	/* All the above members are written once at bootup and
	 * never written again _or_ are predominantly read-access.
	 *
	 * Now align to a new cache line as all the following members
	 * might be often dirty.
	 */
	/* All sockets in TCP_LISTEN state will be in here.  This is the only
	 * table where wildcard'd TCP sockets can exist.  Hash function here
	 * is just local port number.
	 */
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
					____cacheline_aligned_in_smp;

	atomic_t			bsockets;

struct inet_bind_bucket {
#ifdef CONFIG_NET_NS
struct net  *ib_net;
#endif
unsigned short  port;
signed short  fastreuse;
int   num_owners;
struct hlist_node node;
struct hlist_head owners;
};
------------------------------------
sock.h
   Arnaldo C. Melo : removed net_pinfo, tp_pinfo and made
*                 protinfo be just a void pointer, as the
*                 protocol specific parts were moved to
*                 respective headers and ipv4/v6, etc now
*                 use private slabcaches for its socks

/*
* The three way handshake has completed - we got a valid synack -
* now create the new socket.
*/
struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb,
struct request_sock *req,
struct dst_entry *dst)
本地端口复用：

Socket 绑定在不同的设备上共享一个 TCP端口

sk->sk_reuse 端口复用标记被设置, 而且没有处在 TCP_LISTEN状态的，端口可以共享
如ftp 的数据端口

Socket绑定到一个多个特别的本地地址上inet_sk(sk)->rcv_saddr，他们可以共享

tcp_sack - BOOLEAN Enable select acknowledgments (SACKS). 1
减小重复包。

拥塞避免算法和慢启动算法，需要对每个连接维持两个变量；

一个拥塞窗口cwnd 和一个慢启动门限

tcp_abc

tcp_abc - INTEGER Controls Appropriate Byte Count (ABC) defined in RFC3465.
ABC is a way of increasing congestion window (cwnd) more slowly
in response to partial acknowledgments.
Possible values are:
0 increase cwnd once per acknowledgment (no ABC)
1 increase cwnd once per acknowledgment of full sized segment
2 allow increase cwnd by two if acknowledgment is of two segments to compensate for delayed acknowledgments. Default: 0 (off)

/*
* Slow start is used when congestion window is less than slow start
* threshold. This version implements the basic RFC2581 version
* and optionally supports:
* RFC3742 Limited Slow Start - growth limited to max_ssthresh
* RFC3465 Appropriate Byte Counting - growth limited by bytes acknowledged
*/

void tcp_slow_start(struct tcp_sock *tp)
{
 int cnt; /* 这次要增加的量 */

 /* RFC3465: ABC Slow start
  * Increase only after a full MSS of bytes is acked
  *
  * TCP sender SHOULD increase cwnd by the number of
  * previously unacknowledged bytes ACKed by each incoming
  * acknowledgment, provided the increase is not more than L
  */
/*根据 rfc 3645 还没有收满一个 MSS就不增长*/
if (sysctl_tcp_abc && tp->bytes_acked < tp->mss_cache)
  return;
/*拥塞窗口大于了设置的max_ssthresh */
 if (sysctl_tcp_max_ssthresh > 0 && tp->snd_cwnd > sysctl_tcp_max_ssthresh)
  cnt = sysctl_tcp_max_ssthresh >> 1; /* 受限情况下为最大的一半limited slow start */
 else
  cnt = tp->snd_cwnd;   /* 正常情况每次翻倍exponential increase */

 /* RFC3465: ABC 对方启用延迟ACK 
  * We MAY increase by 2 if discovered delayed ack
  */
/*开始根据rfc 3465来增加窗口，如果发现对方启用了延迟ack 就直接增加一倍*/
 if (sysctl_tcp_abc > 1 && tp->bytes_acked >= 2*tp->mss_cache)
  cnt <<= 1;
 tp->bytes_acked = 0;
/*加到原来的基础上*/
 tp->snd_cwnd_cnt += cnt;
/*迫使cwnd窗口不能大于snd_cwnd_clamp*/
 while (tp->snd_cwnd_cnt >= tp->snd_cwnd) {
  tp->snd_cwnd_cnt -= tp->snd_cwnd;
  if (tp->snd_cwnd < tp->snd_cwnd_clamp)
   tp->snd_cwnd++;
 }
}

snd_ssthresh Slow start threshold. We are in slow start if snd_cwnd is less than this.
snd_cwnd_cnt A counter used to slow down the rate of increase once we exceed slow start threshold. snd_cwnd_clamp This is the maximum size that snd_cwnd can grow to.

RFC3742 :

Slow-Start cwnd <= max_ssthresh（RFC 2581）

cwnd 增长根据one MSS per ACK

Limited Slow-Start Max_ssthresh < cwnd <= ssthresh

Max_ssthresh/2 MSS per RRT

tcp_max_ssthresh
SACK
新的sack选项第一个快需要包含最近接受到的段，重复先前发送过额外的SACK块为了增加对丢失ACK 的表现。

tcp_fack - BOOLEAN Enable FACK congestion avoidance and fast retransmission.
The value is not used, if tcp_sack is not enabled.

接收方：
“收到失序报文段”，发送方TCP立刻需要产生重复ACK
: 不应该被延迟
：让对方知道收到一个失序的报文段
：告诉对方自己想收到的序号

“收到重复的ACK”
如果是一些报文段重排序，等待少量的ACK带来。 1~2个正常

3个以上，说明可能是报文段丢失了。启动快速重传算法，无需等待重传定时器(快速恢复算法)、、
快速恢复算法(如果没有ack超时就不进入满启动) 因为这样会突然减少数据流
Fast Revovery
通过计算重复ACK 数目，来检查有多少数据还在网络中
他自动的增加cwnd 每次收到重复的ACK后，为了让数据能充分传输
它允许在减半的窗口大小下来快速重传数据。

当收到3个以上重复的ack 将ssthresh 设置为当前 cwnd 一半
并且 cwnd 为ssthresh加上3*MSS
每次收到一个重复ack ，cwnd 增加一个MSS并且发送一个分组
当下一个确认新的数据 ack 到了 cwnd= ssthresh 这样可以达到拥塞避免(因为拥塞看起来已恢复了)

Congestion avoidance: As long as non-duplicate ACKs are received, the congestion window is additively increased by one MSS every round trip time. When a packet is lost, the likelihood of duplicate ACKs being received is very high (it's possible though unlikely that the stream just underwent extreme packet reordering, which would also prompt duplicate ACKs). The behavior of Tahoe and Reno differ in how they detect and react to packet loss:
Tahoe: Triple duplicate ACKS are treated the same as a timeout. Tahoe will perform "fast retransmit", reduce congestion window to 1 MSS, and reset to slow-start state.[8]
把三个重复acks当成超时，却表现快速重传，减少 cw 到一个MSS 并且重置 s s状态

Reno: If three duplicate ACKs are received (i.e., four ACKs acknowledging the same packet, which are not piggybacked on data, and do not change the receiver's advertised window), Reno will halve the congestion window, perform a fast retransmit, and enter a phase called Fast Recovery. If an ACK times out, slow start is used as it is with Tahoe.[8]

将拥塞窗口减半，表现快速重传，并且进入快速回复。如果ack超时同样重置进入 s s 状态

static int tcp_time_to_recover(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;

/* Do not perform any recovery during F-RTO algorithm */
if (tp->frto_counter)
return 0;

/* Trick#1: The loss is proven. */
if (tp->lost_out)
return 1;

/* Not-A-Trick#2 : Classic rule... */
if (tcp_dupack_heuristics(tp) > tp->reordering)
return 1;

/* Trick#3 : when we use RFC2988 timer restart, fast
* retransmit can be triggered by timeout of queue head.
*/
if (tcp_is_fack(tp) && tcp_head_timedout(sk))
return 1;

/* Trick#4: It is still not OK... But will it be useful to delay
* recovery more?
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
     tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
     !tcp_may_send_now(sk)) {
  /* We have nothing to send. This connection is limited
   * either by receiver window or by application.
   */
  return 1;
}

/* If a thin stream is detected, retransmit after first
* received dupack. Employ only if SACK is supported in order
* to avoid possible corner-case series of spurious retransmissions
* Use only if there are no unsent data.
*/
if ((tp->thin_dupack || sysctl_tcp_thin_dupack) &&
     tcp_stream_is_thin(tp) && tcp_dupack_heuristics(tp) > 1 &&
     tcp_is_sack(tp) && !tcp_send_head(sk))
  return 1;

return 0;
}
u32 lost_out; /* Lost packets   */
u32 fackets_out; /* FACK'd packets   */
u32 sacked_out; /* SACK'd packets   */
left_out is number of segments left network, but not ACKed yet.
FACK :

简单的启发式算法

只要我们发现一些报文丢失了，就认为所有没有被 sack 确认范围的包都丢失了
也就是 lost_out = fackets_out - sacked_out left_out = fackets_out
*  FACK: It is the simplest heuristics. As soon as we decided
*  that something is lost, we decide that _all_ not SACKed
*  packets until the most forward SACK are lost. I.e.
*  lost_out = fackets_out - sacked_out and left_out = fackets_out.
*  It is absolutely correct estimate, if network does not reorder
*  packets. And it loses any connection to reality when reordering
*  takes place. We use FACK by default until reordering
*  is suspected on the path to this destination.

Fack 算法基于流控制原理，并且设计和 TCP SACK选项一起使用

在简单的段丢失下， Reno体现了流量控制的理念。但是在成倍丢失的情况下， reno由于对网络数据丢包数缺少一个足够统计数据从而达不到它的效果。
Reno fails to meet the ideal principles because it lacks a sufficiently accurate estimate of the data outstanding in the network

需要的网络状态信息可以获得通过精确的确认receiver 保持的最被转发数据

最被转发数据，指的是正确接收的序号最大的数据 FACK算法的目标是表现一个精确的流量控制在浏览回复时期通过对网络上的数据数目保持一个精确的检查
The goal of fack algorithm is to perform precise congestion control during recovery by keeping an accurate estimate of the amount of data
Outstanding in the network
为了实现这个 Fack 试图保存 TCP的时钟并且减少总体上的突发性事件

Fack 使用 SACK选项提供额外的信息用来保存一个详细的关于在网络中飘的总数据量

Reno 和reno sack 都企图统计这个值，通过假设每一重复的ACK 接受表现一个在网络中飘的段

而Fack 算法可以做到这个直接通过引入两个新的状态变量

Snd.fack 和 retran_data . 同样发送发必须维护关于接受方拥有的报文段的信息，目地是为了用sack信息去正确的重传数据包

作为fack 浏览控制算法的核心变量 snd.fack 。它用来更新从而反映接收方拥有的最被转发的数据

在非恢复状态下 snd.fack 变量被更新通过 TCP头部中的确认号 = as.snd.una

最恢复模式中发送方连续的更新 snd.una 根据确认的序号

拥塞避免

拥塞避免算法和满启动算法是两个目的不同，独立的算法。但是发生拥塞时候，期望通过降低分组进入网络的速率
可以通过结合慢启动做到。

拥塞发生时（RTO; 3次ACK）： ssthresh 被设置为当前窗口大小的一半 min(cwnd, advice) 最小为2个MSS, 此外如果是超时引起的话， cwnd 设置为1个MSS

Cwnd< ssthresh 进行慢启动
Cwnd> ssthresh 进入拥塞避免

tcp_retries1

sysctl_tcp_retries1

重传的次数，之后需要检查是否中间路由出问题了

如果超过就调用 negative_advice (dst_negativ_advice)

Ipv4: ipv4_negative_advice();

Sk_dst = null

tcp_retries2

sysctl_tcp_retries2

重传最多的次数，之后就需要放弃这个链接了。默认15次，长达900多秒

通过指数退避原则

Sysctl_tcp_syn_retries

重传SYN段的最多次数，之后就放弃此链接

Sysctl_tcp_orphan_retries

Kill an orphaned socket in two cases even when

1 :Sysctl_tcp_max_orphans

2: tcp_memory_allocated> sysctl_tcp_mem

SYN-ACK timer

When get a connection request adn there is no pending connection request in the listening socket's

SYN queue to be processed .

Tcp_synq_added() ：new connection moves from SYN queue to accept queue 3 h-h

tcp_synack_retries

Number of times SYNACKs for a passive TCP connection attempt will

be retransmitted. Should not be higher than 255. Default value

is 5, which corresponds to ~180seconds.

tcp_vegas_cong_avoid

Enable TCP Vegas congestion avoidance algorithm.

发送方通过检查带宽来预知拥塞从而调整cwnd改变发送速率。

TCP Vegas should provide less packet loss, but it is not as aggressive as TCP Reno.

TCP_CORK

	case TCP_NODELAY:
		if (val) {
			/* TCP_NODELAY is weaker than TCP_CORK, so that
			 * this option on corked socket is remembered, but
			 * it is not activated until cork is cleared.
			 *
			 * However, when TCP_NODELAY is set we make
			 * an explicit push, which overrides even TCP_CORK
			 * for currently queued segments.
			 */
			tp->nonagle |= TCP_NAGLE_OFF|TCP_NAGLE_PUSH;

wmem_max rmem_max

The maximum sizes for socket buffers declared via the SO_SNDBUF and SO_RCVBUF mechanisms are limited by the values in the

/proc/sys/net/core/rmem_max and /proc/sys/net/core/wmem_max files. Note that TCP actually allocates twice the size of the buffer

requested in the setsockopt(2) call, and so a succeeding getsockopt(2) call will not return the same size of buffer as requested in

the setsockopt(2) call

代码如下

case SO_RCVBUF:
		/* Don't error on this BSD doesn't and if you think
		   about it this is right. Otherwise apps have to
		   play 'guess the biggest size' games. RCVBUF/SNDBUF
		   are treated in BSD as hints */

		if (val > sysctl_rmem_max)
			val = sysctl_rmem_max;
set_rcvbuf:
		sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
		/*
		 * We double it on the way in to account for
		 * "struct sk_buff" etc. overhead.   Applications
		 * assume that the SO_RCVBUF setting they make will
		 * allow that much actual data to be received on that
		 * socket.
		 *
		 * Applications are unaware that "struct sk_buff" and
		 * other overheads allocate from the receive buffer
		 * during socket buffer allocation.
		 *
		 * And after considering the possible alternatives,
		 * returning the value we actually used in getsockopt
		 * is the most desirable behavior.
		 */
		if ((val * 2) < SOCK_MIN_RCVBUF)
			sk->sk_rcvbuf = SOCK_MIN_RCVBUF;
		else
			sk->sk_rcvbuf = val * 2;

tcp_low_latency

默认是保持较高的吞吐量，如果设置为1 则较低的延迟，包会被立刻处理

int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg,
		size_t len, int nonblock, int flags, int *addr_len)
{
/×这里本来应该通过直接设置 用户空间缓存接收的，但是如果设置了low_latency便失去了prequeue 的处理能力×/
		if (!sysctl_tcp_low_latency && tp->ucopy.task == user_recv) {
			/* Install new reader */
			if (!user_recv && !(flags & (MSG_TRUNC | MSG_PEEK))) {
				user_recv = current;
				tp->ucopy.task = user_recv;
				tp->ucopy.iov = msg->msg_iov;

官方说建议在Beowulf compute cluster 中设置。我感觉是不是在MPI 的并行计算环境中，需要设置这个。。