Linux 网络协议栈

最后更新于 9 天前 23260 字 预计阅读时间: 2 小时


课设有个选题是巩固Linux网络协议栈,刚好最近再看Linux源码,就想试试能不能写出来,以Linux 4.9 TCP协议 IPv4 为例开始分析。

初始化

为了排版好看点,把初始化放在前面,在后续用到的时候再来看

net_families

关于net_families数组,该数值是在调用sock_register函数初始化的

sock_register函数
int sock_register(const struct net_proto_family *ops)
{
	int err;

	if (ops->family >= NPROTO) {
		pr_crit("protocol %d >= NPROTO(%d)\n", ops->family, NPROTO);
		return -ENOBUFS;
	}

	spin_lock(&net_family_lock);
	if (rcu_dereference_protected(net_families[ops->family],
				      lockdep_is_held(&net_family_lock)))
		err = -EEXIST;
	else {
		rcu_assign_pointer(net_families[ops->family], ops);
		err = 0;
	}
	spin_unlock(&net_family_lock);

	pr_info("NET: Registered protocol family %d\n", ops->family);
	return err;
}
EXPORT_SYMBOL(sock_register);

简化后

int sock_register(const struct net_proto_family *ops)
{
	...................
		rcu_assign_pointer(net_families[ops->family], ops);
        ...................
	
}

实现了net_families[ops->family]=ops,而在/net/ipv4/af_inet.c中注册了inet_family_ops

static int __init inet_init(void)
{
	.........................
	(void)sock_register(&inet_family_ops);
        .........................
}

inet_family_ops结构体如下

#define PF_INET		AF_INET
static const struct net_proto_family inet_family_ops = {
	.family = PF_INET,
	.create = inet_create,
	.owner	= THIS_MODULE,
};

inetsw

sock_register函数中遍历inetsw_array数组并执行inet_register_protosw函数。

  int sock_register(const struct net_proto_family *ops)
{
	...................
	        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
		    inet_register_protosw(q);
        ...................
	
}

inetsw_array数组如下,里面的成员都是inet_protosw类型的。


static struct inet_protosw inetsw_array[] =
{
	{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
	},

	{
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_UDP,
		.prot =       &udp_prot,
		.ops =        &inet_dgram_ops,
		.flags =      INET_PROTOSW_PERMANENT,
       },

       {
		.type =       SOCK_DGRAM,
		.protocol =   IPPROTO_ICMP,
		.prot =       &ping_prot,
		.ops =        &inet_sockraw_ops,
		.flags =      INET_PROTOSW_REUSE,
       },

       {
	       .type =       SOCK_RAW,
	       .protocol =   IPPROTO_IP,	/* wild card */
	       .prot =       &raw_prot,
	       .ops =        &inet_sockraw_ops,
	       .flags =      INET_PROTOSW_REUSE,
       }
};

inet_register_protosw函数

static struct list_head inetsw[SOCK_MAX];
void inet_register_protosw(struct inet_protosw *p)
{
	struct list_head *lh;
	struct inet_protosw *answer;
	int protocol = p->protocol;
	struct list_head *last_perm;

	last_perm = &inetsw[p->type];
	list_for_each(lh, &inetsw[p->type]) {
		answer = list_entry(lh, struct inet_protosw, list);
		if (protocol == answer->protocol)
			goto out_permanent;
		last_perm = lh;
	}
	return;
list_add_rcu(&p->list, last_perm);
}

inetsw数组的成员类型为list_head即数组里装的是头节点。inet_protosw 结构体内部有指向下一个inet_protosw结构体的指针,遍历当前type的链表,将每个inet_protosw节点赋值给answer。

struct inet_protosw {
	struct list_head list;
..........................   
};
answer = list_entry(lh, struct inet_protosw, list);

判断协议的名字是否重复

if (protocol == answer->protocol)
	goto out_permanent;

将该inet_protosw结构体插入list中

list_add_rcu(&p->list, last_perm);

sock

sk_alloc创建了一个sock结构体以供操作系统使用

sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);

struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
		      struct proto *prot, int kern)
{
	struct sock *sk;
	sk = sk_prot_alloc(prot, priority | __GFP_ZERO, family);
	if (sk) {
		sk->sk_family = family;
		sk->sk_prot = sk->sk_prot_creator = prot;
		sock_lock_init(sk);
		sk->sk_net_refcnt = kern ? 0 : 1;
	}

	return sk;
}

在sk_alloc函数中sk_prot_alloc创建了一个sock结构体,在后续我们会经常使用他,通过结构体嵌套在C语言中实现了C++的多态和继承功能,在后面中会讲到

在sk_alloc函数和inet_create后续中实现了,最重要的还是前两条

  • sk->sk_family = PF_INET
  • sk->sk_prot = sk->sk_prot_creator = answer_prot=tcp_prot
  • sk->sk_destruct = inet_sock_destruct;
  • sk->sk_protocol = protocol
  • sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

sock是最基本的结构体也就是父类,在其之下有很多子类

inet_sock
struct inet_sock {
	/* sk and pinet6 has to be the first two members of inet_sock */
	struct sock		sk;
#if IS_ENABLED(CONFIG_IPV6)
	struct ipv6_pinfo	*pinet6;
#endif
	/* Socket demultiplex comparisons on incoming packets. */
#define inet_daddr		sk.__sk_common.skc_daddr
#define inet_rcv_saddr		sk.__sk_common.skc_rcv_saddr
#define inet_dport		sk.__sk_common.skc_dport
#define inet_num		sk.__sk_common.skc_num

	unsigned long		inet_flags;
	__be32			inet_saddr;
	__s16			uc_ttl;
	__be16			inet_sport;
	struct ip_options_rcu __rcu	*inet_opt;
	atomic_t		inet_id;

	__u8			tos;
	__u8			min_ttl;
	__u8			mc_ttl;
	__u8			pmtudisc;
	__u8			rcv_tos;
	__u8			convert_csum;
	int			uc_index;
	int			mc_index;
	__be32			mc_addr;
	u32			local_port_range;	/* high << 16 | low */

	struct ip_mc_socklist __rcu	*mc_list;
	struct inet_cork_full	cork;
};

#define IPCORK_OPT		1	/* ip-options has been held in ipcork.opt */
#define IPCORK_TS_OPT_ID	2	/* ts_opt_id field is valid, overriding sk_tskey */

enum {
	INET_FLAGS_PKTINFO	= 0,
	INET_FLAGS_TTL		= 1,
	INET_FLAGS_TOS		= 2,
	INET_FLAGS_RECVOPTS	= 3,
	INET_FLAGS_RETOPTS	= 4,
	INET_FLAGS_PASSSEC	= 5,
	INET_FLAGS_ORIGDSTADDR	= 6,
	INET_FLAGS_CHECKSUM	= 7,
	INET_FLAGS_RECVFRAGSIZE	= 8,

	INET_FLAGS_RECVERR	= 9,
	INET_FLAGS_RECVERR_RFC4884 = 10,
	INET_FLAGS_FREEBIND	= 11,
	INET_FLAGS_HDRINCL	= 12,
	INET_FLAGS_MC_LOOP	= 13,
	INET_FLAGS_MC_ALL	= 14,
	INET_FLAGS_TRANSPARENT	= 15,
	INET_FLAGS_IS_ICSK	= 16,
	INET_FLAGS_NODEFRAG	= 17,
	INET_FLAGS_BIND_ADDRESS_NO_PORT = 18,
	INET_FLAGS_DEFER_CONNECT = 19,
	INET_FLAGS_MC6_LOOP	= 20,
	INET_FLAGS_RECVERR6_RFC4884 = 21,
	INET_FLAGS_MC6_ALL	= 22,
	INET_FLAGS_AUTOFLOWLABEL_SET = 23,
	INET_FLAGS_AUTOFLOWLABEL = 24,
	INET_FLAGS_DONTFRAG	= 25,
	INET_FLAGS_RECVERR6	= 26,
	INET_FLAGS_REPFLOW	= 27,
	INET_FLAGS_RTALERT_ISOLATE = 28,
	INET_FLAGS_SNDFLOW	= 29,
	INET_FLAGS_RTALERT	= 30,
};
inet_connect_sock
struct inet_connection_sock {
	/* inet_sock has to be the first member! */
	struct inet_sock	  icsk_inet;
	struct request_sock_queue icsk_accept_queue;
	struct inet_bind_bucket	  *icsk_bind_hash;
	unsigned long		  icsk_timeout;
 	struct timer_list	  icsk_retransmit_timer;
 	struct timer_list	  icsk_delack_timer;
	__u32			  icsk_rto;
	__u32			  icsk_pmtu_cookie;
	const struct tcp_congestion_ops *icsk_ca_ops;
	const struct inet_connection_sock_af_ops *icsk_af_ops;
	unsigned int		  (*icsk_sync_mss)(struct sock *sk, u32 pmtu);
	__u8			  icsk_ca_state:6,
				  icsk_ca_setsockopt:1,
				  icsk_ca_dst_locked:1;
	__u8			  icsk_retransmits;
	__u8			  icsk_pending;
	__u8			  icsk_backoff;
	__u8			  icsk_syn_retries;
	__u8			  icsk_probes_out;
	__u16			  icsk_ext_hdr_len;
	struct {
		__u8		  pending;	 /* ACK is pending			   */
		__u8		  quick;	 /* Scheduled number of quick acks	   */
		__u8		  pingpong;	 /* The session is interactive		   */
		__u8		  blocked;	 /* Delayed ACK was blocked by socket lock */
		__u32		  ato;		 /* Predicted tick of soft clock	   */
		unsigned long	  timeout;	 /* Currently scheduled timeout		   */
		__u32		  lrcvtime;	 /* timestamp of last received data packet */
		__u16		  last_seg_size; /* Size of last incoming segment	   */
		__u16		  rcv_mss;	 /* MSS used for delayed ACK decisions	   */ 
	} icsk_ack;
	struct {
		int		  enabled;

		/* Range of MTUs to search */
		int		  search_high;
		int		  search_low;

		/* Information on the current probe. */
		int		  probe_size;

		u32		  probe_timestamp;
	} icsk_mtup;
	u32			  icsk_user_timeout;

	u64			  icsk_ca_priv[88 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE      (11 * sizeof(u64))
};
tcp_sock
struct tcp_sock {
	/* inet_connection_sock has to be the first member of tcp_sock */
	struct inet_connection_sock	inet_conn;
	u16	tcp_header_len;	/* Bytes of tcp header to send		*/
	u16	gso_segs;	/* Max number of segs per GSO packet	*/

/*
 *	Header prediction flags
 *	0x5?10 << 16 + snd_wnd in net byte order
 */
	__be32	pred_flags;

/*
 *	RFC793 variables by their proper names. This means you can
 *	read the code and the spec side by side (and laugh ...)
 *	See RFC793 and RFC1122. The RFC writes these in capitals.
 */
	u64	bytes_received;	/* RFC4898 tcpEStatsAppHCThruOctetsReceived
				 * sum(delta(rcv_nxt)), or how many bytes
				 * were acked.
				 */
	u32	segs_in;	/* RFC4898 tcpEStatsPerfSegsIn
				 * total number of segments in.
				 */
	u32	data_segs_in;	/* RFC4898 tcpEStatsPerfDataSegsIn
				 * total number of data segments in.
				 */
 	u32	rcv_nxt;	/* What we want to receive next 	*/
	u32	copied_seq;	/* Head of yet unread data		*/
	u32	rcv_wup;	/* rcv_nxt on last window update sent	*/
 	u32	snd_nxt;	/* Next sequence we send		*/
	u32	segs_out;	/* RFC4898 tcpEStatsPerfSegsOut
				 * The total number of segments sent.
				 */
	u32	data_segs_out;	/* RFC4898 tcpEStatsPerfDataSegsOut
				 * total number of data segments sent.
				 */
	u64	bytes_acked;	/* RFC4898 tcpEStatsAppHCThruOctetsAcked
				 * sum(delta(snd_una)), or how many bytes
				 * were acked.
				 */
	struct u64_stats_sync syncp; /* protects 64bit vars (cf tcp_get_info()) */

 	u32	snd_una;	/* First byte we want an ack for	*/
 	u32	snd_sml;	/* Last byte of the most recently transmitted small packet */
	u32	rcv_tstamp;	/* timestamp of last received ACK (for keepalives) */
	u32	lsndtime;	/* timestamp of last sent data packet (for restart window) */
	u32	last_oow_ack_time;  /* timestamp of last out-of-window ACK */

	u32	tsoffset;	/* timestamp offset */

	struct list_head tsq_node; /* anchor in tsq_tasklet.head list */
	unsigned long	tsq_flags;

	/* Data for direct copy to user */
	struct {
		struct sk_buff_head	prequeue;
		struct task_struct	*task;
		struct msghdr		*msg;
		int			memory;
		int			len;
	} ucopy;

	u32	snd_wl1;	/* Sequence for window update		*/
	u32	snd_wnd;	/* The window we expect to receive	*/
	u32	max_window;	/* Maximal window ever seen from peer	*/
	u32	mss_cache;	/* Cached effective mss, not including SACKS */

	u32	window_clamp;	/* Maximal window to advertise		*/
	u32	rcv_ssthresh;	/* Current window clamp			*/

	/* Information of the most recently (s)acked skb */
	struct tcp_rack {
		struct skb_mstamp mstamp; /* (Re)sent time of the skb */
		u8 advanced; /* mstamp advanced since last lost marking */
		u8 reord;    /* reordering detected */
	} rack;
	u16	advmss;		/* Advertised MSS			*/
	u8	tlp_retrans:1,	/* TLP is a retransmission */
		unused_1:7;
	u8	rate_app_limited:1,  /* rate_{delivered,interval_us} limited? */
		is_sack_reneg:1,    /* in recovery from loss with SACK reneg? */
		unused:6;
	u8	nonagle     : 4,/* Disable Nagle algorithm?             */
		thin_lto    : 1,/* Use linear timeouts for thin streams */
		thin_dupack : 1,/* Fast retransmit on first dupack      */
		repair      : 1,
		frto        : 1;/* F-RTO (RFC5682) activated in CA_Loss */
	u8	repair_queue;
	u8	do_early_retrans:1,/* Enable RFC5827 early-retransmit  */
		syn_data:1,	/* SYN includes data */
		syn_fastopen:1,	/* SYN includes Fast Open option */
		syn_fastopen_exp:1,/* SYN includes Fast Open exp. option */
		syn_data_acked:1,/* data in SYN is acked by SYN-ACK */
		save_syn:1,	/* Save headers of SYN packet */
		is_cwnd_limited:1;/* forward progress limited by snd_cwnd? */
	u32	tlp_high_seq;	/* snd_nxt at the time of TLP */

/* RTT measurement */
	u32	srtt_us;	/* smoothed round trip time << 3 in usecs */
	u32	mdev_us;	/* medium deviation			*/
	u32	mdev_max_us;	/* maximal mdev for the last rtt period	*/
	u32	rttvar_us;	/* smoothed mdev_max			*/
	u32	rtt_seq;	/* sequence number to update rttvar	*/
	struct  minmax rtt_min;

	u32	packets_out;	/* Packets which are "in flight"	*/
	u32	retrans_out;	/* Retransmitted packets out		*/
	u32	max_packets_out;  /* max packets_out in last window */
	u32	cwnd_usage_seq;  /* right edge of cwnd usage tracking flight */

	u16	urg_data;	/* Saved octet of OOB data and control flags */
	u8	ecn_flags;	/* ECN status bits.			*/
	u8	keepalive_probes; /* num of allowed keep alive probes	*/
	u32	reordering;	/* Packet reordering metric.		*/
	u32	snd_up;		/* Urgent pointer		*/

/*
 *      Options received (usually on last packet, some only on SYN packets).
 */
	struct tcp_options_received rx_opt;

/*
 *	Slow start and congestion control (see also Nagle, and Karn & Partridge)
 */
 	u32	snd_ssthresh;	/* Slow start size threshold		*/
 	u32	snd_cwnd;	/* Sending congestion window		*/
	u32	snd_cwnd_cnt;	/* Linear increase counter		*/
	u32	snd_cwnd_clamp; /* Do not allow snd_cwnd to grow above this */
	u32	snd_cwnd_used;
	u32	snd_cwnd_stamp;
	u32	prior_cwnd;	/* Congestion window at start of Recovery. */
	u32	prr_delivered;	/* Number of newly delivered packets to
				 * receiver in Recovery. */
	u32	prr_out;	/* Total number of pkts sent during Recovery. */
	u32	delivered;	/* Total data packets delivered incl. rexmits */
	u32	lost;		/* Total data packets lost incl. rexmits */
	u32	app_limited;	/* limited until "delivered" reaches this val */
	struct skb_mstamp first_tx_mstamp;  /* start of window send phase */
	struct skb_mstamp delivered_mstamp; /* time we reached "delivered" */
	u32	rate_delivered;    /* saved rate sample: packets delivered */
	u32	rate_interval_us;  /* saved rate sample: time elapsed */

 	u32	rcv_wnd;	/* Current receiver window		*/
	u32	write_seq;	/* Tail(+1) of data held in tcp send buffer */
	u32	notsent_lowat;	/* TCP_NOTSENT_LOWAT */
	u32	pushed_seq;	/* Last pushed seq, required to talk to windows */
	u32	lost_out;	/* Lost packets			*/
	u32	sacked_out;	/* SACK'd packets			*/
	u32	fackets_out;	/* FACK'd packets			*/

	/* from STCP, retrans queue hinting */
	struct sk_buff* lost_skb_hint;
	struct sk_buff *retransmit_skb_hint;

	/* OOO segments go in this rbtree. Socket lock must be held. */
	struct rb_root	out_of_order_queue;
	struct sk_buff	*ooo_last_skb; /* cache rb_last(out_of_order_queue) */

	/* SACKs data, these 2 need to be together (see tcp_options_write) */
	struct tcp_sack_block duplicate_sack[1]; /* D-SACK block */
	struct tcp_sack_block selective_acks[4]; /* The SACKS themselves*/

	struct tcp_sack_block recv_sack_cache[4];

	struct sk_buff *highest_sack;   /* skb just after the highest
					 * skb with SACKed bit set
					 * (validity guaranteed only if
					 * sacked_out > 0)
					 */

	int     lost_cnt_hint;
	u32     retransmit_high;	/* L-bits may be on up to this seqno */

	u32	prior_ssthresh; /* ssthresh saved at recovery start	*/
	u32	high_seq;	/* snd_nxt at onset of congestion	*/

	u32	retrans_stamp;	/* Timestamp of the last retransmit,
				 * also used in SYN-SENT to remember stamp of
				 * the first SYN. */
	u32	undo_marker;	/* snd_una upon a new recovery episode. */
	int	undo_retrans;	/* number of undoable retransmissions. */
	u32	total_retrans;	/* Total retransmits for entire connection */

	u32	urg_seq;	/* Seq of received urgent pointer */
	unsigned int		keepalive_time;	  /* time before keep alive takes place */
	unsigned int		keepalive_intvl;  /* time interval between keep alive probes */

	int			linger2;

/* Receiver side RTT estimation */
	struct {
		u32	rtt;
		u32	seq;
		u32	time;
	} rcv_rtt_est;

/* Receiver queue space */
	struct {
		u32	space;
		u32	seq;
		u32	time;
	} rcvq_space;

/* TCP-specific MTU probe information. */
	struct {
		u32		  probe_seq_start;
		u32		  probe_seq_end;
	} mtu_probe;
	u32	mtu_info; /* We received an ICMP_FRAG_NEEDED / ICMPV6_PKT_TOOBIG
			   * while socket was owned by user.
			   */

#ifdef CONFIG_TCP_MD5SIG
/* TCP AF-Specific parts; only used by MD5 Signature support so far */
	const struct tcp_sock_af_ops	*af_specific;

/* TCP MD5 Signature Option information */
	struct tcp_md5sig_info	__rcu *md5sig_info;
#endif

/* TCP fastopen related information */
	struct tcp_fastopen_request *fastopen_req;
	/* fastopen_rsk points to request_sock that resulted in this big
	 * socket. Used to retransmit SYNACKs etc.
	 */
	struct request_sock *fastopen_rsk;
	u32	*saved_syn;
};

我们可以看到tcp_sock嵌套inet_connect_sock结构体,而inet_connect_sock又嵌套了inet_sock结构体,inet_sock嵌套了sock结构体,实现了一种结构体嵌套达到了C++的继承特性。

在sk_alloc中调用了sk_prot_alloc为sock结构体分配空间,其中sk_prot_alloc函数调用了kmalloc,用tcp->slab进行sock结构体的创建

static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,int family)
{
	struct sock *sk;
	struct kmem_cache *slab;
	slab = prot->slab;
	sk = kmem_cache_alloc(slab, priority & ~__GFP_ZERO);
}

prot->slab的初始化在proto_register函数中的

int proto_register(struct proto *prot, int alloc_slab)
{
	if (alloc_slab) {
		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
					SLAB_HWCACHE_ALIGN | prot->slab_flags,
					NULL);
......................................	
}

slab的大小由prot->obj_size确定,如下图prot->obj_size=sizeof(tcp_sock)

由此可见我们执行sk_alloc创建sock结构体时并不是开辟一个sock结构体的空间,而是tcp_sock结构体的空间。

所以之后我们所见的各种指针转换都是基于此基础上的

struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);

由于我们得到的sock在最顶层和tcp_sock地址相同,我们可以通过指针转换来访问同一个sock的子类

icsk->icsk_af_ops

icsk->icsk_af_ops的初始化在tcp_v4_init_sock函数中完成,而tcp_v4_init_sock函数是在tcp_prot中,在inet_create函数中调用的tcp_v4_init_sock。

struct proto tcp_prot = {
	.init			= tcp_v4_init_sock,
}

inet_create:

if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}

tcp_v4_init_sock代码如下,icsk->icsk_af_ops被赋值了ipv4_specific

static int tcp_v4_init_sock(struct sock *sk)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	tcp_init_sock(sk);
	icsk->icsk_af_ops = &ipv4_specific;
	return 0;
}

ipv4_specific

const struct inet_connection_sock_af_ops ipv4_specific = {
	.queue_xmit	   = ip_queue_xmit,
	.send_check	   = tcp_v4_send_check,
	.rebuild_header	   = inet_sk_rebuild_header,
	.sk_rx_dst_set	   = inet_sk_rx_dst_set,
	.conn_request	   = tcp_v4_conn_request,
	.syn_recv_sock	   = tcp_v4_syn_recv_sock,
	.net_header_len	   = sizeof(struct iphdr),
	.setsockopt	   = ip_setsockopt,
	.getsockopt	   = ip_getsockopt,
	.addr2sockaddr	   = inet_csk_addr2sockaddr,
	.sockaddr_len	   = sizeof(struct sockaddr_in),
	.bind_conflict	   = inet_csk_bind_conflict,
#ifdef CONFIG_COMPAT
	.compat_setsockopt = compat_ip_setsockopt,
	.compat_getsockopt = compat_ip_getsockopt,
#endif
	.mtu_reduced	   = tcp_v4_mtu_reduced,
};

必要知识

该模块介绍某些结构体和通用的操作。。。。。

inet_hashinfo

inet_hashinfo结构体专门用来放置连接信息的,inet_hashinfo结构体如下,只截取了关键成员

struct inet_hashinfo {
	struct inet_ehash_bucket	*ehash;
	struct inet_bind_hashbucket	*bhash;
	struct inet_listen_hashbucket	listening_hash[INET_LHTABLE_SIZE]
};
  • ehash(estblished hash):用于存放完整连接信息的字典,{{saddr,daddr,sport,dport}:sock}
  • bhash(bind hash):用于存放绑定某端口号的连接的字典。{port:sock}
  • listening_hash:用于存放处于监听状态下的sock的字典,{port:sock}

bhash

bhash的类型是一个inet_bind_hashbucket结构体,结构体如下

struct inet_bind_hashbucket {
	spinlock_t		lock;
	struct hlist_head	chain;
};

bhash用来存储数据的结构体叫做inet_bind_bucket,里面记录了绑定的端口号(port),拥有者(owners)和用于将每个inet_bind_bucket作为连接起来的node。

struct inet_bind_bucket {
	possible_net_t		ib_net;
	unsigned short		port;
	signed char		fastreuse;
	signed char		fastreuseport;
	kuid_t			fastuid;
	int			num_owners;
	struct hlist_node	node;
	struct hlist_head	owners;
};

在sock结构体里有个sock_common结构体,里面存储着skc_bind_node/skc_portaddr_node成员,inet_bind_bucket结构体中的owners成员就指向它。

struct sock_common
{
......................
union {
		struct hlist_node	skc_bind_node;
		struct hlist_node	skc_portaddr_node;
	};
......................
}

listening_hash

和bindhash相同有两个成员

struct inet_listen_hashbucket {
	spinlock_t		lock;
	union {
		struct hlist_head	head;
		struct hlist_nulls_head	nulls_head;
	};
};

nulls_head指向sock结构体的sock_common中的skc_nulls_node/skc_nulls_node

ehash

ehash指向一个inet_ehash_bucket的数组,inet_ehash_bucket 结构体如下,只有一个成员。

struct inet_ehash_bucket {
	struct hlist_nulls_head chain;
};

chain指向sock结构体的sock_common中的skc_nulls_node/skc_nulls_node

union {
		struct hlist_node	skc_node;
		struct hlist_nulls_node skc_nulls_node;
	};

sock结构体的状态

在sock结构体里存在一个成员来表示当前sock的状态

#define sk_state		__sk_common.skc_state

sock总共有以下几个状态

enum {
	TCPF_ESTABLISHED = (1 << 1),
	TCPF_SYN_SENT	 = (1 << 2),
	TCPF_SYN_RECV	 = (1 << 3),
	TCPF_FIN_WAIT1	 = (1 << 4),
	TCPF_FIN_WAIT2	 = (1 << 5),
	TCPF_TIME_WAIT	 = (1 << 6),
	TCPF_CLOSE	 = (1 << 7),
	TCPF_CLOSE_WAIT	 = (1 << 8),
	TCPF_LAST_ACK	 = (1 << 9),
	TCPF_LISTEN	 = (1 << 10),
	TCPF_CLOSING	 = (1 << 11),
	TCPF_NEW_SYN_RECV = (1 << 12),
};

socket有以下几个状态

typedef enum {
	SS_FREE = 0,			/* not allocated		*/
	SS_UNCONNECTED,			/* unconnected to any socket	*/
	SS_CONNECTING,			/* in process of connecting	*/
	SS_CONNECTED,			/* connected to socket		*/
	SS_DISCONNECTING		/* in process of disconnecting	*/
} socket_state;

当要检查sock结构体状态时通常采用以下代码

if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
		goto out;

用(1<<state)& (TCPF_LISTEN|TCPF_CLOSE )就可以判断当前状态是否在两个之间。若在这些状态之间,则相与不为0,反之为0.

#TCPF_LISTEN
(10000000000&10001000000)=10000000000
#TCPF_CLOSE 
(00001000000&10001000000)=00001000000

sk_buff

sk_buff是Linux网络协议栈中用于管理和控制接收或发送数据包的关键数据结构。结构体如下

sk_buff
struct sk_buff {
	union {
		struct {
			/* These two members must be first. */
			struct sk_buff		*next;
			struct sk_buff		*prev;

			union {
				ktime_t		tstamp;
				struct skb_mstamp skb_mstamp;
			};
		};
		struct rb_node		rbnode; /* used in netem, ip4 defrag, and tcp stack */
	};

	union {
		struct sock		*sk;
		int			ip_defrag_offset;
	};

	struct net_device	*dev;

	/*
	 * This is the control buffer. It is free to use for every
	 * layer. Please put your private variables there. If you
	 * want to keep them across layers you have to do a skb_clone()
	 * first. This is owned by whoever has the skb queued ATM.
	 */
	char			cb[48] __aligned(8);

	unsigned long		_skb_refdst;
	void			(*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
	struct	sec_path	*sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
	struct nf_conntrack	*nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
	struct nf_bridge_info	*nf_bridge;
#endif
	unsigned int		len,
				data_len;
	__u16			mac_len,
				hdr_len;

	/* Following fields are _not_ copied in __copy_skb_header()
	 * Note that queue_mapping is here mostly to fill a hole.
	 */
	kmemcheck_bitfield_begin(flags1);
	__u16			queue_mapping;

/* if you move cloned around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define CLONED_MASK	(1 << 7)
#else
#define CLONED_MASK	1
#endif
#define CLONED_OFFSET()		offsetof(struct sk_buff, __cloned_offset)

	__u8			__cloned_offset[0];
	__u8			cloned:1,
				nohdr:1,
				fclone:2,
				peeked:1,
				head_frag:1,
				xmit_more:1,
				pfmemalloc:1;
	kmemcheck_bitfield_end(flags1);

	/* fields enclosed in headers_start/headers_end are copied
	 * using a single memcpy() in __copy_skb_header()
	 */
	/* private: */
	__u32			headers_start[0];
	/* public: */

/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX	(7 << 5)
#else
#define PKT_TYPE_MAX	7
#endif
#define PKT_TYPE_OFFSET()	offsetof(struct sk_buff, __pkt_type_offset)

	__u8			__pkt_type_offset[0];
	__u8			pkt_type:3;
	__u8			ignore_df:1;
	__u8			nfctinfo:3;
	__u8			nf_trace:1;

	__u8			ip_summed:2;
	__u8			ooo_okay:1;
	__u8			l4_hash:1;
	__u8			sw_hash:1;
	__u8			wifi_acked_valid:1;
	__u8			wifi_acked:1;
	__u8			no_fcs:1;

	/* Indicates the inner headers are valid in the skbuff. */
	__u8			encapsulation:1;
	__u8			encap_hdr_csum:1;
	__u8			csum_valid:1;
	__u8			csum_complete_sw:1;
	__u8			csum_level:2;
	__u8			csum_bad:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
	__u8			ndisc_nodetype:2;
#endif
	__u8			ipvs_property:1;

	__u8			inner_protocol_type:1;
	__u8			remcsum_offload:1;
#ifdef CONFIG_NET_SWITCHDEV
	__u8			offload_fwd_mark:1;
#endif
	/* 2, 4 or 5 bit hole */

#ifdef CONFIG_NET_SCHED
	__u16			tc_index;	/* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
	__u16			tc_verd;	/* traffic control verdict */
#endif
#endif

	union {
		__wsum		csum;
		struct {
			__u16	csum_start;
			__u16	csum_offset;
		};
	};
	__u32			priority;
	int			skb_iif;
	__u32			hash;
	__be16			vlan_proto;
	__u16			vlan_tci;
#if defined(CONFIG_NET_RX_BUSY_POLL) || defined(CONFIG_XPS)
	union {
		unsigned int	napi_id;
		unsigned int	sender_cpu;
	};
#endif
#ifdef CONFIG_NETWORK_SECMARK
	__u32		secmark;
#endif

	union {
		__u32		mark;
		__u32		reserved_tailroom;
	};

	union {
		__be16		inner_protocol;
		__u8		inner_ipproto;
	};

	__u16			inner_transport_header;
	__u16			inner_network_header;
	__u16			inner_mac_header;

	__be16			protocol;
	__u16			transport_header;
	__u16			network_header;
	__u16			mac_header;

	/* private: */
	__u32			headers_end[0];
	/* public: */

	/* These elements must be at the end, see alloc_skb() for details.  */
	sk_buff_data_t		tail;
	sk_buff_data_t		end;
	unsigned char		*head,
				*data;
	unsigned int		truesize;
	atomic_t		users;
};

在sk_buff中有几个重要成员,为了让作用差不多的成员放在一起故打乱了成员的顺序。

struct sk_buff {
	char			cb[48] __aligned(8);

        unsigned int		len;
        unsigned int            data_len;
 
	__be16			protocol;
	__u16			transport_header;
	__u16			network_header;
	__u16			mac_header;

        unsigned int		truesize;
	sk_buff_data_t		tail;
	sk_buff_data_t		end;
	unsigned char		*head,
				*data;
};
  • cb:控制块,用于存放各层的控制数据
  • len:表示当前skb数据区的长度
  • data_len:实际存放数据的大小
  • truesize:表示整个sk_buff所占用的大小,包括sk_buff结构体大小,数据区大小,以及skb_shared_info结构体和内存对齐的大小
  • protocol:当前的协议
  • transport_header,network_header,mac_header:数据区到各层头部的偏移
  • tail:指向存放各层数据的尾部
  • end:指向数据区的尾部
  • head:指向数据区的头部
  • data:指向存放各层数据的头部

在sk_buff刚创建时,结构如下

sk_buff初始化后,data和tail被放到实际存储数据的中间

存放对应用数据时会调用skb_reserve函数是tail向下移动,得到一片区域存放应用数据,tail到end的这片数据被称为tailhome,data到head这篇区域被称为headhome。

static inline void skb_reserve(struct sk_buff *skb, int len)
{
	skb->data += len;
	skb->tail += len;
}

当各层想要往sk_buff添加协议数据时,会调用skb_push将data指针向上移动得到一片区域存放协议数据,通常向上扩展的长度是个层头部的大小。

在传输层中,tcp会在tcp_transmit_skb函数中调用skb_push向skb添加tcp头部信息,data上移的大小正好是tcp_header_size

static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	skb_push(skb, tcp_header_size);
        skb_reset_transport_header(skb);
	skb->sk = sk;
	th = (struct tcphdr *)skb->data;
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(rcv_nxt);
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |tcb->tcp_flags);

	th->check		= 0;
	th->urg_ptr		= 0;
.....................
}

在skb_reset_transport_header函数中设置了sk_buff的transport_header存放着传输层协议数据在数据区的偏移。

static inline void skb_reset_transport_header(struct sk_buff *skb)
{
	skb->transport_header = skb->data - skb->head;
}

request_sock_queue

request_sock_queue即连接请求队列。

Socket

socket与sock

socket结构体是Linux网络协议栈最为重要的结构体。其结构体如下所示

struct socket {
	socket_state		state;

	kmemcheck_bitfield_begin(type);
	short			type;
	kmemcheck_bitfield_end(type);

	unsigned long		flags;

	struct socket_wq __rcu	*wq;

	struct file		*file;
	struct sock		*sk;
	const struct proto_ops	*ops;
};

socket是提供给上层用户调用的结构体,我们调用socket函数返回的结构体为socket,其对应下层的结构体为sock,sock是给内核底层使用的,包括了很多信息,在socket结构体中记录着与之对应的sock结构体。

struct socket {
.......................
	struct sock		*sk;
.......................
};

sock结构体如下

sock结构体
struct sock {
	/*
	 * Now struct inet_timewait_sock also uses sock_common, so please just
	 * don't add nothing before this first member (__sk_common) --acme
	 */
	struct sock_common	__sk_common;
#define sk_node			__sk_common.skc_node
#define sk_nulls_node		__sk_common.skc_nulls_node
#define sk_refcnt		__sk_common.skc_refcnt
#define sk_tx_queue_mapping	__sk_common.skc_tx_queue_mapping

#define sk_dontcopy_begin	__sk_common.skc_dontcopy_begin
#define sk_dontcopy_end		__sk_common.skc_dontcopy_end
#define sk_hash			__sk_common.skc_hash
#define sk_portpair		__sk_common.skc_portpair
#define sk_num			__sk_common.skc_num
#define sk_dport		__sk_common.skc_dport
#define sk_addrpair		__sk_common.skc_addrpair
#define sk_daddr		__sk_common.skc_daddr
#define sk_rcv_saddr		__sk_common.skc_rcv_saddr
#define sk_family		__sk_common.skc_family
#define sk_state		__sk_common.skc_state
#define sk_reuse		__sk_common.skc_reuse
#define sk_reuseport		__sk_common.skc_reuseport
#define sk_ipv6only		__sk_common.skc_ipv6only
#define sk_net_refcnt		__sk_common.skc_net_refcnt
#define sk_bound_dev_if		__sk_common.skc_bound_dev_if
#define sk_bind_node		__sk_common.skc_bind_node
#define sk_prot			__sk_common.skc_prot
#define sk_net			__sk_common.skc_net
#define sk_v6_daddr		__sk_common.skc_v6_daddr
#define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
#define sk_cookie		__sk_common.skc_cookie
#define sk_incoming_cpu		__sk_common.skc_incoming_cpu
#define sk_flags		__sk_common.skc_flags
#define sk_rxhash		__sk_common.skc_rxhash

	socket_lock_t		sk_lock;
	struct sk_buff_head	sk_receive_queue;
	/*
	 * The backlog queue is special, it is always used with
	 * the per-socket spinlock held and requires low latency
	 * access. Therefore we special case it's implementation.
	 * Note : rmem_alloc is in this structure to fill a hole
	 * on 64bit arches, not because its logically part of
	 * backlog.
	 */
	struct {
		atomic_t	rmem_alloc;
		int		len;
		struct sk_buff	*head;
		struct sk_buff	*tail;
	} sk_backlog;
#define sk_rmem_alloc sk_backlog.rmem_alloc
	int			sk_forward_alloc;

	__u32			sk_txhash;
#ifdef CONFIG_NET_RX_BUSY_POLL
	unsigned int		sk_napi_id;
	unsigned int		sk_ll_usec;
#endif
	atomic_t		sk_drops;
	int			sk_rcvbuf;

	struct sk_filter __rcu	*sk_filter;
	union {
		struct socket_wq __rcu	*sk_wq;
		struct socket_wq	*sk_wq_raw;
	};
#ifdef CONFIG_XFRM
	struct xfrm_policy __rcu *sk_policy[2];
#endif
	struct dst_entry __rcu	*sk_rx_dst;
	struct dst_entry __rcu	*sk_dst_cache;
	/* Note: 32bit hole on 64bit arches */
	atomic_t		sk_wmem_alloc;
	atomic_t		sk_omem_alloc;
	int			sk_sndbuf;
	struct sk_buff_head	sk_write_queue;

	/*
	 * Because of non atomicity rules, all
	 * changes are protected by socket lock.
	 */
	kmemcheck_bitfield_begin(flags);
	unsigned int		sk_padding : 2,
				sk_no_check_tx : 1,
				sk_no_check_rx : 1,
				sk_userlocks : 4,
				sk_protocol  : 8,
				sk_type      : 16;
#define SK_PROTOCOL_MAX U8_MAX
	kmemcheck_bitfield_end(flags);

	int			sk_wmem_queued;
	gfp_t			sk_allocation;
	u32			sk_pacing_rate; /* bytes per second */
	u32			sk_max_pacing_rate;
	netdev_features_t	sk_route_caps;
	netdev_features_t	sk_route_nocaps;
	int			sk_gso_type;
	unsigned int		sk_gso_max_size;
	u16			sk_gso_max_segs;
	int			sk_rcvlowat;
	unsigned long	        sk_lingertime;
	struct sk_buff_head	sk_error_queue;
	struct proto		*sk_prot_creator;
	rwlock_t		sk_callback_lock;
	int			sk_err,
				sk_err_soft;
	u32			sk_ack_backlog;
	u32			sk_max_ack_backlog;
	__u32			sk_priority;
	__u32			sk_mark;
	spinlock_t		sk_peer_lock;
	struct pid		*sk_peer_pid;
	const struct cred	*sk_peer_cred;

	long			sk_rcvtimeo;
	long			sk_sndtimeo;
	struct timer_list	sk_timer;
	ktime_t			sk_stamp;
#if BITS_PER_LONG==32
	seqlock_t		sk_stamp_seq;
#endif
	u16			sk_tsflags;
	u8			sk_shutdown;
	u32			sk_tskey;
	struct socket		*sk_socket;
	void			*sk_user_data;
	struct page_frag	sk_frag;
	struct sk_buff		*sk_send_head;
	__s32			sk_peek_off;
	int			sk_write_pending;
#ifdef CONFIG_SECURITY
	void			*sk_security;
#endif
	struct sock_cgroup_data	sk_cgrp_data;
	struct mem_cgroup	*sk_memcg;
	void			(*sk_state_change)(struct sock *sk);
	void			(*sk_data_ready)(struct sock *sk);
	void			(*sk_write_space)(struct sock *sk);
	void			(*sk_error_report)(struct sock *sk);
	int			(*sk_backlog_rcv)(struct sock *sk,
						  struct sk_buff *skb);
	void                    (*sk_destruct)(struct sock *sk);
	struct sock_reuseport __rcu	*sk_reuseport_cb;
	struct rcu_head		sk_rcu;
};

socket结构体的创建

先看一下socket函数的原型

int socket(int domain, int type, int protocol);

domain(协议族/地址族)

domain表示socket如何解释地址以及它可以通信的网络范围。

  • AF_INET-----IPv4网络协议
  • AF_INET6----IPv6网络协议
  • AF_BLUETOOTH-----蓝牙通信

type(Socket类型)

type表示socket的通讯类型,传输方式等

  • SOCK_STREAM-----面向连接的可靠字节流 TCP(ssh,http)
  • SOCK_DGRAM -----无连接的消息传递 UDP(DNS)

protocol(具体协议)

protocol表示选择协议簇中的具体协议,为NULL表示使用该协议簇的默认协议

  • IPPROTO_TCP----传输控制协议
  • IPPROTO_UDP----用户数据报协议

在/net/socket.c中可以看见socket的系统调用。

Socket系统调用
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
	int retval;
	struct socket *sock;
	int flags;

	/* Check the SOCK_* constants for consistency.  */
	BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);
	BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);
	BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);

	flags = type & ~SOCK_TYPE_MASK;
	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;
	type &= SOCK_TYPE_MASK;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	retval = sock_create(family, type, protocol, &sock);
	if (retval < 0)
		goto out;

	retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));
	if (retval < 0)
		goto out_release;

out:
	/* It may be already another descriptor 8) Not kernel problem. */
	return retval;

out_release:
	sock_release(sock);
	return retval;
}

主要函数如下

  • sock_create
  • sock_map_fd

sock_create

sock_create原型如下

int sock_create(int family, int type, int proto, struct socket **res);

sock_create->__sock_create

__sock_create
int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	int err;
	struct socket *sock;
	const struct net_proto_family *pf;

	/*
	 *      Check protocol is in range
	 */
	if (family < 0 || family >= NPROTO)
		return -EAFNOSUPPORT;
	if (type < 0 || type >= SOCK_MAX)
		return -EINVAL;

	/* Compatibility.

	   This uglymoron is moved from INET layer to here to avoid
	   deadlock in module load.
	 */
	if (family == PF_INET && type == SOCK_PACKET) {
		pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
			     current->comm);
		family = PF_PACKET;
	}

	err = security_socket_create(family, type, protocol, kern);
	if (err)
		return err;

	/*
	 *	Allocate the socket and allow the family to set things up. if
	 *	the protocol is 0, the family is instructed to select an appropriate
	 *	default.
	 */
	sock = sock_alloc();
	if (!sock) {
		net_warn_ratelimited("socket: no more sockets\n");
		return -ENFILE;	/* Not exactly a match, but its the
				   closest posix thing */
	}

	sock->type = type;

#ifdef CONFIG_MODULES
	/* Attempt to load a protocol module if the find failed.
	 *
	 * 12/09/1996 Marcin: But! this makes REALLY only sense, if the user
	 * requested real, full-featured networking support upon configuration.
	 * Otherwise module support will break!
	 */
	if (rcu_access_pointer(net_families[family]) == NULL)
		request_module("net-pf-%d", family);
#endif

	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = -EAFNOSUPPORT;
	if (!pf)
		goto out_release;

	/*
	 * We will call the ->create function, that possibly is in a loadable
	 * module, so we have to bump that loadable module refcnt first.
	 */
	if (!try_module_get(pf->owner))
		goto out_release;

	/* Now protected by module ref count */
	rcu_read_unlock();

	err = pf->create(net, sock, protocol, kern);
	if (err < 0)
		goto out_module_put;

	/*
	 * Now to bump the refcnt of the [loadable] module that owns this
	 * socket at sock_release time we decrement its refcnt.
	 */
	if (!try_module_get(sock->ops->owner))
		goto out_module_busy;

	/*
	 * Now that we're done with the ->create function, the [loadable]
	 * module can have its refcnt decremented
	 */
	module_put(pf->owner);
	err = security_socket_post_create(sock, family, type, protocol, kern);
	if (err)
		goto out_sock_release;
	*res = sock;

	return 0;

out_module_busy:
	err = -EAFNOSUPPORT;
out_module_put:
	sock->ops = NULL;
	module_put(pf->owner);
out_sock_release:
	sock_release(sock);
	return err;

out_release:
	rcu_read_unlock();
	goto out_sock_release;
}

简化后如以下所示

int __sock_create(struct net *net, int family, int type, int protocol,
			 struct socket **res, int kern)
{
	sock =sock_alloc();
	sock->type = type;
	rcu_read_lock();
	pf = rcu_dereference(net_families[family]);
	err = pf->create(net, sock, protocol, kern);
}
  • 调用sock_alloc函数创建一个socket结构体
  • 将socket结构体的type赋值为参数的type
  • 将net_families[family]赋值给pf
  • 调用pf的create函数

net_families在上述初始化中已经讲到了。这样我们的pf->create就执行了inet_family_ops ->create即inet_create,完整代码如下

inet_create

static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
	struct sock *sk;
	struct inet_protosw *answer;
	struct inet_sock *inet;
	struct proto *answer_prot;
	unsigned char answer_flags;
	int try_loading_module = 0;
	int err;

	if (protocol < 0 || protocol >= IPPROTO_MAX)
		return -EINVAL;

	sock->state = SS_UNCONNECTED;

	/* Look for the requested type/protocol pair. */
lookup_protocol:
	err = -ESOCKTNOSUPPORT;
	rcu_read_lock();
	list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {

		err = 0;
		/* Check the non-wild match. */
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
		} else {
			/* Check for the two wild cases. */
			if (IPPROTO_IP == protocol) {
				protocol = answer->protocol;
				break;
			}
			if (IPPROTO_IP == answer->protocol)
				break;
		}
		err = -EPROTONOSUPPORT;
	}

	if (unlikely(err)) {
		if (try_loading_module < 2) {
			rcu_read_unlock();
			/*
			 * Be more specific, e.g. net-pf-2-proto-132-type-1
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)
			 */
			if (++try_loading_module == 1)
				request_module("net-pf-%d-proto-%d-type-%d",
					       PF_INET, protocol, sock->type);
			/*
			 * Fall back to generic, e.g. net-pf-2-proto-132
			 * (net-pf-PF_INET-proto-IPPROTO_SCTP)
			 */
			else
				request_module("net-pf-%d-proto-%d",
					       PF_INET, protocol);
			goto lookup_protocol;
		} else
			goto out_rcu_unlock;
	}

	err = -EPERM;
	if (sock->type == SOCK_RAW && !kern &&
	    !ns_capable(net->user_ns, CAP_NET_RAW))
		goto out_rcu_unlock;

	sock->ops = answer->ops;
	answer_prot = answer->prot;
	answer_flags = answer->flags;
	rcu_read_unlock();

	WARN_ON(!answer_prot->slab);

	err = -ENOBUFS;
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);
	if (!sk)
		goto out;

	err = 0;
	if (INET_PROTOSW_REUSE & answer_flags)
		sk->sk_reuse = SK_CAN_REUSE;

	inet = inet_sk(sk);
	inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

	inet->nodefrag = 0;

	if (SOCK_RAW == sock->type) {
		inet->inet_num = protocol;
		if (IPPROTO_RAW == protocol)
			inet->hdrincl = 1;
	}

	if (net->ipv4.sysctl_ip_no_pmtu_disc)
		inet->pmtudisc = IP_PMTUDISC_DONT;
	else
		inet->pmtudisc = IP_PMTUDISC_WANT;

	inet->inet_id = 0;

	sock_init_data(sock, sk);

	sk->sk_destruct	   = inet_sock_destruct;
	sk->sk_protocol	   = protocol;
	sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

	inet->uc_ttl	= -1;
	inet->mc_loop	= 1;
	inet->mc_ttl	= 1;
	inet->mc_all	= 1;
	inet->mc_index	= 0;
	inet->mc_list	= NULL;
	inet->rcv_tos	= 0;

	sk_refcnt_debug_inc(sk);

	if (inet->inet_num) {
		/* It assumes that any protocol which allows
		 * the user to assign a number at socket
		 * creation time automatically
		 * shares.
		 */
		inet->inet_sport = htons(inet->inet_num);
		/* Add to protocol hash chains. */
		err = sk->sk_prot->hash(sk);
		if (err) {
			sk_common_release(sk);
			goto out;
		}
	}

	if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
out:
	return err;
out_rcu_unlock:
	rcu_read_unlock();
	goto out;
}

inet_create简化后

static int inet_create(struct net *net, struct socket *sock, int protocol,
		       int kern)
{
        struct sock *sk;
	struct inet_protosw *answer;
        struct proto *answer_prot;

list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
		if (protocol == answer->protocol) {
			if (protocol != IPPROTO_IP)
				break;
	}
	sock->ops = answer->ops;
	answer_prot = answer->prot;
	answer_flags = answer->flags;
	
	sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);


        if (sk->sk_prot->init) {
		err = sk->sk_prot->init(sk);
		if (err)
			sk_common_release(sk);
	}
}

list_for_each_entry_rcu用于遍历inetsw数组并将数组的元素给到answer,并判断socket函数传进来的protocol 是否等于answer->protocol。tcp的inet_protosw结构体如下。

{
		.type =       SOCK_STREAM,
		.protocol =   IPPROTO_TCP,
		.prot =       &tcp_prot,
		.ops =        &inet_stream_ops,
		.flags =      INET_PROTOSW_PERMANENT |
			      INET_PROTOSW_ICSK,
}

sock->ops = answer->ops=inet_stream_ops
answer_prot=tcp_prot
answer_flags =INET_PROTOSW_PERMANENT |INET_PROTOSW_ICSK

tcp_prot
struct proto tcp_prot = {
	.name			= "TCP",
	.owner			= THIS_MODULE,
	.close			= tcp_close,
	.connect		= tcp_v4_connect,
	.disconnect		= tcp_disconnect,
	.accept			= inet_csk_accept,
	.ioctl			= tcp_ioctl,
	.init			= tcp_v4_init_sock,
	.destroy		= tcp_v4_destroy_sock,
	.shutdown		= tcp_shutdown,
	.setsockopt		= tcp_setsockopt,
	.getsockopt		= tcp_getsockopt,
	.recvmsg		= tcp_recvmsg,
	.sendmsg		= tcp_sendmsg,
	.sendpage		= tcp_sendpage,
	.backlog_rcv		= tcp_v4_do_rcv,
	.release_cb		= tcp_release_cb,
	.hash			= inet_hash,
	.unhash			= inet_unhash,
	.get_port		= inet_csk_get_port,
	.enter_memory_pressure	= tcp_enter_memory_pressure,
	.stream_memory_free	= tcp_stream_memory_free,
	.sockets_allocated	= &tcp_sockets_allocated,
	.orphan_count		= &tcp_orphan_count,
	.memory_allocated	= &tcp_memory_allocated,
	.memory_pressure	= &tcp_memory_pressure,
	.sysctl_mem		= sysctl_tcp_mem,
	.sysctl_wmem		= sysctl_tcp_wmem,
	.sysctl_rmem		= sysctl_tcp_rmem,
	.max_header		= MAX_TCP_HEADER,
	.obj_size		= sizeof(struct tcp_sock),
	.slab_flags		= SLAB_DESTROY_BY_RCU,
	.twsk_prot		= &tcp_timewait_sock_ops,
	.rsk_prot		= &tcp_request_sock_ops,
	.h.hashinfo		= &tcp_hashinfo,
	.no_autobind		= true,
#ifdef CONFIG_COMPAT
	.compat_setsockopt	= compat_tcp_setsockopt,
	.compat_getsockopt	= compat_tcp_getsockopt,
#endif
	.diag_destroy		= tcp_abort,
};

sk_alloc创建了一个sock结构体,详见初始化的sock模块

在后续的inet_create中判断了sk->sk_prot->init即tcp_prot->init是否存在,若存在则执行,不存在则不执行。

sock_map_fd

sock_create执行结束后我们得到了一个socket结构体和一个sock结构体,在socket系统调用时,我们传进去了一个socket结构体的引用。

retval = sock_create(family, type, protocol, &sock);

在inet_create结尾将这个引用解引用并赋值为我们创建的socket结构体。

*res = sock;

sock_map_fd函数如下,其作用是将socket结构体与文件描述符进行绑定,就像0表示标准输入,1表示标准输出,2表示标准错误输出,在没打开其他文件的情况下,我们新创建的socket结构体的文件描述符为3。

retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));

static int sock_map_fd(struct socket *sock, int flags)
{
	struct file *newfile;
	int fd = get_unused_fd_flags(flags);
	newfile = sock_alloc_file(sock, flags, NULL);
	if (likely(!IS_ERR(newfile))) {
		fd_install(fd, newfile);
		return fd;
	}
	return PTR_ERR(newfile);
}

Bind

bind函数原型,

int bind(int sockfd, const struct sockaddr *addr, socklen_t addrlen);
  • sockfd:socket对应的文件描述符
  • addr:一个sockaddr结构体,里面包含协议簇和端口
  • addrlen:addr的长度

在Socket.c中可以找到对应的系统调用

SYSCALL_DEFINE3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
	struct socket *sock;
	struct sockaddr_storage address;
	int err, fput_needed;
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock) {
		err = move_addr_to_kernel(umyaddr, addrlen, &address);
		if (err >= 0) {
		    err = security_socket_bind(sock,(struct sockaddr *)&address,addrlen);
		    if (!err)
		        err = sock->ops->bind(sock,(struct sockaddr *&address, addrlen);
		}

	}
	return err;
}
  • 通过sockfd_lookup_light用传进来的文件描述符fd找到对应的socket结构体。
  • move_addr_to_kernel用copy_form_user从用户态把sockaddr结构体赋值到内核态空间的sockaddr_storage结构体中。
  • 调用sock->ops->bind

inet_bind

在sock_create函数中我们知道了sock->ops=anser->ops=inet_stream_ops,inet_stream_ops结构体如下

inet_stream_ops
const struct proto_ops inet_stream_ops = {
	.family		   = PF_INET,
	.owner		   = THIS_MODULE,
	.release	   = inet_release,
	.bind		   = inet_bind,
	.connect	   = inet_stream_connect,
	.socketpair	   = sock_no_socketpair,
	.accept		   = inet_accept,
	.getname	   = inet_getname,
	.poll		   = tcp_poll,
	.ioctl		   = inet_ioctl,
	.gettstamp	   = sock_gettstamp,
	.listen		   = inet_listen,
	.shutdown	   = inet_shutdown,
	.setsockopt	   = sock_common_setsockopt,
	.getsockopt	   = sock_common_getsockopt,
	.sendmsg	   = inet_sendmsg,
	.recvmsg	   = inet_recvmsg,
#ifdef CONFIG_MMU
	.mmap		   = tcp_mmap,
#endif
	.splice_eof	   = inet_splice_eof,
	.splice_read	   = tcp_splice_read,
	.set_peek_off      = sk_set_peek_off,
	.read_sock	   = tcp_read_sock,
	.read_skb	   = tcp_read_skb,
	.sendmsg_locked    = tcp_sendmsg_locked,
	.peek_len	   = tcp_peek_len,
#ifdef CONFIG_COMPAT
	.compat_ioctl	   = inet_compat_ioctl,
#endif
	.set_rcvlowat	   = tcp_set_rcvlowat,
};

所以sock->ops->bind=inet_bind

inet_bind又调用了inet_bind_sk

int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
	return inet_bind_sk(sock->sk, uaddr, addr_len);
}

inet_bind_sk。这里就有点多态的影子了,判断是否重写了bind方法,如果重写了就执行sk->sk_prot->bind,没重写就执行父类的bind方法。

int inet_bind_sk(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	if (sk->sk_prot->bind) {
		return sk->sk_prot->bind(sk, uaddr, addr_len);
	return __inet_bind(sk, uaddr, addr_len, flags);
}

在tcp_prot中并没有重写bind方法,执行__inet_bind

__inet_bind函数
int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
		u32 flags)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct inet_sock *inet = inet_sk(sk);
	struct net *net = sock_net(sk);
	unsigned short snum;
	int chk_addr_ret;
	u32 tb_id = RT_TABLE_LOCAL;
	int err;

	if (addr->sin_family != AF_INET) {
		/* Compatibility games : accept AF_UNSPEC (mapped to AF_INET)
		 * only if s_addr is INADDR_ANY.
		 */
		err = -EAFNOSUPPORT;
		if (addr->sin_family != AF_UNSPEC ||
		    addr->sin_addr.s_addr != htonl(INADDR_ANY))
			goto out;
	}

	tb_id = l3mdev_fib_table_by_index(net, sk->sk_bound_dev_if) ? : tb_id;
	chk_addr_ret = inet_addr_type_table(net, addr->sin_addr.s_addr, tb_id);

	/* Not specified by any standard per-se, however it breaks too
	 * many applications when removed.  It is unfortunate since
	 * allowing applications to make a non-local bind solves
	 * several problems with systems using dynamic addressing.
	 * (ie. your servers still start up even if your ISDN link
	 *  is temporarily down)
	 */
	err = -EADDRNOTAVAIL;
	if (!inet_addr_valid_or_nonlocal(net, inet, addr->sin_addr.s_addr,
	                                 chk_addr_ret))
		goto out;

	snum = ntohs(addr->sin_port);
	err = -EACCES;
	if (!(flags & BIND_NO_CAP_NET_BIND_SERVICE) &&
	    snum && inet_port_requires_bind_service(net, snum) &&
	    !ns_capable(net->user_ns, CAP_NET_BIND_SERVICE))
		goto out;

	/*      We keep a pair of addresses. rcv_saddr is the one
	 *      used by hash lookups, and saddr is used for transmit.
	 *
	 *      In the BSD API these are the same except where it
	 *      would be illegal to use them (multicast/broadcast) in
	 *      which case the sending device address is used.
	 */
	if (flags & BIND_WITH_LOCK)
		lock_sock(sk);

	/* Check these errors (active socket, double bind). */
	err = -EINVAL;
	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
		goto out_release_sock;

	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
	if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
		inet->inet_saddr = 0;  /* Use device */

	/* Make sure we are allowed to bind here. */
	if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
		err = sk->sk_prot->get_port(sk, snum);
		if (err) {
			inet->inet_saddr = inet->inet_rcv_saddr = 0;
			goto out_release_sock;
		}
		if (!(flags & BIND_FROM_BPF)) {
			err = BPF_CGROUP_RUN_PROG_INET4_POST_BIND(sk);
			if (err) {
				inet->inet_saddr = inet->inet_rcv_saddr = 0;
				if (sk->sk_prot->put_port)
					sk->sk_prot->put_port(sk);
				goto out_release_sock;
			}
		}
	}

	if (inet->inet_rcv_saddr)
		sk->sk_userlocks |= SOCK_BINDADDR_LOCK;
	if (snum)
		sk->sk_userlocks |= SOCK_BINDPORT_LOCK;
	inet->inet_sport = htons(inet->inet_num);
	inet->inet_daddr = 0;
	inet->inet_dport = 0;
	sk_dst_reset(sk);
	err = 0;
out_release_sock:
	if (flags & BIND_WITH_LOCK)
		release_sock(sk);
out:
	return err;
}

简化后如下

int __inet_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len,
		u32 flags)
{
	struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
	struct inet_sock *inet = inet_sk(sk);
	struct net *net = sock_net(sk);
	unsigned short snum;
	int chk_addr_ret;
	u32 tb_id = RT_TABLE_LOCAL;
	int err;
	snum = ntohs(addr->sin_port);

	if (sk->sk_state != TCP_CLOSE || inet->inet_num)
		goto out_release_sock;
	inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
	if (snum || !(inet_test_bit(BIND_ADDRESS_NO_PORT, sk) ||
		      (flags & BIND_FORCE_ADDRESS_NO_PORT))) {
		err = sk->sk_prot->get_port(sk, snum);
			if (err) {
				inet->inet_saddr = inet->inet_rcv_saddr = 0;
				if (sk->sk_prot->put_port)
					sk->sk_prot->put_port(sk);
				goto out_release_sock;
	return err;
}
  • snum=我们设定的端口
  • inet->inet_rcv_saddr=inet->inet_saddr=设定的地址,sock到inet_sock的转化具体见sock的初始化
  • 如果我们指定了端口则进入sk->sk_prot->get_port(sk, snum);中
  • 执行sk->sk_prot->get_port发生错误则执行sk->sk_prot->put_port

inet_csk_get_port

sk->sk_prot->get_port即tcp_prot->get_port=inet_csk_get_port。该部分比较长,源码如下

inet_csk_get_port
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
	bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
	struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
	int ret = 1, attempts = 5, port = snum;
	int smallest_size = -1, smallest_port;
	struct inet_bind_hashbucket *head;
	struct net *net = sock_net(sk);
	int i, low, high, attempt_half;
	struct inet_bind_bucket *tb;
	kuid_t uid = sock_i_uid(sk);
	u32 remaining, offset;

	if (port) {
have_port:
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == port)
				goto tb_found;

		goto tb_not_found;
	}
again:
	attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
other_half_scan:
	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	if (high - low < 4)
		attempt_half = 0;
	if (attempt_half) {
		int half = low + (((high - low) >> 2) << 1);

		if (attempt_half == 1)
			high = half;
		else
			low = half;
	}
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;

	offset = prandom_u32() % remaining;
	/* __inet_hash_connect() favors ports having @low parity
	 * We do the opposite to not pollute connect() users.
	 */
	offset |= 1U;
	smallest_size = -1;
	smallest_port = low; /* avoid compiler warning */

other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (((tb->fastreuse > 0 && reuse) ||
				     (tb->fastreuseport > 0 &&
				      sk->sk_reuseport &&
				      !rcu_access_pointer(sk->sk_reuseport_cb) &&
				      uid_eq(tb->fastuid, uid))) &&
				    (tb->num_owners < smallest_size || smallest_size == -1)) {
					smallest_size = tb->num_owners;
					smallest_port = port;
				}
				if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
					goto tb_found;
				goto next_port;
			}
		goto tb_not_found;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}

	if (smallest_size != -1) {
		port = smallest_port;
		goto have_port;
	}
	offset--;
	if (!(offset & 1))
		goto other_parity_scan;

	if (attempt_half == 1) {
		/* OK we now try the upper half of the range */
		attempt_half = 2;
		goto other_half_scan;
	}
	return ret;

tb_not_found:
	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
				     net, head, port);
	if (!tb)
		goto fail_unlock;
tb_found:
	if (!hlist_empty(&tb->owners)) {
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;

		if (((tb->fastreuse > 0 && reuse) ||
		     (tb->fastreuseport > 0 &&
		      !rcu_access_pointer(sk->sk_reuseport_cb) &&
		      sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
		    smallest_size == -1)
			goto success;
		if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
			if ((reuse ||
			     (tb->fastreuseport > 0 &&
			      sk->sk_reuseport &&
			      !rcu_access_pointer(sk->sk_reuseport_cb) &&
			      uid_eq(tb->fastuid, uid))) &&
			    smallest_size != -1 && --attempts >= 0) {
				spin_unlock_bh(&head->lock);
				goto again;
			}
			goto fail_unlock;
		}
	}

	inet_csk_update_fastreuse(tb, sk);

success:
	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, port);
	WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
	ret = 0;

fail_unlock:
	spin_unlock_bh(&head->lock);
	return ret;
}

好像都挺重要的没啥可以删掉的。。

if (port) {
have_port:
		head = &hinfo->bhash[inet_bhashfn(net, port,hinfo->bhash_size)];
		spin_lock_bh(&head->lock);
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == port)
				goto tb_found;

		goto tb_not_found;
	}
  • 判断是否指定了端口
  • 如果指定了端口则将端口作hash处理,并找到对应hash桶的inet_bind_hashbucket结构体
  • 从对应的inet_bind_hashbucket结构体遍历inet_bind_bucket,具体详见必要知识
  • 并对比找到的inet_bind_bucket结构体的端口是否等于传进来的端口
  • 若找到对应的inet_bind_bucket结构体,则跳转到tb_found,未找到则跳转到tb_not_found

若未指定端口则执行以下函数

inet_get_local_port_range(net, &low, &high);
	high++;
	if (high - low < 4)
		attempt_half = 0;
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;

	offset = prandom_u32() % remaining;
  • 取到本地最小端口和最大端口,一般设置为(32768, 60999)
  • 计算剩余空闲端口(remaining )
  • 用随机数生成随机端口offset

接着进行端口的检查,以下代码经过简化,排除了端口复用的情况

other_half_scan:
port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		head = &hinfo->bhash[inet_bhashfn(net, port,hinfo->bhash_size)];
		inet_bind_bucket_for_each(tb, &head->chain)
			if (net_eq(ib_net(tb), net) && tb->port == port) {
                           if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
				goto tb_found;
			goto next_port;
			}
		goto tb_not_found;
next_port:
		offset--;
                goto other_half_scan;
	} 
  • 用随机得到的端口进行hash计算并取得对应的inet_bind_hashbucket结构体
  • 遍历inet_bind_hashbucket结构体找到所有inet_bind_bucket结构体对比端口号
  • 若找到则判断该端口是否被占用,若没被占用则跳转至tb_found,若被占用则跳转到next_port
  • 若没找到对应inet_bind_bucket结构体则跳转至tb_not_found
  • next_port则将offset-1后进入下一次循环

将对应的inet_bind_bucket加入bhash中

tb_not_found:
	tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,net, head, port);
tb_found:
	if (!hlist_empty(&tb->owners)) {
		if (sk->sk_reuse == SK_FORCE_REUSE)
			goto success;
	}
success:
	if (!inet_csk(sk)->icsk_bind_hash)
		inet_bind_hash(sk, tb, port);
	ret = 0;
  • 若没找到对应的inet_bind_bucket结构体则创建一个并初始化(inet_bind_bucket_create)
  • 若找到了inet_bind_bucket结构体,则判断inet_bind_bucket结构体对应的sock结构体是否允许复用
  • 若允许或者inet_bind_bucket结构体对应的sock结构体为空则将inet_bind_bucket与sock结构体进行绑定

inet_bind_hash

void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb,const unsigned short snum)
{
	inet_sk(sk)->inet_num = snum;
	sk_add_bind_node(sk, &tb->owners);
	tb->num_owners++;
	inet_csk(sk)->icsk_bind_hash = tb;
}
  • 将sock转换成inet_sk并将指定的端口保存在inet_num成员中
  • 将inet_bind_bucket结构体的owners成员指向sock结构体,并将num_owners++
  • 将sock转换成inet_connection_sock结构体并将inet_bind_bucket保存在icsk_bind_hash成员中

至此bind函数结束

总结

listen

listen函数原型如下

int listen(int sockfd, int backlog);
  • sockfd:想要监听socket的文件描述符
  • backlog:进入队列中等待被处理的连接的最大数量,当有新的客户端连接请求时,如果已经有 backlog 个连接处于等待状态,新的连接请求将被拒绝

listen系统调用如下

SYSCALL_DEFINE2(listen, int, fd, int, backlog)
{
	struct socket *sock;
	int err, fput_needed;
	int somaxconn;
	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (sock) {
		somaxconn = READ_ONCE(sock_net(sock->sk)->core.sysctl_somaxconn);
		if ((unsigned int)backlog > somaxconn)
			backlog = somaxconn;
		err = security_socket_listen(sock, backlog);
		if (!err)
			err = sock->ops->listen(sock, backlog);
		fput_light(sock->file, fput_needed);
	}
	return err;
}
  • sockfd_lookup_light根据对应fd找到socket结构体
  • 执行sock->ops->listen即inet_stream_ops->listen=inet_listen

inet_listen

inet_listen函数如下

int inet_listen(struct socket *sock, int backlog)
{
	struct sock *sk = sock->sk;
	unsigned char old_state;
	int err;
	if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
		goto out;
	old_state = sk->sk_state;
	if (!((1 << old_state) & (TCPF_CLOSE | TCPF_LISTEN)))
		goto out;
	if (old_state != TCP_LISTEN) {
		err = inet_csk_listen_start(sk, backlog);
		if (err)
			goto out;
	}
	sk->sk_max_ack_backlog = backlog;
	err = 0;
	return err;
}
  • 判断当前socket的状态,如果当前socket处于SS_UNCONNECTED状态且类型是SOCK_STREAM就往下执行。
  • 判断sock的状态,如果当前sock处于TCPF_CLOSE | TCPF_LISTEN 状态就往下继续执行,否则都终止。
  • 执行inet_csk_listen_start
  • 更新sk->sk_max_ack_backlog = backlog;

inet_csk_listen_start

inet_csk_listen_start如下

int inet_csk_listen_start(struct sock *sk, int backlog)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet = inet_sk(sk);

	reqsk_queue_alloc(&icsk->icsk_accept_queue);

	sk->sk_max_ack_backlog = backlog;
	sk->sk_ack_backlog = 0;
	inet_csk_delack_init(sk);

	sk_state_store(sk, TCP_LISTEN);
	if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
		inet->inet_sport = htons(inet->inet_num);
		sk_dst_reset(sk);
		err = sk->sk_prot->hash(sk);
                if (likely(!err))
			return 0;
	sk->sk_state = TCP_CLOSE;
	return err;
}
  • 用参数初始化sk_max_ack_backlog
  • 将当前sock的状态设置为TCP_LISTEN
  • 因为bind() 与 listen() 之间可能存在时间差,其他进程可能已占用该端口。所以需要重新调用sk->sk_prot->get_port检查端口是否冲突。
  • 调用sk->sk_prot->hash将sock加入listening_hash

inet_hash

sk->sk_prot->hash=inet_hash,如下

int __inet_hash(struct sock *sk, struct sock *osk,
		 int (*saddr_same)(const struct sock *sk1,
				   const struct sock *sk2,
				   bool match_wildcard))
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct inet_listen_hashbucket *ilb;
	int err = 0;

	if (sk->sk_state != TCP_LISTEN) {
		inet_ehash_nolisten(sk, osk);
		return 0;
	}
	ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];

	spin_lock(&ilb->lock);
	if (sk->sk_reuseport) {
		err = inet_reuseport_add_sock(sk, ilb, saddr_same);
		if (err)
			goto unlock;
	}
	if (IS_ENABLED(CONFIG_IPV6) && sk->sk_reuseport &&
		sk->sk_family == AF_INET6)
		__sk_nulls_add_node_tail_rcu(sk, &ilb->nulls_head);
	else
		__sk_nulls_add_node_rcu(sk, &ilb->nulls_head);
	sock_set_flag(sk, SOCK_RCU_FREE);
	sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
unlock:
	spin_unlock(&ilb->lock);

	return err;
}
  • 判断当前sock是否listing状态,若不是则退出
  • 对端口号进行hash找到对应的inet_listening_bucket
  • 判断当前协议簇调用不同函数将当前sock加入listening_hash

看一下__sk_nulls_add_node_rcu--->hlist_nulls_add_head_rcu

static inline void __sk_nulls_add_node_rcu(struct sock *sk, struct hlist_nulls_head *list)
{
	hlist_nulls_add_head_rcu(&sk->sk_nulls_node, list);
}
hlist_nulls_add_head_rcu(struct hlist_nulls_node *n,struct hlist_nulls_head *h)
{
	struct hlist_nulls_node *first = h->first;

	n->next = first;
	WRITE_ONCE(n->pprev, &h->first);
	rcu_assign_pointer(hlist_nulls_first_rcu(h), n);
	if (!is_a_nulls(first))
		WRITE_ONCE(first->pprev, &n->next);
}
  • sk->sk_nulls_node->next=inet_listen_hashbucket->nulls_head->first

总结

Connect

connect函数原型如下

int connect(int sockfd, const struct sockaddr *addr ,socklen_t addrlen);
  • sockfd:socket的文件描述符
  • addr :连接的地址和端口以及协议簇
  • addrlen:addr的长度

系统调用如下


SYSCALL_DEFINE3(connect, int, fd, struct sockaddr __user *, uservaddr,int, addrlen)
{
	struct socket *sock;
	struct sockaddr_storage address;
	int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
err = move_addr_to_kernel(uservaddr, addrlen, &address);
err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,sock->file->f_flags);
return err;
}
  • 调用sockfd_lookup_light寻找与fd相关的socket
  • 将addr移动到内核
  • 调用inet_stream_ops->connect=inet_stream_connect

inet_stream_connect

简化代码如下,不考虑报错以及已连接状态

int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
			  int addr_len, int flags)
{
	struct sock *sk = sock->sk;
	int err;
	long timeo;
	switch (sock->state) {
	case SS_UNCONNECTED:
		if (sk->sk_state != TCP_CLOSE)
			goto out;
		err = sk->sk_prot->connect(sk, uaddr, addr_len);
		sock->state = SS_CONNECTING;
		break;
	}
	timeo = sock_sndtimeo(sk, flags & O_NONBLOCK);

	if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) {
		if (!timeo || !inet_wait_for_connect(sk, timeo, writebias))
			goto out;
	}
	sock->state = SS_CONNECTED;
	err = 0;
}
  • 根据sock->state选择执行的代码
  • 在SS_UNCONNECTED状态下,若sock的状态不是TCP_CLOSE,则调用sk->sk_prot->connect
  • 调用sock_sndtimeo判断该sock是否是阻塞的,若非阻塞则返回0,阻塞则返回对应等待的秒数
  • 判断若sock是非阻塞的,则退出,若为阻塞则执行inet_wait_for_connect

tcp_v4_connect

源代码如下

tcp_v4_connect
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
	struct inet_sock *inet = inet_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	__be16 orig_sport, orig_dport;
	__be32 daddr, nexthop;
	struct flowi4 *fl4;
	struct rtable *rt;
	int err;
	struct ip_options_rcu *inet_opt;

	if (addr_len < sizeof(struct sockaddr_in))
		return -EINVAL;

	if (usin->sin_family != AF_INET)
		return -EAFNOSUPPORT;

	nexthop = daddr = usin->sin_addr.s_addr;
	inet_opt = rcu_dereference_protected(inet->inet_opt,
					     lockdep_sock_is_held(sk));
	if (inet_opt && inet_opt->opt.srr) {
		if (!daddr)
			return -EINVAL;
		nexthop = inet_opt->opt.faddr;
	}

	orig_sport = inet->inet_sport;
	orig_dport = usin->sin_port;
	fl4 = &inet->cork.fl.u.ip4;
	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,
			      RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,
			      IPPROTO_TCP,
			      orig_sport, orig_dport, sk);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		if (err == -ENETUNREACH)
			IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES);
		return err;
	}

	if (rt->rt_flags & (RTCF_MULTICAST | RTCF_BROADCAST)) {
		ip_rt_put(rt);
		return -ENETUNREACH;
	}

	if (!inet_opt || !inet_opt->opt.srr)
		daddr = fl4->daddr;

	if (!inet->inet_saddr)
		inet->inet_saddr = fl4->saddr;
	sk_rcv_saddr_set(sk, inet->inet_saddr);

	if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
		/* Reset inherited state */
		tp->rx_opt.ts_recent	   = 0;
		tp->rx_opt.ts_recent_stamp = 0;
		if (likely(!tp->repair))
			tp->write_seq	   = 0;
	}

	if (tcp_death_row.sysctl_tw_recycle &&
	    !tp->rx_opt.ts_recent_stamp && fl4->daddr == daddr)
		tcp_fetch_timewait_stamp(sk, &rt->dst);

	inet->inet_dport = usin->sin_port;
	sk_daddr_set(sk, daddr);

	inet_csk(sk)->icsk_ext_hdr_len = 0;
	if (inet_opt)
		inet_csk(sk)->icsk_ext_hdr_len = inet_opt->opt.optlen;

	tp->rx_opt.mss_clamp = TCP_MSS_DEFAULT;

	/* Socket identity is still unknown (sport may be zero).
	 * However we set state to SYN-SENT and not releasing socket
	 * lock select source port, enter ourselves into the hash tables and
	 * complete initialization after this.
	 */
	tcp_set_state(sk, TCP_SYN_SENT);
	err = inet_hash_connect(&tcp_death_row, sk);
	if (err)
		goto failure;

	sk_set_txhash(sk);

	rt = ip_route_newports(fl4, rt, orig_sport, orig_dport,
			       inet->inet_sport, inet->inet_dport, sk);
	if (IS_ERR(rt)) {
		err = PTR_ERR(rt);
		rt = NULL;
		goto failure;
	}
	/* OK, now commit destination to socket.  */
	sk->sk_gso_type = SKB_GSO_TCPV4;
	sk_setup_caps(sk, &rt->dst);

	if (!tp->write_seq && likely(!tp->repair))
		tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
							   inet->inet_daddr,
							   inet->inet_sport,
							   usin->sin_port);

	inet->inet_id = prandom_u32();

	err = tcp_connect(sk);

	rt = NULL;
	if (err)
		goto failure;

	return 0;

failure:
	/*
	 * This unhashes the socket and releases the local port,
	 * if necessary.
	 */
	tcp_set_state(sk, TCP_CLOSE);
	ip_rt_put(rt);
	sk->sk_route_caps = 0;
	inet->inet_dport = 0;
	return err;
}

简化后

int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
	struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
	struct inet_sock *inet = inet_sk(sk);
	struct tcp_sock *tp = tcp_sk(sk);
	__be16 orig_sport, orig_dport;
	__be32 daddr, nexthop;
	struct rtable *rt;

	nexthop = daddr = usin->sin_addr.s_addr;
	orig_sport = inet->inet_sport;
	orig_dport = usin->sin_port;

	rt = ip_route_connect(fl4, nexthop, inet->inet_saddr,RT_CONN_FLAGS(sk), sk->sk_bound_dev_if,IPPROTO_TCP,orig_sport, orig_dport, sk);

	inet->inet_dport = usin->sin_port
	sk_daddr_set(sk, daddr);

	tcp_set_state(sk, TCP_SYN_SENT);
	err = inet_hash_connect(&tcp_death_row, sk);
	err = tcp_connect(sk);
	return 0;
}
  • 从参数中获取目的IP,目的端口赋值给netxhop,orig_dport。在inet_bind_hash函数中我们将绑定的端口赋值给了inet_sk(sk)->inet_num,将该值赋值给orig_sport。
  • 调用ip_route_connect根据目的IP进行路由规划
  • 将目的端口赋值给inet->inet_dport,并设置该sock的状态为TCP_SYN_SENT
  • 执行inet_hash_connect
  • 执行tcp_connect

inet_hash_connect

代码如下

inet_hash_connect
int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		struct sock *sk, u64 port_offset,
		int (*check_established)(struct inet_timewait_death_row *,
			struct sock *, __u16, struct inet_timewait_sock **))
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_timewait_sock *tw = NULL;
	struct inet_bind_hashbucket *head;
	int port = inet_sk(sk)->inet_num;
	struct net *net = sock_net(sk);
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	u32 index;

	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		spin_lock_bh(&head->lock);
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			spin_unlock_bh(&head->lock);
			return 0;
		}
		spin_unlock(&head->lock);
		/* No definite answer... Walk to established hash table */
		ret = check_established(death_row, sk, port, NULL);
		local_bh_enable();
		return ret;
	}

	inet_get_local_port_range(net, &low, &high);
	high++; /* [32768, 60999] -> [32768, 61000[ */
	remaining = high - low;
	if (likely(remaining > 1))
		remaining &= ~1U;

	net_get_random_once(table_perturb,
			    INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
	index = port_offset & (INET_TABLE_PERTURB_SIZE - 1);

	offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32);
	offset %= remaining;

	/* In first pass we try ports of @low parity.
	 * inet_csk_get_port() does the opposite choice.
	 */
	offset &= ~1U;
other_parity_scan:
	port = low + offset;
	for (i = 0; i < remaining; i += 2, port += 2) {
		if (unlikely(port >= high))
			port -= remaining;
		if (inet_is_local_reserved_port(net, port))
			continue;
		head = &hinfo->bhash[inet_bhashfn(net, port,
						  hinfo->bhash_size)];
		spin_lock_bh(&head->lock);

		/* Does not bother with rcv_saddr checks, because
		 * the established check is already unique enough.
		 */
		inet_bind_bucket_for_each(tb, &head->chain) {
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (tb->fastreuse >= 0 ||
				    tb->fastreuseport >= 0)
					goto next_port;
				WARN_ON(hlist_empty(&tb->owners));
				if (!check_established(death_row, sk,
						       port, &tw))
					goto ok;
				goto next_port;
			}
		}

		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
					     net, head, port);
		if (!tb) {
			spin_unlock_bh(&head->lock);
			return -ENOMEM;
		}
		tb->fastreuse = -1;
		tb->fastreuseport = -1;
		goto ok;
next_port:
		spin_unlock_bh(&head->lock);
		cond_resched();
	}

	offset++;
	if ((offset & 1) && remaining > 1)
		goto other_parity_scan;

	return -EADDRNOTAVAIL;

ok:
	/* Here we want to add a little bit of randomness to the next source
	 * port that will be chosen. We use a max() with a random here so that
	 * on low contention the randomness is maximal and on high contention
	 * it may be inexistent.
	 */
	i = max_t(int, i, (prandom_u32() & 7) * 2);
	WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2);

	/* Head lock still held and bh's disabled */
	inet_bind_hash(sk, tb, port);
	if (sk_unhashed(sk)) {
		inet_sk(sk)->inet_sport = htons(port);
		inet_ehash_nolisten(sk, (struct sock *)tw);
	}
	if (tw)
		inet_twsk_bind_unhash(tw, hinfo);
	spin_unlock(&head->lock);
	if (tw)
		inet_twsk_deschedule_put(tw);
	local_bh_enable();
	return 0;
}

简化后

int __inet_hash_connect(struct inet_timewait_death_row *death_row,
		struct sock *sk, u64 port_offset,
		int (*check_established)(struct inet_timewait_death_row *,
			struct sock *, __u16, struct inet_timewait_sock **))
{
	struct inet_hashinfo *hinfo = death_row->hashinfo;
	struct inet_timewait_sock *tw = NULL;
	struct inet_bind_hashbucket *head;
	int port = inet_sk(sk)->inet_num;
	struct net *net = sock_net(sk);
	struct inet_bind_bucket *tb;
	u32 remaining, offset;
	int ret, i, low, high;
	u32 index;

	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			return 0;
		}
		ret = check_established(death_row, sk, port, NULL);
		return ret;
	}
	inet_get_local_port_range(net, &low, &high);
	high++;
	remaining = high - low;
	net_get_random_once(table_perturb,INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb));
	port = low + offset;

	for (i = 0; i < remaining; i += 2, port += 2) {
		head = &hinfo->bhash[inet_bhashfn(net, port,hinfo->bhash_size)];
		inet_bind_bucket_for_each(tb, &head->chain) {
			if (net_eq(ib_net(tb), net) && tb->port == port) {
				if (!check_established(death_row, sk,port, &tw))
					goto ok;
				goto next_port;
			}
		}
		tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,net, head, port);
		goto ok;
next_port:
	}
	offset++;
	inet_bind_hash(sk, tb, port);
	if (sk_unhashed(sk)) {
		inet_sk(sk)->inet_sport = htons(port);
		inet_ehash_nolisten(sk, (struct sock *)tw);
	}
	return 0;
}

一部分一部分看。


	if (port) {
		head = &hinfo->bhash[inet_bhashfn(net, port,hinfo->bhash_size)];
		tb = inet_csk(sk)->icsk_bind_hash;
		if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
			inet_ehash_nolisten(sk, NULL);
			return 0;
		}
		ret = check_established(death_row, sk, port, NULL);
		return ret;
	}
  • 判断是否绑定了端口,如果绑定则找到对应的inet_bind_hashbucket
  • 判断inet_bind_hashbucket->owners是否等于当前sock和当前sock的拥有者是否只有一个,若为真则执行inet_ehash_nolisten
  • 否则执行check_established
inet_ehash_nolisten

inet_ehash_nolisten代码如下,调用了inet_ehash_insert

bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
{
	bool ok = inet_ehash_insert(sk, osk);
................................
}
inet_ehash_insert

简化后

bool inet_ehash_insert(struct sock *sk, struct sock *osk)
{
	struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
	struct hlist_nulls_head *list;
	struct inet_ehash_bucket *head;
	bool ret = true;

	sk->sk_hash = sk_ehashfn(sk);
	head = inet_ehash_bucket(hashinfo, sk->sk_hash);
	list = &head->chain;
	if (ret)
		__sk_nulls_add_node_rcu(sk, list);
	return ret;
}
  • 调用sk_ehashfn用sk生成对应ehash对应四元组的hash赋值给sk->sk_hash
  • 调用__sk_nulls_add_node_rcu插入对应hash桶

我们不考虑端口复用的情况,所以每个bucket对应的sock都是唯一的,所以不执行check_established,未指定端口则和bind中的逻辑一致,在此不多叙述,详见bind部分

最后仍然调用inet_ehash_nolisten进行ehash的插入

tcp_connect

代码如下

tcp_connect
int tcp_connect(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *buff;
	int err;

	if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk))
		return -EHOSTUNREACH; /* Routing failure or similar. */

	tcp_connect_init(sk);

	if (unlikely(tp->repair)) {
		tcp_finish_connect(sk, NULL);
		return 0;
	}

	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);
	if (unlikely(!buff))
		return -ENOBUFS;

	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
	tp->retrans_stamp = tcp_time_stamp;
	tcp_connect_queue_skb(sk, buff);
	tcp_ecn_send_syn(sk, buff);

	/* Send off SYN; include data in Fast Open. */
	err = tp->fastopen_req ? tcp_send_syn_data(sk, buff) :
	      tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);
	if (err == -ECONNREFUSED)
		return err;

	/* We change tp->snd_nxt after the tcp_transmit_skb() call
	 * in order to make this packet get counted in tcpOutSegs.
	 */
	tp->snd_nxt = tp->write_seq;
	tp->pushed_seq = tp->write_seq;
	buff = tcp_send_head(sk);
	if (unlikely(buff)) {
		tp->snd_nxt	= TCP_SKB_CB(buff)->seq;
		tp->pushed_seq	= TCP_SKB_CB(buff)->seq;
	}
	TCP_INC_STATS(sock_net(sk), TCP_MIB_ACTIVEOPENS);

	/* Timer for repeating the SYN until an answer. */
	inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,
				  inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
	return 0;
}

主要逻辑如下

int tcp_connect(struct sock *sk)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct sk_buff *buff;
	int err;

	buff = sk_stream_alloc_skb(sk, 0, sk->sk_allocation, true);

	tcp_init_nondata_skb(buff, tp->write_seq++, TCPHDR_SYN);
	tp->retrans_stamp = tcp_time_stamp;
	tcp_connect_queue_skb(sk, buff);

        tcp_transmit_skb(sk, buff, 1, sk->sk_allocation);

	buff = tcp_send_head(sk);
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS,inet_csk(sk)->icsk_rto, TCP_RTO_MAX);
}
  • 调用sk_stream_alloc_skb生成一个sk_buff结构体,该结构体具体看必要知识
  • 调用tcp_init_nondata_skb初始化sk_buff结构体
  • 初始化tp->retrans_stamp为tcp_time_stamp,记录发送syn分包的时间
  • 调用tcp_connect_queue_skb将skb放入缓存队列中
  • 调用tcp_transmit_skb封装sk_buff将sk_buff传入上层,发送第一个syn包
  • 调用inet_csk_reset_xmit_timer重启一个计时器,tcp_transmit_skb发送syn分包后开始计时
tcp_init_nondata_skb

tcp_init_nondata_skb主要是初始化skb,该函数将生成的序列号seq,flag(syn)复制到sk_buff->cb中

static void tcp_init_nondata_skb(struct sk_buff *skb, u32 seq, u8 flags)
{
	skb->ip_summed = CHECKSUM_PARTIAL;
	skb->csum = 0;

	TCP_SKB_CB(skb)->tcp_flags = flags;
	TCP_SKB_CB(skb)->sacked = 0;

	tcp_skb_pcount_set(skb, 1);

	TCP_SKB_CB(skb)->seq = seq;
	if (flags & (TCPHDR_SYN | TCPHDR_FIN))
		seq++;
	TCP_SKB_CB(skb)->end_seq = seq;
}
tcp_connect_queue_skb

tcp_connect_queue_skb将skb放入了sk->sk_write_queue队列中,在传输数据时,会从该队列中取出发送。

sk->sk_wmem_queued记录了队列里skb的总大小。

static void tcp_connect_queue_skb(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct tcp_skb_cb *tcb = TCP_SKB_CB(skb);

	tcb->end_seq += skb->len;
	__tcp_add_write_queue_tail(sk, skb);
	sk->sk_wmem_queued += skb->truesize;

}
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
	__skb_queue_tail(&sk->sk_write_queue, skb);
}
tcp_transmit_skb

tcp_transmit_skb还是比较长的。。

tcp_transmit_skb
static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
	struct tcp_out_options opts;
	unsigned int tcp_options_size, tcp_header_size;
	struct sk_buff *oskb = NULL;
	struct tcp_md5sig_key *md5;
	struct tcphdr *th;
	int err;

	BUG_ON(!skb || !tcp_skb_pcount(skb));
	tp = tcp_sk(sk);

	if (clone_it) {
		TCP_SKB_CB(skb)->tx.in_flight = TCP_SKB_CB(skb)->end_seq
			- tp->snd_una;
		oskb = skb;
		if (unlikely(skb_cloned(skb)))
			skb = pskb_copy(skb, gfp_mask);
		else
			skb = skb_clone(skb, gfp_mask);
		if (unlikely(!skb))
			return -ENOBUFS;
	}
	skb_mstamp_get(&skb->skb_mstamp);

	inet = inet_sk(sk);
	tcb = TCP_SKB_CB(skb);
	memset(&opts, 0, sizeof(opts));

	if (unlikely(tcb->tcp_flags & TCPHDR_SYN))
		tcp_options_size = tcp_syn_options(sk, skb, &opts, &md5);
	else
		tcp_options_size = tcp_established_options(sk, skb, &opts,
							   &md5);
	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

	/* if no packet is in qdisc/device queue, then allow XPS to select
	 * another queue. We can be called from tcp_tsq_handler()
	 * which holds one reference to sk_wmem_alloc.
	 *
	 * TODO: Ideally, in-flight pure ACK packets should not matter here.
	 * One way to get this would be to set skb->truesize = 2 on them.
	 */
	skb->ooo_okay = sk_wmem_alloc_get(sk) < SKB_TRUESIZE(1);

	skb_push(skb, tcp_header_size);
	skb_reset_transport_header(skb);

	skb_orphan(skb);
	skb->sk = sk;
	skb->destructor = skb_is_tcp_pure_ack(skb) ? __sock_wfree : tcp_wfree;
	skb_set_hash_from_sk(skb, sk);
	atomic_add(skb->truesize, &sk->sk_wmem_alloc);

	/* Build TCP header and checksum it. */
	th = (struct tcphdr *)skb->data;
	th->source		= inet->inet_sport;
	th->dest		= inet->inet_dport;
	th->seq			= htonl(tcb->seq);
	th->ack_seq		= htonl(rcv_nxt);
	*(((__be16 *)th) + 6)	= htons(((tcp_header_size >> 2) << 12) |
					tcb->tcp_flags);

	th->check		= 0;
	th->urg_ptr		= 0;

	/* The urg_mode check is necessary during a below snd_una win probe */
	if (unlikely(tcp_urg_mode(tp) && before(tcb->seq, tp->snd_up))) {
		if (before(tp->snd_up, tcb->seq + 0x10000)) {
			th->urg_ptr = htons(tp->snd_up - tcb->seq);
			th->urg = 1;
		} else if (after(tcb->seq + 0xFFFF, tp->snd_nxt)) {
			th->urg_ptr = htons(0xFFFF);
			th->urg = 1;
		}
	}

	tcp_options_write((__be32 *)(th + 1), tp, &opts);
	skb_shinfo(skb)->gso_type = sk->sk_gso_type;
	if (likely(!(tcb->tcp_flags & TCPHDR_SYN))) {
		th->window      = htons(tcp_select_window(sk));
		tcp_ecn_send(sk, skb, th, tcp_header_size);
	} else {
		/* RFC1323: The window in SYN & SYN/ACK segments
		 * is never scaled.
		 */
		th->window	= htons(min(tp->rcv_wnd, 65535U));
	}
#ifdef CONFIG_TCP_MD5SIG
	/* Calculate the MD5 hash, as we have all we need now */
	if (md5) {
		sk_nocaps_add(sk, NETIF_F_GSO_MASK);
		tp->af_specific->calc_md5_hash(opts.hash_location,
					       md5, sk, skb);
	}
#endif

	icsk->icsk_af_ops->send_check(sk, skb);

	if (likely(tcb->tcp_flags & TCPHDR_ACK))
		tcp_event_ack_sent(sk, tcp_skb_pcount(skb), rcv_nxt);

	if (skb->len != tcp_header_size) {
		tcp_event_data_sent(tp, sk);
		tp->data_segs_out += tcp_skb_pcount(skb);
	}

	if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
		TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
			      tcp_skb_pcount(skb));

	tp->segs_out += tcp_skb_pcount(skb);
	/* OK, its time to fill skb_shinfo(skb)->gso_{segs|size} */
	skb_shinfo(skb)->gso_segs = tcp_skb_pcount(skb);
	skb_shinfo(skb)->gso_size = tcp_skb_mss(skb);

	/* Our usage of tstamp should remain private */
	skb->tstamp.tv64 = 0;

	/* Cleanup our debris for IP stacks */
	memset(skb->cb, 0, max(sizeof(struct inet_skb_parm),
			       sizeof(struct inet6_skb_parm)));

	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);

	if (unlikely(err > 0)) {
		tcp_enter_cwr(sk);
		err = net_xmit_eval(err);
	}
	if (!err && oskb) {
		skb_mstamp_get(&oskb->skb_mstamp);
		tcp_rate_skb_sent(sk, oskb);
	}
	return err;
}

emmmm,简化完后一个是这么个流程...,有关sk_buff的在必要知识讲完了。

static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
			      int clone_it, gfp_t gfp_mask, u32 rcv_nxt)
{
	const struct inet_connection_sock *icsk = inet_csk(sk);
	struct inet_sock *inet;
	struct tcp_sock *tp;
	struct tcp_skb_cb *tcb;
	struct tcphdr *th;

	tp = tcp_sk(sk);

	inet = inet_sk(sk);
	tcb = TCP_SKB_CB(skb);

	tcp_header_size = tcp_options_size + sizeof(struct tcphdr);

	icsk->icsk_af_ops->send_check(sk, skb);

	err = icsk->icsk_af_ops->queue_xmit(sk, skb, &inet->cork.fl);

	
}

关于icsk->icsk_af_ops->queue_xmit=ip_queue_xmit,向网络层步入

Tcp的重传

Accept

源函数如下

int accept(int sockfd, struct sockaddr *addr, socklen_t *addrlen);
  • sockfd:是一个已经通过 socket 函数创建并绑定到特定地址的监听套接字
  • addr:是一个指向 struct sockaddr 类型的指针,用于存储连接的对端地址信息。
  • addrlen:用于指定 addr 缓冲区的大小

accept的返回值是一个新的socket结构体。

系统调用如下

SYSCALL_DEFINE3(accept, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen)
{
	return sys_accept4(fd, upeer_sockaddr, upeer_addrlen, 0);
}

sys_accept4源码如下

sys_accept4
SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen, int, flags)
{
	struct socket *sock, *newsock;
	struct file *newfile;
	int err, len, newfd, fput_needed;
	struct sockaddr_storage address;

	if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))
		return -EINVAL;

	if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))
		flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	if (!sock)
		goto out;

	err = -ENFILE;
	newsock = sock_alloc();
	if (!newsock)
		goto out_put;

	newsock->type = sock->type;
	newsock->ops = sock->ops;

	/*
	 * We don't need try_module_get here, as the listening socket (sock)
	 * has the protocol module (sock->ops->owner) held.
	 */
	__module_get(newsock->ops->owner);

	newfd = get_unused_fd_flags(flags);
	if (unlikely(newfd < 0)) {
		err = newfd;
		sock_release(newsock);
		goto out_put;
	}
	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
	if (IS_ERR(newfile)) {
		err = PTR_ERR(newfile);
		put_unused_fd(newfd);
		sock_release(newsock);
		goto out_put;
	}

	err = security_socket_accept(sock, newsock);
	if (err)
		goto out_fd;

	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	if (err < 0)
		goto out_fd;

	if (upeer_sockaddr) {
		if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
					  &len, 2) < 0) {
			err = -ECONNABORTED;
			goto out_fd;
		}
		err = move_addr_to_user(&address,
					len, upeer_sockaddr, upeer_addrlen);
		if (err < 0)
			goto out_fd;
	}

	/* File flags are not inherited via accept() unlike another OSes. */

	fd_install(newfd, newfile);
	err = newfd;

out_put:
	fput_light(sock->file, fput_needed);
out:
	return err;
out_fd:
	fput(newfile);
	put_unused_fd(newfd);
	goto out_put;
}

简化后

SYSCALL_DEFINE4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
		int __user *, upeer_addrlen, int, flags)
{
	struct socket *sock, *newsock;
	struct file *newfile;
	int err, len, newfd, fput_needed;
	struct sockaddr_storage address;

	sock = sockfd_lookup_light(fd, &err, &fput_needed);
	newsock = sock_alloc();

	newsock->type = sock->type;
	newsock->ops = sock->ops;

	newfile = sock_alloc_file(newsock, flags, sock->sk->sk_prot_creator->name);
	err = sock->ops->accept(sock, newsock, sock->file->f_flags);
	fd_install(newfd, newfile);
}
  • sockfd_lookup_light寻找对应fd的socket结构体
  • sock_alloc创建一个新的socket结构体
  • 将newsock的type和ops都赋值为fd对应socket的type和ops
  • sock_alloc_file创建一个新的文件
  • 调用sock->ops->accept初始化newsock
  • fd_install将新的socket结构体和新的文件建立联系

inet_accept

inet_accept主要的函数如下图

int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1 = sock->sk;
	struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
	sock_graft(sk2, newsock);
	newsock->state = SS_CONNECTED;
	err = 0;
}
  • 调用sk1->sk_prot->accept得到一个sock结构体
  • sock_graft将sock结构体和socket结构体建立联系

inet_csk_accept

struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	struct request_sock_queue *queue = &icsk->icsk_accept_queue;
	struct request_sock *req;
	struct sock *newsk;
	int error;

	if (sk->sk_state != TCP_LISTEN)
		goto out_err;
	if (reqsk_queue_empty(queue)) {
		long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
		if (!timeo)
			goto out_err;
		error = inet_csk_wait_for_connect(sk, timeo);
		if (error)
			goto out_err;
	}
	req = reqsk_queue_remove(queue, sk);
	newsk = req->sk;

	return newsk;

}
  • 判断icsk_accept_queue是否为空,若为空则进入if语句,调用sock_rcvtimeo判断当前sock是否为阻塞,若为阻塞,则执行inet_csk_wait_for_connect,若不阻塞则直接返回
  • 若不为空,则从队列中取出一个sock并返回

inet_csk_wait_for_connect

可以看到inet_csk_wait_for_connect里面是一个死循环,并调用了schedule_timeout判断是否超时

也调用reqsk_queue_empty判断队列是否为空

static int inet_csk_wait_for_connect(struct sock *sk, long timeo)
{
	struct inet_connection_sock *icsk = inet_csk(sk);
	DEFINE_WAIT(wait);
	int err;

	for (;;) {
		prepare_to_wait_exclusive(sk_sleep(sk), &wait,
					  TASK_INTERRUPTIBLE);
		release_sock(sk);
		if (reqsk_queue_empty(&icsk->icsk_accept_queue))
			timeo = schedule_timeout(timeo);
		sched_annotate_sleep();
		lock_sock(sk);
		err = 0;
		if (!reqsk_queue_empty(&icsk->icsk_accept_queue))
			break;
		err = -EINVAL;
		if (sk->sk_state != TCP_LISTEN)
			break;
		err = sock_intr_errno(timeo);
		if (signal_pending(current))
			break;
		err = -EAGAIN;
		if (!timeo)
			break;
	}
	finish_wait(sk_sleep(sk), &wait);
	return err;
}
schedule_timeout
signed long __sched schedule_timeout(signed long timeout)
{
	struct timer_list timer;
	unsigned long expire;

	switch (timeout)
	{
	case MAX_SCHEDULE_TIMEOUT:
		schedule();
		goto out;
	default:
		if (timeout < 0) {
			dump_stack();
			current->state = TASK_RUNNING;
			goto out;
		}
	}
	expire = timeout + jiffies;
	schedule();
	timeout = expire - jiffies;

 out:
	return timeout < 0 ? 0 : timeout;
}
  • jiffies是内核用于记录时间的滴答数,switch什么都不执行,计算expire得出超时的时间,并且在最后计算出剩余的时间timeout 并返回

timeo每次执行schedule_timeout都会减少直到小于0导致inet_csk_wait_for_connect的死循环退出,当icsk_accept_queue队列不为空也会退出,但是err为0,不会执行inet_csk_accept的goto out_err。至于什么时候放sock到icsk_accept_queue队列中后面会讲到。

接收syn包

当一个syn包通过网络传到本机网卡,网卡会调用函数将其解封,在处理传输层的数据时会调用tcp_v4_rcv函数处理。

tcp_v4_rcv

我们主要探讨在listen状态下的sock接受到syn包,故简化成以下状态

int tcp_v4_rcv(struct sk_buff *skb)
{
	struct net *net = dev_net(skb->dev);
	const struct iphdr *iph;
	const struct tcphdr *th;
	bool refcounted;
	struct sock *sk;
	int ret;

	th = (const struct tcphdr *)skb->data;
	iph = ip_hdr(skb);
	sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,th->dest,                
&refcounted);
	if (sk->sk_state == TCP_LISTEN) {
		ret = tcp_v4_do_rcv(sk, skb);
		goto put_and_return;
	}
}
  • 在网络数据包沿着各层拆解完成后,各层会调用skb_pull将sk_buff->data变成指向下一层的指针,故当数据包到达tcp_v4_rcv时,sk_buff->data指向tcp的协议数据,故可以th = (const struct tcphdr *)skb->data;获取tcp的协议数据
  • 调用__inet_lookup_skb用skb中的数据在hashbucket中寻找符合条件的sock并返回
  • 调用tcp_v4_do_rcv

六级标题有bug,没七级标题了(悲),后面每个函数的递归就按照顺序来写把。

__inet_lookup_skb

可以看到,__inet_lookup_skb是先从ehash寻找再从bhash中寻找

static inline struct sock *__inet_lookup
(struct net *net,struct inet_hashinfo *hashinfo,struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,const __be32 daddr, const __be16 dport,
const int dif,bool *refcounted)

{
	u16 hnum = ntohs(dport);
	struct sock *sk;

	sk = __inet_lookup_established(net, hashinfo, saddr, sport,daddr, hnum, dif);
	*refcounted = true;
	if (sk)
		return sk;
	*refcounted = false;
return __inet_lookup_listener(net, hashinfo, skb, doff, saddr,sport, daddr, hnum, dif);
}
__inet_lookup_established

逻辑很简单,就是拿sk_buff中记录的四元组来计算hash找到对应的hash头节点,并遍历该链表找到四元组hash相同的sock,至于sk->sk_hash,早在inet_ehash_insert函数将sock插入ehash已经赋值
sk->sk_hash = sk_ehashfn(sk);

struct sock *__inet_lookup_established(struct net *net,
				  struct inet_hashinfo *hashinfo,
				  const __be32 saddr, const __be16 sport,
				  const __be32 daddr, const u16 hnum,
				  const int dif)
{
	struct sock *sk;
	unsigned int hash = inet_ehashfn(net, daddr, hnum, saddr, sport);
	unsigned int slot = hash & hashinfo->ehash_mask;
	struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
begin:
	sk_nulls_for_each_rcu(sk, node, &head->chain) {
		if (sk->sk_hash != hash)
			continue;
		goto found;
		}
	}
found:
	return sk;
}
__inet_lookup_listener

至于__inet_lookup_listener,原理上和__inet_lookup_established相同,不过引入的评分机制,对于在该hash桶内的sock都会有一个评分,返回一个评分高的sock

struct sock *__inet_lookup_listener(struct net *net,
				    struct inet_hashinfo *hashinfo,
				    struct sk_buff *skb, int doff,
				    const __be32 saddr, __be16 sport,
				    const __be32 daddr, const unsigned short hnum,
				    const int dif)
{
	unsigned int hash = inet_lhashfn(net, hnum);
	struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
	int score, hiscore = 0, matches = 0, reuseport = 0;
	bool exact_dif = inet_exact_dif_match(net, skb);
	struct sock *sk, *result = NULL;
	struct hlist_nulls_node *node;
	u32 phash = 0;

	sk_nulls_for_each_rcu(sk, node, &ilb->nulls_head) {
		score = compute_score(sk, net, hnum, daddr, dif, exact_dif);
		if (score > hiscore) {
			reuseport = sk->sk_reuseport;
			if (reuseport) {
				phash = inet_ehashfn(net, daddr, hnum,saddr, sport);
				result = reuseport_select_sock(sk, phash,skb, doff);
				if (result)
					return result;
				matches = 1;
			}
			result = sk;
			hiscore = score;
		} else if (score == hiscore && reuseport) {
			matches++;
			if (reciprocal_scale(phash, matches) == 0)
				result = sk;
			phash = next_pseudo_random32(phash);
		}
	}
	return result;
}
tcp_v4_do_rcv

tcp_v4_do_rcv前面的函数都没做什么主要看tcp_rcv_state_process

int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	if (tcp_rcv_state_process(sk, skb)) {
		rsk = sk;
		goto reset;
	}
	return 0;
}
tcp_rcv_state_process

tcp_rcv_state_process会根据skb的状态来做出不同的行为,当前sock是listen状态且当前收到的是syn包会执行icsk->icsk_af_ops->conn_request即tcp_conn_request

int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct inet_connection_sock *icsk = inet_csk(sk);
	const struct tcphdr *th = tcp_hdr(skb);
	struct request_sock *req;
	int queued = 0;
	bool acceptable;

	switch (sk->sk_state) {
	case TCP_LISTEN:
		if (th->syn) {
			acceptable = icsk->icsk_af_ops->conn_request(sk, skb) >= 0;
			return 0;
		}
}
tcp_conn_request

tcp_conn_request代码很长,挑重要的来看

tcp_conn_request
int tcp_conn_request(struct request_sock_ops *rsk_ops,
		     const struct tcp_request_sock_ops *af_ops,
		     struct sock *sk, struct sk_buff *skb)
{
	struct tcp_fastopen_cookie foc = { .len = -1 };
	__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
	struct tcp_options_received tmp_opt;
	struct tcp_sock *tp = tcp_sk(sk);
	struct net *net = sock_net(sk);
	struct sock *fastopen_sk = NULL;
	struct dst_entry *dst = NULL;
	struct request_sock *req;
	bool want_cookie = false;
	struct flowi fl;

	/* TW buckets are converted to open requests without
	 * limitations, they conserve resources and peer is
	 * evidently real one.
	 */
	if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
	     inet_csk_reqsk_queue_is_full(sk)) && !isn) {
		want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
		if (!want_cookie)
			goto drop;
	}

	if (sk_acceptq_is_full(sk)) {
		NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
		goto drop;
	}

	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);
	if (!req)
		goto drop;

	tcp_rsk(req)->af_specific = af_ops;

	tcp_clear_options(&tmp_opt);
	tmp_opt.mss_clamp = af_ops->mss_clamp;
	tmp_opt.user_mss  = tp->rx_opt.user_mss;
	tcp_parse_options(skb, &tmp_opt, 0, want_cookie ? NULL : &foc);

	if (want_cookie && !tmp_opt.saw_tstamp)
		tcp_clear_options(&tmp_opt);

	tmp_opt.tstamp_ok = tmp_opt.saw_tstamp;
	tcp_openreq_init(req, &tmp_opt, skb, sk);
	inet_rsk(req)->no_srccheck = inet_sk(sk)->transparent;

	/* Note: tcp_v6_init_req() might override ir_iif for link locals */
	inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);

	af_ops->init_req(req, sk, skb);

	if (security_inet_conn_request(sk, skb, req))
		goto drop_and_free;

	if (!want_cookie && !isn) {
		/* VJ's idea. We save last timestamp seen
		 * from the destination in peer table, when entering
		 * state TIME-WAIT, and check against it before
		 * accepting new connection request.
		 *
		 * If "isn" is not zero, this request hit alive
		 * timewait bucket, so that all the necessary checks
		 * are made in the function processing timewait state.
		 */
		if (tcp_death_row.sysctl_tw_recycle) {
			bool strict;

			dst = af_ops->route_req(sk, &fl, req, &strict);

			if (dst && strict &&
			    !tcp_peer_is_proven(req, dst, true,
						tmp_opt.saw_tstamp)) {
				NET_INC_STATS(sock_net(sk), LINUX_MIB_PAWSPASSIVEREJECTED);
				goto drop_and_release;
			}
		}
		/* Kill the following clause, if you dislike this way. */
		else if (!net->ipv4.sysctl_tcp_syncookies &&
			 (sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
			  (sysctl_max_syn_backlog >> 2)) &&
			 !tcp_peer_is_proven(req, dst, false,
					     tmp_opt.saw_tstamp)) {
			/* Without syncookies last quarter of
			 * backlog is filled with destinations,
			 * proven to be alive.
			 * It means that we continue to communicate
			 * to destinations, already remembered
			 * to the moment of synflood.
			 */
			pr_drop_req(req, ntohs(tcp_hdr(skb)->source),
				    rsk_ops->family);
			goto drop_and_release;
		}

		isn = af_ops->init_seq(skb);
	}
	if (!dst) {
		dst = af_ops->route_req(sk, &fl, req, NULL);
		if (!dst)
			goto drop_and_free;
	}

	tcp_ecn_create_request(req, skb, sk, dst);

	if (want_cookie) {
		isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
		req->cookie_ts = tmp_opt.tstamp_ok;
		if (!tmp_opt.tstamp_ok)
			inet_rsk(req)->ecn_ok = 0;
	}

	tcp_rsk(req)->snt_isn = isn;
	tcp_rsk(req)->txhash = net_tx_rndhash();
	tcp_openreq_init_rwin(req, sk, dst);
	if (!want_cookie) {
		tcp_reqsk_record_syn(sk, req, skb);
		fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
	}
	if (fastopen_sk) {
		af_ops->send_synack(fastopen_sk, dst, &fl, req,
				    &foc, TCP_SYNACK_FASTOPEN);
		/* Add the child socket directly into the accept queue */
		if (!inet_csk_reqsk_queue_add(sk, req, fastopen_sk)) {
			reqsk_fastopen_remove(fastopen_sk, req, false);
			bh_unlock_sock(fastopen_sk);
			sock_put(fastopen_sk);
			reqsk_put(req);
			goto drop;
		}
		sk->sk_data_ready(sk);
		bh_unlock_sock(fastopen_sk);
		sock_put(fastopen_sk);
	} else {
		tcp_rsk(req)->tfo_listener = false;
		if (!want_cookie)
			inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
		af_ops->send_synack(sk, dst, &fl, req, &foc,
				    !want_cookie ? TCP_SYNACK_NORMAL :
						   TCP_SYNACK_COOKIE);
		if (want_cookie) {
			reqsk_free(req);
			return 0;
		}
	}
	reqsk_put(req);
	return 0;

drop_and_release:
	dst_release(dst);
drop_and_free:
	reqsk_free(req);
drop:
	tcp_listendrop(sk);
	return 0;
}

在这里把一些我们不讨论的对结构体初始化的步骤删掉了,不考虑fastopen和其他错误信息

int tcp_conn_request(struct request_sock_ops *rsk_ops,const struct tcp_request_sock_ops *af_ops,struct sock *sk, struct sk_buff *skb)
{
	struct tcp_sock *tp = tcp_sk(sk);
	struct net *net = sock_net(sk);
	struct request_sock *req;

	req = inet_reqsk_alloc(rsk_ops, sk, !want_cookie);

	tcp_rsk(req)->af_specific = af_ops;

	tcp_openreq_init(req, &tmp_opt, skb, sk);

	af_ops->init_req(req, sk, skb);

	inet_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT);
	af_ops->send_synack(.........);
}
  • inet_reqsk_alloc创建一个request_sock 结构体。
  • tcp_openreq_init用sk和skb来初始化刚才创建的req结构体
  • af_ops->init_req用于初始化req的ip地址
  • inet_csk_reqsk_queue_hash_add用于将req结构体放入ehash中并放入半连接队列中
  • 调用af_ops->send_synack向对方发送syn+ack报文
inet_reqsk_alloc

inet_reqsk_alloc调用reqsk_alloc alloc一片空间后进行了一些赋值并初始化。

struct request_sock *inet_reqsk_alloc(const struct request_sock_ops *ops,struct sock *sk_listener,bool attach_listener)
{
	struct request_sock *req = reqsk_alloc(ops, sk_listener,attach_listener);
	if (req) {
		struct inet_request_sock *ireq = inet_rsk(req);
		ireq->ireq_state = TCP_NEW_SYN_RECV;
		write_pnet(&ireq->ireq_net, sock_net(sk_listener));
		ireq->ireq_family = sk_listener->sk_family;
	}
	return req;
}

reqsk_alloc(const struct request_sock_ops *ops, struct sock *sk_listener,bool attach_listener)
{
	struct request_sock *req;
	req = kmem_cache_alloc(ops->slab, GFP_ATOMIC | __GFP_NOWARN);
	req->rsk_listener = NULL;
	req->rsk_ops = ops;
	req_to_sk(req)->sk_prot = sk_listener->sk_prot;
	req->saved_syn = NULL;
	return req;
}
tcp_openreq_init

对req的进一步初始化,包括将源端口目的端口赋值到ireq->ir_rmt_port和ireq->ir_num

static void tcp_openreq_init(struct request_sock *req,
			     const struct tcp_options_received *rx_opt,
			     struct sk_buff *skb, const struct sock *sk)
{
	struct inet_request_sock *ireq = inet_rsk(req);

	req->rsk_rcv_wnd = 0;		/* So that tcp_send_synack() knows! */
	req->cookie_ts = 0;
	tcp_rsk(req)->rcv_isn = TCP_SKB_CB(skb)->seq;
	tcp_rsk(req)->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
	skb_mstamp_get(&tcp_rsk(req)->snt_synack);
	tcp_rsk(req)->last_oow_ack_time = 0;
	req->mss = rx_opt->mss_clamp;
	req->ts_recent = rx_opt->saw_tstamp ? rx_opt->rcv_tsval : 0;
	ireq->tstamp_ok = rx_opt->tstamp_ok;
	ireq->sack_ok = rx_opt->sack_ok;
	ireq->snd_wscale = rx_opt->snd_wscale;
	ireq->wscale_ok = rx_opt->wscale_ok;
	ireq->acked = 0;
	ireq->ecn_ok = 0;
	ireq->ir_rmt_port = tcp_hdr(skb)->source;
	ireq->ir_num = ntohs(tcp_hdr(skb)->dest);
	ireq->ir_mark = inet_request_mark(sk, skb);
}
af_ops->init_req

设置req的源目的ip地址

static void tcp_v4_init_req(struct request_sock *req,
			    const struct sock *sk_listener,
			    struct sk_buff *skb)
{
	struct inet_request_sock *ireq = inet_rsk(req);
	sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
	sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
}

关于抵御synflood的思路

6.17,最近在复习考研,没学啥新技术,搞了好久终于把课设搞好了,调试内核有点麻烦,有符号表的情况下不能单步调试,还要考虑多CPU多核,锁,RCU的使用,还有考虑使用内核结构体,内核版本代码标准,写了三四百行,写的不是很好还有优化的空间,这些就留给研究生的我吧。

synflood往往都会伪造源IP进行攻击,每个源IP只发送一次或多次,如果我们将第一个syn丢弃并且记录当前IP是否就可以做到降低第一次的连接率来提升可用性能。如不伪造源IP,可以记录当前IP的syn发包速率,限制在10syn包/s。

关于数据结构的选择

布隆过滤器

关于第一次syn包的记录,有什么东西是能够记录元素是否被记录,并且内存消耗还少的呢,有的喵

布隆过滤器(Bloom Filter)是1970年由布隆提出的。它实际上是一个很长的二进制向量和一系列随机映射函数。布隆过滤器可以用于检索一个元素是否在一个集合中。它的优点是空间效率和查询时间都比一般的算法要好得多,缺点是有一定的误识别率和删除困难。

我们可以创建一个位图(bitmap),相当于一连串全0的二进制位(000000000000000),对于一个IP,它存储在内存中是一个u32的类型,我们对它进行多次hash计算得到多个hash值,这些hash值我们对它取余,并在位图上将对应index的值置为1,如果我们要对其进行验证,只需要再次将IP进行相同hash操作,并在位图上验证是否都为1,若都为1则代表可能存在,若有一位不为1,则代表肯定不存在

虽然对于百万甚至千万的流量来说有一定误报率,但是对于其本身大小以及实现难易度,可以说利大于弊。

我选择了4MB大小的位图,5个hash函数,在百万流量的情况下可以有0.005%的误报率

#define BLOOM_FILTER_BITS   (1 << 22)  
#define BLOOM_FUNCS 5

关于hash函数,我选择了内核提供的jhash函数进行,该函数可以传入种子对同一IP生成不同的hash值,并且保证在位图上分布均匀

size_t seeds[BLOOM_FUNCS] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0, 0xfdb97531};
static uint32_t bloom_hash(uint32_t ip, uint32_t seed)
{
    return jhash(&ip, sizeof(ip), seed) % bf->size;
}
void calu_hash(size_t ip,uint32_t arr[BLOOM_FUNCS])
{
	int i;
 for (i = 0; i < BLOOM_FUNCS; i++) {
        arr[i] = bloom_hash(ip,seeds[i]) % bf->size;
    }
 return;
}

布隆过滤器的数据结构如下,成员不是很多(对比于一些内核结构体。。。。)

enum traffic_level {
    TRAFFIC_LOW,     
    TRAFFIC_NORMAL,  
    TRAFFIC_HIGH, 
    TRAFFIC_ATTACK   
};
struct bloom_filter {
    unsigned long *bitmap;
    atomic_t  count;
    uint32_t size;
    u_int8_t funcs;
    struct timer_list timer;
    spinlock_t my_lock;
    enum traffic_level newlevel;
    enum traffic_level oldlevel;
};
  • bitmap:布隆过滤器的位图
  • count:计时器中记录IP的数量,用于计算当前流量速率
  • size:大小,用于对hash值取余操作
  • funcs:使用hash函数的数量
  • timer:计时器,用于在不同状况下定时清空布隆过滤器
  • my_lock:自旋锁,防止条件竞争
  • newlevel,oldlevel:新旧状态,只有在这两个状态不一样的情况下改变计时器的定时
初始化函数
int init_bloom(void)
{
    bf = kmalloc(sizeof(*bf), GFP_KERNEL);
    if (!bf) 
        return -ENOMEM;
    
    spin_lock_init(&bf->my_lock);
    
    size_t longs = BITS_TO_LONGS(BLOOM_FILTER_BITS);
    bf->bitmap = kzalloc(longs * sizeof(unsigned long), GFP_KERNEL);
    if (!bf->bitmap) {
        kfree(bf);
        return -ENOMEM;
    }
    atomic_set(&clear_time_ms,30000);
    bf->size = BLOOM_FILTER_BITS;
    bf->funcs = BLOOM_FUNCS;
    atomic_set(&bf->count, 0);
    bf->newlevel=TRAFFIC_LOW;
    bf->oldlevel=TRAFFIC_LOW;
    setup_timer(&bf->timer, clear_bloom, (unsigned long)bf);
    mod_timer(&bf->timer, jiffies + msecs_to_jiffies(CLEAR_INTERVAL_MS));
    return 0;
}
记录和验证函数

用内核的set_bit和test_bit函数可以实现原子操作

void record_ip_into_bloom(uint32_t ip)
{
 unsigned long flags;
 int a;
 uint32_t bloom_arr[BLOOM_FUNCS]={0};
 calu_hash(ip,bloom_arr);
 spin_lock_irqsave(&bf->my_lock, flags);
  for(a=0;a<BLOOM_FUNCS;a++)
 {
	set_bit(bloom_arr[a], bf->bitmap);
 }
 spin_unlock_irqrestore(&bf->my_lock, flags);
 return;
}

int bloom_check(uint32_t ip)
{
 unsigned long flags;
 int a;
 uint32_t bloom_arr[BLOOM_FUNCS]={0};
 calu_hash(ip,bloom_arr);
 spin_lock_irqsave(&bf->my_lock, flags);
 for(a=0;a<BLOOM_FUNCS;a++)
 {
	if (!test_bit(bloom_arr[a],bf->bitmap))
	{
		spin_unlock_irqrestore(&bf->my_lock, flags);
		return -1;
	}
 }
 spin_unlock_irqrestore(&bf->my_lock, flags);
 return 0;
};
布隆过滤器的状态转化
void change_state(void)
{

if (bf->newlevel!=bf->oldlevel)
	{
    switch(bf->newlevel)
	{
	case TRAFFIC_LOW:
		atomic_set(&clear_time_ms,30000);
		break;
	case TRAFFIC_NORMAL:
		atomic_set(&clear_time_ms,10000);
		break;
	case TRAFFIC_HIGH:
		atomic_set(&clear_time_ms,3000);
		break;
	case TRAFFIC_ATTACK:
		atomic_set(&clear_time_ms,1000);
		break;
	}
	bf->oldlevel=bf->newlevel;
	
	}
	
}
void check_traffic(void)
{
	unsigned long flags;
	size_t rate=atomic_read(&bf->count)/(atomic_read(&clear_time_ms)/1000);
	if(rate<10)
	{
		bf->newlevel=TRAFFIC_LOW;
	}
	if(100<rate && rate <1000)
	{
		bf->newlevel=TRAFFIC_NORMAL;
	}
	if(1000<rate && rate<10000)
	{
		bf->newlevel=TRAFFIC_HIGH;
	}
	if(rate>10000)
	{
		bf->newlevel=TRAFFIC_ATTACK;
	}
	change_state();
}
  • 每当倒计时结束后用count/clear_time_ms*HZ来得到当前网络流量速率并更新下一次的清空过滤器的时间
  • 只有当bf->newlevel!=bf->oldlevel才进行clear_time_ms的更新

哈希桶

受到tcp_hashinfo的启发,对于同一IP的synflood来说我们可以构造一个hash bucket记录当前IP的发包速率,并且链入hash表。

struct syn_window{
    unsigned long timestamps[WINDOW_SIZE]; 
    atomic_t count; 
};

struct ip_hash_bucket
{
	size_t ip;
	struct syn_window syn_win;
	struct hlist_nulls_node ip_node;
	size_t syn_time;
	atomic_t is_allow;
	spinlock_t lock;
};

struct ip_hash{
	struct hlist_nulls_head chain;
	atomic_t count;
	spinlock_t lock;
};

struct ip_hash ip_hash_buckets[hash_bucket_num];

和tcp_hashinfo很像,因为就是搬过来的(。

syn_window

  • timestamps:时间戳数组,用于记录每个syn包到达的时间
  • count:时间戳数组元素的个数,用于判断是否刷新该数组

ip_hash_bucket

  • ip:当前hash桶对应的IP。
  • syn_win:用于记录当前IP发包速率的滑动窗口
  • ip_node:哈希桶节点
  • syn_time:最后一次接受到syn包的时间,可以根据这个删除释放节点
  • is_allow:当前IP是否可以通行
  • lock:自旋锁

ip_hash

  • chain:链表头
  • count:该哈希桶链表的个数,用于判断是否遍历该链表以摘除释放不使用的哈希桶
  • lock:还是自旋锁
初始化函数

初始化每个链表的链表头,不然使用hlist_nulls_for_each_entry_rcu或hlist_nulls_for_each_entry遍历链表时会出错。设置一个定时器定时调用clear_ip_hash_buckets清空不用的哈希桶。

void init_ip_hash_buckets(void)
{
	int i;
	for (i = 0; i < hash_bucket_num; i++) {
		spin_lock_init(&ip_hash_buckets[i].lock);
		INIT_HLIST_NULLS_HEAD(&ip_hash_buckets[i].chain, 0);
		atomic_set(&ip_hash_buckets[i].count, 0);
	}
	setup_timer(&bucket_timer, clear_ip_hash_buckets, (long unsigned int)0);
        mod_timer(&bucket_timer, jiffies + msecs_to_jiffies(BUCKET_TIMEOUT_MS));
	printk("init ip hash buckets\n");
	return;
}
插入和创建哈希桶

使用hlist_nulls_for_each_entry_rcu遍历链表,自动判断是否到表尾,若没初始化链表头无法自动识别表尾

size_t hash_index = jhash(&iph->saddr, sizeof(iph->saddr), seeds[0]) % hash_bucket_num;
struct ip_hash *ip_hash = &ip_hash_buckets[hash_index];
struct ip_hash_bucket *bucket=NULL;	
struct hlist_nulls_node *pos;
rcu_read_lock();
hlist_nulls_for_each_entry_rcu(bucket,pos,&ip_hash->chain,ip_node) {
		if (bucket->ip == iph->saddr) {
			check_ip_is_allow(bucket);
			if (!atomic_read(&bucket->is_allow)) {
printk("recive tcp packet from %pI4,not allow,discard the packet\n",&iph->saddr);
				rcu_read_unlock();
				goto discard;
			}
			rcu_read_unlock();
	printk("recive tcp packet from %pI4,allow,handle the packet\n",&iph->saddr);
			goto handle;
		}
	}
	rcu_read_unlock();
	bucket = kmalloc(sizeof(*bucket), GFP_ATOMIC);	
	if (!bucket) {
		goto discard;
	}
	bucket->ip = iph->saddr;
	spin_lock_init(&bucket->lock);
	bucket->syn_time= jiffies;
	atomic_set(&bucket->is_allow,1);
	atomic_set(&bucket->syn_win.count,0);
	hlist_nulls_add_head_rcu(&bucket->ip_node,&ip_hash->chain);
	atomic_inc(&ip_hash->count);
清除函数

根据最后一次收到syn包的时间(bucket->syn_time)判断是否超时,若超时则摘除释放该节点。记得加锁。。

void clear_ip_hash_buckets(long unsigned int a)
{
	int i;
	for (i = 0; i < hash_bucket_num; i++) {
		if (!atomic_read(&ip_hash_buckets[i].count))
			continue;
		struct ip_hash *ip_hash = &ip_hash_buckets[i];
		struct ip_hash_bucket *bucket;
		struct hlist_nulls_node *pos;

		spin_lock(&ip_hash->lock);
		hlist_nulls_for_each_entry(bucket, pos, &ip_hash->chain, ip_node) {
			if(jiffies-bucket->syn_time > BUCKET_MAX_TIME*HZ)
			{
			hlist_nulls_del_rcu(&bucket->ip_node);
			kfree(bucket);
			atomic_dec(&ip_hash->count);
			}
		}
		spin_unlock(&ip_hash->lock);
	}
	mod_timer(&bucket_timer, jiffies + msecs_to_jiffies(BUCKET_TIMEOUT_MS));
	printk("clear ip hash buckets\n");
	return;
}
判断函数

利用了滑动窗口,每次收到指定IP的包后判断win->timestamps是否满,在timestamps数组中是按照时间的先后顺序进行排序的,第一个元素进入的时间最长,若不满则判断数组中是否有超时元素,若有则移除,没有则跳过往该数组添加一个时间戳。

void check_ip_is_allow(struct ip_hash_bucket *bucket)
{
struct syn_window *win=&bucket->syn_win;
while(atomic_read(&win->count)>0 && jiffies-win->timestamps[0]>SYN_WINDOW_MAX_TIME*HZ){
	memmove(&win->timestamps[0], &win->timestamps[1], (atomic_read(&win->count)-1) * sizeof(unsigned long));
	atomic_dec(&bucket->syn_win.count);
}
if (atomic_read(&win->count) < WINDOW_SIZE) {
	win->timestamps[atomic_read(&win->count)] = jiffies;
	atomic_inc(&bucket->syn_win.count);
}
if(atomic_read(&win->count)==WINDOW_SIZE)
{
	atomic_set(&bucket->is_allow,0);
	printk("ip %pI4 syn times %d, time diff %ld, not allow\n",&bucket->ip,atomic_read(&win->count),jiffies-win->timestamps[0]);
}
else{
	atomic_set(&bucket->is_allow,1);
	printk("ip %pI4 syn times %d, time diff %ld, allow\n",&bucket->ip,atomic_read(&win->count),jiffies-win->timestamps[0]);
}
}

完整代码

#define BLOOM_FILTER_BITS   (1 << 22)  
#define BLOOM_FUNCS 5
#define hash_bucket_num 0x100  
#define WINDOW_SIZE 10
#define SYN_WINDOW_MAX_TIME 1
#define CLEAR_INTERVAL_MS 30000
#define BUCKET_MAX_TIME 3
#define BUCKET_TIMEOUT_MS 3000
size_t seeds[BLOOM_FUNCS] = {0x12345678, 0x9abcdef0, 0x13579bdf, 0x2468ace0, 0xfdb97531};
struct timer_list bucket_timer;
atomic_t clear_time_ms;
enum traffic_level {
    TRAFFIC_LOW,     
    TRAFFIC_NORMAL,  
    TRAFFIC_HIGH, 
    TRAFFIC_ATTACK   
};

struct bloom_filter {
    unsigned long *bitmap;
    atomic_t  count;
    uint32_t size;
	u_int8_t funcs;
	struct timer_list timer;
	spinlock_t my_lock;
	enum traffic_level newlevel;
	enum traffic_level oldlevel;

};

void change_state(void)
{

if (bf->newlevel!=bf->oldlevel)
	{
    switch(bf->newlevel)
	{
	case TRAFFIC_LOW:
		atomic_set(&clear_time_ms,30000);
		break;
	case TRAFFIC_NORMAL:
		atomic_set(&clear_time_ms,10000);
		break;
	case TRAFFIC_HIGH:
		atomic_set(&clear_time_ms,3000);
		break;
	case TRAFFIC_ATTACK:
		atomic_set(&clear_time_ms,1000);
		break;
	}
	bf->oldlevel=bf->newlevel;
	
	}
	
}

void check_traffic(void)
{
	unsigned long flags;
	size_t rate=atomic_read(&bf->count)/(atomic_read(&clear_time_ms)/1000);
	if(rate<10)
	{
		bf->newlevel=TRAFFIC_LOW;
	}
	if(100<rate && rate <1000)
	{
		bf->newlevel=TRAFFIC_NORMAL;
	}
	if(1000<rate && rate<10000)
	{
		bf->newlevel=TRAFFIC_HIGH;
	}
	if(rate>10000)
	{
		bf->newlevel=TRAFFIC_ATTACK;
	}
	change_state();
}

void clear_bloom(unsigned long data)
{
    struct bloom_filter *bf = (struct bloom_filter *)data;
    unsigned long flags;
    if (atomic_read(&bf->count)) {
        spin_lock_irqsave(&bf->my_lock, flags);
        check_traffic();
        size_t longs = BITS_TO_LONGS(bf->size);
        memset(bf->bitmap, 0, longs * sizeof(unsigned long));
        atomic_set(&bf->count, 0);
		printk("clear_time_ms=%d,network state=%d\n",atomic_read(&clear_time_ms),bf->newlevel);
        printk(KERN_INFO "Bloom filter cleared. Count reset to 0\n");
    }
	mod_timer(&bf->timer, jiffies + msecs_to_jiffies(atomic_read(&clear_time_ms)));
	spin_unlock_irqrestore(&bf->my_lock, flags);
}

static uint32_t bloom_hash(uint32_t ip, uint32_t seed)
{
    return jhash(&ip, sizeof(ip), seed) % bf->size;
}

int init_bloom(void)
{
    bf = kmalloc(sizeof(*bf), GFP_KERNEL);
    if (!bf) 
        return -ENOMEM;
    
    spin_lock_init(&bf->my_lock);
    
    size_t longs = BITS_TO_LONGS(BLOOM_FILTER_BITS);
    bf->bitmap = kzalloc(longs * sizeof(unsigned long), GFP_KERNEL);
    if (!bf->bitmap) {
        kfree(bf);
        return -ENOMEM;
    }
	atomic_set(&clear_time_ms,30000);
    bf->size = BLOOM_FILTER_BITS;
    bf->funcs = BLOOM_FUNCS;
    atomic_set(&bf->count, 0);
	bf->newlevel=TRAFFIC_LOW;
	bf->oldlevel=TRAFFIC_LOW;
    setup_timer(&bf->timer, clear_bloom, (unsigned long)bf);
    mod_timer(&bf->timer, jiffies + msecs_to_jiffies(CLEAR_INTERVAL_MS));
    
    return 0;
}


void calu_hash(size_t ip,uint32_t arr[BLOOM_FUNCS])
{
	int i;
 for (i = 0; i < BLOOM_FUNCS; i++) {
        arr[i] = bloom_hash(ip,seeds[i]) % bf->size;
    }
 return;
}

void record_ip_into_bloom(uint32_t ip)
{
 unsigned long flags;
 int a;
 uint32_t bloom_arr[BLOOM_FUNCS]={0};
 calu_hash(ip,bloom_arr);
 spin_lock_irqsave(&bf->my_lock, flags);
  for(a=0;a<BLOOM_FUNCS;a++)
 {
	set_bit(bloom_arr[a], bf->bitmap);
 }
 spin_unlock_irqrestore(&bf->my_lock, flags);
 return;
}

int bloom_check(uint32_t ip)
{
 unsigned long flags;
 int a;
 uint32_t bloom_arr[BLOOM_FUNCS]={0};
 calu_hash(ip,bloom_arr);
 spin_lock_irqsave(&bf->my_lock, flags);
 for(a=0;a<BLOOM_FUNCS;a++)
 {
	if (!test_bit(bloom_arr[a],bf->bitmap))
	{
		spin_unlock_irqrestore(&bf->my_lock, flags);
		return -1;
	}
 }
 spin_unlock_irqrestore(&bf->my_lock, flags);
 return 0;
};




/*-------------------------------------------------------------------------------------*/
struct syn_window{
    unsigned long timestamps[WINDOW_SIZE]; 
    atomic_t count; 
};

struct ip_hash_bucket
{
	size_t ip;
	struct syn_window syn_win;
	struct hlist_nulls_node ip_node;
	size_t syn_time;
	atomic_t is_allow;
	spinlock_t lock;

};

struct ip_hash{
	struct hlist_nulls_head chain;
	atomic_t count;
	spinlock_t lock;
};

struct ip_hash ip_hash_buckets[hash_bucket_num];

void clear_ip_hash_buckets(long unsigned int a)
{
	int i;
	for (i = 0; i < hash_bucket_num; i++) {
		if (!atomic_read(&ip_hash_buckets[i].count))
			continue;
		struct ip_hash *ip_hash = &ip_hash_buckets[i];
		struct ip_hash_bucket *bucket;
		struct hlist_nulls_node *pos;

		spin_lock(&ip_hash->lock);
		hlist_nulls_for_each_entry(bucket, pos, &ip_hash->chain, ip_node) {
			if(jiffies-bucket->syn_time > BUCKET_MAX_TIME*HZ)
			{
			hlist_nulls_del_rcu(&bucket->ip_node);
			kfree(bucket);
			atomic_dec(&ip_hash->count);
			}
		}
		spin_unlock(&ip_hash->lock);
	}
	mod_timer(&bucket_timer, jiffies + msecs_to_jiffies(BUCKET_TIMEOUT_MS));
	printk("clear ip hash buckets\n");
	return;
}

void init_ip_hash_buckets(void)
{
	int i;
	for (i = 0; i < hash_bucket_num; i++) {
		spin_lock_init(&ip_hash_buckets[i].lock);
		INIT_HLIST_NULLS_HEAD(&ip_hash_buckets[i].chain, 0);
		atomic_set(&ip_hash_buckets[i].count, 0);
	}
	setup_timer(&bucket_timer, clear_ip_hash_buckets, (long unsigned int)0);
    mod_timer(&bucket_timer, jiffies + msecs_to_jiffies(BUCKET_TIMEOUT_MS));
	printk("init ip hash buckets\n");
	return;
}

void check_ip_is_allow(struct ip_hash_bucket *bucket)
{
struct syn_window *win=&bucket->syn_win;
while(atomic_read(&win->count)>0 && jiffies-win->timestamps[0]>SYN_WINDOW_MAX_TIME*HZ){
	memmove(&win->timestamps[0], &win->timestamps[1], (atomic_read(&win->count)-1) * sizeof(unsigned long));
	atomic_dec(&bucket->syn_win.count);
}
if (atomic_read(&win->count) < WINDOW_SIZE) {
	win->timestamps[atomic_read(&win->count)] = jiffies;
	atomic_inc(&bucket->syn_win.count);
}
if(atomic_read(&win->count)==WINDOW_SIZE)
{
	atomic_set(&bucket->is_allow,0);
	printk("ip %pI4 syn times %d, time diff %ld, not allow\n",&bucket->ip,atomic_read(&win->count),jiffies-win->timestamps[0]);
}
else{
	atomic_set(&bucket->is_allow,1);
	printk("ip %pI4 syn times %d, time diff %ld, allow\n",&bucket->ip,atomic_read(&win->count),jiffies-win->timestamps[0]);
}
}


/*-------------------------------------------------------------------------------------*/
int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
{
	struct sock *rsk;
	const struct iphdr *iph= ip_hdr(skb);
	//printk("recive tcp packet from %pI4 to %pI4\n",&iph->saddr, &iph->daddr);
	if(inet_csk_reqsk_queue_len(sk) >= (sk->sk_max_ack_backlog>>1))
	{
		struct net *net = sock_net(sk);
		struct inet_connection_sock *icsk = inet_csk(sk);
		icsk->icsk_syn_retries=3;
		net->ipv4.sysctl_tcp_synack_retries=3;
		//printk("tcp_v4_do_rcv: backlog queue is half full, decrease synack_retries to 3\n");
	}
		
		if(bloom_check(iph->saddr)!=0)
		{
			//printk("recive tcp packet from %pI4,no in bloom filter,discard the first packet\n",&iph->saddr);
			record_ip_into_bloom(iph->saddr);
			atomic_inc(&bf->count);
			goto discard;
		}
		printk("recive tcp packet from %pI4,In bloom filter,handle the packet and record in hash buckets\n",&iph->saddr);
		
	
size_t hash_index = jhash(&iph->saddr, sizeof(iph->saddr), seeds[0]) % hash_bucket_num;
struct ip_hash *ip_hash = &ip_hash_buckets[hash_index];
struct ip_hash_bucket *bucket=NULL;	
struct hlist_nulls_node *pos;
rcu_read_lock();
hlist_nulls_for_each_entry_rcu(bucket,pos,&ip_hash->chain,ip_node) {
		if (bucket->ip == iph->saddr) {
			check_ip_is_allow(bucket);
			if (!atomic_read(&bucket->is_allow)) {
	printk("recive tcp packet from %pI4,not allow,discard the packet\n",&iph->saddr);
				rcu_read_unlock();
				goto discard;
			}
			rcu_read_unlock();
printk("recive tcp packet from %pI4,allow,handle the packet\n",&iph->saddr);
			goto handle;
					}
				}
rcu_read_unlock();
bucket = kmalloc(sizeof(*bucket), GFP_ATOMIC);	
if (!bucket) {
	goto discard;
}
bucket->ip = iph->saddr;
spin_lock_init(&bucket->lock);
bucket->syn_time= jiffies;
atomic_set(&bucket->is_allow,1);
atomic_set(&bucket->syn_win.count,0);
hlist_nulls_add_head_rcu(&bucket->ip_node,&ip_hash->chain);
atomic_inc(&ip_hash->count);

handle:
此作者没有提供个人介绍。
最后更新于 2025-06-18