linuxkernel2.4.5ipv4socket层的一点解释

发表于:2007-07-04来源:作者:点击数: 标签:
1.新建socket 函数原形: static int inet_create(struct socket *sock, int protocol) 在net/ipv4/af_inet.c中 详细解释 static int inet_create(struct socket *sock, int protocol) struct sock *sk; struct proto *prot; sock->state = SS_UNCONNECTED;

  1.新建socket
  函数原形:
  static int inet_create(struct socket *sock, int protocol)
  在net/ipv4/af_inet.c中
  详细解释
  static int inet_create(struct socket *sock, int protocol)
  {
  struct sock *sk;
  struct proto *prot;
  sock->state = SS_UNCONNECTED; /* 设置状态为未连接 */
  sk = sk_alloc(PF_INET, GFP_KERNEL, 1); /* 申请sock所需的内存 */
     /* net/core/sock.c */
  if (sk == NULL)
   goto do_oom;
  switch (sock->type) {
  case SOCK_STREAM:  /* TCP协议 */
   if (protocol && protocol != IPPROTO_TCP)
    goto free_and_noproto;
   protocol = IPPROTO_TCP;
   prot = &tcp_prot; /* tcp_prot定义在net/ipv4/tcp_ipv4.c */
   sock->ops = &inet_stream_ops; /* 针对STREAM的socket操作 */
   break;
  case SOCK_SEQPACKET:  /* 不支持 */
   goto free_and_badtype;
  case SOCK_DGRAM:  /* UDP协议 */
   if (protocol && protocol != IPPROTO_UDP)
    goto free_and_noproto;
   protocol = IPPROTO_UDP;
   sk->no_check = UDP_CSUM_DEFAULT;
   prot=&udp_prot;  /* udp_prot定义在net/ipv4/udp.c */
   sock->ops = &inet_dgram_ops; /* 针对DGRAM的socket操作 */
   break;
  case SOCK_RAW:  /* RAW */
   if (!capable(CAP_NET_RAW)) /* 判断是否有权利建立SOCK_RAW */
    goto free_and_badperm;
   if (!protocol)  /* protocol不能为0 */
    goto free_and_noproto;
   prot = &raw_prot; /* raw_prot定义在net/ipv4/raw.c */
   sk->reuse = 1;  /* 允许地址重用 */
   sk->num = protocol;
   sock->ops = &inet_dgram_ops; /* RAW的一些特性和DGRAM相同 */
   if (protocol == IPPROTO_RAW)
    sk->protinfo.af_inet.hdrincl = 1;
     /* 允许自己定制ip头 */
   break;
  default:
   goto free_and_badtype;
  }
  if (ipv4_config.no_pmtu_disc)
   sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_DONT;
  else
   sk->protinfo.af_inet.pmtudisc = IP_PMTUDISC_WANT;
  sk->protinfo.af_inet.id = 0;
  sock_init_data(sock,sk); /* 初始化一些数据 */
     /* net/core/sock.c */
  sk->destruct = inet_sock_destruct; /* 当销毁socket时调用inet_sock_destruct */
  sk->zapped = 0;
  sk->family = PF_INET;
  sk->protocol = protocol;
  sk->prot = prot;
  sk->backlog_rcv = prot->backlog_rcv; /* prot->backlog_rcv()见各个类型的定义 */
  sk->protinfo.af_inet.ttl = sysctl_ip_default_ttl; /* 设置默认ttl */
     /* 修改/proc/sys/net/ipv4/ip_default_ttl */
  sk->protinfo.af_inet.mc_loop = 1;
  sk->protinfo.af_inet.mc_ttl = 1;
  sk->protinfo.af_inet.mc_index = 0;
  sk->protinfo.af_inet.mc_list = NULL;
  #ifdef INET_REFCNT_DEBUG
  atomic_inc(&inet_sock_nr);
  #endif
  if (sk->num) {
   /* It assumes that any protocol which allows
    * the user to assign a number at socket
    * creation time automatically
    * shares.
    */
   sk->sport = htons(sk->num); /* 设置本地端口 */
   /* Add to protocol hash chains. */
   sk->prot->hash(sk);
  }
  if (sk->prot->init) {
   int err = sk->prot->init(sk); /* 协议对socket的初始化 */
   if (err != 0) {
    inet_sock_release(sk);
    return(err);
   }
  }
  return(0);
  free_and_badtype:
  sk_free(sk);  /* 释放内存 */
  return -ESOCKTNOSUPPORT;
  free_and_badperm:
  sk_free(sk);
  return -EPERM;
  free_and_noproto:
  sk_free(sk);
  return -EPROTONOSUPPORT;
  do_oom:
  return -ENOBUFS;
  }
  在net/core/sock.c
  void sock_init_data(struct socket *sock, struct sock *sk)
  {
  skb_queue_head_init(&sk->receive_queue); /* 初始化3条队列 接受,发送,错误*/
  skb_queue_head_init(&sk->write_queue);
  skb_queue_head_init(&sk->error_queue);
  init_timer(&sk->timer);  /* 初始化timer */
  
  sk->allocation = GFP_KERNEL;
  sk->rcvbuf = sysctl_rmem_default;
  sk->sndbuf = sysctl_wmem_default;
  sk->state = TCP_CLOSE;
  sk->zapped = 1;
  sk->socket = sock;
  if(sock)
  {
   sk->type = sock->type;
   sk->sleep = &sock->wait;
   sock->sk = sk;
  } else
   sk->sleep = NULL;
  sk->dst_lock = RW_LOCK_UNLOCKED;
  sk->callback_lock = RW_LOCK_UNLOCKED;
     /* sock_def_wakeup(),sock_def_readable(),
      sock_def_write_space(),sock_def_error_report(),
      sock_def_destruct() 在net/core/sock.c */
  sk->state_change = sock_def_wakeup;
  sk->data_ready = sock_def_readable;
  sk->write_space = sock_def_write_space;
  sk->error_report = sock_def_error_report;
  sk->destruct      =    sock_def_destruct;
  sk->peercred.pid = 0;
  sk->peercred.uid = -1;
  sk->peercred.gid = -1;
  sk->rcvlowat = 1;
  sk->rcvtimeo = MAX_SCHEDULE_TIMEOUT; /* 设置接受,发送超时 */
  sk->sndtimeo = MAX_SCHEDULE_TIMEOUT;
  atomic_set(&sk->refcnt, 1);
  }
  1.1 SOCK_STREAM的初始化
  在net/ipv4/tcp_ipv4.c
  static int tcp_v4_init_sock(struct sock *sk)
  {
  struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  skb_queue_head_init(&tp->out_of_order_queue);
  tcp_init_xmit_timers(sk);
  tcp_prequeue_init(tp);
  tp->rto = TCP_TIMEOUT_INIT;
  tp->mdev = TCP_TIMEOUT_INIT;
     
  /* So many TCP implementations out there (incorrectly) count the
   * initial SYN frame in their delayed-ACK and congestion control
   * algorithms that we must have the following bandaid to talk
   * efficiently to them. -DaveM
   */
  tp->snd_cwnd = 2;
  /* See draft-stevens-tcpca-spec-01 for discussion of the
   * initialization of these values.
   */
  tp->snd_ssthresh = 0x7fffffff; /* Infinity */
  tp->snd_cwnd_clamp = ~0;
  tp->mss_cache = 536;
  tp->reordering = sysctl_tcp_reordering;
  sk->state = TCP_CLOSE;
  sk->write_space = tcp_write_space; /* tcp_write_space() 在net/ipv4/tcp.c */
  sk->use_write_queue = 1;
  sk->tp_pinfo.af_tcp.af_specific = &ipv4_specific;
     /* ipv4_specific 在net/ipv4/tcp_ipv4.c */
  sk->sndbuf = sysctl_tcp_wmem[1]; /* 设置发送和接收缓冲区大小 */
  sk->rcvbuf = sysctl_tcp_rmem[1]; /* sysctl_tcp_* 在net/ipv4/tcp.c */
  atomic_inc(&tcp_sockets_allocated); /* tcp_sockets_allocated是当前TCP socket的数量 */
  return 0;
  }
  SOCK_DGRAM无初始化
  1.2 SOCK_RAW初始化
  在net/ipv4/raw.c
  static int raw_init(struct sock *sk)
  {
  struct raw_opt *tp = &(sk->tp_pinfo.tp_raw4);
  if (sk->num == IPPROTO_ICMP)
   memset(&tp->filter, 0, sizeof(tp->filter));
  return 0;
  }
  2.Server
  2.1 bind
  static int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
  {
  struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
  struct sock *sk=sock->sk;
  unsigned short snum;
  int chk_addr_ret;
  int err;
  /* If the socket has its own bind function then use it. (RAW) */
  if(sk->prot->bind)
   return sk->prot->bind(sk, uaddr, addr_len);
      /* 只有SOCK_RAW定义了自己的bind函数 */
  if (addr_len < sizeof(struct sockaddr_in))
   return -EINVAL;
  chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
      /* inet_addr_type返回地址的类型 */
      /* 在net/ipv4/fib_frontend.c */
  /* Not specified by any standard per-se, however it breaks too
   * many applications when removed. It is unfortunate since
   * allowing applications to make a non-local bind solves
   * several problems with systems using dynamic addressing.
   * (ie. your servers still start up even if your ISDN link
   * is temporarily down)
   */
  if (sysctl_ip_nonlocal_bind == 0 &&
     sk->protinfo.af_inet.freebind == 0 &&
     addr->sin_addr.s_addr != INADDR_ANY &&
     chk_addr_ret != RTN_LOCAL &&
     chk_addr_ret != RTN_MULTICAST &&
     chk_addr_ret != RTN_BROADCAST)
   return -EADDRNOTAVAIL;
  snum = ntohs(addr->sin_port);
  if (snum && snum < PROT_SOCK && !capable(CAP_NET_BIND_SERVICE))
     /* 检查是否有权利bind端口到1-1024 */
   return -EACCES;
  /*   We keep a pair of addresses. rcv_saddr is the one
   *   used by hash lookups, and saddr is used for transmit.
   *
   *   In the BSD API these are the same except where it
   *   would be illegal to use them (multicast/broadcast) in
   *   which case the sending device address is used.
   */
  lock_sock(sk);
  /* Check these errors (active socket, double bind). */
  err = -EINVAL;
  if ((sk->state != TCP_CLOSE)  ||
     (sk->num != 0))
   goto out;
  sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
  if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
   sk->saddr = 0; /* Use device */
  /* Make sure we are allowed to bind here. */
  if (sk->prot->get_port(sk, snum) != 0) { /* get_port检查是否重用 */
   sk->saddr = sk->rcv_saddr = 0;
   err = -EADDRINUSE;
   goto out;
  }
  if (sk->rcv_saddr)
   sk->userlocks |= SOCK_BINDADDR_LOCK;
  if (snum)
   sk->userlocks |= SOCK_BINDPORT_LOCK;
  sk->sport = htons(sk->num);
  sk->daddr = 0;
  sk->dport = 0;
  sk_dst_reset(sk);
  err = 0;
  out:
  release_sock(sk);
  return err;
  }
  SOCK_STREAM和SOCK_DGRAM用默认的bind
  2.1.1 SOCK_RAW的bind
  在net/ipv4/raw.c
  static int raw_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len)
  {
  struct sockaddr_in *addr = (struct sockaddr_in *) uaddr;
  int ret = -EINVAL;
  int chk_addr_ret;
  if (sk->state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in))
   goto out;
  chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr);
      /* inet_addr_type返回地址的类型 */
      /* 在net/ipv4/fib_frontend.c */
  ret = -EADDRNOTAVAIL;
  if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL &&
     chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST)
   goto out;
  sk->rcv_saddr = sk->saddr = addr->sin_addr.s_addr;
      /* sk->rcv_saddr 捆绑的本地地址 */
      /* sk->saddr 源地址 */
  if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST)
   sk->saddr = 0; /* Use device */ /* 地址类型如为多播或是广播源地址为0 */
  sk_dst_reset(sk);
  ret = 0;
  out: return ret;
  }
  2.2 listen
  2.2.1 SOCK_STREAM的listen
  在net/ipv4/af_inet.c
  int inet_listen(struct socket *sock, int backlog)
  {
  struct sock *sk = sock->sk;
  unsigned char old_state;
  int err;
  lock_sock(sk);
  err = -EINVAL;
  if (sock->state != SS_UNCONNECTED || sock->type != SOCK_STREAM)
   goto out;
  old_state = sk->state;
  if (!((1< goto out;
  /* Really, if the socket is already in listen state
   * we can only allow the backlog to be adjusted.
   */
  if (old_state != TCP_LISTEN) {
   err = tcp_listen_start(sk); /* 真正实现TCP协议listen */
   if (err)
    goto out;
  }
  sk->max_ack_backlog = backlog;
  err = 0;
  out:
  release_sock(sk);
  return err;
  }
  tcp_listen_start在net/ipv4/tcp.h
  int tcp_listen_start(struct sock *sk)
  {
  struct tcp_opt *tp = &(sk->tp_pinfo.af_tcp);
  struct tcp_listen_opt *lopt;
  sk->max_ack_backlog = 0;
  sk->ack_backlog = 0;
  tp->aclearcase/" target="_blank" >ccept_queue = tp->accept_queue_tail = NULL;
  tp->syn_wait_lock = RW_LOCK_UNLOCKED;
  tcp_delack_init(tp);  /* tp清0 */
     /* include/net/tcp.h */
  lopt = kmalloc(sizeof(struct tcp_listen_opt), GFP_KERNEL);
  if (!lopt)
   return -ENOMEM;
  memset(lopt, 0, sizeof(struct tcp_listen_opt));
  for (lopt->max_qlen_log = 6; ; lopt->max_qlen_log++)
   if ((1= sysctl_max_syn_backlog)
    break;
  write_lock_bh(&tp->syn_wait_lock);
  tp->listen_opt = lopt;
  write_unlock_bh(&tp->syn_wait_lock);
  /* There is race window here: we announce ourselves listening,
   * but this transition is still not validated by get_port().
   * It is OK, because this socket enters to hash table only
   * after validation is complete.
   */
  sk->state = TCP_LISTEN;
  if (sk->prot->get_port(sk, sk->num) == 0) { /* 确认地址没有重用 */
   sk->sport = htons(sk->num); /* 设置源端口 */
   sk_dst_reset(sk);
   sk->prot->hash(sk);  /* 将端口加到hash表中 */
   return 0;
  }
  sk->state = TCP_CLOSE;
  write_lock_bh(&tp->syn_wait_lock);
  tp->listen_opt = NULL;
  write_unlock_bh(&tp->syn_wait_lock);
  kfree(lopt);
  return -EADDRINUSE;
  }
  SOCK_DGRAM 和 SOCK_RAW 不支持listen
  2.3 accept
  2.3.1 SOCK_STREAM的accept
  在net/ipv4/af_inet.c
  int inet_accept(struct socket *sock, struct socket *newsock, int flags)
  {
  struct sock *sk1 = sock->sk;
  struct sock *sk2;
  int err = -EINVAL;
  if((sk2 = sk1->prot->accept(sk1,flags,&err)) == NULL)
   goto do_err;
  lock_sock(sk2);
  BUG_TRAP((1 sock_graft(sk2, newsock); /* 将sk2转接到newsock */
     /* 在include/net/sock.h */
  newsock->state = SS_CONNECTED;
  release_sock(sk2);
  return 0;
  do_err:
  return err;
  }
  SOCK_DGRAM 和 SOCK_RAW 不支持 accept
  2.3.1.1 TCP协议的accept
  在net/ipv4/tcp.c
  struct sock *tcp_accept(struct sock *sk, int flags, int *err)
  {
  struct tcp_opt *tp = &sk->tp_pinfo.af_tcp;
  struct open_request *req;
  struct sock *newsk;
  int error;
  lock_sock(sk);
  /* We need to make sure that this socket is listening,
   * and that it has something pending.
   */
  error = -EINVAL;
  if (sk->state != TCP_LISTEN) /* 检查socket是否处于listen状态 */
   goto out;
  /* Find already established connection */
  if (!tp->accept_queue) { /* 判断accept队列是否准备好 */
   long timeo = sock_rcvtimeo(sk, flags & O_NONBLOCK);
     /* 判断是否为堵塞模式 */
     /* 在include/net/sock.h */
   /* If this is a non blocking socket don't sleep */
   error = -EAGAIN;
   if (!timeo)  /* 不堵塞模式,直接返回 */
    goto out;
   error = wait_for_connect(sk, timeo); /* 进入空闲等待连接 */
   if (error)
    goto out;
  }
  req = tp->accept_queue;
  if ((tp->accept_queue = req->dl_next) == NULL)
   tp->accept_queue_tail = NULL;
   newsk = req->sk;
  tcp_acceptq_removed(sk);  /* sk当前连接数减1 */
      /*在include/net/tcp.h */
  tcp_openreq_fastfree(req);  /* 释放内存 */
      /*在include/net/tcp.h */
  BUG_TRAP(newsk->state != TCP_SYN_RECV); 
  release_sock(sk);
  return newsk;
  out:
  release_sock(sk);
  *err = error;
  return NULL;
  }
  /* 只有当socket为堵塞模式,该函数才会被调用 */
  /* 在net/ipv4/tcp.c */
  static int wait_for_connect(struct sock * sk, long timeo)
  {
  DECLARE_WAITQUEUE(wait, current);
  int err;
  /*
   * True wake-one mechanism for incoming connections: only
   * one process gets woken up, not the 'whole herd'.
   * Since we do not 'race & poll' for established sockets
   * anymore, the common case will execute the loop only once.
   *
   * Subtle issue: "add_wait_queue_exclusive()" will be added
   * after any current non-exclusive waiters, and we know that
   * it will always _stay_ after any new non-exclusive waiters
   * because all non-exclusive waiters are added at the
   * beginning of the wait-queue. As such, it's ok to "drop"
   * our exclusiveness temporarily when we get woken up without
   * having to remove and re-insert us on the wait queue.
   */
  add_wait_queue_exclusive(sk->sleep, &wait);
  for (;;) {
   current->state = TASK_INTERRUPTIBLE;
   release_sock(sk);
   if (sk->tp_pinfo.af_tcp.accept_queue == NULL)
    timeo = schedule_timeout(timeo); /* 休眠timeo时长 */
   lock_sock(sk);
   err = 0;
   if (sk->tp_pinfo.af_tcp.accept_queue) /* accept队列可用 */
      /* 也就是有连接进入 */
    break;
   err = -EINVAL;
   if (sk->state != TCP_LISTEN)
    break;
   err = sock_intr_errno(timeo);
   if (signal_pending(current))
    break;
   err = -EAGAIN;
   if (!timeo)
    break;
  }
  current->state = TASK_RUNNING;
  remove_wait_queue(sk->sleep, &wait);
  return err;
  }
  3.Client
  3.1 connect
  3.1.1 SOCK_STREAM的connect
  在net/ipv4/af_inet.c
  
  int inet_stream_connect(struct socket *sock, struct sockaddr * uaddr,
    int addr_len, int flags)
  {
  struct sock *sk=sock->sk;
  int err;
  long timeo;
  lock_sock(sk);
  if (uaddr->sa_family == AF_UNSPEC) {
   err = sk->prot->disconnect(sk, flags); /* 关闭连接 */
   sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
   goto out;
  }
  switch (sock->state) {
  default:
   err = -EINVAL;
   goto out;
  case SS_CONNECTED:
   err = -EISCONN;
   goto out;
  case SS_CONNECTING:
   err = -EALREADY;
   /* Fall out of switch with err, set for this state */
   break;
  case SS_UNCONNECTED:
   err = -EISCONN;
   if (sk->state != TCP_CLOSE)
    goto out;
   err = -EAGAIN;
   if (sk->num == 0) {
    if (sk->prot->get_port(sk, 0) != 0) /* 是否重用 */
    goto out;
    sk->sport = htons(sk->num);
   }
   err = sk->prot->connect(sk, uaddr, addr_len); /* 调用协议的connect */
   if (err < 0)
    goto out;
    sock->state = SS_CONNECTING;  /* socket状态设置成连接中 */
   /* Just entered SS_CONNECTING state; the only
    * difference is that return value in non-blocking
    * case is EINPROGRESS, rather than EALREADY.
    */
   err = -EINPROGRESS;
   break;
  }
  timeo = sock_sndtimeo(sk, flags&O_NONBLOCK); /* 是否为堵塞模式 */
      /* 在include/net/sock.h */
  if ((1  /* Error code is set above */
   if (!timeo || !inet_wait_for_connect(sk, timeo))
      /* 非堵塞模式立即返回 */
      /* 堵塞模式调用inet_wait_for_connect() */
    goto out;
   err = sock_intr_errno(timeo);
   if (signal_pending(current))
    goto out;
  }
  /* Connection was closed by RST, timeout, ICMP error
   * or another process disconnected us.
   */
  if (sk->state == TCP_CLOSE)
   goto sock_error;
  /* sk->err may be not zero now, if RECVERR was ordered by user
   * and error was received after socket entered established state.
   * Hence, it is handled normally after connect() return successfully.
   */
  sock->state = SS_CONNECTED;  /* 设置状态为已连接 */
  err = 0;
  out:
  release_sock(sk);
  return err;
  sock_error:
  err = sock_error(sk) ? : -ECONNABORTED;
  sock->state = SS_UNCONNECTED;
  if (sk->prot->disconnect(sk, flags))
   sock->state = SS_DISCONNECTING;
  goto out;
  }
  /* 只有当socket为堵塞模式,该函数才会被调用 */
  /* 在/net/ipv4/af_inet.c */
  static long inet_wait_for_connect(struct sock *sk, long timeo)
  {
  DECLARE_WAITQUEUE(wait, current);
  __set_current_state(TASK_INTERRUPTIBLE);
  add_wait_queue(sk->sleep, &wait);
  /* Basic assumption: if someone sets sk->err, he _must_
   * change state of the socket from TCP_SYN_*.
   * Connect() does not allow to get error notifications
   * without closing the socket.
   */
  while ((1  release_sock(sk);
   timeo = schedule_timeout(timeo); /* 进入休眠 */
   lock_sock(sk);
   if (signal_pending(current) || !timeo)
    break;
   set_current_state(TASK_INTERRUPTIBLE);
  }
  __set_current_state(TASK_RUNNING);
  remove_wait_queue(sk->sleep, &wait);
  return timeo;
  }

原文转自:http://www.ltesting.net