err.no Git - linux-2.6/blob - fs/dlm/lowcomms-tcp.c

   1 /******************************************************************************
   2 *******************************************************************************
   3 **
   4 **  Copyright (C) Sistina Software, Inc.  1997-2003  All rights reserved.
   5 **  Copyright (C) 2004-2007 Red Hat, Inc.  All rights reserved.
   6 **
   7 **  This copyrighted material is made available to anyone wishing to use,
   8 **  modify, copy, or redistribute it subject to the terms and conditions
   9 **  of the GNU General Public License v.2.
  10 **
  11 *******************************************************************************
  12 ******************************************************************************/
  13
  14 /*
  15  * lowcomms.c
  16  *
  17  * This is the "low-level" comms layer.
  18  *
  19  * It is responsible for sending/receiving messages
  20  * from other nodes in the cluster.
  21  *
  22  * Cluster nodes are referred to by their nodeids. nodeids are
  23  * simply 32 bit numbers to the locking module - if they need to
  24  * be expanded for the cluster infrastructure then that is it's
  25  * responsibility. It is this layer's
  26  * responsibility to resolve these into IP address or
  27  * whatever it needs for inter-node communication.
  28  *
  29  * The comms level is two kernel threads that deal mainly with
  30  * the receiving of messages from other nodes and passing them
  31  * up to the mid-level comms layer (which understands the
  32  * message format) for execution by the locking core, and
  33  * a send thread which does all the setting up of connections
  34  * to remote nodes and the sending of data. Threads are not allowed
  35  * to send their own data because it may cause them to wait in times
  36  * of high load. Also, this way, the sending thread can collect together
  37  * messages bound for one node and send them in one block.
  38  *
  39  * I don't see any problem with the recv thread executing the locking
  40  * code on behalf of remote processes as the locking code is
  41  * short, efficient and never waits.
  42  *
  43  */
  44
  45
  46 #include <asm/ioctls.h>
  47 #include <net/sock.h>
  48 #include <net/tcp.h>
  49 #include <linux/pagemap.h>
  50
  51 #include "dlm_internal.h"
  52 #include "lowcomms.h"
  53 #include "midcomms.h"
  54 #include "config.h"
  55
  56 struct cbuf {
  57         unsigned int base;
  58         unsigned int len;
  59         unsigned int mask;
  60 };
  61
  62 #define NODE_INCREMENT 32
  63 static void cbuf_add(struct cbuf *cb, int n)
  64 {
  65         cb->len += n;
  66 }
  67
  68 static int cbuf_data(struct cbuf *cb)
  69 {
  70         return ((cb->base + cb->len) & cb->mask);
  71 }
  72
  73 static void cbuf_init(struct cbuf *cb, int size)
  74 {
  75         cb->base = cb->len = 0;
  76         cb->mask = size-1;
  77 }
  78
  79 static void cbuf_eat(struct cbuf *cb, int n)
  80 {
  81         cb->len  -= n;
  82         cb->base += n;
  83         cb->base &= cb->mask;
  84 }
  85
  86 static bool cbuf_empty(struct cbuf *cb)
  87 {
  88         return cb->len == 0;
  89 }
  90
  91 /* Maximum number of incoming messages to process before
  92    doing a cond_resched()
  93 */
  94 #define MAX_RX_MSG_COUNT 25
  95
  96 struct connection {
  97         struct socket *sock;    /* NULL if not connected */
  98         uint32_t nodeid;        /* So we know who we are in the list */
  99         struct mutex sock_mutex;
 100         unsigned long flags;    /* bit 1,2 = We are on the read/write lists */
 101 #define CF_READ_PENDING 1
 102 #define CF_WRITE_PENDING 2
 103 #define CF_CONNECT_PENDING 3
 104 #define CF_IS_OTHERCON 4
 105         struct list_head writequeue;  /* List of outgoing writequeue_entries */
 106         struct list_head listenlist;  /* List of allocated listening sockets */
 107         spinlock_t writequeue_lock;
 108         int (*rx_action) (struct connection *); /* What to do when active */
 109         struct page *rx_page;
 110         struct cbuf cb;
 111         int retries;
 112 #define MAX_CONNECT_RETRIES 3
 113         struct connection *othercon;
 114         struct work_struct rwork; /* Receive workqueue */
 115         struct work_struct swork; /* Send workqueue */
 116 };
 117 #define sock2con(x) ((struct connection *)(x)->sk_user_data)
 118
 119 /* An entry waiting to be sent */
 120 struct writequeue_entry {
 121         struct list_head list;
 122         struct page *page;
 123         int offset;
 124         int len;
 125         int end;
 126         int users;
 127         struct connection *con;
 128 };
 129
 130 static struct sockaddr_storage dlm_local_addr;
 131
 132 /* Work queues */
 133 static struct workqueue_struct *recv_workqueue;
 134 static struct workqueue_struct *send_workqueue;
 135
 136 /* An array of pointers to connections, indexed by NODEID */
 137 static struct connection **connections;
 138 static DECLARE_MUTEX(connections_lock);
 139 static struct kmem_cache *con_cache;
 140 static int conn_array_size;
 141
 142 static void process_recv_sockets(struct work_struct *work);
 143 static void process_send_sockets(struct work_struct *work);
 144
 145 static struct connection *nodeid2con(int nodeid, gfp_t allocation)
 146 {
 147         struct connection *con = NULL;
 148
 149         down(&connections_lock);
 150         if (nodeid >= conn_array_size) {
 151                 int new_size = nodeid + NODE_INCREMENT;
 152                 struct connection **new_conns;
 153
 154                 new_conns = kzalloc(sizeof(struct connection *) *
 155                                     new_size, allocation);
 156                 if (!new_conns)
 157                         goto finish;
 158
 159                 memcpy(new_conns, connections,  sizeof(struct connection *) * conn_array_size);
 160                 conn_array_size = new_size;
 161                 kfree(connections);
 162                 connections = new_conns;
 163
 164         }
 165
 166         con = connections[nodeid];
 167         if (con == NULL && allocation) {
 168                 con = kmem_cache_zalloc(con_cache, allocation);
 169                 if (!con)
 170                         goto finish;
 171
 172                 con->nodeid = nodeid;
 173                 mutex_init(&con->sock_mutex);
 174                 INIT_LIST_HEAD(&con->writequeue);
 175                 spin_lock_init(&con->writequeue_lock);
 176                 INIT_WORK(&con->swork, process_send_sockets);
 177                 INIT_WORK(&con->rwork, process_recv_sockets);
 178
 179                 connections[nodeid] = con;
 180         }
 181
 182 finish:
 183         up(&connections_lock);
 184         return con;
 185 }
 186
 187 /* Data available on socket or listen socket received a connect */
 188 static void lowcomms_data_ready(struct sock *sk, int count_unused)
 189 {
 190         struct connection *con = sock2con(sk);
 191
 192         if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
 193                 queue_work(recv_workqueue, &con->rwork);
 194 }
 195
 196 static void lowcomms_write_space(struct sock *sk)
 197 {
 198         struct connection *con = sock2con(sk);
 199
 200         if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags))
 201                 queue_work(send_workqueue, &con->swork);
 202 }
 203
 204 static inline void lowcomms_connect_sock(struct connection *con)
 205 {
 206         if (!test_and_set_bit(CF_CONNECT_PENDING, &con->flags))
 207                 queue_work(send_workqueue, &con->swork);
 208 }
 209
 210 static void lowcomms_state_change(struct sock *sk)
 211 {
 212         if (sk->sk_state == TCP_ESTABLISHED)
 213                 lowcomms_write_space(sk);
 214 }
 215
 216 /* Make a socket active */
 217 static int add_sock(struct socket *sock, struct connection *con)
 218 {
 219         con->sock = sock;
 220
 221         /* Install a data_ready callback */
 222         con->sock->sk->sk_data_ready = lowcomms_data_ready;
 223         con->sock->sk->sk_write_space = lowcomms_write_space;
 224         con->sock->sk->sk_state_change = lowcomms_state_change;
 225
 226         return 0;
 227 }
 228
 229 /* Add the port number to an IP6 or 4 sockaddr and return the address
 230    length */
 231 static void make_sockaddr(struct sockaddr_storage *saddr, uint16_t port,
 232                           int *addr_len)
 233 {
 234         saddr->ss_family =  dlm_local_addr.ss_family;
 235         if (saddr->ss_family == AF_INET) {
 236                 struct sockaddr_in *in4_addr = (struct sockaddr_in *)saddr;
 237                 in4_addr->sin_port = cpu_to_be16(port);
 238                 *addr_len = sizeof(struct sockaddr_in);
 239         } else {
 240                 struct sockaddr_in6 *in6_addr = (struct sockaddr_in6 *)saddr;
 241                 in6_addr->sin6_port = cpu_to_be16(port);
 242                 *addr_len = sizeof(struct sockaddr_in6);
 243         }
 244 }
 245
 246 /* Close a remote connection and tidy up */
 247 static void close_connection(struct connection *con, bool and_other)
 248 {
 249         mutex_lock(&con->sock_mutex);
 250
 251         if (con->sock) {
 252                 sock_release(con->sock);
 253                 con->sock = NULL;
 254         }
 255         if (con->othercon && and_other) {
 256                 /* Will only re-enter once. */
 257                 close_connection(con->othercon, false);
 258         }
 259         if (con->rx_page) {
 260                 __free_page(con->rx_page);
 261                 con->rx_page = NULL;
 262         }
 263         con->retries = 0;
 264         mutex_unlock(&con->sock_mutex);
 265 }
 266
 267 /* Data received from remote end */
 268 static int receive_from_sock(struct connection *con)
 269 {
 270         int ret = 0;
 271         struct msghdr msg;
 272         struct iovec iov[2];
 273         mm_segment_t fs;
 274         unsigned len;
 275         int r;
 276         int call_again_soon = 0;
 277
 278         mutex_lock(&con->sock_mutex);
 279
 280         if (con->sock == NULL) {
 281                 ret = -EAGAIN;
 282                 goto out_close;
 283         }
 284
 285         if (con->rx_page == NULL) {
 286                 /*
 287                  * This doesn't need to be atomic, but I think it should
 288                  * improve performance if it is.
 289                  */
 290                 con->rx_page = alloc_page(GFP_ATOMIC);
 291                 if (con->rx_page == NULL)
 292                         goto out_resched;
 293                 cbuf_init(&con->cb, PAGE_CACHE_SIZE);
 294         }
 295
 296         msg.msg_control = NULL;
 297         msg.msg_controllen = 0;
 298         msg.msg_iovlen = 1;
 299         msg.msg_iov = iov;
 300         msg.msg_name = NULL;
 301         msg.msg_namelen = 0;
 302         msg.msg_flags = 0;
 303
 304         /*
 305          * iov[0] is the bit of the circular buffer between the current end
 306          * point (cb.base + cb.len) and the end of the buffer.
 307          */
 308         iov[0].iov_len = con->cb.base - cbuf_data(&con->cb);
 309         iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb);
 310         iov[1].iov_len = 0;
 311
 312         /*
 313          * iov[1] is the bit of the circular buffer between the start of the
 314          * buffer and the start of the currently used section (cb.base)
 315          */
 316         if (cbuf_data(&con->cb) >= con->cb.base) {
 317                 iov[0].iov_len = PAGE_CACHE_SIZE - cbuf_data(&con->cb);
 318                 iov[1].iov_len = con->cb.base;
 319                 iov[1].iov_base = page_address(con->rx_page);
 320                 msg.msg_iovlen = 2;
 321         }
 322         len = iov[0].iov_len + iov[1].iov_len;
 323
 324         fs = get_fs();
 325         set_fs(get_ds());
 326         r = ret = sock_recvmsg(con->sock, &msg, len,
 327                                MSG_DONTWAIT | MSG_NOSIGNAL);
 328         set_fs(fs);
 329
 330         if (ret <= 0)
 331                 goto out_close;
 332         if (ret == -EAGAIN)
 333                 goto out_resched;
 334
 335         if (ret == len)
 336                 call_again_soon = 1;
 337         cbuf_add(&con->cb, ret);
 338         ret = dlm_process_incoming_buffer(con->nodeid,
 339                                           page_address(con->rx_page),
 340                                           con->cb.base, con->cb.len,
 341                                           PAGE_CACHE_SIZE);
 342         if (ret == -EBADMSG) {
 343                 printk(KERN_INFO "dlm: lowcomms: addr=%p, base=%u, len=%u, "
 344                        "iov_len=%u, iov_base[0]=%p, read=%d\n",
 345                        page_address(con->rx_page), con->cb.base, con->cb.len,
 346                        len, iov[0].iov_base, r);
 347         }
 348         if (ret < 0)
 349                 goto out_close;
 350         cbuf_eat(&con->cb, ret);
 351
 352         if (cbuf_empty(&con->cb) && !call_again_soon) {
 353                 __free_page(con->rx_page);
 354                 con->rx_page = NULL;
 355         }
 356
 357         if (call_again_soon)
 358                 goto out_resched;
 359         mutex_unlock(&con->sock_mutex);
 360         return 0;
 361
 362 out_resched:
 363         if (!test_and_set_bit(CF_READ_PENDING, &con->flags))
 364                 queue_work(recv_workqueue, &con->rwork);
 365         mutex_unlock(&con->sock_mutex);
 366         return -EAGAIN;
 367
 368 out_close:
 369         mutex_unlock(&con->sock_mutex);
 370         if (ret != -EAGAIN && !test_bit(CF_IS_OTHERCON, &con->flags)) {
 371                 close_connection(con, false);
 372                 /* Reconnect when there is something to send */
 373         }
 374         /* Don't return success if we really got EOF */
 375         if (ret == 0)
 376                 ret = -EAGAIN;
 377
 378         return ret;
 379 }
 380
 381 /* Listening socket is busy, accept a connection */
 382 static int accept_from_sock(struct connection *con)
 383 {
 384         int result;
 385         struct sockaddr_storage peeraddr;
 386         struct socket *newsock;
 387         int len;
 388         int nodeid;
 389         struct connection *newcon;
 390         struct connection *addcon;
 391
 392         memset(&peeraddr, 0, sizeof(peeraddr));
 393         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
 394                                   IPPROTO_TCP, &newsock);
 395         if (result < 0)
 396                 return -ENOMEM;
 397
 398         mutex_lock_nested(&con->sock_mutex, 0);
 399
 400         result = -ENOTCONN;
 401         if (con->sock == NULL)
 402                 goto accept_err;
 403
 404         newsock->type = con->sock->type;
 405         newsock->ops = con->sock->ops;
 406
 407         result = con->sock->ops->accept(con->sock, newsock, O_NONBLOCK);
 408         if (result < 0)
 409                 goto accept_err;
 410
 411         /* Get the connected socket's peer */
 412         memset(&peeraddr, 0, sizeof(peeraddr));
 413         if (newsock->ops->getname(newsock, (struct sockaddr *)&peeraddr,
 414                                   &len, 2)) {
 415                 result = -ECONNABORTED;
 416                 goto accept_err;
 417         }
 418
 419         /* Get the new node's NODEID */
 420         make_sockaddr(&peeraddr, 0, &len);
 421         if (dlm_addr_to_nodeid(&peeraddr, &nodeid)) {
 422                 printk("dlm: connect from non cluster node\n");
 423                 sock_release(newsock);
 424                 mutex_unlock(&con->sock_mutex);
 425                 return -1;
 426         }
 427
 428         log_print("got connection from %d", nodeid);
 429
 430         /*  Check to see if we already have a connection to this node. This
 431          *  could happen if the two nodes initiate a connection at roughly
 432          *  the same time and the connections cross on the wire.
 433          * TEMPORARY FIX:
 434          *  In this case we store the incoming one in "othercon"
 435          */
 436         newcon = nodeid2con(nodeid, GFP_KERNEL);
 437         if (!newcon) {
 438                 result = -ENOMEM;
 439                 goto accept_err;
 440         }
 441         mutex_lock_nested(&newcon->sock_mutex, 1);
 442         if (newcon->sock) {
 443                 struct connection *othercon = newcon->othercon;
 444
 445                 if (!othercon) {
 446                         othercon = kmem_cache_zalloc(con_cache, GFP_KERNEL);
 447                         if (!othercon) {
 448                                 printk("dlm: failed to allocate incoming socket\n");
 449                                 mutex_unlock(&newcon->sock_mutex);
 450                                 result = -ENOMEM;
 451                                 goto accept_err;
 452                         }
 453                         othercon->nodeid = nodeid;
 454                         othercon->rx_action = receive_from_sock;
 455                         mutex_init(&othercon->sock_mutex);
 456                         INIT_WORK(&othercon->swork, process_send_sockets);
 457                         INIT_WORK(&othercon->rwork, process_recv_sockets);
 458                         set_bit(CF_IS_OTHERCON, &othercon->flags);
 459                         newcon->othercon = othercon;
 460                 }
 461                 othercon->sock = newsock;
 462                 newsock->sk->sk_user_data = othercon;
 463                 add_sock(newsock, othercon);
 464                 addcon = othercon;
 465         }
 466         else {
 467                 newsock->sk->sk_user_data = newcon;
 468                 newcon->rx_action = receive_from_sock;
 469                 add_sock(newsock, newcon);
 470                 addcon = newcon;
 471         }
 472
 473         mutex_unlock(&newcon->sock_mutex);
 474
 475         /*
 476          * Add it to the active queue in case we got data
 477          * beween processing the accept adding the socket
 478          * to the read_sockets list
 479          */
 480         if (!test_and_set_bit(CF_READ_PENDING, &addcon->flags))
 481                 queue_work(recv_workqueue, &addcon->rwork);
 482         mutex_unlock(&con->sock_mutex);
 483
 484         return 0;
 485
 486 accept_err:
 487         mutex_unlock(&con->sock_mutex);
 488         sock_release(newsock);
 489
 490         if (result != -EAGAIN)
 491                 printk("dlm: error accepting connection from node: %d\n", result);
 492         return result;
 493 }
 494
 495 /* Connect a new socket to its peer */
 496 static void connect_to_sock(struct connection *con)
 497 {
 498         int result = -EHOSTUNREACH;
 499         struct sockaddr_storage saddr;
 500         int addr_len;
 501         struct socket *sock;
 502
 503         if (con->nodeid == 0) {
 504                 log_print("attempt to connect sock 0 foiled");
 505                 return;
 506         }
 507
 508         mutex_lock(&con->sock_mutex);
 509         if (con->retries++ > MAX_CONNECT_RETRIES)
 510                 goto out;
 511
 512         /* Some odd races can cause double-connects, ignore them */
 513         if (con->sock) {
 514                 result = 0;
 515                 goto out;
 516         }
 517
 518         /* Create a socket to communicate with */
 519         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM,
 520                                   IPPROTO_TCP, &sock);
 521         if (result < 0)
 522                 goto out_err;
 523
 524         memset(&saddr, 0, sizeof(saddr));
 525         if (dlm_nodeid_to_addr(con->nodeid, &saddr))
 526                 goto out_err;
 527
 528         sock->sk->sk_user_data = con;
 529         con->rx_action = receive_from_sock;
 530
 531         make_sockaddr(&saddr, dlm_config.ci_tcp_port, &addr_len);
 532
 533         add_sock(sock, con);
 534
 535         log_print("connecting to %d", con->nodeid);
 536         result =
 537                 sock->ops->connect(sock, (struct sockaddr *)&saddr, addr_len,
 538                                    O_NONBLOCK);
 539         if (result == -EINPROGRESS)
 540                 result = 0;
 541         if (result == 0)
 542                 goto out;
 543
 544 out_err:
 545         if (con->sock) {
 546                 sock_release(con->sock);
 547                 con->sock = NULL;
 548         }
 549         /*
 550          * Some errors are fatal and this list might need adjusting. For other
 551          * errors we try again until the max number of retries is reached.
 552          */
 553         if (result != -EHOSTUNREACH && result != -ENETUNREACH &&
 554             result != -ENETDOWN && result != EINVAL
 555             && result != -EPROTONOSUPPORT) {
 556                 lowcomms_connect_sock(con);
 557                 result = 0;
 558         }
 559 out:
 560         mutex_unlock(&con->sock_mutex);
 561         return;
 562 }
 563
 564 static struct socket *create_listen_sock(struct connection *con,
 565                                          struct sockaddr_storage *saddr)
 566 {
 567         struct socket *sock = NULL;
 568         mm_segment_t fs;
 569         int result = 0;
 570         int one = 1;
 571         int addr_len;
 572
 573         if (dlm_local_addr.ss_family == AF_INET)
 574                 addr_len = sizeof(struct sockaddr_in);
 575         else
 576                 addr_len = sizeof(struct sockaddr_in6);
 577
 578         /* Create a socket to communicate with */
 579         result = sock_create_kern(dlm_local_addr.ss_family, SOCK_STREAM, IPPROTO_TCP, &sock);
 580         if (result < 0) {
 581                 printk("dlm: Can't create listening comms socket\n");
 582                 goto create_out;
 583         }
 584
 585         fs = get_fs();
 586         set_fs(get_ds());
 587         result = sock_setsockopt(sock, SOL_SOCKET, SO_REUSEADDR,
 588                                  (char *)&one, sizeof(one));
 589         set_fs(fs);
 590         if (result < 0) {
 591                 printk("dlm: Failed to set SO_REUSEADDR on socket: result=%d\n",
 592                        result);
 593         }
 594         sock->sk->sk_user_data = con;
 595         con->rx_action = accept_from_sock;
 596         con->sock = sock;
 597
 598         /* Bind to our port */
 599         make_sockaddr(saddr, dlm_config.ci_tcp_port, &addr_len);
 600         result = sock->ops->bind(sock, (struct sockaddr *) saddr, addr_len);
 601         if (result < 0) {
 602                 printk("dlm: Can't bind to port %d\n", dlm_config.ci_tcp_port);
 603                 sock_release(sock);
 604                 sock = NULL;
 605                 con->sock = NULL;
 606                 goto create_out;
 607         }
 608
 609         fs = get_fs();
 610         set_fs(get_ds());
 611
 612         result = sock_setsockopt(sock, SOL_SOCKET, SO_KEEPALIVE,
 613                                  (char *)&one, sizeof(one));
 614         set_fs(fs);
 615         if (result < 0) {
 616                 printk("dlm: Set keepalive failed: %d\n", result);
 617         }
 618
 619         result = sock->ops->listen(sock, 5);
 620         if (result < 0) {
 621                 printk("dlm: Can't listen on port %d\n", dlm_config.ci_tcp_port);
 622                 sock_release(sock);
 623                 sock = NULL;
 624                 goto create_out;
 625         }
 626
 627 create_out:
 628         return sock;
 629 }
 630
 631
 632 /* Listen on all interfaces */
 633 static int listen_for_all(void)
 634 {
 635         struct socket *sock = NULL;
 636         struct connection *con = nodeid2con(0, GFP_KERNEL);
 637         int result = -EINVAL;
 638
 639         /* We don't support multi-homed hosts */
 640         set_bit(CF_IS_OTHERCON, &con->flags);
 641
 642         sock = create_listen_sock(con, &dlm_local_addr);
 643         if (sock) {
 644                 add_sock(sock, con);
 645                 result = 0;
 646         }
 647         else {
 648                 result = -EADDRINUSE;
 649         }
 650
 651         return result;
 652 }
 653
 654
 655
 656 static struct writequeue_entry *new_writequeue_entry(struct connection *con,
 657                                                      gfp_t allocation)
 658 {
 659         struct writequeue_entry *entry;
 660
 661         entry = kmalloc(sizeof(struct writequeue_entry), allocation);
 662         if (!entry)
 663                 return NULL;
 664
 665         entry->page = alloc_page(allocation);
 666         if (!entry->page) {
 667                 kfree(entry);
 668                 return NULL;
 669         }
 670
 671         entry->offset = 0;
 672         entry->len = 0;
 673         entry->end = 0;
 674         entry->users = 0;
 675         entry->con = con;
 676
 677         return entry;
 678 }
 679
 680 void *dlm_lowcomms_get_buffer(int nodeid, int len,
 681                               gfp_t allocation, char **ppc)
 682 {
 683         struct connection *con;
 684         struct writequeue_entry *e;
 685         int offset = 0;
 686         int users = 0;
 687
 688         con = nodeid2con(nodeid, allocation);
 689         if (!con)
 690                 return NULL;
 691
 692         spin_lock(&con->writequeue_lock);
 693         e = list_entry(con->writequeue.prev, struct writequeue_entry, list);
 694         if ((&e->list == &con->writequeue) ||
 695             (PAGE_CACHE_SIZE - e->end < len)) {
 696                 e = NULL;
 697         } else {
 698                 offset = e->end;
 699                 e->end += len;
 700                 users = e->users++;
 701         }
 702         spin_unlock(&con->writequeue_lock);
 703
 704         if (e) {
 705         got_one:
 706                 if (users == 0)
 707                         kmap(e->page);
 708                 *ppc = page_address(e->page) + offset;
 709                 return e;
 710         }
 711
 712         e = new_writequeue_entry(con, allocation);
 713         if (e) {
 714                 spin_lock(&con->writequeue_lock);
 715                 offset = e->end;
 716                 e->end += len;
 717                 users = e->users++;
 718                 list_add_tail(&e->list, &con->writequeue);
 719                 spin_unlock(&con->writequeue_lock);
 720                 goto got_one;
 721         }
 722         return NULL;
 723 }
 724
 725 void dlm_lowcomms_commit_buffer(void *mh)
 726 {
 727         struct writequeue_entry *e = (struct writequeue_entry *)mh;
 728         struct connection *con = e->con;
 729         int users;
 730
 731         spin_lock(&con->writequeue_lock);
 732         users = --e->users;
 733         if (users)
 734                 goto out;
 735         e->len = e->end - e->offset;
 736         kunmap(e->page);
 737         spin_unlock(&con->writequeue_lock);
 738
 739         if (!test_and_set_bit(CF_WRITE_PENDING, &con->flags)) {
 740                 queue_work(send_workqueue, &con->swork);
 741         }
 742         return;
 743
 744 out:
 745         spin_unlock(&con->writequeue_lock);
 746         return;
 747 }
 748
 749 static void free_entry(struct writequeue_entry *e)
 750 {
 751         __free_page(e->page);
 752         kfree(e);
 753 }
 754
 755 /* Send a message */
 756 static void send_to_sock(struct connection *con)
 757 {
 758         int ret = 0;
 759         ssize_t(*sendpage) (struct socket *, struct page *, int, size_t, int);
 760         const int msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL;
 761         struct writequeue_entry *e;
 762         int len, offset;
 763
 764         mutex_lock(&con->sock_mutex);
 765         if (con->sock == NULL)
 766                 goto out_connect;
 767
 768         sendpage = con->sock->ops->sendpage;
 769
 770         spin_lock(&con->writequeue_lock);
 771         for (;;) {
 772                 e = list_entry(con->writequeue.next, struct writequeue_entry,
 773                                list);
 774                 if ((struct list_head *) e == &con->writequeue)
 775                         break;
 776
 777                 len = e->len;
 778                 offset = e->offset;
 779                 BUG_ON(len == 0 && e->users == 0);
 780                 spin_unlock(&con->writequeue_lock);
 781                 kmap(e->page);
 782
 783                 ret = 0;
 784                 if (len) {
 785                         ret = sendpage(con->sock, e->page, offset, len,
 786                                        msg_flags);
 787                         if (ret == -EAGAIN || ret == 0)
 788                                 goto out;
 789                         if (ret <= 0)
 790                                 goto send_error;
 791                 }
 792                 else {
 793                         /* Don't starve people filling buffers */
 794                         cond_resched();
 795                 }
 796
 797                 spin_lock(&con->writequeue_lock);
 798                 e->offset += ret;
 799                 e->len -= ret;
 800
 801                 if (e->len == 0 && e->users == 0) {
 802                         list_del(&e->list);
 803                         kunmap(e->page);
 804                         free_entry(e);
 805                         continue;
 806                 }
 807         }
 808         spin_unlock(&con->writequeue_lock);
 809 out:
 810         mutex_unlock(&con->sock_mutex);
 811         return;
 812
 813 send_error:
 814         mutex_unlock(&con->sock_mutex);
 815         close_connection(con, false);
 816         lowcomms_connect_sock(con);
 817         return;
 818
 819 out_connect:
 820         mutex_unlock(&con->sock_mutex);
 821         connect_to_sock(con);
 822         return;
 823 }
 824
 825 static void clean_one_writequeue(struct connection *con)
 826 {
 827         struct list_head *list;
 828         struct list_head *temp;
 829
 830         spin_lock(&con->writequeue_lock);
 831         list_for_each_safe(list, temp, &con->writequeue) {
 832                 struct writequeue_entry *e =
 833                         list_entry(list, struct writequeue_entry, list);
 834                 list_del(&e->list);
 835                 free_entry(e);
 836         }
 837         spin_unlock(&con->writequeue_lock);
 838 }
 839
 840 /* Called from recovery when it knows that a node has
 841    left the cluster */
 842 int dlm_lowcomms_close(int nodeid)
 843 {
 844         struct connection *con;
 845
 846         if (!connections)
 847                 goto out;
 848
 849         log_print("closing connection to node %d", nodeid);
 850         con = nodeid2con(nodeid, 0);
 851         if (con) {
 852                 clean_one_writequeue(con);
 853                 close_connection(con, true);
 854         }
 855         return 0;
 856
 857 out:
 858         return -1;
 859 }
 860
 861 /* Look for activity on active sockets */
 862 static void process_recv_sockets(struct work_struct *work)
 863 {
 864         struct connection *con = container_of(work, struct connection, rwork);
 865         int err;
 866
 867         clear_bit(CF_READ_PENDING, &con->flags);
 868         do {
 869                 err = con->rx_action(con);
 870         } while (!err);
 871 }
 872
 873
 874 static void process_send_sockets(struct work_struct *work)
 875 {
 876         struct connection *con = container_of(work, struct connection, swork);
 877
 878         if (test_and_clear_bit(CF_CONNECT_PENDING, &con->flags)) {
 879                 connect_to_sock(con);
 880         }
 881
 882         clear_bit(CF_WRITE_PENDING, &con->flags);
 883         send_to_sock(con);
 884 }
 885
 886
 887 /* Discard all entries on the write queues */
 888 static void clean_writequeues(void)
 889 {
 890         int nodeid;
 891
 892         for (nodeid = 1; nodeid < conn_array_size; nodeid++) {
 893                 struct connection *con = nodeid2con(nodeid, 0);
 894
 895                 if (con)
 896                         clean_one_writequeue(con);
 897         }
 898 }
 899
 900 static void work_stop(void)
 901 {
 902         destroy_workqueue(recv_workqueue);
 903         destroy_workqueue(send_workqueue);
 904 }
 905
 906 static int work_start(void)
 907 {
 908         int error;
 909         recv_workqueue = create_workqueue("dlm_recv");
 910         error = IS_ERR(recv_workqueue);
 911         if (error) {
 912                 log_print("can't start dlm_recv %d", error);
 913                 return error;
 914         }
 915
 916         send_workqueue = create_singlethread_workqueue("dlm_send");
 917         error = IS_ERR(send_workqueue);
 918         if (error) {
 919                 log_print("can't start dlm_send %d", error);
 920                 destroy_workqueue(recv_workqueue);
 921                 return error;
 922         }
 923
 924         return 0;
 925 }
 926
 927 void dlm_lowcomms_stop(void)
 928 {
 929         int i;
 930
 931         /* Set all the flags to prevent any
 932            socket activity.
 933         */
 934         for (i = 0; i < conn_array_size; i++) {
 935                 if (connections[i])
 936                         connections[i]->flags |= 0xFF;
 937         }
 938
 939         work_stop();
 940         clean_writequeues();
 941
 942         for (i = 0; i < conn_array_size; i++) {
 943                 if (connections[i]) {
 944                         close_connection(connections[i], true);
 945                         if (connections[i]->othercon)
 946                                 kmem_cache_free(con_cache, connections[i]->othercon);
 947                         kmem_cache_free(con_cache, connections[i]);
 948                 }
 949         }
 950
 951         kfree(connections);
 952         connections = NULL;
 953
 954         kmem_cache_destroy(con_cache);
 955 }
 956
 957 /* This is quite likely to sleep... */
 958 int dlm_lowcomms_start(void)
 959 {
 960         int error = 0;
 961
 962         error = -ENOMEM;
 963         connections = kzalloc(sizeof(struct connection *) *
 964                               NODE_INCREMENT, GFP_KERNEL);
 965         if (!connections)
 966                 goto out;
 967
 968         conn_array_size = NODE_INCREMENT;
 969
 970         if (dlm_our_addr(&dlm_local_addr, 0)) {
 971                 log_print("no local IP address has been set");
 972                 goto fail_free_conn;
 973         }
 974         if (!dlm_our_addr(&dlm_local_addr, 1)) {
 975                 log_print("This dlm comms module does not support multi-homed clustering");
 976                 goto fail_free_conn;
 977         }
 978
 979         con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection),
 980                                       __alignof__(struct connection), 0,
 981                                       NULL, NULL);
 982         if (!con_cache)
 983                 goto fail_free_conn;
 984
 985
 986         /* Start listening */
 987         error = listen_for_all();
 988         if (error)
 989                 goto fail_unlisten;
 990
 991         error = work_start();
 992         if (error)
 993                 goto fail_unlisten;
 994
 995         return 0;
 996
 997 fail_unlisten:
 998         close_connection(connections[0], false);
 999         kmem_cache_free(con_cache, connections[0]);
1000         kmem_cache_destroy(con_cache);
1001
1002 fail_free_conn:
1003         kfree(connections);
1004
1005 out:
1006         return error;
1007 }
1008
1009 /*
1010  * Overrides for Emacs so that we follow Linus's tabbing style.
1011  * Emacs will notice this stuff at the end of the file and automatically
1012  * adjust the settings for this buffer only.  This must remain at the end
1013  * of the file.
1014  * ---------------------------------------------------------------------------
1015  * Local variables:
1016  * c-file-style: "linux"
1017  * End:
1018  */