]> err.no Git - linux-2.6/commitdiff
[PATCH] S2io: Large Receive Offload (LRO) feature(v2) for Neterion (s2io) 10GbE Xfram...
authorRavinandan Arakali <Ravinandan.Arakali@neterion.com>
Wed, 25 Jan 2006 19:53:07 +0000 (14:53 -0500)
committerJeff Garzik <jgarzik@pobox.com>
Fri, 27 Jan 2006 15:34:38 +0000 (10:34 -0500)
Hi,
Below is a patch for the Large Receive Offload feature.
Please review and let us know your comments.

LRO algorithm was described in an OLS 2005 presentation, located at
ftp.s2io.com
user: linuxdocs
password: HALdocs

The same ftp site has Programming Manual for Xframe-I ASIC.
LRO feature is supported on Neterion Xframe-I, Xframe-II and
Xframe-Express 10GbE NICs.

Brief description:
The Large Receive Offload(LRO) feature is a stateless offload
that is complementary to TSO feature but on the receive path.
The idea is to combine and collapse(upto 64K maximum) in the
driver, in-sequence TCP packets belonging to the same session.
It is mainly designed to improve 1500 mtu receive performance,
since Jumbo frame performance is already close to 10GbE line
rate. Some performance numbers are attached below.

Implementation details:
1. Handle packet chains from multiple sessions(current default
MAX_LRO_SESSSIONS=32).
2. Examine each packet for eligiblity to aggregate. A packet is
considered eligible if it meets all the below criteria.
  a. It is a TCP/IP packet and L2 type is not LLC or SNAP.
  b. The packet has no checksum errors(L3 and L4).
  c. There are no IP options. The only TCP option supported is timestamps.
  d. Search and locate the LRO object corresponding to this
     socket and ensure packet is in TCP sequence.
  e. It's not a special packet(SYN, FIN, RST, URG, PSH etc. flags are not set).
  f. TCP payload is non-zero(It's not a pure ACK).
  g. It's not an IP-fragmented packet.
3. If a packet is found eligible, the LRO object is updated with
   information such as next sequence number expected, current length
   of aggregated packet and so on. If not eligible or max packets
   reached, update IP and TCP headers of first packet in the chain
   and pass it up to stack.
4. The frag_list in skb structure is used to chain packets into one
   large packet.

Kernel changes required: None

Performance results:
Main focus of the initial testing was on 1500 mtu receiver, since this
is a bottleneck not covered by the existing stateless offloads.

There are couple disclaimers about the performance results below:
1. Your mileage will vary!!!! We initially concentrated on couple pci-x
2.0 platforms that are powerful enough to push 10 GbE NIC and do not
have bottlenecks other than cpu%;  testing on other platforms is still
in progress. On some lower end systems we are seeing lower gains.

2. Current LRO implementation is still (for the most part) software based,
and therefore performance potential of the feature is far from being realized.
Full hw implementation of LRO is expected in the next version of Xframe ASIC.

Performance delta(with MTU=1500) going from LRO disabled to enabled:
IBM 2-way Xeon (x366) : 3.5 to 7.1 Gbps
2-way Opteron : 4.5 to 6.1 Gbps

Signed-off-by: Ravinandan Arakali <ravinandan.arakali@neterion.com>
Signed-off-by: Jeff Garzik <jgarzik@pobox.com>
drivers/net/s2io.c
drivers/net/s2io.h

index 49b597cbc19a076e1ed2e737ff68b7b653922044..4e392914971e2389654859b6ca78df93ed6b6d63 100644 (file)
@@ -57,6 +57,9 @@
 #include <linux/ethtool.h>
 #include <linux/workqueue.h>
 #include <linux/if_vlan.h>
+#include <linux/ip.h>
+#include <linux/tcp.h>
+#include <net/tcp.h>
 
 #include <asm/system.h>
 #include <asm/uaccess.h>
@@ -66,7 +69,7 @@
 #include "s2io.h"
 #include "s2io-regs.h"
 
-#define DRV_VERSION "Version 2.0.9.4"
+#define DRV_VERSION "2.0.11.2"
 
 /* S2io Driver name & version. */
 static char s2io_driver_name[] = "Neterion";
@@ -168,6 +171,11 @@ static char ethtool_stats_keys[][ETH_GSTRING_LEN] = {
        {"\n DRIVER STATISTICS"},
        {"single_bit_ecc_errs"},
        {"double_bit_ecc_errs"},
+       ("lro_aggregated_pkts"),
+       ("lro_flush_both_count"),
+       ("lro_out_of_sequence_pkts"),
+       ("lro_flush_due_to_max_pkts"),
+       ("lro_avg_aggr_pkts"),
 };
 
 #define S2IO_STAT_LEN sizeof(ethtool_stats_keys)/ ETH_GSTRING_LEN
@@ -317,6 +325,12 @@ static unsigned int indicate_max_pkts;
 static unsigned int rxsync_frequency = 3;
 /* Interrupt type. Values can be 0(INTA), 1(MSI), 2(MSI_X) */
 static unsigned int intr_type = 0;
+/* Large receive offload feature */
+static unsigned int lro = 0;
+/* Max pkts to be aggregated by LRO at one time. If not specified,
+ * aggregation happens until we hit max IP pkt size(64K)
+ */
+static unsigned int lro_max_pkts = 0xFFFF;
 
 /*
  * S2IO device table.
@@ -1476,6 +1490,19 @@ static int init_nic(struct s2io_nic *nic)
        writel((u32) (val64 >> 32), (add + 4));
        val64 = readq(&bar0->mac_cfg);
 
+       /* Enable FCS stripping by adapter */
+       add = &bar0->mac_cfg;
+       val64 = readq(&bar0->mac_cfg);
+       val64 |= MAC_CFG_RMAC_STRIP_FCS;
+       if (nic->device_type == XFRAME_II_DEVICE)
+               writeq(val64, &bar0->mac_cfg);
+       else {
+               writeq(RMAC_CFG_KEY(0x4C0D), &bar0->rmac_cfg_key);
+               writel((u32) (val64), add);
+               writeq(RMAC_CFG_KEY(0x4C0D), &bar0->rmac_cfg_key);
+               writel((u32) (val64 >> 32), (add + 4));
+       }
+
        /*
         * Set the time value to be inserted in the pause frame
         * generated by xena.
@@ -2569,6 +2596,8 @@ static void rx_intr_handler(ring_info_t *ring_data)
 #ifndef CONFIG_S2IO_NAPI
        int pkt_cnt = 0;
 #endif
+       int i;
+
        spin_lock(&nic->rx_lock);
        if (atomic_read(&nic->card_state) == CARD_DOWN) {
                DBG_PRINT(INTR_DBG, "%s: %s going down for reset\n",
@@ -2661,6 +2690,18 @@ static void rx_intr_handler(ring_info_t *ring_data)
                        break;
 #endif
        }
+       if (nic->lro) {
+               /* Clear all LRO sessions before exiting */
+               for (i=0; i<MAX_LRO_SESSIONS; i++) {
+                       lro_t *lro = &nic->lro0_n[i];
+                       if (lro->in_use) {
+                               update_L3L4_header(nic, lro);
+                               queue_rx_frame(lro->parent);
+                               clear_lro_session(lro);
+                       }
+               }
+       }
+
        spin_unlock(&nic->rx_lock);
 }
 
@@ -3668,23 +3709,32 @@ s2io_msi_handle(int irq, void *dev_id, struct pt_regs *regs)
         * else schedule a tasklet to reallocate the buffers.
         */
        for (i = 0; i < config->rx_ring_num; i++) {
-               int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
-               int level = rx_buffer_level(sp, rxb_size, i);
-
-               if ((level == PANIC) && (!TASKLET_IN_USE)) {
-                       DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", dev->name);
-                       DBG_PRINT(INTR_DBG, "PANIC levels\n");
-                       if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
-                               DBG_PRINT(ERR_DBG, "%s:Out of memory",
-                                         dev->name);
-                               DBG_PRINT(ERR_DBG, " in ISR!!\n");
+               if (!sp->lro) {
+                       int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
+                       int level = rx_buffer_level(sp, rxb_size, i);
+
+                       if ((level == PANIC) && (!TASKLET_IN_USE)) {
+                               DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", 
+                                                       dev->name);
+                               DBG_PRINT(INTR_DBG, "PANIC levels\n");
+                               if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
+                                       DBG_PRINT(ERR_DBG, "%s:Out of memory",
+                                                 dev->name);
+                                       DBG_PRINT(ERR_DBG, " in ISR!!\n");
+                                       clear_bit(0, (&sp->tasklet_status));
+                                       atomic_dec(&sp->isr_cnt);
+                                       return IRQ_HANDLED;
+                               }
                                clear_bit(0, (&sp->tasklet_status));
-                               atomic_dec(&sp->isr_cnt);
-                               return IRQ_HANDLED;
+                       } else if (level == LOW) {
+                               tasklet_schedule(&sp->task);
                        }
-                       clear_bit(0, (&sp->tasklet_status));
-               } else if (level == LOW) {
-                       tasklet_schedule(&sp->task);
+               }
+               else if (fill_rx_buffers(sp, i) == -ENOMEM) {
+                               DBG_PRINT(ERR_DBG, "%s:Out of memory",
+                                                       dev->name);
+                               DBG_PRINT(ERR_DBG, " in Rx Intr!!\n");
+                               break;
                }
        }
 
@@ -3697,29 +3747,37 @@ s2io_msix_ring_handle(int irq, void *dev_id, struct pt_regs *regs)
 {
        ring_info_t *ring = (ring_info_t *)dev_id;
        nic_t *sp = ring->nic;
+       struct net_device *dev = (struct net_device *) dev_id;
        int rxb_size, level, rng_n;
 
        atomic_inc(&sp->isr_cnt);
        rx_intr_handler(ring);
 
        rng_n = ring->ring_no;
-       rxb_size = atomic_read(&sp->rx_bufs_left[rng_n]);
-       level = rx_buffer_level(sp, rxb_size, rng_n);
-
-       if ((level == PANIC) && (!TASKLET_IN_USE)) {
-               int ret;
-               DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", __FUNCTION__);
-               DBG_PRINT(INTR_DBG, "PANIC levels\n");
-               if ((ret = fill_rx_buffers(sp, rng_n)) == -ENOMEM) {
-                       DBG_PRINT(ERR_DBG, "Out of memory in %s",
-                                 __FUNCTION__);
+       if (!sp->lro) {
+               rxb_size = atomic_read(&sp->rx_bufs_left[rng_n]);
+               level = rx_buffer_level(sp, rxb_size, rng_n);
+
+               if ((level == PANIC) && (!TASKLET_IN_USE)) {
+                       int ret;
+                       DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", __FUNCTION__);
+                       DBG_PRINT(INTR_DBG, "PANIC levels\n");
+                       if ((ret = fill_rx_buffers(sp, rng_n)) == -ENOMEM) {
+                               DBG_PRINT(ERR_DBG, "Out of memory in %s",
+                                         __FUNCTION__);
+                               clear_bit(0, (&sp->tasklet_status));
+                               return IRQ_HANDLED;
+                       }
                        clear_bit(0, (&sp->tasklet_status));
-                       return IRQ_HANDLED;
+               } else if (level == LOW) {
+                       tasklet_schedule(&sp->task);
                }
-               clear_bit(0, (&sp->tasklet_status));
-       } else if (level == LOW) {
-               tasklet_schedule(&sp->task);
        }
+       else if (fill_rx_buffers(sp, rng_n) == -ENOMEM) {
+                       DBG_PRINT(ERR_DBG, "%s:Out of memory", dev->name);
+                       DBG_PRINT(ERR_DBG, " in Rx Intr!!\n");
+       }
+
        atomic_dec(&sp->isr_cnt);
 
        return IRQ_HANDLED;
@@ -3875,24 +3933,33 @@ static irqreturn_t s2io_isr(int irq, void *dev_id, struct pt_regs *regs)
         */
 #ifndef CONFIG_S2IO_NAPI
        for (i = 0; i < config->rx_ring_num; i++) {
-               int ret;
-               int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
-               int level = rx_buffer_level(sp, rxb_size, i);
-
-               if ((level == PANIC) && (!TASKLET_IN_USE)) {
-                       DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", dev->name);
-                       DBG_PRINT(INTR_DBG, "PANIC levels\n");
-                       if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
-                               DBG_PRINT(ERR_DBG, "%s:Out of memory",
-                                         dev->name);
-                               DBG_PRINT(ERR_DBG, " in ISR!!\n");
+               if (!sp->lro) {
+                       int ret;
+                       int rxb_size = atomic_read(&sp->rx_bufs_left[i]);
+                       int level = rx_buffer_level(sp, rxb_size, i);
+
+                       if ((level == PANIC) && (!TASKLET_IN_USE)) {
+                               DBG_PRINT(INTR_DBG, "%s: Rx BD hit ", 
+                                                       dev->name);
+                               DBG_PRINT(INTR_DBG, "PANIC levels\n");
+                               if ((ret = fill_rx_buffers(sp, i)) == -ENOMEM) {
+                                       DBG_PRINT(ERR_DBG, "%s:Out of memory",
+                                                 dev->name);
+                                       DBG_PRINT(ERR_DBG, " in ISR!!\n");
+                                       clear_bit(0, (&sp->tasklet_status));
+                                       atomic_dec(&sp->isr_cnt);
+                                       return IRQ_HANDLED;
+                               }
                                clear_bit(0, (&sp->tasklet_status));
-                               atomic_dec(&sp->isr_cnt);
-                               return IRQ_HANDLED;
+                       } else if (level == LOW) {
+                               tasklet_schedule(&sp->task);
                        }
-                       clear_bit(0, (&sp->tasklet_status));
-               } else if (level == LOW) {
-                       tasklet_schedule(&sp->task);
+               }
+               else if (fill_rx_buffers(sp, i) == -ENOMEM) {
+                               DBG_PRINT(ERR_DBG, "%s:Out of memory",
+                                                       dev->name);
+                               DBG_PRINT(ERR_DBG, " in Rx intr!!\n");
+                               break;
                }
        }
 #endif
@@ -5134,6 +5201,16 @@ static void s2io_get_ethtool_stats(struct net_device *dev,
        tmp_stats[i++] = 0;
        tmp_stats[i++] = stat_info->sw_stat.single_ecc_errs;
        tmp_stats[i++] = stat_info->sw_stat.double_ecc_errs;
+       tmp_stats[i++] = stat_info->sw_stat.clubbed_frms_cnt;
+       tmp_stats[i++] = stat_info->sw_stat.sending_both;
+       tmp_stats[i++] = stat_info->sw_stat.outof_sequence_pkts;
+       tmp_stats[i++] = stat_info->sw_stat.flush_max_pkts;
+       if (stat_info->sw_stat.num_aggregations)
+               tmp_stats[i++] = stat_info->sw_stat.sum_avg_pkts_aggregated /
+                                       stat_info->sw_stat.num_aggregations;
+       else
+               tmp_stats[i++] = 0;
+
 }
 
 static int s2io_ethtool_get_regs_len(struct net_device *dev)
@@ -5515,6 +5592,14 @@ static int s2io_card_up(nic_t * sp)
        /* Setting its receive mode */
        s2io_set_multicast(dev);
 
+       if (sp->lro) {
+               /* Initialize max aggregatable pkts based on MTU */
+               sp->lro_max_aggr_per_sess = ((1<<16) - 1) / dev->mtu;
+               /* Check if we can use(if specified) user provided value */
+               if (lro_max_pkts < sp->lro_max_aggr_per_sess)
+                       sp->lro_max_aggr_per_sess = lro_max_pkts;
+       }
+
        /* Enable tasklet for the device */
        tasklet_init(&sp->task, s2io_tasklet, (unsigned long) dev);
 
@@ -5607,6 +5692,7 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
                ((unsigned long) rxdp->Host_Control);
        int ring_no = ring_data->ring_no;
        u16 l3_csum, l4_csum;
+       lro_t *lro;
 
        skb->dev = dev;
        if (rxdp->Control_1 & RXD_T_CODE) {
@@ -5655,7 +5741,8 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
                        skb_put(skb, buf2_len);
        }
 
-       if ((rxdp->Control_1 & TCP_OR_UDP_FRAME) &&
+       if ((rxdp->Control_1 & TCP_OR_UDP_FRAME) && ((!sp->lro) ||
+           (sp->lro && (!(rxdp->Control_1 & RXD_FRAME_IP_FRAG)))) &&
            (sp->rx_csum)) {
                l3_csum = RXD_GET_L3_CKSUM(rxdp->Control_1);
                l4_csum = RXD_GET_L4_CKSUM(rxdp->Control_1);
@@ -5666,6 +5753,54 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
                         * a flag in the RxD.
                         */
                        skb->ip_summed = CHECKSUM_UNNECESSARY;
+                       if (sp->lro) {
+                               u32 tcp_len;
+                               u8 *tcp;
+                               int ret = 0;
+
+                               ret = s2io_club_tcp_session(skb->data, &tcp,
+                                               &tcp_len, &lro, rxdp, sp);
+                               switch (ret) {
+                                       case 3: /* Begin anew */
+                                               lro->parent = skb;
+                                               goto aggregate;
+                                       case 1: /* Aggregate */
+                                       {
+                                               lro_append_pkt(sp, lro,
+                                                       skb, tcp_len);
+                                               goto aggregate;
+                                       }
+                                       case 4: /* Flush session */
+                                       {
+                                               lro_append_pkt(sp, lro,
+                                                       skb, tcp_len);
+                                               queue_rx_frame(lro->parent);
+                                               clear_lro_session(lro);
+                                               sp->mac_control.stats_info->
+                                                   sw_stat.flush_max_pkts++;
+                                               goto aggregate;
+                                       }
+                                       case 2: /* Flush both */
+                                               lro->parent->data_len =
+                                                       lro->frags_len;
+                                               sp->mac_control.stats_info->
+                                                    sw_stat.sending_both++;
+                                               queue_rx_frame(lro->parent);
+                                               clear_lro_session(lro);
+                                               goto send_up;
+                                       case 0: /* sessions exceeded */
+                                       case 5: /*
+                                                * First pkt in session not
+                                                * L3/L4 aggregatable
+                                                */
+                                               break;
+                                       default:
+                                               DBG_PRINT(ERR_DBG,
+                                                       "%s: Samadhana!!\n",
+                                                        __FUNCTION__);
+                                               BUG();
+                               }
+                       }
                } else {
                        /*
                         * Packet with erroneous checksum, let the
@@ -5677,25 +5812,31 @@ static int rx_osm_handler(ring_info_t *ring_data, RxD_t * rxdp)
                skb->ip_summed = CHECKSUM_NONE;
        }
 
-       skb->protocol = eth_type_trans(skb, dev);
+       if (!sp->lro) {
+               skb->protocol = eth_type_trans(skb, dev);
 #ifdef CONFIG_S2IO_NAPI
-       if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
-               /* Queueing the vlan frame to the upper layer */
-               vlan_hwaccel_receive_skb(skb, sp->vlgrp,
-                       RXD_GET_VLAN_TAG(rxdp->Control_2));
-       } else {
-               netif_receive_skb(skb);
-       }
+               if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
+                       /* Queueing the vlan frame to the upper layer */
+                       vlan_hwaccel_receive_skb(skb, sp->vlgrp,
+                               RXD_GET_VLAN_TAG(rxdp->Control_2));
+               } else {
+                       netif_receive_skb(skb);
+               }
 #else
-       if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
-               /* Queueing the vlan frame to the upper layer */
-               vlan_hwaccel_rx(skb, sp->vlgrp,
-                       RXD_GET_VLAN_TAG(rxdp->Control_2));
-       } else {
-               netif_rx(skb);
-       }
+               if (sp->vlgrp && RXD_GET_VLAN_TAG(rxdp->Control_2)) {
+                       /* Queueing the vlan frame to the upper layer */
+                       vlan_hwaccel_rx(skb, sp->vlgrp,
+                               RXD_GET_VLAN_TAG(rxdp->Control_2));
+               } else {
+                       netif_rx(skb);
+               }
 #endif
+       } else {
+send_up:
+               queue_rx_frame(skb);
+       }               
        dev->last_rx = jiffies;
+aggregate:
        atomic_dec(&sp->rx_bufs_left[ring_no]);
        return SUCCESS;
 }
@@ -5807,6 +5948,8 @@ module_param(indicate_max_pkts, int, 0);
 #endif
 module_param(rxsync_frequency, int, 0);
 module_param(intr_type, int, 0);
+module_param(lro, int, 0);
+module_param(lro_max_pkts, int, 0);
 
 /**
  *  s2io_init_nic - Initialization of the adapter .
@@ -5938,6 +6081,7 @@ Defaulting to INTA\n");
        else
                sp->device_type = XFRAME_I_DEVICE;
 
+       sp->lro = lro;
                
        /* Initialize some PCI/PCI-X fields of the NIC. */
        s2io_init_pci(sp);
@@ -6241,6 +6385,10 @@ Defaulting to INTA\n");
                DBG_PRINT(ERR_DBG, "%s: 3-Buffer mode support has been "
                          "enabled\n",dev->name);
 
+       if (sp->lro)
+               DBG_PRINT(ERR_DBG, "%s: Large receive offload enabled\n",
+                       dev->name);
+
        /* Initialize device name */
        strcpy(sp->name, dev->name);
        if (sp->device_type & XFRAME_II_DEVICE)
@@ -6351,3 +6499,317 @@ void s2io_closer(void)
 
 module_init(s2io_starter);
 module_exit(s2io_closer);
+
+static int check_L2_lro_capable(u8 *buffer, struct iphdr **ip, 
+               struct tcphdr **tcp, RxD_t *rxdp)
+{
+       int ip_off;
+       u8 l2_type = (u8)((rxdp->Control_1 >> 37) & 0x7), ip_len;
+
+       if (!(rxdp->Control_1 & RXD_FRAME_PROTO_TCP)) {
+               DBG_PRINT(INIT_DBG,"%s: Non-TCP frames not supported for LRO\n",
+                         __FUNCTION__);
+               return -1;
+       }
+
+       /* TODO:
+        * By default the VLAN field in the MAC is stripped by the card, if this
+        * feature is turned off in rx_pa_cfg register, then the ip_off field
+        * has to be shifted by a further 2 bytes
+        */
+       switch (l2_type) {
+               case 0: /* DIX type */
+               case 4: /* DIX type with VLAN */
+                       ip_off = HEADER_ETHERNET_II_802_3_SIZE;
+                       break;
+               /* LLC, SNAP etc are considered non-mergeable */
+               default:
+                       return -1;
+       }
+
+       *ip = (struct iphdr *)((u8 *)buffer + ip_off);
+       ip_len = (u8)((*ip)->ihl);
+       ip_len <<= 2;
+       *tcp = (struct tcphdr *)((unsigned long)*ip + ip_len);
+
+       return 0;
+}
+
+static int check_for_socket_match(lro_t *lro, struct iphdr *ip,
+                                 struct tcphdr *tcp)
+{
+       DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
+       if ((lro->iph->saddr != ip->saddr) || (lro->iph->daddr != ip->daddr) ||
+          (lro->tcph->source != tcp->source) || (lro->tcph->dest != tcp->dest))
+               return -1;
+       return 0;
+}
+
+static inline int get_l4_pyld_length(struct iphdr *ip, struct tcphdr *tcp)
+{
+       return(ntohs(ip->tot_len) - (ip->ihl << 2) - (tcp->doff << 2));
+}
+
+static void initiate_new_session(lro_t *lro, u8 *l2h,
+                    struct iphdr *ip, struct tcphdr *tcp, u32 tcp_pyld_len)
+{
+       DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
+       lro->l2h = l2h;
+       lro->iph = ip;
+       lro->tcph = tcp;
+       lro->tcp_next_seq = tcp_pyld_len + ntohl(tcp->seq);
+       lro->tcp_ack = ntohl(tcp->ack_seq);
+       lro->sg_num = 1;
+       lro->total_len = ntohs(ip->tot_len);
+       lro->frags_len = 0;
+       /* 
+        * check if we saw TCP timestamp. Other consistency checks have
+        * already been done.
+        */
+       if (tcp->doff == 8) {
+               u32 *ptr;
+               ptr = (u32 *)(tcp+1);
+               lro->saw_ts = 1;
+               lro->cur_tsval = *(ptr+1);
+               lro->cur_tsecr = *(ptr+2);
+       }
+       lro->in_use = 1;
+}
+
+static void update_L3L4_header(nic_t *sp, lro_t *lro)
+{
+       struct iphdr *ip = lro->iph;
+       struct tcphdr *tcp = lro->tcph;
+       u16 nchk;
+       StatInfo_t *statinfo = sp->mac_control.stats_info;
+       DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
+
+       /* Update L3 header */
+       ip->tot_len = htons(lro->total_len);
+       ip->check = 0;
+       nchk = ip_fast_csum((u8 *)lro->iph, ip->ihl);
+       ip->check = nchk;
+
+       /* Update L4 header */
+       tcp->ack_seq = lro->tcp_ack;
+       tcp->window = lro->window;
+
+       /* Update tsecr field if this session has timestamps enabled */
+       if (lro->saw_ts) {
+               u32 *ptr = (u32 *)(tcp + 1);
+               *(ptr+2) = lro->cur_tsecr;
+       }
+
+       /* Update counters required for calculation of
+        * average no. of packets aggregated.
+        */
+       statinfo->sw_stat.sum_avg_pkts_aggregated += lro->sg_num;
+       statinfo->sw_stat.num_aggregations++;
+}
+
+static void aggregate_new_rx(lro_t *lro, struct iphdr *ip,
+               struct tcphdr *tcp, u32 l4_pyld)
+{
+       DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
+       lro->total_len += l4_pyld;
+       lro->frags_len += l4_pyld;
+       lro->tcp_next_seq += l4_pyld;
+       lro->sg_num++;
+
+       /* Update ack seq no. and window ad(from this pkt) in LRO object */
+       lro->tcp_ack = tcp->ack_seq;
+       lro->window = tcp->window;
+       
+       if (lro->saw_ts) {
+               u32 *ptr;
+               /* Update tsecr and tsval from this packet */
+               ptr = (u32 *) (tcp + 1);
+               lro->cur_tsval = *(ptr + 1); 
+               lro->cur_tsecr = *(ptr + 2);
+       }
+}
+
+static int verify_l3_l4_lro_capable(lro_t *l_lro, struct iphdr *ip,
+                                   struct tcphdr *tcp, u32 tcp_pyld_len)
+{
+       DBG_PRINT(INFO_DBG,"%s: Been here...\n", __FUNCTION__);
+       u8 *ptr;
+
+       if (!tcp_pyld_len) {
+               /* Runt frame or a pure ack */
+               return -1;
+       }
+
+       if (ip->ihl != 5) /* IP has options */
+               return -1;
+
+       if (tcp->urg || tcp->psh || tcp->rst || tcp->syn || tcp->fin ||
+                                                               !tcp->ack) {
+               /*
+                * Currently recognize only the ack control word and
+                * any other control field being set would result in
+                * flushing the LRO session
+                */
+               return -1;
+       }
+
+       /* 
+        * Allow only one TCP timestamp option. Don't aggregate if
+        * any other options are detected.
+        */
+       if (tcp->doff != 5 && tcp->doff != 8)
+               return -1;
+
+       if (tcp->doff == 8) {
+               ptr = (u8 *)(tcp + 1);  
+               while (*ptr == TCPOPT_NOP)
+                       ptr++;
+               if (*ptr != TCPOPT_TIMESTAMP || *(ptr+1) != TCPOLEN_TIMESTAMP)
+                       return -1;
+
+               /* Ensure timestamp value increases monotonically */
+               if (l_lro)
+                       if (l_lro->cur_tsval > *((u32 *)(ptr+2)))
+                               return -1;
+
+               /* timestamp echo reply should be non-zero */
+               if (*((u32 *)(ptr+6)) == 0) 
+                       return -1;
+       }
+
+       return 0;
+}
+
+static int
+s2io_club_tcp_session(u8 *buffer, u8 **tcp, u32 *tcp_len, lro_t **lro,
+                     RxD_t *rxdp, nic_t *sp)
+{
+       struct iphdr *ip;
+       struct tcphdr *tcph;
+       int ret = 0, i;
+
+       if (!(ret = check_L2_lro_capable(buffer, &ip, (struct tcphdr **)tcp,
+                                        rxdp))) {
+               DBG_PRINT(INFO_DBG,"IP Saddr: %x Daddr: %x\n",
+                         ip->saddr, ip->daddr);
+       } else {
+               return ret;
+       }
+
+       tcph = (struct tcphdr *)*tcp;
+       *tcp_len = get_l4_pyld_length(ip, tcph);
+       for (i=0; i<MAX_LRO_SESSIONS; i++) {
+               lro_t *l_lro = &sp->lro0_n[i];
+               if (l_lro->in_use) {
+                       if (check_for_socket_match(l_lro, ip, tcph))
+                               continue;
+                       /* Sock pair matched */
+                       *lro = l_lro;
+
+                       if ((*lro)->tcp_next_seq != ntohl(tcph->seq)) {
+                               DBG_PRINT(INFO_DBG, "%s:Out of order. expected "
+                                         "0x%x, actual 0x%x\n", __FUNCTION__,
+                                         (*lro)->tcp_next_seq,
+                                         ntohl(tcph->seq));
+
+                               sp->mac_control.stats_info->
+                                  sw_stat.outof_sequence_pkts++;
+                               ret = 2;
+                               break;
+                       }
+
+                       if (!verify_l3_l4_lro_capable(l_lro, ip, tcph,*tcp_len))
+                               ret = 1; /* Aggregate */
+                       else
+                               ret = 2; /* Flush both */
+                       break;
+               }
+       }
+
+       if (ret == 0) {
+               /* Before searching for available LRO objects,
+                * check if the pkt is L3/L4 aggregatable. If not
+                * don't create new LRO session. Just send this
+                * packet up.
+                */
+               if (verify_l3_l4_lro_capable(NULL, ip, tcph, *tcp_len)) {
+                       return 5;
+               }
+
+               for (i=0; i<MAX_LRO_SESSIONS; i++) {
+                       lro_t *l_lro = &sp->lro0_n[i];
+                       if (!(l_lro->in_use)) {
+                               *lro = l_lro;
+                               ret = 3; /* Begin anew */
+                               break;
+                       }
+               }
+       }
+
+       if (ret == 0) { /* sessions exceeded */
+               DBG_PRINT(INFO_DBG,"%s:All LRO sessions already in use\n",
+                         __FUNCTION__);
+               *lro = NULL;
+               return ret;
+       }
+
+       switch (ret) {
+               case 3:
+                       initiate_new_session(*lro, buffer, ip, tcph, *tcp_len);
+                       break;
+               case 2:
+                       update_L3L4_header(sp, *lro);
+                       break;
+               case 1:
+                       aggregate_new_rx(*lro, ip, tcph, *tcp_len);
+                       if ((*lro)->sg_num == sp->lro_max_aggr_per_sess) {
+                               update_L3L4_header(sp, *lro);
+                               ret = 4; /* Flush the LRO */
+                       }
+                       break;
+               default:
+                       DBG_PRINT(ERR_DBG,"%s:Dont know, can't say!!\n",
+                               __FUNCTION__);
+                       break;
+       }
+
+       return ret;
+}
+
+static void clear_lro_session(lro_t *lro)
+{
+       static u16 lro_struct_size = sizeof(lro_t);
+
+       memset(lro, 0, lro_struct_size);
+}
+
+static void queue_rx_frame(struct sk_buff *skb)
+{
+       struct net_device *dev = skb->dev;
+
+       skb->protocol = eth_type_trans(skb, dev);
+#ifdef CONFIG_S2IO_NAPI
+       netif_receive_skb(skb);
+#else
+       netif_rx(skb);
+#endif
+}
+
+static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb,
+                          u32 tcp_len)
+{
+       struct sk_buff *tmp, *first = lro->parent;
+
+       first->len += tcp_len;
+       first->data_len = lro->frags_len;
+       skb_pull(skb, (skb->len - tcp_len));
+       if ((tmp = skb_shinfo(first)->frag_list)) {
+               while (tmp->next)
+                       tmp = tmp->next;
+               tmp->next = skb;
+       }
+       else
+               skb_shinfo(first)->frag_list = skb;
+       sp->mac_control.stats_info->sw_stat.clubbed_frms_cnt++;
+       return;
+}
index 852a6a899d07cd5d1d9a79b7b724ce447daf5991..65cc59ac71f01e4f7241cb6634b4eb6b3b4fee3b 100644 (file)
@@ -78,6 +78,13 @@ int debug_level = ERR_DBG;   /* Default level. */
 typedef struct {
        unsigned long long single_ecc_errs;
        unsigned long long double_ecc_errs;
+       /* LRO statistics */
+       unsigned long long clubbed_frms_cnt;
+       unsigned long long sending_both;
+       unsigned long long outof_sequence_pkts;
+       unsigned long long flush_max_pkts;
+       unsigned long long sum_avg_pkts_aggregated;
+       unsigned long long num_aggregations;
 } swStat_t;
 
 /* The statistics block of Xena */
@@ -680,6 +687,24 @@ struct msix_info_st {
        u64 data;
 };
 
+/* Data structure to represent a LRO session */
+typedef struct lro {
+       struct sk_buff  *parent;
+       u8              *l2h;
+       struct iphdr    *iph;
+       struct tcphdr   *tcph;
+       u32             tcp_next_seq;
+       u32             tcp_ack;
+       int             total_len;
+       int             frags_len;
+       int             sg_num;
+       int             in_use;
+       u16             window;
+       u32             cur_tsval;
+       u32             cur_tsecr;
+       u8              saw_ts;
+}lro_t;
+
 /* Structure representing one instance of the NIC */
 struct s2io_nic {
        int rxd_mode;
@@ -784,6 +809,13 @@ struct s2io_nic {
 #define XFRAME_II_DEVICE       2
        u8 device_type;
 
+#define MAX_LRO_SESSIONS       32
+       lro_t lro0_n[MAX_LRO_SESSIONS];
+       unsigned long   clubbed_frms_cnt;
+       unsigned long   sending_both;
+       u8              lro;
+       u16             lro_max_aggr_per_sess;
+
 #define INTA   0
 #define MSI    1
 #define MSI_X  2
@@ -940,4 +972,10 @@ static void s2io_card_down(nic_t *nic);
 static int s2io_card_up(nic_t *nic);
 int get_xena_rev_id(struct pci_dev *pdev);
 void restore_xmsi_data(nic_t *nic);
+
+static int s2io_club_tcp_session(u8 *buffer, u8 **tcp, u32 *tcp_len, lro_t **lro, RxD_t *rxdp, nic_t *sp);
+static void clear_lro_session(lro_t *lro);
+static void queue_rx_frame(struct sk_buff *skb);
+static void update_L3L4_header(nic_t *sp, lro_t *lro);
+static void lro_append_pkt(nic_t *sp, lro_t *lro, struct sk_buff *skb, u32 tcp_len);
 #endif                         /* _S2IO_H */