1 /* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
4 * Filesystem request handling methods
7 #include <linux/hdreg.h>
8 #include <linux/blkdev.h>
9 #include <linux/skbuff.h>
10 #include <linux/netdevice.h>
11 #include <linux/genhd.h>
12 #include <linux/moduleparam.h>
13 #include <net/net_namespace.h>
14 #include <asm/unaligned.h>
17 static int aoe_deadsecs = 60 * 3;
18 module_param(aoe_deadsecs, int, 0644);
19 MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
21 static int aoe_maxout = 16;
22 module_param(aoe_maxout, int, 0644);
23 MODULE_PARM_DESC(aoe_maxout,
24 "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
26 static struct sk_buff *
31 skb = alloc_skb(len, GFP_ATOMIC);
33 skb_reset_mac_header(skb);
34 skb_reset_network_header(skb);
35 skb->protocol = __constant_htons(ETH_P_AOE);
37 skb->next = skb->prev = NULL;
39 /* tell the network layer not to perform IP checksums
40 * or to get the NIC to do it
42 skb->ip_summed = CHECKSUM_NONE;
48 getframe(struct aoetgt *t, int tag)
61 * Leave the top bit clear so we have tagspace for userland.
62 * The bottom 16 bits are the xmit tick for rexmit/rttavg processing.
63 * This driver reserves tag -1 to mean "unused frame."
66 newtag(struct aoetgt *t)
71 return n |= (++t->lasttag & 0x7fff) << 16;
75 aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
77 u32 host_tag = newtag(t);
79 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
80 memcpy(h->dst, t->addr, sizeof h->dst);
81 h->type = __constant_cpu_to_be16(ETH_P_AOE);
83 h->major = cpu_to_be16(d->aoemajor);
84 h->minor = d->aoeminor;
86 h->tag = cpu_to_be32(host_tag);
92 put_lba(struct aoe_atahdr *ah, sector_t lba)
103 ifrotate(struct aoetgt *t)
106 if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
108 if (t->ifp->nd == NULL) {
109 printk(KERN_INFO "aoe: no interface to rotate to\n");
115 skb_pool_put(struct aoedev *d, struct sk_buff *skb)
120 d->skbpool_tl->next = skb;
124 static struct sk_buff *
125 skb_pool_get(struct aoedev *d)
130 if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
131 d->skbpool_hd = skb->next;
135 if (d->nskbpool < NSKBPOOLMAX
136 && (skb = new_skb(ETH_ZLEN))) {
143 /* freeframe is where we do our load balancing so it's a little hairy. */
144 static struct frame *
145 freeframe(struct aoedev *d)
147 struct frame *f, *e, *rf;
151 if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
152 printk(KERN_ERR "aoe: NULL TARGETS!\n");
157 if (t >= &d->targets[NTARGETS] || !*t)
160 if ((*t)->nout < (*t)->maxout
165 e = f + (*t)->nframes;
167 if (f->tag != FREETAG)
171 && !(f->skb = skb = new_skb(ETH_ZLEN)))
173 if (atomic_read(&skb_shinfo(skb)->dataref)
179 gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
185 /* Work can be done, but the network layer is
186 holding our precious packets. Try to grab
187 one from the pool. */
189 if (f == NULL) { /* more paranoia */
191 "aoe: freeframe: %s.\n",
192 "unexpected null rf");
193 d->flags |= DEVFL_KICKME;
196 skb = skb_pool_get(d);
198 skb_pool_put(d, f->skb);
204 d->flags |= DEVFL_KICKME;
206 if (t == d->tgt) /* we've looped and found nada */
209 if (t >= &d->targets[NTARGETS] || !*t)
216 aoecmd_ata_rw(struct aoedev *d)
220 struct aoe_atahdr *ah;
226 char writebit, extbit;
237 bcnt = t->ifp->maxbcnt;
240 if (bcnt > buf->bv_resid)
241 bcnt = buf->bv_resid;
242 /* initialize the headers & frame */
244 h = (struct aoe_hdr *) skb_mac_header(skb);
245 ah = (struct aoe_atahdr *) (h+1);
246 skb_put(skb, sizeof *h + sizeof *ah);
247 memset(h, 0, skb->len);
248 f->tag = aoehdr_atainit(d, t, h);
252 f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
254 f->lba = buf->sector;
256 /* set up ata header */
257 ah->scnt = bcnt >> 9;
258 put_lba(ah, buf->sector);
259 if (d->flags & DEVFL_EXT) {
260 ah->aflags |= AOEAFL_EXT;
264 ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
266 if (bio_data_dir(buf->bio) == WRITE) {
267 skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
268 ah->aflags |= AOEAFL_WRITE;
270 skb->data_len = bcnt;
277 ah->cmdstat = WIN_READ | writebit | extbit;
279 /* mark all tracking fields and load out */
280 buf->nframesout += 1;
282 buf->bv_resid -= bcnt;
284 buf->sector += bcnt >> 9;
285 if (buf->resid == 0) {
287 } else if (buf->bv_resid == 0) {
289 buf->bv_resid = bv->bv_len;
290 WARN_ON(buf->bv_resid == 0);
291 buf->bv_off = bv->bv_offset;
294 skb->dev = t->ifp->nd;
295 skb = skb_clone(skb, GFP_ATOMIC);
298 d->sendq_tl->next = skb;
306 /* some callers cannot sleep, and they can call this function,
307 * transmitting the packets later, when interrupts are on
309 static struct sk_buff *
310 aoecmd_cfg_pkts(ushort aoemajor, unsigned char aoeminor, struct sk_buff **tail)
313 struct aoe_cfghdr *ch;
314 struct sk_buff *skb, *sl, *sl_tail;
315 struct net_device *ifp;
319 read_lock(&dev_base_lock);
320 for_each_netdev(&init_net, ifp) {
322 if (!is_aoe_netif(ifp))
325 skb = new_skb(sizeof *h + sizeof *ch);
327 printk(KERN_INFO "aoe: skb alloc failure\n");
330 skb_put(skb, sizeof *h + sizeof *ch);
334 h = (struct aoe_hdr *) skb_mac_header(skb);
335 memset(h, 0, sizeof *h + sizeof *ch);
337 memset(h->dst, 0xff, sizeof h->dst);
338 memcpy(h->src, ifp->dev_addr, sizeof h->src);
339 h->type = __constant_cpu_to_be16(ETH_P_AOE);
341 h->major = cpu_to_be16(aoemajor);
350 read_unlock(&dev_base_lock);
358 resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
362 struct aoe_atahdr *ah;
369 h = (struct aoe_hdr *) skb_mac_header(skb);
370 ah = (struct aoe_atahdr *) (h+1);
372 snprintf(buf, sizeof buf,
373 "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x "
374 "s=%012llx d=%012llx nout=%d\n",
375 "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
377 mac_addr(h->dst), t->nout);
381 h->tag = cpu_to_be32(n);
382 memcpy(h->dst, t->addr, sizeof h->dst);
383 memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
385 switch (ah->cmdstat) {
398 if (ah->aflags & AOEAFL_WRITE) {
399 skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
400 offset_in_page(f->bufaddr), n);
401 skb->len = sizeof *h + sizeof *ah + n;
405 skb->dev = t->ifp->nd;
406 skb = skb_clone(skb, GFP_ATOMIC);
410 d->sendq_tl->next = skb;
421 n = jiffies & 0xffff;
428 static struct aoeif *
429 getif(struct aoetgt *t, struct net_device *nd)
441 static struct aoeif *
442 addif(struct aoetgt *t, struct net_device *nd)
450 p->maxbcnt = DEFAULTBCNT;
457 ejectif(struct aoetgt *t, struct aoeif *ifp)
462 e = t->ifs + NAOEIFS - 1;
463 n = (e - ifp) * sizeof *ifp;
464 memmove(ifp, ifp+1, n);
469 sthtith(struct aoedev *d)
471 struct frame *f, *e, *nf;
473 struct aoetgt *ht = *d->htgt;
478 if (f->tag == FREETAG)
490 resend(d, *d->tgt, nf);
492 /* he's clean, he's useless. take away his interfaces */
493 memset(ht->ifs, 0, sizeof ht->ifs);
498 static inline unsigned char
499 ata_scnt(unsigned char *packet) {
501 struct aoe_atahdr *ah;
503 h = (struct aoe_hdr *) packet;
504 ah = (struct aoe_atahdr *) (h+1);
509 rexmit_timer(ulong vp)
512 struct aoetgt *t, **tt, **te;
516 register long timeout;
519 d = (struct aoedev *) vp;
522 /* timeout is always ~150% of the moving average */
524 timeout += timeout >> 1;
526 spin_lock_irqsave(&d->lock, flags);
528 if (d->flags & DEVFL_TKILL) {
529 spin_unlock_irqrestore(&d->lock, flags);
534 for (; tt < te && *tt; tt++) {
539 if (f->tag == FREETAG
540 || tsince(f->tag) < timeout)
542 n = f->waited += timeout;
544 if (n > aoe_deadsecs) {
545 /* waited too long. device failure. */
550 if (n > HELPWAIT /* see if another target can help */
551 && (tt != d->targets || d->targets[1]))
554 if (t->nout == t->maxout) {
557 t->lastwadj = jiffies;
560 ifp = getif(t, f->skb->dev);
561 if (ifp && ++ifp->lost > (t->nframes << 1)
562 && (ifp != t->ifs || t->ifs[1].nd)) {
567 if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
568 && ifp && ++ifp->lostjumbo > (t->nframes << 1)
569 && ifp->maxbcnt != DEFAULTBCNT) {
572 "too many lost jumbo on "
574 "falling back to %d frames.\n",
575 d->aoemajor, d->aoeminor,
576 ifp->nd->name, mac_addr(t->addr),
584 if (t->nout == t->maxout
585 && t->maxout < t->nframes
586 && (jiffies - t->lastwadj)/HZ > 10) {
588 t->lastwadj = jiffies;
595 d->rttavg = MAXTIMER;
598 if (d->flags & DEVFL_KICKME || d->htgt) {
599 d->flags &= ~DEVFL_KICKME;
604 d->sendq_hd = d->sendq_tl = NULL;
606 d->timer.expires = jiffies + TIMERTICK;
607 add_timer(&d->timer);
609 spin_unlock_irqrestore(&d->lock, flags);
614 /* enters with d->lock held */
616 aoecmd_work(struct aoedev *d)
620 if (d->htgt && !sthtith(d))
622 if (d->inprocess == NULL) {
623 if (list_empty(&d->bufq))
625 buf = container_of(d->bufq.next, struct buf, bufs);
626 list_del(d->bufq.next);
629 if (aoecmd_ata_rw(d))
633 /* this function performs work that has been deferred until sleeping is OK
636 aoecmd_sleepwork(struct work_struct *work)
638 struct aoedev *d = container_of(work, struct aoedev, work);
640 if (d->flags & DEVFL_GDALLOC)
643 if (d->flags & DEVFL_NEWSIZE) {
644 struct block_device *bd;
648 ssize = d->gd->capacity;
649 bd = bdget_disk(d->gd, 0);
652 mutex_lock(&bd->bd_inode->i_mutex);
653 i_size_write(bd->bd_inode, (loff_t)ssize<<9);
654 mutex_unlock(&bd->bd_inode->i_mutex);
657 spin_lock_irqsave(&d->lock, flags);
658 d->flags |= DEVFL_UP;
659 d->flags &= ~DEVFL_NEWSIZE;
660 spin_unlock_irqrestore(&d->lock, flags);
665 ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
670 /* word 83: command set supported */
671 n = get_unaligned_le16(&id[83 << 1]);
673 /* word 86: command set/feature enabled */
674 n |= get_unaligned_le16(&id[86 << 1]);
676 if (n & (1<<10)) { /* bit 10: LBA 48 */
677 d->flags |= DEVFL_EXT;
679 /* word 100: number lba48 sectors */
680 ssize = get_unaligned_le64(&id[100 << 1]);
682 /* set as in ide-disk.c:init_idedisk_capacity */
683 d->geo.cylinders = ssize;
684 d->geo.cylinders /= (255 * 63);
688 d->flags &= ~DEVFL_EXT;
690 /* number lba28 sectors */
691 ssize = get_unaligned_le32(&id[60 << 1]);
693 /* NOTE: obsolete in ATA 6 */
694 d->geo.cylinders = get_unaligned_le16(&id[54 << 1]);
695 d->geo.heads = get_unaligned_le16(&id[55 << 1]);
696 d->geo.sectors = get_unaligned_le16(&id[56 << 1]);
699 if (d->ssize != ssize)
701 "aoe: %012llx e%ld.%d v%04x has %llu sectors\n",
703 d->aoemajor, d->aoeminor,
704 d->fw_ver, (long long)ssize);
707 if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
710 d->gd->capacity = ssize;
711 d->flags |= DEVFL_NEWSIZE;
713 d->flags |= DEVFL_GDALLOC;
714 schedule_work(&d->work);
718 calc_rttavg(struct aoedev *d, int rtt)
727 else if (n > MAXTIMER)
729 d->mintimer += (n - d->mintimer) >> 1;
730 } else if (n < d->mintimer)
732 else if (n > MAXTIMER)
735 /* g == .25; cf. Congestion Avoidance and Control, Jacobson & Karels; 1988 */
740 static struct aoetgt *
741 gettgt(struct aoedev *d, char *addr)
743 struct aoetgt **t, **e;
747 for (; t < e && *t; t++)
748 if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
754 diskstats(struct gendisk *disk, struct bio *bio, ulong duration, sector_t sector)
756 unsigned long n_sect = bio->bi_size >> 9;
757 const int rw = bio_data_dir(bio);
759 all_stat_inc(disk, ios[rw], sector);
760 all_stat_add(disk, ticks[rw], duration, sector);
761 all_stat_add(disk, sectors[rw], n_sect, sector);
762 all_stat_add(disk, io_ticks, duration, sector);
766 aoecmd_ata_rsp(struct sk_buff *skb)
769 struct aoe_hdr *hin, *hout;
770 struct aoe_atahdr *ahin, *ahout;
781 hin = (struct aoe_hdr *) skb_mac_header(skb);
782 aoemajor = get_unaligned_be16(&hin->major);
783 d = aoedev_by_aoeaddr(aoemajor, hin->minor);
785 snprintf(ebuf, sizeof ebuf, "aoecmd_ata_rsp: ata response "
786 "for unknown device %d.%d\n",
787 aoemajor, hin->minor);
792 spin_lock_irqsave(&d->lock, flags);
794 n = get_unaligned_be32(&hin->tag);
795 t = gettgt(d, hin->src);
797 printk(KERN_INFO "aoe: can't find target e%ld.%d:%012llx\n",
798 d->aoemajor, d->aoeminor, mac_addr(hin->src));
799 spin_unlock_irqrestore(&d->lock, flags);
804 calc_rttavg(d, -tsince(n));
805 spin_unlock_irqrestore(&d->lock, flags);
806 snprintf(ebuf, sizeof ebuf,
807 "%15s e%d.%d tag=%08x@%08lx\n",
809 get_unaligned_be16(&hin->major),
811 get_unaligned_be32(&hin->tag),
817 calc_rttavg(d, tsince(f->tag));
819 ahin = (struct aoe_atahdr *) (hin+1);
820 hout = (struct aoe_hdr *) skb_mac_header(f->skb);
821 ahout = (struct aoe_atahdr *) (hout+1);
824 if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
826 "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
827 ahout->cmdstat, ahin->cmdstat,
828 d->aoemajor, d->aoeminor);
830 buf->flags |= BUFFL_FAIL;
832 if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
834 n = ahout->scnt << 9;
835 switch (ahout->cmdstat) {
838 if (skb->len - sizeof *hin - sizeof *ahin < n) {
840 "aoe: %s. skb->len=%d need=%ld\n",
841 "runt data size in read", skb->len, n);
842 /* fail frame f? just returning will rexmit. */
843 spin_unlock_irqrestore(&d->lock, flags);
846 memcpy(f->bufaddr, ahin+1, n);
849 ifp = getif(t, skb->dev);
863 if (skb->len - sizeof *hin - sizeof *ahin < 512) {
865 "aoe: runt data size in ataid. skb->len=%d\n",
867 spin_unlock_irqrestore(&d->lock, flags);
870 ataid_complete(d, t, (char *) (ahin+1));
874 "aoe: unrecognized ata command %2.2Xh for %d.%d\n",
876 get_unaligned_be16(&hin->major),
881 if (buf && --buf->nframesout == 0 && buf->resid == 0) {
882 diskstats(d->gd, buf->bio, jiffies - buf->stime, buf->sector);
883 n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
884 bio_endio(buf->bio, n);
885 mempool_free(buf, d->bufpool);
895 d->sendq_hd = d->sendq_tl = NULL;
897 spin_unlock_irqrestore(&d->lock, flags);
902 aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
906 sl = aoecmd_cfg_pkts(aoemajor, aoeminor, NULL);
912 aoecmd_ata_id(struct aoedev *d)
915 struct aoe_atahdr *ah;
926 /* initialize the headers & frame */
928 h = (struct aoe_hdr *) skb_mac_header(skb);
929 ah = (struct aoe_atahdr *) (h+1);
930 skb_put(skb, sizeof *h + sizeof *ah);
931 memset(h, 0, skb->len);
932 f->tag = aoehdr_atainit(d, t, h);
936 /* set up ata header */
938 ah->cmdstat = WIN_IDENTIFY;
941 skb->dev = t->ifp->nd;
943 d->rttavg = MAXTIMER;
944 d->timer.function = rexmit_timer;
946 return skb_clone(skb, GFP_ATOMIC);
949 static struct aoetgt *
950 addtgt(struct aoedev *d, char *addr, ulong nframes)
952 struct aoetgt *t, **tt, **te;
957 for (; tt < te && *tt; tt++)
962 "aoe: device addtgt failure; too many targets\n");
965 t = kcalloc(1, sizeof *t, GFP_ATOMIC);
966 f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
970 printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
974 t->nframes = nframes;
979 memcpy(t->addr, addr, sizeof t->addr);
981 t->maxout = t->nframes;
986 aoecmd_cfg_rsp(struct sk_buff *skb)
990 struct aoe_cfghdr *ch;
993 ulong flags, sysminor, aoemajor;
997 h = (struct aoe_hdr *) skb_mac_header(skb);
998 ch = (struct aoe_cfghdr *) (h+1);
1001 * Enough people have their dip switches set backwards to
1002 * warrant a loud message for this special case.
1004 aoemajor = be16_to_cpu(get_unaligned(&h->major));
1005 if (aoemajor == 0xfff) {
1006 printk(KERN_ERR "aoe: Warning: shelf address is all ones. "
1007 "Check shelf dip switches.\n");
1011 sysminor = SYSMINOR(aoemajor, h->minor);
1012 if (sysminor * AOE_PARTITIONS + AOE_PARTITIONS > MINORMASK) {
1013 printk(KERN_INFO "aoe: e%ld.%d: minor number too large\n",
1014 aoemajor, (int) h->minor);
1018 n = be16_to_cpu(ch->bufcnt);
1019 if (n > aoe_maxout) /* keep it reasonable */
1022 d = aoedev_by_sysminor_m(sysminor);
1024 printk(KERN_INFO "aoe: device sysminor_m failure\n");
1028 spin_lock_irqsave(&d->lock, flags);
1030 t = gettgt(d, h->src);
1032 t = addtgt(d, h->src, n);
1034 spin_unlock_irqrestore(&d->lock, flags);
1038 ifp = getif(t, skb->dev);
1040 ifp = addif(t, skb->dev);
1043 "aoe: device addif failure; "
1044 "too many interfaces?\n");
1045 spin_unlock_irqrestore(&d->lock, flags);
1051 n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
1055 n = n ? n * 512 : DEFAULTBCNT;
1056 if (n != ifp->maxbcnt) {
1058 "aoe: e%ld.%d: setting %d%s%s:%012llx\n",
1059 d->aoemajor, d->aoeminor, n,
1060 " byte data frames on ", ifp->nd->name,
1066 /* don't change users' perspective */
1068 spin_unlock_irqrestore(&d->lock, flags);
1071 d->fw_ver = be16_to_cpu(ch->fwver);
1073 sl = aoecmd_ata_id(d);
1075 spin_unlock_irqrestore(&d->lock, flags);
1081 aoecmd_cleanslate(struct aoedev *d)
1083 struct aoetgt **t, **te;
1084 struct aoeif *p, *e;
1086 d->mintimer = MINTIMER;
1090 for (; t < te && *t; t++) {
1091 (*t)->maxout = (*t)->nframes;
1094 for (; p < e; p++) {
1097 p->maxbcnt = DEFAULTBCNT;