From: Patrick McHardy Date: Wed, 27 Sep 2006 23:45:45 +0000 (-0700) Subject: [NET_SCHED]: Fix fallout from dev->qdisc RCU change X-Git-Tag: v2.6.19-rc1~858^2~86 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=85670cc1faa2e1472e4a423cbf0b5e3d55c5ba88;p=linux-2.6 [NET_SCHED]: Fix fallout from dev->qdisc RCU change The move of qdisc destruction to a rcu callback broke locking in the entire qdisc layer by invalidating previously valid assumptions about the context in which changes to the qdisc tree occur. The two assumptions were: - since changes only happen in process context, read_lock doesn't need bottem half protection. Now invalid since destruction of inner qdiscs, classifiers, actions and estimators happens in the RCU callback unless they're manually deleted, resulting in dead-locks when read_lock in process context is interrupted by write_lock_bh in bottem half context. - since changes only happen under the RTNL, no additional locking is necessary for data not used during packet processing (f.e. u32_list). Again, since destruction now happens in the RCU callback, this assumption is not valid anymore, causing races while using this data, which can result in corruption or use-after-free. Instead of "fixing" this by disabling bottem halfs everywhere and adding new locks/refcounting, this patch makes these assumptions valid again by moving destruction back to process context. Since only the dev->qdisc pointer is protected by RCU, but ->enqueue and the qdisc tree are still protected by dev->qdisc_lock, destruction of the tree can be performed immediately and only the final free needs to happen in the rcu callback to make sure dev_queue_xmit doesn't access already freed memory. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- diff --git a/net/core/dev.c b/net/core/dev.c index 14de297d02..4d891beab1 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1480,14 +1480,16 @@ gso: if (q->enqueue) { /* Grab device queue */ spin_lock(&dev->queue_lock); + q = dev->qdisc; + if (q->enqueue) { + rc = q->enqueue(skb, q); + qdisc_run(dev); + spin_unlock(&dev->queue_lock); - rc = q->enqueue(skb, q); - - qdisc_run(dev); - + rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; + goto out; + } spin_unlock(&dev->queue_lock); - rc = rc == NET_XMIT_BYPASS ? NET_XMIT_SUCCESS : rc; - goto out; } /* The device has no queue. Common case for software devices: diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 7e14f14058..37a1840216 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -401,7 +401,7 @@ static int tc_dump_tfilter(struct sk_buff *skb, struct netlink_callback *cb) if ((dev = dev_get_by_index(tcm->tcm_ifindex)) == NULL) return skb->len; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); if (!tcm->tcm_parent) q = dev->qdisc_sleeping; else @@ -458,7 +458,7 @@ errout: if (cl) cops->put(q, cl); out: - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); dev_put(dev); return skb->len; } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index a19eff12cf..0b64892911 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -195,14 +195,14 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) { struct Qdisc *q; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (q->handle == handle) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return q; } } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); return NULL; } @@ -837,7 +837,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) continue; if (idx > s_idx) s_q_idx = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); q_idx = 0; list_for_each_entry(q, &dev->qdisc_list, list) { if (q_idx < s_q_idx) { @@ -846,12 +846,12 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) } if (tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, NLM_F_MULTI, RTM_NEWQDISC) <= 0) { - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); goto done; } q_idx++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); } done: @@ -1074,7 +1074,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) s_t = cb->args[0]; t = 0; - read_lock_bh(&qdisc_tree_lock); + read_lock(&qdisc_tree_lock); list_for_each_entry(q, &dev->qdisc_list, list) { if (t < s_t || !q->ops->cl_ops || (tcm->tcm_parent && @@ -1096,7 +1096,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) break; t++; } - read_unlock_bh(&qdisc_tree_lock); + read_unlock(&qdisc_tree_lock); cb->args[0] = t; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 6f91518997..88c6a99ce5 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -45,11 +45,10 @@ The idea is the following: - enqueue, dequeue are serialized via top level device spinlock dev->queue_lock. - - tree walking is protected by read_lock_bh(qdisc_tree_lock) + - tree walking is protected by read_lock(qdisc_tree_lock) and this lock is used only in process context. - - updates to tree are made under rtnl semaphore or - from softirq context (__qdisc_destroy rcu-callback) - hence this lock needs local bh disabling. + - updates to tree are made only under rtnl semaphore, + hence this lock may be made without local bh disabling. qdisc_tree_lock must be grabbed BEFORE dev->queue_lock! */ @@ -57,14 +56,14 @@ DEFINE_RWLOCK(qdisc_tree_lock); void qdisc_lock_tree(struct net_device *dev) { - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); spin_lock_bh(&dev->queue_lock); } void qdisc_unlock_tree(struct net_device *dev) { spin_unlock_bh(&dev->queue_lock); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } /* @@ -483,20 +482,6 @@ void qdisc_reset(struct Qdisc *qdisc) static void __qdisc_destroy(struct rcu_head *head) { struct Qdisc *qdisc = container_of(head, struct Qdisc, q_rcu); - struct Qdisc_ops *ops = qdisc->ops; - -#ifdef CONFIG_NET_ESTIMATOR - gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); -#endif - write_lock(&qdisc_tree_lock); - if (ops->reset) - ops->reset(qdisc); - if (ops->destroy) - ops->destroy(qdisc); - write_unlock(&qdisc_tree_lock); - module_put(ops->owner); - - dev_put(qdisc->dev); kfree((char *) qdisc - qdisc->padded); } @@ -504,32 +489,23 @@ static void __qdisc_destroy(struct rcu_head *head) void qdisc_destroy(struct Qdisc *qdisc) { - struct list_head cql = LIST_HEAD_INIT(cql); - struct Qdisc *cq, *q, *n; + struct Qdisc_ops *ops = qdisc->ops; if (qdisc->flags & TCQ_F_BUILTIN || - !atomic_dec_and_test(&qdisc->refcnt)) + !atomic_dec_and_test(&qdisc->refcnt)) return; - if (!list_empty(&qdisc->list)) { - if (qdisc->ops->cl_ops == NULL) - list_del(&qdisc->list); - else - list_move(&qdisc->list, &cql); - } - - /* unlink inner qdiscs from dev->qdisc_list immediately */ - list_for_each_entry(cq, &cql, list) - list_for_each_entry_safe(q, n, &qdisc->dev->qdisc_list, list) - if (TC_H_MAJ(q->parent) == TC_H_MAJ(cq->handle)) { - if (q->ops->cl_ops == NULL) - list_del_init(&q->list); - else - list_move_tail(&q->list, &cql); - } - list_for_each_entry_safe(cq, n, &cql, list) - list_del_init(&cq->list); + list_del(&qdisc->list); +#ifdef CONFIG_NET_ESTIMATOR + gen_kill_estimator(&qdisc->bstats, &qdisc->rate_est); +#endif + if (ops->reset) + ops->reset(qdisc); + if (ops->destroy) + ops->destroy(qdisc); + module_put(ops->owner); + dev_put(qdisc->dev); call_rcu(&qdisc->q_rcu, __qdisc_destroy); } @@ -549,15 +525,15 @@ void dev_activate(struct net_device *dev) printk(KERN_INFO "%s: activation failed\n", dev->name); return; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); list_add_tail(&qdisc->list, &dev->qdisc_list); - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } else { qdisc = &noqueue_qdisc; } - write_lock_bh(&qdisc_tree_lock); + write_lock(&qdisc_tree_lock); dev->qdisc_sleeping = qdisc; - write_unlock_bh(&qdisc_tree_lock); + write_unlock(&qdisc_tree_lock); } if (!netif_carrier_ok(dev))