[PATCH] percpu_counters: create lib/percpu_counter.c

[linux-2.6] / Documentation / memory-barriers.txt
diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt

index f8550310a6d5d6481ac15417dac04fcfaf0e910a..4710845dbac4fb663e35fcf0056499614a06e56b 100644 (file)
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -19,6 +19,7 @@ Contents:
       - Control dependencies.
       - SMP barrier pairing.
       - Examples of memory barrier sequences.
+     - Read memory barriers vs load speculation.
  
   (*) Explicit kernel barriers.
  
@@ -248,7 +249,7 @@ And there are a number of things that _must_ or _must_not_ be assumed:
       we may get either of:
  
         STORE *A = X; Y = LOAD *A;
-       STORE *A = Y;
+       STORE *A = Y = X;
  
  
  =========================
@@ -344,9 +345,12 @@ Memory barriers come in four basic varieties:
  
   (4) General memory barriers.
  
-     A general memory barrier is a combination of both a read memory barrier
-     and a write memory barrier.  It is a partial ordering over both loads and
-     stores.
+     A general memory barrier gives a guarantee that all the LOAD and STORE
+     operations specified before the barrier will appear to happen before all
+     the LOAD and STORE operations specified after the barrier with respect to
+     the other components of the system.
+
+     A general memory barrier is a partial ordering over both loads and stores.
  
       General memory barriers imply both read and write memory barriers, and so
       can substitute for either.
@@ -546,9 +550,9 @@ write barrier, though, again, a general barrier is viable:
         =============== ===============
         a = 1;
         <write barrier>
-       b = 2;          x = a;
+       b = 2;          x = b;
                         <read barrier>
-                       y = b;
+                       y = a;
  
  Or:
  
@@ -563,6 +567,18 @@ Or:
  Basically, the read barrier always has to be there, even though it can be of
  the "weaker" type.
  
+[!] Note that the stores before the write barrier would normally be expected to
+match the loads after the read barrier or data dependency barrier, and vice
+versa:
+
+       CPU 1                           CPU 2
+       ===============                 ===============
+       a = 1;           }----   --->{  v = c
+       b = 2;           }    \ /    {  w = d
+       <write barrier>        \        <read barrier>
+       c = 3;           }    / \    {  x = a;
+       d = 4;           }----   --->{  y = b;
+
  
  EXAMPLES OF MEMORY BARRIER SEQUENCES
  ------------------------------------
@@ -600,8 +616,8 @@ STORE B, STORE C } all occuring before the unordered set of { STORE D, STORE E
         |       |       +------+
         +-------+       :      :
                            |
-                          | Sequence in which stores committed to memory system
-                          | by CPU 1
+                          | Sequence in which stores are committed to the
+                          | memory system by CPU 1
                            V
  
  
@@ -610,6 +626,7 @@ loads.  Consider the following sequence of events:
  
         CPU 1                   CPU 2
         ======================= =======================
+               { B = 7; X = 9; Y = 8; C = &Y }
         STORE A = 1
         STORE B = 2
         <write barrier>
@@ -651,7 +668,20 @@ In the above example, CPU 2 perceives that B is 7, despite the load of *C
  (which would be B) coming after the the LOAD of C.
  
  If, however, a data dependency barrier were to be placed between the load of C
-and the load of *C (ie: B) on CPU 2, then the following will occur:
+and the load of *C (ie: B) on CPU 2:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { B = 7; X = 9; Y = 8; C = &Y }
+       STORE A = 1
+       STORE B = 2
+       <write barrier>
+       STORE C = &B            LOAD X
+       STORE D = 4             LOAD C (gets &B)
+                               <data dependency barrier>
+                               LOAD *C (reads B)
+
+then the following will occur:
  
         +-------+       :      :                :       :
         |       |       +------+                +-------+
@@ -669,14 +699,12 @@ and the load of *C (ie: B) on CPU 2, then the following will occur:
                                        |        :       :       |       |
                                        |        :       :       | CPU 2 |
                                        |        +-------+       |       |
-                                       \       | X->9  |------>|       |
-                                        \      +-------+       |       |
-                                         ----->| B->2  |       |       |
-                                               +-------+       |       |
-            Makes sure all effects --->    ddddddddddddddddd   |       |
-            prior to the store of C            +-------+       |       |
-            are perceptible to                 | B->2  |------>|       |
-            successive loads                   +-------+       |       |
+                                      |        | X->9  |------>|       |
+                                      |        +-------+       |       |
+         Makes sure all effects --->   \   ddddddddddddddddd   |       |
+         prior to the store of C        \      +-------+       |       |
+         are perceptible to              ----->| B->2  |------>|       |
+         subsequent loads                      +-------+       |       |
                                                 :       :       +-------+
  
  
@@ -685,73 +713,239 @@ following sequence of events:
  
         CPU 1                   CPU 2
         ======================= =======================
+               { A = 0, B = 9 }
         STORE A=1
-       STORE B=2
-       STORE C=3
         <write barrier>
-       STORE D=4
-       STORE E=5
-                               LOAD A
+       STORE B=2
                                 LOAD B
-                               LOAD C
-                               LOAD D
-                               LOAD E
+                               LOAD A
  
  Without intervention, CPU 2 may then choose to perceive the events on CPU 1 in
  some effectively random order, despite the write barrier issued by CPU 1:
  
-       +-------+       :      :
-       |       |       +------+
-       |       |------>| C=3  | }
-       |       |  :    +------+ }
-       |       |  :    | A=1  | }
-       |       |  :    +------+ }
-       | CPU 1 |  :    | B=2  | }---
-       |       |       +------+ }   \
-       |       |   wwwwwwwwwwwww}    \
-       |       |       +------+ }     \          :       :       +-------+
-       |       |  :    | E=5  | }      \         +-------+       |       |
-       |       |  :    +------+ }       \      { | C->3  |------>|       |
-       |       |------>| D=4  | }        \     { +-------+    :  |       |
-       |       |       +------+           \    { | E->5  |    :  |       |
-       +-------+       :      :            \   { +-------+    :  |       |
-                                  Transfer  -->{ | A->1  |    :  | CPU 2 |
-                                 from CPU 1    { +-------+    :  |       |
-                                  to CPU 2     { | D->4  |    :  |       |
-                                               { +-------+    :  |       |
-                                               { | B->2  |------>|       |
-                                                 +-------+       |       |
-                                                 :       :       +-------+
-
-
-If, however, a read barrier were to be placed between the load of C and the
-load of D on CPU 2, then the partial ordering imposed by CPU 1 will be
-perceived correctly by CPU 2.
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       | A->0  |------>|       |
+                                       |       +-------+       |       |
+                                       |       :       :       +-------+
+                                        \      :       :
+                                         \     +-------+
+                                          ---->| A->1  |
+                                               +-------+
+                                               :       :
  
-       +-------+       :      :
-       |       |       +------+
-       |       |------>| C=3  | }
-       |       |  :    +------+ }
-       |       |  :    | A=1  | }---
-       |       |  :    +------+ }   \
-       | CPU 1 |  :    | B=2  | }    \
-       |       |       +------+       \
-       |       |   wwwwwwwwwwwwwwww    \
-       |       |       +------+         \        :       :       +-------+
-       |       |  :    | E=5  | }        \       +-------+       |       |
-       |       |  :    +------+ }---      \    { | C->3  |------>|       |
-       |       |------>| D=4  | }   \      \   { +-------+    :  |       |
-       |       |       +------+      \      -->{ | B->2  |    :  |       |
-       +-------+       :      :       \        { +-------+    :  |       |
-                                       \       { | A->1  |    :  | CPU 2 |
-                                        \        +-------+       |       |
-          At this point the read ---->   \   rrrrrrrrrrrrrrrrr   |       |
-          barrier causes all effects      \      +-------+       |       |
-          prior to the storage of C        \   { | E->5  |    :  |       |
-          to be perceptible to CPU 2        -->{ +-------+    :  |       |
-                                               { | D->4  |------>|       |
-                                                 +-------+       |       |
-                                                 :       :       +-------+
+
+If, however, a read barrier were to be placed between the load of E and the
+load of A on CPU 2:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { A = 0, B = 9 }
+       STORE A=1
+       <write barrier>
+       STORE B=2
+                               LOAD B
+                               <read barrier>
+                               LOAD A
+
+then the partial ordering imposed by CPU 1 will be perceived correctly by CPU
+2:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                       |       :       :       |       |
+         At this point the read ---->   \  rrrrrrrrrrrrrrrrr   |       |
+         barrier causes all effects      \     +-------+       |       |
+         prior to the storage of B        ---->| A->1  |------>|       |
+         to be perceptible to CPU 2            +-------+       |       |
+                                               :       :       +-------+
+
+
+To illustrate this more completely, consider what could happen if the code
+contained a load of A either side of the read barrier:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+               { A = 0, B = 9 }
+       STORE A=1
+       <write barrier>
+       STORE B=2
+                               LOAD B
+                               LOAD A [first load of A]
+                               <read barrier>
+                               LOAD A [second load of A]
+
+Even though the two loads of A both occur after the load of B, they may both
+come up with different values:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                       |       :       :       |       |
+                                       |       +-------+       |       |
+                                       |       | A->0  |------>| 1st   |
+                                       |       +-------+       |       |
+         At this point the read ---->   \  rrrrrrrrrrrrrrrrr   |       |
+         barrier causes all effects      \     +-------+       |       |
+         prior to the storage of B        ---->| A->1  |------>| 2nd   |
+         to be perceptible to CPU 2            +-------+       |       |
+                                               :       :       +-------+
+
+
+But it may be that the update to A from CPU 1 becomes perceptible to CPU 2
+before the read barrier completes anyway:
+
+       +-------+       :      :                :       :
+       |       |       +------+                +-------+
+       |       |------>| A=1  |------      --->| A->0  |
+       |       |       +------+      \         +-------+
+       | CPU 1 |   wwwwwwwwwwwwwwww   \    --->| B->9  |
+       |       |       +------+        |       +-------+
+       |       |------>| B=2  |---     |       :       :
+       |       |       +------+   \    |       :       :       +-------+
+       +-------+       :      :    \   |       +-------+       |       |
+                                    ---------->| B->2  |------>|       |
+                                       |       +-------+       | CPU 2 |
+                                       |       :       :       |       |
+                                        \      :       :       |       |
+                                         \     +-------+       |       |
+                                          ---->| A->1  |------>| 1st   |
+                                               +-------+       |       |
+                                           rrrrrrrrrrrrrrrrr   |       |
+                                               +-------+       |       |
+                                               | A->1  |------>| 2nd   |
+                                               +-------+       |       |
+                                               :       :       +-------+
+
+
+The guarantee is that the second load will always come up with A == 1 if the
+load of B came up with B == 2.  No such guarantee exists for the first load of
+A; that may come up with either A == 0 or A == 1.
+
+
+READ MEMORY BARRIERS VS LOAD SPECULATION
+----------------------------------------
+
+Many CPUs speculate with loads: that is they see that they will need to load an
+item from memory, and they find a time where they're not using the bus for any
+other loads, and so do the load in advance - even though they haven't actually
+got to that point in the instruction execution flow yet.  This permits the
+actual load instruction to potentially complete immediately because the CPU
+already has the value to hand.
+
+It may turn out that the CPU didn't actually need the value - perhaps because a
+branch circumvented the load - in which case it can discard the value or just
+cache it for later use.
+
+Consider:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+                               LOAD B
+                               DIVIDE          } Divide instructions generally
+                               DIVIDE          } take a long time to perform
+                               LOAD A
+
+Which might appear as this:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       The CPU being busy doing a --->     --->| A->0  |~~~~   |       |
+       division speculates on the              +-------+   ~   |       |
+       LOAD of A                               :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+       Once the divisions are complete -->     :       :   ~-->|       |
+       the CPU can then perform the            :       :       |       |
+       LOAD with immediate effect              :       :       +-------+
+
+
+Placing a read barrier or a data dependency barrier just before the second
+load:
+
+       CPU 1                   CPU 2
+       ======================= =======================
+                               LOAD B
+                               DIVIDE
+                               DIVIDE
+                               <read barrier>
+                               LOAD A
+
+will force any value speculatively obtained to be reconsidered to an extent
+dependent on the type of barrier used.  If there was no change made to the
+speculated memory location, then the speculated value will just be used:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       The CPU being busy doing a --->     --->| A->0  |~~~~   |       |
+       division speculates on the              +-------+   ~   |       |
+       LOAD of A                               :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+                                               :       :   ~   |       |
+                                           rrrrrrrrrrrrrrrr~   |       |
+                                               :       :   ~   |       |
+                                               :       :   ~-->|       |
+                                               :       :       |       |
+                                               :       :       +-------+
+
+
+but if there was an update or an invalidation from another CPU pending, then
+the speculation will be cancelled and the value reloaded:
+
+                                               :       :       +-------+
+                                               +-------+       |       |
+                                           --->| B->2  |------>|       |
+                                               +-------+       | CPU 2 |
+                                               :       :DIVIDE |       |
+                                               +-------+       |       |
+       The CPU being busy doing a --->     --->| A->0  |~~~~   |       |
+       division speculates on the              +-------+   ~   |       |
+       LOAD of A                               :       :   ~   |       |
+                                               :       :DIVIDE |       |
+                                               :       :   ~   |       |
+                                               :       :   ~   |       |
+                                           rrrrrrrrrrrrrrrrr   |       |
+                                               +-------+       |       |
+       The speculation is discarded --->   --->| A->1  |------>|       |
+       and an updated value is                 +-------+       |       |
+       retrieved                               :       :       +-------+
  
  
  ========================
@@ -829,8 +1023,8 @@ There are some more advanced barrier functions:
   (*) smp_mb__after_atomic_inc();
  
       These are for use with atomic add, subtract, increment and decrement
-     functions, especially when used for reference counting.  These functions
-     do not imply memory barriers.
+     functions that don't return a value, especially when used for reference
+     counting.  These functions do not imply memory barriers.
  
       As an example, consider a piece of code that marks an object as being dead
       and then decrements the object's reference count:
@@ -887,7 +1081,7 @@ IMPLICIT KERNEL MEMORY BARRIERS
  ===============================
  
  Some of the other functions in the linux kernel imply memory barriers, amongst
-which are locking, scheduling and memory allocation functions.
+which are locking and scheduling functions.
  
  This specification is a _minimum_ guarantee; any particular architecture may
  provide more substantial guarantees, but these may not be relied upon outside
@@ -952,6 +1146,20 @@ equivalent to a full barrier, but a LOCK followed by an UNLOCK is not.
      barriers is that the effects instructions outside of a critical section may
      seep into the inside of the critical section.
  
+A LOCK followed by an UNLOCK may not be assumed to be full memory barrier
+because it is possible for an access preceding the LOCK to happen after the
+LOCK, and an access following the UNLOCK to happen before the UNLOCK, and the
+two accesses can themselves then cross:
+
+       *A = a;
+       LOCK
+       UNLOCK
+       *B = b;
+
+may occur as:
+
+       LOCK, STORE *B, STORE *A, UNLOCK
+
  Locks and semaphores may not provide any guarantee of ordering on UP compiled
  systems, and so cannot be counted on in such a situation to actually achieve
  anything at all - especially with respect to I/O accesses - unless combined
@@ -1002,8 +1210,6 @@ Other functions that imply barriers:
  
   (*) schedule() and similar imply full memory barriers.
  
- (*) Memory allocation and release functions imply full memory barriers.
-
  
  =================================
  INTER-CPU LOCKING BARRIER EFFECTS
@@ -1017,7 +1223,7 @@ conflict on any particular lock.
  LOCKS VS MEMORY ACCESSES
  ------------------------
  
-Consider the following: the system has a pair of spinlocks (N) and (Q), and
+Consider the following: the system has a pair of spinlocks (M) and (Q), and
  three CPUs; then should the following sequence of events occur:
  
         CPU 1                           CPU 2
@@ -1263,15 +1469,17 @@ else.
  ATOMIC OPERATIONS
  -----------------
  
-Though they are technically interprocessor interaction considerations, atomic
-operations are noted specially as they do _not_ generally imply memory
-barriers.  The possible offenders include:
+Whilst they are technically interprocessor interaction considerations, atomic
+operations are noted specially as some of them imply full memory barriers and
+some don't, but they're very heavily relied on as a group throughout the
+kernel.
+
+Any atomic operation that modifies some state in memory and returns information
+about the state (old or new) implies an SMP-conditional general memory barrier
+(smp_mb()) on each side of the actual operation.  These include:
  
         xchg();
         cmpxchg();
-       test_and_set_bit();
-       test_and_clear_bit();
-       test_and_change_bit();
         atomic_cmpxchg();
         atomic_inc_return();
         atomic_dec_return();
@@ -1282,21 +1490,31 @@ barriers.  The possible offenders include:
         atomic_sub_and_test();
         atomic_add_negative();
         atomic_add_unless();
+       test_and_set_bit();
+       test_and_clear_bit();
+       test_and_change_bit();
+
+These are used for such things as implementing LOCK-class and UNLOCK-class
+operations and adjusting reference counters towards object destruction, and as
+such the implicit memory barrier effects are necessary.
  
-These may be used for such things as implementing LOCK operations or controlling
-the lifetime of objects by decreasing their reference counts.  In such cases
-they need preceding memory barriers.
  
-The following may also be possible offenders as they may be used as UNLOCK
-operations.
+The following operation are potential problems as they do _not_ imply memory
+barriers, but might be used for implementing such things as UNLOCK-class
+operations:
  
+       atomic_set();
         set_bit();
         clear_bit();
         change_bit();
-       atomic_set();
  
+With these the appropriate explicit memory barrier should be used if necessary
+(smp_mb__before_clear_bit() for instance).
  
-The following are a little tricky:
+
+The following also do _not_ imply memory barriers, and so may require explicit
+memory barriers under some circumstances (smp_mb__before_atomic_dec() for
+instance)):
  
         atomic_add();
         atomic_sub();
@@ -1317,10 +1535,12 @@ specific order.
  
  
  Basically, each usage case has to be carefully considered as to whether memory
-barriers are needed or not.  The simplest rule is probably: if the atomic
-operation is protected by a lock, then it does not require a barrier unless
-there's another operation within the critical section with respect to which an
-ordering must be maintained.
+barriers are needed or not.
+
+[!] Note that special memory barrier primitives are available for these
+situations because on some CPUs the atomic instructions used imply full memory
+barriers, and so barrier instructions are superfluous in conjunction with them,
+and in such cases the special barrier primitives will be no-ops.
  
  See Documentation/atomic_ops.txt for more information.
  
@@ -1650,7 +1870,7 @@ CPU's caches by some other cache event:
         smp_wmb();
         <A:modify v=2>  <C:busy>
                         <C:queue v=2>
-       p = &b;         q = p;
+       p = &v;         q = p;
                         <D:request p>
         <B:modify p=&v> <D:commit p=&v>
                         <D:read p>