atomic: introduce smp_mb_acquire and smp_mb_release
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
This commit is contained in:
parent
0781dd6e79
commit
f1ee86963b
@ -15,7 +15,8 @@ Macros defined by qemu/atomic.h fall in three camps:
|
|||||||
- compiler barriers: barrier();
|
- compiler barriers: barrier();
|
||||||
|
|
||||||
- weak atomic access and manual memory barriers: atomic_read(),
|
- weak atomic access and manual memory barriers: atomic_read(),
|
||||||
atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_read_barrier_depends();
|
atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_mb_acquire(),
|
||||||
|
smp_mb_release(), smp_read_barrier_depends();
|
||||||
|
|
||||||
- sequentially consistent atomic access: everything else.
|
- sequentially consistent atomic access: everything else.
|
||||||
|
|
||||||
@ -111,8 +112,8 @@ consistent primitives.
|
|||||||
|
|
||||||
When using this model, variables are accessed with atomic_read() and
|
When using this model, variables are accessed with atomic_read() and
|
||||||
atomic_set(), and restrictions to the ordering of accesses is enforced
|
atomic_set(), and restrictions to the ordering of accesses is enforced
|
||||||
using the smp_rmb(), smp_wmb(), smp_mb() and smp_read_barrier_depends()
|
using the memory barrier macros: smp_rmb(), smp_wmb(), smp_mb(),
|
||||||
memory barriers.
|
smp_mb_acquire(), smp_mb_release(), smp_read_barrier_depends().
|
||||||
|
|
||||||
atomic_read() and atomic_set() prevents the compiler from using
|
atomic_read() and atomic_set() prevents the compiler from using
|
||||||
optimizations that might otherwise optimize accesses out of existence
|
optimizations that might otherwise optimize accesses out of existence
|
||||||
@ -124,7 +125,7 @@ other threads, and which are local to the current thread or protected
|
|||||||
by other, more mundane means.
|
by other, more mundane means.
|
||||||
|
|
||||||
Memory barriers control the order of references to shared memory.
|
Memory barriers control the order of references to shared memory.
|
||||||
They come in four kinds:
|
They come in six kinds:
|
||||||
|
|
||||||
- smp_rmb() guarantees that all the LOAD operations specified before
|
- smp_rmb() guarantees that all the LOAD operations specified before
|
||||||
the barrier will appear to happen before all the LOAD operations
|
the barrier will appear to happen before all the LOAD operations
|
||||||
@ -142,6 +143,16 @@ They come in four kinds:
|
|||||||
In other words, smp_wmb() puts a partial ordering on stores, but is not
|
In other words, smp_wmb() puts a partial ordering on stores, but is not
|
||||||
required to have any effect on loads.
|
required to have any effect on loads.
|
||||||
|
|
||||||
|
- smp_mb_acquire() guarantees that all the LOAD operations specified before
|
||||||
|
the barrier will appear to happen before all the LOAD or STORE operations
|
||||||
|
specified after the barrier with respect to the other components of
|
||||||
|
the system.
|
||||||
|
|
||||||
|
- smp_mb_release() guarantees that all the STORE operations specified *after*
|
||||||
|
the barrier will appear to happen after all the LOAD or STORE operations
|
||||||
|
specified *before* the barrier with respect to the other components of
|
||||||
|
the system.
|
||||||
|
|
||||||
- smp_mb() guarantees that all the LOAD and STORE operations specified
|
- smp_mb() guarantees that all the LOAD and STORE operations specified
|
||||||
before the barrier will appear to happen before all the LOAD and
|
before the barrier will appear to happen before all the LOAD and
|
||||||
STORE operations specified after the barrier with respect to the other
|
STORE operations specified after the barrier with respect to the other
|
||||||
@ -149,8 +160,9 @@ They come in four kinds:
|
|||||||
|
|
||||||
smp_mb() puts a partial ordering on both loads and stores. It is
|
smp_mb() puts a partial ordering on both loads and stores. It is
|
||||||
stronger than both a read and a write memory barrier; it implies both
|
stronger than both a read and a write memory barrier; it implies both
|
||||||
smp_rmb() and smp_wmb(), but it also prevents STOREs coming before the
|
smp_mb_acquire() and smp_mb_release(), but it also prevents STOREs
|
||||||
barrier from overtaking LOADs coming after the barrier and vice versa.
|
coming before the barrier from overtaking LOADs coming after the
|
||||||
|
barrier and vice versa.
|
||||||
|
|
||||||
- smp_read_barrier_depends() is a weaker kind of read barrier. On
|
- smp_read_barrier_depends() is a weaker kind of read barrier. On
|
||||||
most processors, whenever two loads are performed such that the
|
most processors, whenever two loads are performed such that the
|
||||||
@ -173,24 +185,21 @@ They come in four kinds:
|
|||||||
This is the set of barriers that is required *between* two atomic_read()
|
This is the set of barriers that is required *between* two atomic_read()
|
||||||
and atomic_set() operations to achieve sequential consistency:
|
and atomic_set() operations to achieve sequential consistency:
|
||||||
|
|
||||||
| 2nd operation |
|
| 2nd operation |
|
||||||
|-----------------------------------------|
|
|-----------------------------------------------|
|
||||||
1st operation | (after last) | atomic_read | atomic_set |
|
1st operation | (after last) | atomic_read | atomic_set |
|
||||||
---------------+--------------+-------------+------------|
|
---------------+----------------+-------------+----------------|
|
||||||
(before first) | | none | smp_wmb() |
|
(before first) | | none | smp_mb_release |
|
||||||
---------------+--------------+-------------+------------|
|
---------------+----------------+-------------+----------------|
|
||||||
atomic_read | smp_rmb() | smp_rmb()* | ** |
|
atomic_read | smp_mb_acquire | smp_rmb | ** |
|
||||||
---------------+--------------+-------------+------------|
|
---------------+----------------+-------------+----------------|
|
||||||
atomic_set | none | smp_mb()*** | smp_wmb() |
|
atomic_set | none | smp_mb()*** | smp_wmb() |
|
||||||
---------------+--------------+-------------+------------|
|
---------------+----------------+-------------+----------------|
|
||||||
|
|
||||||
* Or smp_read_barrier_depends().
|
* Or smp_read_barrier_depends().
|
||||||
|
|
||||||
** This requires a load-store barrier. How to achieve this varies
|
** This requires a load-store barrier. This is achieved by
|
||||||
depending on the machine, but in practice smp_rmb()+smp_wmb()
|
either smp_mb_acquire() or smp_mb_release().
|
||||||
should have the desired effect. For example, on PowerPC the
|
|
||||||
lwsync instruction is a combined load-load, load-store and
|
|
||||||
store-store barrier.
|
|
||||||
|
|
||||||
*** This requires a store-load barrier. On most machines, the only
|
*** This requires a store-load barrier. On most machines, the only
|
||||||
way to achieve this is a full barrier.
|
way to achieve this is a full barrier.
|
||||||
@ -199,11 +208,11 @@ and atomic_set() operations to achieve sequential consistency:
|
|||||||
You can see that the two possible definitions of atomic_mb_read()
|
You can see that the two possible definitions of atomic_mb_read()
|
||||||
and atomic_mb_set() are the following:
|
and atomic_mb_set() are the following:
|
||||||
|
|
||||||
1) atomic_mb_read(p) = atomic_read(p); smp_rmb()
|
1) atomic_mb_read(p) = atomic_read(p); smp_mb_acquire()
|
||||||
atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); smp_mb()
|
atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v); smp_mb()
|
||||||
|
|
||||||
2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_rmb()
|
2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_mb_acquire()
|
||||||
atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v);
|
atomic_mb_set(p, v) = smp_mb_release(); atomic_set(p, v);
|
||||||
|
|
||||||
Usually the former is used, because smp_mb() is expensive and a program
|
Usually the former is used, because smp_mb() is expensive and a program
|
||||||
normally has more reads than writes. Therefore it makes more sense to
|
normally has more reads than writes. Therefore it makes more sense to
|
||||||
@ -222,7 +231,7 @@ place barriers instead:
|
|||||||
thread 1 thread 1
|
thread 1 thread 1
|
||||||
------------------------- ------------------------
|
------------------------- ------------------------
|
||||||
(other writes)
|
(other writes)
|
||||||
smp_wmb()
|
smp_mb_release()
|
||||||
atomic_mb_set(&a, x) atomic_set(&a, x)
|
atomic_mb_set(&a, x) atomic_set(&a, x)
|
||||||
smp_wmb()
|
smp_wmb()
|
||||||
atomic_mb_set(&b, y) atomic_set(&b, y)
|
atomic_mb_set(&b, y) atomic_set(&b, y)
|
||||||
@ -233,7 +242,13 @@ place barriers instead:
|
|||||||
y = atomic_mb_read(&b) y = atomic_read(&b)
|
y = atomic_mb_read(&b) y = atomic_read(&b)
|
||||||
smp_rmb()
|
smp_rmb()
|
||||||
x = atomic_mb_read(&a) x = atomic_read(&a)
|
x = atomic_mb_read(&a) x = atomic_read(&a)
|
||||||
smp_rmb()
|
smp_mb_acquire()
|
||||||
|
|
||||||
|
Note that the barrier between the stores in thread 1, and between
|
||||||
|
the loads in thread 2, has been optimized here to a write or a
|
||||||
|
read memory barrier respectively. On some architectures, notably
|
||||||
|
ARMv7, smp_mb_acquire and smp_mb_release are just as expensive as
|
||||||
|
smp_mb, but smp_rmb and/or smp_wmb are more efficient.
|
||||||
|
|
||||||
- sometimes, a thread is accessing many variables that are otherwise
|
- sometimes, a thread is accessing many variables that are otherwise
|
||||||
unrelated to each other (for example because, apart from the current
|
unrelated to each other (for example because, apart from the current
|
||||||
@ -246,12 +261,12 @@ place barriers instead:
|
|||||||
n = 0; n = 0;
|
n = 0; n = 0;
|
||||||
for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
|
for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
|
||||||
n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]);
|
n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]);
|
||||||
smp_rmb();
|
smp_mb_acquire();
|
||||||
|
|
||||||
Similarly, atomic_mb_set() can be transformed as follows:
|
Similarly, atomic_mb_set() can be transformed as follows:
|
||||||
smp_mb():
|
smp_mb():
|
||||||
|
|
||||||
smp_wmb();
|
smp_mb_release();
|
||||||
for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
|
for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++)
|
||||||
atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
|
atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
|
||||||
smp_mb();
|
smp_mb();
|
||||||
@ -261,7 +276,7 @@ The two tricks can be combined. In this case, splitting a loop in
|
|||||||
two lets you hoist the barriers out of the loops _and_ eliminate the
|
two lets you hoist the barriers out of the loops _and_ eliminate the
|
||||||
expensive smp_mb():
|
expensive smp_mb():
|
||||||
|
|
||||||
smp_wmb();
|
smp_mb_release();
|
||||||
for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++)
|
for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++)
|
||||||
atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
|
atomic_mb_set(&a[i], false); atomic_set(&a[i], false);
|
||||||
atomic_mb_set(&b[i], false); smb_wmb();
|
atomic_mb_set(&b[i], false); smb_wmb();
|
||||||
@ -312,8 +327,8 @@ access and for data dependency barriers:
|
|||||||
smp_read_barrier_depends();
|
smp_read_barrier_depends();
|
||||||
z = b[y];
|
z = b[y];
|
||||||
|
|
||||||
smp_wmb() also pairs with atomic_mb_read(), and smp_rmb() also pairs
|
smp_wmb() also pairs with atomic_mb_read() and smp_mb_acquire().
|
||||||
with atomic_mb_set().
|
and smp_rmb() also pairs with atomic_mb_set() and smp_mb_release().
|
||||||
|
|
||||||
|
|
||||||
COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
|
COMPARISON WITH LINUX KERNEL MEMORY BARRIERS
|
||||||
|
@ -72,16 +72,16 @@
|
|||||||
* Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().
|
* Add one here, and similarly in smp_rmb() and smp_read_barrier_depends().
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#define smp_mb() ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); })
|
#define smp_mb() ({ barrier(); __atomic_thread_fence(__ATOMIC_SEQ_CST); })
|
||||||
#define smp_wmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); })
|
#define smp_mb_release() ({ barrier(); __atomic_thread_fence(__ATOMIC_RELEASE); })
|
||||||
#define smp_rmb() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); })
|
#define smp_mb_acquire() ({ barrier(); __atomic_thread_fence(__ATOMIC_ACQUIRE); })
|
||||||
|
|
||||||
/* Most compilers currently treat consume and acquire the same, but really
|
/* Most compilers currently treat consume and acquire the same, but really
|
||||||
* no processors except Alpha need a barrier here. Leave it in if
|
* no processors except Alpha need a barrier here. Leave it in if
|
||||||
* using Thread Sanitizer to avoid warnings, otherwise optimize it away.
|
* using Thread Sanitizer to avoid warnings, otherwise optimize it away.
|
||||||
*/
|
*/
|
||||||
#if defined(__SANITIZE_THREAD__)
|
#if defined(__SANITIZE_THREAD__)
|
||||||
#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); })
|
#define smp_read_barrier_depends() ({ barrier(); __atomic_thread_fence(__ATOMIC_CONSUME); })
|
||||||
#elif defined(__alpha__)
|
#elif defined(__alpha__)
|
||||||
#define smp_read_barrier_depends() asm volatile("mb":::"memory")
|
#define smp_read_barrier_depends() asm volatile("mb":::"memory")
|
||||||
#else
|
#else
|
||||||
@ -149,13 +149,13 @@
|
|||||||
QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
|
QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
|
||||||
typeof_strip_qual(*ptr) _val; \
|
typeof_strip_qual(*ptr) _val; \
|
||||||
__atomic_load(ptr, &_val, __ATOMIC_RELAXED); \
|
__atomic_load(ptr, &_val, __ATOMIC_RELAXED); \
|
||||||
smp_rmb(); \
|
smp_mb_acquire(); \
|
||||||
_val; \
|
_val; \
|
||||||
})
|
})
|
||||||
|
|
||||||
#define atomic_mb_set(ptr, i) do { \
|
#define atomic_mb_set(ptr, i) do { \
|
||||||
QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
|
QEMU_BUILD_BUG_ON(sizeof(*ptr) > sizeof(void *)); \
|
||||||
smp_wmb(); \
|
smp_mb_release(); \
|
||||||
__atomic_store_n(ptr, i, __ATOMIC_RELAXED); \
|
__atomic_store_n(ptr, i, __ATOMIC_RELAXED); \
|
||||||
smp_mb(); \
|
smp_mb(); \
|
||||||
} while(0)
|
} while(0)
|
||||||
@ -238,8 +238,8 @@
|
|||||||
* here (a compiler barrier only). QEMU doesn't do accesses to write-combining
|
* here (a compiler barrier only). QEMU doesn't do accesses to write-combining
|
||||||
* qemu memory or non-temporal load/stores from C code.
|
* qemu memory or non-temporal load/stores from C code.
|
||||||
*/
|
*/
|
||||||
#define smp_wmb() barrier()
|
#define smp_mb_release() barrier()
|
||||||
#define smp_rmb() barrier()
|
#define smp_mb_acquire() barrier()
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* __sync_lock_test_and_set() is documented to be an acquire barrier only,
|
* __sync_lock_test_and_set() is documented to be an acquire barrier only,
|
||||||
@ -263,13 +263,15 @@
|
|||||||
* smp_mb has the same problem as on x86 for not-very-new GCC
|
* smp_mb has the same problem as on x86 for not-very-new GCC
|
||||||
* (http://patchwork.ozlabs.org/patch/126184/, Nov 2011).
|
* (http://patchwork.ozlabs.org/patch/126184/, Nov 2011).
|
||||||
*/
|
*/
|
||||||
#define smp_wmb() ({ asm volatile("eieio" ::: "memory"); (void)0; })
|
#define smp_wmb() ({ asm volatile("eieio" ::: "memory"); (void)0; })
|
||||||
#if defined(__powerpc64__)
|
#if defined(__powerpc64__)
|
||||||
#define smp_rmb() ({ asm volatile("lwsync" ::: "memory"); (void)0; })
|
#define smp_mb_release() ({ asm volatile("lwsync" ::: "memory"); (void)0; })
|
||||||
|
#define smp_mb_acquire() ({ asm volatile("lwsync" ::: "memory"); (void)0; })
|
||||||
#else
|
#else
|
||||||
#define smp_rmb() ({ asm volatile("sync" ::: "memory"); (void)0; })
|
#define smp_mb_release() ({ asm volatile("sync" ::: "memory"); (void)0; })
|
||||||
|
#define smp_mb_acquire() ({ asm volatile("sync" ::: "memory"); (void)0; })
|
||||||
#endif
|
#endif
|
||||||
#define smp_mb() ({ asm volatile("sync" ::: "memory"); (void)0; })
|
#define smp_mb() ({ asm volatile("sync" ::: "memory"); (void)0; })
|
||||||
|
|
||||||
#endif /* _ARCH_PPC */
|
#endif /* _ARCH_PPC */
|
||||||
|
|
||||||
@ -277,18 +279,18 @@
|
|||||||
* For (host) platforms we don't have explicit barrier definitions
|
* For (host) platforms we don't have explicit barrier definitions
|
||||||
* for, we use the gcc __sync_synchronize() primitive to generate a
|
* for, we use the gcc __sync_synchronize() primitive to generate a
|
||||||
* full barrier. This should be safe on all platforms, though it may
|
* full barrier. This should be safe on all platforms, though it may
|
||||||
* be overkill for smp_wmb() and smp_rmb().
|
* be overkill for smp_mb_acquire() and smp_mb_release().
|
||||||
*/
|
*/
|
||||||
#ifndef smp_mb
|
#ifndef smp_mb
|
||||||
#define smp_mb() __sync_synchronize()
|
#define smp_mb() __sync_synchronize()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef smp_wmb
|
#ifndef smp_mb_acquire
|
||||||
#define smp_wmb() __sync_synchronize()
|
#define smp_mb_acquire() __sync_synchronize()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef smp_rmb
|
#ifndef smp_mb_release
|
||||||
#define smp_rmb() __sync_synchronize()
|
#define smp_mb_release() __sync_synchronize()
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef smp_read_barrier_depends
|
#ifndef smp_read_barrier_depends
|
||||||
@ -365,13 +367,13 @@
|
|||||||
*/
|
*/
|
||||||
#define atomic_mb_read(ptr) ({ \
|
#define atomic_mb_read(ptr) ({ \
|
||||||
typeof(*ptr) _val = atomic_read(ptr); \
|
typeof(*ptr) _val = atomic_read(ptr); \
|
||||||
smp_rmb(); \
|
smp_mb_acquire(); \
|
||||||
_val; \
|
_val; \
|
||||||
})
|
})
|
||||||
|
|
||||||
#ifndef atomic_mb_set
|
#ifndef atomic_mb_set
|
||||||
#define atomic_mb_set(ptr, i) do { \
|
#define atomic_mb_set(ptr, i) do { \
|
||||||
smp_wmb(); \
|
smp_mb_release(); \
|
||||||
atomic_set(ptr, i); \
|
atomic_set(ptr, i); \
|
||||||
smp_mb(); \
|
smp_mb(); \
|
||||||
} while (0)
|
} while (0)
|
||||||
@ -404,4 +406,12 @@
|
|||||||
#define atomic_or(ptr, n) ((void) __sync_fetch_and_or(ptr, n))
|
#define atomic_or(ptr, n) ((void) __sync_fetch_and_or(ptr, n))
|
||||||
|
|
||||||
#endif /* __ATOMIC_RELAXED */
|
#endif /* __ATOMIC_RELAXED */
|
||||||
|
|
||||||
|
#ifndef smp_wmb
|
||||||
|
#define smp_wmb() smp_mb_release()
|
||||||
|
#endif
|
||||||
|
#ifndef smp_rmb
|
||||||
|
#define smp_rmb() smp_mb_acquire()
|
||||||
|
#endif
|
||||||
|
|
||||||
#endif /* QEMU_ATOMIC_H */
|
#endif /* QEMU_ATOMIC_H */
|
||||||
|
Loading…
x
Reference in New Issue
Block a user