@@ -1348,14 +1348,73 @@ pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
13481348 simd_shuffle ! ( a, b, [ 4 , 1 , 2 , 3 ] )
13491349}
13501350
1351- /// Performs a serializing operation on all store-to-memory instructions that
1352- /// were issued prior to this instruction.
1351+ /// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1352+ /// were issued by the current thread prior to this instruction.
13531353///
1354- /// Guarantees that every store instruction that precedes, in program order, is
1355- /// globally visible before any store instruction which follows the fence in
1356- /// program order.
1354+ /// Guarantees that every non-temporal store instruction that precedes this fence , in program order, is
1355+ /// ordered before any load or store instruction which follows the fence in
1356+ /// synchronization order.
13571357///
13581358/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1359+ /// (but note that Intel is only documenting the hardware-level concerns related to this
1360+ /// instruction; the Intel documentation does not take into account the extra concerns that arise
1361+ /// because the Rust memory model is different from the x86 memory model.)
1362+ ///
1363+ /// # Safety of non-temporal stores
1364+ ///
1365+ /// After using any non-temporal store intrinsic, but before any other access to the memory that the
1366+ /// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1367+ /// intrinsic.
1368+ ///
1369+ /// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1370+ /// memory model, these stores are happening asynchronously in a background thread. This means a
1371+ /// non-temporal store can cause data races with other accesses, even other accesses on the same
1372+ /// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1373+ /// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1374+ /// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1375+ /// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1376+ /// with all the non-temporal stores previously started on this thread, which means in particular
1377+ /// that subsequent synchronization with other threads will then work as intended again.
1378+ ///
1379+ /// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1380+ /// code jumps back to code outside your library. This ensures all stores inside your function
1381+ /// are synchronized-before the return, and thus transitively synchronized-before everything
1382+ /// the caller does after your function returns.
1383+ //
1384+ // The following is not a doc comment since it's not clear whether we want to put this into the
1385+ // docs, but it should be written out somewhere.
1386+ //
1387+ // Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1388+ // inspect, and that behave like the following functions. This explains where the docs above come
1389+ // from.
1390+ // ```
1391+ // #[thread_local]
1392+ // static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1393+ //
1394+ // pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1395+ // PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1396+ // // Spawn a thread that will eventually do our write.
1397+ // // We need to fetch a pointer to this thread's pending-write
1398+ // // counter, so that we can access it from the background thread.
1399+ // let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1400+ // // If this was actual Rust code we'd have to do some extra work
1401+ // // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1402+ // std::thread::spawn(move || {
1403+ // // Do the write in the background thread.
1404+ // ptr.write(val);
1405+ // // Register the write as done. Crucially, this is `Release`, so it
1406+ // // syncs-with the `Acquire in `sfence`.
1407+ // (&*pending_writes).fetch_sub(1, Release);
1408+ // });
1409+ // }
1410+ //
1411+ // pub fn sfence() {
1412+ // unsafe {
1413+ // // Wait until there are no more pending writes.
1414+ // while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1415+ // }
1416+ // }
1417+ // ```
13591418#[ inline]
13601419#[ target_feature( enable = "sse" ) ]
13611420#[ cfg_attr( test, assert_instr( sfence) ) ]
@@ -1938,6 +1997,15 @@ extern "C" {
19381997/// exception _may_ be generated.
19391998///
19401999/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
2000+ ///
2001+ /// # Safety of non-temporal stores
2002+ ///
2003+ /// After using this intrinsic, but before any other access to the memory that this intrinsic
2004+ /// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
2005+ /// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
2006+ /// return.
2007+ ///
2008+ /// See [`_mm_sfence`] for details.
19412009#[ inline]
19422010#[ target_feature( enable = "sse" ) ]
19432011#[ cfg_attr( test, assert_instr( movntps) ) ]
0 commit comments