Revamped portable atomic layer

Two main improvements:
1. Memory order is an explicit parameter.
2. Functions are type-generic.

PiperOrigin-RevId: 520074974
diff --git a/upb/mem/arena.c b/upb/mem/arena.c
index 4161cb0..2fc814c 100644
--- a/upb/mem/arena.c
+++ b/upb/mem/arena.c
@@ -60,10 +60,11 @@
     UPB_ALIGN_UP(sizeof(_upb_MemBlock), UPB_MALLOC_ALIGN);
 
 static upb_Arena* _upb_Arena_FindRoot(upb_Arena* a) {
-  uintptr_t poc = upb_Atomic_LoadAcquire(&a->parent_or_count);
+  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
   while (_upb_Arena_IsTaggedPointer(poc)) {
     upb_Arena* next = _upb_Arena_PointerFromTagged(poc);
-    uintptr_t next_poc = upb_Atomic_LoadAcquire(&next->parent_or_count);
+    uintptr_t next_poc =
+        upb_Atomic_Load(&next->parent_or_count, memory_order_acquire);
 
     if (_upb_Arena_IsTaggedPointer(next_poc)) {
       // To keep complexity down, we lazily collapse levels of the tree.  This
@@ -85,7 +86,7 @@
       // further away over time, but the path towards that root will continue to
       // be valid and the creation of the path carries all the memory orderings
       // required.
-      upb_Atomic_StoreRelaxed(&a->parent_or_count, next_poc);
+      upb_Atomic_Store(&a->parent_or_count, next_poc, memory_order_relaxed);
     }
     a = next;
     poc = next_poc;
@@ -110,10 +111,10 @@
 uint32_t upb_Arena_DebugRefCount(upb_Arena* a) {
   // These loads could probably be relaxed, but given that this is debug-only,
   // it's not worth introducing a new variant for it.
-  uintptr_t poc = upb_Atomic_LoadAcquire(&a->parent_or_count);
+  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
   while (_upb_Arena_IsTaggedPointer(poc)) {
     a = _upb_Arena_PointerFromTagged(poc);
-    poc = upb_Atomic_LoadAcquire(&a->parent_or_count);
+    poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
   }
   return _upb_Arena_RefCountFromTagged(poc);
 }
@@ -236,11 +237,11 @@
 }
 
 void upb_Arena_Free(upb_Arena* a) {
-  uintptr_t poc = upb_Atomic_LoadAcquire(&a->parent_or_count);
+  uintptr_t poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
 retry:
   while (_upb_Arena_IsTaggedPointer(poc)) {
     a = _upb_Arena_PointerFromTagged(poc);
-    poc = upb_Atomic_LoadAcquire(&a->parent_or_count);
+    poc = upb_Atomic_Load(&a->parent_or_count, memory_order_acquire);
   }
 
   // compare_exchange or fetch_sub are RMW operations, which are more
@@ -251,10 +252,10 @@
     return;
   }
 
-  if (upb_Atomic_CompareExchangeStrongAcqRel(
+  if (upb_Atomic_CompareExchangeStrong(
           &a->parent_or_count, &poc,
-          _upb_Arena_TaggedFromRefcount(_upb_Arena_RefCountFromTagged(poc) -
-                                        1))) {
+          _upb_Arena_TaggedFromRefcount(_upb_Arena_RefCountFromTagged(poc) - 1),
+          memory_order_release, memory_order_acquire)) {
     // We were >1 and we decremented it successfully, so we are done.
     return;
   }
@@ -319,8 +320,10 @@
   // Only allow fuse with a common allocator
   if (r1->block_alloc != r2->block_alloc) return false;
 
-  uintptr_t r1_poc = upb_Atomic_LoadAcquire(&r1->parent_or_count);
-  uintptr_t r2_poc = upb_Atomic_LoadAcquire(&r2->parent_or_count);
+  uintptr_t r1_poc =
+      upb_Atomic_Load(&r1->parent_or_count, memory_order_acquire);
+  uintptr_t r2_poc =
+      upb_Atomic_Load(&r2->parent_or_count, memory_order_acquire);
   UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r1_poc));
   UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r2_poc));
 
@@ -348,18 +351,20 @@
   // immediately begin decrementing `r1`'s refcount.  So we must install all the
   // refcounts that we know about first to prevent a premature unref to zero.
   uint32_t r2_refcount = _upb_Arena_RefCountFromTagged(r2_poc);
-  upb_Atomic_AddRelease(&r1->parent_or_count, ((uintptr_t)r2_refcount) << 1);
+  upb_Atomic_Add(&r1->parent_or_count, ((uintptr_t)r2_refcount) << 1,
+                 memory_order_release);
 
   // When installing `r1` as the parent for `r2` racing frees may have changed
   // the refcount for `r2` so we need to capture the old value to fix up `r1`'s
   // refcount based on the delta from what we saw the first time.
-  r2_poc = upb_Atomic_ExchangeAcqRel(&r2->parent_or_count,
-                                     _upb_Arena_TaggedFromPointer(r1));
+  r2_poc = upb_Atomic_Exchange(&r2->parent_or_count,
+                               _upb_Arena_TaggedFromPointer(r1),
+                               memory_order_acq_rel);
   UPB_ASSERT(_upb_Arena_IsTaggedRefcount(r2_poc));
   uint32_t delta_refcount = r2_refcount - _upb_Arena_RefCountFromTagged(r2_poc);
   if (delta_refcount != 0) {
-    upb_Atomic_SubRelease(&r1->parent_or_count, ((uintptr_t)delta_refcount)
-                                                    << 1);
+    upb_Atomic_Sub(&r1->parent_or_count, ((uintptr_t)delta_refcount) << 1,
+                   memory_order_release);
   }
   return true;
 }
diff --git a/upb/port/atomic.h b/upb/port/atomic.h
index 3406575..9ad7fde 100644
--- a/upb/port/atomic.h
+++ b/upb/port/atomic.h
@@ -35,66 +35,54 @@
 #include <stdatomic.h>
 #include <stdbool.h>
 
-UPB_INLINE void upb_Atomic_Init(_Atomic uintptr_t* addr, uintptr_t val) {
-  atomic_init(addr, val);
-}
-
-UPB_INLINE uintptr_t upb_Atomic_LoadAcquire(_Atomic uintptr_t* addr) {
-  return atomic_load_explicit(addr, memory_order_acquire);
-}
-
-UPB_INLINE void upb_Atomic_StoreRelaxed(_Atomic uintptr_t* addr,
-                                        uintptr_t val) {
-  atomic_store_explicit(addr, val, memory_order_relaxed);
-}
-
-UPB_INLINE void upb_Atomic_AddRelease(_Atomic uintptr_t* addr, uintptr_t val) {
-  atomic_fetch_add_explicit(addr, val, memory_order_release);
-}
-
-UPB_INLINE void upb_Atomic_SubRelease(_Atomic uintptr_t* addr, uintptr_t val) {
+#define upb_Atomic_Init(addr, val) atomic_init(addr, val)
+#define upb_Atomic_Load(addr, order) atomic_load_explicit(addr, order)
+#define upb_Atomic_Store(addr, val, order) \
+  atomic_store_explicit(addr, val, order)
+#define upb_Atomic_Add(addr, val, order) \
+  atomic_fetch_add_explicit(addr, val, order)
+#define upb_Atomic_Sub(addr, val, order) \
   atomic_fetch_sub_explicit(addr, val, memory_order_release);
-}
-
-UPB_INLINE uintptr_t upb_Atomic_ExchangeAcqRel(_Atomic uintptr_t* addr,
-                                               uintptr_t val) {
-  return atomic_exchange_explicit(addr, val, memory_order_acq_rel);
-}
-
-UPB_INLINE bool upb_Atomic_CompareExchangeStrongAcqRel(_Atomic uintptr_t* addr,
-                                                       uintptr_t* expected,
-                                                       uintptr_t desired) {
-  return atomic_compare_exchange_strong_explicit(
-      addr, expected, desired, memory_order_release, memory_order_acquire);
-}
+#define upb_Atomic_Exchange(addr, val, order) \
+  atomic_exchange_explicit(addr, val, order)
+#define upb_Atomic_CompareExchangeStrong(addr, expected, desired,      \
+                                         success_order, failure_order) \
+  atomic_compare_exchange_strong_explicit(addr, expected, desired,     \
+                                          success_order, failure_order)
 
 #else  // !UPB_USE_C11_ATOMICS
 
-UPB_INLINE void upb_Atomic_Init(uintptr_t* addr, uintptr_t val) { *addr = val; }
+#include <string.h>
 
-UPB_INLINE uintptr_t upb_Atomic_LoadAcquire(uintptr_t* addr) { return *addr; }
+#define upb_Atomic_Init(addr, val) (*addr = val)
+#define upb_Atomic_Load(addr, order) (*addr)
+#define upb_Atomic_Store(addr, val, order) (*(addr) = val)
+#define upb_Atomic_Add(addr, val, order) (*(addr) += val)
+#define upb_Atomic_Sub(addr, val, order) (*(addr) -= val)
 
-UPB_INLINE void upb_Atomic_StoreRelaxed(uintptr_t* addr, uintptr_t val) {
-  *addr = val;
-}
-
-UPB_INLINE void upb_Atomic_AddRelease(uintptr_t* addr, uintptr_t val) {
-  *addr += val;
-}
-
-UPB_INLINE void upb_Atomic_SubRelease(uintptr_t* addr, uintptr_t val) {
-  *addr -= val;
-}
-
-UPB_INLINE uintptr_t upb_Atomic_ExchangeAcqRel(uintptr_t* addr, uintptr_t val) {
+UPB_INLINE uintptr_t _upb_NonAtomic_ExchangeU(uintptr_t* addr, uintptr_t val) {
   uintptr_t ret = *addr;
   *addr = val;
   return ret;
 }
 
-UPB_INLINE bool upb_Atomic_CompareExchangeStrongAcqRel(uintptr_t* addr,
-                                                       uintptr_t* expected,
-                                                       uintptr_t desired) {
+// `addr` should logically be `void**`, but `void*` allows for more convenient
+// implicit conversions.
+UPB_INLINE void* _upb_NonAtomic_ExchangeP(void* addr, void* val) {
+  void* ret;
+  memcpy(&ret, addr, sizeof(val));
+  memcpy(addr, &val, sizeof(val));
+  return ret;
+}
+
+#define upb_Atomic_Exchange(addr, val, order) \
+  _Generic((val),                             \
+      uintptr_t: _upb_NonAtomic_ExchangeU,    \
+      void*: _upb_NonAtomic_ExchangeP)(addr, val)
+
+UPB_INLINE bool _upb_NonAtomic_CompareExchangeStrongU(uintptr_t* addr,
+                                                      uintptr_t* expected,
+                                                      uintptr_t desired) {
   if (*addr == *expected) {
     *addr = desired;
     return true;
@@ -104,6 +92,25 @@
   }
 }
 
+// `addr` and `expected` should logically be `void**`, but `void*` allows for
+// more convenient implicit conversions.
+UPB_INLINE bool _upb_NonAtomic_CompareExchangeStrongP(void* addr,
+                                                      void* expected,
+                                                      void* desired) {
+  if (memcmp(addr, expected, sizeof(desired)) == 0) {
+    memcpy(addr, &desired, sizeof(desired));
+    return true;
+  } else {
+    memcpy(expected, addr, sizeof(desired));
+    return false;
+  }
+}
+
+#define upb_Atomic_CompareExchangeStrong(addr, expected, desired, order) \
+  _Generic((desired),                                                    \
+      uintptr_t: _upb_NonAtomic_CompareExchangeStrongU,                  \
+      void*: _upb_NonAtomic_CompareExchangeStrongP)(addr, expected, desired)
+
 #endif
 
 #include "upb/port/undef.inc"