From 67ccda16de2ef6a27f74b94c330f2b89cf4057f5 Mon Sep 17 00:00:00 2001
From: Ed Schouten <ed@FreeBSD.org>
Date: Thu, 13 Jun 2013 18:46:49 +0000
Subject: [PATCH] Add C11 atomic fallbacks for ARM.

Basically the situation is as follows:

- When using Clang + armv6, we should not need any intrinsics. It should
  support it, even though due to a target misconfiguration it does not.
  We should fix this in Clang.
- When using Clang + noarmv6, provide __atomic_* functions that disable
  interrupts.
- When using GCC + armv6, we can provide __sync_* intrinsics, similar to
  what we did for MIPS. As ARM and MIPS are quite similar, simply base
  this implementation on the one I did for MIPS.
- When using GCC + noarmv6, disable the interrupts, like we do for
  Clang.

This implementation still lacks functions for noarmv6 userspace. To be
done.
---
 sys/arm/arm/stdatomic.c | 540 ++++++++++++++++++++++++++++++++++++++++
 sys/conf/files.arm      |   2 +
 2 files changed, 542 insertions(+)
 create mode 100644 sys/arm/arm/stdatomic.c

diff --git a/sys/arm/arm/stdatomic.c b/sys/arm/arm/stdatomic.c
new file mode 100644
index 00000000000..6921272b60b
--- /dev/null
+++ b/sys/arm/arm/stdatomic.c
@@ -0,0 +1,540 @@
+/*-
+ * Copyright (c) 2013 Ed Schouten <ed@FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/cdefs.h>
+__FBSDID("$FreeBSD$");
+
+#include <sys/param.h>
+#include <sys/stdatomic.h>
+#include <sys/types.h>
+
+#include <machine/cpufunc.h>
+
+#ifdef _KERNEL
+#include "opt_global.h"
+#endif
+
+/*
+ * Executing statements with interrupts disabled.
+ */
+
+#ifndef SMP
+#define	WITHOUT_INTERRUPTS(s) do {					\
+	register_t regs;						\
+									\
+	regs = intr_disable();						\
+	do s while (0);							\
+	intr_restore(regs);						\
+} while (0)
+#endif /* !SMP */
+
+/*
+ * Memory barriers.
+ *
+ * It turns out __sync_synchronize() does not emit any code when used
+ * with GCC 4.2. Implement our own version that does work reliably.
+ *
+ * Although __sync_lock_test_and_set() should only perform an acquire
+ * barrier, make it do a full barrier like the other functions. This
+ * should make <stdatomic.h>'s atomic_exchange_explicit() work reliably.
+ */
+
+static inline void
+do_sync(void)
+{
+
+#if defined(_KERNEL) && !defined(SMP)
+	__asm volatile ("" : : : "memory");
+#elif defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
+	__asm volatile ("dmb" : : : "memory");
+#else /* __ARM_ARCH_6__ */
+	__asm volatile ("mcr p15, 0, %0, c7, c10, 5" : : "r" (0) : "memory");
+#endif
+}
+
+#if defined(__CLANG_ATOMICS) || defined(__GNUC_ATOMICS)
+
+/*
+ * New C11 __atomic_* API.
+ */
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+    defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
+    defined(__ARM_ARCH_6ZK__) || \
+    defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
+
+/* These systems should be supported by the compiler. */
+
+#elif defined(_KERNEL)
+
+#ifdef SMP
+#error "On SMP systems we should have proper atomic operations."
+#endif
+
+/* Clang doesn't allow us to reimplement builtins without this. */
+#ifdef __clang__
+#pragma redefine_extname __sync_synchronize_ext __sync_synchronize
+#define __sync_synchronize __sync_synchronize_ext
+#endif
+
+void
+__sync_synchronize(void)
+{
+
+	do_sync();
+}
+
+/*
+ * On uniprocessor systems, we can perform the atomic operations by
+ * disabling interrupts.
+ */
+
+#define	EMIT_LOAD_N(N, uintN_t)						\
+uintN_t									\
+__atomic_load_##N(uintN_t *mem, int model __unused)			\
+{									\
+	uintN_t ret;							\
+									\
+	WITHOUT_INTERRUPTS({						\
+		ret = *mem;						\
+	});								\
+	return (ret);							\
+}
+
+#define	EMIT_STORE_N(N, uintN_t)					\
+void									\
+__atomic_store_##N(uintN_t *mem, uintN_t val, int model __unused)	\
+{									\
+									\
+	WITHOUT_INTERRUPTS({						\
+		*mem = val;						\
+	});								\
+}
+
+#define	EMIT_COMPARE_EXCHANGE_N(N, uintN_t)				\
+_Bool									\
+__atomic_compare_exchange_##N(uintN_t *mem, uintN_t *expected,		\
+    uintN_t desired, int success __unused, int failure __unused)	\
+{									\
+	_Bool ret;							\
+									\
+	WITHOUT_INTERRUPTS({						\
+		if (*mem == *expected) {				\
+			*mem = desired;					\
+			ret = 1;					\
+		} else {						\
+			*expected = *mem;				\
+			ret = 0;					\
+		}							\
+	});								\
+	return (ret);							\
+}
+
+#define	EMIT_FETCH_OP_N(N, uintN_t, name, op)				\
+uintN_t									\
+__atomic_##name##_##N(uintN_t *mem, uintN_t val, int model __unused)	\
+{									\
+	uintN_t ret;							\
+									\
+	WITHOUT_INTERRUPTS({						\
+		ret = *mem;						\
+		*mem op val;						\
+	});								\
+	return (ret);							\
+}
+
+#define	EMIT_ALL_OPS_N(N, uintN_t)					\
+EMIT_LOAD_N(N, uintN_t)							\
+EMIT_STORE_N(N, uintN_t)						\
+EMIT_COMPARE_EXCHANGE_N(N, uintN_t)					\
+EMIT_FETCH_OP_N(N, uintN_t, exchange, =)				\
+EMIT_FETCH_OP_N(N, uintN_t, fetch_add, +=)				\
+EMIT_FETCH_OP_N(N, uintN_t, fetch_and, &=)				\
+EMIT_FETCH_OP_N(N, uintN_t, fetch_or, |=)				\
+EMIT_FETCH_OP_N(N, uintN_t, fetch_sub, -=)				\
+EMIT_FETCH_OP_N(N, uintN_t, fetch_xor, ^=)
+
+EMIT_ALL_OPS_N(1, uint8_t)
+EMIT_ALL_OPS_N(2, uint16_t)
+EMIT_ALL_OPS_N(4, uint32_t)
+EMIT_ALL_OPS_N(8, uint64_t)
+
+#else /* !__ARM_ARCH_6__ && !__ARM_ARCH_7__ && !_KERNEL */
+
+/* XXX: Implement intrinsics for ARMv5 userspace. */
+
+#endif
+
+#endif /* __CLANG_ATOMICS || __GNUC_ATOMICS */
+
+/*
+ * Old __sync_* API.
+ */
+
+#if defined(__SYNC_ATOMICS)
+
+#if defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || \
+    defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || \
+    defined(__ARM_ARCH_6ZK__) || \
+    defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__)
+
+/* Implementations for old GCC versions, lacking support for atomics. */
+
+typedef union {
+	uint8_t		v8[4];
+	uint32_t	v32;
+} reg_t;
+
+/*
+ * Given a memory address pointing to an 8-bit or 16-bit integer, return
+ * the address of the 32-bit word containing it.
+ */
+
+static inline uint32_t *
+round_to_word(void *ptr)
+{
+
+	return ((uint32_t *)((intptr_t)ptr & ~3));
+}
+
+/*
+ * Utility functions for loading and storing 8-bit and 16-bit integers
+ * in 32-bit words at an offset corresponding with the location of the
+ * atomic variable.
+ */
+
+static inline void
+put_1(reg_t *r, const uint8_t *offset_ptr, uint8_t val)
+{
+	size_t offset;
+
+	offset = (intptr_t)offset_ptr & 3;
+	r->v8[offset] = val;
+}
+
+static inline uint8_t
+get_1(const reg_t *r, const uint8_t *offset_ptr)
+{
+	size_t offset;
+
+	offset = (intptr_t)offset_ptr & 3;
+	return (r->v8[offset]);
+}
+
+static inline void
+put_2(reg_t *r, const uint16_t *offset_ptr, uint16_t val)
+{
+	size_t offset;
+	union {
+		uint16_t in;
+		uint8_t out[2];
+	} bytes;
+
+	offset = (intptr_t)offset_ptr & 3;
+	bytes.in = val;
+	r->v8[offset] = bytes.out[0];
+	r->v8[offset + 1] = bytes.out[1];
+}
+
+static inline uint16_t
+get_2(const reg_t *r, const uint16_t *offset_ptr)
+{
+	size_t offset;
+	union {
+		uint8_t in[2];
+		uint16_t out;
+	} bytes;
+
+	offset = (intptr_t)offset_ptr & 3;
+	bytes.in[0] = r->v8[offset];
+	bytes.in[1] = r->v8[offset + 1];
+	return (bytes.out);
+}
+
+/*
+ * 8-bit and 16-bit routines.
+ *
+ * These operations are not natively supported by the CPU, so we use
+ * some shifting and bitmasking on top of the 32-bit instructions.
+ */
+
+#define	EMIT_LOCK_TEST_AND_SET_N(N, uintN_t)				\
+uintN_t									\
+__sync_lock_test_and_set_##N(uintN_t *mem, uintN_t val)			\
+{									\
+	uint32_t *mem32;						\
+	reg_t val32, negmask, old;					\
+	uint32_t temp1, temp2;						\
+									\
+	mem32 = round_to_word(mem);					\
+	val32.v32 = 0x00000000;						\
+	put_##N(&val32, mem, val);					\
+	negmask.v32 = 0xffffffff;					\
+	put_##N(&negmask, mem, 0);					\
+									\
+	do_sync();							\
+	__asm volatile (						\
+		"1:"							\
+		"\tldrex %0, %6\n"	/* Load old value. */		\
+		"\tand   %2, %5, %0\n"	/* Remove the old value. */	\
+		"\torr   %2, %2, %4\n"	/* Put in the new value. */	\
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
+		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
+		"\tbne   1b\n"		/* Spin if failed. */		\
+		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
+		  "=&r" (temp2)						\
+		: "r" (val32.v32), "r" (negmask.v32), "m" (*mem32));	\
+	return (get_##N(&old, mem));					\
+}
+
+EMIT_LOCK_TEST_AND_SET_N(1, uint8_t)
+EMIT_LOCK_TEST_AND_SET_N(2, uint16_t)
+
+#define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
+uintN_t									\
+__sync_val_compare_and_swap_##N(uintN_t *mem, uintN_t expected,		\
+    uintN_t desired)							\
+{									\
+	uint32_t *mem32;						\
+	reg_t expected32, desired32, posmask, negmask, old;		\
+	uint32_t temp1, temp2;						\
+									\
+	mem32 = round_to_word(mem);					\
+	expected32.v32 = 0x00000000;					\
+	put_##N(&expected32, mem, expected);				\
+	desired32.v32 = 0x00000000;					\
+	put_##N(&desired32, mem, desired);				\
+	posmask.v32 = 0x00000000;					\
+	put_##N(&posmask, mem, ~0);					\
+	negmask.v32 = ~posmask.v32;					\
+									\
+	do_sync();							\
+	__asm volatile (						\
+		"1:"							\
+		"\tldrex %0, %8\n"	/* Load old value. */		\
+		"\tand   %2, %6, %0\n"	/* Isolate the old value. */	\
+		"\tcmp   %2, %4\n"	/* Compare to expected value. */\
+		"\tbne   2f\n"		/* Values are unequal. */	\
+		"\tand   %2, %7, %0\n"	/* Remove the old value. */	\
+		"\torr   %2, %5\n"	/* Put in the new value. */	\
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
+		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
+		"\tbne   1b\n"		/* Spin if failed. */		\
+		"2:"							\
+		: "=&r" (old), "=m" (*mem32), "=&r" (temp1),		\
+		  "=&r" (temp2)						\
+		: "r" (expected32.v32), "r" (desired32.v32),		\
+		  "r" (posmask.v32), "r" (negmask.v32), "m" (*mem32));	\
+	return (get_##N(&old, mem));					\
+}
+
+EMIT_VAL_COMPARE_AND_SWAP_N(1, uint8_t)
+EMIT_VAL_COMPARE_AND_SWAP_N(2, uint16_t)
+
+#define	EMIT_ARITHMETIC_FETCH_AND_OP_N(N, uintN_t, name, op)		\
+uintN_t									\
+__sync_##name##_##N(uintN_t *mem, uintN_t val)				\
+{									\
+	uint32_t *mem32;						\
+	reg_t val32, posmask, negmask, old;				\
+	uint32_t temp1, temp2;						\
+									\
+	mem32 = round_to_word(mem);					\
+	val32.v32 = 0x00000000;						\
+	put_##N(&val32, mem, val);					\
+	posmask.v32 = 0x00000000;					\
+	put_##N(&posmask, mem, ~0);					\
+	negmask.v32 = ~posmask.v32;					\
+									\
+	do_sync();							\
+	__asm volatile (						\
+		"1:"							\
+		"\tldrex %0, %7\n"	/* Load old value. */		\
+		"\t"op"  %2, %0, %4\n"	/* Calculate new value. */	\
+		"\tand   %2, %5\n"	/* Isolate the new value. */	\
+		"\tand   %3, %6, %0\n"	/* Remove the old value. */	\
+		"\torr   %2, %2, %3\n"	/* Put in the new value. */	\
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
+		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
+		"\tbne   1b\n"		/* Spin if failed. */		\
+		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
+		  "=&r" (temp2)						\
+		: "r" (val32.v32), "r" (posmask.v32),			\
+		  "r" (negmask.v32), "m" (*mem32));			\
+	return (get_##N(&old, mem));					\
+}
+
+EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_add, "add")
+EMIT_ARITHMETIC_FETCH_AND_OP_N(1, uint8_t, fetch_and_sub, "sub")
+EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_add, "add")
+EMIT_ARITHMETIC_FETCH_AND_OP_N(2, uint16_t, fetch_and_sub, "sub")
+
+#define	EMIT_BITWISE_FETCH_AND_OP_N(N, uintN_t, name, op, idempotence)	\
+uintN_t									\
+__sync_##name##_##N(uintN_t *mem, uintN_t val)				\
+{									\
+	uint32_t *mem32;						\
+	reg_t val32, old;						\
+	uint32_t temp1, temp2;						\
+									\
+	mem32 = round_to_word(mem);					\
+	val32.v32 = idempotence ? 0xffffffff : 0x00000000;		\
+	put_##N(&val32, mem, val);					\
+									\
+	do_sync();							\
+	__asm volatile (						\
+		"1:"							\
+		"\tldrex %0, %5\n"	/* Load old value. */		\
+		"\t"op"  %2, %4, %0\n"	/* Calculate new value. */	\
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
+		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
+		"\tbne   1b\n"		/* Spin if failed. */		\
+		: "=&r" (old.v32), "=m" (*mem32), "=&r" (temp1),	\
+		  "=&r" (temp2)						\
+		: "r" (val32.v32), "m" (*mem32));			\
+	return (get_##N(&old, mem));					\
+}
+
+EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_and, "and", 1)
+EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_or, "orr", 0)
+EMIT_BITWISE_FETCH_AND_OP_N(1, uint8_t, fetch_and_xor, "eor", 0)
+EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_and, "and", 1)
+EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_or, "orr", 0)
+EMIT_BITWISE_FETCH_AND_OP_N(2, uint16_t, fetch_and_xor, "eor", 0)
+
+/*
+ * 32-bit routines.
+ */
+
+uint32_t
+__sync_val_compare_and_swap_4(uint32_t *mem, uint32_t expected,
+    uint32_t desired)
+{
+	uint32_t old, temp1, temp2;
+
+	do_sync();
+	__asm volatile (
+		"1:"
+		"\tldrex %0, %6\n"	/* Load old value. */
+		"\tcmp   %0, %4\n"	/* Compare to expected value. */
+		"\tbne   2f\n"		/* Values are unequal. */
+		"\tmov   %2, %5\n"	/* Value to store. */
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */
+		"\tcmp   %3, #0\n"	/* Did it succeed? */
+		"\tbne   1b\n"		/* Spin if failed. */
+		"2:"
+		: "=&r" (old), "=m" (*mem), "=&r" (temp1), "=&r" (temp2)
+		: "r" (expected), "r" (desired), "m" (*mem));
+	return (old);
+}
+
+#define	EMIT_FETCH_AND_OP_4(name, op)					\
+uint32_t								\
+__sync_##name##_4(uint32_t *mem, uint32_t val)				\
+{									\
+	uint32_t old, temp1, temp2;					\
+									\
+	do_sync();							\
+	__asm volatile (						\
+		"1:"							\
+		"\tldrex %0, %5\n"	/* Load old value. */		\
+		"\t"op"\n"		/* Calculate new value. */	\
+		"\tstrex %3, %2, %1\n"	/* Attempt to store. */		\
+		"\tcmp   %3, #0\n"	/* Did it succeed? */		\
+		"\tbne   1b\n"		/* Spin if failed. */		\
+		: "=&r" (old), "=m" (*mem), "=&r" (temp1),		\
+		  "=&r" (temp2)						\
+		: "r" (val), "m" (*mem));				\
+	return (old);							\
+}
+
+EMIT_FETCH_AND_OP_4(lock_test_and_set, "mov %2, %4")
+EMIT_FETCH_AND_OP_4(fetch_and_add, "add %2, %0, %4")
+EMIT_FETCH_AND_OP_4(fetch_and_and, "and %2, %0, %4")
+EMIT_FETCH_AND_OP_4(fetch_and_or, "orr %2, %0, %4")
+EMIT_FETCH_AND_OP_4(fetch_and_sub, "sub %2, %0, %4")
+EMIT_FETCH_AND_OP_4(fetch_and_xor, "eor %2, %0, %4")
+
+#elif defined(_KERNEL)
+
+#ifdef SMP
+#error "On SMP systems we should have proper atomic operations."
+#endif
+
+/*
+ * On uniprocessor systems, we can perform the atomic operations by
+ * disabling interrupts.
+ */
+
+#define	EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)				\
+uintN_t									\
+__sync_val_compare_and_swap_##N(uintN_t *mem, uintN_t expected,		\
+    uintN_t desired)							\
+{									\
+	uintN_t ret;							\
+									\
+	WITHOUT_INTERRUPTS({						\
+		ret = *mem;						\
+		if (*mem == expected)					\
+			*mem = desired;					\
+	});								\
+	return (ret);							\
+}
+
+#define	EMIT_FETCH_AND_OP_N(N, uintN_t, name, op)			\
+uintN_t									\
+__sync_##name##_##N(uintN_t *mem, uintN_t val)				\
+{									\
+	uintN_t ret;							\
+									\
+	WITHOUT_INTERRUPTS({						\
+		ret = *mem;						\
+		*mem op val;						\
+	});								\
+	return (ret);							\
+}
+
+#define	EMIT_ALL_OPS_N(N, uintN_t)					\
+EMIT_VAL_COMPARE_AND_SWAP_N(N, uintN_t)					\
+EMIT_FETCH_AND_OP_N(N, uintN_t, lock_test_and_set, =)			\
+EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_add, +=)			\
+EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_and, &=)			\
+EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_or, |=)			\
+EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_sub, -=)			\
+EMIT_FETCH_AND_OP_N(N, uintN_t, fetch_and_xor, ^=)
+
+EMIT_ALL_OPS_N(1, uint8_t)
+EMIT_ALL_OPS_N(2, uint16_t)
+EMIT_ALL_OPS_N(4, uint32_t)
+EMIT_ALL_OPS_N(8, uint64_t)
+
+#else /* !__ARM_ARCH_6__ && !__ARM_ARCH_7__ && !_KERNEL */
+
+/* XXX: Implement intrinsics for ARMv5 userspace. */
+
+#endif
+
+#endif /* __SYNC_ATOMICS */
diff --git a/sys/conf/files.arm b/sys/conf/files.arm
index ade34e6c13a..938386d70bf 100644
--- a/sys/conf/files.arm
+++ b/sys/conf/files.arm
@@ -40,6 +40,8 @@ arm/arm/sc_machdep.c		optional	sc
 arm/arm/setcpsr.S		standard
 arm/arm/setstack.s		standard
 arm/arm/stack_machdep.c		optional	ddb | stack
+arm/arm/stdatomic.c		standard \
+	compile-with "${NORMAL_C:N-Wmissing-prototypes}"
 arm/arm/support.S		standard
 arm/arm/swtch.S			standard
 arm/arm/sys_machdep.c		standard