/*	$NetBSD: octeon_cop2var.h,v 1.2 2020/06/18 13:52:08 simonb Exp $	*/

/*
 * TODO:
 *
 * - Utilize prefetch.
 *
 * - Implement loop in CBC operations.  Take an argument of the number of
 *   blocks.  Better if prefetch is used too.
 *
 * - In AES and DES buffer block loop, merge encrypt / decrypt.  Take a
 *   direction argument (int dir, 0 => encrypt, 1 => decrypt) then branch.
 */

#ifndef _OCTEON_COP2VAR_H_
#define _OCTEON_COP2VAR_H_

#ifdef __OCTEON_USEUN__
#define CNASM_ULD(r, o, b)		"uld	%["#r"], "#o"(%["#b"])	\n\t"
#define CNASM_USD(r, o, b)		"usd	%["#r"], "#o"(%["#b"])	\n\t"
#define CNASM_ULW(r, o, b)		"ulw	%["#r"], "#o"(%["#b"])	\n\t"
#define CNASM_USW(r, o, b)		"usw	%["#r"], "#o"(%["#b"])	\n\t"
#else
#define __CNASM_ULH(i, r, o, x, b)	i"	%["#r"], ("#o" + "#x")(%["#b"]) \n\t"
#define __CNASM_ULS(p, r, o, l, h, b)	__CNASM_ULH(#p"l", r, o, l, b) \
					__CNASM_ULH(#p"r", r, o, h, b)
#define CNASM_ULD(r, o, b)		__CNASM_ULS(ld, r, o, 0, 7, b)
#define CNASM_USD(r, o, b)		__CNASM_ULS(sd, r, o, 0, 7, b)
#define CNASM_ULW(r, o, b)		__CNASM_ULS(lw, r, o, 0, 3, b)
#define CNASM_USW(r, o, b)		__CNASM_ULS(sw, r, o, 0, 3, b)
#endif

#define CNASM_ALD(r, o, b)		"ld	%["#r"], "#o"(%["#b"])	\n\t"
#define CNASM_ASD(r, o, b)		"sd	%["#r"], "#o"(%["#b"])	\n\t"

#undef	__s
#define	__s(s)	#s	/* stringify */
#define CNASM_MT2(r, n, o)		"dmtc2	%["#r"], ("__s(n)" + "#o") \n\t"
#define CNASM_MF2(r, n, o)		"dmfc2	%["#r"], ("__s(n)" + "#o") \n\t"
#define CNASM_MT2ZERO(n, o)		"dmtc2	$0, ("__s(n)" + "#o")	\n\t"
#define CNASM_MT2ZERO(n, o)		"dmtc2	$0, ("__s(n)" + "#o")	\n\t"

#define CNASM_START()			".set	push			\n\t" \
					".set	mips64			\n\t" \
					".set	arch=octeon		\n\t" \
					".set	noreorder		\n\t"
#define CNASM_END()			".set	pop"

#define	__aligned_t	uint64_t
#define	__unaligned_t	uint8_t

/* -------------------------------------------------------------------------- */

/* AES */

#define	__octeon_cop2_aes_set_key_au_vaddr64(au, AU)		\
static inline void						\
octeon_cop2_aes_set_key_##au##_vaddr64(uint64_t key, uint32_t klen) \
{								\
	uint64_t tmp0, tmp1, tmp2, tmp3;			\
								\
	asm volatile (						\
		CNASM_START()					\
		/* %[cnt] is either 4 (256), 3 (192), or 2 (128) */ \
		/* Each operation set AESKEYLEN of cop2 also */ \
		/* >= 64 */					\
		CNASM_##AU##LD(tmp0, 0, key)			\
	"	subu	%[cnt], %[cnt], 1		\n"	\
	"	beqz	%[cnt], 1f			\n"	\
		 CNASM_MT2(tmp0, CVM_MT_AES_KEY, 0)	/* delay slot */ \
		/* >= 128 */					\
		CNASM_##AU##LD(tmp1, 8, key)			\
	"	subu	%[cnt], %[cnt], 1		\n"	\
	"	beqz	%[cnt], 1f			\n"	\
		 CNASM_MT2(tmp1, CVM_MT_AES_KEY, 1)	/* delay slot */ \
		/* >= 192 */					\
		CNASM_##AU##LD(tmp2, 16, key)			\
	"	subu	%[cnt], %[cnt], 1		\n"	\
	"	beqz	%[cnt], 1f			\n"	\
		 CNASM_MT2(tmp2, CVM_MT_AES_KEY, 2)	/* delay slot */ \
		/* >= 256 */					\
		CNASM_##AU##LD(tmp3, 24, key)			\
		CNASM_MT2(tmp3, CVM_MT_AES_KEY, 3)		\
		/* done */					\
	"1:						\n"	\
		CNASM_END()					\
		: [tmp0] "=&r" (tmp0),				\
		  [tmp1] "=&r" (tmp1),				\
		  [tmp2] "=&r" (tmp2),				\
		  [tmp3] "=&r" (tmp3)				\
		: [key] "d" (key),				\
		  [cnt] "d" (klen >> 6));			\
}

#define	__octeon_cop2_aes_set_key_au_ptr(au, AU, ptr)		\
static inline void						\
octeon_cop2_aes_set_key_##au(ptr key, uint32_t klen)		\
{								\
	octeon_cop2_aes_set_key_##au##_vaddr64((intptr_t)key, klen); \
}

#define	__octeon_cop2_aes_set_key_au(au, AU)			\
__octeon_cop2_aes_set_key_au_vaddr64(au, AU)			\
__octeon_cop2_aes_set_key_au_ptr(au, AU, __##au##_t *)

#define	__octeon_cop2_aes_set_key				\
__octeon_cop2_aes_set_key_au(aligned, A)				\
__octeon_cop2_aes_set_key_au(unaligned, U)

__octeon_cop2_aes_set_key

static inline void
octeon_cop2_aes_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1;

	asm volatile (
		CNASM_START()
		/* Store the IV to cop2 */
		CNASM_ULD(tmp0, 0, iv)
		CNASM_ULD(tmp1, 8, iv)
		CNASM_MT2(tmp0, CVM_MT_AES_IV, 0)
		CNASM_MT2(tmp1, CVM_MT_AES_IV, 1)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_aes_set_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_aes_set_iv_unaligned_vaddr64((intptr_t)iv);
}

#define	__octeon_cop2_aes_ed_16_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_aes_##ed##_16_##au##_vaddr64(uint64_t d, uint64_t s) \
{								\
	uint64_t tmp0, tmp1;					\
								\
	asm volatile (						\
		CNASM_START()					\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_##AU##LD(tmp1, 8, s)			\
		CNASM_MT2(tmp0, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp1, CVM_MT_AES_##ED##1, 0)		\
		CNASM_MF2(tmp0, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp1, CVM_MF_AES_RESINP, 1)		\
		CNASM_##AU##SD(tmp0, 0, d)			\
		CNASM_##AU##SD(tmp1, 8, d)			\
		CNASM_END()					\
		: [tmp0] "=&r" (tmp0),				\
		  [tmp1] "=&r" (tmp1)				\
		: [d] "d" (d),					\
		  [s] "d" (s));					\
}

#define	__octeon_cop2_aes_ed_16_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_aes_##ed##_16_##au(ptr d, ptr s)			\
{								\
	octeon_cop2_aes_##ed##_16_##au##_vaddr64((intptr_t)d, (intptr_t)s); \
}

#define	__octeon_cop2_aes_ed_16_au(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_16_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_16_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_aes_ed_16(ed, ED)				\
__octeon_cop2_aes_ed_16_au(ed, ED, aligned, A)			\
__octeon_cop2_aes_ed_16_au(ed, ED, unaligned, U)

#define	__octeon_cop2_aes_16					\
__octeon_cop2_aes_ed_16(encrypt, ENC)				\
__octeon_cop2_aes_ed_16(decrypt, DEC)				\
__octeon_cop2_aes_ed_16(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_aes_ed_16(cbc_decrypt, DEC_CBC)

__octeon_cop2_aes_16

#define	__octeon_cop2_aes_ed_block_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_aes_##ed##_block_##au##_vaddr64(uint64_t d, uint64_t s, int n) \
{								\
	uint64_t tmp0, tmp1;					\
	uint64_t x = d + 16 * n;				\
								\
	asm volatile (						\
		CNASM_START()					\
	"1:						\n"	\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_##AU##LD(tmp1, 8, s)			\
		CNASM_MT2(tmp0, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp1, CVM_MT_AES_##ED##1, 0)		\
		CNASM_MF2(tmp0, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp1, CVM_MF_AES_RESINP, 1)		\
		CNASM_##AU##SD(tmp0, 0, d)			\
		CNASM_##AU##SD(tmp1, 8, d)			\
	"	daddu	%[d], %[d], 16			\n"	\
	"	bne	%[d], %[x], 1b			\n"	\
	"	 daddu	%[s], %[s], 16			\n" /* delay slot */ \
		CNASM_END()					\
		: [d] "=d" (d),					\
		  [s] "=d" (s),					\
		  [tmp0] "=&r" (tmp0),				\
		  [tmp1] "=&r" (tmp1)				\
		: "0" (d),					\
		  "1" (s),					\
		  [x] "d" (x));					\
}

#define	__octeon_cop2_aes_ed_block_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_aes_##ed##_block_##au(ptr d, ptr s, int n)		\
{								\
	octeon_cop2_aes_##ed##_block_##au##_vaddr64((intptr_t)d, (intptr_t)s, n); \
}

#define	__octeon_cop2_aes_ed_block_au(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_block_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_block_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_aes_ed_block(ed, ED)			\
__octeon_cop2_aes_ed_block_au(ed, ED, aligned, A)		\
__octeon_cop2_aes_ed_block_au(ed, ED, unaligned, U)

#define	__octeon_cop2_aes_block					\
/* __octeon_cop2_aes_ed_block(encrypt, ENC) */			\
/* __octeon_cop2_aes_ed_block(decrypt, DEC)	*/		\
__octeon_cop2_aes_ed_block(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_aes_ed_block(cbc_decrypt, DEC_CBC)

__octeon_cop2_aes_block

#define	__octeon_cop2_aes_ed_64_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_aes_##ed##_64_##au##_vaddr64(uint64_t d, uint64_t s) \
{								\
	uint64_t tmp0, tmp1, tmp2, tmp3;			\
								\
	asm volatile (						\
		CNASM_START()					\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_##AU##LD(tmp1, 8, s)			\
		CNASM_MT2(tmp0, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp1, CVM_MT_AES_##ED##1, 0)		\
		CNASM_##AU##LD(tmp2, 16, s)			\
		CNASM_##AU##LD(tmp3, 24, s)			\
		CNASM_MF2(tmp0, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp1, CVM_MF_AES_RESINP, 1)		\
		CNASM_MT2(tmp2, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp3, CVM_MT_AES_##ED##1, 0)		\
		CNASM_##AU##SD(tmp0, 0, d)			\
		CNASM_##AU##SD(tmp1, 8, d)			\
		CNASM_MF2(tmp2, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp3, CVM_MF_AES_RESINP, 1)		\
		CNASM_##AU##SD(tmp2, 16, d)			\
		CNASM_##AU##SD(tmp3, 24, d)			\
		CNASM_##AU##LD(tmp0, 32, s)			\
		CNASM_##AU##LD(tmp1, 40, s)			\
		CNASM_MT2(tmp0, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp1, CVM_MT_AES_##ED##1, 0)		\
		CNASM_##AU##LD(tmp2, 48, s)			\
		CNASM_##AU##LD(tmp3, 56, s)			\
		CNASM_MF2(tmp0, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp1, CVM_MF_AES_RESINP, 1)		\
		CNASM_MT2(tmp2, CVM_MT_AES_##ED##0, 0)		\
		CNASM_MT2(tmp3, CVM_MT_AES_##ED##1, 0)		\
		CNASM_##AU##SD(tmp0, 32, d)			\
		CNASM_##AU##SD(tmp1, 40, d)			\
		CNASM_MF2(tmp2, CVM_MF_AES_RESINP, 0)		\
		CNASM_MF2(tmp3, CVM_MF_AES_RESINP, 1)		\
		CNASM_##AU##SD(tmp2, 48, d)			\
		CNASM_##AU##SD(tmp3, 56, d)			\
		CNASM_END()					\
		: [tmp0] "=&r" (tmp0),				\
		  [tmp1] "=&r" (tmp1),				\
		  [tmp2] "=&r" (tmp2),				\
		  [tmp3] "=&r" (tmp3)				\
		: [d] "d" (d),					\
		  [s] "d" (s));					\
}

#define	__octeon_cop2_aes_ed_64_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_aes_##ed##_64_##au(ptr d, ptr s)			\
{								\
	octeon_cop2_aes_##ed##_64_##au##_vaddr64((intptr_t)d, (intptr_t)s); \
}

#define	__octeon_cop2_aes_ed_64_au(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_64_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_aes_ed_64_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_aes_ed_64(ed, ED)				\
__octeon_cop2_aes_ed_64_au(ed, ED, aligned, A)			\
__octeon_cop2_aes_ed_64_au(ed, ED, unaligned, U)

#define	__octeon_cop2_aes_64					\
/* __octeon_cop2_aes_ed_64(encrypt, ENC) */			\
/* __octeon_cop2_aes_ed_64(decrypt, DEC)	*/			\
__octeon_cop2_aes_ed_64(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_aes_ed_64(cbc_decrypt, DEC_CBC)

__octeon_cop2_aes_64

/* -------------------------------------------------------------------------- */

/* DES */

static inline void
octeon_cop2_des_set_key_unaligned_vaddr64(uint64_t k1, uint64_t k2, uint64_t k3)
{
	uint64_t tmp0, tmp1, tmp2;

	asm volatile (
		CNASM_START()
		/* Set key */
		CNASM_ULD(tmp0, 0, k1)
		CNASM_ULD(tmp1, 0, k2)
		CNASM_ULD(tmp2, 0, k3)
		CNASM_MT2(tmp0, CVM_MT_3DES_KEY, 0)
		CNASM_MT2(tmp1, CVM_MT_3DES_KEY, 1)
		CNASM_MT2(tmp2, CVM_MT_3DES_KEY, 2)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2)
		: [k1] "d" (k1),
		  [k2] "d" (k2),
		  [k3] "d" (k3));
}

static inline void
octeon_cop2_des_set_key_unaligned(uint64_t *k1, uint64_t *k2, uint64_t *k3)
{
	octeon_cop2_des_set_key_unaligned_vaddr64((intptr_t)k1, (intptr_t)k2, (intptr_t)k3);
}

static inline void
octeon_cop2_des_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0;

	asm volatile (
		CNASM_START()
		/* Load IV to a register */
		CNASM_ULD(tmp0, 0, iv)
		/* Store the IV to cop2 */
		CNASM_MT2(tmp0, CVM_MT_3DES_IV, 0)
		CNASM_END()
		: [tmp0] "=&r" (tmp0)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_des_set_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_des_set_iv_unaligned_vaddr64((intptr_t)iv);
}

#define	__octeon_cop2_des_ed_8_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_des_##ed##_8_##au##_vaddr64(uint64_t d, uint64_t s)	\
{								\
	uint64_t tmp0;						\
								\
	asm volatile (						\
		CNASM_START()					\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_MT2(tmp0, CVM_MT_3DES_##ED, 0)		\
		CNASM_MF2(tmp0, CVM_MF_3DES_RESULT, 0)		\
		CNASM_##AU##SD(tmp0, 0, s)			\
		CNASM_END()					\
		: [tmp0] "=&r" (tmp0)				\
		: [d] "d" (d),					\
		  [s] "d" (s));					\
}

#define	__octeon_cop2_des_ed_8_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_des_##ed##_8_##au(ptr d, ptr s)			\
{								\
	octeon_cop2_des_##ed##_8_##au##_vaddr64((intptr_t)d, (intptr_t)s); \
}

#define	__octeon_cop2_des_ed_8_au(ed, ED, au, AU)		\
__octeon_cop2_des_ed_8_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_des_ed_8_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_des_ed_8(ed, ED)				\
__octeon_cop2_des_ed_8_au(ed, ED, aligned, A)			\
__octeon_cop2_des_ed_8_au(ed, ED, unaligned, U)

#define	__octeon_cop2_des_8					\
__octeon_cop2_des_ed_8(encrypt, ENC)				\
__octeon_cop2_des_ed_8(decrypt, DEC)				\
__octeon_cop2_des_ed_8(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_des_ed_8(cbc_decrypt, DEC_CBC)

__octeon_cop2_des_8

#define	__octeon_cop2_des_ed_block_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_des_##ed##_block_##au##_vaddr64(uint64_t d, uint64_t s, int n) \
{								\
	uint64_t tmp0;						\
	uint64_t x = d + 8 * n;					\
								\
	asm volatile (						\
		CNASM_START()					\
	"1:						\n"	\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_MT2(tmp0, CVM_MT_3DES_##ED, 0)		\
		CNASM_MF2(tmp0, CVM_MF_3DES_RESULT, 0)		\
		CNASM_##AU##SD(tmp0, 0, d)			\
	"	daddu	%[d], %[d], 8			\n"	\
	"	bne	%[d], %[x], 1b			\n"	\
	"	 daddu	%[s], %[s], 8			\n"	\
		CNASM_END()					\
		: [d] "=d" (d),					\
		  [s] "=d" (s),					\
		  [tmp0] "=&r" (tmp0)				\
		: "0" (d),					\
		  "1" (s),					\
		  [x] "d" (x));					\
}

#define	__octeon_cop2_des_ed_block_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_des_##ed##_block_##au(ptr d, ptr s, int n)		\
{								\
	octeon_cop2_des_##ed##_block_##au##_vaddr64((intptr_t)d, (intptr_t)s, n); \
}

#define	__octeon_cop2_des_ed_block_au(ed, ED, au, AU)		\
__octeon_cop2_des_ed_block_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_des_ed_block_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_des_ed_block(ed, ED)			\
__octeon_cop2_des_ed_block_au(ed, ED, aligned, A)		\
__octeon_cop2_des_ed_block_au(ed, ED, unaligned, U)

#define	__octeon_cop2_des_block					\
/* __octeon_cop2_des_ed_block(encrypt, ENC) */			\
/* __octeon_cop2_des_ed_block(decrypt, DEC) */			\
__octeon_cop2_des_ed_block(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_des_ed_block(cbc_decrypt, DEC_CBC)

__octeon_cop2_des_block

#define	__octeon_cop2_des_ed_64_au_vaddr64(ed, ED, au, AU)	\
static inline void						\
octeon_cop2_des_##ed##_64_##au##_vaddr64(uint64_t d, uint64_t s) \
{								\
	uint64_t tmp0, tmp1, tmp2, tmp3;			\
								\
	asm volatile (						\
		CNASM_START()					\
		CNASM_##AU##LD(tmp0, 0, s)			\
		CNASM_##AU##LD(tmp1, 8, s)			\
		CNASM_MT2(tmp0, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##LD(tmp2, 16, s)			\
		CNASM_MF2(tmp0, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp1, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##LD(tmp3, 24, s)			\
		CNASM_MF2(tmp1, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp2, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##SD(tmp0, 0, d)			\
		CNASM_MF2(tmp2, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp3, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##SD(tmp1, 8, d)			\
		CNASM_MF2(tmp3, CVM_MF_3DES_RESULT, 0)		\
		CNASM_##AU##SD(tmp2, 16, d)			\
		CNASM_##AU##SD(tmp3, 24, d)			\
		CNASM_##AU##LD(tmp0, 32, s)			\
		CNASM_##AU##LD(tmp1, 40, s)			\
		CNASM_MT2(tmp0, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##LD(tmp2, 48, s)			\
		CNASM_MF2(tmp0, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp1, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##LD(tmp3, 56, s)			\
		CNASM_MF2(tmp1, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp2, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##SD(tmp0, 32, d)			\
		CNASM_MF2(tmp2, CVM_MF_3DES_RESULT, 0)		\
		CNASM_MT2(tmp3, CVM_MT_3DES_##ED, 0)		\
		CNASM_##AU##SD(tmp1, 40, d)			\
		CNASM_MF2(tmp3, CVM_MF_3DES_RESULT, 0)		\
		CNASM_##AU##SD(tmp2, 48, d)			\
		CNASM_##AU##SD(tmp3, 56, d)			\
		CNASM_END()					\
		: [tmp0] "=&r" (tmp0),				\
		  [tmp1] "=&r" (tmp1),				\
		  [tmp2] "=&r" (tmp2),				\
		  [tmp3] "=&r" (tmp3)				\
		: [d] "d" (d),					\
		  [s] "d" (s));					\
}

#define	__octeon_cop2_des_ed_64_au_ptr(ed, ED, au, AU, ptr)	\
static inline void						\
octeon_cop2_des_##ed##_64_##au(ptr d, ptr s)			\
{								\
	octeon_cop2_des_##ed##_64_##au##_vaddr64((intptr_t)d, (intptr_t)s); \
}

#define	__octeon_cop2_des_ed_64_au(ed, ED, au, AU)		\
__octeon_cop2_des_ed_64_au_vaddr64(ed, ED, au, AU)		\
__octeon_cop2_des_ed_64_au_ptr(ed, ED, au, AU, __##au##_t *)

#define	__octeon_cop2_des_ed_64(ed, ED)				\
__octeon_cop2_des_ed_64_au(ed, ED, aligned, A)			\
__octeon_cop2_des_ed_64_au(ed, ED, unaligned, U)

#define	__octeon_cop2_des_64					\
/* __octeon_cop2_des_ed_64(encrypt, ENC) */			\
/* __octeon_cop2_des_ed_64(decrypt, DEC) */			\
__octeon_cop2_des_ed_64(cbc_encrypt, ENC_CBC)			\
__octeon_cop2_des_ed_64(cbc_decrypt, DEC_CBC)

__octeon_cop2_des_64

/* -------------------------------------------------------------------------- */

/* MD5 */

static inline void
octeon_cop2_md5_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1;

	asm volatile (
		CNASM_START()
		/* Load IV from context */
		CNASM_ULD(tmp0, 0, iv)
		CNASM_ULD(tmp1, 8, iv)
		CNASM_MT2(tmp0, CVM_MT_HSH_IV, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_IV, 1)
		CNASM_MT2ZERO(  CVM_MT_HSH_IV, 2)
		CNASM_MT2ZERO(  CVM_MT_HSH_IV, 3)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_md5_set_iv_unaligned(uint64_t *iv)
{
	octeon_cop2_md5_set_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_md5_get_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1;

	asm volatile (
		CNASM_START()
		/* Store IV to context */
		CNASM_MF2(tmp0, CVM_MF_HSH_IV, 0)
		CNASM_MF2(tmp1, CVM_MF_HSH_IV, 1)
		CNASM_USD(tmp0, 0, iv)
		CNASM_USD(tmp1, 8, iv)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_md5_get_iv_unaligned(uint64_t *iv)
{
	octeon_cop2_md5_get_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_md5_update_unaligned_vaddr64(uint64_t src)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Update HASH */
		CNASM_ULD(tmp0, 0, src)
		CNASM_ULD(tmp1, 8, src)
		CNASM_ULD(tmp2, 16, src)
		CNASM_ULD(tmp3, 24, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_DAT, 3)
		CNASM_ULD(tmp0, 32, src)
		CNASM_ULD(tmp1, 40, src)
		CNASM_ULD(tmp2, 48, src)
		CNASM_ULD(tmp3, 56, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 4)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 5)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 6)
		CNASM_MT2(tmp3, CVM_MT_HSH_STANDARD5, 0)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [src] "d" (src));
}

static inline void
octeon_cop2_md5_update_unaligned(uint64_t *src)
{
	octeon_cop2_md5_update_unaligned_vaddr64((intptr_t)src);
}

/* -------------------------------------------------------------------------- */

/* SHA1 */

static inline void
octeon_cop2_sha1_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2;

	asm volatile (
		CNASM_START()
		/* Load IV from context */
		CNASM_ULD(tmp0, 0, iv)
		CNASM_ULD(tmp1, 8, iv)
		CNASM_ULW(tmp2, 16, iv)
		"dsll	%[tmp2], %[tmp2], 32	\n\t"
		CNASM_MT2(tmp0, CVM_MT_HSH_IV, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_IV, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_IV, 2)
		CNASM_MT2ZERO(  CVM_MT_HSH_IV, 3)
		CNASM_END()
	: [tmp0] "=&r" (tmp0),
	  [tmp1] "=&r" (tmp1),
	  [tmp2] "=&r" (tmp2)
	: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha1_set_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha1_set_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha1_get_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2;

	asm volatile (
		CNASM_START()
		/* Store IV to context */
		CNASM_MF2(tmp0, CVM_MF_HSH_IV, 0)
		CNASM_MF2(tmp1, CVM_MF_HSH_IV, 1)
		CNASM_MF2(tmp2, CVM_MF_HSH_IV, 2)
		CNASM_USD(tmp0, 0, iv)
		CNASM_USD(tmp1, 8, iv)
		"dsrl	%[tmp2], %[tmp2], 32	\n\t"
		CNASM_USW(tmp2, 16, iv)
		CNASM_END()
	: [tmp0] "=&r" (tmp0),
	  [tmp1] "=&r" (tmp1),
	  [tmp2] "=&r" (tmp2)
	: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha1_get_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha1_get_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha1_update_unaligned_vaddr64(uint64_t src)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Update HASH */
		CNASM_ULD(tmp0, 0, src)
		CNASM_ULD(tmp1, 8, src)
		CNASM_ULD(tmp2, 16, src)
		CNASM_ULD(tmp3, 24, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_DAT, 3)
		CNASM_ULD(tmp0, 32, src)
		CNASM_ULD(tmp1, 40, src)
		CNASM_ULD(tmp2, 48, src)
		CNASM_ULD(tmp3, 56, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 4)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 5)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 6)
		CNASM_MT2(tmp3, CVM_MT_HSH_STARTSHA, 0)
		CNASM_END()
	: [tmp0] "=&r" (tmp0),
	  [tmp1] "=&r" (tmp1),
	  [tmp2] "=&r" (tmp2),
	  [tmp3] "=&r" (tmp3)
	: [src] "d" (src));
}

static inline void
octeon_cop2_sha1_update_unaligned(uint8_t *src)
{
	octeon_cop2_sha1_update_unaligned_vaddr64((intptr_t)src);
}

/* -------------------------------------------------------------------------- */

/* SHA256 */

static inline void
octeon_cop2_sha256_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Load IV from context */
		CNASM_ULD(tmp0, 0, iv)
		CNASM_ULD(tmp1, 8, iv)
		CNASM_ULD(tmp2, 16, iv)
		CNASM_ULD(tmp3, 24, iv)
		CNASM_MT2(tmp0, CVM_MT_HSH_IV, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_IV, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_IV, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_IV, 3)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha256_set_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha256_set_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha256_get_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Store IV to context */
		CNASM_MF2(tmp0, CVM_MF_HSH_IV, 0)
		CNASM_MF2(tmp1, CVM_MF_HSH_IV, 1)
		CNASM_MF2(tmp2, CVM_MF_HSH_IV, 2)
		CNASM_MF2(tmp3, CVM_MF_HSH_IV, 3)
		CNASM_USD(tmp0, 0, iv)
		CNASM_USD(tmp1, 8, iv)
		CNASM_USD(tmp2, 16, iv)
		CNASM_USD(tmp3, 24, iv)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha256_get_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha256_get_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha256_update_unaligned_vaddr64(uint64_t src)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Update HASH */
		CNASM_ULD(tmp0, 0, src)
		CNASM_ULD(tmp1, 8, src)
		CNASM_ULD(tmp2, 16, src)
		CNASM_ULD(tmp3, 24, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_DAT, 3)
		CNASM_ULD(tmp0, 32, src)
		CNASM_ULD(tmp1, 40, src)
		CNASM_ULD(tmp2, 48, src)
		CNASM_ULD(tmp3, 56, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DAT, 4)
		CNASM_MT2(tmp1, CVM_MT_HSH_DAT, 5)
		CNASM_MT2(tmp2, CVM_MT_HSH_DAT, 6)
		CNASM_MT2(tmp3, CVM_MT_HSH_STARTSHA256, 0)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [src] "d" (src));
}

static inline void
octeon_cop2_sha256_update_unaligned(uint8_t *src)
{
	octeon_cop2_sha256_update_unaligned_vaddr64((intptr_t)src);
}

/* -------------------------------------------------------------------------- */

/* SHA512 */

static inline void
octeon_cop2_sha512_set_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Load IV from context */
		CNASM_ULD(tmp0, 0, iv)
		CNASM_ULD(tmp1, 8, iv)
		CNASM_ULD(tmp2, 16, iv)
		CNASM_ULD(tmp3, 24, iv)
		CNASM_MT2(tmp0, CVM_MT_HSH_IVW, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_IVW, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_IVW, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_IVW, 3)
		CNASM_ULD(tmp0, 32, iv)
		CNASM_ULD(tmp1, 40, iv)
		CNASM_ULD(tmp2, 48, iv)
		CNASM_ULD(tmp3, 56, iv)
		CNASM_MT2(tmp0, CVM_MT_HSH_IVW, 4)
		CNASM_MT2(tmp1, CVM_MT_HSH_IVW, 5)
		CNASM_MT2(tmp2, CVM_MT_HSH_IVW, 6)
		CNASM_MT2(tmp3, CVM_MT_HSH_IVW, 7)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha512_set_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha512_set_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha512_get_iv_unaligned_vaddr64(uint64_t iv)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Store IV to context */
		CNASM_MF2(tmp0, CVM_MF_HSH_IVW, 0)
		CNASM_MF2(tmp1, CVM_MF_HSH_IVW, 1)
		CNASM_MF2(tmp2, CVM_MF_HSH_IVW, 2)
		CNASM_MF2(tmp3, CVM_MF_HSH_IVW, 3)
		CNASM_USD(tmp0, 0, iv)
		CNASM_USD(tmp1, 8, iv)
		CNASM_USD(tmp2, 16, iv)
		CNASM_USD(tmp3, 24, iv)
		CNASM_MF2(tmp0, CVM_MF_HSH_IVW, 4)
		CNASM_MF2(tmp1, CVM_MF_HSH_IVW, 5)
		CNASM_MF2(tmp2, CVM_MF_HSH_IVW, 6)
		CNASM_MF2(tmp3, CVM_MF_HSH_IVW, 7)
		CNASM_USD(tmp0, 32, iv)
		CNASM_USD(tmp1, 40, iv)
		CNASM_USD(tmp2, 48, iv)
		CNASM_USD(tmp3, 56, iv)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [iv] "d" (iv));
}

static inline void
octeon_cop2_sha512_get_iv_unaligned(uint8_t *iv)
{
	octeon_cop2_sha512_get_iv_unaligned_vaddr64((intptr_t)iv);
}

static inline void
octeon_cop2_sha512_update_unaligned_vaddr64(uint64_t src)
{
	uint64_t tmp0, tmp1, tmp2, tmp3;

	asm volatile (
		CNASM_START()
		/* Update HASH */
		CNASM_ULD(tmp0, 0, src)
		CNASM_ULD(tmp1, 8, src)
		CNASM_ULD(tmp2, 16, src)
		CNASM_ULD(tmp3, 24, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DATW, 0)
		CNASM_MT2(tmp1, CVM_MT_HSH_DATW, 1)
		CNASM_MT2(tmp2, CVM_MT_HSH_DATW, 2)
		CNASM_MT2(tmp3, CVM_MT_HSH_DATW, 3)
		CNASM_ULD(tmp0, 32, src)
		CNASM_ULD(tmp1, 40, src)
		CNASM_ULD(tmp2, 48, src)
		CNASM_ULD(tmp3, 56, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DATW, 4)
		CNASM_MT2(tmp1, CVM_MT_HSH_DATW, 5)
		CNASM_MT2(tmp2, CVM_MT_HSH_DATW, 6)
		CNASM_MT2(tmp3, CVM_MT_HSH_DATW, 7)
		CNASM_ULD(tmp0, 64, src)
		CNASM_ULD(tmp1, 72, src)
		CNASM_ULD(tmp2, 80, src)
		CNASM_ULD(tmp3, 88, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DATW, 8)
		CNASM_MT2(tmp1, CVM_MT_HSH_DATW, 9)
		CNASM_MT2(tmp2, CVM_MT_HSH_DATW, 10)
		CNASM_MT2(tmp3, CVM_MT_HSH_DATW, 11)
		CNASM_ULD(tmp0, 96, src)
		CNASM_ULD(tmp1, 104, src)
		CNASM_ULD(tmp2, 112, src)
		CNASM_ULD(tmp3, 120, src)
		CNASM_MT2(tmp0, CVM_MT_HSH_DATW, 12)
		CNASM_MT2(tmp1, CVM_MT_HSH_DATW, 13)
		CNASM_MT2(tmp2, CVM_MT_HSH_DATW, 14)
		CNASM_MT2(tmp3, CVM_MT_HSH_STARTSHA512, 0)
		CNASM_END()
		: [tmp0] "=&r" (tmp0),
		  [tmp1] "=&r" (tmp1),
		  [tmp2] "=&r" (tmp2),
		  [tmp3] "=&r" (tmp3)
		: [src] "d" (src));
}

static inline void
octeon_cop2_sha512_update_unaligned(uint8_t *src)
{
	octeon_cop2_sha512_update_unaligned_vaddr64((intptr_t)src);
}

/* -------------------------------------------------------------------------- */

/* CRC */

/* XXX */

#ifdef notyet
static inline void
octeon_cop2_crc_polynomial(val)
{
	__asm __volatile (
		CNASM_START()
		"	dmtc2 %[val], 0x4200"
		CNASM_END()
		:
		: [val] "d" (val))
}

#define CVMX_MT_CRC_IV(val) \
            __asm __volatile (__PUSH "dmtc2 %0,0x0201" __POP :: "d"(val))
#define CVMX_MT_CRC_BYTE_REFLECT(val) \
            __asm __volatile (__PUSH "dmtc2 %0,0x0214" __POP :: "d"(val))
#define CVMX_MT_CRC_HALF_REFLECT(val) \
            __asm __volatile (__PUSH "dmtc2 %0,0x0215" __POP :: "d"(val))
#define CVMX_MT_CRC_DWORD_REFLECT(val) \
            __asm __volatile (__PUSH "dmtc2 %0,0x1217" __POP :: "d"(val))
#define CVMX_MF_CRC_IV_REFLECT(val) \
            __asm __volatile (__PUSH "dmfc2 %0,0x0203" __POP : "=d"(val))

static inline void
octeon_cop2_crc_reflect(XXX)
{
	__asm __volatile (
		CNASM_START()
	"	and	%[val], %[len], 15		\n"
	"	beq	%[val], %[len], 2f		\n"
	"	 subu	%[tmp], %[len], %[val]		\n"
	"	move	%[len], %[val]			\n"
	"	addu	%[tmp], %[buf]			\n"

	"	.align	3				\n"
	"1:						\n"
		CNASM_ULD(val, 0, buf)
	"	addu	%[buf], 16			\n"
		CNASM_MT2(val, CVM_MT_CRC_DWORD_REFLECT, 0)
		CNASM_ULD(val, -8, buf)
	"	bne	%[buf], %[tmp], 1b		\n"
		CNASM_MT2(val, CVM_MT_CRC_DWORD_REFLECT, 0)

	"	.align	3				\n"
	"2:	and	%[val], %[len], 1		\n"
	"	beq	%[val], %[len], 4f		\n"
	"	 subu	%[tmp], %[len], %[val]		\n"
	"	move	%[len], %[val]			\n"
	"	addu	%[tmp], %[buf]			\n"

	"	.align	3				\n"
	"3:	addu	%[buf], 2			\n"
	"	lhu	%[val], -2(%[buf])		\n"
	"	bne	%[buf], %[tmp], 3b		\n"
		CNASM_MT2(val, CVM_MT_CRC_HALF_REFLECT, 0)

	"	.align	3				\n"
	"4:	beqz	%[len], 5f			\n"
	"	 nop					\n"
	"	lbu	%[val], 0(%[buf])		\n"
		CNASM_MT2(val, CVM_MT_CRC_BYTE_REFLECT, 0)

	"	.align	3				\n"
	"5:						\n"
		CNASM_END()
		: [len] "=d" (len),
		  [buf] "=d" (buf),
		  [val] "=d" (val),
		  [tmp] "=d" (tmp)
		: "0" (len),
		  "1" (buf)
	);
#endif

/* -------------------------------------------------------------------------- */

/* GFM */

/* XXX */

#endif	/* _OCTEON_COP2VAR_H_ */
