Make IV/buf in prov_cipher_ctx_st aligned

Make IV/buf aligned will drastically improve performance
as some architecture performs badly on misaligned memory
access.

Ref to
https://gist.github.com/ZenithalHourlyRate/7b5175734f87acb73d0bbc53391d7140#file-2-openssl-long-md
Ref to
openssl#18197

Signed-off-by: Hongren (Zenithal) Zheng <i@zenithal.me>

Reviewed-by: Paul Dale <pauli@openssl.org>
Reviewed-by: Tomas Mraz <tomas@openssl.org>
(Merged from https://github.com/openssl/openssl/pull/18267)
diff --git a/providers/implementations/include/prov/ciphercommon.h b/providers/implementations/include/prov/ciphercommon.h
index 2404960..f474128 100644
--- a/providers/implementations/include/prov/ciphercommon.h
+++ b/providers/implementations/include/prov/ciphercommon.h
@@ -46,6 +46,13 @@
 # define PROV_CIPHER_FLAG_INVERSE_CIPHER   0x0200
 
 struct prov_cipher_ctx_st {
+    /* place buffer at the beginning for memory alignment */
+    /* The original value of the iv */
+    unsigned char oiv[GENERIC_BLOCK_SIZE];
+    /* Buffer of partial blocks processed via update calls */
+    unsigned char buf[GENERIC_BLOCK_SIZE];
+    unsigned char iv[GENERIC_BLOCK_SIZE];
+
     block128_f block;
     union {
         cbc128_f cbc;
@@ -86,12 +93,6 @@
      * manage partial blocks themselves.
      */
     unsigned int num;
-
-    /* The original value of the iv */
-    unsigned char oiv[GENERIC_BLOCK_SIZE];
-    /* Buffer of partial blocks processed via update calls */
-    unsigned char buf[GENERIC_BLOCK_SIZE];
-    unsigned char iv[GENERIC_BLOCK_SIZE];
     const PROV_CIPHER_HW *hw; /* hardware specific functions */
     const void *ks; /* Pointer to algorithm specific key data */
     OSSL_LIB_CTX *libctx;