diff --git a/powerpc64/p8/ghash-update.asm b/powerpc64/p8/ghash-update.asm index 515705b7ad95b4b17ad12e179396d6ea73f2c7ef..31bddead1218a9a4e8e2c1e35476994fab7aeee7 100644 --- a/powerpc64/p8/ghash-update.asm +++ b/powerpc64/p8/ghash-update.asm @@ -92,6 +92,10 @@ IF_LE(` IF_LE(` vperm R,R,R,LE_MASK ') + C Used as offsets for load/store, throughout this function + li r8,1*16 + li r9,2*16 + li r10,3*16 C --- process 4 blocks '128-bit each' per one loop --- @@ -101,31 +105,21 @@ IF_LE(` mtctr r7 C assign counter register to loop count C store non-volatile vector registers - addi r8,SP,-32 - stvx v20,0,r8 - addi r8,r8,16 - stvx v21,0,r8 + addi r7,SP,-32 + stvx v20,0,r7 + stvx v21,r8,r7 C load table elements - li r8,1*16 - li r9,2*16 - li r10,3*16 lxvd2x VSR(H1M),0,CTX lxvd2x VSR(H1L),r8,CTX lxvd2x VSR(H2M),r9,CTX lxvd2x VSR(H2L),r10,CTX - li r7,4*16 - li r8,5*16 - li r9,6*16 - li r10,7*16 - lxvd2x VSR(H3M),r7,CTX - lxvd2x VSR(H3L),r8,CTX - lxvd2x VSR(H4M),r9,CTX - lxvd2x VSR(H4L),r10,CTX - - li r8,0x10 - li r9,0x20 - li r10,0x30 + addi r7,CTX,64 + lxvd2x VSR(H3M),0,r7 + lxvd2x VSR(H3L),r8,r7 + lxvd2x VSR(H4M),r9,r7 + lxvd2x VSR(H4L),r10,r7 + .align 5 L4x_loop: C input loading @@ -168,10 +162,10 @@ IF_LE(` bdnz L4x_loop C restore non-volatile vector registers - addi r8,SP,-32 - lvx v20,0,r8 - addi r8,r8,16 - lvx v21,0,r8 + addi r7,SP,-32 + lvx v20,0,r7 + addi r7,r7,16 + lvx v21,0,r7 clrldi BLOCKS,BLOCKS,62 C 'set the high-order 62 bits to zeros' L2x: @@ -181,18 +175,14 @@ L2x: beq L1x C load table elements - li r8,1*16 - li r9,2*16 - li r10,3*16 lxvd2x VSR(H1M),0,CTX lxvd2x VSR(H1L),r8,CTX lxvd2x VSR(H2M),r9,CTX lxvd2x VSR(H2L),r10,CTX C input loading - li r10,0x10 lxvd2x VSR(C0),0,DATA C load C0 - lxvd2x VSR(C1),r10,DATA C load C1 + lxvd2x VSR(C1),r8,DATA C load C1 IF_LE(` vperm C0,C0,C0,LE_MASK @@ -223,7 +213,6 @@ L1x: beq Ldone C load table elements - li r8,1*16 lxvd2x VSR(H1M),0,CTX lxvd2x VSR(H1L),r8,CTX