diff --git a/common/arm/cpu-a.S b/common/arm/cpu-a.S
index a459553..bc91505 100644
--- a/common/arm/cpu-a.S
+++ b/common/arm/cpu-a.S
@@ -26,7 +26,7 @@
 #include "asm.S"
 
 .fpu neon
-.align
+// .align
 
 // done in gas because .fpu neon overrides the refusal to assemble
 // instructions the selected -march/-mcpu doesn't support
@@ -95,7 +95,7 @@ average_loop:
     sub         r2, r2, r1
     cmpgt       r2, #30 << 3    // assume context switch if it took over 30 cycles
     addle       r3, r3, r2
-    subles      ip, ip, #1
+    suble       ip, ip, #1
     bgt         average_loop
 
     // disable counters if we enabled them
diff --git a/common/arm/mc-a.S b/common/arm/mc-a.S
index 507bbba..d5554be 100644
--- a/common/arm/mc-a.S
+++ b/common/arm/mc-a.S
@@ -166,7 +166,7 @@ function x264_pixel_avg_\w\()x\h\()_neon
     ldr         ip, [sp, #8]
     push        {r4-r6,lr}
     cmp         ip, #32
-    ldrd        r4, [sp, #16]
+    ldrd        r4, r5, [sp, #16]
     mov         lr, #\h
     beq         x264_pixel_avg_w\w\()_neon
     rsbs        r6,  ip,  #64
@@ -446,7 +446,7 @@ avg2_w20_loop:
 .ifc \type, full
     ldr         lr,  [r4, #32]      // denom
 .endif
-    ldrd        r4,  [r4, #32+4]    // scale, offset
+    ldrd        r4,  r5,  [r4, #32+4]    // scale, offset
     vdup.16     q0,  r4
     vdup.16     q1,  r5
 .ifc \type, full
@@ -815,7 +815,7 @@ copy_w16_aligned_loop:
 //                           int dx, int dy, int i_width, int i_height );
 function x264_mc_chroma_neon
     push            {r4-r6, lr}
-    ldrd            r4,  [sp, #16]
+    ldrd            r4,  r5,  [sp, #16]
     ldr             r6,  [sp, #24]
 
     asr             lr,  r5,  #3
@@ -1271,8 +1271,8 @@ filter_h_loop:
 function x264_frame_init_lowres_core_neon
     push            {r4-r10,lr}
     vpush           {d8-d15}
-    ldrd            r4,  [sp, #96]
-    ldrd            r6,  [sp, #104]
+    ldrd            r4,  r5,  [sp, #96]
+    ldrd            r6,  r7,  [sp, #104]
     ldr             lr,  [sp, #112]
     sub             r10, r6,  r7            // dst_stride - width
     and             r10, r10, #~15
diff --git a/common/arm/pixel-a.S b/common/arm/pixel-a.S
index 8bce3b6..0784ae6 100644
--- a/common/arm/pixel-a.S
+++ b/common/arm/pixel-a.S
@@ -328,9 +328,9 @@ SAD_FUNC_DUAL  16, 16
 function x264_pixel_sad_x\x\()_\w\()x\h\()_neon
     push        {r6-r7,lr}
 .if \x == 3
-    ldrd        r6,  [sp, #12]
+    ldrd        r6,  r7, [sp, #12]
 .else
-    ldrd        r6,  [sp, #16]
+    ldrd        r6,  r7, [sp, #16]
     ldr         r12, [sp, #12]
 .endif
     mov         lr,  #FENC_STRIDE
@@ -596,7 +596,7 @@ function x264_pixel_var2_8x8_neon
     vadd.s32        d1,  d2,  d3
     vpadd.s32       d0,  d0,  d1
 
-    vmov.32         r0,  r1,  d0
+    vmov            r0,  r1,  d0
     vst1.32         {d0[1]}, [ip,:32]
     mul             r0,  r0,  r0
     sub             r0,  r1,  r0,  lsr #6
diff --git a/common/arm/predict-a.S b/common/arm/predict-a.S
index af65bd7..8cdaf50 100644
--- a/common/arm/predict-a.S
+++ b/common/arm/predict-a.S
@@ -181,9 +181,9 @@ function x264_predict_4x4_ddl_neon
 
 function x264_predict_8x8_dc_neon
     mov     ip, #0
-    ldrd    r2, [r1, #8]
+    ldrd    r2, r3, [r1, #8]
     push    {r4-r5,lr}
-    ldrd    r4, [r1, #16]
+    ldrd    r4, r5, [r1, #16]
     lsl     r3, r3, #8
     ldrb    lr, [r1, #7]
     usad8   r2, r2, ip
diff --git a/common/arm/quant-a.S b/common/arm/quant-a.S
index e851562..c159f9e 100644
--- a/common/arm/quant-a.S
+++ b/common/arm/quant-a.S
@@ -271,7 +271,7 @@ dequant_4x4_dc_rshift:
 
 // int coeff_last( int16_t *l )
 function x264_coeff_last4_arm
-    ldrd        r2,  [r0]
+    ldrd        r2,  r3, [r0]
     subs        r0,  r3,  #0
     movne       r0,  #2
     movne       r2,  r3
@@ -300,7 +300,7 @@ function x264_coeff_last\size\()_neon
 
     subs        r1,  ip,  r1,  lsr #2
     addge       r0,  r1,  #\size - 8
-    sublts      r0,  r3,  r0,  lsr #2
+    sublt       r0,  r3,  r0,  lsr #2
     movlt       r0,  #0
     bx          lr
 .endfunc
@@ -349,7 +349,7 @@ function x264_coeff_last64_neon
 
     subs        r1,  ip,  r1
     addge       r0,  r1,  #32
-    sublts      r0,  ip,  r0
+    sublt       r0,  ip,  r0
     movlt       r0,  #0
     bx          lr
 .endfunc