2022-08-22 22:21:23 +02:00

1088 lines
47 KiB
ArmAsm

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///*****************************************************************************/
///* */
///* File Name : ih264_deblk_luma_av8.s */
///* */
///* Description : Contains function definitions for deblocking luma */
///* edge. Functions are coded in NEON assembly and can */
///* be compiled using ARM RVDS. */
///* */
///* List of Functions : ih264_deblk_luma_vert_bs4_av8() */
///* ih264_deblk_luma_vert_bslt4_av8() */
///* ih264_deblk_luma_horz_bs4_av8() */
///* ih264_deblk_luma_horz_bslt4_av8() */
///* */
///* Issues / Problems : None */
///* */
///* Revision History : */
///* */
///* DD MM YYYY Author(s) Changes (Describe the changes made) */
///* 28 11 2013 Ittiam Draft */
///* */
///*****************************************************************************/
.text
.p2align 2
.include "ih264_neon_macros.s"
///**
//*******************************************************************************
//*
//* @brief
//* Performs filtering of a luma block horizontal edge for cases where the
//* boundary strength is less than 4
//*
//* @par Description:
//* This operation is described in Sec. 8.7.2.4 under the title
//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
//*
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
//* @param[in] w1 - src_strd
//* Source stride
//*
//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @param[in] w4 - u4_bs
//* Packed Boundary strength array
//*
//* @param[in] x5 - pu1_cliptab
//* tc0_table
//*
//* @returns
//* None
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
.global ih264_deblk_luma_horz_bslt4_av8
ih264_deblk_luma_horz_bslt4_av8:
// STMFD sp!,{x4-x7,x14}
push_v_regs
sxtw x1, w1
stp x19, x20, [sp, #-16]!
//LDRD x4,x5,[SP,#0x14] //x4 = ui_Bs , x5 = *puc_ClpTab
sub x0, x0, x1, lsl #1 //x1 = uc_Horizonpad
sub x0, x0, x1 //x0 pointer to p2
rev w4, w4 //
ld1 {v10.8b, v11.8b}, [x0], x1 //p2 values are loaded into q5
mov v12.s[0], w4 //d12[0] = ui_Bs
mov x6, x0 //keeping backup of pointer to p1
ld1 {v8.8b, v9.8b}, [x0], x1 //p1 values are loaded into q4
mov x7, x0 //keeping backup of pointer to p0
ld1 {v6.8b, v7.8b}, [x0], x1 //p0 values are loaded into q3
uxtl v12.8h, v12.8b //q6 = uc_Bs in each 16 bt scalar
ld1 {v0.8b, v1.8b}, [x0], x1 //q0 values are loaded into q0
mov v10.d[1], v11.d[0]
mov v8.d[1], v9.d[0]
mov v6.d[1], v7.d[0]
uabd v26.16b, v8.16b, v6.16b
ld1 {v2.8b, v3.8b}, [x0], x1 //q1 values are loaded into q1
mov v0.d[1], v1.d[0]
mov v2.d[1], v3.d[0]
uabd v22.16b, v6.16b, v0.16b
ld1 {v16.s}[0], [x5] //D16[0] contains cliptab
uabd v24.16b, v2.16b, v0.16b
ld1 {v4.8b, v5.8b}, [x0], x1 //q2 values are loaded into q2
tbl v14.8b, {v16.16b}, v12.8b //
mov v4.d[1], v5.d[0]
dup v20.16b, w2 //Q10 contains alpha
dup v16.16b, w3 //Q8 contains beta
uxtl v12.4s, v12.4h //
uxtl v14.4s, v14.4h //
uabd v28.16b, v10.16b, v6.16b
uabd v30.16b, v4.16b, v0.16b
cmgt v12.4s, v12.4s, #0
sli v14.4s, v14.4s, #8
cmhs v18.16b, v22.16b, v20.16b
cmhs v24.16b, v24.16b, v16.16b
cmhs v26.16b, v26.16b, v16.16b
cmhi v20.16b, v16.16b , v28.16b //Q10=(Ap<Beta)
cmhi v22.16b, v16.16b , v30.16b //Q11=(Aq<Beta)
sli v14.4s, v14.4s, #16
orr v18.16b, v18.16b , v24.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
usubl v30.8h, v1.8b, v7.8b //
usubl v24.8h, v0.8b, v6.8b //Q15,Q12 = (q0 - p0)
orr v18.16b, v18.16b , v26.16b //Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
usubl v28.8h, v8.8b, v2.8b //Q14 = (p1 - q1)L
shl v26.8h, v30.8h, #2 //Q13 = (q0 - p0)<<2
shl v24.8h, v24.8h, #2 //Q12 = (q0 - p0)<<2
usubl v30.8h, v9.8b, v3.8b //Q15 = (p1 - q1)H
bic v12.16b, v12.16b , v18.16b //final condition
add v24.8h, v24.8h , v28.8h //
add v26.8h, v26.8h , v30.8h //Q13,Q12 = [ (q0 - p0)<<2 ] + (p1 - q1)
sub v18.16b, v14.16b , v20.16b //Q9 = C0 + (Ap < Beta)
urhadd v16.16b, v6.16b , v0.16b //Q8 = ((p0+q0+1) >> 1)
mov v17.d[0], v16.d[1]
sqrshrn v24.8b, v24.8h, #3 //
sqrshrn v25.8b, v26.8h, #3 //Q12 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
mov v24.d[1], v25.d[0]
sub v18.16b, v18.16b , v22.16b //Q9 = C0 + (Ap < Beta) + (Aq < Beta)
and v20.16b, v20.16b , v12.16b //
and v22.16b, v22.16b , v12.16b //
abs v26.16b, v24.16b //Q13 = ABS (i_macro)
uaddl v28.8h, v17.8b, v11.8b //
uaddl v10.8h, v16.8b, v10.8b //Q14,Q5 = p2 + (p0+q0+1)>>1
uaddl v30.8h, v17.8b, v5.8b //
umin v18.16b, v26.16b , v18.16b //Q9 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
ushll v26.8h, v9.8b, #1 //
uaddl v4.8h, v16.8b, v4.8b //Q15,Q2 = q2 + (p0+q0+1)>>1
ushll v16.8h, v8.8b, #1 //Q13,Q8 = (p1<<1)
and v18.16b, v18.16b , v12.16b //Making delta zero in places where values shouldn be filterd
sub v28.8h, v28.8h , v26.8h //Q14,Q5 = [p2 + (p0+q0+1)>>1] - (p1<<1)
sub v10.8h, v10.8h , v16.8h //
ushll v16.8h, v2.8b, #1 //
ushll v26.8h, v3.8b, #1 //Q13,Q8 = (q1<<1)
sqshrn v29.8b, v28.8h, #1 //
sqshrn v28.8b, v10.8h, #1 //Q14 = i_macro_p1
mov v28.d[1], v29.d[0]
sub v4.8h, v4.8h , v16.8h //
sub v30.8h, v30.8h , v26.8h //Q15,Q2 = [q2 + (p0+q0+1)>>1] - (q1<<1)
neg v26.16b, v14.16b //Q13 = -C0
smin v28.16b, v28.16b , v14.16b //Q14 = min(C0,i_macro_p1)
cmge v24.16b, v24.16b, #0
sqshrn v31.8b, v30.8h, #1 //
sqshrn v30.8b, v4.8h, #1 //Q15 = i_macro_q1
mov v30.d[1], v31.d[0]
smax v28.16b, v28.16b , v26.16b //Q14 = max( - C0 , min(C0, i_macro_p1) )
uqadd v16.16b, v6.16b , v18.16b //Q8 = p0 + delta
uqsub v6.16b, v6.16b , v18.16b //Q3 = p0 - delta
smin v30.16b, v30.16b , v14.16b //Q15 = min(C0,i_macro_q1)
and v28.16b, v20.16b , v28.16b //condition check Ap<beta
uqadd v14.16b, v0.16b , v18.16b //Q7 = q0 + delta
uqsub v0.16b, v0.16b , v18.16b //Q0 = q0 - delta
smax v30.16b, v30.16b , v26.16b //Q15 = max( - C0 , min(C0, i_macro_q1) )
bif v16.16b, v6.16b , v24.16b //Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
bif v0.16b, v14.16b , v24.16b //Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
add v28.16b, v28.16b , v8.16b //
and v30.16b, v22.16b , v30.16b //condition check Aq<beta
st1 {v16.16b}, [x7], x1 //writting back filtered value of p0
add v30.16b, v30.16b , v2.16b //
st1 {v0.16b}, [x7], x1 //writting back filtered value of q0
st1 {v28.16b}, [x6] //writting back filtered value of p1
st1 {v30.16b}, [x7], x1 //writting back filtered value of q1
// LDMFD sp!,{x4-x7,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Performs filtering of a luma block horizontal edge when the
//* boundary strength is set to 4
//*
//* @par Description:
//* This operation is described in Sec. 8.7.2.4 under the title
//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
//*
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
//* @param[in] w1 - src_strd
//* Source stride
//*
//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @returns
//* None
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
.global ih264_deblk_luma_horz_bs4_av8
ih264_deblk_luma_horz_bs4_av8:
// Back up necessary registers on stack
// STMFD sp!,{x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
sxtw x1, w1
// Init
dup v0.16b, w2 //duplicate alpha
sub x12, x0, x1 //pointer to p0 = q0 - src_strd
dup v2.16b, w3 //duplicate beta
sub x14, x0, x1, lsl#1 //pointer to p1 = q0 - src_strd*2
sub x2, x0, x1, lsl#2 //pointer to p3 = q0 - src_strd*4
sub x3, x14, x1 //pointer to p2 = p1 - src_strd
// Load Data
ld1 {v4.8b, v5.8b}, [x0], x1 //load q0 to Q2, q0 = q0 + src_strd
ld1 {v6.8b, v7.8b}, [x12] //load p0 to Q3
ld1 {v8.8b, v9.8b}, [x0], x1 //load q1 to Q4, q0 = q0 + src_strd
ld1 {v10.8b, v11.8b}, [x14] //load p1 to Q5
mov v4.d[1] , v5.d[0]
mov v6.d[1] , v7.d[0]
mov v8.d[1] , v9.d[0]
mov v10.d[1] , v11.d[0]
// Filter Decision
uabd v12.16b , v4.16b, v6.16b
uabd v14.16b , v8.16b, v4.16b
uabd v16.16b , v10.16b, v6.16b
cmhs v18.16b, v12.16b , v0.16b //ABS(p0 - q0) >= Alpha
cmhs v14.16b, v14.16b , v2.16b //ABS(q1 - q0) >= Beta
cmhs v16.16b, v16.16b , v2.16b //ABS(q1 - q0) >= Beta
movi v20.16b, #2
orr v18.16b, v18.16b , v14.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta
ld1 {v14.8b, v15.8b}, [x0], x1 //load q2 to Q7, q0 = q0 + src_strd
mov v14.d[1] , v15.d[0]
orr v18.16b, v18.16b , v16.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta
usra v20.16b, v0.16b, #2 //alpha >>2 +2
uabd v22.16b , v14.16b, v4.16b
uaddl v24.8h, v4.8b, v6.8b //p0+q0 L
uaddl v26.8h, v5.8b, v7.8b //p0+q0 H
cmhi v22.16b, v2.16b , v22.16b //Aq < Beta
cmhi v20.16b, v20.16b , v12.16b //(ABS(p0 - q0) <((Alpha >>2) + 2))
// Deblock Filtering q0', q1', q2'
uaddw v28.8h, v24.8h , v8.8b //p0+q0+q1 L
uaddw v30.8h, v26.8h , v9.8b //p0+q0+q1 H
and v22.16b, v22.16b , v20.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
// q0' if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) TRUE
add v16.8h, v28.8h , v28.8h //2*(p0+q0+q1)L
add v0.8h, v30.8h , v30.8h //2*(p0+q0+q1)H
uaddw v16.8h, v16.8h , v14.8b //2*(p0+q0+q1)+q2 L
uaddw v0.8h, v0.8h , v15.8b //2*(p0+q0+q1)+q2 H
uaddw v16.8h, v16.8h , v10.8b //2*(p0+q0+q1)+q2 +p1 L
uaddw v0.8h, v0.8h , v11.8b //2*(p0+q0+q1)+q2 +p1 H
rshrn v12.8b, v16.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 L [q0']
rshrn v13.8b, v0.8h, #3 //(2*(p0+q0+q1)+q2 +p1 +4)>> 3 H [q0']
mov v12.d[1] , v13.d[0]
// q0" if (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)) FALSE
uaddl v16.8h, v8.8b, v8.8b //2*q1 L
uaddl v0.8h, v9.8b, v9.8b //2*q1 H
uaddw v16.8h, v16.8h , v4.8b //2*q1+q0 L
uaddw v0.8h, v0.8h , v5.8b //2*q1+q0 H
uaddw v16.8h, v16.8h , v10.8b //2*q1+q0+p1 L
uaddw v0.8h, v0.8h , v11.8b //2*q1+q0+p1 H
rshrn v16.8b, v16.8h, #2 //(2*q1+q0+p1+2)>>2 L [q0"]
rshrn v17.8b, v0.8h, #2 //(2*q1+q0+p1+2)>>2 H [q0"]
mov v16.d[1] , v17.d[0]
uaddw v28.8h, v28.8h , v14.8b //p0+q0+q1+q2 L
uaddw v30.8h, v30.8h , v15.8b //p0+q0+q1+q2 H
ld1 {v0.8b, v1.8b}, [x0], x1 //load q3 to Q0, q0 = q0 + src_strd
mov v0.d[1] , v1.d[0]
bit v16.16b, v12.16b , v22.16b //choosing between q0' and q0" depending on condn
sub x0, x0, x1, lsl #2 //pointer to q0
bic v22.16b, v22.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
// && (Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
rshrn v12.8b, v28.8h, #2 //(p0+q0+q1+q2+2)>>2 L [q1']
rshrn v13.8b, v30.8h, #2 //(p0+q0+q1+q2+2)>>2 H [q1']
mov v12.d[1] , v13.d[0]
bif v4.16b, v16.16b , v18.16b //choose q0 or filtered q0
mov v5.d[0] , v4.d[1]
uaddl v16.8h, v14.8b, v0.8b //q2+q3,L
uaddl v0.8h, v15.8b, v1.8b //q2+q3,H
add v28.8h, v28.8h , v16.8h //p0+q0+q1+2*q2+q3 L
st1 {v4.8b, v5.8b}, [x0], x1 //store q0
add v30.8h, v30.8h , v0.8h //p0+q0+q1+2*q2+q3 H
add v28.8h, v28.8h , v16.8h //p0+q0+q1+3*q2+2*q3 L
add v30.8h, v30.8h , v0.8h //p0+q0+q1+3*q2+2*q3 H
rshrn v0.8b, v28.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 L [q2']
rshrn v1.8b, v30.8h, #3 //(p0+q0+q1+3*q2+2*q3+4)>>3 H [q2']
mov v0.d[1] , v1.d[0]
ld1 {v30.8b, v31.8b}, [x3] //load p2 to Q15
mov v30.d[1] , v31.d[0]
bif v12.16b, v8.16b , v22.16b //choose q1 or filtered value of q1
mov v13.d[0] , v12.d[1]
uabd v16.16b , v30.16b, v6.16b
uaddw v24.8h, v24.8h , v10.8b //p0+q0+p1 L
bif v0.16b, v14.16b , v22.16b //choose q2 or filtered q2
mov v1.d[0] , v0.d[1]
uaddw v26.8h, v26.8h , v11.8b //p0+q0+p1 H
st1 {v12.8b, v13.8b}, [x0], x1 //store q1
cmhi v16.16b, v2.16b , v16.16b //Ap < Beta
add v28.8h, v24.8h , v24.8h //2*(p0+q0+p1) L
add v4.8h, v26.8h , v26.8h //2*(p0+q0+p1) H
st1 {v0.8b, v1.8b}, [x0], x1 //store q2
and v20.16b, v20.16b , v16.16b //((Ap < Beta) && (ABS(p0 - q0) <((Alpha >>2) + 2)))
uaddw v28.8h, v28.8h , v30.8b //2*(p0+q0+p1)+p2 l
uaddw v4.8h, v4.8h , v31.8b //2*(p0+q0+p1)+p2 H
uaddw v28.8h, v28.8h , v8.8b //2*(p0+q0+p1)+p2+q1 L
uaddw v4.8h, v4.8h , v9.8b //2*(p0+q0+p1)+p2+q1 H
rshrn v28.8b, v28.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 L,p0'
rshrn v29.8b, v4.8h, #3 //(2*(p0+q0+p1)+p2+q1+4)>>3 H,p0'
mov v28.d[1] , v29.d[0]
movi v0.8b, #2
movi v1.4h, #2
uaddl v2.8h, v6.8b, v8.8b //p0+q1 L
umlal v2.8h, v10.8b, v0.8b //2*p1+p0+q1 L
uaddl v16.8h, v7.8b, v9.8b //p0+q1 H
umlal v16.8h, v11.8b, v0.8b //2*p1+p0+q1 H
uaddw v12.8h, v24.8h , v30.8b //(p0+q0+p1) +p2 L
ld1 {v24.8b, v25.8b}, [x2] //load p3,Q12
mov v24.d[1] , v25.d[0]
uaddw v4.8h, v26.8h , v31.8b //(p0+q0+p1) +p2 H
uaddl v8.8h, v30.8b, v24.8b //p2+p3 L
rshrn v26.8b, v12.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' L
rshrn v2.8b, v2.8h, #2 //(2*p1+p0+q1+2)>>2,p0"L
rshrn v27.8b, v4.8h, #2 //((p0+q0+p1)+p2 +2)>>2,p1' H
rshrn v3.8b, v16.8h, #2 //(2*p1+p0+q1+2)>>2,p0" H
mov v26.d[1] , v27.d[0]
mov v2.d[1] , v3.d[0]
uaddl v16.8h, v31.8b, v25.8b //p2+p3 H
mla v12.8h, v8.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 L
mla v4.8h, v16.8h , v1.h[0] //(p0+q0+p1)+3*p2+2*p3 H
bic v16.16b, v20.16b , v18.16b //((ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta))
mov v17.d[0] , v16.d[1] //&& (Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
bit v2.16b, v28.16b , v20.16b //choosing between po' and p0"
mov v3.d[0] , v2.d[1]
rshrn v12.8b, v12.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 L p2'
rshrn v13.8b, v4.8h, #3 //((p0+q0+p1)+3*p2+2*p3+4)>>3 H p2'
mov v12.d[1] , v13.d[0]
bif v6.16b, v2.16b , v18.16b //choosing between p0 and filtered value of p0
bit v10.16b, v26.16b , v16.16b //choosing between p1 and p1'
bit v30.16b, v12.16b , v16.16b //choosing between p2 and p2'
st1 {v6.16b}, [x12] //store p0
st1 {v10.16b}, [x14] //store p1
st1 {v30.16b}, [x3] //store p2
// LDMFD sp!,{x12,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Performs filtering of a luma block vertical edge for cases where the
//* boundary strength is less than 4
//*
//* @par Description:
//* This operation is described in Sec. 8.7.2.4 under the title
//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
//*
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
//* @param[in] w1 - src_strd
//* Source stride
//*
//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @param[in] w4 - u4_bs
//* Packed Boundary strength array
//*
//* @param[in] x5 - pu1_cliptab
//* tc0_table
//*
//* @returns
//* None
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
.global ih264_deblk_luma_vert_bslt4_av8
ih264_deblk_luma_vert_bslt4_av8:
// STMFD sp!,{x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
sxtw x1, w1
sub x0, x0, #4 //pointer uc_edgePixel-4
mov x12, x4
mov x14, x5
mov x17, x0
//loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
ld1 {v0.8b}, [x0], x1 //row1
ld1 {v2.8b}, [x0], x1 //row2
ld1 {v4.8b}, [x0], x1 //row3
rev w12, w12 //reversing ui_bs
ld1 {v6.8b}, [x0], x1 //row4
mov v18.s[0], w12 //d12[0] = ui_Bs
ld1 {v16.s}[0], [x14] //D16[0] contains cliptab
ld1 {v8.8b}, [x0], x1 //row5
uxtl v18.8h, v18.8b //q6 = uc_Bs in each 16 bt scalar
ld1 {v10.8b}, [x0], x1 //row6
ld1 {v12.8b}, [x0], x1 //row7
tbl v16.8b, {v16.16b}, v18.8b //puc_ClipTab[uc_Bs]
ld1 {v14.8b}, [x0], x1 //row8
ld1 {v1.8b}, [x0], x1 //row9
uxtl v16.4s, v16.4h //
ld1 {v3.8b}, [x0], x1 //row10
ld1 {v5.8b}, [x0], x1 //row11
ld1 {v7.8b}, [x0], x1 //row12
sli v16.4s, v16.4s, #8 //
ld1 {v9.8b}, [x0], x1 //row13
ld1 {v11.8b}, [x0], x1 //row14
ld1 {v13.8b}, [x0], x1 //row15
sli v16.4s, v16.4s, #16
ld1 {v15.8b}, [x0], x1 //row16
//taking two 8x8 transposes
//2X2 transposes
trn1 v21.8b, v0.8b, v2.8b
trn2 v2.8b, v0.8b, v2.8b //row1 &2
mov v0.8b, v21.8b
trn1 v21.8b, v4.8b, v6.8b
trn2 v6.8b, v4.8b, v6.8b //row3&row4
mov v4.8b, v21.8b
trn1 v21.8b, v8.8b, v10.8b
trn2 v10.8b, v8.8b, v10.8b //row5&6
mov v8.8b, v21.8b
trn1 v21.8b, v12.8b, v14.8b
trn2 v14.8b, v12.8b, v14.8b //row7 & 8
mov v12.8b, v21.8b
trn1 v21.8b, v1.8b, v3.8b
trn2 v3.8b, v1.8b, v3.8b //row9 &10
mov v1.8b, v21.8b
trn1 v21.8b, v5.8b, v7.8b
trn2 v7.8b, v5.8b, v7.8b //row11 & 12
mov v5.8b, v21.8b
trn1 v21.8b, v9.8b, v11.8b
trn2 v11.8b, v9.8b, v11.8b //row13 &14
mov v9.8b, v21.8b
trn1 v21.8b, v13.8b, v15.8b
trn2 v15.8b, v13.8b, v15.8b //row15 & 16
mov v13.8b, v21.8b
//4x4 transposes
trn1 v21.4h, v2.4h, v6.4h
trn2 v6.4h, v2.4h, v6.4h //row2 & row4
mov v2.8b, v21.8b
trn1 v21.4h, v10.4h, v14.4h
trn2 v14.4h, v10.4h, v14.4h //row6 & row8
mov v10.8b, v21.8b
trn1 v21.4h, v3.4h, v7.4h
trn2 v7.4h, v3.4h, v7.4h //row10 & 12
mov v3.8b, v21.8b
trn1 v21.4h, v11.4h, v15.4h
trn2 v15.4h, v11.4h, v15.4h //row14 & row16
mov v11.8b, v21.8b
trn1 v21.2s, v6.2s, v14.2s
trn2 v14.2s, v6.2s, v14.2s //row4 & 8
mov v6.8b, v21.8b
trn1 v21.2s, v7.2s, v15.2s
trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
mov v7.8b, v21.8b
//now Q3 ->p0 and Q7->q3
trn1 v21.4h, v0.4h, v4.4h
trn2 v4.4h, v0.4h, v4.4h //row1 & 3
mov v0.8b, v21.8b
trn1 v21.4h, v8.4h, v12.4h
trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
mov v8.8b, v21.8b
trn1 v21.4h, v1.4h, v5.4h
trn2 v5.4h, v1.4h, v5.4h //row9 & row11
mov v1.8b, v21.8b
trn1 v21.4h, v9.4h, v13.4h
trn2 v13.4h, v9.4h, v13.4h //row13 & row15
mov v9.8b, v21.8b
trn1 v21.2s, v0.2s, v8.2s
trn2 v8.2s, v0.2s, v8.2s //row1 & row5
mov v0.8b, v21.8b
trn1 v21.2s, v1.2s, v9.2s
trn2 v9.2s, v1.2s, v9.2s //row9 & 13
mov v1.8b, v21.8b
//now Q0->p3 & Q4->q0
//starting processing as p0 and q0 are now ready
trn1 v21.2s, v2.2s, v10.2s
trn2 v10.2s, v2.2s, v10.2s //row2 &6
mov v2.8b, v21.8b
mov v6.d[1] , v7.d[0]
mov v8.d[1] , v9.d[0]
urhadd v20.16b, v6.16b , v8.16b //((p0 + q0 + 1) >> 1)
mov v21.d[0], v20.d[1]
trn1 v31.2s, v3.2s, v11.2s
trn2 v11.2s, v3.2s, v11.2s //row10&row14
mov v3.8b, v31.8b
movi v19.8b, #2
mov v18.d[1], v19.d[0]
//now Q1->p2 & Q5->q1
trn1 v31.2s, v4.2s, v12.2s
trn2 v12.2s, v4.2s, v12.2s //row3 & 7
mov v4.8b, v31.8b
uabd v22.16b , v6.16b, v8.16b //ABS(q1 - q0)
trn1 v31.2s, v5.2s, v13.2s
trn2 v13.2s, v5.2s, v13.2s //row11 & row15
mov v5.8b, v31.8b
mov v0.d[1] , v1.d[0]
mov v2.d[1] , v3.d[0]
mov v4.d[1] , v5.d[0]
mov v10.d[1] , v11.d[0]
mov v12.d[1] , v13.d[0]
mov v14.d[1] , v15.d[0]
uaddl v24.8h, v20.8b, v2.8b //(p2 + ((p0 + q0 + 1) >> 1) L
//now Q2->p1,Q6->q2
uaddl v26.8h, v21.8b, v3.8b //(p2 + ((p0 + q0 + 1) >> 1) H
umlsl v24.8h, v4.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) L
umlsl v26.8h, v5.8b, v19.8b //(p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) H
dup v28.16b, w2 //alpha
cmhs v22.16b, v22.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
dup v28.16b, w3 //beta
uabd v30.16b , v10.16b, v8.16b //ABS(q1 - q0)
sqshrn v24.8b, v24.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) L
sqshrn v25.8b, v26.8h, #1 //((p2 + ((p0 + q0 + 1) >> 1) - (p1 << 1)) >> 1) H
mov v24.d[1], v25.d[0]
cmhs v30.16b, v30.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
uabd v26.16b , v4.16b, v6.16b //ABS(q1 - q0)
smin v24.16b, v24.16b , v16.16b //min(deltap1 ,C0)
orr v22.16b, v22.16b , v30.16b //ABS(q1 - q0) >= Beta ||ABS(p0 - q0) >= Alpha
neg v30.16b, v16.16b //-C0
cmhs v26.16b, v26.16b , v28.16b //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
smax v24.16b, v24.16b , v30.16b //max(deltap1,-C0)
orr v22.16b, v22.16b , v26.16b //ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)
uxtl v26.4s, v18.4h //ui_bs
uaddl v18.8h, v20.8b, v12.8b //q2 + ((p0 + q0 + 1) >> 1) L
cmeq v26.4s, v26.4s , #0 //ABS(p0 - q0) >= Alpha(Alpha <=ABS(p0 - q0))
usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) L
uaddl v20.8h, v21.8b, v13.8b //q2 + ((p0 + q0 + 1) >> 1) H
usubw v18.8h, v18.8h , v10.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1)L
usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - q1) H
orr v26.16b, v26.16b , v22.16b //(ABS(p0 - q0) >= Alpha || ABS(q1 - q0) >= Beta || ABS(p1 - p0) >= Beta)) &&(ui_bs)
usubw v20.8h, v20.8h , v11.8b //(q2 + ((p0 + q0 + 1) >> 1) - 2*q1) H
sqshrn v18.8b, v18.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) L
uabd v22.16b , v2.16b, v6.16b //ABS(q1 - q0)
sqshrn v19.8b, v20.8h, #1 //((q2 + ((p0 + q0 + 1) >> 1) - (q1 << 1)) >> 1) H
mov v18.d[1], v19.d[0]
uabd v20.16b , v12.16b, v8.16b //ABS(q1 - q0)
cmhi v22.16b, v28.16b , v22.16b //Ap < Beta
smin v18.16b, v18.16b , v16.16b //min(delatq1,C0)
cmhi v20.16b, v28.16b , v20.16b //Aq <Beta
usubl v28.8h, v8.8b, v6.8b //(q0 - p0) L
smax v18.16b, v18.16b , v30.16b //max(deltaq1,-C0)
usubl v30.8h, v9.8b, v7.8b //(q0 - p0) H
shl v28.8h, v28.8h, #2 //(q0 - p0)<<2 L
sub v16.16b, v16.16b , v22.16b //C0 + (Ap < Beta)
shl v30.8h, v30.8h, #2 //(q0 - p0) << 2) H
uaddw v28.8h, v28.8h , v4.8b //((q0 - p0) << 2) + (p1 L
uaddw v30.8h, v30.8h , v5.8b //((q0 - p0) << 2) + (p1 H
usubw v28.8h, v28.8h , v10.8b //((q0 - p0) << 2) + (p1 - q1) L
usubw v30.8h, v30.8h , v11.8b //((q0 - p0) << 2) + (p1 - q1) H
bic v22.16b, v22.16b , v26.16b //final condition for p1
rshrn v28.8b, v28.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3); L
rshrn v29.8b, v30.8h, #3 //delta = ((((q0 - p0) << 2) + (p1 - q1) + 4) >> 3) H
mov v28.d[1], v29.d[0]
sub v16.16b, v16.16b , v20.16b //C0 + (Ap < Beta) + (Aq < Beta)
bic v20.16b, v20.16b , v26.16b //final condition for q1
abs v30.16b, v28.16b //abs(delta)
and v24.16b, v24.16b , v22.16b //delatp1
and v18.16b, v18.16b , v20.16b //delta q1
umin v30.16b, v30.16b , v16.16b //min((abs(delta),C)
add v4.16b, v4.16b , v24.16b //p1+deltap1
add v10.16b, v10.16b , v18.16b //q1+deltaq1
mov v5.d[0], v4.d[1]
mov v11.d[0], v10.d[1]
bic v30.16b, v30.16b , v26.16b //abs(delta) of pixels to be changed only
// VCGE.S8 Q14, Q14,#0 //sign(delta)
cmge v28.16b, v28.16b , #0
uqsub v22.16b, v6.16b , v30.16b //clip(p0-delta)
trn1 v21.8b, v0.8b, v2.8b
trn2 v2.8b, v0.8b, v2.8b //row1 &2
mov v0.8b, v21.8b
uqadd v6.16b, v6.16b , v30.16b //clip(p0+delta)
trn1 v21.8b, v1.8b, v3.8b
trn2 v3.8b, v1.8b, v3.8b //row9 &10
mov v1.8b, v21.8b
uqadd v24.16b, v8.16b , v30.16b //clip(q0+delta)
trn1 v21.8b, v12.8b, v14.8b
trn2 v14.8b, v12.8b, v14.8b //row7 & 8
mov v12.8b, v21.8b
uqsub v8.16b, v8.16b , v30.16b //clip(q0-delta)
trn1 v21.8b, v13.8b, v15.8b
trn2 v15.8b, v13.8b, v15.8b //row15 & 16
mov v13.8b, v21.8b
bif v6.16b, v22.16b , v28.16b //p0
bif v8.16b, v24.16b , v28.16b //q0
mov v7.d[0], v6.d[1]
mov v9.d[0], v8.d[1]
trn1 v21.8b, v4.8b, v6.8b
trn2 v6.8b, v4.8b, v6.8b //row3&row4
mov v4.8b, v21.8b
trn1 v21.8b, v8.8b, v10.8b
trn2 v10.8b, v8.8b, v10.8b //row5&6
mov v8.8b, v21.8b
trn1 v21.8b, v5.8b, v7.8b
trn2 v7.8b, v5.8b, v7.8b //row11 & 12
mov v5.8b, v21.8b
trn1 v21.8b, v9.8b, v11.8b
trn2 v11.8b, v9.8b, v11.8b //row13 &14
mov v9.8b, v21.8b
trn1 v21.4h, v2.4h, v6.4h
trn2 v6.4h, v2.4h, v6.4h //row2 & row4
mov v2.8b, v21.8b
trn1 v21.4h, v10.4h, v14.4h
trn2 v14.4h, v10.4h, v14.4h //row6 & row8
mov v10.8b, v21.8b
trn1 v21.4h, v3.4h, v7.4h
trn2 v7.4h, v3.4h, v7.4h //row10 & 12
mov v3.8b, v21.8b
trn1 v21.4h, v11.4h, v15.4h
trn2 v15.4h, v11.4h, v15.4h //row14 & row16
mov v11.8b, v21.8b
trn1 v21.2s, v6.2s, v14.2s
trn2 v14.2s, v6.2s, v14.2s //row4 & 8
mov v6.8b, v21.8b
trn1 v21.2s, v7.2s, v15.2s
trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
mov v7.8b, v21.8b
//now Q3 ->p0 and Q7->q3
trn1 v21.4h, v0.4h, v4.4h
trn2 v4.4h, v0.4h, v4.4h //row1 & 3
mov v0.8b, v21.8b
trn1 v21.4h, v8.4h, v12.4h
trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
mov v8.8b, v21.8b
trn1 v21.4h, v1.4h, v5.4h
trn2 v5.4h, v1.4h, v5.4h //row9 & row11
mov v1.8b, v21.8b
trn1 v21.4h, v9.4h, v13.4h
trn2 v13.4h, v9.4h, v13.4h //row13 & row15
mov v9.8b, v21.8b
sub x0, x0, x1, lsl#4 //restore pointer
trn1 v21.2s, v0.2s, v8.2s
trn2 v8.2s, v0.2s, v8.2s //row1 & row5
mov v0.8b, v21.8b
trn1 v21.2s, v1.2s, v9.2s
trn2 v9.2s, v1.2s, v9.2s //row9 & 13
mov v1.8b, v21.8b
trn1 v21.2s, v2.2s, v10.2s
trn2 v10.2s, v2.2s, v10.2s //row2 &6
mov v2.8b, v21.8b
trn1 v21.2s, v3.2s, v11.2s
trn2 v11.2s, v3.2s, v11.2s //row10&row14
mov v3.8b, v21.8b
trn1 v21.2s, v4.2s, v12.2s
trn2 v12.2s, v4.2s, v12.2s //row3 & 7
mov v4.8b, v21.8b
trn1 v21.2s, v5.2s, v13.2s
trn2 v13.2s, v5.2s, v13.2s //row11 & row15
mov v5.8b, v21.8b
st1 {v0.8b}, [x0], x1 //row1
st1 {v2.8b}, [x0], x1 //row2
st1 {v4.8b}, [x0], x1 //row3
st1 {v6.8b}, [x0], x1 //row4
st1 {v8.8b}, [x0], x1 //row5
st1 {v10.8b}, [x0], x1 //row6
st1 {v12.8b}, [x0], x1 //row7
st1 {v14.8b}, [x0], x1 //row8
st1 {v1.8b}, [x0], x1 //row9
st1 {v3.8b}, [x0], x1 //row10
st1 {v5.8b}, [x0], x1 //row11
st1 {v7.8b}, [x0], x1 //row12
st1 {v9.8b}, [x0], x1 //row13
st1 {v11.8b}, [x0], x1 //row14
st1 {v13.8b}, [x0], x1 //row15
st1 {v15.8b}, [x0], x1 //row16
// LDMFD sp!,{x12,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Performs filtering of a luma block vertical edge when the
//* boundary strength is set to 4
//*
//* @par Description:
//* This operation is described in Sec. 8.7.2.4 under the title
//* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
//*
//* @param[in] x0 - pu1_src
//* Pointer to the src sample q0
//*
//* @param[in] w1 - src_strd
//* Source stride
//*
//* @param[in] w2 - alpha
//* Alpha Value for the boundary
//*
//* @param[in] w3 - beta
//* Beta Value for the boundary
//*
//* @returns
//* None
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
.global ih264_deblk_luma_vert_bs4_av8
ih264_deblk_luma_vert_bs4_av8:
// STMFD sp!,{x12,x14}
push_v_regs
stp x19, x20, [sp, #-16]!
sub x0, x0, #4 //pointer uc_edgePixel-4
mov x17, x0
//loading p3:p2:p1:p0:q0:q1:q2:q3 for every row
ld1 {v0.8b}, [x0], x1 //row1
ld1 {v2.8b}, [x0], x1 //row2
ld1 {v4.8b}, [x0], x1 //row3
ld1 {v6.8b}, [x0], x1 //row4
ld1 {v8.8b}, [x0], x1 //row5
ld1 {v10.8b}, [x0], x1 //row6
ld1 {v12.8b}, [x0], x1 //row7
ld1 {v14.8b}, [x0], x1 //row8
ld1 {v1.8b}, [x0], x1 //row9
ld1 {v3.8b}, [x0], x1 //row10
ld1 {v5.8b}, [x0], x1 //row11
ld1 {v7.8b}, [x0], x1 //row12
ld1 {v9.8b}, [x0], x1 //row13
ld1 {v11.8b}, [x0], x1 //row14
ld1 {v13.8b}, [x0], x1 //row15
ld1 {v15.8b}, [x0], x1 //row16
//taking two 8x8 transposes
//2X2 transposes
trn1 v21.8b, v0.8b, v2.8b
trn2 v2.8b, v0.8b, v2.8b //row1 &2
mov v0.8b, v21.8b
trn1 v21.8b, v4.8b, v6.8b
trn2 v6.8b, v4.8b, v6.8b //row3&row4
mov v4.8b, v21.8b
trn1 v21.8b, v8.8b, v10.8b
trn2 v10.8b, v8.8b, v10.8b //row5&6
mov v8.8b, v21.8b
trn1 v21.8b, v12.8b, v14.8b
trn2 v14.8b, v12.8b, v14.8b //row7 & 8
mov v12.8b, v21.8b
trn1 v21.8b, v1.8b, v3.8b
trn2 v3.8b, v1.8b, v3.8b //row9 &10
mov v1.8b , v21.8b
trn1 v21.8b, v5.8b, v7.8b
trn2 v7.8b, v5.8b, v7.8b //row11 & 12
mov v5.8b , v21.8b
trn1 v21.8b, v9.8b, v11.8b
trn2 v11.8b, v9.8b, v11.8b //row13 &14
mov v9.8b , v21.8b
trn1 v21.8b, v13.8b, v15.8b
trn2 v15.8b, v13.8b, v15.8b //row15 & 16
mov v13.8b , v21.8b
//4x4 transposes
trn1 v21.4h, v2.4h, v6.4h
trn2 v6.4h, v2.4h, v6.4h //row2 & row4
mov v2.8b, v21.8b
trn1 v21.4h, v10.4h, v14.4h
trn2 v14.4h, v10.4h, v14.4h //row6 & row8
mov v10.8b , v21.8b
trn1 v21.4h, v3.4h, v7.4h
trn2 v7.4h, v3.4h, v7.4h //row10 & 12
mov v3.8b, v21.8b
trn1 v21.4h, v11.4h, v15.4h
trn2 v15.4h, v11.4h, v15.4h //row14 & row16
mov v11.8b, v21.8b
trn1 v21.2s, v6.2s, v14.2s
trn2 v14.2s, v6.2s, v14.2s //row4 & 8
mov v6.8b, v21.8b
trn1 v21.2s, v7.2s, v15.2s
trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
mov v7.8b, v21.8b
//now Q3 ->p0 and Q7->q3
trn1 v21.4h, v0.4h, v4.4h
trn2 v4.4h, v0.4h, v4.4h //row1 & 3
mov v0.8b , v21.8b
trn1 v21.4h, v8.4h, v12.4h
trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
mov v8.8b, v21.8b
trn1 v21.4h, v1.4h, v5.4h
trn2 v5.4h, v1.4h, v5.4h //row9 & row11
mov v1.8b, v21.8b
trn1 v21.4h, v9.4h, v13.4h
trn2 v13.4h, v9.4h, v13.4h //row13 & row15
mov v9.8b , v21.8b
trn1 v21.2s, v0.2s, v8.2s
trn2 v8.2s, v0.2s, v8.2s //row1 & row5
mov v0.8b, v21.8b
trn1 v21.2s, v1.2s, v9.2s
trn2 v9.2s, v1.2s, v9.2s //row9 & 13
mov v1.8b, v21.8b
//now Q0->p3 & Q4->q0
//starting processing as p0 and q0 are now ready
//now Q1->p2 & Q5->q1
mov v31.d[0], v14.d[0]
mov v31.d[1], v15.d[0]
trn1 v21.2s, v4.2s, v12.2s
trn2 v12.2s, v4.2s, v12.2s //row3 & 7
mov v4.8b, v21.8b
movi v28.8h, #2
trn1 v21.2s, v5.2s, v13.2s
trn2 v13.2s, v5.2s, v13.2s //row11 & row15
mov v5.8b, v21.8b
uaddl v16.8h, v6.8b, v8.8b //p0+q0 L
trn1 v21.2s, v2.2s, v10.2s
trn2 v10.2s, v2.2s, v10.2s //row2 &6
mov v2.8b, v21.8b
uaddl v18.8h, v7.8b, v9.8b //p0+q0 H
trn1 v21.2s, v3.2s, v11.2s
trn2 v11.2s, v3.2s, v11.2s //row10&row14
mov v3.8b, v21.8b
uaddw v20.8h, v16.8h , v4.8b //p0+q0+p1 L
uaddw v22.8h, v18.8h , v5.8b //p0+q0+p1 H
uaddl v24.8h, v2.8b, v10.8b //p2+q1 L
uaddl v26.8h, v3.8b, v11.8b //p2+q1 H
mla v24.8h, v20.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 L
mla v26.8h, v22.8h , v28.8h //p2 + X2(p1) + X2(p0) + X2(q0) + q1 H
movi v28.16b, #2
uaddw v16.8h, v20.8h , v2.8b //p0+q0+p1+p2 L
uaddw v18.8h, v22.8h , v3.8b //p0+q0+p1+p2 H
dup v30.16b, w2 //duplicate alpha
rshrn v20.8b, v16.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)L p1'
rshrn v21.8b, v18.8h, #2 //(p2 + p1 + p0 + q0 + 2) >> 2)H p1'
mov v20.d[1] , v21.d[0]
mov v0.d[1] , v1.d[0]
mov v2.d[1] , v3.d[0]
mov v4.d[1] , v5.d[0]
mov v6.d[1] , v7.d[0]
mov v8.d[1] , v9.d[0]
mov v10.d[1] , v11.d[0]
mov v12.d[1] , v13.d[0]
mov v14.d[1] , v15.d[0]
uabd v22.16b , v6.16b, v8.16b
usra v28.16b, v30.16b, #2 //alpha >>2 +2
uabd v30.16b , v2.16b, v6.16b
rshrn v24.8b, v24.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) L p0'
rshrn v25.8b, v26.8h, #3 //((p2 + X2(p1) + X2(p0) + X2(q0) + q1 + 4) >> 3) H p0'
mov v24.d[1] , v25.d[0]
dup v26.16b, w3 //beta
cmhi v28.16b, v28.16b , v22.16b //ABS(p0 - q0) <((Alpha >>2) + 2)
uaddl v22.8h, v6.8b, v10.8b //p0+q1 L
cmhi v14.16b, v26.16b , v30.16b //beta>Ap
uaddl v30.8h, v7.8b, v11.8b //p0+q1 H
uaddw v22.8h, v22.8h , v4.8b //p0+q1+p1 L
uaddw v30.8h, v30.8h , v5.8b //p0+q1+p1 H
uaddw v22.8h, v22.8h , v4.8b //p0+q1+2*p1 L
uaddw v30.8h, v30.8h , v5.8b //p0+q1+2*p1 H
and v14.16b, v14.16b , v28.16b //(Ap < Beta && ABS(p0 - q0) <((Alpha >>2) + 2)
rshrn v22.8b, v22.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) L p0"
rshrn v23.8b, v30.8h, #2 //((X2(p1) + p0 + q1 + 2) >> 2) H p0"
mov v22.d[1] , v23.d[0]
uaddl v30.8h, v2.8b, v0.8b //p2+p3 L
bif v24.16b, v22.16b , v14.16b //p0' or p0 "
uaddl v22.8h, v3.8b, v1.8b //p2+p3 H
add v30.8h, v30.8h , v30.8h //2*(p2+p3) L
add v22.8h, v22.8h , v22.8h //2*(p2+p3)H
add v16.8h, v16.8h , v30.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) L
add v18.8h, v18.8h , v22.8h //(X2(p3) + X3(p2) + p1 + p0 + q0) H
uabd v30.16b , v12.16b, v8.16b
uabd v22.16b , v10.16b, v8.16b
rshrn v16.8b, v16.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); L p2'
rshrn v17.8b, v18.8h, #3 //((X2(p3) + X3(p2) + p1 + p0 + q0 + 4) >> 3); H p2'
mov v16.d[1] , v17.d[0]
uabd v18.16b , v4.16b, v6.16b
cmhi v30.16b, v26.16b , v30.16b //Aq < Beta
cmhs v22.16b, v22.16b, v26.16b
cmhs v18.16b, v18.16b, v26.16b
dup v26.16b, w2 //duplicate alpha
and v30.16b, v30.16b , v28.16b //(Aq < Beta && ABS(p0 - q0) <((Alpha >>2) + 2))
uabd v28.16b , v6.16b, v8.16b
orr v22.16b, v22.16b , v18.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta
uaddl v18.8h, v6.8b, v8.8b //p0+q0 L
cmhs v28.16b, v28.16b, v26.16b
uaddl v26.8h, v7.8b, v9.8b //p0+q0 H
uaddw v18.8h, v18.8h , v10.8b //p0+q0+q1 L
orr v22.16b, v22.16b , v28.16b //ABS(p1 - p0) >= Beta || ABS(q1 - q0) >= Beta||ABS(p0 - q0) >= Alpha
uaddw v26.8h, v26.8h , v11.8b //p0+q0+q1 H
bic v14.16b, v14.16b , v22.16b //final condn for p's
movi v28.16b, #2
bif v6.16b, v24.16b , v22.16b //final p0
bit v2.16b, v16.16b , v14.16b //final p2
bif v20.16b, v4.16b , v14.16b //final p1
mov v7.d[0] , v6.d[1]
mov v3.d[0] , v2.d[1]
mov v21.d[0] , v20.d[1]
uaddl v24.8h, v8.8b, v4.8b //q0+p1 L
umlal v24.8h, v10.8b, v28.8b //X2(q1) + q0 + p1 L
uaddl v16.8h, v9.8b, v5.8b //q0+p1 H
umlal v16.8h, v11.8b, v28.8b //X2(q1) + q0 + p1 H
movi v28.8h, #2
uaddl v14.8h, v4.8b, v12.8b //p1+q2 L
mla v14.8h, v18.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2L
uaddl v4.8h, v5.8b, v13.8b //p1+q2H
mla v4.8h, v26.8h , v28.8h //p1 + X2(p0) + X2(q0) + X2(q1) + q2H
rshrn v24.8b, v24.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; L q0'
rshrn v25.8b, v16.8h, #2 //(X2(q1) + q0 + p1 + 2) >> 2; H q0'
mov v24.d[1] , v25.d[0]
uaddw v18.8h, v18.8h , v12.8b //p0 + q0 + q1 + q2 L
uaddw v26.8h, v26.8h , v13.8b //p0 + q0 + q1 + q2 H
rshrn v16.8b, v14.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 L qo"
mov v14.16b, v31.16b
rshrn v17.8b, v4.8h, #3 //(p1 + X2(p0) + X2(q0) + X2(q1) + q2 + 4) >> 3 H qo"
mov v16.d[1] , v17.d[0]
rshrn v4.8b, v18.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 L q1'
rshrn v5.8b, v26.8h, #2 //p0 + q0 + q1 + q2 + 2)>>2 H q1'
mov v4.d[1] , v5.d[0]
bit v24.16b, v16.16b , v30.16b //q0' or q0"
bic v30.16b, v30.16b , v22.16b //final condn for q's
trn1 v31.8b, v0.8b, v2.8b
trn2 v2.8b, v0.8b, v2.8b //row1 &2
mov v0.8b, v31.8b
bit v10.16b, v4.16b , v30.16b
mov v11.d[0] , v10.d[1]
mov v25.d[0] , v24.d[1]
mov v31.d[0] , v30.d[1]
trn1 v31.8b, v1.8b, v3.8b
trn2 v3.8b, v1.8b, v3.8b //row9 &10
mov v1.8b, v31.8b
uaddl v16.8h, v12.8b, v14.8b //q2+q3 L
trn1 v31.8b, v20.8b, v6.8b
trn2 v6.8b, v20.8b, v6.8b //row3&row4
mov v20.8b , v31.8b
uaddl v4.8h, v13.8b, v15.8b //q2+q3 H
trn1 v31.8b, v21.8b, v7.8b
trn2 v7.8b, v21.8b, v7.8b //row11 & 12
mov v21.8b , v31.8b
mla v18.8h, v16.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 L
trn1 v31.4h, v2.4h, v6.4h
trn2 v6.4h, v2.4h, v6.4h //row2 & row4
mov v2.8b, v31.8b
mla v26.8h, v4.8h , v28.8h //X2(q3) + X3(q2) + q1 + q0 + p0 H
trn1 v31.4h, v3.4h, v7.4h
trn2 v7.4h, v3.4h, v7.4h //row10 & 12
mov v3.8b , v31.8b
bif v8.16b, v24.16b , v22.16b //final q0
mov v9.d[0] , v8.d[1]
trn1 v31.4h, v0.4h, v20.4h
trn2 v20.4h, v0.4h, v20.4h //row1 & 3
mov v0.8b , v31.8b
rshrn v18.8b, v18.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; L
trn1 v31.4h, v1.4h, v21.4h
trn2 v21.4h, v1.4h, v21.4h //row9 & row11
mov v1.8b, v31.8b
rshrn v19.8b, v26.8h, #3 //(X2(q3) + X3(q2) + q1 + q0 + p0 + 4) >> 3; H
mov v18.d[1] , v19.d[0]
trn1 v31.8b, v8.8b, v10.8b
trn2 v10.8b, v8.8b, v10.8b //row5&6
mov v8.8b, v31.8b
bit v12.16b, v18.16b , v30.16b //final q2
mov v13.d[0] , v12.d[1]
trn1 v31.8b, v9.8b, v11.8b
trn2 v11.8b, v9.8b, v11.8b //row13 &14
mov v9.8b, v31.8b
trn1 v31.8b, v12.8b, v14.8b
trn2 v14.8b, v12.8b, v14.8b //row7 & 8
mov v12.8b, v31.8b
trn1 v31.8b, v13.8b, v15.8b
trn2 v15.8b, v13.8b, v15.8b //row15 & 16
mov v13.8b , v31.8b
trn1 v31.4h, v10.4h, v14.4h
trn2 v14.4h, v10.4h, v14.4h //row6 & row8
mov v10.8b, v31.8b
trn1 v31.4h, v11.4h, v15.4h
trn2 v15.4h, v11.4h, v15.4h //row14 & row16
mov v11.8b, v31.8b
//now Q3 ->p0 and Q7->q3
trn1 v31.4h, v8.4h, v12.4h
trn2 v12.4h, v8.4h, v12.4h //row 5 & 7
mov v8.8b, v31.8b
trn1 v31.4h, v9.4h, v13.4h
trn2 v13.4h, v9.4h, v13.4h //row13 & row15
mov v9.8b, v31.8b
sub x0, x0, x1, lsl#4 //restore pointer
trn1 v31.2s, v6.2s, v14.2s
trn2 v14.2s, v6.2s, v14.2s //row4 & 8
mov v6.8b , v31.8b
trn1 v31.2s, v7.2s, v15.2s
trn2 v15.2s, v7.2s, v15.2s //row 12 & 16
mov v7.8b, v31.8b
trn1 v31.2s, v0.2s, v8.2s
trn2 v8.2s, v0.2s, v8.2s //row1 & row5
mov v0.8b , v31.8b
trn1 v31.2s, v1.2s, v9.2s
trn2 v9.2s, v1.2s, v9.2s //row9 & 13
mov v1.8b , v31.8b
trn1 v31.2s, v2.2s, v10.2s
trn2 v10.2s, v2.2s, v10.2s //row2 &6
mov v2.8b , v31.8b
trn1 v31.2s, v3.2s, v11.2s
trn2 v11.2s, v3.2s, v11.2s //row10&row14
mov v3.8b , v31.8b
trn1 v31.2s, v20.2s, v12.2s
trn2 v12.2s, v20.2s, v12.2s //row3 & 7
mov v20.8b , v31.8b
trn1 v31.2s, v21.2s, v13.2s
trn2 v13.2s, v21.2s, v13.2s //row11 & row15
mov v21.8b, v31.8b
st1 {v0.8b}, [x0], x1 //row1
st1 {v2.8b}, [x0], x1 //row2
st1 {v20.8b}, [x0], x1 //row3
st1 {v6.8b}, [x0], x1 //row4
st1 {v8.8b}, [x0], x1 //row5
st1 {v10.8b}, [x0], x1 //row6
st1 {v12.8b}, [x0], x1 //row7
st1 {v14.8b}, [x0], x1 //row8
st1 {v1.8b}, [x0], x1 //row9
st1 {v3.8b}, [x0], x1 //row10
st1 {v21.8b}, [x0], x1 //row11
st1 {v7.8b}, [x0], x1 //row12
st1 {v9.8b}, [x0], x1 //row13
st1 {v11.8b}, [x0], x1 //row14
st1 {v13.8b}, [x0], x1 //row15
st1 {v15.8b}, [x0], x1 //row16
// LDMFD sp!,{x12,pc}
ldp x19, x20, [sp], #16
pop_v_regs
ret