Cemu/dependencies/ih264d/common/arm/ih264_deblk_chroma_a9.s

1338 lines
49 KiB
ArmAsm
Raw Normal View History

2022-08-22 22:21:23 +02:00
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@/*****************************************************************************/
@/* */
@/* File Name : ih264_deblk_chroma_a9.s */
@/* */
@/* Description : Contains function definitions for deblocking luma */
@/* edge. Functions are coded in NEON assembly and can */
@/* be compiled using ARM RVDS. */
@/* */
@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */
@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */
@/* ih264_deblk_chroma_horz_bs4_bp_a9() */
@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */
@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */
@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */
@/* ih264_deblk_chroma_vert_bs4_a9() */
@/* ih264_deblk_chroma_vert_bslt4_a9() */
@/* ih264_deblk_chroma_horz_bs4_a9() */
@/* ih264_deblk_chroma_horz_bslt4_a9() */
@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */
@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */
@/* */
@/* Issues / Problems : None */
@/* */
@/* Revision History : */
@/* */
@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
@/* 28 11 2013 Ittiam Draft */
@/* 05 01 2015 Kaushik Added double-call functions for */
@/* Senthoor vertical deblocking, and high */
@/* profile functions. */
@/* */
@/*****************************************************************************/
.text
.p2align 2
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block horizontal edge when the
@* boundary strength is set to 4
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_horz_bs4_bp_a9
ih264_deblk_chroma_horz_bs4_bp_a9:
stmfd sp!, {r4, lr} @
vpush {d8 - d15}
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
mov r4, r0 @Keeping a backup of the pointer p0 of chroma
vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
vdup.8 q10, r2 @Q10 contains alpha
vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
vaddl.u8 q4, d6, d0 @
vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
vmov.i8 d31, #2 @
vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
vmlal.u8 q4, d2, d31 @
vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
vaddl.u8 q7, d4, d2 @
vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
vdup.8 q8, r3 @Q8 contains beta
vmlal.u8 q7, d6, d31 @
vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
vrshrn.u16 d8, q4, #2 @
vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
vrshrn.u16 d10, q7, #2 @
vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
vbit q5, q2, q9 @
vbit q4, q0, q9 @
vst2.8 {d10, d11}, [r4], r1 @
vst2.8 {d8, d9}, [r4] @
vpop {d8 - d15}
ldmfd sp!, {r4, pc} @
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge when the
@* boundary strength is set to 4
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bs4_bp_a9
ih264_deblk_chroma_vert_bs4_bp_a9:
stmfd sp!, {r12, r14}
vpush {d8 - d15}
sub r0, r0, #4 @point r0 to p1u of row0.
mov r12, r0 @keep a back up of r0 for buffer write
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
vdup.8 q11, r2 @Q4 = alpha
vdup.8 q12, r3 @Q5 = beta
vmov.i8 d31, #2
vabd.u8 q4, q1, q2 @|p0-q0|
vabd.u8 q5, q3, q2 @|q1-q0|
vabd.u8 q6, q0, q1 @|p1-p0|
vaddl.u8 q7, d2, d6
vaddl.u8 q8, d3, d7 @(p0 + q1)
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
vmlal.u8 q7, d0, d31
vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
vaddl.u8 q9, d0, d4
vaddl.u8 q10, d1, d5 @(p1 + q0)
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
vmlal.u8 q9, d6, d31
vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
vrshrn.i16 d14, q7, #2
vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vrshrn.i16 d18, q9, #2
vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
vbit q1, q7, q4
vbit q2, q9, q4
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block horizontal edge for cases where the
@* boundary strength is less than 4
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @param[in] sp(0) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(4) - pu1_cliptab
@* tc0_table
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_horz_bslt4_bp_a9
ih264_deblk_chroma_horz_bslt4_bp_a9:
stmfd sp!, {r4-r6, lr} @
ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab
vpush {d8 - d15}
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U
rev r4, r4 @
vmov.32 d12[0], r4 @d12[0] = ui_Bs
vld1.32 d16[0], [r5] @D16[0] contains cliptab
vld2.8 {d6, d7}, [r0], r1 @Q3=p1
vtbl.8 d14, {d16}, d12 @
vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar
mov r6, r0 @Keeping a backup of the pointer to chroma U P0
vld2.8 {d4, d5}, [r0], r1 @Q2=p0
vmov.i8 d30, #1 @
vdup.8 q10, r2 @Q10 contains alpha
vld2.8 {d0, d1}, [r0], r1 @Q0=q0
vmovl.u8 q7, d14 @
vld2.8 {d2, d3}, [r0] @Q1=q1
vsubl.u8 q5, d1, d5 @
vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
vsli.16 q7, q7, #8 @
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
vdup.8 q8, r3 @Q8 contains beta
vadd.i16 q4, q4, q10 @
vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
vqrshrn.s16 d8, q4, #3 @
vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
vadd.i8 d14, d14, d30 @Q7 = C = C0+1
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
vabs.s8 q3, q4 @Q4 = ABS (i_macro)
vmov.i8 d15, d14 @
vmov.i8 d13, d12 @
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
vbic q6, q6, q9 @final condition
vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
vst2.8 {d16, d17}, [r6], r1 @
vst2.8 {d0, d1}, [r6] @
vpop {d8 - d15}
ldmfd sp!, {r4-r6, pc} @
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge for cases where the
@* boundary strength is less than 4
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @param[in] sp(0) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(4) - pu1_cliptab
@* tc0_table
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bslt4_bp_a9
ih264_deblk_chroma_vert_bslt4_bp_a9:
stmfd sp!, {r10-r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
ldr r11, [sp, #16] @r12 = ui_Bs
ldr r10, [sp, #20] @r14 = puc_ClipTab
mov r12, r0 @keep a back up of r0 for buffer write
vpush {d8 - d15}
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
vdup.8 q11, r2 @Q4 = alpha
vabd.u8 q4, q1, q2 @|p0-q0|
vdup.8 q12, r3 @Q5 = beta
vabd.u8 q5, q3, q2 @|q1-q0|
vabd.u8 q6, q0, q1 @|p1-p0|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
vsubl.u8 q7, d0, d6
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
vsubl.u8 q8, d1, d7 @(p1 - q1)
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
vsubl.u8 q9, d4, d2
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
vsubl.u8 q10, d5, d3 @(q0 - p0)
vmov.u16 q14, #4
vld1.32 {d24[0]}, [r10] @Load ClipTable
rev r11, r11 @Blocking strengths
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vmov.32 d10[0], r11
vmla.s16 q7, q9, q14
vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
vmovl.u8 q5, d10
vsli.u16 d10, d10, #8
vmovl.u16 q5, d10
vsli.u32 q5, q5, #16
vtbl.8 d12, {d24}, d10
vtbl.8 d13, {d24}, d11 @tC0
vmov.u8 q12, #1
vadd.u8 q6, q6, q12 @tC0 + 1
vcge.u8 q5, q5, q12 @u4_bS > 0 ?
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
@ Q0 - Q3(inputs),
@ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
@ Q6 (tC)
vrshr.s16 q7, q7, #3
vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
vcgt.s16 q9, q7, #0
vcgt.s16 q10, q8, #0
vmovn.i16 d18, q9
vmovn.i16 d19, q10 @Q9 = sign(delta)
vabs.s16 q7, q7
vabs.s16 q8, q8
vmovn.u16 d14, q7
vmovn.u16 d15, q8
vmin.u8 q7, q7, q6 @Q7 = |delta|
vqadd.u8 q10, q1, q7 @p0+|delta|
vqadd.u8 q11, q2, q7 @q0+|delta|
vqsub.u8 q12, q1, q7 @p0-|delta|
vqsub.u8 q13, q2, q7 @q0-|delta|
vbit q12, q10, q9 @p0 + delta
vbit q11, q13, q9 @q0 - delta
vbit q1, q12, q4
vbit q2, q11, q4
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r10-r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge when the
@* boundary strength is set to 4 on calling twice
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
stmfd sp!, {r12, r14}
vpush {d8 - d15}
sub r0, r0, #4 @point r0 to p1u of row0.
mov r12, r0 @keep a back up of r0 for buffer write
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
vdup.8 d11, r2 @D11 = alpha
vdup.8 d12, r3 @D12 = beta
vmov.i8 d31, #2
vabd.u8 d4, d1, d2 @|p0-q0|
vabd.u8 d5, d3, d2 @|q1-q0|
vabd.u8 d6, d0, d1 @|p1-p0|
vaddl.u8 q14, d1, d3 @(p0 + q1)
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
vaddl.u8 q13, d0, d2 @(p1 + q0)
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
vbit d1, d7, d4
vbit d2, d9, d4
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge for cases where the
@* boundary strength is less than 4 on calling twice
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha
@* Alpha Value for the boundary
@*
@* @param[in] r3 - beta
@* Beta Value for the boundary
@*
@* @param[in] sp(0) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(4) - pu1_cliptab
@* tc0_table
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
stmfd sp!, {r10-r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
ldr r11, [sp, #16] @r11 = ui_Bs
ldr r10, [sp, #20] @r10 = puc_ClipTab
mov r12, r0 @keep a back up of r0 for buffer write
vpush {d8 - d15}
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
vdup.8 d11, r2 @D11 = alpha
vabd.u8 d4, d1, d2 @|p0-q0|
vdup.8 d12, r3 @D12 = beta
vabd.u8 d5, d3, d2 @|q1-q0|
vabd.u8 d6, d0, d1 @|p1-p0|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
vsubl.u8 q14, d0, d3 @(p1 - q1)
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
vsubl.u8 q12, d2, d1 @(q0 - p0)
vmov.u16 q10, #4
vld1.32 {d31[0]}, [r10] @Load ClipTable
rev r11, r11 @Blocking strengths
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vmov.32 d22[0], r11
vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
vmovl.u8 q11, d22
vsli.u16 d22, d22, #8
vtbl.8 d6, {d31}, d22 @tC0
vmov.u8 d12, #1
vadd.u8 d6, d6, d12 @tC0 + 1
vcge.u8 d5, d22, d12 @u4_bS > 0 ?
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
@ D0 - D3(inputs),
@ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
@ D6 (tC)
vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
vcgt.s16 q13, q14, #0
vmovn.i16 d9, q13 @D9 = sign(delta)
vabs.s16 q14, q14
vmovn.u16 d7, q14
vmin.u8 d7, d7, d6 @D7 = |delta|
vqadd.u8 d10, d1, d7 @p0+|delta|
vqadd.u8 d11, d2, d7 @q0+|delta|
vqsub.u8 d12, d1, d7 @p0-|delta|
vqsub.u8 d13, d2, d7 @q0-|delta|
vbit d12, d10, d9 @p0 + delta
vbit d11, d13, d9 @q0 - delta
vbit d1, d12, d4
vbit d2, d11, d4
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r10-r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block horizontal edge when the
@* boundary strength is set to 4 in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_horz_bs4_a9
ih264_deblk_chroma_horz_bs4_a9:
stmfd sp!, {r4-r6, lr} @
ldr r5, [sp, #16] @R5 = alpha_cr
ldr r6, [sp, #20] @R6 = beta_cr
vpush {d8 - d15}
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
mov r4, r0 @Keeping a backup of the pointer p0 of chroma
vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
vdup.8 d20, r2 @D20 contains alpha_cb
vdup.8 d21, r5 @D21 contains alpha_cr
vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
vaddl.u8 q4, d6, d0 @
vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
vmov.i8 d31, #2 @
vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
vmlal.u8 q4, d2, d31 @
vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
vaddl.u8 q7, d4, d2 @
vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
vdup.8 d16, r3 @D16 contains beta_cb
vdup.8 d17, r6 @D17 contains beta_cr
vmlal.u8 q7, d6, d31 @
vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
vrshrn.u16 d8, q4, #2 @
vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
vrshrn.u16 d10, q7, #2 @
vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
vbit q5, q2, q9 @
vbit q4, q0, q9 @
vst2.8 {d10, d11}, [r4], r1 @
vst2.8 {d8, d9}, [r4] @
vpop {d8 - d15}
ldmfd sp!, {r4-r6, pc} @
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge when the
@* boundary strength is set to 4 in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bs4_a9
ih264_deblk_chroma_vert_bs4_a9:
stmfd sp!, {r4, r5, r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
mov r12, r0 @keep a back up of r0 for buffer write
ldr r4, [sp, #16] @r4 = alpha_cr
ldr r5, [sp, #20] @r5 = beta_cr
add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb)
add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb)
vpush {d8 - d15}
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
vdup.16 q11, r2 @Q11 = alpha
vdup.16 q12, r3 @Q12 = beta
vmov.i8 d31, #2
vabd.u8 q4, q1, q2 @|p0-q0|
vabd.u8 q5, q3, q2 @|q1-q0|
vabd.u8 q6, q0, q1 @|p1-p0|
vaddl.u8 q7, d2, d6
vaddl.u8 q8, d3, d7 @(p0 + q1)
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
vmlal.u8 q7, d0, d31
vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
vaddl.u8 q9, d0, d4
vaddl.u8 q10, d1, d5 @(p1 + q0)
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
vmlal.u8 q9, d6, d31
vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
vrshrn.i16 d14, q7, #2
vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vrshrn.i16 d18, q9, #2
vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
vbit q1, q7, q4
vbit q2, q9, q4
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r4, r5, r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block horizontal edge for cases where the
@* boundary strength is less than 4 in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @param[in] sp(8) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(12) - pu1_cliptab_cb
@* tc0_table for U
@*
@* @param[in] sp(16) - pu1_cliptab_cr
@* tc0_table for V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_horz_bslt4_a9
ih264_deblk_chroma_horz_bslt4_a9:
stmfd sp!, {r4-r9, lr} @
ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
ldr r7, [sp, #36] @R7 = u4_bs
ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U
vpush {d8 - d15}
rev r7, r7 @
vmov.32 d12[0], r7 @D12[0] = ui_Bs
vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb
vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr
vld2.8 {d6, d7}, [r0], r1 @Q3=p1
vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U
vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V
vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar
mov r6, r0 @Keeping a backup of the pointer to chroma U P0
vld2.8 {d4, d5}, [r0], r1 @Q2=p0
vmov.i8 d30, #1 @
vdup.8 d20, r2 @D20 contains alpha_cb
vdup.8 d21, r4 @D21 contains alpha_cr
vld2.8 {d0, d1}, [r0], r1 @Q0=q0
vmovl.u8 q7, d14 @
vmovl.u8 q14, d28 @
vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V
vld2.8 {d2, d3}, [r0] @Q1=q1
vsubl.u8 q5, d1, d5 @
vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
vsli.16 q7, q7, #8 @
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
vdup.8 d16, r3 @Q8 contains beta_cb
vdup.8 d17, r5 @Q8 contains beta_cr
vadd.i16 q4, q4, q10 @
vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
vqrshrn.s16 d8, q4, #3 @
vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
vabs.s8 q3, q4 @Q4 = ABS (i_macro)
vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V
vmov.i8 d13, d12 @
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
vbic q6, q6, q9 @final condition
vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
vst2.8 {d16, d17}, [r6], r1 @
vst2.8 {d0, d1}, [r6] @
vpop {d8 - d15}
ldmfd sp!, {r4-r9, pc} @
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge for cases where the
@* boundary strength is less than 4 in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @param[in] sp(8) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(12) - pu1_cliptab_cb
@* tc0_table for U
@*
@* @param[in] sp(16) - pu1_cliptab_cr
@* tc0_table for V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bslt4_a9
ih264_deblk_chroma_vert_bslt4_a9:
stmfd sp!, {r4-r7, r10-r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr
add r2, r2, r4, lsl #8
add r3, r3, r5, lsl #8
ldr r6, [sp, #40] @R6 = u4_bs
ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
vpush {d8 - d15}
mov r12, r0 @keep a back up of R0 for buffer write
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
vdup.16 q11, r2 @Q11 = alpha
vabd.u8 q4, q1, q2 @|p0-q0|
vdup.16 q12, r3 @Q12 = beta
vabd.u8 q5, q3, q2 @|q1-q0|
vabd.u8 q6, q0, q1 @|p1-p0|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
vsubl.u8 q7, d0, d6
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
vsubl.u8 q8, d1, d7 @(p1 - q1)
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
vsubl.u8 q9, d4, d2
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
vsubl.u8 q10, d5, d3 @(q0 - p0)
vmov.u16 q14, #4
vld1.32 {d24[0]}, [r10] @Load ClipTable for U
vld1.32 {d25[0]}, [r11] @Load ClipTable for V
rev r6, r6 @Blocking strengths
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vmov.32 d10[0], r6
vmla.s16 q7, q9, q14
vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
vmovl.u8 q5, d10
vsli.u16 d10, d10, #8
vtbl.8 d12, {d24}, d10 @tC0 for U
vtbl.8 d13, {d25}, d10 @tC0 for V
vzip.8 d12, d13
vmovl.u16 q5, d10
vsli.u32 q5, q5, #16
vmov.u8 q12, #1
vadd.u8 q6, q6, q12 @tC0 + 1
vcge.u8 q5, q5, q12 @u4_bS > 0 ?
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
@ Q0 - Q3(inputs),
@ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
@ Q6 (tC)
vrshr.s16 q7, q7, #3
vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
vcgt.s16 q9, q7, #0
vcgt.s16 q10, q8, #0
vmovn.i16 d18, q9
vmovn.i16 d19, q10 @Q9 = sign(delta)
vabs.s16 q7, q7
vabs.s16 q8, q8
vmovn.u16 d14, q7
vmovn.u16 d15, q8
vmin.u8 q7, q7, q6 @Q7 = |delta|
vqadd.u8 q10, q1, q7 @p0+|delta|
vqadd.u8 q11, q2, q7 @q0+|delta|
vqsub.u8 q12, q1, q7 @p0-|delta|
vqsub.u8 q13, q2, q7 @q0-|delta|
vbit q12, q10, q9 @p0 + delta
vbit q11, q13, q9 @q0 - delta
vbit q1, q12, q4
vbit q2, q11, q4
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r4-r7, r10-r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge when the
@* boundary strength is set to 4 on calling twice in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bs4_mbaff_a9
ih264_deblk_chroma_vert_bs4_mbaff_a9:
stmfd sp!, {r4, r5, r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
mov r12, r0 @keep a back up of r0 for buffer write
ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr
add r2, r2, r4, lsl #8
add r3, r3, r5, lsl #8
vpush {d8 - d15}
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
vdup.16 d11, r2 @D11 = alpha
vdup.16 d12, r3 @D12 = beta
vmov.i8 d31, #2
vabd.u8 d4, d1, d2 @|p0-q0|
vabd.u8 d5, d3, d2 @|q1-q0|
vabd.u8 d6, d0, d1 @|p1-p0|
vaddl.u8 q14, d1, d3 @(p0 + q1)
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
vaddl.u8 q13, d0, d2 @(p1 + q0)
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
vbit d1, d7, d4
vbit d2, d9, d4
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r4, r5, r12, pc}
@**
@*******************************************************************************
@*
@* @brief
@* Performs filtering of a chroma block vertical edge for cases where the
@* boundary strength is less than 4 on calling twice in high profile
@*
@* @par Description:
@* This operation is described in Sec. 8.7.2.4 under the title
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
@*
@* @param[in] r0 - pu1_src
@* Pointer to the src sample q0
@*
@* @param[in] r1 - src_strd
@* Source stride
@*
@* @param[in] r2 - alpha_cb
@* Alpha Value for the boundary in U
@*
@* @param[in] r3 - beta_cb
@* Beta Value for the boundary in U
@*
@* @param[in] sp(0) - alpha_cr
@* Alpha Value for the boundary in V
@*
@* @param[in] sp(4) - beta_cr
@* Beta Value for the boundary in V
@*
@* @param[in] sp(8) - u4_bs
@* Packed Boundary strength array
@*
@* @param[in] sp(12) - pu1_cliptab_cb
@* tc0_table for U
@*
@* @param[in] sp(16) - pu1_cliptab_cr
@* tc0_table for V
@*
@* @returns
@* None
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
.global ih264_deblk_chroma_vert_bslt4_mbaff_a9
ih264_deblk_chroma_vert_bslt4_mbaff_a9:
stmfd sp!, {r4-r6, r10-r12, r14}
sub r0, r0, #4 @point r0 to p1u of row0.
mov r12, r0 @keep a back up of r0 for buffer write
ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
add r2, r2, r4, lsl #8
add r3, r3, r5, lsl #8
ldr r6, [sp, #36] @R6 = u4_bs
ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
vpush {d8 - d15}
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
vdup.16 d11, r2 @D11 = alpha
vabd.u8 d4, d1, d2 @|p0-q0|
vdup.16 d12, r3 @D12 = beta
vabd.u8 d5, d3, d2 @|q1-q0|
vabd.u8 d6, d0, d1 @|p1-p0|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
vsubl.u8 q14, d0, d3 @(p1 - q1)
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
vsubl.u8 q12, d2, d1 @(q0 - p0)
vmov.u16 q10, #4
vld1.32 {d31[1]}, [r10] @Load ClipTable for U
vld1.32 {d31[0]}, [r11] @Load ClipTable for V
rev r6, r6 @Blocking strengths
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
vmov.32 d22[0], r6
vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
vmovl.u8 q11, d22
vsli.u16 d22, d22, #8
vmov.u16 d13, #4
vadd.u8 d22, d22, d13
vtbl.8 d6, {d31}, d22 @tC0
vmov.u8 d12, #1
vsub.u8 d22, d22, d13
vadd.u8 d6, d6, d12 @tC0 + 1
vcge.u8 d5, d22, d12 @u4_bS > 0 ?
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
@ D0 - D3(inputs),
@ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
@ D6 (tC)
vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
vcgt.s16 q13, q14, #0
vmovn.i16 d9, q13 @D9 = sign(delta)
vabs.s16 q14, q14
vmovn.u16 d7, q14
vmin.u8 d7, d7, d6 @D7 = |delta|
vqadd.u8 d10, d1, d7 @p0+|delta|
vqadd.u8 d11, d2, d7 @q0+|delta|
vqsub.u8 d12, d1, d7 @p0-|delta|
vqsub.u8 d13, d2, d7 @q0-|delta|
vbit d12, d10, d9 @p0 + delta
vbit d11, d13, d9 @q0 - delta
vbit d1, d12, d4
vbit d2, d11, d4
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
vpop {d8 - d15}
ldmfd sp!, {r4-r6, r10-r12, pc}