mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-12 18:09:10 +01:00
1338 lines
49 KiB
ArmAsm
1338 lines
49 KiB
ArmAsm
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
@/*****************************************************************************/
|
|
@/* */
|
|
@/* File Name : ih264_deblk_chroma_a9.s */
|
|
@/* */
|
|
@/* Description : Contains function definitions for deblocking luma */
|
|
@/* edge. Functions are coded in NEON assembly and can */
|
|
@/* be compiled using ARM RVDS. */
|
|
@/* */
|
|
@/* List of Functions : ih264_deblk_chroma_vert_bs4_bp_a9() */
|
|
@/* ih264_deblk_chroma_vert_bslt4_bp_a9() */
|
|
@/* ih264_deblk_chroma_horz_bs4_bp_a9() */
|
|
@/* ih264_deblk_chroma_horz_bslt4_bp_a9() */
|
|
@/* ih264_deblk_chroma_vert_bs4_mbaff_bp_a9() */
|
|
@/* ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9() */
|
|
@/* ih264_deblk_chroma_vert_bs4_a9() */
|
|
@/* ih264_deblk_chroma_vert_bslt4_a9() */
|
|
@/* ih264_deblk_chroma_horz_bs4_a9() */
|
|
@/* ih264_deblk_chroma_horz_bslt4_a9() */
|
|
@/* ih264_deblk_chroma_vert_bs4_mbaff_a9() */
|
|
@/* ih264_deblk_chroma_vert_bslt4_mbaff_a9() */
|
|
@/* */
|
|
@/* Issues / Problems : None */
|
|
@/* */
|
|
@/* Revision History : */
|
|
@/* */
|
|
@/* DD MM YYYY Author(s) Changes (Describe the changes made) */
|
|
@/* 28 11 2013 Ittiam Draft */
|
|
@/* 05 01 2015 Kaushik Added double-call functions for */
|
|
@/* Senthoor vertical deblocking, and high */
|
|
@/* profile functions. */
|
|
@/* */
|
|
@/*****************************************************************************/
|
|
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block horizontal edge when the
|
|
@* boundary strength is set to 4
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_horz_bs4_bp_a9
|
|
|
|
ih264_deblk_chroma_horz_bs4_bp_a9:
|
|
|
|
stmfd sp!, {r4, lr} @
|
|
vpush {d8 - d15}
|
|
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
|
|
vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
|
|
mov r4, r0 @Keeping a backup of the pointer p0 of chroma
|
|
vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
|
|
vdup.8 q10, r2 @Q10 contains alpha
|
|
vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
|
|
vaddl.u8 q4, d6, d0 @
|
|
vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
|
|
vmov.i8 d31, #2 @
|
|
vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
|
|
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
|
|
vmlal.u8 q4, d2, d31 @
|
|
vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
|
|
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
|
|
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
|
|
vaddl.u8 q7, d4, d2 @
|
|
vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
|
|
vdup.8 q8, r3 @Q8 contains beta
|
|
vmlal.u8 q7, d6, d31 @
|
|
vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
|
|
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
|
|
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
|
|
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
|
|
vrshrn.u16 d8, q4, #2 @
|
|
vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
|
|
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
|
|
vrshrn.u16 d10, q7, #2 @
|
|
vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
|
|
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
|
|
vbit q5, q2, q9 @
|
|
vbit q4, q0, q9 @
|
|
vst2.8 {d10, d11}, [r4], r1 @
|
|
vst2.8 {d8, d9}, [r4] @
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4, pc} @
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge when the
|
|
@* boundary strength is set to 4
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bs4_bp_a9
|
|
|
|
ih264_deblk_chroma_vert_bs4_bp_a9:
|
|
|
|
stmfd sp!, {r12, r14}
|
|
vpush {d8 - d15}
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
|
|
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
|
|
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
|
|
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
|
|
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
|
|
|
|
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
|
|
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
|
|
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
|
|
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
|
|
|
|
vdup.8 q11, r2 @Q4 = alpha
|
|
vdup.8 q12, r3 @Q5 = beta
|
|
vmov.i8 d31, #2
|
|
|
|
vabd.u8 q4, q1, q2 @|p0-q0|
|
|
vabd.u8 q5, q3, q2 @|q1-q0|
|
|
vabd.u8 q6, q0, q1 @|p1-p0|
|
|
vaddl.u8 q7, d2, d6
|
|
vaddl.u8 q8, d3, d7 @(p0 + q1)
|
|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
|
|
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
|
|
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
|
|
vmlal.u8 q7, d0, d31
|
|
vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
|
|
vaddl.u8 q9, d0, d4
|
|
vaddl.u8 q10, d1, d5 @(p1 + q0)
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vmlal.u8 q9, d6, d31
|
|
vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
|
|
|
|
vrshrn.i16 d14, q7, #2
|
|
vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
|
|
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vrshrn.i16 d18, q9, #2
|
|
vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
|
|
|
|
vbit q1, q7, q4
|
|
vbit q2, q9, q4
|
|
|
|
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
|
|
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
|
|
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
|
|
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
|
|
|
|
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
|
|
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
|
|
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
|
|
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block horizontal edge for cases where the
|
|
@* boundary strength is less than 4
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @param[in] sp(0) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(4) - pu1_cliptab
|
|
@* tc0_table
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_horz_bslt4_bp_a9
|
|
|
|
ih264_deblk_chroma_horz_bslt4_bp_a9:
|
|
|
|
stmfd sp!, {r4-r6, lr} @
|
|
|
|
ldrd r4, r5, [sp, #0x10] @r4 = u4_bs , r5 = pu1_cliptab
|
|
vpush {d8 - d15}
|
|
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p2 of chroma U
|
|
rev r4, r4 @
|
|
vmov.32 d12[0], r4 @d12[0] = ui_Bs
|
|
vld1.32 d16[0], [r5] @D16[0] contains cliptab
|
|
vld2.8 {d6, d7}, [r0], r1 @Q3=p1
|
|
vtbl.8 d14, {d16}, d12 @
|
|
vmovl.u8 q6, d12 @q6 = uc_Bs in each 16 bit scalar
|
|
mov r6, r0 @Keeping a backup of the pointer to chroma U P0
|
|
vld2.8 {d4, d5}, [r0], r1 @Q2=p0
|
|
vmov.i8 d30, #1 @
|
|
vdup.8 q10, r2 @Q10 contains alpha
|
|
vld2.8 {d0, d1}, [r0], r1 @Q0=q0
|
|
vmovl.u8 q7, d14 @
|
|
vld2.8 {d2, d3}, [r0] @Q1=q1
|
|
vsubl.u8 q5, d1, d5 @
|
|
vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
|
|
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
|
|
vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
|
|
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
|
|
vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
|
|
vsli.16 q7, q7, #8 @
|
|
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
|
|
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
|
|
vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
|
|
vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
|
|
vdup.8 q8, r3 @Q8 contains beta
|
|
vadd.i16 q4, q4, q10 @
|
|
vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
|
|
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
|
|
vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
|
|
vqrshrn.s16 d8, q4, #3 @
|
|
vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
|
|
vadd.i8 d14, d14, d30 @Q7 = C = C0+1
|
|
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
|
|
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
|
|
vabs.s8 q3, q4 @Q4 = ABS (i_macro)
|
|
vmov.i8 d15, d14 @
|
|
vmov.i8 d13, d12 @
|
|
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
|
|
vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
|
|
vbic q6, q6, q9 @final condition
|
|
vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
|
|
vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
|
|
vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
|
|
vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
|
|
vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
|
|
vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
|
|
vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
|
|
vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
|
|
vst2.8 {d16, d17}, [r6], r1 @
|
|
vst2.8 {d0, d1}, [r6] @
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4-r6, pc} @
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge for cases where the
|
|
@* boundary strength is less than 4
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @param[in] sp(0) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(4) - pu1_cliptab
|
|
@* tc0_table
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bslt4_bp_a9
|
|
|
|
ih264_deblk_chroma_vert_bslt4_bp_a9:
|
|
|
|
stmfd sp!, {r10-r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
ldr r11, [sp, #16] @r12 = ui_Bs
|
|
|
|
ldr r10, [sp, #20] @r14 = puc_ClipTab
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
vpush {d8 - d15}
|
|
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
|
|
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
|
|
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
|
|
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
|
|
|
|
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
|
|
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
|
|
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
|
|
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
|
|
|
|
|
|
vdup.8 q11, r2 @Q4 = alpha
|
|
vabd.u8 q4, q1, q2 @|p0-q0|
|
|
vdup.8 q12, r3 @Q5 = beta
|
|
vabd.u8 q5, q3, q2 @|q1-q0|
|
|
vabd.u8 q6, q0, q1 @|p1-p0|
|
|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
|
|
vsubl.u8 q7, d0, d6
|
|
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
|
|
vsubl.u8 q8, d1, d7 @(p1 - q1)
|
|
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
|
|
vsubl.u8 q9, d4, d2
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vsubl.u8 q10, d5, d3 @(q0 - p0)
|
|
vmov.u16 q14, #4
|
|
vld1.32 {d24[0]}, [r10] @Load ClipTable
|
|
rev r11, r11 @Blocking strengths
|
|
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
|
|
vmov.32 d10[0], r11
|
|
|
|
vmla.s16 q7, q9, q14
|
|
vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
|
|
|
|
vmovl.u8 q5, d10
|
|
|
|
|
|
vsli.u16 d10, d10, #8
|
|
vmovl.u16 q5, d10
|
|
vsli.u32 q5, q5, #16
|
|
vtbl.8 d12, {d24}, d10
|
|
vtbl.8 d13, {d24}, d11 @tC0
|
|
vmov.u8 q12, #1
|
|
vadd.u8 q6, q6, q12 @tC0 + 1
|
|
vcge.u8 q5, q5, q12 @u4_bS > 0 ?
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
|
|
|
|
@ Q0 - Q3(inputs),
|
|
@ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
|
|
@ Q6 (tC)
|
|
|
|
vrshr.s16 q7, q7, #3
|
|
vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
|
|
|
|
vcgt.s16 q9, q7, #0
|
|
vcgt.s16 q10, q8, #0
|
|
vmovn.i16 d18, q9
|
|
vmovn.i16 d19, q10 @Q9 = sign(delta)
|
|
vabs.s16 q7, q7
|
|
vabs.s16 q8, q8
|
|
vmovn.u16 d14, q7
|
|
vmovn.u16 d15, q8
|
|
vmin.u8 q7, q7, q6 @Q7 = |delta|
|
|
|
|
vqadd.u8 q10, q1, q7 @p0+|delta|
|
|
vqadd.u8 q11, q2, q7 @q0+|delta|
|
|
vqsub.u8 q12, q1, q7 @p0-|delta|
|
|
vqsub.u8 q13, q2, q7 @q0-|delta|
|
|
|
|
vbit q12, q10, q9 @p0 + delta
|
|
vbit q11, q13, q9 @q0 - delta
|
|
|
|
vbit q1, q12, q4
|
|
vbit q2, q11, q4
|
|
|
|
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
|
|
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
|
|
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
|
|
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
|
|
|
|
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
|
|
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
|
|
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
|
|
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r10-r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge when the
|
|
@* boundary strength is set to 4 on calling twice
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bs4_mbaff_bp_a9
|
|
|
|
ih264_deblk_chroma_vert_bs4_mbaff_bp_a9:
|
|
|
|
stmfd sp!, {r12, r14}
|
|
vpush {d8 - d15}
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
|
|
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
|
|
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
|
|
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
|
|
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
|
|
|
|
vdup.8 d11, r2 @D11 = alpha
|
|
vdup.8 d12, r3 @D12 = beta
|
|
vmov.i8 d31, #2
|
|
|
|
vabd.u8 d4, d1, d2 @|p0-q0|
|
|
vabd.u8 d5, d3, d2 @|q1-q0|
|
|
vabd.u8 d6, d0, d1 @|p1-p0|
|
|
vaddl.u8 q14, d1, d3 @(p0 + q1)
|
|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
|
|
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
|
|
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
|
|
vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
|
|
vaddl.u8 q13, d0, d2 @(p1 + q0)
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
|
|
|
|
vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
|
|
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
|
|
|
|
vbit d1, d7, d4
|
|
vbit d2, d9, d4
|
|
|
|
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
|
|
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
|
|
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
|
|
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge for cases where the
|
|
@* boundary strength is less than 4 on calling twice
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha
|
|
@* Alpha Value for the boundary
|
|
@*
|
|
@* @param[in] r3 - beta
|
|
@* Beta Value for the boundary
|
|
@*
|
|
@* @param[in] sp(0) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(4) - pu1_cliptab
|
|
@* tc0_table
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9
|
|
|
|
ih264_deblk_chroma_vert_bslt4_mbaff_bp_a9:
|
|
|
|
stmfd sp!, {r10-r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
ldr r11, [sp, #16] @r11 = ui_Bs
|
|
|
|
ldr r10, [sp, #20] @r10 = puc_ClipTab
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
vpush {d8 - d15}
|
|
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
|
|
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
|
|
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
|
|
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
|
|
|
|
vdup.8 d11, r2 @D11 = alpha
|
|
vabd.u8 d4, d1, d2 @|p0-q0|
|
|
vdup.8 d12, r3 @D12 = beta
|
|
vabd.u8 d5, d3, d2 @|q1-q0|
|
|
vabd.u8 d6, d0, d1 @|p1-p0|
|
|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
|
|
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
|
|
vsubl.u8 q14, d0, d3 @(p1 - q1)
|
|
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vsubl.u8 q12, d2, d1 @(q0 - p0)
|
|
vmov.u16 q10, #4
|
|
|
|
vld1.32 {d31[0]}, [r10] @Load ClipTable
|
|
rev r11, r11 @Blocking strengths
|
|
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vmov.32 d22[0], r11
|
|
vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
|
|
vmovl.u8 q11, d22
|
|
vsli.u16 d22, d22, #8
|
|
vtbl.8 d6, {d31}, d22 @tC0
|
|
vmov.u8 d12, #1
|
|
vadd.u8 d6, d6, d12 @tC0 + 1
|
|
vcge.u8 d5, d22, d12 @u4_bS > 0 ?
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
|
|
|
|
@ D0 - D3(inputs),
|
|
@ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
|
|
@ D6 (tC)
|
|
|
|
vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
|
|
|
|
vcgt.s16 q13, q14, #0
|
|
vmovn.i16 d9, q13 @D9 = sign(delta)
|
|
vabs.s16 q14, q14
|
|
vmovn.u16 d7, q14
|
|
vmin.u8 d7, d7, d6 @D7 = |delta|
|
|
|
|
vqadd.u8 d10, d1, d7 @p0+|delta|
|
|
vqadd.u8 d11, d2, d7 @q0+|delta|
|
|
vqsub.u8 d12, d1, d7 @p0-|delta|
|
|
vqsub.u8 d13, d2, d7 @q0-|delta|
|
|
|
|
vbit d12, d10, d9 @p0 + delta
|
|
vbit d11, d13, d9 @q0 - delta
|
|
|
|
vbit d1, d12, d4
|
|
vbit d2, d11, d4
|
|
|
|
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
|
|
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
|
|
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
|
|
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r10-r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block horizontal edge when the
|
|
@* boundary strength is set to 4 in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_horz_bs4_a9
|
|
|
|
ih264_deblk_chroma_horz_bs4_a9:
|
|
|
|
stmfd sp!, {r4-r6, lr} @
|
|
|
|
ldr r5, [sp, #16] @R5 = alpha_cr
|
|
ldr r6, [sp, #20] @R6 = beta_cr
|
|
vpush {d8 - d15}
|
|
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixel pointing to p1 of chroma
|
|
vld2.8 {d6, d7}, [r0], r1 @D6 = p1u , D7 = p1v
|
|
mov r4, r0 @Keeping a backup of the pointer p0 of chroma
|
|
vld2.8 {d4, d5}, [r0], r1 @D4 = p0u , D5 = p0v
|
|
vdup.8 d20, r2 @D20 contains alpha_cb
|
|
vdup.8 d21, r5 @D21 contains alpha_cr
|
|
vld2.8 {d0, d1}, [r0], r1 @D0 = q0u , D1 = q0v
|
|
vaddl.u8 q4, d6, d0 @
|
|
vaddl.u8 q5, d7, d1 @Q4,Q5 = q0 + p1
|
|
vmov.i8 d31, #2 @
|
|
vld2.8 {d2, d3}, [r0] @D2 = q1u , D3 = q1v
|
|
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
|
|
vmlal.u8 q4, d2, d31 @
|
|
vmlal.u8 q5, d3, d31 @Q5,Q4 = (X2(q1U) + q0U + p1U)
|
|
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
|
|
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
|
|
vaddl.u8 q7, d4, d2 @
|
|
vaddl.u8 q14, d5, d3 @Q14,Q7 = P0 + Q1
|
|
vdup.8 d16, r3 @D16 contains beta_cb
|
|
vdup.8 d17, r6 @D17 contains beta_cr
|
|
vmlal.u8 q7, d6, d31 @
|
|
vmlal.u8 q14, d7, d31 @Q14,Q7 = (X2(p1U) + p0U + q1U)
|
|
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
|
|
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
|
|
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
|
|
vrshrn.u16 d8, q4, #2 @
|
|
vrshrn.u16 d9, q5, #2 @Q4 = (X2(q1U) + q0U + p1U + 2) >> 2
|
|
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
|
|
vrshrn.u16 d10, q7, #2 @
|
|
vrshrn.u16 d11, q14, #2 @Q5 = (X2(p1U) + p0U + q1U + 2) >> 2
|
|
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
|
|
vbit q5, q2, q9 @
|
|
vbit q4, q0, q9 @
|
|
vst2.8 {d10, d11}, [r4], r1 @
|
|
vst2.8 {d8, d9}, [r4] @
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4-r6, pc} @
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge when the
|
|
@* boundary strength is set to 4 in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bs4_a9
|
|
|
|
ih264_deblk_chroma_vert_bs4_a9:
|
|
|
|
stmfd sp!, {r4, r5, r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
|
|
ldr r4, [sp, #16] @r4 = alpha_cr
|
|
ldr r5, [sp, #20] @r5 = beta_cr
|
|
add r2, r2, r4, lsl #8 @r2 = (alpha_cr,alpha_cb)
|
|
add r3, r3, r5, lsl #8 @r3 = (beta_cr,beta_cb)
|
|
vpush {d8 - d15}
|
|
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
|
|
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
|
|
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
|
|
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
|
|
|
|
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
|
|
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
|
|
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
|
|
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
|
|
|
|
vdup.16 q11, r2 @Q11 = alpha
|
|
vdup.16 q12, r3 @Q12 = beta
|
|
vmov.i8 d31, #2
|
|
|
|
vabd.u8 q4, q1, q2 @|p0-q0|
|
|
vabd.u8 q5, q3, q2 @|q1-q0|
|
|
vabd.u8 q6, q0, q1 @|p1-p0|
|
|
vaddl.u8 q7, d2, d6
|
|
vaddl.u8 q8, d3, d7 @(p0 + q1)
|
|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
|
|
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
|
|
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
|
|
vmlal.u8 q7, d0, d31
|
|
vmlal.u8 q8, d1, d31 @2*p1 + (p0 + q1)
|
|
vaddl.u8 q9, d0, d4
|
|
vaddl.u8 q10, d1, d5 @(p1 + q0)
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vmlal.u8 q9, d6, d31
|
|
vmlal.u8 q10, d7, d31 @2*q1 + (p1 + q0)
|
|
|
|
vrshrn.i16 d14, q7, #2
|
|
vrshrn.i16 d15, q8, #2 @(2*p1 + (p0 + q1) + 2) >> 2
|
|
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vrshrn.i16 d18, q9, #2
|
|
vrshrn.i16 d19, q10, #2 @(2*q1 + (p1 + q0) + 2) >> 2
|
|
|
|
vbit q1, q7, q4
|
|
vbit q2, q9, q4
|
|
|
|
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
|
|
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
|
|
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
|
|
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
|
|
|
|
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
|
|
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
|
|
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
|
|
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4, r5, r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block horizontal edge for cases where the
|
|
@* boundary strength is less than 4 in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(8) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(12) - pu1_cliptab_cb
|
|
@* tc0_table for U
|
|
@*
|
|
@* @param[in] sp(16) - pu1_cliptab_cr
|
|
@* tc0_table for V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_horz_bslt4_a9
|
|
|
|
ih264_deblk_chroma_horz_bslt4_a9:
|
|
|
|
stmfd sp!, {r4-r9, lr} @
|
|
|
|
ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
|
|
ldr r7, [sp, #36] @R7 = u4_bs
|
|
ldrd r8, r9, [sp, #40] @R8 = pu1_cliptab_cb , R9 = pu1_cliptab_cr
|
|
sub r0, r0, r1, lsl #1 @R0 = uc_edgePixelU pointing to p1 of chroma U
|
|
vpush {d8 - d15}
|
|
rev r7, r7 @
|
|
vmov.32 d12[0], r7 @D12[0] = ui_Bs
|
|
|
|
vld1.32 d16[0], [r8] @D16[0] contains cliptab_cb
|
|
vld1.32 d17[0], [r9] @D17[0] contains cliptab_cr
|
|
vld2.8 {d6, d7}, [r0], r1 @Q3=p1
|
|
vtbl.8 d14, {d16}, d12 @Retreiving cliptab values for U
|
|
vtbl.8 d28, {d17}, d12 @Retrieving cliptab values for V
|
|
vmovl.u8 q6, d12 @Q6 = uc_Bs in each 16 bit scalar
|
|
mov r6, r0 @Keeping a backup of the pointer to chroma U P0
|
|
vld2.8 {d4, d5}, [r0], r1 @Q2=p0
|
|
vmov.i8 d30, #1 @
|
|
vdup.8 d20, r2 @D20 contains alpha_cb
|
|
vdup.8 d21, r4 @D21 contains alpha_cr
|
|
vld2.8 {d0, d1}, [r0], r1 @Q0=q0
|
|
vmovl.u8 q7, d14 @
|
|
vmovl.u8 q14, d28 @
|
|
vmov.i16 d15, d28 @D14 has cliptab values for U, D15 for V
|
|
vld2.8 {d2, d3}, [r0] @Q1=q1
|
|
vsubl.u8 q5, d1, d5 @
|
|
vsubl.u8 q4, d0, d4 @Q5,Q4 = (q0 - p0)
|
|
vabd.u8 q13, q3, q2 @Q13 = ABS(p1 - p0)
|
|
vshl.i16 q5, q5, #2 @Q5 = (q0 - p0)<<2
|
|
vabd.u8 q11, q2, q0 @Q11 = ABS(p0 - q0)
|
|
vshl.i16 q4, q4, #2 @Q4 = (q0 - p0)<<2
|
|
vsli.16 q7, q7, #8 @
|
|
vabd.u8 q12, q1, q0 @Q12 = ABS(q1 - q0)
|
|
vcge.u8 q9, q11, q10 @Q9 = ( ABS(p0 - q0) >= Alpha )
|
|
vsubl.u8 q10, d6, d2 @Q10 = (p1 - q1)L
|
|
vsubl.u8 q3, d7, d3 @Q3 = (p1 - q1)H
|
|
vdup.8 d16, r3 @Q8 contains beta_cb
|
|
vdup.8 d17, r5 @Q8 contains beta_cr
|
|
vadd.i16 q4, q4, q10 @
|
|
vadd.i16 q5, q5, q3 @Q5,Q4 = [ (q0 - p0)<<2 ] + (p1 - q1)
|
|
vcge.u8 q12, q12, q8 @Q12= ( ABS(q1 - q0) >= Beta )
|
|
vcgt.s16 d12, d12, #0 @Q6 = (us_Bs > 0)
|
|
vqrshrn.s16 d8, q4, #3 @
|
|
vqrshrn.s16 d9, q5, #3 @Q4 = i_macro = (((q0 - p0)<<2) + (p1 - q1) + 4)>>3
|
|
vadd.i8 d14, d14, d30 @D14 = C = C0+1 for U
|
|
vcge.u8 q13, q13, q8 @Q13= ( ABS(p1 - p0) >= Beta )
|
|
vorr q9, q9, q12 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta )
|
|
vabs.s8 q3, q4 @Q4 = ABS (i_macro)
|
|
vadd.i8 d15, d15, d30 @D15 = C = C0+1 for V
|
|
vmov.i8 d13, d12 @
|
|
vorr q9, q9, q13 @Q9 = ( ABS(p0 - q0) >= Alpha ) | ( ABS(q1 - q0) >= Beta ) | ( ABS(p1 - p0) >= Beta )
|
|
vmin.u8 q7, q3, q7 @Q7 = delta = (ABS(i_macro) > C) ? C : ABS(i_macro)
|
|
vbic q6, q6, q9 @final condition
|
|
vcge.s8 q4, q4, #0 @Q4 = (i_macro >= 0)
|
|
vand q7, q7, q6 @Making delta zero in places where values shouldn be filterd
|
|
vqadd.u8 q8, q2, q7 @Q8 = p0 + delta
|
|
vqsub.u8 q2, q2, q7 @Q2 = p0 - delta
|
|
vqadd.u8 q9, q0, q7 @Q9 = q0 + delta
|
|
vqsub.u8 q0, q0, q7 @Q0 = q0 - delta
|
|
vbif q8, q2, q4 @Q8 = (i_macro >= 0 ) ? (p0+delta) : (p0-delta)
|
|
vbif q0, q9, q4 @Q0 = (i_macro >= 0 ) ? (q0-delta) : (q0+delta)
|
|
vst2.8 {d16, d17}, [r6], r1 @
|
|
vst2.8 {d0, d1}, [r6] @
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4-r9, pc} @
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge for cases where the
|
|
@* boundary strength is less than 4 in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(8) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(12) - pu1_cliptab_cb
|
|
@* tc0_table for U
|
|
@*
|
|
@* @param[in] sp(16) - pu1_cliptab_cr
|
|
@* tc0_table for V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bslt4_a9
|
|
|
|
ih264_deblk_chroma_vert_bslt4_a9:
|
|
|
|
stmfd sp!, {r4-r7, r10-r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
ldrd r4, r5, [sp, #32] @R4 = alpha_cr , R5 = beta_cr
|
|
add r2, r2, r4, lsl #8
|
|
add r3, r3, r5, lsl #8
|
|
ldr r6, [sp, #40] @R6 = u4_bs
|
|
ldrd r10, r11, [sp, #44] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
|
|
vpush {d8 - d15}
|
|
mov r12, r0 @keep a back up of R0 for buffer write
|
|
|
|
vld4.16 {d0[0], d2[0], d4[0], d6[0]}, [r0], r1
|
|
vld4.16 {d0[1], d2[1], d4[1], d6[1]}, [r0], r1
|
|
vld4.16 {d0[2], d2[2], d4[2], d6[2]}, [r0], r1
|
|
vld4.16 {d0[3], d2[3], d4[3], d6[3]}, [r0], r1
|
|
|
|
vld4.16 {d1[0], d3[0], d5[0], d7[0]}, [r0], r1
|
|
vld4.16 {d1[1], d3[1], d5[1], d7[1]}, [r0], r1
|
|
vld4.16 {d1[2], d3[2], d5[2], d7[2]}, [r0], r1
|
|
vld4.16 {d1[3], d3[3], d5[3], d7[3]}, [r0], r1
|
|
|
|
|
|
vdup.16 q11, r2 @Q11 = alpha
|
|
vabd.u8 q4, q1, q2 @|p0-q0|
|
|
vdup.16 q12, r3 @Q12 = beta
|
|
vabd.u8 q5, q3, q2 @|q1-q0|
|
|
vabd.u8 q6, q0, q1 @|p1-p0|
|
|
vclt.u8 q4, q4, q11 @|p0-q0| < alpha ?
|
|
vsubl.u8 q7, d0, d6
|
|
vclt.u8 q5, q5, q12 @|q1-q0| < beta ?
|
|
vsubl.u8 q8, d1, d7 @(p1 - q1)
|
|
vclt.u8 q6, q6, q12 @|p1-p0| < beta ?
|
|
vsubl.u8 q9, d4, d2
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vsubl.u8 q10, d5, d3 @(q0 - p0)
|
|
vmov.u16 q14, #4
|
|
vld1.32 {d24[0]}, [r10] @Load ClipTable for U
|
|
vld1.32 {d25[0]}, [r11] @Load ClipTable for V
|
|
rev r6, r6 @Blocking strengths
|
|
vand.u8 q4, q4, q6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
|
|
vmov.32 d10[0], r6
|
|
|
|
vmla.s16 q7, q9, q14
|
|
vmla.s16 q8, q10, q14 @4*(q0 - p0) + (p1 - q1)
|
|
|
|
vmovl.u8 q5, d10
|
|
vsli.u16 d10, d10, #8
|
|
vtbl.8 d12, {d24}, d10 @tC0 for U
|
|
vtbl.8 d13, {d25}, d10 @tC0 for V
|
|
vzip.8 d12, d13
|
|
vmovl.u16 q5, d10
|
|
vsli.u32 q5, q5, #16
|
|
vmov.u8 q12, #1
|
|
vadd.u8 q6, q6, q12 @tC0 + 1
|
|
vcge.u8 q5, q5, q12 @u4_bS > 0 ?
|
|
vand.u8 q4, q4, q5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
|
|
|
|
@ Q0 - Q3(inputs),
|
|
@ Q4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
|
|
@ Q6 (tC)
|
|
|
|
vrshr.s16 q7, q7, #3
|
|
vrshr.s16 q8, q8, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
|
|
|
|
vcgt.s16 q9, q7, #0
|
|
vcgt.s16 q10, q8, #0
|
|
vmovn.i16 d18, q9
|
|
vmovn.i16 d19, q10 @Q9 = sign(delta)
|
|
vabs.s16 q7, q7
|
|
vabs.s16 q8, q8
|
|
vmovn.u16 d14, q7
|
|
vmovn.u16 d15, q8
|
|
vmin.u8 q7, q7, q6 @Q7 = |delta|
|
|
|
|
vqadd.u8 q10, q1, q7 @p0+|delta|
|
|
vqadd.u8 q11, q2, q7 @q0+|delta|
|
|
vqsub.u8 q12, q1, q7 @p0-|delta|
|
|
vqsub.u8 q13, q2, q7 @q0-|delta|
|
|
|
|
vbit q12, q10, q9 @p0 + delta
|
|
vbit q11, q13, q9 @q0 - delta
|
|
|
|
vbit q1, q12, q4
|
|
vbit q2, q11, q4
|
|
|
|
vst4.16 {d0[0], d2[0], d4[0], d6[0]}, [r12], r1
|
|
vst4.16 {d0[1], d2[1], d4[1], d6[1]}, [r12], r1
|
|
vst4.16 {d0[2], d2[2], d4[2], d6[2]}, [r12], r1
|
|
vst4.16 {d0[3], d2[3], d4[3], d6[3]}, [r12], r1
|
|
|
|
vst4.16 {d1[0], d3[0], d5[0], d7[0]}, [r12], r1
|
|
vst4.16 {d1[1], d3[1], d5[1], d7[1]}, [r12], r1
|
|
vst4.16 {d1[2], d3[2], d5[2], d7[2]}, [r12], r1
|
|
vst4.16 {d1[3], d3[3], d5[3], d7[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4-r7, r10-r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge when the
|
|
@* boundary strength is set to 4 on calling twice in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bs4_mbaff_a9
|
|
|
|
ih264_deblk_chroma_vert_bs4_mbaff_a9:
|
|
|
|
stmfd sp!, {r4, r5, r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
ldrd r4, r5, [sp, #16] @R4 = alpha_cr , R5 = beta_cr
|
|
add r2, r2, r4, lsl #8
|
|
add r3, r3, r5, lsl #8
|
|
vpush {d8 - d15}
|
|
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
|
|
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
|
|
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
|
|
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
|
|
|
|
vdup.16 d11, r2 @D11 = alpha
|
|
vdup.16 d12, r3 @D12 = beta
|
|
vmov.i8 d31, #2
|
|
|
|
vabd.u8 d4, d1, d2 @|p0-q0|
|
|
vabd.u8 d5, d3, d2 @|q1-q0|
|
|
vabd.u8 d6, d0, d1 @|p1-p0|
|
|
vaddl.u8 q14, d1, d3 @(p0 + q1)
|
|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
|
|
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
|
|
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
|
|
vmlal.u8 q14, d0, d31 @2*p1 + (p0 + q1)
|
|
vaddl.u8 q13, d0, d2 @(p1 + q0)
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vmlal.u8 q13, d3, d31 @2*q1 + (p1 + q0)
|
|
|
|
vrshrn.i16 d7, q14, #2 @(2*p1 + (p0 + q1) + 2) >> 2
|
|
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vrshrn.i16 d9, q13, #2 @(2*q1 + (p1 + q0) + 2) >> 2
|
|
|
|
vbit d1, d7, d4
|
|
vbit d2, d9, d4
|
|
|
|
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
|
|
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
|
|
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
|
|
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4, r5, r12, pc}
|
|
|
|
|
|
|
|
@**
|
|
@*******************************************************************************
|
|
@*
|
|
@* @brief
|
|
@* Performs filtering of a chroma block vertical edge for cases where the
|
|
@* boundary strength is less than 4 on calling twice in high profile
|
|
@*
|
|
@* @par Description:
|
|
@* This operation is described in Sec. 8.7.2.4 under the title
|
|
@* "Filtering process for edges for bS equal to 4" in ITU T Rec H.264.
|
|
@*
|
|
@* @param[in] r0 - pu1_src
|
|
@* Pointer to the src sample q0
|
|
@*
|
|
@* @param[in] r1 - src_strd
|
|
@* Source stride
|
|
@*
|
|
@* @param[in] r2 - alpha_cb
|
|
@* Alpha Value for the boundary in U
|
|
@*
|
|
@* @param[in] r3 - beta_cb
|
|
@* Beta Value for the boundary in U
|
|
@*
|
|
@* @param[in] sp(0) - alpha_cr
|
|
@* Alpha Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(4) - beta_cr
|
|
@* Beta Value for the boundary in V
|
|
@*
|
|
@* @param[in] sp(8) - u4_bs
|
|
@* Packed Boundary strength array
|
|
@*
|
|
@* @param[in] sp(12) - pu1_cliptab_cb
|
|
@* tc0_table for U
|
|
@*
|
|
@* @param[in] sp(16) - pu1_cliptab_cr
|
|
@* tc0_table for V
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
|
|
.global ih264_deblk_chroma_vert_bslt4_mbaff_a9
|
|
|
|
ih264_deblk_chroma_vert_bslt4_mbaff_a9:
|
|
|
|
stmfd sp!, {r4-r6, r10-r12, r14}
|
|
|
|
sub r0, r0, #4 @point r0 to p1u of row0.
|
|
mov r12, r0 @keep a back up of r0 for buffer write
|
|
|
|
ldrd r4, r5, [sp, #28] @R4 = alpha_cr , R5 = beta_cr
|
|
add r2, r2, r4, lsl #8
|
|
add r3, r3, r5, lsl #8
|
|
ldr r6, [sp, #36] @R6 = u4_bs
|
|
ldrd r10, r11, [sp, #40] @R10 = pu1_cliptab_cb , R11 = pu1_cliptab_cr
|
|
vpush {d8 - d15}
|
|
vld4.16 {d0[0], d1[0], d2[0], d3[0]}, [r0], r1
|
|
vld4.16 {d0[1], d1[1], d2[1], d3[1]}, [r0], r1
|
|
vld4.16 {d0[2], d1[2], d2[2], d3[2]}, [r0], r1
|
|
vld4.16 {d0[3], d1[3], d2[3], d3[3]}, [r0], r1
|
|
|
|
vdup.16 d11, r2 @D11 = alpha
|
|
vabd.u8 d4, d1, d2 @|p0-q0|
|
|
vdup.16 d12, r3 @D12 = beta
|
|
vabd.u8 d5, d3, d2 @|q1-q0|
|
|
vabd.u8 d6, d0, d1 @|p1-p0|
|
|
vclt.u8 d4, d4, d11 @|p0-q0| < alpha ?
|
|
vclt.u8 d5, d5, d12 @|q1-q0| < beta ?
|
|
vsubl.u8 q14, d0, d3 @(p1 - q1)
|
|
vclt.u8 d6, d6, d12 @|p1-p0| < beta ?
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta
|
|
vsubl.u8 q12, d2, d1 @(q0 - p0)
|
|
vmov.u16 q10, #4
|
|
|
|
vld1.32 {d31[1]}, [r10] @Load ClipTable for U
|
|
vld1.32 {d31[0]}, [r11] @Load ClipTable for V
|
|
rev r6, r6 @Blocking strengths
|
|
vand.u8 d4, d4, d6 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta
|
|
vmov.32 d22[0], r6
|
|
vmla.s16 q14, q12, q10 @4*(q0 - p0) + (p1 - q1)
|
|
vmovl.u8 q11, d22
|
|
vsli.u16 d22, d22, #8
|
|
vmov.u16 d13, #4
|
|
vadd.u8 d22, d22, d13
|
|
vtbl.8 d6, {d31}, d22 @tC0
|
|
vmov.u8 d12, #1
|
|
vsub.u8 d22, d22, d13
|
|
vadd.u8 d6, d6, d12 @tC0 + 1
|
|
vcge.u8 d5, d22, d12 @u4_bS > 0 ?
|
|
vand.u8 d4, d4, d5 @|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0
|
|
|
|
@ D0 - D3(inputs),
|
|
@ D4 (|p0-q0| < alpha && |q1-q0| < beta && |p1-p0| < beta && u4_bs != 0),
|
|
@ D6 (tC)
|
|
|
|
vrshr.s16 q14, q14, #3 @(((q0 - p0) << 2) + (p1 - q1) + 4) >> 3)
|
|
|
|
vcgt.s16 q13, q14, #0
|
|
vmovn.i16 d9, q13 @D9 = sign(delta)
|
|
vabs.s16 q14, q14
|
|
vmovn.u16 d7, q14
|
|
vmin.u8 d7, d7, d6 @D7 = |delta|
|
|
|
|
vqadd.u8 d10, d1, d7 @p0+|delta|
|
|
vqadd.u8 d11, d2, d7 @q0+|delta|
|
|
vqsub.u8 d12, d1, d7 @p0-|delta|
|
|
vqsub.u8 d13, d2, d7 @q0-|delta|
|
|
|
|
vbit d12, d10, d9 @p0 + delta
|
|
vbit d11, d13, d9 @q0 - delta
|
|
|
|
vbit d1, d12, d4
|
|
vbit d2, d11, d4
|
|
|
|
vst4.16 {d0[0], d1[0], d2[0], d3[0]}, [r12], r1
|
|
vst4.16 {d0[1], d1[1], d2[1], d3[1]}, [r12], r1
|
|
vst4.16 {d0[2], d1[2], d2[2], d3[2]}, [r12], r1
|
|
vst4.16 {d0[3], d1[3], d2[3], d3[3]}, [r12], r1
|
|
vpop {d8 - d15}
|
|
ldmfd sp!, {r4-r6, r10-r12, pc}
|
|
|
|
|
|
|