Cemu/dependencies/ih264d/common/arm/ih264_intra_pred_chroma_a9q.s

544 lines
15 KiB
ArmAsm
Raw Permalink Normal View History

2022-08-22 22:21:23 +02:00
@/******************************************************************************
@ *
@ * Copyright (C) 2015 The Android Open Source Project
@ *
@ * Licensed under the Apache License, Version 2.0 (the "License");
@ * you may not use this file except in compliance with the License.
@ * You may obtain a copy of the License at:
@ *
@ * http://www.apache.org/licenses/LICENSE-2.0
@ *
@ * Unless required by applicable law or agreed to in writing, software
@ * distributed under the License is distributed on an "AS IS" BASIS,
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@ * See the License for the specific language governing permissions and
@ * limitations under the License.
@ *
@ *****************************************************************************
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
@*/
@**
@******************************************************************************
@* @file
@* ih264_intra_pred_chroma_a9q.s
@*
@* @brief
@* Contains function definitions for intra chroma prediction .
@*
@* @author
@* Ittiam
@*
@* @par List of Functions:
@*
@* - ih264_intra_pred_chroma_mode_horz_a9q()
@* - ih264_intra_pred_chroma_8x8_mode_vert_a9q()
@* - ih264_intra_pred_chroma_mode_dc_a9q()
@* - ih264_intra_pred_chroma_mode_plane_a9q()
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
@* All the functions here are replicated from ih264_chroma_intra_pred_filters.c
@
.text
.p2align 2
.extern ih264_gai1_intrapred_chroma_plane_coeffs1
.hidden ih264_gai1_intrapred_chroma_plane_coeffs1
.extern ih264_gai1_intrapred_chroma_plane_coeffs2
.hidden ih264_gai1_intrapred_chroma_plane_coeffs2
scratch_chroma_intrapred_addr1:
.long ih264_gai1_intrapred_chroma_plane_coeffs1 - scrlblc1 - 8
scratch_intrapred_chroma_plane_addr1:
.long ih264_gai1_intrapred_chroma_plane_coeffs2 - scrlblc2 - 8
@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_dc
@*
@* @brief
@* Perform Intra prediction for chroma_8x8 mode:DC
@*
@* @par Description:
@* Perform Intra prediction for chroma_8x8 mode:DC ,described in sec 8.3.4.1
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source containing alternate U and V samples
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination with alternate U and V samples
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@** @param[in] ui_neighboravailability
@* availability of neighbouring pixels
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@void ih264_intra_pred_chroma_8x8_mode_dc(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
@ WORD32 ui_neighboravailability)
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_dc_a9q
ih264_intra_pred_chroma_8x8_mode_dc_a9q:
stmfd sp!, {r4, r14} @store register values to stack
ldr r4, [sp, #8] @r4 => ui_neighboravailability
vpush {d8-d15}
ands r2, r4, #0x01 @CHECKING IF LEFT_AVAILABLE ELSE BRANCHING TO ONLY TOP AVAILABLE
beq top_available
ands r2, r4, #0x04 @CHECKING IF TOP_AVAILABLE ELSE BRANCHING TO ONLY LEFT AVAILABLE
beq left_available
vld1.u8 {q0}, [r0] @BOTH LEFT AND TOP AVAILABLE
add r0, r0, #18
vld1.u8 {q1}, [r0]
vaddl.u8 q2, d1, d2
vaddl.u8 q3, d0, d3
vmovl.u8 q1, d3
vmovl.u8 q0, d0
vadd.u16 d12, d4, d5
vadd.u16 d13, d2, d3
vadd.u16 d15, d6, d7
vadd.u16 d14, d0, d1
vpadd.u32 d12, d12, d15
vpadd.u32 d14, d13, d14
vqrshrun.s16 d12, q6, #3
vqrshrun.s16 d14, q7, #2
vdup.u16 d8, d12[0]
vdup.u16 d9, d14[0]
vdup.u16 d10, d14[1]
vdup.u16 d11, d12[1]
b str_pred
top_available: @ONLY TOP AVAILABLE
ands r2, r4, #0x04 @CHECKING TOP AVAILABILTY OR ELSE BRANCH TO NONE AVAILABLE
beq none_available
add r0, r0, #18
vld1.u8 {q0}, [r0]
vmovl.u8 q1, d0
vmovl.u8 q2, d1
vadd.u16 d0, d2, d3
vadd.u16 d1, d4, d5
vpaddl.u32 q0, q0
vqrshrun.s16 d0, q0, #2
vdup.u16 d8, d0[0]
vdup.u16 d9, d0[2]
vmov q5, q4
b str_pred
left_available: @ONLY LEFT AVAILABLE
vld1.u8 {q0}, [r0]
vmovl.u8 q1, d0
vmovl.u8 q2, d1
vadd.u16 d0, d2, d3
vadd.u16 d1, d4, d5
vpaddl.u32 q0, q0
vqrshrun.s16 d0, q0, #2
vdup.u16 q5, d0[0]
vdup.u16 q4, d0[2]
b str_pred
none_available: @NONE AVAILABLE
vmov.u8 q4, #128
vmov.u8 q5, #128
str_pred:
vst1.8 {q4}, [r1], r3
vst1.8 {q4}, [r1], r3
vst1.8 {q4}, [r1], r3
vst1.8 {q4}, [r1], r3
vst1.8 {q5}, [r1], r3
vst1.8 {q5}, [r1], r3
vst1.8 {q5}, [r1], r3
vst1.8 {q5}, [r1], r3
vpop {d8-d15}
ldmfd sp!, {r4, pc} @Restoring registers from stack
@******************************************************************************
@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_horz
@*
@* @brief
@* Perform Intra prediction for chroma_8x8 mode:Horizontal
@*
@* @par Description:
@* Perform Intra prediction for chroma_8x8 mode:Horizontal ,described in sec 8.3.4.2
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source containing alternate U and V samples
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination with alternate U and V samples
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] ui_neighboravailability
@* availability of neighbouring pixels(Not used in this function)
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@*
@void ih264_intra_pred_chroma_8x8_mode_horz(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
@ WORD32 ui_neighboravailability)
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_horz_a9q
ih264_intra_pred_chroma_8x8_mode_horz_a9q:
stmfd sp!, {r14} @store register values to stack
vld1.u8 {q0}, [r0]
mov r2, #6
vdup.u16 q1, d1[3]
vdup.u16 q2, d1[2]
vst1.8 {q1}, [r1], r3
loop_8x8_horz:
vext.8 q0, q0, q0, #12
vst1.8 {q2}, [r1], r3
vdup.u16 q1, d1[3]
subs r2, #2
vdup.u16 q2, d1[2]
vst1.8 {q1}, [r1], r3
bne loop_8x8_horz
vext.8 q0, q0, q0, #12
vst1.8 {q2}, [r1], r3
ldmfd sp!, {pc} @restoring registers from stack
@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_vert
@*
@* @brief
@* Perform Intra prediction for chroma_8x8 mode:vertical
@*
@* @par Description:
@*Perform Intra prediction for chroma_8x8 mode:vertical ,described in sec 8.3.4.3
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source containing alternate U and V samples
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination with alternate U and V samples
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] ui_neighboravailability
@* availability of neighbouring pixels(Not used in this function)
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@void ih264_intra_pred_chroma_8x8_mode_vert(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
@ WORD32 ui_neighboravailability)
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_vert_a9q
ih264_intra_pred_chroma_8x8_mode_vert_a9q:
stmfd sp!, {r4-r12, r14} @store register values to stack
add r0, r0, #18
vld1.8 {q0}, [r0]
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
vst1.8 {q0}, [r1], r3
ldmfd sp!, {r4-r12, pc} @Restoring registers from stack
@******************************************************************************
@**
@*******************************************************************************
@*
@*ih264_intra_pred_chroma_8x8_mode_plane
@*
@* @brief
@* Perform Intra prediction for chroma_8x8 mode:PLANE
@*
@* @par Description:
@* Perform Intra prediction for chroma_8x8 mode:PLANE ,described in sec 8.3.4.4
@*
@* @param[in] pu1_src
@* UWORD8 pointer to the source containing alternate U and V samples
@*
@* @param[out] pu1_dst
@* UWORD8 pointer to the destination with alternate U and V samples
@*
@* @param[in] src_strd
@* integer source stride
@*
@* @param[in] dst_strd
@* integer destination stride
@*
@* @param[in] ui_neighboravailability
@* availability of neighbouring pixels
@*
@* @returns
@*
@* @remarks
@* None
@*
@*******************************************************************************
@void ih264_intra_pred_chroma_8x8_mode_plane(UWORD8 *pu1_src,
@ UWORD8 *pu1_dst,
@ WORD32 src_strd,
@ WORD32 dst_strd,
@ WORD32 ui_neighboravailability)
@**************Variables Vs Registers*****************************************
@ r0 => *pu1_src
@ r1 => *pu1_dst
@ r2 => src_strd
@ r3 => dst_strd
@ r4 => ui_neighboravailability
.global ih264_intra_pred_chroma_8x8_mode_plane_a9q
ih264_intra_pred_chroma_8x8_mode_plane_a9q:
stmfd sp!, {r4-r10, r12, lr}
vpush {d8-d15}
vld1.32 d0, [r0]
add r10, r0, #10
vld1.32 d1, [r10]
add r10, r10, #6
vrev64.16 d5, d0
vld1.32 d2, [r10]!
add r10, r10, #2
vrev64.16 d7, d2
vld1.32 d3, [r10]
sub r5, r3, #8
ldr r12, scratch_chroma_intrapred_addr1
scrlblc1:
add r12, r12, pc
vsubl.u8 q5, d5, d1
vld1.64 {q4}, [r12] @ Load multiplication factors 1 to 8 into D3
vsubl.u8 q6, d3, d7
vmul.s16 q7, q5, q4
vmul.s16 q8, q6, q4
vuzp.16 q7, q8
vpadd.s16 d14, d14
vpadd.s16 d15, d15
vpadd.s16 d16, d16
vpadd.s16 d17, d17
vpadd.s16 d14, d14
vpadd.s16 d15, d15
vpadd.s16 d16, d16
vpadd.s16 d17, d17
mov r6, #34
vdup.16 q9, r6
vmull.s16 q11, d14, d18
vmull.s16 q12, d15, d18
vmull.s16 q13, d16, d18
vmull.s16 q14, d17, d18
vrshrn.s32 d10, q11, #6
vrshrn.s32 d12, q12, #6
vrshrn.s32 d13, q13, #6
vrshrn.s32 d14, q14, #6
ldrb r6, [r0], #1
add r10, r0, #31
ldrb r8, [r0], #1
ldrb r7, [r10], #1
ldrb r9, [r10], #1
add r6, r6, r7
add r8, r8, r9
lsl r6, r6, #4
lsl r8, r8, #4
vdup.16 q0, r6
vdup.16 q1, r8
vdup.16 q2, d12[0]
vdup.16 q3, d10[0]
vdup.16 q12, d14[0]
vdup.16 q13, d13[0]
vzip.16 q2, q12
vzip.16 q3, q13
vzip.16 q0, q1
ldr r12, scratch_intrapred_chroma_plane_addr1
scrlblc2:
add r12, r12, pc
vld1.64 {q4}, [r12]
vmov.16 q5, q4
vmov q11, q4
vzip.16 q4, q5
vmul.s16 q6, q2, q4
vmul.s16 q8, q2, q5
vadd.s16 q6, q0, q6
vadd.s16 q8, q0, q8
vdup.16 q10, d22[0]
vmul.s16 q2, q3, q10
vdup.16 q15, d22[1]
vmul.s16 q9, q3, q10
vmul.s16 q7, q3, q15
vmul.s16 q4, q3, q15
vadd.s16 q12, q6, q2
vadd.s16 q0, q8, q9
vadd.s16 q1, q6, q7
vqrshrun.s16 d28, q12, #5
vadd.s16 q13, q8, q4
vqrshrun.s16 d29, q0, #5
vdup.16 q10, d22[2]
vst1.8 {q14}, [r1], r3
vqrshrun.s16 d28, q1, #5
vqrshrun.s16 d29, q13, #5
vmul.s16 q2, q3, q10
vmul.s16 q9, q3, q10
vst1.8 {q14}, [r1], r3
vadd.s16 q12, q6, q2
vadd.s16 q0, q8, q9
vdup.16 q15, d22[3]
vqrshrun.s16 d28, q12, #5
vqrshrun.s16 d29, q0, #5
vmul.s16 q7, q3, q15
vmul.s16 q4, q3, q15
vst1.8 {q14}, [r1], r3
vadd.s16 q1, q6, q7
vadd.s16 q13, q8, q4
vdup.16 q10, d23[0]
vqrshrun.s16 d28, q1, #5
vqrshrun.s16 d29, q13, #5
vmul.s16 q2, q3, q10
vmul.s16 q9, q3, q10
vst1.8 {q14}, [r1], r3
vadd.s16 q12, q6, q2
vadd.s16 q0, q8, q9
vdup.16 q15, d23[1]
vqrshrun.s16 d28, q12, #5
vqrshrun.s16 d29, q0, #5
vmul.s16 q7, q3, q15
vmul.s16 q4, q3, q15
vst1.8 {q14}, [r1], r3
vadd.s16 q1, q6, q7
vadd.s16 q13, q8, q4
vdup.16 q10, d23[2]
vqrshrun.s16 d28, q1, #5
vqrshrun.s16 d29, q13, #5
vmul.s16 q2, q3, q10
vmul.s16 q9, q3, q10
vst1.8 {q14}, [r1], r3
vadd.s16 q12, q6, q2
vadd.s16 q0, q8, q9
vdup.16 q15, d23[3]
vqrshrun.s16 d28, q12, #5
vqrshrun.s16 d29, q0, #5
vmul.s16 q7, q3, q15
vmul.s16 q4, q3, q15
vst1.8 {q14}, [r1], r3
vadd.s16 q1, q6, q7
vadd.s16 q13, q8, q4
vqrshrun.s16 d28, q1, #5
vqrshrun.s16 d29, q13, #5
vst1.8 {q14}, [r1], r3
end_func_plane:
vpop {d8-d15}
ldmfd sp!, {r4-r10, r12, pc}