mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-10 08:59:24 +01:00
695 lines
24 KiB
ArmAsm
695 lines
24 KiB
ArmAsm
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
@**
|
|
@*******************************************************************************
|
|
@* @file
|
|
@* ih264_resi_trans_quant_a9.s
|
|
@*
|
|
@* @brief
|
|
@* Contains function definitions for residual and forward trans
|
|
@*
|
|
@* @author
|
|
@* Ittiam
|
|
@*
|
|
@* @par List of Functions:
|
|
@* ih264_resi_trans_quant_4x4_a9
|
|
@* ih264_resi_trans_quant_8x8_a9
|
|
@* ih264_resi_trans_quant_chroma_4x4_a9
|
|
@* ih264_hadamard_quant_4x4_a9
|
|
@* ih264_hadamard_quant_2x2_uv_a9
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
|
|
|
|
.text
|
|
.p2align 2
|
|
@*****************************************************************************
|
|
@*
|
|
@* Function Name : ih264_resi_trans_quant_4x4_a9
|
|
@* Description : This function does cf4 of H264
|
|
@*
|
|
@* Arguments : R0 :pointer to src buffer
|
|
@ R1 :pointer to pred buffer
|
|
@ R2 :pointer to dst buffer
|
|
@ R3 :source stride
|
|
@ STACK : pred stride,
|
|
@ dst stride,
|
|
@ pointer to scaling matrix,
|
|
@ pointer to threshold matrix,
|
|
@ qbits,
|
|
@ rounding factor,
|
|
@ pointer to store nnz
|
|
@ pointer to store non quantized dc value
|
|
@ Values Returned : NONE
|
|
@
|
|
@ Register Usage :
|
|
@ Stack Usage : 40 bytes
|
|
@ Cycles : Around
|
|
@ Interruptiaility : Interruptable
|
|
@
|
|
@ Known Limitations
|
|
@ \Assumptions :
|
|
@
|
|
@ Revision History :
|
|
@ DD MM YYYY Author(s) Changes
|
|
@ 1 12 2013 100633 First version
|
|
@ 20 1 2014 100633 Changes the API, Optimization
|
|
@
|
|
@*****************************************************************************
|
|
|
|
.global ih264_resi_trans_quant_4x4_a9
|
|
ih264_resi_trans_quant_4x4_a9:
|
|
|
|
@R0 :pointer to src buffer
|
|
@R1 :pointer to pred buffer
|
|
@R2 :pointer to dst buffer
|
|
@R3 :Source stride
|
|
@STACk :pred stride
|
|
@ :scale matirx,
|
|
@ :threshold matrix
|
|
@ :qbits
|
|
@ :round factor
|
|
@ :nnz
|
|
|
|
push {r4-r12, lr} @push all the variables first
|
|
|
|
add r11, sp, #40 @decrement stack pointer,to accomodate two variables
|
|
ldmfd r11, {r4-r10} @load the strides into registers
|
|
|
|
@R0 :pointer to src buffer
|
|
@R1 :pointer to pred buffer
|
|
@R2 :pointer to dst buffer
|
|
@R3 :Source stride
|
|
@R4 :Pred stride
|
|
@R5 :scale matirx,
|
|
@R6 :threshold matrix
|
|
@R7 :qbits
|
|
@R8 :round factor
|
|
@R9 :nnz
|
|
|
|
vpush {d8-d15}
|
|
|
|
mov r11, #0
|
|
sub r7, r11, r7 @Negate the qbit value for usiing LSL
|
|
|
|
@------------Fucntion Loading done----------------;
|
|
|
|
vld1.u8 d30, [r0], r3 @load first 8 pix src row 1
|
|
|
|
vld1.u8 d31, [r1], r4 @load first 8 pix pred row 1
|
|
|
|
vld1.u8 d28, [r0], r3 @load first 8 pix src row 2
|
|
|
|
vld1.u8 d29, [r1], r4 @load first 8 pix pred row 2
|
|
|
|
vld1.u8 d26, [r0], r3 @load first 8 pix src row 3
|
|
|
|
vld1.u8 d27, [r1], r4 @load first 8 pix pred row 3
|
|
vsubl.u8 q0, d30, d31 @find residue row 1
|
|
|
|
vld1.u8 d24, [r0], r3 @load first 8 pix src row 4
|
|
|
|
vld1.u8 d25, [r1], r4 @load first 8 pix pred row 4
|
|
vsubl.u8 q1, d28, d29 @find residue row 2
|
|
|
|
vsubl.u8 q2, d26, d27 @find residue row 3
|
|
vsubl.u8 q3, d24, d25 @find residue row 4
|
|
|
|
vtrn.16 d0, d2 @T12
|
|
vtrn.16 d4, d6 @T23
|
|
vtrn.32 d0, d4 @T13
|
|
vtrn.32 d2, d6 @T14
|
|
|
|
vadd.s16 d8 , d0, d6 @x0 = x4+x7
|
|
vadd.s16 d9 , d2, d4 @x1 = x5+x6
|
|
vsub.s16 d10, d2, d4 @x2 = x5-x6
|
|
vsub.s16 d11, d0, d6 @x3 = x4-x7
|
|
|
|
vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
|
|
vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
|
|
|
|
vadd.s16 d14, d8, d9 @x4 = x0 + x1;
|
|
vsub.s16 d16, d8, d9 @x6 = x0 - x1;
|
|
vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
|
|
vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
|
|
|
|
@taking transpose again so as to make do vert transform
|
|
vtrn.16 d14, d15 @T12
|
|
vtrn.16 d16, d17 @T23
|
|
vtrn.32 d14, d16 @T13
|
|
vtrn.32 d15, d17 @T24
|
|
|
|
@let us do vertical transform
|
|
@same code as horiz
|
|
vadd.s16 d18, d14, d17 @x0 = x4+x7
|
|
vadd.s16 d19, d15, d16 @x1 = x5+x6
|
|
vsub.s16 d20, d15, d16 @x2 = x5-x6
|
|
vsub.s16 d21, d14, d17 @x3 = x4-x7
|
|
|
|
vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
|
|
vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
|
|
|
|
vdup.s32 q4, r8 @Load rounding value row 1
|
|
|
|
vadd.s16 d24, d18, d19 @x5 = x0 + x1;
|
|
vsub.s16 d26, d18, d19 @x7 = x0 - x1;
|
|
vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
|
|
vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
|
|
vdup.s32 q10, r7 @Load qbit values
|
|
|
|
vst1.s16 d24[0], [r10] @Store the dc value to alternate dc sddress
|
|
|
|
@core tranform is done for 4x8 block 1
|
|
vld1.s16 {q14-q15}, [r5] @load the scaling values
|
|
|
|
vabs.s16 q0, q12 @Abs val of row 1 blk 1
|
|
|
|
vabs.s16 q1, q13 @Abs val of row 2 blk 1
|
|
|
|
vmov.s32 q5, q4 @copy round fact for row 2
|
|
|
|
vmov.s32 q6, q4 @copy round fact for row 2
|
|
vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
|
|
|
|
vmov.s32 q7, q4 @copy round fact for row 2
|
|
vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
|
|
|
|
vmlal.s16 q4, d0, d28 @Multiply and add row 1
|
|
vmlal.s16 q5, d1, d29 @Multiply and add row 2
|
|
vmlal.s16 q6, d2, d30 @Multiply and add row 3
|
|
vmlal.s16 q7, d3, d31 @Multiply and add row 4
|
|
|
|
vshl.s32 q11, q4, q10 @Shift row 1
|
|
vshl.s32 q12, q5, q10 @Shift row 2
|
|
vshl.s32 q13, q6, q10 @Shift row 3
|
|
vshl.s32 q14, q7, q10 @Shift row 4
|
|
|
|
vmovn.s32 d30, q11 @Narrow row 1
|
|
vmovn.s32 d31, q12 @Narrow row 2
|
|
vmovn.s32 d0 , q13 @Narrow row 3
|
|
vmovn.s32 d1 , q14 @Narrow row 4
|
|
|
|
vneg.s16 q1, q15 @Get negative
|
|
vneg.s16 q4, q0 @Get negative
|
|
|
|
vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
|
|
vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
|
|
|
|
vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
|
|
vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
|
|
|
|
|
|
vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
|
|
vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
|
|
|
|
vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
|
|
|
|
vpadd.u8 d18, d16, d17 @I pair add nnz 1
|
|
vpadd.u8 d20, d18, d19 @I Pair add nnz 2
|
|
vpadd.u8 d22, d20, d21 @I Pair add nnz 3
|
|
vpadd.u8 d24, d22, d23 @I Pair add nnz4
|
|
vst1.s16 {q2-q3}, [r2] @Store blk
|
|
|
|
vmov.u8 d25, #16 @I Get max nnz
|
|
vsub.u8 d26, d25, d24 @I invert current nnz
|
|
|
|
vst1.u8 d26[0], [r9] @I Write nnz
|
|
|
|
vpop {d8-d15}
|
|
pop {r4-r12, pc}
|
|
|
|
|
|
|
|
@*****************************************************************************
|
|
@*
|
|
@* Function Name : ih264_resi_trans_quant_chroma_4x4_a9
|
|
@* Description : This function does residue calculation, forward transform
|
|
@* and quantization for 4x4 chroma block.
|
|
@*
|
|
@* Arguments : R0 :pointer to src buffer
|
|
@ R1 :pointer to pred buffer
|
|
@ R2 :pointer to dst buffer
|
|
@ R3 :source stride
|
|
@ STACK : pred stride,
|
|
@ dst stride,
|
|
@ pointer to scaling matrix,
|
|
@ pointer to threshold matrix,
|
|
@ qbits,
|
|
@ rounding factor,
|
|
@ pointer to store nnz
|
|
@ pointer to store unquantized dc values
|
|
@ Values Returned : NONE
|
|
@
|
|
@ Register Usage :
|
|
@ Stack Usage : 40 bytes
|
|
@ Cycles : Around
|
|
@ Interruptiaility : Interruptable
|
|
@
|
|
@ Known Limitations
|
|
@ \Assumptions :
|
|
@
|
|
@ Revision History :
|
|
@ DD MM YYYY Author(s) Changes
|
|
@ 11 2 2015 100664 First version
|
|
@
|
|
@*****************************************************************************
|
|
|
|
.global ih264_resi_trans_quant_chroma_4x4_a9
|
|
ih264_resi_trans_quant_chroma_4x4_a9:
|
|
|
|
@R0 :pointer to src buffer
|
|
@R1 :pointer to pred buffer
|
|
@R2 :pointer to dst buffer
|
|
@R3 :Source stride
|
|
@STACk :pred stride
|
|
@ :scale matirx,
|
|
@ :threshold matrix
|
|
@ :qbits
|
|
@ :round factor
|
|
@ :nnz
|
|
@ :pu1_dc_alt_addr
|
|
push {r4-r12, lr} @push all the variables first
|
|
|
|
add r11, sp, #40 @decrement stack pointer,to accomodate two variables
|
|
ldmfd r11, {r4-r10} @load the strides into registers
|
|
|
|
@R0 :pointer to src buffer
|
|
@R1 :pointer to pred buffer
|
|
@R2 :pointer to dst buffer
|
|
@R3 :Source stride
|
|
@R4 :Pred stride
|
|
@R5 :scale matirx,
|
|
@R6 :threshold matrix
|
|
@R7 :qbits
|
|
@R8 :round factor
|
|
@R9 :nnz
|
|
vpush {d8-d15}
|
|
mov r11, #0
|
|
sub r7, r11, r7 @Negate the qbit value for usiing LSL
|
|
|
|
@------------Fucntion Loading done----------------;
|
|
|
|
vld2.u8 {d10, d11}, [r0], r3 @load first 8 pix src row 1
|
|
|
|
vld2.u8 {d11, d12}, [r1], r4 @load first 8 pix pred row 1
|
|
|
|
vld2.u8 {d28, d29}, [r0], r3 @load first 8 pix src row 2
|
|
|
|
vld2.u8 {d29, d30}, [r1], r4 @load first 8 pix pred row 2
|
|
|
|
vld2.u8 {d25, d26}, [r0], r3 @load first 8 pix src row 3
|
|
|
|
vld2.u8 {d26, d27}, [r1], r4 @load first 8 pix pred row 3
|
|
vsubl.u8 q0, d10, d11 @find residue row 1
|
|
|
|
vld2.u8 {d22, d23}, [r0], r3 @load first 8 pix src row 4
|
|
|
|
vld2.u8 {d23, d24}, [r1], r4 @load first 8 pix pred row 4
|
|
vsubl.u8 q1, d28, d29 @find residue row 2
|
|
|
|
vsubl.u8 q2, d25, d26 @find residue row 3
|
|
vsubl.u8 q3, d22, d23 @find residue row 4
|
|
|
|
vtrn.16 d0, d2 @T12
|
|
vtrn.16 d4, d6 @T23
|
|
vtrn.32 d0, d4 @T13
|
|
vtrn.32 d2, d6 @T14
|
|
|
|
vadd.s16 d8 , d0, d6 @x0 = x4+x7
|
|
vadd.s16 d9 , d2, d4 @x1 = x5+x6
|
|
vsub.s16 d10, d2, d4 @x2 = x5-x6
|
|
vsub.s16 d11, d0, d6 @x3 = x4-x7
|
|
|
|
vshl.s16 d12, d10, #1 @U_SHIFT(x2,1,shft)
|
|
vshl.s16 d13, d11, #1 @U_SHIFT(x3,1,shft)
|
|
|
|
vadd.s16 d14, d8, d9 @x4 = x0 + x1;
|
|
vsub.s16 d16, d8, d9 @x6 = x0 - x1;
|
|
vadd.s16 d15, d13, d10 @x5 = U_SHIFT(x3,1,shft) + x2;
|
|
vsub.s16 d17, d11, d12 @x7 = x3 - U_SHIFT(x2,1,shft);
|
|
|
|
@taking transpose again so as to make do vert transform
|
|
vtrn.16 d14, d15 @T12
|
|
vtrn.16 d16, d17 @T23
|
|
vtrn.32 d14, d16 @T13
|
|
vtrn.32 d15, d17 @T24
|
|
|
|
@let us do vertical transform
|
|
@same code as horiz
|
|
vadd.s16 d18, d14, d17 @x0 = x4+x7
|
|
vadd.s16 d19, d15, d16 @x1 = x5+x6
|
|
vsub.s16 d20, d15, d16 @x2 = x5-x6
|
|
vsub.s16 d21, d14, d17 @x3 = x4-x7
|
|
|
|
vshl.s16 d22, d20, #1 @U_SHIFT(x2,1,shft)
|
|
vshl.s16 d23, d21, #1 @U_SHIFT(x3,1,shft)
|
|
|
|
vdup.s32 q4, r8 @Load rounding value row 1
|
|
|
|
vadd.s16 d24, d18, d19 @x5 = x0 + x1;
|
|
vsub.s16 d26, d18, d19 @x7 = x0 - x1;
|
|
vadd.s16 d25, d23, d20 @x6 = U_SHIFT(x3,1,shft) + x2;
|
|
vsub.s16 d27, d21, d22 @x8 = x3 - U_SHIFT(x2,1,shft);
|
|
vdup.s32 q10, r7 @Load qbit values
|
|
|
|
vst1.s16 d24[0], [r10] @Store Unquantized dc value to dc alte address
|
|
|
|
@core tranform is done for 4x8 block 1
|
|
vld1.s16 {q14-q15}, [r5] @load the scaling values
|
|
|
|
vabs.s16 q0, q12 @Abs val of row 1 blk 1
|
|
|
|
vabs.s16 q1, q13 @Abs val of row 2 blk 1
|
|
|
|
vmov.s32 q5, q4 @copy round fact for row 2
|
|
|
|
vmov.s32 q6, q4 @copy round fact for row 2
|
|
vclt.s16 q2, q12, #0 @Get the sign of row 1 blk 1
|
|
|
|
vmov.s32 q7, q4 @copy round fact for row 2
|
|
vclt.s16 q3, q13, #0 @Get the sign of row 2 blk 1
|
|
|
|
vmlal.s16 q4, d0, d28 @Multiply and add row 1
|
|
vmlal.s16 q5, d1, d29 @Multiply and add row 2
|
|
vmlal.s16 q6, d2, d30 @Multiply and add row 3
|
|
vmlal.s16 q7, d3, d31 @Multiply and add row 4
|
|
|
|
vshl.s32 q11, q4, q10 @Shift row 1
|
|
vshl.s32 q12, q5, q10 @Shift row 2
|
|
vshl.s32 q13, q6, q10 @Shift row 3
|
|
vshl.s32 q14, q7, q10 @Shift row 4
|
|
|
|
vmovn.s32 d30, q11 @Narrow row 1
|
|
vmovn.s32 d31, q12 @Narrow row 2
|
|
vmovn.s32 d0 , q13 @Narrow row 3
|
|
vmovn.s32 d1 , q14 @Narrow row 4
|
|
|
|
vneg.s16 q1, q15 @Get negative
|
|
vneg.s16 q4, q0 @Get negative
|
|
|
|
vceq.s16 q5, q15, #0 @I compare with zero row 1 and 2 blk 1
|
|
vceq.s16 q6, q0 , #0 @I compare with zero row 1 and 2 blk 1
|
|
|
|
vbsl.s16 q2, q1, q15 @Restore sign of row 1 and 2
|
|
vbsl.s16 q3, q4, q0 @Restore sign of row 3 and 4
|
|
|
|
vmovn.u16 d14, q5 @I Narrow the comparison for row 1 and 2 blk 1
|
|
vmovn.u16 d15, q6 @I Narrow the comparison for row 1 and 2 blk 2
|
|
|
|
vshr.u8 q8, q7, #7 @I Reduce comaparison bit to a signle bit row 1 and 2 blk 1 and 2 [ keep the value for later use ]
|
|
|
|
vpadd.u8 d18, d16, d17 @I pair add nnz 1
|
|
vpadd.u8 d20, d18, d19 @I Pair add nnz 2
|
|
vpadd.u8 d22, d20, d21 @I Pair add nnz 3
|
|
vpadd.u8 d24, d22, d23 @I Pair add nnz4
|
|
vst1.s16 {q2-q3}, [r2] @Store blk
|
|
|
|
vmov.u8 d25, #16 @I Get max nnz
|
|
vsub.u8 d26, d25, d24 @I invert current nnz
|
|
|
|
vst1.u8 d26[0], [r9] @I Write nnz
|
|
|
|
vpop {d8-d15}
|
|
pop {r4-r12, pc}
|
|
|
|
|
|
|
|
@*****************************************************************************
|
|
@*
|
|
@* Function Name : ih264_hadamard_quant_4x4_a9
|
|
@* Description : This function does forward hadamard transform and
|
|
@* quantization for luma dc block
|
|
@*
|
|
@* Arguments : R0 :pointer to src buffer
|
|
@ R1 :pointer to dst buffer
|
|
@ R2 :pu2_scale_matrix
|
|
@ R2 :pu2_threshold_matrix
|
|
@ STACk : u4_qbits
|
|
@ u4_round_factor
|
|
@ pu1_nnz
|
|
@ Values Returned : NONE
|
|
@
|
|
@ Register Usage :
|
|
@ Stack Usage : 0 bytes
|
|
@ Cycles : Around
|
|
@ Interruptiaility : Interruptable
|
|
@
|
|
@ Known Limitations
|
|
@ \Assumptions :
|
|
@
|
|
@ Revision History :
|
|
@ DD MM YYYY Author(s) Changes
|
|
@ 20 2 2015 100633 First version
|
|
@
|
|
@*****************************************************************************
|
|
@ih264_hadamard_quant_4x4_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
|
|
@ const UWORD16 *pu2_scale_matrix,
|
|
@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
|
|
@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
|
|
@ )
|
|
.global ih264_hadamard_quant_4x4_a9
|
|
ih264_hadamard_quant_4x4_a9:
|
|
|
|
@Registert usage
|
|
@ r0 : src
|
|
@ r1 : dst
|
|
@ r2 : *pu2_scale_matrix
|
|
@ r3 : *pu2_threshold_matrix
|
|
|
|
vld4.s16 {d0, d1, d2, d3}, [r0]! @Load 4x4 block
|
|
vpush {d8-d15}
|
|
|
|
vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
|
|
|
|
vaddl.s16 q3, d0, d3 @x0 = x4 + x7;
|
|
vaddl.s16 q4, d1, d2 @x1 = x5 + x6;
|
|
vsubl.s16 q5, d1, d2 @x2 = x5 - x6;
|
|
vsubl.s16 q6, d0, d3 @x3 = x4 - x7;
|
|
|
|
vdup.u16 d30, d30[0] @pu2_scale_matrix[0]
|
|
|
|
vadd.s32 q7, q3, q4 @pi2_dst[0] = x0 + x1;
|
|
vadd.s32 q8, q6, q5 @pi2_dst[1] = x3 + x2;
|
|
add r3, sp, #68 @Get address of u4_round_factor
|
|
vsub.s32 q9, q3, q4 @pi2_dst[2] = x0 - x1;
|
|
vsub.s32 q10, q6, q5 @pi2_dst[3] = x3 - x2;
|
|
|
|
vtrn.s32 q7, q8 @transpose 4x4 block
|
|
vtrn.s32 q9, q10
|
|
vld1.s32 d0[0], [r3] @load u4_round_factor
|
|
vswp d15, d18
|
|
vswp d17, d20
|
|
|
|
add r3, sp, #64 @Get address of u4_qbits
|
|
vadd.s32 q11, q7, q10 @x0 = x4 + x7;
|
|
vadd.s32 q12, q8, q9 @x1 = x5 + x6;
|
|
vld1.s32 d31[0], [r3] @load u4_qbits
|
|
vsub.s32 q13, q8, q9 @x2 = x5 - x6;
|
|
vsub.s32 q14, q7, q10 @x3 = x4 - x7;
|
|
|
|
vdup.s32 q7, d0[0] @u4_round_factor
|
|
|
|
vadd.s32 q0, q11, q12 @(x0 + x1)
|
|
vadd.s32 q1, q14, q13 @(x3 + x2)
|
|
vsub.s32 q2, q11, q12 @(x0 - x1)
|
|
vsub.s32 q3, q14, q13 @(x3 - x2)
|
|
|
|
vdup.s32 q11, d31[0] @u4_round_factor
|
|
|
|
vshrn.s32 d0, q0, #1 @i4_value = (x0 + x1) >> 1;
|
|
vshrn.s32 d1, q1, #1 @i4_value = (x3 + x2) >> 1;
|
|
vshrn.s32 d2, q2, #1 @i4_value = (x0 - x1) >> 1;
|
|
vshrn.s32 d3, q3, #1 @i4_value = (x3 - x2) >> 1;
|
|
|
|
vabs.s16 q5, q0
|
|
vabs.s16 q6, q1
|
|
|
|
vmov.s32 q8, q7 @Get the round fact
|
|
vmov.s32 q9, q7
|
|
vmov.s32 q10, q7
|
|
|
|
vclt.s16 q3, q0, #0 @get the sign row 1,2
|
|
vclt.s16 q4, q1, #0
|
|
|
|
vneg.s32 q11, q11 @-u4_round_factor
|
|
|
|
vmlal.u16 q7, d10, d30
|
|
vmlal.u16 q8, d11, d30
|
|
vmlal.u16 q9, d12, d30
|
|
vmlal.u16 q10, d13, d30
|
|
|
|
vshl.u32 q7, q7, q11
|
|
vshl.u32 q8, q8, q11
|
|
vshl.u32 q9, q9, q11
|
|
vshl.u32 q10, q10, q11
|
|
|
|
vqmovn.u32 d22, q7
|
|
vqmovn.u32 d23, q8
|
|
vqmovn.u32 d24, q9
|
|
vqmovn.u32 d25, q10
|
|
|
|
vneg.s16 q13, q11
|
|
vneg.s16 q14, q12
|
|
|
|
vbsl.s16 q3, q13, q11
|
|
vbsl.s16 q4, q14, q12
|
|
|
|
vceq.s16 q5, q11, #0
|
|
vceq.s16 q6, q12, #0
|
|
|
|
vst1.s16 {q3}, [r1]!
|
|
|
|
vshrn.u16 d14, q5, #8
|
|
vshrn.u16 d15, q6, #8
|
|
|
|
ldr r3, [sp, #72] @Load *pu1_nnz
|
|
|
|
vshr.u8 q7, q7, #7
|
|
|
|
vst1.s16 {q4}, [r1]!
|
|
|
|
vadd.u8 d16, d14, d15
|
|
vmov.u8 d20, #16
|
|
vpadd.u8 d17, d16, d16
|
|
vpadd.u8 d18, d17, d17
|
|
vpadd.u8 d19, d18, d18
|
|
vsub.u8 d20, d20, d19
|
|
vst1.u8 d20[0], [r3]
|
|
|
|
vpop {d8-d15}
|
|
bx lr
|
|
|
|
|
|
|
|
|
|
@*****************************************************************************
|
|
@*
|
|
@* Function Name : ih264_hadamard_quant_2x2_uv_a9
|
|
@* Description : This function does forward hadamard transform and
|
|
@* quantization for dc block of chroma for both planes
|
|
@*
|
|
@* Arguments : R0 :pointer to src buffer
|
|
@ R1 :pointer to dst buffer
|
|
@ R2 :pu2_scale_matrix
|
|
@ R2 :pu2_threshold_matrix
|
|
@ STACk : u4_qbits
|
|
@ u4_round_factor
|
|
@ pu1_nnz
|
|
@ Values Returned : NONE
|
|
@
|
|
@ Register Usage :
|
|
@ Stack Usage : 0 bytes
|
|
@ Cycles : Around
|
|
@ Interruptiaility : Interruptable
|
|
@
|
|
@ Known Limitations
|
|
@ \Assumptions :
|
|
@
|
|
@ Revision History :
|
|
@ DD MM YYYY Author(s) Changes
|
|
@ 20 2 2015 100633 First version
|
|
@
|
|
@*****************************************************************************
|
|
@ ih264_hadamard_quant_2x2_uv_a9(WORD16 *pi2_src, WORD16 *pi2_dst,
|
|
@ const UWORD16 *pu2_scale_matrix,
|
|
@ const UWORD16 *pu2_threshold_matrix, UWORD32 u4_qbits,
|
|
@ UWORD32 u4_round_factor,UWORD8 *pu1_nnz
|
|
@ )
|
|
|
|
.global ih264_hadamard_quant_2x2_uv_a9
|
|
ih264_hadamard_quant_2x2_uv_a9:
|
|
|
|
vpush {d8-d15}
|
|
vld2.s16 {d0-d1}, [r0] @load src
|
|
|
|
add r3, sp, #68 @Get address of u4_round_factor
|
|
|
|
vaddl.s16 q3, d0, d1 @x0 = x4 + x5;, x2 = x6 + x7;
|
|
vld1.u16 d30[0], [r2] @load pu2_scale_matrix[0]
|
|
vsubl.s16 q4, d0, d1 @x1 = x4 - x5; x3 = x6 - x7;
|
|
|
|
add r0, sp, #64 @Get affress of u4_qbits
|
|
vld1.s32 d28[0], [r3] @load u4_round_factor
|
|
vtrn.s32 q3, q4 @q1 -> x0 x1, q2 -> x2 x3
|
|
|
|
vadd.s32 q0, q3, q4 @ (x0 + x2) (x1 + x3) (y0 + y2); (y1 + y3);
|
|
vld1.s32 d24[0], [r0] @load u4_qbits
|
|
vsub.s32 q1, q3, q4 @ (x0 - x2) (x1 - x3) (y0 - y2); (y1 - y3);
|
|
|
|
vdup.u16 d30, d30[0] @pu2_scale_matrix
|
|
|
|
vabs.s32 q2, q0
|
|
vabs.s32 q3, q1
|
|
|
|
vdup.s32 q14, d28[0] @u4_round_factor
|
|
|
|
vmovl.u16 q15, d30 @pu2_scale_matrix
|
|
|
|
vclt.s32 q4, q0, #0 @get the sign row 1,2
|
|
vdup.s32 q12, d24[0] @u4_round_factor
|
|
vclt.s32 q5, q1, #0
|
|
|
|
vqmovn.u32 d8, q4
|
|
vqmovn.s32 d9, q5
|
|
|
|
vmov.s32 q13, q14 @Get the round fact
|
|
vneg.s32 q12, q12 @-u4_round_factor
|
|
|
|
vmla.u32 q13, q2, q15
|
|
vmla.u32 q14, q3, q15
|
|
|
|
vshl.u32 q13, q13, q12 @>>qbit
|
|
vshl.u32 q14, q14, q12 @>>qbit
|
|
|
|
vqmovn.u32 d10, q13
|
|
vqmovn.u32 d11, q14
|
|
|
|
vneg.s16 q6, q5
|
|
|
|
vbsl.s16 q4, q6, q5 @*sign
|
|
|
|
vtrn.s32 d8, d9
|
|
|
|
vceq.s16 q7, q4, #0 @Compute nnz
|
|
|
|
vshrn.u16 d14, q7, #8 @reduce nnz comparison to 1 bit
|
|
|
|
ldr r3, [sp, #72] @Load *pu1_nnz
|
|
vshr.u8 d14, d14, #7 @reduce nnz comparison to 1 bit
|
|
vmov.u8 d20, #4 @Since we add zeros, we need to subtract from 4 to get nnz
|
|
vpadd.u8 d17, d14, d14 @Sum up nnz
|
|
|
|
vst1.s16 {q4}, [r1]! @Store the block
|
|
|
|
vpadd.u8 d17, d17, d17 @Sum up nnz
|
|
vsub.u8 d20, d20, d17 @4- numzeros
|
|
vst1.u16 d20[0], [r3] @store nnz
|
|
|
|
vpop {d8-d15}
|
|
bx lr
|
|
|
|
|
|
|
|
|
|
|