mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-23 07:11:12 +01:00
643 lines
29 KiB
ArmAsm
643 lines
29 KiB
ArmAsm
@/******************************************************************************
|
|
@ *
|
|
@ * Copyright (C) 2015 The Android Open Source Project
|
|
@ *
|
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
|
@ * you may not use this file except in compliance with the License.
|
|
@ * You may obtain a copy of the License at:
|
|
@ *
|
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
|
@ *
|
|
@ * Unless required by applicable law or agreed to in writing, software
|
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
@ * See the License for the specific language governing permissions and
|
|
@ * limitations under the License.
|
|
@ *
|
|
@ *****************************************************************************
|
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
@*/
|
|
@**
|
|
@******************************************************************************
|
|
@* @file
|
|
@* ih264_weighted_bi_pred_a9q.s
|
|
@*
|
|
@* @brief
|
|
@* Contains function definitions for weighted biprediction.
|
|
@*
|
|
@* @author
|
|
@* Kaushik Senthoor R
|
|
@*
|
|
@* @par List of Functions:
|
|
@*
|
|
@* - ih264_weighted_bi_pred_luma_a9q()
|
|
@* - ih264_weighted_bi_pred_chroma_a9q()
|
|
@*
|
|
@* @remarks
|
|
@* None
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
@*******************************************************************************
|
|
@* @function
|
|
@* ih264_weighted_bi_pred_luma_a9q()
|
|
@*
|
|
@* @brief
|
|
@* This routine performs the weighted biprediction as described in sec
|
|
@* 8.4.2.3.2 titled "Weighted sample prediction process" for luma.
|
|
@*
|
|
@* @par Description:
|
|
@* This function gets two ht x wd blocks, calculates the weighted samples,
|
|
@* rounds off, adds offset and stores it in the destination block.
|
|
@*
|
|
@* @param[in] pu1_src1
|
|
@* UWORD8 Pointer to the buffer containing the input block 1.
|
|
@*
|
|
@* @param[in] pu1_src2
|
|
@* UWORD8 Pointer to the buffer containing the input block 2.
|
|
@*
|
|
@* @param[out] pu1_dst
|
|
@* UWORD8 pointer to the destination where the output block is stored.
|
|
@*
|
|
@* @param[in] src_strd1
|
|
@* Stride of the input buffer 1
|
|
@*
|
|
@* @param[in] src_strd2
|
|
@* Stride of the input buffer 2
|
|
@*
|
|
@* @param[in] dst_strd
|
|
@* Stride of the destination buffer
|
|
@*
|
|
@* @param[in] log_wd
|
|
@* number of bits to be rounded off
|
|
@*
|
|
@* @param[in] wt1
|
|
@* weight for the weighted prediction
|
|
@*
|
|
@* @param[in] wt2
|
|
@* weight for the weighted prediction
|
|
@*
|
|
@* @param[in] ofst1
|
|
@* offset 1 used after rounding off
|
|
@*
|
|
@* @param[in] ofst2
|
|
@* offset 2 used after rounding off
|
|
@*
|
|
@* @param[in] ht
|
|
@* integer height of the array
|
|
@*
|
|
@* @param[in] wd
|
|
@* integer width of the array
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* (ht,wd) can be (4,4), (4,8), (8,4), (8,8), (8,16), (16,8) or (16,16).
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
@void ih264_weighted_bi_pred_luma_a9q(UWORD8 *pu1_src1,
|
|
@ UWORD8 *pu1_src2,
|
|
@ UWORD8 *pu1_dst,
|
|
@ WORD32 src_strd1,
|
|
@ WORD32 src_strd2,
|
|
@ WORD32 dst_strd,
|
|
@ WORD32 log_wd,
|
|
@ WORD32 wt1,
|
|
@ WORD32 wt2,
|
|
@ WORD32 ofst1,
|
|
@ WORD32 ofst2,
|
|
@ WORD32 ht,
|
|
@ WORD32 wd)
|
|
@
|
|
@**************Variables Vs Registers*****************************************
|
|
@ r0 => pu1_src1
|
|
@ r1 => pu1_src2
|
|
@ r2 => pu1_dst
|
|
@ r3 => src_strd1
|
|
@ [sp] => src_strd2 (r4)
|
|
@ [sp+4] => dst_strd (r5)
|
|
@ [sp+8] => log_wd (r6)
|
|
@ [sp+12] => wt1 (r7)
|
|
@ [sp+16] => wt2 (r8)
|
|
@ [sp+20] => ofst1 (r9)
|
|
@ [sp+24] => ofst2 (r10)
|
|
@ [sp+28] => ht (r11)
|
|
@ [sp+32] => wd (r12)
|
|
@
|
|
.text
|
|
.p2align 2
|
|
|
|
.global ih264_weighted_bi_pred_luma_a9q
|
|
|
|
ih264_weighted_bi_pred_luma_a9q:
|
|
|
|
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
|
|
ldr r6, [sp, #48] @Load log_wd in r6
|
|
ldr r7, [sp, #52] @Load wt1 in r7
|
|
ldr r8, [sp, #56] @Load wt2 in r8
|
|
ldr r9, [sp, #60] @Load ofst1 in r9
|
|
|
|
add r6, r6, #1 @r6 = log_wd + 1
|
|
sxtb r7, r7 @sign-extend 16-bit wt1 to 32-bit
|
|
ldr r4, [sp, #40] @Load src_strd2 in r4
|
|
ldr r5, [sp, #44] @Load dst_strd in r5
|
|
sxtb r9, r9 @sign-extend 8-bit ofst1 to 32-bit
|
|
neg r10, r6 @r10 = -(log_wd + 1)
|
|
ldr r11, [sp, #68] @Load ht in r11
|
|
ldr r12, [sp, #72] @Load wd in r12
|
|
vdup.16 q0, r10 @Q0 = -(log_wd + 1) (32-bit)
|
|
add r9, r9, #1 @r9 = ofst1 + 1
|
|
|
|
ldr r10, [sp, #64] @Load ofst2 in r10
|
|
sxtb r8, r8 @sign-extend 16-bit wt2 to 32-bit
|
|
cmp r12, #16 @check if wd is 16
|
|
vpush {d8-d15}
|
|
sxtb r10, r10 @sign-extend 8-bit ofst2 to 32-bit
|
|
add r9, r9, r10 @r9 = ofst1 + ofst2 + 1
|
|
vmov d2, r7, r8 @D2 = {wt1(32-bit), wt2(32-bit)}
|
|
asr r9, r9, #1 @r9 = ofst = (ofst1 + ofst2 + 1) >> 1
|
|
vdup.8 d3, r9 @D3 = ofst (8-bit)
|
|
beq loop_16 @branch if wd is 16
|
|
|
|
cmp r12, #8 @check if wd is 8
|
|
beq loop_8 @branch if wd is 8
|
|
|
|
loop_4: @each iteration processes four rows
|
|
|
|
vld1.32 d4[0], [r0], r3 @load row 1 in source 1
|
|
vld1.32 d4[1], [r0], r3 @load row 2 in source 1
|
|
vld1.32 d6[0], [r1], r4 @load row 1 in source 2
|
|
vld1.32 d6[1], [r1], r4 @load row 2 in source 2
|
|
|
|
vmovl.u8 q2, d4 @converting rows 1,2 in source 1 to 16-bit
|
|
vld1.32 d8[0], [r0], r3 @load row 3 in source 1
|
|
vld1.32 d8[1], [r0], r3 @load row 4 in source 1
|
|
vmovl.u8 q3, d6 @converting rows 1,2 in source 2 to 16-bit
|
|
vld1.32 d10[0], [r1], r4 @load row 3 in source 2
|
|
vld1.32 d10[1], [r1], r4 @load row 4 in source 2
|
|
|
|
vmovl.u8 q4, d8 @converting rows 3,4 in source 1 to 16-bit
|
|
vmovl.u8 q5, d10 @converting rows 3,4 in source 2 to 16-bit
|
|
|
|
vmul.s16 q2, q2, d2[0] @weight 1 mult. for rows 1,2
|
|
vmla.s16 q2, q3, d2[2] @weight 2 mult. for rows 1,2
|
|
vmul.s16 q4, q4, d2[0] @weight 1 mult. for rows 3,4
|
|
vmla.s16 q4, q5, d2[2] @weight 2 mult. for rows 3,4
|
|
|
|
subs r11, r11, #4 @decrement ht by 4
|
|
vrshl.s16 q2, q2, q0 @rounds off the weighted samples from rows 1,2
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 3,4
|
|
|
|
vaddw.s8 q2, q2, d3 @adding offset for rows 1,2
|
|
vaddw.s8 q4, q4, d3 @adding offset for rows 3,4
|
|
|
|
vqmovun.s16 d4, q2 @saturating rows 1,2 to unsigned 8-bit
|
|
vqmovun.s16 d8, q4 @saturating rows 3,4 to unsigned 8-bit
|
|
|
|
vst1.32 d4[0], [r2], r5 @store row 1 in destination
|
|
vst1.32 d4[1], [r2], r5 @store row 2 in destination
|
|
vst1.32 d8[0], [r2], r5 @store row 3 in destination
|
|
vst1.32 d8[1], [r2], r5 @store row 4 in destination
|
|
|
|
bgt loop_4 @if greater than 0 repeat the loop again
|
|
|
|
b end_loops
|
|
|
|
loop_8: @each iteration processes four rows
|
|
|
|
vld1.8 d4, [r0], r3 @load row 1 in source 1
|
|
vld1.8 d6, [r1], r4 @load row 1 in source 2
|
|
vld1.8 d8, [r0], r3 @load row 2 in source 1
|
|
vld1.8 d10, [r1], r4 @load row 2 in source 2
|
|
vmovl.u8 q2, d4 @converting row 1 in source 1 to 16-bit
|
|
vld1.8 d12, [r0], r3 @load row 3 in source 1
|
|
vld1.8 d14, [r1], r4 @load row 3 in source 2
|
|
vmovl.u8 q3, d6 @converting row 1 in source 2 to 16-bit
|
|
vld1.8 d16, [r0], r3 @load row 4 in source 1
|
|
vld1.8 d18, [r1], r4 @load row 4 in source 2
|
|
|
|
vmovl.u8 q4, d8 @converting row 2 in source 1 to 16-bit
|
|
vmovl.u8 q5, d10 @converting row 2 in source 2 to 16-bit
|
|
|
|
vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1
|
|
vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1
|
|
vmovl.u8 q6, d12 @converting row 3 in source 1 to 16-bit
|
|
vmovl.u8 q7, d14 @converting row 3 in source 2 to 16-bit
|
|
vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2
|
|
vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2
|
|
vmovl.u8 q8, d16 @converting row 4 in source 1 to 16-bit
|
|
vmovl.u8 q9, d18 @converting row 4 in source 2 to 16-bit
|
|
|
|
vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3
|
|
vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3
|
|
vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4
|
|
vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4
|
|
|
|
vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2
|
|
vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3
|
|
vaddw.s8 q2, q2, d3 @adding offset for row 1
|
|
vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4
|
|
vaddw.s8 q4, q4, d3 @adding offset for row 2
|
|
|
|
vaddw.s8 q6, q6, d3 @adding offset for row 3
|
|
vqmovun.s16 d4, q2 @saturating row 1 to unsigned 8-bit
|
|
vaddw.s8 q8, q8, d3 @adding offset for row 4
|
|
vqmovun.s16 d8, q4 @saturating row 2 to unsigned 8-bit
|
|
|
|
vqmovun.s16 d12, q6 @saturating row 3 to unsigned 8-bit
|
|
vqmovun.s16 d16, q8 @saturating row 4 to unsigned 8-bit
|
|
|
|
vst1.8 d4, [r2], r5 @store row 1 in destination
|
|
vst1.8 d8, [r2], r5 @store row 2 in destination
|
|
subs r11, r11, #4 @decrement ht by 4
|
|
vst1.8 d12, [r2], r5 @store row 3 in destination
|
|
vst1.8 d16, [r2], r5 @store row 4 in destination
|
|
|
|
bgt loop_8 @if greater than 0 repeat the loop again
|
|
|
|
b end_loops
|
|
|
|
loop_16: @each iteration processes two rows
|
|
|
|
vld1.8 {q2}, [r0], r3 @load row 1 in source 1
|
|
vld1.8 {q3}, [r1], r4 @load row 1 in source 2
|
|
vld1.8 {q4}, [r0], r3 @load row 2 in source 1
|
|
vld1.8 {q5}, [r1], r4 @load row 2 in source 2
|
|
vmovl.u8 q10, d4 @converting row 1L in source 1 to 16-bit
|
|
vld1.8 {q6}, [r0], r3 @load row 3 in source 1
|
|
vld1.8 {q7}, [r1], r4 @load row 3 in source 2
|
|
vmovl.u8 q11, d6 @converting row 1L in source 2 to 16-bit
|
|
vld1.8 {q8}, [r0], r3 @load row 4 in source 1
|
|
vld1.8 {q9}, [r1], r4 @load row 4 in source 2
|
|
|
|
vmovl.u8 q2, d5 @converting row 1H in source 1 to 16-bit
|
|
vmovl.u8 q3, d7 @converting row 1H in source 2 to 16-bit
|
|
|
|
vmul.s16 q10, q10, d2[0] @weight 1 mult. for row 1L
|
|
vmla.s16 q10, q11, d2[2] @weight 2 mult. for row 1L
|
|
vmovl.u8 q12, d8 @converting row 2L in source 1 to 16-bit
|
|
vmovl.u8 q13, d10 @converting row 2L in source 2 to 16-bit
|
|
|
|
vmul.s16 q2, q2, d2[0] @weight 1 mult. for row 1H
|
|
vmla.s16 q2, q3, d2[2] @weight 2 mult. for row 1H
|
|
vmovl.u8 q4, d9 @converting row 2H in source 1 to 16-bit
|
|
vmovl.u8 q5, d11 @converting row 2H in source 2 to 16-bit
|
|
|
|
vmul.s16 q12, q12, d2[0] @weight 1 mult. for row 2L
|
|
vmla.s16 q12, q13, d2[2] @weight 2 mult. for row 2L
|
|
vmovl.u8 q14, d12 @converting row 3L in source 1 to 16-bit
|
|
vmovl.u8 q15, d14 @converting row 3L in source 2 to 16-bit
|
|
|
|
vmul.s16 q4, q4, d2[0] @weight 1 mult. for row 2H
|
|
vmla.s16 q4, q5, d2[2] @weight 2 mult. for row 2H
|
|
vmovl.u8 q6, d13 @converting row 3H in source 1 to 16-bit
|
|
vmovl.u8 q7, d15 @converting row 3H in source 2 to 16-bit
|
|
|
|
vmul.s16 q14, q14, d2[0] @weight 1 mult. for row 3L
|
|
vmla.s16 q14, q15, d2[2] @weight 2 mult. for row 3L
|
|
vmovl.u8 q11, d16 @converting row 4L in source 1 to 16-bit
|
|
vmovl.u8 q3, d18 @converting row 4L in source 2 to 16-bit
|
|
|
|
vmul.s16 q6, q6, d2[0] @weight 1 mult. for row 3H
|
|
vmla.s16 q6, q7, d2[2] @weight 2 mult. for row 3H
|
|
vmovl.u8 q8, d17 @converting row 4H in source 1 to 16-bit
|
|
vmovl.u8 q9, d19 @converting row 4H in source 2 to 16-bit
|
|
|
|
vmul.s16 q11, q11, d2[0] @weight 1 mult. for row 4L
|
|
vmla.s16 q11, q3, d2[2] @weight 2 mult. for row 4L
|
|
vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 1L
|
|
|
|
vmul.s16 q8, q8, d2[0] @weight 1 mult. for row 4H
|
|
vmla.s16 q8, q9, d2[2] @weight 2 mult. for row 4H
|
|
vrshl.s16 q2, q2, q0 @rounds off the weighted samples from row 1H
|
|
|
|
vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 2L
|
|
vaddw.s8 q10, q10, d3 @adding offset for row 1L
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 2H
|
|
vaddw.s8 q2, q2, d3 @adding offset for row 1H
|
|
vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 3L
|
|
vaddw.s8 q12, q12, d3 @adding offset for row 2L
|
|
vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 3H
|
|
vaddw.s8 q4, q4, d3 @adding offset for row 2H
|
|
vrshl.s16 q11, q11, q0 @rounds off the weighted samples from row 4L
|
|
vaddw.s8 q14, q14, d3 @adding offset for row 3L
|
|
vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 4H
|
|
vaddw.s8 q6, q6, d3 @adding offset for row 3H
|
|
|
|
vqmovun.s16 d26, q10 @saturating row 1L to unsigned 8-bit
|
|
vaddw.s8 q11, q11, d3 @adding offset for row 4L
|
|
vqmovun.s16 d27, q2 @saturating row 1H to unsigned 8-bit
|
|
vaddw.s8 q8, q8, d3 @adding offset for row 4H
|
|
|
|
vqmovun.s16 d10, q12 @saturating row 2L to unsigned 8-bit
|
|
vqmovun.s16 d11, q4 @saturating row 2H to unsigned 8-bit
|
|
vqmovun.s16 d30, q14 @saturating row 3L to unsigned 8-bit
|
|
vqmovun.s16 d31, q6 @saturating row 3H to unsigned 8-bit
|
|
vst1.8 {q13}, [r2], r5 @store row 1 in destination
|
|
vqmovun.s16 d14, q11 @saturating row 4L to unsigned 8-bit
|
|
vqmovun.s16 d15, q8 @saturating row 4H to unsigned 8-bit
|
|
|
|
vst1.8 {q5}, [r2], r5 @store row 2 in destination
|
|
subs r11, r11, #4 @decrement ht by 4
|
|
vst1.8 {q15}, [r2], r5 @store row 3 in destination
|
|
vst1.8 {q7}, [r2], r5 @store row 4 in destination
|
|
|
|
bgt loop_16 @if greater than 0 repeat the loop again
|
|
|
|
end_loops:
|
|
|
|
vpop {d8-d15}
|
|
ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
|
|
|
|
|
|
@*******************************************************************************
|
|
@* @function
|
|
@* ih264_weighted_bi_pred_chroma_a9q()
|
|
@*
|
|
@* @brief
|
|
@* This routine performs the default weighted prediction as described in sec
|
|
@* 8.4.2.3.2 titled "Weighted sample prediction process" for chroma.
|
|
@*
|
|
@* @par Description:
|
|
@* This function gets two ht x wd blocks, calculates the weighted samples,
|
|
@* rounds off, adds offset and stores it in the destination block for U and V.
|
|
@*
|
|
@* @param[in] pu1_src1
|
|
@* UWORD8 Pointer to the buffer containing the input block 1.
|
|
@*
|
|
@* @param[in] pu1_src2
|
|
@* UWORD8 Pointer to the buffer containing the input block 2.
|
|
@*
|
|
@* @param[out] pu1_dst
|
|
@* UWORD8 pointer to the destination where the output block is stored.
|
|
@*
|
|
@* @param[in] src_strd1
|
|
@* Stride of the input buffer 1
|
|
@*
|
|
@* @param[in] src_strd2
|
|
@* Stride of the input buffer 2
|
|
@*
|
|
@* @param[in] dst_strd
|
|
@* Stride of the destination buffer
|
|
@*
|
|
@* @param[in] log_wd
|
|
@* number of bits to be rounded off
|
|
@*
|
|
@* @param[in] wt1
|
|
@* weights for the weighted prediction in U and V
|
|
@*
|
|
@* @param[in] wt2
|
|
@* weights for the weighted prediction in U and V
|
|
@*
|
|
@* @param[in] ofst1
|
|
@* offset 1 used after rounding off for U an dV
|
|
@*
|
|
@* @param[in] ofst2
|
|
@* offset 2 used after rounding off for U and V
|
|
@*
|
|
@* @param[in] ht
|
|
@* integer height of the array
|
|
@*
|
|
@* @param[in] wd
|
|
@* integer width of the array
|
|
@*
|
|
@* @returns
|
|
@* None
|
|
@*
|
|
@* @remarks
|
|
@* (ht,wd) can be (2,2), (2,4), (4,2), (4,4), (4,8), (8,4) or (8,8).
|
|
@*
|
|
@*******************************************************************************
|
|
@*
|
|
@void ih264_weighted_bi_pred_chroma_a9q(UWORD8 *pu1_src1,
|
|
@ UWORD8 *pu1_src2,
|
|
@ UWORD8 *pu1_dst,
|
|
@ WORD32 src_strd1,
|
|
@ WORD32 src_strd2,
|
|
@ WORD32 dst_strd,
|
|
@ WORD32 log_wd,
|
|
@ WORD32 wt1,
|
|
@ WORD32 wt2,
|
|
@ WORD32 ofst1,
|
|
@ WORD32 ofst2,
|
|
@ WORD32 ht,
|
|
@ WORD32 wd)
|
|
@
|
|
@**************Variables Vs Registers*****************************************
|
|
@ r0 => pu1_src1
|
|
@ r1 => pu1_src2
|
|
@ r2 => pu1_dst
|
|
@ r3 => src_strd1
|
|
@ [sp] => src_strd2 (r4)
|
|
@ [sp+4] => dst_strd (r5)
|
|
@ [sp+8] => log_wd (r6)
|
|
@ [sp+12] => wt1 (r7)
|
|
@ [sp+16] => wt2 (r8)
|
|
@ [sp+20] => ofst1 (r9)
|
|
@ [sp+24] => ofst2 (r10)
|
|
@ [sp+28] => ht (r11)
|
|
@ [sp+32] => wd (r12)
|
|
@
|
|
|
|
|
|
.global ih264_weighted_bi_pred_chroma_a9q
|
|
|
|
ih264_weighted_bi_pred_chroma_a9q:
|
|
|
|
stmfd sp!, {r4-r12, r14} @stack stores the values of the arguments
|
|
|
|
ldr r6, [sp, #48] @Load log_wd in r6
|
|
ldr r7, [sp, #52] @Load wt1 in r7
|
|
ldr r8, [sp, #56] @Load wt2 in r8
|
|
add r6, r6, #1 @r6 = log_wd + 1
|
|
ldr r9, [sp, #60] @Load ofst1 in r9
|
|
ldr r10, [sp, #64] @Load ofst2 in r10
|
|
|
|
neg r12, r6 @r12 = -(log_wd + 1)
|
|
ldr r4, [sp, #40] @Load src_strd2 in r4
|
|
ldr r5, [sp, #44] @Load dst_strd in r5
|
|
vdup.16 q0, r12 @Q0 = -(log_wd + 1) (16-bit)
|
|
|
|
ldr r11, [sp, #68] @Load ht in r11
|
|
vdup.32 q1, r7 @Q1 = (wt1_u, wt1_v) (32-bit)
|
|
ldr r12, [sp, #72] @Load wd in r12
|
|
vdup.32 q2, r8 @Q2 = (wt2_u, wt2_v) (32-bit)
|
|
asr r7, r9, #8 @r7 = ofst1_v
|
|
asr r8, r10, #8 @r8 = ofst2_v
|
|
vpush {d8-d15}
|
|
sxtb r9, r9 @sign-extend 8-bit ofst1_u to 32-bit
|
|
sxtb r10, r10 @sign-extend 8-bit ofst2_u to 32-bit
|
|
sxtb r7, r7 @sign-extend 8-bit ofst1_v to 32-bit
|
|
sxtb r8, r8 @sign-extend 8-bit ofst2_v to 32-bit
|
|
|
|
add r9, r9, #1 @r9 = ofst1_u + 1
|
|
add r7, r7, #1 @r7 = ofst1_v + 1
|
|
add r9, r9, r10 @r9 = ofst1_u + ofst2_u + 1
|
|
add r7, r7, r8 @r7 = ofst1_v + ofst2_v + 1
|
|
asr r9, r9, #1 @r9 = ofst_u = (ofst1_u + ofst2_u + 1) >> 1
|
|
asr r7, r7, #1 @r7 = ofst_v = (ofst1_v + ofst2_v + 1) >> 1
|
|
cmp r12, #8 @check if wd is 8
|
|
pkhbt r9, r9, r7, lsl #16 @r9 = {ofst_u(16-bit), ofst_v(16-bit)}
|
|
vdup.32 q3, r9 @Q3 = {ofst_u(16-bit), ofst_v(16-bit)}
|
|
beq loop_8_uv @branch if wd is 8
|
|
|
|
cmp r12, #4 @check if wd is 4
|
|
beq loop_4_uv @branch if wd is 4
|
|
|
|
loop_2_uv: @each iteration processes two rows
|
|
|
|
vld1.32 d8[0], [r0], r3 @load row 1 in source 1
|
|
vld1.32 d8[1], [r0], r3 @load row 2 in source 1
|
|
vld1.32 d10[0], [r1], r4 @load row 1 in source 2
|
|
vld1.32 d10[1], [r1], r4 @load row 2 in source 2
|
|
|
|
vmovl.u8 q4, d8 @converting rows 1,2 in source 1 to 16-bit
|
|
vmovl.u8 q5, d10 @converting rows 1,2 in source 2 to 16-bit
|
|
|
|
vmul.s16 q4, q4, q1 @weight 1 mult. for rows 1,2
|
|
vmla.s16 q4, q5, q2 @weight 2 mult. for rows 1,2
|
|
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from rows 1,2
|
|
|
|
vadd.s16 q4, q4, q3 @adding offset for rows 1,2
|
|
|
|
vqmovun.s16 d8, q4 @saturating rows 1,2 to unsigned 8-bit
|
|
|
|
vst1.32 d8[0], [r2], r5 @store row 1 in destination
|
|
vst1.32 d8[1], [r2], r5 @store row 2 in destination
|
|
|
|
subs r11, r11, #2 @decrement ht by 2
|
|
bgt loop_2_uv @if greater than 0 repeat the loop again
|
|
|
|
b end_loops_uv
|
|
|
|
loop_4_uv: @each iteration processes two rows
|
|
|
|
vld1.8 d8, [r0], r3 @load row 1 in source 1
|
|
vld1.8 d10, [r1], r4 @load row 1 in source 2
|
|
vmovl.u8 q4, d8 @converting row 1 in source 1 to 16-bit
|
|
vld1.8 d12, [r0], r3 @load row 2 in source 1
|
|
vmovl.u8 q5, d10 @converting row 1 in source 2 to 16-bit
|
|
vld1.8 d14, [r1], r4 @load row 2 in source 2
|
|
|
|
vmovl.u8 q6, d12 @converting row 2 in source 1 to 16-bit
|
|
vmul.s16 q4, q4, q1 @weight 1 mult. for row 1
|
|
vmla.s16 q4, q5, q2 @weight 2 mult. for row 1
|
|
vmovl.u8 q7, d14 @converting row 2 in source 2 to 16-bit
|
|
|
|
vmul.s16 q6, q6, q1 @weight 1 mult. for row 2
|
|
vmla.s16 q6, q7, q2 @weight 2 mult. for row 2
|
|
|
|
subs r11, r11, #2 @decrement ht by 2
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1
|
|
vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2
|
|
vadd.s16 q4, q4, q3 @adding offset for row 1
|
|
vadd.s16 q6, q6, q3 @adding offset for row 2
|
|
|
|
vqmovun.s16 d8, q4 @saturating row 1 to unsigned 8-bit
|
|
vqmovun.s16 d12, q6 @saturating row 2 to unsigned 8-bit
|
|
|
|
vst1.8 d8, [r2], r5 @store row 1 in destination
|
|
vst1.8 d12, [r2], r5 @store row 2 in destination
|
|
|
|
bgt loop_4_uv @if greater than 0 repeat the loop again
|
|
|
|
b end_loops_uv
|
|
|
|
loop_8_uv: @each iteration processes two rows
|
|
|
|
vld1.8 {q4}, [r0], r3 @load row 1 in source 1
|
|
vld1.8 {q5}, [r1], r4 @load row 1 in source 2
|
|
vld1.8 {q6}, [r0], r3 @load row 2 in source 1
|
|
vld1.8 {q7}, [r1], r4 @load row 2 in source 2
|
|
vmovl.u8 q12, d8 @converting row 1L in source 1 to 16-bit
|
|
vld1.8 {q8}, [r0], r3 @load row 3 in source 1
|
|
vld1.8 {q9}, [r1], r4 @load row 3 in source 2
|
|
vmovl.u8 q13, d10 @converting row 1L in source 2 to 16-bit
|
|
vld1.8 {q10}, [r0], r3 @load row 4 in source 1
|
|
vld1.8 {q11}, [r1], r4 @load row 4 in source 2
|
|
|
|
vmovl.u8 q4, d9 @converting row 1H in source 1 to 16-bit
|
|
vmovl.u8 q5, d11 @converting row 1H in source 2 to 16-bit
|
|
|
|
vmul.s16 q12, q12, q1 @weight 1 mult. for row 1L
|
|
vmla.s16 q12, q13, q2 @weight 2 mult. for row 1L
|
|
vmovl.u8 q14, d12 @converting row 2L in source 1 to 16-bit
|
|
vmovl.u8 q15, d14 @converting row 2L in source 2 to 16-bit
|
|
|
|
vmul.s16 q4, q4, q1 @weight 1 mult. for row 1H
|
|
vmla.s16 q4, q5, q2 @weight 2 mult. for row 1H
|
|
vmovl.u8 q6, d13 @converting row 2H in source 1 to 16-bit
|
|
vmovl.u8 q7, d15 @converting row 2H in source 2 to 16-bit
|
|
|
|
vmul.s16 q14, q14, q1 @weight 1 mult. for row 2L
|
|
vmla.s16 q14, q15, q2 @weight 2 mult. for row 2L
|
|
vmovl.u8 q13, d16 @converting row 3L in source 1 to 16-bit
|
|
vmovl.u8 q5, d18 @converting row 3L in source 2 to 16-bit
|
|
|
|
vmul.s16 q6, q6, q1 @weight 1 mult. for row 2H
|
|
vmla.s16 q6, q7, q2 @weight 2 mult. for row 2H
|
|
vmovl.u8 q8, d17 @converting row 3H in source 1 to 16-bit
|
|
vmovl.u8 q9, d19 @converting row 3H in source 2 to 16-bit
|
|
|
|
vmul.s16 q13, q13, q1 @weight 1 mult. for row 3L
|
|
vmla.s16 q13, q5, q2 @weight 2 mult. for row 3L
|
|
vmovl.u8 q15, d20 @converting row 4L in source 1 to 16-bit
|
|
vmovl.u8 q7, d22 @converting row 4L in source 2 to 16-bit
|
|
|
|
vmul.s16 q8, q8, q1 @weight 1 mult. for row 3H
|
|
vmla.s16 q8, q9, q2 @weight 2 mult. for row 3H
|
|
vmovl.u8 q10, d21 @converting row 4H in source 1 to 16-bit
|
|
vmovl.u8 q11, d23 @converting row 4H in source 2 to 16-bit
|
|
|
|
vmul.s16 q15, q15, q1 @weight 1 mult. for row 4L
|
|
vmla.s16 q15, q7, q2 @weight 2 mult. for row 4L
|
|
vrshl.s16 q12, q12, q0 @rounds off the weighted samples from row 1L
|
|
|
|
vmul.s16 q10, q10, q1 @weight 1 mult. for row 4H
|
|
vmla.s16 q10, q11, q2 @weight 2 mult. for row 4H
|
|
vrshl.s16 q4, q4, q0 @rounds off the weighted samples from row 1H
|
|
|
|
vrshl.s16 q14, q14, q0 @rounds off the weighted samples from row 2L
|
|
vadd.s16 q12, q12, q3 @adding offset for row 1L
|
|
vrshl.s16 q6, q6, q0 @rounds off the weighted samples from row 2H
|
|
vadd.s16 q4, q4, q3 @adding offset for row 1H
|
|
vrshl.s16 q13, q13, q0 @rounds off the weighted samples from row 3L
|
|
vadd.s16 q14, q14, q3 @adding offset for row 2L
|
|
vrshl.s16 q8, q8, q0 @rounds off the weighted samples from row 3H
|
|
vadd.s16 q6, q6, q3 @adding offset for row 2H
|
|
vrshl.s16 q15, q15, q0 @rounds off the weighted samples from row 4L
|
|
vadd.s16 q13, q13, q3 @adding offset for row 3L
|
|
vrshl.s16 q10, q10, q0 @rounds off the weighted samples from row 4H
|
|
vadd.s16 q8, q8, q3 @adding offset for row 3H
|
|
|
|
vqmovun.s16 d10, q12 @saturating row 1L to unsigned 8-bit
|
|
vadd.s16 q15, q15, q3 @adding offset for row 4L
|
|
vqmovun.s16 d11, q4 @saturating row 1H to unsigned 8-bit
|
|
vadd.s16 q10, q10, q3 @adding offset for row 4H
|
|
|
|
vqmovun.s16 d18, q14 @saturating row 2L to unsigned 8-bit
|
|
vqmovun.s16 d19, q6 @saturating row 2H to unsigned 8-bit
|
|
vqmovun.s16 d14, q13 @saturating row 3L to unsigned 8-bit
|
|
vqmovun.s16 d15, q8 @saturating row 3H to unsigned 8-bit
|
|
vst1.8 {q5}, [r2], r5 @store row 1 in destination
|
|
vqmovun.s16 d22, q15 @saturating row 4L to unsigned 8-bit
|
|
vqmovun.s16 d23, q10 @saturating row 4H to unsigned 8-bit
|
|
|
|
vst1.8 {q9}, [r2], r5 @store row 2 in destination
|
|
subs r11, r11, #4 @decrement ht by 4
|
|
vst1.8 {q7}, [r2], r5 @store row 3 in destination
|
|
vst1.8 {q11}, [r2], r5 @store row 4 in destination
|
|
|
|
bgt loop_8_uv @if greater than 0 repeat the loop again
|
|
|
|
end_loops_uv:
|
|
|
|
vpop {d8-d15}
|
|
ldmfd sp!, {r4-r12, r15} @Reload the registers from sp
|
|
|
|
|