mirror of
https://github.com/cemu-project/Cemu.git
synced 2025-01-12 09:59:11 +01:00
648 lines
18 KiB
ArmAsm
648 lines
18 KiB
ArmAsm
|
@/******************************************************************************
|
||
|
@ *
|
||
|
@ * Copyright (C) 2015 The Android Open Source Project
|
||
|
@ *
|
||
|
@ * Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
@ * you may not use this file except in compliance with the License.
|
||
|
@ * You may obtain a copy of the License at:
|
||
|
@ *
|
||
|
@ * http://www.apache.org/licenses/LICENSE-2.0
|
||
|
@ *
|
||
|
@ * Unless required by applicable law or agreed to in writing, software
|
||
|
@ * distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
@ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
@ * See the License for the specific language governing permissions and
|
||
|
@ * limitations under the License.
|
||
|
@ *
|
||
|
@ *****************************************************************************
|
||
|
@ * Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
||
|
@*/
|
||
|
@*
|
||
|
@ *******************************************************************************
|
||
|
@ * @file
|
||
|
@ * ih264_padding_neon.s
|
||
|
@ *
|
||
|
@ * @brief
|
||
|
@ * Contains function definitions padding
|
||
|
@ *
|
||
|
@ * @author
|
||
|
@ * Ittiam
|
||
|
@ *
|
||
|
@ * @par List of Functions:
|
||
|
@ * - ih264_pad_top_a9q()
|
||
|
@ * - ih264_pad_left_luma_a9q()
|
||
|
@ * - ih264_pad_left_chroma_a9q()
|
||
|
@ * - ih264_pad_right_luma_a9q()
|
||
|
@ * - ih264_pad_right_chroma_a9q()
|
||
|
@ *
|
||
|
@ * @remarks
|
||
|
@ * None
|
||
|
@ *
|
||
|
@ *******************************************************************************
|
||
|
@*
|
||
|
|
||
|
|
||
|
@**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief pad at the top of a 2d array
|
||
|
@*
|
||
|
@* @par Description:
|
||
|
@* The top row of a 2d array is replicated for pad_size times at the top
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@* UWORD8 pointer to the source
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @param[in] pad_size
|
||
|
@* integer -padding size of the array
|
||
|
@*
|
||
|
@* @returns none
|
||
|
@*
|
||
|
@* @remarks none
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@void ih264_pad_top(UWORD8 *pu1_src,
|
||
|
@ WORD32 src_strd,
|
||
|
@ WORD32 wd,
|
||
|
@ WORD32 pad_size)
|
||
|
@**************Variables Vs Registers*************************
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => src_strd
|
||
|
@ r2 => wd
|
||
|
@ r3 => pad_size
|
||
|
|
||
|
.text
|
||
|
.p2align 2
|
||
|
|
||
|
.global ih264_pad_top_a9q
|
||
|
|
||
|
ih264_pad_top_a9q:
|
||
|
|
||
|
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
|
||
|
|
||
|
sub r5, r0, r1
|
||
|
neg r6, r1
|
||
|
|
||
|
loop_neon_memcpy_mul_16:
|
||
|
@ Load 16 bytes
|
||
|
vld1.8 {d0, d1}, [r0]!
|
||
|
mov r4, r5
|
||
|
mov r7, r3
|
||
|
add r5, r5, #16
|
||
|
|
||
|
loop_neon_pad_top:
|
||
|
vst1.8 {d0, d1}, [r4], r6
|
||
|
subs r7, r7, #1
|
||
|
bne loop_neon_pad_top
|
||
|
|
||
|
subs r2, r2, #16
|
||
|
bne loop_neon_memcpy_mul_16
|
||
|
|
||
|
ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
@**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief
|
||
|
@* Padding (luma block) at the left of a 2d array
|
||
|
@*
|
||
|
@* @par Description:
|
||
|
@* The left column of a 2d array is replicated for pad_size times at the left
|
||
|
@*
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@* UWORD8 pointer to the source
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @param[in] pad_size
|
||
|
@* integer -padding size of the array
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @returns
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* None
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@#if PAD_LEFT_LUMA == C
|
||
|
@void ih264_pad_left_luma(UWORD8 *pu1_src,
|
||
|
@ WORD32 src_strd,
|
||
|
@ WORD32 ht,
|
||
|
@ WORD32 pad_size)
|
||
|
@**************Variables Vs Registers*************************
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => src_strd
|
||
|
@ r2 => ht
|
||
|
@ r3 => pad_size
|
||
|
|
||
|
|
||
|
|
||
|
.global ih264_pad_left_luma_a9q
|
||
|
|
||
|
ih264_pad_left_luma_a9q:
|
||
|
|
||
|
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
|
||
|
|
||
|
|
||
|
sub r4, r0, r3
|
||
|
sub r6, r1, #16
|
||
|
subs r5, r3, #16
|
||
|
bne loop_32
|
||
|
loop_16: @ /*hard coded for width=16 ,height =8,16*/
|
||
|
ldrb r8, [r0], r1
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q1}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vdup.u8 q2, r10
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r8, [r0], r1
|
||
|
vst1.8 {q3}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vdup.u8 q3, r11
|
||
|
subs r2, r2, #8
|
||
|
vst1.8 {q2}, [r4], r1 @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r1 @ 16 bytes store
|
||
|
bne loop_16
|
||
|
b end_func
|
||
|
|
||
|
loop_32: @ /*hard coded for width=32 ,height =8,16*/
|
||
|
ldrb r8, [r0], r1
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r8, [r0], r1
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r9, [r0], r1
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #8
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
bne loop_32
|
||
|
|
||
|
|
||
|
|
||
|
end_func:
|
||
|
ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
@**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief
|
||
|
@* Padding (chroma block) at the left of a 2d array
|
||
|
@*
|
||
|
@* @par Description:
|
||
|
@* The left column of a 2d array is replicated for pad_size times at the left
|
||
|
@*
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@* UWORD8 pointer to the source
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array (each colour component)
|
||
|
@*
|
||
|
@* @param[in] pad_size
|
||
|
@* integer -padding size of the array
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @returns
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* None
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@#if PAD_LEFT_CHROMA == C
|
||
|
@void ih264_pad_left_chroma(UWORD8 *pu1_src,
|
||
|
@ WORD32 src_strd,
|
||
|
@ WORD32 ht,
|
||
|
@ WORD32 pad_size)
|
||
|
@{
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => src_strd
|
||
|
@ r2 => ht
|
||
|
@ r3 => pad_size
|
||
|
|
||
|
|
||
|
|
||
|
.global ih264_pad_left_chroma_a9q
|
||
|
|
||
|
ih264_pad_left_chroma_a9q:
|
||
|
|
||
|
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
|
||
|
|
||
|
sub r4, r0, r3
|
||
|
sub r6, r1, #16
|
||
|
|
||
|
|
||
|
loop_32_l_c: @ /*hard coded for width=32 ,height =4,8,12*/
|
||
|
ldrh r8, [r0], r1
|
||
|
ldrh r9, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6 @ 16 bytes store
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #4
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
|
||
|
beq end_func_l_c @/* Branching when ht=4*/
|
||
|
|
||
|
ldrh r8, [r0], r1
|
||
|
ldrh r9, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #4
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
beq end_func_l_c @/* Branching when ht=8*/
|
||
|
bne loop_32_l_c
|
||
|
|
||
|
ldrh r8, [r0], r1
|
||
|
ldrh r9, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
end_func_l_c:
|
||
|
ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
@**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief
|
||
|
@* Padding (luma block) at the right of a 2d array
|
||
|
@*
|
||
|
@* @par Description:
|
||
|
@* The right column of a 2d array is replicated for pad_size times at the right
|
||
|
@*
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@* UWORD8 pointer to the source
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @param[in] pad_size
|
||
|
@* integer -padding size of the array
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @returns
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* None
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@#if PAD_RIGHT_LUMA == C
|
||
|
@void ih264_pad_right_luma(UWORD8 *pu1_src,
|
||
|
@ WORD32 src_strd,
|
||
|
@ WORD32 ht,
|
||
|
@ WORD32 pad_size)
|
||
|
@{
|
||
|
@ WORD32 row;
|
||
|
@
|
||
|
@ for(row = 0; row < ht; row++)
|
||
|
@ {
|
||
|
@ memset(pu1_src, *(pu1_src -1), pad_size);
|
||
|
@
|
||
|
@ pu1_src += src_strd;
|
||
|
@ }
|
||
|
@}
|
||
|
@
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => src_strd
|
||
|
@ r2 => ht
|
||
|
@ r3 => pad_size
|
||
|
|
||
|
|
||
|
|
||
|
.global ih264_pad_right_luma_a9q
|
||
|
|
||
|
ih264_pad_right_luma_a9q:
|
||
|
|
||
|
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
|
||
|
|
||
|
mov r4, r0
|
||
|
sub r6, r1, #16
|
||
|
sub r0, r0, #1
|
||
|
subs r5, r3, #16
|
||
|
bne loop_32
|
||
|
loop_16_r: @ /*hard coded for width=16 ,height =8,16*/
|
||
|
ldrb r8, [r0], r1
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q1}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vdup.u8 q2, r10
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r8, [r0], r1
|
||
|
vst1.8 {q3}, [r4], r1 @ 16 bytes store
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4], r1 @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vdup.u8 q3, r11
|
||
|
subs r2, r2, #8
|
||
|
vst1.8 {q2}, [r4], r1 @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r1 @ 16 bytes store
|
||
|
bne loop_16_r
|
||
|
b end_func_r
|
||
|
|
||
|
loop_32_r: @ /*hard coded for width=32 ,height =8,16*/
|
||
|
ldrb r8, [r0], r1
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r8, [r0], r1
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
ldrb r9, [r0], r1
|
||
|
vdup.u8 q0, r8
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6 @ 16 bytes store
|
||
|
ldrb r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u8 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #8
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
bne loop_32_r
|
||
|
|
||
|
|
||
|
|
||
|
end_func_r:
|
||
|
ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
@**
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@* @brief
|
||
|
@;* Padding (chroma block) at the right of a 2d array
|
||
|
@*
|
||
|
@* @par Description:
|
||
|
@* The right column of a 2d array is replicated for pad_size times at the right
|
||
|
@*
|
||
|
@*
|
||
|
@* @param[in] pu1_src
|
||
|
@;* UWORD8 pointer to the source
|
||
|
@*
|
||
|
@* @param[in] src_strd
|
||
|
@* integer source stride
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@;* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array (each colour component)
|
||
|
@*
|
||
|
@* @param[in] pad_size
|
||
|
@* integer -padding size of the array
|
||
|
@*
|
||
|
@* @param[in] ht
|
||
|
@;* integer height of the array
|
||
|
@*
|
||
|
@* @param[in] wd
|
||
|
@* integer width of the array
|
||
|
@*
|
||
|
@* @returns
|
||
|
@*
|
||
|
@* @remarks
|
||
|
@* None
|
||
|
@*
|
||
|
@*******************************************************************************
|
||
|
@*
|
||
|
@#if PAD_RIGHT_CHROMA == C
|
||
|
@void ih264_pad_right_chroma(UWORD8 *pu1_src,
|
||
|
@ WORD32 src_strd,
|
||
|
@ WORD32 ht,
|
||
|
@ WORD32 pad_size)
|
||
|
@ r0 => *pu1_src
|
||
|
@ r1 => src_strd
|
||
|
@ r2 => ht
|
||
|
@ r3 => pad_size
|
||
|
|
||
|
|
||
|
|
||
|
.global ih264_pad_right_chroma_a9q
|
||
|
|
||
|
ih264_pad_right_chroma_a9q:
|
||
|
|
||
|
stmfd sp!, {r4-r11, lr} @stack stores the values of the arguments
|
||
|
|
||
|
mov r4, r0
|
||
|
sub r6, r1, #16
|
||
|
sub r0, r0, #2
|
||
|
loop_32_r_c: @ /*hard coded for width=32 ,height =8,4*/
|
||
|
ldrh r8, [r0], r1
|
||
|
ldrh r9, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #4
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
beq end_func_r_c @/* Branching when ht=4*/
|
||
|
|
||
|
ldrh r8, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r9, [r0], r1
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6 @ 16 bytes store
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
subs r2, r2, #4
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
beq end_func_r_c @/* Branching when ht=8*/
|
||
|
bne loop_32_r_c
|
||
|
|
||
|
ldrh r8, [r0], r1
|
||
|
vdup.u16 q0, r8
|
||
|
ldrh r9, [r0], r1
|
||
|
ldrh r10, [r0], r1
|
||
|
vst1.8 {q0}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q1, r9
|
||
|
vst1.8 {q0}, [r4], r6 @ 16 bytes store
|
||
|
ldrh r11, [r0], r1
|
||
|
vst1.8 {q1}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q2, r10
|
||
|
vst1.8 {q1}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q2}, [r4]! @ 16 bytes store
|
||
|
vdup.u16 q3, r11
|
||
|
vst1.8 {q2}, [r4], r6 @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4]! @ 16 bytes store
|
||
|
vst1.8 {q3}, [r4], r6 @ 16 bytes store
|
||
|
|
||
|
end_func_r_c:
|
||
|
ldmfd sp!, {r4-r11, pc} @Reload the registers from SP
|
||
|
|
||
|
|
||
|
|
||
|
|
||
|
|