mirror of
https://github.com/cemu-project/Cemu.git
synced 2024-12-27 02:01:51 +01:00
737 lines
20 KiB
ArmAsm
737 lines
20 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
///**
|
|
// *******************************************************************************
|
|
// * @file
|
|
// * ih264_padding_neon.s
|
|
// *
|
|
// * @brief
|
|
// * Contains function definitions padding
|
|
// *
|
|
// * @author
|
|
// * Ittiam
|
|
// *
|
|
// * @par List of Functions:
|
|
// * - ih264_pad_top_av8()
|
|
// * - ih264_pad_left_luma_av8()
|
|
// * - ih264_pad_left_chroma_av8()
|
|
// * - ih264_pad_right_luma_av8()
|
|
// * - ih264_pad_right_chroma_av8()
|
|
// *
|
|
// * @remarks
|
|
// * None
|
|
// *
|
|
// *******************************************************************************
|
|
//*/
|
|
|
|
.text
|
|
.p2align 2
|
|
.include "ih264_neon_macros.s"
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief pad at the top of a 2d array
|
|
//*
|
|
//* @par Description:
|
|
//* The top row of a 2d array is replicated for pad_size times at the top
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* @returns none
|
|
//*
|
|
//* @remarks none
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//void ih264_pad_top(UWORD8 *pu1_src,
|
|
// WORD32 src_strd,
|
|
// WORD32 wd,
|
|
// WORD32 pad_size)
|
|
//**************Variables Vs Registers*************************
|
|
// x0 => *pu1_src
|
|
// w1 => src_strd
|
|
// w2 => wd
|
|
// w3 => pad_size
|
|
|
|
.global ih264_pad_top_av8
|
|
|
|
ih264_pad_top_av8:
|
|
|
|
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
|
|
push_v_regs
|
|
sxtw x1, w1
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
sub x5, x0, x1
|
|
neg x6, x1
|
|
|
|
loop_neon_memcpy_mul_16:
|
|
// Load 16 bytes
|
|
ld1 {v0.8b, v1.8b}, [x0], #16
|
|
mov x4, x5
|
|
mov w7, w3
|
|
add x5, x5, #16
|
|
|
|
loop_neon_pad_top:
|
|
st1 {v0.8b, v1.8b}, [x4], x6
|
|
subs w7, w7, #1
|
|
bne loop_neon_pad_top
|
|
|
|
subs w2, w2, #16
|
|
bne loop_neon_memcpy_mul_16
|
|
|
|
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* Padding (luma block) at the left of a 2d array
|
|
//*
|
|
//* @par Description:
|
|
//* The left column of a 2d array is replicated for pad_size times at the left
|
|
//*
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//#if PAD_LEFT_LUMA == C
|
|
//void ih264_pad_left_luma(UWORD8 *pu1_src,
|
|
// WORD32 src_strd,
|
|
// WORD32 ht,
|
|
// WORD32 pad_size)
|
|
//**************Variables Vs Registers*************************
|
|
// x0 => *pu1_src
|
|
// w1 => src_strd
|
|
// w2 => ht
|
|
// w3 => pad_size
|
|
|
|
|
|
|
|
.global ih264_pad_left_luma_av8
|
|
|
|
ih264_pad_left_luma_av8:
|
|
|
|
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
|
|
push_v_regs
|
|
sxtw x1, w1
|
|
sxtw x3, w3
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
|
|
sub x4, x0, x3
|
|
sub x6, x1, #16
|
|
subs x5, x3, #16
|
|
bne loop_32
|
|
loop_16: // /*hard coded for width=16 ,height =8,16*/
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], x1 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v2.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
dup v4.16b, w10
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
st1 {v6.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], x1 // 16 bytes store
|
|
dup v2.16b, w9
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], x1 // 16 bytes store
|
|
dup v4.16b, w10
|
|
dup v6.16b, w11
|
|
subs w2, w2, #8
|
|
st1 {v4.16b}, [x4], x1 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x1 // 16 bytes store
|
|
bne loop_16
|
|
b end_func
|
|
|
|
loop_32: // /*hard coded for width=32 ,height =8,16*/
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v0.16b}, [x4], x6
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.16b, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
dup v0.16b, w8
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v0.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.16b, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #8
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
bne loop_32
|
|
|
|
|
|
|
|
end_func:
|
|
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* Padding (chroma block) at the left of a 2d array
|
|
//*
|
|
//* @par Description:
|
|
//* The left column of a 2d array is replicated for pad_size times at the left
|
|
//*
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array (each colour component)
|
|
//*
|
|
//* @param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//#if PAD_LEFT_CHROMA == C
|
|
//void ih264_pad_left_chroma(UWORD8 *pu1_src,
|
|
// WORD32 src_strd,
|
|
// WORD32 ht,
|
|
// WORD32 pad_size)
|
|
//{
|
|
// x0 => *pu1_src
|
|
// w1 => src_strd
|
|
// w2 => ht
|
|
// w3 => pad_size
|
|
|
|
|
|
|
|
.global ih264_pad_left_chroma_av8
|
|
|
|
ih264_pad_left_chroma_av8:
|
|
|
|
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
|
|
push_v_regs
|
|
sxtw x1, w1
|
|
sxtw x3, w3
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
sub x4, x0, x3
|
|
sub x6, x1, #16
|
|
|
|
|
|
loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6 // 16 bytes store
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #4
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
|
|
beq end_func_l_c ///* Branching when ht=4*/
|
|
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #4
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
beq end_func_l_c ///* Branching when ht=8*/
|
|
bne loop_32_l_c
|
|
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
end_func_l_c:
|
|
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//* Padding (luma block) at the right of a 2d array
|
|
//*
|
|
//* @par Description:
|
|
//* The right column of a 2d array is replicated for pad_size times at the right
|
|
//*
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* @param[in] ht
|
|
//* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//#if PAD_RIGHT_LUMA == C
|
|
//void ih264_pad_right_luma(UWORD8 *pu1_src,
|
|
// WORD32 src_strd,
|
|
// WORD32 ht,
|
|
// WORD32 pad_size)
|
|
//{
|
|
// WORD32 row;
|
|
//
|
|
// for(row = 0; row < ht; row++)
|
|
// {
|
|
// memset(pu1_src, *(pu1_src -1), pad_size);
|
|
//
|
|
// pu1_src += src_strd;
|
|
// }
|
|
//}
|
|
//
|
|
// x0 => *pu1_src
|
|
// w1 => src_strd
|
|
// w2 => ht
|
|
// w3 => pad_size
|
|
|
|
|
|
|
|
.global ih264_pad_right_luma_av8
|
|
|
|
ih264_pad_right_luma_av8:
|
|
|
|
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
|
|
push_v_regs
|
|
sxtw x1, w1
|
|
sxtw x3, w3
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
mov x4, x0
|
|
sub x6, x1, #16
|
|
sub x0, x0, #1
|
|
subs x5, x3, #16
|
|
bne loop_32
|
|
loop_16_r: // /*hard coded for width=16 ,height =8,16*/
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], x1 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v2.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
dup v4.16b, w10
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
st1 {v6.16b}, [x4], x1 // 16 bytes store
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], x1 // 16 bytes store
|
|
dup v2.16b, w9
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], x1 // 16 bytes store
|
|
dup v4.16b, w10
|
|
dup v6.16b, w11
|
|
subs w2, w2, #8
|
|
st1 {v4.16b}, [x4], x1 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x1 // 16 bytes store
|
|
bne loop_16_r
|
|
b end_func_r
|
|
|
|
loop_32_r: // /*hard coded for width=32 ,height =8,16*/
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v0.16b}, [x4], x6
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.16b, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w8, [x0]
|
|
add x0, x0, x1
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
ldrb w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.16b, w8
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.16b, w9
|
|
st1 {v0.16b}, [x4], x6 // 16 bytes store
|
|
ldrb w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.16b, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.16b, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #8
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
bne loop_32_r
|
|
|
|
|
|
|
|
end_func_r:
|
|
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
///**
|
|
//*******************************************************************************
|
|
//*
|
|
//* @brief
|
|
//;* Padding (chroma block) at the right of a 2d array
|
|
//*
|
|
//* @par Description:
|
|
//* The right column of a 2d array is replicated for pad_size times at the right
|
|
//*
|
|
//*
|
|
//* @param[in] pu1_src
|
|
//;* UWORD8 pointer to the source
|
|
//*
|
|
//* @param[in] src_strd
|
|
//* integer source stride
|
|
//*
|
|
//* @param[in] ht
|
|
//;* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array (each colour component)
|
|
//*
|
|
//* @param[in] pad_size
|
|
//* integer -padding size of the array
|
|
//*
|
|
//* @param[in] ht
|
|
//;* integer height of the array
|
|
//*
|
|
//* @param[in] wd
|
|
//* integer width of the array
|
|
//*
|
|
//* @returns
|
|
//*
|
|
//* @remarks
|
|
//* None
|
|
//*
|
|
//*******************************************************************************
|
|
//*/
|
|
//#if PAD_RIGHT_CHROMA == C
|
|
//void ih264_pad_right_chroma(UWORD8 *pu1_src,
|
|
// WORD32 src_strd,
|
|
// WORD32 ht,
|
|
// WORD32 pad_size)
|
|
// x0 => *pu1_src
|
|
// w1 => src_strd
|
|
// w2 => ht
|
|
// w3 => pad_size
|
|
|
|
|
|
|
|
.global ih264_pad_right_chroma_av8
|
|
|
|
ih264_pad_right_chroma_av8:
|
|
|
|
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
|
|
push_v_regs
|
|
sxtw x1, w1
|
|
sxtw x3, w3
|
|
stp x19, x20, [sp, #-16]!
|
|
|
|
mov x4, x0
|
|
sub x6, x1, #16
|
|
sub x0, x0, #2
|
|
loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #4
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
beq end_func_r_c ///* Branching when ht=4*/
|
|
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6 // 16 bytes store
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
subs w2, w2, #4
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
beq end_func_r_c ///* Branching when ht=8*/
|
|
bne loop_32_r_c
|
|
ldrh w8, [x0]
|
|
add x0, x0, x1
|
|
dup v0.8h, w8
|
|
ldrh w9, [x0]
|
|
add x0, x0, x1
|
|
ldrh w10, [x0]
|
|
add x0, x0, x1
|
|
st1 {v0.16b}, [x4], #16 // 16 bytes store
|
|
dup v2.8h, w9
|
|
st1 {v0.16b}, [x4], x6 // 16 bytes store
|
|
ldrh w11, [x0]
|
|
add x0, x0, x1
|
|
st1 {v2.16b}, [x4], #16 // 16 bytes store
|
|
dup v4.8h, w10
|
|
st1 {v2.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v4.16b}, [x4], #16 // 16 bytes store
|
|
dup v6.8h, w11
|
|
st1 {v4.16b}, [x4], x6 // 16 bytes store
|
|
st1 {v6.16b}, [x4], #16 // 16 bytes store
|
|
st1 {v6.16b}, [x4], x6 // 16 bytes store
|
|
|
|
end_func_r_c:
|
|
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
|
|
ldp x19, x20, [sp], #16
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|
|
|
|
|
|
|