Cemu/dependencies/ih264d/common/armv8/ih264_padding_neon_av8.s
2022-08-22 22:21:23 +02:00

737 lines
20 KiB
ArmAsm

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// * ih264_padding_neon.s
// *
// * @brief
// * Contains function definitions padding
// *
// * @author
// * Ittiam
// *
// * @par List of Functions:
// * - ih264_pad_top_av8()
// * - ih264_pad_left_luma_av8()
// * - ih264_pad_left_chroma_av8()
// * - ih264_pad_right_luma_av8()
// * - ih264_pad_right_chroma_av8()
// *
// * @remarks
// * None
// *
// *******************************************************************************
//*/
.text
.p2align 2
.include "ih264_neon_macros.s"
///**
//*******************************************************************************
//*
//* @brief pad at the top of a 2d array
//*
//* @par Description:
//* The top row of a 2d array is replicated for pad_size times at the top
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] wd
//* integer width of the array
//*
//* @param[in] pad_size
//* integer -padding size of the array
//*
//* @returns none
//*
//* @remarks none
//*
//*******************************************************************************
//*/
//void ih264_pad_top(UWORD8 *pu1_src,
// WORD32 src_strd,
// WORD32 wd,
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
// w1 => src_strd
// w2 => wd
// w3 => pad_size
.global ih264_pad_top_av8
ih264_pad_top_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
sxtw x1, w1
stp x19, x20, [sp, #-16]!
sub x5, x0, x1
neg x6, x1
loop_neon_memcpy_mul_16:
// Load 16 bytes
ld1 {v0.8b, v1.8b}, [x0], #16
mov x4, x5
mov w7, w3
add x5, x5, #16
loop_neon_pad_top:
st1 {v0.8b, v1.8b}, [x4], x6
subs w7, w7, #1
bne loop_neon_pad_top
subs w2, w2, #16
bne loop_neon_memcpy_mul_16
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Padding (luma block) at the left of a 2d array
//*
//* @par Description:
//* The left column of a 2d array is replicated for pad_size times at the left
//*
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @param[in] pad_size
//* integer -padding size of the array
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//#if PAD_LEFT_LUMA == C
//void ih264_pad_left_luma(UWORD8 *pu1_src,
// WORD32 src_strd,
// WORD32 ht,
// WORD32 pad_size)
//**************Variables Vs Registers*************************
// x0 => *pu1_src
// w1 => src_strd
// w2 => ht
// w3 => pad_size
.global ih264_pad_left_luma_av8
ih264_pad_left_luma_av8:
// STMFD sp!, {x4-x11,x14} //stack stores the values of the arguments
push_v_regs
sxtw x1, w1
sxtw x3, w3
stp x19, x20, [sp, #-16]!
sub x4, x0, x3
sub x6, x1, #16
subs x5, x3, #16
bne loop_32
loop_16: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16
b end_func
loop_32: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
st1 {v6.16b}, [x4], #16 // 16 bytes store
dup v0.16b, w8
ldrb w9, [x0]
add x0, x0, x1
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32
end_func:
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Padding (chroma block) at the left of a 2d array
//*
//* @par Description:
//* The left column of a 2d array is replicated for pad_size times at the left
//*
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array (each colour component)
//*
//* @param[in] pad_size
//* integer -padding size of the array
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//#if PAD_LEFT_CHROMA == C
//void ih264_pad_left_chroma(UWORD8 *pu1_src,
// WORD32 src_strd,
// WORD32 ht,
// WORD32 pad_size)
//{
// x0 => *pu1_src
// w1 => src_strd
// w2 => ht
// w3 => pad_size
.global ih264_pad_left_chroma_av8
ih264_pad_left_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
sxtw x1, w1
sxtw x3, w3
stp x19, x20, [sp, #-16]!
sub x4, x0, x3
sub x6, x1, #16
loop_32_l_c: // /*hard coded for width=32 ,height =4,8,12*/
ldrh w8, [x0]
add x0, x0, x1
ldrh w9, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
beq end_func_l_c ///* Branching when ht=4*/
ldrh w8, [x0]
add x0, x0, x1
ldrh w9, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
beq end_func_l_c ///* Branching when ht=8*/
bne loop_32_l_c
ldrh w8, [x0]
add x0, x0, x1
ldrh w9, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
ldrh w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], #16 // 16 bytes store
st1 {v4.16b}, [x4], x6 // 16 bytes store
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
end_func_l_c:
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//* Padding (luma block) at the right of a 2d array
//*
//* @par Description:
//* The right column of a 2d array is replicated for pad_size times at the right
//*
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @param[in] pad_size
//* integer -padding size of the array
//*
//* @param[in] ht
//* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//#if PAD_RIGHT_LUMA == C
//void ih264_pad_right_luma(UWORD8 *pu1_src,
// WORD32 src_strd,
// WORD32 ht,
// WORD32 pad_size)
//{
// WORD32 row;
//
// for(row = 0; row < ht; row++)
// {
// memset(pu1_src, *(pu1_src -1), pad_size);
//
// pu1_src += src_strd;
// }
//}
//
// x0 => *pu1_src
// w1 => src_strd
// w2 => ht
// w3 => pad_size
.global ih264_pad_right_luma_av8
ih264_pad_right_luma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
sxtw x1, w1
sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
sub x6, x1, #16
sub x0, x0, #1
subs x5, x3, #16
bne loop_32
loop_16_r: // /*hard coded for width=16 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
st1 {v2.16b}, [x4], x1 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
dup v4.16b, w10
dup v6.16b, w11
st1 {v4.16b}, [x4], x1 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
st1 {v6.16b}, [x4], x1 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], x1 // 16 bytes store
dup v2.16b, w9
ldrb w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], x1 // 16 bytes store
dup v4.16b, w10
dup v6.16b, w11
subs w2, w2, #8
st1 {v4.16b}, [x4], x1 // 16 bytes store
st1 {v6.16b}, [x4], x1 // 16 bytes store
bne loop_16_r
b end_func_r
loop_32_r: // /*hard coded for width=32 ,height =8,16*/
ldrb w8, [x0]
add x0, x0, x1
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
ldrb w8, [x0]
add x0, x0, x1
st1 {v6.16b}, [x4], #16 // 16 bytes store
ldrb w9, [x0]
add x0, x0, x1
dup v0.16b, w8
st1 {v6.16b}, [x4], x6 // 16 bytes store
ldrb w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.16b, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrb w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.16b, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.16b, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #8
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
bne loop_32_r
end_func_r:
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret
///**
//*******************************************************************************
//*
//* @brief
//;* Padding (chroma block) at the right of a 2d array
//*
//* @par Description:
//* The right column of a 2d array is replicated for pad_size times at the right
//*
//*
//* @param[in] pu1_src
//;* UWORD8 pointer to the source
//*
//* @param[in] src_strd
//* integer source stride
//*
//* @param[in] ht
//;* integer height of the array
//*
//* @param[in] wd
//* integer width of the array (each colour component)
//*
//* @param[in] pad_size
//* integer -padding size of the array
//*
//* @param[in] ht
//;* integer height of the array
//*
//* @param[in] wd
//* integer width of the array
//*
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//#if PAD_RIGHT_CHROMA == C
//void ih264_pad_right_chroma(UWORD8 *pu1_src,
// WORD32 src_strd,
// WORD32 ht,
// WORD32 pad_size)
// x0 => *pu1_src
// w1 => src_strd
// w2 => ht
// w3 => pad_size
.global ih264_pad_right_chroma_av8
ih264_pad_right_chroma_av8:
// STMFD sp!, {x4-x11, x14} //stack stores the values of the arguments
push_v_regs
sxtw x1, w1
sxtw x3, w3
stp x19, x20, [sp, #-16]!
mov x4, x0
sub x6, x1, #16
sub x0, x0, #2
loop_32_r_c: // /*hard coded for width=32 ,height =8,4*/
ldrh w8, [x0]
add x0, x0, x1
ldrh w9, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #4
ldrh w11, [x0]
add x0, x0, x1
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
beq end_func_r_c ///* Branching when ht=4*/
ldrh w8, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
subs w2, w2, #4
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
beq end_func_r_c ///* Branching when ht=8*/
bne loop_32_r_c
ldrh w8, [x0]
add x0, x0, x1
dup v0.8h, w8
ldrh w9, [x0]
add x0, x0, x1
ldrh w10, [x0]
add x0, x0, x1
st1 {v0.16b}, [x4], #16 // 16 bytes store
dup v2.8h, w9
st1 {v0.16b}, [x4], x6 // 16 bytes store
ldrh w11, [x0]
add x0, x0, x1
st1 {v2.16b}, [x4], #16 // 16 bytes store
dup v4.8h, w10
st1 {v2.16b}, [x4], x6 // 16 bytes store
st1 {v4.16b}, [x4], #16 // 16 bytes store
dup v6.8h, w11
st1 {v4.16b}, [x4], x6 // 16 bytes store
st1 {v6.16b}, [x4], #16 // 16 bytes store
st1 {v6.16b}, [x4], x6 // 16 bytes store
end_func_r_c:
// LDMFD sp!,{x4-x11,pc} //Reload the registers from SP
ldp x19, x20, [sp], #16
pop_v_regs
ret