Cemu/dependencies/ih264d/common/armv8/ih264_mem_fns_neon_av8.s
2022-08-22 22:21:23 +02:00

271 lines
5.9 KiB
ArmAsm

//******************************************************************************
//*
//* Copyright (C) 2015 The Android Open Source Project
//*
//* Licensed under the Apache License, Version 2.0 (the "License");
//* you may not use this file except in compliance with the License.
//* You may obtain a copy of the License at:
//*
//* http://www.apache.org/licenses/LICENSE-2.0
//*
//* Unless required by applicable law or agreed to in writing, software
//* distributed under the License is distributed on an "AS IS" BASIS,
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
//* See the License for the specific language governing permissions and
//* limitations under the License.
//*
//*****************************************************************************
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
//*/
///**
// *******************************************************************************
// * @file
// * ih264_mem_fns_neon.s
// *
// * @brief
// * Contains function definitions for memory manipulation
// *
// * @author
// * Naveen SR
// *
// * @par List of Functions:
// * - ih264_memcpy_av8()
// * - ih264_memcpy_mul_8_av8()
// * - ih264_memset_mul_8_av8()
// * - ih264_memset_16bit_mul_8_av8()
// * - ih264_memset_16bit_av8()
// *
// * @remarks
// * None
// *
// *******************************************************************************
//*/
.text
.p2align 2
.include "ih264_neon_macros.s"
///**
//*******************************************************************************
//*
//* @brief
//* memcpy of a 1d array
//*
//* @par Description:
//* Does memcpy of 8bit data from source to destination for 8,16 or 32 number of bytes
//*
//* @param[in] pu1_dst
//* UWORD8 pointer to the destination
//*
//* @param[in] pu1_src
//* UWORD8 pointer to the source
//*
//* @param[in] num_bytes
//* number of bytes to copy
//* @returns
//*
//* @remarks
//* None
//*
//*******************************************************************************
//*/
//void ih264_memcpy_mul_8(UWORD8 *pu1_dst,
// UWORD8 *pu1_src,
// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => *pu1_src
// w2 => num_bytes
.global ih264_memcpy_mul_8_av8
ih264_memcpy_mul_8_av8:
loop_neon_memcpy_mul_8:
// Memcpy 8 bytes
ld1 {v0.8b}, [x1], #8
st1 {v0.8b}, [x0], #8
subs w2, w2, #8
bne loop_neon_memcpy_mul_8
ret
//*******************************************************************************
//*/
//void ih264_memcpy(UWORD8 *pu1_dst,
// UWORD8 *pu1_src,
// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => *pu1_src
// w2 => num_bytes
.global ih264_memcpy_av8
ih264_memcpy_av8:
subs w2, w2, #8
blt arm_memcpy
loop_neon_memcpy:
// Memcpy 8 bytes
ld1 {v0.8b}, [x1], #8
st1 {v0.8b}, [x0], #8
subs w2, w2, #8
bge loop_neon_memcpy
cmn w2, #8
beq end_func1
arm_memcpy:
add w2, w2, #8
loop_arm_memcpy:
ldrb w3, [x1], #1
strb w3, [x0], #1
subs w2, w2, #1
bne loop_arm_memcpy
ret
end_func1:
ret
//void ih264_memset_mul_8(UWORD8 *pu1_dst,
// UWORD8 value,
// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// x1 => value
// x2 => num_bytes
.global ih264_memset_mul_8_av8
ih264_memset_mul_8_av8:
// Assumptions: numbytes is either 8, 16 or 32
dup v0.8b, w1
loop_memset_mul_8:
// Memset 8 bytes
st1 {v0.8b}, [x0], #8
subs w2, w2, #8
bne loop_memset_mul_8
ret
//void ih264_memset(UWORD8 *pu1_dst,
// UWORD8 value,
// UWORD32 num_bytes)
//**************Variables Vs Registers*************************
// x0 => *pu1_dst
// w1 => value
// w2 => num_bytes
.global ih264_memset_av8
ih264_memset_av8:
subs w2, w2, #8
blt arm_memset
dup v0.8b, w1
loop_neon_memset:
// Memcpy 8 bytes
st1 {v0.8b}, [x0], #8
subs w2, w2, #8
bge loop_neon_memset
cmn w2, #8
beq end_func2
arm_memset:
add w2, w2, #8
loop_arm_memset:
strb w1, [x0], #1
subs w2, w2, #1
bne loop_arm_memset
ret
end_func2:
ret
//void ih264_memset_16bit_mul_8(UWORD16 *pu2_dst,
// UWORD16 value,
// UWORD32 num_words)
//**************Variables Vs Registers*************************
// x0 => *pu2_dst
// w1 => value
// w2 => num_words
.global ih264_memset_16bit_mul_8_av8
ih264_memset_16bit_mul_8_av8:
// Assumptions: num_words is either 8, 16 or 32
// Memset 8 words
dup v0.4h, w1
loop_memset_16bit_mul_8:
st1 {v0.4h}, [x0], #8
st1 {v0.4h}, [x0], #8
subs w2, w2, #8
bne loop_memset_16bit_mul_8
ret
//void ih264_memset_16bit(UWORD16 *pu2_dst,
// UWORD16 value,
// UWORD32 num_words)
//**************Variables Vs Registers*************************
// x0 => *pu2_dst
// w1 => value
// w2 => num_words
.global ih264_memset_16bit_av8
ih264_memset_16bit_av8:
subs w2, w2, #8
blt arm_memset_16bit
dup v0.4h, w1
loop_neon_memset_16bit:
// Memset 8 words
st1 {v0.4h}, [x0], #8
st1 {v0.4h}, [x0], #8
subs w2, w2, #8
bge loop_neon_memset_16bit
cmn w2, #8
beq end_func3
arm_memset_16bit:
add w2, w2, #8
loop_arm_memset_16bit:
strh w1, [x0], #2
subs w2, w2, #1
bne loop_arm_memset_16bit
ret
end_func3:
ret