mirror of
https://github.com/cemu-project/Cemu.git
synced 2024-12-29 11:11:51 +01:00
251 lines
8.6 KiB
ArmAsm
251 lines
8.6 KiB
ArmAsm
//******************************************************************************
|
|
//*
|
|
//* Copyright (C) 2015 The Android Open Source Project
|
|
//*
|
|
//* Licensed under the Apache License, Version 2.0 (the "License");
|
|
//* you may not use this file except in compliance with the License.
|
|
//* You may obtain a copy of the License at:
|
|
//*
|
|
//* http://www.apache.org/licenses/LICENSE-2.0
|
|
//*
|
|
//* Unless required by applicable law or agreed to in writing, software
|
|
//* distributed under the License is distributed on an "AS IS" BASIS,
|
|
//* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
//* See the License for the specific language governing permissions and
|
|
//* limitations under the License.
|
|
//*
|
|
//*****************************************************************************
|
|
//* Originally developed and contributed by Ittiam Systems Pvt. Ltd, Bangalore
|
|
//*/
|
|
///**
|
|
// *******************************************************************************
|
|
// * @file
|
|
// * ih264_ihadamard_scaling_av8.s
|
|
// *
|
|
// * @brief
|
|
// * Contains function definitions for inverse hadamard transform on 4x4 DC outputs
|
|
// * of 16x16 intra-prediction
|
|
// *
|
|
// * @author
|
|
// * Mohit
|
|
// *
|
|
// * @par List of Functions:
|
|
// * - ih264_ihadamard_scaling_4x4_av8()
|
|
// *
|
|
// * @remarks
|
|
// * None
|
|
// *
|
|
.include "ih264_neon_macros.s"
|
|
|
|
// *******************************************************************************
|
|
// */
|
|
// * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
|
|
// * of a 16x16 intra prediction macroblock, and then performs scaling.
|
|
// * prediction buffer
|
|
// *
|
|
// * @par Description:
|
|
// * The DC coefficients pass through a 2-stage inverse hadamard transform.
|
|
// * This inverse transformed content is scaled to based on Qp value.
|
|
// *
|
|
// * @param[in] pi2_src
|
|
// * input 4x4 block of DC coefficients
|
|
// *
|
|
// * @param[out] pi2_out
|
|
// * output 4x4 block
|
|
// *
|
|
// * @param[in] pu2_iscal_mat
|
|
// * pointer to scaling list
|
|
// *
|
|
// * @param[in] pu2_weigh_mat
|
|
// * pointer to weight matrix
|
|
// *
|
|
// * @param[in] u4_qp_div_6
|
|
// * Floor (qp/6)
|
|
// *
|
|
// * @param[in] pi4_tmp
|
|
// * temporary buffer of size 1*16
|
|
// *
|
|
// * @returns none
|
|
// *
|
|
// * @remarks none
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
// void ih264_ihadamard_scaling_4x4(word16* pi2_src,
|
|
// word16* pi2_out,
|
|
// const uword16 *pu2_iscal_mat,
|
|
// const uword16 *pu2_weigh_mat,
|
|
// uword32 u4_qp_div_6,
|
|
// word32* pi4_tmp)
|
|
//**************variables vs registers*****************************************
|
|
//x0 => *pi2_src
|
|
//x1 => *pi2_out
|
|
//x2 => *pu2_iscal_mat
|
|
//x3 => *pu2_weigh_mat
|
|
//x4=> u4_qp_div_6
|
|
|
|
.text
|
|
.p2align 2
|
|
|
|
.global ih264_ihadamard_scaling_4x4_av8
|
|
ih264_ihadamard_scaling_4x4_av8:
|
|
|
|
//only one shift is done in horizontal inverse because,
|
|
//if u4_qp_div_6 is lesser than 4 then shift value will be neagative and do negative left shift, in this case rnd_factor has value
|
|
//if u4_qp_div_6 is greater than 4 then shift value will be positive and do left shift, here rnd_factor is 0
|
|
push_v_regs
|
|
|
|
//=======================inverse hadamard transform================================
|
|
|
|
ld4 {v0.4h-v3.4h}, [x0] //load x4,x5,x6,x7
|
|
|
|
dup v14.4s, w4 // populate the u4_qp_div_6
|
|
ld1 {v15.h}[0], [x3] // pu2_weigh_mat
|
|
ld1 {v16.h}[0], [x2] //pu2_iscal_mat
|
|
|
|
saddl v4.4s, v0.4h, v3.4h //x0 = x4 + x7
|
|
saddl v5.4s, v1.4h, v2.4h //x1 = x5 + x6
|
|
ssubl v6.4s, v1.4h, v2.4h //x2 = x5 - x6
|
|
ssubl v7.4s, v0.4h, v3.4h //x3 = x4 - x7
|
|
|
|
add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
|
|
add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
|
|
sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
|
|
sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
|
|
|
|
umull v15.4s, v15.4h, v16.4h
|
|
dup v15.4s, v15.s[0] //pu2_weigh_mat[0]*pu2_iscal_mat[0]
|
|
|
|
//transpose
|
|
trn1 v4.4s, v0.4s, v1.4s
|
|
trn2 v5.4s, v0.4s, v1.4s
|
|
trn1 v6.4s, v2.4s, v3.4s
|
|
trn2 v7.4s, v2.4s, v3.4s
|
|
|
|
trn1 v0.2d, v4.2d, v6.2d
|
|
trn2 v2.2d, v4.2d, v6.2d
|
|
trn1 v1.2d, v5.2d, v7.2d
|
|
trn2 v3.2d, v5.2d, v7.2d
|
|
//end transpose
|
|
|
|
add v4.4s, v0.4s, v3.4s //x0 = x4+x7
|
|
add v5.4s, v1.4s, v2.4s //x1 = x5+x6
|
|
sub v6.4s, v1.4s, v2.4s //x2 = x5-x6
|
|
sub v7.4s, v0.4s, v3.4s //x3 = x4-x7
|
|
|
|
add v0.4s, v4.4s, v5.4s //pi4_tmp_ptr[0] = x0 + x1
|
|
add v1.4s, v7.4s, v6.4s //pi4_tmp_ptr[1] = x3 + x2
|
|
sub v2.4s, v4.4s, v5.4s //pi4_tmp_ptr[2] = x0 - x1
|
|
sub v3.4s, v7.4s, v6.4s //pi4_tmp_ptr[3] = x3 - x2
|
|
|
|
mul v0.4s, v0.4s, v15.4s // q0 = p[i] = (x[i] * trns_coeff[i]) where i = 0..3
|
|
mul v1.4s, v1.4s, v15.4s // q1 = p[i] = (x[i] * trns_coeff[i]) where i = 4..7
|
|
mul v2.4s, v2.4s, v15.4s // q2 = p[i] = (x[i] * trns_coeff[i]) where i = 8..11
|
|
mul v3.4s, v3.4s, v15.4s // q3 = p[i] = (x[i] * trns_coeff[i]) where i = 12..15
|
|
|
|
sshl v0.4s, v0.4s, v14.4s // q0 = q[i] = (p[i] << (qp/6)) where i = 0..3
|
|
sshl v1.4s, v1.4s, v14.4s // q1 = q[i] = (p[i] << (qp/6)) where i = 4..7
|
|
sshl v2.4s, v2.4s, v14.4s // q2 = q[i] = (p[i] << (qp/6)) where i = 8..11
|
|
sshl v3.4s, v3.4s, v14.4s // q3 = q[i] = (p[i] << (qp/6)) where i = 12..15
|
|
|
|
sqrshrn v0.4h, v0.4s, #6 // d0 = c[i] = ((q[i] + 32) >> 4) where i = 0..3
|
|
sqrshrn v1.4h, v1.4s, #6 // d1 = c[i] = ((q[i] + 32) >> 4) where i = 4..7
|
|
sqrshrn v2.4h, v2.4s, #6 // d2 = c[i] = ((q[i] + 32) >> 4) where i = 8..11
|
|
sqrshrn v3.4h, v3.4s, #6 // d3 = c[i] = ((q[i] + 32) >> 4) where i = 12..15
|
|
|
|
st1 {v0.4h-v3.4h}, [x1] //store the result
|
|
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
// *******************************************************************************
|
|
// */
|
|
// * @brief This function performs a 2x2 inverse hadamard transform for chroma block
|
|
// *
|
|
// * @par Description:
|
|
// * The DC coefficients pass through a 2-stage inverse hadamard transform.
|
|
// * This inverse transformed content is scaled to based on Qp value.
|
|
// * Both DC blocks of U and v blocks are processesd
|
|
// *
|
|
// * @param[in] pi2_src
|
|
// * input 1x8 block of ceffs. First 4 are from U and next from V
|
|
// *
|
|
// * @param[out] pi2_out
|
|
// * output 1x8 block
|
|
// *
|
|
// * @param[in] pu2_iscal_mat
|
|
// * pointer to scaling list
|
|
// *
|
|
// * @param[in] pu2_weigh_mat
|
|
// * pointer to weight matrix
|
|
// *
|
|
// * @param[in] u4_qp_div_6
|
|
// * Floor (qp/6)
|
|
// *
|
|
// * @returns none
|
|
// *
|
|
// * @remarks none
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
// *
|
|
// *******************************************************************************
|
|
// */
|
|
// void ih264_ihadamard_scaling_2x2_uv(WORD16* pi2_src,
|
|
// WORD16* pi2_out,
|
|
// const UWORD16 *pu2_iscal_mat,
|
|
// const UWORD16 *pu2_weigh_mat,
|
|
// UWORD32 u4_qp_div_6,
|
|
|
|
.global ih264_ihadamard_scaling_2x2_uv_av8
|
|
ih264_ihadamard_scaling_2x2_uv_av8:
|
|
|
|
//Registers used
|
|
// x0 : *pi2_src
|
|
// x1 : *pi2_out
|
|
// x2 : *pu2_iscal_mat
|
|
// x3 : *pu2_weigh_mat
|
|
// x4 : u4_qp_div_6
|
|
push_v_regs
|
|
ld1 {v26.h}[0], [x2]
|
|
ld1 {v27.h}[0], [x3]
|
|
|
|
sub w4, w4, #5 //qp/6 - 4
|
|
dup v28.4s, w4 //load qp/6
|
|
|
|
ld2 {v0.4h, v1.4h}, [x0] //load 8 dc coeffs
|
|
//i2_x4,i2_x6,i2_y4,i1_y6 -> d0
|
|
//i2_x5,i2_x7,i2_y5,i1_y6 -> d1
|
|
|
|
saddl v2.4s, v0.4h, v1.4h //i4_x0 = i4_x4 + i4_x5;...x2
|
|
ssubl v4.4s, v0.4h, v1.4h //i4_x1 = i4_x4 - i4_x5;...x3
|
|
|
|
umull v30.4s, v26.4h, v27.4h //pu2_iscal_mat[0]*pu2_weigh_mat[0]
|
|
dup v30.4s, v30.s[0]
|
|
|
|
trn1 v0.4s, v2.4s, v4.4s
|
|
trn2 v1.4s, v2.4s, v4.4s //i4_x0 i4_x1 -> q1
|
|
|
|
add v2.4s, v0.4s, v1.4s //i4_x4 = i4_x0+i4_x2;.. i4_x5
|
|
sub v3.4s, v0.4s, v1.4s //i4_x6 = i4_x0-i4_x2;.. i4_x7
|
|
|
|
mul v2.4s, v2.4s, v30.4s
|
|
mul v3.4s, v3.4s, v30.4s
|
|
|
|
sshl v2.4s, v2.4s, v28.4s
|
|
sshl v3.4s, v3.4s, v28.4s
|
|
|
|
xtn v0.4h, v2.4s //i4_x4 i4_x5 i4_y4 i4_y5
|
|
xtn v1.4h, v3.4s //i4_x6 i4_x7 i4_y6 i4_y7
|
|
|
|
st2 {v0.4s-v1.4s}, [x1]
|
|
pop_v_regs
|
|
ret
|
|
|
|
|
|
|